forked from jiuyuan/InfiniTensor
Support bang c kernel wanghailu 0927 (#43)
* fix a little bug which found by new verison CMake * add code for support BangC language kernel , just like Cuda kernel, not library * add bangc kernel * support BangC kernel * add code for support BangC kernel * support bangc kernel * fix some code from reviewer * fix code of template fumction * add code for support bangc kernel * fix bangc format Co-authored-by: wanghailu <wanghailu@qiyuanlab.com> Co-authored-by: Haojie Wang <haojie0429@gmail.com>
This commit is contained in:
parent
26cee55e81
commit
b0c2a08252
|
@ -121,6 +121,7 @@ if(USE_CUDA)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(USE_BANG)
|
if(USE_BANG)
|
||||||
|
include_directories(src/kernels/mlu/include)
|
||||||
################################################################################
|
################################################################################
|
||||||
# Neuware Evironment
|
# Neuware Evironment
|
||||||
################################################################################
|
################################################################################
|
||||||
|
@ -151,53 +152,12 @@ if(USE_BANG)
|
||||||
message(STATUS "TARGET_CPU_ARCH: ${TARGET_CPU_ARCH}")
|
message(STATUS "TARGET_CPU_ARCH: ${TARGET_CPU_ARCH}")
|
||||||
|
|
||||||
################################################################################
|
################################################################################
|
||||||
# Sample Kernels
|
# BangC Kernels
|
||||||
################################################################################
|
################################################################################
|
||||||
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "$ENV{NEUWARE_HOME}/cmake" "$ENV{NEUWARE_HOME}/cmake/modules")
|
add_subdirectory(src/kernels/mlu)
|
||||||
find_package(BANG)
|
|
||||||
if(NOT BANG_FOUND)
|
|
||||||
message(FATAL_ERROR "BANG cannot be found.")
|
|
||||||
elseif(NOT BANG_CNCC_EXECUTABLE)
|
|
||||||
message(FATAL_ERROR "cncc not found, please ensure cncc is in your PATH env or set variable BANG_CNCC_EXECUTABLE from cmake. Otherwise you should check path used by find_program(BANG_CNCC_EXECUTABLE) in FindBANG.cmake")
|
|
||||||
endif()
|
|
||||||
set(BANG_CNCC_FLAGS "-Wall -Werror -fPIC -std=c++11 --target=${TARGET_CPU_ARCH} -O3")
|
|
||||||
set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS}"
|
|
||||||
"--bang-arch=compute_20"
|
|
||||||
"--bang-arch=compute_30"
|
|
||||||
"--bang-mlu-arch=mtp_322"
|
|
||||||
"--bang-wram-align64"
|
|
||||||
)
|
|
||||||
|
|
||||||
if(${TARGET_CPU_ARCH} MATCHES "aarch64-linux-gnu")
|
|
||||||
set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=1")
|
|
||||||
add_definitions(-D_GLIBCXX_USE_CXX11_ABI=1)
|
|
||||||
execute_process(
|
|
||||||
COMMAND uname -m
|
|
||||||
OUTPUT_VARIABLE _uname_m
|
|
||||||
OUTPUT_STRIP_TRAILING_WHITESPACE
|
|
||||||
)
|
|
||||||
if (NOT ("${TARGET_CPU_ARCH}" MATCHES ".*${_uname_m}.*" AND "${_uname_m}" MATCHES "aarch64"))
|
|
||||||
execute_process(
|
|
||||||
COMMAND "${CMAKE_CXX_COMPILER}" "-v" "-c" "-x" "c++" "/dev/null" "-M"
|
|
||||||
ERROR_VARIABLE _cxx_verbose
|
|
||||||
)
|
|
||||||
execute_process(
|
|
||||||
COMMAND "echo" "${_cxx_verbose}"
|
|
||||||
COMMAND "sed" "-n" "/include.*search starts here/,/End of search list/{s/^ //p}"
|
|
||||||
COMMAND "tr" "'\n'" ";"
|
|
||||||
OUTPUT_VARIABLE _cxx_includes
|
|
||||||
)
|
|
||||||
list(REMOVE_ITEM _cxx_includes "/usr/include")
|
|
||||||
foreach(_include ${_cxx_includes})
|
|
||||||
message(STATUS "add include path: ${_include}")
|
|
||||||
set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS} -idirafter ${_include}")
|
|
||||||
endforeach()
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
#bang_add_library(bangops SHARED ${SRC_BANG})
|
|
||||||
#target_link_libraries(bangops ${CAMBRICON_CNDRV})
|
|
||||||
target_link_libraries(InfiniTensor ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
|
target_link_libraries(InfiniTensor ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
|
||||||
#target_link_libraries(InfiniTensor bangops)
|
target_link_libraries(InfiniTensor bangops)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# # Python bindings
|
# # Python bindings
|
||||||
|
|
|
@ -0,0 +1,22 @@
|
||||||
|
#pragma once
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
#include "bang_div.h"
|
||||||
|
#include "operators/element_wise.h"
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
void element_wise_kernel(const RuntimeObj *obj, const Operator &_op) {
|
||||||
|
auto op = as<ElementWiseObj>(_op);
|
||||||
|
float *const aData = (op->getInputs(0)->getRawDataPtr<float *>());
|
||||||
|
float *const bData = (op->getInputs(1)->getRawDataPtr<float *>());
|
||||||
|
float *const cData = (op->getOutput()->getRawDataPtr<float *>());
|
||||||
|
|
||||||
|
auto dim = op->getInputs(0)->getDims();
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(obj);
|
||||||
|
int n = dim[0], c = dim[1], h = dim[2], w = dim[3];
|
||||||
|
if (op->getOpType() == OpType::Div)
|
||||||
|
div_kernel(context->cnnlHandle(), aData, bData, cData, n * c * h * w);
|
||||||
|
else
|
||||||
|
IT_TODO_HALT();
|
||||||
|
}
|
||||||
|
|
||||||
|
}; // namespace infini
|
|
@ -48,20 +48,22 @@ class BangRuntimeObj : public RuntimeObj {
|
||||||
return workspace;
|
return workspace;
|
||||||
}
|
}
|
||||||
|
|
||||||
void copyBlobFromCPU(void *dst, void *src, size_t bytes) const override {
|
void copyBlobFromCPU(void *dst, const void *src,
|
||||||
checkBangError(
|
size_t bytes) const override {
|
||||||
cnrtMemcpy(dst, src, bytes, CNRT_MEM_TRANS_DIR_HOST2DEV));
|
checkBangError(cnrtMemcpy(dst, const_cast<void *>(src), bytes,
|
||||||
|
CNRT_MEM_TRANS_DIR_HOST2DEV));
|
||||||
}
|
}
|
||||||
|
|
||||||
void copyBlobToCPU(void *dst, void *src, size_t bytes) const override {
|
void copyBlobToCPU(void *dst, const void *src,
|
||||||
checkBangError(
|
size_t bytes) const override {
|
||||||
cnrtMemcpy(dst, src, bytes, CNRT_MEM_TRANS_DIR_DEV2HOST));
|
checkBangError(cnrtMemcpy(dst, const_cast<void *>(src), bytes,
|
||||||
|
CNRT_MEM_TRANS_DIR_DEV2HOST));
|
||||||
}
|
}
|
||||||
|
|
||||||
void copyBlobInsideRuntime(void *dst, void *src,
|
void copyBlobInsideRuntime(void *dst, const void *src,
|
||||||
size_t bytes) const override {
|
size_t bytes) const override {
|
||||||
checkBangError(
|
checkBangError(cnrtMemcpy(dst, const_cast<void *>(src), bytes,
|
||||||
cnrtMemcpy(dst, src, bytes, CNRT_MEM_TRANS_DIR_PEER2PEER));
|
CNRT_MEM_TRANS_DIR_PEER2PEER));
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
#include "operators/element_wise.h"
|
#include "operators/element_wise.h"
|
||||||
|
#include "bang/bang_element_wise.h"
|
||||||
#include "bang/bang_kernel_without_config.h"
|
#include "bang/bang_kernel_without_config.h"
|
||||||
#include "bang/bang_runtime.h"
|
#include "bang/bang_runtime.h"
|
||||||
|
|
||||||
|
@ -80,12 +81,12 @@ class MulCnnl : public ElementWiseCnnl {
|
||||||
cnnlOpTensorDesc_t getOpType() const override { return CNNL_OP_TENSOR_MUL; }
|
cnnlOpTensorDesc_t getOpType() const override { return CNNL_OP_TENSOR_MUL; }
|
||||||
};
|
};
|
||||||
|
|
||||||
// class ElementWiseBang : public BangKernelWithoutConfig {
|
class ElementWiseBang : public BangKernelWithoutConfig {
|
||||||
// void compute(const Operator &_op,
|
void compute(const Operator &_op,
|
||||||
// const RuntimeObj *_context) const override {
|
const RuntimeObj *_context) const override {
|
||||||
// element_wise_kernel(_op);
|
element_wise_kernel(_context, _op);
|
||||||
// }
|
}
|
||||||
// };
|
};
|
||||||
|
|
||||||
REGISTER_KERNEL(Device::BANG, OpType::Add, DataType::Float32, AddCnnl,
|
REGISTER_KERNEL(Device::BANG, OpType::Add, DataType::Float32, AddCnnl,
|
||||||
"Add_cnnl_BANG_Float32");
|
"Add_cnnl_BANG_Float32");
|
||||||
|
@ -94,9 +95,8 @@ REGISTER_KERNEL(Device::BANG, OpType::Sub, DataType::Float32, SubCnnl,
|
||||||
REGISTER_KERNEL(Device::BANG, OpType::Mul, DataType::Float32, MulCnnl,
|
REGISTER_KERNEL(Device::BANG, OpType::Mul, DataType::Float32, MulCnnl,
|
||||||
"Mul_cnnl_BANG_Float32");
|
"Mul_cnnl_BANG_Float32");
|
||||||
|
|
||||||
// REGISTER_KERNEL(Device::BANG, OpType::Div, DataType::Float32,
|
REGISTER_KERNEL(Device::BANG, OpType::Div, DataType::Float32, ElementWiseBang,
|
||||||
// ElementWiseBang,
|
"Div_Bang_Float32");
|
||||||
// "Div_Bang_Float32");
|
|
||||||
// REGISTER_KERNEL(Device::BANG, OpType::Pow, DataType::Float32,
|
// REGISTER_KERNEL(Device::BANG, OpType::Pow, DataType::Float32,
|
||||||
// ElementWiseBang,
|
// ElementWiseBang,
|
||||||
// "Pow_Bang_Float32");
|
// "Pow_Bang_Float32");
|
||||||
|
|
|
@ -0,0 +1,46 @@
|
||||||
|
cmake_minimum_required(VERSION 3.3)
|
||||||
|
project(bangops)
|
||||||
|
include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include")
|
||||||
|
set(LIBRARY_OUTPUT_PATH "${CMAKE_BINARY_DIR}/lib")
|
||||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -fPIC -std=c++11 -pthread -pipe")
|
||||||
|
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} ${CMAKE_CXX_FLAGS} -O3")
|
||||||
|
set(CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} -Wl,--gc-sections -fPIC")
|
||||||
|
|
||||||
|
# check `NEUWARE_HOME` env
|
||||||
|
message(${NEUWARE_HOME})
|
||||||
|
if(EXISTS ${NEUWARE_HOME})
|
||||||
|
include_directories("${NEUWARE_HOME}/include")
|
||||||
|
link_directories("${NEUWARE_HOME}/lib64")
|
||||||
|
link_directories("${NEUWARE_HOME}/lib")
|
||||||
|
set(NEUWARE_ROOT_DIR "${NEUWARE_HOME}")
|
||||||
|
else()
|
||||||
|
message(FATAL_ERROR "NEUWARE directory cannot be found, refer README.md to prepare NEUWARE_HOME environment.")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
# setup cmake search path
|
||||||
|
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH}
|
||||||
|
"${CMAKE_SOURCE_DIR}/cmake"
|
||||||
|
"${NEUWARE_HOME}/cmake"
|
||||||
|
"${NEUWARE_HOME}/cmake/modules"
|
||||||
|
)
|
||||||
|
|
||||||
|
# include FindBANG.cmake and check cncc
|
||||||
|
find_package(BANG)
|
||||||
|
if(NOT BANG_FOUND)
|
||||||
|
message(FATAL_ERROR "BANG cannot be found.")
|
||||||
|
elseif (NOT BANG_CNCC_EXECUTABLE)
|
||||||
|
message(FATAL_ERROR "cncc not found, please ensure cncc is in your PATH env or set variable BANG_CNCC_EXECUTABLE from cmake. Otherwise you should check path used by find_program(BANG_CNCC_EXECUTABLE) in FindBANG.cmake")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
# setup cncc flags
|
||||||
|
set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS} -fPIC -Wall -Werror -std=c++11 -pthread")
|
||||||
|
set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS} -O3")
|
||||||
|
set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS}" "--bang-mlu-arch=mtp_220"
|
||||||
|
"--bang-mlu-arch=mtp_270"
|
||||||
|
"--bang-mlu-arch=mtp_290"
|
||||||
|
"--bang-mlu-arch=mtp_372"
|
||||||
|
)
|
||||||
|
|
||||||
|
file(GLOB_RECURSE src_files ${src_files} "${CMAKE_CURRENT_SOURCE_DIR}/src/*.mlu")
|
||||||
|
bang_add_library(bangops SHARED ${src_files})
|
||||||
|
|
|
@ -0,0 +1,7 @@
|
||||||
|
#pragma once
|
||||||
|
#include "cnnl.h"
|
||||||
|
namespace infini {
|
||||||
|
void div_kernel(cnnlHandle_t handle, const float *input1, const float *input2,
|
||||||
|
float *output, const uint32_t num);
|
||||||
|
|
||||||
|
}; // namespace infini
|
|
@ -0,0 +1,7 @@
|
||||||
|
#ifndef BANG_KERNELS_DIVOPERATION_DIV_H_
|
||||||
|
#define BANG_KERNELS_DIVOPERATION_DIV_H_
|
||||||
|
|
||||||
|
__mlu_global__ void MLUDivKernelUnion1(float *output, float *input1,
|
||||||
|
float *input2, uint32_t num);
|
||||||
|
|
||||||
|
#endif // BANG_KERNELS_DIVOPERATION_DIV_H_
|
|
@ -0,0 +1,24 @@
|
||||||
|
#include "bang_div.h"
|
||||||
|
#include "div.h"
|
||||||
|
namespace infini {
|
||||||
|
void div_kernel(cnnlHandle_t handle,
|
||||||
|
const float *input1,
|
||||||
|
const float *input2,
|
||||||
|
float *output,
|
||||||
|
const uint32_t num) {
|
||||||
|
// 任务类型和调度方法
|
||||||
|
cnrtDim3_t k_dim;
|
||||||
|
cnrtFunctionType_t k_type;
|
||||||
|
cnrtQueue_t queue;
|
||||||
|
cnnlGetQueue(handle, &queue);
|
||||||
|
k_dim.x = 4;
|
||||||
|
k_dim.y = 8;
|
||||||
|
k_dim.z = 1;
|
||||||
|
k_type = CNRT_FUNC_TYPE_UNION1;
|
||||||
|
// launch 任务
|
||||||
|
MLUDivKernelUnion1<<<k_dim, k_type, queue>>>((float*)output,
|
||||||
|
(float*)input1,
|
||||||
|
(float*)input2,
|
||||||
|
num);
|
||||||
|
}
|
||||||
|
};
|
|
@ -0,0 +1,50 @@
|
||||||
|
#include "div.h"
|
||||||
|
|
||||||
|
#define NRAM_USE_SIZE 102400
|
||||||
|
|
||||||
|
__nram__ char left[NRAM_USE_SIZE];
|
||||||
|
__nram__ char right[NRAM_USE_SIZE];
|
||||||
|
__nram__ char output[NRAM_USE_SIZE];
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
__mlu_device__ void DivFunction(T* output1, T* input1, T* input2, size_t num) {
|
||||||
|
int use_nram_size = NRAM_USE_SIZE;
|
||||||
|
int deal_align = use_nram_size / sizeof(T);
|
||||||
|
int num_per_core = num / taskDim;
|
||||||
|
int num_rem = num % taskDim;
|
||||||
|
int easy = num_per_core;
|
||||||
|
int hard = num_per_core + (num_rem != 0 ? 1 : 0);
|
||||||
|
int my = taskId < num_rem ? hard : easy;
|
||||||
|
int start = (taskId < num_rem) ? (hard * taskId) : (hard * num_rem + (taskId - num_rem) * easy);
|
||||||
|
char* input1_start = (char*)input1 + start * sizeof(T);
|
||||||
|
char* input2_start = (char*)input2 + start * sizeof(T);
|
||||||
|
char* output_start = (char*)output1 + start * sizeof(T);
|
||||||
|
|
||||||
|
int my_repeat = my / deal_align;
|
||||||
|
int my_rem = my % deal_align;
|
||||||
|
for(int i = 0; i < my_repeat; ++i) {
|
||||||
|
__memcpy(left, input1_start, use_nram_size, GDRAM2NRAM);
|
||||||
|
__memcpy(right, input2_start, use_nram_size, GDRAM2NRAM);
|
||||||
|
__bang_active_recip((T*)right, (T*)right, deal_align);
|
||||||
|
__bang_mul((T*)output, (T*)left, (T*)right, deal_align);
|
||||||
|
__memcpy(output_start, output, use_nram_size, NRAM2GDRAM);
|
||||||
|
input1_start += use_nram_size;
|
||||||
|
input2_start += use_nram_size;
|
||||||
|
output_start += use_nram_size;
|
||||||
|
}
|
||||||
|
if(my_rem) {
|
||||||
|
__memcpy(left, input1_start, my_rem * sizeof(T), GDRAM2NRAM);
|
||||||
|
__memcpy(right, input2_start, my_rem * sizeof(T), GDRAM2NRAM);
|
||||||
|
__bang_active_recip((T*)right, (T*)right, deal_align);
|
||||||
|
__bang_mul((T*)output, (T*)left, (T*)right, deal_align);
|
||||||
|
__memcpy(output_start, output, my_rem * sizeof(T), NRAM2GDRAM);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__mlu_global__ void MLUDivKernelUnion1(float *output,
|
||||||
|
float *input1,
|
||||||
|
float *input2,
|
||||||
|
uint32_t num) {
|
||||||
|
DivFunction((float*)output, (float*)input1, (float*)input2, num);
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,63 @@
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
#include "core/blob.h"
|
||||||
|
#include "core/graph.h"
|
||||||
|
#include "core/kernel.h"
|
||||||
|
#include "core/runtime.h"
|
||||||
|
#include "operators/element_wise.h"
|
||||||
|
#include "utils/validation.h"
|
||||||
|
|
||||||
|
#include "test.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
void testBangcKernel(
|
||||||
|
const std::function<void(void *, size_t, DataType)> &generator,
|
||||||
|
const Shape &shape) {
|
||||||
|
// Runtime
|
||||||
|
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||||
|
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||||
|
|
||||||
|
// Build input data on CPU
|
||||||
|
Tensor inputCpu1 =
|
||||||
|
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||||
|
inputCpu1->dataMalloc();
|
||||||
|
inputCpu1->setData(generator);
|
||||||
|
Tensor inputCpu2 =
|
||||||
|
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||||
|
inputCpu2->dataMalloc();
|
||||||
|
inputCpu2->setData(generator);
|
||||||
|
|
||||||
|
// inputCpu1->printData();
|
||||||
|
// inputCpu2->printData();
|
||||||
|
|
||||||
|
// GPU
|
||||||
|
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||||
|
auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
|
||||||
|
auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
|
||||||
|
auto gpuOp = bangGraph->addOp<T>(inputGpu1, inputGpu2, nullptr);
|
||||||
|
bangGraph->dataMalloc();
|
||||||
|
bangRuntime->run(bangGraph);
|
||||||
|
auto outputGpu = gpuOp->getOutput();
|
||||||
|
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||||
|
// outputGpu2Cpu->printData();
|
||||||
|
// CPU
|
||||||
|
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
|
||||||
|
auto cpuOp = cpuGraph->addOp<T>(inputCpu1, inputCpu2, nullptr);
|
||||||
|
cpuGraph->dataMalloc();
|
||||||
|
cpuRuntime->run(cpuGraph);
|
||||||
|
auto outputCpu = cpuOp->getOutput();
|
||||||
|
// outputCpu->printData();
|
||||||
|
// Check
|
||||||
|
float *const cpuRes =
|
||||||
|
(float *)(outputCpu->template getRawDataPtr<float *>());
|
||||||
|
float *const mluRes =
|
||||||
|
(float *)(outputGpu2Cpu->template getRawDataPtr<float *>());
|
||||||
|
EXPECT_LE(computeDifference2(cpuRes, mluRes, outputCpu->size()), 0.003);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(BangcKernel_Div, run) {
|
||||||
|
testBangcKernel<DivObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace infini
|
Loading…
Reference in New Issue