diff --git a/CMakeLists.txt b/CMakeLists.txt index f760f934..52f4ff4e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -121,6 +121,7 @@ if(USE_CUDA) endif() if(USE_BANG) + include_directories(src/kernels/mlu/include) ################################################################################ # Neuware Evironment ################################################################################ @@ -151,53 +152,12 @@ if(USE_BANG) message(STATUS "TARGET_CPU_ARCH: ${TARGET_CPU_ARCH}") ################################################################################ - # Sample Kernels + # BangC Kernels ################################################################################ - set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "$ENV{NEUWARE_HOME}/cmake" "$ENV{NEUWARE_HOME}/cmake/modules") - find_package(BANG) - if(NOT BANG_FOUND) - message(FATAL_ERROR "BANG cannot be found.") - elseif(NOT BANG_CNCC_EXECUTABLE) - message(FATAL_ERROR "cncc not found, please ensure cncc is in your PATH env or set variable BANG_CNCC_EXECUTABLE from cmake. Otherwise you should check path used by find_program(BANG_CNCC_EXECUTABLE) in FindBANG.cmake") - endif() - set(BANG_CNCC_FLAGS "-Wall -Werror -fPIC -std=c++11 --target=${TARGET_CPU_ARCH} -O3") - set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS}" - "--bang-arch=compute_20" - "--bang-arch=compute_30" - "--bang-mlu-arch=mtp_322" - "--bang-wram-align64" - ) - - if(${TARGET_CPU_ARCH} MATCHES "aarch64-linux-gnu") - set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=1") - add_definitions(-D_GLIBCXX_USE_CXX11_ABI=1) - execute_process( - COMMAND uname -m - OUTPUT_VARIABLE _uname_m - OUTPUT_STRIP_TRAILING_WHITESPACE - ) - if (NOT ("${TARGET_CPU_ARCH}" MATCHES ".*${_uname_m}.*" AND "${_uname_m}" MATCHES "aarch64")) - execute_process( - COMMAND "${CMAKE_CXX_COMPILER}" "-v" "-c" "-x" "c++" "/dev/null" "-M" - ERROR_VARIABLE _cxx_verbose - ) - execute_process( - COMMAND "echo" "${_cxx_verbose}" - COMMAND "sed" "-n" "/include.*search starts here/,/End of search list/{s/^ //p}" - COMMAND "tr" "'\n'" ";" - OUTPUT_VARIABLE _cxx_includes - ) - list(REMOVE_ITEM _cxx_includes "/usr/include") - foreach(_include ${_cxx_includes}) - message(STATUS "add include path: ${_include}") - set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS} -idirafter ${_include}") - endforeach() - endif() - endif() - #bang_add_library(bangops SHARED ${SRC_BANG}) - #target_link_libraries(bangops ${CAMBRICON_CNDRV}) + add_subdirectory(src/kernels/mlu) + target_link_libraries(InfiniTensor ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++) - #target_link_libraries(InfiniTensor bangops) + target_link_libraries(InfiniTensor bangops) endif() # # Python bindings diff --git a/include/bang/bang_element_wise.h b/include/bang/bang_element_wise.h new file mode 100644 index 00000000..a0bf03c4 --- /dev/null +++ b/include/bang/bang_element_wise.h @@ -0,0 +1,22 @@ +#pragma once +#include "bang/bang_runtime.h" +#include "bang_div.h" +#include "operators/element_wise.h" +namespace infini { + +void element_wise_kernel(const RuntimeObj *obj, const Operator &_op) { + auto op = as(_op); + float *const aData = (op->getInputs(0)->getRawDataPtr()); + float *const bData = (op->getInputs(1)->getRawDataPtr()); + float *const cData = (op->getOutput()->getRawDataPtr()); + + auto dim = op->getInputs(0)->getDims(); + auto context = dynamic_cast(obj); + int n = dim[0], c = dim[1], h = dim[2], w = dim[3]; + if (op->getOpType() == OpType::Div) + div_kernel(context->cnnlHandle(), aData, bData, cData, n * c * h * w); + else + IT_TODO_HALT(); +} + +}; // namespace infini diff --git a/include/bang/bang_runtime.h b/include/bang/bang_runtime.h index 311c807b..6b43988c 100644 --- a/include/bang/bang_runtime.h +++ b/include/bang/bang_runtime.h @@ -48,20 +48,22 @@ class BangRuntimeObj : public RuntimeObj { return workspace; } - void copyBlobFromCPU(void *dst, void *src, size_t bytes) const override { - checkBangError( - cnrtMemcpy(dst, src, bytes, CNRT_MEM_TRANS_DIR_HOST2DEV)); + void copyBlobFromCPU(void *dst, const void *src, + size_t bytes) const override { + checkBangError(cnrtMemcpy(dst, const_cast(src), bytes, + CNRT_MEM_TRANS_DIR_HOST2DEV)); } - void copyBlobToCPU(void *dst, void *src, size_t bytes) const override { - checkBangError( - cnrtMemcpy(dst, src, bytes, CNRT_MEM_TRANS_DIR_DEV2HOST)); + void copyBlobToCPU(void *dst, const void *src, + size_t bytes) const override { + checkBangError(cnrtMemcpy(dst, const_cast(src), bytes, + CNRT_MEM_TRANS_DIR_DEV2HOST)); } - void copyBlobInsideRuntime(void *dst, void *src, + void copyBlobInsideRuntime(void *dst, const void *src, size_t bytes) const override { - checkBangError( - cnrtMemcpy(dst, src, bytes, CNRT_MEM_TRANS_DIR_PEER2PEER)); + checkBangError(cnrtMemcpy(dst, const_cast(src), bytes, + CNRT_MEM_TRANS_DIR_PEER2PEER)); } private: diff --git a/include/core/blob.h b/include/core/blob.h index 4ff05ea8..c3b9f41b 100644 --- a/include/core/blob.h +++ b/include/core/blob.h @@ -21,4 +21,4 @@ class BlobObj { template T getPtr() const { return reinterpret_cast(ptr); } }; -} // namespace infini \ No newline at end of file +} // namespace infini diff --git a/src/kernels/bang/element_wise.cc b/src/kernels/bang/element_wise.cc index 07a841b1..74505e9b 100644 --- a/src/kernels/bang/element_wise.cc +++ b/src/kernels/bang/element_wise.cc @@ -1,4 +1,5 @@ #include "operators/element_wise.h" +#include "bang/bang_element_wise.h" #include "bang/bang_kernel_without_config.h" #include "bang/bang_runtime.h" @@ -80,12 +81,12 @@ class MulCnnl : public ElementWiseCnnl { cnnlOpTensorDesc_t getOpType() const override { return CNNL_OP_TENSOR_MUL; } }; -// class ElementWiseBang : public BangKernelWithoutConfig { -// void compute(const Operator &_op, -// const RuntimeObj *_context) const override { -// element_wise_kernel(_op); -// } -// }; +class ElementWiseBang : public BangKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + element_wise_kernel(_context, _op); + } +}; REGISTER_KERNEL(Device::BANG, OpType::Add, DataType::Float32, AddCnnl, "Add_cnnl_BANG_Float32"); @@ -94,9 +95,8 @@ REGISTER_KERNEL(Device::BANG, OpType::Sub, DataType::Float32, SubCnnl, REGISTER_KERNEL(Device::BANG, OpType::Mul, DataType::Float32, MulCnnl, "Mul_cnnl_BANG_Float32"); -// REGISTER_KERNEL(Device::BANG, OpType::Div, DataType::Float32, -// ElementWiseBang, -// "Div_Bang_Float32"); +REGISTER_KERNEL(Device::BANG, OpType::Div, DataType::Float32, ElementWiseBang, + "Div_Bang_Float32"); // REGISTER_KERNEL(Device::BANG, OpType::Pow, DataType::Float32, // ElementWiseBang, // "Pow_Bang_Float32"); diff --git a/src/kernels/mlu/CMakeLists.txt b/src/kernels/mlu/CMakeLists.txt new file mode 100644 index 00000000..b8cd41b0 --- /dev/null +++ b/src/kernels/mlu/CMakeLists.txt @@ -0,0 +1,46 @@ +cmake_minimum_required(VERSION 3.3) +project(bangops) +include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include") +set(LIBRARY_OUTPUT_PATH "${CMAKE_BINARY_DIR}/lib") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -fPIC -std=c++11 -pthread -pipe") +set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} ${CMAKE_CXX_FLAGS} -O3") +set(CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} -Wl,--gc-sections -fPIC") + +# check `NEUWARE_HOME` env +message(${NEUWARE_HOME}) +if(EXISTS ${NEUWARE_HOME}) + include_directories("${NEUWARE_HOME}/include") + link_directories("${NEUWARE_HOME}/lib64") + link_directories("${NEUWARE_HOME}/lib") + set(NEUWARE_ROOT_DIR "${NEUWARE_HOME}") +else() + message(FATAL_ERROR "NEUWARE directory cannot be found, refer README.md to prepare NEUWARE_HOME environment.") +endif() + +# setup cmake search path +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} + "${CMAKE_SOURCE_DIR}/cmake" + "${NEUWARE_HOME}/cmake" + "${NEUWARE_HOME}/cmake/modules" +) + +# include FindBANG.cmake and check cncc +find_package(BANG) +if(NOT BANG_FOUND) + message(FATAL_ERROR "BANG cannot be found.") +elseif (NOT BANG_CNCC_EXECUTABLE) + message(FATAL_ERROR "cncc not found, please ensure cncc is in your PATH env or set variable BANG_CNCC_EXECUTABLE from cmake. Otherwise you should check path used by find_program(BANG_CNCC_EXECUTABLE) in FindBANG.cmake") +endif() + +# setup cncc flags +set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS} -fPIC -Wall -Werror -std=c++11 -pthread") +set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS} -O3") +set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS}" "--bang-mlu-arch=mtp_220" + "--bang-mlu-arch=mtp_270" + "--bang-mlu-arch=mtp_290" + "--bang-mlu-arch=mtp_372" +) + +file(GLOB_RECURSE src_files ${src_files} "${CMAKE_CURRENT_SOURCE_DIR}/src/*.mlu") +bang_add_library(bangops SHARED ${src_files}) + diff --git a/src/kernels/mlu/include/bang_div.h b/src/kernels/mlu/include/bang_div.h new file mode 100644 index 00000000..0f81c721 --- /dev/null +++ b/src/kernels/mlu/include/bang_div.h @@ -0,0 +1,7 @@ +#pragma once +#include "cnnl.h" +namespace infini { +void div_kernel(cnnlHandle_t handle, const float *input1, const float *input2, + float *output, const uint32_t num); + +}; // namespace infini diff --git a/src/kernels/mlu/include/div.h b/src/kernels/mlu/include/div.h new file mode 100644 index 00000000..83520d1f --- /dev/null +++ b/src/kernels/mlu/include/div.h @@ -0,0 +1,7 @@ +#ifndef BANG_KERNELS_DIVOPERATION_DIV_H_ +#define BANG_KERNELS_DIVOPERATION_DIV_H_ + +__mlu_global__ void MLUDivKernelUnion1(float *output, float *input1, + float *input2, uint32_t num); + +#endif // BANG_KERNELS_DIVOPERATION_DIV_H_ diff --git a/src/kernels/mlu/src/div.mlu b/src/kernels/mlu/src/div.mlu new file mode 100644 index 00000000..63456f96 --- /dev/null +++ b/src/kernels/mlu/src/div.mlu @@ -0,0 +1,24 @@ +#include "bang_div.h" +#include "div.h" +namespace infini { +void div_kernel(cnnlHandle_t handle, + const float *input1, + const float *input2, + float *output, + const uint32_t num) { + // 任务类型和调度方法 + cnrtDim3_t k_dim; + cnrtFunctionType_t k_type; + cnrtQueue_t queue; + cnnlGetQueue(handle, &queue); + k_dim.x = 4; + k_dim.y = 8; + k_dim.z = 1; + k_type = CNRT_FUNC_TYPE_UNION1; + // launch 任务 + MLUDivKernelUnion1<<>>((float*)output, + (float*)input1, + (float*)input2, + num); +} +}; diff --git a/src/kernels/mlu/src/div_device.mlu b/src/kernels/mlu/src/div_device.mlu new file mode 100644 index 00000000..bee83eb8 --- /dev/null +++ b/src/kernels/mlu/src/div_device.mlu @@ -0,0 +1,50 @@ +#include "div.h" + +#define NRAM_USE_SIZE 102400 + +__nram__ char left[NRAM_USE_SIZE]; +__nram__ char right[NRAM_USE_SIZE]; +__nram__ char output[NRAM_USE_SIZE]; + +template +__mlu_device__ void DivFunction(T* output1, T* input1, T* input2, size_t num) { + int use_nram_size = NRAM_USE_SIZE; + int deal_align = use_nram_size / sizeof(T); + int num_per_core = num / taskDim; + int num_rem = num % taskDim; + int easy = num_per_core; + int hard = num_per_core + (num_rem != 0 ? 1 : 0); + int my = taskId < num_rem ? hard : easy; + int start = (taskId < num_rem) ? (hard * taskId) : (hard * num_rem + (taskId - num_rem) * easy); + char* input1_start = (char*)input1 + start * sizeof(T); + char* input2_start = (char*)input2 + start * sizeof(T); + char* output_start = (char*)output1 + start * sizeof(T); + + int my_repeat = my / deal_align; + int my_rem = my % deal_align; + for(int i = 0; i < my_repeat; ++i) { + __memcpy(left, input1_start, use_nram_size, GDRAM2NRAM); + __memcpy(right, input2_start, use_nram_size, GDRAM2NRAM); + __bang_active_recip((T*)right, (T*)right, deal_align); + __bang_mul((T*)output, (T*)left, (T*)right, deal_align); + __memcpy(output_start, output, use_nram_size, NRAM2GDRAM); + input1_start += use_nram_size; + input2_start += use_nram_size; + output_start += use_nram_size; + } + if(my_rem) { + __memcpy(left, input1_start, my_rem * sizeof(T), GDRAM2NRAM); + __memcpy(right, input2_start, my_rem * sizeof(T), GDRAM2NRAM); + __bang_active_recip((T*)right, (T*)right, deal_align); + __bang_mul((T*)output, (T*)left, (T*)right, deal_align); + __memcpy(output_start, output, my_rem * sizeof(T), NRAM2GDRAM); + } +} + +__mlu_global__ void MLUDivKernelUnion1(float *output, + float *input1, + float *input2, + uint32_t num) { + DivFunction((float*)output, (float*)input1, (float*)input2, num); +} + diff --git a/test/kernels/bang/test_bang_bangcKernel.cc b/test/kernels/bang/test_bang_bangcKernel.cc new file mode 100644 index 00000000..f29a0e50 --- /dev/null +++ b/test/kernels/bang/test_bang_bangcKernel.cc @@ -0,0 +1,63 @@ +#include "bang/bang_runtime.h" +#include "core/blob.h" +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "operators/element_wise.h" +#include "utils/validation.h" + +#include "test.h" + +namespace infini { + +template +void testBangcKernel( + const std::function &generator, + const Shape &shape) { + // Runtime + Runtime cpuRuntime = CpuRuntimeObj::getInstance(); + auto bangRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu1 = + make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu1->dataMalloc(); + inputCpu1->setData(generator); + Tensor inputCpu2 = + make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu2->dataMalloc(); + inputCpu2->setData(generator); + + // inputCpu1->printData(); + // inputCpu2->printData(); + + // GPU + Graph bangGraph = make_ref(bangRuntime); + auto inputGpu1 = bangGraph->cloneTensor(inputCpu1); + auto inputGpu2 = bangGraph->cloneTensor(inputCpu2); + auto gpuOp = bangGraph->addOp(inputGpu1, inputGpu2, nullptr); + bangGraph->dataMalloc(); + bangRuntime->run(bangGraph); + auto outputGpu = gpuOp->getOutput(); + auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); + // outputGpu2Cpu->printData(); + // CPU + Graph cpuGraph = make_ref(cpuRuntime); + auto cpuOp = cpuGraph->addOp(inputCpu1, inputCpu2, nullptr); + cpuGraph->dataMalloc(); + cpuRuntime->run(cpuGraph); + auto outputCpu = cpuOp->getOutput(); + // outputCpu->printData(); + // Check + float *const cpuRes = + (float *)(outputCpu->template getRawDataPtr()); + float *const mluRes = + (float *)(outputGpu2Cpu->template getRawDataPtr()); + EXPECT_LE(computeDifference2(cpuRes, mluRes, outputCpu->size()), 0.003); +} + +TEST(BangcKernel_Div, run) { + testBangcKernel(IncrementalGenerator(), Shape{1, 2, 2, 3}); +} + +} // namespace infini