Support bang c kernel wanghailu 0927 (#43)

* fix a little bug which found by new verison CMake * add code for support BangC language kernel , just like Cuda kernel, not library * add bangc kernel * support BangC kernel * add code for support BangC kernel * support bangc kernel * fix some code from reviewer * fix code of template fumction * add code for support bangc kernel * fix bangc format Co-authored-by: wanghailu <wanghailu@qiyuanlab.com> Co-authored-by: Haojie Wang <haojie0429@gmail.com>
2022-09-30 11:01:52 +08:00 · 2022-09-30 11:01:52 +08:00 · b0c2a08252
parent 26cee55e81
commit b0c2a08252
11 changed files with 245 additions and 64 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -121,6 +121,7 @@ if(USE_CUDA)
 endif()
 if(USE_BANG)
  include_directories(src/kernels/mlu/include)
  ################################################################################
  # Neuware Evironment
  ################################################################################
@ -151,53 +152,12 @@ if(USE_BANG)
  message(STATUS "TARGET_CPU_ARCH: ${TARGET_CPU_ARCH}")
  ################################################################################
-  # Sample Kernels
+  # BangC Kernels
  ################################################################################
-  set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "$ENV{NEUWARE_HOME}/cmake" "$ENV{NEUWARE_HOME}/cmake/modules")
+  add_subdirectory(src/kernels/mlu)
  find_package(BANG)
  if(NOT BANG_FOUND)
    message(FATAL_ERROR "BANG cannot be found.")
  elseif(NOT BANG_CNCC_EXECUTABLE)
    message(FATAL_ERROR "cncc not found, please ensure cncc is in your PATH env or set variable BANG_CNCC_EXECUTABLE from cmake. Otherwise you should check path used by find_program(BANG_CNCC_EXECUTABLE) in FindBANG.cmake")
  endif()
  set(BANG_CNCC_FLAGS "-Wall -Werror -fPIC -std=c++11 --target=${TARGET_CPU_ARCH} -O3")
  set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS}"
                                             "--bang-arch=compute_20"
                                             "--bang-arch=compute_30"
                                             "--bang-mlu-arch=mtp_322"
                                             "--bang-wram-align64"
  )
  if(${TARGET_CPU_ARCH} MATCHES "aarch64-linux-gnu")
    set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=1")
    add_definitions(-D_GLIBCXX_USE_CXX11_ABI=1)
    execute_process(
      COMMAND uname -m
      OUTPUT_VARIABLE _uname_m
      OUTPUT_STRIP_TRAILING_WHITESPACE
    )
    if (NOT ("${TARGET_CPU_ARCH}" MATCHES ".*${_uname_m}.*" AND "${_uname_m}" MATCHES "aarch64"))
      execute_process(
        COMMAND "${CMAKE_CXX_COMPILER}" "-v" "-c" "-x" "c++" "/dev/null" "-M"
        ERROR_VARIABLE _cxx_verbose
      )
      execute_process(
        COMMAND "echo" "${_cxx_verbose}"
        COMMAND "sed" "-n" "/include.*search starts here/,/End of search list/{s/^ //p}"
        COMMAND "tr" "'\n'" ";"
        OUTPUT_VARIABLE _cxx_includes
      )
      list(REMOVE_ITEM _cxx_includes "/usr/include")
      foreach(_include ${_cxx_includes})
        message(STATUS "add include path: ${_include}")
        set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS} -idirafter ${_include}")
      endforeach()
    endif()
  endif()
  #bang_add_library(bangops SHARED ${SRC_BANG})
  #target_link_libraries(bangops ${CAMBRICON_CNDRV})
  target_link_libraries(InfiniTensor ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
-  #target_link_libraries(InfiniTensor bangops)
+  target_link_libraries(InfiniTensor bangops)
 endif()
 # # Python bindings
--- a/include/bang/bang_element_wise.h
+++ b/include/bang/bang_element_wise.h
@ -0,0 +1,22 @@
 #pragma once
 #include "bang/bang_runtime.h"
 #include "bang_div.h"
 #include "operators/element_wise.h"
 namespace infini {
 void element_wise_kernel(const RuntimeObj *obj, const Operator &_op) {
    auto op = as<ElementWiseObj>(_op);
    float *const aData = (op->getInputs(0)->getRawDataPtr<float *>());
    float *const bData = (op->getInputs(1)->getRawDataPtr<float *>());
    float *const cData = (op->getOutput()->getRawDataPtr<float *>());
    auto dim = op->getInputs(0)->getDims();
    auto context = dynamic_cast<const BangRuntimeObj *>(obj);
    int n = dim[0], c = dim[1], h = dim[2], w = dim[3];
    if (op->getOpType() == OpType::Div)
        div_kernel(context->cnnlHandle(), aData, bData, cData, n * c * h * w);
    else
        IT_TODO_HALT();
 }
 }; // namespace infini
--- a/include/bang/bang_runtime.h
+++ b/include/bang/bang_runtime.h
@ -48,20 +48,22 @@ class BangRuntimeObj : public RuntimeObj {
        return workspace;
    }
-    void copyBlobFromCPU(void *dst, void *src, size_t bytes) const override {
+    void copyBlobFromCPU(void *dst, const void *src,
-        checkBangError(
+                         size_t bytes) const override {
-            cnrtMemcpy(dst, src, bytes, CNRT_MEM_TRANS_DIR_HOST2DEV));
+        checkBangError(cnrtMemcpy(dst, const_cast<void *>(src), bytes,
                                  CNRT_MEM_TRANS_DIR_HOST2DEV));
    }
-    void copyBlobToCPU(void *dst, void *src, size_t bytes) const override {
+    void copyBlobToCPU(void *dst, const void *src,
-        checkBangError(
+                       size_t bytes) const override {
-            cnrtMemcpy(dst, src, bytes, CNRT_MEM_TRANS_DIR_DEV2HOST));
+        checkBangError(cnrtMemcpy(dst, const_cast<void *>(src), bytes,
                                  CNRT_MEM_TRANS_DIR_DEV2HOST));
    }
-    void copyBlobInsideRuntime(void *dst, void *src,
+    void copyBlobInsideRuntime(void *dst, const void *src,
                               size_t bytes) const override {
-        checkBangError(
+        checkBangError(cnrtMemcpy(dst, const_cast<void *>(src), bytes,
-            cnrtMemcpy(dst, src, bytes, CNRT_MEM_TRANS_DIR_PEER2PEER));
+                                  CNRT_MEM_TRANS_DIR_PEER2PEER));
    }
  private:
--- a/src/kernels/bang/element_wise.cc
+++ b/src/kernels/bang/element_wise.cc
@ -1,4 +1,5 @@
 #include "operators/element_wise.h"
 #include "bang/bang_element_wise.h"
 #include "bang/bang_kernel_without_config.h"
 #include "bang/bang_runtime.h"
@ -80,12 +81,12 @@ class MulCnnl : public ElementWiseCnnl {
    cnnlOpTensorDesc_t getOpType() const override { return CNNL_OP_TENSOR_MUL; }
 };
-// class ElementWiseBang : public BangKernelWithoutConfig {
+class ElementWiseBang : public BangKernelWithoutConfig {
-//     void compute(const Operator &_op,
+    void compute(const Operator &_op,
-//                  const RuntimeObj *_context) const override {
+                 const RuntimeObj *_context) const override {
-//         element_wise_kernel(_op);
+        element_wise_kernel(_context, _op);
-//     }
+    }
-// };
+};
 REGISTER_KERNEL(Device::BANG, OpType::Add, DataType::Float32, AddCnnl,
                "Add_cnnl_BANG_Float32");
@ -94,9 +95,8 @@ REGISTER_KERNEL(Device::BANG, OpType::Sub, DataType::Float32, SubCnnl,
 REGISTER_KERNEL(Device::BANG, OpType::Mul, DataType::Float32, MulCnnl,
                "Mul_cnnl_BANG_Float32");
-// REGISTER_KERNEL(Device::BANG, OpType::Div, DataType::Float32,
+REGISTER_KERNEL(Device::BANG, OpType::Div, DataType::Float32, ElementWiseBang,
-// ElementWiseBang,
+                "Div_Bang_Float32");
 //                 "Div_Bang_Float32");
 // REGISTER_KERNEL(Device::BANG, OpType::Pow, DataType::Float32,
 // ElementWiseBang,
 //                 "Pow_Bang_Float32");
--- a/src/kernels/mlu/CMakeLists.txt
+++ b/src/kernels/mlu/CMakeLists.txt
@ -0,0 +1,46 @@
 cmake_minimum_required(VERSION 3.3)
 project(bangops)
 include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include")
 set(LIBRARY_OUTPUT_PATH "${CMAKE_BINARY_DIR}/lib")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -fPIC -std=c++11 -pthread -pipe")
 set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} ${CMAKE_CXX_FLAGS} -O3")
 set(CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} -Wl,--gc-sections -fPIC")
 # check `NEUWARE_HOME` env
 message(${NEUWARE_HOME})
 if(EXISTS ${NEUWARE_HOME})
  include_directories("${NEUWARE_HOME}/include")
  link_directories("${NEUWARE_HOME}/lib64")
  link_directories("${NEUWARE_HOME}/lib")
  set(NEUWARE_ROOT_DIR "${NEUWARE_HOME}")
 else()
  message(FATAL_ERROR "NEUWARE directory cannot be found, refer README.md to prepare NEUWARE_HOME environment.")
 endif()
 # setup cmake search path
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH}
  "${CMAKE_SOURCE_DIR}/cmake"
  "${NEUWARE_HOME}/cmake"
  "${NEUWARE_HOME}/cmake/modules"
 )
 # include FindBANG.cmake and check cncc
 find_package(BANG)
 if(NOT BANG_FOUND)
  message(FATAL_ERROR "BANG cannot be found.")
 elseif (NOT BANG_CNCC_EXECUTABLE)
  message(FATAL_ERROR "cncc not found, please ensure cncc is in your PATH env or set variable BANG_CNCC_EXECUTABLE from cmake. Otherwise you should check path used by find_program(BANG_CNCC_EXECUTABLE) in FindBANG.cmake")
 endif()
 # setup cncc flags
 set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS} -fPIC -Wall -Werror -std=c++11 -pthread")
 set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS} -O3")
 set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS}" "--bang-mlu-arch=mtp_220"
                                         "--bang-mlu-arch=mtp_270"
                                         "--bang-mlu-arch=mtp_290"
                                         "--bang-mlu-arch=mtp_372"
 )
 file(GLOB_RECURSE src_files ${src_files} "${CMAKE_CURRENT_SOURCE_DIR}/src/*.mlu")
 bang_add_library(bangops SHARED ${src_files})
--- a/src/kernels/mlu/include/bang_div.h
+++ b/src/kernels/mlu/include/bang_div.h
@ -0,0 +1,7 @@
 #pragma once
 #include "cnnl.h"
 namespace infini {
 void div_kernel(cnnlHandle_t handle, const float *input1, const float *input2,
                float *output, const uint32_t num);
 }; // namespace infini
--- a/src/kernels/mlu/include/div.h
+++ b/src/kernels/mlu/include/div.h
@ -0,0 +1,7 @@
 #ifndef BANG_KERNELS_DIVOPERATION_DIV_H_
 #define BANG_KERNELS_DIVOPERATION_DIV_H_
 __mlu_global__ void MLUDivKernelUnion1(float *output, float *input1,
                                       float *input2, uint32_t num);
 #endif // BANG_KERNELS_DIVOPERATION_DIV_H_
--- a/src/kernels/mlu/src/div.mlu
+++ b/src/kernels/mlu/src/div.mlu
@ -0,0 +1,24 @@
 #include "bang_div.h"
 #include "div.h"
 namespace infini {
 void div_kernel(cnnlHandle_t handle,
                const float *input1,
                const float *input2,
                float *output,
                const uint32_t num) {
    // 任务类型和调度方法
    cnrtDim3_t k_dim;
    cnrtFunctionType_t k_type;
    cnrtQueue_t queue;
    cnnlGetQueue(handle, &queue);
    k_dim.x = 4;
    k_dim.y = 8;
    k_dim.z = 1;
    k_type = CNRT_FUNC_TYPE_UNION1;
    // launch 任务
    MLUDivKernelUnion1<<<k_dim, k_type, queue>>>((float*)output,
                                                  (float*)input1,
                                                  (float*)input2,
                                                  num);
 }
 };
--- a/src/kernels/mlu/src/div_device.mlu
+++ b/src/kernels/mlu/src/div_device.mlu
@ -0,0 +1,50 @@
 #include "div.h"
 #define NRAM_USE_SIZE 102400
 __nram__ char left[NRAM_USE_SIZE];
 __nram__ char right[NRAM_USE_SIZE];
 __nram__ char output[NRAM_USE_SIZE];
 template<typename T>
 __mlu_device__ void DivFunction(T* output1, T* input1, T* input2, size_t num) {
    int use_nram_size = NRAM_USE_SIZE;
    int deal_align = use_nram_size / sizeof(T);
    int num_per_core = num / taskDim;
    int num_rem = num % taskDim;
    int easy = num_per_core;
    int hard = num_per_core + (num_rem != 0 ? 1 : 0);
    int my = taskId < num_rem ? hard : easy;
    int start = (taskId < num_rem) ? (hard * taskId) : (hard * num_rem + (taskId - num_rem) * easy);
    char* input1_start = (char*)input1 + start * sizeof(T);
    char* input2_start = (char*)input2 + start * sizeof(T);
    char* output_start = (char*)output1 + start * sizeof(T);
    int my_repeat = my / deal_align;
    int my_rem = my % deal_align;
    for(int i = 0; i < my_repeat; ++i) {
        __memcpy(left, input1_start, use_nram_size, GDRAM2NRAM);
        __memcpy(right, input2_start, use_nram_size, GDRAM2NRAM);
        __bang_active_recip((T*)right, (T*)right, deal_align);
        __bang_mul((T*)output, (T*)left, (T*)right, deal_align);
        __memcpy(output_start, output, use_nram_size, NRAM2GDRAM);
        input1_start += use_nram_size;
        input2_start += use_nram_size;
        output_start += use_nram_size;
    }
    if(my_rem) {
        __memcpy(left, input1_start, my_rem * sizeof(T), GDRAM2NRAM);
        __memcpy(right, input2_start, my_rem * sizeof(T), GDRAM2NRAM);
        __bang_active_recip((T*)right, (T*)right, deal_align);
        __bang_mul((T*)output, (T*)left, (T*)right, deal_align);
        __memcpy(output_start, output, my_rem * sizeof(T), NRAM2GDRAM);
    }
 }
 __mlu_global__ void MLUDivKernelUnion1(float *output,
                                       float *input1,
                                       float *input2,
                                       uint32_t num) {
    DivFunction((float*)output, (float*)input1, (float*)input2, num);
 }
--- a/test/kernels/bang/test_bang_bangcKernel.cc
+++ b/test/kernels/bang/test_bang_bangcKernel.cc
@ -0,0 +1,63 @@
 #include "bang/bang_runtime.h"
 #include "core/blob.h"
 #include "core/graph.h"
 #include "core/kernel.h"
 #include "core/runtime.h"
 #include "operators/element_wise.h"
 #include "utils/validation.h"
 #include "test.h"
 namespace infini {
 template <class T>
 void testBangcKernel(
    const std::function<void(void *, size_t, DataType)> &generator,
    const Shape &shape) {
    // Runtime
    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
    auto bangRuntime = make_ref<BangRuntimeObj>();
    // Build input data on CPU
    Tensor inputCpu1 =
        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
    inputCpu1->dataMalloc();
    inputCpu1->setData(generator);
    Tensor inputCpu2 =
        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
    inputCpu2->dataMalloc();
    inputCpu2->setData(generator);
    // inputCpu1->printData();
    // inputCpu2->printData();
    // GPU
    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
    auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
    auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
    auto gpuOp = bangGraph->addOp<T>(inputGpu1, inputGpu2, nullptr);
    bangGraph->dataMalloc();
    bangRuntime->run(bangGraph);
    auto outputGpu = gpuOp->getOutput();
    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
    // outputGpu2Cpu->printData();
    // CPU
    Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
    auto cpuOp = cpuGraph->addOp<T>(inputCpu1, inputCpu2, nullptr);
    cpuGraph->dataMalloc();
    cpuRuntime->run(cpuGraph);
    auto outputCpu = cpuOp->getOutput();
    // outputCpu->printData();
    // Check
    float *const cpuRes =
        (float *)(outputCpu->template getRawDataPtr<float *>());
    float *const mluRes =
        (float *)(outputGpu2Cpu->template getRawDataPtr<float *>());
    EXPECT_LE(computeDifference2(cpuRes, mluRes, outputCpu->size()), 0.003);
 }
 TEST(BangcKernel_Div, run) {
    testBangcKernel<DivObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
 }
 } // namespace infini