Support bang c kernel wanghailu 0927 (#43)

* fix a little bug which found by new verison CMake * add code for support BangC language kernel , just like Cuda kernel, not library * add bangc kernel * support BangC kernel * add code for support BangC kernel * support bangc kernel * fix some code from reviewer * fix code of template fumction * add code for support bangc kernel * fix bangc format Co-authored-by: wanghailu <wanghailu@qiyuanlab.com> Co-authored-by: Haojie Wang <haojie0429@gmail.com>
2022-09-30 11:01:52 +08:00 · 2022-09-30 11:01:52 +08:00 · b0c2a08252
parent 26cee55e81
commit b0c2a08252
11 changed files with 245 additions and 64 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -121,6 +121,7 @@ if(USE_CUDA)
 endif()

 if(USE_BANG)
+  include_directories(src/kernels/mlu/include)
  ################################################################################
  # Neuware Evironment
  ################################################################################
@ -151,53 +152,12 @@ if(USE_BANG)
  message(STATUS "TARGET_CPU_ARCH: ${TARGET_CPU_ARCH}")

  ################################################################################
-  # Sample Kernels
+  # BangC Kernels
  ################################################################################
-  set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "$ENV{NEUWARE_HOME}/cmake" "$ENV{NEUWARE_HOME}/cmake/modules")
-  find_package(BANG)
-  if(NOT BANG_FOUND)
-    message(FATAL_ERROR "BANG cannot be found.")
-  elseif(NOT BANG_CNCC_EXECUTABLE)
-    message(FATAL_ERROR "cncc not found, please ensure cncc is in your PATH env or set variable BANG_CNCC_EXECUTABLE from cmake. Otherwise you should check path used by find_program(BANG_CNCC_EXECUTABLE) in FindBANG.cmake")
-  endif()
-  set(BANG_CNCC_FLAGS "-Wall -Werror -fPIC -std=c++11 --target=${TARGET_CPU_ARCH} -O3")
-  set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS}"
-                                             "--bang-arch=compute_20"
-                                             "--bang-arch=compute_30"
-                                             "--bang-mlu-arch=mtp_322"
-                                             "--bang-wram-align64"
-  )
-  
-  if(${TARGET_CPU_ARCH} MATCHES "aarch64-linux-gnu")
-    set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=1")
-    add_definitions(-D_GLIBCXX_USE_CXX11_ABI=1)
-    execute_process(
-      COMMAND uname -m
-      OUTPUT_VARIABLE _uname_m
-      OUTPUT_STRIP_TRAILING_WHITESPACE
-    )
-    if (NOT ("${TARGET_CPU_ARCH}" MATCHES ".*${_uname_m}.*" AND "${_uname_m}" MATCHES "aarch64"))
-      execute_process(
-        COMMAND "${CMAKE_CXX_COMPILER}" "-v" "-c" "-x" "c++" "/dev/null" "-M"
-        ERROR_VARIABLE _cxx_verbose
-      )
-      execute_process(
-        COMMAND "echo" "${_cxx_verbose}"
-        COMMAND "sed" "-n" "/include.*search starts here/,/End of search list/{s/^ //p}"
-        COMMAND "tr" "'\n'" ";"
-        OUTPUT_VARIABLE _cxx_includes
-      )
-      list(REMOVE_ITEM _cxx_includes "/usr/include")
-      foreach(_include ${_cxx_includes})
-        message(STATUS "add include path: ${_include}")
-        set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS} -idirafter ${_include}")
-      endforeach()
-    endif()
-  endif()
-  #bang_add_library(bangops SHARED ${SRC_BANG})
-  #target_link_libraries(bangops ${CAMBRICON_CNDRV})
+  add_subdirectory(src/kernels/mlu)
+
  target_link_libraries(InfiniTensor ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
-  #target_link_libraries(InfiniTensor bangops)
+  target_link_libraries(InfiniTensor bangops)
 endif()

 # # Python bindings
--- a/include/bang/bang_element_wise.h
+++ b/include/bang/bang_element_wise.h
@ -0,0 +1,22 @@
+#pragma once
+#include "bang/bang_runtime.h"
+#include "bang_div.h"
+#include "operators/element_wise.h"
+namespace infini {
+
+void element_wise_kernel(const RuntimeObj *obj, const Operator &_op) {
+    auto op = as<ElementWiseObj>(_op);
+    float *const aData = (op->getInputs(0)->getRawDataPtr<float *>());
+    float *const bData = (op->getInputs(1)->getRawDataPtr<float *>());
+    float *const cData = (op->getOutput()->getRawDataPtr<float *>());
+
+    auto dim = op->getInputs(0)->getDims();
+    auto context = dynamic_cast<const BangRuntimeObj *>(obj);
+    int n = dim[0], c = dim[1], h = dim[2], w = dim[3];
+    if (op->getOpType() == OpType::Div)
+        div_kernel(context->cnnlHandle(), aData, bData, cData, n * c * h * w);
+    else
+        IT_TODO_HALT();
+}
+
+}; // namespace infini
--- a/include/bang/bang_runtime.h
+++ b/include/bang/bang_runtime.h
@ -48,20 +48,22 @@ class BangRuntimeObj : public RuntimeObj {
        return workspace;
    }

-    void copyBlobFromCPU(void *dst, void *src, size_t bytes) const override {
-        checkBangError(
-            cnrtMemcpy(dst, src, bytes, CNRT_MEM_TRANS_DIR_HOST2DEV));
+    void copyBlobFromCPU(void *dst, const void *src,
+                         size_t bytes) const override {
+        checkBangError(cnrtMemcpy(dst, const_cast<void *>(src), bytes,
+                                  CNRT_MEM_TRANS_DIR_HOST2DEV));
    }

-    void copyBlobToCPU(void *dst, void *src, size_t bytes) const override {
-        checkBangError(
-            cnrtMemcpy(dst, src, bytes, CNRT_MEM_TRANS_DIR_DEV2HOST));
+    void copyBlobToCPU(void *dst, const void *src,
+                       size_t bytes) const override {
+        checkBangError(cnrtMemcpy(dst, const_cast<void *>(src), bytes,
+                                  CNRT_MEM_TRANS_DIR_DEV2HOST));
    }

-    void copyBlobInsideRuntime(void *dst, void *src,
+    void copyBlobInsideRuntime(void *dst, const void *src,
                               size_t bytes) const override {
-        checkBangError(
-            cnrtMemcpy(dst, src, bytes, CNRT_MEM_TRANS_DIR_PEER2PEER));
+        checkBangError(cnrtMemcpy(dst, const_cast<void *>(src), bytes,
+                                  CNRT_MEM_TRANS_DIR_PEER2PEER));
    }

  private:
--- a/include/core/blob.h
+++ b/include/core/blob.h
@ -21,4 +21,4 @@ class BlobObj {
    template <typename T> T getPtr() const { return reinterpret_cast<T>(ptr); }
 };

-} // namespace infini
+} // namespace infini
--- a/src/kernels/bang/element_wise.cc
+++ b/src/kernels/bang/element_wise.cc
@ -1,4 +1,5 @@
 #include "operators/element_wise.h"
+#include "bang/bang_element_wise.h"
 #include "bang/bang_kernel_without_config.h"
 #include "bang/bang_runtime.h"

@ -80,12 +81,12 @@ class MulCnnl : public ElementWiseCnnl {
    cnnlOpTensorDesc_t getOpType() const override { return CNNL_OP_TENSOR_MUL; }
 };

-// class ElementWiseBang : public BangKernelWithoutConfig {
-//     void compute(const Operator &_op,
-//                  const RuntimeObj *_context) const override {
-//         element_wise_kernel(_op);
-//     }
-// };
+class ElementWiseBang : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        element_wise_kernel(_context, _op);
+    }
+};

 REGISTER_KERNEL(Device::BANG, OpType::Add, DataType::Float32, AddCnnl,
                "Add_cnnl_BANG_Float32");
@ -94,9 +95,8 @@ REGISTER_KERNEL(Device::BANG, OpType::Sub, DataType::Float32, SubCnnl,
 REGISTER_KERNEL(Device::BANG, OpType::Mul, DataType::Float32, MulCnnl,
                "Mul_cnnl_BANG_Float32");

-// REGISTER_KERNEL(Device::BANG, OpType::Div, DataType::Float32,
-// ElementWiseBang,
-//                 "Div_Bang_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Div, DataType::Float32, ElementWiseBang,
+                "Div_Bang_Float32");
 // REGISTER_KERNEL(Device::BANG, OpType::Pow, DataType::Float32,
 // ElementWiseBang,
 //                 "Pow_Bang_Float32");
--- a/src/kernels/mlu/CMakeLists.txt
+++ b/src/kernels/mlu/CMakeLists.txt
@ -0,0 +1,46 @@
+cmake_minimum_required(VERSION 3.3)
+project(bangops)
+include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include")
+set(LIBRARY_OUTPUT_PATH "${CMAKE_BINARY_DIR}/lib")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -fPIC -std=c++11 -pthread -pipe")
+set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} ${CMAKE_CXX_FLAGS} -O3")
+set(CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} -Wl,--gc-sections -fPIC")
+
+# check `NEUWARE_HOME` env
+message(${NEUWARE_HOME})
+if(EXISTS ${NEUWARE_HOME})
+  include_directories("${NEUWARE_HOME}/include")
+  link_directories("${NEUWARE_HOME}/lib64")
+  link_directories("${NEUWARE_HOME}/lib")
+  set(NEUWARE_ROOT_DIR "${NEUWARE_HOME}")
+else()
+  message(FATAL_ERROR "NEUWARE directory cannot be found, refer README.md to prepare NEUWARE_HOME environment.")
+endif()
+
+# setup cmake search path
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH}
+  "${CMAKE_SOURCE_DIR}/cmake"
+  "${NEUWARE_HOME}/cmake"
+  "${NEUWARE_HOME}/cmake/modules"
+)
+
+# include FindBANG.cmake and check cncc
+find_package(BANG)
+if(NOT BANG_FOUND)
+  message(FATAL_ERROR "BANG cannot be found.")
+elseif (NOT BANG_CNCC_EXECUTABLE)
+  message(FATAL_ERROR "cncc not found, please ensure cncc is in your PATH env or set variable BANG_CNCC_EXECUTABLE from cmake. Otherwise you should check path used by find_program(BANG_CNCC_EXECUTABLE) in FindBANG.cmake")
+endif()
+
+# setup cncc flags
+set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS} -fPIC -Wall -Werror -std=c++11 -pthread")
+set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS} -O3")
+set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS}" "--bang-mlu-arch=mtp_220"
+                                         "--bang-mlu-arch=mtp_270"
+                                         "--bang-mlu-arch=mtp_290"
+                                         "--bang-mlu-arch=mtp_372"
+)
+
+file(GLOB_RECURSE src_files ${src_files} "${CMAKE_CURRENT_SOURCE_DIR}/src/*.mlu")
+bang_add_library(bangops SHARED ${src_files})
+
--- a/src/kernels/mlu/include/bang_div.h
+++ b/src/kernels/mlu/include/bang_div.h
@ -0,0 +1,7 @@
+#pragma once
+#include "cnnl.h"
+namespace infini {
+void div_kernel(cnnlHandle_t handle, const float *input1, const float *input2,
+                float *output, const uint32_t num);
+
+}; // namespace infini
--- a/src/kernels/mlu/include/div.h
+++ b/src/kernels/mlu/include/div.h
@ -0,0 +1,7 @@
+#ifndef BANG_KERNELS_DIVOPERATION_DIV_H_
+#define BANG_KERNELS_DIVOPERATION_DIV_H_
+
+__mlu_global__ void MLUDivKernelUnion1(float *output, float *input1,
+                                       float *input2, uint32_t num);
+
+#endif // BANG_KERNELS_DIVOPERATION_DIV_H_
--- a/src/kernels/mlu/src/div.mlu
+++ b/src/kernels/mlu/src/div.mlu
@ -0,0 +1,24 @@
+#include "bang_div.h"
+#include "div.h"
+namespace infini {
+void div_kernel(cnnlHandle_t handle,
+                const float *input1,
+                const float *input2,
+                float *output,
+                const uint32_t num) {
+    // 任务类型和调度方法
+    cnrtDim3_t k_dim;
+    cnrtFunctionType_t k_type;
+    cnrtQueue_t queue;
+    cnnlGetQueue(handle, &queue);
+    k_dim.x = 4;
+    k_dim.y = 8;
+    k_dim.z = 1;
+    k_type = CNRT_FUNC_TYPE_UNION1;
+    // launch 任务
+    MLUDivKernelUnion1<<<k_dim, k_type, queue>>>((float*)output,
+                                                  (float*)input1,
+                                                  (float*)input2,
+                                                  num);
+}
+};
--- a/src/kernels/mlu/src/div_device.mlu
+++ b/src/kernels/mlu/src/div_device.mlu
@ -0,0 +1,50 @@
+#include "div.h"
+
+#define NRAM_USE_SIZE 102400
+
+__nram__ char left[NRAM_USE_SIZE];
+__nram__ char right[NRAM_USE_SIZE];
+__nram__ char output[NRAM_USE_SIZE];
+
+template<typename T>
+__mlu_device__ void DivFunction(T* output1, T* input1, T* input2, size_t num) {
+    int use_nram_size = NRAM_USE_SIZE;
+    int deal_align = use_nram_size / sizeof(T);
+    int num_per_core = num / taskDim;
+    int num_rem = num % taskDim;
+    int easy = num_per_core;
+    int hard = num_per_core + (num_rem != 0 ? 1 : 0);
+    int my = taskId < num_rem ? hard : easy;
+    int start = (taskId < num_rem) ? (hard * taskId) : (hard * num_rem + (taskId - num_rem) * easy);
+    char* input1_start = (char*)input1 + start * sizeof(T);
+    char* input2_start = (char*)input2 + start * sizeof(T);
+    char* output_start = (char*)output1 + start * sizeof(T);
+
+    int my_repeat = my / deal_align;
+    int my_rem = my % deal_align;
+    for(int i = 0; i < my_repeat; ++i) {
+        __memcpy(left, input1_start, use_nram_size, GDRAM2NRAM);
+        __memcpy(right, input2_start, use_nram_size, GDRAM2NRAM);
+        __bang_active_recip((T*)right, (T*)right, deal_align);
+        __bang_mul((T*)output, (T*)left, (T*)right, deal_align);
+        __memcpy(output_start, output, use_nram_size, NRAM2GDRAM);
+        input1_start += use_nram_size;
+        input2_start += use_nram_size;
+        output_start += use_nram_size;
+    }
+    if(my_rem) {
+        __memcpy(left, input1_start, my_rem * sizeof(T), GDRAM2NRAM);
+        __memcpy(right, input2_start, my_rem * sizeof(T), GDRAM2NRAM);
+        __bang_active_recip((T*)right, (T*)right, deal_align);
+        __bang_mul((T*)output, (T*)left, (T*)right, deal_align);
+        __memcpy(output_start, output, my_rem * sizeof(T), NRAM2GDRAM);
+    }
+}
+
+__mlu_global__ void MLUDivKernelUnion1(float *output,
+                                       float *input1,
+                                       float *input2,
+                                       uint32_t num) {
+    DivFunction((float*)output, (float*)input1, (float*)input2, num);
+}
+
--- a/test/kernels/bang/test_bang_bangcKernel.cc
+++ b/test/kernels/bang/test_bang_bangcKernel.cc
@ -0,0 +1,63 @@
+#include "bang/bang_runtime.h"
+#include "core/blob.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/element_wise.h"
+#include "utils/validation.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testBangcKernel(
+    const std::function<void(void *, size_t, DataType)> &generator,
+    const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu1 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu1->dataMalloc();
+    inputCpu1->setData(generator);
+    Tensor inputCpu2 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu2->dataMalloc();
+    inputCpu2->setData(generator);
+
+    // inputCpu1->printData();
+    // inputCpu2->printData();
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
+    auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu1, inputGpu2, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    // outputGpu2Cpu->printData();
+    // CPU
+    Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
+    auto cpuOp = cpuGraph->addOp<T>(inputCpu1, inputCpu2, nullptr);
+    cpuGraph->dataMalloc();
+    cpuRuntime->run(cpuGraph);
+    auto outputCpu = cpuOp->getOutput();
+    // outputCpu->printData();
+    // Check
+    float *const cpuRes =
+        (float *)(outputCpu->template getRawDataPtr<float *>());
+    float *const mluRes =
+        (float *)(outputGpu2Cpu->template getRawDataPtr<float *>());
+    EXPECT_LE(computeDifference2(cpuRes, mluRes, outputCpu->size()), 0.003);
+}
+
+TEST(BangcKernel_Div, run) {
+    testBangcKernel<DivObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini