Support bang c kernel wanghailu 0927 (#43)

* fix a little bug which found by new verison CMake

* add code for support BangC language kernel , just like Cuda kernel, not
library

* add bangc kernel

* support BangC kernel

* add code for support BangC kernel

* support bangc kernel

* fix some code from reviewer

* fix code of template fumction

* add code for support bangc kernel

* fix bangc format

Co-authored-by: wanghailu <wanghailu@qiyuanlab.com>
Co-authored-by: Haojie Wang <haojie0429@gmail.com>
This commit is contained in:
Hardy 2022-09-30 11:01:52 +08:00 committed by GitHub
parent 26cee55e81
commit b0c2a08252
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 245 additions and 64 deletions

View File

@ -121,6 +121,7 @@ if(USE_CUDA)
endif()
if(USE_BANG)
include_directories(src/kernels/mlu/include)
################################################################################
# Neuware Evironment
################################################################################
@ -151,53 +152,12 @@ if(USE_BANG)
message(STATUS "TARGET_CPU_ARCH: ${TARGET_CPU_ARCH}")
################################################################################
# Sample Kernels
# BangC Kernels
################################################################################
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "$ENV{NEUWARE_HOME}/cmake" "$ENV{NEUWARE_HOME}/cmake/modules")
find_package(BANG)
if(NOT BANG_FOUND)
message(FATAL_ERROR "BANG cannot be found.")
elseif(NOT BANG_CNCC_EXECUTABLE)
message(FATAL_ERROR "cncc not found, please ensure cncc is in your PATH env or set variable BANG_CNCC_EXECUTABLE from cmake. Otherwise you should check path used by find_program(BANG_CNCC_EXECUTABLE) in FindBANG.cmake")
endif()
set(BANG_CNCC_FLAGS "-Wall -Werror -fPIC -std=c++11 --target=${TARGET_CPU_ARCH} -O3")
set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS}"
"--bang-arch=compute_20"
"--bang-arch=compute_30"
"--bang-mlu-arch=mtp_322"
"--bang-wram-align64"
)
add_subdirectory(src/kernels/mlu)
if(${TARGET_CPU_ARCH} MATCHES "aarch64-linux-gnu")
set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=1")
add_definitions(-D_GLIBCXX_USE_CXX11_ABI=1)
execute_process(
COMMAND uname -m
OUTPUT_VARIABLE _uname_m
OUTPUT_STRIP_TRAILING_WHITESPACE
)
if (NOT ("${TARGET_CPU_ARCH}" MATCHES ".*${_uname_m}.*" AND "${_uname_m}" MATCHES "aarch64"))
execute_process(
COMMAND "${CMAKE_CXX_COMPILER}" "-v" "-c" "-x" "c++" "/dev/null" "-M"
ERROR_VARIABLE _cxx_verbose
)
execute_process(
COMMAND "echo" "${_cxx_verbose}"
COMMAND "sed" "-n" "/include.*search starts here/,/End of search list/{s/^ //p}"
COMMAND "tr" "'\n'" ";"
OUTPUT_VARIABLE _cxx_includes
)
list(REMOVE_ITEM _cxx_includes "/usr/include")
foreach(_include ${_cxx_includes})
message(STATUS "add include path: ${_include}")
set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS} -idirafter ${_include}")
endforeach()
endif()
endif()
#bang_add_library(bangops SHARED ${SRC_BANG})
#target_link_libraries(bangops ${CAMBRICON_CNDRV})
target_link_libraries(InfiniTensor ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
#target_link_libraries(InfiniTensor bangops)
target_link_libraries(InfiniTensor bangops)
endif()
# # Python bindings

View File

@ -0,0 +1,22 @@
#pragma once
#include "bang/bang_runtime.h"
#include "bang_div.h"
#include "operators/element_wise.h"
namespace infini {
void element_wise_kernel(const RuntimeObj *obj, const Operator &_op) {
auto op = as<ElementWiseObj>(_op);
float *const aData = (op->getInputs(0)->getRawDataPtr<float *>());
float *const bData = (op->getInputs(1)->getRawDataPtr<float *>());
float *const cData = (op->getOutput()->getRawDataPtr<float *>());
auto dim = op->getInputs(0)->getDims();
auto context = dynamic_cast<const BangRuntimeObj *>(obj);
int n = dim[0], c = dim[1], h = dim[2], w = dim[3];
if (op->getOpType() == OpType::Div)
div_kernel(context->cnnlHandle(), aData, bData, cData, n * c * h * w);
else
IT_TODO_HALT();
}
}; // namespace infini

View File

@ -48,20 +48,22 @@ class BangRuntimeObj : public RuntimeObj {
return workspace;
}
void copyBlobFromCPU(void *dst, void *src, size_t bytes) const override {
checkBangError(
cnrtMemcpy(dst, src, bytes, CNRT_MEM_TRANS_DIR_HOST2DEV));
void copyBlobFromCPU(void *dst, const void *src,
size_t bytes) const override {
checkBangError(cnrtMemcpy(dst, const_cast<void *>(src), bytes,
CNRT_MEM_TRANS_DIR_HOST2DEV));
}
void copyBlobToCPU(void *dst, void *src, size_t bytes) const override {
checkBangError(
cnrtMemcpy(dst, src, bytes, CNRT_MEM_TRANS_DIR_DEV2HOST));
void copyBlobToCPU(void *dst, const void *src,
size_t bytes) const override {
checkBangError(cnrtMemcpy(dst, const_cast<void *>(src), bytes,
CNRT_MEM_TRANS_DIR_DEV2HOST));
}
void copyBlobInsideRuntime(void *dst, void *src,
void copyBlobInsideRuntime(void *dst, const void *src,
size_t bytes) const override {
checkBangError(
cnrtMemcpy(dst, src, bytes, CNRT_MEM_TRANS_DIR_PEER2PEER));
checkBangError(cnrtMemcpy(dst, const_cast<void *>(src), bytes,
CNRT_MEM_TRANS_DIR_PEER2PEER));
}
private:

View File

@ -1,4 +1,5 @@
#include "operators/element_wise.h"
#include "bang/bang_element_wise.h"
#include "bang/bang_kernel_without_config.h"
#include "bang/bang_runtime.h"
@ -80,12 +81,12 @@ class MulCnnl : public ElementWiseCnnl {
cnnlOpTensorDesc_t getOpType() const override { return CNNL_OP_TENSOR_MUL; }
};
// class ElementWiseBang : public BangKernelWithoutConfig {
// void compute(const Operator &_op,
// const RuntimeObj *_context) const override {
// element_wise_kernel(_op);
// }
// };
class ElementWiseBang : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
element_wise_kernel(_context, _op);
}
};
REGISTER_KERNEL(Device::BANG, OpType::Add, DataType::Float32, AddCnnl,
"Add_cnnl_BANG_Float32");
@ -94,9 +95,8 @@ REGISTER_KERNEL(Device::BANG, OpType::Sub, DataType::Float32, SubCnnl,
REGISTER_KERNEL(Device::BANG, OpType::Mul, DataType::Float32, MulCnnl,
"Mul_cnnl_BANG_Float32");
// REGISTER_KERNEL(Device::BANG, OpType::Div, DataType::Float32,
// ElementWiseBang,
// "Div_Bang_Float32");
REGISTER_KERNEL(Device::BANG, OpType::Div, DataType::Float32, ElementWiseBang,
"Div_Bang_Float32");
// REGISTER_KERNEL(Device::BANG, OpType::Pow, DataType::Float32,
// ElementWiseBang,
// "Pow_Bang_Float32");

View File

@ -0,0 +1,46 @@
cmake_minimum_required(VERSION 3.3)
project(bangops)
include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include")
set(LIBRARY_OUTPUT_PATH "${CMAKE_BINARY_DIR}/lib")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -fPIC -std=c++11 -pthread -pipe")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} ${CMAKE_CXX_FLAGS} -O3")
set(CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} -Wl,--gc-sections -fPIC")
# check `NEUWARE_HOME` env
message(${NEUWARE_HOME})
if(EXISTS ${NEUWARE_HOME})
include_directories("${NEUWARE_HOME}/include")
link_directories("${NEUWARE_HOME}/lib64")
link_directories("${NEUWARE_HOME}/lib")
set(NEUWARE_ROOT_DIR "${NEUWARE_HOME}")
else()
message(FATAL_ERROR "NEUWARE directory cannot be found, refer README.md to prepare NEUWARE_HOME environment.")
endif()
# setup cmake search path
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH}
"${CMAKE_SOURCE_DIR}/cmake"
"${NEUWARE_HOME}/cmake"
"${NEUWARE_HOME}/cmake/modules"
)
# include FindBANG.cmake and check cncc
find_package(BANG)
if(NOT BANG_FOUND)
message(FATAL_ERROR "BANG cannot be found.")
elseif (NOT BANG_CNCC_EXECUTABLE)
message(FATAL_ERROR "cncc not found, please ensure cncc is in your PATH env or set variable BANG_CNCC_EXECUTABLE from cmake. Otherwise you should check path used by find_program(BANG_CNCC_EXECUTABLE) in FindBANG.cmake")
endif()
# setup cncc flags
set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS} -fPIC -Wall -Werror -std=c++11 -pthread")
set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS} -O3")
set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS}" "--bang-mlu-arch=mtp_220"
"--bang-mlu-arch=mtp_270"
"--bang-mlu-arch=mtp_290"
"--bang-mlu-arch=mtp_372"
)
file(GLOB_RECURSE src_files ${src_files} "${CMAKE_CURRENT_SOURCE_DIR}/src/*.mlu")
bang_add_library(bangops SHARED ${src_files})

View File

@ -0,0 +1,7 @@
#pragma once
#include "cnnl.h"
namespace infini {
void div_kernel(cnnlHandle_t handle, const float *input1, const float *input2,
float *output, const uint32_t num);
}; // namespace infini

View File

@ -0,0 +1,7 @@
#ifndef BANG_KERNELS_DIVOPERATION_DIV_H_
#define BANG_KERNELS_DIVOPERATION_DIV_H_
__mlu_global__ void MLUDivKernelUnion1(float *output, float *input1,
float *input2, uint32_t num);
#endif // BANG_KERNELS_DIVOPERATION_DIV_H_

View File

@ -0,0 +1,24 @@
#include "bang_div.h"
#include "div.h"
namespace infini {
void div_kernel(cnnlHandle_t handle,
const float *input1,
const float *input2,
float *output,
const uint32_t num) {
// 任务类型和调度方法
cnrtDim3_t k_dim;
cnrtFunctionType_t k_type;
cnrtQueue_t queue;
cnnlGetQueue(handle, &queue);
k_dim.x = 4;
k_dim.y = 8;
k_dim.z = 1;
k_type = CNRT_FUNC_TYPE_UNION1;
// launch 任务
MLUDivKernelUnion1<<<k_dim, k_type, queue>>>((float*)output,
(float*)input1,
(float*)input2,
num);
}
};

View File

@ -0,0 +1,50 @@
#include "div.h"
#define NRAM_USE_SIZE 102400
__nram__ char left[NRAM_USE_SIZE];
__nram__ char right[NRAM_USE_SIZE];
__nram__ char output[NRAM_USE_SIZE];
template<typename T>
__mlu_device__ void DivFunction(T* output1, T* input1, T* input2, size_t num) {
int use_nram_size = NRAM_USE_SIZE;
int deal_align = use_nram_size / sizeof(T);
int num_per_core = num / taskDim;
int num_rem = num % taskDim;
int easy = num_per_core;
int hard = num_per_core + (num_rem != 0 ? 1 : 0);
int my = taskId < num_rem ? hard : easy;
int start = (taskId < num_rem) ? (hard * taskId) : (hard * num_rem + (taskId - num_rem) * easy);
char* input1_start = (char*)input1 + start * sizeof(T);
char* input2_start = (char*)input2 + start * sizeof(T);
char* output_start = (char*)output1 + start * sizeof(T);
int my_repeat = my / deal_align;
int my_rem = my % deal_align;
for(int i = 0; i < my_repeat; ++i) {
__memcpy(left, input1_start, use_nram_size, GDRAM2NRAM);
__memcpy(right, input2_start, use_nram_size, GDRAM2NRAM);
__bang_active_recip((T*)right, (T*)right, deal_align);
__bang_mul((T*)output, (T*)left, (T*)right, deal_align);
__memcpy(output_start, output, use_nram_size, NRAM2GDRAM);
input1_start += use_nram_size;
input2_start += use_nram_size;
output_start += use_nram_size;
}
if(my_rem) {
__memcpy(left, input1_start, my_rem * sizeof(T), GDRAM2NRAM);
__memcpy(right, input2_start, my_rem * sizeof(T), GDRAM2NRAM);
__bang_active_recip((T*)right, (T*)right, deal_align);
__bang_mul((T*)output, (T*)left, (T*)right, deal_align);
__memcpy(output_start, output, my_rem * sizeof(T), NRAM2GDRAM);
}
}
__mlu_global__ void MLUDivKernelUnion1(float *output,
float *input1,
float *input2,
uint32_t num) {
DivFunction((float*)output, (float*)input1, (float*)input2, num);
}

View File

@ -0,0 +1,63 @@
#include "bang/bang_runtime.h"
#include "core/blob.h"
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "operators/element_wise.h"
#include "utils/validation.h"
#include "test.h"
namespace infini {
template <class T>
void testBangcKernel(
const std::function<void(void *, size_t, DataType)> &generator,
const Shape &shape) {
// Runtime
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
auto bangRuntime = make_ref<BangRuntimeObj>();
// Build input data on CPU
Tensor inputCpu1 =
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
inputCpu1->dataMalloc();
inputCpu1->setData(generator);
Tensor inputCpu2 =
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
inputCpu2->dataMalloc();
inputCpu2->setData(generator);
// inputCpu1->printData();
// inputCpu2->printData();
// GPU
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
auto gpuOp = bangGraph->addOp<T>(inputGpu1, inputGpu2, nullptr);
bangGraph->dataMalloc();
bangRuntime->run(bangGraph);
auto outputGpu = gpuOp->getOutput();
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
// outputGpu2Cpu->printData();
// CPU
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
auto cpuOp = cpuGraph->addOp<T>(inputCpu1, inputCpu2, nullptr);
cpuGraph->dataMalloc();
cpuRuntime->run(cpuGraph);
auto outputCpu = cpuOp->getOutput();
// outputCpu->printData();
// Check
float *const cpuRes =
(float *)(outputCpu->template getRawDataPtr<float *>());
float *const mluRes =
(float *)(outputGpu2Cpu->template getRawDataPtr<float *>());
EXPECT_LE(computeDifference2(cpuRes, mluRes, outputCpu->size()), 0.003);
}
TEST(BangcKernel_Div, run) {
testBangcKernel<DivObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
}
} // namespace infini