forked from jiuyuan/InfiniTensor
Support bang c kernel wanghailu 0927 (#43)
* fix a little bug which found by new verison CMake * add code for support BangC language kernel , just like Cuda kernel, not library * add bangc kernel * support BangC kernel * add code for support BangC kernel * support bangc kernel * fix some code from reviewer * fix code of template fumction * add code for support bangc kernel * fix bangc format Co-authored-by: wanghailu <wanghailu@qiyuanlab.com> Co-authored-by: Haojie Wang <haojie0429@gmail.com>
This commit is contained in:
parent
26cee55e81
commit
b0c2a08252
|
@ -121,6 +121,7 @@ if(USE_CUDA)
|
|||
endif()
|
||||
|
||||
if(USE_BANG)
|
||||
include_directories(src/kernels/mlu/include)
|
||||
################################################################################
|
||||
# Neuware Evironment
|
||||
################################################################################
|
||||
|
@ -151,53 +152,12 @@ if(USE_BANG)
|
|||
message(STATUS "TARGET_CPU_ARCH: ${TARGET_CPU_ARCH}")
|
||||
|
||||
################################################################################
|
||||
# Sample Kernels
|
||||
# BangC Kernels
|
||||
################################################################################
|
||||
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "$ENV{NEUWARE_HOME}/cmake" "$ENV{NEUWARE_HOME}/cmake/modules")
|
||||
find_package(BANG)
|
||||
if(NOT BANG_FOUND)
|
||||
message(FATAL_ERROR "BANG cannot be found.")
|
||||
elseif(NOT BANG_CNCC_EXECUTABLE)
|
||||
message(FATAL_ERROR "cncc not found, please ensure cncc is in your PATH env or set variable BANG_CNCC_EXECUTABLE from cmake. Otherwise you should check path used by find_program(BANG_CNCC_EXECUTABLE) in FindBANG.cmake")
|
||||
endif()
|
||||
set(BANG_CNCC_FLAGS "-Wall -Werror -fPIC -std=c++11 --target=${TARGET_CPU_ARCH} -O3")
|
||||
set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS}"
|
||||
"--bang-arch=compute_20"
|
||||
"--bang-arch=compute_30"
|
||||
"--bang-mlu-arch=mtp_322"
|
||||
"--bang-wram-align64"
|
||||
)
|
||||
add_subdirectory(src/kernels/mlu)
|
||||
|
||||
if(${TARGET_CPU_ARCH} MATCHES "aarch64-linux-gnu")
|
||||
set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=1")
|
||||
add_definitions(-D_GLIBCXX_USE_CXX11_ABI=1)
|
||||
execute_process(
|
||||
COMMAND uname -m
|
||||
OUTPUT_VARIABLE _uname_m
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE
|
||||
)
|
||||
if (NOT ("${TARGET_CPU_ARCH}" MATCHES ".*${_uname_m}.*" AND "${_uname_m}" MATCHES "aarch64"))
|
||||
execute_process(
|
||||
COMMAND "${CMAKE_CXX_COMPILER}" "-v" "-c" "-x" "c++" "/dev/null" "-M"
|
||||
ERROR_VARIABLE _cxx_verbose
|
||||
)
|
||||
execute_process(
|
||||
COMMAND "echo" "${_cxx_verbose}"
|
||||
COMMAND "sed" "-n" "/include.*search starts here/,/End of search list/{s/^ //p}"
|
||||
COMMAND "tr" "'\n'" ";"
|
||||
OUTPUT_VARIABLE _cxx_includes
|
||||
)
|
||||
list(REMOVE_ITEM _cxx_includes "/usr/include")
|
||||
foreach(_include ${_cxx_includes})
|
||||
message(STATUS "add include path: ${_include}")
|
||||
set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS} -idirafter ${_include}")
|
||||
endforeach()
|
||||
endif()
|
||||
endif()
|
||||
#bang_add_library(bangops SHARED ${SRC_BANG})
|
||||
#target_link_libraries(bangops ${CAMBRICON_CNDRV})
|
||||
target_link_libraries(InfiniTensor ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
|
||||
#target_link_libraries(InfiniTensor bangops)
|
||||
target_link_libraries(InfiniTensor bangops)
|
||||
endif()
|
||||
|
||||
# # Python bindings
|
||||
|
|
|
@ -0,0 +1,22 @@
|
|||
#pragma once
|
||||
#include "bang/bang_runtime.h"
|
||||
#include "bang_div.h"
|
||||
#include "operators/element_wise.h"
|
||||
namespace infini {
|
||||
|
||||
void element_wise_kernel(const RuntimeObj *obj, const Operator &_op) {
|
||||
auto op = as<ElementWiseObj>(_op);
|
||||
float *const aData = (op->getInputs(0)->getRawDataPtr<float *>());
|
||||
float *const bData = (op->getInputs(1)->getRawDataPtr<float *>());
|
||||
float *const cData = (op->getOutput()->getRawDataPtr<float *>());
|
||||
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(obj);
|
||||
int n = dim[0], c = dim[1], h = dim[2], w = dim[3];
|
||||
if (op->getOpType() == OpType::Div)
|
||||
div_kernel(context->cnnlHandle(), aData, bData, cData, n * c * h * w);
|
||||
else
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
|
||||
}; // namespace infini
|
|
@ -48,20 +48,22 @@ class BangRuntimeObj : public RuntimeObj {
|
|||
return workspace;
|
||||
}
|
||||
|
||||
void copyBlobFromCPU(void *dst, void *src, size_t bytes) const override {
|
||||
checkBangError(
|
||||
cnrtMemcpy(dst, src, bytes, CNRT_MEM_TRANS_DIR_HOST2DEV));
|
||||
void copyBlobFromCPU(void *dst, const void *src,
|
||||
size_t bytes) const override {
|
||||
checkBangError(cnrtMemcpy(dst, const_cast<void *>(src), bytes,
|
||||
CNRT_MEM_TRANS_DIR_HOST2DEV));
|
||||
}
|
||||
|
||||
void copyBlobToCPU(void *dst, void *src, size_t bytes) const override {
|
||||
checkBangError(
|
||||
cnrtMemcpy(dst, src, bytes, CNRT_MEM_TRANS_DIR_DEV2HOST));
|
||||
void copyBlobToCPU(void *dst, const void *src,
|
||||
size_t bytes) const override {
|
||||
checkBangError(cnrtMemcpy(dst, const_cast<void *>(src), bytes,
|
||||
CNRT_MEM_TRANS_DIR_DEV2HOST));
|
||||
}
|
||||
|
||||
void copyBlobInsideRuntime(void *dst, void *src,
|
||||
void copyBlobInsideRuntime(void *dst, const void *src,
|
||||
size_t bytes) const override {
|
||||
checkBangError(
|
||||
cnrtMemcpy(dst, src, bytes, CNRT_MEM_TRANS_DIR_PEER2PEER));
|
||||
checkBangError(cnrtMemcpy(dst, const_cast<void *>(src), bytes,
|
||||
CNRT_MEM_TRANS_DIR_PEER2PEER));
|
||||
}
|
||||
|
||||
private:
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
#include "operators/element_wise.h"
|
||||
#include "bang/bang_element_wise.h"
|
||||
#include "bang/bang_kernel_without_config.h"
|
||||
#include "bang/bang_runtime.h"
|
||||
|
||||
|
@ -80,12 +81,12 @@ class MulCnnl : public ElementWiseCnnl {
|
|||
cnnlOpTensorDesc_t getOpType() const override { return CNNL_OP_TENSOR_MUL; }
|
||||
};
|
||||
|
||||
// class ElementWiseBang : public BangKernelWithoutConfig {
|
||||
// void compute(const Operator &_op,
|
||||
// const RuntimeObj *_context) const override {
|
||||
// element_wise_kernel(_op);
|
||||
// }
|
||||
// };
|
||||
class ElementWiseBang : public BangKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
element_wise_kernel(_context, _op);
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Add, DataType::Float32, AddCnnl,
|
||||
"Add_cnnl_BANG_Float32");
|
||||
|
@ -94,9 +95,8 @@ REGISTER_KERNEL(Device::BANG, OpType::Sub, DataType::Float32, SubCnnl,
|
|||
REGISTER_KERNEL(Device::BANG, OpType::Mul, DataType::Float32, MulCnnl,
|
||||
"Mul_cnnl_BANG_Float32");
|
||||
|
||||
// REGISTER_KERNEL(Device::BANG, OpType::Div, DataType::Float32,
|
||||
// ElementWiseBang,
|
||||
// "Div_Bang_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Div, DataType::Float32, ElementWiseBang,
|
||||
"Div_Bang_Float32");
|
||||
// REGISTER_KERNEL(Device::BANG, OpType::Pow, DataType::Float32,
|
||||
// ElementWiseBang,
|
||||
// "Pow_Bang_Float32");
|
||||
|
|
|
@ -0,0 +1,46 @@
|
|||
cmake_minimum_required(VERSION 3.3)
|
||||
project(bangops)
|
||||
include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include")
|
||||
set(LIBRARY_OUTPUT_PATH "${CMAKE_BINARY_DIR}/lib")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -fPIC -std=c++11 -pthread -pipe")
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} ${CMAKE_CXX_FLAGS} -O3")
|
||||
set(CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} -Wl,--gc-sections -fPIC")
|
||||
|
||||
# check `NEUWARE_HOME` env
|
||||
message(${NEUWARE_HOME})
|
||||
if(EXISTS ${NEUWARE_HOME})
|
||||
include_directories("${NEUWARE_HOME}/include")
|
||||
link_directories("${NEUWARE_HOME}/lib64")
|
||||
link_directories("${NEUWARE_HOME}/lib")
|
||||
set(NEUWARE_ROOT_DIR "${NEUWARE_HOME}")
|
||||
else()
|
||||
message(FATAL_ERROR "NEUWARE directory cannot be found, refer README.md to prepare NEUWARE_HOME environment.")
|
||||
endif()
|
||||
|
||||
# setup cmake search path
|
||||
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH}
|
||||
"${CMAKE_SOURCE_DIR}/cmake"
|
||||
"${NEUWARE_HOME}/cmake"
|
||||
"${NEUWARE_HOME}/cmake/modules"
|
||||
)
|
||||
|
||||
# include FindBANG.cmake and check cncc
|
||||
find_package(BANG)
|
||||
if(NOT BANG_FOUND)
|
||||
message(FATAL_ERROR "BANG cannot be found.")
|
||||
elseif (NOT BANG_CNCC_EXECUTABLE)
|
||||
message(FATAL_ERROR "cncc not found, please ensure cncc is in your PATH env or set variable BANG_CNCC_EXECUTABLE from cmake. Otherwise you should check path used by find_program(BANG_CNCC_EXECUTABLE) in FindBANG.cmake")
|
||||
endif()
|
||||
|
||||
# setup cncc flags
|
||||
set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS} -fPIC -Wall -Werror -std=c++11 -pthread")
|
||||
set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS} -O3")
|
||||
set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS}" "--bang-mlu-arch=mtp_220"
|
||||
"--bang-mlu-arch=mtp_270"
|
||||
"--bang-mlu-arch=mtp_290"
|
||||
"--bang-mlu-arch=mtp_372"
|
||||
)
|
||||
|
||||
file(GLOB_RECURSE src_files ${src_files} "${CMAKE_CURRENT_SOURCE_DIR}/src/*.mlu")
|
||||
bang_add_library(bangops SHARED ${src_files})
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
#pragma once
|
||||
#include "cnnl.h"
|
||||
namespace infini {
|
||||
void div_kernel(cnnlHandle_t handle, const float *input1, const float *input2,
|
||||
float *output, const uint32_t num);
|
||||
|
||||
}; // namespace infini
|
|
@ -0,0 +1,7 @@
|
|||
#ifndef BANG_KERNELS_DIVOPERATION_DIV_H_
|
||||
#define BANG_KERNELS_DIVOPERATION_DIV_H_
|
||||
|
||||
__mlu_global__ void MLUDivKernelUnion1(float *output, float *input1,
|
||||
float *input2, uint32_t num);
|
||||
|
||||
#endif // BANG_KERNELS_DIVOPERATION_DIV_H_
|
|
@ -0,0 +1,24 @@
|
|||
#include "bang_div.h"
|
||||
#include "div.h"
|
||||
namespace infini {
|
||||
void div_kernel(cnnlHandle_t handle,
|
||||
const float *input1,
|
||||
const float *input2,
|
||||
float *output,
|
||||
const uint32_t num) {
|
||||
// 任务类型和调度方法
|
||||
cnrtDim3_t k_dim;
|
||||
cnrtFunctionType_t k_type;
|
||||
cnrtQueue_t queue;
|
||||
cnnlGetQueue(handle, &queue);
|
||||
k_dim.x = 4;
|
||||
k_dim.y = 8;
|
||||
k_dim.z = 1;
|
||||
k_type = CNRT_FUNC_TYPE_UNION1;
|
||||
// launch 任务
|
||||
MLUDivKernelUnion1<<<k_dim, k_type, queue>>>((float*)output,
|
||||
(float*)input1,
|
||||
(float*)input2,
|
||||
num);
|
||||
}
|
||||
};
|
|
@ -0,0 +1,50 @@
|
|||
#include "div.h"
|
||||
|
||||
#define NRAM_USE_SIZE 102400
|
||||
|
||||
__nram__ char left[NRAM_USE_SIZE];
|
||||
__nram__ char right[NRAM_USE_SIZE];
|
||||
__nram__ char output[NRAM_USE_SIZE];
|
||||
|
||||
template<typename T>
|
||||
__mlu_device__ void DivFunction(T* output1, T* input1, T* input2, size_t num) {
|
||||
int use_nram_size = NRAM_USE_SIZE;
|
||||
int deal_align = use_nram_size / sizeof(T);
|
||||
int num_per_core = num / taskDim;
|
||||
int num_rem = num % taskDim;
|
||||
int easy = num_per_core;
|
||||
int hard = num_per_core + (num_rem != 0 ? 1 : 0);
|
||||
int my = taskId < num_rem ? hard : easy;
|
||||
int start = (taskId < num_rem) ? (hard * taskId) : (hard * num_rem + (taskId - num_rem) * easy);
|
||||
char* input1_start = (char*)input1 + start * sizeof(T);
|
||||
char* input2_start = (char*)input2 + start * sizeof(T);
|
||||
char* output_start = (char*)output1 + start * sizeof(T);
|
||||
|
||||
int my_repeat = my / deal_align;
|
||||
int my_rem = my % deal_align;
|
||||
for(int i = 0; i < my_repeat; ++i) {
|
||||
__memcpy(left, input1_start, use_nram_size, GDRAM2NRAM);
|
||||
__memcpy(right, input2_start, use_nram_size, GDRAM2NRAM);
|
||||
__bang_active_recip((T*)right, (T*)right, deal_align);
|
||||
__bang_mul((T*)output, (T*)left, (T*)right, deal_align);
|
||||
__memcpy(output_start, output, use_nram_size, NRAM2GDRAM);
|
||||
input1_start += use_nram_size;
|
||||
input2_start += use_nram_size;
|
||||
output_start += use_nram_size;
|
||||
}
|
||||
if(my_rem) {
|
||||
__memcpy(left, input1_start, my_rem * sizeof(T), GDRAM2NRAM);
|
||||
__memcpy(right, input2_start, my_rem * sizeof(T), GDRAM2NRAM);
|
||||
__bang_active_recip((T*)right, (T*)right, deal_align);
|
||||
__bang_mul((T*)output, (T*)left, (T*)right, deal_align);
|
||||
__memcpy(output_start, output, my_rem * sizeof(T), NRAM2GDRAM);
|
||||
}
|
||||
}
|
||||
|
||||
__mlu_global__ void MLUDivKernelUnion1(float *output,
|
||||
float *input1,
|
||||
float *input2,
|
||||
uint32_t num) {
|
||||
DivFunction((float*)output, (float*)input1, (float*)input2, num);
|
||||
}
|
||||
|
|
@ -0,0 +1,63 @@
|
|||
#include "bang/bang_runtime.h"
|
||||
#include "core/blob.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/element_wise.h"
|
||||
#include "utils/validation.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testBangcKernel(
|
||||
const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu1 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu1->dataMalloc();
|
||||
inputCpu1->setData(generator);
|
||||
Tensor inputCpu2 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu2->dataMalloc();
|
||||
inputCpu2->setData(generator);
|
||||
|
||||
// inputCpu1->printData();
|
||||
// inputCpu2->printData();
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
|
||||
auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
|
||||
auto gpuOp = bangGraph->addOp<T>(inputGpu1, inputGpu2, nullptr);
|
||||
bangGraph->dataMalloc();
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
// outputGpu2Cpu->printData();
|
||||
// CPU
|
||||
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
|
||||
auto cpuOp = cpuGraph->addOp<T>(inputCpu1, inputCpu2, nullptr);
|
||||
cpuGraph->dataMalloc();
|
||||
cpuRuntime->run(cpuGraph);
|
||||
auto outputCpu = cpuOp->getOutput();
|
||||
// outputCpu->printData();
|
||||
// Check
|
||||
float *const cpuRes =
|
||||
(float *)(outputCpu->template getRawDataPtr<float *>());
|
||||
float *const mluRes =
|
||||
(float *)(outputGpu2Cpu->template getRawDataPtr<float *>());
|
||||
EXPECT_LE(computeDifference2(cpuRes, mluRes, outputCpu->size()), 0.003);
|
||||
}
|
||||
|
||||
TEST(BangcKernel_Div, run) {
|
||||
testBangcKernel<DivObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
}
|
||||
|
||||
} // namespace infini
|
Loading…
Reference in New Issue