forked from jiuyuan/InfiniTensor
Add bangc runtime and element-wise kernels
* add code for cambricon mlu, bang, cnnl * add code for support cambricon mlu,cnnl,cnrt * add code for support mlu * add code for support cambricon cnnl * add code for support mlu * add code for mlu * add code for mlu ` * Update CMakeLists.txt Co-authored-by: wanghailu <wanghailu@qiyuanlab.com> Co-authored-by: zhengly123 <zhengly123@outlook.com>
This commit is contained in:
parent
90eb9d05a8
commit
c7c974f07a
|
@ -3,7 +3,8 @@ include(CMakeDependentOption)
|
||||||
project(InfiniTensor C CXX)
|
project(InfiniTensor C CXX)
|
||||||
|
|
||||||
# Do not change these options in this file. Use cmake.config, cmake -DOPTION=VALUE, or ccmake to specify them.
|
# Do not change these options in this file. Use cmake.config, cmake -DOPTION=VALUE, or ccmake to specify them.
|
||||||
option(USE_CUDA "Support CUDA GPU" ON)
|
option(USE_CUDA "Support CUDA GPU" OFF)
|
||||||
|
option(USE_BANG "Support BANG MLU" OFF)
|
||||||
option(USE_BACKTRACE "Print backtrace on exception and segmentation fault" ON)
|
option(USE_BACKTRACE "Print backtrace on exception and segmentation fault" ON)
|
||||||
option(USE_PROTOBUF "Serialize and deserialize tensors" ON)
|
option(USE_PROTOBUF "Serialize and deserialize tensors" ON)
|
||||||
option(BUILD_TEST "Build tests" ON)
|
option(BUILD_TEST "Build tests" ON)
|
||||||
|
@ -81,6 +82,11 @@ if(USE_CUDA)
|
||||||
list (APPEND SRC ${SRC_CUDA})
|
list (APPEND SRC ${SRC_CUDA})
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if(USE_BANG)
|
||||||
|
file(GLOB_RECURSE SRC_BANG src/bang/*.cc src/kernels/bang/*.cc )
|
||||||
|
list (APPEND SRC ${SRC_BANG})
|
||||||
|
endif()
|
||||||
|
|
||||||
# Libraries
|
# Libraries
|
||||||
add_library(InfiniTensor SHARED ${SRC})
|
add_library(InfiniTensor SHARED ${SRC})
|
||||||
if(USE_PROTOBUF)
|
if(USE_PROTOBUF)
|
||||||
|
@ -109,6 +115,86 @@ if(USE_CUDA)
|
||||||
target_link_libraries(InfiniTensor cudnn curand cublas ${CUDA_LIBRARIES})
|
target_link_libraries(InfiniTensor cudnn curand cublas ${CUDA_LIBRARIES})
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if(USE_BANG)
|
||||||
|
################################################################################
|
||||||
|
# Neuware Evironment
|
||||||
|
################################################################################
|
||||||
|
# cnrt cndrv cnnl
|
||||||
|
if ((NOT DEFINED NEUWARE_HOME) AND (NOT DEFINED ENV{NEUWARE_HOME}))
|
||||||
|
message(FATAL_ERROR "NEUWARE_HOME is not defined from cmake or env")
|
||||||
|
elseif (DEFINED NEUWARE_HOME)
|
||||||
|
set(NEUWARE_HOME ${NEUWARE_HOME} CACHE STRING "NEUWARE_HOME directory for Cambricon Neuware development")
|
||||||
|
else()
|
||||||
|
set(NEUWARE_HOME $ENV{NEUWARE_HOME} CACHE STRING "NEUWARE_HOME directory for Cambricon Neuware development")
|
||||||
|
endif()
|
||||||
|
message(STATUS "NEUWARE_HOME: ${NEUWARE_HOME}")
|
||||||
|
|
||||||
|
include_directories("${NEUWARE_HOME}/include")
|
||||||
|
find_library(CAMBRICON_CNNL libcnnl.so "${NEUWARE_HOME}/lib64")
|
||||||
|
find_library(CAMBRICON_CNRT libcnrt.so "${NEUWARE_HOME}/lib64")
|
||||||
|
find_library(CAMBRICON_CNDRV libcndrv.so "${NEUWARE_HOME}/lib64")
|
||||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lstdc++ -Wall -Werror")
|
||||||
|
|
||||||
|
if ((NOT DEFINED TARGET_CPU_ARCH) AND (NOT DEFINED ENV{TARGET_CPU_ARCH}))
|
||||||
|
execute_process(COMMAND uname -m OUTPUT_VARIABLE _uname_m OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||||
|
set(TARGET_CPU_ARCH "${_uname_m}" CACHE STRING "Target CPU ARCH")
|
||||||
|
elseif(DEFINED TARGET_CPU_ARCH)
|
||||||
|
set(TARGET_CPU_ARCH ${TARGET_CPU_ARCH} CACHE STRING "Target CPU ARCH")
|
||||||
|
else()
|
||||||
|
set(TARGET_CPU_ARCH $ENV{TARGET_CPU_ARCH} CACHE STRING "Target CPU ARCH")
|
||||||
|
endif()
|
||||||
|
message(STATUS "TARGET_CPU_ARCH: ${TARGET_CPU_ARCH}")
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
# Sample Kernels
|
||||||
|
################################################################################
|
||||||
|
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "$ENV{NEUWARE_HOME}/cmake" "$ENV{NEUWARE_HOME}/cmake/modules")
|
||||||
|
find_package(BANG)
|
||||||
|
if(NOT BANG_FOUND)
|
||||||
|
message(FATAL_ERROR "BANG cannot be found.")
|
||||||
|
elseif(NOT BANG_CNCC_EXECUTABLE)
|
||||||
|
message(FATAL_ERROR "cncc not found, please ensure cncc is in your PATH env or set variable BANG_CNCC_EXECUTABLE from cmake. Otherwise you should check path used by find_program(BANG_CNCC_EXECUTABLE) in FindBANG.cmake")
|
||||||
|
endif()
|
||||||
|
set(BANG_CNCC_FLAGS "-Wall -Werror -fPIC -std=c++11 --target=${TARGET_CPU_ARCH} -O3")
|
||||||
|
set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS}"
|
||||||
|
"--bang-arch=compute_20"
|
||||||
|
"--bang-arch=compute_30"
|
||||||
|
"--bang-mlu-arch=mtp_322"
|
||||||
|
"--bang-wram-align64"
|
||||||
|
)
|
||||||
|
|
||||||
|
if(${TARGET_CPU_ARCH} MATCHES "aarch64-linux-gnu")
|
||||||
|
set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=1")
|
||||||
|
add_definitions(-D_GLIBCXX_USE_CXX11_ABI=1)
|
||||||
|
execute_process(
|
||||||
|
COMMAND uname -m
|
||||||
|
OUTPUT_VARIABLE _uname_m
|
||||||
|
OUTPUT_STRIP_TRAILING_WHITESPACE
|
||||||
|
)
|
||||||
|
if (NOT ("${TARGET_CPU_ARCH}" MATCHES ".*${_uname_m}.*" AND "${_uname_m}" MATCHES "aarch64"))
|
||||||
|
execute_process(
|
||||||
|
COMMAND "${CMAKE_CXX_COMPILER}" "-v" "-c" "-x" "c++" "/dev/null" "-M"
|
||||||
|
ERROR_VARIABLE _cxx_verbose
|
||||||
|
)
|
||||||
|
execute_process(
|
||||||
|
COMMAND "echo" "${_cxx_verbose}"
|
||||||
|
COMMAND "sed" "-n" "/include.*search starts here/,/End of search list/{s/^ //p}"
|
||||||
|
COMMAND "tr" "'\n'" ";"
|
||||||
|
OUTPUT_VARIABLE _cxx_includes
|
||||||
|
)
|
||||||
|
list(REMOVE_ITEM _cxx_includes "/usr/include")
|
||||||
|
foreach(_include ${_cxx_includes})
|
||||||
|
message(STATUS "add include path: ${_include}")
|
||||||
|
set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS} -idirafter ${_include}")
|
||||||
|
endforeach()
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
#bang_add_library(bangops SHARED ${SRC_BANG})
|
||||||
|
#target_link_libraries(bangops ${CAMBRICON_CNDRV})
|
||||||
|
target_link_libraries(InfiniTensor ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
|
||||||
|
#target_link_libraries(InfiniTensor bangops)
|
||||||
|
endif()
|
||||||
|
|
||||||
# # Python bindings
|
# # Python bindings
|
||||||
# pybind11_add_module(infini MODULE ${FFI})
|
# pybind11_add_module(infini MODULE ${FFI})
|
||||||
# target_link_libraries(infini PRIVATE infini_cpp)
|
# target_link_libraries(infini PRIVATE infini_cpp)
|
||||||
|
@ -135,6 +221,9 @@ if(BUILD_TEST)
|
||||||
if (USE_CUDA)
|
if (USE_CUDA)
|
||||||
build_test(test/kernels/cuda/*.cc)
|
build_test(test/kernels/cuda/*.cc)
|
||||||
endif()
|
endif()
|
||||||
|
if (USE_BANG)
|
||||||
|
build_test(test/kernels/bang/*.cc)
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
if(BUILD_TEST_PET)
|
if(BUILD_TEST_PET)
|
||||||
build_test(test/pet/*.cc)
|
build_test(test/pet/*.cc)
|
||||||
|
|
|
@ -0,0 +1,30 @@
|
||||||
|
#pragma once
|
||||||
|
#include "cnnl.h"
|
||||||
|
#include "cnrt.h"
|
||||||
|
#include "core/common.h"
|
||||||
|
|
||||||
|
#define checkBangError(call) \
|
||||||
|
{ \
|
||||||
|
auto err = call; \
|
||||||
|
if (CNRT_RET_SUCCESS != err) { \
|
||||||
|
fprintf(stderr, "Bang error in %s:%i : %s.\n", __FILE__, __LINE__, \
|
||||||
|
cnrtGetErrorStr(err)); \
|
||||||
|
exit(EXIT_FAILURE); \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define checkCnnlError(call) \
|
||||||
|
{ \
|
||||||
|
auto err = call; \
|
||||||
|
if (CNNL_STATUS_SUCCESS != err) { \
|
||||||
|
fprintf(stderr, "cnnl error in %s:%i : %s.\n", __FILE__, __LINE__, \
|
||||||
|
cnnlGetErrorString(err)); \
|
||||||
|
exit(EXIT_FAILURE); \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
using BangPtr = void *;
|
||||||
|
|
||||||
|
} // namespace infini
|
|
@ -0,0 +1,24 @@
|
||||||
|
#pragma once
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
#include "core/kernel.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
class BangKernelWithoutConfig : public Kernel {
|
||||||
|
public:
|
||||||
|
virtual void compute(const Operator &op, const PerfRecord &record,
|
||||||
|
const RuntimeObj *context) const {
|
||||||
|
compute(op, context);
|
||||||
|
}
|
||||||
|
virtual void compute(const Operator &op,
|
||||||
|
const RuntimeObj *context) const = 0;
|
||||||
|
// Premise: op is idempotent since it is called multiple times.
|
||||||
|
virtual PerfRecord tune(const Operator &op,
|
||||||
|
const RuntimeObj *_context) const {
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
return make_ref<PerfRecordObj>(timeit([&]() { compute(op, _context); },
|
||||||
|
[&]() { context->sync(); }));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace infini
|
|
@ -0,0 +1,71 @@
|
||||||
|
#pragma once
|
||||||
|
#include "bang/bang_common.h"
|
||||||
|
#include "core/runtime.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
class BangRuntimeObj : public RuntimeObj {
|
||||||
|
private:
|
||||||
|
cnnlHandle_t cnnl;
|
||||||
|
BangPtr workspace;
|
||||||
|
size_t workspaceSize;
|
||||||
|
|
||||||
|
public:
|
||||||
|
BangRuntimeObj() : RuntimeObj(Device::BANG) {
|
||||||
|
checkBangError(cnrtInit(0));
|
||||||
|
cnrtDev_t dev;
|
||||||
|
checkBangError(cnrtGetDeviceHandle(&dev, 0));
|
||||||
|
checkBangError(cnrtSetCurrentDevice(dev));
|
||||||
|
cnrtQueue_t queue;
|
||||||
|
checkBangError(cnrtCreateQueue(&queue));
|
||||||
|
|
||||||
|
checkCnnlError(cnnlCreate(&cnnl));
|
||||||
|
checkCnnlError(cnnlSetQueue(cnnl, queue));
|
||||||
|
// 10GB for Longformer
|
||||||
|
// size_t longformerNum = 3lu * (1 << 30);
|
||||||
|
workspaceSize = 7ll << 30; // 7 GB
|
||||||
|
workspace = alloc(workspaceSize);
|
||||||
|
}
|
||||||
|
virtual ~BangRuntimeObj() {
|
||||||
|
dealloc(workspace);
|
||||||
|
checkCnnlError(cnnlDestroy(cnnl));
|
||||||
|
}
|
||||||
|
|
||||||
|
void run(const Graph &graph, bool tune = false,
|
||||||
|
bool profiling = false) const;
|
||||||
|
// double runEvaluation(const Graph &graph, int nWarmups,
|
||||||
|
// int nEvaluations) const;
|
||||||
|
void sync() const;
|
||||||
|
BangPtr alloc(size_t size) override {
|
||||||
|
void *ptr;
|
||||||
|
checkBangError(cnrtMalloc(&ptr, size));
|
||||||
|
return ptr;
|
||||||
|
}
|
||||||
|
void dealloc(void *ptr) override { checkBangError(cnrtFree(ptr)); }
|
||||||
|
cnnlHandle_t cnnlHandle() const { return cnnl; }
|
||||||
|
BangPtr getWorkspace(size_t size) const {
|
||||||
|
IT_ASSERT(size <= workspaceSize);
|
||||||
|
return workspace;
|
||||||
|
}
|
||||||
|
|
||||||
|
void copyBlobFromCPU(void *dst, void *src, size_t bytes) const override {
|
||||||
|
checkBangError(
|
||||||
|
cnrtMemcpy(dst, src, bytes, CNRT_MEM_TRANS_DIR_HOST2DEV));
|
||||||
|
}
|
||||||
|
|
||||||
|
void copyBlobToCPU(void *dst, void *src, size_t bytes) const override {
|
||||||
|
checkBangError(
|
||||||
|
cnrtMemcpy(dst, src, bytes, CNRT_MEM_TRANS_DIR_DEV2HOST));
|
||||||
|
}
|
||||||
|
|
||||||
|
void copyBlobInsideRuntime(void *dst, void *src,
|
||||||
|
size_t bytes) const override {
|
||||||
|
checkBangError(
|
||||||
|
cnrtMemcpy(dst, src, bytes, CNRT_MEM_TRANS_DIR_PEER2PEER));
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
void runWithoutSync(const Graph &graph, bool tune, bool profiling) const;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace infini
|
|
@ -26,7 +26,7 @@ using OpVec = vector<Operator>;
|
||||||
|
|
||||||
using VType = uint32_t;
|
using VType = uint32_t;
|
||||||
|
|
||||||
enum class Device { CPU = 1, CUDA };
|
enum class Device { CPU = 1, CUDA, BANG };
|
||||||
/***************** Forward declaration end *****************/
|
/***************** Forward declaration end *****************/
|
||||||
|
|
||||||
class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
|
class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
|
||||||
|
@ -64,6 +64,7 @@ class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
|
||||||
Blob allocBlob(size_t size);
|
Blob allocBlob(size_t size);
|
||||||
bool isCpu() const { return device == Device::CPU; }
|
bool isCpu() const { return device == Device::CPU; }
|
||||||
bool isCuda() const { return device == Device::CUDA; }
|
bool isCuda() const { return device == Device::CUDA; }
|
||||||
|
bool isBang() const { return device == Device::BANG; }
|
||||||
void copyBlob(const TensorObj *dst, const TensorObj *src) const;
|
void copyBlob(const TensorObj *dst, const TensorObj *src) const;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
|
@ -99,4 +100,4 @@ class CpuRuntimeObj : public RuntimeObj {
|
||||||
size_t bytes) const override;
|
size_t bytes) const override;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace infini
|
} // namespace infini
|
||||||
|
|
|
@ -0,0 +1,57 @@
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
#include "core/kernel.h"
|
||||||
|
#include "core/perf_engine.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
void BangRuntimeObj::runWithoutSync(const Graph &graph, bool tune = false,
|
||||||
|
bool profiling = false) const {
|
||||||
|
const auto &kernelRegistry = KernelRegistry::getInstance();
|
||||||
|
auto &perfEngine = PerfEngine::getInstance();
|
||||||
|
double totalTime = 0;
|
||||||
|
std::map<OpType, double> opTime;
|
||||||
|
std::map<OpType, int> opCnt;
|
||||||
|
for (auto &op : graph->getOperators()) {
|
||||||
|
// HACK: set correct data type
|
||||||
|
auto kernelAttrs =
|
||||||
|
KernelAttrs{device, op->getOpType(), DataType::Float32};
|
||||||
|
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
|
||||||
|
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
|
||||||
|
auto perfData = perfEngine.getPerfData(perfKey);
|
||||||
|
if (!perfData && !tune) {
|
||||||
|
kernel->compute(op, this);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
PerfRecord record;
|
||||||
|
if (!perfData) {
|
||||||
|
record = kernel->tune(op, this);
|
||||||
|
perfEngine.setPerfData(perfKey, record);
|
||||||
|
} else
|
||||||
|
record = perfData;
|
||||||
|
|
||||||
|
double t = record->time;
|
||||||
|
totalTime += t;
|
||||||
|
|
||||||
|
if (profiling) {
|
||||||
|
double t = timeit([&]() { kernel->compute(op, record, this); },
|
||||||
|
[&]() { sync(); }, 1, 1);
|
||||||
|
op->print();
|
||||||
|
printf(" op_time on bang %lf\n", t);
|
||||||
|
totalTime += t;
|
||||||
|
opTime[op->getOpType()] += t;
|
||||||
|
opCnt[op->getOpType()]++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void BangRuntimeObj::run(const Graph &graph, bool tune, bool profiling) const {
|
||||||
|
if (profiling)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
runWithoutSync(graph, tune, profiling);
|
||||||
|
sync();
|
||||||
|
}
|
||||||
|
|
||||||
|
void BangRuntimeObj::sync() const { cnrtSyncDevice(); }
|
||||||
|
|
||||||
|
} // namespace infini
|
|
@ -0,0 +1,103 @@
|
||||||
|
#include "operators/element_wise.h"
|
||||||
|
#include "bang/bang_kernel_without_config.h"
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
class ElementWiseCnnl : public BangKernelWithoutConfig {
|
||||||
|
virtual cnnlOpTensorDesc_t getOpType() const = 0;
|
||||||
|
virtual tuple<float, float, float> getAlphBeta() const {
|
||||||
|
return {1.f, 1.f, 0.f};
|
||||||
|
}
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<ElementWiseObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
|
cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
|
||||||
|
auto dim = op->getInputs(0)->getDims();
|
||||||
|
if (dim.size() != 4)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
|
||||||
|
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||||
|
// get inputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
// get outputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
// get op descriptor
|
||||||
|
cnnlOpTensorDescriptor_t opDesc;
|
||||||
|
checkCnnlError(cnnlCreateOpTensorDescriptor(&opDesc));
|
||||||
|
checkCnnlError(cnnlSetOpTensorDescriptor(
|
||||||
|
opDesc, getOpType(), CNNL_DTYPE_FLOAT, CNNL_NOT_PROPAGATE_NAN));
|
||||||
|
|
||||||
|
size_t wsSize;
|
||||||
|
cnnlGetOpTensorWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
|
||||||
|
&wsSize);
|
||||||
|
|
||||||
|
BangPtr wsData = context->getWorkspace(wsSize);
|
||||||
|
|
||||||
|
auto [aAlpha, bAlpha, beta] = getAlphBeta();
|
||||||
|
cnnlStatus_t stat = cnnlOpTensor(context->cnnlHandle(), opDesc, &aAlpha,
|
||||||
|
aDesc, aData, &bAlpha, bDesc, bData,
|
||||||
|
wsData, wsSize, &beta, cDesc, cData);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Destories in BANG does not require sync. But cnnl does not state
|
||||||
|
// whether sync is required before destories.
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||||
|
checkCnnlError(cnnlDestroyOpTensorDescriptor(opDesc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class AddCnnl : public ElementWiseCnnl {
|
||||||
|
cnnlOpTensorDesc_t getOpType() const override { return CNNL_OP_TENSOR_ADD; }
|
||||||
|
};
|
||||||
|
|
||||||
|
class SubCnnl : public ElementWiseCnnl {
|
||||||
|
cnnlOpTensorDesc_t getOpType() const override { return CNNL_OP_TENSOR_ADD; }
|
||||||
|
tuple<float, float, float> getAlphBeta() const override {
|
||||||
|
return {1.f, -1.f, 0.f};
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class MulCnnl : public ElementWiseCnnl {
|
||||||
|
cnnlOpTensorDesc_t getOpType() const override { return CNNL_OP_TENSOR_MUL; }
|
||||||
|
};
|
||||||
|
|
||||||
|
// class ElementWiseBang : public BangKernelWithoutConfig {
|
||||||
|
// void compute(const Operator &_op,
|
||||||
|
// const RuntimeObj *_context) const override {
|
||||||
|
// element_wise_kernel(_op);
|
||||||
|
// }
|
||||||
|
// };
|
||||||
|
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::Add, DataType::Float32, AddCnnl,
|
||||||
|
"Add_cnnl_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::Sub, DataType::Float32, SubCnnl,
|
||||||
|
"Sub_cnnl_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::Mul, DataType::Float32, MulCnnl,
|
||||||
|
"Mul_cnnl_BANG_Float32");
|
||||||
|
|
||||||
|
// REGISTER_KERNEL(Device::BANG, OpType::Div, DataType::Float32,
|
||||||
|
// ElementWiseBang,
|
||||||
|
// "Div_Bang_Float32");
|
||||||
|
// REGISTER_KERNEL(Device::BANG, OpType::Pow, DataType::Float32,
|
||||||
|
// ElementWiseBang,
|
||||||
|
// "Pow_Bang_Float32");
|
||||||
|
}; // namespace infini
|
|
@ -0,0 +1,59 @@
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
#include "core/graph.h"
|
||||||
|
#include "core/kernel.h"
|
||||||
|
#include "core/runtime.h"
|
||||||
|
#include "operators/element_wise.h"
|
||||||
|
|
||||||
|
#include "test.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
using ExpectOutput = vector<float>;
|
||||||
|
template <class T>
|
||||||
|
void testElementWiseCnnl(
|
||||||
|
const std::function<void(void *, size_t, DataType)> &generator,
|
||||||
|
const Shape &shape, const ExpectOutput &ansVec) {
|
||||||
|
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||||
|
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||||
|
|
||||||
|
// Build input data on CPU
|
||||||
|
Tensor acpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||||
|
acpu->dataMalloc();
|
||||||
|
acpu->setData(generator);
|
||||||
|
|
||||||
|
Tensor bcpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||||
|
bcpu->dataMalloc();
|
||||||
|
bcpu->setData(generator);
|
||||||
|
|
||||||
|
// Build BANG graph
|
||||||
|
Graph g = make_ref<GraphObj>(bangRuntime);
|
||||||
|
auto a = g->cloneTensor(acpu);
|
||||||
|
auto b = g->cloneTensor(bcpu);
|
||||||
|
auto op = g->addOp<T>(a, b, nullptr);
|
||||||
|
|
||||||
|
// allocate BANG memory
|
||||||
|
g->dataMalloc();
|
||||||
|
|
||||||
|
// Execute on BANG
|
||||||
|
bangRuntime->run(g);
|
||||||
|
|
||||||
|
// clone BANG output to CPU
|
||||||
|
auto c = op->getOutput();
|
||||||
|
auto ccpu = c->clone(cpuRuntime);
|
||||||
|
// check results on CPU
|
||||||
|
EXPECT_TRUE(ccpu->equalData(ansVec));
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(cnnl_ElementWise, run) {
|
||||||
|
testElementWiseCnnl<AddObj>(
|
||||||
|
IncrementalGenerator(), Shape{1, 2, 2, 3},
|
||||||
|
ExpectOutput{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22});
|
||||||
|
testElementWiseCnnl<SubObj>(
|
||||||
|
IncrementalGenerator(), Shape{1, 2, 2, 3},
|
||||||
|
ExpectOutput{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
|
||||||
|
testElementWiseCnnl<MulObj>(
|
||||||
|
IncrementalGenerator(), Shape{1, 2, 2, 3},
|
||||||
|
ExpectOutput{0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 121});
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace infini
|
|
@ -0,0 +1,54 @@
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
#include "core/graph.h"
|
||||||
|
#include "core/kernel.h"
|
||||||
|
#include "core/runtime.h"
|
||||||
|
#include "operators/element_wise.h"
|
||||||
|
|
||||||
|
#include "test.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
void testOptensor(
|
||||||
|
const std::function<void(void *, size_t, DataType)> &generator,
|
||||||
|
const Shape &shape) {
|
||||||
|
// Runtime
|
||||||
|
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||||
|
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||||
|
|
||||||
|
// Build input data on CPU
|
||||||
|
Tensor inputCpu1 =
|
||||||
|
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||||
|
inputCpu1->dataMalloc();
|
||||||
|
inputCpu1->setData(generator);
|
||||||
|
Tensor inputCpu2 =
|
||||||
|
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||||
|
inputCpu2->dataMalloc();
|
||||||
|
inputCpu2->setData(generator);
|
||||||
|
|
||||||
|
// GPU
|
||||||
|
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||||
|
auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
|
||||||
|
auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
|
||||||
|
auto gpuOp = bangGraph->addOp<T>(inputGpu1, inputGpu2, nullptr);
|
||||||
|
bangGraph->dataMalloc();
|
||||||
|
bangRuntime->run(bangGraph);
|
||||||
|
auto outputGpu = gpuOp->getOutput();
|
||||||
|
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||||
|
// CPU
|
||||||
|
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
|
||||||
|
auto cpuOp = cpuGraph->addOp<T>(inputCpu1, inputCpu2, nullptr);
|
||||||
|
cpuGraph->dataMalloc();
|
||||||
|
cpuRuntime->run(cpuGraph);
|
||||||
|
auto outputCpu = cpuOp->getOutput();
|
||||||
|
// Check
|
||||||
|
EXPECT_TRUE(outputCpu->equalData(outputGpu2Cpu));
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(cuDNN_OpTensor, run) {
|
||||||
|
testOptensor<AddObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||||
|
testOptensor<SubObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||||
|
testOptensor<MulObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace infini
|
Loading…
Reference in New Issue