fix mlu some kernel registration & gather op (#210)

* fix: fix bang build/kernel registration | test_onnx

* delete assert float

* fix gather

* fix CMakeLists and Reshape

* fix cncl ops

* add hardsigmoid/hardswish

* fix

* add invalid datatype exception

* fix gather

* fix gather indices type

* fix gather/prelu/hardsigmoid on mlu

* fix format

* fix

---------

Co-authored-by: Bolun Zhang <48948016+Chamberlain0w0@users.noreply.github.com>
Co-authored-by: Haojie Wang <haojie0429@gmail.com>
Co-authored-by: Zhang Bolun <Chamberlain0w0@gmail.com>
This commit is contained in:
zhangyunze 2024-02-01 15:02:02 +08:00 committed by GitHub
parent 4813204a36
commit 67b2bcb7d5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 283 additions and 31 deletions

View File

@ -245,7 +245,6 @@ if(USE_BANG)
find_library(CAMBRICON_CNNL libcnnl.so "${NEUWARE_HOME}/lib64")
find_library(CAMBRICON_CNRT libcnrt.so "${NEUWARE_HOME}/lib64")
find_library(CAMBRICON_CNDRV libcndrv.so "${NEUWARE_HOME}/lib64")
find_library(CAMBRICON_CNCL libcncl.so "${NEUWARE_HOME}/lib64")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lstdc++ -Wall -Werror")
if ((NOT DEFINED TARGET_CPU_ARCH) AND (NOT DEFINED ENV{TARGET_CPU_ARCH}))
@ -262,12 +261,13 @@ if(USE_BANG)
# BangC Kernels
################################################################################
target_link_libraries(InfiniTensor ${CAMBRICON_CNCL} ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
if (BUILD_DIST)
find_library(CAMBRICON_CNCL libcncl.so "${NEUWARE_HOME}/lib64")
target_link_libraries(InfiniTensor ${CAMBRICON_CNCL} ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
message(STATUS "Add BUILD_DIST, use CNCL with BANG")
add_compile_definitions(INFINI_USE_CNCL=1)
else()
target_link_libraries(InfiniTensor ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
endif()
endif()

View File

@ -7,6 +7,7 @@ KUNLUN ?= OFF
INTELCPU ?= off
BACKTRACE ?= ON
TEST ?= ON
DIST ?= OFF
NNET ?= OFF
FORMAT_ORIGIN ?=
# Docker build options
@ -29,6 +30,7 @@ CMAKE_OPT += -DUSE_BANG=$(BANG)
CMAKE_OPT += -DUSE_KUNLUN=$(KUNLUN)
CMAKE_OPT += -DUSE_BACKTRACE=$(BACKTRACE)
CMAKE_OPT += -DBUILD_TEST=$(TEST)
CMAKE_OPT += -DBUILD_DIST=$(DIST)
CMAKE_OPT += -DBUILD_NNET=$(NNET)
ifeq ($(INTELCPU), ON)

View File

@ -3,6 +3,9 @@
#include "cnrt.h"
#include "core/common.h"
#include "core/data_type.h"
#ifdef INFINI_USE_CNCL
#include "cncl.h"
#endif
#define checkBangError(call) \
{ \
@ -56,7 +59,42 @@ inline cnnlDataType_t cnnlDataTypeConvert(DataType dataType) {
if (dataType == DataType::Bool) {
return CNNL_DTYPE_BOOL;
}
return CNNL_DTYPE_INVALID;
IT_TODO_HALT_MSG("Data type " + dataType.toString() +
" not supported in CNNL.");
}
#ifdef INFINI_USE_CNCL
inline cnclDataType_t cnclDataTypeConvert(DataType dataType) {
if (dataType == DataType::Float32) {
return cnclFloat32;
}
if (dataType == DataType::Float16) {
return cnclHalf;
}
if (dataType == DataType::Int8) {
return cnclInt8;
}
if (dataType == DataType::Int16) {
return cnclInt16;
}
if (dataType == DataType::Int32) {
return cnclInt32;
}
if (dataType == DataType::UInt8) {
return cnclUint8;
}
if (dataType == DataType::UInt16) {
return cnclUint16;
}
if (dataType == DataType::UInt32) {
return cnclUint32;
}
if (dataType == DataType::BFloat16) {
return cnclBfloat16;
}
IT_TODO_HALT_MSG("Data type " + dataType.toString() +
" not supported in CNCL.");
}
#endif
} // namespace infini

View File

@ -463,13 +463,20 @@ class TestStringMethods(unittest.TestCase):
def test_split(self):
input = make_tensor_value_info("input", TensorProto.FLOAT, [1, 3, 2, 4])
split = make_node("Split", ["input"], ["output"], name="split", axis=0)
make_and_import_model(make_graph([split], "split", [input], []))
output = make_tensor_value_info("output", TensorProto.FLOAT, [1, 3, 2, 4])
make_and_import_model(make_graph([split], "split", [input], [output]))
def test_split1(self):
input = make_tensor_value_info("input", TensorProto.FLOAT, [1, 3, 2, 4])
splitAttr = make_tensor_value_info("split", TensorProto.INT64, [2, 1])
split = make_node("Split", ["input", "split"], ["output"], name="split", axis=1)
make_and_import_model(make_graph([split], "split", [input, splitAttr], []))
splitAttr = make_tensor("split", TensorProto.INT64, [2], [2, 1])
output1 = make_tensor_value_info("output1", TensorProto.FLOAT, [1, 2, 2, 4])
output2 = make_tensor_value_info("output2", TensorProto.FLOAT, [1, 1, 2, 4])
split = make_node(
"Split", ["input", "split"], ["output1", "output2"], name="split", axis=1
)
make_and_import_model(
make_graph([split], "split", [input], [output1, output2], [splitAttr])
)
def test_allBroadcast(self):
input = make_tensor_value_info("input", TensorProto.FLOAT, [1, 3, 2, 4])

View File

@ -2,12 +2,17 @@
#include "bang/bang_runtime.h"
#include "operators/softmax.h"
#include "operators/unary.h"
#include <iostream>
namespace infini {
class UnaryCnnl : public BangKernelWithoutConfig {
virtual cnnlActivationMode_t getOpType() const = 0;
virtual float getCoef() const = 0;
virtual tuple<float, float> getAlphBeta() const { return {1.f, 0.f}; }
virtual float getSlicedDim() const { return 0.0; }
virtual float getGamma() const { return 0.0; }
virtual float getScale() const { return 0.0; }
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<UnaryObj>(_op);
@ -30,9 +35,10 @@ class UnaryCnnl : public BangKernelWithoutConfig {
cDim.size(), cDim.data()));
cnnlActivationDescriptor_t opDesc;
checkCnnlError(cnnlCreateActivationDescriptor(&opDesc));
checkCnnlError(cnnlSetActivationDescriptor_v2(
checkCnnlError(cnnlSetActivationDescriptor_v5(
opDesc, getOpType(), CNNL_ACTIVATION_HIGH_PRECISION,
CNNL_NOT_PROPAGATE_NAN, getCoef()));
CNNL_NOT_PROPAGATE_NAN, getCoef(), getSlicedDim(), getGamma(),
getScale(), true));
auto [alpha, beta] = getAlphBeta();
cnnlStatus_t stat =
@ -91,6 +97,10 @@ class PReluCnnl : public BangKernelWithoutConfig {
auto bDim = op->getInputs(1)->getDims();
auto cDim = op->getOutput()->getDims();
if (auto alignSize = aDim.size() - bDim.size(); alignSize) {
bDim.insert(bDim.begin(), alignSize, 1);
}
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
@ -215,6 +225,22 @@ class SigmoidCnnl : public UnaryCnnl {
float getCoef() const override { return 0.0; }
};
class HardSwishCnnl : public UnaryCnnl {
cnnlActivationMode_t getOpType() const override {
return CNNL_ACTIVATION_HARDSWISH;
}
float getCoef() const override { return 0.0; }
};
class HardSigmoidCnnl : public UnaryCnnl {
cnnlActivationMode_t getOpType() const override {
return CNNL_ACTIVATION_HARDSIGMOID;
}
float getCoef() const override { return 0.0; }
float getGamma() const override { return 1.f / 6.f; }
float getScale() const override { return 0.5f; }
};
REGISTER_KERNEL(Device::BANG, OpType::Relu, ReluCnnl, "Relu_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::PRelu, PReluCnnl, "PRelu_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::Sigmoid, SigmoidCnnl,
@ -222,5 +248,9 @@ REGISTER_KERNEL(Device::BANG, OpType::Sigmoid, SigmoidCnnl,
REGISTER_KERNEL(Device::BANG, OpType::Round, RoundCnnl, "Round_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::Softmax, SoftmaxCnnl,
"Softmax_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::HardSigmoid, HardSigmoidCnnl,
"HardSigmoid_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::HardSwish, HardSwishCnnl,
"HardSwish_cnnl_BANG");
}; // namespace infini

View File

@ -22,14 +22,15 @@ class AllGatherCNCL : public BangKernelWithoutConfig {
checkBangError(cnrtMalloc(&output_temp,
op->getInputs(0)->getBytes() * world_size));
size_t bytes = op->getInputs(0)->getBytes();
size_t count = bytes / sizeof(uint8_t);
size_t count = bytes / op->getDType().getSize();
cnclComm_t comm =
dynamic_cast<CnclCommunicatorObj &>(context->getCommunicator())
.getCnclComm();
cnrtQueue_t queue = context->getBangQueue();
CNCL_CHECK(
cnclAllGather(input, output_temp, count, cnclUint8, comm, queue));
CNCL_CHECK(cnclAllGather(input, output_temp, count,
cnclDataTypeConvert(op->getDType()), comm,
queue));
checkBangError(cnrtQueueSync(queue));
for (int i = 0; i < world_size; ++i) {
Tensor output = op->getOutput(i);

View File

@ -14,14 +14,15 @@ class AllReduceCNCL : public BangKernelWithoutConfig {
void *input = op->getInputs(0)->getRawDataPtr<void *>();
void *output = op->getOutput()->getRawDataPtr<void *>();
size_t bytes = op->getInputs(0)->getBytes();
size_t count = bytes / sizeof(uint8_t);
size_t count = bytes / op->getDType().getSize();
cnclComm_t comm =
dynamic_cast<CnclCommunicatorObj &>(context->getCommunicator())
.getCnclComm();
cnrtQueue_t queue = context->getBangQueue();
// checkBangError(cnrtQueueSync(queue));
CNCL_CHECK(cnclAllReduce(input, output, count, cnclUint8, getRedOp(),
comm, queue));
CNCL_CHECK(cnclAllReduce(input, output, count,
cnclDataTypeConvert(op->getDType()),
getRedOp(), comm, queue));
checkBangError(cnrtQueueSync(queue));
}

View File

@ -14,15 +14,16 @@ class BroadcastCNCL : public BangKernelWithoutConfig {
void *input = op->getInputs(0)->getRawDataPtr<void *>();
void *output = op->getOutput()->getRawDataPtr<void *>();
size_t bytes = op->getInputs(0)->getBytes();
size_t count = bytes / sizeof(uint8_t);
size_t count = bytes / op->getDType().getSize();
cnclComm_t comm =
dynamic_cast<CnclCommunicatorObj &>(context->getCommunicator())
.getCnclComm();
cnrtQueue_t queue = context->getBangQueue();
// TODO: Using default stream 0 for now.
CNCL_CHECK(cnclBroadcast(input, output, count, cnclUint8, op->getRoot(),
comm, queue));
CNCL_CHECK(cnclBroadcast(input, output, count,
cnclDataTypeConvert(op->getDType()),
op->getRoot(), comm, queue));
checkBangError(cnrtQueueSync(queue));
}
};

View File

@ -12,6 +12,7 @@ class ElementWiseCnnl : public BangKernelWithoutConfig {
const RuntimeObj *_context) const override {
auto op = as<ElementWiseObj>(_op);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
auto [aAlpha, bAlpha, beta] = getAlphBeta();
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
@ -51,12 +52,12 @@ class ElementWiseCnnl : public BangKernelWithoutConfig {
CNNL_NOT_PROPAGATE_NAN));
size_t wsSize;
cnnlGetOpTensorWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
&wsSize);
cnnlGetOpTensorWorkspaceSize_v2(context->cnnlHandle(), opDesc, &aAlpha,
aDesc, aData, &bAlpha, bDesc, bData,
&beta, cDesc, cData, &wsSize);
BangPtr wsData = context->getWorkspace(wsSize);
auto [aAlpha, bAlpha, beta] = getAlphBeta();
cnnlStatus_t stat = cnnlOpTensor(context->cnnlHandle(), opDesc, &aAlpha,
aDesc, aData, &bAlpha, bDesc, bData,
wsData, wsSize, &beta, cDesc, cData);

View File

@ -23,23 +23,52 @@ class GatherCnnl : public BangKernelWithoutConfig {
aDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
aDim.size(), aDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
checkCnnlError(
cnnlSetTensorDescriptorPointerMode(bDesc, CNNL_POINTER_MODE_HOST));
if (bDim.size() == 0) {
bDim.push_back(1);
}
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_ARRAY,
CNNL_DTYPE_INT32, bDim.size(),
bDim.data()));
BangPtr indices;
DataType indicesDataType = op->getInputs(1)->getDType();
if (indicesDataType == DataType::Int64) {
// cnnlGatherV2 does not support int64 indices
int indicesSize =
op->getInputs(1)->getBytes() / indicesDataType.getSize();
indices = context->getWorkspace(indicesSize * sizeof(int));
cnnlTensorDescriptor_t bDescInt64;
checkCnnlError(cnnlCreateTensorDescriptor(&bDescInt64));
checkCnnlError(cnnlSetTensorDescriptor(
bDescInt64, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_INT64, bDim.size(),
bDim.data()));
checkCnnlError(cnnlCastDataType(context->cnnlHandle(), bDescInt64,
bData, CNNL_CAST_INT64_TO_INT32,
bDesc, indices));
cnrtQueueSync(context->getBangQueue());
checkCnnlError(cnnlDestroyTensorDescriptor(bDescInt64));
} else if (indicesDataType == DataType::Int32) {
indices = bData;
} else {
IT_TODO_HALT_MSG("Unsupported data type of indices: " +
indicesDataType.toString());
}
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
cDim.size(), cDim.data()));
BangPtr wsData = context->getWorkspace(aDim.size() * 4);
context->copyBlobFromCPU(wsData, aDim.data(), aDim.size() * 4);
BangPtr wsData = context->getWorkspace(aDim.size() * sizeof(int));
context->copyBlobFromCPU(wsData, aDim.data(),
aDim.size() * sizeof(int));
auto axis = op->getAxis();
cnnlStatus_t stat =
cnnlGatherV2(context->cnnlHandle(), axis, aDesc, aData,
(int *)wsData, bDesc, (int *)bData, cDesc, cData);
reinterpret_cast<const int *>(wsData), bDesc,
reinterpret_cast<const int *>(indices), cDesc, cData);
if (stat != CNNL_STATUS_SUCCESS)
return;

View File

@ -14,8 +14,8 @@ class CopyBang : public BangKernelWithoutConfig {
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_INT8,
dim.size() * op->getDType().getSize(), dim.data()));
aDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
dim.size(), dim.data()));
cnnlStatus_t stat =
cnnlCopy(context->cnnlHandle(), aDesc, inData, aDesc, outData);
if (stat != CNNL_STATUS_SUCCESS)
@ -28,5 +28,7 @@ class CopyBang : public BangKernelWithoutConfig {
REGISTER_KERNEL(Device::BANG, OpType::Reshape, CopyBang, "Reshape_BANG");
REGISTER_KERNEL(Device::BANG, OpType::Flatten, CopyBang, "Flatten_BANG");
REGISTER_KERNEL(Device::BANG, OpType::Identity, CopyBang, "Identity_BANG");
REGISTER_KERNEL(Device::BANG, OpType::Squeeze, CopyBang, "Squeeze_BANG");
REGISTER_KERNEL(Device::BANG, OpType::Unsqueeze, CopyBang, "Unsqueeze_BANG");
} // namespace infini

View File

@ -29,7 +29,8 @@ class TrigonCnnl : public BangKernelWithoutConfig {
cnnlTrigonDescriptor_t opDesc;
checkCnnlError(cnnlCreateTrigonDescriptor(&opDesc));
checkCnnlError(cnnlSetTrigonDescriptor(opDesc, getOpType()));
checkCnnlError(
cnnlSetTrigonDescriptor_v2(opDesc, getOpType(), getPrefer()));
cnnlStatus_t stat = cnnlTrigonForward(context->cnnlHandle(), opDesc,
aDesc, aData, cDesc, cData);

View File

@ -0,0 +1,139 @@
#include "bang/bang_runtime.h"
#include "core/graph.h"
#include "core/runtime.h"
#include "operators/gather.h"
#include "test.h"
namespace infini {
/*
test1:
input = [
[1, 2],
[3, 4],
[5, 6],
]
indices = [
[0, 1],
[1, 2],
]
output = [
[
[1, 2],
[3, 4],
],
[
[3, 4],
[5, 6],
],
]
axis=0
*/
/*
test2
input = [
[0, 1, 2],
[3, 4, 5],
[6, 7, 8],
]
indices = [
[0, 2],
]
axis = 1,
output = [
[[0, 2]],
[[3, 5]],
[[6, 8]],
]
*/
/*
test3
input=[[[ 0, 1],
[ 2, 3],
[ 4, 5],
[ 6, 7]],
[[ 8, 9],
[10, 11],
[12, 13],
[14, 15]]] //(2,4,2)
indices=[[0],[3],[1]] //(3,1)
axis=1
output=
*/
TEST(Gather, Mlu) {
{
Runtime runtime = NativeCpuRuntimeObj::getInstance();
Graph gCpu = make_ref<GraphObj>(runtime);
auto input = gCpu->addTensor({3, 2}, DataType::Float32);
auto index = gCpu->addTensor({2, 2}, DataType::Int32);
gCpu->dataMalloc();
input->copyin(vector<float>{1, 2, 3, 4, 5, 6});
index->copyin(vector<int>{0, 1, 1, 2});
auto bangRuntime = make_ref<BangRuntimeObj>();
Graph gMlu = make_ref<GraphObj>(bangRuntime);
auto inputMlu = gMlu->cloneTensor(input);
auto indexMlu = gMlu->cloneTensor(index);
auto op = gMlu->addOp<GatherObj>(inputMlu, indexMlu, nullptr, 0);
gMlu->dataMalloc();
inputMlu->copyin(vector<float>{1, 2, 3, 4, 5, 6});
indexMlu->copyin(vector<int>{0, 1, 1, 2});
bangRuntime->run(gMlu);
// copy output from MLU to CPU
auto oCpu = gCpu->cloneTensor(op->getOutput());
EXPECT_TRUE(oCpu->equalData(vector<float>{1, 2, 3, 4, 3, 4, 5, 6}));
}
{
Runtime runtime = NativeCpuRuntimeObj::getInstance();
Graph gCpu = make_ref<GraphObj>(runtime);
auto input = gCpu->addTensor({3, 3}, DataType::Float32);
auto index = gCpu->addTensor({1, 2}, DataType::Int32);
gCpu->dataMalloc();
input->setData(IncrementalGenerator());
index->copyin(vector<int>{0, 2});
auto bangRuntime = make_ref<BangRuntimeObj>();
Graph gMlu = make_ref<GraphObj>(bangRuntime);
auto inputMlu = gMlu->cloneTensor(input);
auto indexMlu = gMlu->cloneTensor(index);
auto op = gMlu->addOp<GatherObj>(inputMlu, indexMlu, nullptr, 1);
gMlu->dataMalloc();
inputMlu->setData(IncrementalGenerator());
indexMlu->copyin(vector<int>{0, 2});
bangRuntime->run(gMlu);
// copy output from MLU to CPU
auto oCpu = gCpu->cloneTensor(op->getOutput());
EXPECT_TRUE(oCpu->equalData(vector<float>{0, 2, 3, 5, 6, 8}));
}
{
Runtime runtime = NativeCpuRuntimeObj::getInstance();
Graph gCpu = make_ref<GraphObj>(runtime);
auto input = gCpu->addTensor({2, 4, 2}, DataType::Float32);
auto index = gCpu->addTensor({3, 1}, DataType::Int32);
gCpu->dataMalloc();
input->setData(IncrementalGenerator());
index->copyin(vector<int>{0, 3, 1});
auto bangRuntime = make_ref<BangRuntimeObj>();
Graph gMlu = make_ref<GraphObj>(bangRuntime);
auto inputMlu = gMlu->cloneTensor(input);
auto indexMlu = gMlu->cloneTensor(index);
auto op = gMlu->addOp<GatherObj>(inputMlu, indexMlu, nullptr, 1);
gMlu->dataMalloc();
inputMlu->setData(IncrementalGenerator());
indexMlu->copyin(vector<int>{0, 3, 1});
bangRuntime->run(gMlu);
// copy output from MLU to CPU
auto oCpu = gCpu->cloneTensor(op->getOutput());
EXPECT_TRUE(oCpu->equalData(
vector<float>{0, 1, 6, 7, 2, 3, 8, 9, 14, 15, 10, 11}));
}
}
} // namespace infini