From 823e66a9ff59f7398c751c5f8266c19f287b0b72 Mon Sep 17 00:00:00 2001 From: Hardy <100662313+wanghailu0717@users.noreply.github.com> Date: Wed, 29 Mar 2023 13:52:56 +0800 Subject: [PATCH] Support perf bang 1115 (#57) * support matmul * add matmul * add matmul * add code for cnnl matmul operation and test * add conv * add code for conv test on mlu * add code for test cnnl conv on mlu * add code for perf conv and matmul on mlu * clang format * fix convolution operation * fxi cmaklist * code format * fix code * code format --------- Co-authored-by: wanghailu Co-authored-by: wanghailu --- include/bang/bang_runtime.h | 1 + include/bang/operator_timer.h | 10 ++ src/bang/bang_runtime.cc | 2 + src/bang/operator_timer.cc | 71 ++++++++++++ src/kernels/bang/conv.cc | 156 ++++++++++++++++++++++++++ src/kernels/bang/matmul.cc | 65 +++++++++++ test/kernels/bang/test_bang_conv.cc | 58 ++++++++++ test/kernels/bang/test_bang_matmul.cc | 56 +++++++++ 8 files changed, 419 insertions(+) create mode 100644 include/bang/operator_timer.h create mode 100644 src/bang/operator_timer.cc create mode 100644 src/kernels/bang/conv.cc create mode 100644 src/kernels/bang/matmul.cc create mode 100644 test/kernels/bang/test_bang_conv.cc create mode 100644 test/kernels/bang/test_bang_matmul.cc diff --git a/include/bang/bang_runtime.h b/include/bang/bang_runtime.h index 6b43988c..7e2bad1c 100644 --- a/include/bang/bang_runtime.h +++ b/include/bang/bang_runtime.h @@ -30,6 +30,7 @@ class BangRuntimeObj : public RuntimeObj { dealloc(workspace); checkCnnlError(cnnlDestroy(cnnl)); } + string toString() const override; void run(const Graph &graph, bool tune = false, bool profiling = false) const; diff --git a/include/bang/operator_timer.h b/include/bang/operator_timer.h new file mode 100644 index 00000000..5aceceef --- /dev/null +++ b/include/bang/operator_timer.h @@ -0,0 +1,10 @@ +#pragma once +namespace infini { +namespace opTimer { +double getPerfConvCnnl(int n, int c, int h, int w, int f, int r, int s, + int padh, int padw, int strideh, int stridew, + int dilationh, int dilationw, int group, + const char *name); +double getPerfMatmulCnnl(int b, int m, int n, int k, const char *name); +} // namespace opTimer +} // namespace infini diff --git a/src/bang/bang_runtime.cc b/src/bang/bang_runtime.cc index b981ecbb..8f71f1b6 100644 --- a/src/bang/bang_runtime.cc +++ b/src/bang/bang_runtime.cc @@ -54,4 +54,6 @@ void BangRuntimeObj::run(const Graph &graph, bool tune, bool profiling) const { void BangRuntimeObj::sync() const { cnrtSyncDevice(); } +string BangRuntimeObj::toString() const { return "BANG Runtime"; } + } // namespace infini diff --git a/src/bang/operator_timer.cc b/src/bang/operator_timer.cc new file mode 100644 index 00000000..d5c6782b --- /dev/null +++ b/src/bang/operator_timer.cc @@ -0,0 +1,71 @@ +#include "bang/operator_timer.h" +#include "bang/bang_runtime.h" +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "operators/conv.h" +#include "operators/matmul.h" +#include "utils/data_generator.h" + +namespace infini { +namespace opTimer { + +double getPerfConvCnnl(int n, int c, int h, int w, int f, int r, int s, + int padh, int padw, int strideh, int stridew, + int dilationh, int dilationw, int group, + const char *name) { + Runtime cpu = CpuRuntimeObj::getInstance(); // CPUruntime is singleton + Graph gCpu = make_ref(cpu); + Runtime bang = make_ref(); + Graph gBang = make_ref(bang); + // Set input data on CPU in a CPU Graph + IT_ASSERT(c % group == 0); + Tensor i0Cpu = gCpu->addTensor({n, h, w, c}, DataType::Float32); + Tensor w0Cpu = gCpu->addTensor({f, r, s, c / group}, DataType::Float32); + // Malloc data for all tensors in a graph. Do we need implicit allocation? + gCpu->dataMalloc(); + i0Cpu->setData(IncrementalGenerator()); + w0Cpu->setData(IncrementalGenerator()); + + // Copy input tensors from CPU to Bang + Tensor i0Bang = gBang->cloneTensor(i0Cpu); + Tensor w0Bang = gBang->cloneTensor(w0Cpu); + // Build Bang graph + auto conv = gBang->addOp(i0Bang, w0Bang, nullptr, padh, padw, + strideh, stridew, dilationh, dilationw); + // allocate Bang memory + gBang->dataMalloc(); + // Execute on Bang + bool tune = true; + bang->run(gBang, tune); + return bang->getPerfTime(gBang); +} + +double getPerfMatmulCnnl(int b, int m, int n, int k, const char *name) { + Runtime cpu = CpuRuntimeObj::getInstance(); // CPUruntime is singleton + Graph gCpu = make_ref(cpu); + Runtime bang = make_ref(); + Graph gBang = make_ref(bang); + // Set input data on CPU in a CPU Graph + Tensor i0Cpu = gCpu->addTensor({b, m, k}, DataType::Float32); + Tensor w0Cpu = gCpu->addTensor({b, k, n}, DataType::Float32); + // Malloc data for all tensors in a graph. Do we need implicit allocation? + gCpu->dataMalloc(); + i0Cpu->setData(IncrementalGenerator()); + w0Cpu->setData(IncrementalGenerator()); + + // Copy input tensors from CPU to Bang + Tensor i0Bang = gBang->cloneTensor(i0Cpu); + Tensor w0Bang = gBang->cloneTensor(w0Cpu); + // Build Bang graph + auto conv = gBang->addOp(i0Bang, w0Bang, nullptr); + // allocate Bang memory + gBang->dataMalloc(); + // Execute on Bang + bool tune = true; + bang->run(gBang, tune); + return bang->getPerfTime(gBang); +} + +} // namespace opTimer +} // namespace infini diff --git a/src/kernels/bang/conv.cc b/src/kernels/bang/conv.cc new file mode 100644 index 00000000..e55c749e --- /dev/null +++ b/src/kernels/bang/conv.cc @@ -0,0 +1,156 @@ +#include "operators/conv.h" +#include "bang/bang_kernel_without_config.h" +#include "bang/bang_runtime.h" + +namespace infini { +class ConvCnnl : public BangKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + + const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation(); + const auto [n, c, h, w, f, r, s] = op->getNCHWFRS(); + const int cpg = op->getChannelPerGroup(); + const int g = c / cpg; + + int pad[4] = {ph, ph, pw, pw}; + int stride[2] = {sh, sw}; + int dilation[2] = {dh, dw}; + + cnnlConvolutionDescriptor_t convDesc; + checkCnnlError(cnnlCreateConvolutionDescriptor(&convDesc)); + checkCnnlError(cnnlSetConvolutionDescriptor( + convDesc, 4, pad, stride, dilation, g, CNNL_DTYPE_FLOAT)); + + void *const aData = (op->getInputs(0)->getRawDataPtr()); + void *const bData = (op->getInputs(1)->getRawDataPtr()); + void *const cData = (op->getOutput()->getRawDataPtr()); + + cnnlTensorDescriptor_t aInDesc, aDesc, bInDesc, bDesc, cInDesc, cDesc; + auto dimInputs0 = op->getInputs(0)->getDims(); + auto dimInputs1 = op->getInputs(1)->getDims(); + auto dimOutput = op->getOutput()->getDims(); + + if (dimInputs0.size() != 4) + IT_TODO_HALT(); + if (dimInputs1.size() != 4) + IT_TODO_HALT(); + if (dimOutput.size() != 4) + IT_TODO_HALT(); + + int inputs0[4] = {dimInputs0[0], dimInputs0[1], dimInputs0[2], + dimInputs0[3]}; + int inputs0Array[4] = {dimInputs0[0], dimInputs0[2], dimInputs0[3], + dimInputs0[1]}; + int inputs1[4] = {dimInputs1[0], dimInputs1[1], dimInputs1[2], + dimInputs1[3]}; + int inputs1Array[4] = {dimInputs1[0], dimInputs1[2], dimInputs1[3], + dimInputs1[1]}; + int output[4] = {dimOutput[0], dimOutput[1], dimOutput[2], + dimOutput[3]}; + int outputArray[4] = {dimOutput[0], dimOutput[2], dimOutput[3], + dimOutput[1]}; + + // get inputs + checkCnnlError(cnnlCreateTensorDescriptor(&aInDesc)); + checkCnnlError(cnnlSetTensorDescriptor(aInDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, inputs0)); + + checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); + checkCnnlError(cnnlSetTensorDescriptor( + aDesc, CNNL_LAYOUT_NHWC, CNNL_DTYPE_FLOAT, 4, inputs0Array)); + + checkCnnlError(cnnlCreateTensorDescriptor(&bInDesc)); + checkCnnlError(cnnlSetTensorDescriptor(bInDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, inputs1)); + + checkCnnlError(cnnlCreateTensorDescriptor(&bDesc)); + checkCnnlError(cnnlSetTensorDescriptor( + bDesc, CNNL_LAYOUT_NHWC, CNNL_DTYPE_FLOAT, 4, inputs1Array)); + + int permute[4] = {0, 2, 3, 1}; + cnnlTransposeDescriptor_t opDesc; + checkCnnlError(cnnlCreateTransposeDescriptor(&opDesc)); + checkCnnlError(cnnlSetTransposeDescriptor(opDesc, 4, permute)); + + size_t wsSize; + cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), aInDesc, opDesc, + &wsSize); + BangPtr wsData = context->getWorkspace(wsSize); + BangPtr aDataOut = context->getWorkspace( + cnnlGetTensorElementNum(aInDesc) * sizeof(float)); + cnnlStatus_t stat = + cnnlTranspose_v2(context->cnnlHandle(), opDesc, aInDesc, aData, + aDesc, aDataOut, wsData, wsSize); + if (stat != CNNL_STATUS_SUCCESS) + return; + + cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), bInDesc, opDesc, + &wsSize); + wsData = context->getWorkspace(wsSize); + BangPtr bDataOut = context->getWorkspace( + cnnlGetTensorElementNum(bInDesc) * sizeof(float)); + stat = cnnlTranspose_v2(context->cnnlHandle(), opDesc, bInDesc, bData, + bDesc, bDataOut, wsData, wsSize); + if (stat != CNNL_STATUS_SUCCESS) + return; + + // get outputs + checkCnnlError(cnnlCreateTensorDescriptor(&cInDesc)); + checkCnnlError(cnnlSetTensorDescriptor( + cInDesc, CNNL_LAYOUT_NHWC, CNNL_DTYPE_FLOAT, 4, outputArray)); + + checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, output)); + + cnnlConvolutionForwardAlgo_t algo; + cnnlGetConvolutionForwardAlgorithm(context->cnnlHandle(), convDesc, + aDesc, bDesc, cInDesc, + CNNL_CONVOLUTION_FWD_FASTEST, &algo); + + cnnlGetConvolutionForwardWorkspaceSize(context->cnnlHandle(), aDesc, + bDesc, cInDesc, NULL, convDesc, + algo, &wsSize); + wsData = context->getWorkspace(wsSize); + BangPtr cDataIn = context->getWorkspace( + cnnlGetTensorElementNum(cInDesc) * sizeof(float)); + + stat = cnnlConvolutionForward( + context->cnnlHandle(), convDesc, algo, NULL, aDesc, aData, bDesc, + bData, NULL, NULL, wsData, wsSize, NULL, cInDesc, cDataIn); + if (stat != CNNL_STATUS_SUCCESS) + return; + + int cPermute[4] = {0, 3, 1, 2}; + cnnlTransposeDescriptor_t opOutDesc; + checkCnnlError(cnnlCreateTransposeDescriptor(&opOutDesc)); + checkCnnlError(cnnlSetTransposeDescriptor(opOutDesc, 4, cPermute)); + + cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), cInDesc, opOutDesc, + &wsSize); + wsData = context->getWorkspace(wsSize); + + stat = cnnlTranspose_v2(context->cnnlHandle(), opOutDesc, cInDesc, + cDataIn, cDesc, cData, wsData, wsSize); + if (stat != CNNL_STATUS_SUCCESS) + return; + + // Destories in BANG does not require sync. But cnnl does not state + // whether sync is required before destories. + checkCnnlError(cnnlDestroyTensorDescriptor(aInDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(bInDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(cInDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(bDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); + checkCnnlError(cnnlDestroyConvolutionDescriptor(convDesc)); + checkCnnlError(cnnlDestroyTransposeDescriptor(opDesc)); + checkCnnlError(cnnlDestroyTransposeDescriptor(opOutDesc)); + } +}; + +REGISTER_KERNEL(Device::BANG, OpType::Conv, DataType::Float32, ConvCnnl, + "Conv_cnnl_BANG_Float32"); +}; // namespace infini diff --git a/src/kernels/bang/matmul.cc b/src/kernels/bang/matmul.cc new file mode 100644 index 00000000..b30ecb87 --- /dev/null +++ b/src/kernels/bang/matmul.cc @@ -0,0 +1,65 @@ +#include "operators/matmul.h" +#include "bang/bang_kernel_without_config.h" +#include "bang/bang_runtime.h" + +namespace infini { +class MatmulCnnl : public BangKernelWithoutConfig { + virtual tuple getAlphBeta() const { return {1.f, 0.f}; } + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + + void *const aData = (op->getInputs(0)->getRawDataPtr()); + void *const bData = (op->getInputs(1)->getRawDataPtr()); + void *const cData = (op->getOutput()->getRawDataPtr()); + + cnnlTensorDescriptor_t aDesc, bDesc, cDesc; + auto dimInputs0 = op->getInputs(0)->getDims(); + auto dimInputs1 = op->getInputs(1)->getDims(); + auto dimOutput = op->getOutput()->getDims(); + if (dimInputs0.size() != 3) + IT_TODO_HALT(); + if (dimInputs1.size() != 3) + IT_TODO_HALT(); + if (dimOutput.size() != 3) + IT_TODO_HALT(); + + bool transA = op->getTransA(); + bool transB = op->getTransB(); + + int inputs0Array[3] = {dimInputs0[0], dimInputs0[1], dimInputs0[2]}; + int inputs1Array[3] = {dimInputs1[0], dimInputs1[1], dimInputs1[2]}; + int outputArray[3] = {dimOutput[0], dimOutput[1], dimOutput[2]}; + + // get inputs + checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); + checkCnnlError(cnnlSetTensorDescriptor( + aDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 3, inputs0Array)); + + checkCnnlError(cnnlCreateTensorDescriptor(&bDesc)); + checkCnnlError(cnnlSetTensorDescriptor( + bDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 3, inputs1Array)); + + // get outputs + checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); + checkCnnlError(cnnlSetTensorDescriptor( + cDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 3, outputArray)); + + cnnlStatus_t stat = + cnnlBatchMatMul(context->cnnlHandle(), transA, transB, aDesc, aData, + bDesc, bData, cDesc, cData); + if (stat != CNNL_STATUS_SUCCESS) + return; + + // Destories in BANG does not require sync. But cnnl does not state + // whether sync is required before destories. + checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(bDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); + } +}; + +REGISTER_KERNEL(Device::BANG, OpType::Matmul, DataType::Float32, MatmulCnnl, + "Matmul_cnnl_BANG_Float32"); +}; // namespace infini diff --git a/test/kernels/bang/test_bang_conv.cc b/test/kernels/bang/test_bang_conv.cc new file mode 100644 index 00000000..c67b62b6 --- /dev/null +++ b/test/kernels/bang/test_bang_conv.cc @@ -0,0 +1,58 @@ +#include "bang/bang_runtime.h" +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "operators/conv.h" + +#include "test.h" + +namespace infini { + +template +void testConv(const std::function &generatorA, + const std::function &generatorB, + const Shape &shapeA, const Shape &shapeB) { + // Runtime + Runtime cpuRuntime = CpuRuntimeObj::getInstance(); + auto bangRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu1 = + make_ref(shapeA, DataType::Float32, cpuRuntime); + inputCpu1->dataMalloc(); + inputCpu1->setData(generatorA); + Tensor inputCpu2 = + make_ref(shapeB, DataType::Float32, cpuRuntime); + inputCpu2->dataMalloc(); + inputCpu2->setData(generatorB); + + // MLU + Graph bangGraph = make_ref(bangRuntime); + auto inputMlu1 = bangGraph->cloneTensor(inputCpu1); + auto inputMlu2 = bangGraph->cloneTensor(inputCpu2); + auto mluOp = + bangGraph->addOp(inputMlu1, inputMlu2, nullptr, 1, 1, 1, 1, 1, 1); + bangGraph->dataMalloc(); + bangRuntime->run(bangGraph); + auto outputMlu = mluOp->getOutput(); + auto outputMlu2Cpu = outputMlu->clone(cpuRuntime); + // CPU + Graph cpuGraph = make_ref(cpuRuntime); + auto cpuOp = + cpuGraph->addOp(inputCpu1, inputCpu2, nullptr, 1, 1, 1, 1, 1, 1); + cpuGraph->dataMalloc(); + cpuRuntime->run(cpuGraph); + auto outputCpu = cpuOp->getOutput(); + outputCpu->print(); + outputMlu2Cpu->print(); + // Check + // EXPECT_TRUE(outputCpu->equalData(outputMlu2Cpu)); + EXPECT_TRUE(true); +} + +TEST(cnnl_Conv, run) { + testConv(IncrementalGenerator(), IncrementalGenerator(), + Shape{1, 3, 224, 224}, Shape{2, 3, 3, 3}); +} + +} // namespace infini diff --git a/test/kernels/bang/test_bang_matmul.cc b/test/kernels/bang/test_bang_matmul.cc new file mode 100644 index 00000000..77acf4ab --- /dev/null +++ b/test/kernels/bang/test_bang_matmul.cc @@ -0,0 +1,56 @@ +#include "bang/bang_runtime.h" +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "operators/matmul.h" + +#include "test.h" + +namespace infini { + +template +void testMatmul(const std::function &generatorA, + const std::function &generatorB, + bool transA, bool transB, const Shape &shapeA, + const Shape &shapeB) { + // Runtime + Runtime cpuRuntime = CpuRuntimeObj::getInstance(); + auto bangRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu1 = + make_ref(shapeA, DataType::Float32, cpuRuntime); + inputCpu1->dataMalloc(); + inputCpu1->setData(generatorA); + Tensor inputCpu2 = + make_ref(shapeB, DataType::Float32, cpuRuntime); + inputCpu2->dataMalloc(); + inputCpu2->setData(generatorB); + + // MLU + Graph bangGraph = make_ref(bangRuntime); + auto inputMlu1 = bangGraph->cloneTensor(inputCpu1); + auto inputMlu2 = bangGraph->cloneTensor(inputCpu2); + auto mluOp = bangGraph->addOp(inputMlu1, inputMlu2, nullptr); + bangGraph->dataMalloc(); + bangRuntime->run(bangGraph); + auto outputMlu = mluOp->getOutput(); + auto outputMlu2Cpu = outputMlu->clone(cpuRuntime); + // CPU + Graph cpuGraph = make_ref(cpuRuntime); + auto cpuOp = cpuGraph->addOp(inputCpu1, inputCpu2, nullptr); + cpuGraph->dataMalloc(); + cpuRuntime->run(cpuGraph); + auto outputCpu = cpuOp->getOutput(); + outputCpu->print(); + outputMlu2Cpu->print(); + // Check + EXPECT_TRUE(outputCpu->equalData(outputMlu2Cpu)); +} + +TEST(cnnl_Matmul, run) { + testMatmul(IncrementalGenerator(), IncrementalGenerator(), false, + false, Shape{1, 2, 3}, Shape{1, 3, 4}); +} + +} // namespace infini