Support perf bang 1115 (#57)

* support matmul * add matmul * add matmul * add code for cnnl matmul operation and test * add conv * add code for conv test on mlu * add code for test cnnl conv on mlu * add code for perf conv and matmul on mlu * clang format * fix convolution operation * fxi cmaklist * code format * fix code * code format --------- Co-authored-by: wanghailu <wanghailu@qiyuanlab.com> Co-authored-by: wanghailu <wanghailu0717@163.com>
2023-03-29 13:52:56 +08:00 · 2023-03-29 13:52:56 +08:00 · 823e66a9ff
parent 86ec4036ce
commit 823e66a9ff
8 changed files with 419 additions and 0 deletions
--- a/include/bang/bang_runtime.h
+++ b/include/bang/bang_runtime.h
@ -30,6 +30,7 @@ class BangRuntimeObj : public RuntimeObj {
        dealloc(workspace);
        checkCnnlError(cnnlDestroy(cnnl));
    }
+    string toString() const override;

    void run(const Graph &graph, bool tune = false,
             bool profiling = false) const;
--- a/include/bang/operator_timer.h
+++ b/include/bang/operator_timer.h
@ -0,0 +1,10 @@
+#pragma once
+namespace infini {
+namespace opTimer {
+double getPerfConvCnnl(int n, int c, int h, int w, int f, int r, int s,
+                       int padh, int padw, int strideh, int stridew,
+                       int dilationh, int dilationw, int group,
+                       const char *name);
+double getPerfMatmulCnnl(int b, int m, int n, int k, const char *name);
+} // namespace opTimer
+} // namespace infini
--- a/src/bang/bang_runtime.cc
+++ b/src/bang/bang_runtime.cc
@ -54,4 +54,6 @@ void BangRuntimeObj::run(const Graph &graph, bool tune, bool profiling) const {

 void BangRuntimeObj::sync() const { cnrtSyncDevice(); }

+string BangRuntimeObj::toString() const { return "BANG Runtime"; }
+
 } // namespace infini
--- a/src/bang/operator_timer.cc
+++ b/src/bang/operator_timer.cc
@ -0,0 +1,71 @@
+#include "bang/operator_timer.h"
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/conv.h"
+#include "operators/matmul.h"
+#include "utils/data_generator.h"
+
+namespace infini {
+namespace opTimer {
+
+double getPerfConvCnnl(int n, int c, int h, int w, int f, int r, int s,
+                       int padh, int padw, int strideh, int stridew,
+                       int dilationh, int dilationw, int group,
+                       const char *name) {
+    Runtime cpu = CpuRuntimeObj::getInstance(); // CPUruntime is singleton
+    Graph gCpu = make_ref<GraphObj>(cpu);
+    Runtime bang = make_ref<BangRuntimeObj>();
+    Graph gBang = make_ref<GraphObj>(bang);
+    // Set input data on CPU in a CPU Graph
+    IT_ASSERT(c % group == 0);
+    Tensor i0Cpu = gCpu->addTensor({n, h, w, c}, DataType::Float32);
+    Tensor w0Cpu = gCpu->addTensor({f, r, s, c / group}, DataType::Float32);
+    // Malloc data for all tensors in a graph. Do we need implicit allocation?
+    gCpu->dataMalloc();
+    i0Cpu->setData(IncrementalGenerator());
+    w0Cpu->setData(IncrementalGenerator());
+
+    // Copy input tensors from CPU to Bang
+    Tensor i0Bang = gBang->cloneTensor(i0Cpu);
+    Tensor w0Bang = gBang->cloneTensor(w0Cpu);
+    // Build Bang graph
+    auto conv = gBang->addOp<ConvObj>(i0Bang, w0Bang, nullptr, padh, padw,
+                                      strideh, stridew, dilationh, dilationw);
+    // allocate Bang memory
+    gBang->dataMalloc();
+    // Execute on Bang
+    bool tune = true;
+    bang->run(gBang, tune);
+    return bang->getPerfTime(gBang);
+}
+
+double getPerfMatmulCnnl(int b, int m, int n, int k, const char *name) {
+    Runtime cpu = CpuRuntimeObj::getInstance(); // CPUruntime is singleton
+    Graph gCpu = make_ref<GraphObj>(cpu);
+    Runtime bang = make_ref<BangRuntimeObj>();
+    Graph gBang = make_ref<GraphObj>(bang);
+    // Set input data on CPU in a CPU Graph
+    Tensor i0Cpu = gCpu->addTensor({b, m, k}, DataType::Float32);
+    Tensor w0Cpu = gCpu->addTensor({b, k, n}, DataType::Float32);
+    // Malloc data for all tensors in a graph. Do we need implicit allocation?
+    gCpu->dataMalloc();
+    i0Cpu->setData(IncrementalGenerator());
+    w0Cpu->setData(IncrementalGenerator());
+
+    // Copy input tensors from CPU to Bang
+    Tensor i0Bang = gBang->cloneTensor(i0Cpu);
+    Tensor w0Bang = gBang->cloneTensor(w0Cpu);
+    // Build Bang graph
+    auto conv = gBang->addOp<MatmulObj>(i0Bang, w0Bang, nullptr);
+    // allocate Bang memory
+    gBang->dataMalloc();
+    // Execute on Bang
+    bool tune = true;
+    bang->run(gBang, tune);
+    return bang->getPerfTime(gBang);
+}
+
+} // namespace opTimer
+} // namespace infini
--- a/src/kernels/bang/conv.cc
+++ b/src/kernels/bang/conv.cc
@ -0,0 +1,156 @@
+#include "operators/conv.h"
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+
+namespace infini {
+class ConvCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<ConvObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
+        const auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
+        const int cpg = op->getChannelPerGroup();
+        const int g = c / cpg;
+
+        int pad[4] = {ph, ph, pw, pw};
+        int stride[2] = {sh, sw};
+        int dilation[2] = {dh, dw};
+
+        cnnlConvolutionDescriptor_t convDesc;
+        checkCnnlError(cnnlCreateConvolutionDescriptor(&convDesc));
+        checkCnnlError(cnnlSetConvolutionDescriptor(
+            convDesc, 4, pad, stride, dilation, g, CNNL_DTYPE_FLOAT));
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aInDesc, aDesc, bInDesc, bDesc, cInDesc, cDesc;
+        auto dimInputs0 = op->getInputs(0)->getDims();
+        auto dimInputs1 = op->getInputs(1)->getDims();
+        auto dimOutput = op->getOutput()->getDims();
+
+        if (dimInputs0.size() != 4)
+            IT_TODO_HALT();
+        if (dimInputs1.size() != 4)
+            IT_TODO_HALT();
+        if (dimOutput.size() != 4)
+            IT_TODO_HALT();
+
+        int inputs0[4] = {dimInputs0[0], dimInputs0[1], dimInputs0[2],
+                          dimInputs0[3]};
+        int inputs0Array[4] = {dimInputs0[0], dimInputs0[2], dimInputs0[3],
+                               dimInputs0[1]};
+        int inputs1[4] = {dimInputs1[0], dimInputs1[1], dimInputs1[2],
+                          dimInputs1[3]};
+        int inputs1Array[4] = {dimInputs1[0], dimInputs1[2], dimInputs1[3],
+                               dimInputs1[1]};
+        int output[4] = {dimOutput[0], dimOutput[1], dimOutput[2],
+                         dimOutput[3]};
+        int outputArray[4] = {dimOutput[0], dimOutput[2], dimOutput[3],
+                              dimOutput[1]};
+
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aInDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aInDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, inputs0));
+
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDesc, CNNL_LAYOUT_NHWC, CNNL_DTYPE_FLOAT, 4, inputs0Array));
+
+        checkCnnlError(cnnlCreateTensorDescriptor(&bInDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(bInDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, inputs1));
+
+        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            bDesc, CNNL_LAYOUT_NHWC, CNNL_DTYPE_FLOAT, 4, inputs1Array));
+
+        int permute[4] = {0, 2, 3, 1};
+        cnnlTransposeDescriptor_t opDesc;
+        checkCnnlError(cnnlCreateTransposeDescriptor(&opDesc));
+        checkCnnlError(cnnlSetTransposeDescriptor(opDesc, 4, permute));
+
+        size_t wsSize;
+        cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), aInDesc, opDesc,
+                                      &wsSize);
+        BangPtr wsData = context->getWorkspace(wsSize);
+        BangPtr aDataOut = context->getWorkspace(
+            cnnlGetTensorElementNum(aInDesc) * sizeof(float));
+        cnnlStatus_t stat =
+            cnnlTranspose_v2(context->cnnlHandle(), opDesc, aInDesc, aData,
+                             aDesc, aDataOut, wsData, wsSize);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), bInDesc, opDesc,
+                                      &wsSize);
+        wsData = context->getWorkspace(wsSize);
+        BangPtr bDataOut = context->getWorkspace(
+            cnnlGetTensorElementNum(bInDesc) * sizeof(float));
+        stat = cnnlTranspose_v2(context->cnnlHandle(), opDesc, bInDesc, bData,
+                                bDesc, bDataOut, wsData, wsSize);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cInDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            cInDesc, CNNL_LAYOUT_NHWC, CNNL_DTYPE_FLOAT, 4, outputArray));
+
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, output));
+
+        cnnlConvolutionForwardAlgo_t algo;
+        cnnlGetConvolutionForwardAlgorithm(context->cnnlHandle(), convDesc,
+                                           aDesc, bDesc, cInDesc,
+                                           CNNL_CONVOLUTION_FWD_FASTEST, &algo);
+
+        cnnlGetConvolutionForwardWorkspaceSize(context->cnnlHandle(), aDesc,
+                                               bDesc, cInDesc, NULL, convDesc,
+                                               algo, &wsSize);
+        wsData = context->getWorkspace(wsSize);
+        BangPtr cDataIn = context->getWorkspace(
+            cnnlGetTensorElementNum(cInDesc) * sizeof(float));
+
+        stat = cnnlConvolutionForward(
+            context->cnnlHandle(), convDesc, algo, NULL, aDesc, aData, bDesc,
+            bData, NULL, NULL, wsData, wsSize, NULL, cInDesc, cDataIn);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        int cPermute[4] = {0, 3, 1, 2};
+        cnnlTransposeDescriptor_t opOutDesc;
+        checkCnnlError(cnnlCreateTransposeDescriptor(&opOutDesc));
+        checkCnnlError(cnnlSetTransposeDescriptor(opOutDesc, 4, cPermute));
+
+        cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), cInDesc, opOutDesc,
+                                      &wsSize);
+        wsData = context->getWorkspace(wsSize);
+
+        stat = cnnlTranspose_v2(context->cnnlHandle(), opOutDesc, cInDesc,
+                                cDataIn, cDesc, cData, wsData, wsSize);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aInDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(bInDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cInDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+        checkCnnlError(cnnlDestroyConvolutionDescriptor(convDesc));
+        checkCnnlError(cnnlDestroyTransposeDescriptor(opDesc));
+        checkCnnlError(cnnlDestroyTransposeDescriptor(opOutDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Conv, DataType::Float32, ConvCnnl,
+                "Conv_cnnl_BANG_Float32");
+}; // namespace infini
--- a/src/kernels/bang/matmul.cc
+++ b/src/kernels/bang/matmul.cc
@ -0,0 +1,65 @@
+#include "operators/matmul.h"
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+
+namespace infini {
+class MatmulCnnl : public BangKernelWithoutConfig {
+    virtual tuple<float, float> getAlphBeta() const { return {1.f, 0.f}; }
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<MatmulObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
+        auto dimInputs0 = op->getInputs(0)->getDims();
+        auto dimInputs1 = op->getInputs(1)->getDims();
+        auto dimOutput = op->getOutput()->getDims();
+        if (dimInputs0.size() != 3)
+            IT_TODO_HALT();
+        if (dimInputs1.size() != 3)
+            IT_TODO_HALT();
+        if (dimOutput.size() != 3)
+            IT_TODO_HALT();
+
+        bool transA = op->getTransA();
+        bool transB = op->getTransB();
+
+        int inputs0Array[3] = {dimInputs0[0], dimInputs0[1], dimInputs0[2]};
+        int inputs1Array[3] = {dimInputs1[0], dimInputs1[1], dimInputs1[2]};
+        int outputArray[3] = {dimOutput[0], dimOutput[1], dimOutput[2]};
+
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 3, inputs0Array));
+
+        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            bDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 3, inputs1Array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            cDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 3, outputArray));
+
+        cnnlStatus_t stat =
+            cnnlBatchMatMul(context->cnnlHandle(), transA, transB, aDesc, aData,
+                            bDesc, bData, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Matmul, DataType::Float32, MatmulCnnl,
+                "Matmul_cnnl_BANG_Float32");
+}; // namespace infini
--- a/test/kernels/bang/test_bang_conv.cc
+++ b/test/kernels/bang/test_bang_conv.cc
@ -0,0 +1,58 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/conv.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testConv(const std::function<void(void *, size_t, DataType)> &generatorA,
+              const std::function<void(void *, size_t, DataType)> &generatorB,
+              const Shape &shapeA, const Shape &shapeB) {
+    // Runtime
+    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu1 =
+        make_ref<TensorObj>(shapeA, DataType::Float32, cpuRuntime);
+    inputCpu1->dataMalloc();
+    inputCpu1->setData(generatorA);
+    Tensor inputCpu2 =
+        make_ref<TensorObj>(shapeB, DataType::Float32, cpuRuntime);
+    inputCpu2->dataMalloc();
+    inputCpu2->setData(generatorB);
+
+    // MLU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputMlu1 = bangGraph->cloneTensor(inputCpu1);
+    auto inputMlu2 = bangGraph->cloneTensor(inputCpu2);
+    auto mluOp =
+        bangGraph->addOp<T>(inputMlu1, inputMlu2, nullptr, 1, 1, 1, 1, 1, 1);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputMlu = mluOp->getOutput();
+    auto outputMlu2Cpu = outputMlu->clone(cpuRuntime);
+    // CPU
+    Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
+    auto cpuOp =
+        cpuGraph->addOp<T>(inputCpu1, inputCpu2, nullptr, 1, 1, 1, 1, 1, 1);
+    cpuGraph->dataMalloc();
+    cpuRuntime->run(cpuGraph);
+    auto outputCpu = cpuOp->getOutput();
+    outputCpu->print();
+    outputMlu2Cpu->print();
+    // Check
+    // EXPECT_TRUE(outputCpu->equalData(outputMlu2Cpu));
+    EXPECT_TRUE(true);
+}
+
+TEST(cnnl_Conv, run) {
+    testConv<ConvObj>(IncrementalGenerator(), IncrementalGenerator(),
+                      Shape{1, 3, 224, 224}, Shape{2, 3, 3, 3});
+}
+
+} // namespace infini
--- a/test/kernels/bang/test_bang_matmul.cc
+++ b/test/kernels/bang/test_bang_matmul.cc
@ -0,0 +1,56 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/matmul.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testMatmul(const std::function<void(void *, size_t, DataType)> &generatorA,
+                const std::function<void(void *, size_t, DataType)> &generatorB,
+                bool transA, bool transB, const Shape &shapeA,
+                const Shape &shapeB) {
+    // Runtime
+    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu1 =
+        make_ref<TensorObj>(shapeA, DataType::Float32, cpuRuntime);
+    inputCpu1->dataMalloc();
+    inputCpu1->setData(generatorA);
+    Tensor inputCpu2 =
+        make_ref<TensorObj>(shapeB, DataType::Float32, cpuRuntime);
+    inputCpu2->dataMalloc();
+    inputCpu2->setData(generatorB);
+
+    // MLU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputMlu1 = bangGraph->cloneTensor(inputCpu1);
+    auto inputMlu2 = bangGraph->cloneTensor(inputCpu2);
+    auto mluOp = bangGraph->addOp<T>(inputMlu1, inputMlu2, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputMlu = mluOp->getOutput();
+    auto outputMlu2Cpu = outputMlu->clone(cpuRuntime);
+    // CPU
+    Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
+    auto cpuOp = cpuGraph->addOp<T>(inputCpu1, inputCpu2, nullptr);
+    cpuGraph->dataMalloc();
+    cpuRuntime->run(cpuGraph);
+    auto outputCpu = cpuOp->getOutput();
+    outputCpu->print();
+    outputMlu2Cpu->print();
+    // Check
+    EXPECT_TRUE(outputCpu->equalData(outputMlu2Cpu));
+}
+
+TEST(cnnl_Matmul, run) {
+    testMatmul<MatmulObj>(IncrementalGenerator(), IncrementalGenerator(), false,
+                          false, Shape{1, 2, 3}, Shape{1, 3, 4});
+}
+
+} // namespace infini