Support perf bang 1115 (#57)

* support matmul * add matmul * add matmul * add code for cnnl matmul operation and test * add conv * add code for conv test on mlu * add code for test cnnl conv on mlu * add code for perf conv and matmul on mlu * clang format * fix convolution operation * fxi cmaklist * code format * fix code * code format --------- Co-authored-by: wanghailu <wanghailu@qiyuanlab.com> Co-authored-by: wanghailu <wanghailu0717@163.com>
2023-03-29 13:52:56 +08:00 · 2023-03-29 13:52:56 +08:00 · 823e66a9ff
parent 86ec4036ce
commit 823e66a9ff
8 changed files with 419 additions and 0 deletions
--- a/include/bang/bang_runtime.h
+++ b/include/bang/bang_runtime.h
@ -30,6 +30,7 @@ class BangRuntimeObj : public RuntimeObj {
        dealloc(workspace);
        checkCnnlError(cnnlDestroy(cnnl));
    }
    string toString() const override;
    void run(const Graph &graph, bool tune = false,
             bool profiling = false) const;
--- a/include/bang/operator_timer.h
+++ b/include/bang/operator_timer.h
@ -0,0 +1,10 @@
 #pragma once
 namespace infini {
 namespace opTimer {
 double getPerfConvCnnl(int n, int c, int h, int w, int f, int r, int s,
                       int padh, int padw, int strideh, int stridew,
                       int dilationh, int dilationw, int group,
                       const char *name);
 double getPerfMatmulCnnl(int b, int m, int n, int k, const char *name);
 } // namespace opTimer
 } // namespace infini
--- a/src/bang/bang_runtime.cc
+++ b/src/bang/bang_runtime.cc
@ -54,4 +54,6 @@ void BangRuntimeObj::run(const Graph &graph, bool tune, bool profiling) const {
 void BangRuntimeObj::sync() const { cnrtSyncDevice(); }
 string BangRuntimeObj::toString() const { return "BANG Runtime"; }
 } // namespace infini
--- a/src/bang/operator_timer.cc
+++ b/src/bang/operator_timer.cc
@ -0,0 +1,71 @@
 #include "bang/operator_timer.h"
 #include "bang/bang_runtime.h"
 #include "core/graph.h"
 #include "core/kernel.h"
 #include "core/runtime.h"
 #include "operators/conv.h"
 #include "operators/matmul.h"
 #include "utils/data_generator.h"
 namespace infini {
 namespace opTimer {
 double getPerfConvCnnl(int n, int c, int h, int w, int f, int r, int s,
                       int padh, int padw, int strideh, int stridew,
                       int dilationh, int dilationw, int group,
                       const char *name) {
    Runtime cpu = CpuRuntimeObj::getInstance(); // CPUruntime is singleton
    Graph gCpu = make_ref<GraphObj>(cpu);
    Runtime bang = make_ref<BangRuntimeObj>();
    Graph gBang = make_ref<GraphObj>(bang);
    // Set input data on CPU in a CPU Graph
    IT_ASSERT(c % group == 0);
    Tensor i0Cpu = gCpu->addTensor({n, h, w, c}, DataType::Float32);
    Tensor w0Cpu = gCpu->addTensor({f, r, s, c / group}, DataType::Float32);
    // Malloc data for all tensors in a graph. Do we need implicit allocation?
    gCpu->dataMalloc();
    i0Cpu->setData(IncrementalGenerator());
    w0Cpu->setData(IncrementalGenerator());
    // Copy input tensors from CPU to Bang
    Tensor i0Bang = gBang->cloneTensor(i0Cpu);
    Tensor w0Bang = gBang->cloneTensor(w0Cpu);
    // Build Bang graph
    auto conv = gBang->addOp<ConvObj>(i0Bang, w0Bang, nullptr, padh, padw,
                                      strideh, stridew, dilationh, dilationw);
    // allocate Bang memory
    gBang->dataMalloc();
    // Execute on Bang
    bool tune = true;
    bang->run(gBang, tune);
    return bang->getPerfTime(gBang);
 }
 double getPerfMatmulCnnl(int b, int m, int n, int k, const char *name) {
    Runtime cpu = CpuRuntimeObj::getInstance(); // CPUruntime is singleton
    Graph gCpu = make_ref<GraphObj>(cpu);
    Runtime bang = make_ref<BangRuntimeObj>();
    Graph gBang = make_ref<GraphObj>(bang);
    // Set input data on CPU in a CPU Graph
    Tensor i0Cpu = gCpu->addTensor({b, m, k}, DataType::Float32);
    Tensor w0Cpu = gCpu->addTensor({b, k, n}, DataType::Float32);
    // Malloc data for all tensors in a graph. Do we need implicit allocation?
    gCpu->dataMalloc();
    i0Cpu->setData(IncrementalGenerator());
    w0Cpu->setData(IncrementalGenerator());
    // Copy input tensors from CPU to Bang
    Tensor i0Bang = gBang->cloneTensor(i0Cpu);
    Tensor w0Bang = gBang->cloneTensor(w0Cpu);
    // Build Bang graph
    auto conv = gBang->addOp<MatmulObj>(i0Bang, w0Bang, nullptr);
    // allocate Bang memory
    gBang->dataMalloc();
    // Execute on Bang
    bool tune = true;
    bang->run(gBang, tune);
    return bang->getPerfTime(gBang);
 }
 } // namespace opTimer
 } // namespace infini
--- a/src/kernels/bang/conv.cc
+++ b/src/kernels/bang/conv.cc
@ -0,0 +1,156 @@
 #include "operators/conv.h"
 #include "bang/bang_kernel_without_config.h"
 #include "bang/bang_runtime.h"
 namespace infini {
 class ConvCnnl : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<ConvObj>(_op);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
        const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
        const auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
        const int cpg = op->getChannelPerGroup();
        const int g = c / cpg;
        int pad[4] = {ph, ph, pw, pw};
        int stride[2] = {sh, sw};
        int dilation[2] = {dh, dw};
        cnnlConvolutionDescriptor_t convDesc;
        checkCnnlError(cnnlCreateConvolutionDescriptor(&convDesc));
        checkCnnlError(cnnlSetConvolutionDescriptor(
            convDesc, 4, pad, stride, dilation, g, CNNL_DTYPE_FLOAT));
        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
        void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
        cnnlTensorDescriptor_t aInDesc, aDesc, bInDesc, bDesc, cInDesc, cDesc;
        auto dimInputs0 = op->getInputs(0)->getDims();
        auto dimInputs1 = op->getInputs(1)->getDims();
        auto dimOutput = op->getOutput()->getDims();
        if (dimInputs0.size() != 4)
            IT_TODO_HALT();
        if (dimInputs1.size() != 4)
            IT_TODO_HALT();
        if (dimOutput.size() != 4)
            IT_TODO_HALT();
        int inputs0[4] = {dimInputs0[0], dimInputs0[1], dimInputs0[2],
                          dimInputs0[3]};
        int inputs0Array[4] = {dimInputs0[0], dimInputs0[2], dimInputs0[3],
                               dimInputs0[1]};
        int inputs1[4] = {dimInputs1[0], dimInputs1[1], dimInputs1[2],
                          dimInputs1[3]};
        int inputs1Array[4] = {dimInputs1[0], dimInputs1[2], dimInputs1[3],
                               dimInputs1[1]};
        int output[4] = {dimOutput[0], dimOutput[1], dimOutput[2],
                         dimOutput[3]};
        int outputArray[4] = {dimOutput[0], dimOutput[2], dimOutput[3],
                              dimOutput[1]};
        // get inputs
        checkCnnlError(cnnlCreateTensorDescriptor(&aInDesc));
        checkCnnlError(cnnlSetTensorDescriptor(aInDesc, CNNL_LAYOUT_NCHW,
                                               CNNL_DTYPE_FLOAT, 4, inputs0));
        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
        checkCnnlError(cnnlSetTensorDescriptor(
            aDesc, CNNL_LAYOUT_NHWC, CNNL_DTYPE_FLOAT, 4, inputs0Array));
        checkCnnlError(cnnlCreateTensorDescriptor(&bInDesc));
        checkCnnlError(cnnlSetTensorDescriptor(bInDesc, CNNL_LAYOUT_NCHW,
                                               CNNL_DTYPE_FLOAT, 4, inputs1));
        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
        checkCnnlError(cnnlSetTensorDescriptor(
            bDesc, CNNL_LAYOUT_NHWC, CNNL_DTYPE_FLOAT, 4, inputs1Array));
        int permute[4] = {0, 2, 3, 1};
        cnnlTransposeDescriptor_t opDesc;
        checkCnnlError(cnnlCreateTransposeDescriptor(&opDesc));
        checkCnnlError(cnnlSetTransposeDescriptor(opDesc, 4, permute));
        size_t wsSize;
        cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), aInDesc, opDesc,
                                      &wsSize);
        BangPtr wsData = context->getWorkspace(wsSize);
        BangPtr aDataOut = context->getWorkspace(
            cnnlGetTensorElementNum(aInDesc) * sizeof(float));
        cnnlStatus_t stat =
            cnnlTranspose_v2(context->cnnlHandle(), opDesc, aInDesc, aData,
                             aDesc, aDataOut, wsData, wsSize);
        if (stat != CNNL_STATUS_SUCCESS)
            return;
        cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), bInDesc, opDesc,
                                      &wsSize);
        wsData = context->getWorkspace(wsSize);
        BangPtr bDataOut = context->getWorkspace(
            cnnlGetTensorElementNum(bInDesc) * sizeof(float));
        stat = cnnlTranspose_v2(context->cnnlHandle(), opDesc, bInDesc, bData,
                                bDesc, bDataOut, wsData, wsSize);
        if (stat != CNNL_STATUS_SUCCESS)
            return;
        // get outputs
        checkCnnlError(cnnlCreateTensorDescriptor(&cInDesc));
        checkCnnlError(cnnlSetTensorDescriptor(
            cInDesc, CNNL_LAYOUT_NHWC, CNNL_DTYPE_FLOAT, 4, outputArray));
        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
                                               CNNL_DTYPE_FLOAT, 4, output));
        cnnlConvolutionForwardAlgo_t algo;
        cnnlGetConvolutionForwardAlgorithm(context->cnnlHandle(), convDesc,
                                           aDesc, bDesc, cInDesc,
                                           CNNL_CONVOLUTION_FWD_FASTEST, &algo);
        cnnlGetConvolutionForwardWorkspaceSize(context->cnnlHandle(), aDesc,
                                               bDesc, cInDesc, NULL, convDesc,
                                               algo, &wsSize);
        wsData = context->getWorkspace(wsSize);
        BangPtr cDataIn = context->getWorkspace(
            cnnlGetTensorElementNum(cInDesc) * sizeof(float));
        stat = cnnlConvolutionForward(
            context->cnnlHandle(), convDesc, algo, NULL, aDesc, aData, bDesc,
            bData, NULL, NULL, wsData, wsSize, NULL, cInDesc, cDataIn);
        if (stat != CNNL_STATUS_SUCCESS)
            return;
        int cPermute[4] = {0, 3, 1, 2};
        cnnlTransposeDescriptor_t opOutDesc;
        checkCnnlError(cnnlCreateTransposeDescriptor(&opOutDesc));
        checkCnnlError(cnnlSetTransposeDescriptor(opOutDesc, 4, cPermute));
        cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), cInDesc, opOutDesc,
                                      &wsSize);
        wsData = context->getWorkspace(wsSize);
        stat = cnnlTranspose_v2(context->cnnlHandle(), opOutDesc, cInDesc,
                                cDataIn, cDesc, cData, wsData, wsSize);
        if (stat != CNNL_STATUS_SUCCESS)
            return;
        // Destories in BANG does not require sync. But cnnl does not state
        // whether sync is required before destories.
        checkCnnlError(cnnlDestroyTensorDescriptor(aInDesc));
        checkCnnlError(cnnlDestroyTensorDescriptor(bInDesc));
        checkCnnlError(cnnlDestroyTensorDescriptor(cInDesc));
        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
        checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
        checkCnnlError(cnnlDestroyConvolutionDescriptor(convDesc));
        checkCnnlError(cnnlDestroyTransposeDescriptor(opDesc));
        checkCnnlError(cnnlDestroyTransposeDescriptor(opOutDesc));
    }
 };
 REGISTER_KERNEL(Device::BANG, OpType::Conv, DataType::Float32, ConvCnnl,
                "Conv_cnnl_BANG_Float32");
 }; // namespace infini
--- a/src/kernels/bang/matmul.cc
+++ b/src/kernels/bang/matmul.cc
@ -0,0 +1,65 @@
 #include "operators/matmul.h"
 #include "bang/bang_kernel_without_config.h"
 #include "bang/bang_runtime.h"
 namespace infini {
 class MatmulCnnl : public BangKernelWithoutConfig {
    virtual tuple<float, float> getAlphBeta() const { return {1.f, 0.f}; }
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<MatmulObj>(_op);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
        void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
        cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
        auto dimInputs0 = op->getInputs(0)->getDims();
        auto dimInputs1 = op->getInputs(1)->getDims();
        auto dimOutput = op->getOutput()->getDims();
        if (dimInputs0.size() != 3)
            IT_TODO_HALT();
        if (dimInputs1.size() != 3)
            IT_TODO_HALT();
        if (dimOutput.size() != 3)
            IT_TODO_HALT();
        bool transA = op->getTransA();
        bool transB = op->getTransB();
        int inputs0Array[3] = {dimInputs0[0], dimInputs0[1], dimInputs0[2]};
        int inputs1Array[3] = {dimInputs1[0], dimInputs1[1], dimInputs1[2]};
        int outputArray[3] = {dimOutput[0], dimOutput[1], dimOutput[2]};
        // get inputs
        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
        checkCnnlError(cnnlSetTensorDescriptor(
            aDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 3, inputs0Array));
        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
        checkCnnlError(cnnlSetTensorDescriptor(
            bDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 3, inputs1Array));
        // get outputs
        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
        checkCnnlError(cnnlSetTensorDescriptor(
            cDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 3, outputArray));
        cnnlStatus_t stat =
            cnnlBatchMatMul(context->cnnlHandle(), transA, transB, aDesc, aData,
                            bDesc, bData, cDesc, cData);
        if (stat != CNNL_STATUS_SUCCESS)
            return;
        // Destories in BANG does not require sync. But cnnl does not state
        // whether sync is required before destories.
        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
        checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
    }
 };
 REGISTER_KERNEL(Device::BANG, OpType::Matmul, DataType::Float32, MatmulCnnl,
                "Matmul_cnnl_BANG_Float32");
 }; // namespace infini
--- a/test/kernels/bang/test_bang_conv.cc
+++ b/test/kernels/bang/test_bang_conv.cc
@ -0,0 +1,58 @@
 #include "bang/bang_runtime.h"
 #include "core/graph.h"
 #include "core/kernel.h"
 #include "core/runtime.h"
 #include "operators/conv.h"
 #include "test.h"
 namespace infini {
 template <class T>
 void testConv(const std::function<void(void *, size_t, DataType)> &generatorA,
              const std::function<void(void *, size_t, DataType)> &generatorB,
              const Shape &shapeA, const Shape &shapeB) {
    // Runtime
    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
    auto bangRuntime = make_ref<BangRuntimeObj>();
    // Build input data on CPU
    Tensor inputCpu1 =
        make_ref<TensorObj>(shapeA, DataType::Float32, cpuRuntime);
    inputCpu1->dataMalloc();
    inputCpu1->setData(generatorA);
    Tensor inputCpu2 =
        make_ref<TensorObj>(shapeB, DataType::Float32, cpuRuntime);
    inputCpu2->dataMalloc();
    inputCpu2->setData(generatorB);
    // MLU
    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
    auto inputMlu1 = bangGraph->cloneTensor(inputCpu1);
    auto inputMlu2 = bangGraph->cloneTensor(inputCpu2);
    auto mluOp =
        bangGraph->addOp<T>(inputMlu1, inputMlu2, nullptr, 1, 1, 1, 1, 1, 1);
    bangGraph->dataMalloc();
    bangRuntime->run(bangGraph);
    auto outputMlu = mluOp->getOutput();
    auto outputMlu2Cpu = outputMlu->clone(cpuRuntime);
    // CPU
    Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
    auto cpuOp =
        cpuGraph->addOp<T>(inputCpu1, inputCpu2, nullptr, 1, 1, 1, 1, 1, 1);
    cpuGraph->dataMalloc();
    cpuRuntime->run(cpuGraph);
    auto outputCpu = cpuOp->getOutput();
    outputCpu->print();
    outputMlu2Cpu->print();
    // Check
    // EXPECT_TRUE(outputCpu->equalData(outputMlu2Cpu));
    EXPECT_TRUE(true);
 }
 TEST(cnnl_Conv, run) {
    testConv<ConvObj>(IncrementalGenerator(), IncrementalGenerator(),
                      Shape{1, 3, 224, 224}, Shape{2, 3, 3, 3});
 }
 } // namespace infini
--- a/test/kernels/bang/test_bang_matmul.cc
+++ b/test/kernels/bang/test_bang_matmul.cc
@ -0,0 +1,56 @@
 #include "bang/bang_runtime.h"
 #include "core/graph.h"
 #include "core/kernel.h"
 #include "core/runtime.h"
 #include "operators/matmul.h"
 #include "test.h"
 namespace infini {
 template <class T>
 void testMatmul(const std::function<void(void *, size_t, DataType)> &generatorA,
                const std::function<void(void *, size_t, DataType)> &generatorB,
                bool transA, bool transB, const Shape &shapeA,
                const Shape &shapeB) {
    // Runtime
    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
    auto bangRuntime = make_ref<BangRuntimeObj>();
    // Build input data on CPU
    Tensor inputCpu1 =
        make_ref<TensorObj>(shapeA, DataType::Float32, cpuRuntime);
    inputCpu1->dataMalloc();
    inputCpu1->setData(generatorA);
    Tensor inputCpu2 =
        make_ref<TensorObj>(shapeB, DataType::Float32, cpuRuntime);
    inputCpu2->dataMalloc();
    inputCpu2->setData(generatorB);
    // MLU
    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
    auto inputMlu1 = bangGraph->cloneTensor(inputCpu1);
    auto inputMlu2 = bangGraph->cloneTensor(inputCpu2);
    auto mluOp = bangGraph->addOp<T>(inputMlu1, inputMlu2, nullptr);
    bangGraph->dataMalloc();
    bangRuntime->run(bangGraph);
    auto outputMlu = mluOp->getOutput();
    auto outputMlu2Cpu = outputMlu->clone(cpuRuntime);
    // CPU
    Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
    auto cpuOp = cpuGraph->addOp<T>(inputCpu1, inputCpu2, nullptr);
    cpuGraph->dataMalloc();
    cpuRuntime->run(cpuGraph);
    auto outputCpu = cpuOp->getOutput();
    outputCpu->print();
    outputMlu2Cpu->print();
    // Check
    EXPECT_TRUE(outputCpu->equalData(outputMlu2Cpu));
 }
 TEST(cnnl_Matmul, run) {
    testMatmul<MatmulObj>(IncrementalGenerator(), IncrementalGenerator(), false,
                          false, Shape{1, 2, 3}, Shape{1, 3, 4});
 }
 } // namespace infini