feat: 寒武纪上添加 resize 算子，修复 format

2024-05-06 16:45:01 +08:00 · 2024-05-06 16:45:01 +08:00 · 917e82e90c
parent d1799b67a3
commit 917e82e90c
4 changed files with 210 additions and 2 deletions
--- a/src/kernels/bang/resize.cc
+++ b/src/kernels/bang/resize.cc
@ -0,0 +1,142 @@
 #include "operators/resize.h"
 #include "bang/bang_kernel_without_config.h"
 #include "bang/bang_runtime.h"
 #include <iostream>
 namespace infini {
 class ResizeCnnl : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<ResizeObj>(_op);
        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
        auto nDims = op->getInputs(0)->getRank();
        if (nDims != 4) {
            IT_TODO_HALT();
        }
        auto aDim = op->getInputs(0)->getDims();
        auto cDim = op->getOutput()->getDims();
        std::vector<int> aTransDim = {aDim[0], aDim[2], aDim[3], aDim[1]};
        std::vector<int> cTransDim = {cDim[0], cDim[2], cDim[3], cDim[1]};
        cnnlTensorDescriptor_t aDesc, cDesc, aTransDesc, cTransDesc;
        // input
        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
        checkCnnlError(cnnlSetTensorDescriptor(
            aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
            aDim.size(), aDim.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&aTransDesc));
        checkCnnlError(cnnlSetTensorDescriptor(
            aTransDesc, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()),
            aTransDim.size(), aTransDim.data()));
        // output
        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
        checkCnnlError(cnnlSetTensorDescriptor(
            cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
            cDim.size(), cDim.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&cTransDesc));
        checkCnnlError(cnnlSetTensorDescriptor(
            cTransDesc, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()),
            cTransDim.size(), cTransDim.data()));
        // transpose
        BangPtr aTransData = context->getWorkspace(
            cnnlGetTensorElementNum(aTransDesc) * op->getDType().getSize());
        BangPtr cTransData = context->getWorkspace(
            cnnlGetTensorElementNum(cTransDesc) * op->getDType().getSize());
        int permuteIn[4] = {0, 2, 3, 1};
        cnnlTransposeDescriptor_t inDesc;
        checkCnnlError(cnnlCreateTransposeDescriptor(&inDesc));
        checkCnnlError(cnnlSetTransposeDescriptor(inDesc, 4, permuteIn));
        size_t wsSizeIn;
        cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), aDesc, inDesc,
                                      &wsSizeIn);
        BangPtr wsDataIn = context->getWorkspace(wsSizeIn);
        checkCnnlError(cnnlTranspose_v2(context->cnnlHandle(), inDesc, aDesc,
                                        aData, aTransDesc, aTransData, wsDataIn,
                                        wsSizeIn));
        cnnlTensorDescriptor_t boxesDesc, boxesIndexDesc;
        checkCnnlError(cnnlCreateTensorDescriptor(&boxesDesc));
        auto nBatch = aDim[0];
        std::vector<int> boxesDim = {nBatch, 4};
        checkCnnlError(cnnlSetTensorDescriptor(
            boxesDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
            boxesDim.size(), boxesDim.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&boxesIndexDesc));
        std::vector<int> boxesIndexDim = {nBatch};
        checkCnnlError(cnnlSetTensorDescriptor(
            boxesIndexDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_INT32,
            boxesIndexDim.size(), boxesIndexDim.data()));
        std::vector<int32_t> boxesIndex(nBatch);
        std::iota(boxesIndex.begin(), boxesIndex.end(), 0);
        BangPtr boxesIndexData =
            context->getWorkspace(nBatch * sizeof(int32_t));
        context->copyBlobFromCPU(boxesIndexData, boxesIndex.data(),
                                 nBatch * sizeof(int32_t));
        cnnlCropAndResizeMode_t mode;
        auto coefMode = op->getMode();
        if (coefMode == ResizeObj::ECoeffMode::nearest) {
            mode = CNNL_CROP_AND_RESIZE_NEAREST;
        } else if (coefMode == ResizeObj::ECoeffMode::linear) {
            mode = CNNL_CROP_AND_RESIZE_BILINEAR;
        } else {
            IT_TODO_HALT();
        }
        std::vector<float> box;
        auto transMode = op->getCoordinateTransMode();
        if (transMode ==
            enum_to_underlying(
                ResizeObj::ECoordinateTransMode::tfCropAndResize)) {
            box = {op->getRoi(2), op->getRoi(3), op->getRoi(6), op->getRoi(7)};
        } else {
            box = {0, 0, 1.0, 1.0};
        }
        BangPtr boxesData =
            context->getWorkspace(nBatch * box.size() * sizeof(float));
        for (auto i = 0; i < nBatch; i++) {
            context->copyBlobFromCPU(boxesData + i * box.size() * sizeof(float),
                                     box.data(), box.size() * sizeof(float));
        }
        checkCnnlError(cnnlCropAndResize(
            context->cnnlHandle(), aTransDesc, aTransData, boxesDesc, boxesData,
            boxesIndexDesc, boxesIndexData, mode, 0.0, cTransDesc, cTransData));
        // transpose
        int permuteOut[4] = {0, 3, 1, 2};
        cnnlTransposeDescriptor_t outDesc;
        checkCnnlError(cnnlCreateTransposeDescriptor(&outDesc));
        checkCnnlError(cnnlSetTransposeDescriptor(outDesc, 4, permuteOut));
        size_t wsSizeOut;
        cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), cTransDesc,
                                      outDesc, &wsSizeOut);
        BangPtr wsDataOut = context->getWorkspace(wsSizeOut);
        checkCnnlError(cnnlTranspose_v2(context->cnnlHandle(), outDesc,
                                        cTransDesc, cTransData, cDesc, cData,
                                        wsDataOut, wsSizeOut));
        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
        checkCnnlError(cnnlDestroyTensorDescriptor(aTransDesc));
        checkCnnlError(cnnlDestroyTensorDescriptor(cTransDesc));
        checkCnnlError(cnnlDestroyTensorDescriptor(boxesDesc));
        checkCnnlError(cnnlDestroyTensorDescriptor(boxesIndexDesc));
        checkCnnlError(cnnlDestroyTransposeDescriptor(inDesc));
        checkCnnlError(cnnlDestroyTransposeDescriptor(outDesc));
    }
 };
 REGISTER_KERNEL(Device::BANG, OpType::Resize, ResizeCnnl, "Resize_cnnl_BANG");
 }; // namespace infini
--- a/src/kernels/kunlun/batch_norm.cc
+++ b/src/kernels/kunlun/batch_norm.cc
@ -20,7 +20,7 @@ class BatchNormXdnn : public KUNLUNKernelWithoutConfig {
        auto dims = op->getInputs(0)->getDims();
        int n, c, h, w;
-        if (dims.size() != 4){
+        if (dims.size() != 4) {
            h = 1;
            w = 1;
        }
--- a/src/kernels/kunlun/unary.cc
+++ b/src/kernels/kunlun/unary.cc
@ -572,7 +572,8 @@ class ATanhXdnn : public KUNLUNKernelWithoutConfig {
 };
 REGISTER_KERNEL(Device::KUNLUN, OpType::Relu, ReluXdnn, "Relu_xdnn_KUNLUN");
-REGISTER_KERNEL(Device::KUNLUN, OpType::LeakyRelu, LeakyReluXdnn, "LeakyRelu_xdnn_KUNLUN");
+REGISTER_KERNEL(Device::KUNLUN, OpType::LeakyRelu, LeakyReluXdnn,
                "LeakyRelu_xdnn_KUNLUN");
 REGISTER_KERNEL(Device::KUNLUN, OpType::Sigmoid, SigmoidXdnn,
                "Sigmoid_xdnn_KUNLUN");
 REGISTER_KERNEL(Device::KUNLUN, OpType::Tanh, TanhXdnn, "Tanh_xdnn_KUNLUN");
--- a/test/kernels/bang/test_bang_resize.cc
+++ b/test/kernels/bang/test_bang_resize.cc
@ -0,0 +1,65 @@
 #include "bang/bang_runtime.h"
 #include "cmath"
 #include "core/graph.h"
 #include "core/runtime.h"
 #include "operators/resize.h"
 #include "test.h"
 namespace infini {
 TEST(Resize, Bang_downsample_sizes_nearest) {
    Runtime runtime = NativeCpuRuntimeObj::getInstance();
    Graph gCpu = make_ref<GraphObj>(runtime);
    auto input = gCpu->addTensor({1, 1, 2, 4}, DataType::Float32);
    auto scales = gCpu->addTensor({4}, DataType::Float32);
    gCpu->dataMalloc();
    input->copyin(vector<float>{1, 2, 3, 4, 5, 6, 7, 8});
    scales->copyin(vector<float>{1, 1, 0.6, 0.6});
    auto bangRuntime = make_ref<BangRuntimeObj>();
    Graph gMlu = make_ref<GraphObj>(bangRuntime);
    auto inputMlu = gMlu->cloneTensor(input);
    auto scalesMlu = gMlu->cloneTensor(scales);
    auto op = gMlu->addOp<ResizeObj>(inputMlu, nullptr, std::nullopt, nullptr,
                                     scalesMlu, nullptr);
    gMlu->dataMalloc();
    inputMlu->copyin(vector<float>{1, 2, 3, 4, 5, 6, 7, 8});
    scalesMlu->copyin(vector<float>{1, 1, 0.6, 0.6});
    bangRuntime->run(gMlu);
    //  copy output from CUDA to CPU
    auto oCpu = gCpu->cloneTensor(op->getOutput(0));
    EXPECT_TRUE(oCpu->equalData(vector<float>{5, 8}));
 }
 TEST(Resize, Bang_upsample_sizes_nearest) {
    Runtime runtime = NativeCpuRuntimeObj::getInstance();
    Graph gCpu = make_ref<GraphObj>(runtime);
    auto input = gCpu->addTensor({1, 1, 2, 2}, DataType::Float32);
    auto scales = gCpu->addTensor({4}, DataType::Float32);
    gCpu->dataMalloc();
    input->copyin(vector<float>{1, 2, 3, 4});
    scales->copyin(vector<float>{1, 1, 2, 3});
    auto bangRuntime = make_ref<BangRuntimeObj>();
    Graph gMlu = make_ref<GraphObj>(bangRuntime);
    auto inputMlu = gMlu->cloneTensor(input);
    auto scalesMlu = gMlu->cloneTensor(scales);
    auto op = gMlu->addOp<ResizeObj>(inputMlu, nullptr, std::nullopt, nullptr,
                                     scalesMlu, nullptr);
    gMlu->dataMalloc();
    inputMlu->copyin(vector<float>{1, 2, 3, 4});
    scalesMlu->copyin(vector<float>{1, 1, 2, 3});
    bangRuntime->run(gMlu);
    //  copy output from CUDA to CPU
    auto oCpu = gCpu->cloneTensor(op->getOutput(0));
    EXPECT_TRUE(
        oCpu->equalData(vector<float>{1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2,
                                      3, 3, 3, 4, 4, 4, 3, 3, 3, 4, 4, 4}));
 }
 } // namespace infini