From 917e82e90ccc4014b1b17db8b18e73aed3274d92 Mon Sep 17 00:00:00 2001
From: Zhang Bolun <Chamberlain0w0@gmail.com>
Date: Mon, 6 May 2024 16:45:01 +0800
Subject: [PATCH] =?UTF-8?q?feat:=20=E5=AF=92=E6=AD=A6=E7=BA=AA=E4=B8=8A?=
 =?UTF-8?q?=E6=B7=BB=E5=8A=A0=20resize=20=E7=AE=97=E5=AD=90=EF=BC=8C?=
 =?UTF-8?q?=E4=BF=AE=E5=A4=8D=20format?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/kernels/bang/resize.cc            | 142 ++++++++++++++++++++++++++
 src/kernels/kunlun/batch_norm.cc      |   2 +-
 src/kernels/kunlun/unary.cc           |   3 +-
 test/kernels/bang/test_bang_resize.cc |  65 ++++++++++++
 4 files changed, 210 insertions(+), 2 deletions(-)
 create mode 100644 src/kernels/bang/resize.cc
 create mode 100644 test/kernels/bang/test_bang_resize.cc
diff --git a/src/kernels/bang/resize.cc b/src/kernels/bang/resize.cc
new file mode 100644
index 00000000..fbe47b9b
--- /dev/null
+++ b/src/kernels/bang/resize.cc
@@ -0,0 +1,142 @@
+#include "operators/resize.h"
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include <iostream>
+
+namespace infini {
+class ResizeCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<ResizeObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        auto nDims = op->getInputs(0)->getRank();
+        if (nDims != 4) {
+            IT_TODO_HALT();
+        }
+        auto aDim = op->getInputs(0)->getDims();
+        auto cDim = op->getOutput()->getDims();
+        std::vector<int> aTransDim = {aDim[0], aDim[2], aDim[3], aDim[1]};
+        std::vector<int> cTransDim = {cDim[0], cDim[2], cDim[3], cDim[1]};
+
+        cnnlTensorDescriptor_t aDesc, cDesc, aTransDesc, cTransDesc;
+        // input
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            aDim.size(), aDim.data()));
+        checkCnnlError(cnnlCreateTensorDescriptor(&aTransDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aTransDesc, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()),
+            aTransDim.size(), aTransDim.data()));
+        // output
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            cDim.size(), cDim.data()));
+        checkCnnlError(cnnlCreateTensorDescriptor(&cTransDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            cTransDesc, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()),
+            cTransDim.size(), cTransDim.data()));
+
+        // transpose
+        BangPtr aTransData = context->getWorkspace(
+            cnnlGetTensorElementNum(aTransDesc) * op->getDType().getSize());
+        BangPtr cTransData = context->getWorkspace(
+            cnnlGetTensorElementNum(cTransDesc) * op->getDType().getSize());
+
+        int permuteIn[4] = {0, 2, 3, 1};
+        cnnlTransposeDescriptor_t inDesc;
+        checkCnnlError(cnnlCreateTransposeDescriptor(&inDesc));
+        checkCnnlError(cnnlSetTransposeDescriptor(inDesc, 4, permuteIn));
+        size_t wsSizeIn;
+        cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), aDesc, inDesc,
+                                      &wsSizeIn);
+        BangPtr wsDataIn = context->getWorkspace(wsSizeIn);
+
+        checkCnnlError(cnnlTranspose_v2(context->cnnlHandle(), inDesc, aDesc,
+                                        aData, aTransDesc, aTransData, wsDataIn,
+                                        wsSizeIn));
+
+        cnnlTensorDescriptor_t boxesDesc, boxesIndexDesc;
+        checkCnnlError(cnnlCreateTensorDescriptor(&boxesDesc));
+        auto nBatch = aDim[0];
+        std::vector<int> boxesDim = {nBatch, 4};
+        checkCnnlError(cnnlSetTensorDescriptor(
+            boxesDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
+            boxesDim.size(), boxesDim.data()));
+
+        checkCnnlError(cnnlCreateTensorDescriptor(&boxesIndexDesc));
+        std::vector<int> boxesIndexDim = {nBatch};
+        checkCnnlError(cnnlSetTensorDescriptor(
+            boxesIndexDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_INT32,
+            boxesIndexDim.size(), boxesIndexDim.data()));
+        std::vector<int32_t> boxesIndex(nBatch);
+        std::iota(boxesIndex.begin(), boxesIndex.end(), 0);
+        BangPtr boxesIndexData =
+            context->getWorkspace(nBatch * sizeof(int32_t));
+        context->copyBlobFromCPU(boxesIndexData, boxesIndex.data(),
+                                 nBatch * sizeof(int32_t));
+
+        cnnlCropAndResizeMode_t mode;
+        auto coefMode = op->getMode();
+        if (coefMode == ResizeObj::ECoeffMode::nearest) {
+            mode = CNNL_CROP_AND_RESIZE_NEAREST;
+        } else if (coefMode == ResizeObj::ECoeffMode::linear) {
+            mode = CNNL_CROP_AND_RESIZE_BILINEAR;
+        } else {
+            IT_TODO_HALT();
+        }
+
+        std::vector<float> box;
+        auto transMode = op->getCoordinateTransMode();
+        if (transMode ==
+            enum_to_underlying(
+                ResizeObj::ECoordinateTransMode::tfCropAndResize)) {
+            box = {op->getRoi(2), op->getRoi(3), op->getRoi(6), op->getRoi(7)};
+        } else {
+            box = {0, 0, 1.0, 1.0};
+        }
+
+        BangPtr boxesData =
+            context->getWorkspace(nBatch * box.size() * sizeof(float));
+        for (auto i = 0; i < nBatch; i++) {
+            context->copyBlobFromCPU(boxesData + i * box.size() * sizeof(float),
+                                     box.data(), box.size() * sizeof(float));
+        }
+
+        checkCnnlError(cnnlCropAndResize(
+            context->cnnlHandle(), aTransDesc, aTransData, boxesDesc, boxesData,
+            boxesIndexDesc, boxesIndexData, mode, 0.0, cTransDesc, cTransData));
+
+        // transpose
+        int permuteOut[4] = {0, 3, 1, 2};
+        cnnlTransposeDescriptor_t outDesc;
+        checkCnnlError(cnnlCreateTransposeDescriptor(&outDesc));
+        checkCnnlError(cnnlSetTransposeDescriptor(outDesc, 4, permuteOut));
+        size_t wsSizeOut;
+        cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), cTransDesc,
+                                      outDesc, &wsSizeOut);
+        BangPtr wsDataOut = context->getWorkspace(wsSizeOut);
+
+        checkCnnlError(cnnlTranspose_v2(context->cnnlHandle(), outDesc,
+                                        cTransDesc, cTransData, cDesc, cData,
+                                        wsDataOut, wsSizeOut));
+
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(aTransDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cTransDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(boxesDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(boxesIndexDesc));
+        checkCnnlError(cnnlDestroyTransposeDescriptor(inDesc));
+        checkCnnlError(cnnlDestroyTransposeDescriptor(outDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Resize, ResizeCnnl, "Resize_cnnl_BANG");
+}; // namespace infini
diff --git a/src/kernels/kunlun/batch_norm.cc b/src/kernels/kunlun/batch_norm.cc
index 47ea325a..36847549 100644
--- a/src/kernels/kunlun/batch_norm.cc
+++ b/src/kernels/kunlun/batch_norm.cc
@@ -20,7 +20,7 @@ class BatchNormXdnn : public KUNLUNKernelWithoutConfig {
         auto dims = op->getInputs(0)->getDims();
 
         int n, c, h, w;
-        if (dims.size() != 4){
+        if (dims.size() != 4) {
             h = 1;
             w = 1;
         }
diff --git a/src/kernels/kunlun/unary.cc b/src/kernels/kunlun/unary.cc
index 2a7a28b9..e7180875 100755
--- a/src/kernels/kunlun/unary.cc
+++ b/src/kernels/kunlun/unary.cc
@@ -572,7 +572,8 @@ class ATanhXdnn : public KUNLUNKernelWithoutConfig {
 };
 
 REGISTER_KERNEL(Device::KUNLUN, OpType::Relu, ReluXdnn, "Relu_xdnn_KUNLUN");
-REGISTER_KERNEL(Device::KUNLUN, OpType::LeakyRelu, LeakyReluXdnn, "LeakyRelu_xdnn_KUNLUN");
+REGISTER_KERNEL(Device::KUNLUN, OpType::LeakyRelu, LeakyReluXdnn,
+                "LeakyRelu_xdnn_KUNLUN");
 REGISTER_KERNEL(Device::KUNLUN, OpType::Sigmoid, SigmoidXdnn,
                 "Sigmoid_xdnn_KUNLUN");
 REGISTER_KERNEL(Device::KUNLUN, OpType::Tanh, TanhXdnn, "Tanh_xdnn_KUNLUN");
diff --git a/test/kernels/bang/test_bang_resize.cc b/test/kernels/bang/test_bang_resize.cc
new file mode 100644
index 00000000..8e622c91
--- /dev/null
+++ b/test/kernels/bang/test_bang_resize.cc
@@ -0,0 +1,65 @@
+#include "bang/bang_runtime.h"
+#include "cmath"
+#include "core/graph.h"
+#include "core/runtime.h"
+#include "operators/resize.h"
+#include "test.h"
+namespace infini {
+TEST(Resize, Bang_downsample_sizes_nearest) {
+    Runtime runtime = NativeCpuRuntimeObj::getInstance();
+    Graph gCpu = make_ref<GraphObj>(runtime);
+
+    auto input = gCpu->addTensor({1, 1, 2, 4}, DataType::Float32);
+    auto scales = gCpu->addTensor({4}, DataType::Float32);
+    gCpu->dataMalloc();
+    input->copyin(vector<float>{1, 2, 3, 4, 5, 6, 7, 8});
+    scales->copyin(vector<float>{1, 1, 0.6, 0.6});
+
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+    Graph gMlu = make_ref<GraphObj>(bangRuntime);
+
+    auto inputMlu = gMlu->cloneTensor(input);
+    auto scalesMlu = gMlu->cloneTensor(scales);
+    auto op = gMlu->addOp<ResizeObj>(inputMlu, nullptr, std::nullopt, nullptr,
+                                     scalesMlu, nullptr);
+    gMlu->dataMalloc();
+    inputMlu->copyin(vector<float>{1, 2, 3, 4, 5, 6, 7, 8});
+    scalesMlu->copyin(vector<float>{1, 1, 0.6, 0.6});
+
+    bangRuntime->run(gMlu);
+
+    //  copy output from CUDA to CPU
+    auto oCpu = gCpu->cloneTensor(op->getOutput(0));
+    EXPECT_TRUE(oCpu->equalData(vector<float>{5, 8}));
+}
+
+TEST(Resize, Bang_upsample_sizes_nearest) {
+    Runtime runtime = NativeCpuRuntimeObj::getInstance();
+    Graph gCpu = make_ref<GraphObj>(runtime);
+
+    auto input = gCpu->addTensor({1, 1, 2, 2}, DataType::Float32);
+    auto scales = gCpu->addTensor({4}, DataType::Float32);
+    gCpu->dataMalloc();
+    input->copyin(vector<float>{1, 2, 3, 4});
+    scales->copyin(vector<float>{1, 1, 2, 3});
+
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+    Graph gMlu = make_ref<GraphObj>(bangRuntime);
+
+    auto inputMlu = gMlu->cloneTensor(input);
+    auto scalesMlu = gMlu->cloneTensor(scales);
+    auto op = gMlu->addOp<ResizeObj>(inputMlu, nullptr, std::nullopt, nullptr,
+                                     scalesMlu, nullptr);
+    gMlu->dataMalloc();
+    inputMlu->copyin(vector<float>{1, 2, 3, 4});
+    scalesMlu->copyin(vector<float>{1, 1, 2, 3});
+
+    bangRuntime->run(gMlu);
+
+    //  copy output from CUDA to CPU
+    auto oCpu = gCpu->cloneTensor(op->getOutput(0));
+    EXPECT_TRUE(
+        oCpu->equalData(vector<float>{1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2,
+                                      3, 3, 3, 4, 4, 4, 3, 3, 3, 4, 4, 4}));
+}
+} // namespace infini
\ No newline at end of file