From 917e82e90ccc4014b1b17db8b18e73aed3274d92 Mon Sep 17 00:00:00 2001 From: Zhang Bolun Date: Mon, 6 May 2024 16:45:01 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E5=AF=92=E6=AD=A6=E7=BA=AA=E4=B8=8A?= =?UTF-8?q?=E6=B7=BB=E5=8A=A0=20resize=20=E7=AE=97=E5=AD=90=EF=BC=8C?= =?UTF-8?q?=E4=BF=AE=E5=A4=8D=20format?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/kernels/bang/resize.cc | 142 ++++++++++++++++++++++++++ src/kernels/kunlun/batch_norm.cc | 2 +- src/kernels/kunlun/unary.cc | 3 +- test/kernels/bang/test_bang_resize.cc | 65 ++++++++++++ 4 files changed, 210 insertions(+), 2 deletions(-) create mode 100644 src/kernels/bang/resize.cc create mode 100644 test/kernels/bang/test_bang_resize.cc diff --git a/src/kernels/bang/resize.cc b/src/kernels/bang/resize.cc new file mode 100644 index 00000000..fbe47b9b --- /dev/null +++ b/src/kernels/bang/resize.cc @@ -0,0 +1,142 @@ +#include "operators/resize.h" +#include "bang/bang_kernel_without_config.h" +#include "bang/bang_runtime.h" +#include + +namespace infini { +class ResizeCnnl : public BangKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + IT_ASSERT(op->getDType() == DataType::Float32); + auto context = dynamic_cast(_context); + + void *const aData = (op->getInputs(0)->getRawDataPtr()); + void *const cData = (op->getOutput()->getRawDataPtr()); + + auto nDims = op->getInputs(0)->getRank(); + if (nDims != 4) { + IT_TODO_HALT(); + } + auto aDim = op->getInputs(0)->getDims(); + auto cDim = op->getOutput()->getDims(); + std::vector aTransDim = {aDim[0], aDim[2], aDim[3], aDim[1]}; + std::vector cTransDim = {cDim[0], cDim[2], cDim[3], cDim[1]}; + + cnnlTensorDescriptor_t aDesc, cDesc, aTransDesc, cTransDesc; + // input + checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); + checkCnnlError(cnnlSetTensorDescriptor( + aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), + aDim.size(), aDim.data())); + checkCnnlError(cnnlCreateTensorDescriptor(&aTransDesc)); + checkCnnlError(cnnlSetTensorDescriptor( + aTransDesc, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()), + aTransDim.size(), aTransDim.data())); + // output + checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); + checkCnnlError(cnnlSetTensorDescriptor( + cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), + cDim.size(), cDim.data())); + checkCnnlError(cnnlCreateTensorDescriptor(&cTransDesc)); + checkCnnlError(cnnlSetTensorDescriptor( + cTransDesc, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()), + cTransDim.size(), cTransDim.data())); + + // transpose + BangPtr aTransData = context->getWorkspace( + cnnlGetTensorElementNum(aTransDesc) * op->getDType().getSize()); + BangPtr cTransData = context->getWorkspace( + cnnlGetTensorElementNum(cTransDesc) * op->getDType().getSize()); + + int permuteIn[4] = {0, 2, 3, 1}; + cnnlTransposeDescriptor_t inDesc; + checkCnnlError(cnnlCreateTransposeDescriptor(&inDesc)); + checkCnnlError(cnnlSetTransposeDescriptor(inDesc, 4, permuteIn)); + size_t wsSizeIn; + cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), aDesc, inDesc, + &wsSizeIn); + BangPtr wsDataIn = context->getWorkspace(wsSizeIn); + + checkCnnlError(cnnlTranspose_v2(context->cnnlHandle(), inDesc, aDesc, + aData, aTransDesc, aTransData, wsDataIn, + wsSizeIn)); + + cnnlTensorDescriptor_t boxesDesc, boxesIndexDesc; + checkCnnlError(cnnlCreateTensorDescriptor(&boxesDesc)); + auto nBatch = aDim[0]; + std::vector boxesDim = {nBatch, 4}; + checkCnnlError(cnnlSetTensorDescriptor( + boxesDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()), + boxesDim.size(), boxesDim.data())); + + checkCnnlError(cnnlCreateTensorDescriptor(&boxesIndexDesc)); + std::vector boxesIndexDim = {nBatch}; + checkCnnlError(cnnlSetTensorDescriptor( + boxesIndexDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_INT32, + boxesIndexDim.size(), boxesIndexDim.data())); + std::vector boxesIndex(nBatch); + std::iota(boxesIndex.begin(), boxesIndex.end(), 0); + BangPtr boxesIndexData = + context->getWorkspace(nBatch * sizeof(int32_t)); + context->copyBlobFromCPU(boxesIndexData, boxesIndex.data(), + nBatch * sizeof(int32_t)); + + cnnlCropAndResizeMode_t mode; + auto coefMode = op->getMode(); + if (coefMode == ResizeObj::ECoeffMode::nearest) { + mode = CNNL_CROP_AND_RESIZE_NEAREST; + } else if (coefMode == ResizeObj::ECoeffMode::linear) { + mode = CNNL_CROP_AND_RESIZE_BILINEAR; + } else { + IT_TODO_HALT(); + } + + std::vector box; + auto transMode = op->getCoordinateTransMode(); + if (transMode == + enum_to_underlying( + ResizeObj::ECoordinateTransMode::tfCropAndResize)) { + box = {op->getRoi(2), op->getRoi(3), op->getRoi(6), op->getRoi(7)}; + } else { + box = {0, 0, 1.0, 1.0}; + } + + BangPtr boxesData = + context->getWorkspace(nBatch * box.size() * sizeof(float)); + for (auto i = 0; i < nBatch; i++) { + context->copyBlobFromCPU(boxesData + i * box.size() * sizeof(float), + box.data(), box.size() * sizeof(float)); + } + + checkCnnlError(cnnlCropAndResize( + context->cnnlHandle(), aTransDesc, aTransData, boxesDesc, boxesData, + boxesIndexDesc, boxesIndexData, mode, 0.0, cTransDesc, cTransData)); + + // transpose + int permuteOut[4] = {0, 3, 1, 2}; + cnnlTransposeDescriptor_t outDesc; + checkCnnlError(cnnlCreateTransposeDescriptor(&outDesc)); + checkCnnlError(cnnlSetTransposeDescriptor(outDesc, 4, permuteOut)); + size_t wsSizeOut; + cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), cTransDesc, + outDesc, &wsSizeOut); + BangPtr wsDataOut = context->getWorkspace(wsSizeOut); + + checkCnnlError(cnnlTranspose_v2(context->cnnlHandle(), outDesc, + cTransDesc, cTransData, cDesc, cData, + wsDataOut, wsSizeOut)); + + checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(aTransDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(cTransDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(boxesDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(boxesIndexDesc)); + checkCnnlError(cnnlDestroyTransposeDescriptor(inDesc)); + checkCnnlError(cnnlDestroyTransposeDescriptor(outDesc)); + } +}; + +REGISTER_KERNEL(Device::BANG, OpType::Resize, ResizeCnnl, "Resize_cnnl_BANG"); +}; // namespace infini diff --git a/src/kernels/kunlun/batch_norm.cc b/src/kernels/kunlun/batch_norm.cc index 47ea325a..36847549 100644 --- a/src/kernels/kunlun/batch_norm.cc +++ b/src/kernels/kunlun/batch_norm.cc @@ -20,7 +20,7 @@ class BatchNormXdnn : public KUNLUNKernelWithoutConfig { auto dims = op->getInputs(0)->getDims(); int n, c, h, w; - if (dims.size() != 4){ + if (dims.size() != 4) { h = 1; w = 1; } diff --git a/src/kernels/kunlun/unary.cc b/src/kernels/kunlun/unary.cc index 2a7a28b9..e7180875 100755 --- a/src/kernels/kunlun/unary.cc +++ b/src/kernels/kunlun/unary.cc @@ -572,7 +572,8 @@ class ATanhXdnn : public KUNLUNKernelWithoutConfig { }; REGISTER_KERNEL(Device::KUNLUN, OpType::Relu, ReluXdnn, "Relu_xdnn_KUNLUN"); -REGISTER_KERNEL(Device::KUNLUN, OpType::LeakyRelu, LeakyReluXdnn, "LeakyRelu_xdnn_KUNLUN"); +REGISTER_KERNEL(Device::KUNLUN, OpType::LeakyRelu, LeakyReluXdnn, + "LeakyRelu_xdnn_KUNLUN"); REGISTER_KERNEL(Device::KUNLUN, OpType::Sigmoid, SigmoidXdnn, "Sigmoid_xdnn_KUNLUN"); REGISTER_KERNEL(Device::KUNLUN, OpType::Tanh, TanhXdnn, "Tanh_xdnn_KUNLUN"); diff --git a/test/kernels/bang/test_bang_resize.cc b/test/kernels/bang/test_bang_resize.cc new file mode 100644 index 00000000..8e622c91 --- /dev/null +++ b/test/kernels/bang/test_bang_resize.cc @@ -0,0 +1,65 @@ +#include "bang/bang_runtime.h" +#include "cmath" +#include "core/graph.h" +#include "core/runtime.h" +#include "operators/resize.h" +#include "test.h" +namespace infini { +TEST(Resize, Bang_downsample_sizes_nearest) { + Runtime runtime = NativeCpuRuntimeObj::getInstance(); + Graph gCpu = make_ref(runtime); + + auto input = gCpu->addTensor({1, 1, 2, 4}, DataType::Float32); + auto scales = gCpu->addTensor({4}, DataType::Float32); + gCpu->dataMalloc(); + input->copyin(vector{1, 2, 3, 4, 5, 6, 7, 8}); + scales->copyin(vector{1, 1, 0.6, 0.6}); + + auto bangRuntime = make_ref(); + Graph gMlu = make_ref(bangRuntime); + + auto inputMlu = gMlu->cloneTensor(input); + auto scalesMlu = gMlu->cloneTensor(scales); + auto op = gMlu->addOp(inputMlu, nullptr, std::nullopt, nullptr, + scalesMlu, nullptr); + gMlu->dataMalloc(); + inputMlu->copyin(vector{1, 2, 3, 4, 5, 6, 7, 8}); + scalesMlu->copyin(vector{1, 1, 0.6, 0.6}); + + bangRuntime->run(gMlu); + + // copy output from CUDA to CPU + auto oCpu = gCpu->cloneTensor(op->getOutput(0)); + EXPECT_TRUE(oCpu->equalData(vector{5, 8})); +} + +TEST(Resize, Bang_upsample_sizes_nearest) { + Runtime runtime = NativeCpuRuntimeObj::getInstance(); + Graph gCpu = make_ref(runtime); + + auto input = gCpu->addTensor({1, 1, 2, 2}, DataType::Float32); + auto scales = gCpu->addTensor({4}, DataType::Float32); + gCpu->dataMalloc(); + input->copyin(vector{1, 2, 3, 4}); + scales->copyin(vector{1, 1, 2, 3}); + + auto bangRuntime = make_ref(); + Graph gMlu = make_ref(bangRuntime); + + auto inputMlu = gMlu->cloneTensor(input); + auto scalesMlu = gMlu->cloneTensor(scales); + auto op = gMlu->addOp(inputMlu, nullptr, std::nullopt, nullptr, + scalesMlu, nullptr); + gMlu->dataMalloc(); + inputMlu->copyin(vector{1, 2, 3, 4}); + scalesMlu->copyin(vector{1, 1, 2, 3}); + + bangRuntime->run(gMlu); + + // copy output from CUDA to CPU + auto oCpu = gCpu->cloneTensor(op->getOutput(0)); + EXPECT_TRUE( + oCpu->equalData(vector{1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, + 3, 3, 3, 4, 4, 4, 3, 3, 3, 4, 4, 4})); +} +} // namespace infini \ No newline at end of file