feat: 寒武纪上添加 resize 算子,修复 format

This commit is contained in:
Zhang Bolun 2024-05-06 16:45:01 +08:00
parent d1799b67a3
commit 917e82e90c
4 changed files with 210 additions and 2 deletions

142
src/kernels/bang/resize.cc Normal file
View File

@ -0,0 +1,142 @@
#include "operators/resize.h"
#include "bang/bang_kernel_without_config.h"
#include "bang/bang_runtime.h"
#include <iostream>
namespace infini {
class ResizeCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ResizeObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto nDims = op->getInputs(0)->getRank();
if (nDims != 4) {
IT_TODO_HALT();
}
auto aDim = op->getInputs(0)->getDims();
auto cDim = op->getOutput()->getDims();
std::vector<int> aTransDim = {aDim[0], aDim[2], aDim[3], aDim[1]};
std::vector<int> cTransDim = {cDim[0], cDim[2], cDim[3], cDim[1]};
cnnlTensorDescriptor_t aDesc, cDesc, aTransDesc, cTransDesc;
// input
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
aDim.size(), aDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&aTransDesc));
checkCnnlError(cnnlSetTensorDescriptor(
aTransDesc, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()),
aTransDim.size(), aTransDim.data()));
// output
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
cDim.size(), cDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cTransDesc));
checkCnnlError(cnnlSetTensorDescriptor(
cTransDesc, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()),
cTransDim.size(), cTransDim.data()));
// transpose
BangPtr aTransData = context->getWorkspace(
cnnlGetTensorElementNum(aTransDesc) * op->getDType().getSize());
BangPtr cTransData = context->getWorkspace(
cnnlGetTensorElementNum(cTransDesc) * op->getDType().getSize());
int permuteIn[4] = {0, 2, 3, 1};
cnnlTransposeDescriptor_t inDesc;
checkCnnlError(cnnlCreateTransposeDescriptor(&inDesc));
checkCnnlError(cnnlSetTransposeDescriptor(inDesc, 4, permuteIn));
size_t wsSizeIn;
cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), aDesc, inDesc,
&wsSizeIn);
BangPtr wsDataIn = context->getWorkspace(wsSizeIn);
checkCnnlError(cnnlTranspose_v2(context->cnnlHandle(), inDesc, aDesc,
aData, aTransDesc, aTransData, wsDataIn,
wsSizeIn));
cnnlTensorDescriptor_t boxesDesc, boxesIndexDesc;
checkCnnlError(cnnlCreateTensorDescriptor(&boxesDesc));
auto nBatch = aDim[0];
std::vector<int> boxesDim = {nBatch, 4};
checkCnnlError(cnnlSetTensorDescriptor(
boxesDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
boxesDim.size(), boxesDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&boxesIndexDesc));
std::vector<int> boxesIndexDim = {nBatch};
checkCnnlError(cnnlSetTensorDescriptor(
boxesIndexDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_INT32,
boxesIndexDim.size(), boxesIndexDim.data()));
std::vector<int32_t> boxesIndex(nBatch);
std::iota(boxesIndex.begin(), boxesIndex.end(), 0);
BangPtr boxesIndexData =
context->getWorkspace(nBatch * sizeof(int32_t));
context->copyBlobFromCPU(boxesIndexData, boxesIndex.data(),
nBatch * sizeof(int32_t));
cnnlCropAndResizeMode_t mode;
auto coefMode = op->getMode();
if (coefMode == ResizeObj::ECoeffMode::nearest) {
mode = CNNL_CROP_AND_RESIZE_NEAREST;
} else if (coefMode == ResizeObj::ECoeffMode::linear) {
mode = CNNL_CROP_AND_RESIZE_BILINEAR;
} else {
IT_TODO_HALT();
}
std::vector<float> box;
auto transMode = op->getCoordinateTransMode();
if (transMode ==
enum_to_underlying(
ResizeObj::ECoordinateTransMode::tfCropAndResize)) {
box = {op->getRoi(2), op->getRoi(3), op->getRoi(6), op->getRoi(7)};
} else {
box = {0, 0, 1.0, 1.0};
}
BangPtr boxesData =
context->getWorkspace(nBatch * box.size() * sizeof(float));
for (auto i = 0; i < nBatch; i++) {
context->copyBlobFromCPU(boxesData + i * box.size() * sizeof(float),
box.data(), box.size() * sizeof(float));
}
checkCnnlError(cnnlCropAndResize(
context->cnnlHandle(), aTransDesc, aTransData, boxesDesc, boxesData,
boxesIndexDesc, boxesIndexData, mode, 0.0, cTransDesc, cTransData));
// transpose
int permuteOut[4] = {0, 3, 1, 2};
cnnlTransposeDescriptor_t outDesc;
checkCnnlError(cnnlCreateTransposeDescriptor(&outDesc));
checkCnnlError(cnnlSetTransposeDescriptor(outDesc, 4, permuteOut));
size_t wsSizeOut;
cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), cTransDesc,
outDesc, &wsSizeOut);
BangPtr wsDataOut = context->getWorkspace(wsSizeOut);
checkCnnlError(cnnlTranspose_v2(context->cnnlHandle(), outDesc,
cTransDesc, cTransData, cDesc, cData,
wsDataOut, wsSizeOut));
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
checkCnnlError(cnnlDestroyTensorDescriptor(aTransDesc));
checkCnnlError(cnnlDestroyTensorDescriptor(cTransDesc));
checkCnnlError(cnnlDestroyTensorDescriptor(boxesDesc));
checkCnnlError(cnnlDestroyTensorDescriptor(boxesIndexDesc));
checkCnnlError(cnnlDestroyTransposeDescriptor(inDesc));
checkCnnlError(cnnlDestroyTransposeDescriptor(outDesc));
}
};
REGISTER_KERNEL(Device::BANG, OpType::Resize, ResizeCnnl, "Resize_cnnl_BANG");
}; // namespace infini

View File

@ -572,7 +572,8 @@ class ATanhXdnn : public KUNLUNKernelWithoutConfig {
};
REGISTER_KERNEL(Device::KUNLUN, OpType::Relu, ReluXdnn, "Relu_xdnn_KUNLUN");
REGISTER_KERNEL(Device::KUNLUN, OpType::LeakyRelu, LeakyReluXdnn, "LeakyRelu_xdnn_KUNLUN");
REGISTER_KERNEL(Device::KUNLUN, OpType::LeakyRelu, LeakyReluXdnn,
"LeakyRelu_xdnn_KUNLUN");
REGISTER_KERNEL(Device::KUNLUN, OpType::Sigmoid, SigmoidXdnn,
"Sigmoid_xdnn_KUNLUN");
REGISTER_KERNEL(Device::KUNLUN, OpType::Tanh, TanhXdnn, "Tanh_xdnn_KUNLUN");

View File

@ -0,0 +1,65 @@
#include "bang/bang_runtime.h"
#include "cmath"
#include "core/graph.h"
#include "core/runtime.h"
#include "operators/resize.h"
#include "test.h"
namespace infini {
TEST(Resize, Bang_downsample_sizes_nearest) {
Runtime runtime = NativeCpuRuntimeObj::getInstance();
Graph gCpu = make_ref<GraphObj>(runtime);
auto input = gCpu->addTensor({1, 1, 2, 4}, DataType::Float32);
auto scales = gCpu->addTensor({4}, DataType::Float32);
gCpu->dataMalloc();
input->copyin(vector<float>{1, 2, 3, 4, 5, 6, 7, 8});
scales->copyin(vector<float>{1, 1, 0.6, 0.6});
auto bangRuntime = make_ref<BangRuntimeObj>();
Graph gMlu = make_ref<GraphObj>(bangRuntime);
auto inputMlu = gMlu->cloneTensor(input);
auto scalesMlu = gMlu->cloneTensor(scales);
auto op = gMlu->addOp<ResizeObj>(inputMlu, nullptr, std::nullopt, nullptr,
scalesMlu, nullptr);
gMlu->dataMalloc();
inputMlu->copyin(vector<float>{1, 2, 3, 4, 5, 6, 7, 8});
scalesMlu->copyin(vector<float>{1, 1, 0.6, 0.6});
bangRuntime->run(gMlu);
// copy output from CUDA to CPU
auto oCpu = gCpu->cloneTensor(op->getOutput(0));
EXPECT_TRUE(oCpu->equalData(vector<float>{5, 8}));
}
TEST(Resize, Bang_upsample_sizes_nearest) {
Runtime runtime = NativeCpuRuntimeObj::getInstance();
Graph gCpu = make_ref<GraphObj>(runtime);
auto input = gCpu->addTensor({1, 1, 2, 2}, DataType::Float32);
auto scales = gCpu->addTensor({4}, DataType::Float32);
gCpu->dataMalloc();
input->copyin(vector<float>{1, 2, 3, 4});
scales->copyin(vector<float>{1, 1, 2, 3});
auto bangRuntime = make_ref<BangRuntimeObj>();
Graph gMlu = make_ref<GraphObj>(bangRuntime);
auto inputMlu = gMlu->cloneTensor(input);
auto scalesMlu = gMlu->cloneTensor(scales);
auto op = gMlu->addOp<ResizeObj>(inputMlu, nullptr, std::nullopt, nullptr,
scalesMlu, nullptr);
gMlu->dataMalloc();
inputMlu->copyin(vector<float>{1, 2, 3, 4});
scalesMlu->copyin(vector<float>{1, 1, 2, 3});
bangRuntime->run(gMlu);
// copy output from CUDA to CPU
auto oCpu = gCpu->cloneTensor(op->getOutput(0));
EXPECT_TRUE(
oCpu->equalData(vector<float>{1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2,
3, 3, 3, 4, 4, 4, 3, 3, 3, 4, 4, 4}));
}
} // namespace infini