forked from jiuyuan/InfiniTensor
feat: 寒武纪上添加 resize 算子,修复 format
This commit is contained in:
parent
d1799b67a3
commit
917e82e90c
|
@ -0,0 +1,142 @@
|
||||||
|
#include "operators/resize.h"
|
||||||
|
#include "bang/bang_kernel_without_config.h"
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
class ResizeCnnl : public BangKernelWithoutConfig {
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<ResizeObj>(_op);
|
||||||
|
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
|
auto nDims = op->getInputs(0)->getRank();
|
||||||
|
if (nDims != 4) {
|
||||||
|
IT_TODO_HALT();
|
||||||
|
}
|
||||||
|
auto aDim = op->getInputs(0)->getDims();
|
||||||
|
auto cDim = op->getOutput()->getDims();
|
||||||
|
std::vector<int> aTransDim = {aDim[0], aDim[2], aDim[3], aDim[1]};
|
||||||
|
std::vector<int> cTransDim = {cDim[0], cDim[2], cDim[3], cDim[1]};
|
||||||
|
|
||||||
|
cnnlTensorDescriptor_t aDesc, cDesc, aTransDesc, cTransDesc;
|
||||||
|
// input
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||||
|
aDim.size(), aDim.data()));
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&aTransDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
aTransDesc, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()),
|
||||||
|
aTransDim.size(), aTransDim.data()));
|
||||||
|
// output
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||||
|
cDim.size(), cDim.data()));
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&cTransDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
cTransDesc, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()),
|
||||||
|
cTransDim.size(), cTransDim.data()));
|
||||||
|
|
||||||
|
// transpose
|
||||||
|
BangPtr aTransData = context->getWorkspace(
|
||||||
|
cnnlGetTensorElementNum(aTransDesc) * op->getDType().getSize());
|
||||||
|
BangPtr cTransData = context->getWorkspace(
|
||||||
|
cnnlGetTensorElementNum(cTransDesc) * op->getDType().getSize());
|
||||||
|
|
||||||
|
int permuteIn[4] = {0, 2, 3, 1};
|
||||||
|
cnnlTransposeDescriptor_t inDesc;
|
||||||
|
checkCnnlError(cnnlCreateTransposeDescriptor(&inDesc));
|
||||||
|
checkCnnlError(cnnlSetTransposeDescriptor(inDesc, 4, permuteIn));
|
||||||
|
size_t wsSizeIn;
|
||||||
|
cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), aDesc, inDesc,
|
||||||
|
&wsSizeIn);
|
||||||
|
BangPtr wsDataIn = context->getWorkspace(wsSizeIn);
|
||||||
|
|
||||||
|
checkCnnlError(cnnlTranspose_v2(context->cnnlHandle(), inDesc, aDesc,
|
||||||
|
aData, aTransDesc, aTransData, wsDataIn,
|
||||||
|
wsSizeIn));
|
||||||
|
|
||||||
|
cnnlTensorDescriptor_t boxesDesc, boxesIndexDesc;
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&boxesDesc));
|
||||||
|
auto nBatch = aDim[0];
|
||||||
|
std::vector<int> boxesDim = {nBatch, 4};
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
boxesDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
|
||||||
|
boxesDim.size(), boxesDim.data()));
|
||||||
|
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&boxesIndexDesc));
|
||||||
|
std::vector<int> boxesIndexDim = {nBatch};
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
boxesIndexDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_INT32,
|
||||||
|
boxesIndexDim.size(), boxesIndexDim.data()));
|
||||||
|
std::vector<int32_t> boxesIndex(nBatch);
|
||||||
|
std::iota(boxesIndex.begin(), boxesIndex.end(), 0);
|
||||||
|
BangPtr boxesIndexData =
|
||||||
|
context->getWorkspace(nBatch * sizeof(int32_t));
|
||||||
|
context->copyBlobFromCPU(boxesIndexData, boxesIndex.data(),
|
||||||
|
nBatch * sizeof(int32_t));
|
||||||
|
|
||||||
|
cnnlCropAndResizeMode_t mode;
|
||||||
|
auto coefMode = op->getMode();
|
||||||
|
if (coefMode == ResizeObj::ECoeffMode::nearest) {
|
||||||
|
mode = CNNL_CROP_AND_RESIZE_NEAREST;
|
||||||
|
} else if (coefMode == ResizeObj::ECoeffMode::linear) {
|
||||||
|
mode = CNNL_CROP_AND_RESIZE_BILINEAR;
|
||||||
|
} else {
|
||||||
|
IT_TODO_HALT();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<float> box;
|
||||||
|
auto transMode = op->getCoordinateTransMode();
|
||||||
|
if (transMode ==
|
||||||
|
enum_to_underlying(
|
||||||
|
ResizeObj::ECoordinateTransMode::tfCropAndResize)) {
|
||||||
|
box = {op->getRoi(2), op->getRoi(3), op->getRoi(6), op->getRoi(7)};
|
||||||
|
} else {
|
||||||
|
box = {0, 0, 1.0, 1.0};
|
||||||
|
}
|
||||||
|
|
||||||
|
BangPtr boxesData =
|
||||||
|
context->getWorkspace(nBatch * box.size() * sizeof(float));
|
||||||
|
for (auto i = 0; i < nBatch; i++) {
|
||||||
|
context->copyBlobFromCPU(boxesData + i * box.size() * sizeof(float),
|
||||||
|
box.data(), box.size() * sizeof(float));
|
||||||
|
}
|
||||||
|
|
||||||
|
checkCnnlError(cnnlCropAndResize(
|
||||||
|
context->cnnlHandle(), aTransDesc, aTransData, boxesDesc, boxesData,
|
||||||
|
boxesIndexDesc, boxesIndexData, mode, 0.0, cTransDesc, cTransData));
|
||||||
|
|
||||||
|
// transpose
|
||||||
|
int permuteOut[4] = {0, 3, 1, 2};
|
||||||
|
cnnlTransposeDescriptor_t outDesc;
|
||||||
|
checkCnnlError(cnnlCreateTransposeDescriptor(&outDesc));
|
||||||
|
checkCnnlError(cnnlSetTransposeDescriptor(outDesc, 4, permuteOut));
|
||||||
|
size_t wsSizeOut;
|
||||||
|
cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), cTransDesc,
|
||||||
|
outDesc, &wsSizeOut);
|
||||||
|
BangPtr wsDataOut = context->getWorkspace(wsSizeOut);
|
||||||
|
|
||||||
|
checkCnnlError(cnnlTranspose_v2(context->cnnlHandle(), outDesc,
|
||||||
|
cTransDesc, cTransData, cDesc, cData,
|
||||||
|
wsDataOut, wsSizeOut));
|
||||||
|
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(aTransDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(cTransDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(boxesDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(boxesIndexDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTransposeDescriptor(inDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTransposeDescriptor(outDesc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::Resize, ResizeCnnl, "Resize_cnnl_BANG");
|
||||||
|
}; // namespace infini
|
|
@ -20,7 +20,7 @@ class BatchNormXdnn : public KUNLUNKernelWithoutConfig {
|
||||||
auto dims = op->getInputs(0)->getDims();
|
auto dims = op->getInputs(0)->getDims();
|
||||||
|
|
||||||
int n, c, h, w;
|
int n, c, h, w;
|
||||||
if (dims.size() != 4){
|
if (dims.size() != 4) {
|
||||||
h = 1;
|
h = 1;
|
||||||
w = 1;
|
w = 1;
|
||||||
}
|
}
|
||||||
|
|
|
@ -572,7 +572,8 @@ class ATanhXdnn : public KUNLUNKernelWithoutConfig {
|
||||||
};
|
};
|
||||||
|
|
||||||
REGISTER_KERNEL(Device::KUNLUN, OpType::Relu, ReluXdnn, "Relu_xdnn_KUNLUN");
|
REGISTER_KERNEL(Device::KUNLUN, OpType::Relu, ReluXdnn, "Relu_xdnn_KUNLUN");
|
||||||
REGISTER_KERNEL(Device::KUNLUN, OpType::LeakyRelu, LeakyReluXdnn, "LeakyRelu_xdnn_KUNLUN");
|
REGISTER_KERNEL(Device::KUNLUN, OpType::LeakyRelu, LeakyReluXdnn,
|
||||||
|
"LeakyRelu_xdnn_KUNLUN");
|
||||||
REGISTER_KERNEL(Device::KUNLUN, OpType::Sigmoid, SigmoidXdnn,
|
REGISTER_KERNEL(Device::KUNLUN, OpType::Sigmoid, SigmoidXdnn,
|
||||||
"Sigmoid_xdnn_KUNLUN");
|
"Sigmoid_xdnn_KUNLUN");
|
||||||
REGISTER_KERNEL(Device::KUNLUN, OpType::Tanh, TanhXdnn, "Tanh_xdnn_KUNLUN");
|
REGISTER_KERNEL(Device::KUNLUN, OpType::Tanh, TanhXdnn, "Tanh_xdnn_KUNLUN");
|
||||||
|
|
|
@ -0,0 +1,65 @@
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
#include "cmath"
|
||||||
|
#include "core/graph.h"
|
||||||
|
#include "core/runtime.h"
|
||||||
|
#include "operators/resize.h"
|
||||||
|
#include "test.h"
|
||||||
|
namespace infini {
|
||||||
|
TEST(Resize, Bang_downsample_sizes_nearest) {
|
||||||
|
Runtime runtime = NativeCpuRuntimeObj::getInstance();
|
||||||
|
Graph gCpu = make_ref<GraphObj>(runtime);
|
||||||
|
|
||||||
|
auto input = gCpu->addTensor({1, 1, 2, 4}, DataType::Float32);
|
||||||
|
auto scales = gCpu->addTensor({4}, DataType::Float32);
|
||||||
|
gCpu->dataMalloc();
|
||||||
|
input->copyin(vector<float>{1, 2, 3, 4, 5, 6, 7, 8});
|
||||||
|
scales->copyin(vector<float>{1, 1, 0.6, 0.6});
|
||||||
|
|
||||||
|
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||||
|
Graph gMlu = make_ref<GraphObj>(bangRuntime);
|
||||||
|
|
||||||
|
auto inputMlu = gMlu->cloneTensor(input);
|
||||||
|
auto scalesMlu = gMlu->cloneTensor(scales);
|
||||||
|
auto op = gMlu->addOp<ResizeObj>(inputMlu, nullptr, std::nullopt, nullptr,
|
||||||
|
scalesMlu, nullptr);
|
||||||
|
gMlu->dataMalloc();
|
||||||
|
inputMlu->copyin(vector<float>{1, 2, 3, 4, 5, 6, 7, 8});
|
||||||
|
scalesMlu->copyin(vector<float>{1, 1, 0.6, 0.6});
|
||||||
|
|
||||||
|
bangRuntime->run(gMlu);
|
||||||
|
|
||||||
|
// copy output from CUDA to CPU
|
||||||
|
auto oCpu = gCpu->cloneTensor(op->getOutput(0));
|
||||||
|
EXPECT_TRUE(oCpu->equalData(vector<float>{5, 8}));
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(Resize, Bang_upsample_sizes_nearest) {
|
||||||
|
Runtime runtime = NativeCpuRuntimeObj::getInstance();
|
||||||
|
Graph gCpu = make_ref<GraphObj>(runtime);
|
||||||
|
|
||||||
|
auto input = gCpu->addTensor({1, 1, 2, 2}, DataType::Float32);
|
||||||
|
auto scales = gCpu->addTensor({4}, DataType::Float32);
|
||||||
|
gCpu->dataMalloc();
|
||||||
|
input->copyin(vector<float>{1, 2, 3, 4});
|
||||||
|
scales->copyin(vector<float>{1, 1, 2, 3});
|
||||||
|
|
||||||
|
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||||
|
Graph gMlu = make_ref<GraphObj>(bangRuntime);
|
||||||
|
|
||||||
|
auto inputMlu = gMlu->cloneTensor(input);
|
||||||
|
auto scalesMlu = gMlu->cloneTensor(scales);
|
||||||
|
auto op = gMlu->addOp<ResizeObj>(inputMlu, nullptr, std::nullopt, nullptr,
|
||||||
|
scalesMlu, nullptr);
|
||||||
|
gMlu->dataMalloc();
|
||||||
|
inputMlu->copyin(vector<float>{1, 2, 3, 4});
|
||||||
|
scalesMlu->copyin(vector<float>{1, 1, 2, 3});
|
||||||
|
|
||||||
|
bangRuntime->run(gMlu);
|
||||||
|
|
||||||
|
// copy output from CUDA to CPU
|
||||||
|
auto oCpu = gCpu->cloneTensor(op->getOutput(0));
|
||||||
|
EXPECT_TRUE(
|
||||||
|
oCpu->equalData(vector<float>{1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2,
|
||||||
|
3, 3, 3, 4, 4, 4, 3, 3, 3, 4, 4, 4}));
|
||||||
|
}
|
||||||
|
} // namespace infini
|
Loading…
Reference in New Issue