From fc4b62a88c4dc3fde0172077df99b8d2db06f34a Mon Sep 17 00:00:00 2001 From: OdinaryWord Date: Wed, 13 Mar 2024 17:25:15 +0800 Subject: [PATCH] add maxpooling & flatten --- src/kernels/ascend/pooling.cc | 116 +++++++++++++++++- src/kernels/ascend/reshape.cc | 4 + test/kernels/ascend/test_ascend_pooling.cc | 4 +- ..._ascend_copy.cc => test_ascend_reshape.cc} | 30 +++++ 4 files changed, 149 insertions(+), 5 deletions(-) rename test/kernels/ascend/{test_ascend_copy.cc => test_ascend_reshape.cc} (61%) diff --git a/src/kernels/ascend/pooling.cc b/src/kernels/ascend/pooling.cc index d639321b..3bbc1c1e 100644 --- a/src/kernels/ascend/pooling.cc +++ b/src/kernels/ascend/pooling.cc @@ -1,4 +1,5 @@ #include "operators/pooling.h" +#include "aclnnop/level2/aclnn_adaptive_max_pool2d.h" #include "aclnnop/level2/aclnn_avgpool2d.h" #include "ascend/ascend_kernel_without_config.h" #include "ascend/ascend_runtime.h" @@ -22,7 +23,7 @@ class AvgPooling : public ASCENDKernelWithoutConfig { std::vector stride = {sh, sw}; std::vector pad = {ph, pw}; - int64_t divisorOverride = kh * kw; + int64_t divisorOverride = 0; auto selfD = op->getInputs(0)->getDims(); auto selfS = op->getInputs(0)->getStride(); @@ -51,12 +52,14 @@ class AvgPooling : public ASCENDKernelWithoutConfig { auto ret = aclnnAvgPool2dGetWorkspaceSize( selfTensor, kernelSize, strides, paddings, false, true, - divisorOverride, 1, outputTensor, &workspaceSize, &executor); + divisorOverride, 0, outputTensor, &workspaceSize, &executor); + assert(ret == ACL_SUCCESS); + void *workspaceAddr = nullptr; if (workspaceSize > 0) { workspaceAddr = context->getWorkspace(workspaceSize); } - assert(ret == ACL_SUCCESS); + ret = aclnnAvgPool2d(workspaceAddr, workspaceSize, executor, context->ASCENDHandle()); assert(ret == ACL_SUCCESS); @@ -71,6 +74,113 @@ class AvgPooling : public ASCENDKernelWithoutConfig { } }; +class MaxPooling : public ASCENDKernelWithoutConfig { + // Only adaptiveMaxPool2d was found in the ACLNN doc. + int64_t GetShapeSize(const std::vector &shape) { + int64_t shapeSize = 1; + for (auto i : shape) { + shapeSize *= i; + } + return shapeSize; + } + template + int CreateAclTensor(const std::vector &hostData, + const std::vector &shape, void **deviceAddr, + aclDataType dataType, aclTensor **tensor) { + auto size = GetShapeSize(shape) * sizeof(T); + // 调用aclrtMalloc申请device侧内存 + auto ret = aclrtMalloc(deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST); + assert(ret == ACL_SUCCESS); + // 调用aclrtMemcpy将host侧数据拷贝到device侧内存上 + ret = aclrtMemcpy(*deviceAddr, size, hostData.data(), size, + ACL_MEMCPY_HOST_TO_DEVICE); + assert(ret == ACL_SUCCESS); + + // 计算连续tensor的strides + std::vector strides(shape.size(), 1); + for (int64_t i = shape.size() - 2; i >= 0; i--) { + strides[i] = shape[i + 1] * strides[i + 1]; + } + + // 调用aclCreateTensor接口创建aclTensor + *tensor = aclCreateTensor(shape.data(), shape.size(), dataType, + strides.data(), 0, aclFormat::ACL_FORMAT_NCHW, + shape.data(), shape.size(), *deviceAddr); + return 0; + } + + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + + void *const aData = (op->getInputs(0)->getRawDataPtr()); + void *const cData = (op->getOutput()->getRawDataPtr()); + + auto selfD = op->getInputs(0)->getDims(); + auto selfS = op->getInputs(0)->getStride(); + auto outD = op->getOutput()->getDims(); + auto outS = op->getOutput()->getStride(); + + std::vector selfDim = castTo64(selfD); + std::vector selfStride = castTo64(selfS); + std::vector outputDim = castTo64(outD); + std::vector outputStride = castTo64(outS); + + std::vector outputHW(2, 1); + outputHW[0] = outputDim[outputDim.size() - 2]; + outputHW[1] = outputDim[outputDim.size() - 1]; + + int64_t indicesOutSize = 1; + for (auto i : outputDim) { + indicesOutSize *= i; + } + void *indicesOutDeviceAddr = nullptr; + aclrtMalloc(&indicesOutDeviceAddr, indicesOutSize, + ACL_MEM_MALLOC_HUGE_FIRST); + + aclIntArray *outputsize = + aclCreateIntArray(outputHW.data(), outputHW.size()); + auto selfTensor = aclCreateTensor( + selfDim.data(), selfDim.size(), ACL_FLOAT, selfStride.data(), 0, + aclFormat::ACL_FORMAT_NCHW, selfDim.data(), selfDim.size(), aData); + auto outputTensor = + aclCreateTensor(outputDim.data(), outputDim.size(), ACL_FLOAT, + outputStride.data(), 0, aclFormat::ACL_FORMAT_NCHW, + outputDim.data(), outputDim.size(), cData); + auto indicesOutTensor = aclCreateTensor( + outputDim.data(), outputDim.size(), ACL_INT64, outputStride.data(), + 0, aclFormat::ACL_FORMAT_NCHW, outputDim.data(), outputDim.size(), + indicesOutDeviceAddr); + + uint64_t workspaceSize = 0; + aclOpExecutor *executor; + auto ret = aclnnAdaptiveMaxPool2dGetWorkspaceSize( + selfTensor, outputsize, outputTensor, indicesOutTensor, + &workspaceSize, &executor); + assert(ret == ACL_SUCCESS); + + void *workspaceAddr = nullptr; + if (workspaceSize > 0) { + workspaceAddr = context->getWorkspace(workspaceSize); + } + + ret = aclnnAdaptiveMaxPool2d(workspaceAddr, workspaceSize, executor, + context->ASCENDHandle()); + assert(ret == ACL_SUCCESS); + + ret = aclrtSynchronizeStream(context->ASCENDHandle()); + assert(ret == ACL_SUCCESS); + + aclDestroyTensor(indicesOutTensor); + + return; + } +}; + +REGISTER_KERNEL(Device::ASCEND, OpType::MaxPool, MaxPooling, + "maxpooling_ASCEND_float"); + REGISTER_KERNEL(Device::ASCEND, OpType::AveragePool, AvgPooling, "avgpooling_ASCEND_float"); }; // namespace infini diff --git a/src/kernels/ascend/reshape.cc b/src/kernels/ascend/reshape.cc index 3b596f94..586adda4 100644 --- a/src/kernels/ascend/reshape.cc +++ b/src/kernels/ascend/reshape.cc @@ -52,4 +52,8 @@ REGISTER_KERNEL(Device::ASCEND, OpType::Unsqueeze, CopyAclnn, "unsqueeze_ASCEND_float"); REGISTER_KERNEL(Device::ASCEND, OpType::Squeeze, CopyAclnn, "squeeze_ASCEND_float"); +REGISTER_KERNEL(Device::ASCEND, OpType::Flatten, CopyAclnn, + "Flatten_ASCEND_float"); +REGISTER_KERNEL(Device::ASCEND, OpType::Identity, CopyAclnn, + "Identity_ASCEND_float"); }; // namespace infini diff --git a/test/kernels/ascend/test_ascend_pooling.cc b/test/kernels/ascend/test_ascend_pooling.cc index 44d8f504..7fedc41e 100644 --- a/test/kernels/ascend/test_ascend_pooling.cc +++ b/test/kernels/ascend/test_ascend_pooling.cc @@ -39,8 +39,8 @@ void testPooling(const std::function &generator, TEST(cnnl_Pooling, run) { aclInit(nullptr); - // testPooling(IncrementalGenerator(), Shape{1, 1, 5, 5}); - testPooling(IncrementalGenerator(), Shape{1, 1, 5, 5}); + testPooling(IncrementalGenerator(), Shape{1, 2, 5, 5}); + testPooling(IncrementalGenerator(), Shape{1, 2, 5, 5}); aclFinalize(); } diff --git a/test/kernels/ascend/test_ascend_copy.cc b/test/kernels/ascend/test_ascend_reshape.cc similarity index 61% rename from test/kernels/ascend/test_ascend_copy.cc rename to test/kernels/ascend/test_ascend_reshape.cc index 0558095b..6e873db0 100644 --- a/test/kernels/ascend/test_ascend_copy.cc +++ b/test/kernels/ascend/test_ascend_reshape.cc @@ -40,6 +40,35 @@ void testReshape(const std::function &generator, EXPECT_TRUE(inputCpu->equalData(outputNpu2Cpu, 1e-3)); } +void testFlatten(const std::function &generator, + const Shape &shape, int axis) { + // Runtime + Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); + auto npuRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu = make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu->dataMalloc(); + inputCpu->setData(generator); + + // NPU + Graph npuGraph = make_ref(npuRuntime); + auto inputNpu = npuGraph->cloneTensor(inputCpu); + auto npuOp = npuGraph->addOp(inputNpu, nullptr, axis); + npuGraph->dataMalloc(); + inputNpu->setData(generator); + npuRuntime->run(npuGraph); + auto outputNpu = npuOp->getOutput(); + auto outputNpu2Cpu = outputNpu->clone(cpuRuntime); + + // Check + inputCpu->print(); + inputCpu->printData(); + outputNpu2Cpu->print(); + outputNpu2Cpu->printData(); + EXPECT_TRUE(inputCpu->equalData(outputNpu2Cpu, 1e-3)); +} + TEST(ascend_Unary, run) { aclInit(nullptr); testReshape(IncrementalGenerator(), Shape{1, 2, 2, 3}, @@ -48,6 +77,7 @@ TEST(ascend_Unary, run) { Shape{0}); testReshape(IncrementalGenerator(), Shape{1, 2, 2, 3}, Shape{4}); + testFlatten(IncrementalGenerator(), Shape{1, 2, 2, 3}, 2); aclFinalize(); }