add maxpooling & flatten

This commit is contained in:
OdinaryWord 2024-03-13 17:25:15 +08:00
parent 36e0840f2f
commit fc4b62a88c
4 changed files with 149 additions and 5 deletions

View File

@ -1,4 +1,5 @@
#include "operators/pooling.h"
#include "aclnnop/level2/aclnn_adaptive_max_pool2d.h"
#include "aclnnop/level2/aclnn_avgpool2d.h"
#include "ascend/ascend_kernel_without_config.h"
#include "ascend/ascend_runtime.h"
@ -22,7 +23,7 @@ class AvgPooling : public ASCENDKernelWithoutConfig {
std::vector<int64_t> stride = {sh, sw};
std::vector<int64_t> pad = {ph, pw};
int64_t divisorOverride = kh * kw;
int64_t divisorOverride = 0;
auto selfD = op->getInputs(0)->getDims();
auto selfS = op->getInputs(0)->getStride();
@ -51,12 +52,14 @@ class AvgPooling : public ASCENDKernelWithoutConfig {
auto ret = aclnnAvgPool2dGetWorkspaceSize(
selfTensor, kernelSize, strides, paddings, false, true,
divisorOverride, 1, outputTensor, &workspaceSize, &executor);
divisorOverride, 0, outputTensor, &workspaceSize, &executor);
assert(ret == ACL_SUCCESS);
void *workspaceAddr = nullptr;
if (workspaceSize > 0) {
workspaceAddr = context->getWorkspace(workspaceSize);
}
assert(ret == ACL_SUCCESS);
ret = aclnnAvgPool2d(workspaceAddr, workspaceSize, executor,
context->ASCENDHandle());
assert(ret == ACL_SUCCESS);
@ -71,6 +74,113 @@ class AvgPooling : public ASCENDKernelWithoutConfig {
}
};
class MaxPooling : public ASCENDKernelWithoutConfig {
// Only adaptiveMaxPool2d was found in the ACLNN doc.
int64_t GetShapeSize(const std::vector<int64_t> &shape) {
int64_t shapeSize = 1;
for (auto i : shape) {
shapeSize *= i;
}
return shapeSize;
}
template <typename T>
int CreateAclTensor(const std::vector<T> &hostData,
const std::vector<int64_t> &shape, void **deviceAddr,
aclDataType dataType, aclTensor **tensor) {
auto size = GetShapeSize(shape) * sizeof(T);
// 调用aclrtMalloc申请device侧内存
auto ret = aclrtMalloc(deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST);
assert(ret == ACL_SUCCESS);
// 调用aclrtMemcpy将host侧数据拷贝到device侧内存上
ret = aclrtMemcpy(*deviceAddr, size, hostData.data(), size,
ACL_MEMCPY_HOST_TO_DEVICE);
assert(ret == ACL_SUCCESS);
// 计算连续tensor的strides
std::vector<int64_t> strides(shape.size(), 1);
for (int64_t i = shape.size() - 2; i >= 0; i--) {
strides[i] = shape[i + 1] * strides[i + 1];
}
// 调用aclCreateTensor接口创建aclTensor
*tensor = aclCreateTensor(shape.data(), shape.size(), dataType,
strides.data(), 0, aclFormat::ACL_FORMAT_NCHW,
shape.data(), shape.size(), *deviceAddr);
return 0;
}
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<PoolingObj>(_op);
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto selfD = op->getInputs(0)->getDims();
auto selfS = op->getInputs(0)->getStride();
auto outD = op->getOutput()->getDims();
auto outS = op->getOutput()->getStride();
std::vector<int64_t> selfDim = castTo64(selfD);
std::vector<int64_t> selfStride = castTo64(selfS);
std::vector<int64_t> outputDim = castTo64(outD);
std::vector<int64_t> outputStride = castTo64(outS);
std::vector<int64_t> outputHW(2, 1);
outputHW[0] = outputDim[outputDim.size() - 2];
outputHW[1] = outputDim[outputDim.size() - 1];
int64_t indicesOutSize = 1;
for (auto i : outputDim) {
indicesOutSize *= i;
}
void *indicesOutDeviceAddr = nullptr;
aclrtMalloc(&indicesOutDeviceAddr, indicesOutSize,
ACL_MEM_MALLOC_HUGE_FIRST);
aclIntArray *outputsize =
aclCreateIntArray(outputHW.data(), outputHW.size());
auto selfTensor = aclCreateTensor(
selfDim.data(), selfDim.size(), ACL_FLOAT, selfStride.data(), 0,
aclFormat::ACL_FORMAT_NCHW, selfDim.data(), selfDim.size(), aData);
auto outputTensor =
aclCreateTensor(outputDim.data(), outputDim.size(), ACL_FLOAT,
outputStride.data(), 0, aclFormat::ACL_FORMAT_NCHW,
outputDim.data(), outputDim.size(), cData);
auto indicesOutTensor = aclCreateTensor(
outputDim.data(), outputDim.size(), ACL_INT64, outputStride.data(),
0, aclFormat::ACL_FORMAT_NCHW, outputDim.data(), outputDim.size(),
indicesOutDeviceAddr);
uint64_t workspaceSize = 0;
aclOpExecutor *executor;
auto ret = aclnnAdaptiveMaxPool2dGetWorkspaceSize(
selfTensor, outputsize, outputTensor, indicesOutTensor,
&workspaceSize, &executor);
assert(ret == ACL_SUCCESS);
void *workspaceAddr = nullptr;
if (workspaceSize > 0) {
workspaceAddr = context->getWorkspace(workspaceSize);
}
ret = aclnnAdaptiveMaxPool2d(workspaceAddr, workspaceSize, executor,
context->ASCENDHandle());
assert(ret == ACL_SUCCESS);
ret = aclrtSynchronizeStream(context->ASCENDHandle());
assert(ret == ACL_SUCCESS);
aclDestroyTensor(indicesOutTensor);
return;
}
};
REGISTER_KERNEL(Device::ASCEND, OpType::MaxPool, MaxPooling,
"maxpooling_ASCEND_float");
REGISTER_KERNEL(Device::ASCEND, OpType::AveragePool, AvgPooling,
"avgpooling_ASCEND_float");
}; // namespace infini

View File

@ -52,4 +52,8 @@ REGISTER_KERNEL(Device::ASCEND, OpType::Unsqueeze, CopyAclnn,
"unsqueeze_ASCEND_float");
REGISTER_KERNEL(Device::ASCEND, OpType::Squeeze, CopyAclnn,
"squeeze_ASCEND_float");
REGISTER_KERNEL(Device::ASCEND, OpType::Flatten, CopyAclnn,
"Flatten_ASCEND_float");
REGISTER_KERNEL(Device::ASCEND, OpType::Identity, CopyAclnn,
"Identity_ASCEND_float");
}; // namespace infini

View File

@ -39,8 +39,8 @@ void testPooling(const std::function<void(void *, size_t, DataType)> &generator,
TEST(cnnl_Pooling, run) {
aclInit(nullptr);
// testPooling<MaxPoolObj>(IncrementalGenerator(), Shape{1, 1, 5, 5});
testPooling<AvgPoolObj>(IncrementalGenerator(), Shape{1, 1, 5, 5});
testPooling<MaxPoolObj>(IncrementalGenerator(), Shape{1, 2, 5, 5});
testPooling<AvgPoolObj>(IncrementalGenerator(), Shape{1, 2, 5, 5});
aclFinalize();
}

View File

@ -40,6 +40,35 @@ void testReshape(const std::function<void(void *, size_t, DataType)> &generator,
EXPECT_TRUE(inputCpu->equalData(outputNpu2Cpu, 1e-3));
}
void testFlatten(const std::function<void(void *, size_t, DataType)> &generator,
const Shape &shape, int axis) {
// Runtime
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto npuRuntime = make_ref<ASCENDRuntimeObj>();
// Build input data on CPU
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
inputCpu->dataMalloc();
inputCpu->setData(generator);
// NPU
Graph npuGraph = make_ref<GraphObj>(npuRuntime);
auto inputNpu = npuGraph->cloneTensor(inputCpu);
auto npuOp = npuGraph->addOp<FlattenObj>(inputNpu, nullptr, axis);
npuGraph->dataMalloc();
inputNpu->setData(generator);
npuRuntime->run(npuGraph);
auto outputNpu = npuOp->getOutput();
auto outputNpu2Cpu = outputNpu->clone(cpuRuntime);
// Check
inputCpu->print();
inputCpu->printData();
outputNpu2Cpu->print();
outputNpu2Cpu->printData();
EXPECT_TRUE(inputCpu->equalData(outputNpu2Cpu, 1e-3));
}
TEST(ascend_Unary, run) {
aclInit(nullptr);
testReshape<ReshapeObj>(IncrementalGenerator(), Shape{1, 2, 2, 3},
@ -48,6 +77,7 @@ TEST(ascend_Unary, run) {
Shape{0});
testReshape<UnsqueezeObj>(IncrementalGenerator(), Shape{1, 2, 2, 3},
Shape{4});
testFlatten(IncrementalGenerator(), Shape{1, 2, 2, 3}, 2);
aclFinalize();
}