add conv_transpose&&native maxpooling

This commit is contained in:
OdinaryWord 2024-04-01 16:01:36 +08:00
parent fc4b62a88c
commit a5ccf06551
11 changed files with 120 additions and 104 deletions

View File

@ -20,6 +20,7 @@ endif()
include(CMakeDependentOption) include(CMakeDependentOption)
project(InfiniTensor C CXX) project(InfiniTensor C CXX)
cmake_dependent_option(BUILD_TEST_CORE "Build tests for core components" ON BUILD_TEST OFF) cmake_dependent_option(BUILD_TEST_CORE "Build tests for core components" ON BUILD_TEST OFF)
cmake_dependent_option(BUILD_TEST_PET "Build tests for PET" OFF BUILD_TEST OFF) cmake_dependent_option(BUILD_TEST_PET "Build tests for PET" OFF BUILD_TEST OFF)

View File

@ -39,7 +39,7 @@ endif
build: build:
mkdir -p build/$(TYPE) mkdir -p build/$(TYPE)
cd build/$(TYPE) && cmake $(CMAKE_OPT) ../.. && make -j8 cd build/$(TYPE) && cmake $(CMAKE_OPT) ../.. && make -j
clean: clean:
rm -rf build rm -rf build

View File

@ -20,21 +20,16 @@ class ASCENDRuntimeObj : public RuntimeObj {
private: private:
aclrtContext context; aclrtContext context;
aclrtStream stream; aclrtStream stream;
std::unique_ptr<CommunicatorObj> comm;
ASCENDPtr workspace = nullptr; ASCENDPtr workspace = nullptr;
size_t workspaceSize; size_t workspaceSize;
public: public:
ASCENDRuntimeObj(int deviceId = 0) : RuntimeObj(Device::ASCEND, deviceId) { ASCENDRuntimeObj(int deviceId = 0) : RuntimeObj(Device::ASCEND, deviceId) {
// #ifndef _ACL_INIT auto ret = aclInit(nullptr);
// #define _ACL_INIT CHECK_RET(ret == ACL_SUCCESS,
// aclInit(nullptr); LOG_PRINT("aclInit failed. ERROR: %d\n", ret));
// // auto ret_init = ret = aclrtSetDevice(deviceId);
// // CHECK_RET(ret == ACL_SUCCESS,
// // LOG_PRINT("aclInit failed. ERROR: %d\n",
// ret));
// #endif
aclInit(nullptr);
auto ret = aclrtSetDevice(deviceId);
CHECK_RET(ret == ACL_SUCCESS, CHECK_RET(ret == ACL_SUCCESS,
LOG_PRINT("aclrtSetDevice failed. ERROR: %d\n", ret)); LOG_PRINT("aclrtSetDevice failed. ERROR: %d\n", ret));
ret = aclrtCreateContext(&context, deviceId); ret = aclrtCreateContext(&context, deviceId);
@ -49,7 +44,7 @@ class ASCENDRuntimeObj : public RuntimeObj {
// 10GB for Longformer // 10GB for Longformer
// size_t longformerNum = 3lu * (1 << 30); // size_t longformerNum = 3lu * (1 << 30);
workspaceSize = 3ll << 30; // 3 GB workspaceSize = 3ll << 33; // 3 GB
// std::cout<<workspaceSize/1024/1024/1024<< std::endl; // std::cout<<workspaceSize/1024/1024/1024<< std::endl;
// std::cout<<std::bitset<64>(workspaceSize)<< std::endl; // std::cout<<std::bitset<64>(workspaceSize)<< std::endl;
workspace = alloc(workspaceSize); workspace = alloc(workspaceSize);
@ -99,9 +94,9 @@ class ASCENDRuntimeObj : public RuntimeObj {
ACL_MEMCPY_DEVICE_TO_DEVICE); ACL_MEMCPY_DEVICE_TO_DEVICE);
} }
void initComm(const string &, int, int) override { IT_TODO_HALT(); } void initComm(const string &name, int worldSize, int rank) final;
CommunicatorObj &getCommunicator() const override { IT_TODO_HALT(); } CommunicatorObj &getCommunicator() const override { return *comm; }
private: private:
void runWithoutSync(const Graph &graph, bool tune, bool profiling) const; void runWithoutSync(const Graph &graph, bool tune, bool profiling) const;

View File

@ -184,7 +184,7 @@ class OnnxStub:
node, node,
{ {
"dilations": [1, 1], "dilations": [1, 1],
"pads": [0, 0], "pads": [0, 0, 0, 0],
"strides": [1, 1], "strides": [1, 1],
"output_padding": [0, 0], "output_padding": [0, 0],
}, },
@ -193,19 +193,63 @@ class OnnxStub:
attributes[name] attributes[name]
for name in ["dilations", "pads", "strides", "output_padding"] for name in ["dilations", "pads", "strides", "output_padding"]
) )
tensors[node.output[0]] = self.handler.convTransposed2d( if p[0] != p[2] or p[1] != p[3]:
tensors[node.input[0]], adapt = "{}-adapt".format(node.output[0])
tensors[node.input[1]], tensors[adapt] = self.handler.pad(
tensors.get(node.output[0]), tensors[node.input[0]], None, p, [-2, -1]
p[0], )
p[1], p = [0, 0, 0, 0]
s[0], else:
s[1], adapt = node.input[0]
d[0],
d[1], if len(node.input) > 2:
op[0], bias = "{}-bias".format(node.output[0])
op[1], reshape = "{}-reshape".format(node.output[0])
) tensors[bias] = self.handler.convTransposed2d(
tensors[adapt],
tensors[node.input[1]],
None,
p[0],
p[1],
s[0],
s[1],
d[0],
d[1],
op[0],
op[1],
)
tensors[reshape] = self.handler.reshape(
tensors[node.input[2]],
None,
[
1,
reduce(
lambda acc, x: acc * x,
tensors[node.input[2]].shape(),
),
1,
1,
],
)
tensors[node.output[0]] = self.handler.add(
tensors[bias],
tensors[reshape],
tensors.get(node.output[0]),
)
else:
tensors[node.output[0]] = self.handler.convTransposed2d(
tensors[adapt],
tensors[node.input[1]],
tensors.get(node.output[0]),
p[0],
p[1],
s[0],
s[1],
d[0],
d[1],
op[0],
op[1],
)
elif node.op_type == "MatMul": elif node.op_type == "MatMul":
tensors[node.output[0]] = self.handler.matmul( tensors[node.output[0]] = self.handler.matmul(
tensors[node.input[0]], tensors[node.input[0]],

View File

@ -56,4 +56,15 @@ void ASCENDRuntimeObj::sync() const { ; }
string ASCENDRuntimeObj::toString() const { return "ASCEND Runtime"; } string ASCENDRuntimeObj::toString() const { return "ASCEND Runtime"; }
void ASCENDRuntimeObj::initComm(const string &name, int worldSize, int rank) {
IT_ASSERT(worldSize > 0);
IT_ASSERT(rank >= 0);
IT_ASSERT(rank < worldSize);
IT_ASSERT(!comm) << "communicator is already initialized.";
#ifdef INFINI_USE_HCCL
comm = std::make_unique<HcclCommunicatorObj>(name, worldSize, rank);
#else
IT_TODO_HALT_MSG("Not compiled with CNCL.");
#endif
}
} // namespace infini } // namespace infini

View File

@ -13,9 +13,9 @@ class ConvAclnn : public ASCENDKernelWithoutConfig {
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context); auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation(); const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
// const auto [n, c, h, w, f, r, s] = op->getNCHWFRS(); const auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
// const int cpg = op->getChannelPerGroup(); const int cpg = op->getChannelPerGroup();
// const int g = c / cpg; const int g = c / cpg;
std::vector<int64_t> pads = {ph, pw}; std::vector<int64_t> pads = {ph, pw};
// std::vector<int64_t> ksize = {r, s}; // std::vector<int64_t> ksize = {r, s};
@ -67,8 +67,8 @@ class ConvAclnn : public ASCENDKernelWithoutConfig {
auto ret = aclnnConvolutionGetWorkspaceSize( auto ret = aclnnConvolutionGetWorkspaceSize(
inputTensor, weightTensor, nullptr, convstride, convpads, inputTensor, weightTensor, nullptr, convstride, convpads,
convdilation, false, convOutputpadding, 1, outputTensor, 1, convdilation, false, convOutputpadding, int64_t(g), outputTensor,
&workspaceSize, &executor); int8_t(1), &workspaceSize, &executor);
void *workspaceAddr = nullptr; void *workspaceAddr = nullptr;
if (workspaceSize > 0) { if (workspaceSize > 0) {
workspaceAddr = context->getWorkspace(workspaceSize); workspaceAddr = context->getWorkspace(workspaceSize);

View File

@ -1,6 +1,6 @@
#include "operators/pooling.h" #include "operators/pooling.h"
#include "aclnnop/level2/aclnn_adaptive_max_pool2d.h"
#include "aclnnop/level2/aclnn_avgpool2d.h" #include "aclnnop/level2/aclnn_avgpool2d.h"
#include "aclnnop/level2/aclnn_max_pool.h"
#include "ascend/ascend_kernel_without_config.h" #include "ascend/ascend_kernel_without_config.h"
#include "ascend/ascend_runtime.h" #include "ascend/ascend_runtime.h"
@ -75,40 +75,6 @@ class AvgPooling : public ASCENDKernelWithoutConfig {
}; };
class MaxPooling : public ASCENDKernelWithoutConfig { class MaxPooling : public ASCENDKernelWithoutConfig {
// Only adaptiveMaxPool2d was found in the ACLNN doc.
int64_t GetShapeSize(const std::vector<int64_t> &shape) {
int64_t shapeSize = 1;
for (auto i : shape) {
shapeSize *= i;
}
return shapeSize;
}
template <typename T>
int CreateAclTensor(const std::vector<T> &hostData,
const std::vector<int64_t> &shape, void **deviceAddr,
aclDataType dataType, aclTensor **tensor) {
auto size = GetShapeSize(shape) * sizeof(T);
// 调用aclrtMalloc申请device侧内存
auto ret = aclrtMalloc(deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST);
assert(ret == ACL_SUCCESS);
// 调用aclrtMemcpy将host侧数据拷贝到device侧内存上
ret = aclrtMemcpy(*deviceAddr, size, hostData.data(), size,
ACL_MEMCPY_HOST_TO_DEVICE);
assert(ret == ACL_SUCCESS);
// 计算连续tensor的strides
std::vector<int64_t> strides(shape.size(), 1);
for (int64_t i = shape.size() - 2; i >= 0; i--) {
strides[i] = shape[i + 1] * strides[i + 1];
}
// 调用aclCreateTensor接口创建aclTensor
*tensor = aclCreateTensor(shape.data(), shape.size(), dataType,
strides.data(), 0, aclFormat::ACL_FORMAT_NCHW,
shape.data(), shape.size(), *deviceAddr);
return 0;
}
void compute(const Operator &_op, void compute(const Operator &_op,
const RuntimeObj *_context) const override { const RuntimeObj *_context) const override {
auto op = as<PoolingObj>(_op); auto op = as<PoolingObj>(_op);
@ -117,6 +83,15 @@ class MaxPooling : public ASCENDKernelWithoutConfig {
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>()); void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>()); void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto [n, c, h, w, kh, kw] = op->getNCHWRS();
auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
int64_t ceilMode = int64_t(op->getCeilMode());
std::vector<int64_t> ksize = {kh, kw};
std::vector<int64_t> stride = {sh, sw};
std::vector<int64_t> pad = {ph, pw};
std::vector<int64_t> dilation = {dh, dw};
auto selfD = op->getInputs(0)->getDims(); auto selfD = op->getInputs(0)->getDims();
auto selfS = op->getInputs(0)->getStride(); auto selfS = op->getInputs(0)->getStride();
auto outD = op->getOutput()->getDims(); auto outD = op->getOutput()->getDims();
@ -127,20 +102,12 @@ class MaxPooling : public ASCENDKernelWithoutConfig {
std::vector<int64_t> outputDim = castTo64(outD); std::vector<int64_t> outputDim = castTo64(outD);
std::vector<int64_t> outputStride = castTo64(outS); std::vector<int64_t> outputStride = castTo64(outS);
std::vector<int64_t> outputHW(2, 1); aclIntArray *kernelSize = aclCreateIntArray(ksize.data(), ksize.size());
outputHW[0] = outputDim[outputDim.size() - 2]; aclIntArray *strides = aclCreateIntArray(stride.data(), stride.size());
outputHW[1] = outputDim[outputDim.size() - 1]; aclIntArray *paddings = aclCreateIntArray(pad.data(), pad.size());
aclIntArray *dilations =
aclCreateIntArray(dilation.data(), dilation.size());
int64_t indicesOutSize = 1;
for (auto i : outputDim) {
indicesOutSize *= i;
}
void *indicesOutDeviceAddr = nullptr;
aclrtMalloc(&indicesOutDeviceAddr, indicesOutSize,
ACL_MEM_MALLOC_HUGE_FIRST);
aclIntArray *outputsize =
aclCreateIntArray(outputHW.data(), outputHW.size());
auto selfTensor = aclCreateTensor( auto selfTensor = aclCreateTensor(
selfDim.data(), selfDim.size(), ACL_FLOAT, selfStride.data(), 0, selfDim.data(), selfDim.size(), ACL_FLOAT, selfStride.data(), 0,
aclFormat::ACL_FORMAT_NCHW, selfDim.data(), selfDim.size(), aData); aclFormat::ACL_FORMAT_NCHW, selfDim.data(), selfDim.size(), aData);
@ -148,16 +115,12 @@ class MaxPooling : public ASCENDKernelWithoutConfig {
aclCreateTensor(outputDim.data(), outputDim.size(), ACL_FLOAT, aclCreateTensor(outputDim.data(), outputDim.size(), ACL_FLOAT,
outputStride.data(), 0, aclFormat::ACL_FORMAT_NCHW, outputStride.data(), 0, aclFormat::ACL_FORMAT_NCHW,
outputDim.data(), outputDim.size(), cData); outputDim.data(), outputDim.size(), cData);
auto indicesOutTensor = aclCreateTensor(
outputDim.data(), outputDim.size(), ACL_INT64, outputStride.data(),
0, aclFormat::ACL_FORMAT_NCHW, outputDim.data(), outputDim.size(),
indicesOutDeviceAddr);
uint64_t workspaceSize = 0; uint64_t workspaceSize = 0;
aclOpExecutor *executor; aclOpExecutor *executor;
auto ret = aclnnAdaptiveMaxPool2dGetWorkspaceSize( auto ret = aclnnMaxPoolGetWorkspaceSize(
selfTensor, outputsize, outputTensor, indicesOutTensor, selfTensor, kernelSize, strides, 0, paddings, dilations, ceilMode,
&workspaceSize, &executor); outputTensor, &workspaceSize, &executor);
assert(ret == ACL_SUCCESS); assert(ret == ACL_SUCCESS);
void *workspaceAddr = nullptr; void *workspaceAddr = nullptr;
@ -165,15 +128,13 @@ class MaxPooling : public ASCENDKernelWithoutConfig {
workspaceAddr = context->getWorkspace(workspaceSize); workspaceAddr = context->getWorkspace(workspaceSize);
} }
ret = aclnnAdaptiveMaxPool2d(workspaceAddr, workspaceSize, executor, ret = aclnnMaxPool(workspaceAddr, workspaceSize, executor,
context->ASCENDHandle()); context->ASCENDHandle());
assert(ret == ACL_SUCCESS); assert(ret == ACL_SUCCESS);
ret = aclrtSynchronizeStream(context->ASCENDHandle()); ret = aclrtSynchronizeStream(context->ASCENDHandle());
assert(ret == ACL_SUCCESS); assert(ret == ACL_SUCCESS);
aclDestroyTensor(indicesOutTensor);
return; return;
} }
}; };

View File

@ -45,14 +45,16 @@ void testConv(const std::function<void(void *, size_t, DataType)> &generatorA,
cpuRuntime->run(cpuGraph); cpuRuntime->run(cpuGraph);
auto outputCpu = cpuOp->getOutput(); auto outputCpu = cpuOp->getOutput();
// Check // Check
EXPECT_TRUE(outputCpu->equalData(outputNpu2Cpu)); // outputCpu->printData();
// outputNpu2Cpu->printData();
EXPECT_TRUE(outputCpu->equalData(outputNpu2Cpu, 1e-3));
} }
TEST(ascend_Conv, run) { TEST(ascend_Conv, run) {
aclInit(nullptr); // aclInit(nullptr);
testConv<ConvObj>(IncrementalGenerator(), IncrementalGenerator(), testConv<ConvObj>(IncrementalGenerator(), IncrementalGenerator(),
Shape{1, 3, 32, 32}, Shape{2, 3, 3, 3}); Shape{1, 3, 128, 128}, Shape{2, 3, 3, 3});
aclFinalize(); // aclFinalize();
} }
} // namespace infini } // namespace infini

View File

@ -53,9 +53,10 @@ TEST(ascend_ElementWise, run) {
// aclInit(nullptr); // aclInit(nullptr);
// testElementWise<PowObj>(IncrementalGenerator(), Shape{1, 2, 2, 3}); // testElementWise<PowObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
// testElementWise<AddObj>(IncrementalGenerator(), Shape{1, 2, 2, 3}); // testElementWise<AddObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
// testElementWise<SubObj>(IncrementalGenerator(), Shape{1, 2, 2, 3}); testElementWise<SubObj>(IncrementalGenerator(), Shape{1, 1, 48, 48},
testElementWise<DivObj>(IncrementalGenerator(), Shape{1}, Shape{1, 1, 1, 1});
Shape{1, 2, 2, 3}); // testElementWise<DivObj>(IncrementalGenerator(), Shape{1}, Shape{1, 2, 2,
// 3});
// testElementWise<MulObj>(IncrementalGenerator(), Shape{1, 2, 2, 3}); // testElementWise<MulObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
// aclFinalize(); // aclFinalize();
} }

View File

@ -50,10 +50,10 @@ void testMatmul(const std::function<void(void *, size_t, DataType)> &generatorA,
} }
TEST(ascend_Matmul, run) { TEST(ascend_Matmul, run) {
aclInit(nullptr); // aclInit(nullptr);
testMatmul<MatmulObj>(IncrementalGenerator(), IncrementalGenerator(), false, testMatmul<MatmulObj>(IncrementalGenerator(), IncrementalGenerator(), false,
false, Shape{1, 2, 3}, Shape{1, 3, 4}); false, Shape{1, 2, 3}, Shape{1, 3, 4});
aclFinalize(); // aclFinalize();
} }
} // namespace infini } // namespace infini

View File

@ -26,6 +26,7 @@ void testPooling(const std::function<void(void *, size_t, DataType)> &generator,
auto inputNpu = npuGraph->cloneTensor(inputCpu); auto inputNpu = npuGraph->cloneTensor(inputCpu);
auto npuOp = auto npuOp =
npuGraph->addOp<T>(inputNpu, nullptr, 3, 3, 1, 1, 1, 1, 2, 2, 0); npuGraph->addOp<T>(inputNpu, nullptr, 3, 3, 1, 1, 1, 1, 2, 2, 0);
// npuGraph->addOp<T>(inputNpu, nullptr, 2, 2, 1, 1, 0, 0, 1, 1, 0);
npuGraph->dataMalloc(); npuGraph->dataMalloc();
inputNpu->setData(generator); inputNpu->setData(generator);
npuRuntime->run(npuGraph); npuRuntime->run(npuGraph);
@ -38,10 +39,10 @@ void testPooling(const std::function<void(void *, size_t, DataType)> &generator,
} }
TEST(cnnl_Pooling, run) { TEST(cnnl_Pooling, run) {
aclInit(nullptr); // aclInit(nullptr);
testPooling<MaxPoolObj>(IncrementalGenerator(), Shape{1, 2, 5, 5}); // testPooling<MaxPoolObj>(IncrementalGenerator(), Shape{1, 2, 5, 5});
testPooling<AvgPoolObj>(IncrementalGenerator(), Shape{1, 2, 5, 5}); testPooling<AvgPoolObj>(IncrementalGenerator(), Shape{1, 2, 5, 5});
aclFinalize(); // aclFinalize();
} }
} // namespace infini } // namespace infini