add conv_transpose&&native maxpooling

This commit is contained in:
OdinaryWord 2024-04-01 16:01:36 +08:00
parent fc4b62a88c
commit a5ccf06551
11 changed files with 120 additions and 104 deletions

View File

@ -20,6 +20,7 @@ endif()
include(CMakeDependentOption)
project(InfiniTensor C CXX)
cmake_dependent_option(BUILD_TEST_CORE "Build tests for core components" ON BUILD_TEST OFF)
cmake_dependent_option(BUILD_TEST_PET "Build tests for PET" OFF BUILD_TEST OFF)

View File

@ -39,7 +39,7 @@ endif
build:
mkdir -p build/$(TYPE)
cd build/$(TYPE) && cmake $(CMAKE_OPT) ../.. && make -j8
cd build/$(TYPE) && cmake $(CMAKE_OPT) ../.. && make -j
clean:
rm -rf build

View File

@ -20,21 +20,16 @@ class ASCENDRuntimeObj : public RuntimeObj {
private:
aclrtContext context;
aclrtStream stream;
std::unique_ptr<CommunicatorObj> comm;
ASCENDPtr workspace = nullptr;
size_t workspaceSize;
public:
ASCENDRuntimeObj(int deviceId = 0) : RuntimeObj(Device::ASCEND, deviceId) {
// #ifndef _ACL_INIT
// #define _ACL_INIT
// aclInit(nullptr);
// // auto ret_init =
// // CHECK_RET(ret == ACL_SUCCESS,
// // LOG_PRINT("aclInit failed. ERROR: %d\n",
// ret));
// #endif
aclInit(nullptr);
auto ret = aclrtSetDevice(deviceId);
auto ret = aclInit(nullptr);
CHECK_RET(ret == ACL_SUCCESS,
LOG_PRINT("aclInit failed. ERROR: %d\n", ret));
ret = aclrtSetDevice(deviceId);
CHECK_RET(ret == ACL_SUCCESS,
LOG_PRINT("aclrtSetDevice failed. ERROR: %d\n", ret));
ret = aclrtCreateContext(&context, deviceId);
@ -49,7 +44,7 @@ class ASCENDRuntimeObj : public RuntimeObj {
// 10GB for Longformer
// size_t longformerNum = 3lu * (1 << 30);
workspaceSize = 3ll << 30; // 3 GB
workspaceSize = 3ll << 33; // 3 GB
// std::cout<<workspaceSize/1024/1024/1024<< std::endl;
// std::cout<<std::bitset<64>(workspaceSize)<< std::endl;
workspace = alloc(workspaceSize);
@ -99,9 +94,9 @@ class ASCENDRuntimeObj : public RuntimeObj {
ACL_MEMCPY_DEVICE_TO_DEVICE);
}
void initComm(const string &, int, int) override { IT_TODO_HALT(); }
void initComm(const string &name, int worldSize, int rank) final;
CommunicatorObj &getCommunicator() const override { IT_TODO_HALT(); }
CommunicatorObj &getCommunicator() const override { return *comm; }
private:
void runWithoutSync(const Graph &graph, bool tune, bool profiling) const;

View File

@ -184,7 +184,7 @@ class OnnxStub:
node,
{
"dilations": [1, 1],
"pads": [0, 0],
"pads": [0, 0, 0, 0],
"strides": [1, 1],
"output_padding": [0, 0],
},
@ -193,19 +193,63 @@ class OnnxStub:
attributes[name]
for name in ["dilations", "pads", "strides", "output_padding"]
)
tensors[node.output[0]] = self.handler.convTransposed2d(
tensors[node.input[0]],
tensors[node.input[1]],
tensors.get(node.output[0]),
p[0],
p[1],
s[0],
s[1],
d[0],
d[1],
op[0],
op[1],
)
if p[0] != p[2] or p[1] != p[3]:
adapt = "{}-adapt".format(node.output[0])
tensors[adapt] = self.handler.pad(
tensors[node.input[0]], None, p, [-2, -1]
)
p = [0, 0, 0, 0]
else:
adapt = node.input[0]
if len(node.input) > 2:
bias = "{}-bias".format(node.output[0])
reshape = "{}-reshape".format(node.output[0])
tensors[bias] = self.handler.convTransposed2d(
tensors[adapt],
tensors[node.input[1]],
None,
p[0],
p[1],
s[0],
s[1],
d[0],
d[1],
op[0],
op[1],
)
tensors[reshape] = self.handler.reshape(
tensors[node.input[2]],
None,
[
1,
reduce(
lambda acc, x: acc * x,
tensors[node.input[2]].shape(),
),
1,
1,
],
)
tensors[node.output[0]] = self.handler.add(
tensors[bias],
tensors[reshape],
tensors.get(node.output[0]),
)
else:
tensors[node.output[0]] = self.handler.convTransposed2d(
tensors[adapt],
tensors[node.input[1]],
tensors.get(node.output[0]),
p[0],
p[1],
s[0],
s[1],
d[0],
d[1],
op[0],
op[1],
)
elif node.op_type == "MatMul":
tensors[node.output[0]] = self.handler.matmul(
tensors[node.input[0]],

View File

@ -56,4 +56,15 @@ void ASCENDRuntimeObj::sync() const { ; }
string ASCENDRuntimeObj::toString() const { return "ASCEND Runtime"; }
void ASCENDRuntimeObj::initComm(const string &name, int worldSize, int rank) {
IT_ASSERT(worldSize > 0);
IT_ASSERT(rank >= 0);
IT_ASSERT(rank < worldSize);
IT_ASSERT(!comm) << "communicator is already initialized.";
#ifdef INFINI_USE_HCCL
comm = std::make_unique<HcclCommunicatorObj>(name, worldSize, rank);
#else
IT_TODO_HALT_MSG("Not compiled with CNCL.");
#endif
}
} // namespace infini

View File

@ -13,9 +13,9 @@ class ConvAclnn : public ASCENDKernelWithoutConfig {
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
// const auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
// const int cpg = op->getChannelPerGroup();
// const int g = c / cpg;
const auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
const int cpg = op->getChannelPerGroup();
const int g = c / cpg;
std::vector<int64_t> pads = {ph, pw};
// std::vector<int64_t> ksize = {r, s};
@ -67,8 +67,8 @@ class ConvAclnn : public ASCENDKernelWithoutConfig {
auto ret = aclnnConvolutionGetWorkspaceSize(
inputTensor, weightTensor, nullptr, convstride, convpads,
convdilation, false, convOutputpadding, 1, outputTensor, 1,
&workspaceSize, &executor);
convdilation, false, convOutputpadding, int64_t(g), outputTensor,
int8_t(1), &workspaceSize, &executor);
void *workspaceAddr = nullptr;
if (workspaceSize > 0) {
workspaceAddr = context->getWorkspace(workspaceSize);

View File

@ -1,6 +1,6 @@
#include "operators/pooling.h"
#include "aclnnop/level2/aclnn_adaptive_max_pool2d.h"
#include "aclnnop/level2/aclnn_avgpool2d.h"
#include "aclnnop/level2/aclnn_max_pool.h"
#include "ascend/ascend_kernel_without_config.h"
#include "ascend/ascend_runtime.h"
@ -75,40 +75,6 @@ class AvgPooling : public ASCENDKernelWithoutConfig {
};
class MaxPooling : public ASCENDKernelWithoutConfig {
// Only adaptiveMaxPool2d was found in the ACLNN doc.
int64_t GetShapeSize(const std::vector<int64_t> &shape) {
int64_t shapeSize = 1;
for (auto i : shape) {
shapeSize *= i;
}
return shapeSize;
}
template <typename T>
int CreateAclTensor(const std::vector<T> &hostData,
const std::vector<int64_t> &shape, void **deviceAddr,
aclDataType dataType, aclTensor **tensor) {
auto size = GetShapeSize(shape) * sizeof(T);
// 调用aclrtMalloc申请device侧内存
auto ret = aclrtMalloc(deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST);
assert(ret == ACL_SUCCESS);
// 调用aclrtMemcpy将host侧数据拷贝到device侧内存上
ret = aclrtMemcpy(*deviceAddr, size, hostData.data(), size,
ACL_MEMCPY_HOST_TO_DEVICE);
assert(ret == ACL_SUCCESS);
// 计算连续tensor的strides
std::vector<int64_t> strides(shape.size(), 1);
for (int64_t i = shape.size() - 2; i >= 0; i--) {
strides[i] = shape[i + 1] * strides[i + 1];
}
// 调用aclCreateTensor接口创建aclTensor
*tensor = aclCreateTensor(shape.data(), shape.size(), dataType,
strides.data(), 0, aclFormat::ACL_FORMAT_NCHW,
shape.data(), shape.size(), *deviceAddr);
return 0;
}
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<PoolingObj>(_op);
@ -117,6 +83,15 @@ class MaxPooling : public ASCENDKernelWithoutConfig {
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto [n, c, h, w, kh, kw] = op->getNCHWRS();
auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
int64_t ceilMode = int64_t(op->getCeilMode());
std::vector<int64_t> ksize = {kh, kw};
std::vector<int64_t> stride = {sh, sw};
std::vector<int64_t> pad = {ph, pw};
std::vector<int64_t> dilation = {dh, dw};
auto selfD = op->getInputs(0)->getDims();
auto selfS = op->getInputs(0)->getStride();
auto outD = op->getOutput()->getDims();
@ -127,20 +102,12 @@ class MaxPooling : public ASCENDKernelWithoutConfig {
std::vector<int64_t> outputDim = castTo64(outD);
std::vector<int64_t> outputStride = castTo64(outS);
std::vector<int64_t> outputHW(2, 1);
outputHW[0] = outputDim[outputDim.size() - 2];
outputHW[1] = outputDim[outputDim.size() - 1];
aclIntArray *kernelSize = aclCreateIntArray(ksize.data(), ksize.size());
aclIntArray *strides = aclCreateIntArray(stride.data(), stride.size());
aclIntArray *paddings = aclCreateIntArray(pad.data(), pad.size());
aclIntArray *dilations =
aclCreateIntArray(dilation.data(), dilation.size());
int64_t indicesOutSize = 1;
for (auto i : outputDim) {
indicesOutSize *= i;
}
void *indicesOutDeviceAddr = nullptr;
aclrtMalloc(&indicesOutDeviceAddr, indicesOutSize,
ACL_MEM_MALLOC_HUGE_FIRST);
aclIntArray *outputsize =
aclCreateIntArray(outputHW.data(), outputHW.size());
auto selfTensor = aclCreateTensor(
selfDim.data(), selfDim.size(), ACL_FLOAT, selfStride.data(), 0,
aclFormat::ACL_FORMAT_NCHW, selfDim.data(), selfDim.size(), aData);
@ -148,16 +115,12 @@ class MaxPooling : public ASCENDKernelWithoutConfig {
aclCreateTensor(outputDim.data(), outputDim.size(), ACL_FLOAT,
outputStride.data(), 0, aclFormat::ACL_FORMAT_NCHW,
outputDim.data(), outputDim.size(), cData);
auto indicesOutTensor = aclCreateTensor(
outputDim.data(), outputDim.size(), ACL_INT64, outputStride.data(),
0, aclFormat::ACL_FORMAT_NCHW, outputDim.data(), outputDim.size(),
indicesOutDeviceAddr);
uint64_t workspaceSize = 0;
aclOpExecutor *executor;
auto ret = aclnnAdaptiveMaxPool2dGetWorkspaceSize(
selfTensor, outputsize, outputTensor, indicesOutTensor,
&workspaceSize, &executor);
auto ret = aclnnMaxPoolGetWorkspaceSize(
selfTensor, kernelSize, strides, 0, paddings, dilations, ceilMode,
outputTensor, &workspaceSize, &executor);
assert(ret == ACL_SUCCESS);
void *workspaceAddr = nullptr;
@ -165,15 +128,13 @@ class MaxPooling : public ASCENDKernelWithoutConfig {
workspaceAddr = context->getWorkspace(workspaceSize);
}
ret = aclnnAdaptiveMaxPool2d(workspaceAddr, workspaceSize, executor,
context->ASCENDHandle());
ret = aclnnMaxPool(workspaceAddr, workspaceSize, executor,
context->ASCENDHandle());
assert(ret == ACL_SUCCESS);
ret = aclrtSynchronizeStream(context->ASCENDHandle());
assert(ret == ACL_SUCCESS);
aclDestroyTensor(indicesOutTensor);
return;
}
};

View File

@ -45,14 +45,16 @@ void testConv(const std::function<void(void *, size_t, DataType)> &generatorA,
cpuRuntime->run(cpuGraph);
auto outputCpu = cpuOp->getOutput();
// Check
EXPECT_TRUE(outputCpu->equalData(outputNpu2Cpu));
// outputCpu->printData();
// outputNpu2Cpu->printData();
EXPECT_TRUE(outputCpu->equalData(outputNpu2Cpu, 1e-3));
}
TEST(ascend_Conv, run) {
aclInit(nullptr);
// aclInit(nullptr);
testConv<ConvObj>(IncrementalGenerator(), IncrementalGenerator(),
Shape{1, 3, 32, 32}, Shape{2, 3, 3, 3});
aclFinalize();
Shape{1, 3, 128, 128}, Shape{2, 3, 3, 3});
// aclFinalize();
}
} // namespace infini

View File

@ -53,9 +53,10 @@ TEST(ascend_ElementWise, run) {
// aclInit(nullptr);
// testElementWise<PowObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
// testElementWise<AddObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
// testElementWise<SubObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testElementWise<DivObj>(IncrementalGenerator(), Shape{1},
Shape{1, 2, 2, 3});
testElementWise<SubObj>(IncrementalGenerator(), Shape{1, 1, 48, 48},
Shape{1, 1, 1, 1});
// testElementWise<DivObj>(IncrementalGenerator(), Shape{1}, Shape{1, 2, 2,
// 3});
// testElementWise<MulObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
// aclFinalize();
}

View File

@ -50,10 +50,10 @@ void testMatmul(const std::function<void(void *, size_t, DataType)> &generatorA,
}
TEST(ascend_Matmul, run) {
aclInit(nullptr);
// aclInit(nullptr);
testMatmul<MatmulObj>(IncrementalGenerator(), IncrementalGenerator(), false,
false, Shape{1, 2, 3}, Shape{1, 3, 4});
aclFinalize();
// aclFinalize();
}
} // namespace infini

View File

@ -26,6 +26,7 @@ void testPooling(const std::function<void(void *, size_t, DataType)> &generator,
auto inputNpu = npuGraph->cloneTensor(inputCpu);
auto npuOp =
npuGraph->addOp<T>(inputNpu, nullptr, 3, 3, 1, 1, 1, 1, 2, 2, 0);
// npuGraph->addOp<T>(inputNpu, nullptr, 2, 2, 1, 1, 0, 0, 1, 1, 0);
npuGraph->dataMalloc();
inputNpu->setData(generator);
npuRuntime->run(npuGraph);
@ -38,10 +39,10 @@ void testPooling(const std::function<void(void *, size_t, DataType)> &generator,
}
TEST(cnnl_Pooling, run) {
aclInit(nullptr);
testPooling<MaxPoolObj>(IncrementalGenerator(), Shape{1, 2, 5, 5});
// aclInit(nullptr);
// testPooling<MaxPoolObj>(IncrementalGenerator(), Shape{1, 2, 5, 5});
testPooling<AvgPoolObj>(IncrementalGenerator(), Shape{1, 2, 5, 5});
aclFinalize();
// aclFinalize();
}
} // namespace infini