forked from jiuyuan/InfiniTensor
add conv_transpose&&native maxpooling
This commit is contained in:
parent
fc4b62a88c
commit
a5ccf06551
|
@ -20,6 +20,7 @@ endif()
|
|||
include(CMakeDependentOption)
|
||||
project(InfiniTensor C CXX)
|
||||
|
||||
|
||||
cmake_dependent_option(BUILD_TEST_CORE "Build tests for core components" ON BUILD_TEST OFF)
|
||||
cmake_dependent_option(BUILD_TEST_PET "Build tests for PET" OFF BUILD_TEST OFF)
|
||||
|
||||
|
|
2
Makefile
2
Makefile
|
@ -39,7 +39,7 @@ endif
|
|||
|
||||
build:
|
||||
mkdir -p build/$(TYPE)
|
||||
cd build/$(TYPE) && cmake $(CMAKE_OPT) ../.. && make -j8
|
||||
cd build/$(TYPE) && cmake $(CMAKE_OPT) ../.. && make -j
|
||||
|
||||
clean:
|
||||
rm -rf build
|
||||
|
|
|
@ -20,21 +20,16 @@ class ASCENDRuntimeObj : public RuntimeObj {
|
|||
private:
|
||||
aclrtContext context;
|
||||
aclrtStream stream;
|
||||
std::unique_ptr<CommunicatorObj> comm;
|
||||
ASCENDPtr workspace = nullptr;
|
||||
size_t workspaceSize;
|
||||
|
||||
public:
|
||||
ASCENDRuntimeObj(int deviceId = 0) : RuntimeObj(Device::ASCEND, deviceId) {
|
||||
// #ifndef _ACL_INIT
|
||||
// #define _ACL_INIT
|
||||
// aclInit(nullptr);
|
||||
// // auto ret_init =
|
||||
// // CHECK_RET(ret == ACL_SUCCESS,
|
||||
// // LOG_PRINT("aclInit failed. ERROR: %d\n",
|
||||
// ret));
|
||||
// #endif
|
||||
aclInit(nullptr);
|
||||
auto ret = aclrtSetDevice(deviceId);
|
||||
auto ret = aclInit(nullptr);
|
||||
CHECK_RET(ret == ACL_SUCCESS,
|
||||
LOG_PRINT("aclInit failed. ERROR: %d\n", ret));
|
||||
ret = aclrtSetDevice(deviceId);
|
||||
CHECK_RET(ret == ACL_SUCCESS,
|
||||
LOG_PRINT("aclrtSetDevice failed. ERROR: %d\n", ret));
|
||||
ret = aclrtCreateContext(&context, deviceId);
|
||||
|
@ -49,7 +44,7 @@ class ASCENDRuntimeObj : public RuntimeObj {
|
|||
|
||||
// 10GB for Longformer
|
||||
// size_t longformerNum = 3lu * (1 << 30);
|
||||
workspaceSize = 3ll << 30; // 3 GB
|
||||
workspaceSize = 3ll << 33; // 3 GB
|
||||
// std::cout<<workspaceSize/1024/1024/1024<< std::endl;
|
||||
// std::cout<<std::bitset<64>(workspaceSize)<< std::endl;
|
||||
workspace = alloc(workspaceSize);
|
||||
|
@ -99,9 +94,9 @@ class ASCENDRuntimeObj : public RuntimeObj {
|
|||
ACL_MEMCPY_DEVICE_TO_DEVICE);
|
||||
}
|
||||
|
||||
void initComm(const string &, int, int) override { IT_TODO_HALT(); }
|
||||
void initComm(const string &name, int worldSize, int rank) final;
|
||||
|
||||
CommunicatorObj &getCommunicator() const override { IT_TODO_HALT(); }
|
||||
CommunicatorObj &getCommunicator() const override { return *comm; }
|
||||
|
||||
private:
|
||||
void runWithoutSync(const Graph &graph, bool tune, bool profiling) const;
|
||||
|
|
|
@ -184,7 +184,7 @@ class OnnxStub:
|
|||
node,
|
||||
{
|
||||
"dilations": [1, 1],
|
||||
"pads": [0, 0],
|
||||
"pads": [0, 0, 0, 0],
|
||||
"strides": [1, 1],
|
||||
"output_padding": [0, 0],
|
||||
},
|
||||
|
@ -193,8 +193,52 @@ class OnnxStub:
|
|||
attributes[name]
|
||||
for name in ["dilations", "pads", "strides", "output_padding"]
|
||||
)
|
||||
if p[0] != p[2] or p[1] != p[3]:
|
||||
adapt = "{}-adapt".format(node.output[0])
|
||||
tensors[adapt] = self.handler.pad(
|
||||
tensors[node.input[0]], None, p, [-2, -1]
|
||||
)
|
||||
p = [0, 0, 0, 0]
|
||||
else:
|
||||
adapt = node.input[0]
|
||||
|
||||
if len(node.input) > 2:
|
||||
bias = "{}-bias".format(node.output[0])
|
||||
reshape = "{}-reshape".format(node.output[0])
|
||||
tensors[bias] = self.handler.convTransposed2d(
|
||||
tensors[adapt],
|
||||
tensors[node.input[1]],
|
||||
None,
|
||||
p[0],
|
||||
p[1],
|
||||
s[0],
|
||||
s[1],
|
||||
d[0],
|
||||
d[1],
|
||||
op[0],
|
||||
op[1],
|
||||
)
|
||||
tensors[reshape] = self.handler.reshape(
|
||||
tensors[node.input[2]],
|
||||
None,
|
||||
[
|
||||
1,
|
||||
reduce(
|
||||
lambda acc, x: acc * x,
|
||||
tensors[node.input[2]].shape(),
|
||||
),
|
||||
1,
|
||||
1,
|
||||
],
|
||||
)
|
||||
tensors[node.output[0]] = self.handler.add(
|
||||
tensors[bias],
|
||||
tensors[reshape],
|
||||
tensors.get(node.output[0]),
|
||||
)
|
||||
else:
|
||||
tensors[node.output[0]] = self.handler.convTransposed2d(
|
||||
tensors[node.input[0]],
|
||||
tensors[adapt],
|
||||
tensors[node.input[1]],
|
||||
tensors.get(node.output[0]),
|
||||
p[0],
|
||||
|
|
|
@ -56,4 +56,15 @@ void ASCENDRuntimeObj::sync() const { ; }
|
|||
|
||||
string ASCENDRuntimeObj::toString() const { return "ASCEND Runtime"; }
|
||||
|
||||
void ASCENDRuntimeObj::initComm(const string &name, int worldSize, int rank) {
|
||||
IT_ASSERT(worldSize > 0);
|
||||
IT_ASSERT(rank >= 0);
|
||||
IT_ASSERT(rank < worldSize);
|
||||
IT_ASSERT(!comm) << "communicator is already initialized.";
|
||||
#ifdef INFINI_USE_HCCL
|
||||
comm = std::make_unique<HcclCommunicatorObj>(name, worldSize, rank);
|
||||
#else
|
||||
IT_TODO_HALT_MSG("Not compiled with CNCL.");
|
||||
#endif
|
||||
}
|
||||
} // namespace infini
|
||||
|
|
|
@ -13,9 +13,9 @@ class ConvAclnn : public ASCENDKernelWithoutConfig {
|
|||
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
|
||||
|
||||
const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
|
||||
// const auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
|
||||
// const int cpg = op->getChannelPerGroup();
|
||||
// const int g = c / cpg;
|
||||
const auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
|
||||
const int cpg = op->getChannelPerGroup();
|
||||
const int g = c / cpg;
|
||||
|
||||
std::vector<int64_t> pads = {ph, pw};
|
||||
// std::vector<int64_t> ksize = {r, s};
|
||||
|
@ -67,8 +67,8 @@ class ConvAclnn : public ASCENDKernelWithoutConfig {
|
|||
|
||||
auto ret = aclnnConvolutionGetWorkspaceSize(
|
||||
inputTensor, weightTensor, nullptr, convstride, convpads,
|
||||
convdilation, false, convOutputpadding, 1, outputTensor, 1,
|
||||
&workspaceSize, &executor);
|
||||
convdilation, false, convOutputpadding, int64_t(g), outputTensor,
|
||||
int8_t(1), &workspaceSize, &executor);
|
||||
void *workspaceAddr = nullptr;
|
||||
if (workspaceSize > 0) {
|
||||
workspaceAddr = context->getWorkspace(workspaceSize);
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
#include "operators/pooling.h"
|
||||
#include "aclnnop/level2/aclnn_adaptive_max_pool2d.h"
|
||||
#include "aclnnop/level2/aclnn_avgpool2d.h"
|
||||
#include "aclnnop/level2/aclnn_max_pool.h"
|
||||
#include "ascend/ascend_kernel_without_config.h"
|
||||
#include "ascend/ascend_runtime.h"
|
||||
|
||||
|
@ -75,40 +75,6 @@ class AvgPooling : public ASCENDKernelWithoutConfig {
|
|||
};
|
||||
|
||||
class MaxPooling : public ASCENDKernelWithoutConfig {
|
||||
// Only adaptiveMaxPool2d was found in the ACLNN doc.
|
||||
int64_t GetShapeSize(const std::vector<int64_t> &shape) {
|
||||
int64_t shapeSize = 1;
|
||||
for (auto i : shape) {
|
||||
shapeSize *= i;
|
||||
}
|
||||
return shapeSize;
|
||||
}
|
||||
template <typename T>
|
||||
int CreateAclTensor(const std::vector<T> &hostData,
|
||||
const std::vector<int64_t> &shape, void **deviceAddr,
|
||||
aclDataType dataType, aclTensor **tensor) {
|
||||
auto size = GetShapeSize(shape) * sizeof(T);
|
||||
// 调用aclrtMalloc申请device侧内存
|
||||
auto ret = aclrtMalloc(deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST);
|
||||
assert(ret == ACL_SUCCESS);
|
||||
// 调用aclrtMemcpy将host侧数据拷贝到device侧内存上
|
||||
ret = aclrtMemcpy(*deviceAddr, size, hostData.data(), size,
|
||||
ACL_MEMCPY_HOST_TO_DEVICE);
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
// 计算连续tensor的strides
|
||||
std::vector<int64_t> strides(shape.size(), 1);
|
||||
for (int64_t i = shape.size() - 2; i >= 0; i--) {
|
||||
strides[i] = shape[i + 1] * strides[i + 1];
|
||||
}
|
||||
|
||||
// 调用aclCreateTensor接口创建aclTensor
|
||||
*tensor = aclCreateTensor(shape.data(), shape.size(), dataType,
|
||||
strides.data(), 0, aclFormat::ACL_FORMAT_NCHW,
|
||||
shape.data(), shape.size(), *deviceAddr);
|
||||
return 0;
|
||||
}
|
||||
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<PoolingObj>(_op);
|
||||
|
@ -117,6 +83,15 @@ class MaxPooling : public ASCENDKernelWithoutConfig {
|
|||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
auto [n, c, h, w, kh, kw] = op->getNCHWRS();
|
||||
auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
|
||||
int64_t ceilMode = int64_t(op->getCeilMode());
|
||||
|
||||
std::vector<int64_t> ksize = {kh, kw};
|
||||
std::vector<int64_t> stride = {sh, sw};
|
||||
std::vector<int64_t> pad = {ph, pw};
|
||||
std::vector<int64_t> dilation = {dh, dw};
|
||||
|
||||
auto selfD = op->getInputs(0)->getDims();
|
||||
auto selfS = op->getInputs(0)->getStride();
|
||||
auto outD = op->getOutput()->getDims();
|
||||
|
@ -127,20 +102,12 @@ class MaxPooling : public ASCENDKernelWithoutConfig {
|
|||
std::vector<int64_t> outputDim = castTo64(outD);
|
||||
std::vector<int64_t> outputStride = castTo64(outS);
|
||||
|
||||
std::vector<int64_t> outputHW(2, 1);
|
||||
outputHW[0] = outputDim[outputDim.size() - 2];
|
||||
outputHW[1] = outputDim[outputDim.size() - 1];
|
||||
aclIntArray *kernelSize = aclCreateIntArray(ksize.data(), ksize.size());
|
||||
aclIntArray *strides = aclCreateIntArray(stride.data(), stride.size());
|
||||
aclIntArray *paddings = aclCreateIntArray(pad.data(), pad.size());
|
||||
aclIntArray *dilations =
|
||||
aclCreateIntArray(dilation.data(), dilation.size());
|
||||
|
||||
int64_t indicesOutSize = 1;
|
||||
for (auto i : outputDim) {
|
||||
indicesOutSize *= i;
|
||||
}
|
||||
void *indicesOutDeviceAddr = nullptr;
|
||||
aclrtMalloc(&indicesOutDeviceAddr, indicesOutSize,
|
||||
ACL_MEM_MALLOC_HUGE_FIRST);
|
||||
|
||||
aclIntArray *outputsize =
|
||||
aclCreateIntArray(outputHW.data(), outputHW.size());
|
||||
auto selfTensor = aclCreateTensor(
|
||||
selfDim.data(), selfDim.size(), ACL_FLOAT, selfStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_NCHW, selfDim.data(), selfDim.size(), aData);
|
||||
|
@ -148,16 +115,12 @@ class MaxPooling : public ASCENDKernelWithoutConfig {
|
|||
aclCreateTensor(outputDim.data(), outputDim.size(), ACL_FLOAT,
|
||||
outputStride.data(), 0, aclFormat::ACL_FORMAT_NCHW,
|
||||
outputDim.data(), outputDim.size(), cData);
|
||||
auto indicesOutTensor = aclCreateTensor(
|
||||
outputDim.data(), outputDim.size(), ACL_INT64, outputStride.data(),
|
||||
0, aclFormat::ACL_FORMAT_NCHW, outputDim.data(), outputDim.size(),
|
||||
indicesOutDeviceAddr);
|
||||
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor *executor;
|
||||
auto ret = aclnnAdaptiveMaxPool2dGetWorkspaceSize(
|
||||
selfTensor, outputsize, outputTensor, indicesOutTensor,
|
||||
&workspaceSize, &executor);
|
||||
auto ret = aclnnMaxPoolGetWorkspaceSize(
|
||||
selfTensor, kernelSize, strides, 0, paddings, dilations, ceilMode,
|
||||
outputTensor, &workspaceSize, &executor);
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
void *workspaceAddr = nullptr;
|
||||
|
@ -165,15 +128,13 @@ class MaxPooling : public ASCENDKernelWithoutConfig {
|
|||
workspaceAddr = context->getWorkspace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnAdaptiveMaxPool2d(workspaceAddr, workspaceSize, executor,
|
||||
ret = aclnnMaxPool(workspaceAddr, workspaceSize, executor,
|
||||
context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
aclDestroyTensor(indicesOutTensor);
|
||||
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
|
|
@ -45,14 +45,16 @@ void testConv(const std::function<void(void *, size_t, DataType)> &generatorA,
|
|||
cpuRuntime->run(cpuGraph);
|
||||
auto outputCpu = cpuOp->getOutput();
|
||||
// Check
|
||||
EXPECT_TRUE(outputCpu->equalData(outputNpu2Cpu));
|
||||
// outputCpu->printData();
|
||||
// outputNpu2Cpu->printData();
|
||||
EXPECT_TRUE(outputCpu->equalData(outputNpu2Cpu, 1e-3));
|
||||
}
|
||||
|
||||
TEST(ascend_Conv, run) {
|
||||
aclInit(nullptr);
|
||||
// aclInit(nullptr);
|
||||
testConv<ConvObj>(IncrementalGenerator(), IncrementalGenerator(),
|
||||
Shape{1, 3, 32, 32}, Shape{2, 3, 3, 3});
|
||||
aclFinalize();
|
||||
Shape{1, 3, 128, 128}, Shape{2, 3, 3, 3});
|
||||
// aclFinalize();
|
||||
}
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -53,9 +53,10 @@ TEST(ascend_ElementWise, run) {
|
|||
// aclInit(nullptr);
|
||||
// testElementWise<PowObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
// testElementWise<AddObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
// testElementWise<SubObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testElementWise<DivObj>(IncrementalGenerator(), Shape{1},
|
||||
Shape{1, 2, 2, 3});
|
||||
testElementWise<SubObj>(IncrementalGenerator(), Shape{1, 1, 48, 48},
|
||||
Shape{1, 1, 1, 1});
|
||||
// testElementWise<DivObj>(IncrementalGenerator(), Shape{1}, Shape{1, 2, 2,
|
||||
// 3});
|
||||
// testElementWise<MulObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
// aclFinalize();
|
||||
}
|
||||
|
|
|
@ -50,10 +50,10 @@ void testMatmul(const std::function<void(void *, size_t, DataType)> &generatorA,
|
|||
}
|
||||
|
||||
TEST(ascend_Matmul, run) {
|
||||
aclInit(nullptr);
|
||||
// aclInit(nullptr);
|
||||
testMatmul<MatmulObj>(IncrementalGenerator(), IncrementalGenerator(), false,
|
||||
false, Shape{1, 2, 3}, Shape{1, 3, 4});
|
||||
aclFinalize();
|
||||
// aclFinalize();
|
||||
}
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -26,6 +26,7 @@ void testPooling(const std::function<void(void *, size_t, DataType)> &generator,
|
|||
auto inputNpu = npuGraph->cloneTensor(inputCpu);
|
||||
auto npuOp =
|
||||
npuGraph->addOp<T>(inputNpu, nullptr, 3, 3, 1, 1, 1, 1, 2, 2, 0);
|
||||
// npuGraph->addOp<T>(inputNpu, nullptr, 2, 2, 1, 1, 0, 0, 1, 1, 0);
|
||||
npuGraph->dataMalloc();
|
||||
inputNpu->setData(generator);
|
||||
npuRuntime->run(npuGraph);
|
||||
|
@ -38,10 +39,10 @@ void testPooling(const std::function<void(void *, size_t, DataType)> &generator,
|
|||
}
|
||||
|
||||
TEST(cnnl_Pooling, run) {
|
||||
aclInit(nullptr);
|
||||
testPooling<MaxPoolObj>(IncrementalGenerator(), Shape{1, 2, 5, 5});
|
||||
// aclInit(nullptr);
|
||||
// testPooling<MaxPoolObj>(IncrementalGenerator(), Shape{1, 2, 5, 5});
|
||||
testPooling<AvgPoolObj>(IncrementalGenerator(), Shape{1, 2, 5, 5});
|
||||
aclFinalize();
|
||||
// aclFinalize();
|
||||
}
|
||||
|
||||
} // namespace infini
|
||||
|
|
Loading…
Reference in New Issue