diff --git a/CMakeLists.txt b/CMakeLists.txt index f9fdb679..7bc077ec 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,6 +20,7 @@ endif() include(CMakeDependentOption) project(InfiniTensor C CXX) + cmake_dependent_option(BUILD_TEST_CORE "Build tests for core components" ON BUILD_TEST OFF) cmake_dependent_option(BUILD_TEST_PET "Build tests for PET" OFF BUILD_TEST OFF) diff --git a/Makefile b/Makefile index 8a18e24f..bc8b1f3d 100644 --- a/Makefile +++ b/Makefile @@ -39,7 +39,7 @@ endif build: mkdir -p build/$(TYPE) - cd build/$(TYPE) && cmake $(CMAKE_OPT) ../.. && make -j8 + cd build/$(TYPE) && cmake $(CMAKE_OPT) ../.. && make -j clean: rm -rf build diff --git a/include/ascend/ascend_runtime.h b/include/ascend/ascend_runtime.h index 7ec6bdfb..3a916606 100644 --- a/include/ascend/ascend_runtime.h +++ b/include/ascend/ascend_runtime.h @@ -20,21 +20,16 @@ class ASCENDRuntimeObj : public RuntimeObj { private: aclrtContext context; aclrtStream stream; + std::unique_ptr comm; ASCENDPtr workspace = nullptr; size_t workspaceSize; public: ASCENDRuntimeObj(int deviceId = 0) : RuntimeObj(Device::ASCEND, deviceId) { - // #ifndef _ACL_INIT - // #define _ACL_INIT - // aclInit(nullptr); - // // auto ret_init = - // // CHECK_RET(ret == ACL_SUCCESS, - // // LOG_PRINT("aclInit failed. ERROR: %d\n", - // ret)); - // #endif - aclInit(nullptr); - auto ret = aclrtSetDevice(deviceId); + auto ret = aclInit(nullptr); + CHECK_RET(ret == ACL_SUCCESS, + LOG_PRINT("aclInit failed. ERROR: %d\n", ret)); + ret = aclrtSetDevice(deviceId); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtSetDevice failed. ERROR: %d\n", ret)); ret = aclrtCreateContext(&context, deviceId); @@ -49,7 +44,7 @@ class ASCENDRuntimeObj : public RuntimeObj { // 10GB for Longformer // size_t longformerNum = 3lu * (1 << 30); - workspaceSize = 3ll << 30; // 3 GB + workspaceSize = 3ll << 33; // 3 GB // std::cout<(workspaceSize)<< std::endl; workspace = alloc(workspaceSize); @@ -99,9 +94,9 @@ class ASCENDRuntimeObj : public RuntimeObj { ACL_MEMCPY_DEVICE_TO_DEVICE); } - void initComm(const string &, int, int) override { IT_TODO_HALT(); } + void initComm(const string &name, int worldSize, int rank) final; - CommunicatorObj &getCommunicator() const override { IT_TODO_HALT(); } + CommunicatorObj &getCommunicator() const override { return *comm; } private: void runWithoutSync(const Graph &graph, bool tune, bool profiling) const; diff --git a/pyinfinitensor/src/pyinfinitensor/onnx.py b/pyinfinitensor/src/pyinfinitensor/onnx.py index 79abb7f4..e8090515 100644 --- a/pyinfinitensor/src/pyinfinitensor/onnx.py +++ b/pyinfinitensor/src/pyinfinitensor/onnx.py @@ -184,7 +184,7 @@ class OnnxStub: node, { "dilations": [1, 1], - "pads": [0, 0], + "pads": [0, 0, 0, 0], "strides": [1, 1], "output_padding": [0, 0], }, @@ -193,19 +193,63 @@ class OnnxStub: attributes[name] for name in ["dilations", "pads", "strides", "output_padding"] ) - tensors[node.output[0]] = self.handler.convTransposed2d( - tensors[node.input[0]], - tensors[node.input[1]], - tensors.get(node.output[0]), - p[0], - p[1], - s[0], - s[1], - d[0], - d[1], - op[0], - op[1], - ) + if p[0] != p[2] or p[1] != p[3]: + adapt = "{}-adapt".format(node.output[0]) + tensors[adapt] = self.handler.pad( + tensors[node.input[0]], None, p, [-2, -1] + ) + p = [0, 0, 0, 0] + else: + adapt = node.input[0] + + if len(node.input) > 2: + bias = "{}-bias".format(node.output[0]) + reshape = "{}-reshape".format(node.output[0]) + tensors[bias] = self.handler.convTransposed2d( + tensors[adapt], + tensors[node.input[1]], + None, + p[0], + p[1], + s[0], + s[1], + d[0], + d[1], + op[0], + op[1], + ) + tensors[reshape] = self.handler.reshape( + tensors[node.input[2]], + None, + [ + 1, + reduce( + lambda acc, x: acc * x, + tensors[node.input[2]].shape(), + ), + 1, + 1, + ], + ) + tensors[node.output[0]] = self.handler.add( + tensors[bias], + tensors[reshape], + tensors.get(node.output[0]), + ) + else: + tensors[node.output[0]] = self.handler.convTransposed2d( + tensors[adapt], + tensors[node.input[1]], + tensors.get(node.output[0]), + p[0], + p[1], + s[0], + s[1], + d[0], + d[1], + op[0], + op[1], + ) elif node.op_type == "MatMul": tensors[node.output[0]] = self.handler.matmul( tensors[node.input[0]], diff --git a/src/ascend/ascend_runtime.cc b/src/ascend/ascend_runtime.cc index 228af2c5..221499bd 100644 --- a/src/ascend/ascend_runtime.cc +++ b/src/ascend/ascend_runtime.cc @@ -56,4 +56,15 @@ void ASCENDRuntimeObj::sync() const { ; } string ASCENDRuntimeObj::toString() const { return "ASCEND Runtime"; } +void ASCENDRuntimeObj::initComm(const string &name, int worldSize, int rank) { + IT_ASSERT(worldSize > 0); + IT_ASSERT(rank >= 0); + IT_ASSERT(rank < worldSize); + IT_ASSERT(!comm) << "communicator is already initialized."; +#ifdef INFINI_USE_HCCL + comm = std::make_unique(name, worldSize, rank); +#else + IT_TODO_HALT_MSG("Not compiled with CNCL."); +#endif +} } // namespace infini diff --git a/src/kernels/ascend/conv.cc b/src/kernels/ascend/conv.cc index 41e31556..35db301e 100644 --- a/src/kernels/ascend/conv.cc +++ b/src/kernels/ascend/conv.cc @@ -13,9 +13,9 @@ class ConvAclnn : public ASCENDKernelWithoutConfig { auto context = dynamic_cast(_context); const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation(); - // const auto [n, c, h, w, f, r, s] = op->getNCHWFRS(); - // const int cpg = op->getChannelPerGroup(); - // const int g = c / cpg; + const auto [n, c, h, w, f, r, s] = op->getNCHWFRS(); + const int cpg = op->getChannelPerGroup(); + const int g = c / cpg; std::vector pads = {ph, pw}; // std::vector ksize = {r, s}; @@ -67,8 +67,8 @@ class ConvAclnn : public ASCENDKernelWithoutConfig { auto ret = aclnnConvolutionGetWorkspaceSize( inputTensor, weightTensor, nullptr, convstride, convpads, - convdilation, false, convOutputpadding, 1, outputTensor, 1, - &workspaceSize, &executor); + convdilation, false, convOutputpadding, int64_t(g), outputTensor, + int8_t(1), &workspaceSize, &executor); void *workspaceAddr = nullptr; if (workspaceSize > 0) { workspaceAddr = context->getWorkspace(workspaceSize); diff --git a/src/kernels/ascend/pooling.cc b/src/kernels/ascend/pooling.cc index 3bbc1c1e..29b36b0e 100644 --- a/src/kernels/ascend/pooling.cc +++ b/src/kernels/ascend/pooling.cc @@ -1,6 +1,6 @@ #include "operators/pooling.h" -#include "aclnnop/level2/aclnn_adaptive_max_pool2d.h" #include "aclnnop/level2/aclnn_avgpool2d.h" +#include "aclnnop/level2/aclnn_max_pool.h" #include "ascend/ascend_kernel_without_config.h" #include "ascend/ascend_runtime.h" @@ -75,40 +75,6 @@ class AvgPooling : public ASCENDKernelWithoutConfig { }; class MaxPooling : public ASCENDKernelWithoutConfig { - // Only adaptiveMaxPool2d was found in the ACLNN doc. - int64_t GetShapeSize(const std::vector &shape) { - int64_t shapeSize = 1; - for (auto i : shape) { - shapeSize *= i; - } - return shapeSize; - } - template - int CreateAclTensor(const std::vector &hostData, - const std::vector &shape, void **deviceAddr, - aclDataType dataType, aclTensor **tensor) { - auto size = GetShapeSize(shape) * sizeof(T); - // 调用aclrtMalloc申请device侧内存 - auto ret = aclrtMalloc(deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST); - assert(ret == ACL_SUCCESS); - // 调用aclrtMemcpy将host侧数据拷贝到device侧内存上 - ret = aclrtMemcpy(*deviceAddr, size, hostData.data(), size, - ACL_MEMCPY_HOST_TO_DEVICE); - assert(ret == ACL_SUCCESS); - - // 计算连续tensor的strides - std::vector strides(shape.size(), 1); - for (int64_t i = shape.size() - 2; i >= 0; i--) { - strides[i] = shape[i + 1] * strides[i + 1]; - } - - // 调用aclCreateTensor接口创建aclTensor - *tensor = aclCreateTensor(shape.data(), shape.size(), dataType, - strides.data(), 0, aclFormat::ACL_FORMAT_NCHW, - shape.data(), shape.size(), *deviceAddr); - return 0; - } - void compute(const Operator &_op, const RuntimeObj *_context) const override { auto op = as(_op); @@ -117,6 +83,15 @@ class MaxPooling : public ASCENDKernelWithoutConfig { void *const aData = (op->getInputs(0)->getRawDataPtr()); void *const cData = (op->getOutput()->getRawDataPtr()); + auto [n, c, h, w, kh, kw] = op->getNCHWRS(); + auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation(); + int64_t ceilMode = int64_t(op->getCeilMode()); + + std::vector ksize = {kh, kw}; + std::vector stride = {sh, sw}; + std::vector pad = {ph, pw}; + std::vector dilation = {dh, dw}; + auto selfD = op->getInputs(0)->getDims(); auto selfS = op->getInputs(0)->getStride(); auto outD = op->getOutput()->getDims(); @@ -127,20 +102,12 @@ class MaxPooling : public ASCENDKernelWithoutConfig { std::vector outputDim = castTo64(outD); std::vector outputStride = castTo64(outS); - std::vector outputHW(2, 1); - outputHW[0] = outputDim[outputDim.size() - 2]; - outputHW[1] = outputDim[outputDim.size() - 1]; + aclIntArray *kernelSize = aclCreateIntArray(ksize.data(), ksize.size()); + aclIntArray *strides = aclCreateIntArray(stride.data(), stride.size()); + aclIntArray *paddings = aclCreateIntArray(pad.data(), pad.size()); + aclIntArray *dilations = + aclCreateIntArray(dilation.data(), dilation.size()); - int64_t indicesOutSize = 1; - for (auto i : outputDim) { - indicesOutSize *= i; - } - void *indicesOutDeviceAddr = nullptr; - aclrtMalloc(&indicesOutDeviceAddr, indicesOutSize, - ACL_MEM_MALLOC_HUGE_FIRST); - - aclIntArray *outputsize = - aclCreateIntArray(outputHW.data(), outputHW.size()); auto selfTensor = aclCreateTensor( selfDim.data(), selfDim.size(), ACL_FLOAT, selfStride.data(), 0, aclFormat::ACL_FORMAT_NCHW, selfDim.data(), selfDim.size(), aData); @@ -148,16 +115,12 @@ class MaxPooling : public ASCENDKernelWithoutConfig { aclCreateTensor(outputDim.data(), outputDim.size(), ACL_FLOAT, outputStride.data(), 0, aclFormat::ACL_FORMAT_NCHW, outputDim.data(), outputDim.size(), cData); - auto indicesOutTensor = aclCreateTensor( - outputDim.data(), outputDim.size(), ACL_INT64, outputStride.data(), - 0, aclFormat::ACL_FORMAT_NCHW, outputDim.data(), outputDim.size(), - indicesOutDeviceAddr); uint64_t workspaceSize = 0; aclOpExecutor *executor; - auto ret = aclnnAdaptiveMaxPool2dGetWorkspaceSize( - selfTensor, outputsize, outputTensor, indicesOutTensor, - &workspaceSize, &executor); + auto ret = aclnnMaxPoolGetWorkspaceSize( + selfTensor, kernelSize, strides, 0, paddings, dilations, ceilMode, + outputTensor, &workspaceSize, &executor); assert(ret == ACL_SUCCESS); void *workspaceAddr = nullptr; @@ -165,15 +128,13 @@ class MaxPooling : public ASCENDKernelWithoutConfig { workspaceAddr = context->getWorkspace(workspaceSize); } - ret = aclnnAdaptiveMaxPool2d(workspaceAddr, workspaceSize, executor, - context->ASCENDHandle()); + ret = aclnnMaxPool(workspaceAddr, workspaceSize, executor, + context->ASCENDHandle()); assert(ret == ACL_SUCCESS); ret = aclrtSynchronizeStream(context->ASCENDHandle()); assert(ret == ACL_SUCCESS); - aclDestroyTensor(indicesOutTensor); - return; } }; diff --git a/test/kernels/ascend/test_ascend_conv.cc b/test/kernels/ascend/test_ascend_conv.cc index 69ab1cd1..55c7025a 100644 --- a/test/kernels/ascend/test_ascend_conv.cc +++ b/test/kernels/ascend/test_ascend_conv.cc @@ -45,14 +45,16 @@ void testConv(const std::function &generatorA, cpuRuntime->run(cpuGraph); auto outputCpu = cpuOp->getOutput(); // Check - EXPECT_TRUE(outputCpu->equalData(outputNpu2Cpu)); + // outputCpu->printData(); + // outputNpu2Cpu->printData(); + EXPECT_TRUE(outputCpu->equalData(outputNpu2Cpu, 1e-3)); } TEST(ascend_Conv, run) { - aclInit(nullptr); + // aclInit(nullptr); testConv(IncrementalGenerator(), IncrementalGenerator(), - Shape{1, 3, 32, 32}, Shape{2, 3, 3, 3}); - aclFinalize(); + Shape{1, 3, 128, 128}, Shape{2, 3, 3, 3}); + // aclFinalize(); } } // namespace infini diff --git a/test/kernels/ascend/test_ascend_element_wise.cc b/test/kernels/ascend/test_ascend_element_wise.cc index 9d3f06c3..34484612 100644 --- a/test/kernels/ascend/test_ascend_element_wise.cc +++ b/test/kernels/ascend/test_ascend_element_wise.cc @@ -53,9 +53,10 @@ TEST(ascend_ElementWise, run) { // aclInit(nullptr); // testElementWise(IncrementalGenerator(), Shape{1, 2, 2, 3}); // testElementWise(IncrementalGenerator(), Shape{1, 2, 2, 3}); - // testElementWise(IncrementalGenerator(), Shape{1, 2, 2, 3}); - testElementWise(IncrementalGenerator(), Shape{1}, - Shape{1, 2, 2, 3}); + testElementWise(IncrementalGenerator(), Shape{1, 1, 48, 48}, + Shape{1, 1, 1, 1}); + // testElementWise(IncrementalGenerator(), Shape{1}, Shape{1, 2, 2, + // 3}); // testElementWise(IncrementalGenerator(), Shape{1, 2, 2, 3}); // aclFinalize(); } diff --git a/test/kernels/ascend/test_ascend_matmul.cc b/test/kernels/ascend/test_ascend_matmul.cc index 247e6fc5..4e1eb56f 100644 --- a/test/kernels/ascend/test_ascend_matmul.cc +++ b/test/kernels/ascend/test_ascend_matmul.cc @@ -50,10 +50,10 @@ void testMatmul(const std::function &generatorA, } TEST(ascend_Matmul, run) { - aclInit(nullptr); + // aclInit(nullptr); testMatmul(IncrementalGenerator(), IncrementalGenerator(), false, false, Shape{1, 2, 3}, Shape{1, 3, 4}); - aclFinalize(); + // aclFinalize(); } } // namespace infini diff --git a/test/kernels/ascend/test_ascend_pooling.cc b/test/kernels/ascend/test_ascend_pooling.cc index 7fedc41e..74e1f0a9 100644 --- a/test/kernels/ascend/test_ascend_pooling.cc +++ b/test/kernels/ascend/test_ascend_pooling.cc @@ -26,6 +26,7 @@ void testPooling(const std::function &generator, auto inputNpu = npuGraph->cloneTensor(inputCpu); auto npuOp = npuGraph->addOp(inputNpu, nullptr, 3, 3, 1, 1, 1, 1, 2, 2, 0); + // npuGraph->addOp(inputNpu, nullptr, 2, 2, 1, 1, 0, 0, 1, 1, 0); npuGraph->dataMalloc(); inputNpu->setData(generator); npuRuntime->run(npuGraph); @@ -38,10 +39,10 @@ void testPooling(const std::function &generator, } TEST(cnnl_Pooling, run) { - aclInit(nullptr); - testPooling(IncrementalGenerator(), Shape{1, 2, 5, 5}); + // aclInit(nullptr); + // testPooling(IncrementalGenerator(), Shape{1, 2, 5, 5}); testPooling(IncrementalGenerator(), Shape{1, 2, 5, 5}); - aclFinalize(); + // aclFinalize(); } } // namespace infini