add kernels

This commit is contained in:
OdinaryWord 2023-11-03 14:43:21 +08:00
parent a9bd73528d
commit 39484e0cc4
13 changed files with 791 additions and 270 deletions

View File

@ -4,6 +4,7 @@
namespace infini {
class ASCENDKernelWithoutConfig : public Kernel {
public:
virtual void compute(const Operator &op, const PerfRecord &record,
@ -19,6 +20,14 @@ class ASCENDKernelWithoutConfig : public Kernel {
return make_ref<PerfRecordObj>(timeit([&]() { compute(op, _context); },
[&]() { context->sync(); }));
}
// transform vector<int> to vector<int64_t>
std::vector<int64_t> MycastTo64(std::vector<int> const & v32) const {
std::vector<int64_t> v64(v32.size(), 1);
for (size_t i = 0; i < v32.size(); ++i) {
v64[i] = int64_t(v32[i]);
}
return v64;
}
};
} // namespace infini

View File

@ -0,0 +1,109 @@
#include "operators/batch_norm.h"
#include "aclnnop/level2/aclnn_batch_norm.h"
#include "ascend/ascend_kernel_without_config.h"
#include "ascend/ascend_runtime.h"
namespace infini {
class BatchNormAclnn : public ASCENDKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<BatchNormObj>(_op);
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
void *const inData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const outData = (op->getOutput()->getRawDataPtr<void *>());
void *const meanData = (op->getInputs(1)->getRawDataPtr<void *>());
void *const varData = (op->getInputs(2)->getRawDataPtr<void *>());
void *const scaleData = (op->getInputs(3)->getRawDataPtr<void *>());
void *const biasData = (op->getInputs(4)->getRawDataPtr<void *>());
auto inD = op->getInputs(0)->getDims();
auto inS = op->getInputs(0)->getStride();
auto paraD = op->getInputs(1)->getDims();
auto paraS = op->getInputs(1)->getStride();
auto outD = op->getOutput()->getDims();
auto outS = op->getOutput()->getStride();
std::vector<int64_t> inputDim = MycastTo64(inD);
std::vector<int64_t> inputStride = MycastTo64(inS);
std::vector<int64_t> paraDim = MycastTo64(paraD);
std::vector<int64_t> paraStride = MycastTo64(paraS);
std::vector<int64_t> outputDim = MycastTo64(outD);
std::vector<int64_t> outputStride = MycastTo64(outS);
//std::vector<int64_t> inputDim(in.size(), 1);
//for (size_t i = 0; i < a.size(); ++i) {
// inputDim[i] = int64_t(in[i]);
//}
//std::vector<int64_t> inputStride(inS.size(), 1);
//for (size_t i = 0; i < inS.size(); ++i) {
// inputStride[i] = int64_t(inS[i]);
//}
auto inputTensor = aclCreateTensor(
inputDim.data(), inputDim.size(), ACL_FLOAT, inputStride.data(), 0,
aclFormat::ACL_FORMAT_NCHW, inputDim.data(), inputDim.size(), inData);
auto outputTensor = aclCreateTensor(
outputDim.data(), outputDim.size(), ACL_FLOAT, outputStride.data(), 0,
aclFormat::ACL_FORMAT_NCHW, outputDim.data(), outputDim.size(), outData);
auto meanTensor = aclCreateTensor(
paraDim.data(), paraDim.size(), ACL_FLOAT, paraStride.data(), 0,
aclFormat::ACL_FORMAT_ND, paraDim.data(), paraDim.size(), meanData);
auto varTensor = aclCreateTensor(
paraDim.data(), paraDim.size(), ACL_FLOAT, paraStride.data(), 0,
aclFormat::ACL_FORMAT_ND, paraDim.data(), paraDim.size(), varData);
auto scaleTensor = aclCreateTensor(
paraDim.data(), paraDim.size(), ACL_FLOAT, paraStride.data(), 0,
aclFormat::ACL_FORMAT_ND, paraDim.data(), paraDim.size(), scaleData);
auto biasTensor = aclCreateTensor(
paraDim.data(), paraDim.size(), ACL_FLOAT, paraStride.data(), 0,
aclFormat::ACL_FORMAT_ND, paraDim.data(), paraDim.size(), biasData);
auto savemeanTensor = aclCreateTensor(
paraDim.data(), paraDim.size(), ACL_FLOAT, paraStride.data(), 0,
aclFormat::ACL_FORMAT_ND, paraDim.data(), paraDim.size(), scaleData);
auto saveinvstdTensor = aclCreateTensor(
paraDim.data(), paraDim.size(), ACL_FLOAT, paraStride.data(), 0,
aclFormat::ACL_FORMAT_ND, paraDim.data(), paraDim.size(), biasData);
uint64_t workspaceSize = 0;
aclOpExecutor *executor;
auto ret =
aclnnBatchNormGetWorkspaceSize(inputTensor, scaleTensor, biasTensor, meanTensor, varTensor, false, op->getMomentum(), op->getEps(), outputTensor, savemeanTensor, saveinvstdTensor, &workspaceSize, &executor);
void *workspaceAddr = nullptr;
if (workspaceSize > 0) {
ret = aclrtMalloc(&workspaceAddr, workspaceSize,
ACL_MEM_MALLOC_HUGE_FIRST);
}
assert(ret == ACL_SUCCESS);
ret = aclnnBatchNorm(workspaceAddr, workspaceSize, executor,
context->ASCENDHandle());
assert(ret == ACL_SUCCESS);
ret = aclrtSynchronizeStream(context->ASCENDHandle());
assert(ret == ACL_SUCCESS);
aclDestroyTensor(inputTensor);
aclDestroyTensor(outputTensor);
aclDestroyTensor(meanTensor);
aclDestroyTensor(varTensor);
aclDestroyTensor(scaleTensor);
aclDestroyTensor(biasTensor);
aclDestroyTensor(savemeanTensor);
aclDestroyTensor(saveinvstdTensor);
return;
}
};
REGISTER_KERNEL(Device::ASCEND, OpType::BatchNormalization, DataType::Float32, BatchNormAclnn,
"batchnorm_ASCEND_float");
}; // namespace infini

View File

@ -0,0 +1,100 @@
#include "operators/concat.h"
#include "aclnnop/level2/aclnn_cat.h"
#include "ascend/ascend_kernel_without_config.h"
#include "ascend/ascend_runtime.h"
namespace infini {
class ConcatAclnn : public ASCENDKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ConcatObj>(_op);
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
int dim = op->getDim();
//int num = op->numInputs();
std::vector<aclTensor*> inputsData{};
auto inD0 = op->getInputs(0)->getDims();
auto inS0 = op->getInputs(0)->getStride();
std::vector<int64_t> inputDim0 = MycastTo64(inD0);
std::vector<int64_t> inputStride0 = MycastTo64(inS0);
void *const inData0 = (op->getInputs(0)->getRawDataPtr<void *>());
auto tmpTensor0 = aclCreateTensor(
inputDim0.data(), inputDim0.size(), ACL_FLOAT, inputStride0.data(), 0,
aclFormat::ACL_FORMAT_ND, inputDim0.data(), inputDim0.size(), inData0);
inputsData.push_back(tmpTensor0);
auto inD = op->getInputs(1)->getDims();
auto inS = op->getInputs(1)->getStride();
std::vector<int64_t> inputDim = MycastTo64(inD);
std::vector<int64_t> inputStride = MycastTo64(inS);
void *const inData = (op->getInputs(1)->getRawDataPtr<void *>());
auto tmpTensor = aclCreateTensor(
inputDim.data(), inputDim.size(), ACL_FLOAT, inputStride.data(), 0,
aclFormat::ACL_FORMAT_ND, inputDim.data(), inputDim.size(), inData);
inputsData.push_back(tmpTensor);
//for (int i = 0; i < num; ++i) {
// auto inD = op->getInputs(i)->getDims();
// auto inS = op->getInputs(i)->getStride();
// std::vector<int64_t> inputDim = MycastTo64(inD);
// std::vector<int64_t> inputStride = MycastTo64(inS);
// void *const inData = (op->getInputs(i)->getRawDataPtr<void *>());
// auto tmpTensor = aclCreateTensor(
// inputDim.data(), inputDim.size(), ACL_FLOAT, inputStride.data(), 0,
// aclFormat::ACL_FORMAT_ND, inputDim.data(), inputDim.size(), inData);
// inputsData.push_back(tmpTensor);
//}
aclTensorList* tensorList = aclCreateTensorList(inputsData.data(), inputsData.size());
void *const outData = (op->getOutput()->getRawDataPtr<void *>());
auto outD = op->getOutput()->getDims();
auto outS = op->getOutput()->getStride();
std::vector<int64_t> outputDim = MycastTo64(outD);
std::vector<int64_t> outputStride = MycastTo64(outS);
auto outputTensor = aclCreateTensor(
outputDim.data(), outputDim.size(), ACL_FLOAT, outputStride.data(), 0,
aclFormat::ACL_FORMAT_ND, outputDim.data(), outputDim.size(), outData);
uint64_t workspaceSize = 0;
aclOpExecutor *executor;
auto ret =
aclnnCatGetWorkspaceSize(tensorList, int64_t(dim), outputTensor, &workspaceSize, &executor);
void *workspaceAddr = nullptr;
if (workspaceSize > 0) {
ret = aclrtMalloc(&workspaceAddr, workspaceSize,
ACL_MEM_MALLOC_HUGE_FIRST);
}
assert(ret == ACL_SUCCESS);
ret = aclnnCat(workspaceAddr, workspaceSize, executor,
context->ASCENDHandle());
assert(ret == ACL_SUCCESS);
ret = aclrtSynchronizeStream(context->ASCENDHandle());
assert(ret == ACL_SUCCESS);
aclDestroyTensorList(tensorList);
aclDestroyTensor(outputTensor);
return;
}
};
REGISTER_KERNEL(Device::ASCEND, OpType::Concat, DataType::Float32, ConcatAclnn,
"concat_ASCEND_float");
}; // namespace infini

View File

@ -0,0 +1,92 @@
#include "operators/conv.h"
#include "aclnnop/level2/aclnn_convolution.h"
#include "ascend/ascend_kernel_without_config.h"
#include "ascend/ascend_runtime.h"
namespace infini {
class ConvAclnn : public ASCENDKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ConvObj>(_op);
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
//const auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
//const int cpg = op->getChannelPerGroup();
//const int g = c / cpg;
std::vector<int64_t> pads = {ph, pw};
//std::vector<int64_t> ksize = {r, s};
std::vector<int64_t> stride = {sh, sw};
std::vector<int64_t> dilation = {dh, dw};
std::vector<int64_t> outputPadding = {sh-1, sw-1};
aclIntArray *convpads = aclCreateIntArray(pads.data(), pads.size());
aclIntArray *convstride = aclCreateIntArray(stride.data(), stride.size());
aclIntArray *convdilation = aclCreateIntArray(dilation.data(), dilation.size());
aclIntArray *convOutputpadding = aclCreateIntArray(outputPadding.data(), outputPadding.size());
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto inputD = op->getInputs(0)->getDims();
auto inputS = op->getInputs(0)->getStride();
auto weightD = op->getInputs(1)->getDims();
auto weightS = op->getInputs(1)->getStride();
auto outD = op->getOutput()->getDims();
auto outS = op->getOutput()->getStride();
std::vector<int64_t> inputDim = MycastTo64(inputD);
std::vector<int64_t> inputStride = MycastTo64(inputS);
std::vector<int64_t> weightDim = MycastTo64(weightD);
std::vector<int64_t> weightStride = MycastTo64(weightS);
std::vector<int64_t> outputDim = MycastTo64(outD);
std::vector<int64_t> outputStride = MycastTo64(outS);
auto inputTensor = aclCreateTensor(
inputDim.data(), inputDim.size(), ACL_FLOAT, inputStride.data(), 0,
aclFormat::ACL_FORMAT_NCHW, inputDim.data(), inputDim.size(), aData);
auto weightTensor = aclCreateTensor(
weightDim.data(), weightDim.size(), ACL_FLOAT, weightStride.data(), 0,
aclFormat::ACL_FORMAT_NCHW, weightDim.data(), weightDim.size(), bData);
auto outputTensor = aclCreateTensor(
outputDim.data(), outputDim.size(), ACL_FLOAT, outputStride.data(), 0,
aclFormat::ACL_FORMAT_NCHW, outputDim.data(), outputDim.size(), cData);
uint64_t workspaceSize = 0;
aclOpExecutor *executor;
auto ret =
aclnnConvolutionGetWorkspaceSize(inputTensor, weightTensor, nullptr, convstride, convpads, convdilation, false, convOutputpadding, 1, outputTensor, 1, &workspaceSize, &executor);
void *workspaceAddr = nullptr;
if (workspaceSize > 0) {
ret = aclrtMalloc(&workspaceAddr, workspaceSize,
ACL_MEM_MALLOC_HUGE_FIRST);
}
assert(ret == ACL_SUCCESS);
ret = aclnnConvolution(workspaceAddr, workspaceSize, executor,
context->ASCENDHandle());
assert(ret == ACL_SUCCESS);
ret = aclrtSynchronizeStream(context->ASCENDHandle());
assert(ret == ACL_SUCCESS);
aclDestroyTensor(inputTensor);
aclDestroyTensor(weightTensor);
aclDestroyTensor(outputTensor);
return;
}
};
REGISTER_KERNEL(Device::ASCEND, OpType::Conv, DataType::Float32, ConvAclnn,
"conv_ASCEND_float");
}; // namespace infini

View File

@ -0,0 +1,77 @@
#include "operators/matmul.h"
#include "aclnnop/level2/aclnn_matmul.h"
#include "ascend/ascend_kernel_without_config.h"
#include "ascend/ascend_runtime.h"
namespace infini {
class MatmulAclnn : public ASCENDKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<MatmulObj>(_op);
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto selfD = op->getInputs(0)->getDims();
auto selfS = op->getInputs(0)->getStride();
auto matD = op->getInputs(1)->getDims();
auto matS = op->getInputs(1)->getStride();
auto outD = op->getOutput()->getDims();
auto outS = op->getOutput()->getStride();
std::vector<int64_t> selfDim = MycastTo64(selfD);
std::vector<int64_t> selfStride = MycastTo64(selfS);
std::vector<int64_t> matDim = MycastTo64(matD);
std::vector<int64_t> matStride = MycastTo64(matS);
std::vector<int64_t> outputDim = MycastTo64(outD);
std::vector<int64_t> outputStride = MycastTo64(outS);
auto selfTensor = aclCreateTensor(
selfDim.data(), selfDim.size(), ACL_FLOAT, selfStride.data(), 0,
aclFormat::ACL_FORMAT_ND, selfDim.data(), selfDim.size(), aData);
auto matTensor = aclCreateTensor(
matDim.data(), matDim.size(), ACL_FLOAT, matStride.data(), 0,
aclFormat::ACL_FORMAT_ND, matDim.data(), matDim.size(), bData);
auto outputTensor = aclCreateTensor(
outputDim.data(), outputDim.size(), ACL_FLOAT, outputStride.data(), 0,
aclFormat::ACL_FORMAT_ND, outputDim.data(), outputDim.size(), cData);
uint64_t workspaceSize = 0;
aclOpExecutor *executor;
auto ret =
aclnnMatmulGetWorkspaceSize(selfTensor, matTensor, outputTensor, 1, &workspaceSize, &executor);
void *workspaceAddr = nullptr;
if (workspaceSize > 0) {
ret = aclrtMalloc(&workspaceAddr, workspaceSize,
ACL_MEM_MALLOC_HUGE_FIRST);
}
assert(ret == ACL_SUCCESS);
ret = aclnnMatmul(workspaceAddr, workspaceSize, executor,
context->ASCENDHandle());
assert(ret == ACL_SUCCESS);
ret = aclrtSynchronizeStream(context->ASCENDHandle());
assert(ret == ACL_SUCCESS);
aclDestroyTensor(selfTensor);
aclDestroyTensor(matTensor);
aclDestroyTensor(outputTensor);
return;
}
};
REGISTER_KERNEL(Device::ASCEND, OpType::MatMul, DataType::Float32, MatmulAclnn,
"matmul_ASCEND_float");
}; // namespace infini

View File

@ -0,0 +1,82 @@
#include "operators/pooling.h"
#include "aclnnop/level2/aclnn_avgpool2d.h"
#include "ascend/ascend_kernel_without_config.h"
#include "ascend/ascend_runtime.h"
namespace infini {
class AvgPooling : public ASCENDKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<PoolingObj>(_op);
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto [n, c, h, w, kh, kw] = op->getNCHWRS();
auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
std::vector<int64_t> ksize = {kh, kw};
std::vector<int64_t> stride = {sh, sw};
std::vector<int64_t> pad = {ph, pw};
int64_t divisorOverride = kh * kw;
auto selfD = op->getInputs(0)->getDims();
auto selfS = op->getInputs(0)->getStride();
auto outD = op->getOutput()->getDims();
auto outS = op->getOutput()->getStride();
std::vector<int64_t> selfDim = MycastTo64(selfD);
std::vector<int64_t> selfStride = MycastTo64(selfS);
std::vector<int64_t> outputDim = MycastTo64(outD);
std::vector<int64_t> outputStride = MycastTo64(outS);
aclIntArray *kernelSize = aclCreateIntArray(ksize.data(), ksize.size());
aclIntArray *strides = aclCreateIntArray(stride.data(), stride.size());
aclIntArray *paddings = aclCreateIntArray(pad.data(), pad.size());
auto selfTensor = aclCreateTensor(
selfDim.data(), selfDim.size(), ACL_FLOAT, selfStride.data(), 0,
aclFormat::ACL_FORMAT_NCHW, selfDim.data(), selfDim.size(), aData);
auto outputTensor = aclCreateTensor(
outputDim.data(), outputDim.size(), ACL_FLOAT, outputStride.data(), 0,
aclFormat::ACL_FORMAT_NCHW, outputDim.data(), outputDim.size(), cData);
uint64_t workspaceSize = 0;
aclOpExecutor *executor;
auto ret =
aclnnAvgPool2dGetWorkspaceSize(selfTensor, kernelSize, strides, paddings, false, true, divisorOverride, 1, outputTensor, &workspaceSize, &executor);
void *workspaceAddr = nullptr;
if (workspaceSize > 0) {
ret = aclrtMalloc(&workspaceAddr, workspaceSize,
ACL_MEM_MALLOC_HUGE_FIRST);
}
assert(ret == ACL_SUCCESS);
ret = aclnnAvgPool2d(workspaceAddr, workspaceSize, executor,
context->ASCENDHandle());
assert(ret == ACL_SUCCESS);
ret = aclrtSynchronizeStream(context->ASCENDHandle());
assert(ret == ACL_SUCCESS);
aclDestroyTensor(selfTensor);
aclDestroyTensor(outputTensor);
return;
}
};
REGISTER_KERNEL(Device::ASCEND, OpType::AveragePool, DataType::Float32, AvgPooling,
"avgpooling_ASCEND_float");
}; // namespace infini

View File

@ -8,6 +8,14 @@
#include "aclnnop/level2/aclnn_sin.h"
#include "aclnnop/level2/aclnn_cos.h"
#include "aclnnop/level2/aclnn_acos.h"
#include "aclnnop/level2/aclnn_atan.h"
#include "aclnnop/level2/aclnn_ceil.h"
#include "aclnnop/level2/aclnn_floor.h"
#include "aclnnop/level2/aclnn_exp.h"
#include "aclnnop/level2/aclnn_neg.h"
#include "aclnnop/level2/aclnn_reciprocal.h"
#include "aclnnop/level2/aclnn_sqrt.h"
#include "aclnnop/level2/aclnn_round.h"
#include "ascend/ascend_kernel_without_config.h"
#include "ascend/ascend_runtime.h"
@ -77,262 +85,6 @@ class ReluAclnn : public ASCENDKernelWithoutConfig {
}
};
class AbsAclnn : public ASCENDKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<UnaryObj>(_op);
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto a = op->getInputs(0)->getDims();
std::vector<int64_t> aDim(a.size(), 1);
for (size_t i = 0; i < a.size(); ++i) {
aDim[i] = int64_t(a[i]);
}
auto aS = op->getInputs(0)->getStride();
std::vector<int64_t> aStride(aS.size(), 1);
for (size_t i = 0; i < aS.size(); ++i) {
aStride[i] = int64_t(aS[i]);
}
auto c = op->getInputs(0)->getDims();
std::vector<int64_t> cDim(c.size(), 1);
for (size_t i = 0; i < c.size(); ++i) {
cDim[i] = int64_t(c[i]);
}
auto cS = op->getInputs(0)->getStride();
std::vector<int64_t> cStride(cS.size(), 1);
for (size_t i = 0; i < cS.size(); ++i) {
cStride[i] = int64_t(cS[i]);
}
auto input = aclCreateTensor(
aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0,
aclFormat::ACL_FORMAT_ND, aDim.data(), aDim.size(), aData);
auto output = aclCreateTensor(
cDim.data(), cDim.size(), ACL_FLOAT, cStride.data(), 0,
aclFormat::ACL_FORMAT_ND, cDim.data(), cDim.size(), cData);
uint64_t workspaceSize = 0;
aclOpExecutor *executor;
auto ret =
aclnnAbsGetWorkspaceSize(input, output, &workspaceSize, &executor);
void *workspaceAddr = nullptr;
if (workspaceSize > 0) {
ret = aclrtMalloc(&workspaceAddr, workspaceSize,
ACL_MEM_MALLOC_HUGE_FIRST);
}
assert(ret == ACL_SUCCESS);
ret = aclnnAbs(workspaceAddr, workspaceSize, executor,
context->ASCENDHandle());
assert(ret == ACL_SUCCESS);
//ret = aclDestroyTensor(input);
//assert(ret == ACL_SUCCESS);
//ret = aclDestroyTensor(output);
//assert(ret == ACL_SUCCESS);
ret = aclrtSynchronizeStream(context->ASCENDHandle());
assert(ret == ACL_SUCCESS);
return;
}
};
class SigmoidAclnn : public ASCENDKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<UnaryObj>(_op);
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto a = op->getInputs(0)->getDims();
std::vector<int64_t> aDim(a.size(), 1);
for (size_t i = 0; i < a.size(); ++i) {
aDim[i] = int64_t(a[i]);
}
auto aS = op->getInputs(0)->getStride();
std::vector<int64_t> aStride(aS.size(), 1);
for (size_t i = 0; i < aS.size(); ++i) {
aStride[i] = int64_t(aS[i]);
}
auto c = op->getInputs(0)->getDims();
std::vector<int64_t> cDim(c.size(), 1);
for (size_t i = 0; i < c.size(); ++i) {
cDim[i] = int64_t(c[i]);
}
auto cS = op->getInputs(0)->getStride();
std::vector<int64_t> cStride(cS.size(), 1);
for (size_t i = 0; i < cS.size(); ++i) {
cStride[i] = int64_t(cS[i]);
}
auto input = aclCreateTensor(
aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0,
aclFormat::ACL_FORMAT_ND, aDim.data(), aDim.size(), aData);
auto output = aclCreateTensor(
cDim.data(), cDim.size(), ACL_FLOAT, cStride.data(), 0,
aclFormat::ACL_FORMAT_ND, cDim.data(), cDim.size(), cData);
uint64_t workspaceSize = 0;
aclOpExecutor *executor;
auto ret =
aclnnSigmoidGetWorkspaceSize(input, output, &workspaceSize, &executor);
void *workspaceAddr = nullptr;
if (workspaceSize > 0) {
ret = aclrtMalloc(&workspaceAddr, workspaceSize,
ACL_MEM_MALLOC_HUGE_FIRST);
}
assert(ret == ACL_SUCCESS);
ret = aclnnSigmoid(workspaceAddr, workspaceSize, executor,
context->ASCENDHandle());
assert(ret == ACL_SUCCESS);
//ret = aclDestroyTensor(input);
//assert(ret == ACL_SUCCESS);
//ret = aclDestroyTensor(output);
//assert(ret == ACL_SUCCESS);
ret = aclrtSynchronizeStream(context->ASCENDHandle());
assert(ret == ACL_SUCCESS);
return;
}
};
class HardswishAclnn : public ASCENDKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<UnaryObj>(_op);
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto a = op->getInputs(0)->getDims();
std::vector<int64_t> aDim(a.size(), 1);
for (size_t i = 0; i < a.size(); ++i) {
aDim[i] = int64_t(a[i]);
}
auto aS = op->getInputs(0)->getStride();
std::vector<int64_t> aStride(aS.size(), 1);
for (size_t i = 0; i < aS.size(); ++i) {
aStride[i] = int64_t(aS[i]);
}
auto c = op->getInputs(0)->getDims();
std::vector<int64_t> cDim(c.size(), 1);
for (size_t i = 0; i < c.size(); ++i) {
cDim[i] = int64_t(c[i]);
}
auto cS = op->getInputs(0)->getStride();
std::vector<int64_t> cStride(cS.size(), 1);
for (size_t i = 0; i < cS.size(); ++i) {
cStride[i] = int64_t(cS[i]);
}
auto input = aclCreateTensor(
aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0,
aclFormat::ACL_FORMAT_ND, aDim.data(), aDim.size(), aData);
auto output = aclCreateTensor(
cDim.data(), cDim.size(), ACL_FLOAT, cStride.data(), 0,
aclFormat::ACL_FORMAT_ND, cDim.data(), cDim.size(), cData);
uint64_t workspaceSize = 0;
aclOpExecutor *executor;
auto ret =
aclnnHardswishGetWorkspaceSize(input, output, &workspaceSize, &executor);
void *workspaceAddr = nullptr;
if (workspaceSize > 0) {
ret = aclrtMalloc(&workspaceAddr, workspaceSize,
ACL_MEM_MALLOC_HUGE_FIRST);
}
assert(ret == ACL_SUCCESS);
ret = aclnnHardswish(workspaceAddr, workspaceSize, executor,
context->ASCENDHandle());
assert(ret == ACL_SUCCESS);
//ret = aclDestroyTensor(input);
//assert(ret == ACL_SUCCESS);
//ret = aclDestroyTensor(output);
//assert(ret == ACL_SUCCESS);
ret = aclrtSynchronizeStream(context->ASCENDHandle());
assert(ret == ACL_SUCCESS);
return;
}
};
//class TanhAclnn : public ASCENDKernelWithoutConfig {
// void compute(const Operator &_op,
// const RuntimeObj *_context) const override {
// auto op = as<UnaryObj>(_op);
// auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
//
// void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
// void *const cData = (op->getOutput()->getRawDataPtr<void *>());
//
// auto a = op->getInputs(0)->getDims();
// std::vector<int64_t> aDim(a.size(), 1);
// for (size_t i = 0; i < a.size(); ++i) {
// aDim[i] = int64_t(a[i]);
// }
// auto aS = op->getInputs(0)->getStride();
// std::vector<int64_t> aStride(aS.size(), 1);
// for (size_t i = 0; i < aS.size(); ++i) {
// aStride[i] = int64_t(aS[i]);
// }
// auto c = op->getInputs(0)->getDims();
// std::vector<int64_t> cDim(c.size(), 1);
// for (size_t i = 0; i < c.size(); ++i) {
// cDim[i] = int64_t(c[i]);
// }
// auto cS = op->getInputs(0)->getStride();
// std::vector<int64_t> cStride(cS.size(), 1);
// for (size_t i = 0; i < cS.size(); ++i) {
// cStride[i] = int64_t(cS[i]);
// }
//
// auto input = aclCreateTensor(
// aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0,
// aclFormat::ACL_FORMAT_ND, aDim.data(), aDim.size(), aData);
// auto output = aclCreateTensor(
// cDim.data(), cDim.size(), ACL_FLOAT, cStride.data(), 0,
// aclFormat::ACL_FORMAT_ND, cDim.data(), cDim.size(), cData);
//
// uint64_t workspaceSize = 0;
// aclOpExecutor *executor;
//
// auto ret =
// aclnnTanhGetWorkspaceSize(input, output, &workspaceSize, &executor);
// void *workspaceAddr = nullptr;
// if (workspaceSize > 0) {
// ret = aclrtMalloc(&workspaceAddr, workspaceSize,
// ACL_MEM_MALLOC_HUGE_FIRST);
// }
// assert(ret == ACL_SUCCESS);
// ret = aclnnTanh(workspaceAddr, workspaceSize, executor,
// context->ASCENDHandle());
// assert(ret == ACL_SUCCESS);
//
// //ret = aclDestroyTensor(input);
// //assert(ret == ACL_SUCCESS);
// //ret = aclDestroyTensor(output);
// //assert(ret == ACL_SUCCESS);
//
// ret = aclrtSynchronizeStream(context->ASCENDHandle());
// assert(ret == ACL_SUCCESS);
//
// return;
// }
//};
#define DEFINE_UNARY_Aclnn(prefix) \
class prefix##Aclnn : public ASCENDKernelWithoutConfig { \
@ -392,12 +144,25 @@ class HardswishAclnn : public ASCENDKernelWithoutConfig {
} \
};
DEFINE_UNARY_Aclnn(Abs)
DEFINE_UNARY_Aclnn(Sigmoid)
DEFINE_UNARY_Aclnn(Hardswish)
DEFINE_UNARY_Aclnn(Gelu)
DEFINE_UNARY_Aclnn(Tanh)
DEFINE_UNARY_Aclnn(Sin)
DEFINE_UNARY_Aclnn(Cos)
//DEFINE_UNARY_Aclnn(ACos)
//DEFINE_UNARY_Aclnn(Tan)
DEFINE_UNARY_Aclnn(Acos)
DEFINE_UNARY_Aclnn(Atan)
DEFINE_UNARY_Aclnn(Ceil)
DEFINE_UNARY_Aclnn(Floor)
DEFINE_UNARY_Aclnn(Exp)
DEFINE_UNARY_Aclnn(Neg)
DEFINE_UNARY_Aclnn(Reciprocal)
DEFINE_UNARY_Aclnn(Sqrt)
DEFINE_UNARY_Aclnn(Round)
REGISTER_KERNEL(Device::ASCEND, OpType::Relu, DataType::Float32, ReluAclnn,
"relu_ASCEND_float");
@ -415,8 +180,22 @@ REGISTER_KERNEL(Device::ASCEND, OpType::Sin, DataType::Float32, SinAclnn,
"sin_ASCEND_float");
REGISTER_KERNEL(Device::ASCEND, OpType::Cos, DataType::Float32, CosAclnn,
"cos_ASCEND_float");
//REGISTER_KERNEL(Device::ASCEND, OpType::ACos, DataType::Float32, ACosAclnn,
// "acos_ASCEND_float");
//REGISTER_KERNEL(Device::ASCEND, OpType::Tan, DataType::Float32, TanAclnn,
// "tan_ASCEND_float");
REGISTER_KERNEL(Device::ASCEND, OpType::Acos, DataType::Float32, AcosAclnn,
"acos_ASCEND_float");
REGISTER_KERNEL(Device::ASCEND, OpType::Atan, DataType::Float32, AtanAclnn,
"atan_ASCEND_float");
REGISTER_KERNEL(Device::ASCEND, OpType::Neg, DataType::Float32, NegAclnn,
"neg_ASCEND_float");
REGISTER_KERNEL(Device::ASCEND, OpType::Ceil, DataType::Float32, CeilAclnn,
"ceil_ASCEND_float");
REGISTER_KERNEL(Device::ASCEND, OpType::Floor, DataType::Float32, FloorAclnn,
"floor_ASCEND_float");
REGISTER_KERNEL(Device::ASCEND, OpType::Exp, DataType::Float32, ExpAclnn,
"exp_ASCEND_float");
REGISTER_KERNEL(Device::ASCEND, OpType::Reciprocal, DataType::Float32, ReciprocalAclnn,
"reciprocal_ASCEND_float");
REGISTER_KERNEL(Device::ASCEND, OpType::Sqrt, DataType::Float32, SqrtAclnn,
"sqrt_ASCEND_float");
REGISTER_KERNEL(Device::ASCEND, OpType::Round, DataType::Float32, RoundAclnn,
"round_ASCEND_float");
}; // namespace infini

View File

@ -0,0 +1,55 @@
#include "ascend/ascend_runtime.h"
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "operators/batch_norm.h"
#include "test.h"
namespace infini {
TEST(ascend_BatchNorm, run) {
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto npuRuntime = make_ref<ASCENDRuntimeObj>();
// Build cpu graph
Graph gCpu = make_ref<GraphObj>(cpuRuntime);
auto iCpu = gCpu->addTensor(Shape{1, 3, 2, 2}, DataType::Float32);
auto meanCpu = gCpu->addTensor(Shape{3}, DataType::Float32);
auto varCpu = gCpu->addTensor(Shape{3}, DataType::Float32);
auto scaleCpu = gCpu->addTensor(Shape{3}, DataType::Float32);
auto biasCpu = gCpu->addTensor(Shape{3}, DataType::Float32);
// Build input data on CPU
gCpu->dataMalloc();
iCpu->setData(IncrementalGenerator());
meanCpu->copyin(vector<float>{1, 6, 9});
varCpu->copyin(vector<float>{4, 1, 9});
scaleCpu->setData(OneGenerator());
biasCpu->setData(ZeroGenerator());
// Build CUDA graph
Graph g = make_ref<GraphObj>(npuRuntime);
auto i = g->cloneTensor(iCpu);
auto mean = g->cloneTensor(meanCpu);
auto var = g->cloneTensor(varCpu);
auto scale = g->cloneTensor(scaleCpu);
auto bias = g->cloneTensor(biasCpu);
auto op =
g->addOp<BatchNormObj>(i, nullptr, mean, var, scale, bias, 0.9, 0);
// allocate CUDA memory
g->dataMalloc();
// Execute on CUDA
npuRuntime->run(g);
// clone CUDA output to CPU
auto o = op->getOutput();
auto ocpu = o->clone(cpuRuntime);
// check results on CPU
EXPECT_TRUE(ocpu->equalData(vector<float>{
-0.5, 0, 0.5, 1, -2, -1, 0, 1, -0.333333, 0, 0.333333, 0.666667}));
}
} // namespace infini

View File

@ -0,0 +1,52 @@
#include "ascend/ascend_runtime.h"
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "operators/concat.h"
#include "test.h"
namespace infini {
template <class T>
void testConcat(const std::function<void(void *, size_t, DataType)> &generator,
const Shape &shape) {
// Runtime
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto npuRuntime = make_ref<ASCENDRuntimeObj>();
// Build input data on CPU
Tensor inputCpu1 =
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
inputCpu1->dataMalloc();
inputCpu1->setData(generator);
Tensor inputCpu2 =
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
inputCpu2->dataMalloc();
inputCpu2->setData(generator);
// GPU
Graph npuGraph = make_ref<GraphObj>(npuRuntime);
auto inputNpu1 = npuGraph->cloneTensor(inputCpu1);
auto inputNpu2 = npuGraph->cloneTensor(inputCpu2);
auto npuOp =
npuGraph->addOp<T>(TensorVec{inputNpu1, inputNpu2}, nullptr, 2);
npuGraph->dataMalloc();
npuRuntime->run(npuGraph);
auto outputNpu = npuOp->getOutput();
auto outputNpu2Cpu = outputNpu->clone(cpuRuntime);
// Check
inputCpu1->print();
inputCpu1->printData();
inputCpu2->print();
inputCpu2->printData();
outputNpu2Cpu->print();
outputNpu2Cpu->printData();
EXPECT_TRUE(1);
}
TEST(ascend_Concat, run) {
testConcat<ConcatObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
}
} // namespace infini

View File

@ -0,0 +1,57 @@
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "ascend/ascend_runtime.h"
#include "operators/conv.h"
#include "test.h"
namespace infini {
template <class T>
void testConv(const std::function<void(void *, size_t, DataType)> &generatorA,
const std::function<void(void *, size_t, DataType)> &generatorB,
const Shape &shapeA, const Shape &shapeB) {
// Runtime
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto npuRuntime = make_ref<ASCENDRuntimeObj>();
// Build input data on CPU
Tensor inputCpu1 =
make_ref<TensorObj>(shapeA, DataType::Float32, cpuRuntime);
Tensor inputCpu2 =
make_ref<TensorObj>(shapeB, DataType::Float32, cpuRuntime);
// NPU
Graph npuGraph = make_ref<GraphObj>(npuRuntime);
auto inputNpu1 = npuGraph->cloneTensor(inputCpu1);
auto inputNpu2 = npuGraph->cloneTensor(inputCpu2);
auto npuOp =
npuGraph->addOp<T>(inputNpu1, inputNpu2, nullptr, 1, 1, 1, 1, 1, 1);
npuGraph->dataMalloc();
inputNpu1->setData(generatorA);
inputNpu2->setData(generatorB);
npuRuntime->run(npuGraph);
auto outputNpu = npuOp->getOutput();
auto outputNpu2Cpu = outputNpu->clone(cpuRuntime);
// CPU
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
cpuGraph->addTensor(inputCpu1);
cpuGraph->addTensor(inputCpu2);
auto cpuOp =
cpuGraph->addOp<T>(inputCpu1, inputCpu2, nullptr, 1, 1, 1, 1, 1, 1);
cpuGraph->dataMalloc();
inputCpu1->setData(generatorA);
inputCpu2->setData(generatorB);
cpuRuntime->run(cpuGraph);
auto outputCpu = cpuOp->getOutput();
// Check
EXPECT_TRUE(outputCpu->equalData(outputNpu2Cpu));
}
TEST(ascend_Conv, run) {
testConv<ConvObj>(IncrementalGenerator(), IncrementalGenerator(),
Shape{1, 3, 32, 32}, Shape{2, 3, 3, 3});
}
} // namespace infini

View File

@ -0,0 +1,58 @@
#include "ascend/ascend_runtime.h"
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "operators/matmul.h"
#include "test.h"
namespace infini {
template <class T>
void testMatmul(const std::function<void(void *, size_t, DataType)> &generatorA,
const std::function<void(void *, size_t, DataType)> &generatorB,
bool transA, bool transB, const Shape &shapeA,
const Shape &shapeB) {
// Runtime
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto npuRuntime = make_ref<ASCENDRuntimeObj>();
// Build input data on CPU
Tensor inputCpu1 =
make_ref<TensorObj>(shapeA, DataType::Float32, cpuRuntime);
Tensor inputCpu2 =
make_ref<TensorObj>(shapeB, DataType::Float32, cpuRuntime);
// NPU
Graph npuGraph = make_ref<GraphObj>(npuRuntime);
auto inputNpu1 = npuGraph->cloneTensor(inputCpu1);
auto inputNpu2 = npuGraph->cloneTensor(inputCpu2);
auto npuOp = npuGraph->addOp<T>(inputNpu1, inputNpu2, nullptr);
npuGraph->dataMalloc();
inputNpu1->setData(generatorA);
inputNpu2->setData(generatorB);
npuRuntime->run(npuGraph);
auto outputNpu = npuOp->getOutput();
auto outputNpu2Cpu = outputNpu->clone(cpuRuntime);
// CPU
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
auto cpuOp = cpuGraph->addOp<T>(inputCpu1, inputCpu2, nullptr);
cpuGraph->addTensor(inputCpu1);
cpuGraph->addTensor(inputCpu2);
cpuGraph->dataMalloc();
inputCpu1->setData(generatorA);
inputCpu2->setData(generatorB);
cpuRuntime->run(cpuGraph);
auto outputCpu = cpuOp->getOutput();
outputCpu->print();
outputNpu2Cpu->print();
// Check
EXPECT_TRUE(outputCpu->equalData(outputNpu2Cpu));
}
TEST(ascend_Matmul, run) {
testMatmul<MatmulObj>(IncrementalGenerator(), IncrementalGenerator(), false,
false, Shape{1, 2, 3}, Shape{1, 3, 4});
}
} // namespace infini

View File

@ -0,0 +1,43 @@
#include "ascend/ascend_runtime.h"
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "operators/pooling.h"
#include "test.h"
namespace infini {
template <class T, typename std::enable_if<std::is_base_of<PoolingObj, T>{},
int>::type = 0>
void testPooling(const std::function<void(void *, size_t, DataType)> &generator,
const Shape &shape) {
// Runtime
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto npuRuntime = make_ref<ASCENDRuntimeObj>();
// Build input data on CPU
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
inputCpu->dataMalloc();
inputCpu->setData(generator);
// GPU
Graph npuGraph = make_ref<GraphObj>(npuRuntime);
auto inputNpu = npuGraph->cloneTensor(inputCpu);
auto npuOp =
npuGraph->addOp<T>(inputNpu, nullptr, 3, 3, 1, 1, 1, 1, 2, 2, 0);
npuGraph->dataMalloc();
npuRuntime->run(npuGraph);
auto outputNpu = npuOp->getOutput();
auto outputNpu2Cpu = outputNpu->clone(cpuRuntime);
inputCpu->printData();
outputNpu2Cpu->printData();
EXPECT_TRUE(1);
}
TEST(cnnl_Pooling, run) {
//testPooling<MaxPoolObj>(IncrementalGenerator(), Shape{1, 1, 5, 5});
testPooling<AvgPoolObj>(IncrementalGenerator(), Shape{1, 1, 5, 5});
}
} // namespace infini

View File

@ -40,15 +40,23 @@ void testUnary(const std::function<void(void *, size_t, DataType)> &generator,
}
TEST(ascend_Unary, run) {
//testUnary<ReluObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
//testUnary<AbsObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
//testUnary<SigmoidObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
//testUnary<HardSwishObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
//testUnary<TanhObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
//testUnary<SinObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
//testUnary<GeluObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testUnary<ReluObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testUnary<AbsObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testUnary<SigmoidObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testUnary<HardSwishObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testUnary<TanhObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testUnary<SinObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testUnary<GeluObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testUnary<CosObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
//testUnary<ACosObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testUnary<ACosObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testUnary<ATanObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
//testUnary<CeilObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
//testUnary<FloorObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
//testUnary<ExpObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testUnary<NegObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
//testUnary<ReciprocalObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testUnary<SqrtObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
//testUnary<RoundObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
}
} // namespace infini