forked from jiuyuan/InfiniTensor
add kernels
This commit is contained in:
parent
a9bd73528d
commit
39484e0cc4
|
@ -4,6 +4,7 @@
|
|||
|
||||
namespace infini {
|
||||
|
||||
|
||||
class ASCENDKernelWithoutConfig : public Kernel {
|
||||
public:
|
||||
virtual void compute(const Operator &op, const PerfRecord &record,
|
||||
|
@ -19,6 +20,14 @@ class ASCENDKernelWithoutConfig : public Kernel {
|
|||
return make_ref<PerfRecordObj>(timeit([&]() { compute(op, _context); },
|
||||
[&]() { context->sync(); }));
|
||||
}
|
||||
// transform vector<int> to vector<int64_t>
|
||||
std::vector<int64_t> MycastTo64(std::vector<int> const & v32) const {
|
||||
std::vector<int64_t> v64(v32.size(), 1);
|
||||
for (size_t i = 0; i < v32.size(); ++i) {
|
||||
v64[i] = int64_t(v32[i]);
|
||||
}
|
||||
return v64;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -0,0 +1,109 @@
|
|||
#include "operators/batch_norm.h"
|
||||
#include "aclnnop/level2/aclnn_batch_norm.h"
|
||||
#include "ascend/ascend_kernel_without_config.h"
|
||||
#include "ascend/ascend_runtime.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
|
||||
class BatchNormAclnn : public ASCENDKernelWithoutConfig {
|
||||
|
||||
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<BatchNormObj>(_op);
|
||||
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
|
||||
|
||||
void *const inData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const outData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
void *const meanData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const varData = (op->getInputs(2)->getRawDataPtr<void *>());
|
||||
void *const scaleData = (op->getInputs(3)->getRawDataPtr<void *>());
|
||||
void *const biasData = (op->getInputs(4)->getRawDataPtr<void *>());
|
||||
|
||||
auto inD = op->getInputs(0)->getDims();
|
||||
auto inS = op->getInputs(0)->getStride();
|
||||
auto paraD = op->getInputs(1)->getDims();
|
||||
auto paraS = op->getInputs(1)->getStride();
|
||||
auto outD = op->getOutput()->getDims();
|
||||
auto outS = op->getOutput()->getStride();
|
||||
|
||||
std::vector<int64_t> inputDim = MycastTo64(inD);
|
||||
std::vector<int64_t> inputStride = MycastTo64(inS);
|
||||
std::vector<int64_t> paraDim = MycastTo64(paraD);
|
||||
std::vector<int64_t> paraStride = MycastTo64(paraS);
|
||||
std::vector<int64_t> outputDim = MycastTo64(outD);
|
||||
std::vector<int64_t> outputStride = MycastTo64(outS);
|
||||
|
||||
//std::vector<int64_t> inputDim(in.size(), 1);
|
||||
//for (size_t i = 0; i < a.size(); ++i) {
|
||||
// inputDim[i] = int64_t(in[i]);
|
||||
//}
|
||||
//std::vector<int64_t> inputStride(inS.size(), 1);
|
||||
//for (size_t i = 0; i < inS.size(); ++i) {
|
||||
// inputStride[i] = int64_t(inS[i]);
|
||||
//}
|
||||
|
||||
auto inputTensor = aclCreateTensor(
|
||||
inputDim.data(), inputDim.size(), ACL_FLOAT, inputStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_NCHW, inputDim.data(), inputDim.size(), inData);
|
||||
auto outputTensor = aclCreateTensor(
|
||||
outputDim.data(), outputDim.size(), ACL_FLOAT, outputStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_NCHW, outputDim.data(), outputDim.size(), outData);
|
||||
auto meanTensor = aclCreateTensor(
|
||||
paraDim.data(), paraDim.size(), ACL_FLOAT, paraStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, paraDim.data(), paraDim.size(), meanData);
|
||||
auto varTensor = aclCreateTensor(
|
||||
paraDim.data(), paraDim.size(), ACL_FLOAT, paraStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, paraDim.data(), paraDim.size(), varData);
|
||||
auto scaleTensor = aclCreateTensor(
|
||||
paraDim.data(), paraDim.size(), ACL_FLOAT, paraStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, paraDim.data(), paraDim.size(), scaleData);
|
||||
auto biasTensor = aclCreateTensor(
|
||||
paraDim.data(), paraDim.size(), ACL_FLOAT, paraStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, paraDim.data(), paraDim.size(), biasData);
|
||||
auto savemeanTensor = aclCreateTensor(
|
||||
paraDim.data(), paraDim.size(), ACL_FLOAT, paraStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, paraDim.data(), paraDim.size(), scaleData);
|
||||
auto saveinvstdTensor = aclCreateTensor(
|
||||
paraDim.data(), paraDim.size(), ACL_FLOAT, paraStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, paraDim.data(), paraDim.size(), biasData);
|
||||
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor *executor;
|
||||
|
||||
auto ret =
|
||||
aclnnBatchNormGetWorkspaceSize(inputTensor, scaleTensor, biasTensor, meanTensor, varTensor, false, op->getMomentum(), op->getEps(), outputTensor, savemeanTensor, saveinvstdTensor, &workspaceSize, &executor);
|
||||
void *workspaceAddr = nullptr;
|
||||
if (workspaceSize > 0) {
|
||||
ret = aclrtMalloc(&workspaceAddr, workspaceSize,
|
||||
ACL_MEM_MALLOC_HUGE_FIRST);
|
||||
}
|
||||
assert(ret == ACL_SUCCESS);
|
||||
ret = aclnnBatchNorm(workspaceAddr, workspaceSize, executor,
|
||||
context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
aclDestroyTensor(inputTensor);
|
||||
aclDestroyTensor(outputTensor);
|
||||
aclDestroyTensor(meanTensor);
|
||||
aclDestroyTensor(varTensor);
|
||||
aclDestroyTensor(scaleTensor);
|
||||
aclDestroyTensor(biasTensor);
|
||||
aclDestroyTensor(savemeanTensor);
|
||||
aclDestroyTensor(saveinvstdTensor);
|
||||
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::BatchNormalization, DataType::Float32, BatchNormAclnn,
|
||||
"batchnorm_ASCEND_float");
|
||||
}; // namespace infini
|
|
@ -0,0 +1,100 @@
|
|||
#include "operators/concat.h"
|
||||
#include "aclnnop/level2/aclnn_cat.h"
|
||||
#include "ascend/ascend_kernel_without_config.h"
|
||||
#include "ascend/ascend_runtime.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
|
||||
class ConcatAclnn : public ASCENDKernelWithoutConfig {
|
||||
|
||||
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ConcatObj>(_op);
|
||||
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
|
||||
int dim = op->getDim();
|
||||
//int num = op->numInputs();
|
||||
|
||||
|
||||
std::vector<aclTensor*> inputsData{};
|
||||
auto inD0 = op->getInputs(0)->getDims();
|
||||
auto inS0 = op->getInputs(0)->getStride();
|
||||
std::vector<int64_t> inputDim0 = MycastTo64(inD0);
|
||||
std::vector<int64_t> inputStride0 = MycastTo64(inS0);
|
||||
|
||||
void *const inData0 = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
auto tmpTensor0 = aclCreateTensor(
|
||||
inputDim0.data(), inputDim0.size(), ACL_FLOAT, inputStride0.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, inputDim0.data(), inputDim0.size(), inData0);
|
||||
|
||||
inputsData.push_back(tmpTensor0);
|
||||
|
||||
auto inD = op->getInputs(1)->getDims();
|
||||
auto inS = op->getInputs(1)->getStride();
|
||||
std::vector<int64_t> inputDim = MycastTo64(inD);
|
||||
std::vector<int64_t> inputStride = MycastTo64(inS);
|
||||
|
||||
void *const inData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
auto tmpTensor = aclCreateTensor(
|
||||
inputDim.data(), inputDim.size(), ACL_FLOAT, inputStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, inputDim.data(), inputDim.size(), inData);
|
||||
|
||||
inputsData.push_back(tmpTensor);
|
||||
//for (int i = 0; i < num; ++i) {
|
||||
// auto inD = op->getInputs(i)->getDims();
|
||||
// auto inS = op->getInputs(i)->getStride();
|
||||
// std::vector<int64_t> inputDim = MycastTo64(inD);
|
||||
// std::vector<int64_t> inputStride = MycastTo64(inS);
|
||||
|
||||
// void *const inData = (op->getInputs(i)->getRawDataPtr<void *>());
|
||||
// auto tmpTensor = aclCreateTensor(
|
||||
// inputDim.data(), inputDim.size(), ACL_FLOAT, inputStride.data(), 0,
|
||||
// aclFormat::ACL_FORMAT_ND, inputDim.data(), inputDim.size(), inData);
|
||||
|
||||
// inputsData.push_back(tmpTensor);
|
||||
//}
|
||||
aclTensorList* tensorList = aclCreateTensorList(inputsData.data(), inputsData.size());
|
||||
|
||||
void *const outData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
auto outD = op->getOutput()->getDims();
|
||||
auto outS = op->getOutput()->getStride();
|
||||
std::vector<int64_t> outputDim = MycastTo64(outD);
|
||||
std::vector<int64_t> outputStride = MycastTo64(outS);
|
||||
|
||||
auto outputTensor = aclCreateTensor(
|
||||
outputDim.data(), outputDim.size(), ACL_FLOAT, outputStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, outputDim.data(), outputDim.size(), outData);
|
||||
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor *executor;
|
||||
|
||||
auto ret =
|
||||
aclnnCatGetWorkspaceSize(tensorList, int64_t(dim), outputTensor, &workspaceSize, &executor);
|
||||
void *workspaceAddr = nullptr;
|
||||
if (workspaceSize > 0) {
|
||||
ret = aclrtMalloc(&workspaceAddr, workspaceSize,
|
||||
ACL_MEM_MALLOC_HUGE_FIRST);
|
||||
}
|
||||
assert(ret == ACL_SUCCESS);
|
||||
ret = aclnnCat(workspaceAddr, workspaceSize, executor,
|
||||
context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
aclDestroyTensorList(tensorList);
|
||||
aclDestroyTensor(outputTensor);
|
||||
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::Concat, DataType::Float32, ConcatAclnn,
|
||||
"concat_ASCEND_float");
|
||||
}; // namespace infini
|
|
@ -0,0 +1,92 @@
|
|||
#include "operators/conv.h"
|
||||
#include "aclnnop/level2/aclnn_convolution.h"
|
||||
#include "ascend/ascend_kernel_without_config.h"
|
||||
#include "ascend/ascend_runtime.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
|
||||
class ConvAclnn : public ASCENDKernelWithoutConfig {
|
||||
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ConvObj>(_op);
|
||||
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
|
||||
|
||||
const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
|
||||
//const auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
|
||||
//const int cpg = op->getChannelPerGroup();
|
||||
//const int g = c / cpg;
|
||||
|
||||
std::vector<int64_t> pads = {ph, pw};
|
||||
//std::vector<int64_t> ksize = {r, s};
|
||||
std::vector<int64_t> stride = {sh, sw};
|
||||
std::vector<int64_t> dilation = {dh, dw};
|
||||
std::vector<int64_t> outputPadding = {sh-1, sw-1};
|
||||
|
||||
aclIntArray *convpads = aclCreateIntArray(pads.data(), pads.size());
|
||||
aclIntArray *convstride = aclCreateIntArray(stride.data(), stride.size());
|
||||
aclIntArray *convdilation = aclCreateIntArray(dilation.data(), dilation.size());
|
||||
aclIntArray *convOutputpadding = aclCreateIntArray(outputPadding.data(), outputPadding.size());
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
auto inputD = op->getInputs(0)->getDims();
|
||||
auto inputS = op->getInputs(0)->getStride();
|
||||
auto weightD = op->getInputs(1)->getDims();
|
||||
auto weightS = op->getInputs(1)->getStride();
|
||||
auto outD = op->getOutput()->getDims();
|
||||
auto outS = op->getOutput()->getStride();
|
||||
|
||||
std::vector<int64_t> inputDim = MycastTo64(inputD);
|
||||
std::vector<int64_t> inputStride = MycastTo64(inputS);
|
||||
std::vector<int64_t> weightDim = MycastTo64(weightD);
|
||||
std::vector<int64_t> weightStride = MycastTo64(weightS);
|
||||
std::vector<int64_t> outputDim = MycastTo64(outD);
|
||||
std::vector<int64_t> outputStride = MycastTo64(outS);
|
||||
|
||||
auto inputTensor = aclCreateTensor(
|
||||
inputDim.data(), inputDim.size(), ACL_FLOAT, inputStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_NCHW, inputDim.data(), inputDim.size(), aData);
|
||||
auto weightTensor = aclCreateTensor(
|
||||
weightDim.data(), weightDim.size(), ACL_FLOAT, weightStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_NCHW, weightDim.data(), weightDim.size(), bData);
|
||||
auto outputTensor = aclCreateTensor(
|
||||
outputDim.data(), outputDim.size(), ACL_FLOAT, outputStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_NCHW, outputDim.data(), outputDim.size(), cData);
|
||||
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor *executor;
|
||||
|
||||
auto ret =
|
||||
aclnnConvolutionGetWorkspaceSize(inputTensor, weightTensor, nullptr, convstride, convpads, convdilation, false, convOutputpadding, 1, outputTensor, 1, &workspaceSize, &executor);
|
||||
void *workspaceAddr = nullptr;
|
||||
if (workspaceSize > 0) {
|
||||
ret = aclrtMalloc(&workspaceAddr, workspaceSize,
|
||||
ACL_MEM_MALLOC_HUGE_FIRST);
|
||||
}
|
||||
assert(ret == ACL_SUCCESS);
|
||||
ret = aclnnConvolution(workspaceAddr, workspaceSize, executor,
|
||||
context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
aclDestroyTensor(inputTensor);
|
||||
aclDestroyTensor(weightTensor);
|
||||
aclDestroyTensor(outputTensor);
|
||||
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::Conv, DataType::Float32, ConvAclnn,
|
||||
"conv_ASCEND_float");
|
||||
}; // namespace infini
|
|
@ -0,0 +1,77 @@
|
|||
#include "operators/matmul.h"
|
||||
#include "aclnnop/level2/aclnn_matmul.h"
|
||||
#include "ascend/ascend_kernel_without_config.h"
|
||||
#include "ascend/ascend_runtime.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
|
||||
class MatmulAclnn : public ASCENDKernelWithoutConfig {
|
||||
|
||||
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<MatmulObj>(_op);
|
||||
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
auto selfD = op->getInputs(0)->getDims();
|
||||
auto selfS = op->getInputs(0)->getStride();
|
||||
auto matD = op->getInputs(1)->getDims();
|
||||
auto matS = op->getInputs(1)->getStride();
|
||||
auto outD = op->getOutput()->getDims();
|
||||
auto outS = op->getOutput()->getStride();
|
||||
|
||||
std::vector<int64_t> selfDim = MycastTo64(selfD);
|
||||
std::vector<int64_t> selfStride = MycastTo64(selfS);
|
||||
std::vector<int64_t> matDim = MycastTo64(matD);
|
||||
std::vector<int64_t> matStride = MycastTo64(matS);
|
||||
std::vector<int64_t> outputDim = MycastTo64(outD);
|
||||
std::vector<int64_t> outputStride = MycastTo64(outS);
|
||||
|
||||
auto selfTensor = aclCreateTensor(
|
||||
selfDim.data(), selfDim.size(), ACL_FLOAT, selfStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, selfDim.data(), selfDim.size(), aData);
|
||||
auto matTensor = aclCreateTensor(
|
||||
matDim.data(), matDim.size(), ACL_FLOAT, matStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, matDim.data(), matDim.size(), bData);
|
||||
auto outputTensor = aclCreateTensor(
|
||||
outputDim.data(), outputDim.size(), ACL_FLOAT, outputStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, outputDim.data(), outputDim.size(), cData);
|
||||
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor *executor;
|
||||
|
||||
auto ret =
|
||||
aclnnMatmulGetWorkspaceSize(selfTensor, matTensor, outputTensor, 1, &workspaceSize, &executor);
|
||||
void *workspaceAddr = nullptr;
|
||||
if (workspaceSize > 0) {
|
||||
ret = aclrtMalloc(&workspaceAddr, workspaceSize,
|
||||
ACL_MEM_MALLOC_HUGE_FIRST);
|
||||
}
|
||||
assert(ret == ACL_SUCCESS);
|
||||
ret = aclnnMatmul(workspaceAddr, workspaceSize, executor,
|
||||
context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
aclDestroyTensor(selfTensor);
|
||||
aclDestroyTensor(matTensor);
|
||||
aclDestroyTensor(outputTensor);
|
||||
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::MatMul, DataType::Float32, MatmulAclnn,
|
||||
"matmul_ASCEND_float");
|
||||
}; // namespace infini
|
|
@ -0,0 +1,82 @@
|
|||
#include "operators/pooling.h"
|
||||
#include "aclnnop/level2/aclnn_avgpool2d.h"
|
||||
#include "ascend/ascend_kernel_without_config.h"
|
||||
#include "ascend/ascend_runtime.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
|
||||
class AvgPooling : public ASCENDKernelWithoutConfig {
|
||||
|
||||
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<PoolingObj>(_op);
|
||||
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
auto [n, c, h, w, kh, kw] = op->getNCHWRS();
|
||||
auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
|
||||
|
||||
std::vector<int64_t> ksize = {kh, kw};
|
||||
std::vector<int64_t> stride = {sh, sw};
|
||||
std::vector<int64_t> pad = {ph, pw};
|
||||
|
||||
|
||||
int64_t divisorOverride = kh * kw;
|
||||
|
||||
auto selfD = op->getInputs(0)->getDims();
|
||||
auto selfS = op->getInputs(0)->getStride();
|
||||
auto outD = op->getOutput()->getDims();
|
||||
auto outS = op->getOutput()->getStride();
|
||||
|
||||
std::vector<int64_t> selfDim = MycastTo64(selfD);
|
||||
std::vector<int64_t> selfStride = MycastTo64(selfS);
|
||||
std::vector<int64_t> outputDim = MycastTo64(outD);
|
||||
std::vector<int64_t> outputStride = MycastTo64(outS);
|
||||
|
||||
aclIntArray *kernelSize = aclCreateIntArray(ksize.data(), ksize.size());
|
||||
aclIntArray *strides = aclCreateIntArray(stride.data(), stride.size());
|
||||
aclIntArray *paddings = aclCreateIntArray(pad.data(), pad.size());
|
||||
|
||||
auto selfTensor = aclCreateTensor(
|
||||
selfDim.data(), selfDim.size(), ACL_FLOAT, selfStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_NCHW, selfDim.data(), selfDim.size(), aData);
|
||||
auto outputTensor = aclCreateTensor(
|
||||
outputDim.data(), outputDim.size(), ACL_FLOAT, outputStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_NCHW, outputDim.data(), outputDim.size(), cData);
|
||||
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor *executor;
|
||||
|
||||
auto ret =
|
||||
aclnnAvgPool2dGetWorkspaceSize(selfTensor, kernelSize, strides, paddings, false, true, divisorOverride, 1, outputTensor, &workspaceSize, &executor);
|
||||
void *workspaceAddr = nullptr;
|
||||
if (workspaceSize > 0) {
|
||||
ret = aclrtMalloc(&workspaceAddr, workspaceSize,
|
||||
ACL_MEM_MALLOC_HUGE_FIRST);
|
||||
}
|
||||
assert(ret == ACL_SUCCESS);
|
||||
ret = aclnnAvgPool2d(workspaceAddr, workspaceSize, executor,
|
||||
context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
aclDestroyTensor(selfTensor);
|
||||
aclDestroyTensor(outputTensor);
|
||||
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::AveragePool, DataType::Float32, AvgPooling,
|
||||
"avgpooling_ASCEND_float");
|
||||
}; // namespace infini
|
|
@ -8,6 +8,14 @@
|
|||
#include "aclnnop/level2/aclnn_sin.h"
|
||||
#include "aclnnop/level2/aclnn_cos.h"
|
||||
#include "aclnnop/level2/aclnn_acos.h"
|
||||
#include "aclnnop/level2/aclnn_atan.h"
|
||||
#include "aclnnop/level2/aclnn_ceil.h"
|
||||
#include "aclnnop/level2/aclnn_floor.h"
|
||||
#include "aclnnop/level2/aclnn_exp.h"
|
||||
#include "aclnnop/level2/aclnn_neg.h"
|
||||
#include "aclnnop/level2/aclnn_reciprocal.h"
|
||||
#include "aclnnop/level2/aclnn_sqrt.h"
|
||||
#include "aclnnop/level2/aclnn_round.h"
|
||||
#include "ascend/ascend_kernel_without_config.h"
|
||||
#include "ascend/ascend_runtime.h"
|
||||
|
||||
|
@ -77,262 +85,6 @@ class ReluAclnn : public ASCENDKernelWithoutConfig {
|
|||
}
|
||||
};
|
||||
|
||||
class AbsAclnn : public ASCENDKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<UnaryObj>(_op);
|
||||
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
auto a = op->getInputs(0)->getDims();
|
||||
std::vector<int64_t> aDim(a.size(), 1);
|
||||
for (size_t i = 0; i < a.size(); ++i) {
|
||||
aDim[i] = int64_t(a[i]);
|
||||
}
|
||||
auto aS = op->getInputs(0)->getStride();
|
||||
std::vector<int64_t> aStride(aS.size(), 1);
|
||||
for (size_t i = 0; i < aS.size(); ++i) {
|
||||
aStride[i] = int64_t(aS[i]);
|
||||
}
|
||||
auto c = op->getInputs(0)->getDims();
|
||||
std::vector<int64_t> cDim(c.size(), 1);
|
||||
for (size_t i = 0; i < c.size(); ++i) {
|
||||
cDim[i] = int64_t(c[i]);
|
||||
}
|
||||
auto cS = op->getInputs(0)->getStride();
|
||||
std::vector<int64_t> cStride(cS.size(), 1);
|
||||
for (size_t i = 0; i < cS.size(); ++i) {
|
||||
cStride[i] = int64_t(cS[i]);
|
||||
}
|
||||
|
||||
auto input = aclCreateTensor(
|
||||
aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, aDim.data(), aDim.size(), aData);
|
||||
auto output = aclCreateTensor(
|
||||
cDim.data(), cDim.size(), ACL_FLOAT, cStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, cDim.data(), cDim.size(), cData);
|
||||
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor *executor;
|
||||
|
||||
auto ret =
|
||||
aclnnAbsGetWorkspaceSize(input, output, &workspaceSize, &executor);
|
||||
void *workspaceAddr = nullptr;
|
||||
if (workspaceSize > 0) {
|
||||
ret = aclrtMalloc(&workspaceAddr, workspaceSize,
|
||||
ACL_MEM_MALLOC_HUGE_FIRST);
|
||||
}
|
||||
assert(ret == ACL_SUCCESS);
|
||||
ret = aclnnAbs(workspaceAddr, workspaceSize, executor,
|
||||
context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
//ret = aclDestroyTensor(input);
|
||||
//assert(ret == ACL_SUCCESS);
|
||||
//ret = aclDestroyTensor(output);
|
||||
//assert(ret == ACL_SUCCESS);
|
||||
|
||||
ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
class SigmoidAclnn : public ASCENDKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<UnaryObj>(_op);
|
||||
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
auto a = op->getInputs(0)->getDims();
|
||||
std::vector<int64_t> aDim(a.size(), 1);
|
||||
for (size_t i = 0; i < a.size(); ++i) {
|
||||
aDim[i] = int64_t(a[i]);
|
||||
}
|
||||
auto aS = op->getInputs(0)->getStride();
|
||||
std::vector<int64_t> aStride(aS.size(), 1);
|
||||
for (size_t i = 0; i < aS.size(); ++i) {
|
||||
aStride[i] = int64_t(aS[i]);
|
||||
}
|
||||
auto c = op->getInputs(0)->getDims();
|
||||
std::vector<int64_t> cDim(c.size(), 1);
|
||||
for (size_t i = 0; i < c.size(); ++i) {
|
||||
cDim[i] = int64_t(c[i]);
|
||||
}
|
||||
auto cS = op->getInputs(0)->getStride();
|
||||
std::vector<int64_t> cStride(cS.size(), 1);
|
||||
for (size_t i = 0; i < cS.size(); ++i) {
|
||||
cStride[i] = int64_t(cS[i]);
|
||||
}
|
||||
|
||||
auto input = aclCreateTensor(
|
||||
aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, aDim.data(), aDim.size(), aData);
|
||||
auto output = aclCreateTensor(
|
||||
cDim.data(), cDim.size(), ACL_FLOAT, cStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, cDim.data(), cDim.size(), cData);
|
||||
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor *executor;
|
||||
|
||||
auto ret =
|
||||
aclnnSigmoidGetWorkspaceSize(input, output, &workspaceSize, &executor);
|
||||
void *workspaceAddr = nullptr;
|
||||
if (workspaceSize > 0) {
|
||||
ret = aclrtMalloc(&workspaceAddr, workspaceSize,
|
||||
ACL_MEM_MALLOC_HUGE_FIRST);
|
||||
}
|
||||
assert(ret == ACL_SUCCESS);
|
||||
ret = aclnnSigmoid(workspaceAddr, workspaceSize, executor,
|
||||
context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
//ret = aclDestroyTensor(input);
|
||||
//assert(ret == ACL_SUCCESS);
|
||||
//ret = aclDestroyTensor(output);
|
||||
//assert(ret == ACL_SUCCESS);
|
||||
|
||||
ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
class HardswishAclnn : public ASCENDKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<UnaryObj>(_op);
|
||||
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
auto a = op->getInputs(0)->getDims();
|
||||
std::vector<int64_t> aDim(a.size(), 1);
|
||||
for (size_t i = 0; i < a.size(); ++i) {
|
||||
aDim[i] = int64_t(a[i]);
|
||||
}
|
||||
auto aS = op->getInputs(0)->getStride();
|
||||
std::vector<int64_t> aStride(aS.size(), 1);
|
||||
for (size_t i = 0; i < aS.size(); ++i) {
|
||||
aStride[i] = int64_t(aS[i]);
|
||||
}
|
||||
auto c = op->getInputs(0)->getDims();
|
||||
std::vector<int64_t> cDim(c.size(), 1);
|
||||
for (size_t i = 0; i < c.size(); ++i) {
|
||||
cDim[i] = int64_t(c[i]);
|
||||
}
|
||||
auto cS = op->getInputs(0)->getStride();
|
||||
std::vector<int64_t> cStride(cS.size(), 1);
|
||||
for (size_t i = 0; i < cS.size(); ++i) {
|
||||
cStride[i] = int64_t(cS[i]);
|
||||
}
|
||||
|
||||
auto input = aclCreateTensor(
|
||||
aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, aDim.data(), aDim.size(), aData);
|
||||
auto output = aclCreateTensor(
|
||||
cDim.data(), cDim.size(), ACL_FLOAT, cStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, cDim.data(), cDim.size(), cData);
|
||||
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor *executor;
|
||||
|
||||
auto ret =
|
||||
aclnnHardswishGetWorkspaceSize(input, output, &workspaceSize, &executor);
|
||||
void *workspaceAddr = nullptr;
|
||||
if (workspaceSize > 0) {
|
||||
ret = aclrtMalloc(&workspaceAddr, workspaceSize,
|
||||
ACL_MEM_MALLOC_HUGE_FIRST);
|
||||
}
|
||||
assert(ret == ACL_SUCCESS);
|
||||
ret = aclnnHardswish(workspaceAddr, workspaceSize, executor,
|
||||
context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
//ret = aclDestroyTensor(input);
|
||||
//assert(ret == ACL_SUCCESS);
|
||||
//ret = aclDestroyTensor(output);
|
||||
//assert(ret == ACL_SUCCESS);
|
||||
|
||||
ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
//class TanhAclnn : public ASCENDKernelWithoutConfig {
|
||||
// void compute(const Operator &_op,
|
||||
// const RuntimeObj *_context) const override {
|
||||
// auto op = as<UnaryObj>(_op);
|
||||
// auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
|
||||
//
|
||||
// void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
// void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
//
|
||||
// auto a = op->getInputs(0)->getDims();
|
||||
// std::vector<int64_t> aDim(a.size(), 1);
|
||||
// for (size_t i = 0; i < a.size(); ++i) {
|
||||
// aDim[i] = int64_t(a[i]);
|
||||
// }
|
||||
// auto aS = op->getInputs(0)->getStride();
|
||||
// std::vector<int64_t> aStride(aS.size(), 1);
|
||||
// for (size_t i = 0; i < aS.size(); ++i) {
|
||||
// aStride[i] = int64_t(aS[i]);
|
||||
// }
|
||||
// auto c = op->getInputs(0)->getDims();
|
||||
// std::vector<int64_t> cDim(c.size(), 1);
|
||||
// for (size_t i = 0; i < c.size(); ++i) {
|
||||
// cDim[i] = int64_t(c[i]);
|
||||
// }
|
||||
// auto cS = op->getInputs(0)->getStride();
|
||||
// std::vector<int64_t> cStride(cS.size(), 1);
|
||||
// for (size_t i = 0; i < cS.size(); ++i) {
|
||||
// cStride[i] = int64_t(cS[i]);
|
||||
// }
|
||||
//
|
||||
// auto input = aclCreateTensor(
|
||||
// aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0,
|
||||
// aclFormat::ACL_FORMAT_ND, aDim.data(), aDim.size(), aData);
|
||||
// auto output = aclCreateTensor(
|
||||
// cDim.data(), cDim.size(), ACL_FLOAT, cStride.data(), 0,
|
||||
// aclFormat::ACL_FORMAT_ND, cDim.data(), cDim.size(), cData);
|
||||
//
|
||||
// uint64_t workspaceSize = 0;
|
||||
// aclOpExecutor *executor;
|
||||
//
|
||||
// auto ret =
|
||||
// aclnnTanhGetWorkspaceSize(input, output, &workspaceSize, &executor);
|
||||
// void *workspaceAddr = nullptr;
|
||||
// if (workspaceSize > 0) {
|
||||
// ret = aclrtMalloc(&workspaceAddr, workspaceSize,
|
||||
// ACL_MEM_MALLOC_HUGE_FIRST);
|
||||
// }
|
||||
// assert(ret == ACL_SUCCESS);
|
||||
// ret = aclnnTanh(workspaceAddr, workspaceSize, executor,
|
||||
// context->ASCENDHandle());
|
||||
// assert(ret == ACL_SUCCESS);
|
||||
//
|
||||
// //ret = aclDestroyTensor(input);
|
||||
// //assert(ret == ACL_SUCCESS);
|
||||
// //ret = aclDestroyTensor(output);
|
||||
// //assert(ret == ACL_SUCCESS);
|
||||
//
|
||||
// ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
||||
// assert(ret == ACL_SUCCESS);
|
||||
//
|
||||
// return;
|
||||
// }
|
||||
//};
|
||||
|
||||
#define DEFINE_UNARY_Aclnn(prefix) \
|
||||
class prefix##Aclnn : public ASCENDKernelWithoutConfig { \
|
||||
|
@ -392,12 +144,25 @@ class HardswishAclnn : public ASCENDKernelWithoutConfig {
|
|||
} \
|
||||
};
|
||||
|
||||
DEFINE_UNARY_Aclnn(Abs)
|
||||
DEFINE_UNARY_Aclnn(Sigmoid)
|
||||
DEFINE_UNARY_Aclnn(Hardswish)
|
||||
DEFINE_UNARY_Aclnn(Gelu)
|
||||
|
||||
DEFINE_UNARY_Aclnn(Tanh)
|
||||
DEFINE_UNARY_Aclnn(Sin)
|
||||
DEFINE_UNARY_Aclnn(Cos)
|
||||
//DEFINE_UNARY_Aclnn(ACos)
|
||||
//DEFINE_UNARY_Aclnn(Tan)
|
||||
DEFINE_UNARY_Aclnn(Acos)
|
||||
DEFINE_UNARY_Aclnn(Atan)
|
||||
|
||||
DEFINE_UNARY_Aclnn(Ceil)
|
||||
DEFINE_UNARY_Aclnn(Floor)
|
||||
DEFINE_UNARY_Aclnn(Exp)
|
||||
DEFINE_UNARY_Aclnn(Neg)
|
||||
DEFINE_UNARY_Aclnn(Reciprocal)
|
||||
DEFINE_UNARY_Aclnn(Sqrt)
|
||||
DEFINE_UNARY_Aclnn(Round)
|
||||
|
||||
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::Relu, DataType::Float32, ReluAclnn,
|
||||
"relu_ASCEND_float");
|
||||
|
@ -415,8 +180,22 @@ REGISTER_KERNEL(Device::ASCEND, OpType::Sin, DataType::Float32, SinAclnn,
|
|||
"sin_ASCEND_float");
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::Cos, DataType::Float32, CosAclnn,
|
||||
"cos_ASCEND_float");
|
||||
//REGISTER_KERNEL(Device::ASCEND, OpType::ACos, DataType::Float32, ACosAclnn,
|
||||
// "acos_ASCEND_float");
|
||||
//REGISTER_KERNEL(Device::ASCEND, OpType::Tan, DataType::Float32, TanAclnn,
|
||||
// "tan_ASCEND_float");
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::Acos, DataType::Float32, AcosAclnn,
|
||||
"acos_ASCEND_float");
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::Atan, DataType::Float32, AtanAclnn,
|
||||
"atan_ASCEND_float");
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::Neg, DataType::Float32, NegAclnn,
|
||||
"neg_ASCEND_float");
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::Ceil, DataType::Float32, CeilAclnn,
|
||||
"ceil_ASCEND_float");
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::Floor, DataType::Float32, FloorAclnn,
|
||||
"floor_ASCEND_float");
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::Exp, DataType::Float32, ExpAclnn,
|
||||
"exp_ASCEND_float");
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::Reciprocal, DataType::Float32, ReciprocalAclnn,
|
||||
"reciprocal_ASCEND_float");
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::Sqrt, DataType::Float32, SqrtAclnn,
|
||||
"sqrt_ASCEND_float");
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::Round, DataType::Float32, RoundAclnn,
|
||||
"round_ASCEND_float");
|
||||
}; // namespace infini
|
||||
|
|
|
@ -0,0 +1,55 @@
|
|||
#include "ascend/ascend_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/batch_norm.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
TEST(ascend_BatchNorm, run) {
|
||||
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||
auto npuRuntime = make_ref<ASCENDRuntimeObj>();
|
||||
|
||||
// Build cpu graph
|
||||
Graph gCpu = make_ref<GraphObj>(cpuRuntime);
|
||||
auto iCpu = gCpu->addTensor(Shape{1, 3, 2, 2}, DataType::Float32);
|
||||
auto meanCpu = gCpu->addTensor(Shape{3}, DataType::Float32);
|
||||
auto varCpu = gCpu->addTensor(Shape{3}, DataType::Float32);
|
||||
auto scaleCpu = gCpu->addTensor(Shape{3}, DataType::Float32);
|
||||
auto biasCpu = gCpu->addTensor(Shape{3}, DataType::Float32);
|
||||
|
||||
// Build input data on CPU
|
||||
gCpu->dataMalloc();
|
||||
iCpu->setData(IncrementalGenerator());
|
||||
meanCpu->copyin(vector<float>{1, 6, 9});
|
||||
varCpu->copyin(vector<float>{4, 1, 9});
|
||||
scaleCpu->setData(OneGenerator());
|
||||
biasCpu->setData(ZeroGenerator());
|
||||
|
||||
// Build CUDA graph
|
||||
Graph g = make_ref<GraphObj>(npuRuntime);
|
||||
auto i = g->cloneTensor(iCpu);
|
||||
auto mean = g->cloneTensor(meanCpu);
|
||||
auto var = g->cloneTensor(varCpu);
|
||||
auto scale = g->cloneTensor(scaleCpu);
|
||||
auto bias = g->cloneTensor(biasCpu);
|
||||
auto op =
|
||||
g->addOp<BatchNormObj>(i, nullptr, mean, var, scale, bias, 0.9, 0);
|
||||
|
||||
// allocate CUDA memory
|
||||
g->dataMalloc();
|
||||
|
||||
// Execute on CUDA
|
||||
npuRuntime->run(g);
|
||||
|
||||
// clone CUDA output to CPU
|
||||
auto o = op->getOutput();
|
||||
auto ocpu = o->clone(cpuRuntime);
|
||||
|
||||
// check results on CPU
|
||||
EXPECT_TRUE(ocpu->equalData(vector<float>{
|
||||
-0.5, 0, 0.5, 1, -2, -1, 0, 1, -0.333333, 0, 0.333333, 0.666667}));
|
||||
}
|
||||
} // namespace infini
|
|
@ -0,0 +1,52 @@
|
|||
#include "ascend/ascend_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/concat.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testConcat(const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||
auto npuRuntime = make_ref<ASCENDRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu1 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu1->dataMalloc();
|
||||
inputCpu1->setData(generator);
|
||||
Tensor inputCpu2 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu2->dataMalloc();
|
||||
inputCpu2->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph npuGraph = make_ref<GraphObj>(npuRuntime);
|
||||
auto inputNpu1 = npuGraph->cloneTensor(inputCpu1);
|
||||
auto inputNpu2 = npuGraph->cloneTensor(inputCpu2);
|
||||
auto npuOp =
|
||||
npuGraph->addOp<T>(TensorVec{inputNpu1, inputNpu2}, nullptr, 2);
|
||||
npuGraph->dataMalloc();
|
||||
npuRuntime->run(npuGraph);
|
||||
auto outputNpu = npuOp->getOutput();
|
||||
auto outputNpu2Cpu = outputNpu->clone(cpuRuntime);
|
||||
// Check
|
||||
inputCpu1->print();
|
||||
inputCpu1->printData();
|
||||
inputCpu2->print();
|
||||
inputCpu2->printData();
|
||||
outputNpu2Cpu->print();
|
||||
outputNpu2Cpu->printData();
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(ascend_Concat, run) {
|
||||
testConcat<ConcatObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,57 @@
|
|||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "ascend/ascend_runtime.h"
|
||||
#include "operators/conv.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testConv(const std::function<void(void *, size_t, DataType)> &generatorA,
|
||||
const std::function<void(void *, size_t, DataType)> &generatorB,
|
||||
const Shape &shapeA, const Shape &shapeB) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||
auto npuRuntime = make_ref<ASCENDRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu1 =
|
||||
make_ref<TensorObj>(shapeA, DataType::Float32, cpuRuntime);
|
||||
Tensor inputCpu2 =
|
||||
make_ref<TensorObj>(shapeB, DataType::Float32, cpuRuntime);
|
||||
// NPU
|
||||
Graph npuGraph = make_ref<GraphObj>(npuRuntime);
|
||||
auto inputNpu1 = npuGraph->cloneTensor(inputCpu1);
|
||||
auto inputNpu2 = npuGraph->cloneTensor(inputCpu2);
|
||||
auto npuOp =
|
||||
npuGraph->addOp<T>(inputNpu1, inputNpu2, nullptr, 1, 1, 1, 1, 1, 1);
|
||||
npuGraph->dataMalloc();
|
||||
inputNpu1->setData(generatorA);
|
||||
inputNpu2->setData(generatorB);
|
||||
npuRuntime->run(npuGraph);
|
||||
auto outputNpu = npuOp->getOutput();
|
||||
auto outputNpu2Cpu = outputNpu->clone(cpuRuntime);
|
||||
// CPU
|
||||
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
|
||||
cpuGraph->addTensor(inputCpu1);
|
||||
cpuGraph->addTensor(inputCpu2);
|
||||
auto cpuOp =
|
||||
cpuGraph->addOp<T>(inputCpu1, inputCpu2, nullptr, 1, 1, 1, 1, 1, 1);
|
||||
cpuGraph->dataMalloc();
|
||||
inputCpu1->setData(generatorA);
|
||||
inputCpu2->setData(generatorB);
|
||||
cpuRuntime->run(cpuGraph);
|
||||
auto outputCpu = cpuOp->getOutput();
|
||||
// Check
|
||||
EXPECT_TRUE(outputCpu->equalData(outputNpu2Cpu));
|
||||
}
|
||||
|
||||
TEST(ascend_Conv, run) {
|
||||
testConv<ConvObj>(IncrementalGenerator(), IncrementalGenerator(),
|
||||
Shape{1, 3, 32, 32}, Shape{2, 3, 3, 3});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,58 @@
|
|||
#include "ascend/ascend_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/matmul.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testMatmul(const std::function<void(void *, size_t, DataType)> &generatorA,
|
||||
const std::function<void(void *, size_t, DataType)> &generatorB,
|
||||
bool transA, bool transB, const Shape &shapeA,
|
||||
const Shape &shapeB) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||
auto npuRuntime = make_ref<ASCENDRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu1 =
|
||||
make_ref<TensorObj>(shapeA, DataType::Float32, cpuRuntime);
|
||||
Tensor inputCpu2 =
|
||||
make_ref<TensorObj>(shapeB, DataType::Float32, cpuRuntime);
|
||||
|
||||
// NPU
|
||||
Graph npuGraph = make_ref<GraphObj>(npuRuntime);
|
||||
auto inputNpu1 = npuGraph->cloneTensor(inputCpu1);
|
||||
auto inputNpu2 = npuGraph->cloneTensor(inputCpu2);
|
||||
auto npuOp = npuGraph->addOp<T>(inputNpu1, inputNpu2, nullptr);
|
||||
npuGraph->dataMalloc();
|
||||
inputNpu1->setData(generatorA);
|
||||
inputNpu2->setData(generatorB);
|
||||
npuRuntime->run(npuGraph);
|
||||
auto outputNpu = npuOp->getOutput();
|
||||
auto outputNpu2Cpu = outputNpu->clone(cpuRuntime);
|
||||
// CPU
|
||||
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
|
||||
auto cpuOp = cpuGraph->addOp<T>(inputCpu1, inputCpu2, nullptr);
|
||||
cpuGraph->addTensor(inputCpu1);
|
||||
cpuGraph->addTensor(inputCpu2);
|
||||
cpuGraph->dataMalloc();
|
||||
inputCpu1->setData(generatorA);
|
||||
inputCpu2->setData(generatorB);
|
||||
cpuRuntime->run(cpuGraph);
|
||||
auto outputCpu = cpuOp->getOutput();
|
||||
outputCpu->print();
|
||||
outputNpu2Cpu->print();
|
||||
// Check
|
||||
EXPECT_TRUE(outputCpu->equalData(outputNpu2Cpu));
|
||||
}
|
||||
|
||||
TEST(ascend_Matmul, run) {
|
||||
testMatmul<MatmulObj>(IncrementalGenerator(), IncrementalGenerator(), false,
|
||||
false, Shape{1, 2, 3}, Shape{1, 3, 4});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,43 @@
|
|||
#include "ascend/ascend_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/pooling.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T, typename std::enable_if<std::is_base_of<PoolingObj, T>{},
|
||||
int>::type = 0>
|
||||
void testPooling(const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||
auto npuRuntime = make_ref<ASCENDRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu->dataMalloc();
|
||||
inputCpu->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph npuGraph = make_ref<GraphObj>(npuRuntime);
|
||||
auto inputNpu = npuGraph->cloneTensor(inputCpu);
|
||||
auto npuOp =
|
||||
npuGraph->addOp<T>(inputNpu, nullptr, 3, 3, 1, 1, 1, 1, 2, 2, 0);
|
||||
npuGraph->dataMalloc();
|
||||
npuRuntime->run(npuGraph);
|
||||
auto outputNpu = npuOp->getOutput();
|
||||
auto outputNpu2Cpu = outputNpu->clone(cpuRuntime);
|
||||
inputCpu->printData();
|
||||
outputNpu2Cpu->printData();
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(cnnl_Pooling, run) {
|
||||
//testPooling<MaxPoolObj>(IncrementalGenerator(), Shape{1, 1, 5, 5});
|
||||
testPooling<AvgPoolObj>(IncrementalGenerator(), Shape{1, 1, 5, 5});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -40,15 +40,23 @@ void testUnary(const std::function<void(void *, size_t, DataType)> &generator,
|
|||
}
|
||||
|
||||
TEST(ascend_Unary, run) {
|
||||
//testUnary<ReluObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
//testUnary<AbsObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
//testUnary<SigmoidObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
//testUnary<HardSwishObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
//testUnary<TanhObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
//testUnary<SinObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
//testUnary<GeluObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testUnary<ReluObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testUnary<AbsObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testUnary<SigmoidObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testUnary<HardSwishObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testUnary<TanhObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testUnary<SinObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testUnary<GeluObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testUnary<CosObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
//testUnary<ACosObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testUnary<ACosObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testUnary<ATanObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
//testUnary<CeilObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
//testUnary<FloorObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
//testUnary<ExpObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testUnary<NegObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
//testUnary<ReciprocalObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testUnary<SqrtObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
//testUnary<RoundObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
}
|
||||
|
||||
} // namespace infini
|
||||
|
|
Loading…
Reference in New Issue