support for llama

This commit is contained in:
OdinaryWord 2024-02-29 14:29:28 +08:00
parent 9db6703b58
commit 36e0840f2f
24 changed files with 884 additions and 177 deletions

View File

@ -20,7 +20,7 @@ class ASCENDKernelWithoutConfig : public Kernel {
[&]() { context->sync(); }));
}
// transform vector<int> to vector<int64_t>
std::vector<int64_t> MycastTo64(std::vector<int> const &v32) const {
std::vector<int64_t> castTo64(std::vector<int> const &v32) const {
std::vector<int64_t> v64(v32.size(), 1);
for (size_t i = 0; i < v32.size(); ++i) {
v64[i] = int64_t(v32[i]);

View File

@ -33,6 +33,7 @@ class ASCENDRuntimeObj : public RuntimeObj {
// // LOG_PRINT("aclInit failed. ERROR: %d\n",
// ret));
// #endif
aclInit(nullptr);
auto ret = aclrtSetDevice(deviceId);
CHECK_RET(ret == ACL_SUCCESS,
LOG_PRINT("aclrtSetDevice failed. ERROR: %d\n", ret));
@ -58,7 +59,7 @@ class ASCENDRuntimeObj : public RuntimeObj {
aclrtDestroyStream(stream);
aclrtDestroyContext(context);
aclrtResetDevice(deviceId);
// aclFinalize();
aclFinalize();
}
string toString() const override;

View File

@ -7,7 +7,9 @@ namespace infini {
*
*/
class SliceObj : public OperatorObj {
template <class T> struct range_t { T start, end, step; };
template <class T> struct range_t {
T start, end, step;
};
vector<range_t<int>> axes;
public:

View File

@ -26,12 +26,12 @@ class BatchNormAclnn : public ASCENDKernelWithoutConfig {
auto outD = op->getOutput()->getDims();
auto outS = op->getOutput()->getStride();
std::vector<int64_t> inputDim = MycastTo64(inD);
std::vector<int64_t> inputStride = MycastTo64(inS);
std::vector<int64_t> paraDim = MycastTo64(paraD);
std::vector<int64_t> paraStride = MycastTo64(paraS);
std::vector<int64_t> outputDim = MycastTo64(outD);
std::vector<int64_t> outputStride = MycastTo64(outS);
std::vector<int64_t> inputDim = castTo64(inD);
std::vector<int64_t> inputStride = castTo64(inS);
std::vector<int64_t> paraDim = castTo64(paraD);
std::vector<int64_t> paraStride = castTo64(paraS);
std::vector<int64_t> outputDim = castTo64(outD);
std::vector<int64_t> outputStride = castTo64(outS);
auto inputTensor =
aclCreateTensor(inputDim.data(), inputDim.size(), ACL_FLOAT,

View File

@ -19,8 +19,8 @@ class ConcatAclnn : public ASCENDKernelWithoutConfig {
for (int i = 0; i < num; ++i) {
auto inD = op->getInputs(i)->getDims();
auto inS = op->getInputs(i)->getStride();
std::vector<int64_t> inputDim = MycastTo64(inD);
std::vector<int64_t> inputStride = MycastTo64(inS);
std::vector<int64_t> inputDim = castTo64(inD);
std::vector<int64_t> inputStride = castTo64(inS);
void *const inData = (op->getInputs(i)->getRawDataPtr<void *>());
auto tmpTensor =
@ -36,8 +36,8 @@ class ConcatAclnn : public ASCENDKernelWithoutConfig {
void *const outData = (op->getOutput()->getRawDataPtr<void *>());
auto outD = op->getOutput()->getDims();
auto outS = op->getOutput()->getStride();
std::vector<int64_t> outputDim = MycastTo64(outD);
std::vector<int64_t> outputStride = MycastTo64(outS);
std::vector<int64_t> outputDim = castTo64(outD);
std::vector<int64_t> outputStride = castTo64(outS);
auto outputTensor =
aclCreateTensor(outputDim.data(), outputDim.size(), ACL_FLOAT,

View File

@ -42,12 +42,12 @@ class ConvAclnn : public ASCENDKernelWithoutConfig {
auto outD = op->getOutput()->getDims();
auto outS = op->getOutput()->getStride();
std::vector<int64_t> inputDim = MycastTo64(inputD);
std::vector<int64_t> inputStride = MycastTo64(inputS);
std::vector<int64_t> weightDim = MycastTo64(weightD);
std::vector<int64_t> weightStride = MycastTo64(weightS);
std::vector<int64_t> outputDim = MycastTo64(outD);
std::vector<int64_t> outputStride = MycastTo64(outS);
std::vector<int64_t> inputDim = castTo64(inputD);
std::vector<int64_t> inputStride = castTo64(inputS);
std::vector<int64_t> weightDim = castTo64(weightD);
std::vector<int64_t> weightStride = castTo64(weightS);
std::vector<int64_t> outputDim = castTo64(outD);
std::vector<int64_t> outputStride = castTo64(outS);
auto inputTensor =
aclCreateTensor(inputDim.data(), inputDim.size(), ACL_FLOAT,

View File

@ -9,67 +9,6 @@
namespace infini {
/*
class PowAclnn : public ASCENDKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ElementWiseObj>(_op);
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto a = op->getInputs(0)->getDims();
auto aS = op->getInputs(0)->getStride();
auto b = op->getInputs(1)->getDims();
auto bS = op->getInputs(1)->getStride();
auto c = op->getInputs(0)->getDims();
auto cS = op->getInputs(0)->getStride();
std::vector<int64_t> aDim = MycastTo64(a);
std::vector<int64_t> aStride = MycastTo64(aS);
std::vector<int64_t> bDim = MycastTo64(b);
std::vector<int64_t> bStride = MycastTo64(bS);
std::vector<int64_t> cDim = MycastTo64(c);
std::vector<int64_t> cStride = MycastTo64(cS);
auto inputA = aclCreateTensor(
aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0,
aclFormat::ACL_FORMAT_ND, aDim.data(), aDim.size(), aData);
auto inputB = aclCreateTensor(
bDim.data(), bDim.size(), ACL_FLOAT, bStride.data(), 0,
aclFormat::ACL_FORMAT_ND, bDim.data(), bDim.size(), bData);
auto output = aclCreateTensor(
cDim.data(), cDim.size(), ACL_FLOAT, cStride.data(), 0,
aclFormat::ACL_FORMAT_ND, cDim.data(), cDim.size(), cData);
uint64_t workspaceSize = 0;
aclOpExecutor *executor;
auto ret = aclnnPowTensorTensorGetWorkspaceSize(
inputA, inputB, output, &workspaceSize, &executor);
void *workspaceAddr = nullptr;
if (workspaceSize > 0) {
workspaceAddr = context->getWorkspace(workspaceSize);
}
assert(ret == ACL_SUCCESS);
ret = aclnnPowTensorTensor(workspaceAddr, workspaceSize, executor,
context->ASCENDHandle());
assert(ret == ACL_SUCCESS);
ret = aclrtSynchronizeStream(context->ASCENDHandle());
assert(ret == ACL_SUCCESS);
ret = aclDestroyTensor(inputA);
ret = aclDestroyTensor(inputB);
ret = aclDestroyTensor(output);
return;
}
};
*/
#define DEFINE_ELEMENT_WISE_Aclnn(prefix) \
class prefix##Aclnn : public ASCENDKernelWithoutConfig { \
void compute(const Operator &_op, \
@ -85,15 +24,15 @@ class PowAclnn : public ASCENDKernelWithoutConfig {
auto aS = op->getInputs(0) -> getStride(); \
auto b = op->getInputs(1) -> getDims(); \
auto bS = op->getInputs(1) -> getStride(); \
auto c = op->getInputs(0) -> getDims(); \
auto cS = op->getInputs(0) -> getStride(); \
auto c = op->getOutput() -> getDims(); \
auto cS = op->getOutput() -> getStride(); \
\
std::vector<int64_t> aDim = MycastTo64(a); \
std::vector<int64_t> aStride = MycastTo64(aS); \
std::vector<int64_t> bDim = MycastTo64(b); \
std::vector<int64_t> bStride = MycastTo64(bS); \
std::vector<int64_t> cDim = MycastTo64(c); \
std::vector<int64_t> cStride = MycastTo64(cS); \
std::vector<int64_t> aDim = castTo64(a); \
std::vector<int64_t> aStride = castTo64(aS); \
std::vector<int64_t> bDim = castTo64(b); \
std::vector<int64_t> bStride = castTo64(bS); \
std::vector<int64_t> cDim = castTo64(c); \
std::vector<int64_t> cStride = castTo64(cS); \
\
auto inputA = aclCreateTensor( \
aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0, \
@ -147,15 +86,15 @@ class AddAclnn : public ASCENDKernelWithoutConfig {
auto aS = op->getInputs(0)->getStride();
auto b = op->getInputs(1)->getDims();
auto bS = op->getInputs(1)->getStride();
auto c = op->getInputs(0)->getDims();
auto cS = op->getInputs(0)->getStride();
auto c = op->getOutput()->getDims();
auto cS = op->getOutput()->getStride();
std::vector<int64_t> aDim = MycastTo64(a);
std::vector<int64_t> aStride = MycastTo64(aS);
std::vector<int64_t> bDim = MycastTo64(b);
std::vector<int64_t> bStride = MycastTo64(bS);
std::vector<int64_t> cDim = MycastTo64(c);
std::vector<int64_t> cStride = MycastTo64(cS);
std::vector<int64_t> aDim = castTo64(a);
std::vector<int64_t> aStride = castTo64(aS);
std::vector<int64_t> bDim = castTo64(b);
std::vector<int64_t> bStride = castTo64(bS);
std::vector<int64_t> cDim = castTo64(c);
std::vector<int64_t> cStride = castTo64(cS);
auto inputA = aclCreateTensor(
aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0,
@ -187,11 +126,6 @@ class AddAclnn : public ASCENDKernelWithoutConfig {
ret = aclrtSynchronizeStream(context->ASCENDHandle());
assert(ret == ACL_SUCCESS);
// ret = aclDestroyTensor(inputA);
// ret = aclDestroyTensor(inputB);
// ret = aclDestroyScalar(alpha);
// ret = aclDestroyTensor(output);
return;
}
};
@ -213,15 +147,15 @@ class SubAclnn : public ASCENDKernelWithoutConfig {
auto aS = op->getInputs(0)->getStride();
auto b = op->getInputs(1)->getDims();
auto bS = op->getInputs(1)->getStride();
auto c = op->getInputs(0)->getDims();
auto cS = op->getInputs(0)->getStride();
auto c = op->getOutput()->getDims();
auto cS = op->getOutput()->getStride();
std::vector<int64_t> aDim = MycastTo64(a);
std::vector<int64_t> aStride = MycastTo64(aS);
std::vector<int64_t> bDim = MycastTo64(b);
std::vector<int64_t> bStride = MycastTo64(bS);
std::vector<int64_t> cDim = MycastTo64(c);
std::vector<int64_t> cStride = MycastTo64(cS);
std::vector<int64_t> aDim = castTo64(a);
std::vector<int64_t> aStride = castTo64(aS);
std::vector<int64_t> bDim = castTo64(b);
std::vector<int64_t> bStride = castTo64(bS);
std::vector<int64_t> cDim = castTo64(c);
std::vector<int64_t> cStride = castTo64(cS);
auto inputA = aclCreateTensor(
aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0,

View File

@ -0,0 +1,74 @@
#include "operators/gather.h"
#include "aclnnop/level2/aclnn_gather_v2.h"
#include "ascend/ascend_kernel_without_config.h"
#include "ascend/ascend_runtime.h"
namespace infini {
class GatherAclnn : public ASCENDKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<GatherObj>(_op);
IT_ASSERT(op->getInputs(1)->getDType() == DataType::Int32 ||
op->getInputs(1)->getDType() == DataType::Int64);
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
int64_t axis = int64_t(op->getAxis());
auto a = op->getInputs(0)->getDims();
auto aS = op->getInputs(0)->getStride();
auto b = op->getInputs(1)->getDims();
auto bS = op->getInputs(1)->getStride();
auto c = op->getOutput()->getDims();
auto cS = op->getOutput()->getStride();
std::vector<int64_t> aDim = castTo64(a);
std::vector<int64_t> aStride = castTo64(aS);
std::vector<int64_t> bDim = castTo64(b);
std::vector<int64_t> bStride = castTo64(bS);
std::vector<int64_t> cDim = castTo64(c);
std::vector<int64_t> cStride = castTo64(cS);
auto inputA = aclCreateTensor(
aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0,
aclFormat::ACL_FORMAT_ND, aDim.data(), aDim.size(), aData);
auto inputB = aclCreateTensor(
bDim.data(), bDim.size(),
op->getInputs(1)->getDType() == DataType::Int32 ? ACL_INT32
: ACL_INT64,
bStride.data(), 0, aclFormat::ACL_FORMAT_ND, bDim.data(),
bDim.size(), bData);
auto output = aclCreateTensor(
cDim.data(), cDim.size(), ACL_FLOAT, cStride.data(), 0,
aclFormat::ACL_FORMAT_ND, cDim.data(), cDim.size(), cData);
uint64_t workspaceSize = 0;
aclOpExecutor *executor;
auto ret = aclnnGatherV2GetWorkspaceSize(inputA, axis, inputB, output,
&workspaceSize, &executor);
void *workspaceAddr = nullptr;
if (workspaceSize > 0) {
workspaceAddr = context->getWorkspace(workspaceSize);
}
assert(ret == ACL_SUCCESS);
ret = aclnnGatherV2(workspaceAddr, workspaceSize, executor,
context->ASCENDHandle());
assert(ret == ACL_SUCCESS);
ret = aclrtSynchronizeStream(context->ASCENDHandle());
assert(ret == ACL_SUCCESS);
return;
}
};
REGISTER_KERNEL(Device::ASCEND, OpType::Gather, GatherAclnn,
"gather_ASCEND_float");
}; // namespace infini

View File

@ -23,12 +23,12 @@ class MatmulAclnn : public ASCENDKernelWithoutConfig {
auto outD = op->getOutput()->getDims();
auto outS = op->getOutput()->getStride();
std::vector<int64_t> selfDim = MycastTo64(selfD);
std::vector<int64_t> selfStride = MycastTo64(selfS);
std::vector<int64_t> matDim = MycastTo64(matD);
std::vector<int64_t> matStride = MycastTo64(matS);
std::vector<int64_t> outputDim = MycastTo64(outD);
std::vector<int64_t> outputStride = MycastTo64(outS);
std::vector<int64_t> selfDim = castTo64(selfD);
std::vector<int64_t> selfStride = castTo64(selfS);
std::vector<int64_t> matDim = castTo64(matD);
std::vector<int64_t> matStride = castTo64(matS);
std::vector<int64_t> outputDim = castTo64(outD);
std::vector<int64_t> outputStride = castTo64(outS);
auto selfTensor = aclCreateTensor(
selfDim.data(), selfDim.size(), ACL_FLOAT, selfStride.data(), 0,

View File

@ -29,10 +29,10 @@ class AvgPooling : public ASCENDKernelWithoutConfig {
auto outD = op->getOutput()->getDims();
auto outS = op->getOutput()->getStride();
std::vector<int64_t> selfDim = MycastTo64(selfD);
std::vector<int64_t> selfStride = MycastTo64(selfS);
std::vector<int64_t> outputDim = MycastTo64(outD);
std::vector<int64_t> outputStride = MycastTo64(outS);
std::vector<int64_t> selfDim = castTo64(selfD);
std::vector<int64_t> selfStride = castTo64(selfS);
std::vector<int64_t> outputDim = castTo64(outD);
std::vector<int64_t> outputStride = castTo64(outS);
aclIntArray *kernelSize = aclCreateIntArray(ksize.data(), ksize.size());
aclIntArray *strides = aclCreateIntArray(stride.data(), stride.size());

View File

@ -0,0 +1,127 @@
#include "operators/reduce.h"
#include "aclnnop/aclnn_mean.h"
#include "aclnnop/aclnn_reduce_sum.h"
#include "ascend/ascend_kernel_without_config.h"
#include "ascend/ascend_runtime.h"
namespace infini {
class MeanAclnn : public ASCENDKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ReduceBaseObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto axes_set = op->getAxes();
std::vector<int> axes;
axes.assign(axes_set.begin(), axes_set.end());
bool KeepDim = op->getKeepDims();
auto a = op->getInputs(0)->getDims();
auto aS = op->getInputs(0)->getStride();
auto c = op->getOutput()->getDims();
auto cS = op->getOutput()->getStride();
std::vector<int64_t> aDim = castTo64(a);
std::vector<int64_t> aStride = castTo64(aS);
std::vector<int64_t> cDim = castTo64(c);
std::vector<int64_t> cStride = castTo64(cS);
std::vector<int64_t> axes_64 = castTo64(axes);
auto inputA = aclCreateTensor(
aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0,
aclFormat::ACL_FORMAT_ND, aDim.data(), aDim.size(), aData);
auto output = aclCreateTensor(
cDim.data(), cDim.size(), ACL_FLOAT, cStride.data(), 0,
aclFormat::ACL_FORMAT_ND, cDim.data(), cDim.size(), cData);
aclIntArray *dim = aclCreateIntArray(axes_64.data(), axes_64.size());
uint64_t workspaceSize = 0;
aclOpExecutor *executor;
auto ret = aclnnMeanV2GetWorkspaceSize(
inputA, dim, KeepDim, true, output, &workspaceSize, &executor);
assert(ret == ACL_SUCCESS);
void *workspaceAddr = nullptr;
if (workspaceSize > 0) {
workspaceAddr = context->getWorkspace(workspaceSize);
}
assert(ret == ACL_SUCCESS);
ret = aclnnMeanV2(workspaceAddr, workspaceSize, executor,
context->ASCENDHandle());
assert(ret == ACL_SUCCESS);
ret = aclrtSynchronizeStream(context->ASCENDHandle());
assert(ret == ACL_SUCCESS);
return;
}
};
class ReduceSumAclnn : public ASCENDKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ReduceBaseObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto axes_set = op->getAxes();
std::vector<int> axes;
axes.assign(axes_set.begin(), axes_set.end());
bool KeepDim = op->getKeepDims();
auto a = op->getInputs(0)->getDims();
auto aS = op->getInputs(0)->getStride();
auto c = op->getOutput()->getDims();
auto cS = op->getOutput()->getStride();
std::vector<int64_t> aDim = castTo64(a);
std::vector<int64_t> aStride = castTo64(aS);
std::vector<int64_t> cDim = castTo64(c);
std::vector<int64_t> cStride = castTo64(cS);
std::vector<int64_t> axes_64 = castTo64(axes);
auto inputA = aclCreateTensor(
aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0,
aclFormat::ACL_FORMAT_ND, aDim.data(), aDim.size(), aData);
auto output = aclCreateTensor(
cDim.data(), cDim.size(), ACL_FLOAT, cStride.data(), 0,
aclFormat::ACL_FORMAT_ND, cDim.data(), cDim.size(), cData);
aclIntArray *dim = aclCreateIntArray(axes_64.data(), axes_64.size());
uint64_t workspaceSize = 0;
aclOpExecutor *executor;
auto ret = aclnnReduceSumGetWorkspaceSize(
inputA, dim, KeepDim, ACL_FLOAT, output, &workspaceSize, &executor);
assert(ret == ACL_SUCCESS);
void *workspaceAddr = nullptr;
if (workspaceSize > 0) {
workspaceAddr = context->getWorkspace(workspaceSize);
}
assert(ret == ACL_SUCCESS);
ret = aclnnReduceSum(workspaceAddr, workspaceSize, executor,
context->ASCENDHandle());
assert(ret == ACL_SUCCESS);
ret = aclrtSynchronizeStream(context->ASCENDHandle());
assert(ret == ACL_SUCCESS);
return;
}
};
REGISTER_KERNEL(Device::ASCEND, OpType::ReduceMean, MeanAclnn,
"reduceMean_ASCEND_float");
REGISTER_KERNEL(Device::ASCEND, OpType::ReduceSum, ReduceSumAclnn,
"reduceSum_ASCEND_float");
}; // namespace infini

View File

@ -4,68 +4,52 @@
#include "ascend/ascend_runtime.h"
namespace infini {
class CopyAclnn : public ASCENDKernelWithoutConfig {
void compute(const Operator &_op,
void compute(const Operator &op,
const RuntimeObj *_context) const override {
auto op = as<MatmulObj>(_op);
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto selfD = op->getInputs(0)->getDims();
auto selfS = op->getInputs(0)->getStride();
auto matD = op->getInputs(1)->getDims();
auto matS = op->getInputs(1)->getStride();
auto outD = op->getOutput()->getDims();
auto outS = op->getOutput()->getStride();
auto aD = op->getInputs(0)->getDims();
auto aS = op->getInputs(0)->getStride();
std::vector<int64_t> selfDim = MycastTo64(selfD);
std::vector<int64_t> selfStride = MycastTo64(selfS);
std::vector<int64_t> matDim = MycastTo64(matD);
std::vector<int64_t> matStride = MycastTo64(matS);
std::vector<int64_t> outputDim = MycastTo64(outD);
std::vector<int64_t> outputStride = MycastTo64(outS);
std::vector<int64_t> aDim = castTo64(aD);
std::vector<int64_t> aStride = castTo64(aS);
auto selfTensor = aclCreateTensor(
selfDim.data(), selfDim.size(), ACL_FLOAT, selfStride.data(), 0,
aclFormat::ACL_FORMAT_ND, selfDim.data(), selfDim.size(), aData);
auto matTensor = aclCreateTensor(
matDim.data(), matDim.size(), ACL_FLOAT, matStride.data(), 0,
aclFormat::ACL_FORMAT_ND, matDim.data(), matDim.size(), bData);
auto outputTensor =
aclCreateTensor(outputDim.data(), outputDim.size(), ACL_FLOAT,
outputStride.data(), 0, aclFormat::ACL_FORMAT_ND,
outputDim.data(), outputDim.size(), cData);
auto srcTensor = aclCreateTensor(
aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0,
aclFormat::ACL_FORMAT_ND, aDim.data(), aDim.size(), aData);
auto outputTensor = aclCreateTensor(
aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0,
aclFormat::ACL_FORMAT_ND, aDim.data(), aDim.size(), cData);
uint64_t workspaceSize = 0;
aclOpExecutor *executor;
auto ret = aclnnMatmulGetWorkspaceSize(
selfTensor, matTensor, outputTensor, 1, &workspaceSize, &executor);
auto ret = aclnnInplaceCopyGetWorkspaceSize(outputTensor, srcTensor,
&workspaceSize, &executor);
void *workspaceAddr = nullptr;
if (workspaceSize > 0) {
workspaceAddr = context->getWorkspace(workspaceSize);
}
assert(ret == ACL_SUCCESS);
ret = aclnnMatmul(workspaceAddr, workspaceSize, executor,
context->ASCENDHandle());
ret = aclnnInplaceCopy(workspaceAddr, workspaceSize, executor,
context->ASCENDHandle());
assert(ret == ACL_SUCCESS);
ret = aclrtSynchronizeStream(context->ASCENDHandle());
assert(ret == ACL_SUCCESS);
// aclDestroyTensor(selfTensor);
// aclDestroyTensor(matTensor);
// aclDestroyTensor(outputTensor);
return;
}
};
REGISTER_KERNEL(Device::ASCEND, OpType::MatMul, MatmulAclnn,
"matmul_ASCEND_float");
REGISTER_KERNEL(Device::ASCEND, OpType::Reshape, CopyAclnn,
"reshape_ASCEND_float");
REGISTER_KERNEL(Device::ASCEND, OpType::Unsqueeze, CopyAclnn,
"unsqueeze_ASCEND_float");
REGISTER_KERNEL(Device::ASCEND, OpType::Squeeze, CopyAclnn,
"squeeze_ASCEND_float");
}; // namespace infini

View File

@ -0,0 +1,77 @@
#include "operators/slice.h"
#include "aclnnop/aclnn_slice_v2.h"
#include "ascend/ascend_kernel_without_config.h"
#include "ascend/ascend_runtime.h"
namespace infini {
class SliceAclnn : public ASCENDKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<SliceObj>(_op);
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto starts_32 = op->getStarts();
auto ends_32 = op->getEnds();
auto steps_32 = op->getSteps();
auto a = op->getInputs(0)->getDims();
auto aS = op->getInputs(0)->getStride();
auto c = op->getOutput()->getDims();
auto cS = op->getOutput()->getStride();
std::vector<int64_t> aDim = castTo64(a);
std::vector<int64_t> aStride = castTo64(aS);
std::vector<int64_t> cDim = castTo64(c);
std::vector<int64_t> cStride = castTo64(cS);
std::vector<int64_t> starts_64 = castTo64(starts_32);
std::vector<int64_t> ends_64 = castTo64(ends_32);
std::vector<int64_t> steps_64 = castTo64(steps_32);
vector<int64_t> axes_64 = vector<int64_t>(starts_32.size(), 0);
for (int i = 0; i < int(starts_32.size()); i++) {
axes_64[i] = i;
}
auto inputA = aclCreateTensor(
aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0,
aclFormat::ACL_FORMAT_ND, aDim.data(), aDim.size(), aData);
auto output = aclCreateTensor(
cDim.data(), cDim.size(), ACL_FLOAT, cStride.data(), 0,
aclFormat::ACL_FORMAT_ND, cDim.data(), cDim.size(), cData);
aclIntArray *starts =
aclCreateIntArray(starts_64.data(), starts_64.size());
aclIntArray *ends = aclCreateIntArray(ends_64.data(), ends_64.size());
aclIntArray *steps =
aclCreateIntArray(steps_64.data(), steps_64.size());
aclIntArray *axes = aclCreateIntArray(axes_64.data(), axes_64.size());
uint64_t workspaceSize = 0;
aclOpExecutor *executor;
auto ret =
aclnnSliceV2GetWorkspaceSize(inputA, starts, ends, axes, steps,
output, &workspaceSize, &executor);
void *workspaceAddr = nullptr;
if (workspaceSize > 0) {
workspaceAddr = context->getWorkspace(workspaceSize);
}
assert(ret == ACL_SUCCESS);
ret = aclnnSliceV2(workspaceAddr, workspaceSize, executor,
context->ASCENDHandle());
assert(ret == ACL_SUCCESS);
ret = aclrtSynchronizeStream(context->ASCENDHandle());
assert(ret == ACL_SUCCESS);
return;
}
};
REGISTER_KERNEL(Device::ASCEND, OpType::Slice, SliceAclnn,
"slice_ASCEND_float");
}; // namespace infini

View File

@ -1,4 +1,3 @@
#include "operators/softmax.h"
#include "aclnnop/level2/aclnn_softmax.h"
#include "ascend/ascend_kernel_without_config.h"
@ -18,13 +17,13 @@ class SoftmaxAclnn : public ASCENDKernelWithoutConfig {
auto a = op->getInputs(0)->getDims();
auto aS = op->getInputs(0)->getStride();
auto c = op->getInputs(0)->getDims();
auto cS = op->getInputs(0)->getStride();
auto c = op->getOutput()->getDims();
auto cS = op->getOutput()->getStride();
std::vector<int64_t> aDim = MycastTo64(a);
std::vector<int64_t> aStride = MycastTo64(aS);
std::vector<int64_t> cDim = MycastTo64(c);
std::vector<int64_t> cStride = MycastTo64(cS);
std::vector<int64_t> aDim = castTo64(a);
std::vector<int64_t> aStride = castTo64(aS);
std::vector<int64_t> cDim = castTo64(c);
std::vector<int64_t> cStride = castTo64(cS);
auto input = aclCreateTensor(
aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0,

View File

@ -0,0 +1,72 @@
#include "operators/split.h"
#include "aclnnop/aclnn_split_tensor.h"
#include "ascend/ascend_kernel_without_config.h"
#include "ascend/ascend_runtime.h"
namespace infini {
class SplitAclnn : public ASCENDKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<SplitObj>(_op);
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
auto a = op->getInputs(0)->getDims();
auto aS = op->getInputs(0)->getStride();
std::vector<int64_t> aDim = castTo64(a);
std::vector<int64_t> aStride = castTo64(aS);
int64_t dim = op->getDim();
int num = op->numOutputs();
int dimSize = a.at(op->getDim());
uint64_t splitSections = dimSize / num;
auto inputA = aclCreateTensor(
aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0,
aclFormat::ACL_FORMAT_ND, aDim.data(), aDim.size(), aData);
std::vector<aclTensor *> outputsData{};
for (int i = 0; i < num; ++i) {
auto c = op->getOutput(i)->getDims();
auto cS = op->getOutput(i)->getStride();
std::vector<int64_t> cDim = castTo64(c);
std::vector<int64_t> cStride = castTo64(cS);
void *const cData = (op->getOutput(i)->getRawDataPtr<void *>());
aclTensor *tmpTensor = aclCreateTensor(
cDim.data(), cDim.size(), ACL_FLOAT, cStride.data(), 0,
aclFormat::ACL_FORMAT_ND, cDim.data(), cDim.size(), cData);
outputsData.push_back(tmpTensor);
}
aclTensorList *tensorList =
aclCreateTensorList(outputsData.data(), outputsData.size());
uint64_t workspaceSize = 0;
aclOpExecutor *executor;
auto ret = aclnnSplitTensorGetWorkspaceSize(
inputA, splitSections, dim, tensorList, &workspaceSize, &executor);
void *workspaceAddr = nullptr;
if (workspaceSize > 0) {
workspaceAddr = context->getWorkspace(workspaceSize);
}
assert(ret == ACL_SUCCESS);
ret = aclnnSplitTensor(workspaceAddr, workspaceSize, executor,
context->ASCENDHandle());
assert(ret == ACL_SUCCESS);
ret = aclrtSynchronizeStream(context->ASCENDHandle());
assert(ret == ACL_SUCCESS);
return;
}
};
REGISTER_KERNEL(Device::ASCEND, OpType::Split, SplitAclnn,
"split_ASCEND_float");
}; // namespace infini

View File

@ -0,0 +1,61 @@
#include "operators/transpose.h"
#include "aclnnop/level2/aclnn_permute.h"
#include "ascend/ascend_kernel_without_config.h"
#include "ascend/ascend_runtime.h"
namespace infini {
class PermuteAclnn : public ASCENDKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<TransposeObj>(_op);
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto a = op->getInputs(0)->getDims();
auto aS = op->getInputs(0)->getStride();
auto c = op->getOutput()->getDims();
auto cS = op->getOutput()->getStride();
std::vector<int64_t> aDim = castTo64(a);
std::vector<int64_t> aStride = castTo64(aS);
std::vector<int64_t> cDim = castTo64(c);
std::vector<int64_t> cStride = castTo64(cS);
auto _permute = op->getPermute();
std::vector<int64_t> permute = castTo64(_permute);
auto inputA = aclCreateTensor(
aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0,
aclFormat::ACL_FORMAT_ND, aDim.data(), aDim.size(), aData);
aclIntArray *dims = aclCreateIntArray(permute.data(), permute.size());
auto output = aclCreateTensor(
cDim.data(), cDim.size(), ACL_FLOAT, cStride.data(), 0,
aclFormat::ACL_FORMAT_ND, cDim.data(), cDim.size(), cData);
uint64_t workspaceSize = 0;
aclOpExecutor *executor;
auto ret = aclnnPermuteGetWorkspaceSize(inputA, dims, output,
&workspaceSize, &executor);
void *workspaceAddr = nullptr;
if (workspaceSize > 0) {
workspaceAddr = context->getWorkspace(workspaceSize);
}
assert(ret == ACL_SUCCESS);
ret = aclnnPermute(workspaceAddr, workspaceSize, executor,
context->ASCENDHandle());
assert(ret == ACL_SUCCESS);
ret = aclrtSynchronizeStream(context->ASCENDHandle());
assert(ret == ACL_SUCCESS);
return;
}
};
REGISTER_KERNEL(Device::ASCEND, OpType::Transpose, PermuteAclnn,
"transpose_ASCEND_float");
}; // namespace infini

View File

@ -0,0 +1,54 @@
#include "ascend/ascend_runtime.h"
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "operators/reshape.h"
#include "operators/squeeze.h"
#include "operators/unsqueeze.h"
#include "test.h"
namespace infini {
template <class T>
void testReshape(const std::function<void(void *, size_t, DataType)> &generator,
const Shape &shape, const Shape &outputShape) {
// Runtime
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto npuRuntime = make_ref<ASCENDRuntimeObj>();
// Build input data on CPU
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
inputCpu->dataMalloc();
inputCpu->setData(generator);
// NPU
Graph npuGraph = make_ref<GraphObj>(npuRuntime);
auto inputNpu = npuGraph->cloneTensor(inputCpu);
auto npuOp = npuGraph->addOp<T>(inputNpu, nullptr, outputShape);
npuGraph->dataMalloc();
inputNpu->setData(generator);
npuRuntime->run(npuGraph);
auto outputNpu = npuOp->getOutput();
auto outputNpu2Cpu = outputNpu->clone(cpuRuntime);
// Check
inputCpu->print();
inputCpu->printData();
outputNpu2Cpu->print();
outputNpu2Cpu->printData();
EXPECT_TRUE(inputCpu->equalData(outputNpu2Cpu, 1e-3));
}
TEST(ascend_Unary, run) {
aclInit(nullptr);
testReshape<ReshapeObj>(IncrementalGenerator(), Shape{1, 2, 2, 3},
Shape{1, 2, 6});
testReshape<SqueezeObj>(IncrementalGenerator(), Shape{1, 2, 2, 3},
Shape{0});
testReshape<UnsqueezeObj>(IncrementalGenerator(), Shape{1, 2, 2, 3},
Shape{4});
aclFinalize();
}
} // namespace infini

View File

@ -11,14 +11,14 @@ namespace infini {
template <class T>
void testElementWise(
const std::function<void(void *, size_t, DataType)> &generator,
const Shape &shape) {
const Shape &shape0, const Shape &shape) {
// Runtime
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto npuRuntime = make_ref<ASCENDRuntimeObj>();
// Build input data on CPU
Tensor inputCpu1 =
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
make_ref<TensorObj>(shape0, DataType::Float32, cpuRuntime);
Tensor inputCpu2 =
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
inputCpu1->dataMalloc();
@ -26,6 +26,10 @@ void testElementWise(
inputCpu1->setData(generator);
inputCpu2->setData(generator);
inputCpu1->print();
inputCpu1->printData();
inputCpu2->print();
inputCpu2->printData();
// NPU
Graph npuGraph = make_ref<GraphObj>(npuRuntime);
auto inputNpu1 = npuGraph->cloneTensor(inputCpu1);
@ -39,23 +43,21 @@ void testElementWise(
auto outputNpu2Cpu = outputNpu->clone(cpuRuntime);
// Check
inputCpu1->print();
inputCpu1->printData();
inputCpu2->print();
inputCpu2->printData();
outputNpu2Cpu->print();
outputNpu2Cpu->printData();
EXPECT_TRUE(1);
}
TEST(ascend_ElementWise, run) {
aclInit(nullptr);
testElementWise<PowObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testElementWise<AddObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testElementWise<SubObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testElementWise<DivObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testElementWise<MulObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
aclFinalize();
// aclInit(nullptr);
// testElementWise<PowObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
// testElementWise<AddObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
// testElementWise<SubObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testElementWise<DivObj>(IncrementalGenerator(), Shape{1},
Shape{1, 2, 2, 3});
// testElementWise<MulObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
// aclFinalize();
}
} // namespace infini

View File

@ -0,0 +1,96 @@
#include "ascend/ascend_runtime.h"
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "operators/gather.h"
#include "test.h"
namespace infini {
TEST(ascend_Unary, run) {
aclInit(nullptr);
{
// Runtime
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto npuRuntime = make_ref<ASCENDRuntimeObj>();
// Build input data on CPU
Tensor inputCpu =
make_ref<TensorObj>(Shape{3, 2}, DataType::Float32, cpuRuntime);
Tensor indexCpu =
make_ref<TensorObj>(Shape{2, 2}, DataType::Int32, cpuRuntime);
// NPU
Graph npuGraph = make_ref<GraphObj>(npuRuntime);
auto inputNpu = npuGraph->cloneTensor(inputCpu);
auto indexNpu = npuGraph->cloneTensor(indexCpu);
auto npuOp = npuGraph->addOp<GatherObj>(inputNpu, indexNpu, nullptr, 0);
npuGraph->dataMalloc();
inputNpu->copyin(vector<float>{1, 2, 3, 4, 5, 6});
indexNpu->copyin(vector<int>{0, 1, 1, 2});
npuRuntime->run(npuGraph);
auto outputNpu = npuOp->getOutput();
auto outputNpu2Cpu = outputNpu->clone(cpuRuntime);
// Check
EXPECT_TRUE(
outputNpu2Cpu->equalData(vector<float>{1, 2, 3, 4, 3, 4, 5, 6}));
}
{
// Runtime
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto npuRuntime = make_ref<ASCENDRuntimeObj>();
// Build input data on CPU
Tensor inputCpu =
make_ref<TensorObj>(Shape{3, 3}, DataType::Float32, cpuRuntime);
Tensor indexCpu =
make_ref<TensorObj>(Shape{1, 2}, DataType::Int32, cpuRuntime);
// NPU
Graph npuGraph = make_ref<GraphObj>(npuRuntime);
auto inputNpu = npuGraph->cloneTensor(inputCpu);
auto indexNpu = npuGraph->cloneTensor(indexCpu);
auto npuOp = npuGraph->addOp<GatherObj>(inputNpu, indexNpu, nullptr, 1);
npuGraph->dataMalloc();
inputNpu->setData(IncrementalGenerator());
indexNpu->copyin(vector<int>{0, 2});
npuRuntime->run(npuGraph);
auto outputNpu = npuOp->getOutput();
auto outputNpu2Cpu = outputNpu->clone(cpuRuntime);
// Check
EXPECT_TRUE(outputNpu2Cpu->equalData(vector<float>{0, 2, 3, 5, 6, 8}));
}
{
// Runtime
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto npuRuntime = make_ref<ASCENDRuntimeObj>();
// Build input data on CPU
Tensor inputCpu =
make_ref<TensorObj>(Shape{3, 2}, DataType::Float32, cpuRuntime);
Tensor indexCpu =
make_ref<TensorObj>(Shape{2, 2}, DataType::Int64, cpuRuntime);
// NPU
Graph npuGraph = make_ref<GraphObj>(npuRuntime);
auto inputNpu = npuGraph->cloneTensor(inputCpu);
auto indexNpu = npuGraph->cloneTensor(indexCpu);
auto npuOp = npuGraph->addOp<GatherObj>(inputNpu, indexNpu, nullptr, 0);
npuGraph->dataMalloc();
inputNpu->copyin(std::vector<float>{1.0, 1.2, 2.3, 3.4, 4.5, 5.7});
indexNpu->copyin(vector<int64_t>{0, 1, 1, 2});
npuRuntime->run(npuGraph);
auto outputNpu = npuOp->getOutput();
auto outputNpu2Cpu = outputNpu->clone(cpuRuntime);
// Check
EXPECT_TRUE(outputNpu2Cpu->equalData(
vector<float>{1.0, 1.2, 2.3, 3.4, 2.3, 3.4, 4.5, 5.7}));
}
aclFinalize();
}
} // namespace infini

View File

@ -0,0 +1,84 @@
#include "ascend/ascend_runtime.h"
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "operators/reduce.h"
#include "test.h"
namespace infini {
template <typename ReduceObjT>
void test_reduce(const Shape &shape, const vector<float> &data,
const optional<const vector<int>> &axes, bool keepDims,
const vector<float> &ExpectData) {
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto npuRuntime = make_ref<ASCENDRuntimeObj>();
// Build input data on CPU
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
// Build NPU graph
Graph npuGraph = make_ref<GraphObj>(npuRuntime);
auto inputNpu = npuGraph->cloneTensor(inputCpu);
auto op = npuGraph->addOp<ReduceObjT>(inputNpu, nullptr, axes, keepDims);
// allocate NPU memory
npuGraph->dataMalloc();
inputNpu->copyin(data);
// Execute on NPU
npuRuntime->run(npuGraph);
// clone NPU output to CPU
auto outputNpu = op->getOutput();
auto outputNpu2Cpu = outputNpu->clone(cpuRuntime);
// check results on CPU
EXPECT_TRUE(outputNpu2Cpu->equalData(ExpectData));
}
TEST(ascend_ReduceMean, run) {
aclInit(nullptr);
test_reduce<ReduceMeanObj>(
Shape{3, 2, 2}, vector<float>{5, 1, 20, 2, 30, 1, 40, 2, 55, 1, 60, 2},
std::nullopt, true, vector<float>{18.25});
test_reduce<ReduceMeanObj>(
Shape{1, 3, 2, 2, 1},
vector<float>{5, 1, 20, 2, 30, 1, 40, 2, 55, 1, 60, 2}, std::nullopt,
false, vector<float>{18.25});
test_reduce<ReduceMeanObj>(
Shape{2, 3, 2, 2},
vector<float>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23},
vector<int>{1, 2}, false, vector<float>{5, 6, 17, 18});
test_reduce<ReduceMeanObj>(
Shape{2, 3, 2, 2, 1},
vector<float>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23},
vector<int>{1, 2}, true, vector<float>{5, 6, 17, 18});
aclFinalize();
}
TEST(ascend_ReduceSum, run) {
test_reduce<ReduceSumObj>(Shape{3, 2, 2},
vector<float>{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
std::nullopt, true, vector<float>{12});
test_reduce<ReduceSumObj>(Shape{1, 3, 2, 2, 1},
vector<float>{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
std::nullopt, false, vector<float>{12});
test_reduce<ReduceSumObj>(
Shape{2, 3, 2, 2},
vector<float>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23},
vector<int>{1, 2}, false, vector<float>{30, 36, 102, 108});
test_reduce<ReduceSumObj>(
Shape{2, 3, 2, 2, 1},
vector<float>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23},
vector<int>{1, 2}, true, vector<float>{30, 36, 102, 108});
}
} // namespace infini

View File

@ -0,0 +1,41 @@
#include "ascend/ascend_runtime.h"
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "operators/slice.h"
#include "test.h"
namespace infini {
TEST(ascend_Unary, run) {
aclInit(nullptr);
// Runtime
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto npuRuntime = make_ref<ASCENDRuntimeObj>();
// Build input data on CPU
Tensor inputCpu =
make_ref<TensorObj>(Shape{3, 2, 1, 5}, DataType::Float32, cpuRuntime);
// inputCpu->dataMalloc();
// inputCpu->setData(IncrementalGenerator());
// NPU
Graph npuGraph = make_ref<GraphObj>(npuRuntime);
auto inputNpu = npuGraph->cloneTensor(inputCpu);
auto npuOp = npuGraph->addOp<SliceObj>(inputNpu, nullptr, vector<int>{1, 1},
vector<int>{2, 5}, vector<int>{0, 3},
std::nullopt);
npuGraph->dataMalloc();
inputNpu->setData(IncrementalGenerator());
npuRuntime->run(npuGraph);
auto outputNpu = npuOp->getOutput();
auto outputNpu2Cpu = outputNpu->clone(cpuRuntime);
// Check
EXPECT_TRUE(outputNpu2Cpu->equalData(
vector<float>{11, 12, 13, 14, 16, 17, 18, 19}));
aclFinalize();
}
} // namespace infini

View File

@ -0,0 +1,50 @@
#include "ascend/ascend_runtime.h"
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "operators/split.h"
#include "test.h"
namespace infini {
template <class T>
void testSplit(const std::function<void(void *, size_t, DataType)> &generator,
const Shape &shape) {
// Runtime
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto npuRuntime = make_ref<ASCENDRuntimeObj>();
// Build input data on CPU
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
inputCpu->dataMalloc();
inputCpu->setData(generator);
// GPU
Graph npuGraph = make_ref<GraphObj>(npuRuntime);
auto inputNpu = npuGraph->cloneTensor(inputCpu);
auto gpuOp = npuGraph->addOp<T>(inputNpu, std::nullopt, 3, 3);
npuGraph->dataMalloc();
inputNpu->setData(generator);
npuRuntime->run(npuGraph);
auto o0Cpu = gpuOp->getOutput(0)->clone(cpuRuntime);
auto o1Cpu = gpuOp->getOutput(1)->clone(cpuRuntime);
auto o2Cpu = gpuOp->getOutput(2)->clone(cpuRuntime);
// Check
inputCpu->print();
inputCpu->printData();
o0Cpu->print();
o0Cpu->printData();
o1Cpu->print();
o1Cpu->printData();
o2Cpu->print();
o2Cpu->printData();
EXPECT_TRUE(1);
}
TEST(ascend_Split, run) {
aclInit(nullptr);
testSplit<SplitObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
aclFinalize();
}
} // namespace infini

View File

@ -0,0 +1,49 @@
#include "ascend/ascend_runtime.h"
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "operators/transpose.h"
#include "test.h"
namespace infini {
template <class T>
void testTranspose(
const std::function<void(void *, size_t, DataType)> &generator,
const Shape &shape, const Shape &permute) {
// Runtime
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto npuRuntime = make_ref<ASCENDRuntimeObj>();
// Build input data on CPU
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
inputCpu->dataMalloc();
inputCpu->setData(generator);
// NPU
Graph npuGraph = make_ref<GraphObj>(npuRuntime);
auto inputNpu = npuGraph->cloneTensor(inputCpu);
auto npuOp = npuGraph->addOp<T>(inputNpu, nullptr, permute);
npuGraph->dataMalloc();
inputNpu->setData(generator);
npuRuntime->run(npuGraph);
auto outputNpu = npuOp->getOutput();
auto outputNpu2Cpu = outputNpu->clone(cpuRuntime);
// Check
inputCpu->print();
inputCpu->printData();
outputNpu2Cpu->print();
outputNpu2Cpu->printData();
EXPECT_TRUE(1);
}
TEST(ascend_Unary, run) {
aclInit(nullptr);
testTranspose<TransposeObj>(IncrementalGenerator(), Shape{1, 1, 2, 3},
vector<int>{0, 1, 3, 2});
aclFinalize();
}
} // namespace infini

View File

@ -18,7 +18,7 @@ void testUnary(const std::function<void(void *, size_t, DataType)> &generator,
// Build input data on CPU
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
// GPU
// NPU
Graph npuGraph = make_ref<GraphObj>(npuRuntime);
auto inputNpu = npuGraph->cloneTensor(inputCpu);
auto npuOp = npuGraph->addOp<T>(inputNpu, nullptr);