forked from jiuyuan/InfiniTensor
support for llama
This commit is contained in:
parent
9db6703b58
commit
36e0840f2f
|
@ -20,7 +20,7 @@ class ASCENDKernelWithoutConfig : public Kernel {
|
|||
[&]() { context->sync(); }));
|
||||
}
|
||||
// transform vector<int> to vector<int64_t>
|
||||
std::vector<int64_t> MycastTo64(std::vector<int> const &v32) const {
|
||||
std::vector<int64_t> castTo64(std::vector<int> const &v32) const {
|
||||
std::vector<int64_t> v64(v32.size(), 1);
|
||||
for (size_t i = 0; i < v32.size(); ++i) {
|
||||
v64[i] = int64_t(v32[i]);
|
||||
|
|
|
@ -33,6 +33,7 @@ class ASCENDRuntimeObj : public RuntimeObj {
|
|||
// // LOG_PRINT("aclInit failed. ERROR: %d\n",
|
||||
// ret));
|
||||
// #endif
|
||||
aclInit(nullptr);
|
||||
auto ret = aclrtSetDevice(deviceId);
|
||||
CHECK_RET(ret == ACL_SUCCESS,
|
||||
LOG_PRINT("aclrtSetDevice failed. ERROR: %d\n", ret));
|
||||
|
@ -58,7 +59,7 @@ class ASCENDRuntimeObj : public RuntimeObj {
|
|||
aclrtDestroyStream(stream);
|
||||
aclrtDestroyContext(context);
|
||||
aclrtResetDevice(deviceId);
|
||||
// aclFinalize();
|
||||
aclFinalize();
|
||||
}
|
||||
string toString() const override;
|
||||
|
||||
|
|
|
@ -7,7 +7,9 @@ namespace infini {
|
|||
*
|
||||
*/
|
||||
class SliceObj : public OperatorObj {
|
||||
template <class T> struct range_t { T start, end, step; };
|
||||
template <class T> struct range_t {
|
||||
T start, end, step;
|
||||
};
|
||||
vector<range_t<int>> axes;
|
||||
|
||||
public:
|
||||
|
|
|
@ -26,12 +26,12 @@ class BatchNormAclnn : public ASCENDKernelWithoutConfig {
|
|||
auto outD = op->getOutput()->getDims();
|
||||
auto outS = op->getOutput()->getStride();
|
||||
|
||||
std::vector<int64_t> inputDim = MycastTo64(inD);
|
||||
std::vector<int64_t> inputStride = MycastTo64(inS);
|
||||
std::vector<int64_t> paraDim = MycastTo64(paraD);
|
||||
std::vector<int64_t> paraStride = MycastTo64(paraS);
|
||||
std::vector<int64_t> outputDim = MycastTo64(outD);
|
||||
std::vector<int64_t> outputStride = MycastTo64(outS);
|
||||
std::vector<int64_t> inputDim = castTo64(inD);
|
||||
std::vector<int64_t> inputStride = castTo64(inS);
|
||||
std::vector<int64_t> paraDim = castTo64(paraD);
|
||||
std::vector<int64_t> paraStride = castTo64(paraS);
|
||||
std::vector<int64_t> outputDim = castTo64(outD);
|
||||
std::vector<int64_t> outputStride = castTo64(outS);
|
||||
|
||||
auto inputTensor =
|
||||
aclCreateTensor(inputDim.data(), inputDim.size(), ACL_FLOAT,
|
||||
|
|
|
@ -19,8 +19,8 @@ class ConcatAclnn : public ASCENDKernelWithoutConfig {
|
|||
for (int i = 0; i < num; ++i) {
|
||||
auto inD = op->getInputs(i)->getDims();
|
||||
auto inS = op->getInputs(i)->getStride();
|
||||
std::vector<int64_t> inputDim = MycastTo64(inD);
|
||||
std::vector<int64_t> inputStride = MycastTo64(inS);
|
||||
std::vector<int64_t> inputDim = castTo64(inD);
|
||||
std::vector<int64_t> inputStride = castTo64(inS);
|
||||
|
||||
void *const inData = (op->getInputs(i)->getRawDataPtr<void *>());
|
||||
auto tmpTensor =
|
||||
|
@ -36,8 +36,8 @@ class ConcatAclnn : public ASCENDKernelWithoutConfig {
|
|||
void *const outData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
auto outD = op->getOutput()->getDims();
|
||||
auto outS = op->getOutput()->getStride();
|
||||
std::vector<int64_t> outputDim = MycastTo64(outD);
|
||||
std::vector<int64_t> outputStride = MycastTo64(outS);
|
||||
std::vector<int64_t> outputDim = castTo64(outD);
|
||||
std::vector<int64_t> outputStride = castTo64(outS);
|
||||
|
||||
auto outputTensor =
|
||||
aclCreateTensor(outputDim.data(), outputDim.size(), ACL_FLOAT,
|
||||
|
|
|
@ -42,12 +42,12 @@ class ConvAclnn : public ASCENDKernelWithoutConfig {
|
|||
auto outD = op->getOutput()->getDims();
|
||||
auto outS = op->getOutput()->getStride();
|
||||
|
||||
std::vector<int64_t> inputDim = MycastTo64(inputD);
|
||||
std::vector<int64_t> inputStride = MycastTo64(inputS);
|
||||
std::vector<int64_t> weightDim = MycastTo64(weightD);
|
||||
std::vector<int64_t> weightStride = MycastTo64(weightS);
|
||||
std::vector<int64_t> outputDim = MycastTo64(outD);
|
||||
std::vector<int64_t> outputStride = MycastTo64(outS);
|
||||
std::vector<int64_t> inputDim = castTo64(inputD);
|
||||
std::vector<int64_t> inputStride = castTo64(inputS);
|
||||
std::vector<int64_t> weightDim = castTo64(weightD);
|
||||
std::vector<int64_t> weightStride = castTo64(weightS);
|
||||
std::vector<int64_t> outputDim = castTo64(outD);
|
||||
std::vector<int64_t> outputStride = castTo64(outS);
|
||||
|
||||
auto inputTensor =
|
||||
aclCreateTensor(inputDim.data(), inputDim.size(), ACL_FLOAT,
|
||||
|
|
|
@ -9,67 +9,6 @@
|
|||
|
||||
namespace infini {
|
||||
|
||||
/*
|
||||
class PowAclnn : public ASCENDKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ElementWiseObj>(_op);
|
||||
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
auto a = op->getInputs(0)->getDims();
|
||||
auto aS = op->getInputs(0)->getStride();
|
||||
auto b = op->getInputs(1)->getDims();
|
||||
auto bS = op->getInputs(1)->getStride();
|
||||
auto c = op->getInputs(0)->getDims();
|
||||
auto cS = op->getInputs(0)->getStride();
|
||||
|
||||
std::vector<int64_t> aDim = MycastTo64(a);
|
||||
std::vector<int64_t> aStride = MycastTo64(aS);
|
||||
std::vector<int64_t> bDim = MycastTo64(b);
|
||||
std::vector<int64_t> bStride = MycastTo64(bS);
|
||||
std::vector<int64_t> cDim = MycastTo64(c);
|
||||
std::vector<int64_t> cStride = MycastTo64(cS);
|
||||
|
||||
auto inputA = aclCreateTensor(
|
||||
aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, aDim.data(), aDim.size(), aData);
|
||||
auto inputB = aclCreateTensor(
|
||||
bDim.data(), bDim.size(), ACL_FLOAT, bStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, bDim.data(), bDim.size(), bData);
|
||||
auto output = aclCreateTensor(
|
||||
cDim.data(), cDim.size(), ACL_FLOAT, cStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, cDim.data(), cDim.size(), cData);
|
||||
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor *executor;
|
||||
|
||||
auto ret = aclnnPowTensorTensorGetWorkspaceSize(
|
||||
inputA, inputB, output, &workspaceSize, &executor);
|
||||
void *workspaceAddr = nullptr;
|
||||
if (workspaceSize > 0) {
|
||||
workspaceAddr = context->getWorkspace(workspaceSize);
|
||||
}
|
||||
assert(ret == ACL_SUCCESS);
|
||||
ret = aclnnPowTensorTensor(workspaceAddr, workspaceSize, executor,
|
||||
context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
ret = aclDestroyTensor(inputA);
|
||||
ret = aclDestroyTensor(inputB);
|
||||
ret = aclDestroyTensor(output);
|
||||
|
||||
return;
|
||||
}
|
||||
};
|
||||
*/
|
||||
|
||||
#define DEFINE_ELEMENT_WISE_Aclnn(prefix) \
|
||||
class prefix##Aclnn : public ASCENDKernelWithoutConfig { \
|
||||
void compute(const Operator &_op, \
|
||||
|
@ -85,15 +24,15 @@ class PowAclnn : public ASCENDKernelWithoutConfig {
|
|||
auto aS = op->getInputs(0) -> getStride(); \
|
||||
auto b = op->getInputs(1) -> getDims(); \
|
||||
auto bS = op->getInputs(1) -> getStride(); \
|
||||
auto c = op->getInputs(0) -> getDims(); \
|
||||
auto cS = op->getInputs(0) -> getStride(); \
|
||||
auto c = op->getOutput() -> getDims(); \
|
||||
auto cS = op->getOutput() -> getStride(); \
|
||||
\
|
||||
std::vector<int64_t> aDim = MycastTo64(a); \
|
||||
std::vector<int64_t> aStride = MycastTo64(aS); \
|
||||
std::vector<int64_t> bDim = MycastTo64(b); \
|
||||
std::vector<int64_t> bStride = MycastTo64(bS); \
|
||||
std::vector<int64_t> cDim = MycastTo64(c); \
|
||||
std::vector<int64_t> cStride = MycastTo64(cS); \
|
||||
std::vector<int64_t> aDim = castTo64(a); \
|
||||
std::vector<int64_t> aStride = castTo64(aS); \
|
||||
std::vector<int64_t> bDim = castTo64(b); \
|
||||
std::vector<int64_t> bStride = castTo64(bS); \
|
||||
std::vector<int64_t> cDim = castTo64(c); \
|
||||
std::vector<int64_t> cStride = castTo64(cS); \
|
||||
\
|
||||
auto inputA = aclCreateTensor( \
|
||||
aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0, \
|
||||
|
@ -147,15 +86,15 @@ class AddAclnn : public ASCENDKernelWithoutConfig {
|
|||
auto aS = op->getInputs(0)->getStride();
|
||||
auto b = op->getInputs(1)->getDims();
|
||||
auto bS = op->getInputs(1)->getStride();
|
||||
auto c = op->getInputs(0)->getDims();
|
||||
auto cS = op->getInputs(0)->getStride();
|
||||
auto c = op->getOutput()->getDims();
|
||||
auto cS = op->getOutput()->getStride();
|
||||
|
||||
std::vector<int64_t> aDim = MycastTo64(a);
|
||||
std::vector<int64_t> aStride = MycastTo64(aS);
|
||||
std::vector<int64_t> bDim = MycastTo64(b);
|
||||
std::vector<int64_t> bStride = MycastTo64(bS);
|
||||
std::vector<int64_t> cDim = MycastTo64(c);
|
||||
std::vector<int64_t> cStride = MycastTo64(cS);
|
||||
std::vector<int64_t> aDim = castTo64(a);
|
||||
std::vector<int64_t> aStride = castTo64(aS);
|
||||
std::vector<int64_t> bDim = castTo64(b);
|
||||
std::vector<int64_t> bStride = castTo64(bS);
|
||||
std::vector<int64_t> cDim = castTo64(c);
|
||||
std::vector<int64_t> cStride = castTo64(cS);
|
||||
|
||||
auto inputA = aclCreateTensor(
|
||||
aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0,
|
||||
|
@ -187,11 +126,6 @@ class AddAclnn : public ASCENDKernelWithoutConfig {
|
|||
ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
// ret = aclDestroyTensor(inputA);
|
||||
// ret = aclDestroyTensor(inputB);
|
||||
// ret = aclDestroyScalar(alpha);
|
||||
// ret = aclDestroyTensor(output);
|
||||
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
@ -213,15 +147,15 @@ class SubAclnn : public ASCENDKernelWithoutConfig {
|
|||
auto aS = op->getInputs(0)->getStride();
|
||||
auto b = op->getInputs(1)->getDims();
|
||||
auto bS = op->getInputs(1)->getStride();
|
||||
auto c = op->getInputs(0)->getDims();
|
||||
auto cS = op->getInputs(0)->getStride();
|
||||
auto c = op->getOutput()->getDims();
|
||||
auto cS = op->getOutput()->getStride();
|
||||
|
||||
std::vector<int64_t> aDim = MycastTo64(a);
|
||||
std::vector<int64_t> aStride = MycastTo64(aS);
|
||||
std::vector<int64_t> bDim = MycastTo64(b);
|
||||
std::vector<int64_t> bStride = MycastTo64(bS);
|
||||
std::vector<int64_t> cDim = MycastTo64(c);
|
||||
std::vector<int64_t> cStride = MycastTo64(cS);
|
||||
std::vector<int64_t> aDim = castTo64(a);
|
||||
std::vector<int64_t> aStride = castTo64(aS);
|
||||
std::vector<int64_t> bDim = castTo64(b);
|
||||
std::vector<int64_t> bStride = castTo64(bS);
|
||||
std::vector<int64_t> cDim = castTo64(c);
|
||||
std::vector<int64_t> cStride = castTo64(cS);
|
||||
|
||||
auto inputA = aclCreateTensor(
|
||||
aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0,
|
||||
|
|
|
@ -0,0 +1,74 @@
|
|||
#include "operators/gather.h"
|
||||
#include "aclnnop/level2/aclnn_gather_v2.h"
|
||||
#include "ascend/ascend_kernel_without_config.h"
|
||||
#include "ascend/ascend_runtime.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
class GatherAclnn : public ASCENDKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<GatherObj>(_op);
|
||||
IT_ASSERT(op->getInputs(1)->getDType() == DataType::Int32 ||
|
||||
op->getInputs(1)->getDType() == DataType::Int64);
|
||||
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
int64_t axis = int64_t(op->getAxis());
|
||||
|
||||
auto a = op->getInputs(0)->getDims();
|
||||
auto aS = op->getInputs(0)->getStride();
|
||||
auto b = op->getInputs(1)->getDims();
|
||||
auto bS = op->getInputs(1)->getStride();
|
||||
auto c = op->getOutput()->getDims();
|
||||
auto cS = op->getOutput()->getStride();
|
||||
|
||||
std::vector<int64_t> aDim = castTo64(a);
|
||||
std::vector<int64_t> aStride = castTo64(aS);
|
||||
std::vector<int64_t> bDim = castTo64(b);
|
||||
std::vector<int64_t> bStride = castTo64(bS);
|
||||
std::vector<int64_t> cDim = castTo64(c);
|
||||
std::vector<int64_t> cStride = castTo64(cS);
|
||||
|
||||
auto inputA = aclCreateTensor(
|
||||
aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, aDim.data(), aDim.size(), aData);
|
||||
|
||||
auto inputB = aclCreateTensor(
|
||||
bDim.data(), bDim.size(),
|
||||
op->getInputs(1)->getDType() == DataType::Int32 ? ACL_INT32
|
||||
: ACL_INT64,
|
||||
bStride.data(), 0, aclFormat::ACL_FORMAT_ND, bDim.data(),
|
||||
bDim.size(), bData);
|
||||
|
||||
auto output = aclCreateTensor(
|
||||
cDim.data(), cDim.size(), ACL_FLOAT, cStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, cDim.data(), cDim.size(), cData);
|
||||
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor *executor;
|
||||
|
||||
auto ret = aclnnGatherV2GetWorkspaceSize(inputA, axis, inputB, output,
|
||||
&workspaceSize, &executor);
|
||||
void *workspaceAddr = nullptr;
|
||||
if (workspaceSize > 0) {
|
||||
workspaceAddr = context->getWorkspace(workspaceSize);
|
||||
}
|
||||
assert(ret == ACL_SUCCESS);
|
||||
ret = aclnnGatherV2(workspaceAddr, workspaceSize, executor,
|
||||
context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::Gather, GatherAclnn,
|
||||
"gather_ASCEND_float");
|
||||
}; // namespace infini
|
|
@ -23,12 +23,12 @@ class MatmulAclnn : public ASCENDKernelWithoutConfig {
|
|||
auto outD = op->getOutput()->getDims();
|
||||
auto outS = op->getOutput()->getStride();
|
||||
|
||||
std::vector<int64_t> selfDim = MycastTo64(selfD);
|
||||
std::vector<int64_t> selfStride = MycastTo64(selfS);
|
||||
std::vector<int64_t> matDim = MycastTo64(matD);
|
||||
std::vector<int64_t> matStride = MycastTo64(matS);
|
||||
std::vector<int64_t> outputDim = MycastTo64(outD);
|
||||
std::vector<int64_t> outputStride = MycastTo64(outS);
|
||||
std::vector<int64_t> selfDim = castTo64(selfD);
|
||||
std::vector<int64_t> selfStride = castTo64(selfS);
|
||||
std::vector<int64_t> matDim = castTo64(matD);
|
||||
std::vector<int64_t> matStride = castTo64(matS);
|
||||
std::vector<int64_t> outputDim = castTo64(outD);
|
||||
std::vector<int64_t> outputStride = castTo64(outS);
|
||||
|
||||
auto selfTensor = aclCreateTensor(
|
||||
selfDim.data(), selfDim.size(), ACL_FLOAT, selfStride.data(), 0,
|
||||
|
|
|
@ -29,10 +29,10 @@ class AvgPooling : public ASCENDKernelWithoutConfig {
|
|||
auto outD = op->getOutput()->getDims();
|
||||
auto outS = op->getOutput()->getStride();
|
||||
|
||||
std::vector<int64_t> selfDim = MycastTo64(selfD);
|
||||
std::vector<int64_t> selfStride = MycastTo64(selfS);
|
||||
std::vector<int64_t> outputDim = MycastTo64(outD);
|
||||
std::vector<int64_t> outputStride = MycastTo64(outS);
|
||||
std::vector<int64_t> selfDim = castTo64(selfD);
|
||||
std::vector<int64_t> selfStride = castTo64(selfS);
|
||||
std::vector<int64_t> outputDim = castTo64(outD);
|
||||
std::vector<int64_t> outputStride = castTo64(outS);
|
||||
|
||||
aclIntArray *kernelSize = aclCreateIntArray(ksize.data(), ksize.size());
|
||||
aclIntArray *strides = aclCreateIntArray(stride.data(), stride.size());
|
||||
|
|
|
@ -0,0 +1,127 @@
|
|||
#include "operators/reduce.h"
|
||||
#include "aclnnop/aclnn_mean.h"
|
||||
#include "aclnnop/aclnn_reduce_sum.h"
|
||||
#include "ascend/ascend_kernel_without_config.h"
|
||||
#include "ascend/ascend_runtime.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
class MeanAclnn : public ASCENDKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ReduceBaseObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
auto axes_set = op->getAxes();
|
||||
std::vector<int> axes;
|
||||
axes.assign(axes_set.begin(), axes_set.end());
|
||||
|
||||
bool KeepDim = op->getKeepDims();
|
||||
|
||||
auto a = op->getInputs(0)->getDims();
|
||||
auto aS = op->getInputs(0)->getStride();
|
||||
auto c = op->getOutput()->getDims();
|
||||
auto cS = op->getOutput()->getStride();
|
||||
|
||||
std::vector<int64_t> aDim = castTo64(a);
|
||||
std::vector<int64_t> aStride = castTo64(aS);
|
||||
std::vector<int64_t> cDim = castTo64(c);
|
||||
std::vector<int64_t> cStride = castTo64(cS);
|
||||
std::vector<int64_t> axes_64 = castTo64(axes);
|
||||
|
||||
auto inputA = aclCreateTensor(
|
||||
aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, aDim.data(), aDim.size(), aData);
|
||||
auto output = aclCreateTensor(
|
||||
cDim.data(), cDim.size(), ACL_FLOAT, cStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, cDim.data(), cDim.size(), cData);
|
||||
aclIntArray *dim = aclCreateIntArray(axes_64.data(), axes_64.size());
|
||||
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor *executor;
|
||||
|
||||
auto ret = aclnnMeanV2GetWorkspaceSize(
|
||||
inputA, dim, KeepDim, true, output, &workspaceSize, &executor);
|
||||
assert(ret == ACL_SUCCESS);
|
||||
void *workspaceAddr = nullptr;
|
||||
if (workspaceSize > 0) {
|
||||
workspaceAddr = context->getWorkspace(workspaceSize);
|
||||
}
|
||||
assert(ret == ACL_SUCCESS);
|
||||
ret = aclnnMeanV2(workspaceAddr, workspaceSize, executor,
|
||||
context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
class ReduceSumAclnn : public ASCENDKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ReduceBaseObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
auto axes_set = op->getAxes();
|
||||
std::vector<int> axes;
|
||||
axes.assign(axes_set.begin(), axes_set.end());
|
||||
|
||||
bool KeepDim = op->getKeepDims();
|
||||
|
||||
auto a = op->getInputs(0)->getDims();
|
||||
auto aS = op->getInputs(0)->getStride();
|
||||
auto c = op->getOutput()->getDims();
|
||||
auto cS = op->getOutput()->getStride();
|
||||
|
||||
std::vector<int64_t> aDim = castTo64(a);
|
||||
std::vector<int64_t> aStride = castTo64(aS);
|
||||
std::vector<int64_t> cDim = castTo64(c);
|
||||
std::vector<int64_t> cStride = castTo64(cS);
|
||||
std::vector<int64_t> axes_64 = castTo64(axes);
|
||||
|
||||
auto inputA = aclCreateTensor(
|
||||
aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, aDim.data(), aDim.size(), aData);
|
||||
auto output = aclCreateTensor(
|
||||
cDim.data(), cDim.size(), ACL_FLOAT, cStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, cDim.data(), cDim.size(), cData);
|
||||
aclIntArray *dim = aclCreateIntArray(axes_64.data(), axes_64.size());
|
||||
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor *executor;
|
||||
|
||||
auto ret = aclnnReduceSumGetWorkspaceSize(
|
||||
inputA, dim, KeepDim, ACL_FLOAT, output, &workspaceSize, &executor);
|
||||
assert(ret == ACL_SUCCESS);
|
||||
void *workspaceAddr = nullptr;
|
||||
if (workspaceSize > 0) {
|
||||
workspaceAddr = context->getWorkspace(workspaceSize);
|
||||
}
|
||||
assert(ret == ACL_SUCCESS);
|
||||
ret = aclnnReduceSum(workspaceAddr, workspaceSize, executor,
|
||||
context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::ReduceMean, MeanAclnn,
|
||||
"reduceMean_ASCEND_float");
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::ReduceSum, ReduceSumAclnn,
|
||||
"reduceSum_ASCEND_float");
|
||||
}; // namespace infini
|
|
@ -4,68 +4,52 @@
|
|||
#include "ascend/ascend_runtime.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
class CopyAclnn : public ASCENDKernelWithoutConfig {
|
||||
|
||||
void compute(const Operator &_op,
|
||||
void compute(const Operator &op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<MatmulObj>(_op);
|
||||
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
auto selfD = op->getInputs(0)->getDims();
|
||||
auto selfS = op->getInputs(0)->getStride();
|
||||
auto matD = op->getInputs(1)->getDims();
|
||||
auto matS = op->getInputs(1)->getStride();
|
||||
auto outD = op->getOutput()->getDims();
|
||||
auto outS = op->getOutput()->getStride();
|
||||
auto aD = op->getInputs(0)->getDims();
|
||||
auto aS = op->getInputs(0)->getStride();
|
||||
|
||||
std::vector<int64_t> selfDim = MycastTo64(selfD);
|
||||
std::vector<int64_t> selfStride = MycastTo64(selfS);
|
||||
std::vector<int64_t> matDim = MycastTo64(matD);
|
||||
std::vector<int64_t> matStride = MycastTo64(matS);
|
||||
std::vector<int64_t> outputDim = MycastTo64(outD);
|
||||
std::vector<int64_t> outputStride = MycastTo64(outS);
|
||||
std::vector<int64_t> aDim = castTo64(aD);
|
||||
std::vector<int64_t> aStride = castTo64(aS);
|
||||
|
||||
auto selfTensor = aclCreateTensor(
|
||||
selfDim.data(), selfDim.size(), ACL_FLOAT, selfStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, selfDim.data(), selfDim.size(), aData);
|
||||
auto matTensor = aclCreateTensor(
|
||||
matDim.data(), matDim.size(), ACL_FLOAT, matStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, matDim.data(), matDim.size(), bData);
|
||||
auto outputTensor =
|
||||
aclCreateTensor(outputDim.data(), outputDim.size(), ACL_FLOAT,
|
||||
outputStride.data(), 0, aclFormat::ACL_FORMAT_ND,
|
||||
outputDim.data(), outputDim.size(), cData);
|
||||
auto srcTensor = aclCreateTensor(
|
||||
aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, aDim.data(), aDim.size(), aData);
|
||||
auto outputTensor = aclCreateTensor(
|
||||
aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, aDim.data(), aDim.size(), cData);
|
||||
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor *executor;
|
||||
|
||||
auto ret = aclnnMatmulGetWorkspaceSize(
|
||||
selfTensor, matTensor, outputTensor, 1, &workspaceSize, &executor);
|
||||
auto ret = aclnnInplaceCopyGetWorkspaceSize(outputTensor, srcTensor,
|
||||
&workspaceSize, &executor);
|
||||
void *workspaceAddr = nullptr;
|
||||
if (workspaceSize > 0) {
|
||||
workspaceAddr = context->getWorkspace(workspaceSize);
|
||||
}
|
||||
assert(ret == ACL_SUCCESS);
|
||||
ret = aclnnMatmul(workspaceAddr, workspaceSize, executor,
|
||||
context->ASCENDHandle());
|
||||
ret = aclnnInplaceCopy(workspaceAddr, workspaceSize, executor,
|
||||
context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
// aclDestroyTensor(selfTensor);
|
||||
// aclDestroyTensor(matTensor);
|
||||
// aclDestroyTensor(outputTensor);
|
||||
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::MatMul, MatmulAclnn,
|
||||
"matmul_ASCEND_float");
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::Reshape, CopyAclnn,
|
||||
"reshape_ASCEND_float");
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::Unsqueeze, CopyAclnn,
|
||||
"unsqueeze_ASCEND_float");
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::Squeeze, CopyAclnn,
|
||||
"squeeze_ASCEND_float");
|
||||
}; // namespace infini
|
||||
|
|
|
@ -0,0 +1,77 @@
|
|||
#include "operators/slice.h"
|
||||
#include "aclnnop/aclnn_slice_v2.h"
|
||||
#include "ascend/ascend_kernel_without_config.h"
|
||||
#include "ascend/ascend_runtime.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
class SliceAclnn : public ASCENDKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<SliceObj>(_op);
|
||||
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
auto starts_32 = op->getStarts();
|
||||
auto ends_32 = op->getEnds();
|
||||
auto steps_32 = op->getSteps();
|
||||
|
||||
auto a = op->getInputs(0)->getDims();
|
||||
auto aS = op->getInputs(0)->getStride();
|
||||
auto c = op->getOutput()->getDims();
|
||||
auto cS = op->getOutput()->getStride();
|
||||
|
||||
std::vector<int64_t> aDim = castTo64(a);
|
||||
std::vector<int64_t> aStride = castTo64(aS);
|
||||
std::vector<int64_t> cDim = castTo64(c);
|
||||
std::vector<int64_t> cStride = castTo64(cS);
|
||||
|
||||
std::vector<int64_t> starts_64 = castTo64(starts_32);
|
||||
std::vector<int64_t> ends_64 = castTo64(ends_32);
|
||||
std::vector<int64_t> steps_64 = castTo64(steps_32);
|
||||
|
||||
vector<int64_t> axes_64 = vector<int64_t>(starts_32.size(), 0);
|
||||
for (int i = 0; i < int(starts_32.size()); i++) {
|
||||
axes_64[i] = i;
|
||||
}
|
||||
|
||||
auto inputA = aclCreateTensor(
|
||||
aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, aDim.data(), aDim.size(), aData);
|
||||
auto output = aclCreateTensor(
|
||||
cDim.data(), cDim.size(), ACL_FLOAT, cStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, cDim.data(), cDim.size(), cData);
|
||||
aclIntArray *starts =
|
||||
aclCreateIntArray(starts_64.data(), starts_64.size());
|
||||
aclIntArray *ends = aclCreateIntArray(ends_64.data(), ends_64.size());
|
||||
aclIntArray *steps =
|
||||
aclCreateIntArray(steps_64.data(), steps_64.size());
|
||||
aclIntArray *axes = aclCreateIntArray(axes_64.data(), axes_64.size());
|
||||
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor *executor;
|
||||
|
||||
auto ret =
|
||||
aclnnSliceV2GetWorkspaceSize(inputA, starts, ends, axes, steps,
|
||||
output, &workspaceSize, &executor);
|
||||
void *workspaceAddr = nullptr;
|
||||
if (workspaceSize > 0) {
|
||||
workspaceAddr = context->getWorkspace(workspaceSize);
|
||||
}
|
||||
assert(ret == ACL_SUCCESS);
|
||||
ret = aclnnSliceV2(workspaceAddr, workspaceSize, executor,
|
||||
context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::Slice, SliceAclnn,
|
||||
"slice_ASCEND_float");
|
||||
}; // namespace infini
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
#include "operators/softmax.h"
|
||||
#include "aclnnop/level2/aclnn_softmax.h"
|
||||
#include "ascend/ascend_kernel_without_config.h"
|
||||
|
@ -18,13 +17,13 @@ class SoftmaxAclnn : public ASCENDKernelWithoutConfig {
|
|||
|
||||
auto a = op->getInputs(0)->getDims();
|
||||
auto aS = op->getInputs(0)->getStride();
|
||||
auto c = op->getInputs(0)->getDims();
|
||||
auto cS = op->getInputs(0)->getStride();
|
||||
auto c = op->getOutput()->getDims();
|
||||
auto cS = op->getOutput()->getStride();
|
||||
|
||||
std::vector<int64_t> aDim = MycastTo64(a);
|
||||
std::vector<int64_t> aStride = MycastTo64(aS);
|
||||
std::vector<int64_t> cDim = MycastTo64(c);
|
||||
std::vector<int64_t> cStride = MycastTo64(cS);
|
||||
std::vector<int64_t> aDim = castTo64(a);
|
||||
std::vector<int64_t> aStride = castTo64(aS);
|
||||
std::vector<int64_t> cDim = castTo64(c);
|
||||
std::vector<int64_t> cStride = castTo64(cS);
|
||||
|
||||
auto input = aclCreateTensor(
|
||||
aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0,
|
||||
|
|
|
@ -0,0 +1,72 @@
|
|||
#include "operators/split.h"
|
||||
#include "aclnnop/aclnn_split_tensor.h"
|
||||
#include "ascend/ascend_kernel_without_config.h"
|
||||
#include "ascend/ascend_runtime.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
class SplitAclnn : public ASCENDKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<SplitObj>(_op);
|
||||
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
|
||||
auto a = op->getInputs(0)->getDims();
|
||||
auto aS = op->getInputs(0)->getStride();
|
||||
std::vector<int64_t> aDim = castTo64(a);
|
||||
std::vector<int64_t> aStride = castTo64(aS);
|
||||
|
||||
int64_t dim = op->getDim();
|
||||
int num = op->numOutputs();
|
||||
int dimSize = a.at(op->getDim());
|
||||
uint64_t splitSections = dimSize / num;
|
||||
|
||||
auto inputA = aclCreateTensor(
|
||||
aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, aDim.data(), aDim.size(), aData);
|
||||
|
||||
std::vector<aclTensor *> outputsData{};
|
||||
for (int i = 0; i < num; ++i) {
|
||||
auto c = op->getOutput(i)->getDims();
|
||||
auto cS = op->getOutput(i)->getStride();
|
||||
|
||||
std::vector<int64_t> cDim = castTo64(c);
|
||||
std::vector<int64_t> cStride = castTo64(cS);
|
||||
|
||||
void *const cData = (op->getOutput(i)->getRawDataPtr<void *>());
|
||||
|
||||
aclTensor *tmpTensor = aclCreateTensor(
|
||||
cDim.data(), cDim.size(), ACL_FLOAT, cStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, cDim.data(), cDim.size(), cData);
|
||||
|
||||
outputsData.push_back(tmpTensor);
|
||||
}
|
||||
aclTensorList *tensorList =
|
||||
aclCreateTensorList(outputsData.data(), outputsData.size());
|
||||
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor *executor;
|
||||
|
||||
auto ret = aclnnSplitTensorGetWorkspaceSize(
|
||||
inputA, splitSections, dim, tensorList, &workspaceSize, &executor);
|
||||
void *workspaceAddr = nullptr;
|
||||
if (workspaceSize > 0) {
|
||||
workspaceAddr = context->getWorkspace(workspaceSize);
|
||||
}
|
||||
assert(ret == ACL_SUCCESS);
|
||||
ret = aclnnSplitTensor(workspaceAddr, workspaceSize, executor,
|
||||
context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::Split, SplitAclnn,
|
||||
"split_ASCEND_float");
|
||||
}; // namespace infini
|
|
@ -0,0 +1,61 @@
|
|||
#include "operators/transpose.h"
|
||||
#include "aclnnop/level2/aclnn_permute.h"
|
||||
#include "ascend/ascend_kernel_without_config.h"
|
||||
#include "ascend/ascend_runtime.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
class PermuteAclnn : public ASCENDKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<TransposeObj>(_op);
|
||||
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
auto a = op->getInputs(0)->getDims();
|
||||
auto aS = op->getInputs(0)->getStride();
|
||||
auto c = op->getOutput()->getDims();
|
||||
auto cS = op->getOutput()->getStride();
|
||||
|
||||
std::vector<int64_t> aDim = castTo64(a);
|
||||
std::vector<int64_t> aStride = castTo64(aS);
|
||||
std::vector<int64_t> cDim = castTo64(c);
|
||||
std::vector<int64_t> cStride = castTo64(cS);
|
||||
|
||||
auto _permute = op->getPermute();
|
||||
std::vector<int64_t> permute = castTo64(_permute);
|
||||
|
||||
auto inputA = aclCreateTensor(
|
||||
aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, aDim.data(), aDim.size(), aData);
|
||||
aclIntArray *dims = aclCreateIntArray(permute.data(), permute.size());
|
||||
auto output = aclCreateTensor(
|
||||
cDim.data(), cDim.size(), ACL_FLOAT, cStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, cDim.data(), cDim.size(), cData);
|
||||
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor *executor;
|
||||
|
||||
auto ret = aclnnPermuteGetWorkspaceSize(inputA, dims, output,
|
||||
&workspaceSize, &executor);
|
||||
void *workspaceAddr = nullptr;
|
||||
if (workspaceSize > 0) {
|
||||
workspaceAddr = context->getWorkspace(workspaceSize);
|
||||
}
|
||||
assert(ret == ACL_SUCCESS);
|
||||
ret = aclnnPermute(workspaceAddr, workspaceSize, executor,
|
||||
context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::Transpose, PermuteAclnn,
|
||||
"transpose_ASCEND_float");
|
||||
}; // namespace infini
|
|
@ -0,0 +1,54 @@
|
|||
#include "ascend/ascend_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/reshape.h"
|
||||
#include "operators/squeeze.h"
|
||||
#include "operators/unsqueeze.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testReshape(const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape, const Shape &outputShape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||
auto npuRuntime = make_ref<ASCENDRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu->dataMalloc();
|
||||
inputCpu->setData(generator);
|
||||
|
||||
// NPU
|
||||
Graph npuGraph = make_ref<GraphObj>(npuRuntime);
|
||||
auto inputNpu = npuGraph->cloneTensor(inputCpu);
|
||||
auto npuOp = npuGraph->addOp<T>(inputNpu, nullptr, outputShape);
|
||||
npuGraph->dataMalloc();
|
||||
inputNpu->setData(generator);
|
||||
npuRuntime->run(npuGraph);
|
||||
auto outputNpu = npuOp->getOutput();
|
||||
auto outputNpu2Cpu = outputNpu->clone(cpuRuntime);
|
||||
|
||||
// Check
|
||||
inputCpu->print();
|
||||
inputCpu->printData();
|
||||
outputNpu2Cpu->print();
|
||||
outputNpu2Cpu->printData();
|
||||
EXPECT_TRUE(inputCpu->equalData(outputNpu2Cpu, 1e-3));
|
||||
}
|
||||
|
||||
TEST(ascend_Unary, run) {
|
||||
aclInit(nullptr);
|
||||
testReshape<ReshapeObj>(IncrementalGenerator(), Shape{1, 2, 2, 3},
|
||||
Shape{1, 2, 6});
|
||||
testReshape<SqueezeObj>(IncrementalGenerator(), Shape{1, 2, 2, 3},
|
||||
Shape{0});
|
||||
testReshape<UnsqueezeObj>(IncrementalGenerator(), Shape{1, 2, 2, 3},
|
||||
Shape{4});
|
||||
aclFinalize();
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -11,14 +11,14 @@ namespace infini {
|
|||
template <class T>
|
||||
void testElementWise(
|
||||
const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
const Shape &shape0, const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||
auto npuRuntime = make_ref<ASCENDRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu1 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
make_ref<TensorObj>(shape0, DataType::Float32, cpuRuntime);
|
||||
Tensor inputCpu2 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu1->dataMalloc();
|
||||
|
@ -26,6 +26,10 @@ void testElementWise(
|
|||
inputCpu1->setData(generator);
|
||||
inputCpu2->setData(generator);
|
||||
|
||||
inputCpu1->print();
|
||||
inputCpu1->printData();
|
||||
inputCpu2->print();
|
||||
inputCpu2->printData();
|
||||
// NPU
|
||||
Graph npuGraph = make_ref<GraphObj>(npuRuntime);
|
||||
auto inputNpu1 = npuGraph->cloneTensor(inputCpu1);
|
||||
|
@ -39,23 +43,21 @@ void testElementWise(
|
|||
auto outputNpu2Cpu = outputNpu->clone(cpuRuntime);
|
||||
|
||||
// Check
|
||||
inputCpu1->print();
|
||||
inputCpu1->printData();
|
||||
inputCpu2->print();
|
||||
inputCpu2->printData();
|
||||
|
||||
outputNpu2Cpu->print();
|
||||
outputNpu2Cpu->printData();
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(ascend_ElementWise, run) {
|
||||
aclInit(nullptr);
|
||||
testElementWise<PowObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testElementWise<AddObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testElementWise<SubObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testElementWise<DivObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testElementWise<MulObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
aclFinalize();
|
||||
// aclInit(nullptr);
|
||||
// testElementWise<PowObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
// testElementWise<AddObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
// testElementWise<SubObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testElementWise<DivObj>(IncrementalGenerator(), Shape{1},
|
||||
Shape{1, 2, 2, 3});
|
||||
// testElementWise<MulObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
// aclFinalize();
|
||||
}
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -0,0 +1,96 @@
|
|||
#include "ascend/ascend_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/gather.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
TEST(ascend_Unary, run) {
|
||||
aclInit(nullptr);
|
||||
{
|
||||
// Runtime
|
||||
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||
auto npuRuntime = make_ref<ASCENDRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu =
|
||||
make_ref<TensorObj>(Shape{3, 2}, DataType::Float32, cpuRuntime);
|
||||
Tensor indexCpu =
|
||||
make_ref<TensorObj>(Shape{2, 2}, DataType::Int32, cpuRuntime);
|
||||
|
||||
// NPU
|
||||
Graph npuGraph = make_ref<GraphObj>(npuRuntime);
|
||||
auto inputNpu = npuGraph->cloneTensor(inputCpu);
|
||||
auto indexNpu = npuGraph->cloneTensor(indexCpu);
|
||||
auto npuOp = npuGraph->addOp<GatherObj>(inputNpu, indexNpu, nullptr, 0);
|
||||
npuGraph->dataMalloc();
|
||||
inputNpu->copyin(vector<float>{1, 2, 3, 4, 5, 6});
|
||||
indexNpu->copyin(vector<int>{0, 1, 1, 2});
|
||||
npuRuntime->run(npuGraph);
|
||||
auto outputNpu = npuOp->getOutput();
|
||||
auto outputNpu2Cpu = outputNpu->clone(cpuRuntime);
|
||||
|
||||
// Check
|
||||
EXPECT_TRUE(
|
||||
outputNpu2Cpu->equalData(vector<float>{1, 2, 3, 4, 3, 4, 5, 6}));
|
||||
}
|
||||
{
|
||||
// Runtime
|
||||
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||
auto npuRuntime = make_ref<ASCENDRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu =
|
||||
make_ref<TensorObj>(Shape{3, 3}, DataType::Float32, cpuRuntime);
|
||||
Tensor indexCpu =
|
||||
make_ref<TensorObj>(Shape{1, 2}, DataType::Int32, cpuRuntime);
|
||||
|
||||
// NPU
|
||||
Graph npuGraph = make_ref<GraphObj>(npuRuntime);
|
||||
auto inputNpu = npuGraph->cloneTensor(inputCpu);
|
||||
auto indexNpu = npuGraph->cloneTensor(indexCpu);
|
||||
auto npuOp = npuGraph->addOp<GatherObj>(inputNpu, indexNpu, nullptr, 1);
|
||||
npuGraph->dataMalloc();
|
||||
inputNpu->setData(IncrementalGenerator());
|
||||
indexNpu->copyin(vector<int>{0, 2});
|
||||
npuRuntime->run(npuGraph);
|
||||
auto outputNpu = npuOp->getOutput();
|
||||
auto outputNpu2Cpu = outputNpu->clone(cpuRuntime);
|
||||
|
||||
// Check
|
||||
EXPECT_TRUE(outputNpu2Cpu->equalData(vector<float>{0, 2, 3, 5, 6, 8}));
|
||||
}
|
||||
{
|
||||
// Runtime
|
||||
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||
auto npuRuntime = make_ref<ASCENDRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu =
|
||||
make_ref<TensorObj>(Shape{3, 2}, DataType::Float32, cpuRuntime);
|
||||
Tensor indexCpu =
|
||||
make_ref<TensorObj>(Shape{2, 2}, DataType::Int64, cpuRuntime);
|
||||
|
||||
// NPU
|
||||
Graph npuGraph = make_ref<GraphObj>(npuRuntime);
|
||||
auto inputNpu = npuGraph->cloneTensor(inputCpu);
|
||||
auto indexNpu = npuGraph->cloneTensor(indexCpu);
|
||||
auto npuOp = npuGraph->addOp<GatherObj>(inputNpu, indexNpu, nullptr, 0);
|
||||
npuGraph->dataMalloc();
|
||||
inputNpu->copyin(std::vector<float>{1.0, 1.2, 2.3, 3.4, 4.5, 5.7});
|
||||
indexNpu->copyin(vector<int64_t>{0, 1, 1, 2});
|
||||
npuRuntime->run(npuGraph);
|
||||
auto outputNpu = npuOp->getOutput();
|
||||
auto outputNpu2Cpu = outputNpu->clone(cpuRuntime);
|
||||
|
||||
// Check
|
||||
EXPECT_TRUE(outputNpu2Cpu->equalData(
|
||||
vector<float>{1.0, 1.2, 2.3, 3.4, 2.3, 3.4, 4.5, 5.7}));
|
||||
}
|
||||
aclFinalize();
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,84 @@
|
|||
#include "ascend/ascend_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/reduce.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <typename ReduceObjT>
|
||||
void test_reduce(const Shape &shape, const vector<float> &data,
|
||||
const optional<const vector<int>> &axes, bool keepDims,
|
||||
const vector<float> &ExpectData) {
|
||||
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||
auto npuRuntime = make_ref<ASCENDRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
|
||||
// Build NPU graph
|
||||
Graph npuGraph = make_ref<GraphObj>(npuRuntime);
|
||||
auto inputNpu = npuGraph->cloneTensor(inputCpu);
|
||||
auto op = npuGraph->addOp<ReduceObjT>(inputNpu, nullptr, axes, keepDims);
|
||||
|
||||
// allocate NPU memory
|
||||
npuGraph->dataMalloc();
|
||||
inputNpu->copyin(data);
|
||||
|
||||
// Execute on NPU
|
||||
npuRuntime->run(npuGraph);
|
||||
|
||||
// clone NPU output to CPU
|
||||
auto outputNpu = op->getOutput();
|
||||
auto outputNpu2Cpu = outputNpu->clone(cpuRuntime);
|
||||
|
||||
// check results on CPU
|
||||
EXPECT_TRUE(outputNpu2Cpu->equalData(ExpectData));
|
||||
}
|
||||
|
||||
TEST(ascend_ReduceMean, run) {
|
||||
aclInit(nullptr);
|
||||
test_reduce<ReduceMeanObj>(
|
||||
Shape{3, 2, 2}, vector<float>{5, 1, 20, 2, 30, 1, 40, 2, 55, 1, 60, 2},
|
||||
std::nullopt, true, vector<float>{18.25});
|
||||
test_reduce<ReduceMeanObj>(
|
||||
Shape{1, 3, 2, 2, 1},
|
||||
vector<float>{5, 1, 20, 2, 30, 1, 40, 2, 55, 1, 60, 2}, std::nullopt,
|
||||
false, vector<float>{18.25});
|
||||
|
||||
test_reduce<ReduceMeanObj>(
|
||||
Shape{2, 3, 2, 2},
|
||||
vector<float>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
|
||||
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23},
|
||||
vector<int>{1, 2}, false, vector<float>{5, 6, 17, 18});
|
||||
test_reduce<ReduceMeanObj>(
|
||||
Shape{2, 3, 2, 2, 1},
|
||||
vector<float>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
|
||||
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23},
|
||||
vector<int>{1, 2}, true, vector<float>{5, 6, 17, 18});
|
||||
aclFinalize();
|
||||
}
|
||||
|
||||
TEST(ascend_ReduceSum, run) {
|
||||
test_reduce<ReduceSumObj>(Shape{3, 2, 2},
|
||||
vector<float>{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
|
||||
std::nullopt, true, vector<float>{12});
|
||||
test_reduce<ReduceSumObj>(Shape{1, 3, 2, 2, 1},
|
||||
vector<float>{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
|
||||
std::nullopt, false, vector<float>{12});
|
||||
|
||||
test_reduce<ReduceSumObj>(
|
||||
Shape{2, 3, 2, 2},
|
||||
vector<float>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
|
||||
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23},
|
||||
vector<int>{1, 2}, false, vector<float>{30, 36, 102, 108});
|
||||
test_reduce<ReduceSumObj>(
|
||||
Shape{2, 3, 2, 2, 1},
|
||||
vector<float>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
|
||||
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23},
|
||||
vector<int>{1, 2}, true, vector<float>{30, 36, 102, 108});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,41 @@
|
|||
#include "ascend/ascend_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/slice.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
TEST(ascend_Unary, run) {
|
||||
aclInit(nullptr);
|
||||
// Runtime
|
||||
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||
auto npuRuntime = make_ref<ASCENDRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu =
|
||||
make_ref<TensorObj>(Shape{3, 2, 1, 5}, DataType::Float32, cpuRuntime);
|
||||
// inputCpu->dataMalloc();
|
||||
// inputCpu->setData(IncrementalGenerator());
|
||||
|
||||
// NPU
|
||||
Graph npuGraph = make_ref<GraphObj>(npuRuntime);
|
||||
auto inputNpu = npuGraph->cloneTensor(inputCpu);
|
||||
auto npuOp = npuGraph->addOp<SliceObj>(inputNpu, nullptr, vector<int>{1, 1},
|
||||
vector<int>{2, 5}, vector<int>{0, 3},
|
||||
std::nullopt);
|
||||
npuGraph->dataMalloc();
|
||||
inputNpu->setData(IncrementalGenerator());
|
||||
npuRuntime->run(npuGraph);
|
||||
auto outputNpu = npuOp->getOutput();
|
||||
auto outputNpu2Cpu = outputNpu->clone(cpuRuntime);
|
||||
|
||||
// Check
|
||||
EXPECT_TRUE(outputNpu2Cpu->equalData(
|
||||
vector<float>{11, 12, 13, 14, 16, 17, 18, 19}));
|
||||
aclFinalize();
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,50 @@
|
|||
#include "ascend/ascend_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/split.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testSplit(const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||
auto npuRuntime = make_ref<ASCENDRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu->dataMalloc();
|
||||
inputCpu->setData(generator);
|
||||
// GPU
|
||||
Graph npuGraph = make_ref<GraphObj>(npuRuntime);
|
||||
auto inputNpu = npuGraph->cloneTensor(inputCpu);
|
||||
auto gpuOp = npuGraph->addOp<T>(inputNpu, std::nullopt, 3, 3);
|
||||
npuGraph->dataMalloc();
|
||||
inputNpu->setData(generator);
|
||||
npuRuntime->run(npuGraph);
|
||||
auto o0Cpu = gpuOp->getOutput(0)->clone(cpuRuntime);
|
||||
auto o1Cpu = gpuOp->getOutput(1)->clone(cpuRuntime);
|
||||
auto o2Cpu = gpuOp->getOutput(2)->clone(cpuRuntime);
|
||||
// Check
|
||||
inputCpu->print();
|
||||
inputCpu->printData();
|
||||
o0Cpu->print();
|
||||
o0Cpu->printData();
|
||||
o1Cpu->print();
|
||||
o1Cpu->printData();
|
||||
o2Cpu->print();
|
||||
o2Cpu->printData();
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(ascend_Split, run) {
|
||||
aclInit(nullptr);
|
||||
testSplit<SplitObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
aclFinalize();
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,49 @@
|
|||
#include "ascend/ascend_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/transpose.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testTranspose(
|
||||
const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape, const Shape &permute) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||
auto npuRuntime = make_ref<ASCENDRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu->dataMalloc();
|
||||
inputCpu->setData(generator);
|
||||
|
||||
// NPU
|
||||
Graph npuGraph = make_ref<GraphObj>(npuRuntime);
|
||||
auto inputNpu = npuGraph->cloneTensor(inputCpu);
|
||||
auto npuOp = npuGraph->addOp<T>(inputNpu, nullptr, permute);
|
||||
npuGraph->dataMalloc();
|
||||
inputNpu->setData(generator);
|
||||
npuRuntime->run(npuGraph);
|
||||
auto outputNpu = npuOp->getOutput();
|
||||
auto outputNpu2Cpu = outputNpu->clone(cpuRuntime);
|
||||
|
||||
// Check
|
||||
inputCpu->print();
|
||||
inputCpu->printData();
|
||||
outputNpu2Cpu->print();
|
||||
outputNpu2Cpu->printData();
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(ascend_Unary, run) {
|
||||
aclInit(nullptr);
|
||||
testTranspose<TransposeObj>(IncrementalGenerator(), Shape{1, 1, 2, 3},
|
||||
vector<int>{0, 1, 3, 2});
|
||||
aclFinalize();
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -18,7 +18,7 @@ void testUnary(const std::function<void(void *, size_t, DataType)> &generator,
|
|||
// Build input data on CPU
|
||||
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
|
||||
// GPU
|
||||
// NPU
|
||||
Graph npuGraph = make_ref<GraphObj>(npuRuntime);
|
||||
auto inputNpu = npuGraph->cloneTensor(inputCpu);
|
||||
auto npuOp = npuGraph->addOp<T>(inputNpu, nullptr);
|
||||
|
|
Loading…
Reference in New Issue