forked from jiuyuan/InfiniTensor
add softmax/element_wise kernel
This commit is contained in:
parent
c970c93ba1
commit
f6176124ec
|
@ -1 +1 @@
|
||||||
Subproject commit b896cec2dba5b8522b141ac4f89eb43074ee1b98
|
Subproject commit 51d3105277f3774ed31c02ed4cd11fa92925af77
|
|
@ -18,28 +18,34 @@ namespace infini {
|
||||||
|
|
||||||
class ASCENDRuntimeObj : public RuntimeObj {
|
class ASCENDRuntimeObj : public RuntimeObj {
|
||||||
private:
|
private:
|
||||||
aclrtContext aclnn;
|
aclrtContext context;
|
||||||
aclrtStream stream;
|
aclrtStream stream;
|
||||||
ASCENDPtr workspace;
|
ASCENDPtr workspace = nullptr;
|
||||||
size_t workspaceSize;
|
size_t workspaceSize;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
ASCENDRuntimeObj(int deviceId = 0) : RuntimeObj(Device::ASCEND, deviceId) {
|
ASCENDRuntimeObj(int deviceId = 0) : RuntimeObj(Device::ASCEND, deviceId) {
|
||||||
|
// #ifndef _ACL_INIT
|
||||||
|
// #define _ACL_INIT
|
||||||
|
// aclInit(nullptr);
|
||||||
|
// // auto ret_init =
|
||||||
|
// // CHECK_RET(ret == ACL_SUCCESS,
|
||||||
|
// // LOG_PRINT("aclInit failed. ERROR: %d\n",
|
||||||
|
// ret));
|
||||||
|
// #endif
|
||||||
auto ret = aclrtSetDevice(deviceId);
|
auto ret = aclrtSetDevice(deviceId);
|
||||||
CHECK_RET(ret == ACL_SUCCESS,
|
CHECK_RET(ret == ACL_SUCCESS,
|
||||||
LOG_PRINT("aclrtSetDevice failed. ERROR: %d\n", ret));
|
LOG_PRINT("aclrtSetDevice failed. ERROR: %d\n", ret));
|
||||||
ret = aclrtCreateContext(&aclnn, deviceId);
|
ret = aclrtCreateContext(&context, deviceId);
|
||||||
CHECK_RET(ret == ACL_SUCCESS,
|
CHECK_RET(ret == ACL_SUCCESS,
|
||||||
LOG_PRINT("aclrtCreateContext failed. ERROR: %d\n", ret));
|
LOG_PRINT("aclrtCreateContext failed. ERROR: %d\n", ret));
|
||||||
ret = aclrtSetCurrentContext(aclnn);
|
ret = aclrtSetCurrentContext(context);
|
||||||
CHECK_RET(ret == ACL_SUCCESS,
|
CHECK_RET(ret == ACL_SUCCESS,
|
||||||
LOG_PRINT("aclrtSetCurrentContext failed. ERROR: %d\n", ret));
|
LOG_PRINT("aclrtSetCurrentContext failed. ERROR: %d\n", ret));
|
||||||
ret = aclrtCreateStream(&stream);
|
ret = aclrtCreateStream(&stream);
|
||||||
CHECK_RET(ret == ACL_SUCCESS,
|
CHECK_RET(ret == ACL_SUCCESS,
|
||||||
LOG_PRINT("aclrtCreateStream failed. ERROR: %d\n", ret));
|
LOG_PRINT("aclrtCreateStream failed. ERROR: %d\n", ret));
|
||||||
ret = aclInit(nullptr);
|
|
||||||
CHECK_RET(ret == ACL_SUCCESS,
|
|
||||||
LOG_PRINT("aclInit failed. ERROR: %d\n", ret));
|
|
||||||
// 10GB for Longformer
|
// 10GB for Longformer
|
||||||
// size_t longformerNum = 3lu * (1 << 30);
|
// size_t longformerNum = 3lu * (1 << 30);
|
||||||
workspaceSize = 3ll << 30; // 3 GB
|
workspaceSize = 3ll << 30; // 3 GB
|
||||||
|
@ -50,9 +56,9 @@ class ASCENDRuntimeObj : public RuntimeObj {
|
||||||
virtual ~ASCENDRuntimeObj() {
|
virtual ~ASCENDRuntimeObj() {
|
||||||
dealloc(workspace);
|
dealloc(workspace);
|
||||||
aclrtDestroyStream(stream);
|
aclrtDestroyStream(stream);
|
||||||
aclrtDestroyContext(aclnn);
|
aclrtDestroyContext(context);
|
||||||
aclrtResetDevice(deviceId);
|
aclrtResetDevice(deviceId);
|
||||||
aclFinalize();
|
// aclFinalize();
|
||||||
}
|
}
|
||||||
string toString() const override;
|
string toString() const override;
|
||||||
|
|
||||||
|
@ -68,7 +74,7 @@ class ASCENDRuntimeObj : public RuntimeObj {
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
void dealloc(void *ptr) override { aclrtFree(ptr); }
|
void dealloc(void *ptr) override { aclrtFree(ptr); }
|
||||||
aclrtContext *ASCENDHandle() const { return nullptr; }
|
aclrtStream ASCENDHandle() const { return stream; }
|
||||||
ASCENDPtr getWorkspace(size_t size) const {
|
ASCENDPtr getWorkspace(size_t size) const {
|
||||||
IT_ASSERT(size <= workspaceSize);
|
IT_ASSERT(size <= workspaceSize);
|
||||||
return workspace;
|
return workspace;
|
||||||
|
@ -76,19 +82,19 @@ class ASCENDRuntimeObj : public RuntimeObj {
|
||||||
|
|
||||||
void copyBlobFromCPU(void *dst, const void *src,
|
void copyBlobFromCPU(void *dst, const void *src,
|
||||||
size_t bytes) const override {
|
size_t bytes) const override {
|
||||||
aclrtMemcpy(dst, 1024 * 1024 * 1024, const_cast<void *>(src), bytes,
|
aclrtMemcpy(dst, bytes, const_cast<void *>(src), bytes,
|
||||||
ACL_MEMCPY_HOST_TO_DEVICE);
|
ACL_MEMCPY_HOST_TO_DEVICE);
|
||||||
}
|
}
|
||||||
|
|
||||||
void copyBlobToCPU(void *dst, const void *src,
|
void copyBlobToCPU(void *dst, const void *src,
|
||||||
size_t bytes) const override {
|
size_t bytes) const override {
|
||||||
aclrtMemcpy(dst, 1024 * 1024 * 1024, const_cast<void *>(src), bytes,
|
aclrtMemcpy(dst, bytes, const_cast<void *>(src), bytes,
|
||||||
ACL_MEMCPY_DEVICE_TO_HOST);
|
ACL_MEMCPY_DEVICE_TO_HOST);
|
||||||
}
|
}
|
||||||
|
|
||||||
void copyBlobInsideRuntime(void *dst, const void *src,
|
void copyBlobInsideRuntime(void *dst, const void *src,
|
||||||
size_t bytes) const override {
|
size_t bytes) const override {
|
||||||
aclrtMemcpy(dst, 1024 * 1024 * 1024, const_cast<void *>(src), bytes,
|
aclrtMemcpy(dst, bytes, const_cast<void *>(src), bytes,
|
||||||
ACL_MEMCPY_DEVICE_TO_DEVICE);
|
ACL_MEMCPY_DEVICE_TO_DEVICE);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -13,8 +13,7 @@ void ASCENDRuntimeObj::runWithoutSync(const Graph &graph, bool tune = false,
|
||||||
std::map<OpType, int> opCnt;
|
std::map<OpType, int> opCnt;
|
||||||
for (auto &op : graph->getOperators()) {
|
for (auto &op : graph->getOperators()) {
|
||||||
// HACK: set correct data type
|
// HACK: set correct data type
|
||||||
auto kernelAttrs =
|
auto kernelAttrs = KernelAttrs{device, op->getOpType().underlying()};
|
||||||
KernelAttrs{device, op->getOpType().underlying(), op->getDType()};
|
|
||||||
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
|
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
|
||||||
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
|
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
|
||||||
auto perfData = perfEngine.getPerfData(perfKey);
|
auto perfData = perfEngine.getPerfData(perfKey);
|
||||||
|
|
|
@ -5,10 +5,8 @@
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
|
|
||||||
class BatchNormAclnn : public ASCENDKernelWithoutConfig {
|
class BatchNormAclnn : public ASCENDKernelWithoutConfig {
|
||||||
|
|
||||||
|
|
||||||
void compute(const Operator &_op,
|
void compute(const Operator &_op,
|
||||||
const RuntimeObj *_context) const override {
|
const RuntimeObj *_context) const override {
|
||||||
auto op = as<BatchNormObj>(_op);
|
auto op = as<BatchNormObj>(_op);
|
||||||
|
@ -35,36 +33,31 @@ class BatchNormAclnn : public ASCENDKernelWithoutConfig {
|
||||||
std::vector<int64_t> outputDim = MycastTo64(outD);
|
std::vector<int64_t> outputDim = MycastTo64(outD);
|
||||||
std::vector<int64_t> outputStride = MycastTo64(outS);
|
std::vector<int64_t> outputStride = MycastTo64(outS);
|
||||||
|
|
||||||
//std::vector<int64_t> inputDim(in.size(), 1);
|
auto inputTensor =
|
||||||
//for (size_t i = 0; i < a.size(); ++i) {
|
aclCreateTensor(inputDim.data(), inputDim.size(), ACL_FLOAT,
|
||||||
// inputDim[i] = int64_t(in[i]);
|
inputStride.data(), 0, aclFormat::ACL_FORMAT_NCHW,
|
||||||
//}
|
inputDim.data(), inputDim.size(), inData);
|
||||||
//std::vector<int64_t> inputStride(inS.size(), 1);
|
auto outputTensor =
|
||||||
//for (size_t i = 0; i < inS.size(); ++i) {
|
aclCreateTensor(outputDim.data(), outputDim.size(), ACL_FLOAT,
|
||||||
// inputStride[i] = int64_t(inS[i]);
|
outputStride.data(), 0, aclFormat::ACL_FORMAT_NCHW,
|
||||||
//}
|
outputDim.data(), outputDim.size(), outData);
|
||||||
|
|
||||||
auto inputTensor = aclCreateTensor(
|
|
||||||
inputDim.data(), inputDim.size(), ACL_FLOAT, inputStride.data(), 0,
|
|
||||||
aclFormat::ACL_FORMAT_NCHW, inputDim.data(), inputDim.size(), inData);
|
|
||||||
auto outputTensor = aclCreateTensor(
|
|
||||||
outputDim.data(), outputDim.size(), ACL_FLOAT, outputStride.data(), 0,
|
|
||||||
aclFormat::ACL_FORMAT_NCHW, outputDim.data(), outputDim.size(), outData);
|
|
||||||
auto meanTensor = aclCreateTensor(
|
auto meanTensor = aclCreateTensor(
|
||||||
paraDim.data(), paraDim.size(), ACL_FLOAT, paraStride.data(), 0,
|
paraDim.data(), paraDim.size(), ACL_FLOAT, paraStride.data(), 0,
|
||||||
aclFormat::ACL_FORMAT_ND, paraDim.data(), paraDim.size(), meanData);
|
aclFormat::ACL_FORMAT_ND, paraDim.data(), paraDim.size(), meanData);
|
||||||
auto varTensor = aclCreateTensor(
|
auto varTensor = aclCreateTensor(
|
||||||
paraDim.data(), paraDim.size(), ACL_FLOAT, paraStride.data(), 0,
|
paraDim.data(), paraDim.size(), ACL_FLOAT, paraStride.data(), 0,
|
||||||
aclFormat::ACL_FORMAT_ND, paraDim.data(), paraDim.size(), varData);
|
aclFormat::ACL_FORMAT_ND, paraDim.data(), paraDim.size(), varData);
|
||||||
auto scaleTensor = aclCreateTensor(
|
auto scaleTensor =
|
||||||
paraDim.data(), paraDim.size(), ACL_FLOAT, paraStride.data(), 0,
|
aclCreateTensor(paraDim.data(), paraDim.size(), ACL_FLOAT,
|
||||||
aclFormat::ACL_FORMAT_ND, paraDim.data(), paraDim.size(), scaleData);
|
paraStride.data(), 0, aclFormat::ACL_FORMAT_ND,
|
||||||
|
paraDim.data(), paraDim.size(), scaleData);
|
||||||
auto biasTensor = aclCreateTensor(
|
auto biasTensor = aclCreateTensor(
|
||||||
paraDim.data(), paraDim.size(), ACL_FLOAT, paraStride.data(), 0,
|
paraDim.data(), paraDim.size(), ACL_FLOAT, paraStride.data(), 0,
|
||||||
aclFormat::ACL_FORMAT_ND, paraDim.data(), paraDim.size(), biasData);
|
aclFormat::ACL_FORMAT_ND, paraDim.data(), paraDim.size(), biasData);
|
||||||
auto savemeanTensor = aclCreateTensor(
|
auto savemeanTensor =
|
||||||
paraDim.data(), paraDim.size(), ACL_FLOAT, paraStride.data(), 0,
|
aclCreateTensor(paraDim.data(), paraDim.size(), ACL_FLOAT,
|
||||||
aclFormat::ACL_FORMAT_ND, paraDim.data(), paraDim.size(), scaleData);
|
paraStride.data(), 0, aclFormat::ACL_FORMAT_ND,
|
||||||
|
paraDim.data(), paraDim.size(), scaleData);
|
||||||
auto saveinvstdTensor = aclCreateTensor(
|
auto saveinvstdTensor = aclCreateTensor(
|
||||||
paraDim.data(), paraDim.size(), ACL_FLOAT, paraStride.data(), 0,
|
paraDim.data(), paraDim.size(), ACL_FLOAT, paraStride.data(), 0,
|
||||||
aclFormat::ACL_FORMAT_ND, paraDim.data(), paraDim.size(), biasData);
|
aclFormat::ACL_FORMAT_ND, paraDim.data(), paraDim.size(), biasData);
|
||||||
|
@ -72,38 +65,35 @@ class BatchNormAclnn : public ASCENDKernelWithoutConfig {
|
||||||
uint64_t workspaceSize = 0;
|
uint64_t workspaceSize = 0;
|
||||||
aclOpExecutor *executor;
|
aclOpExecutor *executor;
|
||||||
|
|
||||||
auto ret =
|
auto ret = aclnnBatchNormGetWorkspaceSize(
|
||||||
aclnnBatchNormGetWorkspaceSize(inputTensor, scaleTensor, biasTensor, meanTensor, varTensor, false, op->getMomentum(), op->getEps(), outputTensor, savemeanTensor, saveinvstdTensor, &workspaceSize, &executor);
|
inputTensor, scaleTensor, biasTensor, meanTensor, varTensor, false,
|
||||||
|
op->getMomentum(), op->getEps(), outputTensor, savemeanTensor,
|
||||||
|
saveinvstdTensor, &workspaceSize, &executor);
|
||||||
void *workspaceAddr = nullptr;
|
void *workspaceAddr = nullptr;
|
||||||
if (workspaceSize > 0) {
|
if (workspaceSize > 0) {
|
||||||
ret = aclrtMalloc(&workspaceAddr, workspaceSize,
|
workspaceAddr = context->getWorkspace(workspaceSize);
|
||||||
ACL_MEM_MALLOC_HUGE_FIRST);
|
|
||||||
}
|
}
|
||||||
assert(ret == ACL_SUCCESS);
|
assert(ret == ACL_SUCCESS);
|
||||||
ret = aclnnBatchNorm(workspaceAddr, workspaceSize, executor,
|
ret = aclnnBatchNorm(workspaceAddr, workspaceSize, executor,
|
||||||
context->ASCENDHandle());
|
context->ASCENDHandle());
|
||||||
assert(ret == ACL_SUCCESS);
|
assert(ret == ACL_SUCCESS);
|
||||||
|
|
||||||
ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
||||||
assert(ret == ACL_SUCCESS);
|
assert(ret == ACL_SUCCESS);
|
||||||
|
|
||||||
aclDestroyTensor(inputTensor);
|
// aclDestroyTensor(inputTensor);
|
||||||
aclDestroyTensor(outputTensor);
|
// aclDestroyTensor(outputTensor);
|
||||||
aclDestroyTensor(meanTensor);
|
// aclDestroyTensor(meanTensor);
|
||||||
aclDestroyTensor(varTensor);
|
// aclDestroyTensor(varTensor);
|
||||||
aclDestroyTensor(scaleTensor);
|
// aclDestroyTensor(scaleTensor);
|
||||||
aclDestroyTensor(biasTensor);
|
// aclDestroyTensor(biasTensor);
|
||||||
aclDestroyTensor(savemeanTensor);
|
// aclDestroyTensor(savemeanTensor);
|
||||||
aclDestroyTensor(saveinvstdTensor);
|
// aclDestroyTensor(saveinvstdTensor);
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
REGISTER_KERNEL(Device::ASCEND, OpType::BatchNormalization, BatchNormAclnn,
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
REGISTER_KERNEL(Device::ASCEND, OpType::BatchNormalization, DataType::Float32, BatchNormAclnn,
|
|
||||||
"batchnorm_ASCEND_float");
|
"batchnorm_ASCEND_float");
|
||||||
}; // namespace infini
|
}; // namespace infini
|
||||||
|
|
|
@ -5,56 +5,33 @@
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
|
|
||||||
class ConcatAclnn : public ASCENDKernelWithoutConfig {
|
class ConcatAclnn : public ASCENDKernelWithoutConfig {
|
||||||
|
|
||||||
|
|
||||||
void compute(const Operator &_op,
|
void compute(const Operator &_op,
|
||||||
const RuntimeObj *_context) const override {
|
const RuntimeObj *_context) const override {
|
||||||
auto op = as<ConcatObj>(_op);
|
auto op = as<ConcatObj>(_op);
|
||||||
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
|
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
|
||||||
int dim = op->getDim();
|
int dim = op->getDim();
|
||||||
//int num = op->numInputs();
|
int num = op->numInputs();
|
||||||
|
|
||||||
|
std::vector<aclTensor *> inputsData{};
|
||||||
|
|
||||||
std::vector<aclTensor*> inputsData{};
|
for (int i = 0; i < num; ++i) {
|
||||||
auto inD0 = op->getInputs(0)->getDims();
|
auto inD = op->getInputs(i)->getDims();
|
||||||
auto inS0 = op->getInputs(0)->getStride();
|
auto inS = op->getInputs(i)->getStride();
|
||||||
std::vector<int64_t> inputDim0 = MycastTo64(inD0);
|
std::vector<int64_t> inputDim = MycastTo64(inD);
|
||||||
std::vector<int64_t> inputStride0 = MycastTo64(inS0);
|
std::vector<int64_t> inputStride = MycastTo64(inS);
|
||||||
|
|
||||||
void *const inData0 = (op->getInputs(0)->getRawDataPtr<void *>());
|
void *const inData = (op->getInputs(i)->getRawDataPtr<void *>());
|
||||||
auto tmpTensor0 = aclCreateTensor(
|
auto tmpTensor =
|
||||||
inputDim0.data(), inputDim0.size(), ACL_FLOAT, inputStride0.data(), 0,
|
aclCreateTensor(inputDim.data(), inputDim.size(), ACL_FLOAT,
|
||||||
aclFormat::ACL_FORMAT_ND, inputDim0.data(), inputDim0.size(), inData0);
|
inputStride.data(), 0, aclFormat::ACL_FORMAT_ND,
|
||||||
|
inputDim.data(), inputDim.size(), inData);
|
||||||
|
|
||||||
inputsData.push_back(tmpTensor0);
|
inputsData.push_back(tmpTensor);
|
||||||
|
}
|
||||||
auto inD = op->getInputs(1)->getDims();
|
aclTensorList *tensorList =
|
||||||
auto inS = op->getInputs(1)->getStride();
|
aclCreateTensorList(inputsData.data(), inputsData.size());
|
||||||
std::vector<int64_t> inputDim = MycastTo64(inD);
|
|
||||||
std::vector<int64_t> inputStride = MycastTo64(inS);
|
|
||||||
|
|
||||||
void *const inData = (op->getInputs(1)->getRawDataPtr<void *>());
|
|
||||||
auto tmpTensor = aclCreateTensor(
|
|
||||||
inputDim.data(), inputDim.size(), ACL_FLOAT, inputStride.data(), 0,
|
|
||||||
aclFormat::ACL_FORMAT_ND, inputDim.data(), inputDim.size(), inData);
|
|
||||||
|
|
||||||
inputsData.push_back(tmpTensor);
|
|
||||||
//for (int i = 0; i < num; ++i) {
|
|
||||||
// auto inD = op->getInputs(i)->getDims();
|
|
||||||
// auto inS = op->getInputs(i)->getStride();
|
|
||||||
// std::vector<int64_t> inputDim = MycastTo64(inD);
|
|
||||||
// std::vector<int64_t> inputStride = MycastTo64(inS);
|
|
||||||
|
|
||||||
// void *const inData = (op->getInputs(i)->getRawDataPtr<void *>());
|
|
||||||
// auto tmpTensor = aclCreateTensor(
|
|
||||||
// inputDim.data(), inputDim.size(), ACL_FLOAT, inputStride.data(), 0,
|
|
||||||
// aclFormat::ACL_FORMAT_ND, inputDim.data(), inputDim.size(), inData);
|
|
||||||
|
|
||||||
// inputsData.push_back(tmpTensor);
|
|
||||||
//}
|
|
||||||
aclTensorList* tensorList = aclCreateTensorList(inputsData.data(), inputsData.size());
|
|
||||||
|
|
||||||
void *const outData = (op->getOutput()->getRawDataPtr<void *>());
|
void *const outData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
auto outD = op->getOutput()->getDims();
|
auto outD = op->getOutput()->getDims();
|
||||||
|
@ -62,39 +39,35 @@ class ConcatAclnn : public ASCENDKernelWithoutConfig {
|
||||||
std::vector<int64_t> outputDim = MycastTo64(outD);
|
std::vector<int64_t> outputDim = MycastTo64(outD);
|
||||||
std::vector<int64_t> outputStride = MycastTo64(outS);
|
std::vector<int64_t> outputStride = MycastTo64(outS);
|
||||||
|
|
||||||
auto outputTensor = aclCreateTensor(
|
auto outputTensor =
|
||||||
outputDim.data(), outputDim.size(), ACL_FLOAT, outputStride.data(), 0,
|
aclCreateTensor(outputDim.data(), outputDim.size(), ACL_FLOAT,
|
||||||
aclFormat::ACL_FORMAT_ND, outputDim.data(), outputDim.size(), outData);
|
outputStride.data(), 0, aclFormat::ACL_FORMAT_ND,
|
||||||
|
outputDim.data(), outputDim.size(), outData);
|
||||||
|
|
||||||
uint64_t workspaceSize = 0;
|
uint64_t workspaceSize = 0;
|
||||||
aclOpExecutor *executor;
|
aclOpExecutor *executor;
|
||||||
|
|
||||||
auto ret =
|
auto ret = aclnnCatGetWorkspaceSize(
|
||||||
aclnnCatGetWorkspaceSize(tensorList, int64_t(dim), outputTensor, &workspaceSize, &executor);
|
tensorList, int64_t(dim), outputTensor, &workspaceSize, &executor);
|
||||||
void *workspaceAddr = nullptr;
|
void *workspaceAddr = nullptr;
|
||||||
if (workspaceSize > 0) {
|
if (workspaceSize > 0) {
|
||||||
ret = aclrtMalloc(&workspaceAddr, workspaceSize,
|
workspaceAddr = context->getWorkspace(workspaceSize);
|
||||||
ACL_MEM_MALLOC_HUGE_FIRST);
|
|
||||||
}
|
}
|
||||||
assert(ret == ACL_SUCCESS);
|
assert(ret == ACL_SUCCESS);
|
||||||
ret = aclnnCat(workspaceAddr, workspaceSize, executor,
|
ret = aclnnCat(workspaceAddr, workspaceSize, executor,
|
||||||
context->ASCENDHandle());
|
context->ASCENDHandle());
|
||||||
assert(ret == ACL_SUCCESS);
|
assert(ret == ACL_SUCCESS);
|
||||||
|
|
||||||
ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
||||||
assert(ret == ACL_SUCCESS);
|
assert(ret == ACL_SUCCESS);
|
||||||
|
|
||||||
aclDestroyTensorList(tensorList);
|
// aclDestroyTensorList(tensorList);
|
||||||
aclDestroyTensor(outputTensor);
|
// aclDestroyTensor(outputTensor);
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
REGISTER_KERNEL(Device::ASCEND, OpType::Concat, ConcatAclnn,
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
REGISTER_KERNEL(Device::ASCEND, OpType::Concat, DataType::Float32, ConcatAclnn,
|
|
||||||
"concat_ASCEND_float");
|
"concat_ASCEND_float");
|
||||||
}; // namespace infini
|
}; // namespace infini
|
||||||
|
|
|
@ -5,7 +5,6 @@
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
|
|
||||||
class ConvAclnn : public ASCENDKernelWithoutConfig {
|
class ConvAclnn : public ASCENDKernelWithoutConfig {
|
||||||
|
|
||||||
void compute(const Operator &_op,
|
void compute(const Operator &_op,
|
||||||
|
@ -14,20 +13,23 @@ class ConvAclnn : public ASCENDKernelWithoutConfig {
|
||||||
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
|
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
|
||||||
|
|
||||||
const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
|
const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
|
||||||
//const auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
|
// const auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
|
||||||
//const int cpg = op->getChannelPerGroup();
|
// const int cpg = op->getChannelPerGroup();
|
||||||
//const int g = c / cpg;
|
// const int g = c / cpg;
|
||||||
|
|
||||||
std::vector<int64_t> pads = {ph, pw};
|
std::vector<int64_t> pads = {ph, pw};
|
||||||
//std::vector<int64_t> ksize = {r, s};
|
// std::vector<int64_t> ksize = {r, s};
|
||||||
std::vector<int64_t> stride = {sh, sw};
|
std::vector<int64_t> stride = {sh, sw};
|
||||||
std::vector<int64_t> dilation = {dh, dw};
|
std::vector<int64_t> dilation = {dh, dw};
|
||||||
std::vector<int64_t> outputPadding = {sh-1, sw-1};
|
std::vector<int64_t> outputPadding = {sh - 1, sw - 1};
|
||||||
|
|
||||||
aclIntArray *convpads = aclCreateIntArray(pads.data(), pads.size());
|
aclIntArray *convpads = aclCreateIntArray(pads.data(), pads.size());
|
||||||
aclIntArray *convstride = aclCreateIntArray(stride.data(), stride.size());
|
aclIntArray *convstride =
|
||||||
aclIntArray *convdilation = aclCreateIntArray(dilation.data(), dilation.size());
|
aclCreateIntArray(stride.data(), stride.size());
|
||||||
aclIntArray *convOutputpadding = aclCreateIntArray(outputPadding.data(), outputPadding.size());
|
aclIntArray *convdilation =
|
||||||
|
aclCreateIntArray(dilation.data(), dilation.size());
|
||||||
|
aclIntArray *convOutputpadding =
|
||||||
|
aclCreateIntArray(outputPadding.data(), outputPadding.size());
|
||||||
|
|
||||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||||
|
@ -47,46 +49,45 @@ class ConvAclnn : public ASCENDKernelWithoutConfig {
|
||||||
std::vector<int64_t> outputDim = MycastTo64(outD);
|
std::vector<int64_t> outputDim = MycastTo64(outD);
|
||||||
std::vector<int64_t> outputStride = MycastTo64(outS);
|
std::vector<int64_t> outputStride = MycastTo64(outS);
|
||||||
|
|
||||||
auto inputTensor = aclCreateTensor(
|
auto inputTensor =
|
||||||
inputDim.data(), inputDim.size(), ACL_FLOAT, inputStride.data(), 0,
|
aclCreateTensor(inputDim.data(), inputDim.size(), ACL_FLOAT,
|
||||||
aclFormat::ACL_FORMAT_NCHW, inputDim.data(), inputDim.size(), aData);
|
inputStride.data(), 0, aclFormat::ACL_FORMAT_NCHW,
|
||||||
auto weightTensor = aclCreateTensor(
|
inputDim.data(), inputDim.size(), aData);
|
||||||
weightDim.data(), weightDim.size(), ACL_FLOAT, weightStride.data(), 0,
|
auto weightTensor =
|
||||||
aclFormat::ACL_FORMAT_NCHW, weightDim.data(), weightDim.size(), bData);
|
aclCreateTensor(weightDim.data(), weightDim.size(), ACL_FLOAT,
|
||||||
auto outputTensor = aclCreateTensor(
|
weightStride.data(), 0, aclFormat::ACL_FORMAT_NCHW,
|
||||||
outputDim.data(), outputDim.size(), ACL_FLOAT, outputStride.data(), 0,
|
weightDim.data(), weightDim.size(), bData);
|
||||||
aclFormat::ACL_FORMAT_NCHW, outputDim.data(), outputDim.size(), cData);
|
auto outputTensor =
|
||||||
|
aclCreateTensor(outputDim.data(), outputDim.size(), ACL_FLOAT,
|
||||||
|
outputStride.data(), 0, aclFormat::ACL_FORMAT_NCHW,
|
||||||
|
outputDim.data(), outputDim.size(), cData);
|
||||||
|
|
||||||
uint64_t workspaceSize = 0;
|
uint64_t workspaceSize = 0;
|
||||||
aclOpExecutor *executor;
|
aclOpExecutor *executor;
|
||||||
|
|
||||||
auto ret =
|
auto ret = aclnnConvolutionGetWorkspaceSize(
|
||||||
aclnnConvolutionGetWorkspaceSize(inputTensor, weightTensor, nullptr, convstride, convpads, convdilation, false, convOutputpadding, 1, outputTensor, 1, &workspaceSize, &executor);
|
inputTensor, weightTensor, nullptr, convstride, convpads,
|
||||||
|
convdilation, false, convOutputpadding, 1, outputTensor, 1,
|
||||||
|
&workspaceSize, &executor);
|
||||||
void *workspaceAddr = nullptr;
|
void *workspaceAddr = nullptr;
|
||||||
if (workspaceSize > 0) {
|
if (workspaceSize > 0) {
|
||||||
ret = aclrtMalloc(&workspaceAddr, workspaceSize,
|
workspaceAddr = context->getWorkspace(workspaceSize);
|
||||||
ACL_MEM_MALLOC_HUGE_FIRST);
|
|
||||||
}
|
}
|
||||||
assert(ret == ACL_SUCCESS);
|
assert(ret == ACL_SUCCESS);
|
||||||
ret = aclnnConvolution(workspaceAddr, workspaceSize, executor,
|
ret = aclnnConvolution(workspaceAddr, workspaceSize, executor,
|
||||||
context->ASCENDHandle());
|
context->ASCENDHandle());
|
||||||
assert(ret == ACL_SUCCESS);
|
assert(ret == ACL_SUCCESS);
|
||||||
|
|
||||||
ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
||||||
assert(ret == ACL_SUCCESS);
|
assert(ret == ACL_SUCCESS);
|
||||||
|
|
||||||
aclDestroyTensor(inputTensor);
|
// aclDestroyTensor(inputTensor);
|
||||||
aclDestroyTensor(weightTensor);
|
// aclDestroyTensor(weightTensor);
|
||||||
aclDestroyTensor(outputTensor);
|
// aclDestroyTensor(outputTensor);
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
REGISTER_KERNEL(Device::ASCEND, OpType::Conv, ConvAclnn, "conv_ASCEND_float");
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
REGISTER_KERNEL(Device::ASCEND, OpType::Conv, DataType::Float32, ConvAclnn,
|
|
||||||
"conv_ASCEND_float");
|
|
||||||
}; // namespace infini
|
}; // namespace infini
|
||||||
|
|
|
@ -0,0 +1,278 @@
|
||||||
|
#include "operators/element_wise.h"
|
||||||
|
#include "aclnnop/level2/aclnn_add.h"
|
||||||
|
#include "aclnnop/level2/aclnn_div.h"
|
||||||
|
#include "aclnnop/level2/aclnn_mul.h"
|
||||||
|
#include "aclnnop/level2/aclnn_pow_tensor_tensor.h"
|
||||||
|
#include "aclnnop/level2/aclnn_sub.h"
|
||||||
|
#include "ascend/ascend_kernel_without_config.h"
|
||||||
|
#include "ascend/ascend_runtime.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
/*
|
||||||
|
class PowAclnn : public ASCENDKernelWithoutConfig {
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<ElementWiseObj>(_op);
|
||||||
|
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
|
auto a = op->getInputs(0)->getDims();
|
||||||
|
auto aS = op->getInputs(0)->getStride();
|
||||||
|
auto b = op->getInputs(1)->getDims();
|
||||||
|
auto bS = op->getInputs(1)->getStride();
|
||||||
|
auto c = op->getInputs(0)->getDims();
|
||||||
|
auto cS = op->getInputs(0)->getStride();
|
||||||
|
|
||||||
|
std::vector<int64_t> aDim = MycastTo64(a);
|
||||||
|
std::vector<int64_t> aStride = MycastTo64(aS);
|
||||||
|
std::vector<int64_t> bDim = MycastTo64(b);
|
||||||
|
std::vector<int64_t> bStride = MycastTo64(bS);
|
||||||
|
std::vector<int64_t> cDim = MycastTo64(c);
|
||||||
|
std::vector<int64_t> cStride = MycastTo64(cS);
|
||||||
|
|
||||||
|
auto inputA = aclCreateTensor(
|
||||||
|
aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0,
|
||||||
|
aclFormat::ACL_FORMAT_ND, aDim.data(), aDim.size(), aData);
|
||||||
|
auto inputB = aclCreateTensor(
|
||||||
|
bDim.data(), bDim.size(), ACL_FLOAT, bStride.data(), 0,
|
||||||
|
aclFormat::ACL_FORMAT_ND, bDim.data(), bDim.size(), bData);
|
||||||
|
auto output = aclCreateTensor(
|
||||||
|
cDim.data(), cDim.size(), ACL_FLOAT, cStride.data(), 0,
|
||||||
|
aclFormat::ACL_FORMAT_ND, cDim.data(), cDim.size(), cData);
|
||||||
|
|
||||||
|
uint64_t workspaceSize = 0;
|
||||||
|
aclOpExecutor *executor;
|
||||||
|
|
||||||
|
auto ret = aclnnPowTensorTensorGetWorkspaceSize(
|
||||||
|
inputA, inputB, output, &workspaceSize, &executor);
|
||||||
|
void *workspaceAddr = nullptr;
|
||||||
|
if (workspaceSize > 0) {
|
||||||
|
workspaceAddr = context->getWorkspace(workspaceSize);
|
||||||
|
}
|
||||||
|
assert(ret == ACL_SUCCESS);
|
||||||
|
ret = aclnnPowTensorTensor(workspaceAddr, workspaceSize, executor,
|
||||||
|
context->ASCENDHandle());
|
||||||
|
assert(ret == ACL_SUCCESS);
|
||||||
|
|
||||||
|
ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
||||||
|
assert(ret == ACL_SUCCESS);
|
||||||
|
|
||||||
|
ret = aclDestroyTensor(inputA);
|
||||||
|
ret = aclDestroyTensor(inputB);
|
||||||
|
ret = aclDestroyTensor(output);
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define DEFINE_ELEMENT_WISE_Aclnn(prefix) \
|
||||||
|
class prefix##Aclnn : public ASCENDKernelWithoutConfig { \
|
||||||
|
void compute(const Operator &_op, \
|
||||||
|
const RuntimeObj *_context) const override { \
|
||||||
|
auto op = as<ElementWiseObj>(_op); \
|
||||||
|
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context); \
|
||||||
|
\
|
||||||
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>()); \
|
||||||
|
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>()); \
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>()); \
|
||||||
|
\
|
||||||
|
auto a = op->getInputs(0) -> getDims(); \
|
||||||
|
auto aS = op->getInputs(0) -> getStride(); \
|
||||||
|
auto b = op->getInputs(1) -> getDims(); \
|
||||||
|
auto bS = op->getInputs(1) -> getStride(); \
|
||||||
|
auto c = op->getInputs(0) -> getDims(); \
|
||||||
|
auto cS = op->getInputs(0) -> getStride(); \
|
||||||
|
\
|
||||||
|
std::vector<int64_t> aDim = MycastTo64(a); \
|
||||||
|
std::vector<int64_t> aStride = MycastTo64(aS); \
|
||||||
|
std::vector<int64_t> bDim = MycastTo64(b); \
|
||||||
|
std::vector<int64_t> bStride = MycastTo64(bS); \
|
||||||
|
std::vector<int64_t> cDim = MycastTo64(c); \
|
||||||
|
std::vector<int64_t> cStride = MycastTo64(cS); \
|
||||||
|
\
|
||||||
|
auto inputA = aclCreateTensor( \
|
||||||
|
aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0, \
|
||||||
|
aclFormat::ACL_FORMAT_ND, aDim.data(), aDim.size(), aData); \
|
||||||
|
auto inputB = aclCreateTensor( \
|
||||||
|
bDim.data(), bDim.size(), ACL_FLOAT, bStride.data(), 0, \
|
||||||
|
aclFormat::ACL_FORMAT_ND, bDim.data(), bDim.size(), bData); \
|
||||||
|
auto output = aclCreateTensor( \
|
||||||
|
cDim.data(), cDim.size(), ACL_FLOAT, cStride.data(), 0, \
|
||||||
|
aclFormat::ACL_FORMAT_ND, cDim.data(), cDim.size(), cData); \
|
||||||
|
\
|
||||||
|
uint64_t workspaceSize = 0; \
|
||||||
|
aclOpExecutor *executor; \
|
||||||
|
\
|
||||||
|
auto ret = aclnn##prefix##GetWorkspaceSize( \
|
||||||
|
inputA, inputB, output, &workspaceSize, &executor); \
|
||||||
|
void *workspaceAddr = nullptr; \
|
||||||
|
if (workspaceSize > 0) { \
|
||||||
|
workspaceAddr = context->getWorkspace(workspaceSize); \
|
||||||
|
} \
|
||||||
|
assert(ret == ACL_SUCCESS); \
|
||||||
|
ret = aclnn##prefix(workspaceAddr, workspaceSize, executor, \
|
||||||
|
context->ASCENDHandle()); \
|
||||||
|
assert(ret == ACL_SUCCESS); \
|
||||||
|
\
|
||||||
|
ret = aclrtSynchronizeStream(context->ASCENDHandle()); \
|
||||||
|
assert(ret == ACL_SUCCESS); \
|
||||||
|
\
|
||||||
|
ret = aclDestroyTensor(inputA); \
|
||||||
|
ret = aclDestroyTensor(inputB); \
|
||||||
|
ret = aclDestroyTensor(output); \
|
||||||
|
\
|
||||||
|
return; \
|
||||||
|
} \
|
||||||
|
};
|
||||||
|
|
||||||
|
class AddAclnn : public ASCENDKernelWithoutConfig {
|
||||||
|
virtual tuple<float, float, float> getAlphBeta() const {
|
||||||
|
return {1.f, 1.f, 0.f};
|
||||||
|
}
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<ElementWiseObj>(_op);
|
||||||
|
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
|
auto a = op->getInputs(0)->getDims();
|
||||||
|
auto aS = op->getInputs(0)->getStride();
|
||||||
|
auto b = op->getInputs(1)->getDims();
|
||||||
|
auto bS = op->getInputs(1)->getStride();
|
||||||
|
auto c = op->getInputs(0)->getDims();
|
||||||
|
auto cS = op->getInputs(0)->getStride();
|
||||||
|
|
||||||
|
std::vector<int64_t> aDim = MycastTo64(a);
|
||||||
|
std::vector<int64_t> aStride = MycastTo64(aS);
|
||||||
|
std::vector<int64_t> bDim = MycastTo64(b);
|
||||||
|
std::vector<int64_t> bStride = MycastTo64(bS);
|
||||||
|
std::vector<int64_t> cDim = MycastTo64(c);
|
||||||
|
std::vector<int64_t> cStride = MycastTo64(cS);
|
||||||
|
|
||||||
|
auto inputA = aclCreateTensor(
|
||||||
|
aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0,
|
||||||
|
aclFormat::ACL_FORMAT_ND, aDim.data(), aDim.size(), aData);
|
||||||
|
auto inputB = aclCreateTensor(
|
||||||
|
bDim.data(), bDim.size(), ACL_FLOAT, bStride.data(), 0,
|
||||||
|
aclFormat::ACL_FORMAT_ND, bDim.data(), bDim.size(), bData);
|
||||||
|
auto output = aclCreateTensor(
|
||||||
|
cDim.data(), cDim.size(), ACL_FLOAT, cStride.data(), 0,
|
||||||
|
aclFormat::ACL_FORMAT_ND, cDim.data(), cDim.size(), cData);
|
||||||
|
|
||||||
|
auto [aAlpha, bAlpha, beta] = getAlphBeta();
|
||||||
|
auto alpha = aclCreateScalar(&bAlpha, ACL_FLOAT);
|
||||||
|
|
||||||
|
uint64_t workspaceSize = 0;
|
||||||
|
aclOpExecutor *executor;
|
||||||
|
|
||||||
|
auto ret = aclnnAddGetWorkspaceSize(inputA, inputB, alpha, output,
|
||||||
|
&workspaceSize, &executor);
|
||||||
|
void *workspaceAddr = nullptr;
|
||||||
|
if (workspaceSize > 0) {
|
||||||
|
workspaceAddr = context->getWorkspace(workspaceSize);
|
||||||
|
}
|
||||||
|
assert(ret == ACL_SUCCESS);
|
||||||
|
ret = aclnnAdd(workspaceAddr, workspaceSize, executor,
|
||||||
|
context->ASCENDHandle());
|
||||||
|
assert(ret == ACL_SUCCESS);
|
||||||
|
|
||||||
|
ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
||||||
|
assert(ret == ACL_SUCCESS);
|
||||||
|
|
||||||
|
// ret = aclDestroyTensor(inputA);
|
||||||
|
// ret = aclDestroyTensor(inputB);
|
||||||
|
// ret = aclDestroyScalar(alpha);
|
||||||
|
// ret = aclDestroyTensor(output);
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class SubAclnn : public ASCENDKernelWithoutConfig {
|
||||||
|
virtual tuple<float, float, float> getAlphBeta() const {
|
||||||
|
return {1.f, 1.f, 0.f};
|
||||||
|
}
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<ElementWiseObj>(_op);
|
||||||
|
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
|
auto a = op->getInputs(0)->getDims();
|
||||||
|
auto aS = op->getInputs(0)->getStride();
|
||||||
|
auto b = op->getInputs(1)->getDims();
|
||||||
|
auto bS = op->getInputs(1)->getStride();
|
||||||
|
auto c = op->getInputs(0)->getDims();
|
||||||
|
auto cS = op->getInputs(0)->getStride();
|
||||||
|
|
||||||
|
std::vector<int64_t> aDim = MycastTo64(a);
|
||||||
|
std::vector<int64_t> aStride = MycastTo64(aS);
|
||||||
|
std::vector<int64_t> bDim = MycastTo64(b);
|
||||||
|
std::vector<int64_t> bStride = MycastTo64(bS);
|
||||||
|
std::vector<int64_t> cDim = MycastTo64(c);
|
||||||
|
std::vector<int64_t> cStride = MycastTo64(cS);
|
||||||
|
|
||||||
|
auto inputA = aclCreateTensor(
|
||||||
|
aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0,
|
||||||
|
aclFormat::ACL_FORMAT_ND, aDim.data(), aDim.size(), aData);
|
||||||
|
auto inputB = aclCreateTensor(
|
||||||
|
bDim.data(), bDim.size(), ACL_FLOAT, bStride.data(), 0,
|
||||||
|
aclFormat::ACL_FORMAT_ND, bDim.data(), bDim.size(), bData);
|
||||||
|
auto output = aclCreateTensor(
|
||||||
|
cDim.data(), cDim.size(), ACL_FLOAT, cStride.data(), 0,
|
||||||
|
aclFormat::ACL_FORMAT_ND, cDim.data(), cDim.size(), cData);
|
||||||
|
|
||||||
|
auto [aAlpha, bAlpha, beta] = getAlphBeta();
|
||||||
|
auto alpha = aclCreateScalar(&bAlpha, ACL_FLOAT);
|
||||||
|
|
||||||
|
uint64_t workspaceSize = 0;
|
||||||
|
aclOpExecutor *executor;
|
||||||
|
|
||||||
|
auto ret = aclnnSubGetWorkspaceSize(inputA, inputB, alpha, output,
|
||||||
|
&workspaceSize, &executor);
|
||||||
|
void *workspaceAddr = nullptr;
|
||||||
|
if (workspaceSize > 0) {
|
||||||
|
workspaceAddr = context->getWorkspace(workspaceSize);
|
||||||
|
}
|
||||||
|
assert(ret == ACL_SUCCESS);
|
||||||
|
ret = aclnnSub(workspaceAddr, workspaceSize, executor,
|
||||||
|
context->ASCENDHandle());
|
||||||
|
assert(ret == ACL_SUCCESS);
|
||||||
|
|
||||||
|
ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
||||||
|
assert(ret == ACL_SUCCESS);
|
||||||
|
|
||||||
|
ret = aclDestroyTensor(inputA);
|
||||||
|
ret = aclDestroyTensor(inputB);
|
||||||
|
ret = aclDestroyScalar(alpha);
|
||||||
|
ret = aclDestroyTensor(output);
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
DEFINE_ELEMENT_WISE_Aclnn(PowTensorTensor);
|
||||||
|
DEFINE_ELEMENT_WISE_Aclnn(Div);
|
||||||
|
DEFINE_ELEMENT_WISE_Aclnn(Mul);
|
||||||
|
|
||||||
|
REGISTER_KERNEL(Device::ASCEND, OpType::Pow, PowTensorTensorAclnn,
|
||||||
|
"pow_ASCEND_float");
|
||||||
|
REGISTER_KERNEL(Device::ASCEND, OpType::Div, DivAclnn, "div_ASCEND_float");
|
||||||
|
REGISTER_KERNEL(Device::ASCEND, OpType::Mul, MulAclnn, "mul_ASCEND_float");
|
||||||
|
|
||||||
|
REGISTER_KERNEL(Device::ASCEND, OpType::Add, AddAclnn, "add_ASCEND_float");
|
||||||
|
REGISTER_KERNEL(Device::ASCEND, OpType::Sub, SubAclnn, "sub_ASCEND_float");
|
||||||
|
// REGISTER_KERNEL(Device::ASCEND, OpType::Abs, AbsAclnn, "abs_ASCEND_float");
|
||||||
|
|
||||||
|
}; // namespace infini
|
|
@ -5,10 +5,8 @@
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
|
|
||||||
class MatmulAclnn : public ASCENDKernelWithoutConfig {
|
class MatmulAclnn : public ASCENDKernelWithoutConfig {
|
||||||
|
|
||||||
|
|
||||||
void compute(const Operator &_op,
|
void compute(const Operator &_op,
|
||||||
const RuntimeObj *_context) const override {
|
const RuntimeObj *_context) const override {
|
||||||
auto op = as<MatmulObj>(_op);
|
auto op = as<MatmulObj>(_op);
|
||||||
|
@ -38,40 +36,36 @@ class MatmulAclnn : public ASCENDKernelWithoutConfig {
|
||||||
auto matTensor = aclCreateTensor(
|
auto matTensor = aclCreateTensor(
|
||||||
matDim.data(), matDim.size(), ACL_FLOAT, matStride.data(), 0,
|
matDim.data(), matDim.size(), ACL_FLOAT, matStride.data(), 0,
|
||||||
aclFormat::ACL_FORMAT_ND, matDim.data(), matDim.size(), bData);
|
aclFormat::ACL_FORMAT_ND, matDim.data(), matDim.size(), bData);
|
||||||
auto outputTensor = aclCreateTensor(
|
auto outputTensor =
|
||||||
outputDim.data(), outputDim.size(), ACL_FLOAT, outputStride.data(), 0,
|
aclCreateTensor(outputDim.data(), outputDim.size(), ACL_FLOAT,
|
||||||
aclFormat::ACL_FORMAT_ND, outputDim.data(), outputDim.size(), cData);
|
outputStride.data(), 0, aclFormat::ACL_FORMAT_ND,
|
||||||
|
outputDim.data(), outputDim.size(), cData);
|
||||||
|
|
||||||
uint64_t workspaceSize = 0;
|
uint64_t workspaceSize = 0;
|
||||||
aclOpExecutor *executor;
|
aclOpExecutor *executor;
|
||||||
|
|
||||||
auto ret =
|
auto ret = aclnnMatmulGetWorkspaceSize(
|
||||||
aclnnMatmulGetWorkspaceSize(selfTensor, matTensor, outputTensor, 1, &workspaceSize, &executor);
|
selfTensor, matTensor, outputTensor, 1, &workspaceSize, &executor);
|
||||||
void *workspaceAddr = nullptr;
|
void *workspaceAddr = nullptr;
|
||||||
if (workspaceSize > 0) {
|
if (workspaceSize > 0) {
|
||||||
ret = aclrtMalloc(&workspaceAddr, workspaceSize,
|
workspaceAddr = context->getWorkspace(workspaceSize);
|
||||||
ACL_MEM_MALLOC_HUGE_FIRST);
|
|
||||||
}
|
}
|
||||||
assert(ret == ACL_SUCCESS);
|
assert(ret == ACL_SUCCESS);
|
||||||
ret = aclnnMatmul(workspaceAddr, workspaceSize, executor,
|
ret = aclnnMatmul(workspaceAddr, workspaceSize, executor,
|
||||||
context->ASCENDHandle());
|
context->ASCENDHandle());
|
||||||
assert(ret == ACL_SUCCESS);
|
assert(ret == ACL_SUCCESS);
|
||||||
|
|
||||||
ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
||||||
assert(ret == ACL_SUCCESS);
|
assert(ret == ACL_SUCCESS);
|
||||||
|
|
||||||
aclDestroyTensor(selfTensor);
|
// aclDestroyTensor(selfTensor);
|
||||||
aclDestroyTensor(matTensor);
|
// aclDestroyTensor(matTensor);
|
||||||
aclDestroyTensor(outputTensor);
|
// aclDestroyTensor(outputTensor);
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
REGISTER_KERNEL(Device::ASCEND, OpType::MatMul, MatmulAclnn,
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
REGISTER_KERNEL(Device::ASCEND, OpType::MatMul, DataType::Float32, MatmulAclnn,
|
|
||||||
"matmul_ASCEND_float");
|
"matmul_ASCEND_float");
|
||||||
}; // namespace infini
|
}; // namespace infini
|
||||||
|
|
|
@ -5,10 +5,8 @@
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
|
|
||||||
class AvgPooling : public ASCENDKernelWithoutConfig {
|
class AvgPooling : public ASCENDKernelWithoutConfig {
|
||||||
|
|
||||||
|
|
||||||
void compute(const Operator &_op,
|
void compute(const Operator &_op,
|
||||||
const RuntimeObj *_context) const override {
|
const RuntimeObj *_context) const override {
|
||||||
auto op = as<PoolingObj>(_op);
|
auto op = as<PoolingObj>(_op);
|
||||||
|
@ -24,8 +22,7 @@ class AvgPooling : public ASCENDKernelWithoutConfig {
|
||||||
std::vector<int64_t> stride = {sh, sw};
|
std::vector<int64_t> stride = {sh, sw};
|
||||||
std::vector<int64_t> pad = {ph, pw};
|
std::vector<int64_t> pad = {ph, pw};
|
||||||
|
|
||||||
|
int64_t divisorOverride = kh * kw;
|
||||||
int64_t divisorOverride = kh * kw;
|
|
||||||
|
|
||||||
auto selfD = op->getInputs(0)->getDims();
|
auto selfD = op->getInputs(0)->getDims();
|
||||||
auto selfS = op->getInputs(0)->getStride();
|
auto selfS = op->getInputs(0)->getStride();
|
||||||
|
@ -37,46 +34,43 @@ class AvgPooling : public ASCENDKernelWithoutConfig {
|
||||||
std::vector<int64_t> outputDim = MycastTo64(outD);
|
std::vector<int64_t> outputDim = MycastTo64(outD);
|
||||||
std::vector<int64_t> outputStride = MycastTo64(outS);
|
std::vector<int64_t> outputStride = MycastTo64(outS);
|
||||||
|
|
||||||
aclIntArray *kernelSize = aclCreateIntArray(ksize.data(), ksize.size());
|
aclIntArray *kernelSize = aclCreateIntArray(ksize.data(), ksize.size());
|
||||||
aclIntArray *strides = aclCreateIntArray(stride.data(), stride.size());
|
aclIntArray *strides = aclCreateIntArray(stride.data(), stride.size());
|
||||||
aclIntArray *paddings = aclCreateIntArray(pad.data(), pad.size());
|
aclIntArray *paddings = aclCreateIntArray(pad.data(), pad.size());
|
||||||
|
|
||||||
auto selfTensor = aclCreateTensor(
|
auto selfTensor = aclCreateTensor(
|
||||||
selfDim.data(), selfDim.size(), ACL_FLOAT, selfStride.data(), 0,
|
selfDim.data(), selfDim.size(), ACL_FLOAT, selfStride.data(), 0,
|
||||||
aclFormat::ACL_FORMAT_NCHW, selfDim.data(), selfDim.size(), aData);
|
aclFormat::ACL_FORMAT_NCHW, selfDim.data(), selfDim.size(), aData);
|
||||||
auto outputTensor = aclCreateTensor(
|
auto outputTensor =
|
||||||
outputDim.data(), outputDim.size(), ACL_FLOAT, outputStride.data(), 0,
|
aclCreateTensor(outputDim.data(), outputDim.size(), ACL_FLOAT,
|
||||||
aclFormat::ACL_FORMAT_NCHW, outputDim.data(), outputDim.size(), cData);
|
outputStride.data(), 0, aclFormat::ACL_FORMAT_NCHW,
|
||||||
|
outputDim.data(), outputDim.size(), cData);
|
||||||
|
|
||||||
uint64_t workspaceSize = 0;
|
uint64_t workspaceSize = 0;
|
||||||
aclOpExecutor *executor;
|
aclOpExecutor *executor;
|
||||||
|
|
||||||
auto ret =
|
auto ret = aclnnAvgPool2dGetWorkspaceSize(
|
||||||
aclnnAvgPool2dGetWorkspaceSize(selfTensor, kernelSize, strides, paddings, false, true, divisorOverride, 1, outputTensor, &workspaceSize, &executor);
|
selfTensor, kernelSize, strides, paddings, false, true,
|
||||||
|
divisorOverride, 1, outputTensor, &workspaceSize, &executor);
|
||||||
void *workspaceAddr = nullptr;
|
void *workspaceAddr = nullptr;
|
||||||
if (workspaceSize > 0) {
|
if (workspaceSize > 0) {
|
||||||
ret = aclrtMalloc(&workspaceAddr, workspaceSize,
|
workspaceAddr = context->getWorkspace(workspaceSize);
|
||||||
ACL_MEM_MALLOC_HUGE_FIRST);
|
|
||||||
}
|
}
|
||||||
assert(ret == ACL_SUCCESS);
|
assert(ret == ACL_SUCCESS);
|
||||||
ret = aclnnAvgPool2d(workspaceAddr, workspaceSize, executor,
|
ret = aclnnAvgPool2d(workspaceAddr, workspaceSize, executor,
|
||||||
context->ASCENDHandle());
|
context->ASCENDHandle());
|
||||||
assert(ret == ACL_SUCCESS);
|
assert(ret == ACL_SUCCESS);
|
||||||
|
|
||||||
ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
||||||
assert(ret == ACL_SUCCESS);
|
assert(ret == ACL_SUCCESS);
|
||||||
|
|
||||||
aclDestroyTensor(selfTensor);
|
// aclDestroyTensor(selfTensor);
|
||||||
aclDestroyTensor(outputTensor);
|
// aclDestroyTensor(outputTensor);
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
REGISTER_KERNEL(Device::ASCEND, OpType::AveragePool, AvgPooling,
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
REGISTER_KERNEL(Device::ASCEND, OpType::AveragePool, DataType::Float32, AvgPooling,
|
|
||||||
"avgpooling_ASCEND_float");
|
"avgpooling_ASCEND_float");
|
||||||
}; // namespace infini
|
}; // namespace infini
|
||||||
|
|
|
@ -0,0 +1,62 @@
|
||||||
|
|
||||||
|
#include "operators/softmax.h"
|
||||||
|
#include "aclnnop/level2/aclnn_softmax.h"
|
||||||
|
#include "ascend/ascend_kernel_without_config.h"
|
||||||
|
#include "ascend/ascend_runtime.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
class SoftmaxAclnn : public ASCENDKernelWithoutConfig {
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<SoftmaxObj>(_op);
|
||||||
|
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
|
int64_t axis = int64_t(op->getAxis());
|
||||||
|
|
||||||
|
auto a = op->getInputs(0)->getDims();
|
||||||
|
auto aS = op->getInputs(0)->getStride();
|
||||||
|
auto c = op->getInputs(0)->getDims();
|
||||||
|
auto cS = op->getInputs(0)->getStride();
|
||||||
|
|
||||||
|
std::vector<int64_t> aDim = MycastTo64(a);
|
||||||
|
std::vector<int64_t> aStride = MycastTo64(aS);
|
||||||
|
std::vector<int64_t> cDim = MycastTo64(c);
|
||||||
|
std::vector<int64_t> cStride = MycastTo64(cS);
|
||||||
|
|
||||||
|
auto input = aclCreateTensor(
|
||||||
|
aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0,
|
||||||
|
aclFormat::ACL_FORMAT_ND, aDim.data(), aDim.size(), aData);
|
||||||
|
auto output = aclCreateTensor(
|
||||||
|
cDim.data(), cDim.size(), ACL_FLOAT, cStride.data(), 0,
|
||||||
|
aclFormat::ACL_FORMAT_ND, cDim.data(), cDim.size(), cData);
|
||||||
|
|
||||||
|
uint64_t workspaceSize = 0;
|
||||||
|
aclOpExecutor *executor;
|
||||||
|
|
||||||
|
auto ret = aclnnSoftmaxGetWorkspaceSize(input, axis, output,
|
||||||
|
&workspaceSize, &executor);
|
||||||
|
void *workspaceAddr = nullptr;
|
||||||
|
if (workspaceSize > 0) {
|
||||||
|
workspaceAddr = context->getWorkspace(workspaceSize);
|
||||||
|
}
|
||||||
|
assert(ret == ACL_SUCCESS);
|
||||||
|
ret = aclnnSoftmax(workspaceAddr, workspaceSize, executor,
|
||||||
|
context->ASCENDHandle());
|
||||||
|
assert(ret == ACL_SUCCESS);
|
||||||
|
|
||||||
|
ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
||||||
|
assert(ret == ACL_SUCCESS);
|
||||||
|
|
||||||
|
// aclDestroyTensor(input);
|
||||||
|
// aclDestroyTensor(output);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
REGISTER_KERNEL(Device::ASCEND, OpType::Softmax, SoftmaxAclnn,
|
||||||
|
"softmax_ASCEND_float");
|
||||||
|
|
||||||
|
}; // namespace infini
|
|
@ -1,21 +1,21 @@
|
||||||
#include "operators/unary.h"
|
#include "operators/unary.h"
|
||||||
#include "aclnnop/level2/aclnn_relu.h"
|
|
||||||
#include "aclnnop/level2/aclnn_abs.h"
|
#include "aclnnop/level2/aclnn_abs.h"
|
||||||
#include "aclnnop/level2/aclnn_sigmoid.h"
|
|
||||||
#include "aclnnop/level2/aclnn_hardswish.h"
|
|
||||||
#include "aclnnop/level2/aclnn_tanh.h"
|
|
||||||
#include "aclnnop/level2/aclnn_gelu.h"
|
|
||||||
#include "aclnnop/level2/aclnn_sin.h"
|
|
||||||
#include "aclnnop/level2/aclnn_cos.h"
|
|
||||||
#include "aclnnop/level2/aclnn_acos.h"
|
#include "aclnnop/level2/aclnn_acos.h"
|
||||||
#include "aclnnop/level2/aclnn_atan.h"
|
#include "aclnnop/level2/aclnn_atan.h"
|
||||||
#include "aclnnop/level2/aclnn_ceil.h"
|
#include "aclnnop/level2/aclnn_ceil.h"
|
||||||
#include "aclnnop/level2/aclnn_floor.h"
|
#include "aclnnop/level2/aclnn_cos.h"
|
||||||
#include "aclnnop/level2/aclnn_exp.h"
|
#include "aclnnop/level2/aclnn_exp.h"
|
||||||
|
#include "aclnnop/level2/aclnn_floor.h"
|
||||||
|
#include "aclnnop/level2/aclnn_gelu.h"
|
||||||
|
#include "aclnnop/level2/aclnn_hardswish.h"
|
||||||
#include "aclnnop/level2/aclnn_neg.h"
|
#include "aclnnop/level2/aclnn_neg.h"
|
||||||
#include "aclnnop/level2/aclnn_reciprocal.h"
|
#include "aclnnop/level2/aclnn_reciprocal.h"
|
||||||
#include "aclnnop/level2/aclnn_sqrt.h"
|
#include "aclnnop/level2/aclnn_relu.h"
|
||||||
#include "aclnnop/level2/aclnn_round.h"
|
#include "aclnnop/level2/aclnn_round.h"
|
||||||
|
#include "aclnnop/level2/aclnn_sigmoid.h"
|
||||||
|
#include "aclnnop/level2/aclnn_sin.h"
|
||||||
|
#include "aclnnop/level2/aclnn_sqrt.h"
|
||||||
|
#include "aclnnop/level2/aclnn_tanh.h"
|
||||||
#include "ascend/ascend_kernel_without_config.h"
|
#include "ascend/ascend_kernel_without_config.h"
|
||||||
#include "ascend/ascend_runtime.h"
|
#include "ascend/ascend_runtime.h"
|
||||||
|
|
||||||
|
@ -64,138 +64,120 @@ class ReluAclnn : public ASCENDKernelWithoutConfig {
|
||||||
aclnnReluGetWorkspaceSize(input, output, &workspaceSize, &executor);
|
aclnnReluGetWorkspaceSize(input, output, &workspaceSize, &executor);
|
||||||
void *workspaceAddr = nullptr;
|
void *workspaceAddr = nullptr;
|
||||||
if (workspaceSize > 0) {
|
if (workspaceSize > 0) {
|
||||||
ret = aclrtMalloc(&workspaceAddr, workspaceSize,
|
workspaceAddr = context->getWorkspace(workspaceSize);
|
||||||
ACL_MEM_MALLOC_HUGE_FIRST);
|
|
||||||
}
|
}
|
||||||
assert(ret == ACL_SUCCESS);
|
assert(ret == ACL_SUCCESS);
|
||||||
ret = aclnnRelu(workspaceAddr, workspaceSize, executor,
|
ret = aclnnRelu(workspaceAddr, workspaceSize, executor,
|
||||||
context->ASCENDHandle());
|
context->ASCENDHandle());
|
||||||
assert(ret == ACL_SUCCESS);
|
assert(ret == ACL_SUCCESS);
|
||||||
|
|
||||||
//ret = aclDestroyTensor(input);
|
// aclDestroyTensor(input);
|
||||||
//assert(ret == ACL_SUCCESS);
|
// aclDestroyTensor(output);
|
||||||
//ret = aclDestroyTensor(output);
|
|
||||||
//assert(ret == ACL_SUCCESS);
|
|
||||||
|
|
||||||
ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
||||||
assert(ret == ACL_SUCCESS);
|
assert(ret == ACL_SUCCESS);
|
||||||
|
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#define DEFINE_UNARY_Aclnn(prefix) \
|
||||||
|
class prefix##Aclnn : public ASCENDKernelWithoutConfig { \
|
||||||
|
void compute(const Operator &_op, \
|
||||||
|
const RuntimeObj *_context) const override { \
|
||||||
|
auto op = as<UnaryObj>(_op); \
|
||||||
|
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context); \
|
||||||
|
\
|
||||||
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>()); \
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>()); \
|
||||||
|
\
|
||||||
|
auto a = op->getInputs(0) -> getDims(); \
|
||||||
|
std::vector<int64_t> aDim(a.size(), 1); \
|
||||||
|
for (size_t i = 0; i < a.size(); ++i) { \
|
||||||
|
aDim[i] = int64_t(a[i]); \
|
||||||
|
} \
|
||||||
|
auto aS = op->getInputs(0) -> getStride(); \
|
||||||
|
std::vector<int64_t> aStride(aS.size(), 1); \
|
||||||
|
for (size_t i = 0; i < aS.size(); ++i) { \
|
||||||
|
aStride[i] = int64_t(aS[i]); \
|
||||||
|
} \
|
||||||
|
auto c = op->getInputs(0) -> getDims(); \
|
||||||
|
std::vector<int64_t> cDim(c.size(), 1); \
|
||||||
|
for (size_t i = 0; i < c.size(); ++i) { \
|
||||||
|
cDim[i] = int64_t(c[i]); \
|
||||||
|
} \
|
||||||
|
auto cS = op->getInputs(0) -> getStride(); \
|
||||||
|
std::vector<int64_t> cStride(cS.size(), 1); \
|
||||||
|
for (size_t i = 0; i < cS.size(); ++i) { \
|
||||||
|
cStride[i] = int64_t(cS[i]); \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
auto input = aclCreateTensor( \
|
||||||
|
aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0, \
|
||||||
|
aclFormat::ACL_FORMAT_ND, aDim.data(), aDim.size(), aData); \
|
||||||
|
auto output = aclCreateTensor( \
|
||||||
|
cDim.data(), cDim.size(), ACL_FLOAT, cStride.data(), 0, \
|
||||||
|
aclFormat::ACL_FORMAT_ND, cDim.data(), cDim.size(), cData); \
|
||||||
|
\
|
||||||
|
uint64_t workspaceSize = 0; \
|
||||||
|
aclOpExecutor *executor; \
|
||||||
|
\
|
||||||
|
auto ret = aclnn##prefix##GetWorkspaceSize( \
|
||||||
|
input, output, &workspaceSize, &executor); \
|
||||||
|
void *workspaceAddr = nullptr; \
|
||||||
|
if (workspaceSize > 0) { \
|
||||||
|
workspaceAddr = context->getWorkspace(workspaceSize); \
|
||||||
|
} \
|
||||||
|
assert(ret == ACL_SUCCESS); \
|
||||||
|
ret = aclnn##prefix(workspaceAddr, workspaceSize, executor, \
|
||||||
|
context->ASCENDHandle()); \
|
||||||
|
assert(ret == ACL_SUCCESS); \
|
||||||
|
ret = aclrtSynchronizeStream(context->ASCENDHandle()); \
|
||||||
|
assert(ret == ACL_SUCCESS); \
|
||||||
|
\
|
||||||
|
return; \
|
||||||
|
} \
|
||||||
|
};
|
||||||
|
|
||||||
#define DEFINE_UNARY_Aclnn(prefix) \
|
DEFINE_UNARY_Aclnn(Abs);
|
||||||
class prefix##Aclnn : public ASCENDKernelWithoutConfig { \
|
DEFINE_UNARY_Aclnn(Sigmoid);
|
||||||
void compute(const Operator &_op, \
|
DEFINE_UNARY_Aclnn(Hardswish);
|
||||||
const RuntimeObj *_context) const override { \
|
DEFINE_UNARY_Aclnn(Gelu);
|
||||||
auto op = as<UnaryObj>(_op); \
|
|
||||||
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context); \
|
|
||||||
\
|
|
||||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>()); \
|
|
||||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>()); \
|
|
||||||
\
|
|
||||||
auto a = op->getInputs(0)->getDims(); \
|
|
||||||
std::vector<int64_t> aDim(a.size(), 1); \
|
|
||||||
for (size_t i = 0; i < a.size(); ++i) { \
|
|
||||||
aDim[i] = int64_t(a[i]); \
|
|
||||||
} \
|
|
||||||
auto aS = op->getInputs(0)->getStride(); \
|
|
||||||
std::vector<int64_t> aStride(aS.size(), 1); \
|
|
||||||
for (size_t i = 0; i < aS.size(); ++i) { \
|
|
||||||
aStride[i] = int64_t(aS[i]); \
|
|
||||||
} \
|
|
||||||
auto c = op->getInputs(0)->getDims(); \
|
|
||||||
std::vector<int64_t> cDim(c.size(), 1); \
|
|
||||||
for (size_t i = 0; i < c.size(); ++i) { \
|
|
||||||
cDim[i] = int64_t(c[i]); \
|
|
||||||
} \
|
|
||||||
auto cS = op->getInputs(0)->getStride(); \
|
|
||||||
std::vector<int64_t> cStride(cS.size(), 1); \
|
|
||||||
for (size_t i = 0; i < cS.size(); ++i) { \
|
|
||||||
cStride[i] = int64_t(cS[i]); \
|
|
||||||
} \
|
|
||||||
\
|
|
||||||
auto input = aclCreateTensor( \
|
|
||||||
aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0, \
|
|
||||||
aclFormat::ACL_FORMAT_ND, aDim.data(), aDim.size(), aData); \
|
|
||||||
auto output = aclCreateTensor( \
|
|
||||||
cDim.data(), cDim.size(), ACL_FLOAT, cStride.data(), 0, \
|
|
||||||
aclFormat::ACL_FORMAT_ND, cDim.data(), cDim.size(), cData); \
|
|
||||||
\
|
|
||||||
uint64_t workspaceSize = 0; \
|
|
||||||
aclOpExecutor *executor; \
|
|
||||||
\
|
|
||||||
auto ret = aclnn##prefix##GetWorkspaceSize(input, output, &workspaceSize, &executor); \
|
|
||||||
void *workspaceAddr = nullptr; \
|
|
||||||
if (workspaceSize > 0) { \
|
|
||||||
ret = aclrtMalloc(&workspaceAddr, workspaceSize, \
|
|
||||||
ACL_MEM_MALLOC_HUGE_FIRST); \
|
|
||||||
} \
|
|
||||||
assert(ret == ACL_SUCCESS); \
|
|
||||||
ret = aclnn##prefix(workspaceAddr, workspaceSize, executor, \
|
|
||||||
context->ASCENDHandle()); \
|
|
||||||
assert(ret == ACL_SUCCESS); \
|
|
||||||
ret = aclrtSynchronizeStream(context->ASCENDHandle()); \
|
|
||||||
assert(ret == ACL_SUCCESS); \
|
|
||||||
\
|
|
||||||
return; \
|
|
||||||
} \
|
|
||||||
};
|
|
||||||
|
|
||||||
DEFINE_UNARY_Aclnn(Abs)
|
DEFINE_UNARY_Aclnn(Tanh);
|
||||||
DEFINE_UNARY_Aclnn(Sigmoid)
|
DEFINE_UNARY_Aclnn(Sin);
|
||||||
DEFINE_UNARY_Aclnn(Hardswish)
|
DEFINE_UNARY_Aclnn(Cos);
|
||||||
DEFINE_UNARY_Aclnn(Gelu)
|
DEFINE_UNARY_Aclnn(Acos);
|
||||||
|
DEFINE_UNARY_Aclnn(Atan);
|
||||||
|
|
||||||
DEFINE_UNARY_Aclnn(Tanh)
|
DEFINE_UNARY_Aclnn(Ceil);
|
||||||
DEFINE_UNARY_Aclnn(Sin)
|
DEFINE_UNARY_Aclnn(Floor);
|
||||||
DEFINE_UNARY_Aclnn(Cos)
|
DEFINE_UNARY_Aclnn(Exp);
|
||||||
DEFINE_UNARY_Aclnn(Acos)
|
DEFINE_UNARY_Aclnn(Neg);
|
||||||
DEFINE_UNARY_Aclnn(Atan)
|
DEFINE_UNARY_Aclnn(Reciprocal);
|
||||||
|
DEFINE_UNARY_Aclnn(Sqrt);
|
||||||
|
DEFINE_UNARY_Aclnn(Round);
|
||||||
|
|
||||||
DEFINE_UNARY_Aclnn(Ceil)
|
REGISTER_KERNEL(Device::ASCEND, OpType::Relu, ReluAclnn, "relu_ASCEND_float");
|
||||||
DEFINE_UNARY_Aclnn(Floor)
|
REGISTER_KERNEL(Device::ASCEND, OpType::Abs, AbsAclnn, "abs_ASCEND_float");
|
||||||
DEFINE_UNARY_Aclnn(Exp)
|
REGISTER_KERNEL(Device::ASCEND, OpType::Sigmoid, SigmoidAclnn,
|
||||||
DEFINE_UNARY_Aclnn(Neg)
|
|
||||||
DEFINE_UNARY_Aclnn(Reciprocal)
|
|
||||||
DEFINE_UNARY_Aclnn(Sqrt)
|
|
||||||
DEFINE_UNARY_Aclnn(Round)
|
|
||||||
|
|
||||||
|
|
||||||
REGISTER_KERNEL(Device::ASCEND, OpType::Relu, DataType::Float32, ReluAclnn,
|
|
||||||
"relu_ASCEND_float");
|
|
||||||
REGISTER_KERNEL(Device::ASCEND, OpType::Abs, DataType::Float32, AbsAclnn,
|
|
||||||
"abs_ASCEND_float");
|
|
||||||
REGISTER_KERNEL(Device::ASCEND, OpType::Sigmoid, DataType::Float32, SigmoidAclnn,
|
|
||||||
"sigmoid_ASCEND_float");
|
"sigmoid_ASCEND_float");
|
||||||
REGISTER_KERNEL(Device::ASCEND, OpType::HardSwish, DataType::Float32, HardswishAclnn,
|
REGISTER_KERNEL(Device::ASCEND, OpType::HardSwish, HardswishAclnn,
|
||||||
"hardswish_ASCEND_float");
|
"hardswish_ASCEND_float");
|
||||||
REGISTER_KERNEL(Device::ASCEND, OpType::Tanh, DataType::Float32, TanhAclnn,
|
REGISTER_KERNEL(Device::ASCEND, OpType::Tanh, TanhAclnn, "tanh_ASCEND_float");
|
||||||
"tanh_ASCEND_float");
|
REGISTER_KERNEL(Device::ASCEND, OpType::Gelu, GeluAclnn, "gelu_ASCEND_float");
|
||||||
REGISTER_KERNEL(Device::ASCEND, OpType::Gelu, DataType::Float32, GeluAclnn,
|
REGISTER_KERNEL(Device::ASCEND, OpType::Sin, SinAclnn, "sin_ASCEND_float");
|
||||||
"gelu_ASCEND_float");
|
REGISTER_KERNEL(Device::ASCEND, OpType::Cos, CosAclnn, "cos_ASCEND_float");
|
||||||
REGISTER_KERNEL(Device::ASCEND, OpType::Sin, DataType::Float32, SinAclnn,
|
REGISTER_KERNEL(Device::ASCEND, OpType::Acos, AcosAclnn, "acos_ASCEND_float");
|
||||||
"sin_ASCEND_float");
|
REGISTER_KERNEL(Device::ASCEND, OpType::Atan, AtanAclnn, "atan_ASCEND_float");
|
||||||
REGISTER_KERNEL(Device::ASCEND, OpType::Cos, DataType::Float32, CosAclnn,
|
REGISTER_KERNEL(Device::ASCEND, OpType::Neg, NegAclnn, "neg_ASCEND_float");
|
||||||
"cos_ASCEND_float");
|
REGISTER_KERNEL(Device::ASCEND, OpType::Ceil, CeilAclnn, "ceil_ASCEND_float");
|
||||||
REGISTER_KERNEL(Device::ASCEND, OpType::Acos, DataType::Float32, AcosAclnn,
|
REGISTER_KERNEL(Device::ASCEND, OpType::Floor, FloorAclnn,
|
||||||
"acos_ASCEND_float");
|
|
||||||
REGISTER_KERNEL(Device::ASCEND, OpType::Atan, DataType::Float32, AtanAclnn,
|
|
||||||
"atan_ASCEND_float");
|
|
||||||
REGISTER_KERNEL(Device::ASCEND, OpType::Neg, DataType::Float32, NegAclnn,
|
|
||||||
"neg_ASCEND_float");
|
|
||||||
REGISTER_KERNEL(Device::ASCEND, OpType::Ceil, DataType::Float32, CeilAclnn,
|
|
||||||
"ceil_ASCEND_float");
|
|
||||||
REGISTER_KERNEL(Device::ASCEND, OpType::Floor, DataType::Float32, FloorAclnn,
|
|
||||||
"floor_ASCEND_float");
|
"floor_ASCEND_float");
|
||||||
REGISTER_KERNEL(Device::ASCEND, OpType::Exp, DataType::Float32, ExpAclnn,
|
REGISTER_KERNEL(Device::ASCEND, OpType::Exp, ExpAclnn, "exp_ASCEND_float");
|
||||||
"exp_ASCEND_float");
|
REGISTER_KERNEL(Device::ASCEND, OpType::Reciprocal, ReciprocalAclnn,
|
||||||
REGISTER_KERNEL(Device::ASCEND, OpType::Reciprocal, DataType::Float32, ReciprocalAclnn,
|
|
||||||
"reciprocal_ASCEND_float");
|
"reciprocal_ASCEND_float");
|
||||||
REGISTER_KERNEL(Device::ASCEND, OpType::Sqrt, DataType::Float32, SqrtAclnn,
|
REGISTER_KERNEL(Device::ASCEND, OpType::Sqrt, SqrtAclnn, "sqrt_ASCEND_float");
|
||||||
"sqrt_ASCEND_float");
|
REGISTER_KERNEL(Device::ASCEND, OpType::Round, RoundAclnn,
|
||||||
REGISTER_KERNEL(Device::ASCEND, OpType::Round, DataType::Float32, RoundAclnn,
|
|
||||||
"round_ASCEND_float");
|
"round_ASCEND_float");
|
||||||
}; // namespace infini
|
}; // namespace infini
|
||||||
|
|
|
@ -9,6 +9,7 @@
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
TEST(ascend_BatchNorm, run) {
|
TEST(ascend_BatchNorm, run) {
|
||||||
|
aclInit(nullptr);
|
||||||
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||||
auto npuRuntime = make_ref<ASCENDRuntimeObj>();
|
auto npuRuntime = make_ref<ASCENDRuntimeObj>();
|
||||||
|
|
||||||
|
@ -51,5 +52,7 @@ TEST(ascend_BatchNorm, run) {
|
||||||
// check results on CPU
|
// check results on CPU
|
||||||
EXPECT_TRUE(ocpu->equalData(vector<float>{
|
EXPECT_TRUE(ocpu->equalData(vector<float>{
|
||||||
-0.5, 0, 0.5, 1, -2, -1, 0, 1, -0.333333, 0, 0.333333, 0.666667}));
|
-0.5, 0, 0.5, 1, -2, -1, 0, 1, -0.333333, 0, 0.333333, 0.666667}));
|
||||||
|
|
||||||
|
aclFinalize();
|
||||||
}
|
}
|
||||||
} // namespace infini
|
} // namespace infini
|
||||||
|
|
|
@ -24,40 +24,42 @@ void testConcat(const std::function<void(void *, size_t, DataType)> &generator,
|
||||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||||
inputCpu2->dataMalloc();
|
inputCpu2->dataMalloc();
|
||||||
inputCpu2->setData(generator);
|
inputCpu2->setData(generator);
|
||||||
|
Tensor inputCpu3 =
|
||||||
|
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||||
|
inputCpu3->dataMalloc();
|
||||||
|
inputCpu3->setData(generator);
|
||||||
|
|
||||||
// NPU
|
// NPU
|
||||||
Graph npuGraph = make_ref<GraphObj>(npuRuntime);
|
Graph npuGraph = make_ref<GraphObj>(npuRuntime);
|
||||||
auto inputNpu1 = npuGraph->cloneTensor(inputCpu1);
|
auto inputNpu1 = npuGraph->cloneTensor(inputCpu1);
|
||||||
auto inputNpu2 = npuGraph->cloneTensor(inputCpu2);
|
auto inputNpu2 = npuGraph->cloneTensor(inputCpu2);
|
||||||
auto npuOp =
|
auto inputNpu3 = npuGraph->cloneTensor(inputCpu3);
|
||||||
npuGraph->addOp<T>(TensorVec{inputNpu1, inputNpu2}, nullptr, 2);
|
auto npuOp = npuGraph->addOp<T>(TensorVec{inputNpu1, inputNpu2, inputNpu3},
|
||||||
|
nullptr, 2);
|
||||||
npuGraph->dataMalloc();
|
npuGraph->dataMalloc();
|
||||||
inputNpu1->setData(generator);
|
inputNpu1->setData(generator);
|
||||||
inputNpu2->setData(generator);
|
inputNpu2->setData(generator);
|
||||||
|
inputNpu3->setData(generator);
|
||||||
npuRuntime->run(npuGraph);
|
npuRuntime->run(npuGraph);
|
||||||
auto outputNpu = npuOp->getOutput();
|
auto outputNpu = npuOp->getOutput();
|
||||||
auto outputNpu2Cpu = outputNpu->clone(cpuRuntime);
|
auto outputNpu2Cpu = outputNpu->clone(cpuRuntime);
|
||||||
|
|
||||||
/********************************************************/
|
|
||||||
auto inputTest1 = inputNpu1->clone(cpuRuntime);
|
|
||||||
auto inputTest2 = inputNpu2->clone(cpuRuntime);
|
|
||||||
inputTest1->printData();
|
|
||||||
inputTest2->printData();
|
|
||||||
|
|
||||||
/********************************************************/
|
|
||||||
|
|
||||||
// Check
|
// Check
|
||||||
inputCpu1->print();
|
inputCpu1->print();
|
||||||
inputCpu1->printData();
|
inputCpu1->printData();
|
||||||
inputCpu2->print();
|
inputCpu2->print();
|
||||||
inputCpu2->printData();
|
inputCpu2->printData();
|
||||||
|
inputCpu3->print();
|
||||||
|
inputCpu3->printData();
|
||||||
outputNpu2Cpu->print();
|
outputNpu2Cpu->print();
|
||||||
outputNpu2Cpu->printData();
|
outputNpu2Cpu->printData();
|
||||||
EXPECT_TRUE(1);
|
EXPECT_TRUE(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(ascend_Concat, run) {
|
TEST(ascend_Concat, run) {
|
||||||
|
aclInit(nullptr);
|
||||||
testConcat<ConcatObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
testConcat<ConcatObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||||
|
aclFinalize();
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace infini
|
} // namespace infini
|
||||||
|
|
|
@ -1,12 +1,11 @@
|
||||||
|
#include "ascend/ascend_runtime.h"
|
||||||
#include "core/graph.h"
|
#include "core/graph.h"
|
||||||
#include "core/kernel.h"
|
#include "core/kernel.h"
|
||||||
#include "core/runtime.h"
|
#include "core/runtime.h"
|
||||||
#include "ascend/ascend_runtime.h"
|
|
||||||
#include "operators/conv.h"
|
#include "operators/conv.h"
|
||||||
|
|
||||||
#include "test.h"
|
#include "test.h"
|
||||||
|
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
template <class T>
|
template <class T>
|
||||||
|
@ -50,8 +49,10 @@ void testConv(const std::function<void(void *, size_t, DataType)> &generatorA,
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(ascend_Conv, run) {
|
TEST(ascend_Conv, run) {
|
||||||
|
aclInit(nullptr);
|
||||||
testConv<ConvObj>(IncrementalGenerator(), IncrementalGenerator(),
|
testConv<ConvObj>(IncrementalGenerator(), IncrementalGenerator(),
|
||||||
Shape{1, 3, 32, 32}, Shape{2, 3, 3, 3});
|
Shape{1, 3, 32, 32}, Shape{2, 3, 3, 3});
|
||||||
|
aclFinalize();
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace infini
|
} // namespace infini
|
||||||
|
|
|
@ -0,0 +1,61 @@
|
||||||
|
#include "ascend/ascend_runtime.h"
|
||||||
|
#include "core/graph.h"
|
||||||
|
#include "core/kernel.h"
|
||||||
|
#include "core/runtime.h"
|
||||||
|
#include "operators/element_wise.h"
|
||||||
|
|
||||||
|
#include "test.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
void testElementWise(
|
||||||
|
const std::function<void(void *, size_t, DataType)> &generator,
|
||||||
|
const Shape &shape) {
|
||||||
|
// Runtime
|
||||||
|
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||||
|
auto npuRuntime = make_ref<ASCENDRuntimeObj>();
|
||||||
|
|
||||||
|
// Build input data on CPU
|
||||||
|
Tensor inputCpu1 =
|
||||||
|
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||||
|
Tensor inputCpu2 =
|
||||||
|
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||||
|
inputCpu1->dataMalloc();
|
||||||
|
inputCpu2->dataMalloc();
|
||||||
|
inputCpu1->setData(generator);
|
||||||
|
inputCpu2->setData(generator);
|
||||||
|
|
||||||
|
// NPU
|
||||||
|
Graph npuGraph = make_ref<GraphObj>(npuRuntime);
|
||||||
|
auto inputNpu1 = npuGraph->cloneTensor(inputCpu1);
|
||||||
|
auto inputNpu2 = npuGraph->cloneTensor(inputCpu2);
|
||||||
|
auto npuOp = npuGraph->addOp<T>(inputNpu1, inputNpu2, nullptr);
|
||||||
|
npuGraph->dataMalloc();
|
||||||
|
inputNpu1->setData(generator);
|
||||||
|
inputNpu2->setData(generator);
|
||||||
|
npuRuntime->run(npuGraph);
|
||||||
|
auto outputNpu = npuOp->getOutput();
|
||||||
|
auto outputNpu2Cpu = outputNpu->clone(cpuRuntime);
|
||||||
|
|
||||||
|
// Check
|
||||||
|
inputCpu1->print();
|
||||||
|
inputCpu1->printData();
|
||||||
|
inputCpu2->print();
|
||||||
|
inputCpu2->printData();
|
||||||
|
outputNpu2Cpu->print();
|
||||||
|
outputNpu2Cpu->printData();
|
||||||
|
EXPECT_TRUE(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(ascend_ElementWise, run) {
|
||||||
|
aclInit(nullptr);
|
||||||
|
testElementWise<PowObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||||
|
testElementWise<AddObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||||
|
testElementWise<SubObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||||
|
testElementWise<DivObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||||
|
testElementWise<MulObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||||
|
aclFinalize();
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace infini
|
|
@ -50,8 +50,10 @@ void testMatmul(const std::function<void(void *, size_t, DataType)> &generatorA,
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(ascend_Matmul, run) {
|
TEST(ascend_Matmul, run) {
|
||||||
|
aclInit(nullptr);
|
||||||
testMatmul<MatmulObj>(IncrementalGenerator(), IncrementalGenerator(), false,
|
testMatmul<MatmulObj>(IncrementalGenerator(), IncrementalGenerator(), false,
|
||||||
false, Shape{1, 2, 3}, Shape{1, 3, 4});
|
false, Shape{1, 2, 3}, Shape{1, 3, 4});
|
||||||
|
aclFinalize();
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace infini
|
} // namespace infini
|
||||||
|
|
|
@ -29,6 +29,7 @@ void testPooling(const std::function<void(void *, size_t, DataType)> &generator,
|
||||||
npuGraph->dataMalloc();
|
npuGraph->dataMalloc();
|
||||||
inputNpu->setData(generator);
|
inputNpu->setData(generator);
|
||||||
npuRuntime->run(npuGraph);
|
npuRuntime->run(npuGraph);
|
||||||
|
|
||||||
auto outputNpu = npuOp->getOutput();
|
auto outputNpu = npuOp->getOutput();
|
||||||
auto outputNpu2Cpu = outputNpu->clone(cpuRuntime);
|
auto outputNpu2Cpu = outputNpu->clone(cpuRuntime);
|
||||||
inputCpu->printData();
|
inputCpu->printData();
|
||||||
|
@ -37,8 +38,10 @@ void testPooling(const std::function<void(void *, size_t, DataType)> &generator,
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(cnnl_Pooling, run) {
|
TEST(cnnl_Pooling, run) {
|
||||||
|
aclInit(nullptr);
|
||||||
// testPooling<MaxPoolObj>(IncrementalGenerator(), Shape{1, 1, 5, 5});
|
// testPooling<MaxPoolObj>(IncrementalGenerator(), Shape{1, 1, 5, 5});
|
||||||
testPooling<AvgPoolObj>(IncrementalGenerator(), Shape{1, 1, 5, 5});
|
testPooling<AvgPoolObj>(IncrementalGenerator(), Shape{1, 1, 5, 5});
|
||||||
|
aclFinalize();
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace infini
|
} // namespace infini
|
||||||
|
|
|
@ -0,0 +1,55 @@
|
||||||
|
#include "ascend/ascend_runtime.h"
|
||||||
|
#include "core/graph.h"
|
||||||
|
#include "core/kernel.h"
|
||||||
|
#include "core/runtime.h"
|
||||||
|
#include "operators/softmax.h"
|
||||||
|
|
||||||
|
#include "test.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
void testSoftmax(const std::function<void(void *, size_t, DataType)> &generator,
|
||||||
|
const Shape &shape, int axis, vector<float> Out) {
|
||||||
|
// Runtime
|
||||||
|
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||||
|
auto npuRuntime = make_ref<ASCENDRuntimeObj>();
|
||||||
|
|
||||||
|
// Build input data on CPU
|
||||||
|
Tensor inputCpu1 =
|
||||||
|
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||||
|
inputCpu1->dataMalloc();
|
||||||
|
// inputCpu1->setData(generator);
|
||||||
|
|
||||||
|
// NPU
|
||||||
|
Graph npuGraph = make_ref<GraphObj>(npuRuntime);
|
||||||
|
auto inputNpu1 = npuGraph->cloneTensor(inputCpu1);
|
||||||
|
auto npuOp = npuGraph->addOp<T>(inputNpu1, nullptr, axis);
|
||||||
|
npuGraph->dataMalloc();
|
||||||
|
inputNpu1->setData(generator);
|
||||||
|
npuRuntime->run(npuGraph);
|
||||||
|
auto outputNpu = npuOp->getOutput();
|
||||||
|
auto outputNpu2Cpu = outputNpu->clone(cpuRuntime);
|
||||||
|
|
||||||
|
// Check
|
||||||
|
EXPECT_TRUE(outputNpu2Cpu->equalData(Out));
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(ascend_ElementWise, run) {
|
||||||
|
aclInit(nullptr);
|
||||||
|
testSoftmax<SoftmaxObj>(
|
||||||
|
IncrementalGenerator(), Shape{2, 2, 2, 2}, 1,
|
||||||
|
vector<float>{0.0179862, 0.0179862, 0.0179862, 0.0179862, 0.9820138,
|
||||||
|
0.9820138, 0.9820138, 0.9820138, 0.0179862, 0.0179862,
|
||||||
|
0.0179862, 0.0179862, 0.9820138, 0.9820138, 0.9820138,
|
||||||
|
0.9820138});
|
||||||
|
testSoftmax<SoftmaxObj>(
|
||||||
|
IncrementalGenerator(), Shape{2, 2, 2, 2}, 3,
|
||||||
|
vector<float>{0.2689414, 0.7310586, 0.2689414, 0.7310586, 0.2689414,
|
||||||
|
0.7310586, 0.2689414, 0.7310586, 0.2689414, 0.7310586,
|
||||||
|
0.2689414, 0.7310586, 0.2689414, 0.7310586, 0.2689414,
|
||||||
|
0.7310586});
|
||||||
|
aclFinalize();
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace infini
|
|
@ -13,20 +13,20 @@ void testUnary(const std::function<void(void *, size_t, DataType)> &generator,
|
||||||
const Shape &shape) {
|
const Shape &shape) {
|
||||||
// Runtime
|
// Runtime
|
||||||
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||||
auto xpuRuntime = make_ref<ASCENDRuntimeObj>();
|
auto npuRuntime = make_ref<ASCENDRuntimeObj>();
|
||||||
|
|
||||||
// Build input data on CPU
|
// Build input data on CPU
|
||||||
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||||
|
|
||||||
// GPU
|
// GPU
|
||||||
Graph xpuGraph = make_ref<GraphObj>(xpuRuntime);
|
Graph npuGraph = make_ref<GraphObj>(npuRuntime);
|
||||||
auto inputGpu = xpuGraph->cloneTensor(inputCpu);
|
auto inputNpu = npuGraph->cloneTensor(inputCpu);
|
||||||
auto gpuOp = xpuGraph->addOp<T>(inputGpu, nullptr);
|
auto npuOp = npuGraph->addOp<T>(inputNpu, nullptr);
|
||||||
xpuGraph->dataMalloc();
|
npuGraph->dataMalloc();
|
||||||
inputGpu->setData(generator);
|
inputNpu->setData(generator);
|
||||||
xpuRuntime->run(xpuGraph);
|
npuRuntime->run(npuGraph);
|
||||||
auto outputGpu = gpuOp->getOutput();
|
auto outputNpu = npuOp->getOutput();
|
||||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
auto outputNpu2Cpu = outputNpu->clone(cpuRuntime);
|
||||||
// CPU
|
// CPU
|
||||||
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
|
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
|
||||||
auto cpuOp = cpuGraph->addOp<T>(inputCpu, nullptr);
|
auto cpuOp = cpuGraph->addOp<T>(inputCpu, nullptr);
|
||||||
|
@ -36,10 +36,11 @@ void testUnary(const std::function<void(void *, size_t, DataType)> &generator,
|
||||||
cpuRuntime->run(cpuGraph);
|
cpuRuntime->run(cpuGraph);
|
||||||
auto outputCpu = cpuOp->getOutput();
|
auto outputCpu = cpuOp->getOutput();
|
||||||
// Check
|
// Check
|
||||||
EXPECT_TRUE(outputCpu->equalData(outputGpu2Cpu, 1e-3));
|
EXPECT_TRUE(outputCpu->equalData(outputNpu2Cpu, 1e-3));
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(ascend_Unary, run) {
|
TEST(ascend_Unary, run) {
|
||||||
|
aclInit(nullptr);
|
||||||
testUnary<ReluObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
testUnary<ReluObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||||
testUnary<AbsObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
testUnary<AbsObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||||
testUnary<SigmoidObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
testUnary<SigmoidObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||||
|
@ -52,11 +53,12 @@ TEST(ascend_Unary, run) {
|
||||||
testUnary<ATanObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
testUnary<ATanObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||||
// testUnary<CeilObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
// testUnary<CeilObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||||
// testUnary<FloorObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
// testUnary<FloorObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||||
// testUnary<ExpObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
// testUnary<ExpObj>(IncrementalGenerators(), Shape{1, 2, 2, 3});
|
||||||
testUnary<NegObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
testUnary<NegObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||||
// testUnary<ReciprocalObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
// testUnary<ReciprocalObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||||
testUnary<SqrtObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
testUnary<SqrtObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||||
// testUnary<RoundObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
// testUnary<RoundObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||||
|
aclFinalize();
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace infini
|
} // namespace infini
|
||||||
|
|
Loading…
Reference in New Issue