forked from jiuyuan/InfiniTensor
Compare commits
15 Commits
Author | SHA1 | Date |
---|---|---|
![]() |
e7d34badfb | |
![]() |
f6176124ec | |
![]() |
c970c93ba1 | |
![]() |
dcbbc82d5b | |
![]() |
70950e3fbb | |
![]() |
39484e0cc4 | |
![]() |
a9bd73528d | |
![]() |
95ee579338 | |
![]() |
11e2b08be3 | |
![]() |
cc057bcf80 | |
![]() |
6b06ab0534 | |
![]() |
412f301323 | |
![]() |
b1bdbbf478 | |
![]() |
56634b3b19 | |
![]() |
b6ff4514fe |
|
@ -44,3 +44,5 @@ build_debug/
|
|||
*.onnx
|
||||
*.pb
|
||||
*.npy
|
||||
|
||||
*.swp
|
||||
|
|
|
@ -2,8 +2,9 @@
|
|||
option(USE_CUDA "Support CUDA GPU" OFF)
|
||||
option(USE_BANG "Support BANG MLU" OFF)
|
||||
option(USE_KUNLUN "Support KUNLUN XPU" OFF)
|
||||
option(USE_ASCEND "Support HUAWEI ASCEND" OFF)
|
||||
option(USE_INTELCPU "Support INTELCPU" OFF)
|
||||
option(USE_BACKTRACE "Print backtrace on exception and segmentation fault" ON)
|
||||
option(USE_BACKTRACE "Print backtrace on exception and segmentation fault" OFF)
|
||||
option(USE_PROTOBUF "Serialize and deserialize tensors" OFF)
|
||||
option(BUILD_NNET "Build nnet" OFF)
|
||||
option(BUILD_DIST "Build project for distributed running" OFF)
|
||||
|
@ -149,6 +150,11 @@ if(USE_KUNLUN)
|
|||
list (APPEND SRC ${SRC_KUNLUN})
|
||||
endif()
|
||||
|
||||
if(USE_ASCEND)
|
||||
file(GLOB_RECURSE SRC_ASCEND src/ascend/*.cc src/kernels/ascend/*.cc )
|
||||
list (APPEND SRC ${SRC_ASCEND})
|
||||
endif()
|
||||
|
||||
if(USE_INTELCPU)
|
||||
file(GLOB_RECURSE SRC_INTELCPU src/intelcpu/*.cc src/kernels/intelcpu/*.cc )
|
||||
list (APPEND SRC ${SRC_INTELCPU})
|
||||
|
@ -286,7 +292,6 @@ if(USE_KUNLUN)
|
|||
find_library(KUNLUN_RT libxpurt.so "${KUNLUN_HOME}/lib64")
|
||||
find_library(KUNLUN_DNN libxpuapi.so "${KUNLUN_HOME}/XTDK/shlib")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lstdc++ -Wall -Werror")
|
||||
|
||||
if ((NOT DEFINED TARGET_CPU_ARCH) AND (NOT DEFINED ENV{TARGET_CPU_ARCH}))
|
||||
execute_process(COMMAND uname -m OUTPUT_VARIABLE _uname_m OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
set(TARGET_CPU_ARCH "${_uname_m}" CACHE STRING "Target CPU ARCH")
|
||||
|
@ -296,10 +301,40 @@ if(USE_KUNLUN)
|
|||
set(TARGET_CPU_ARCH $ENV{TARGET_CPU_ARCH} CACHE STRING "Target CPU ARCH")
|
||||
endif()
|
||||
message(STATUS "TARGET_CPU_ARCH: ${TARGET_CPU_ARCH}")
|
||||
|
||||
target_link_libraries(InfiniTensor ${KUNLUN_RT} ${KUNLUN_DNN} stdc++)
|
||||
endif()
|
||||
|
||||
if(USE_ASCEND)
|
||||
add_compile_definitions(USE_ASCEND=1)
|
||||
if ((NOT DEFINED ASCEND_HOME) AND (NOT DEFINED ENV{ASCEND_HOME}))
|
||||
message(FATAL_ERROR "ASCEND_HOME is not defined from cmake or env")
|
||||
elseif (DEFINED ASCEND_HOME)
|
||||
set(ASCEND_HOME ${ASCEND_HOME} CACHE STRING "ASCEND_HOME directory for Kunlun development")
|
||||
else()
|
||||
set(ASCEND_HOME $ENV{ASCEND_HOME} CACHE STRING "ASCEND_HOME directory for Kunlun development")
|
||||
endif()
|
||||
message(STATUS "ASCEND_HOME: ${ASCEND_HOME}")
|
||||
|
||||
include_directories("${ASCEND_HOME}/include/")
|
||||
include_directories("${ASCEND_HOME}/include/aclnn")
|
||||
find_library(ASCEND_CL libascendcl.so "${ASCEND_HOME}/lib64")
|
||||
find_library(ASCEND_BASE libnnopbase.so "${ASCEND_HOME}/lib64")
|
||||
find_library(ASCEND_DNN libopapi.so "${ASCEND_HOME}/lib64")
|
||||
find_library(ASCEND_HAL libascend_hal.so "${ASCEND_HOME}/../../driver/lib64/driver")
|
||||
# find_library(ASCEND_RT libruntime.so "${ASCEND_HOME}/lib64")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lstdc++ -Wall -Werror")
|
||||
if ((NOT DEFINED TARGET_CPU_ARCH) AND (NOT DEFINED ENV{TARGET_CPU_ARCH}))
|
||||
execute_process(COMMAND uname -m OUTPUT_VARIABLE _uname_m OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
set(TARGET_CPU_ARCH "${_uname_m}" CACHE STRING "Target CPU ARCH")
|
||||
elseif(DEFINED TARGET_CPU_ARCH)
|
||||
set(TARGET_CPU_ARCH ${TARGET_CPU_ARCH} CACHE STRING "Target CPU ARCH")
|
||||
else()
|
||||
set(TARGET_CPU_ARCH $ENV{TARGET_CPU_ARCH} CACHE STRING "Target CPU ARCH")
|
||||
endif()
|
||||
message(STATUS "TARGET_CPU_ARCH: ${TARGET_CPU_ARCH}")
|
||||
target_link_libraries(InfiniTensor ${ASCEND_HAL} ${ASCEND_CL} ${ASCEND_BASE} ${ASCEND_DNN} stdc++)
|
||||
endif()
|
||||
|
||||
# # Python bindings
|
||||
# pybind11_add_module(infini MODULE ${FFI})
|
||||
# target_link_libraries(infini PRIVATE infini_cpp)
|
||||
|
@ -336,6 +371,9 @@ if(BUILD_TEST)
|
|||
if (USE_KUNLUN)
|
||||
build_test(test/kernels/kunlun/*.cc)
|
||||
endif()
|
||||
if (USE_ASCEND)
|
||||
build_test(test/kernels/ascend/*.cc)
|
||||
endif()
|
||||
if (USE_INTELCPU)
|
||||
build_test(test/kernels/intelcpu/*.cc)
|
||||
endif()
|
||||
|
|
2
Makefile
2
Makefile
|
@ -4,6 +4,7 @@ TYPE ?= Release
|
|||
CUDA ?= OFF
|
||||
BANG ?= OFF
|
||||
KUNLUN ?= OFF
|
||||
ASCEND ?= OFF
|
||||
INTELCPU ?= off
|
||||
BACKTRACE ?= ON
|
||||
TEST ?= ON
|
||||
|
@ -27,6 +28,7 @@ CMAKE_OPT = -DCMAKE_BUILD_TYPE=$(TYPE)
|
|||
CMAKE_OPT += -DUSE_CUDA=$(CUDA)
|
||||
CMAKE_OPT += -DUSE_BANG=$(BANG)
|
||||
CMAKE_OPT += -DUSE_KUNLUN=$(KUNLUN)
|
||||
CMAKE_OPT += -DUSE_ASCEND=$(ASCEND)
|
||||
CMAKE_OPT += -DUSE_BACKTRACE=$(BACKTRACE)
|
||||
CMAKE_OPT += -DBUILD_TEST=$(TEST)
|
||||
CMAKE_OPT += -DBUILD_NNET=$(NNET)
|
||||
|
|
|
@ -140,6 +140,13 @@
|
|||
make install-python KUNLUN=ON
|
||||
```
|
||||
|
||||
编译 CPU 部分,同时编译华为 ASCEND 部分:
|
||||
|
||||
```bash
|
||||
export ASCEND_HOME=/path/to/your/ascend_home
|
||||
make install-python ASCEND=ON
|
||||
```
|
||||
|
||||
3. 使用方法
|
||||
|
||||
安装成功后,您就可以使用本项目的 Python 接口进行编码并运行。具体使用方式可以参考项目样例代码 example/Resnet/resnet.py 以及用户使用手册
|
||||
|
|
|
@ -27,6 +27,7 @@
|
|||
- `CUDA`:是否编译 CUDA 后端,默认为 `OFF`,`ON` 打开
|
||||
- `BANG`:是否编译寒武纪后端,默认为 `OFF`,`ON` 打开
|
||||
- `KUNLUN`:是否编译昆仑后端,默认为 `OFF`,`ON` 打开
|
||||
- `ASCEND`:是否编译华为后端,默认为 `OFF`,`ON` 打开
|
||||
- `BACKTRACE`:是否启用栈回溯,默认为 `ON`,`OFF` 关闭,建议调试时打开
|
||||
- `TEST`:是否编译 `googletest`,默认为 `ON`,`OFF` 关闭,只有 `test-cpp` 时必要
|
||||
|
||||
|
|
15
env.sh
15
env.sh
|
@ -36,3 +36,18 @@ export LD_LIBRARY_PATH="${NEUWARE_HOME}/lib64:${LD_LIBRARY_PATH}"
|
|||
# ├── version
|
||||
# └── XTDK
|
||||
export KUNLUN_HOME=/usr/local/xpu
|
||||
|
||||
# 配置华为ASCEND NPU 的 HOME 路径,请注意 /usr/local/ascend 是昆仑芯软件栈提供的软件包路径。
|
||||
# 如若用户有其他的路径安装方式,请自行配置正确的路径。
|
||||
# 这里是 ascend 目录下一个可能的结构图,请参考。
|
||||
# .
|
||||
# ├── bin
|
||||
# ├── include
|
||||
# ├── lib64
|
||||
# ├── tools
|
||||
# ├── version
|
||||
# └── XTDK
|
||||
#export ASCEND_HOME=/usr/local/Ascend/ascend-toolkit/6.3
|
||||
export ASCEND_HOME=/usr/local/Ascend/ascend-toolkit/latest
|
||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh
|
||||
source /usr/local/Ascend/toolbox/set_env.sh
|
||||
|
|
|
@ -1 +1 @@
|
|||
Subproject commit b896cec2dba5b8522b141ac4f89eb43074ee1b98
|
||||
Subproject commit 51d3105277f3774ed31c02ed4cd11fa92925af77
|
|
@ -0,0 +1,20 @@
|
|||
#pragma once
|
||||
#include "acl/acl.h"
|
||||
#include "acl/acl_op.h"
|
||||
#include "core/common.h"
|
||||
|
||||
#define checkASCENDError(call) \
|
||||
{ \
|
||||
auto err = call; \
|
||||
if (ACL_SUCCESS != err) { \
|
||||
fprintf(stderr, "ASCEND error in %s:%i : .\n", __FILE__, \
|
||||
__LINE__); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
}
|
||||
|
||||
namespace infini {
|
||||
|
||||
using ASCENDPtr = void *;
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,32 @@
|
|||
#pragma once
|
||||
#include "ascend/ascend_runtime.h"
|
||||
#include "core/kernel.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
class ASCENDKernelWithoutConfig : public Kernel {
|
||||
public:
|
||||
virtual void compute(const Operator &op, const PerfRecord &record,
|
||||
const RuntimeObj *context) const {
|
||||
compute(op, context);
|
||||
}
|
||||
virtual void compute(const Operator &op,
|
||||
const RuntimeObj *context) const = 0;
|
||||
// Premise: op is idempotent since it is called multiple times.
|
||||
virtual PerfRecord tune(const Operator &op,
|
||||
const RuntimeObj *_context) const {
|
||||
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
|
||||
return make_ref<PerfRecordObj>(timeit([&]() { compute(op, _context); },
|
||||
[&]() { context->sync(); }));
|
||||
}
|
||||
// transform vector<int> to vector<int64_t>
|
||||
std::vector<int64_t> MycastTo64(std::vector<int> const &v32) const {
|
||||
std::vector<int64_t> v64(v32.size(), 1);
|
||||
for (size_t i = 0; i < v32.size(); ++i) {
|
||||
v64[i] = int64_t(v32[i]);
|
||||
}
|
||||
return v64;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,109 @@
|
|||
#pragma once
|
||||
#include "ascend/ascend_common.h"
|
||||
#include "core/runtime.h"
|
||||
|
||||
#define CHECK_RET(cond, return_expr) \
|
||||
do { \
|
||||
if (!(cond)) { \
|
||||
return_expr; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define LOG_PRINT(message, ...) \
|
||||
do { \
|
||||
printf(message, ##__VA_ARGS__); \
|
||||
} while (0)
|
||||
|
||||
namespace infini {
|
||||
|
||||
class ASCENDRuntimeObj : public RuntimeObj {
|
||||
private:
|
||||
aclrtContext context;
|
||||
aclrtStream stream;
|
||||
ASCENDPtr workspace = nullptr;
|
||||
size_t workspaceSize;
|
||||
|
||||
public:
|
||||
ASCENDRuntimeObj(int deviceId = 0) : RuntimeObj(Device::ASCEND, deviceId) {
|
||||
// #ifndef _ACL_INIT
|
||||
// #define _ACL_INIT
|
||||
// aclInit(nullptr);
|
||||
// // auto ret_init =
|
||||
// // CHECK_RET(ret == ACL_SUCCESS,
|
||||
// // LOG_PRINT("aclInit failed. ERROR: %d\n",
|
||||
// ret));
|
||||
// #endif
|
||||
auto ret = aclrtSetDevice(deviceId);
|
||||
CHECK_RET(ret == ACL_SUCCESS,
|
||||
LOG_PRINT("aclrtSetDevice failed. ERROR: %d\n", ret));
|
||||
ret = aclrtCreateContext(&context, deviceId);
|
||||
CHECK_RET(ret == ACL_SUCCESS,
|
||||
LOG_PRINT("aclrtCreateContext failed. ERROR: %d\n", ret));
|
||||
ret = aclrtSetCurrentContext(context);
|
||||
CHECK_RET(ret == ACL_SUCCESS,
|
||||
LOG_PRINT("aclrtSetCurrentContext failed. ERROR: %d\n", ret));
|
||||
ret = aclrtCreateStream(&stream);
|
||||
CHECK_RET(ret == ACL_SUCCESS,
|
||||
LOG_PRINT("aclrtCreateStream failed. ERROR: %d\n", ret));
|
||||
|
||||
// 10GB for Longformer
|
||||
// size_t longformerNum = 3lu * (1 << 30);
|
||||
workspaceSize = 3ll << 30; // 3 GB
|
||||
// std::cout<<workspaceSize/1024/1024/1024<< std::endl;
|
||||
// std::cout<<std::bitset<64>(workspaceSize)<< std::endl;
|
||||
workspace = alloc(workspaceSize);
|
||||
}
|
||||
virtual ~ASCENDRuntimeObj() {
|
||||
dealloc(workspace);
|
||||
aclrtDestroyStream(stream);
|
||||
aclrtDestroyContext(context);
|
||||
aclrtResetDevice(deviceId);
|
||||
// aclFinalize();
|
||||
}
|
||||
string toString() const override;
|
||||
|
||||
void run(const Graph &graph, bool tune = false,
|
||||
bool profiling = false) const;
|
||||
// double runEvaluation(const Graph &graph, int nWarmups,
|
||||
// int nEvaluations) const;
|
||||
void sync() const;
|
||||
ASCENDPtr alloc(size_t size) override {
|
||||
void *ptr;
|
||||
checkASCENDError(
|
||||
aclrtMalloc((void **)&ptr, size, ACL_MEM_MALLOC_HUGE_FIRST));
|
||||
return ptr;
|
||||
}
|
||||
void dealloc(void *ptr) override { aclrtFree(ptr); }
|
||||
aclrtStream ASCENDHandle() const { return stream; }
|
||||
ASCENDPtr getWorkspace(size_t size) const {
|
||||
IT_ASSERT(size <= workspaceSize);
|
||||
return workspace;
|
||||
}
|
||||
|
||||
void copyBlobFromCPU(void *dst, const void *src,
|
||||
size_t bytes) const override {
|
||||
aclrtMemcpy(dst, bytes, const_cast<void *>(src), bytes,
|
||||
ACL_MEMCPY_HOST_TO_DEVICE);
|
||||
}
|
||||
|
||||
void copyBlobToCPU(void *dst, const void *src,
|
||||
size_t bytes) const override {
|
||||
aclrtMemcpy(dst, bytes, const_cast<void *>(src), bytes,
|
||||
ACL_MEMCPY_DEVICE_TO_HOST);
|
||||
}
|
||||
|
||||
void copyBlobInsideRuntime(void *dst, const void *src,
|
||||
size_t bytes) const override {
|
||||
aclrtMemcpy(dst, bytes, const_cast<void *>(src), bytes,
|
||||
ACL_MEMCPY_DEVICE_TO_DEVICE);
|
||||
}
|
||||
|
||||
void initComm(const string &, int, int) override { IT_TODO_HALT(); }
|
||||
|
||||
CommunicatorObj &getCommunicator() const override { IT_TODO_HALT(); }
|
||||
|
||||
private:
|
||||
void runWithoutSync(const Graph &graph, bool tune, bool profiling) const;
|
||||
};
|
||||
|
||||
} // namespace infini
|
|
@ -30,7 +30,7 @@ using OpLists = list<Operator>;
|
|||
|
||||
using VType = uint32_t;
|
||||
|
||||
enum class Device { CPU = 1, CUDA, BANG, INTELCPU, KUNLUN };
|
||||
enum class Device { CPU = 1, CUDA, BANG, INTELCPU, KUNLUN, ASCEND };
|
||||
/***************** Forward declaration end *****************/
|
||||
|
||||
class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
|
||||
|
@ -73,6 +73,7 @@ class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
|
|||
bool isCuda() const { return device == Device::CUDA; }
|
||||
bool isBang() const { return device == Device::BANG; }
|
||||
bool isKUNLUN() const { return device == Device::KUNLUN; }
|
||||
bool isAscend() const { return device == Device::ASCEND; }
|
||||
void copyBlob(const TensorObj *dst, const TensorObj *src) const;
|
||||
// TODO: unify these copy APIs
|
||||
virtual void copyBlobFromCPU(void *dst, const void *src,
|
||||
|
|
|
@ -0,0 +1,59 @@
|
|||
#include "ascend/ascend_runtime.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/perf_engine.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
void ASCENDRuntimeObj::runWithoutSync(const Graph &graph, bool tune = false,
|
||||
bool profiling = false) const {
|
||||
const auto &kernelRegistry = KernelRegistry::getInstance();
|
||||
auto &perfEngine = PerfEngine::getInstance();
|
||||
double totalTime = 0;
|
||||
std::map<OpType, double> opTime;
|
||||
std::map<OpType, int> opCnt;
|
||||
for (auto &op : graph->getOperators()) {
|
||||
// HACK: set correct data type
|
||||
auto kernelAttrs = KernelAttrs{device, op->getOpType().underlying()};
|
||||
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
|
||||
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
|
||||
auto perfData = perfEngine.getPerfData(perfKey);
|
||||
if (!perfData && !tune) {
|
||||
kernel->compute(op, this);
|
||||
continue;
|
||||
}
|
||||
|
||||
PerfRecord record;
|
||||
if (!perfData) {
|
||||
record = kernel->tune(op, this);
|
||||
perfEngine.setPerfData(perfKey, record);
|
||||
} else
|
||||
record = perfData;
|
||||
|
||||
double t = record->time;
|
||||
totalTime += t;
|
||||
|
||||
if (profiling) {
|
||||
double t = timeit([&]() { kernel->compute(op, record, this); },
|
||||
[&]() { sync(); }, 1, 1);
|
||||
op->print();
|
||||
printf(" op_time on kunlun xpu %lf\n", t);
|
||||
totalTime += t;
|
||||
opTime[op->getOpType()] += t;
|
||||
opCnt[op->getOpType()]++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ASCENDRuntimeObj::run(const Graph &graph, bool tune,
|
||||
bool profiling) const {
|
||||
if (profiling)
|
||||
IT_TODO_HALT();
|
||||
runWithoutSync(graph, tune, profiling);
|
||||
sync();
|
||||
}
|
||||
|
||||
void ASCENDRuntimeObj::sync() const { ; }
|
||||
|
||||
string ASCENDRuntimeObj::toString() const { return "ASCEND Runtime"; }
|
||||
|
||||
} // namespace infini
|
|
@ -30,6 +30,9 @@
|
|||
#ifdef USE_KUNLUN
|
||||
#include "kunlun/kunlun_runtime.h"
|
||||
#endif
|
||||
#ifdef USE_ASCEND
|
||||
#include "ascend/ascend_runtime.h"
|
||||
#endif
|
||||
#ifdef USE_INTELCPU
|
||||
#include "intelcpu/mkl_runtime.h"
|
||||
#include "intelcpu/operator_timer.h"
|
||||
|
@ -175,6 +178,12 @@ static Ref<KUNLUNRuntimeObj> kunlun_runtime() {
|
|||
}
|
||||
#endif
|
||||
|
||||
#ifdef USE_ASCEND
|
||||
static Ref<ASCENDRuntimeObj> ascend_runtime() {
|
||||
return make_ref<ASCENDRuntimeObj>();
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef USE_INTELCPU
|
||||
static Ref<RuntimeObj> intelcpu_runtime() { return make_ref<MklRuntimeObj>(); }
|
||||
#endif
|
||||
|
@ -347,6 +356,10 @@ void export_functions(py::module &m) {
|
|||
#ifdef USE_KUNLUN
|
||||
.FUNCTION(kunlun_runtime)
|
||||
#endif
|
||||
|
||||
#ifdef USE_ASCEND
|
||||
.FUNCTION(ascend_runtime)
|
||||
#endif
|
||||
.FUNCTION(conv_attrs_of)
|
||||
.FUNCTION(conv_trans_attrs_of)
|
||||
.FUNCTION(matmul_attrs_of)
|
||||
|
@ -431,6 +444,11 @@ void init_graph_builder(py::module &m) {
|
|||
py::class_<KUNLUNRuntimeObj, std::shared_ptr<KUNLUNRuntimeObj>, RuntimeObj>(
|
||||
m, "KUNLUNRuntime");
|
||||
#endif
|
||||
|
||||
#ifdef USE_ASCEND
|
||||
py::class_<ASCENDRuntimeObj, std::shared_ptr<ASCENDRuntimeObj>, RuntimeObj>(
|
||||
m, "ASCENDRuntime");
|
||||
#endif
|
||||
py::class_<TensorObj, std::shared_ptr<TensorObj>>(m, "Tensor",
|
||||
py::buffer_protocol())
|
||||
.def("fuid", &TensorObj::getFuid, policy::automatic)
|
||||
|
|
|
@ -0,0 +1,99 @@
|
|||
#include "operators/batch_norm.h"
|
||||
#include "aclnnop/level2/aclnn_batch_norm.h"
|
||||
#include "ascend/ascend_kernel_without_config.h"
|
||||
#include "ascend/ascend_runtime.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
class BatchNormAclnn : public ASCENDKernelWithoutConfig {
|
||||
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<BatchNormObj>(_op);
|
||||
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
|
||||
|
||||
void *const inData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const outData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
void *const meanData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const varData = (op->getInputs(2)->getRawDataPtr<void *>());
|
||||
void *const scaleData = (op->getInputs(3)->getRawDataPtr<void *>());
|
||||
void *const biasData = (op->getInputs(4)->getRawDataPtr<void *>());
|
||||
|
||||
auto inD = op->getInputs(0)->getDims();
|
||||
auto inS = op->getInputs(0)->getStride();
|
||||
auto paraD = op->getInputs(1)->getDims();
|
||||
auto paraS = op->getInputs(1)->getStride();
|
||||
auto outD = op->getOutput()->getDims();
|
||||
auto outS = op->getOutput()->getStride();
|
||||
|
||||
std::vector<int64_t> inputDim = MycastTo64(inD);
|
||||
std::vector<int64_t> inputStride = MycastTo64(inS);
|
||||
std::vector<int64_t> paraDim = MycastTo64(paraD);
|
||||
std::vector<int64_t> paraStride = MycastTo64(paraS);
|
||||
std::vector<int64_t> outputDim = MycastTo64(outD);
|
||||
std::vector<int64_t> outputStride = MycastTo64(outS);
|
||||
|
||||
auto inputTensor =
|
||||
aclCreateTensor(inputDim.data(), inputDim.size(), ACL_FLOAT,
|
||||
inputStride.data(), 0, aclFormat::ACL_FORMAT_NCHW,
|
||||
inputDim.data(), inputDim.size(), inData);
|
||||
auto outputTensor =
|
||||
aclCreateTensor(outputDim.data(), outputDim.size(), ACL_FLOAT,
|
||||
outputStride.data(), 0, aclFormat::ACL_FORMAT_NCHW,
|
||||
outputDim.data(), outputDim.size(), outData);
|
||||
auto meanTensor = aclCreateTensor(
|
||||
paraDim.data(), paraDim.size(), ACL_FLOAT, paraStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, paraDim.data(), paraDim.size(), meanData);
|
||||
auto varTensor = aclCreateTensor(
|
||||
paraDim.data(), paraDim.size(), ACL_FLOAT, paraStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, paraDim.data(), paraDim.size(), varData);
|
||||
auto scaleTensor =
|
||||
aclCreateTensor(paraDim.data(), paraDim.size(), ACL_FLOAT,
|
||||
paraStride.data(), 0, aclFormat::ACL_FORMAT_ND,
|
||||
paraDim.data(), paraDim.size(), scaleData);
|
||||
auto biasTensor = aclCreateTensor(
|
||||
paraDim.data(), paraDim.size(), ACL_FLOAT, paraStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, paraDim.data(), paraDim.size(), biasData);
|
||||
auto savemeanTensor =
|
||||
aclCreateTensor(paraDim.data(), paraDim.size(), ACL_FLOAT,
|
||||
paraStride.data(), 0, aclFormat::ACL_FORMAT_ND,
|
||||
paraDim.data(), paraDim.size(), scaleData);
|
||||
auto saveinvstdTensor = aclCreateTensor(
|
||||
paraDim.data(), paraDim.size(), ACL_FLOAT, paraStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, paraDim.data(), paraDim.size(), biasData);
|
||||
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor *executor;
|
||||
|
||||
auto ret = aclnnBatchNormGetWorkspaceSize(
|
||||
inputTensor, scaleTensor, biasTensor, meanTensor, varTensor, false,
|
||||
op->getMomentum(), op->getEps(), outputTensor, savemeanTensor,
|
||||
saveinvstdTensor, &workspaceSize, &executor);
|
||||
void *workspaceAddr = nullptr;
|
||||
if (workspaceSize > 0) {
|
||||
workspaceAddr = context->getWorkspace(workspaceSize);
|
||||
}
|
||||
assert(ret == ACL_SUCCESS);
|
||||
ret = aclnnBatchNorm(workspaceAddr, workspaceSize, executor,
|
||||
context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
// aclDestroyTensor(inputTensor);
|
||||
// aclDestroyTensor(outputTensor);
|
||||
// aclDestroyTensor(meanTensor);
|
||||
// aclDestroyTensor(varTensor);
|
||||
// aclDestroyTensor(scaleTensor);
|
||||
// aclDestroyTensor(biasTensor);
|
||||
// aclDestroyTensor(savemeanTensor);
|
||||
// aclDestroyTensor(saveinvstdTensor);
|
||||
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::BatchNormalization, BatchNormAclnn,
|
||||
"batchnorm_ASCEND_float");
|
||||
}; // namespace infini
|
|
@ -0,0 +1,73 @@
|
|||
#include "operators/concat.h"
|
||||
#include "aclnnop/level2/aclnn_cat.h"
|
||||
#include "ascend/ascend_kernel_without_config.h"
|
||||
#include "ascend/ascend_runtime.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
class ConcatAclnn : public ASCENDKernelWithoutConfig {
|
||||
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ConcatObj>(_op);
|
||||
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
|
||||
int dim = op->getDim();
|
||||
int num = op->numInputs();
|
||||
|
||||
std::vector<aclTensor *> inputsData{};
|
||||
|
||||
for (int i = 0; i < num; ++i) {
|
||||
auto inD = op->getInputs(i)->getDims();
|
||||
auto inS = op->getInputs(i)->getStride();
|
||||
std::vector<int64_t> inputDim = MycastTo64(inD);
|
||||
std::vector<int64_t> inputStride = MycastTo64(inS);
|
||||
|
||||
void *const inData = (op->getInputs(i)->getRawDataPtr<void *>());
|
||||
auto tmpTensor =
|
||||
aclCreateTensor(inputDim.data(), inputDim.size(), ACL_FLOAT,
|
||||
inputStride.data(), 0, aclFormat::ACL_FORMAT_ND,
|
||||
inputDim.data(), inputDim.size(), inData);
|
||||
|
||||
inputsData.push_back(tmpTensor);
|
||||
}
|
||||
aclTensorList *tensorList =
|
||||
aclCreateTensorList(inputsData.data(), inputsData.size());
|
||||
|
||||
void *const outData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
auto outD = op->getOutput()->getDims();
|
||||
auto outS = op->getOutput()->getStride();
|
||||
std::vector<int64_t> outputDim = MycastTo64(outD);
|
||||
std::vector<int64_t> outputStride = MycastTo64(outS);
|
||||
|
||||
auto outputTensor =
|
||||
aclCreateTensor(outputDim.data(), outputDim.size(), ACL_FLOAT,
|
||||
outputStride.data(), 0, aclFormat::ACL_FORMAT_ND,
|
||||
outputDim.data(), outputDim.size(), outData);
|
||||
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor *executor;
|
||||
|
||||
auto ret = aclnnCatGetWorkspaceSize(
|
||||
tensorList, int64_t(dim), outputTensor, &workspaceSize, &executor);
|
||||
void *workspaceAddr = nullptr;
|
||||
if (workspaceSize > 0) {
|
||||
workspaceAddr = context->getWorkspace(workspaceSize);
|
||||
}
|
||||
assert(ret == ACL_SUCCESS);
|
||||
ret = aclnnCat(workspaceAddr, workspaceSize, executor,
|
||||
context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
// aclDestroyTensorList(tensorList);
|
||||
// aclDestroyTensor(outputTensor);
|
||||
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::Concat, ConcatAclnn,
|
||||
"concat_ASCEND_float");
|
||||
}; // namespace infini
|
|
@ -0,0 +1,93 @@
|
|||
#include "operators/conv.h"
|
||||
#include "aclnnop/level2/aclnn_convolution.h"
|
||||
#include "ascend/ascend_kernel_without_config.h"
|
||||
#include "ascend/ascend_runtime.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
class ConvAclnn : public ASCENDKernelWithoutConfig {
|
||||
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ConvObj>(_op);
|
||||
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
|
||||
|
||||
const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
|
||||
// const auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
|
||||
// const int cpg = op->getChannelPerGroup();
|
||||
// const int g = c / cpg;
|
||||
|
||||
std::vector<int64_t> pads = {ph, pw};
|
||||
// std::vector<int64_t> ksize = {r, s};
|
||||
std::vector<int64_t> stride = {sh, sw};
|
||||
std::vector<int64_t> dilation = {dh, dw};
|
||||
std::vector<int64_t> outputPadding = {sh - 1, sw - 1};
|
||||
|
||||
aclIntArray *convpads = aclCreateIntArray(pads.data(), pads.size());
|
||||
aclIntArray *convstride =
|
||||
aclCreateIntArray(stride.data(), stride.size());
|
||||
aclIntArray *convdilation =
|
||||
aclCreateIntArray(dilation.data(), dilation.size());
|
||||
aclIntArray *convOutputpadding =
|
||||
aclCreateIntArray(outputPadding.data(), outputPadding.size());
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
auto inputD = op->getInputs(0)->getDims();
|
||||
auto inputS = op->getInputs(0)->getStride();
|
||||
auto weightD = op->getInputs(1)->getDims();
|
||||
auto weightS = op->getInputs(1)->getStride();
|
||||
auto outD = op->getOutput()->getDims();
|
||||
auto outS = op->getOutput()->getStride();
|
||||
|
||||
std::vector<int64_t> inputDim = MycastTo64(inputD);
|
||||
std::vector<int64_t> inputStride = MycastTo64(inputS);
|
||||
std::vector<int64_t> weightDim = MycastTo64(weightD);
|
||||
std::vector<int64_t> weightStride = MycastTo64(weightS);
|
||||
std::vector<int64_t> outputDim = MycastTo64(outD);
|
||||
std::vector<int64_t> outputStride = MycastTo64(outS);
|
||||
|
||||
auto inputTensor =
|
||||
aclCreateTensor(inputDim.data(), inputDim.size(), ACL_FLOAT,
|
||||
inputStride.data(), 0, aclFormat::ACL_FORMAT_NCHW,
|
||||
inputDim.data(), inputDim.size(), aData);
|
||||
auto weightTensor =
|
||||
aclCreateTensor(weightDim.data(), weightDim.size(), ACL_FLOAT,
|
||||
weightStride.data(), 0, aclFormat::ACL_FORMAT_NCHW,
|
||||
weightDim.data(), weightDim.size(), bData);
|
||||
auto outputTensor =
|
||||
aclCreateTensor(outputDim.data(), outputDim.size(), ACL_FLOAT,
|
||||
outputStride.data(), 0, aclFormat::ACL_FORMAT_NCHW,
|
||||
outputDim.data(), outputDim.size(), cData);
|
||||
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor *executor;
|
||||
|
||||
auto ret = aclnnConvolutionGetWorkspaceSize(
|
||||
inputTensor, weightTensor, nullptr, convstride, convpads,
|
||||
convdilation, false, convOutputpadding, 1, outputTensor, 1,
|
||||
&workspaceSize, &executor);
|
||||
void *workspaceAddr = nullptr;
|
||||
if (workspaceSize > 0) {
|
||||
workspaceAddr = context->getWorkspace(workspaceSize);
|
||||
}
|
||||
assert(ret == ACL_SUCCESS);
|
||||
ret = aclnnConvolution(workspaceAddr, workspaceSize, executor,
|
||||
context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
// aclDestroyTensor(inputTensor);
|
||||
// aclDestroyTensor(weightTensor);
|
||||
// aclDestroyTensor(outputTensor);
|
||||
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::Conv, ConvAclnn, "conv_ASCEND_float");
|
||||
}; // namespace infini
|
|
@ -0,0 +1,278 @@
|
|||
#include "operators/element_wise.h"
|
||||
#include "aclnnop/level2/aclnn_add.h"
|
||||
#include "aclnnop/level2/aclnn_div.h"
|
||||
#include "aclnnop/level2/aclnn_mul.h"
|
||||
#include "aclnnop/level2/aclnn_pow_tensor_tensor.h"
|
||||
#include "aclnnop/level2/aclnn_sub.h"
|
||||
#include "ascend/ascend_kernel_without_config.h"
|
||||
#include "ascend/ascend_runtime.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
/*
|
||||
class PowAclnn : public ASCENDKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ElementWiseObj>(_op);
|
||||
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
auto a = op->getInputs(0)->getDims();
|
||||
auto aS = op->getInputs(0)->getStride();
|
||||
auto b = op->getInputs(1)->getDims();
|
||||
auto bS = op->getInputs(1)->getStride();
|
||||
auto c = op->getInputs(0)->getDims();
|
||||
auto cS = op->getInputs(0)->getStride();
|
||||
|
||||
std::vector<int64_t> aDim = MycastTo64(a);
|
||||
std::vector<int64_t> aStride = MycastTo64(aS);
|
||||
std::vector<int64_t> bDim = MycastTo64(b);
|
||||
std::vector<int64_t> bStride = MycastTo64(bS);
|
||||
std::vector<int64_t> cDim = MycastTo64(c);
|
||||
std::vector<int64_t> cStride = MycastTo64(cS);
|
||||
|
||||
auto inputA = aclCreateTensor(
|
||||
aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, aDim.data(), aDim.size(), aData);
|
||||
auto inputB = aclCreateTensor(
|
||||
bDim.data(), bDim.size(), ACL_FLOAT, bStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, bDim.data(), bDim.size(), bData);
|
||||
auto output = aclCreateTensor(
|
||||
cDim.data(), cDim.size(), ACL_FLOAT, cStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, cDim.data(), cDim.size(), cData);
|
||||
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor *executor;
|
||||
|
||||
auto ret = aclnnPowTensorTensorGetWorkspaceSize(
|
||||
inputA, inputB, output, &workspaceSize, &executor);
|
||||
void *workspaceAddr = nullptr;
|
||||
if (workspaceSize > 0) {
|
||||
workspaceAddr = context->getWorkspace(workspaceSize);
|
||||
}
|
||||
assert(ret == ACL_SUCCESS);
|
||||
ret = aclnnPowTensorTensor(workspaceAddr, workspaceSize, executor,
|
||||
context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
ret = aclDestroyTensor(inputA);
|
||||
ret = aclDestroyTensor(inputB);
|
||||
ret = aclDestroyTensor(output);
|
||||
|
||||
return;
|
||||
}
|
||||
};
|
||||
*/
|
||||
|
||||
#define DEFINE_ELEMENT_WISE_Aclnn(prefix) \
|
||||
class prefix##Aclnn : public ASCENDKernelWithoutConfig { \
|
||||
void compute(const Operator &_op, \
|
||||
const RuntimeObj *_context) const override { \
|
||||
auto op = as<ElementWiseObj>(_op); \
|
||||
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context); \
|
||||
\
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>()); \
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>()); \
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>()); \
|
||||
\
|
||||
auto a = op->getInputs(0) -> getDims(); \
|
||||
auto aS = op->getInputs(0) -> getStride(); \
|
||||
auto b = op->getInputs(1) -> getDims(); \
|
||||
auto bS = op->getInputs(1) -> getStride(); \
|
||||
auto c = op->getInputs(0) -> getDims(); \
|
||||
auto cS = op->getInputs(0) -> getStride(); \
|
||||
\
|
||||
std::vector<int64_t> aDim = MycastTo64(a); \
|
||||
std::vector<int64_t> aStride = MycastTo64(aS); \
|
||||
std::vector<int64_t> bDim = MycastTo64(b); \
|
||||
std::vector<int64_t> bStride = MycastTo64(bS); \
|
||||
std::vector<int64_t> cDim = MycastTo64(c); \
|
||||
std::vector<int64_t> cStride = MycastTo64(cS); \
|
||||
\
|
||||
auto inputA = aclCreateTensor( \
|
||||
aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0, \
|
||||
aclFormat::ACL_FORMAT_ND, aDim.data(), aDim.size(), aData); \
|
||||
auto inputB = aclCreateTensor( \
|
||||
bDim.data(), bDim.size(), ACL_FLOAT, bStride.data(), 0, \
|
||||
aclFormat::ACL_FORMAT_ND, bDim.data(), bDim.size(), bData); \
|
||||
auto output = aclCreateTensor( \
|
||||
cDim.data(), cDim.size(), ACL_FLOAT, cStride.data(), 0, \
|
||||
aclFormat::ACL_FORMAT_ND, cDim.data(), cDim.size(), cData); \
|
||||
\
|
||||
uint64_t workspaceSize = 0; \
|
||||
aclOpExecutor *executor; \
|
||||
\
|
||||
auto ret = aclnn##prefix##GetWorkspaceSize( \
|
||||
inputA, inputB, output, &workspaceSize, &executor); \
|
||||
void *workspaceAddr = nullptr; \
|
||||
if (workspaceSize > 0) { \
|
||||
workspaceAddr = context->getWorkspace(workspaceSize); \
|
||||
} \
|
||||
assert(ret == ACL_SUCCESS); \
|
||||
ret = aclnn##prefix(workspaceAddr, workspaceSize, executor, \
|
||||
context->ASCENDHandle()); \
|
||||
assert(ret == ACL_SUCCESS); \
|
||||
\
|
||||
ret = aclrtSynchronizeStream(context->ASCENDHandle()); \
|
||||
assert(ret == ACL_SUCCESS); \
|
||||
\
|
||||
ret = aclDestroyTensor(inputA); \
|
||||
ret = aclDestroyTensor(inputB); \
|
||||
ret = aclDestroyTensor(output); \
|
||||
\
|
||||
return; \
|
||||
} \
|
||||
};
|
||||
|
||||
class AddAclnn : public ASCENDKernelWithoutConfig {
|
||||
virtual tuple<float, float, float> getAlphBeta() const {
|
||||
return {1.f, 1.f, 0.f};
|
||||
}
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ElementWiseObj>(_op);
|
||||
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
auto a = op->getInputs(0)->getDims();
|
||||
auto aS = op->getInputs(0)->getStride();
|
||||
auto b = op->getInputs(1)->getDims();
|
||||
auto bS = op->getInputs(1)->getStride();
|
||||
auto c = op->getInputs(0)->getDims();
|
||||
auto cS = op->getInputs(0)->getStride();
|
||||
|
||||
std::vector<int64_t> aDim = MycastTo64(a);
|
||||
std::vector<int64_t> aStride = MycastTo64(aS);
|
||||
std::vector<int64_t> bDim = MycastTo64(b);
|
||||
std::vector<int64_t> bStride = MycastTo64(bS);
|
||||
std::vector<int64_t> cDim = MycastTo64(c);
|
||||
std::vector<int64_t> cStride = MycastTo64(cS);
|
||||
|
||||
auto inputA = aclCreateTensor(
|
||||
aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, aDim.data(), aDim.size(), aData);
|
||||
auto inputB = aclCreateTensor(
|
||||
bDim.data(), bDim.size(), ACL_FLOAT, bStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, bDim.data(), bDim.size(), bData);
|
||||
auto output = aclCreateTensor(
|
||||
cDim.data(), cDim.size(), ACL_FLOAT, cStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, cDim.data(), cDim.size(), cData);
|
||||
|
||||
auto [aAlpha, bAlpha, beta] = getAlphBeta();
|
||||
auto alpha = aclCreateScalar(&bAlpha, ACL_FLOAT);
|
||||
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor *executor;
|
||||
|
||||
auto ret = aclnnAddGetWorkspaceSize(inputA, inputB, alpha, output,
|
||||
&workspaceSize, &executor);
|
||||
void *workspaceAddr = nullptr;
|
||||
if (workspaceSize > 0) {
|
||||
workspaceAddr = context->getWorkspace(workspaceSize);
|
||||
}
|
||||
assert(ret == ACL_SUCCESS);
|
||||
ret = aclnnAdd(workspaceAddr, workspaceSize, executor,
|
||||
context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
// ret = aclDestroyTensor(inputA);
|
||||
// ret = aclDestroyTensor(inputB);
|
||||
// ret = aclDestroyScalar(alpha);
|
||||
// ret = aclDestroyTensor(output);
|
||||
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
class SubAclnn : public ASCENDKernelWithoutConfig {
|
||||
virtual tuple<float, float, float> getAlphBeta() const {
|
||||
return {1.f, 1.f, 0.f};
|
||||
}
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ElementWiseObj>(_op);
|
||||
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
auto a = op->getInputs(0)->getDims();
|
||||
auto aS = op->getInputs(0)->getStride();
|
||||
auto b = op->getInputs(1)->getDims();
|
||||
auto bS = op->getInputs(1)->getStride();
|
||||
auto c = op->getInputs(0)->getDims();
|
||||
auto cS = op->getInputs(0)->getStride();
|
||||
|
||||
std::vector<int64_t> aDim = MycastTo64(a);
|
||||
std::vector<int64_t> aStride = MycastTo64(aS);
|
||||
std::vector<int64_t> bDim = MycastTo64(b);
|
||||
std::vector<int64_t> bStride = MycastTo64(bS);
|
||||
std::vector<int64_t> cDim = MycastTo64(c);
|
||||
std::vector<int64_t> cStride = MycastTo64(cS);
|
||||
|
||||
auto inputA = aclCreateTensor(
|
||||
aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, aDim.data(), aDim.size(), aData);
|
||||
auto inputB = aclCreateTensor(
|
||||
bDim.data(), bDim.size(), ACL_FLOAT, bStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, bDim.data(), bDim.size(), bData);
|
||||
auto output = aclCreateTensor(
|
||||
cDim.data(), cDim.size(), ACL_FLOAT, cStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, cDim.data(), cDim.size(), cData);
|
||||
|
||||
auto [aAlpha, bAlpha, beta] = getAlphBeta();
|
||||
auto alpha = aclCreateScalar(&bAlpha, ACL_FLOAT);
|
||||
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor *executor;
|
||||
|
||||
auto ret = aclnnSubGetWorkspaceSize(inputA, inputB, alpha, output,
|
||||
&workspaceSize, &executor);
|
||||
void *workspaceAddr = nullptr;
|
||||
if (workspaceSize > 0) {
|
||||
workspaceAddr = context->getWorkspace(workspaceSize);
|
||||
}
|
||||
assert(ret == ACL_SUCCESS);
|
||||
ret = aclnnSub(workspaceAddr, workspaceSize, executor,
|
||||
context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
ret = aclDestroyTensor(inputA);
|
||||
ret = aclDestroyTensor(inputB);
|
||||
ret = aclDestroyScalar(alpha);
|
||||
ret = aclDestroyTensor(output);
|
||||
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
DEFINE_ELEMENT_WISE_Aclnn(PowTensorTensor);
|
||||
DEFINE_ELEMENT_WISE_Aclnn(Div);
|
||||
DEFINE_ELEMENT_WISE_Aclnn(Mul);
|
||||
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::Pow, PowTensorTensorAclnn,
|
||||
"pow_ASCEND_float");
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::Div, DivAclnn, "div_ASCEND_float");
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::Mul, MulAclnn, "mul_ASCEND_float");
|
||||
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::Add, AddAclnn, "add_ASCEND_float");
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::Sub, SubAclnn, "sub_ASCEND_float");
|
||||
// REGISTER_KERNEL(Device::ASCEND, OpType::Abs, AbsAclnn, "abs_ASCEND_float");
|
||||
|
||||
}; // namespace infini
|
|
@ -0,0 +1,71 @@
|
|||
#include "operators/matmul.h"
|
||||
#include "aclnnop/level2/aclnn_matmul.h"
|
||||
#include "ascend/ascend_kernel_without_config.h"
|
||||
#include "ascend/ascend_runtime.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
class MatmulAclnn : public ASCENDKernelWithoutConfig {
|
||||
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<MatmulObj>(_op);
|
||||
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
auto selfD = op->getInputs(0)->getDims();
|
||||
auto selfS = op->getInputs(0)->getStride();
|
||||
auto matD = op->getInputs(1)->getDims();
|
||||
auto matS = op->getInputs(1)->getStride();
|
||||
auto outD = op->getOutput()->getDims();
|
||||
auto outS = op->getOutput()->getStride();
|
||||
|
||||
std::vector<int64_t> selfDim = MycastTo64(selfD);
|
||||
std::vector<int64_t> selfStride = MycastTo64(selfS);
|
||||
std::vector<int64_t> matDim = MycastTo64(matD);
|
||||
std::vector<int64_t> matStride = MycastTo64(matS);
|
||||
std::vector<int64_t> outputDim = MycastTo64(outD);
|
||||
std::vector<int64_t> outputStride = MycastTo64(outS);
|
||||
|
||||
auto selfTensor = aclCreateTensor(
|
||||
selfDim.data(), selfDim.size(), ACL_FLOAT, selfStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, selfDim.data(), selfDim.size(), aData);
|
||||
auto matTensor = aclCreateTensor(
|
||||
matDim.data(), matDim.size(), ACL_FLOAT, matStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, matDim.data(), matDim.size(), bData);
|
||||
auto outputTensor =
|
||||
aclCreateTensor(outputDim.data(), outputDim.size(), ACL_FLOAT,
|
||||
outputStride.data(), 0, aclFormat::ACL_FORMAT_ND,
|
||||
outputDim.data(), outputDim.size(), cData);
|
||||
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor *executor;
|
||||
|
||||
auto ret = aclnnMatmulGetWorkspaceSize(
|
||||
selfTensor, matTensor, outputTensor, 1, &workspaceSize, &executor);
|
||||
void *workspaceAddr = nullptr;
|
||||
if (workspaceSize > 0) {
|
||||
workspaceAddr = context->getWorkspace(workspaceSize);
|
||||
}
|
||||
assert(ret == ACL_SUCCESS);
|
||||
ret = aclnnMatmul(workspaceAddr, workspaceSize, executor,
|
||||
context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
// aclDestroyTensor(selfTensor);
|
||||
// aclDestroyTensor(matTensor);
|
||||
// aclDestroyTensor(outputTensor);
|
||||
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::MatMul, MatmulAclnn,
|
||||
"matmul_ASCEND_float");
|
||||
}; // namespace infini
|
|
@ -0,0 +1,76 @@
|
|||
#include "operators/pooling.h"
|
||||
#include "aclnnop/level2/aclnn_avgpool2d.h"
|
||||
#include "ascend/ascend_kernel_without_config.h"
|
||||
#include "ascend/ascend_runtime.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
class AvgPooling : public ASCENDKernelWithoutConfig {
|
||||
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<PoolingObj>(_op);
|
||||
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
auto [n, c, h, w, kh, kw] = op->getNCHWRS();
|
||||
auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
|
||||
|
||||
std::vector<int64_t> ksize = {kh, kw};
|
||||
std::vector<int64_t> stride = {sh, sw};
|
||||
std::vector<int64_t> pad = {ph, pw};
|
||||
|
||||
int64_t divisorOverride = kh * kw;
|
||||
|
||||
auto selfD = op->getInputs(0)->getDims();
|
||||
auto selfS = op->getInputs(0)->getStride();
|
||||
auto outD = op->getOutput()->getDims();
|
||||
auto outS = op->getOutput()->getStride();
|
||||
|
||||
std::vector<int64_t> selfDim = MycastTo64(selfD);
|
||||
std::vector<int64_t> selfStride = MycastTo64(selfS);
|
||||
std::vector<int64_t> outputDim = MycastTo64(outD);
|
||||
std::vector<int64_t> outputStride = MycastTo64(outS);
|
||||
|
||||
aclIntArray *kernelSize = aclCreateIntArray(ksize.data(), ksize.size());
|
||||
aclIntArray *strides = aclCreateIntArray(stride.data(), stride.size());
|
||||
aclIntArray *paddings = aclCreateIntArray(pad.data(), pad.size());
|
||||
|
||||
auto selfTensor = aclCreateTensor(
|
||||
selfDim.data(), selfDim.size(), ACL_FLOAT, selfStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_NCHW, selfDim.data(), selfDim.size(), aData);
|
||||
auto outputTensor =
|
||||
aclCreateTensor(outputDim.data(), outputDim.size(), ACL_FLOAT,
|
||||
outputStride.data(), 0, aclFormat::ACL_FORMAT_NCHW,
|
||||
outputDim.data(), outputDim.size(), cData);
|
||||
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor *executor;
|
||||
|
||||
auto ret = aclnnAvgPool2dGetWorkspaceSize(
|
||||
selfTensor, kernelSize, strides, paddings, false, true,
|
||||
divisorOverride, 1, outputTensor, &workspaceSize, &executor);
|
||||
void *workspaceAddr = nullptr;
|
||||
if (workspaceSize > 0) {
|
||||
workspaceAddr = context->getWorkspace(workspaceSize);
|
||||
}
|
||||
assert(ret == ACL_SUCCESS);
|
||||
ret = aclnnAvgPool2d(workspaceAddr, workspaceSize, executor,
|
||||
context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
// aclDestroyTensor(selfTensor);
|
||||
// aclDestroyTensor(outputTensor);
|
||||
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::AveragePool, AvgPooling,
|
||||
"avgpooling_ASCEND_float");
|
||||
}; // namespace infini
|
|
@ -0,0 +1,62 @@
|
|||
|
||||
#include "operators/softmax.h"
|
||||
#include "aclnnop/level2/aclnn_softmax.h"
|
||||
#include "ascend/ascend_kernel_without_config.h"
|
||||
#include "ascend/ascend_runtime.h"
|
||||
|
||||
namespace infini {
|
||||
class SoftmaxAclnn : public ASCENDKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<SoftmaxObj>(_op);
|
||||
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
int64_t axis = int64_t(op->getAxis());
|
||||
|
||||
auto a = op->getInputs(0)->getDims();
|
||||
auto aS = op->getInputs(0)->getStride();
|
||||
auto c = op->getInputs(0)->getDims();
|
||||
auto cS = op->getInputs(0)->getStride();
|
||||
|
||||
std::vector<int64_t> aDim = MycastTo64(a);
|
||||
std::vector<int64_t> aStride = MycastTo64(aS);
|
||||
std::vector<int64_t> cDim = MycastTo64(c);
|
||||
std::vector<int64_t> cStride = MycastTo64(cS);
|
||||
|
||||
auto input = aclCreateTensor(
|
||||
aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, aDim.data(), aDim.size(), aData);
|
||||
auto output = aclCreateTensor(
|
||||
cDim.data(), cDim.size(), ACL_FLOAT, cStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, cDim.data(), cDim.size(), cData);
|
||||
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor *executor;
|
||||
|
||||
auto ret = aclnnSoftmaxGetWorkspaceSize(input, axis, output,
|
||||
&workspaceSize, &executor);
|
||||
void *workspaceAddr = nullptr;
|
||||
if (workspaceSize > 0) {
|
||||
workspaceAddr = context->getWorkspace(workspaceSize);
|
||||
}
|
||||
assert(ret == ACL_SUCCESS);
|
||||
ret = aclnnSoftmax(workspaceAddr, workspaceSize, executor,
|
||||
context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
// aclDestroyTensor(input);
|
||||
// aclDestroyTensor(output);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::Softmax, SoftmaxAclnn,
|
||||
"softmax_ASCEND_float");
|
||||
|
||||
}; // namespace infini
|
|
@ -0,0 +1,183 @@
|
|||
#include "operators/unary.h"
|
||||
#include "aclnnop/level2/aclnn_abs.h"
|
||||
#include "aclnnop/level2/aclnn_acos.h"
|
||||
#include "aclnnop/level2/aclnn_atan.h"
|
||||
#include "aclnnop/level2/aclnn_ceil.h"
|
||||
#include "aclnnop/level2/aclnn_cos.h"
|
||||
#include "aclnnop/level2/aclnn_exp.h"
|
||||
#include "aclnnop/level2/aclnn_floor.h"
|
||||
#include "aclnnop/level2/aclnn_gelu.h"
|
||||
#include "aclnnop/level2/aclnn_hardswish.h"
|
||||
#include "aclnnop/level2/aclnn_neg.h"
|
||||
#include "aclnnop/level2/aclnn_reciprocal.h"
|
||||
#include "aclnnop/level2/aclnn_relu.h"
|
||||
#include "aclnnop/level2/aclnn_round.h"
|
||||
#include "aclnnop/level2/aclnn_sigmoid.h"
|
||||
#include "aclnnop/level2/aclnn_sin.h"
|
||||
#include "aclnnop/level2/aclnn_sqrt.h"
|
||||
#include "aclnnop/level2/aclnn_tanh.h"
|
||||
#include "ascend/ascend_kernel_without_config.h"
|
||||
#include "ascend/ascend_runtime.h"
|
||||
|
||||
namespace infini {
|
||||
class ReluAclnn : public ASCENDKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<UnaryObj>(_op);
|
||||
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
auto a = op->getInputs(0)->getDims();
|
||||
std::vector<int64_t> aDim(a.size(), 1);
|
||||
for (size_t i = 0; i < a.size(); ++i) {
|
||||
aDim[i] = int64_t(a[i]);
|
||||
}
|
||||
auto aS = op->getInputs(0)->getStride();
|
||||
std::vector<int64_t> aStride(aS.size(), 1);
|
||||
for (size_t i = 0; i < aS.size(); ++i) {
|
||||
aStride[i] = int64_t(aS[i]);
|
||||
}
|
||||
auto c = op->getInputs(0)->getDims();
|
||||
std::vector<int64_t> cDim(c.size(), 1);
|
||||
for (size_t i = 0; i < c.size(); ++i) {
|
||||
cDim[i] = int64_t(c[i]);
|
||||
}
|
||||
auto cS = op->getInputs(0)->getStride();
|
||||
std::vector<int64_t> cStride(cS.size(), 1);
|
||||
for (size_t i = 0; i < cS.size(); ++i) {
|
||||
cStride[i] = int64_t(cS[i]);
|
||||
}
|
||||
|
||||
auto input = aclCreateTensor(
|
||||
aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, aDim.data(), aDim.size(), aData);
|
||||
auto output = aclCreateTensor(
|
||||
cDim.data(), cDim.size(), ACL_FLOAT, cStride.data(), 0,
|
||||
aclFormat::ACL_FORMAT_ND, cDim.data(), cDim.size(), cData);
|
||||
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor *executor;
|
||||
|
||||
auto ret =
|
||||
aclnnReluGetWorkspaceSize(input, output, &workspaceSize, &executor);
|
||||
void *workspaceAddr = nullptr;
|
||||
if (workspaceSize > 0) {
|
||||
workspaceAddr = context->getWorkspace(workspaceSize);
|
||||
}
|
||||
assert(ret == ACL_SUCCESS);
|
||||
ret = aclnnRelu(workspaceAddr, workspaceSize, executor,
|
||||
context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
// aclDestroyTensor(input);
|
||||
// aclDestroyTensor(output);
|
||||
|
||||
ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
#define DEFINE_UNARY_Aclnn(prefix) \
|
||||
class prefix##Aclnn : public ASCENDKernelWithoutConfig { \
|
||||
void compute(const Operator &_op, \
|
||||
const RuntimeObj *_context) const override { \
|
||||
auto op = as<UnaryObj>(_op); \
|
||||
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context); \
|
||||
\
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>()); \
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>()); \
|
||||
\
|
||||
auto a = op->getInputs(0) -> getDims(); \
|
||||
std::vector<int64_t> aDim(a.size(), 1); \
|
||||
for (size_t i = 0; i < a.size(); ++i) { \
|
||||
aDim[i] = int64_t(a[i]); \
|
||||
} \
|
||||
auto aS = op->getInputs(0) -> getStride(); \
|
||||
std::vector<int64_t> aStride(aS.size(), 1); \
|
||||
for (size_t i = 0; i < aS.size(); ++i) { \
|
||||
aStride[i] = int64_t(aS[i]); \
|
||||
} \
|
||||
auto c = op->getInputs(0) -> getDims(); \
|
||||
std::vector<int64_t> cDim(c.size(), 1); \
|
||||
for (size_t i = 0; i < c.size(); ++i) { \
|
||||
cDim[i] = int64_t(c[i]); \
|
||||
} \
|
||||
auto cS = op->getInputs(0) -> getStride(); \
|
||||
std::vector<int64_t> cStride(cS.size(), 1); \
|
||||
for (size_t i = 0; i < cS.size(); ++i) { \
|
||||
cStride[i] = int64_t(cS[i]); \
|
||||
} \
|
||||
\
|
||||
auto input = aclCreateTensor( \
|
||||
aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0, \
|
||||
aclFormat::ACL_FORMAT_ND, aDim.data(), aDim.size(), aData); \
|
||||
auto output = aclCreateTensor( \
|
||||
cDim.data(), cDim.size(), ACL_FLOAT, cStride.data(), 0, \
|
||||
aclFormat::ACL_FORMAT_ND, cDim.data(), cDim.size(), cData); \
|
||||
\
|
||||
uint64_t workspaceSize = 0; \
|
||||
aclOpExecutor *executor; \
|
||||
\
|
||||
auto ret = aclnn##prefix##GetWorkspaceSize( \
|
||||
input, output, &workspaceSize, &executor); \
|
||||
void *workspaceAddr = nullptr; \
|
||||
if (workspaceSize > 0) { \
|
||||
workspaceAddr = context->getWorkspace(workspaceSize); \
|
||||
} \
|
||||
assert(ret == ACL_SUCCESS); \
|
||||
ret = aclnn##prefix(workspaceAddr, workspaceSize, executor, \
|
||||
context->ASCENDHandle()); \
|
||||
assert(ret == ACL_SUCCESS); \
|
||||
ret = aclrtSynchronizeStream(context->ASCENDHandle()); \
|
||||
assert(ret == ACL_SUCCESS); \
|
||||
\
|
||||
return; \
|
||||
} \
|
||||
};
|
||||
|
||||
DEFINE_UNARY_Aclnn(Abs);
|
||||
DEFINE_UNARY_Aclnn(Sigmoid);
|
||||
DEFINE_UNARY_Aclnn(Hardswish);
|
||||
DEFINE_UNARY_Aclnn(Gelu);
|
||||
|
||||
DEFINE_UNARY_Aclnn(Tanh);
|
||||
DEFINE_UNARY_Aclnn(Sin);
|
||||
DEFINE_UNARY_Aclnn(Cos);
|
||||
DEFINE_UNARY_Aclnn(Acos);
|
||||
DEFINE_UNARY_Aclnn(Atan);
|
||||
|
||||
DEFINE_UNARY_Aclnn(Ceil);
|
||||
DEFINE_UNARY_Aclnn(Floor);
|
||||
DEFINE_UNARY_Aclnn(Exp);
|
||||
DEFINE_UNARY_Aclnn(Neg);
|
||||
DEFINE_UNARY_Aclnn(Reciprocal);
|
||||
DEFINE_UNARY_Aclnn(Sqrt);
|
||||
DEFINE_UNARY_Aclnn(Round);
|
||||
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::Relu, ReluAclnn, "relu_ASCEND_float");
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::Abs, AbsAclnn, "abs_ASCEND_float");
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::Sigmoid, SigmoidAclnn,
|
||||
"sigmoid_ASCEND_float");
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::HardSwish, HardswishAclnn,
|
||||
"hardswish_ASCEND_float");
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::Tanh, TanhAclnn, "tanh_ASCEND_float");
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::Gelu, GeluAclnn, "gelu_ASCEND_float");
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::Sin, SinAclnn, "sin_ASCEND_float");
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::Cos, CosAclnn, "cos_ASCEND_float");
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::Acos, AcosAclnn, "acos_ASCEND_float");
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::Atan, AtanAclnn, "atan_ASCEND_float");
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::Neg, NegAclnn, "neg_ASCEND_float");
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::Ceil, CeilAclnn, "ceil_ASCEND_float");
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::Floor, FloorAclnn,
|
||||
"floor_ASCEND_float");
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::Exp, ExpAclnn, "exp_ASCEND_float");
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::Reciprocal, ReciprocalAclnn,
|
||||
"reciprocal_ASCEND_float");
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::Sqrt, SqrtAclnn, "sqrt_ASCEND_float");
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::Round, RoundAclnn,
|
||||
"round_ASCEND_float");
|
||||
}; // namespace infini
|
|
@ -0,0 +1,58 @@
|
|||
#include "ascend/ascend_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/batch_norm.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
TEST(ascend_BatchNorm, run) {
|
||||
aclInit(nullptr);
|
||||
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||
auto npuRuntime = make_ref<ASCENDRuntimeObj>();
|
||||
|
||||
// Build cpu graph
|
||||
Graph gCpu = make_ref<GraphObj>(cpuRuntime);
|
||||
auto iCpu = gCpu->addTensor(Shape{1, 3, 2, 2}, DataType::Float32);
|
||||
auto meanCpu = gCpu->addTensor(Shape{3}, DataType::Float32);
|
||||
auto varCpu = gCpu->addTensor(Shape{3}, DataType::Float32);
|
||||
auto scaleCpu = gCpu->addTensor(Shape{3}, DataType::Float32);
|
||||
auto biasCpu = gCpu->addTensor(Shape{3}, DataType::Float32);
|
||||
|
||||
// Build input data on CPU
|
||||
gCpu->dataMalloc();
|
||||
iCpu->setData(IncrementalGenerator());
|
||||
meanCpu->copyin(vector<float>{1, 6, 9});
|
||||
varCpu->copyin(vector<float>{4, 1, 9});
|
||||
scaleCpu->setData(OneGenerator());
|
||||
biasCpu->setData(ZeroGenerator());
|
||||
|
||||
// Build CUDA graph
|
||||
Graph g = make_ref<GraphObj>(npuRuntime);
|
||||
auto i = g->cloneTensor(iCpu);
|
||||
auto mean = g->cloneTensor(meanCpu);
|
||||
auto var = g->cloneTensor(varCpu);
|
||||
auto scale = g->cloneTensor(scaleCpu);
|
||||
auto bias = g->cloneTensor(biasCpu);
|
||||
auto op =
|
||||
g->addOp<BatchNormObj>(i, nullptr, mean, var, scale, bias, 0.9, 0);
|
||||
|
||||
// allocate CUDA memory
|
||||
g->dataMalloc();
|
||||
|
||||
// Execute on CUDA
|
||||
npuRuntime->run(g);
|
||||
|
||||
// clone CUDA output to CPU
|
||||
auto o = op->getOutput();
|
||||
auto ocpu = o->clone(cpuRuntime);
|
||||
|
||||
// check results on CPU
|
||||
EXPECT_TRUE(ocpu->equalData(vector<float>{
|
||||
-0.5, 0, 0.5, 1, -2, -1, 0, 1, -0.333333, 0, 0.333333, 0.666667}));
|
||||
|
||||
aclFinalize();
|
||||
}
|
||||
} // namespace infini
|
|
@ -0,0 +1,65 @@
|
|||
#include "ascend/ascend_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/concat.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testConcat(const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||
auto npuRuntime = make_ref<ASCENDRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu1 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu1->dataMalloc();
|
||||
inputCpu1->setData(generator);
|
||||
Tensor inputCpu2 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu2->dataMalloc();
|
||||
inputCpu2->setData(generator);
|
||||
Tensor inputCpu3 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu3->dataMalloc();
|
||||
inputCpu3->setData(generator);
|
||||
|
||||
// NPU
|
||||
Graph npuGraph = make_ref<GraphObj>(npuRuntime);
|
||||
auto inputNpu1 = npuGraph->cloneTensor(inputCpu1);
|
||||
auto inputNpu2 = npuGraph->cloneTensor(inputCpu2);
|
||||
auto inputNpu3 = npuGraph->cloneTensor(inputCpu3);
|
||||
auto npuOp = npuGraph->addOp<T>(TensorVec{inputNpu1, inputNpu2, inputNpu3},
|
||||
nullptr, 2);
|
||||
npuGraph->dataMalloc();
|
||||
inputNpu1->setData(generator);
|
||||
inputNpu2->setData(generator);
|
||||
inputNpu3->setData(generator);
|
||||
npuRuntime->run(npuGraph);
|
||||
auto outputNpu = npuOp->getOutput();
|
||||
auto outputNpu2Cpu = outputNpu->clone(cpuRuntime);
|
||||
|
||||
// Check
|
||||
inputCpu1->print();
|
||||
inputCpu1->printData();
|
||||
inputCpu2->print();
|
||||
inputCpu2->printData();
|
||||
inputCpu3->print();
|
||||
inputCpu3->printData();
|
||||
outputNpu2Cpu->print();
|
||||
outputNpu2Cpu->printData();
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(ascend_Concat, run) {
|
||||
aclInit(nullptr);
|
||||
testConcat<ConcatObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
aclFinalize();
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,58 @@
|
|||
#include "ascend/ascend_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/conv.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testConv(const std::function<void(void *, size_t, DataType)> &generatorA,
|
||||
const std::function<void(void *, size_t, DataType)> &generatorB,
|
||||
const Shape &shapeA, const Shape &shapeB) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||
auto npuRuntime = make_ref<ASCENDRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu1 =
|
||||
make_ref<TensorObj>(shapeA, DataType::Float32, cpuRuntime);
|
||||
Tensor inputCpu2 =
|
||||
make_ref<TensorObj>(shapeB, DataType::Float32, cpuRuntime);
|
||||
// NPU
|
||||
Graph npuGraph = make_ref<GraphObj>(npuRuntime);
|
||||
auto inputNpu1 = npuGraph->cloneTensor(inputCpu1);
|
||||
auto inputNpu2 = npuGraph->cloneTensor(inputCpu2);
|
||||
auto npuOp =
|
||||
npuGraph->addOp<T>(inputNpu1, inputNpu2, nullptr, 1, 1, 1, 1, 1, 1);
|
||||
npuGraph->dataMalloc();
|
||||
inputNpu1->setData(generatorA);
|
||||
inputNpu2->setData(generatorB);
|
||||
npuRuntime->run(npuGraph);
|
||||
auto outputNpu = npuOp->getOutput();
|
||||
auto outputNpu2Cpu = outputNpu->clone(cpuRuntime);
|
||||
// CPU
|
||||
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
|
||||
cpuGraph->addTensor(inputCpu1);
|
||||
cpuGraph->addTensor(inputCpu2);
|
||||
auto cpuOp =
|
||||
cpuGraph->addOp<T>(inputCpu1, inputCpu2, nullptr, 1, 1, 1, 1, 1, 1);
|
||||
cpuGraph->dataMalloc();
|
||||
inputCpu1->setData(generatorA);
|
||||
inputCpu2->setData(generatorB);
|
||||
cpuRuntime->run(cpuGraph);
|
||||
auto outputCpu = cpuOp->getOutput();
|
||||
// Check
|
||||
EXPECT_TRUE(outputCpu->equalData(outputNpu2Cpu));
|
||||
}
|
||||
|
||||
TEST(ascend_Conv, run) {
|
||||
aclInit(nullptr);
|
||||
testConv<ConvObj>(IncrementalGenerator(), IncrementalGenerator(),
|
||||
Shape{1, 3, 32, 32}, Shape{2, 3, 3, 3});
|
||||
aclFinalize();
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,61 @@
|
|||
#include "ascend/ascend_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/element_wise.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testElementWise(
|
||||
const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||
auto npuRuntime = make_ref<ASCENDRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu1 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
Tensor inputCpu2 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu1->dataMalloc();
|
||||
inputCpu2->dataMalloc();
|
||||
inputCpu1->setData(generator);
|
||||
inputCpu2->setData(generator);
|
||||
|
||||
// NPU
|
||||
Graph npuGraph = make_ref<GraphObj>(npuRuntime);
|
||||
auto inputNpu1 = npuGraph->cloneTensor(inputCpu1);
|
||||
auto inputNpu2 = npuGraph->cloneTensor(inputCpu2);
|
||||
auto npuOp = npuGraph->addOp<T>(inputNpu1, inputNpu2, nullptr);
|
||||
npuGraph->dataMalloc();
|
||||
inputNpu1->setData(generator);
|
||||
inputNpu2->setData(generator);
|
||||
npuRuntime->run(npuGraph);
|
||||
auto outputNpu = npuOp->getOutput();
|
||||
auto outputNpu2Cpu = outputNpu->clone(cpuRuntime);
|
||||
|
||||
// Check
|
||||
inputCpu1->print();
|
||||
inputCpu1->printData();
|
||||
inputCpu2->print();
|
||||
inputCpu2->printData();
|
||||
outputNpu2Cpu->print();
|
||||
outputNpu2Cpu->printData();
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(ascend_ElementWise, run) {
|
||||
aclInit(nullptr);
|
||||
testElementWise<PowObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testElementWise<AddObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testElementWise<SubObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testElementWise<DivObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testElementWise<MulObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
aclFinalize();
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,59 @@
|
|||
#include "ascend/ascend_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/matmul.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testMatmul(const std::function<void(void *, size_t, DataType)> &generatorA,
|
||||
const std::function<void(void *, size_t, DataType)> &generatorB,
|
||||
bool transA, bool transB, const Shape &shapeA,
|
||||
const Shape &shapeB) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||
auto npuRuntime = make_ref<ASCENDRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu1 =
|
||||
make_ref<TensorObj>(shapeA, DataType::Float32, cpuRuntime);
|
||||
Tensor inputCpu2 =
|
||||
make_ref<TensorObj>(shapeB, DataType::Float32, cpuRuntime);
|
||||
|
||||
// NPU
|
||||
Graph npuGraph = make_ref<GraphObj>(npuRuntime);
|
||||
auto inputNpu1 = npuGraph->cloneTensor(inputCpu1);
|
||||
auto inputNpu2 = npuGraph->cloneTensor(inputCpu2);
|
||||
auto npuOp = npuGraph->addOp<T>(inputNpu1, inputNpu2, nullptr);
|
||||
npuGraph->dataMalloc();
|
||||
inputNpu1->setData(generatorA);
|
||||
inputNpu2->setData(generatorB);
|
||||
npuRuntime->run(npuGraph);
|
||||
auto outputNpu = npuOp->getOutput();
|
||||
auto outputNpu2Cpu = outputNpu->clone(cpuRuntime);
|
||||
// CPU
|
||||
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
|
||||
auto cpuOp = cpuGraph->addOp<T>(inputCpu1, inputCpu2, nullptr);
|
||||
cpuGraph->addTensor(inputCpu1);
|
||||
cpuGraph->addTensor(inputCpu2);
|
||||
cpuGraph->dataMalloc();
|
||||
inputCpu1->setData(generatorA);
|
||||
inputCpu2->setData(generatorB);
|
||||
cpuRuntime->run(cpuGraph);
|
||||
auto outputCpu = cpuOp->getOutput();
|
||||
|
||||
// Check
|
||||
EXPECT_TRUE(outputCpu->equalData(outputNpu2Cpu));
|
||||
}
|
||||
|
||||
TEST(ascend_Matmul, run) {
|
||||
aclInit(nullptr);
|
||||
testMatmul<MatmulObj>(IncrementalGenerator(), IncrementalGenerator(), false,
|
||||
false, Shape{1, 2, 3}, Shape{1, 3, 4});
|
||||
aclFinalize();
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,47 @@
|
|||
#include "ascend/ascend_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/pooling.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T, typename std::enable_if<std::is_base_of<PoolingObj, T>{},
|
||||
int>::type = 0>
|
||||
void testPooling(const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||
auto npuRuntime = make_ref<ASCENDRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu->dataMalloc();
|
||||
inputCpu->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph npuGraph = make_ref<GraphObj>(npuRuntime);
|
||||
auto inputNpu = npuGraph->cloneTensor(inputCpu);
|
||||
auto npuOp =
|
||||
npuGraph->addOp<T>(inputNpu, nullptr, 3, 3, 1, 1, 1, 1, 2, 2, 0);
|
||||
npuGraph->dataMalloc();
|
||||
inputNpu->setData(generator);
|
||||
npuRuntime->run(npuGraph);
|
||||
|
||||
auto outputNpu = npuOp->getOutput();
|
||||
auto outputNpu2Cpu = outputNpu->clone(cpuRuntime);
|
||||
inputCpu->printData();
|
||||
outputNpu2Cpu->printData();
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(cnnl_Pooling, run) {
|
||||
aclInit(nullptr);
|
||||
// testPooling<MaxPoolObj>(IncrementalGenerator(), Shape{1, 1, 5, 5});
|
||||
testPooling<AvgPoolObj>(IncrementalGenerator(), Shape{1, 1, 5, 5});
|
||||
aclFinalize();
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,55 @@
|
|||
#include "ascend/ascend_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/softmax.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testSoftmax(const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape, int axis, vector<float> Out) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||
auto npuRuntime = make_ref<ASCENDRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu1 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu1->dataMalloc();
|
||||
// inputCpu1->setData(generator);
|
||||
|
||||
// NPU
|
||||
Graph npuGraph = make_ref<GraphObj>(npuRuntime);
|
||||
auto inputNpu1 = npuGraph->cloneTensor(inputCpu1);
|
||||
auto npuOp = npuGraph->addOp<T>(inputNpu1, nullptr, axis);
|
||||
npuGraph->dataMalloc();
|
||||
inputNpu1->setData(generator);
|
||||
npuRuntime->run(npuGraph);
|
||||
auto outputNpu = npuOp->getOutput();
|
||||
auto outputNpu2Cpu = outputNpu->clone(cpuRuntime);
|
||||
|
||||
// Check
|
||||
EXPECT_TRUE(outputNpu2Cpu->equalData(Out));
|
||||
}
|
||||
|
||||
TEST(ascend_ElementWise, run) {
|
||||
aclInit(nullptr);
|
||||
testSoftmax<SoftmaxObj>(
|
||||
IncrementalGenerator(), Shape{2, 2, 2, 2}, 1,
|
||||
vector<float>{0.0179862, 0.0179862, 0.0179862, 0.0179862, 0.9820138,
|
||||
0.9820138, 0.9820138, 0.9820138, 0.0179862, 0.0179862,
|
||||
0.0179862, 0.0179862, 0.9820138, 0.9820138, 0.9820138,
|
||||
0.9820138});
|
||||
testSoftmax<SoftmaxObj>(
|
||||
IncrementalGenerator(), Shape{2, 2, 2, 2}, 3,
|
||||
vector<float>{0.2689414, 0.7310586, 0.2689414, 0.7310586, 0.2689414,
|
||||
0.7310586, 0.2689414, 0.7310586, 0.2689414, 0.7310586,
|
||||
0.2689414, 0.7310586, 0.2689414, 0.7310586, 0.2689414,
|
||||
0.7310586});
|
||||
aclFinalize();
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,64 @@
|
|||
#include "ascend/ascend_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testUnary(const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||
auto npuRuntime = make_ref<ASCENDRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
|
||||
// GPU
|
||||
Graph npuGraph = make_ref<GraphObj>(npuRuntime);
|
||||
auto inputNpu = npuGraph->cloneTensor(inputCpu);
|
||||
auto npuOp = npuGraph->addOp<T>(inputNpu, nullptr);
|
||||
npuGraph->dataMalloc();
|
||||
inputNpu->setData(generator);
|
||||
npuRuntime->run(npuGraph);
|
||||
auto outputNpu = npuOp->getOutput();
|
||||
auto outputNpu2Cpu = outputNpu->clone(cpuRuntime);
|
||||
// CPU
|
||||
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
|
||||
auto cpuOp = cpuGraph->addOp<T>(inputCpu, nullptr);
|
||||
cpuGraph->addTensor(inputCpu);
|
||||
cpuGraph->dataMalloc();
|
||||
inputCpu->setData(generator);
|
||||
cpuRuntime->run(cpuGraph);
|
||||
auto outputCpu = cpuOp->getOutput();
|
||||
// Check
|
||||
EXPECT_TRUE(outputCpu->equalData(outputNpu2Cpu, 1e-3));
|
||||
}
|
||||
|
||||
TEST(ascend_Unary, run) {
|
||||
aclInit(nullptr);
|
||||
testUnary<ReluObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testUnary<AbsObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testUnary<SigmoidObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testUnary<HardSwishObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testUnary<TanhObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testUnary<SinObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testUnary<GeluObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testUnary<CosObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testUnary<ACosObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testUnary<ATanObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
// testUnary<CeilObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
// testUnary<FloorObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
// testUnary<ExpObj>(IncrementalGenerators(), Shape{1, 2, 2, 3});
|
||||
testUnary<NegObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
// testUnary<ReciprocalObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testUnary<SqrtObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
// testUnary<RoundObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
aclFinalize();
|
||||
}
|
||||
|
||||
} // namespace infini
|
Loading…
Reference in New Issue