* support kunlun xpu and add an operator named Add

* add sub, mul, div, pow, maximum, minimum

* add code

* add xpu code

* add code

* add matmul

* add transpose

* add unary operator

* add unary operator

* add some operator

* add code

* support run resnet18 on xpu

* add code

* add max pool2d

* fix xpu code, let it can run.

* 添加XPU算子 (#120)

* add floordiv for xpu

* add batchnorm for xpu

* add more cast types for xpu

* add conv_trans for xpu

* add pad for xpu

* add logical ops for xpu

* fix format for xpu src and include

* fix format for xpu test

* fix format for xpu src

---------

Co-authored-by: Bolun <bolunz@u.nus.edu>

* Xpu abs (#121)

* add: unary kernel for xpu

* formatting

* format

* format

* format

* fix: pointer jump

* fix optype comments

* fix bug introduced while resolving conflict

* change cmake option for kunlunxin xpu from 'xpu' to 'kunlun'; fix bug after merging distributed infrastructure

* Add doc support for xpu (#141)

* fix

* fix

* fix pooling test

* format

* format

* fix

* fix

* set cmake version requirement

* fix cmakelists

* rename xpu to kunlun

* fix

* fix format

* fix format

* fix format

* fix change name to kunlun

* format

* fix format

* clang format

* fix format

---------

Co-authored-by: root <root@localhost.localdomain>
Co-authored-by: wanghailu <wanghailu@qiyuanlab.com>
Co-authored-by: wanghailu <wanghailu0717@163.com>
Co-authored-by: Bolun Zhang <48948016+Chamberlain0w0@users.noreply.github.com>
Co-authored-by: Bolun <bolunz@u.nus.edu>
Co-authored-by: zhangyue207 <138768300+zhangyue207@users.noreply.github.com>
Co-authored-by: Haojie Wang <haojie0429@gmail.com>
Co-authored-by: baominghelly <41820386+baominghelly@users.noreply.github.com>
Co-authored-by: Bolun <chamberlain0w0@gmail.com>
This commit is contained in:
Hardy 2023-10-16 10:57:08 +08:00 committed by GitHub
parent 8e4d88fb9f
commit 1184fa131f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
46 changed files with 2874 additions and 26 deletions

View File

@ -1,16 +1,23 @@
cmake_minimum_required(VERSION 3.17) # FindCUDAToolkit
include(CMakeDependentOption)
project(InfiniTensor C CXX)
# Do not change these options in this file. Use cmake.config, cmake -DOPTION=VALUE, or ccmake to specify them. # Do not change these options in this file. Use cmake.config, cmake -DOPTION=VALUE, or ccmake to specify them.
option(USE_CUDA "Support CUDA GPU" OFF) option(USE_CUDA "Support CUDA GPU" OFF)
option(USE_BANG "Support BANG MLU" OFF) option(USE_BANG "Support BANG MLU" OFF)
option(USE_KUNLUN "Support KUNLUN XPU" OFF)
option(USE_INTELCPU "Support INTELCPU" OFF) option(USE_INTELCPU "Support INTELCPU" OFF)
option(USE_BACKTRACE "Print backtrace on exception and segmentation fault" ON) option(USE_BACKTRACE "Print backtrace on exception and segmentation fault" ON)
option(USE_PROTOBUF "Serialize and deserialize tensors" OFF) option(USE_PROTOBUF "Serialize and deserialize tensors" OFF)
option(BUILD_DIST "Build project for distributed running" OFF) option(BUILD_DIST "Build project for distributed running" OFF)
option(BUILD_TEST "Build tests" OFF) option(BUILD_TEST "Build tests" OFF)
if(USE_CUDA)
message("CMake 3.18 or higher is required for setting CUDAToolkit")
cmake_minimum_required(VERSION 3.18) # FindCUDAToolkit
else()
cmake_minimum_required(VERSION 3.12)
endif()
include(CMakeDependentOption)
project(InfiniTensor C CXX)
cmake_dependent_option(BUILD_TEST_CORE "Build tests for core components" ON BUILD_TEST OFF) cmake_dependent_option(BUILD_TEST_CORE "Build tests for core components" ON BUILD_TEST OFF)
cmake_dependent_option(BUILD_TEST_PET "Build tests for PET" OFF BUILD_TEST OFF) cmake_dependent_option(BUILD_TEST_PET "Build tests for PET" OFF BUILD_TEST OFF)
cmake_dependent_option(BUILD_TEST_EINNET "Build tests for EINNET" OFF BUILD_TEST OFF) cmake_dependent_option(BUILD_TEST_EINNET "Build tests for EINNET" OFF BUILD_TEST OFF)
@ -128,6 +135,11 @@ if(USE_BANG)
list (APPEND SRC ${SRC_BANG}) list (APPEND SRC ${SRC_BANG})
endif() endif()
if(USE_KUNLUN)
file(GLOB_RECURSE SRC_KUNLUN src/kunlun/*.cc src/kernels/kunlun/*.cc )
list (APPEND SRC ${SRC_KUNLUN})
endif()
if(USE_INTELCPU) if(USE_INTELCPU)
file(GLOB_RECURSE SRC_INTELCPU src/intelcpu/*.cc src/kernels/intelcpu/*.cc ) file(GLOB_RECURSE SRC_INTELCPU src/intelcpu/*.cc src/kernels/intelcpu/*.cc )
list (APPEND SRC ${SRC_INTELCPU}) list (APPEND SRC ${SRC_INTELCPU})
@ -243,6 +255,35 @@ if(USE_BANG)
target_link_libraries(InfiniTensor ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++) target_link_libraries(InfiniTensor ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
endif() endif()
if(USE_KUNLUN)
add_compile_definitions(USE_KUNLUN=1)
if ((NOT DEFINED KUNLUN_HOME) AND (NOT DEFINED ENV{KUNLUN_HOME}))
message(FATAL_ERROR "KUNLUN_HOME is not defined from cmake or env")
elseif (DEFINED KUNLUN_HOME)
set(KUNLUN_HOME ${KUNLUN_HOME} CACHE STRING "KUNLUN_HOME directory for Kunlun development")
else()
set(KUNLUN_HOME $ENV{KUNLUN_HOME} CACHE STRING "KUNLUN_HOME directory for Kunlun development")
endif()
message(STATUS "KUNLUN_HOME: ${KUNLUN_HOME}")
include_directories("${KUNLUN_HOME}/XTDK/include/")
find_library(KUNLUN_RT libxpurt.so "${KUNLUN_HOME}/lib64")
find_library(KUNLUN_DNN libxpuapi.so "${KUNLUN_HOME}/XTDK/shlib")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lstdc++ -Wall -Werror")
if ((NOT DEFINED TARGET_CPU_ARCH) AND (NOT DEFINED ENV{TARGET_CPU_ARCH}))
execute_process(COMMAND uname -m OUTPUT_VARIABLE _uname_m OUTPUT_STRIP_TRAILING_WHITESPACE)
set(TARGET_CPU_ARCH "${_uname_m}" CACHE STRING "Target CPU ARCH")
elseif(DEFINED TARGET_CPU_ARCH)
set(TARGET_CPU_ARCH ${TARGET_CPU_ARCH} CACHE STRING "Target CPU ARCH")
else()
set(TARGET_CPU_ARCH $ENV{TARGET_CPU_ARCH} CACHE STRING "Target CPU ARCH")
endif()
message(STATUS "TARGET_CPU_ARCH: ${TARGET_CPU_ARCH}")
target_link_libraries(InfiniTensor ${KUNLUN_RT} ${KUNLUN_DNN} stdc++)
endif()
# # Python bindings # # Python bindings
# pybind11_add_module(infini MODULE ${FFI}) # pybind11_add_module(infini MODULE ${FFI})
# target_link_libraries(infini PRIVATE infini_cpp) # target_link_libraries(infini PRIVATE infini_cpp)
@ -275,6 +316,9 @@ if(BUILD_TEST)
if (USE_BANG) if (USE_BANG)
build_test(test/kernels/bang/*.cc) build_test(test/kernels/bang/*.cc)
endif() endif()
if (USE_KUNLUN)
build_test(test/kernels/kunlun/*.cc)
endif()
if (USE_INTELCPU) if (USE_INTELCPU)
build_test(test/kernels/intelcpu/*.cc) build_test(test/kernels/intelcpu/*.cc)
endif() endif()

View File

@ -3,6 +3,7 @@
TYPE ?= Release TYPE ?= Release
CUDA ?= OFF CUDA ?= OFF
BANG ?= OFF BANG ?= OFF
KUNLUN ?= OFF
INTELCPU ?= off INTELCPU ?= off
BACKTRACE ?= ON BACKTRACE ?= ON
TEST ?= ON TEST ?= ON
@ -25,6 +26,7 @@ endif
CMAKE_OPT = -DCMAKE_BUILD_TYPE=$(TYPE) CMAKE_OPT = -DCMAKE_BUILD_TYPE=$(TYPE)
CMAKE_OPT += -DUSE_CUDA=$(CUDA) CMAKE_OPT += -DUSE_CUDA=$(CUDA)
CMAKE_OPT += -DUSE_BANG=$(BANG) CMAKE_OPT += -DUSE_BANG=$(BANG)
CMAKE_OPT += -DUSE_KUNLUN=$(KUNLUN)
CMAKE_OPT += -DUSE_BACKTRACE=$(BACKTRACE) CMAKE_OPT += -DUSE_BACKTRACE=$(BACKTRACE)
CMAKE_OPT += -DBUILD_TEST=$(TEST) CMAKE_OPT += -DBUILD_TEST=$(TEST)

View File

@ -133,6 +133,13 @@
make install-python BANG=ON make install-python BANG=ON
``` ```
编译 CPU 部分,同时编译昆仑 XPU 部分:
```bash
export KUNLUN_HOME=/path/to/your/kunlun_home
make install-python KUNLUN=ON
```
3. 使用方法 3. 使用方法
安装成功后,您就可以使用本项目的 Python 接口进行编码并运行。具体使用方式可以参考项目样例代码 example/Resnet/resnet.py 以及用户使用手册 安装成功后,您就可以使用本项目的 Python 接口进行编码并运行。具体使用方式可以参考项目样例代码 example/Resnet/resnet.py 以及用户使用手册

View File

@ -26,6 +26,7 @@
- `TYPE`:编译模式(`debug`/`release`),默认值为 `release` - `TYPE`:编译模式(`debug`/`release`),默认值为 `release`
- `CUDA`:是否编译 CUDA 后端,默认为 `OFF``ON` 打开 - `CUDA`:是否编译 CUDA 后端,默认为 `OFF``ON` 打开
- `BANG`:是否编译寒武纪后端,默认为 `OFF``ON` 打开 - `BANG`:是否编译寒武纪后端,默认为 `OFF``ON` 打开
- `KUNLUN`:是否编译昆仑后端,默认为 `OFF``ON` 打开
- `BACKTRACE`:是否启用栈回溯,默认为 `ON``OFF` 关闭,建议调试时打开 - `BACKTRACE`:是否启用栈回溯,默认为 `ON``OFF` 关闭,建议调试时打开
- `TEST`:是否编译 `googletest`,默认为 `ON``OFF` 关闭,只有 `test-cpp` 时必要 - `TEST`:是否编译 `googletest`,默认为 `ON``OFF` 关闭,只有 `test-cpp` 时必要

2
env.sh
View File

@ -35,4 +35,4 @@ export LD_LIBRARY_PATH="${NEUWARE_HOME}/lib64:${LD_LIBRARY_PATH}"
# ├── tools # ├── tools
# ├── version # ├── version
# └── XTDK # └── XTDK
export XPU_HOME=/usr/local/xpu export KUNLUN_HOME=/usr/local/xpu

View File

@ -21,10 +21,10 @@ struct OpType {
Add, // Binary Add, // Binary
And, // Binary And, // Binary
ArgMax, // ArgMax, //
Asin, // Binary Asin, // Unary
Asinh, // Binary Asinh, // Unary
Atan, // Binary Atan, // Unary
Atanh, // Binary Atanh, // Unary
AveragePool, // Pool AveragePool, // Pool
BatchNormalization, // BatchNormalization, //
Bernoulli, // Bernoulli, //

View File

@ -30,7 +30,7 @@ using OpLists = list<Operator>;
using VType = uint32_t; using VType = uint32_t;
enum class Device { CPU = 1, CUDA, BANG, INTELCPU }; enum class Device { CPU = 1, CUDA, BANG, INTELCPU, KUNLUN };
/***************** Forward declaration end *****************/ /***************** Forward declaration end *****************/
class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> { class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
@ -72,6 +72,7 @@ class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
} }
bool isCuda() const { return device == Device::CUDA; } bool isCuda() const { return device == Device::CUDA; }
bool isBang() const { return device == Device::BANG; } bool isBang() const { return device == Device::BANG; }
bool isKUNLUN() const { return device == Device::KUNLUN; }
void copyBlob(const TensorObj *dst, const TensorObj *src) const; void copyBlob(const TensorObj *dst, const TensorObj *src) const;
// TODO: unify these copy APIs // TODO: unify these copy APIs
virtual void copyBlobFromCPU(void *dst, const void *src, virtual void copyBlobFromCPU(void *dst, const void *src,

View File

@ -180,14 +180,15 @@ class TensorObj : public TensorBaseObj {
} }
template <typename T> template <typename T>
bool equalDataImpl(const T *a, const T *b, size_t size) const { bool equalDataImpl(const T *a, const T *b, size_t size,
double relativeError = 1e-6) const {
for (size_t i = 0; i < size; ++i) { for (size_t i = 0; i < size; ++i) {
if constexpr (std::is_integral_v<T>) { if constexpr (std::is_integral_v<T>) {
if (a[i] != b[i]) if (a[i] != b[i])
return false; return false;
} else if constexpr (std::is_floating_point_v<T>) { } else if constexpr (std::is_floating_point_v<T>) {
if (fabs(a[i] - b[i]) / std::max(fabs(a[i]), fabs(b[i])) > if (fabs(a[i] - b[i]) / std::max(fabs(a[i]), fabs(b[i])) >
1e-6) { relativeError) {
printf("Error on %lu: %f %f\n", i, a[i], b[i]); printf("Error on %lu: %f %f\n", i, a[i], b[i]);
return false; return false;
} }

View File

@ -0,0 +1,20 @@
#pragma once
#include "core/common.h"
#include "xpu/runtime_ex.h"
#include "xpu/xdnn.h"
#define checkKUNLUNError(call) \
{ \
auto err = call; \
if (XPU_SUCCESS != err) { \
fprintf(stderr, "KUNLUN error in %s:%i : %s.\n", __FILE__, \
__LINE__, xpu_strerror(err)); \
exit(EXIT_FAILURE); \
} \
}
namespace infini {
using KUNLUNPtr = void *;
} // namespace infini

View File

@ -0,0 +1,24 @@
#pragma once
#include "core/kernel.h"
#include "kunlun/kunlun_runtime.h"
namespace infini {
class KUNLUNKernelWithoutConfig : public Kernel {
public:
virtual void compute(const Operator &op, const PerfRecord &record,
const RuntimeObj *context) const {
compute(op, context);
}
virtual void compute(const Operator &op,
const RuntimeObj *context) const = 0;
// Premise: op is idempotent since it is called multiple times.
virtual PerfRecord tune(const Operator &op,
const RuntimeObj *_context) const {
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
return make_ref<PerfRecordObj>(timeit([&]() { compute(op, _context); },
[&]() { context->sync(); }));
}
};
} // namespace infini

View File

@ -0,0 +1,73 @@
#pragma once
#include "core/runtime.h"
#include "kunlun/kunlun_common.h"
namespace infini {
class KUNLUNRuntimeObj : public RuntimeObj {
private:
baidu::xpu::api::Context *xdnn;
KUNLUNPtr workspace;
size_t workspaceSize;
public:
KUNLUNRuntimeObj() : RuntimeObj(Device::KUNLUN) {
xdnn = baidu::xpu::api::create_context();
// 10GB for Longformer
// size_t longformerNum = 3lu * (1 << 30);
workspaceSize = 3ll << 30; // 3 GB
// std::cout<<workspaceSize/1024/1024/1024<< std::endl;
// std::cout<<std::bitset<64>(workspaceSize)<< std::endl;
workspace = alloc(workspaceSize);
}
virtual ~KUNLUNRuntimeObj() {
dealloc(workspace);
baidu::xpu::api::destroy_context(xdnn);
}
string toString() const override;
void run(const Graph &graph, bool tune = false,
bool profiling = false) const;
// double runEvaluation(const Graph &graph, int nWarmups,
// int nEvaluations) const;
void sync() const;
KUNLUNPtr alloc(size_t size) override {
void *ptr;
checkKUNLUNError(
xpu_malloc_ex((void **)&ptr, size, XPUMemoryKind::XPU_MEM_MAIN));
return ptr;
}
void dealloc(void *ptr) override { xpu_free(ptr); }
baidu::xpu::api::Context *KUNLUNHandle() const { return xdnn; }
KUNLUNPtr getWorkspace(size_t size) const {
IT_ASSERT(size <= workspaceSize);
return workspace;
}
void copyBlobFromCPU(void *dst, const void *src,
size_t bytes) const override {
xpu_memcpy(dst, const_cast<void *>(src), bytes,
XPUMemcpyKind::XPU_HOST_TO_DEVICE);
}
void copyBlobToCPU(void *dst, const void *src,
size_t bytes) const override {
xpu_memcpy(dst, const_cast<void *>(src), bytes,
XPUMemcpyKind::XPU_DEVICE_TO_HOST);
}
void copyBlobInsideRuntime(void *dst, const void *src,
size_t bytes) const override {
xpu_memcpy(dst, const_cast<void *>(src), bytes,
XPUMemcpyKind::XPU_DEVICE_TO_DEVICE);
}
void initComm(const string &, int, int) override { IT_TODO_HALT(); }
CommunicatorObj &getCommunicator() const override { IT_TODO_HALT(); }
private:
void runWithoutSync(const Graph &graph, bool tune, bool profiling) const;
};
} // namespace infini

View File

@ -0,0 +1,10 @@
#pragma once
namespace infini {
namespace opTimer {
double getPerfConvXdnn(int n, int c, int h, int w, int f, int r, int s,
int padh, int padw, int strideh, int stridew,
int dilationh, int dilationw, int group,
const char *name);
double getPerfMatmulXdnn(int b, int m, int n, int k, const char *name);
} // namespace opTimer
} // namespace infini

View File

@ -35,6 +35,7 @@ class OnnxStub:
The Onnx model imported into infinitensor. The Onnx model imported into infinitensor.
It can be generated from an Onnx model object. It can be generated from an Onnx model object.
""" """
def __init__(self, model: ModelProto, runtime): def __init__(self, model: ModelProto, runtime):
# We use some user-defined operators for distributed inference # We use some user-defined operators for distributed inference
try: try:
@ -74,7 +75,6 @@ class OnnxStub:
) )
tensors[output.name].set_output() tensors[output.name].set_output()
node_name = [] node_name = []
new_node_name = [] new_node_name = []
for node in model.graph.node: for node in model.graph.node:
@ -244,7 +244,13 @@ class OnnxStub:
) )
(k, d, p, s, ceil_mode) = ( (k, d, p, s, ceil_mode) = (
attributes[name] attributes[name]
for name in ["kernel_shape", "dilations", "pads", "strides", "ceil_mode"] for name in [
"kernel_shape",
"dilations",
"pads",
"strides",
"ceil_mode",
]
) )
if p[0] != p[2] or p[1] != p[3]: if p[0] != p[2] or p[1] != p[3]:
adapt = "{}-adapt".format(node.output[0]) adapt = "{}-adapt".format(node.output[0])
@ -289,7 +295,8 @@ class OnnxStub:
}, },
) )
(k, p, s, ceil_mode) = ( (k, p, s, ceil_mode) = (
attributes[name] for name in ["kernel_shape", "pads", "strides", "ceil_mode"] attributes[name]
for name in ["kernel_shape", "pads", "strides", "ceil_mode"]
) )
if p[0] != p[2] or p[1] != p[3]: if p[0] != p[2] or p[1] != p[3]:
adapt = "{}-adapt".format(node.output[0]) adapt = "{}-adapt".format(node.output[0])
@ -714,10 +721,9 @@ class OnnxStub:
elif node.op_type == "Constant": elif node.op_type == "Constant":
output_name = node.output[0] output_name = node.output[0]
attributes = _parse_attribute(node) attributes = _parse_attribute(node)
tensor = attributes['value'] tensor = attributes["value"]
dims = [d for d in tensor.dims] dims = [d for d in tensor.dims]
tensors[output_name] = self.handler.tensor( tensors[output_name] = self.handler.tensor(dims, tensor.data_type)
dims, tensor.data_type)
data[output_name] = tensor data[output_name] = tensor
tensors[output_name].set_weight() tensors[output_name].set_weight()
else: else:

View File

@ -208,7 +208,7 @@ class TestStringMethods(unittest.TestCase):
relu = make_node("Relu", ["x"], ["y"], name="relu") relu = make_node("Relu", ["x"], ["y"], name="relu")
make_and_import_model(make_graph([relu], "relu", [x], [y])) make_and_import_model(make_graph([relu], "relu", [x], [y]))
'''Gelu operator is not supported by onnx 14.1 currently.''' """Gelu operator is not supported by onnx 14.1 currently."""
def test_gelu(self): def test_gelu(self):
pass pass
# x = make_tensor_value_info("x", TensorProto.FLOAT, [1, 3, 5, 7]) # x = make_tensor_value_info("x", TensorProto.FLOAT, [1, 3, 5, 7])
@ -319,9 +319,15 @@ class TestStringMethods(unittest.TestCase):
indices = make_tensor_value_info("indices", TensorProto.INT64, [2, 1, 2]) indices = make_tensor_value_info("indices", TensorProto.INT64, [2, 1, 2])
output = make_tensor_value_info("output", TensorProto.FLOAT, [2, 1, 2]) output = make_tensor_value_info("output", TensorProto.FLOAT, [2, 1, 2])
gatherElements = make_node( gatherElements = make_node(
"GatherElements", ["data", "indices"], ["output"], axis=1, name="gatherElements" "GatherElements",
["data", "indices"],
["output"],
axis=1,
name="gatherElements",
)
make_and_import_model(
make_graph([gatherElements], "gatherElements", [data, indices], [output])
) )
make_and_import_model(make_graph([gatherElements], "gatherElements", [data, indices], [output]))
def test_reduce_mean(self): def test_reduce_mean(self):
data = make_tensor_value_info("data", TensorProto.FLOAT, [2, 3, 3, 4]) data = make_tensor_value_info("data", TensorProto.FLOAT, [2, 3, 3, 4])

View File

@ -11,7 +11,7 @@ proj_path = Path(sys.path[0]).parent
def format_file(file): def format_file(file):
file = Path(proj_path.joinpath(file)) file = Path(proj_path.joinpath(file))
if file.suffix in c_style_file: if file.suffix in c_style_file:
run(f"clang-format-14 -i {file}", cwd=proj_path, shell=True) run(f"clang-format-14 -style=file -i {file}", cwd=proj_path, shell=True)
run(f"git add {file}", cwd=proj_path, shell=True) run(f"git add {file}", cwd=proj_path, shell=True)
elif file.suffix == py_file: elif file.suffix == py_file:
run(f"black {file}", cwd=proj_path, shell=True) run(f"black {file}", cwd=proj_path, shell=True)

View File

@ -100,7 +100,8 @@ bool TensorObj::equalData(const Tensor &rhs, double relativeError) const {
#define TEST_EQUAL(N) \ #define TEST_EQUAL(N) \
if (dtype == DataType(N)) \ if (dtype == DataType(N)) \
return equalDataImpl(getRawDataPtr<DT<N>::t *>(), \ return equalDataImpl(getRawDataPtr<DT<N>::t *>(), \
rhs->getRawDataPtr<DT<N>::t *>(), size()); rhs->getRawDataPtr<DT<N>::t *>(), size(), \
relativeError);
TEST_EQUAL(0) // fmt: new line TEST_EQUAL(0) // fmt: new line
else TEST_EQUAL(1) // else TEST_EQUAL(1) //

View File

@ -24,6 +24,9 @@
#ifdef USE_BANG #ifdef USE_BANG
#include "bang/bang_runtime.h" #include "bang/bang_runtime.h"
#endif #endif
#ifdef USE_KUNLUN
#include "kunlun/kunlun_runtime.h"
#endif
#ifdef USE_INTELCPU #ifdef USE_INTELCPU
#include "intelcpu/mkl_runtime.h" #include "intelcpu/mkl_runtime.h"
#include "intelcpu/operator_timer.h" #include "intelcpu/operator_timer.h"
@ -158,6 +161,12 @@ static int tensor_dtype(Tensor t) {
static Ref<BangRuntimeObj> bang_runtime() { return make_ref<BangRuntimeObj>(); } static Ref<BangRuntimeObj> bang_runtime() { return make_ref<BangRuntimeObj>(); }
#endif #endif
#ifdef USE_KUNLUN
static Ref<KUNLUNRuntimeObj> kunlun_runtime() {
return make_ref<KUNLUNRuntimeObj>();
}
#endif
#ifdef USE_INTELCPU #ifdef USE_INTELCPU
static Ref<RuntimeObj> intelcpu_runtime() { return make_ref<MklRuntimeObj>(); } static Ref<RuntimeObj> intelcpu_runtime() { return make_ref<MklRuntimeObj>(); }
#endif #endif
@ -292,6 +301,10 @@ void export_functions(py::module &m) {
#ifdef USE_BANG #ifdef USE_BANG
.FUNCTION(bang_runtime) .FUNCTION(bang_runtime)
#endif #endif
#ifdef USE_KUNLUN
.FUNCTION(kunlun_runtime)
#endif
.FUNCTION(conv_attrs_of) .FUNCTION(conv_attrs_of)
.FUNCTION(conv_trans_attrs_of) .FUNCTION(conv_trans_attrs_of)
.FUNCTION(matmul_attrs_of) .FUNCTION(matmul_attrs_of)
@ -365,6 +378,10 @@ void init_graph_builder(py::module &m) {
#ifdef USE_BANG #ifdef USE_BANG
py::class_<BangRuntimeObj, std::shared_ptr<BangRuntimeObj>, RuntimeObj>( py::class_<BangRuntimeObj, std::shared_ptr<BangRuntimeObj>, RuntimeObj>(
m, "BangRuntime"); m, "BangRuntime");
#endif
#ifdef USE_KUNLUN
py::class_<KUNLUNRuntimeObj, std::shared_ptr<KUNLUNRuntimeObj>, RuntimeObj>(
m, "KUNLUNRuntime");
#endif #endif
py::class_<TensorObj, std::shared_ptr<TensorObj>>(m, "Tensor", py::class_<TensorObj, std::shared_ptr<TensorObj>>(m, "Tensor",
py::buffer_protocol()) py::buffer_protocol())

View File

@ -58,6 +58,21 @@ template <typename T> class NaiveMul : public NativeElementWise<T> {
template <typename T> class NaiveDiv : public NativeElementWise<T> { template <typename T> class NaiveDiv : public NativeElementWise<T> {
T doCompute(T val0, T val1) const override { return (T)(val0 / val1); } T doCompute(T val0, T val1) const override { return (T)(val0 / val1); }
}; };
template <typename T> class NaiveEqual : public NativeElementWise<T> {
T doCompute(T val0, T val1) const override { return (T)(val0 == val1); }
};
template <typename T> class NaiveGreaterEqual : public NativeElementWise<T> {
T doCompute(T val0, T val1) const override { return (T)(val0 >= val1); }
};
template <typename T> class NaiveGreaterThan : public NativeElementWise<T> {
T doCompute(T val0, T val1) const override { return (T)(val0 > val1); }
};
template <typename T> class NaiveLessEqual : public NativeElementWise<T> {
T doCompute(T val0, T val1) const override { return (T)(val0 <= val1); }
};
template <typename T> class NaiveLessThan : public NativeElementWise<T> {
T doCompute(T val0, T val1) const override { return (T)(val0 < val1); }
};
REGISTER_KERNEL(Device::CPU, OpType::Add, DataType::UInt32, NaiveAdd<uint32_t>, REGISTER_KERNEL(Device::CPU, OpType::Add, DataType::UInt32, NaiveAdd<uint32_t>,
"addNaive_CPU_uint32"); "addNaive_CPU_uint32");
@ -75,4 +90,24 @@ REGISTER_KERNEL(Device::CPU, OpType::Div, DataType::UInt32, NaiveDiv<uint32_t>,
"divNaive_CPU_uint32"); "divNaive_CPU_uint32");
REGISTER_KERNEL(Device::CPU, OpType::Div, DataType::Float32, NaiveDiv<float>, REGISTER_KERNEL(Device::CPU, OpType::Div, DataType::Float32, NaiveDiv<float>,
"divNaive_CPU_float32"); "divNaive_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Equal, DataType::UInt32,
NaiveEqual<uint32_t>, "equalNaive_CPU_uint32");
REGISTER_KERNEL(Device::CPU, OpType::Equal, DataType::Float32,
NaiveEqual<float>, "equalNaive_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::GreaterOrEqual, DataType::UInt32,
NaiveGreaterEqual<uint32_t>, "greaterEqualNaive_CPU_uint32");
REGISTER_KERNEL(Device::CPU, OpType::GreaterOrEqual, DataType::Float32,
NaiveGreaterEqual<float>, "greaterEqualNaive_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Greater, DataType::UInt32,
NaiveGreaterThan<uint32_t>, "greaterThanNaive_CPU_uint32");
REGISTER_KERNEL(Device::CPU, OpType::Greater, DataType::Float32,
NaiveGreaterThan<float>, "greaterThanNaive_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::LessOrEqual, DataType::UInt32,
NaiveLessEqual<uint32_t>, "lessEqualNaive_CPU_uint32");
REGISTER_KERNEL(Device::CPU, OpType::LessOrEqual, DataType::Float32,
NaiveLessEqual<float>, "lessEqualNaive_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Less, DataType::UInt32,
NaiveLessThan<uint32_t>, "lessEqualNaive_CPU_uint32");
REGISTER_KERNEL(Device::CPU, OpType::Less, DataType::Float32,
NaiveLessThan<float>, "lessEqualNaive_CPU_float32");
}; // namespace infini }; // namespace infini

View File

@ -71,6 +71,26 @@ template <typename T> class NaiveSqrt : public NativeUnary<T> {
T doCompute(T val) const override { return std::sqrt(val); } T doCompute(T val) const override { return std::sqrt(val); }
}; };
template <typename T> class NaiveCos : public NativeUnary<T> {
T doCompute(T val) const override { return std::cos(val); }
};
template <typename T> class NaiveSin : public NativeUnary<T> {
T doCompute(T val) const override { return std::sin(val); }
};
template <typename T> class NaiveTan : public NativeUnary<T> {
T doCompute(T val) const override { return std::tan(val); }
};
template <typename T> class NaiveSinh : public NativeUnary<T> {
T doCompute(T val) const override { return std::sinh(val); }
};
template <typename T> class NaiveCosh : public NativeUnary<T> {
T doCompute(T val) const override { return std::cosh(val); }
};
template <typename T> class NaiveGelu : public NativeUnary<T> { template <typename T> class NaiveGelu : public NativeUnary<T> {
T doCompute(T val) const override { T doCompute(T val) const override {
return 0.5 * val * (1 + std::erf(val / std::sqrt(2))); return 0.5 * val * (1 + std::erf(val / std::sqrt(2)));
@ -81,6 +101,26 @@ template <typename T> class NaiveErf : public NativeUnary<T> {
T doCompute(T val) const override { return std::erf(val); } T doCompute(T val) const override { return std::erf(val); }
}; };
template <typename T> class NaiveACos : public NativeUnary<T> {
T doCompute(T val) const override { return std::acos(val); }
};
template <typename T> class NaiveACosh : public NativeUnary<T> {
T doCompute(T val) const override { return std::acosh(val); }
};
template <typename T> class NaiveASin : public NativeUnary<T> {
T doCompute(T val) const override { return std::asin(val); }
};
template <typename T> class NaiveASinh : public NativeUnary<T> {
T doCompute(T val) const override { return std::asinh(val); }
};
template <typename T> class NaiveATanh : public NativeUnary<T> {
T doCompute(T val) const override { return std::atanh(val); }
};
template <typename T> class NaiveNeg : public NativeUnary<T> { template <typename T> class NaiveNeg : public NativeUnary<T> {
T doCompute(T val) const override { return -val; } T doCompute(T val) const override { return -val; }
}; };
@ -104,6 +144,43 @@ template <typename T> class Clip : public CpuKernelWithoutConfig {
} }
}; };
template <typename T> class Log : public CpuKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *context) const override {
auto op = as<LogObj>(_op);
T *inptr = op->getInputs(0)->getRawDataPtr<T *>();
T *outptr = op->getOutput()->getRawDataPtr<T *>();
auto logType = op->getType(); // get log type
auto len = op->getOutput()->size();
for (size_t offset = 0; offset < len; offset++) {
T res;
auto val = *inptr++;
switch (logType) {
case LogObj::LogE:
res = std::log(val);
*outptr++ = res;
break;
case LogObj::Log2:
res = std::log2(val);
*outptr++ = res;
break;
case LogObj::Log10:
res = std::log10(val);
*outptr++ = res;
break;
default:
printf("LogType not Defined");
break;
}
}
}
};
template <typename T> class NaiveATan : public NativeUnary<T> {
T doCompute(T val) const override { return std::atan(val); }
};
REGISTER_KERNEL(Device::CPU, OpType::Relu, DataType::UInt32, REGISTER_KERNEL(Device::CPU, OpType::Relu, DataType::UInt32,
NaiveRelu<uint32_t>, "reluNaive_CPU_uint32"); NaiveRelu<uint32_t>, "reluNaive_CPU_uint32");
REGISTER_KERNEL(Device::CPU, OpType::Relu, DataType::Float32, NaiveRelu<float>, REGISTER_KERNEL(Device::CPU, OpType::Relu, DataType::Float32, NaiveRelu<float>,
@ -140,4 +217,28 @@ REGISTER_KERNEL(Device::CPU, OpType::Softmax, DataType::Float32,
NaiveSoftmax<float>, "softmaxNaive_CPU_float32"); NaiveSoftmax<float>, "softmaxNaive_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Clip, DataType::Float32, Clip<float>, REGISTER_KERNEL(Device::CPU, OpType::Clip, DataType::Float32, Clip<float>,
"Clip_CPU_float32"); "Clip_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Atan, DataType::Float32, NaiveATan<float>,
"Atan_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Log, DataType::Float32, Log<float>,
"Log_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Cos, DataType::Float32, NaiveCos<float>,
"Cos_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Sin, DataType::Float32, NaiveSin<float>,
"Sin_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Tan, DataType::Float32, NaiveTan<float>,
"Tan_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Sinh, DataType::Float32, NaiveSinh<float>,
"Sinh_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Cosh, DataType::Float32, NaiveCosh<float>,
"Cosh_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Acos, DataType::Float32, NaiveACos<float>,
"ACos_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Acosh, DataType::Float32,
NaiveACosh<float>, "ACosh_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Asin, DataType::Float32, NaiveASin<float>,
"ASin_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Asinh, DataType::Float32,
NaiveASinh<float>, "ASinh_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Atanh, DataType::Float32,
NaiveATanh<float>, "ATanh_CPU_float32");
}; // namespace infini }; // namespace infini

View File

@ -59,7 +59,8 @@ void gather_elements_kernel(void *in, void *out, GatherMetaData metaData,
reinterpret_cast<int *>(in), reinterpret_cast<int *>(out), metaData, reinterpret_cast<int *>(in), reinterpret_cast<int *>(out), metaData,
num); num);
} else { } else {
IT_TODO_HALT_MSG("GatherElements Cuda Kernel: Unsupported data type.\n"); IT_TODO_HALT_MSG(
"GatherElements Cuda Kernel: Unsupported data type.\n");
} }
} }
} // namespace infini } // namespace infini

View File

@ -0,0 +1,41 @@
#include "operators/batch_norm.h"
#include "kunlun/kunlun_kernel_without_config.h"
#include "kunlun/kunlun_runtime.h"
namespace infini {
class BatchNormXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<BatchNormObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const input = (op->getInputs(0)->getRawDataPtr<void *>());
void *const mean = (op->getInputs(1)->getRawDataPtr<void *>());
void *const var = (op->getInputs(2)->getRawDataPtr<void *>());
void *const scale = (op->getInputs(3)->getRawDataPtr<void *>());
void *const bias = (op->getInputs(4)->getRawDataPtr<void *>());
void *const output = (op->getOutput()->getRawDataPtr<void *>());
auto dims = op->getInputs(0)->getDims();
if (dims.size() != 4)
IT_TODO_HALT();
int w = dims[3];
int h = dims[2];
int c = dims[1];
int n = dims[0];
auto ret = baidu::xpu::api::batch_norm_infer<float>(
context->KUNLUNHandle(), (float *)input, (float *)output, n, c, h,
w, op->getEps(), (float *)scale, (float *)bias, (float *)mean,
(float *)var, true);
assert(ret == 0);
return;
}
};
REGISTER_KERNEL(Device::KUNLUN, OpType::BatchNormalization, DataType::Float32,
BatchNormXdnn, "BatchNorm_xdnn_KUNLUN_Float32");
}; // namespace infini

View File

@ -0,0 +1,98 @@
#include "kunlun/kunlun_kernel_without_config.h"
#include "kunlun/kunlun_runtime.h"
#include "operators/unary.h"
namespace infini {
class CastXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<CastObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
CastType type = op->getType();
int ret = 0;
switch (type) {
case CastType::Float2Float16:
ret = baidu::xpu::api::cast<float, float16>(
context->KUNLUNHandle(), (float *)aData, (float16 *)cData, len);
break;
case CastType::Float2Int64:
ret = baidu::xpu::api::cast<float, int64_t>(
context->KUNLUNHandle(), (float *)aData, (int64_t *)cData, len);
break;
case CastType::Float2Int32:
ret = baidu::xpu::api::cast<float, int>(
context->KUNLUNHandle(), (float *)aData, (int *)cData, len);
break;
case CastType::Float2Int16:
ret = baidu::xpu::api::cast<float, int16_t>(
context->KUNLUNHandle(), (float *)aData, (int16_t *)cData, len);
break;
case CastType::Float2Int8:
ret = baidu::xpu::api::cast<float, int8_t>(
context->KUNLUNHandle(), (float *)aData, (int8_t *)cData, len);
break;
case CastType::Int322Float:
ret = baidu::xpu::api::cast<int, float>(
context->KUNLUNHandle(), (int *)aData, (float *)cData, len);
break;
case CastType::Int322Int8:
ret = baidu::xpu::api::cast<int, int8_t>(
context->KUNLUNHandle(), (int *)aData, (int8_t *)cData, len);
break;
case CastType::Int322Int16:
ret = baidu::xpu::api::cast<int, int16_t>(
context->KUNLUNHandle(), (int *)aData, (int16_t *)cData, len);
break;
case CastType::Int162Float:
ret = baidu::xpu::api::cast<int16_t, float>(
context->KUNLUNHandle(), (int16_t *)aData, (float *)cData, len);
break;
case CastType::Int162Int32:
ret = baidu::xpu::api::cast<int16_t, int>(
context->KUNLUNHandle(), (int16_t *)aData, (int *)cData, len);
break;
case CastType::Int82Float:
ret = baidu::xpu::api::cast<int8_t, float>(
context->KUNLUNHandle(), (int8_t *)aData, (float *)cData, len);
break;
case CastType::Int82Int16:
ret = baidu::xpu::api::cast<int8_t, int16_t>(
context->KUNLUNHandle(), (int8_t *)aData, (int16_t *)cData,
len);
break;
case CastType::Int82Int32:
ret = baidu::xpu::api::cast<int8_t, int>(
context->KUNLUNHandle(), (int8_t *)aData, (int *)cData, len);
break;
case CastType::Int322Int64:
ret = baidu::xpu::api::cast<int, int64_t>(
context->KUNLUNHandle(), (int *)aData, (int64_t *)cData, len);
break;
case CastType::Int642Int32:
ret = baidu::xpu::api::cast<int64_t, int>(
context->KUNLUNHandle(), (int64_t *)aData, (int *)cData, len);
break;
case CastType::Int642Float:
ret = baidu::xpu::api::cast<int64_t, float>(
context->KUNLUNHandle(), (int64_t *)aData, (float *)cData, len);
break;
case CastType::Float162Float:
ret = baidu::xpu::api::cast<float16, float>(
context->KUNLUNHandle(), (float16 *)aData, (float *)cData, len);
break;
default:
IT_TODO_HALT();
}
assert(ret == 0);
return;
}
};
REGISTER_KERNEL(Device::KUNLUN, OpType::Cast, DataType::Float32, CastXdnn,
"Cast_xdnn_KUNLUN_Float32");
}; // namespace infini

View File

@ -0,0 +1,37 @@
#include "operators/concat.h"
#include "kunlun/kunlun_kernel_without_config.h"
#include "kunlun/kunlun_runtime.h"
namespace infini {
class ConcatXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ConcatObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
int axis = op->getDim();
int num = op->numInputs();
std::vector<const float *> inputsData;
for (int i = 0; i < num; ++i) {
inputsData.push_back(
(float *)(op->getInputs(i)->getRawDataPtr<void *>()));
}
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
std::vector<std::vector<int>> dims;
for (int i = 0; i < num; ++i) {
auto dim = op->getInputs(i)->getDims();
if (dim.size() != 4) {
IT_TODO_HALT();
}
dims.push_back(dim);
}
auto ret = baidu::xpu::api::concat<float>(
context->KUNLUNHandle(), inputsData, (float *)cData, dims, axis);
assert(ret == 0);
return;
}
};
REGISTER_KERNEL(Device::KUNLUN, OpType::Concat, DataType::Float32, ConcatXdnn,
"Concat_xdnn_KUNLUN_Float32");
}; // namespace infini

View File

@ -0,0 +1,37 @@
#include "operators/conv.h"
#include "kunlun/kunlun_kernel_without_config.h"
#include "kunlun/kunlun_runtime.h"
namespace infini {
class ConvXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ConvObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
const auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
const int cpg = op->getChannelPerGroup();
const int g = c / cpg;
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
std::vector<int> pads = {ph, pw};
std::vector<int> ksize = {r, s};
std::vector<int> stride = {sh, sw};
std::vector<int> dilation = {dh, dw};
auto ret = baidu::xpu::api::conv2d<float, float, float, float>(
context->KUNLUNHandle(), (float *)aData, (float *)bData,
(float *)cData, n, c, h, w, f, ksize, stride, pads, dilation, g,
nullptr, nullptr, nullptr, true);
assert(ret == 0);
return;
}
};
REGISTER_KERNEL(Device::KUNLUN, OpType::Conv, DataType::Float32, ConvXdnn,
"Conv_xdnn_KUNLUN_Float32");
}; // namespace infini

View File

@ -0,0 +1,54 @@
#include "kunlun/kunlun_kernel_without_config.h"
#include "kunlun/kunlun_runtime.h"
#include "operators/conv.h"
namespace infini {
class ConvTransXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ConvBaseObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
const auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
const int cpg = op->getChannelPerGroup();
const int g = c / cpg;
const bool isNCHW =
(op->getOpType() == OpType::ConvTransNHWC) ? false : true;
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
std::vector<int> pads = {ph, pw};
std::vector<int> ksize = {r, s};
std::vector<int> stride = {sh, sw};
std::vector<int> dilation = {dh, dw};
auto dimInputs0 = op->getInputs(0)->getDims();
auto dimInputs1 = op->getInputs(1)->getDims();
auto dimOutput = op->getOutput()->getDims();
if (dimInputs0.size() != 4)
IT_TODO_HALT();
if (dimInputs1.size() != 4)
IT_TODO_HALT();
if (dimOutput.size() != 4)
IT_TODO_HALT();
auto ret =
baidu::xpu::api::conv2d_transpose<float, float, float, float>(
context->KUNLUNHandle(), (float *)aData, (float *)bData,
(float *)cData, n, c, h, w, f, ksize, stride, pads, dilation, g,
nullptr, nullptr, nullptr, isNCHW);
assert(ret == 0);
return;
}
};
REGISTER_KERNEL(Device::KUNLUN, OpType::ConvTranspose, DataType::Float32,
ConvTransXdnn, "ConvTrans_xdnn_KUNLUN_Float32");
REGISTER_KERNEL(Device::KUNLUN, OpType::ConvTransNHWC, DataType::Float32,
ConvTransXdnn, "ConvTranposedNHWC_xdnn_KUNLUN_Float32");
}; // namespace infini

View File

@ -0,0 +1,476 @@
#include "operators/element_wise.h"
#include "kunlun/kunlun_kernel_without_config.h"
#include "kunlun/kunlun_runtime.h"
namespace infini {
class AddXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ElementWiseObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto aDim = op->getInputs(0)->getDims();
auto bDim = op->getInputs(1)->getDims();
if (aDim.size() != 4 || bDim.size() != 4)
IT_TODO_HALT();
auto ret = baidu::xpu::api::broadcast_add<float>(
context->KUNLUNHandle(), (float *)aData, (float *)bData,
(float *)cData, aDim, bDim);
assert(ret == 0);
return;
}
};
class SubXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ElementWiseObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto aDim = op->getInputs(0)->getDims();
auto bDim = op->getInputs(1)->getDims();
if (aDim.size() != 4 || bDim.size() != 4)
IT_TODO_HALT();
auto ret = baidu::xpu::api::broadcast_sub<float>(
context->KUNLUNHandle(), (float *)aData, (float *)bData,
(float *)cData, aDim, bDim);
assert(ret == 0);
return;
}
};
class MulXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ElementWiseObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto aDim = op->getInputs(0)->getDims();
auto bDim = op->getInputs(1)->getDims();
if (aDim.size() != 4 || bDim.size() != 4)
IT_TODO_HALT();
auto ret = baidu::xpu::api::broadcast_mul<float>(
context->KUNLUNHandle(), (float *)aData, (float *)bData,
(float *)cData, aDim, bDim);
assert(ret == 0);
return;
}
};
class DivXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ElementWiseObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto aDim = op->getInputs(0)->getDims();
auto bDim = op->getInputs(1)->getDims();
if (aDim.size() != 4 || bDim.size() != 4)
IT_TODO_HALT();
auto ret = baidu::xpu::api::broadcast_div<float>(
context->KUNLUNHandle(), (float *)aData, (float *)bData,
(float *)cData, aDim, bDim);
assert(ret == 0);
return;
}
};
class PowXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ElementWiseObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto aDim = op->getInputs(0)->getDims();
auto bDim = op->getInputs(1)->getDims();
if (aDim.size() != 4 || bDim.size() != 4)
IT_TODO_HALT();
auto ret = baidu::xpu::api::broadcast_pow<float>(
context->KUNLUNHandle(), (float *)aData, (float *)bData,
(float *)cData, aDim, bDim);
assert(ret == 0);
return;
}
};
class MaxXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ElementWiseObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto aDim = op->getInputs(0)->getDims();
auto bDim = op->getInputs(1)->getDims();
if (aDim.size() != 4 || bDim.size() != 4)
IT_TODO_HALT();
auto ret = baidu::xpu::api::broadcast_max<float>(
context->KUNLUNHandle(), (float *)aData, (float *)bData,
(float *)cData, aDim, bDim);
assert(ret == 0);
return;
}
};
class MinXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ElementWiseObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto aDim = op->getInputs(0)->getDims();
auto bDim = op->getInputs(1)->getDims();
if (aDim.size() != 4 || bDim.size() != 4)
IT_TODO_HALT();
auto ret = baidu::xpu::api::broadcast_min<float>(
context->KUNLUNHandle(), (float *)aData, (float *)bData,
(float *)cData, aDim, bDim);
assert(ret == 0);
return;
}
};
class EqualXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ElementWiseObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
size_t len = op->getOutput()->size();
KUNLUNPtr wsData = context->getWorkspace(len);
auto aDim = op->getInputs(0)->getDims();
auto bDim = op->getInputs(1)->getDims();
if (aDim.size() != 4 || bDim.size() != 4)
IT_TODO_HALT();
auto ret = baidu::xpu::api::broadcast_equal<float>(
context->KUNLUNHandle(), (float *)aData, (float *)bData,
(bool *)wsData, aDim, bDim);
ret = baidu::xpu::api::cast<bool, float>(
context->KUNLUNHandle(), (bool *)wsData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class GreaterEqualXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ElementWiseObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
size_t len = op->getOutput()->size();
KUNLUNPtr wsData = context->getWorkspace(len);
auto aDim = op->getInputs(0)->getDims();
auto bDim = op->getInputs(1)->getDims();
if (aDim.size() != 4 || bDim.size() != 4)
IT_TODO_HALT();
auto ret = baidu::xpu::api::broadcast_greater_equal<float>(
context->KUNLUNHandle(), (float *)aData, (float *)bData,
(bool *)wsData, aDim, bDim);
ret = baidu::xpu::api::cast<bool, float>(
context->KUNLUNHandle(), (bool *)wsData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class GreaterThanXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ElementWiseObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
size_t len = op->getOutput()->size();
KUNLUNPtr wsData = context->getWorkspace(len);
auto aDim = op->getInputs(0)->getDims();
auto bDim = op->getInputs(1)->getDims();
if (aDim.size() != 4 || bDim.size() != 4)
IT_TODO_HALT();
auto ret = baidu::xpu::api::broadcast_greater_than<float>(
context->KUNLUNHandle(), (float *)aData, (float *)bData,
(bool *)wsData, aDim, bDim);
ret = baidu::xpu::api::cast<bool, float>(
context->KUNLUNHandle(), (bool *)wsData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class LessEqualXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ElementWiseObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
size_t len = op->getOutput()->size();
KUNLUNPtr wsData = context->getWorkspace(len);
auto aDim = op->getInputs(0)->getDims();
auto bDim = op->getInputs(1)->getDims();
if (aDim.size() != 4 || bDim.size() != 4)
IT_TODO_HALT();
auto ret = baidu::xpu::api::broadcast_less_equal<float>(
context->KUNLUNHandle(), (float *)aData, (float *)bData,
(bool *)wsData, aDim, bDim);
ret = baidu::xpu::api::cast<bool, float>(
context->KUNLUNHandle(), (bool *)wsData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class LessThanXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ElementWiseObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
size_t len = op->getOutput()->size();
KUNLUNPtr wsData = context->getWorkspace(len);
auto aDim = op->getInputs(0)->getDims();
auto bDim = op->getInputs(1)->getDims();
if (aDim.size() != 4 || bDim.size() != 4)
IT_TODO_HALT();
auto ret = baidu::xpu::api::broadcast_less_than<float>(
context->KUNLUNHandle(), (float *)aData, (float *)bData,
(bool *)wsData, aDim, bDim);
ret = baidu::xpu::api::cast<bool, float>(
context->KUNLUNHandle(), (bool *)wsData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class FloorDivXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ElementWiseObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
size_t len = op->getOutput()->size();
KUNLUNPtr wsData = context->getWorkspace(len);
auto aDim = op->getInputs(0)->getDims();
auto bDim = op->getInputs(1)->getDims();
if (aDim.size() != 4 || bDim.size() != 4)
IT_TODO_HALT();
auto ret = baidu::xpu::api::broadcast_floordiv<float>(
context->KUNLUNHandle(), (float *)aData, (float *)bData,
(float *)wsData, aDim, bDim);
ret = baidu::xpu::api::cast<int, float>(
context->KUNLUNHandle(), (int *)wsData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class MSELossXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<MSELossObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
size_t len = op->getOutput()->size();
auto dim = op->getInputs(0)->getDims();
if (dim.size() != 4)
IT_TODO_HALT();
auto ret = baidu::xpu::api::mse_loss<float>(
context->KUNLUNHandle(), (float *)aData, (float *)bData,
(float *)cData, len);
assert(ret == 0);
return;
}
};
class AndXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ElementWiseObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
size_t len = op->getOutput()->size();
KUNLUNPtr wsData = context->getWorkspace(len);
auto aDim = op->getInputs(0)->getDims();
auto bDim = op->getInputs(1)->getDims();
if (aDim.size() != 4 || bDim.size() != 4)
IT_TODO_HALT();
auto ret = baidu::xpu::api::logical_and<bool>(
context->KUNLUNHandle(), (bool *)aData, (bool *)bData,
(bool *)wsData, len);
ret = baidu::xpu::api::cast<bool, float>(
context->KUNLUNHandle(), (bool *)wsData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class OrXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ElementWiseObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
size_t len = op->getOutput()->size();
KUNLUNPtr wsData = context->getWorkspace(len);
auto aDim = op->getInputs(0)->getDims();
auto bDim = op->getInputs(1)->getDims();
if (aDim.size() != 4 || bDim.size() != 4)
IT_TODO_HALT();
auto ret = baidu::xpu::api::logical_or<bool>(
context->KUNLUNHandle(), (bool *)aData, (bool *)bData,
(bool *)wsData, len);
ret = baidu::xpu::api::cast<bool, float>(
context->KUNLUNHandle(), (bool *)wsData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class XorXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ElementWiseObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
size_t len = op->getOutput()->size();
KUNLUNPtr wsData = context->getWorkspace(len);
auto aDim = op->getInputs(0)->getDims();
auto bDim = op->getInputs(1)->getDims();
if (aDim.size() != 4 || bDim.size() != 4)
IT_TODO_HALT();
auto ret = baidu::xpu::api::logical_xor<bool>(
context->KUNLUNHandle(), (bool *)aData, (bool *)bData,
(bool *)wsData, len);
ret = baidu::xpu::api::cast<bool, float>(
context->KUNLUNHandle(), (bool *)wsData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class NotXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ElementWiseObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
size_t len = op->getOutput()->size();
KUNLUNPtr wsData = context->getWorkspace(len);
auto aDim = op->getInputs(0)->getDims();
if (aDim.size() != 4)
IT_TODO_HALT();
auto ret = baidu::xpu::api::logical_not<bool>(
context->KUNLUNHandle(), (bool *)aData, (bool *)wsData, len);
ret = baidu::xpu::api::cast<bool, float>(
context->KUNLUNHandle(), (bool *)wsData, (float *)cData, len);
assert(ret == 0);
return;
}
};
REGISTER_KERNEL(Device::KUNLUN, OpType::Add, DataType::Float32, AddXdnn,
"Add_xdnn_KUNLUN_Float32");
REGISTER_KERNEL(Device::KUNLUN, OpType::Sub, DataType::Float32, SubXdnn,
"Sub_xdnn_KUNLUN_Float32");
REGISTER_KERNEL(Device::KUNLUN, OpType::Mul, DataType::Float32, MulXdnn,
"Mul_xdnn_KUNLUN_Float32");
REGISTER_KERNEL(Device::KUNLUN, OpType::Div, DataType::Float32, DivXdnn,
"Div_xdnn_KUNLUN_Float32");
REGISTER_KERNEL(Device::KUNLUN, OpType::Pow, DataType::Float32, PowXdnn,
"Pow_xdnn_KUNLUN_Float32");
REGISTER_KERNEL(Device::KUNLUN, OpType::Max, DataType::Float32, MaxXdnn,
"Max_xdnn_KUNLUN_Float32");
REGISTER_KERNEL(Device::KUNLUN, OpType::Min, DataType::Float32, MinXdnn,
"Min_xdnn_KUNLUN_Float32");
REGISTER_KERNEL(Device::KUNLUN, OpType::Equal, DataType::Float32, EqualXdnn,
"Equal_xdnn_KUNLUN_Float32");
REGISTER_KERNEL(Device::KUNLUN, OpType::GreaterOrEqual, DataType::Float32,
GreaterEqualXdnn, "GreaterEqual_xdnn_KUNLUN_Float32");
REGISTER_KERNEL(Device::KUNLUN, OpType::Greater, DataType::Float32,
GreaterThanXdnn, "GreaterThan_xdnn_KUNLUN_Float32");
REGISTER_KERNEL(Device::KUNLUN, OpType::LessOrEqual, DataType::Float32,
LessEqualXdnn, "LessEqual_xdnn_KUNLUN_Float32");
REGISTER_KERNEL(Device::KUNLUN, OpType::Less, DataType::Float32, LessThanXdnn,
"LessThan_xdnn_KUNLUN_Float32");
REGISTER_KERNEL(Device::KUNLUN, OpType::FloorDiv, DataType::Float32,
FloorDivXdnn, "FloorDiv_xdnn_KUNLUN_Float32");
REGISTER_KERNEL(Device::KUNLUN, OpType::MSELoss, DataType::Float32, MSELossXdnn,
"MSELoss_xdnn_KUNLUN_Float32");
REGISTER_KERNEL(Device::KUNLUN, OpType::And, DataType::Float32, AndXdnn,
"And_xdnn_KUNLUN_Float32");
REGISTER_KERNEL(Device::KUNLUN, OpType::Or, DataType::Float32, OrXdnn,
"Or_xdnn_KUNLUN_Float32");
REGISTER_KERNEL(Device::KUNLUN, OpType::Xor, DataType::Float32, XorXdnn,
"Xor_xdnn_KUNLUN_Float32");
REGISTER_KERNEL(Device::KUNLUN, OpType::Not, DataType::Float32, NotXdnn,
"Not_xdnn_KUNLUN_Float32");
}; // namespace infini

View File

@ -0,0 +1,38 @@
#include "operators/matmul.h"
#include "kunlun/kunlun_kernel_without_config.h"
#include "kunlun/kunlun_runtime.h"
namespace infini {
class MatmulXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<MatmulObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
bool transA = op->getTransA();
bool transB = op->getTransB();
if (op->getInputs(0)->getDims().size() != 2 ||
op->getInputs(1)->getDims().size() != 2) {
IT_TODO_HALT();
}
auto m = transA ? op->getInputs(0)->getDims()[1]
: op->getInputs(0)->getDims()[0];
auto n = transB ? op->getInputs(1)->getDims()[0]
: op->getInputs(1)->getDims()[1];
auto k = transA ? op->getInputs(0)->getDims()[0]
: op->getInputs(0)->getDims()[1];
auto ret = baidu::xpu::api::fc<float, float, float, int>(
context->KUNLUNHandle(), (float *)aData, (float *)bData,
(float *)cData, m, n, k, transA, transB, nullptr, nullptr, nullptr);
assert(ret == 0);
return;
}
};
REGISTER_KERNEL(Device::KUNLUN, OpType::MatMul, DataType::Float32, MatmulXdnn,
"Matmul_xdnn_KUNLUN_Float32");
}; // namespace infini

37
src/kernels/kunlun/pad.cc Normal file
View File

@ -0,0 +1,37 @@
#include "operators/pad.h"
#include "kunlun/kunlun_kernel_without_config.h"
#include "kunlun/kunlun_runtime.h"
namespace infini {
class PadXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<PadObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto dim = op->getInputs(0)->getDims();
int dim_size = dim.size();
std::vector<int> pads = op->getPads();
std::cout << std::endl;
std::vector<int> paddings_left(pads.begin(), pads.begin() + dim_size);
std::vector<int> paddings_right(pads.begin() + dim_size, pads.end());
float paddingValue = 0.0;
auto ret = baidu::xpu::api::pad<float>(
context->KUNLUNHandle(), (float *)aData, (float *)cData, dim,
paddings_left, paddings_right, paddingValue);
assert(ret == 0);
return;
}
};
REGISTER_KERNEL(Device::KUNLUN, OpType::Pad, DataType::Float32, PadXdnn,
"Pad_xdnn_KUNLUN_Float32");
}; // namespace infini

View File

@ -0,0 +1,62 @@
#include "operators/pooling.h"
#include "kunlun/kunlun_kernel_without_config.h"
#include "kunlun/kunlun_runtime.h"
namespace infini {
class AvgPooling : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<PoolingObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto [n, c, h, w, kh, kw] = op->getNCHWRS();
auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
std::vector<int> ksize = {kh, kw};
std::vector<int> stride = {sh, sw};
std::vector<int> pad = {ph, pw};
auto ret = baidu::xpu::api::avg_pool2d<float>(
context->KUNLUNHandle(), (float *)aData, (float *)cData, n, c, h, w,
ksize, stride, pad, true, true, nullptr, nullptr);
assert(ret == 0);
return;
}
};
class MaxPooling : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<PoolingObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto [n, c, h, w, kh, kw] = op->getNCHWRS();
auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
std::vector<int> ksize = {kh, kw};
std::vector<int> stride = {sh, sw};
std::vector<int> pad = {ph, pw};
int yh = (h + ph * 2 - kh) / sh + 1;
int yw = (w + pw * 2 - kw) / sw + 1;
KUNLUNPtr indices = context->getWorkspace(yh * yw * 4);
auto ret = baidu::xpu::api::max_pool2d<float>(
context->KUNLUNHandle(), (float *)aData, (float *)cData,
(int *)indices, n, c, h, w, ksize, stride, pad, true, nullptr,
nullptr, false);
assert(ret == 0);
return;
}
};
REGISTER_KERNEL(Device::KUNLUN, OpType::MaxPool, DataType::Float32, MaxPooling,
"MaxPool_xdnn_Float32");
REGISTER_KERNEL(Device::KUNLUN, OpType::AveragePool, DataType::Float32,
AvgPooling, "AvgPool_xdnn_Float32");
}; // namespace infini

View File

@ -0,0 +1,41 @@
#include "operators/split.h"
#include "kunlun/kunlun_kernel_without_config.h"
#include "kunlun/kunlun_runtime.h"
namespace infini {
class SplitXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<SplitObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
int axis = op->getDim();
int num = op->numOutputs();
void *const inputData = (op->getInputs(0)->getRawDataPtr<void *>());
auto inputDim = op->getInputs(0)->getDims();
std::vector<float *> outputsData;
for (int i = 0; i < num; ++i) {
outputsData.push_back(
(float *)(op->getOutput(i)->getRawDataPtr<void *>()));
}
std::vector<int> splitList;
for (int i = 0; i < num; ++i) {
auto dim = op->getOutput(i)->getDims();
if (dim.size() != 4) {
IT_TODO_HALT();
}
splitList.push_back(dim[axis]);
}
auto ret = baidu::xpu::api::split<float>(
context->KUNLUNHandle(), (float *)inputData, outputsData, inputDim,
splitList, axis);
assert(ret == 0);
return;
}
};
REGISTER_KERNEL(Device::KUNLUN, OpType::Split, DataType::Float32, SplitXdnn,
"Split_xdnn_KUNLUN_Float32");
}; // namespace infini

View File

@ -0,0 +1,32 @@
#include "operators/transpose.h"
#include "kunlun/kunlun_kernel_without_config.h"
#include "kunlun/kunlun_runtime.h"
namespace infini {
class TransposeXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<TransposeObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto dimin = op->getInputs(0)->getDims();
auto permute = op->getPermute();
if (dimin.size() != 4) {
IT_TODO_HALT();
}
auto ret = baidu::xpu::api::transpose<float>(
context->KUNLUNHandle(), (float *)aData, (float *)cData, dimin,
permute);
assert(ret == 0);
return;
}
};
REGISTER_KERNEL(Device::KUNLUN, OpType::Transpose, DataType::Float32,
TransposeXdnn, "Transpose_xdnn_KUNLUN_Float32");
}; // namespace infini

550
src/kernels/kunlun/unary.cc Normal file
View File

@ -0,0 +1,550 @@
#include "operators/unary.h"
#include "kunlun/kunlun_kernel_without_config.h"
#include "kunlun/kunlun_runtime.h"
namespace infini {
class ReluXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<UnaryObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::relu<float>(
context->KUNLUNHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class SigmoidXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<UnaryObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::sigmoid<float>(
context->KUNLUNHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class TanhXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<UnaryObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::tanh<float>(
context->KUNLUNHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class SquareXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<UnaryObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::square<float>(
context->KUNLUNHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class SqrtXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<UnaryObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::sqrt<float>(
context->KUNLUNHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class RsqrtXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<UnaryObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::rsqrt<float>(
context->KUNLUNHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class ExpXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<UnaryObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::exp<float>(
context->KUNLUNHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class CeilXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<UnaryObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::ceil<float>(
context->KUNLUNHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class ClipXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ClipObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
float min = op->getMin().value();
float max = op->getMax().value();
auto ret = baidu::xpu::api::clip<float>(context->KUNLUNHandle(),
(float *)aData, (float *)cData,
len, min, max);
assert(ret == 0);
return;
}
};
class FloorXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<UnaryObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::floor<float>(
context->KUNLUNHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class NegXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<UnaryObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::neg<float>(
context->KUNLUNHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class CopyXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &op,
const RuntimeObj *_context) const override {
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::copy<float>(
context->KUNLUNHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class ReciprocalXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<UnaryObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::reciprocal<float>(
context->KUNLUNHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class AbsXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<UnaryObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::abs<float>(
context->KUNLUNHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class ATanXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<UnaryObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::arctan<float>(
context->KUNLUNHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class LogXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<LogObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto aDim = op->getInputs(0)->getDims();
std::vector<int> divDim = {
1,
};
auto len = op->getInputs(0)->size();
// get ptr of tempspace
KUNLUNPtr temp = context->getWorkspace(len * sizeof(float));
LogObj::LogType type = op->getType();
// get output of xpu::api::loge(x)
auto ret = baidu::xpu::api::log<float>(
context->KUNLUNHandle(), (float *)aData, (float *)temp, len);
// get ptr of divider
KUNLUNPtr dd =
(float *)(context->getWorkspace((1 + len) * sizeof(float))) + len;
// choose from logE, log2, log10
switch (type) {
float constant;
case LogObj::LogE:
// if use loge, copy from temp to cData
ret = baidu::xpu::api::copy<float>(
context->KUNLUNHandle(), (float *)temp, (float *)cData, len);
break;
case LogObj::Log2:
constant = std::log(2);
context->copyBlobFromCPU(dd, &constant, sizeof(float));
ret = baidu::xpu::api::broadcast_div<float>(
context->KUNLUNHandle(), (float *)temp, (float *)dd,
(float *)cData, aDim, divDim);
break;
case LogObj::Log10:
constant = std::log(10);
context->copyBlobFromCPU(dd, &constant, sizeof(float));
ret = baidu::xpu::api::broadcast_div<float>(
context->KUNLUNHandle(), (float *)temp, (float *)dd,
(float *)cData, aDim, divDim);
break;
default:
printf("LogType not support!");
break;
}
assert(ret == 0);
return;
}
};
class CosXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<CosObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::cos<float>(
context->KUNLUNHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class SinXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<SinObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::sin<float>(
context->KUNLUNHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class TanXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<TanObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::tan<float>(
context->KUNLUNHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class SinhXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<SinHObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::sinh<float>(
context->KUNLUNHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class CoshXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<CosHObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::cosh<float>(
context->KUNLUNHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class ErfXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ErfObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::erf<float>(
context->KUNLUNHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class ACosXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ACosObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::arccos<float>(
context->KUNLUNHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class ACoshXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ACosHObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::acosh<float>(
context->KUNLUNHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class ASinXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ASinObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::arcsin<float>(
context->KUNLUNHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class ASinhXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ASinHObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::asinh<float>(
context->KUNLUNHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class ATanhXdnn : public KUNLUNKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ATanHObj>(_op);
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::atanh<float>(
context->KUNLUNHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
REGISTER_KERNEL(Device::KUNLUN, OpType::Relu, DataType::Float32, ReluXdnn,
"Relu_xdnn_KUNLUN_Float32");
REGISTER_KERNEL(Device::KUNLUN, OpType::Sigmoid, DataType::Float32, SigmoidXdnn,
"Sigmoid_xdnn_KUNLUN_Float32");
REGISTER_KERNEL(Device::KUNLUN, OpType::Tanh, DataType::Float32, TanhXdnn,
"Tanh_xdnn_KUNLUN_Float32");
REGISTER_KERNEL(Device::KUNLUN, OpType::Square, DataType::Float32, SquareXdnn,
"Square_xdnn_KUNLUN_Float32");
REGISTER_KERNEL(Device::KUNLUN, OpType::Sqrt, DataType::Float32, SqrtXdnn,
"Sqrt_xdnn_KUNLUN_Float32");
REGISTER_KERNEL(Device::KUNLUN, OpType::Rsqrt, DataType::Float32, RsqrtXdnn,
"Rsqrt_xdnn_KUNLUN_Float32");
REGISTER_KERNEL(Device::KUNLUN, OpType::Exp, DataType::Float32, ExpXdnn,
"Exp_xdnn_KUNLUN_Float32");
REGISTER_KERNEL(Device::KUNLUN, OpType::Ceil, DataType::Float32, CeilXdnn,
"Ceil_xdnn_KUNLUN_Float32");
REGISTER_KERNEL(Device::KUNLUN, OpType::Clip, DataType::Float32, ClipXdnn,
"Clip_xdnn_KUNLUN_Float32");
REGISTER_KERNEL(Device::KUNLUN, OpType::Floor, DataType::Float32, FloorXdnn,
"Floor_xdnn_KUNLUN_Float32");
REGISTER_KERNEL(Device::KUNLUN, OpType::Neg, DataType::Float32, NegXdnn,
"Neg_xdnn_KUNLUN_Float32");
REGISTER_KERNEL(Device::KUNLUN, OpType::Reciprocal, DataType::Float32,
ReciprocalXdnn, "Reciprocal_xdnn_KUNLUN_Float32");
REGISTER_KERNEL(Device::KUNLUN, OpType::Reshape, DataType::Float32, CopyXdnn,
"Reshape_xdnn_Float32");
REGISTER_KERNEL(Device::KUNLUN, OpType::Flatten, DataType::Float32, CopyXdnn,
"Flatten_xdnn_Float32");
REGISTER_KERNEL(Device::KUNLUN, OpType::Identity, DataType::Float32, CopyXdnn,
"Identity_xdnn_Float32");
REGISTER_KERNEL(Device::KUNLUN, OpType::Abs, DataType::Float32, AbsXdnn,
"Abs_xdnn_Float32");
REGISTER_KERNEL(Device::KUNLUN, OpType::Atan, DataType::Float32, ATanXdnn,
"Atan_xdnn_Float32");
REGISTER_KERNEL(Device::KUNLUN, OpType::Log, DataType::Float32, LogXdnn,
"Log_xdnn_Float32");
REGISTER_KERNEL(Device::KUNLUN, OpType::Cos, DataType::Float32, CosXdnn,
"Cos_xdnn_Float32");
REGISTER_KERNEL(Device::KUNLUN, OpType::Sin, DataType::Float32, SinXdnn,
"Sin_xdnn_Float32");
REGISTER_KERNEL(Device::KUNLUN, OpType::Tan, DataType::Float32, TanXdnn,
"Tan_xdnn_Float32");
REGISTER_KERNEL(Device::KUNLUN, OpType::Sinh, DataType::Float32, SinhXdnn,
"Sinh_xdnn_Float32");
REGISTER_KERNEL(Device::KUNLUN, OpType::Cosh, DataType::Float32, CoshXdnn,
"Cosh_xdnn_Float32");
REGISTER_KERNEL(Device::KUNLUN, OpType::Erf, DataType::Float32, ErfXdnn,
"Erf_xdnn_Float32");
REGISTER_KERNEL(Device::KUNLUN, OpType::Acos, DataType::Float32, ACosXdnn,
"ACos_xdnn_Float32");
REGISTER_KERNEL(Device::KUNLUN, OpType::Acosh, DataType::Float32, ACoshXdnn,
"ACosh_xdnn_Float32");
REGISTER_KERNEL(Device::KUNLUN, OpType::Asin, DataType::Float32, ASinXdnn,
"ASin_xdnn_Float32");
REGISTER_KERNEL(Device::KUNLUN, OpType::Asinh, DataType::Float32, ASinhXdnn,
"ASinh_xdnn_Float3 2");
REGISTER_KERNEL(Device::KUNLUN, OpType::Atanh, DataType::Float32, ATanhXdnn,
"ATanh_xdnn_Float32");
}; // namespace infini

View File

@ -0,0 +1,60 @@
#include "kunlun/kunlun_runtime.h"
#include "core/kernel.h"
#include "core/perf_engine.h"
namespace infini {
void KUNLUNRuntimeObj::runWithoutSync(const Graph &graph, bool tune = false,
bool profiling = false) const {
const auto &kernelRegistry = KernelRegistry::getInstance();
auto &perfEngine = PerfEngine::getInstance();
double totalTime = 0;
std::map<OpType, double> opTime;
std::map<OpType, int> opCnt;
for (auto &op : graph->getOperators()) {
// HACK: set correct data type
auto kernelAttrs =
KernelAttrs{device, op->getOpType().underlying(), op->getDType()};
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
auto perfData = perfEngine.getPerfData(perfKey);
if (!perfData && !tune) {
kernel->compute(op, this);
continue;
}
PerfRecord record;
if (!perfData) {
record = kernel->tune(op, this);
perfEngine.setPerfData(perfKey, record);
} else
record = perfData;
double t = record->time;
totalTime += t;
if (profiling) {
double t = timeit([&]() { kernel->compute(op, record, this); },
[&]() { sync(); }, 1, 1);
op->print();
printf(" op_time on kunlun xpu %lf\n", t);
totalTime += t;
opTime[op->getOpType()] += t;
opCnt[op->getOpType()]++;
}
}
}
void KUNLUNRuntimeObj::run(const Graph &graph, bool tune,
bool profiling) const {
if (profiling)
IT_TODO_HALT();
runWithoutSync(graph, tune, profiling);
sync();
}
void KUNLUNRuntimeObj::sync() const { ; }
string KUNLUNRuntimeObj::toString() const { return "KUNLUN Runtime"; }
} // namespace infini

View File

@ -0,0 +1,71 @@
#include "kunlun/operator_timer.h"
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "kunlun/kunlun_runtime.h"
#include "operators/conv.h"
#include "operators/matmul.h"
#include "utils/data_generator.h"
namespace infini {
namespace opTimer {
double getPerfConvKunlun(int n, int c, int h, int w, int f, int r, int s,
int padh, int padw, int strideh, int stridew,
int dilationh, int dilationw, int group,
const char *name) {
Runtime cpu = NativeCpuRuntimeObj::getInstance(); // CPUruntime is singleton
Graph gCpu = make_ref<GraphObj>(cpu);
Runtime kunlun = make_ref<KUNLUNRuntimeObj>();
Graph gKunlun = make_ref<GraphObj>(kunlun);
// Set input data on CPU in a CPU Graph
IT_ASSERT(c % group == 0);
Tensor i0Cpu = gCpu->addTensor({n, h, w, c}, DataType::Float32);
Tensor w0Cpu = gCpu->addTensor({f, r, s, c / group}, DataType::Float32);
// Malloc data for all tensors in a graph. Do we need implicit allocation?
gCpu->dataMalloc();
i0Cpu->setData(IncrementalGenerator());
w0Cpu->setData(IncrementalGenerator());
// Copy input tensors from CPU to Kunlun
Tensor i0Kunlun = gKunlun->cloneTensor(i0Cpu);
Tensor w0Kunlun = gKunlun->cloneTensor(w0Cpu);
// Build Kunlun graph
auto conv = gKunlun->addOp<ConvObj>(i0Kunlun, w0Kunlun, nullptr, padh, padw,
strideh, stridew, dilationh, dilationw);
// allocate Kunlun memory
gKunlun->dataMalloc();
// Execute on Kunlun
bool tune = true;
kunlun->run(gKunlun, tune);
return kunlun->getPerfTime(gKunlun);
}
double getPerfMatmulKunlun(int b, int m, int n, int k, const char *name) {
Runtime cpu = NativeCpuRuntimeObj::getInstance(); // CPUruntime is singleton
Graph gCpu = make_ref<GraphObj>(cpu);
Runtime kunlun = make_ref<KUNLUNRuntimeObj>();
Graph gKunlun = make_ref<GraphObj>(kunlun);
// Set input data on CPU in a CPU Graph
Tensor i0Cpu = gCpu->addTensor({b, m, k}, DataType::Float32);
Tensor w0Cpu = gCpu->addTensor({b, k, n}, DataType::Float32);
// Malloc data for all tensors in a graph. Do we need implicit allocation?
gCpu->dataMalloc();
i0Cpu->setData(IncrementalGenerator());
w0Cpu->setData(IncrementalGenerator());
// Copy input tensors from CPU to Kunlun
Tensor i0Kunlun = gKunlun->cloneTensor(i0Cpu);
Tensor w0Kunlun = gKunlun->cloneTensor(w0Cpu);
// Build Kunlun graph
auto conv = gKunlun->addOp<MatmulObj>(i0Kunlun, w0Kunlun, nullptr);
// allocate Kunlun memory
gKunlun->dataMalloc();
// Execute on Kunlun
bool tune = true;
kunlun->run(gKunlun, tune);
return kunlun->getPerfTime(gKunlun);
}
} // namespace opTimer
} // namespace infini

View File

@ -0,0 +1,61 @@
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "kunlun/kunlun_runtime.h"
#include "operators/element_wise.h"
#include "test.h"
namespace infini {
template <class T>
void testAdd(const std::function<void(void *, size_t, DataType)> &generator,
const Shape &shape) {
// Runtime
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto xpuRuntime = make_ref<KUNLUNRuntimeObj>();
// Build input data on CPU
Tensor inputCpu1 =
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
Tensor inputCpu2 =
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
// GPU
Graph xpuGraph = make_ref<GraphObj>(xpuRuntime);
auto inputGpu1 = xpuGraph->cloneTensor(inputCpu1);
auto inputGpu2 = xpuGraph->cloneTensor(inputCpu2);
auto gpuOp = xpuGraph->addOp<T>(inputGpu1, inputGpu2, nullptr);
xpuGraph->dataMalloc();
inputGpu1->setData(generator);
inputGpu2->setData(generator);
xpuRuntime->run(xpuGraph);
auto outputGpu = gpuOp->getOutput();
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
// CPU
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
auto cpuOp = cpuGraph->addOp<T>(inputCpu1, inputCpu2, nullptr);
cpuGraph->addTensor(inputCpu1);
cpuGraph->addTensor(inputCpu2);
cpuGraph->dataMalloc();
inputCpu1->setData(generator);
inputCpu2->setData(generator);
cpuRuntime->run(cpuGraph);
auto outputCpu = cpuOp->getOutput();
// Check
EXPECT_TRUE(outputCpu->equalData(outputGpu2Cpu));
}
TEST(xpu_add, run) {
testAdd<AddObj>(IncrementalGenerator(), Shape{1, 1, 1, 30});
testAdd<SubObj>(IncrementalGenerator(), Shape{1, 1, 1, 30});
testAdd<MulObj>(IncrementalGenerator(), Shape{1, 1, 1, 30});
testAdd<DivObj>(IncrementalGenerator(), Shape{1, 1, 1, 30});
testAdd<EqualObj>(IncrementalGenerator(), Shape{1, 1, 1, 30});
testAdd<GreaterEqualObj>(IncrementalGenerator(), Shape{1, 1, 1, 30});
testAdd<GreaterThanObj>(IncrementalGenerator(), Shape{1, 1, 1, 30});
testAdd<LessEqualObj>(IncrementalGenerator(), Shape{1, 1, 1, 30});
testAdd<LessThanObj>(IncrementalGenerator(), Shape{1, 1, 1, 30});
}
} // namespace infini

View File

@ -0,0 +1,61 @@
#include "core/graph.h"
#include "core/runtime.h"
#include "kunlun/kunlun_kernel_without_config.h"
#include "kunlun/kunlun_runtime.h"
#include "operators/batch_norm.h"
#include "test.h"
namespace infini {
TEST(XPU_BatchNorm, run) {
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto xpuRuntime = make_ref<KUNLUNRuntimeObj>();
// Build cpu graph
Graph gCpu = make_ref<GraphObj>(cpuRuntime);
auto iCpu = gCpu->addTensor(Shape{1, 3, 2, 2}, DataType::Float32);
auto meanCpu = gCpu->addTensor(Shape{3}, DataType::Float32);
auto varCpu = gCpu->addTensor(Shape{3}, DataType::Float32);
auto scaleCpu = gCpu->addTensor(Shape{3}, DataType::Float32);
auto biasCpu = gCpu->addTensor(Shape{3}, DataType::Float32);
// Build input data on CPU
gCpu->dataMalloc();
iCpu->setData(IncrementalGenerator());
meanCpu->copyin(vector<float>{1, 6, 9});
varCpu->copyin(vector<float>{4, 1, 9});
scaleCpu->setData(OneGenerator());
biasCpu->setData(ZeroGenerator());
// Build XPU graph
Graph g = make_ref<GraphObj>(xpuRuntime);
auto i = g->cloneTensor(iCpu);
auto mean = g->cloneTensor(meanCpu);
auto var = g->cloneTensor(varCpu);
auto scale = g->cloneTensor(scaleCpu);
auto bias = g->cloneTensor(biasCpu);
auto op =
g->addOp<BatchNormObj>(i, nullptr, mean, var, scale, bias, 0.9, 0);
// allocate XPU memory
g->dataMalloc();
i->setData(IncrementalGenerator());
mean->copyin(vector<float>{1, 6, 9});
var->copyin(vector<float>{4, 1, 9});
scale->setData(OneGenerator());
bias->setData(ZeroGenerator());
// Execute on XPU
xpuRuntime->run(g);
// clone XPU output to CPU
auto o = op->getOutput();
auto ocpu = o->clone(cpuRuntime);
// check results on CPU
EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 3, 2, 2}));
EXPECT_TRUE(ocpu->equalData(vector<float>{
-0.5, 0, 0.5, 1, -2, -1, 0, 1, -0.333333, 0, 0.3333333, 0.6666667}));
}
} // namespace infini

View File

@ -0,0 +1,54 @@
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "kunlun/kunlun_runtime.h"
#include "operators/concat.h"
#include "test.h"
namespace infini {
template <class T>
void testConcat(const std::function<void(void *, size_t, DataType)> &generator,
const Shape &shape) {
// Runtime
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto xpuRuntime = make_ref<KUNLUNRuntimeObj>();
// Build input data on CPU
Tensor inputCpu1 =
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
inputCpu1->dataMalloc();
inputCpu1->setData(generator);
Tensor inputCpu2 =
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
inputCpu2->dataMalloc();
inputCpu2->setData(generator);
// GPU
Graph xpuGraph = make_ref<GraphObj>(xpuRuntime);
auto inputGpu1 = xpuGraph->cloneTensor(inputCpu1);
auto inputGpu2 = xpuGraph->cloneTensor(inputCpu2);
auto gpuOp =
xpuGraph->addOp<T>(TensorVec{inputGpu1, inputGpu2}, nullptr, 2);
xpuGraph->dataMalloc();
inputGpu1->setData(generator);
inputGpu2->setData(generator);
xpuRuntime->run(xpuGraph);
auto outputGpu = gpuOp->getOutput();
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
// Check
inputCpu1->print();
inputCpu1->printData();
inputCpu2->print();
inputCpu2->printData();
outputGpu2Cpu->print();
outputGpu2Cpu->printData();
EXPECT_TRUE(1);
}
TEST(xpu_Concat, run) {
testConcat<ConcatObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
}
} // namespace infini

View File

@ -0,0 +1,56 @@
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "kunlun/kunlun_runtime.h"
#include "operators/conv.h"
#include "test.h"
namespace infini {
template <class T>
void testConv(const std::function<void(void *, size_t, DataType)> &generatorA,
const std::function<void(void *, size_t, DataType)> &generatorB,
const Shape &shapeA, const Shape &shapeB) {
// Runtime
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto xpuRuntime = make_ref<KUNLUNRuntimeObj>();
// Build input data on CPU
Tensor inputCpu1 =
make_ref<TensorObj>(shapeA, DataType::Float32, cpuRuntime);
Tensor inputCpu2 =
make_ref<TensorObj>(shapeB, DataType::Float32, cpuRuntime);
// MLU
Graph xpuGraph = make_ref<GraphObj>(xpuRuntime);
auto inputMlu1 = xpuGraph->cloneTensor(inputCpu1);
auto inputMlu2 = xpuGraph->cloneTensor(inputCpu2);
auto mluOp =
xpuGraph->addOp<T>(inputMlu1, inputMlu2, nullptr, 1, 1, 1, 1, 1, 1);
xpuGraph->dataMalloc();
inputMlu1->setData(generatorA);
inputMlu2->setData(generatorB);
xpuRuntime->run(xpuGraph);
auto outputXpu = mluOp->getOutput();
auto outputXpu2Cpu = outputXpu->clone(cpuRuntime);
// CPU
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
cpuGraph->addTensor(inputCpu1);
cpuGraph->addTensor(inputCpu2);
auto cpuOp =
cpuGraph->addOp<T>(inputCpu1, inputCpu2, nullptr, 1, 1, 1, 1, 1, 1);
cpuGraph->dataMalloc();
inputCpu1->setData(generatorA);
inputCpu2->setData(generatorB);
cpuRuntime->run(cpuGraph);
auto outputCpu = cpuOp->getOutput();
// Check
EXPECT_TRUE(outputCpu->equalData(outputXpu2Cpu));
}
TEST(xpu_Conv, run) {
testConv<ConvObj>(IncrementalGenerator(), IncrementalGenerator(),
Shape{1, 3, 32, 32}, Shape{2, 3, 3, 3});
}
} // namespace infini

View File

@ -0,0 +1,136 @@
#include "core/graph.h"
#include "core/kernel.h"
#include "core/perf_engine.h"
#include "core/runtime.h"
#include "kunlun/kunlun_kernel_without_config.h"
#include "kunlun/kunlun_runtime.h"
#include "operators/conv.h"
#include "test.h"
namespace infini {
void testConvTransposedXdnn(
const std::function<void(void *, size_t, DataType)> &generator,
vector<float> ansVec) {
const auto &[N, C, H, W, F, R, S] = tuple{1, 1, 2, 2, 1, 4, 4};
const int stride = 1, padding = 0, dilation = 1;
// Construct Runtime and graph for CPU and XPU
Runtime cpu = NativeCpuRuntimeObj::getInstance(); // CPUruntime is singleton
Graph gCpu = make_ref<GraphObj>(cpu);
Runtime xpu = make_ref<KUNLUNRuntimeObj>();
Graph gXpu = make_ref<GraphObj>(xpu);
// Set input data on CPU in a CPU Graph
Tensor i0Cpu = gCpu->addTensor({N, F, H, H}, DataType::Float32);
Tensor w0Cpu = gCpu->addTensor({F, C, R, S}, DataType::Float32);
// Malloc data for all tensors in a graph. Do we need implicit allocation?
gCpu->dataMalloc();
i0Cpu->setData(generator);
w0Cpu->setData(generator);
// Copy input tensors from CPU to XPU
Tensor i0Xpu = gXpu->cloneTensor(i0Cpu);
Tensor w0Xpu = gXpu->cloneTensor(w0Cpu);
// Build XPU graph
auto conv = gXpu->addOp<ConvTransposed2dObj>(i0Xpu, w0Xpu, nullptr, padding,
padding, stride, stride,
dilation, dilation);
gXpu->dataMalloc();
i0Xpu->setData(generator);
w0Xpu->setData(generator);
// Execute on XPU
xpu->run(gXpu);
// copy output from XPU to CPU
auto o0Cpu = gCpu->cloneTensor(conv->getOutput());
// check results on CPU
EXPECT_TRUE(o0Cpu->equalData(ansVec));
}
void testConvTransposedNHWCXdnn(
const std::function<void(void *, size_t, DataType)> &generator,
vector<float> ansVec) {
const auto &[N, C, H, W, F, R, S] = tuple{1, 1, 2, 2, 1, 4, 4};
const int stride = 1, padding = 0, dilation = 1;
// Construct Runtime and graph for CPU and XPU
Runtime cpu = NativeCpuRuntimeObj::getInstance(); // CPUruntime is singleton
Graph gCpu = make_ref<GraphObj>(cpu);
Runtime xpu = make_ref<KUNLUNRuntimeObj>();
Graph gXpu = make_ref<GraphObj>(xpu);
// Set input data on CPU in a CPU Graph
Tensor i0Cpu = gCpu->addTensor({N, H, W, F}, DataType::Float32);
Tensor w0Cpu = gCpu->addTensor({F, R, S, C}, DataType::Float32);
// Malloc data for all tensors in a graph. Do we need implicit allocation?
gCpu->dataMalloc();
i0Cpu->setData(generator);
w0Cpu->setData(generator);
// Copy input tensors from CPU to XPU
Tensor i0Xpu = gXpu->cloneTensor(i0Cpu);
Tensor w0Xpu = gXpu->cloneTensor(w0Cpu);
// Build XPU graph
auto conv = gXpu->addOp<ConvTransposed2dNHWCObj>(
i0Xpu, w0Xpu, nullptr, padding, padding, stride, stride, dilation,
dilation);
gXpu->dataMalloc();
i0Xpu->setData(generator);
w0Xpu->setData(generator);
// Execute on XPU
xpu->run(gXpu);
// copy output from XPU to CPU
auto o0Cpu = gCpu->cloneTensor(conv->getOutput());
// check results on CPU
EXPECT_TRUE(o0Cpu->equalData(ansVec));
}
TEST(XPU_ConvTransposed, run) {
testConvTransposedXdnn(IncrementalGenerator(),
vector<float>{0., 0., 1., 2., 3., 0., 6.,
12., 18., 16., 8., 30., 36., 42.,
32., 16., 54., 60., 66., 48., 24.,
62., 67., 72., 45.});
}
TEST(XPU_ConvTransposedNHWC, run) {
testConvTransposedNHWCXdnn(IncrementalGenerator(),
vector<float>{0., 0., 1., 2., 3., 0., 6.,
12., 18., 16., 8., 30., 36., 42.,
32., 16., 54., 60., 66., 48., 24.,
62., 67., 72., 45.});
}
TEST(XPU_ConvTransposed, run1) {
// Construct Runtime and graph for CPU and XPU
Runtime cpu = NativeCpuRuntimeObj::getInstance(); // CPUruntime is singleton
Graph gCpu = make_ref<GraphObj>(cpu);
Runtime xpu = make_ref<KUNLUNRuntimeObj>();
Graph gXpu = make_ref<GraphObj>(xpu);
// Set input data on CPU in a CPU Graph
Tensor i0Cpu = gCpu->addTensor({1, 2, 3, 3}, DataType::Float32);
Tensor w0Cpu = gCpu->addTensor({2, 2, 3, 3}, DataType::Float32);
// Malloc data for all tensors in a graph. Do we need implicit allocation?
gCpu->dataMalloc();
i0Cpu->setData(IncrementalGenerator());
w0Cpu->setData(IncrementalGenerator());
// Copy input tensors from CPU to XPU
Tensor i0Xpu = gXpu->cloneTensor(i0Cpu);
Tensor w0Xpu = gXpu->cloneTensor(w0Cpu);
// Build XPU graph
auto conv = gXpu->addOp<ConvTransposed2dObj>(i0Xpu, w0Xpu, nullptr, 0, 0);
gXpu->dataMalloc();
i0Xpu->setData(IncrementalGenerator());
w0Xpu->setData(IncrementalGenerator());
// Execute on XPU
xpu->run(gXpu);
// copy output from XPU to CPU
auto o0Cpu = gCpu->cloneTensor(conv->getOutput());
// check results on CPU
EXPECT_TRUE(o0Cpu->equalData(vector<float>{
162, 351, 569, 413, 224, 405, 876, 1417, 1024, 553,
747, 1611, 2598, 1869, 1005, 639, 1368, 2191, 1564, 835,
396, 843, 1343, 953, 506, 243, 531, 866, 629, 341,
621, 1344, 2173, 1564, 841, 1152, 2475, 3975, 2841, 1518,
963, 2052, 3271, 2320, 1231, 585, 1239, 1964, 1385, 731}));
}
} // namespace infini

View File

@ -0,0 +1,66 @@
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "kunlun/kunlun_runtime.h"
#include "operators/element_wise.h"
#include "test.h"
namespace infini {
using ExpectOutput = vector<float>;
template <class T>
void testElementWiseXdnn(
const std::function<void(void *, size_t, DataType)> &generator,
const Shape &shape, const ExpectOutput &ansVec) {
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto xpuRuntime = make_ref<KUNLUNRuntimeObj>();
// Build input data on CPU
Tensor acpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
acpu->dataMalloc();
acpu->setData(generator);
Tensor bcpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
bcpu->dataMalloc();
bcpu->setData(generator);
// Build XPU graph
Graph g = make_ref<GraphObj>(xpuRuntime);
auto a = g->cloneTensor(acpu);
auto b = g->cloneTensor(bcpu);
auto op = g->addOp<T>(a, b, nullptr);
// allocate XPU memory
g->dataMalloc();
a->setData(generator);
b->setData(generator);
// Execute on XPU
xpuRuntime->run(g);
// clone XPU output to CPU
auto c = op->getOutput();
auto ccpu = c->clone(cpuRuntime);
// check results on CPU
EXPECT_TRUE(ccpu->equalData(ansVec));
}
TEST(xdnn_ElementWise, run) {
testElementWiseXdnn<AddObj>(
IncrementalGenerator(), Shape{1, 2, 2, 3},
ExpectOutput{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22});
testElementWiseXdnn<SubObj>(
IncrementalGenerator(), Shape{1, 2, 2, 3},
ExpectOutput{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
testElementWiseXdnn<MulObj>(
IncrementalGenerator(), Shape{1, 2, 2, 3},
ExpectOutput{0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 121});
testElementWiseXdnn<DivObj>(
OneGenerator(), Shape{1, 2, 2, 3},
ExpectOutput{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
testElementWiseXdnn<PowObj>(IncrementalGenerator(), Shape{1, 2, 2, 1},
ExpectOutput{1, 1, 4, 27});
}
} // namespace infini

View File

@ -0,0 +1,58 @@
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "kunlun/kunlun_runtime.h"
#include "operators/matmul.h"
#include "test.h"
namespace infini {
template <class T>
void testMatmul(const std::function<void(void *, size_t, DataType)> &generatorA,
const std::function<void(void *, size_t, DataType)> &generatorB,
bool transA, bool transB, const Shape &shapeA,
const Shape &shapeB) {
// Runtime
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto xpuRuntime = make_ref<KUNLUNRuntimeObj>();
// Build input data on CPU
Tensor inputCpu1 =
make_ref<TensorObj>(shapeA, DataType::Float32, cpuRuntime);
Tensor inputCpu2 =
make_ref<TensorObj>(shapeB, DataType::Float32, cpuRuntime);
// MLU
Graph xpuGraph = make_ref<GraphObj>(xpuRuntime);
auto inputMlu1 = xpuGraph->cloneTensor(inputCpu1);
auto inputMlu2 = xpuGraph->cloneTensor(inputCpu2);
auto mluOp = xpuGraph->addOp<T>(inputMlu1, inputMlu2, nullptr);
xpuGraph->dataMalloc();
inputMlu1->setData(generatorA);
inputMlu2->setData(generatorB);
xpuRuntime->run(xpuGraph);
auto outputMlu = mluOp->getOutput();
auto outputMlu2Cpu = outputMlu->clone(cpuRuntime);
// CPU
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
auto cpuOp = cpuGraph->addOp<T>(inputCpu1, inputCpu2, nullptr);
cpuGraph->addTensor(inputCpu1);
cpuGraph->addTensor(inputCpu2);
cpuGraph->dataMalloc();
inputCpu1->setData(generatorA);
inputCpu2->setData(generatorB);
cpuRuntime->run(cpuGraph);
auto outputCpu = cpuOp->getOutput();
outputCpu->print();
outputMlu2Cpu->print();
// Check
EXPECT_TRUE(outputCpu->equalData(outputMlu2Cpu));
}
TEST(xpu_Matmul, run) {
testMatmul<MatmulObj>(IncrementalGenerator(), IncrementalGenerator(), false,
false, Shape{2, 3}, Shape{3, 4});
}
} // namespace infini

View File

@ -0,0 +1,40 @@
#include "core/graph.h"
#include "core/runtime.h"
#include "kunlun/kunlun_kernel_without_config.h"
#include "kunlun/kunlun_runtime.h"
#include "operators/pad.h"
#include "test.h"
namespace infini {
TEST(xpu_Pad, run) {
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto xpuRuntime = make_ref<KUNLUNRuntimeObj>();
// Build input data on CPU
Tensor icpu =
make_ref<TensorObj>(Shape{1, 2, 3, 2}, DataType::Float32, cpuRuntime);
// Build XPU graph;
Graph g = make_ref<GraphObj>(xpuRuntime);
auto i = g->cloneTensor(icpu);
auto op = g->addOp<PadObj>(i, nullptr, vector<int>{1, 0, 1, 1},
vector<int>{0, 3});
// allocate XPU memory
g->dataMalloc();
i->setData(IncrementalGenerator());
// Execute on XPU
xpuRuntime->run(g);
// clone XPU output to CPU
auto o = op->getOutput();
auto cpuo = o->clone(cpuRuntime);
cpuo->printData();
// check results on CPU
EXPECT_TRUE(cpuo->equalData(
vector<float>{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 0, 2, 3, 0, 4, 5, 0, 6, 7, 0, 8, 9, 0, 10, 11, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}));
}
} // namespace infini

View File

@ -0,0 +1,51 @@
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "kunlun/kunlun_runtime.h"
#include "operators/pooling.h"
#include "test.h"
namespace infini {
template <class T>
void testPooling(const std::function<void(void *, size_t, DataType)> &generator,
const Shape &shape) {
// Runtime
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto xpuRuntime = make_ref<KUNLUNRuntimeObj>();
// Build input data on CPU
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
inputCpu->dataMalloc();
inputCpu->setData(generator);
// GPU
Graph xpuGraph = make_ref<GraphObj>(xpuRuntime);
auto inputGpu = xpuGraph->cloneTensor(inputCpu);
auto gpuOp =
xpuGraph->addOp<T>(inputGpu, nullptr, 3, 3, 1, 1, 0, 0, 2, 2, 0);
xpuGraph->dataMalloc();
inputGpu->setData(generator);
xpuRuntime->run(xpuGraph);
auto outputGpu = gpuOp->getOutput();
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
// CPU
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
cpuGraph->addTensor(inputCpu);
auto cpuOp =
cpuGraph->addOp<T>(inputCpu, nullptr, 3, 3, 1, 1, 0, 0, 2, 2, 0);
cpuGraph->dataMalloc();
inputCpu->setData(generator);
cpuRuntime->run(cpuGraph);
auto outputCpu = cpuOp->getOutput();
EXPECT_TRUE(outputCpu->equalData(outputGpu2Cpu));
}
TEST(xdnn_Pooling, run) {
testPooling<MaxPoolObj>(IncrementalGenerator(), Shape{1, 1, 5, 5});
testPooling<AvgPoolObj>(IncrementalGenerator(), Shape{1, 1, 5, 5});
}
} // namespace infini

View File

@ -0,0 +1,48 @@
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "kunlun/kunlun_runtime.h"
#include "operators/split.h"
#include "test.h"
namespace infini {
template <class T>
void testSplit(const std::function<void(void *, size_t, DataType)> &generator,
const Shape &shape) {
// Runtime
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto xpuRuntime = make_ref<KUNLUNRuntimeObj>();
// Build input data on CPU
Tensor inputCpu1 =
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
inputCpu1->dataMalloc();
inputCpu1->setData(generator);
// GPU
Graph xpuGraph = make_ref<GraphObj>(xpuRuntime);
auto inputGpu1 = xpuGraph->cloneTensor(inputCpu1);
auto gpuOp = xpuGraph->addOp<T>(inputGpu1, std::nullopt, 3, 3);
xpuGraph->dataMalloc();
xpuRuntime->run(xpuGraph);
auto o0Cpu = gpuOp->getOutput(0)->clone(cpuRuntime);
auto o1Cpu = gpuOp->getOutput(1)->clone(cpuRuntime);
auto o2Cpu = gpuOp->getOutput(2)->clone(cpuRuntime);
// Check
inputCpu1->print();
inputCpu1->printData();
o0Cpu->print();
o0Cpu->printData();
o1Cpu->print();
o1Cpu->printData();
o2Cpu->print();
o2Cpu->printData();
EXPECT_TRUE(1);
}
TEST(xpu_Split, run) {
testSplit<SplitObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
}
} // namespace infini

View File

@ -0,0 +1,43 @@
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "kunlun/kunlun_runtime.h"
#include "operators/transpose.h"
#include "test.h"
namespace infini {
template <class T>
void testTranspose(
const std::function<void(void *, size_t, DataType)> &generator,
const Shape &shape) {
// Runtime
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto xpuRuntime = make_ref<KUNLUNRuntimeObj>();
// Build input data on CPU
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
inputCpu->dataMalloc();
inputCpu->setData(generator);
// GPU
Graph xpuGraph = make_ref<GraphObj>(xpuRuntime);
auto inputGpu = xpuGraph->cloneTensor(inputCpu);
vector<int> permute = {0, 1, 3, 2};
auto gpuOp = xpuGraph->addOp<T>(inputGpu, nullptr, permute);
xpuGraph->dataMalloc();
xpuRuntime->run(xpuGraph);
auto outputGpu = gpuOp->getOutput();
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
// Check
inputCpu->printData();
outputGpu2Cpu->printData();
EXPECT_TRUE(1);
}
TEST(xpu_Transpose, run) {
testTranspose<TransposeObj>(IncrementalGenerator(), Shape{1, 1, 2, 3});
}
} // namespace infini

View File

@ -0,0 +1,190 @@
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "kunlun/kunlun_runtime.h"
#include "operators/unary.h"
#include "test.h"
namespace infini {
template <class T>
void testUnary(const std::function<void(void *, size_t, DataType)> &generator,
const Shape &shape) {
// Runtime
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto xpuRuntime = make_ref<KUNLUNRuntimeObj>();
// Build input data on CPU
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
// GPU
Graph xpuGraph = make_ref<GraphObj>(xpuRuntime);
auto inputGpu = xpuGraph->cloneTensor(inputCpu);
auto gpuOp = xpuGraph->addOp<T>(inputGpu, nullptr);
xpuGraph->dataMalloc();
inputGpu->setData(generator);
xpuRuntime->run(xpuGraph);
auto outputGpu = gpuOp->getOutput();
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
// CPU
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
auto cpuOp = cpuGraph->addOp<T>(inputCpu, nullptr);
cpuGraph->addTensor(inputCpu);
cpuGraph->dataMalloc();
inputCpu->setData(generator);
cpuRuntime->run(cpuGraph);
auto outputCpu = cpuOp->getOutput();
// Check
EXPECT_TRUE(outputCpu->equalData(outputGpu2Cpu, 1e-6));
}
void testClip(const std::function<void(void *, size_t, DataType)> &generator,
const Shape &shape) {
// Runtime
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto xpuRuntime = make_ref<KUNLUNRuntimeObj>();
// Build input data on CPU
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
float min = 1.0;
float max = 5.0;
// GPU
Graph xpuGraph = make_ref<GraphObj>(xpuRuntime);
auto inputGpu = xpuGraph->cloneTensor(inputCpu);
auto gpuOp = xpuGraph->addOp<ClipObj>(inputGpu, nullptr, min, max);
xpuGraph->dataMalloc();
inputGpu->setData(generator);
xpuRuntime->run(xpuGraph);
auto outputGpu = gpuOp->getOutput();
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
// CPU
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
auto cpuOp = cpuGraph->addOp<ClipObj>(inputCpu, nullptr, min, max);
cpuGraph->addTensor(inputCpu);
cpuGraph->dataMalloc();
inputCpu->setData(generator);
cpuRuntime->run(cpuGraph);
auto outputCpu = cpuOp->getOutput();
// Check
EXPECT_TRUE(outputCpu->equalData(outputGpu2Cpu));
}
void testCast(const std::function<void(void *, size_t, DataType)> &generator,
const Shape &shape) {
// Runtime
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto xpuRuntime = make_ref<KUNLUNRuntimeObj>();
// Build input data on CPU
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
// GPU
Graph xpuGraph = make_ref<GraphObj>(xpuRuntime);
auto inputGpu = xpuGraph->cloneTensor(inputCpu);
auto gpuOp =
xpuGraph->addOp<CastObj>(inputGpu, nullptr, CastType::Float2Int32);
xpuGraph->dataMalloc();
inputGpu->setData(generator);
xpuRuntime->run(xpuGraph);
auto outputGpu = gpuOp->getOutput();
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
// CPU
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
auto cpuOp =
cpuGraph->addOp<CastObj>(inputCpu, nullptr, CastType::Float2Int32);
cpuGraph->addTensor(inputCpu);
cpuGraph->dataMalloc();
inputCpu->setData(generator);
cpuRuntime->run(cpuGraph);
auto outputCpu = cpuOp->getOutput();
// Check
EXPECT_TRUE(outputCpu->equalData(outputGpu2Cpu));
}
template <LogObj::LogType T>
void testLog(const std::function<void(void *, size_t, DataType)> &generator,
const Shape &shape) {
// Runtime
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto xpuRuntime = make_ref<KUNLUNRuntimeObj>();
// Build input data on CPU
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
// GPU
Graph xpuGraph = make_ref<GraphObj>(xpuRuntime);
auto inputGpu = xpuGraph->cloneTensor(inputCpu);
auto gpuOp = xpuGraph->addOp<LogObj>(inputGpu, nullptr, T);
xpuGraph->dataMalloc();
inputGpu->setData(generator);
xpuRuntime->run(xpuGraph);
auto outputGpu = gpuOp->getOutput();
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
// CPU
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
auto cpuOp = cpuGraph->addOp<LogObj>(inputCpu, nullptr, T);
cpuGraph->addTensor(inputCpu);
cpuGraph->dataMalloc();
inputCpu->setData(generator);
cpuRuntime->run(cpuGraph);
auto outputCpu = cpuOp->getOutput();
// Check
EXPECT_TRUE(outputCpu->equalData(outputGpu2Cpu));
}
template <class T>
void testTrigon(const std::function<void(void *, size_t, DataType)> &generator,
const Shape &shape) {
// Runtime
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto xpuRuntime = make_ref<KUNLUNRuntimeObj>();
// Build input data on CPU
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
// GPU
Graph xpuGraph = make_ref<GraphObj>(xpuRuntime);
auto inputGpu = xpuGraph->cloneTensor(inputCpu);
auto gpuOp = xpuGraph->addOp<T>(inputGpu, nullptr);
xpuGraph->dataMalloc();
inputGpu->setData(generator);
xpuRuntime->run(xpuGraph);
auto outputGpu = gpuOp->getOutput();
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
// CPU
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
auto cpuOp = cpuGraph->addOp<T>(inputCpu, nullptr);
cpuGraph->addTensor(inputCpu);
cpuGraph->dataMalloc();
inputCpu->setData(generator);
cpuRuntime->run(cpuGraph);
auto outputCpu = cpuOp->getOutput();
// Check
EXPECT_TRUE(outputCpu->equalData(outputGpu2Cpu, 1e-3));
}
TEST(xdnn_Unary, run) {
testUnary<ReluObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testUnary<SigmoidObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testUnary<TanhObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testUnary<AbsObj>(ValGenerator<-1>(), Shape{1, 2, 2, 3});
testUnary<ATanObj>(OneGenerator(), Shape{1, 2, 2, 3});
testLog<LogObj::Log10>(ValGenerator<2>(), Shape{1, 2, 2, 3});
testLog<LogObj::Log2>(ValGenerator<2>(), Shape{1, 2, 2, 3});
testLog<LogObj::LogE>(ValGenerator<2>(), Shape{1, 2, 2, 3});
testTrigon<CosObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testTrigon<SinObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testTrigon<TanObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testTrigon<SinHObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testTrigon<CosHObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testUnary<ErfObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testTrigon<ACosObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testTrigon<ACosHObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testTrigon<ASinObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testTrigon<ASinHObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testTrigon<ATanHObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
}
} // namespace infini