forked from jiuyuan/InfiniTensor
Dev for 202303ddl (#66)
* add activation operatiopn relu, tanh, sigmoid on mlu * commit for format * add activation backward operation * add test for activation_backward * add test * add convbpfilter * fix * add transpsoe code and test * add trigon function operation on mlu: sin,cos,tan,asin,sinh,asinh * add copy operation on mlu * add ceil operation and floor operation * add operation clip * add operation cnnl div, test and test for divdemo bangc kernel * add divnonan operation and test * add erf operation * add exp operation * add operation fill * add log operation * add log1p operation * add l2loss operation * add maximum and minimum operation * add mseloss operation * add negTensor operation * add power operation * add reciprocal operation * add sqrt and rsqrt operation * add transform operation * add addn operation * add muln operation * cherrry pick some operation * add floordiv operation and floordivtrunc operation * add floormod operation * add cumsum operation * add det operation * add pad operation * format * add concat operation * format * add split operation * fix concat and split operation * add round operation * add pooling operation * add square operation * add squaredDifference operation * code format fix * add flip operation * code format fix * add hardtanh operation * add logic operation * add addcdiv and addcmul operation * add arange operation * add bitcompute operation * add net test * fmt Signed-off-by: YdrMaster <ydrml@hotmail.com> * style: rename Signed-off-by: YdrMaster <ydrml@hotmail.com> * fix: 用 NativeCpuRuntime 替换 CpuRuntime Signed-off-by: YdrMaster <ydrml@hotmail.com> * fix code * fix code * fix code by review suggestion * remove operation which is not the onnx operation * fix format * clang format * refactor: tensor 的 print 加一层模板的 dataToString Signed-off-by: YdrMaster <ydrml@hotmail.com> * fix: onnx 导出 Signed-off-by: YdrMaster <ydrml@hotmail.com> * feat: 增加计算图优化接口 Signed-off-by: YdrMaster <ydrml@hotmail.com> * add clip operation * feat: 支持导入 clip Signed-off-by: YdrMaster <ydrml@hotmail.com> * test: 导入导出测试加入 ci Signed-off-by: YdrMaster <ydrml@hotmail.com> * fix batch norm * feat: 增加 Shape 算子 Signed-off-by: YdrMaster <ydrml@hotmail.com> * feat: 支持导入 unsqueeze Signed-off-by: YdrMaster <ydrml@hotmail.com> * fix: 修正 clip 接口 feat: 支持导入 transpose Signed-off-by: YdrMaster <ydrml@hotmail.com> * add broadcast operation * fix elementwise-broadcast * fix elementwise broadcast * add broadcast for gpu elementsie * feat: pad 支持 axes 负数 feat: 不支持的 padding 导出为独立的 pad 算子 feat: 支持导入 onnxsim 过的 inception Signed-off-by: YdrMaster <ydrml@hotmail.com> * fix: 修正池化的测试 Signed-off-by: YdrMaster <ydrml@hotmail.com> * feat: 导出 pads,支持 inception 导入导出,已加入 ci Signed-off-by: YdrMaster <ydrml@hotmail.com> * feat: 支持 densenet 导入导出,并加入 ci Signed-off-by: YdrMaster <ydrml@hotmail.com> * feat: 导入 squeeze Signed-off-by: YdrMaster <ydrml@hotmail.com> * fix softmax * feat: 导出 clip 和 transpose Signed-off-by: YdrMaster <ydrml@hotmail.com> * feat: 支持 Conv 的 bias Signed-off-by: YdrMaster <ydrml@hotmail.com> * fix: bias of conv Signed-off-by: YdrMaster <ydrml@hotmail.com> * fix: bias of conv Signed-off-by: YdrMaster <ydrml@hotmail.com> * feat: 导入 split Signed-off-by: YdrMaster <ydrml@hotmail.com> * feat: 导出 split Signed-off-by: YdrMaster <ydrml@hotmail.com> * fix: conv Signed-off-by: YdrMaster <ydrml@hotmail.com> * fix: conv group Signed-off-by: YdrMaster <ydrml@hotmail.com> * fix: matmul 的 bias 没有放在输入里,修正 Signed-off-by: YdrMaster <ydrml@hotmail.com> * fix exmaple * fix: 改正 reduce_mean 导出 Signed-off-by: YdrMaster <ydrml@hotmail.com> * refactor: 修改 slice 实现与 onnx 一致 Signed-off-by: YdrMaster <ydrml@hotmail.com> * style: 不导出两个 runtime 函数 Signed-off-by: YdrMaster <ydrml@hotmail.com> * doc: 中文使用指南 Signed-off-by: YdrMaster <ydrml@hotmail.com> * doc: 补全指南 Signed-off-by: YdrMaster <ydrml@hotmail.com> * fix: 修复导入数据的问题 Signed-off-by: YdrMaster <ydrml@hotmail.com> * fmt Signed-off-by: YdrMaster <ydrml@hotmail.com> * feat: 添加 Dropout 基本结构,但不支持两个输出是不同的类型 Signed-off-by: YdrMaster <ydrml@hotmail.com> * feat: 重新导出优化接口 feat: dropout 导入 Signed-off-by: YdrMaster <ydrml@hotmail.com> * build: BANG 选项加入 Makefile Signed-off-by: YdrMaster <ydrml@hotmail.com> * fxi code, change of test/kernels/bang/test* is use NativeCpuRuntime. chaneg of include/bang/bang_runtime is for the cntoolkit upgrade. * feat: 导出 bang runtime Signed-off-by: YdrMaster <ydrml@hotmail.com> * add USE_BANG=1 * fix matmul * fix reshape * fix * fix activation * fix transpose * format * format * update Makefile Signed-off-by: YdrMaster <ydrml@hotmail.com> * feat: 支持导入导出 ConvTranspose Signed-off-by: YdrMaster <ydrml@hotmail.com> * add prelu on mlu * fix: ConvTranspose Signed-off-by: YdrMaster <ydrml@hotmail.com> * feat: 支持导入导出 PRelu Signed-off-by: YdrMaster <ydrml@hotmail.com> * add convtrans on mlu * fmt Signed-off-by: YdrMaster <ydrml@hotmail.com> * docs: 更新 README_CN.md Signed-off-by: YdrMaster <ydrml@hotmail.com> * fix code by review suggestions * style Signed-off-by: YdrMaster <ydrml@hotmail.com> * fix: Softmax 的 axis 可以用默认值?感觉是 onnx 不标准 Signed-off-by: YdrMaster <ydrml@hotmail.com> * fix cuda & intelcpu bugs after merging --------- Signed-off-by: YdrMaster <ydrml@hotmail.com> Co-authored-by: wanghailu <wanghailu0717@163.com> Co-authored-by: wanghailu <wanghailu@qiyuanlab.com> Co-authored-by: whjthu <haojie0429@gmail.com>
This commit is contained in:
parent
a1974aabcd
commit
26f0d13c26
|
@ -2,6 +2,9 @@ name: Build and test cpu
|
||||||
on:
|
on:
|
||||||
push:
|
push:
|
||||||
branch: 'master'
|
branch: 'master'
|
||||||
|
paths-ignore:
|
||||||
|
- '**.md'
|
||||||
|
- 'LICENSE'
|
||||||
pull_request:
|
pull_request:
|
||||||
paths-ignore:
|
paths-ignore:
|
||||||
- '**.md'
|
- '**.md'
|
||||||
|
@ -11,8 +14,11 @@ env:
|
||||||
protobuf-download: https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protobuf-cpp-3.21.12.tar.gz
|
protobuf-download: https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protobuf-cpp-3.21.12.tar.gz
|
||||||
protobuf-version: "3.21.12"
|
protobuf-version: "3.21.12"
|
||||||
python-version: "3.10"
|
python-version: "3.10"
|
||||||
|
|
||||||
resnet-download: https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet18-v2-7.onnx
|
resnet-download: https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet18-v2-7.onnx
|
||||||
resnet-file: resnet18-v2-7.onnx
|
inception-download: https://media.githubusercontent.com/media/onnx/models/main/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx
|
||||||
|
densenet-download: https://github.com/onnx/models/raw/main/vision/classification/densenet-121/model/densenet-12.onnx
|
||||||
|
efficientnet-download: https://github.com/onnx/models/raw/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
build:
|
build:
|
||||||
|
@ -31,28 +37,28 @@ jobs:
|
||||||
- name: Install libdw
|
- name: Install libdw
|
||||||
run: sudo apt-get update && sudo apt-get install libdw-dev
|
run: sudo apt-get update && sudo apt-get install libdw-dev
|
||||||
|
|
||||||
- name: Cache protobuf
|
# - name: Cache protobuf
|
||||||
id: cache-protobuf
|
# id: cache-protobuf
|
||||||
uses: actions/cache@v3
|
# uses: actions/cache@v3
|
||||||
with:
|
# with:
|
||||||
path: protobuf-${{ env.protobuf-version }}
|
# path: protobuf-${{ env.protobuf-version }}
|
||||||
key: protobuf-${{ env.protobuf-version }}
|
# key: protobuf-${{ env.protobuf-version }}
|
||||||
|
|
||||||
- name: Download and compile protobuf
|
# - name: Download and compile protobuf
|
||||||
if: steps.cache-protobuf.outputs.cache-hit != 'true'
|
# if: steps.cache-protobuf.outputs.cache-hit != 'true'
|
||||||
run: |
|
# run: |
|
||||||
wget ${{ env.protobuf-download }}
|
# wget ${{ env.protobuf-download }}
|
||||||
tar xf protobuf-cpp-${{ env.protobuf-version }}.tar.gz
|
# tar xf protobuf-cpp-${{ env.protobuf-version }}.tar.gz
|
||||||
cd protobuf-${{ env.protobuf-version }}
|
# cd protobuf-${{ env.protobuf-version }}
|
||||||
./autogen.sh
|
# ./autogen.sh
|
||||||
./configure CFLAGS="-fPIC" CXXFLAGS="-fPIC"
|
# ./configure CFLAGS="-fPIC" CXXFLAGS="-fPIC"
|
||||||
make -j8
|
# make -j8
|
||||||
|
|
||||||
- name: Install protobuf
|
# - name: Install protobuf
|
||||||
run: |
|
# run: |
|
||||||
cd protobuf-${{ env.protobuf-version }}
|
# cd protobuf-${{ env.protobuf-version }}
|
||||||
sudo make install
|
# sudo make install
|
||||||
sudo ldconfig
|
# sudo ldconfig
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
run: make
|
run: make
|
||||||
|
@ -65,8 +71,12 @@ jobs:
|
||||||
python -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
make install-python
|
make install-python
|
||||||
|
|
||||||
- name: Download test model
|
- name: Download test models
|
||||||
run: wget ${{ env.resnet-download }}
|
run: |
|
||||||
|
wget ${{ env.resnet-download }}
|
||||||
|
wget ${{ env.inception-download }}
|
||||||
|
wget ${{ env.densenet-download }}
|
||||||
|
wget ${{ env.efficientnet-download }}
|
||||||
|
|
||||||
- name: Test onnx frontend
|
- name: Test onnx frontend
|
||||||
run: make test-onnx
|
run: make test-onnx
|
||||||
|
|
|
@ -2,6 +2,9 @@ name: clang-format Check
|
||||||
on:
|
on:
|
||||||
push:
|
push:
|
||||||
branch: 'master'
|
branch: 'master'
|
||||||
|
paths-ignore:
|
||||||
|
- '**.md'
|
||||||
|
- 'LICENSE'
|
||||||
pull_request:
|
pull_request:
|
||||||
paths-ignore:
|
paths-ignore:
|
||||||
- '**.md'
|
- '**.md'
|
||||||
|
|
|
@ -10,3 +10,6 @@
|
||||||
[submodule "3rd-party/backward-cpp"]
|
[submodule "3rd-party/backward-cpp"]
|
||||||
path = 3rd-party/backward-cpp
|
path = 3rd-party/backward-cpp
|
||||||
url = git@github.com:bombela/backward-cpp.git
|
url = git@github.com:bombela/backward-cpp.git
|
||||||
|
[submodule "example"]
|
||||||
|
path = example
|
||||||
|
url = git@github.com:wanghailu0717/NNmodel.git
|
||||||
|
|
|
@ -7,8 +7,8 @@ option(USE_CUDA "Support CUDA GPU" OFF)
|
||||||
option(USE_BANG "Support BANG MLU" OFF)
|
option(USE_BANG "Support BANG MLU" OFF)
|
||||||
option(USE_INTELCPU "Support INTELCPU" OFF)
|
option(USE_INTELCPU "Support INTELCPU" OFF)
|
||||||
option(USE_BACKTRACE "Print backtrace on exception and segmentation fault" ON)
|
option(USE_BACKTRACE "Print backtrace on exception and segmentation fault" ON)
|
||||||
option(USE_PROTOBUF "Serialize and deserialize tensors" ON)
|
option(USE_PROTOBUF "Serialize and deserialize tensors" OFF)
|
||||||
option(BUILD_TEST "Build tests" ON)
|
option(BUILD_TEST "Build tests" OFF)
|
||||||
|
|
||||||
cmake_dependent_option(BUILD_TEST_CORE "Build tests for core components" ON BUILD_TEST OFF)
|
cmake_dependent_option(BUILD_TEST_CORE "Build tests for core components" ON BUILD_TEST OFF)
|
||||||
cmake_dependent_option(BUILD_TEST_PET "Build tests for PET" OFF BUILD_TEST OFF)
|
cmake_dependent_option(BUILD_TEST_PET "Build tests for PET" OFF BUILD_TEST OFF)
|
||||||
|
@ -78,7 +78,7 @@ if(BUILD_TEST_EINNET)
|
||||||
include_directories(${DMLC_INCLUDE_DIR})
|
include_directories(${DMLC_INCLUDE_DIR})
|
||||||
include_directories(${DLPACK_INCLUDE_DIR})
|
include_directories(${DLPACK_INCLUDE_DIR})
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_LOGGING_LIBRARY=\\\<${TVM_INCLUDE_DIR}/tvm/runtime/logging.h\\\> ")
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_LOGGING_LIBRARY=\\\<${TVM_INCLUDE_DIR}/tvm/runtime/logging.h\\\> ")
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DINFINI_USE_TVM=1") # Enable TVM codegen kernels
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DINFINI_USE_TVM=1") # Enable TVM codegen kernels
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(BUILD_TEST)
|
if(BUILD_TEST)
|
||||||
|
@ -142,7 +142,7 @@ if(USE_BACKTRACE)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(USE_INTELCPU)
|
if(USE_INTELCPU)
|
||||||
add_compile_definitions(USE_INTELCPU=1)
|
add_compile_definitions(USE_INTELCPU=1)
|
||||||
find_package(MKL CONFIG REQUIRED)
|
find_package(MKL CONFIG REQUIRED)
|
||||||
|
|
||||||
# Refer to https://www.intel.com/content/www/us/en/developer/tools/oneapi/onemkl-link-line-advisor.html
|
# Refer to https://www.intel.com/content/www/us/en/developer/tools/oneapi/onemkl-link-line-advisor.html
|
||||||
|
@ -150,10 +150,11 @@ if(USE_INTELCPU)
|
||||||
|
|
||||||
set(DNNL_CONFIGURATION "cpu_gomp")
|
set(DNNL_CONFIGURATION "cpu_gomp")
|
||||||
find_package(dnnl CONFIG REQUIRED)
|
find_package(dnnl CONFIG REQUIRED)
|
||||||
if(dnnl_FOUND)
|
if(dnnl_FOUND)
|
||||||
|
add_compile_definitions(USE_MKL=1)
|
||||||
include_directories(BEFORE ${dnnl_DIR}/../../../cpu_gomp/include/)
|
include_directories(BEFORE ${dnnl_DIR}/../../../cpu_gomp/include/)
|
||||||
link_directories(${dnnl_DIR}/../../../cpu_gomp/lib)
|
link_directories(${dnnl_DIR}/../../../cpu_gomp/lib)
|
||||||
target_link_libraries(InfiniTensor dnnl)
|
target_link_libraries(InfiniTensor dnnl)
|
||||||
else()
|
else()
|
||||||
message(FATAL_ERROR "dnnl library not found")
|
message(FATAL_ERROR "dnnl library not found")
|
||||||
endif()
|
endif()
|
||||||
|
@ -161,7 +162,7 @@ if(USE_INTELCPU)
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DMKL_ILP64 -qmkl=parallel -Werror ${WNO_ERRORS}")
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DMKL_ILP64 -qmkl=parallel -Werror ${WNO_ERRORS}")
|
||||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DMKL_ILP64 -qmkl=parallel ${WNO_ERRORS}") # Enable assertion
|
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DMKL_ILP64 -qmkl=parallel ${WNO_ERRORS}") # Enable assertion
|
||||||
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -DMKL_ILP64 -qmkl=parallel ${WNO_ERRORS}") # Enable assertion
|
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -DMKL_ILP64 -qmkl=parallel ${WNO_ERRORS}") # Enable assertion
|
||||||
|
|
||||||
find_package(IntelDPCPP REQUIRED)
|
find_package(IntelDPCPP REQUIRED)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
@ -179,6 +180,7 @@ if(USE_CUDA)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(USE_BANG)
|
if(USE_BANG)
|
||||||
|
add_compile_definitions(USE_BANG=1)
|
||||||
include_directories(src/kernels/mlu/include)
|
include_directories(src/kernels/mlu/include)
|
||||||
################################################################################
|
################################################################################
|
||||||
# Neuware Evironment
|
# Neuware Evironment
|
||||||
|
@ -212,10 +214,8 @@ if(USE_BANG)
|
||||||
################################################################################
|
################################################################################
|
||||||
# BangC Kernels
|
# BangC Kernels
|
||||||
################################################################################
|
################################################################################
|
||||||
add_subdirectory(src/kernels/mlu)
|
|
||||||
|
|
||||||
target_link_libraries(InfiniTensor ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
|
target_link_libraries(InfiniTensor ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
|
||||||
target_link_libraries(InfiniTensor bangops)
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# # Python bindings
|
# # Python bindings
|
||||||
|
|
15
Makefile
15
Makefile
|
@ -1,14 +1,17 @@
|
||||||
.PHONY : build clean install-python test-cpp test-onnx
|
.PHONY : build clean install-python test-cpp test-onnx
|
||||||
|
|
||||||
TYPE ?= release
|
TYPE ?= release
|
||||||
CUDA ?= off
|
CUDA ?= OFF
|
||||||
|
BANG ?= OFF
|
||||||
INTELCPU ?= off
|
INTELCPU ?= off
|
||||||
|
BACKTRACE ?= ON
|
||||||
|
TEST ?= ON
|
||||||
|
|
||||||
CMAKE_OPT = -DCMAKE_BUILD_TYPE=$(TYPE)
|
CMAKE_OPT = -DCMAKE_BUILD_TYPE=$(TYPE)
|
||||||
|
CMAKE_OPT += -DUSE_CUDA=$(CUDA)
|
||||||
ifeq ($(CUDA), ON)
|
CMAKE_OPT += -DUSE_BANG=$(BANG)
|
||||||
CMAKE_OPT += -DUSE_CUDA=ON
|
CMAKE_OPT += -DUSE_BACKTRACE=$(BACKTRACE)
|
||||||
endif
|
CMAKE_OPT += -DBUILD_TEST=$(TEST)
|
||||||
|
|
||||||
ifeq ($(INTELCPU), ON)
|
ifeq ($(INTELCPU), ON)
|
||||||
CMAKE_OPT += -DUSE_INTELCPU=ON -DCMAKE_CXX_COMPILER=dpcpp
|
CMAKE_OPT += -DUSE_INTELCPU=ON -DCMAKE_CXX_COMPILER=dpcpp
|
||||||
|
|
|
@ -24,7 +24,9 @@ cmake -DUSE_INTELCPU=ON -DCMAKE_CXX_COMPILER=dpcpp .. && make -j 12
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
> Sets env: `CUDA=ON` to enable cuda.
|
> - Sets env: `TEST=OFF` to accelerate compiling.
|
||||||
|
> - Sets env: `CUDA=ON` to enable cuda.
|
||||||
|
> - Sets env: `BANG=ON` to enable bang.
|
||||||
|
|
||||||
### CMake Options
|
### CMake Options
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,221 @@
|
||||||
|
# 使用指南
|
||||||
|
|
||||||
|
## 目录
|
||||||
|
|
||||||
|
- [编译](#编译)
|
||||||
|
- [使用](#使用)
|
||||||
|
- [python-前端应用指南](#python-前端应用指南)
|
||||||
|
- [导入-onnx-模型](#导入-onnx-模型)
|
||||||
|
- [导出-onnx-模型](#导出-onnx-模型)
|
||||||
|
- [执行推理](#执行推理)
|
||||||
|
- [测试](#测试)
|
||||||
|
|
||||||
|
## 编译
|
||||||
|
|
||||||
|
推荐使用 Ubuntu-22.04,本文以此环境为例。
|
||||||
|
|
||||||
|
1. 使用 apt 安装依赖
|
||||||
|
|
||||||
|
> 如果不使用 Ubuntu-22.04,部分软件版本可能不够高。
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo apt-get install make cmake build-essential python-is-python3 python-dev-is-python3 python3-pip libdw-dev
|
||||||
|
```
|
||||||
|
|
||||||
|
2. 更新 pip 并换清华源
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m pip install -i https://pypi.tuna.tsinghua.edu.cn/simple --upgrade pip
|
||||||
|
pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
|
||||||
|
```
|
||||||
|
|
||||||
|
3. 编译并安装 python 库
|
||||||
|
|
||||||
|
> 第一次执行会同时安装 python 依赖库,比较慢
|
||||||
|
|
||||||
|
仅编译 CPU 部分:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
make install-python
|
||||||
|
```
|
||||||
|
|
||||||
|
编译 GPU 部分:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
make install-python CUDA=ON
|
||||||
|
```
|
||||||
|
|
||||||
|
## 使用
|
||||||
|
|
||||||
|
项目管理功能已写到 [Makefile](Makefile),支持下列功能:
|
||||||
|
|
||||||
|
- 编译项目:`make`/`make build`
|
||||||
|
- 清理生成文件:`make clean`
|
||||||
|
- 安装 python 库:`make install-python`
|
||||||
|
- 测试 c++ 后端:`make test-cpp`
|
||||||
|
- 测试 python 前端:`make test-onnx`
|
||||||
|
|
||||||
|
并使用下列环境变量传递选项参数:
|
||||||
|
|
||||||
|
- `TYPE`:编译模式(`debug`/`release`),默认值为 `release`
|
||||||
|
- `CUDA`:是否编译 CUDA 后端,默认为 `OFF`,`ON` 打开
|
||||||
|
- `BANG`:是否编译寒武纪后端,默认为 `OFF`,`ON` 打开
|
||||||
|
- `BACKTRACE`:是否启用栈回溯,默认为 `ON`,`OFF` 关闭,建议调试时打开
|
||||||
|
- `TEST`:是否编译 `googletest`,默认为 `ON`,`OFF` 关闭,只有 `test-cpp` 时必要
|
||||||
|
|
||||||
|
## python 前端应用指南
|
||||||
|
|
||||||
|
`make install-python` 会将项目的 python 前端以 `pyinfinitensor` 为名字安装到系统目录,可以直接 `import pyinfinitensor` 来使用。现阶段,项目的主要用法是从 onnx 导入模型进行优化,然后可以再导出优化后的模型到 onnx,也可以直接运行推理。
|
||||||
|
|
||||||
|
### 导入 onnx 模型
|
||||||
|
|
||||||
|
支持的模型:
|
||||||
|
|
||||||
|
- [x] [ResNet18-v2](https://github.com/onnx/models/blob/main/vision/classification/resnet/model/resnet18-v2-7.onnx)
|
||||||
|
- [x] [DenseNet-121-12](https://github.com/onnx/models/blob/main/vision/classification/densenet-121/model/densenet-12.onnx)
|
||||||
|
- [x] [Inception-2](https://github.com/onnx/models/blob/main/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx)
|
||||||
|
- [x] [EfficientNet-Lite4](https://github.com/onnx/models/blob/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx)
|
||||||
|
|
||||||
|
```python
|
||||||
|
import onnx
|
||||||
|
from pyinfinitensor.onnx import OnnxStub
|
||||||
|
from pyinfinitensor import backend
|
||||||
|
|
||||||
|
stub = OnnxStub(onnx.load("model_file"), backend.cpu_runtime())
|
||||||
|
```
|
||||||
|
|
||||||
|
[`onnx.load`](https://onnx.ai/onnx/api/serialization.html#load-a-model) 是 onnx 提供的加载函数,将 onnx 文件读取为保存在内存中的 onnx 模型。
|
||||||
|
|
||||||
|
`OnnxStub` 是 onnx 模型在项目中的表示,通过构造这个对象,将 onnx 模型导入到项目中。其构造器的第一个参数是 onnx 模型文件;第二个参数是模型运行的后端运行时,可以是 `backend.cpu_runtime()`、`backend.cuda_runtime()` 或 `backend.bang_runtime()`。
|
||||||
|
|
||||||
|
构造出的 stub 对象可以用于操作项目中的模型和运行时。
|
||||||
|
|
||||||
|
### 优化
|
||||||
|
|
||||||
|
TODO
|
||||||
|
|
||||||
|
### 导出 onnx 模型
|
||||||
|
|
||||||
|
优化后的模型可以导出成 onnx 文件提供给其他运行时。
|
||||||
|
|
||||||
|
```python
|
||||||
|
with open("optimized.onnx", "wb") as f:
|
||||||
|
f.write(stub.to_onnx("optimized").SerializeToString())
|
||||||
|
```
|
||||||
|
|
||||||
|
`stub.to_onnx(<name>)` 将模型转换为 onnx 模型对象,`<name>` 将填写到 onnx 模型的 `name` 字段。序列化到文件的代码见[官方示例](https://onnx.ai/onnx/intro/python.html#model-serialization)。
|
||||||
|
|
||||||
|
要可视化检查导出的模型文件,可以利用 [onnx 提供的功能](https://onnx.ai/onnx/api/shape_inference.html#infer-shapes)将所有的张量的形状推理出来再导出:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from onnx.shape_inference import infer_shapes
|
||||||
|
|
||||||
|
with open("optimized.onnx", "wb") as f:
|
||||||
|
f.write(infer_shapes(stub.to_onnx("optimized")).SerializeToString())
|
||||||
|
```
|
||||||
|
|
||||||
|
然后用 [Netron](https://netron.app/) 绘制计算图。
|
||||||
|
|
||||||
|
### 执行推理
|
||||||
|
|
||||||
|
也可以使用项目的运行时执行推理。
|
||||||
|
|
||||||
|
第一步是将数据传入计算图。`OnnxStub.inputs` 是一个 `Dict[str, Tensor]`,保存着模型的所有输入的名字和对象。可以用 [`items()`](https://docs.python.org/zh-cn/3/library/stdtypes.html#dict.items) 来遍历。
|
||||||
|
|
||||||
|
这个代码片段显示了如何打印出模型所有输入张量的名字、形状和对象指针:
|
||||||
|
|
||||||
|
```python
|
||||||
|
for name, tensor in stub.inputs.items():
|
||||||
|
print(name, tensor.shape(), tensor)
|
||||||
|
```
|
||||||
|
|
||||||
|
对于 [resnet18-v2-7.onnx](https://github.com/onnx/models/blob/main/vision/classification/resnet/model/resnet18-v2-7.onnx),会打印出:
|
||||||
|
|
||||||
|
```plaintext
|
||||||
|
data [1, 3, 224, 224] <backend.Tensor object at 0x7efeb828e3b0>
|
||||||
|
```
|
||||||
|
|
||||||
|
当然,地址是随机的。这个输出表明需要输入一个名为 “data”,形为 1×3×224×224 的数据。通常来说,这表示一张 224×224 的 rgb 图片。而这个模型是一个 1000 分类的图像分类模型。
|
||||||
|
|
||||||
|
为了方便,这里我们向模型传入一个随机的数据。
|
||||||
|
|
||||||
|
```python
|
||||||
|
import numpy
|
||||||
|
|
||||||
|
stub.init()
|
||||||
|
for name, tensor in stub.inputs.items():
|
||||||
|
print(name, tensor.shape(), tensor)
|
||||||
|
input = numpy.random.random(tensor.shape()).astype(numpy.float32)
|
||||||
|
tensor.copyin_float(input.flatten().tolist())
|
||||||
|
```
|
||||||
|
|
||||||
|
`stub.init()` 为所有张量分配空间。空间是预分配的,所以不支持动态 size 的模型。
|
||||||
|
|
||||||
|
`tensor.copyin_float(<data>)` 向张量传入数据。其参数必须是一个 `List[float]`,即压平的数据。类似的函数还有 `copyin_int32(<data>)` 和 `copyin_int64(<data>)`
|
||||||
|
|
||||||
|
然后,调用 `stub.run()` 执行推理:
|
||||||
|
|
||||||
|
```python
|
||||||
|
stub.run()
|
||||||
|
```
|
||||||
|
|
||||||
|
最后,将结果拷贝出来,传入类似:
|
||||||
|
|
||||||
|
```python
|
||||||
|
stub.init()
|
||||||
|
for name, tensor in stub.outputs.items():
|
||||||
|
print(name, tensor.shape(), tensor)
|
||||||
|
print(tensor.copyout_float())
|
||||||
|
```
|
||||||
|
|
||||||
|
## 测试
|
||||||
|
|
||||||
|
除了单元测试 `make test-cpp` 和 `make test-onnx` 之外,还可以用其他方式来测试单个模型导入导出和优化的正确性。
|
||||||
|
|
||||||
|
这个脚本利用 onnxruntime 来测试导出的模型是否与导入的模型等价:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import onnx
|
||||||
|
import numpy
|
||||||
|
import sys
|
||||||
|
from onnx import ModelProto, ValueInfoProto
|
||||||
|
from pyinfinitensor.onnx import OnnxStub
|
||||||
|
from pyinfinitensor import backend
|
||||||
|
from onnxruntime import InferenceSession
|
||||||
|
|
||||||
|
|
||||||
|
def infer(model: ModelProto, input) -> dict:
|
||||||
|
collection = set()
|
||||||
|
for node in model.graph.node:
|
||||||
|
for output in node.output:
|
||||||
|
collection.add(output)
|
||||||
|
model.graph.output.extend([ValueInfoProto(name=x) for x in collection])
|
||||||
|
session = InferenceSession(model.SerializeToString())
|
||||||
|
i = session.get_inputs()[0].name
|
||||||
|
return dict(
|
||||||
|
zip(
|
||||||
|
[x.name for x in session.get_outputs()],
|
||||||
|
[x.flatten() for x in session.run(None, {i: input})],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
model0 = onnx.load(sys.argv[1])
|
||||||
|
model1 = OnnxStub(model0, backend.cpu_runtime()).to_onnx("new")
|
||||||
|
|
||||||
|
input_shape = [x.dim_value for x in model1.graph.input[0].type.tensor_type.shape.dim]
|
||||||
|
input = numpy.random.random(input_shape).astype(numpy.float32)
|
||||||
|
|
||||||
|
output0 = infer(model0, input)[model0.graph.output[0].name]
|
||||||
|
output1 = infer(model1, input)[model1.graph.output[0].name]
|
||||||
|
|
||||||
|
print("error =", sum((output1 - output0) ** 2) / len(output0))
|
||||||
|
```
|
||||||
|
|
||||||
|
要运行脚本,先安装 onnxruntime:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install onnxruntime
|
||||||
|
```
|
||||||
|
|
||||||
|
打印出的 `error = ...` 是两个模型输出张量的均方误差。对于不同的模型,这个误差最小为 0,最大不超过 1e-9。
|
|
@ -0,0 +1 @@
|
||||||
|
Subproject commit d6ac8c8c73bf83833a71b41e95820d4eb7741fa9
|
|
@ -1,22 +0,0 @@
|
||||||
#pragma once
|
|
||||||
#include "bang/bang_runtime.h"
|
|
||||||
#include "bang_div.h"
|
|
||||||
#include "operators/element_wise.h"
|
|
||||||
namespace infini {
|
|
||||||
|
|
||||||
void element_wise_kernel(const RuntimeObj *obj, const Operator &_op) {
|
|
||||||
auto op = as<ElementWiseObj>(_op);
|
|
||||||
float *const aData = (op->getInputs(0)->getRawDataPtr<float *>());
|
|
||||||
float *const bData = (op->getInputs(1)->getRawDataPtr<float *>());
|
|
||||||
float *const cData = (op->getOutput()->getRawDataPtr<float *>());
|
|
||||||
|
|
||||||
auto dim = op->getInputs(0)->getDims();
|
|
||||||
auto context = dynamic_cast<const BangRuntimeObj *>(obj);
|
|
||||||
int n = dim[0], c = dim[1], h = dim[2], w = dim[3];
|
|
||||||
if (op->getOpType() == OpType::Div)
|
|
||||||
div_kernel(context->cnnlHandle(), aData, bData, cData, n * c * h * w);
|
|
||||||
else
|
|
||||||
IT_TODO_HALT();
|
|
||||||
}
|
|
||||||
|
|
||||||
}; // namespace infini
|
|
|
@ -2,4 +2,4 @@
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
constexpr double E_CONSTANT = 2.718281828459;
|
constexpr double E_CONSTANT = 2.718281828459;
|
||||||
}
|
}
|
||||||
|
|
|
@ -52,4 +52,14 @@ template <> inline DataType DataType::get<int16_t>() { return Int16; }
|
||||||
template <> inline DataType DataType::get<int32_t>() { return Int32; }
|
template <> inline DataType DataType::get<int32_t>() { return Int32; }
|
||||||
template <> inline DataType DataType::get<int64_t>() { return Int64; }
|
template <> inline DataType DataType::get<int64_t>() { return Int64; }
|
||||||
|
|
||||||
|
template <int index> struct DT {};
|
||||||
|
template <> struct DT<0> { using t = float; };
|
||||||
|
template <> struct DT<1> { using t = uint32_t; };
|
||||||
|
template <> struct DT<2> { using t = uint8_t; };
|
||||||
|
template <> struct DT<3> { using t = int8_t; };
|
||||||
|
template <> struct DT<4> { using t = uint16_t; };
|
||||||
|
template <> struct DT<5> { using t = int16_t; };
|
||||||
|
template <> struct DT<6> { using t = int32_t; };
|
||||||
|
template <> struct DT<7> { using t = int64_t; };
|
||||||
|
|
||||||
} // namespace infini
|
} // namespace infini
|
||||||
|
|
|
@ -59,6 +59,8 @@ class GraphObj : public Object {
|
||||||
*/
|
*/
|
||||||
bool topo_sort();
|
bool topo_sort();
|
||||||
|
|
||||||
|
void optimize();
|
||||||
|
|
||||||
void dataMalloc();
|
void dataMalloc();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -46,6 +46,9 @@ class GraphHandlerObj {
|
||||||
|
|
||||||
Tensor conv(Tensor input, Tensor weight, Tensor output, int ph, int pw,
|
Tensor conv(Tensor input, Tensor weight, Tensor output, int ph, int pw,
|
||||||
int sh, int sw, int dh, int dw);
|
int sh, int sw, int dh, int dw);
|
||||||
|
Tensor convTransposed2d(Tensor input, Tensor weight, Tensor output, int ph,
|
||||||
|
int pw, int sh, int sw, int dh, int dw, int oph,
|
||||||
|
int opw);
|
||||||
Tensor matmul(Tensor a, Tensor b, Tensor y, bool transA, bool transB,
|
Tensor matmul(Tensor a, Tensor b, Tensor y, bool transA, bool transB,
|
||||||
Tensor bias, ActType act);
|
Tensor bias, ActType act);
|
||||||
Tensor batchNorm(Tensor input, Tensor output, Tensor mean, Tensor var,
|
Tensor batchNorm(Tensor input, Tensor output, Tensor mean, Tensor var,
|
||||||
|
@ -68,10 +71,17 @@ class GraphHandlerObj {
|
||||||
Tensor tanh(Tensor x, Tensor y);
|
Tensor tanh(Tensor x, Tensor y);
|
||||||
Tensor softmax(Tensor x, Tensor y, int axis);
|
Tensor softmax(Tensor x, Tensor y, int axis);
|
||||||
Tensor abs(Tensor x, Tensor y);
|
Tensor abs(Tensor x, Tensor y);
|
||||||
|
Tensor shape(Tensor x, Tensor y);
|
||||||
Tensor identity(Tensor x, Tensor y);
|
Tensor identity(Tensor x, Tensor y);
|
||||||
Tensor flatten(Tensor s, Tensor y, int axis);
|
Tensor flatten(Tensor s, Tensor y, int axis);
|
||||||
|
Tensor pRelu(Tensor x, Tensor slope, Tensor y);
|
||||||
|
Tensor clip(Tensor x, Tensor y, std::optional<float> min,
|
||||||
|
std::optional<float> max);
|
||||||
|
Tensor transpose(Tensor data, Tensor transposed, Shape perm);
|
||||||
Tensor reshape(Tensor data, Tensor reshaped, Shape shape);
|
Tensor reshape(Tensor data, Tensor reshaped, Shape shape);
|
||||||
Tensor concat(TensorVec inputs, Tensor output, int dim);
|
Tensor concat(TensorVec inputs, Tensor output, int dim);
|
||||||
|
TensorVec split(Tensor input, std::optional<TensorVec> outputs, int axis,
|
||||||
|
int num_outputs);
|
||||||
Tensor gather(Tensor data, Tensor indices, Tensor output, int axis);
|
Tensor gather(Tensor data, Tensor indices, Tensor output, int axis);
|
||||||
Tensor reduceMean(Tensor data, Tensor reduced,
|
Tensor reduceMean(Tensor data, Tensor reduced,
|
||||||
const optional<vector<int>> &axes, bool keepdims);
|
const optional<vector<int>> &axes, bool keepdims);
|
||||||
|
@ -85,6 +95,8 @@ class GraphHandlerObj {
|
||||||
|
|
||||||
inline bool topo_sort() { return g->topo_sort(); }
|
inline bool topo_sort() { return g->topo_sort(); }
|
||||||
|
|
||||||
|
inline void optimize() { g->optimize(); }
|
||||||
|
|
||||||
//------ runtime
|
//------ runtime
|
||||||
|
|
||||||
inline void data_malloc() { g->dataMalloc(); }
|
inline void data_malloc() { g->dataMalloc(); }
|
||||||
|
|
|
@ -15,4 +15,4 @@ template <typename T> inline HashType hashVector(const vector<T> &vec) {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace infini
|
} // namespace infini
|
||||||
|
|
|
@ -68,4 +68,4 @@ inline std::ostream &operator<<(std::ostream &os, const Ref<T> &obj) {
|
||||||
return os;
|
return os;
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace infini
|
} // namespace infini
|
||||||
|
|
|
@ -6,6 +6,8 @@ enum class OpType {
|
||||||
Unknown = 0,
|
Unknown = 0,
|
||||||
// linear
|
// linear
|
||||||
Conv = 100,
|
Conv = 100,
|
||||||
|
ConvBackwardFilter,
|
||||||
|
ConvBackwardData,
|
||||||
Matmul,
|
Matmul,
|
||||||
ConvTrans,
|
ConvTrans,
|
||||||
ConvTransNHWC,
|
ConvTransNHWC,
|
||||||
|
@ -34,10 +36,70 @@ enum class OpType {
|
||||||
Softmax,
|
Softmax,
|
||||||
Activation,
|
Activation,
|
||||||
Relu,
|
Relu,
|
||||||
|
ReluBackward,
|
||||||
|
PRelu,
|
||||||
Sigmoid,
|
Sigmoid,
|
||||||
|
SigmoidBackward,
|
||||||
Tanh,
|
Tanh,
|
||||||
|
TanhBackward,
|
||||||
Abs,
|
Abs,
|
||||||
|
Sin,
|
||||||
|
Cos,
|
||||||
|
Tan,
|
||||||
|
ASin,
|
||||||
|
ACos,
|
||||||
|
ATan,
|
||||||
|
SinH,
|
||||||
|
CosH,
|
||||||
|
TanH,
|
||||||
|
ASinH,
|
||||||
|
ACosH,
|
||||||
|
ATanH,
|
||||||
Resize,
|
Resize,
|
||||||
|
Arange,
|
||||||
|
Shape,
|
||||||
|
Copy,
|
||||||
|
Ceil,
|
||||||
|
Floor,
|
||||||
|
Clip,
|
||||||
|
Erf,
|
||||||
|
Exp,
|
||||||
|
Fill,
|
||||||
|
Log,
|
||||||
|
L2Loss,
|
||||||
|
Maximum,
|
||||||
|
Minimum,
|
||||||
|
MSELoss,
|
||||||
|
Neg,
|
||||||
|
Power,
|
||||||
|
Reciprocal,
|
||||||
|
Sqrt,
|
||||||
|
Rsqrt,
|
||||||
|
Cast,
|
||||||
|
FloorDiv,
|
||||||
|
FloorMod,
|
||||||
|
Det,
|
||||||
|
Round,
|
||||||
|
Square,
|
||||||
|
SquaredDifference,
|
||||||
|
Hardtanh,
|
||||||
|
Equal,
|
||||||
|
NotEqual,
|
||||||
|
GreaterThan,
|
||||||
|
GreaterEqual,
|
||||||
|
LessThan,
|
||||||
|
LessEqual,
|
||||||
|
And,
|
||||||
|
Or,
|
||||||
|
Xor,
|
||||||
|
Not,
|
||||||
|
BitAnd,
|
||||||
|
BitOr,
|
||||||
|
BitXor,
|
||||||
|
BitNot,
|
||||||
|
BitLeftShift,
|
||||||
|
BitRightShift,
|
||||||
|
Dropout,
|
||||||
//
|
//
|
||||||
MemBound = 300,
|
MemBound = 300,
|
||||||
};
|
};
|
||||||
|
@ -55,6 +117,8 @@ class OpRegistry {
|
||||||
FOP(Unknown);
|
FOP(Unknown);
|
||||||
// linear
|
// linear
|
||||||
FOP(Conv);
|
FOP(Conv);
|
||||||
|
FOP(ConvBackwardFilter);
|
||||||
|
FOP(ConvBackwardData);
|
||||||
FOP(Matmul);
|
FOP(Matmul);
|
||||||
FOP(ConvTrans);
|
FOP(ConvTrans);
|
||||||
FOP(G2BMM);
|
FOP(G2BMM);
|
||||||
|
@ -76,15 +140,72 @@ class OpRegistry {
|
||||||
FOP(ReduceMean);
|
FOP(ReduceMean);
|
||||||
FOP(Reshape);
|
FOP(Reshape);
|
||||||
FOP(Identity);
|
FOP(Identity);
|
||||||
|
FOP(Shape);
|
||||||
// element wise
|
// element wise
|
||||||
FOP(BatchNorm);
|
FOP(BatchNorm);
|
||||||
FOP(Softmax);
|
FOP(Softmax);
|
||||||
FOP(Activation);
|
FOP(Activation);
|
||||||
FOP(Relu);
|
FOP(Relu);
|
||||||
|
FOP(ReluBackward);
|
||||||
|
FOP(PRelu);
|
||||||
FOP(Sigmoid);
|
FOP(Sigmoid);
|
||||||
|
FOP(SigmoidBackward);
|
||||||
FOP(Tanh);
|
FOP(Tanh);
|
||||||
|
FOP(TanhBackward);
|
||||||
FOP(Abs);
|
FOP(Abs);
|
||||||
FOP(ConvTransNHWC);
|
FOP(Sin);
|
||||||
|
FOP(Cos);
|
||||||
|
FOP(Tan);
|
||||||
|
FOP(ASin);
|
||||||
|
FOP(ACos);
|
||||||
|
FOP(ATan);
|
||||||
|
FOP(SinH);
|
||||||
|
FOP(CosH);
|
||||||
|
FOP(TanH);
|
||||||
|
FOP(ASinH);
|
||||||
|
FOP(ACosH);
|
||||||
|
FOP(ATanH);
|
||||||
|
FOP(Copy);
|
||||||
|
FOP(Ceil);
|
||||||
|
FOP(Floor);
|
||||||
|
FOP(Clip);
|
||||||
|
FOP(Erf);
|
||||||
|
FOP(Exp);
|
||||||
|
FOP(Fill);
|
||||||
|
FOP(Log);
|
||||||
|
FOP(L2Loss);
|
||||||
|
FOP(Maximum);
|
||||||
|
FOP(Minimum);
|
||||||
|
FOP(MSELoss);
|
||||||
|
FOP(Neg);
|
||||||
|
FOP(Power);
|
||||||
|
FOP(Reciprocal);
|
||||||
|
FOP(Sqrt);
|
||||||
|
FOP(Rsqrt);
|
||||||
|
FOP(Cast);
|
||||||
|
FOP(FloorDiv);
|
||||||
|
FOP(FloorMod);
|
||||||
|
FOP(Det);
|
||||||
|
FOP(Round);
|
||||||
|
FOP(Square);
|
||||||
|
FOP(SquaredDifference);
|
||||||
|
FOP(Hardtanh);
|
||||||
|
FOP(Equal);
|
||||||
|
FOP(NotEqual);
|
||||||
|
FOP(GreaterThan);
|
||||||
|
FOP(GreaterEqual);
|
||||||
|
FOP(LessThan);
|
||||||
|
FOP(LessEqual);
|
||||||
|
FOP(And);
|
||||||
|
FOP(Or);
|
||||||
|
FOP(Xor);
|
||||||
|
FOP(Not);
|
||||||
|
FOP(BitAnd);
|
||||||
|
FOP(BitOr);
|
||||||
|
FOP(BitXor);
|
||||||
|
FOP(BitNot);
|
||||||
|
FOP(BitLeftShift);
|
||||||
|
FOP(BitRightShift);
|
||||||
//
|
//
|
||||||
FOP(MemBound);
|
FOP(MemBound);
|
||||||
default:
|
default:
|
||||||
|
|
|
@ -49,4 +49,4 @@ class PerfEngine {
|
||||||
void to_json(json &j, const PerfEngine &p);
|
void to_json(json &j, const PerfEngine &p);
|
||||||
void from_json(const json &j, PerfEngine &p);
|
void from_json(const json &j, PerfEngine &p);
|
||||||
|
|
||||||
} // namespace infini
|
} // namespace infini
|
||||||
|
|
|
@ -40,4 +40,4 @@ std::vector<Ref<T>> wrefs_to_refs(const std::vector<WRef<T>> &wrefs) {
|
||||||
return refs;
|
return refs;
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace infini
|
} // namespace infini
|
||||||
|
|
|
@ -73,7 +73,7 @@ class TensorObj : public TensorBaseObj {
|
||||||
// FIXME: std::fucntion copies the generator instead of passing it by ref.
|
// FIXME: std::fucntion copies the generator instead of passing it by ref.
|
||||||
// Thus the internal state of generator cannot be updated.
|
// Thus the internal state of generator cannot be updated.
|
||||||
void setData(
|
void setData(
|
||||||
const std::function<void(void *, size_t, DataType)> &generator) const;
|
std::function<void(void *, size_t, DataType)> const &generator) const;
|
||||||
Tensor clone() const {
|
Tensor clone() const {
|
||||||
auto obj = make_ref<TensorObj>(*this);
|
auto obj = make_ref<TensorObj>(*this);
|
||||||
obj->freeData();
|
obj->freeData();
|
||||||
|
@ -100,26 +100,53 @@ class TensorObj : public TensorBaseObj {
|
||||||
template <typename T> bool equalData(const vector<T> &dataVector) {
|
template <typename T> bool equalData(const vector<T> &dataVector) {
|
||||||
IT_ASSERT(DataType::get<T>() == dtype);
|
IT_ASSERT(DataType::get<T>() == dtype);
|
||||||
IT_ASSERT(size() == dataVector.size());
|
IT_ASSERT(size() == dataVector.size());
|
||||||
return equalDataImpl(getRawDataPtr<T *>(), dataVector.data(), size(),
|
return equalDataImpl(getRawDataPtr<T *>(), dataVector.data(), size());
|
||||||
1e-6);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t getOffsetByBroadcastOffset(size_t bcOffset, Shape bcShape) const;
|
size_t getOffsetByBroadcastOffset(size_t bcOffset, Shape bcShape) const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void printDataFloat(float *ptr) const;
|
template <class T> string dataToString() const {
|
||||||
void printDataUint32_t(uint32_t *ptr) const;
|
std::stringstream builder;
|
||||||
|
builder << "Tensor: " << guid << std::endl;
|
||||||
|
|
||||||
|
auto numDims = shape.size();
|
||||||
|
auto dimSzVec = vector<int>(numDims, 1);
|
||||||
|
auto ptr = data->getPtr<T *>();
|
||||||
|
dimSzVec[numDims - 1] = shape[numDims - 1];
|
||||||
|
|
||||||
|
for (int i = numDims - 1; i != 0; --i)
|
||||||
|
dimSzVec[i - 1] = dimSzVec[i] * shape[i - 1];
|
||||||
|
|
||||||
|
for (size_t i = 0, iEnd = size(); i < iEnd; ++i) {
|
||||||
|
for (size_t j = 0; j < numDims; ++j)
|
||||||
|
if (i % dimSzVec[j] == 0)
|
||||||
|
builder << "[";
|
||||||
|
|
||||||
|
builder << ptr[i];
|
||||||
|
for (size_t j = 0; j < numDims; ++j)
|
||||||
|
if ((int)i % dimSzVec[j] == dimSzVec[j] - 1)
|
||||||
|
builder << "]";
|
||||||
|
|
||||||
|
if (i != size() - 1)
|
||||||
|
builder << ", ";
|
||||||
|
|
||||||
|
auto column = (size_t)dimSzVec[numDims - 1];
|
||||||
|
if (i % column == column - 1)
|
||||||
|
builder << std::endl;
|
||||||
|
}
|
||||||
|
return builder.str();
|
||||||
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
bool equalDataImpl(const T *a, const T *b, size_t size,
|
bool equalDataImpl(const T *a, const T *b, size_t size) const {
|
||||||
double relativeError) const {
|
|
||||||
for (size_t i = 0; i < size; ++i) {
|
for (size_t i = 0; i < size; ++i) {
|
||||||
if constexpr (std::is_integral_v<T>) {
|
if constexpr (std::is_integral_v<T>) {
|
||||||
if (a[i] != b[i])
|
if (a[i] != b[i])
|
||||||
return false;
|
return false;
|
||||||
} else if constexpr (std::is_floating_point_v<T>) {
|
} else if constexpr (std::is_floating_point_v<T>) {
|
||||||
if (fabs(a[i] - b[i]) / std::max(fabs(a[i]), fabs(b[i])) >
|
if (fabs(a[i] - b[i]) / std::max(fabs(a[i]), fabs(b[i])) >
|
||||||
relativeError) {
|
1e-6) {
|
||||||
printf("Error on %lu: %f %f\n", i, a[i], b[i]);
|
printf("Error on %lu: %f %f\n", i, a[i], b[i]);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,9 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "operators/unary.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
void clip_kernel(float *input, float *output, int num, float minValue,
|
||||||
|
float maxValue);
|
||||||
|
|
||||||
|
}; // namespace infini
|
|
@ -1,6 +1,8 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
void div_kernel(float *a, float *b, float *c, int num);
|
void div_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3,
|
||||||
void pow_kernel(float *a, float *b, float *c, int num);
|
int b0, int b1, int b2, int b3, int c0, int c1, int c2, int c3);
|
||||||
|
void pow_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3,
|
||||||
|
int b0, int b1, int b2, int b3, int c0, int c1, int c2, int c3);
|
||||||
}; // namespace infini
|
}; // namespace infini
|
||||||
|
|
|
@ -12,4 +12,4 @@ class MatchComputationKernel : public Pass {
|
||||||
virtual void transform(Formula &origin, int dfsDepth, Expr &rCur) override;
|
virtual void transform(Formula &origin, int dfsDepth, Expr &rCur) override;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace nnet
|
} // namespace nnet
|
||||||
|
|
|
@ -12,4 +12,4 @@ class MatchMemBoundKernel : public Pass {
|
||||||
virtual void transform(Formula &origin, int dfsDepth, Expr &rCur) override;
|
virtual void transform(Formula &origin, int dfsDepth, Expr &rCur) override;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace nnet
|
} // namespace nnet
|
||||||
|
|
|
@ -38,4 +38,4 @@ class Pass {
|
||||||
const VecExpr &getTransformations();
|
const VecExpr &getTransformations();
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace nnet
|
} // namespace nnet
|
||||||
|
|
|
@ -15,4 +15,4 @@ class Rule1VariableSplit : public Pass {
|
||||||
Expr replaceIters(Expr cur, const Replace &replace);
|
Expr replaceIters(Expr cur, const Replace &replace);
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace nnet
|
} // namespace nnet
|
||||||
|
|
|
@ -26,4 +26,4 @@ class Rule2VariableMerging : public Pass {
|
||||||
pair<Iterator, int> pb);
|
pair<Iterator, int> pb);
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace nnet
|
} // namespace nnet
|
||||||
|
|
|
@ -16,4 +16,4 @@ class Rule3StageSplit : public Pass {
|
||||||
vector<vector<Var>> getSplitSummationIters(RangeOp rangeOp);
|
vector<vector<Var>> getSplitSummationIters(RangeOp rangeOp);
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace nnet
|
} // namespace nnet
|
||||||
|
|
|
@ -19,4 +19,4 @@ class Rule4StageMerging : public Pass {
|
||||||
virtual void transform(Formula &origin, int depth, Expr &rCur) override;
|
virtual void transform(Formula &origin, int depth, Expr &rCur) override;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace nnet
|
} // namespace nnet
|
||||||
|
|
|
@ -13,4 +13,4 @@ class Rule5RangeRelaxation : public Pass {
|
||||||
virtual void transform(Formula &origin, int depth, Expr &rCur) override;
|
virtual void transform(Formula &origin, int depth, Expr &rCur) override;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace nnet
|
} // namespace nnet
|
||||||
|
|
|
@ -14,4 +14,4 @@ class Rule6KenerlMatching : public Pass {
|
||||||
VecExpr matchElementWise(const RangeOp &rangeOp);
|
VecExpr matchElementWise(const RangeOp &rangeOp);
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace nnet
|
} // namespace nnet
|
||||||
|
|
|
@ -13,4 +13,4 @@ class Rule7DLT : public Pass {
|
||||||
vector<int> getFactors();
|
vector<int> getFactors();
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace nnet
|
} // namespace nnet
|
||||||
|
|
|
@ -45,4 +45,4 @@ class Rule8GuidedDLT : public Pass {
|
||||||
vector<Var> tensorDimAxes, vector<int> newShape);
|
vector<Var> tensorDimAxes, vector<int> newShape);
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace nnet
|
} // namespace nnet
|
||||||
|
|
|
@ -13,4 +13,4 @@ class Rule90TwoStageElementWise : public Pass {
|
||||||
VecExpr matchTwoStageElementWise(const RangeOp &rangeOp);
|
VecExpr matchTwoStageElementWise(const RangeOp &rangeOp);
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace nnet
|
} // namespace nnet
|
||||||
|
|
|
@ -12,4 +12,4 @@ class Rule91MergeStagesWithSum : public Pass {
|
||||||
virtual void transform(Formula &origin, int dfsDepth, Expr &rCur) override;
|
virtual void transform(Formula &origin, int dfsDepth, Expr &rCur) override;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace nnet
|
} // namespace nnet
|
||||||
|
|
|
@ -12,4 +12,4 @@ class Rule9RangeMagnify : public Pass {
|
||||||
virtual void transform(Formula &origin, int dfsDepth, Expr &rCur) override;
|
virtual void transform(Formula &origin, int dfsDepth, Expr &rCur) override;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace nnet
|
} // namespace nnet
|
||||||
|
|
|
@ -43,4 +43,4 @@ class ReplaceKit {
|
||||||
const Expr &replacement);
|
const Expr &replacement);
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace nnet
|
} // namespace nnet
|
||||||
|
|
|
@ -231,7 +231,7 @@ template <typename... T> std::string type_list_to_string() {
|
||||||
result.pop_back();
|
result.pop_back();
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
} // namespace dbg
|
||||||
|
|
||||||
template <typename... T> std::string get_type_name(type_tag<std::tuple<T...>>) {
|
template <typename... T> std::string get_type_name(type_tag<std::tuple<T...>>) {
|
||||||
return "std::tuple<" + type_list_to_string<T...>() + ">";
|
return "std::tuple<" + type_list_to_string<T...>() + ">";
|
||||||
|
@ -855,4 +855,4 @@ auto identity(T &&, U &&...u) -> last_t<U...> {
|
||||||
#define dbg(...) dbg::identity(__VA_ARGS__)
|
#define dbg(...) dbg::identity(__VA_ARGS__)
|
||||||
#endif // DBG_MACRO_DISABLE
|
#endif // DBG_MACRO_DISABLE
|
||||||
|
|
||||||
#endif // DBG_MACRO_DBG_H
|
#endif // DBG_MACRO_DBG_H
|
||||||
|
|
|
@ -50,4 +50,4 @@ class DLT {
|
||||||
RangeOp rangeOp);
|
RangeOp rangeOp);
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace nnet
|
} // namespace nnet
|
||||||
|
|
|
@ -55,4 +55,4 @@ class NMutator : public Mutator {
|
||||||
// Graph transformConv1xk(Operator op);
|
// Graph transformConv1xk(Operator op);
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace infini
|
} // namespace infini
|
||||||
|
|
|
@ -35,4 +35,4 @@ template <typename T> class SubsetGenerator {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace nnet
|
} // namespace nnet
|
||||||
|
|
|
@ -115,4 +115,4 @@ constexpr Ref<_Tp> make_ref_from_tuple(_Tuple &&__t) {
|
||||||
// }
|
// }
|
||||||
// };
|
// };
|
||||||
|
|
||||||
// } // namespace nnet
|
// } // namespace nnet
|
||||||
|
|
|
@ -27,4 +27,4 @@ namespace nnet {
|
||||||
int matchExprResult(Derivator &derivator, string fn);
|
int matchExprResult(Derivator &derivator, string fn);
|
||||||
bool checkExprLogSame(string fnPrefix, int start, int end);
|
bool checkExprLogSame(string fnPrefix, int start, int end);
|
||||||
bool checkExprsEquvivalence(VecExpr exprs);
|
bool checkExprsEquvivalence(VecExpr exprs);
|
||||||
} // namespace nnet
|
} // namespace nnet
|
||||||
|
|
|
@ -0,0 +1,32 @@
|
||||||
|
#pragma once
|
||||||
|
#include "core/operator.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
class ActivationBackwardObj : public OperatorObj {
|
||||||
|
public:
|
||||||
|
ActivationBackwardObj(OpType type, GraphObj *graph, Tensor y, Tensor diff_y,
|
||||||
|
Tensor x, Tensor diff_x);
|
||||||
|
OP_CLONE(ActivationBackwardObj);
|
||||||
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||||
|
|
||||||
|
std::string toString() const override;
|
||||||
|
int numInputs() const override { return 3; }
|
||||||
|
int numOutputs() const override { return 1; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
vector<int> getWorkloadVector() const override;
|
||||||
|
vector<int> getOpAttrVector() const override;
|
||||||
|
};
|
||||||
|
|
||||||
|
#define DEFINE_ACTIVATION_BACKWARD_OBJ(prefix, type) \
|
||||||
|
class prefix##Obj : public ActivationBackwardObj { \
|
||||||
|
public: \
|
||||||
|
prefix##Obj(GraphObj *graph, Tensor y, Tensor diff_y, Tensor x, \
|
||||||
|
Tensor diff_x) \
|
||||||
|
: ActivationBackwardObj(type, graph, y, diff_y, x, diff_x) {} \
|
||||||
|
};
|
||||||
|
|
||||||
|
DEFINE_ACTIVATION_BACKWARD_OBJ(ReluBackward, OpType::ReluBackward)
|
||||||
|
DEFINE_ACTIVATION_BACKWARD_OBJ(SigmoidBackward, OpType::SigmoidBackward)
|
||||||
|
DEFINE_ACTIVATION_BACKWARD_OBJ(TanhBackward, OpType::TanhBackward)
|
||||||
|
}; // namespace infini
|
|
@ -9,7 +9,7 @@ namespace infini {
|
||||||
*/
|
*/
|
||||||
class BatchNormObj : public OperatorObj {
|
class BatchNormObj : public OperatorObj {
|
||||||
float momentum, eps;
|
float momentum, eps;
|
||||||
bool training;
|
bool trainingMode;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
/**
|
/**
|
||||||
|
@ -28,11 +28,11 @@ class BatchNormObj : public OperatorObj {
|
||||||
* Default is 0.9.
|
* Default is 0.9.
|
||||||
* @param eps The epsilon value to use to avoid division by zero. Default is
|
* @param eps The epsilon value to use to avoid division by zero. Default is
|
||||||
* 1e-5.
|
* 1e-5.
|
||||||
* @param training Set to true when used for training.
|
* @param trainingMode Set to true when used for training.
|
||||||
*/
|
*/
|
||||||
BatchNormObj(GraphObj *graph, Tensor input, Tensor output, Tensor mean,
|
BatchNormObj(GraphObj *graph, Tensor input, Tensor output, Tensor mean,
|
||||||
Tensor var, Tensor scale, Tensor bias, float momentum = 0.9,
|
Tensor var, Tensor scale, Tensor bias, float momentum = 0.9,
|
||||||
float eps = 1e-5, bool training = false);
|
float eps = 1e-5, bool trainingMode = false);
|
||||||
OP_CLONE(BatchNormObj);
|
OP_CLONE(BatchNormObj);
|
||||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||||
std::string toString() const override;
|
std::string toString() const override;
|
||||||
|
@ -42,7 +42,7 @@ class BatchNormObj : public OperatorObj {
|
||||||
int numOutputs() const override { return outputs.size(); }
|
int numOutputs() const override { return outputs.size(); }
|
||||||
float getMomentum() const { return momentum; }
|
float getMomentum() const { return momentum; }
|
||||||
float getEps() const { return eps; }
|
float getEps() const { return eps; }
|
||||||
bool getTraining() const { return training; }
|
bool getTrainingMode() const { return trainingMode; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
vector<int> getWorkloadVector() const override;
|
vector<int> getWorkloadVector() const override;
|
||||||
|
|
|
@ -149,6 +149,29 @@ class ConvObj : public ConvBaseObj {
|
||||||
void setAuxilaryAttributes(PaddingMode mode) override;
|
void setAuxilaryAttributes(PaddingMode mode) override;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
class ConvBackwardFilterObj : public ConvBaseObj {
|
||||||
|
private:
|
||||||
|
ActType act;
|
||||||
|
|
||||||
|
public:
|
||||||
|
ConvBackwardFilterObj(GraphObj *graph, Tensor inputX, Tensor diffY,
|
||||||
|
Tensor diffW, int ph, int pw, int sh = 1, int sw = 1,
|
||||||
|
int dh = 1, int dw = 1, Tensor bias = nullptr,
|
||||||
|
ActType act = ActType::None);
|
||||||
|
// Constructors for setting padding mode
|
||||||
|
ConvBackwardFilterObj(GraphObj *graph, Tensor inputX, Tensor diffY,
|
||||||
|
Tensor diffW, PaddingMode mode = PaddingMode::Same,
|
||||||
|
int sh = 1, int sw = 1, int dh = 1, int dw = 1,
|
||||||
|
Tensor bias = nullptr, ActType act = ActType::None);
|
||||||
|
|
||||||
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||||
|
ActType getAct() const { return act; }
|
||||||
|
int getNumGroups() const override { return c / getChannelPerGroup(); }
|
||||||
|
|
||||||
|
private:
|
||||||
|
void setAuxilaryAttributes(PaddingMode mode) override;
|
||||||
|
};
|
||||||
|
|
||||||
class ConvTransposed2dObj : public ConvBaseObj {
|
class ConvTransposed2dObj : public ConvBaseObj {
|
||||||
private:
|
private:
|
||||||
int oph, opw;
|
int oph, opw;
|
||||||
|
@ -170,6 +193,7 @@ class ConvTransposed2dObj : public ConvBaseObj {
|
||||||
|
|
||||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||||
int getNumGroups() const override { return group; }
|
int getNumGroups() const override { return group; }
|
||||||
|
std::pair<int, int> getOutputPadding() const { return {oph, opw}; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void setAuxilaryAttributes(PaddingMode mode) override;
|
void setAuxilaryAttributes(PaddingMode mode) override;
|
||||||
|
|
|
@ -0,0 +1,22 @@
|
||||||
|
#pragma once
|
||||||
|
#include "core/operator.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
class DetObj : public OperatorObj {
|
||||||
|
public:
|
||||||
|
enum Mode { NormalDet = 0, LogDet };
|
||||||
|
DetObj(GraphObj *graph, Tensor input, Tensor output, Mode mode);
|
||||||
|
OP_CLONE(DetObj);
|
||||||
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||||
|
|
||||||
|
std::string toString() const override;
|
||||||
|
int numInputs() const override { return 1; }
|
||||||
|
int numOutputs() const override { return 1; }
|
||||||
|
Mode getMode() const { return modeValue; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
Mode modeValue;
|
||||||
|
vector<int> getWorkloadVector() const override;
|
||||||
|
vector<int> getOpAttrVector() const override;
|
||||||
|
};
|
||||||
|
}; // namespace infini
|
|
@ -0,0 +1,52 @@
|
||||||
|
#pragma once
|
||||||
|
#include "core/operator.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
/**
|
||||||
|
* @brief Copy a tensor along a centain dimension for multiple times.
|
||||||
|
*/
|
||||||
|
class DropoutObj : public OperatorObj {
|
||||||
|
float ratio;
|
||||||
|
// bool training_mode; // TODO must be false.
|
||||||
|
|
||||||
|
public:
|
||||||
|
/**
|
||||||
|
* @brief Dropout takes an input floating-point tensor, an input ratio
|
||||||
|
* (floating-point scalar) and an input training_mode (boolean scalar). It
|
||||||
|
* produces two tensor outputs, output (floating-point tensor) and mask
|
||||||
|
* (bool tensor). If training_mode is true then the output Y will be a
|
||||||
|
* random dropout; Note that this Dropout scales the masked input data by
|
||||||
|
* the following equation, so to convert the trained model into inference
|
||||||
|
* mode, the user can simply not pass training_mode input or set it to
|
||||||
|
* false.
|
||||||
|
*
|
||||||
|
* @param graph The computation graph that this operator belongs to.
|
||||||
|
* @param data The input tensor.
|
||||||
|
* @param output The output tensor.
|
||||||
|
* @param mask The mask tensor.
|
||||||
|
* @param ratio The ratio of random dropout, with value in [0, 1). If this
|
||||||
|
* input was not set, or if it was set to 0, the output would be a simple
|
||||||
|
* copy of the input. If it’s non-zero, output will be a random dropout of
|
||||||
|
* the scaled input, which is typically the case during training.
|
||||||
|
* @param training_mode If set to true then it indicates dropout is being
|
||||||
|
* used for training. It is an optional value hence unless specified
|
||||||
|
* explicitly, it is false. If it is false, ratio is ignored and the
|
||||||
|
* operation mimics inference mode where nothing will be dropped from the
|
||||||
|
* input data and if mask is requested as output it will contain all ones.
|
||||||
|
*/
|
||||||
|
DropoutObj(GraphObj *graph, Tensor data, Tensor output, Tensor mask,
|
||||||
|
float ratio, bool training_mode);
|
||||||
|
OP_CLONE(DropoutObj);
|
||||||
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||||
|
|
||||||
|
std::string toString() const override;
|
||||||
|
int numInputs() const override { return 1; }
|
||||||
|
int numOutputs() const override { return 2; }
|
||||||
|
float getRatio() const { return ratio; }
|
||||||
|
bool getTrainingMode() const { return false; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
vector<int> getWorkloadVector() const override;
|
||||||
|
vector<int> getOpAttrVector() const override;
|
||||||
|
};
|
||||||
|
} // namespace infini
|
|
@ -32,6 +32,25 @@ class ElementWiseObj : public OperatorObj {
|
||||||
vector<int> getOpAttrVector() const override;
|
vector<int> getOpAttrVector() const override;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
class MSELossObj : public OperatorObj {
|
||||||
|
public:
|
||||||
|
enum Reduction { None = 0, Sum, Mean };
|
||||||
|
MSELossObj(GraphObj *graph, Tensor input0, Tensor input1,
|
||||||
|
Reduction reduction, Tensor output);
|
||||||
|
OP_CLONE(MSELossObj);
|
||||||
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||||
|
|
||||||
|
Reduction getReduction() const { return reductionMode; }
|
||||||
|
std::string toString() const override;
|
||||||
|
int numInputs() const override { return 2; }
|
||||||
|
int numOutputs() const override { return 1; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
Reduction reductionMode;
|
||||||
|
vector<int> getWorkloadVector() const override;
|
||||||
|
vector<int> getOpAttrVector() const override;
|
||||||
|
};
|
||||||
|
|
||||||
#define DEFINE_ELEMENT_WISE_OBJ(prefix, type) \
|
#define DEFINE_ELEMENT_WISE_OBJ(prefix, type) \
|
||||||
class prefix##Obj : public ElementWiseObj { \
|
class prefix##Obj : public ElementWiseObj { \
|
||||||
public: \
|
public: \
|
||||||
|
@ -46,4 +65,26 @@ DEFINE_ELEMENT_WISE_OBJ(Sub, OpType::Sub)
|
||||||
DEFINE_ELEMENT_WISE_OBJ(Mul, OpType::Mul)
|
DEFINE_ELEMENT_WISE_OBJ(Mul, OpType::Mul)
|
||||||
DEFINE_ELEMENT_WISE_OBJ(Div, OpType::Div)
|
DEFINE_ELEMENT_WISE_OBJ(Div, OpType::Div)
|
||||||
DEFINE_ELEMENT_WISE_OBJ(Pow, OpType::Pow)
|
DEFINE_ELEMENT_WISE_OBJ(Pow, OpType::Pow)
|
||||||
|
DEFINE_ELEMENT_WISE_OBJ(Maximum, OpType::Maximum)
|
||||||
|
DEFINE_ELEMENT_WISE_OBJ(Minimum, OpType::Minimum)
|
||||||
|
DEFINE_ELEMENT_WISE_OBJ(Power, OpType::Power)
|
||||||
|
DEFINE_ELEMENT_WISE_OBJ(FloorDiv, OpType::FloorDiv)
|
||||||
|
DEFINE_ELEMENT_WISE_OBJ(FloorMod, OpType::FloorMod)
|
||||||
|
DEFINE_ELEMENT_WISE_OBJ(SquaredDifference, OpType::SquaredDifference)
|
||||||
|
DEFINE_ELEMENT_WISE_OBJ(Equal, OpType::Equal)
|
||||||
|
DEFINE_ELEMENT_WISE_OBJ(NotEqual, OpType::NotEqual)
|
||||||
|
DEFINE_ELEMENT_WISE_OBJ(GreaterThan, OpType::GreaterThan)
|
||||||
|
DEFINE_ELEMENT_WISE_OBJ(GreaterEqual, OpType::GreaterEqual)
|
||||||
|
DEFINE_ELEMENT_WISE_OBJ(LessThan, OpType::LessThan)
|
||||||
|
DEFINE_ELEMENT_WISE_OBJ(LessEqual, OpType::LessEqual)
|
||||||
|
DEFINE_ELEMENT_WISE_OBJ(And, OpType::And)
|
||||||
|
DEFINE_ELEMENT_WISE_OBJ(Or, OpType::Or)
|
||||||
|
DEFINE_ELEMENT_WISE_OBJ(Xor, OpType::Xor)
|
||||||
|
DEFINE_ELEMENT_WISE_OBJ(Not, OpType::Not)
|
||||||
|
DEFINE_ELEMENT_WISE_OBJ(BitAnd, OpType::BitAnd)
|
||||||
|
DEFINE_ELEMENT_WISE_OBJ(BitOr, OpType::BitOr)
|
||||||
|
DEFINE_ELEMENT_WISE_OBJ(BitXor, OpType::BitXor)
|
||||||
|
DEFINE_ELEMENT_WISE_OBJ(BitNot, OpType::BitNot)
|
||||||
|
DEFINE_ELEMENT_WISE_OBJ(BitLeftShift, OpType::BitLeftShift)
|
||||||
|
DEFINE_ELEMENT_WISE_OBJ(BitRightShift, OpType::BitRightShift)
|
||||||
}; // namespace infini
|
}; // namespace infini
|
||||||
|
|
|
@ -47,10 +47,10 @@ class MatmulObj : public OperatorObj {
|
||||||
std::string toString() const override;
|
std::string toString() const override;
|
||||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||||
|
|
||||||
int numInputs() const override { return 2; }
|
int numInputs() const override { return inputs.size(); }
|
||||||
int numOutputs() const override { return 1; }
|
int numOutputs() const override { return 1; }
|
||||||
|
|
||||||
Tensor getBias() const { return inputs[2]; }
|
Tensor getBias() const { return inputs.size() > 2 ? inputs[2] : nullptr; }
|
||||||
ActType getAct() const { return act; }
|
ActType getAct() const { return act; }
|
||||||
auto getBMNKTransAB() const { return tuple(b, m, n, k, transA, transB); }
|
auto getBMNKTransAB() const { return tuple(b, m, n, k, transA, transB); }
|
||||||
bool getTransA() const { return transA; }
|
bool getTransA() const { return transA; }
|
||||||
|
|
|
@ -7,7 +7,8 @@ namespace infini {
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
class SliceObj : public OperatorObj {
|
class SliceObj : public OperatorObj {
|
||||||
vector<int> starts, ends; // the start no. and end no. for all dims.
|
template <class T> struct range_t { T start, end, step; };
|
||||||
|
vector<range_t<int>> axes;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
/**
|
/**
|
||||||
|
@ -33,9 +34,26 @@ class SliceObj : public OperatorObj {
|
||||||
|
|
||||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||||
std::string toString() const override;
|
std::string toString() const override;
|
||||||
int numInputs() const override { return 1; }
|
inline int numInputs() const override { return 1; }
|
||||||
int numOutputs() const override { return 1; }
|
inline int numOutputs() const override { return 1; }
|
||||||
Shape getStart() const { return starts; }
|
inline Shape getStarts() const {
|
||||||
|
Shape ans(axes.size());
|
||||||
|
std::transform(axes.begin(), axes.end(), ans.begin(),
|
||||||
|
[](auto x) { return x.start; });
|
||||||
|
return ans;
|
||||||
|
}
|
||||||
|
inline Shape getEnds() const {
|
||||||
|
Shape ans(axes.size());
|
||||||
|
std::transform(axes.begin(), axes.end(), ans.begin(),
|
||||||
|
[](auto x) { return x.end; });
|
||||||
|
return ans;
|
||||||
|
}
|
||||||
|
inline Shape getSteps() const {
|
||||||
|
Shape ans(axes.size());
|
||||||
|
std::transform(axes.begin(), axes.end(), ans.begin(),
|
||||||
|
[](auto x) { return x.step; });
|
||||||
|
return ans;
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
vector<int> getWorkloadVector() const override;
|
vector<int> getWorkloadVector() const override;
|
||||||
|
|
|
@ -0,0 +1,22 @@
|
||||||
|
#pragma once
|
||||||
|
#include "core/operator.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
class TransposeObj : public OperatorObj {
|
||||||
|
public:
|
||||||
|
TransposeObj(GraphObj *graph, Tensor input, Tensor output,
|
||||||
|
vector<int> permute);
|
||||||
|
OP_CLONE(TransposeObj);
|
||||||
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||||
|
|
||||||
|
std::string toString() const override;
|
||||||
|
int numInputs() const override { return 1; }
|
||||||
|
int numOutputs() const override { return 1; }
|
||||||
|
std::vector<int> getPermute() const { return transposePermute; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
vector<int> transposePermute = {1, 1, 1, 1};
|
||||||
|
vector<int> getWorkloadVector() const override;
|
||||||
|
vector<int> getOpAttrVector() const override;
|
||||||
|
};
|
||||||
|
}; // namespace infini
|
|
@ -28,6 +28,244 @@ class UnaryObj : public OperatorObj {
|
||||||
vector<int> getOpAttrVector() const override;
|
vector<int> getOpAttrVector() const override;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
class ClipObj : public OperatorObj {
|
||||||
|
public:
|
||||||
|
ClipObj(GraphObj *graph, Tensor input, Tensor output,
|
||||||
|
std::optional<float> min, std::optional<float> max);
|
||||||
|
OP_CLONE(ClipObj);
|
||||||
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||||
|
|
||||||
|
std::string toString() const override;
|
||||||
|
std::optional<float> getMin() const { return minValue; };
|
||||||
|
std::optional<float> getMax() const { return maxValue; };
|
||||||
|
int numInputs() const override { return 1; }
|
||||||
|
int numOutputs() const override { return 1; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::optional<float> minValue, maxValue;
|
||||||
|
vector<int> getWorkloadVector() const override;
|
||||||
|
vector<int> getOpAttrVector() const override;
|
||||||
|
};
|
||||||
|
|
||||||
|
class HardtanhObj : public OperatorObj {
|
||||||
|
public:
|
||||||
|
HardtanhObj(GraphObj *graph, Tensor input, Tensor output, float min,
|
||||||
|
float max);
|
||||||
|
OP_CLONE(HardtanhObj);
|
||||||
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||||
|
|
||||||
|
std::string toString() const override;
|
||||||
|
float getMin() const { return minValue; };
|
||||||
|
float getMax() const { return maxValue; };
|
||||||
|
int numInputs() const override { return 1; }
|
||||||
|
int numOutputs() const override { return 1; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
float minValue, maxValue;
|
||||||
|
vector<int> getWorkloadVector() const override;
|
||||||
|
vector<int> getOpAttrVector() const override;
|
||||||
|
};
|
||||||
|
|
||||||
|
class FlipObj : public OperatorObj {
|
||||||
|
public:
|
||||||
|
FlipObj(GraphObj *graph, Tensor input, Tensor output, vector<int> axis);
|
||||||
|
OP_CLONE(FlipObj);
|
||||||
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||||
|
|
||||||
|
std::string toString() const override;
|
||||||
|
vector<int> getAxis() const { return axisValue; };
|
||||||
|
int numInputs() const override { return 1; }
|
||||||
|
int numOutputs() const override { return 1; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
vector<int> axisValue;
|
||||||
|
vector<int> getWorkloadVector() const override;
|
||||||
|
vector<int> getOpAttrVector() const override;
|
||||||
|
};
|
||||||
|
|
||||||
|
class FillObj : public OperatorObj {
|
||||||
|
public:
|
||||||
|
FillObj(GraphObj *graph, Tensor input, Tensor output, float value);
|
||||||
|
OP_CLONE(FillObj);
|
||||||
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||||
|
|
||||||
|
std::string toString() const override;
|
||||||
|
float getValue() const { return setValue; };
|
||||||
|
int numInputs() const override { return 1; }
|
||||||
|
int numOutputs() const override { return 1; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
float setValue;
|
||||||
|
vector<int> getWorkloadVector() const override;
|
||||||
|
vector<int> getOpAttrVector() const override;
|
||||||
|
};
|
||||||
|
|
||||||
|
class L2LossObj : public OperatorObj {
|
||||||
|
public:
|
||||||
|
L2LossObj(GraphObj *graph, Tensor input, Tensor output);
|
||||||
|
OP_CLONE(L2LossObj);
|
||||||
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||||
|
|
||||||
|
std::string toString() const override;
|
||||||
|
int numInputs() const override { return 1; }
|
||||||
|
int numOutputs() const override { return 1; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
vector<int> getWorkloadVector() const override;
|
||||||
|
vector<int> getOpAttrVector() const override;
|
||||||
|
};
|
||||||
|
|
||||||
|
class TransformObj : public OperatorObj {
|
||||||
|
public:
|
||||||
|
TransformObj(GraphObj *graph, Tensor input, Tensor output, float alpha,
|
||||||
|
float beta);
|
||||||
|
OP_CLONE(TransformObj);
|
||||||
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||||
|
|
||||||
|
std::string toString() const override;
|
||||||
|
float getAlpha() const { return alphaValue; }
|
||||||
|
float getBeta() const { return betaValue; }
|
||||||
|
int numInputs() const override { return 1; }
|
||||||
|
int numOutputs() const override { return 1; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
float alphaValue, betaValue;
|
||||||
|
vector<int> getWorkloadVector() const override;
|
||||||
|
vector<int> getOpAttrVector() const override;
|
||||||
|
};
|
||||||
|
|
||||||
|
class CastObj : public OperatorObj {
|
||||||
|
public:
|
||||||
|
enum CastType {
|
||||||
|
Float2Half = 0,
|
||||||
|
Float2Int64,
|
||||||
|
Float2Int32,
|
||||||
|
Float2Int16,
|
||||||
|
Float2Int8,
|
||||||
|
Int322Float,
|
||||||
|
Int322Int8,
|
||||||
|
Int322Int16,
|
||||||
|
Int162Float,
|
||||||
|
Int162Int32,
|
||||||
|
Int82Float,
|
||||||
|
Int82Int16,
|
||||||
|
Int82Int32,
|
||||||
|
Uint82Float,
|
||||||
|
Uint82Int32,
|
||||||
|
Uint82Int64,
|
||||||
|
Int322Int64,
|
||||||
|
Int642Int32,
|
||||||
|
Int642Uint32,
|
||||||
|
Int642Float,
|
||||||
|
Uint322Int64,
|
||||||
|
};
|
||||||
|
CastObj(GraphObj *graph, Tensor input, Tensor output, CastType type);
|
||||||
|
OP_CLONE(CastObj);
|
||||||
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||||
|
vector<DataType> inferDataType(const TensorVec &inputs) const override;
|
||||||
|
|
||||||
|
std::string toString() const override;
|
||||||
|
CastType getType() const { return castType; }
|
||||||
|
DataType getOutputDataType() const;
|
||||||
|
int numInputs() const override { return 1; }
|
||||||
|
int numOutputs() const override { return 1; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
CastType castType;
|
||||||
|
vector<int> getWorkloadVector() const override;
|
||||||
|
vector<int> getOpAttrVector() const override;
|
||||||
|
};
|
||||||
|
|
||||||
|
class CumsumObj : public OperatorObj {
|
||||||
|
public:
|
||||||
|
CumsumObj(GraphObj *graph, Tensor input, Tensor output, int axis,
|
||||||
|
bool exclusive, bool reverse);
|
||||||
|
OP_CLONE(CumsumObj);
|
||||||
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||||
|
|
||||||
|
std::string toString() const override;
|
||||||
|
int getAxis() const { return axisValue; }
|
||||||
|
float getExclusive() const { return exclusiveValue; }
|
||||||
|
float getReverse() const { return reverseValue; }
|
||||||
|
int numInputs() const override { return 1; }
|
||||||
|
int numOutputs() const override { return 1; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
int axisValue;
|
||||||
|
bool exclusiveValue, reverseValue;
|
||||||
|
vector<int> getWorkloadVector() const override;
|
||||||
|
vector<int> getOpAttrVector() const override;
|
||||||
|
};
|
||||||
|
|
||||||
|
class ArangeObj : public OperatorObj {
|
||||||
|
public:
|
||||||
|
ArangeObj(GraphObj *graph, float start, float step, int length,
|
||||||
|
Tensor output);
|
||||||
|
OP_CLONE(ArangeObj);
|
||||||
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||||
|
|
||||||
|
std::string toString() const override;
|
||||||
|
int numInputs() const override { return 0; }
|
||||||
|
int numOutputs() const override { return 1; }
|
||||||
|
float getStartValue() { return startValue; }
|
||||||
|
float getStepValue() { return stepValue; }
|
||||||
|
int getLength() { return lengthValue; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
float startValue, stepValue;
|
||||||
|
int lengthValue;
|
||||||
|
vector<int> getWorkloadVector() const override;
|
||||||
|
vector<int> getOpAttrVector() const override;
|
||||||
|
};
|
||||||
|
|
||||||
|
class ShapeObj : public OperatorObj {
|
||||||
|
public:
|
||||||
|
ShapeObj(GraphObj *graph, Tensor input, Tensor output);
|
||||||
|
OP_CLONE(ShapeObj);
|
||||||
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||||
|
|
||||||
|
std::string toString() const override;
|
||||||
|
int numInputs() const override { return 1; }
|
||||||
|
int numOutputs() const override { return 1; }
|
||||||
|
};
|
||||||
|
|
||||||
|
class PReluObj : public OperatorObj {
|
||||||
|
public:
|
||||||
|
PReluObj(GraphObj *graph, Tensor input, Tensor alpha, Tensor output);
|
||||||
|
OP_CLONE(PReluObj);
|
||||||
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||||
|
|
||||||
|
std::string toString() const override;
|
||||||
|
int numInputs() const override { return 2; }
|
||||||
|
int numOutputs() const override { return 1; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
vector<int> getWorkloadVector() const override;
|
||||||
|
vector<int> getOpAttrVector() const override;
|
||||||
|
};
|
||||||
|
|
||||||
|
class LogObj : public OperatorObj {
|
||||||
|
public:
|
||||||
|
enum LogType {
|
||||||
|
LogE = 0,
|
||||||
|
Log2,
|
||||||
|
Log10,
|
||||||
|
};
|
||||||
|
LogObj(GraphObj *graph, Tensor input, Tensor output, LogType type);
|
||||||
|
OP_CLONE(LogObj);
|
||||||
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||||
|
|
||||||
|
std::string toString() const override;
|
||||||
|
LogType getType() const { return logType; }
|
||||||
|
int numInputs() const override { return 1; }
|
||||||
|
int numOutputs() const override { return 1; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
LogType logType;
|
||||||
|
vector<int> getWorkloadVector() const override;
|
||||||
|
vector<int> getOpAttrVector() const override;
|
||||||
|
};
|
||||||
|
|
||||||
#define DEFINE_UNARY_OBJ(prefix, type) \
|
#define DEFINE_UNARY_OBJ(prefix, type) \
|
||||||
class prefix##Obj : public UnaryObj { \
|
class prefix##Obj : public UnaryObj { \
|
||||||
public: \
|
public: \
|
||||||
|
@ -42,4 +280,28 @@ DEFINE_UNARY_OBJ(Tanh, OpType::Tanh)
|
||||||
// DEFINE_UNARY_OBJ(Softmax, OpType::Softmax)
|
// DEFINE_UNARY_OBJ(Softmax, OpType::Softmax)
|
||||||
DEFINE_UNARY_OBJ(Abs, OpType::Abs)
|
DEFINE_UNARY_OBJ(Abs, OpType::Abs)
|
||||||
|
|
||||||
|
DEFINE_UNARY_OBJ(Sin, OpType::Sin)
|
||||||
|
DEFINE_UNARY_OBJ(Cos, OpType::Cos)
|
||||||
|
DEFINE_UNARY_OBJ(Tan, OpType::Tan)
|
||||||
|
DEFINE_UNARY_OBJ(ASin, OpType::ASin)
|
||||||
|
DEFINE_UNARY_OBJ(ACos, OpType::ACos)
|
||||||
|
DEFINE_UNARY_OBJ(ATan, OpType::ATan)
|
||||||
|
DEFINE_UNARY_OBJ(SinH, OpType::SinH)
|
||||||
|
DEFINE_UNARY_OBJ(CosH, OpType::CosH)
|
||||||
|
DEFINE_UNARY_OBJ(TanH, OpType::TanH)
|
||||||
|
DEFINE_UNARY_OBJ(ASinH, OpType::ASinH)
|
||||||
|
DEFINE_UNARY_OBJ(ACosH, OpType::ACosH)
|
||||||
|
DEFINE_UNARY_OBJ(ATanH, OpType::ATanH)
|
||||||
|
|
||||||
|
DEFINE_UNARY_OBJ(Copy, OpType::Copy)
|
||||||
|
DEFINE_UNARY_OBJ(Ceil, OpType::Ceil)
|
||||||
|
DEFINE_UNARY_OBJ(Floor, OpType::Floor)
|
||||||
|
DEFINE_UNARY_OBJ(Erf, OpType::Erf)
|
||||||
|
DEFINE_UNARY_OBJ(Exp, OpType::Exp)
|
||||||
|
DEFINE_UNARY_OBJ(Neg, OpType::Neg)
|
||||||
|
DEFINE_UNARY_OBJ(Reciprocal, OpType::Reciprocal)
|
||||||
|
DEFINE_UNARY_OBJ(Sqrt, OpType::Sqrt)
|
||||||
|
DEFINE_UNARY_OBJ(Rsqrt, OpType::Rsqrt)
|
||||||
|
DEFINE_UNARY_OBJ(Round, OpType::Round)
|
||||||
|
DEFINE_UNARY_OBJ(Square, OpType::Square)
|
||||||
}; // namespace infini
|
}; // namespace infini
|
||||||
|
|
|
@ -8,7 +8,7 @@ version = "0.0.0"
|
||||||
authors = [{ name = "YdrMaster", email = "ydrml@hotmail.com" }]
|
authors = [{ name = "YdrMaster", email = "ydrml@hotmail.com" }]
|
||||||
description = "Python frontend of InfiniTensor"
|
description = "Python frontend of InfiniTensor"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
requires-python = ">=3.8"
|
requires-python = ">=3.7"
|
||||||
keywords = ["optimizer"]
|
keywords = ["optimizer"]
|
||||||
license = { text = "Apache" }
|
license = { text = "Apache" }
|
||||||
classifiers = ["Programming Language :: Python :: 3"]
|
classifiers = ["Programming Language :: Python :: 3"]
|
||||||
|
|
|
@ -22,12 +22,17 @@ from onnx.checker import (
|
||||||
check_tensor,
|
check_tensor,
|
||||||
)
|
)
|
||||||
from onnx.shape_inference import infer_shapes
|
from onnx.shape_inference import infer_shapes
|
||||||
|
from onnx.numpy_helper import to_array
|
||||||
from typing import Dict, List, Any, Tuple, Sequence, Union, Optional
|
from typing import Dict, List, Any, Tuple, Sequence, Union, Optional
|
||||||
from functools import reduce
|
from functools import reduce
|
||||||
|
|
||||||
runtime = backend.runtime()
|
|
||||||
|
|
||||||
class OnnxStub:
|
class OnnxStub:
|
||||||
|
"""
|
||||||
|
The Onnx model imported into infinitensor.
|
||||||
|
It can be generated from an Onnx model object.
|
||||||
|
"""
|
||||||
|
|
||||||
inputs: Dict[str, backend.Tensor] = {}
|
inputs: Dict[str, backend.Tensor] = {}
|
||||||
outputs: Dict[str, backend.Tensor] = {}
|
outputs: Dict[str, backend.Tensor] = {}
|
||||||
initializer: Dict[int, TensorProto] = {}
|
initializer: Dict[int, TensorProto] = {}
|
||||||
|
@ -53,6 +58,8 @@ class OnnxStub:
|
||||||
)
|
)
|
||||||
|
|
||||||
for initializer in model.graph.initializer:
|
for initializer in model.graph.initializer:
|
||||||
|
dims = [d for d in initializer.dims]
|
||||||
|
tensors[initializer.name] = self.handler.tensor(dims, initializer.data_type)
|
||||||
data[initializer.name] = initializer
|
data[initializer.name] = initializer
|
||||||
|
|
||||||
for node in model.graph.node:
|
for node in model.graph.node:
|
||||||
|
@ -61,14 +68,81 @@ class OnnxStub:
|
||||||
node,
|
node,
|
||||||
{
|
{
|
||||||
"dilations": [1, 1],
|
"dilations": [1, 1],
|
||||||
"pads": [0, 0],
|
"pads": [0, 0, 0, 0],
|
||||||
"strides": [1, 1],
|
"strides": [1, 1],
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
(d, p, s) = (
|
(d, p, s) = (
|
||||||
attributes[name] for name in ["dilations", "pads", "strides"]
|
attributes[name] for name in ["dilations", "pads", "strides"]
|
||||||
)
|
)
|
||||||
tensors[node.output[0]] = self.handler.conv(
|
if p[0] != p[2] or p[1] != p[3]:
|
||||||
|
adapt = "{}-adapt".format(node.output[0])
|
||||||
|
tensors[adapt] = self.handler.pad(
|
||||||
|
tensors[node.input[0]], None, p, [-2, -1]
|
||||||
|
)
|
||||||
|
p = [0, 0, 0, 0]
|
||||||
|
else:
|
||||||
|
adapt = node.input[0]
|
||||||
|
|
||||||
|
if len(node.input) > 2:
|
||||||
|
bias = "{}-bias".format(node.output[0])
|
||||||
|
reshape = "{}-reshape".format(node.output[0])
|
||||||
|
tensors[bias] = self.handler.conv(
|
||||||
|
tensors[adapt],
|
||||||
|
tensors[node.input[1]],
|
||||||
|
None,
|
||||||
|
p[0],
|
||||||
|
p[1],
|
||||||
|
s[0],
|
||||||
|
s[1],
|
||||||
|
d[0],
|
||||||
|
d[1],
|
||||||
|
)
|
||||||
|
tensors[reshape] = self.handler.reshape(
|
||||||
|
tensors[node.input[2]],
|
||||||
|
None,
|
||||||
|
[
|
||||||
|
1,
|
||||||
|
reduce(
|
||||||
|
lambda acc, x: acc * x,
|
||||||
|
_search_shape(model, node.input[2]),
|
||||||
|
),
|
||||||
|
1,
|
||||||
|
1,
|
||||||
|
],
|
||||||
|
)
|
||||||
|
tensors[node.output[0]] = self.handler.add(
|
||||||
|
tensors[bias],
|
||||||
|
tensors[reshape],
|
||||||
|
tensors.get(node.output[0]),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
tensors[node.output[0]] = self.handler.conv(
|
||||||
|
tensors[adapt],
|
||||||
|
tensors[node.input[1]],
|
||||||
|
tensors.get(node.output[0]),
|
||||||
|
p[0],
|
||||||
|
p[1],
|
||||||
|
s[0],
|
||||||
|
s[1],
|
||||||
|
d[0],
|
||||||
|
d[1],
|
||||||
|
)
|
||||||
|
elif node.op_type == "ConvTranspose":
|
||||||
|
attributes = _parse_attribute(
|
||||||
|
node,
|
||||||
|
{
|
||||||
|
"dilations": [1, 1],
|
||||||
|
"pads": [0, 0],
|
||||||
|
"strides": [1, 1],
|
||||||
|
"output_padding": [0, 0],
|
||||||
|
},
|
||||||
|
)
|
||||||
|
(d, p, s, op) = (
|
||||||
|
attributes[name]
|
||||||
|
for name in ["dilations", "pads", "strides", "output_padding"]
|
||||||
|
)
|
||||||
|
tensors[node.output[0]] = self.handler.convTransposed2d(
|
||||||
tensors[node.input[0]],
|
tensors[node.input[0]],
|
||||||
tensors[node.input[1]],
|
tensors[node.input[1]],
|
||||||
tensors.get(node.output[0]),
|
tensors.get(node.output[0]),
|
||||||
|
@ -78,6 +152,8 @@ class OnnxStub:
|
||||||
s[1],
|
s[1],
|
||||||
d[0],
|
d[0],
|
||||||
d[1],
|
d[1],
|
||||||
|
op[0],
|
||||||
|
op[1],
|
||||||
)
|
)
|
||||||
elif node.op_type == "MatMul":
|
elif node.op_type == "MatMul":
|
||||||
tensors[node.output[0]] = self.handler.matmul(
|
tensors[node.output[0]] = self.handler.matmul(
|
||||||
|
@ -129,7 +205,7 @@ class OnnxStub:
|
||||||
{
|
{
|
||||||
"kernel_shape": None,
|
"kernel_shape": None,
|
||||||
"dilations": [1, 1],
|
"dilations": [1, 1],
|
||||||
"pads": [0, 0],
|
"pads": [0, 0, 0, 0],
|
||||||
"strides": [1, 1],
|
"strides": [1, 1],
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
@ -137,56 +213,80 @@ class OnnxStub:
|
||||||
attributes[name]
|
attributes[name]
|
||||||
for name in ["kernel_shape", "dilations", "pads", "strides"]
|
for name in ["kernel_shape", "dilations", "pads", "strides"]
|
||||||
)
|
)
|
||||||
tensors[node.output[0]] = self.handler.maxPool(
|
if p[0] != p[2] or p[1] != p[3]:
|
||||||
tensors[node.input[0]],
|
adapt = "{}-adapt".format(node.output[0])
|
||||||
tensors.get(node.output[0]),
|
tensors[adapt] = self.handler.pad(
|
||||||
k[0],
|
tensors.get(node.input[0]), None, p, [-2, -1]
|
||||||
k[1],
|
)
|
||||||
d[0],
|
tensors[node.output[0]] = self.handler.maxPool(
|
||||||
d[1],
|
tensors[adapt],
|
||||||
p[0],
|
tensors.get(node.output[0]),
|
||||||
p[1],
|
k[0],
|
||||||
s[0],
|
k[1],
|
||||||
s[1],
|
d[0],
|
||||||
)
|
d[1],
|
||||||
|
0,
|
||||||
|
0,
|
||||||
|
s[0],
|
||||||
|
s[1],
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
tensors[node.output[0]] = self.handler.maxPool(
|
||||||
|
tensors[node.input[0]],
|
||||||
|
tensors.get(node.output[0]),
|
||||||
|
k[0],
|
||||||
|
k[1],
|
||||||
|
d[0],
|
||||||
|
d[1],
|
||||||
|
p[0],
|
||||||
|
p[1],
|
||||||
|
s[0],
|
||||||
|
s[1],
|
||||||
|
)
|
||||||
elif node.op_type == "AveragePool":
|
elif node.op_type == "AveragePool":
|
||||||
attributes = _parse_attribute(
|
attributes = _parse_attribute(
|
||||||
node,
|
node,
|
||||||
{
|
{
|
||||||
"kernel_shape": None,
|
"kernel_shape": None,
|
||||||
"pads": [0, 0],
|
"pads": [0, 0, 0, 0],
|
||||||
"strides": [1, 1],
|
"strides": [1, 1],
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
(k, p, s) = (
|
(k, p, s) = (
|
||||||
attributes[name] for name in ["kernel_shape", "pads", "strides"]
|
attributes[name] for name in ["kernel_shape", "pads", "strides"]
|
||||||
)
|
)
|
||||||
tensors[node.output[0]] = self.handler.avgPool(
|
if p[0] != p[2] or p[1] != p[3]:
|
||||||
tensors[node.input[0]],
|
adapt = "{}-adapt".format(node.output[0])
|
||||||
tensors.get(node.output[0]),
|
tensors[adapt] = self.handler.pad(
|
||||||
k[0],
|
tensors.get(node.input[0]), None, p, [-2, -1]
|
||||||
k[1],
|
)
|
||||||
1,
|
tensors[node.output[0]] = self.handler.avgPool(
|
||||||
1,
|
tensors[adapt],
|
||||||
p[0],
|
tensors.get(node.output[0]),
|
||||||
p[1],
|
k[0],
|
||||||
s[0],
|
k[1],
|
||||||
s[1],
|
1,
|
||||||
)
|
1,
|
||||||
|
0,
|
||||||
|
0,
|
||||||
|
s[0],
|
||||||
|
s[1],
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
tensors[node.output[0]] = self.handler.avgPool(
|
||||||
|
tensors[node.input[0]],
|
||||||
|
tensors.get(node.output[0]),
|
||||||
|
k[0],
|
||||||
|
k[1],
|
||||||
|
1,
|
||||||
|
1,
|
||||||
|
p[0],
|
||||||
|
p[1],
|
||||||
|
s[0],
|
||||||
|
s[1],
|
||||||
|
)
|
||||||
elif node.op_type == "GlobalAveragePool":
|
elif node.op_type == "GlobalAveragePool":
|
||||||
shape = next(
|
[_, _, h, w] = _search_shape(model, node.input[0])
|
||||||
(
|
|
||||||
value.type.tensor_type.shape
|
|
||||||
for value in model.graph.value_info
|
|
||||||
if value.name == node.input[0]
|
|
||||||
),
|
|
||||||
None,
|
|
||||||
) or next(
|
|
||||||
input.type.tensor_type.shape
|
|
||||||
for input in model.graph.input
|
|
||||||
if input.name == node.input[0]
|
|
||||||
)
|
|
||||||
[_, _, h, w] = _take_shape_dim(shape)
|
|
||||||
tensors[node.output[0]] = self.handler.avgPool(
|
tensors[node.output[0]] = self.handler.avgPool(
|
||||||
tensors[node.input[0]],
|
tensors[node.input[0]],
|
||||||
tensors.get(node.output[0]),
|
tensors.get(node.output[0]),
|
||||||
|
@ -248,58 +348,123 @@ class OnnxStub:
|
||||||
tensors[node.output[0]] = self.handler.softmax(
|
tensors[node.output[0]] = self.handler.softmax(
|
||||||
tensors[node.input[0]],
|
tensors[node.input[0]],
|
||||||
tensors.get(node.output[0]),
|
tensors.get(node.output[0]),
|
||||||
next((attr.i for attr in node.attribute if attr.name == "axis")),
|
next(
|
||||||
|
(attr.i for attr in node.attribute if attr.name == "axis"), -1
|
||||||
|
),
|
||||||
)
|
)
|
||||||
elif node.op_type == "Abs":
|
elif node.op_type == "Abs":
|
||||||
tensors[node.output[0]] = self.handler.abs(
|
tensors[node.output[0]] = self.handler.abs(
|
||||||
tensors[node.input[0]],
|
tensors[node.input[0]],
|
||||||
tensors.get(node.output[0]),
|
tensors.get(node.output[0]),
|
||||||
)
|
)
|
||||||
|
elif node.op_type == "Shape":
|
||||||
|
tensors[node.output[0]] = self.handler.shape(
|
||||||
|
tensors[node.input[0]],
|
||||||
|
tensors.get(node.output[0]),
|
||||||
|
)
|
||||||
elif node.op_type == "Identity":
|
elif node.op_type == "Identity":
|
||||||
tensors[node.output[0]] = self.handler.identity(
|
tensors[node.output[0]] = self.handler.identity(
|
||||||
tensors[node.input[0]],
|
tensors[node.input[0]],
|
||||||
tensors.get(node.output[0]),
|
tensors.get(node.output[0]),
|
||||||
)
|
)
|
||||||
elif node.op_type == "Flatten":
|
elif node.op_type == "Flatten":
|
||||||
|
|
||||||
tensors[node.output[0]] = self.handler.flatten(
|
tensors[node.output[0]] = self.handler.flatten(
|
||||||
tensors[node.input[0]],
|
tensors[node.input[0]],
|
||||||
tensors.get(node.output[0]),
|
tensors.get(node.output[0]),
|
||||||
next((attr.i for attr in node.attribute if attr.name == "axis")),
|
next((attr.i for attr in node.attribute if attr.name == "axis")),
|
||||||
)
|
)
|
||||||
elif node.op_type == "Reshape":
|
elif node.op_type == "PRelu":
|
||||||
input_shape = next(
|
tensors[node.output[0]] = self.handler.pRelu(
|
||||||
(
|
tensors[node.input[0]],
|
||||||
value.type.tensor_type.shape
|
tensors[node.input[1]],
|
||||||
for value in model.graph.value_info
|
tensors.get(node.output[0]),
|
||||||
if value.name == node.input[0]
|
|
||||||
),
|
|
||||||
None,
|
|
||||||
) or next(
|
|
||||||
input.type.tensor_type.shape
|
|
||||||
for input in model.graph.input
|
|
||||||
if input.name == node.input[0]
|
|
||||||
)
|
)
|
||||||
dims = _take_shape_dim(input_shape)
|
elif node.op_type == "Clip":
|
||||||
|
tensors[node.output[0]] = self.handler.clip(
|
||||||
|
tensors[node.input[0]],
|
||||||
|
tensors.get(node.output[0]),
|
||||||
|
next(_parse_data(data[node.input[1]]).__iter__(), None)
|
||||||
|
if len(node.input) > 1
|
||||||
|
else None,
|
||||||
|
next(_parse_data(data[node.input[2]]).__iter__(), None)
|
||||||
|
if len(node.input) > 2
|
||||||
|
else None,
|
||||||
|
)
|
||||||
|
elif node.op_type == "Transpose":
|
||||||
|
perm = next(
|
||||||
|
(attr.ints for attr in node.attribute if attr.name == "perm"), None
|
||||||
|
)
|
||||||
|
tensors[node.output[0]] = self.handler.transpose(
|
||||||
|
tensors[node.input[0]],
|
||||||
|
tensors.get(node.output[0]),
|
||||||
|
perm,
|
||||||
|
)
|
||||||
|
elif node.op_type == "Reshape":
|
||||||
|
dims = _search_shape(model, node.input[0])
|
||||||
size = reduce(lambda acc, x: acc * x, dims)
|
size = reduce(lambda acc, x: acc * x, dims)
|
||||||
output_shape = [int(i) for i in data[node.input[1]].int64_data]
|
input_shape = _parse_data(data[node.input[1]])
|
||||||
for i, x in enumerate(output_shape):
|
for i, x in enumerate(input_shape):
|
||||||
if x == 0:
|
if x == 0:
|
||||||
output_shape[i] = dims[i]
|
input_shape[i] = dims[i]
|
||||||
temp = reduce(lambda acc, x: acc * x, output_shape)
|
temp = reduce(lambda acc, x: acc * x, input_shape, 1)
|
||||||
if temp < 0:
|
if temp < 0:
|
||||||
output_shape[output_shape.index(-1)] = size // -temp
|
input_shape[input_shape.index(-1)] = size // -temp
|
||||||
|
tensors[node.output[0]] = self.handler.reshape(
|
||||||
|
tensors[node.input[0]],
|
||||||
|
tensors.get(node.output[0]),
|
||||||
|
input_shape,
|
||||||
|
)
|
||||||
|
elif node.op_type == "Squeeze":
|
||||||
|
input_shape = _search_shape(model, node.input[0])
|
||||||
|
axes = set(
|
||||||
|
[int(i) for i in data[node.input[1]].int64_data]
|
||||||
|
if len(node.input) > 1
|
||||||
|
else _parse_attribute(node, {"axes": None})["axes"]
|
||||||
|
)
|
||||||
|
assert all(input_shape[d] == 1 for d in axes)
|
||||||
|
output_shape = []
|
||||||
|
for i, x in enumerate(input_shape):
|
||||||
|
if i not in axes:
|
||||||
|
output_shape.append(x)
|
||||||
tensors[node.output[0]] = self.handler.reshape(
|
tensors[node.output[0]] = self.handler.reshape(
|
||||||
tensors[node.input[0]],
|
tensors[node.input[0]],
|
||||||
tensors.get(node.output[0]),
|
tensors.get(node.output[0]),
|
||||||
output_shape,
|
output_shape,
|
||||||
)
|
)
|
||||||
|
elif node.op_type == "Unsqueeze":
|
||||||
|
input_shape = _search_shape(model, node.input[0])
|
||||||
|
axes = (
|
||||||
|
[int(i) for i in data[node.input[1]].int64_data]
|
||||||
|
if len(node.input) > 1
|
||||||
|
else _parse_attribute(node, {"axes": None})["axes"]
|
||||||
|
)
|
||||||
|
for i in axes:
|
||||||
|
input_shape.insert(i, 1)
|
||||||
|
tensors[node.output[0]] = self.handler.reshape(
|
||||||
|
tensors[node.input[0]],
|
||||||
|
tensors.get(node.output[0]),
|
||||||
|
input_shape,
|
||||||
|
)
|
||||||
elif node.op_type == "Concat":
|
elif node.op_type == "Concat":
|
||||||
tensors[node.output[0]] = self.handler.concat(
|
tensors[node.output[0]] = self.handler.concat(
|
||||||
[tensors[name] for name in node.input],
|
[tensors[name] for name in node.input],
|
||||||
tensors.get(node.output[0]),
|
tensors.get(node.output[0]),
|
||||||
next((attr.i for attr in node.attribute if attr.name == "axis")),
|
next((attr.i for attr in node.attribute if attr.name == "axis")),
|
||||||
)
|
)
|
||||||
|
elif node.op_type == "Split":
|
||||||
|
for name, tensor in zip(
|
||||||
|
node.output,
|
||||||
|
self.handler.split(
|
||||||
|
tensors[node.input[0]],
|
||||||
|
None,
|
||||||
|
next(
|
||||||
|
(attr.i for attr in node.attribute if attr.name == "axis"),
|
||||||
|
0,
|
||||||
|
),
|
||||||
|
len(node.output),
|
||||||
|
),
|
||||||
|
):
|
||||||
|
tensors[name] = tensor
|
||||||
elif node.op_type == "Gather":
|
elif node.op_type == "Gather":
|
||||||
tensors[node.output[0]] = self.handler.gather(
|
tensors[node.output[0]] = self.handler.gather(
|
||||||
tensors[node.input[0]],
|
tensors[node.input[0]],
|
||||||
|
@ -331,6 +496,22 @@ class OnnxStub:
|
||||||
_parse_data(data[node.input[1]]),
|
_parse_data(data[node.input[1]]),
|
||||||
_parse_data(data[node.input[3]]) if len(node.input) > 3 else None,
|
_parse_data(data[node.input[3]]) if len(node.input) > 3 else None,
|
||||||
)
|
)
|
||||||
|
elif node.op_type == "Dropout":
|
||||||
|
for name, tensor in zip(
|
||||||
|
node.output,
|
||||||
|
self.handler.dropout(
|
||||||
|
tensors[node.input[0]],
|
||||||
|
tensors.get(node.output[0]),
|
||||||
|
tensors.get(node.output[1]) if len(node.output) > 1 else None,
|
||||||
|
_parse_data(data[node.input[1]])[0]
|
||||||
|
if len(node.input) > 1
|
||||||
|
else 0.5,
|
||||||
|
_parse_data(data[node.input[2]])[0]
|
||||||
|
if len(node.input) > 2
|
||||||
|
else False,
|
||||||
|
),
|
||||||
|
):
|
||||||
|
tensors[name] = tensor
|
||||||
else:
|
else:
|
||||||
raise Exception('Unsupported operator "{}"'.format(node.op_type))
|
raise Exception('Unsupported operator "{}"'.format(node.op_type))
|
||||||
|
|
||||||
|
@ -344,11 +525,11 @@ class OnnxStub:
|
||||||
else:
|
else:
|
||||||
self.initializer[obj.fuid()] = tensor
|
self.initializer[obj.fuid()] = tensor
|
||||||
if tensor.data_type == TensorProto.INT32:
|
if tensor.data_type == TensorProto.INT32:
|
||||||
obj.copyin_int32([int(i) for i in tensor.int32_data])
|
obj.copyin_int32(_parse_data(tensor))
|
||||||
elif tensor.data_type == TensorProto.INT64:
|
elif tensor.data_type == TensorProto.INT64:
|
||||||
obj.copyin_int64([int(i) for i in tensor.int64_data])
|
obj.copyin_int64(_parse_data(tensor))
|
||||||
elif tensor.data_type == TensorProto.FLOAT:
|
elif tensor.data_type == TensorProto.FLOAT:
|
||||||
obj.copyin_float([int(i) for i in tensor.float_data])
|
obj.copyin_float(_parse_data(tensor))
|
||||||
else:
|
else:
|
||||||
assert False, "Unsupported Tensor Type: {}".format(tensor.data_type)
|
assert False, "Unsupported Tensor Type: {}".format(tensor.data_type)
|
||||||
|
|
||||||
|
@ -398,14 +579,15 @@ class OnnxStub:
|
||||||
self.count_in += 1
|
self.count_in += 1
|
||||||
name = "input{}".format(self.count_in)
|
name = "input{}".format(self.count_in)
|
||||||
self.names[tensor] = name
|
self.names[tensor] = name
|
||||||
shape = tensor.shape()
|
|
||||||
dtype = backend.tensor_dtype(tensor)
|
|
||||||
value_info = make_tensor_value_info(name, dtype, shape)
|
|
||||||
check_value_info(value_info)
|
|
||||||
self.inputs.append(value_info)
|
|
||||||
if init != None:
|
if init != None:
|
||||||
init.name = name
|
init.name = name
|
||||||
self.initializers.append(init)
|
self.initializers.append(init)
|
||||||
|
else:
|
||||||
|
shape = tensor.shape()
|
||||||
|
dtype = backend.tensor_dtype(tensor)
|
||||||
|
value_info = make_tensor_value_info(name, dtype, shape)
|
||||||
|
check_value_info(value_info)
|
||||||
|
self.inputs.append(value_info)
|
||||||
return name
|
return name
|
||||||
|
|
||||||
def push_data_input(
|
def push_data_input(
|
||||||
|
@ -417,11 +599,8 @@ class OnnxStub:
|
||||||
vals: Any,
|
vals: Any,
|
||||||
) -> str:
|
) -> str:
|
||||||
name = "{}_{}".format(node_name, attr_name)
|
name = "{}_{}".format(node_name, attr_name)
|
||||||
value_info = make_tensor_value_info(name, elem_type, shape)
|
|
||||||
tensor = make_tensor(name, elem_type, shape, vals)
|
tensor = make_tensor(name, elem_type, shape, vals)
|
||||||
check_value_info(value_info)
|
|
||||||
check_tensor(tensor)
|
check_tensor(tensor)
|
||||||
self.inputs.append(value_info)
|
|
||||||
self.initializers.append(tensor)
|
self.initializers.append(tensor)
|
||||||
return name
|
return name
|
||||||
|
|
||||||
|
@ -459,20 +638,40 @@ class OnnxStub:
|
||||||
for (i, it) in enumerate(op.outputs())
|
for (i, it) in enumerate(op.outputs())
|
||||||
]
|
]
|
||||||
if ty == backend.OpType.Conv:
|
if ty == backend.OpType.Conv:
|
||||||
ph, pw, sh, sw, dh, dw = backend.conv_attrs_of(op)
|
ph, pw, dh, dw, sh, sw = backend.conv_attrs_of(op)
|
||||||
ctx.push_node(
|
ctx.push_node(
|
||||||
make_node(
|
make_node(
|
||||||
ty.name,
|
ty.name,
|
||||||
inputs,
|
inputs,
|
||||||
outputs,
|
outputs,
|
||||||
name,
|
name,
|
||||||
|
pads=[ph, pw, ph, pw],
|
||||||
|
strides=[sh, sw],
|
||||||
|
dilations=[dh, dw],
|
||||||
|
group=op.inputs()[0].shape()[1] // op.inputs()[1].shape()[1],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
elif ty == backend.OpType.ConvTrans:
|
||||||
|
ph, pw, sh, sw, dh, dw, oph, opw = backend.conv_trans_attrs_of(op)
|
||||||
|
ctx.push_node(
|
||||||
|
make_node(
|
||||||
|
"ConvTranspose",
|
||||||
|
inputs,
|
||||||
|
outputs,
|
||||||
|
name,
|
||||||
pads=[ph, pw],
|
pads=[ph, pw],
|
||||||
strides=[sh, sw],
|
strides=[sh, sw],
|
||||||
dilations=[dh, dw],
|
dilations=[dh, dw],
|
||||||
|
output_padding=[oph, opw],
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
elif ty == backend.OpType.Matmul:
|
elif ty == backend.OpType.Matmul:
|
||||||
ctx.push_node(make_node("MatMul", inputs, outputs, name))
|
transA, transB = backend.matmul_attrs_of(op)
|
||||||
|
ctx.push_node(
|
||||||
|
make_node(
|
||||||
|
"Gemm", inputs, outputs, name, transA=transA, transB=transB
|
||||||
|
)
|
||||||
|
)
|
||||||
elif ty == backend.OpType.BatchNorm:
|
elif ty == backend.OpType.BatchNorm:
|
||||||
inputs = [inputs[i] for i in [0, 3, 4, 1, 2]]
|
inputs = [inputs[i] for i in [0, 3, 4, 1, 2]]
|
||||||
momentum, eps, training = backend.batch_norm_attrs_of(op)
|
momentum, eps, training = backend.batch_norm_attrs_of(op)
|
||||||
|
@ -496,7 +695,7 @@ class OnnxStub:
|
||||||
outputs,
|
outputs,
|
||||||
name,
|
name,
|
||||||
kernel_shape=[kh, kw],
|
kernel_shape=[kh, kw],
|
||||||
pads=[ph, pw],
|
pads=[ph, pw, ph, pw],
|
||||||
dilations=[dh, dw],
|
dilations=[dh, dw],
|
||||||
strides=[sh, sw],
|
strides=[sh, sw],
|
||||||
)
|
)
|
||||||
|
@ -510,7 +709,7 @@ class OnnxStub:
|
||||||
outputs,
|
outputs,
|
||||||
name,
|
name,
|
||||||
kernel_shape=[kh, kw],
|
kernel_shape=[kh, kw],
|
||||||
pads=[ph, pw],
|
pads=[ph, pw, ph, pw],
|
||||||
strides=[sh, sw],
|
strides=[sh, sw],
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
@ -526,17 +725,21 @@ class OnnxStub:
|
||||||
backend.OpType.Softmax,
|
backend.OpType.Softmax,
|
||||||
backend.OpType.Abs,
|
backend.OpType.Abs,
|
||||||
backend.OpType.Identity,
|
backend.OpType.Identity,
|
||||||
|
backend.OpType.PRelu,
|
||||||
]:
|
]:
|
||||||
ctx.push_node(make_node(ty.name, inputs, outputs, name))
|
ctx.push_node(make_node(ty.name, inputs, outputs, name))
|
||||||
elif ty == backend.OpType.Flatten:
|
elif ty == backend.OpType.Flatten:
|
||||||
raise Exception("TODO")
|
raise Exception("TODO")
|
||||||
|
elif ty == backend.OpType.Transpose:
|
||||||
|
perm = backend.transpose_permute_of(op)
|
||||||
|
ctx.push_node(make_node(ty.name, inputs, outputs, name, perm=perm))
|
||||||
elif ty == backend.OpType.Reshape:
|
elif ty == backend.OpType.Reshape:
|
||||||
shape = backend.reshape_shape_of(op)
|
shape = backend.reshape_shape_of(op)
|
||||||
inputs.append(
|
inputs.append(
|
||||||
ctx.push_data_input(
|
ctx.push_data_input(
|
||||||
name,
|
name,
|
||||||
"shape",
|
"shape",
|
||||||
TensorProto.INT32,
|
TensorProto.INT64,
|
||||||
[len(shape)],
|
[len(shape)],
|
||||||
shape,
|
shape,
|
||||||
)
|
)
|
||||||
|
@ -545,29 +748,81 @@ class OnnxStub:
|
||||||
elif ty == backend.OpType.Concat:
|
elif ty == backend.OpType.Concat:
|
||||||
axis = backend.concat_axis_of(op)
|
axis = backend.concat_axis_of(op)
|
||||||
ctx.push_node(make_node(ty.name, inputs, outputs, name, axis=axis))
|
ctx.push_node(make_node(ty.name, inputs, outputs, name, axis=axis))
|
||||||
|
elif ty == backend.OpType.Split:
|
||||||
|
axis = backend.split_axis_of(op)
|
||||||
|
num_outputs = len(outputs)
|
||||||
|
split = op.inputs()[0].shape()[axis] // num_outputs
|
||||||
|
inputs.append(
|
||||||
|
ctx.push_data_input(
|
||||||
|
name,
|
||||||
|
"split",
|
||||||
|
TensorProto.INT64,
|
||||||
|
[len(outputs)],
|
||||||
|
[split for _ in range(0, num_outputs)],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
ctx.push_node(
|
||||||
|
make_node(
|
||||||
|
ty.name,
|
||||||
|
inputs,
|
||||||
|
outputs,
|
||||||
|
name,
|
||||||
|
axis=axis,
|
||||||
|
)
|
||||||
|
)
|
||||||
elif ty == backend.OpType.Gather:
|
elif ty == backend.OpType.Gather:
|
||||||
axis = backend.gather_axis_of(op)
|
axis = backend.gather_axis_of(op)
|
||||||
ctx.push_node(make_node(ty.name, inputs, outputs, name, axis=axis))
|
ctx.push_node(make_node(ty.name, inputs, outputs, name, axis=axis))
|
||||||
elif ty == backend.OpType.ReduceMean:
|
elif ty == backend.OpType.ReduceMean:
|
||||||
axes = backend.reduce_mean_axes_of(op)
|
axes, keepdims = backend.reduce_mean_attrs_of(op)
|
||||||
inputs.append(
|
inputs.append(
|
||||||
ctx.push_data_input(
|
ctx.push_data_input(
|
||||||
name, "axes", TensorProto.INT32, [len(axes)], axes
|
name, "axes", TensorProto.INT64, [len(axes)], axes
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
ctx.push_node(make_node(ty.name, inputs, outputs, name, keepdims=1))
|
ctx.push_node(
|
||||||
|
make_node(ty.name, inputs, outputs, name, keepdims=keepdims)
|
||||||
|
)
|
||||||
elif ty == backend.OpType.Slice:
|
elif ty == backend.OpType.Slice:
|
||||||
raise Exception("TODO")
|
raise Exception("TODO")
|
||||||
elif ty == backend.OpType.Pad:
|
elif ty == backend.OpType.Pad:
|
||||||
raise Exception("TODO")
|
pads = backend.pad_pads_of(op)
|
||||||
|
inputs.append(
|
||||||
|
ctx.push_data_input(
|
||||||
|
name, "pads", TensorProto.INT64, [len(pads)], pads
|
||||||
|
)
|
||||||
|
)
|
||||||
|
ctx.push_node(make_node(ty.name, inputs, outputs, name))
|
||||||
|
elif ty == backend.OpType.Clip:
|
||||||
|
min, max = backend.clip_attrs_of(op)
|
||||||
|
if min != None:
|
||||||
|
inputs.append(
|
||||||
|
ctx.push_data_input(name, "min", TensorProto.FLOAT, [], [min])
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
inputs.append(
|
||||||
|
ctx.push_data_input(name, "min", TensorProto.FLOAT, [], [])
|
||||||
|
)
|
||||||
|
if max != None:
|
||||||
|
inputs.append(
|
||||||
|
ctx.push_data_input(name, "max", TensorProto.FLOAT, [], [max])
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
inputs.append(
|
||||||
|
ctx.push_data_input(name, "max", TensorProto.FLOAT, [], [])
|
||||||
|
)
|
||||||
|
ctx.push_node(make_node(ty.name, inputs, outputs, name))
|
||||||
else:
|
else:
|
||||||
raise Exception("Unsupported OpType {}".format(ty.name))
|
raise Exception("Unsupported OpType", ty)
|
||||||
|
|
||||||
return ctx.build(name)
|
return ctx.build(name)
|
||||||
|
|
||||||
def init(self) -> None:
|
def init(self) -> None:
|
||||||
self.handler.data_malloc()
|
self.handler.data_malloc()
|
||||||
|
|
||||||
|
def optimize(self) -> None:
|
||||||
|
self.handler.optimize()
|
||||||
|
|
||||||
def run(self) -> None:
|
def run(self) -> None:
|
||||||
self.handler.run()
|
self.handler.run()
|
||||||
|
|
||||||
|
@ -576,9 +831,39 @@ def from_onnx(model: ModelProto, runtime):
|
||||||
stub = OnnxStub(model, runtime)
|
stub = OnnxStub(model, runtime)
|
||||||
return stub.inputs, stub.outputs, stub.handler
|
return stub.inputs, stub.outputs, stub.handler
|
||||||
|
|
||||||
def run_onnx(model: ModelProto, runtime):
|
|
||||||
stub = OnnxStub(model, runtime)
|
def _search_shape(model: ModelProto, name: str) -> List[int]:
|
||||||
stub.run()
|
ans = (
|
||||||
|
next(
|
||||||
|
(
|
||||||
|
[
|
||||||
|
(d.dim_value if d.dim_value > 0 else 1)
|
||||||
|
for d in tensor.type.tensor_type.shape.dim
|
||||||
|
]
|
||||||
|
for tensor in model.graph.value_info
|
||||||
|
if tensor.name == name
|
||||||
|
),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
or next(
|
||||||
|
(
|
||||||
|
[
|
||||||
|
(d.dim_value if d.dim_value > 0 else 1)
|
||||||
|
for d in tensor.type.tensor_type.shape.dim
|
||||||
|
]
|
||||||
|
for tensor in model.graph.input
|
||||||
|
if tensor.name == name
|
||||||
|
),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
or next(
|
||||||
|
[int(d) for d in tensor.dims]
|
||||||
|
for tensor in model.graph.initializer
|
||||||
|
if tensor.name == name
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return ans
|
||||||
|
|
||||||
|
|
||||||
def _parse_attribute(node: NodeProto, attrs: Dict[str, Any] = dict()) -> Dict[str, Any]:
|
def _parse_attribute(node: NodeProto, attrs: Dict[str, Any] = dict()) -> Dict[str, Any]:
|
||||||
for attr in node.attribute:
|
for attr in node.attribute:
|
||||||
|
@ -598,15 +883,8 @@ def _parse_attribute(node: NodeProto, attrs: Dict[str, Any] = dict()) -> Dict[st
|
||||||
return attrs
|
return attrs
|
||||||
|
|
||||||
|
|
||||||
def _parse_data(tensor: TensorProto) -> List[Union[int, float]]:
|
def _parse_data(tensor: TensorProto) -> List[Any]:
|
||||||
if tensor.data_type == TensorProto.INT32:
|
return to_array(tensor).flatten().tolist()
|
||||||
return [int(i) for i in tensor.int32_data]
|
|
||||||
elif tensor.data_type == TensorProto.INT64:
|
|
||||||
return [int(i) for i in tensor.int64_data]
|
|
||||||
elif tensor.data_type == TensorProto.FLOAT:
|
|
||||||
return [float(i) for i in tensor.float_data]
|
|
||||||
else:
|
|
||||||
assert False, "Unsupported Tensor Type: {}".format(tensor.data_type)
|
|
||||||
|
|
||||||
|
|
||||||
def _take_shape_dim(shape: TensorShapeProto) -> List[int]:
|
def _take_shape_dim(shape: TensorShapeProto) -> List[int]:
|
||||||
|
|
|
@ -7,18 +7,20 @@ from onnx.helper import (
|
||||||
make_graph,
|
make_graph,
|
||||||
make_tensor_value_info,
|
make_tensor_value_info,
|
||||||
)
|
)
|
||||||
from onnx.checker import check_model
|
from onnx.checker import check_model, check_graph
|
||||||
from pyinfinitensor.onnx import from_onnx, backend, runtime, run_onnx
|
from onnx.shape_inference import infer_shapes
|
||||||
|
from pyinfinitensor.onnx import from_onnx, OnnxStub, backend
|
||||||
|
|
||||||
|
|
||||||
def make_and_import_model(graph: onnx.GraphProto):
|
def make_and_import_model(graph: onnx.GraphProto):
|
||||||
|
check_graph(graph)
|
||||||
model = make_model(graph)
|
model = make_model(graph)
|
||||||
check_model(model)
|
check_model(model)
|
||||||
from_onnx(model, runtime)
|
from_onnx(model, backend.cpu_runtime())
|
||||||
|
|
||||||
|
|
||||||
class TestStringMethods(unittest.TestCase):
|
class TestStringMethods(unittest.TestCase):
|
||||||
#def test_run(self):
|
# def test_run(self):
|
||||||
# model_file = next(
|
# model_file = next(
|
||||||
# (name for name in os.listdir() if name.endswith(".onnx")), None
|
# (name for name in os.listdir() if name.endswith(".onnx")), None
|
||||||
# )
|
# )
|
||||||
|
@ -31,16 +33,17 @@ class TestStringMethods(unittest.TestCase):
|
||||||
# run_onnx(onnx.load(model_file), runtime)
|
# run_onnx(onnx.load(model_file), runtime)
|
||||||
|
|
||||||
def test_load(self):
|
def test_load(self):
|
||||||
model_file = next(
|
for model_file in os.listdir():
|
||||||
(name for name in os.listdir() if name.endswith(".onnx")), None
|
if model_file.endswith(".onnx"):
|
||||||
)
|
print(
|
||||||
if model_file != None:
|
"model: {file}({size:.2f} MiB)".format(
|
||||||
print(
|
file=model_file, size=os.path.getsize(model_file) / 1024 / 1024
|
||||||
"model: {file}({size:.2f} MiB)".format(
|
)
|
||||||
file=model_file, size=os.path.getsize(model_file) / 1024 / 1024
|
|
||||||
)
|
)
|
||||||
)
|
model = OnnxStub(onnx.load(model_file), backend.cpu_runtime()).to_onnx(
|
||||||
from_onnx(onnx.load(model_file), runtime)
|
"new"
|
||||||
|
)
|
||||||
|
model = infer_shapes(model)
|
||||||
|
|
||||||
def test_tensor(self):
|
def test_tensor(self):
|
||||||
x = make_tensor_value_info("x", TensorProto.FLOAT, [1, 2, 3])
|
x = make_tensor_value_info("x", TensorProto.FLOAT, [1, 2, 3])
|
||||||
|
@ -55,7 +58,7 @@ class TestStringMethods(unittest.TestCase):
|
||||||
["i", "w"],
|
["i", "w"],
|
||||||
["o"],
|
["o"],
|
||||||
"conv",
|
"conv",
|
||||||
pads=[1, 1],
|
pads=[1, 1, 1, 1],
|
||||||
strides=[2, 1],
|
strides=[2, 1],
|
||||||
dilations=[1, 2],
|
dilations=[1, 2],
|
||||||
)
|
)
|
||||||
|
@ -102,7 +105,7 @@ class TestStringMethods(unittest.TestCase):
|
||||||
["y"],
|
["y"],
|
||||||
kernel_shape=[3, 3],
|
kernel_shape=[3, 3],
|
||||||
dilations=[1, 1],
|
dilations=[1, 1],
|
||||||
pads=[0, 0],
|
pads=[0, 0, 0, 0],
|
||||||
strides=[2, 2],
|
strides=[2, 2],
|
||||||
name="maxPool",
|
name="maxPool",
|
||||||
)
|
)
|
||||||
|
@ -116,7 +119,7 @@ class TestStringMethods(unittest.TestCase):
|
||||||
["x"],
|
["x"],
|
||||||
["y"],
|
["y"],
|
||||||
kernel_shape=[3, 3],
|
kernel_shape=[3, 3],
|
||||||
pads=[0, 0],
|
pads=[0, 0, 0, 0],
|
||||||
strides=[2, 2],
|
strides=[2, 2],
|
||||||
name="avgPool",
|
name="avgPool",
|
||||||
)
|
)
|
||||||
|
@ -206,7 +209,7 @@ class TestStringMethods(unittest.TestCase):
|
||||||
|
|
||||||
def test_flatten(self):
|
def test_flatten(self):
|
||||||
x = make_tensor_value_info("x", TensorProto.FLOAT, [1, 3, 5, 7])
|
x = make_tensor_value_info("x", TensorProto.FLOAT, [1, 3, 5, 7])
|
||||||
y = make_tensor_value_info("y", TensorProto.FLOAT, [1*3, 5 * 7])
|
y = make_tensor_value_info("y", TensorProto.FLOAT, [1 * 3, 5 * 7])
|
||||||
flatten = make_node("Flatten", ["x"], ["y"], axis=2, name="flatten")
|
flatten = make_node("Flatten", ["x"], ["y"], axis=2, name="flatten")
|
||||||
# make_and_import_model(
|
# make_and_import_model(
|
||||||
make_graph([flatten], "flatten", [x], [y])
|
make_graph([flatten], "flatten", [x], [y])
|
||||||
|
@ -254,22 +257,19 @@ class TestStringMethods(unittest.TestCase):
|
||||||
|
|
||||||
def test_slice(self):
|
def test_slice(self):
|
||||||
data = make_tensor_value_info("data", TensorProto.UINT32, [10, 64, 162, 162])
|
data = make_tensor_value_info("data", TensorProto.UINT32, [10, 64, 162, 162])
|
||||||
output = make_tensor_value_info("output", TensorProto.UINT32, [1, 0, 99, 95])
|
output = make_tensor_value_info("output", TensorProto.UINT32, [1, 1, 99, 95])
|
||||||
starts = make_tensor_value_info("starts", TensorProto.INT64, [4])
|
starts = make_tensor("starts", TensorProto.INT64, [4], [2, 9, 1, 5])
|
||||||
starts_data = make_tensor("starts", TensorProto.INT64, [4], [2, 10, 1, 5])
|
ends = make_tensor("ends", TensorProto.INT64, [4], [3, 10, 100, 100])
|
||||||
ends = make_tensor_value_info("ends", TensorProto.INT64, [4])
|
|
||||||
ends_data = make_tensor("ends", TensorProto.INT64, [4], [3, 10, 100, 100])
|
|
||||||
slice = make_node("Slice", ["data", "starts", "ends"], ["output"], name="slice")
|
slice = make_node("Slice", ["data", "starts", "ends"], ["output"], name="slice")
|
||||||
# FIXME 后端的实现是 axis:[start,end],onnx 的实现是 axis:[start,end)
|
make_and_import_model(
|
||||||
# make_and_import_model(
|
make_graph(
|
||||||
make_graph(
|
[slice],
|
||||||
[slice],
|
"slice",
|
||||||
"slice",
|
[data],
|
||||||
[data, starts, ends],
|
[output],
|
||||||
[output],
|
[starts, ends],
|
||||||
[starts_data, ends_data],
|
)
|
||||||
)
|
)
|
||||||
# )
|
|
||||||
|
|
||||||
def test_pad(self):
|
def test_pad(self):
|
||||||
data = make_tensor_value_info("data", TensorProto.UINT32, [1, 64, 162, 162])
|
data = make_tensor_value_info("data", TensorProto.UINT32, [1, 64, 162, 162])
|
||||||
|
@ -300,10 +300,10 @@ class TestStringMethods(unittest.TestCase):
|
||||||
graph = make_graph([matmul, add], "lr", [x, a, b], [y])
|
graph = make_graph([matmul, add], "lr", [x, a, b], [y])
|
||||||
model = make_model(graph)
|
model = make_model(graph)
|
||||||
check_model(model)
|
check_model(model)
|
||||||
from_onnx(model, runtime)
|
from_onnx(model, backend.cpu_runtime())
|
||||||
|
|
||||||
def test_frontend(self):
|
def test_frontend(self):
|
||||||
handler = backend.GraphHandler(runtime)
|
handler = backend.GraphHandler(backend.cpu_runtime())
|
||||||
a = handler.tensor([1, 2, 3], 12)
|
a = handler.tensor([1, 2, 3], 12)
|
||||||
b = handler.tensor([1, 2, 3], 12)
|
b = handler.tensor([1, 2, 3], 12)
|
||||||
c = handler.tensor([1, 2, 3], 12)
|
c = handler.tensor([1, 2, 3], 12)
|
||||||
|
|
|
@ -114,6 +114,15 @@ bool GraphObj::topo_sort() {
|
||||||
return this->sorted = true;
|
return this->sorted = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void GraphObj::optimize() {
|
||||||
|
for (auto &op : ops) {
|
||||||
|
switch (op->getOpType()) {
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void GraphObj::dataMalloc() {
|
void GraphObj::dataMalloc() {
|
||||||
for (auto &tensor : tensors) {
|
for (auto &tensor : tensors) {
|
||||||
tensor->dataMalloc();
|
tensor->dataMalloc();
|
||||||
|
|
|
@ -11,6 +11,8 @@
|
||||||
#include "operators/reshape.h"
|
#include "operators/reshape.h"
|
||||||
#include "operators/slice.h"
|
#include "operators/slice.h"
|
||||||
#include "operators/softmax.h"
|
#include "operators/softmax.h"
|
||||||
|
#include "operators/split.h"
|
||||||
|
#include "operators/transpose.h"
|
||||||
#include "operators/unary.h"
|
#include "operators/unary.h"
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
@ -35,6 +37,24 @@ Tensor GraphHandlerObj::conv(Tensor input, Tensor weight, Tensor output, int ph,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Tensor GraphHandlerObj::convTransposed2d(Tensor input, Tensor weight,
|
||||||
|
Tensor output, int ph, int pw, int sh,
|
||||||
|
int sw, int dh, int dw, int oph,
|
||||||
|
int opw) {
|
||||||
|
if (output) {
|
||||||
|
g->addOpWithOutputs<ConvTransposed2dObj>(std::move(input),
|
||||||
|
std::move(weight), output, ph,
|
||||||
|
pw, sh, sw, dh, dw, oph, opw);
|
||||||
|
return output;
|
||||||
|
} else {
|
||||||
|
return g
|
||||||
|
->addOp<ConvTransposed2dObj>(std::move(input), std::move(weight),
|
||||||
|
output, ph, pw, sh, sw, dh, dw, oph,
|
||||||
|
opw)
|
||||||
|
->getOutput();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Tensor GraphHandlerObj::matmul(Tensor a, Tensor b, Tensor y, bool transA,
|
Tensor GraphHandlerObj::matmul(Tensor a, Tensor b, Tensor y, bool transA,
|
||||||
bool transB, Tensor bias, ActType act) {
|
bool transB, Tensor bias, ActType act) {
|
||||||
if (y) {
|
if (y) {
|
||||||
|
@ -128,9 +148,31 @@ DEFINE_UNARY_METHOD(relu, Relu)
|
||||||
DEFINE_UNARY_METHOD(sigmoid, Sigmoid)
|
DEFINE_UNARY_METHOD(sigmoid, Sigmoid)
|
||||||
DEFINE_UNARY_METHOD(tanh, Tanh)
|
DEFINE_UNARY_METHOD(tanh, Tanh)
|
||||||
DEFINE_UNARY_METHOD(abs, Abs)
|
DEFINE_UNARY_METHOD(abs, Abs)
|
||||||
|
DEFINE_UNARY_METHOD(shape, Shape)
|
||||||
|
|
||||||
// see operators/reshape.h
|
// see operators/reshape.h
|
||||||
DEFINE_UNARY_METHOD(identity, Identity)
|
DEFINE_UNARY_METHOD(identity, Identity)
|
||||||
|
|
||||||
|
Tensor GraphHandlerObj::pRelu(Tensor x, Tensor slope, Tensor y) {
|
||||||
|
if (y) {
|
||||||
|
g->addOpWithOutputs<PReluObj>(std::move(x), std::move(slope), y);
|
||||||
|
return y;
|
||||||
|
} else {
|
||||||
|
return g->addOp<PReluObj>(std::move(x), std::move(slope), y)
|
||||||
|
->getOutput();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Tensor GraphHandlerObj::clip(Tensor x, Tensor y, std::optional<float> min,
|
||||||
|
std::optional<float> max) {
|
||||||
|
if (y) {
|
||||||
|
g->addOpWithOutputs<ClipObj>(std::move(x), y, min, max);
|
||||||
|
return y;
|
||||||
|
} else {
|
||||||
|
return g->addOp<ClipObj>(std::move(x), y, min, max)->getOutput();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Tensor GraphHandlerObj::softmax(Tensor input, Tensor output, int axis) {
|
Tensor GraphHandlerObj::softmax(Tensor input, Tensor output, int axis) {
|
||||||
if (output) {
|
if (output) {
|
||||||
g->addOpWithOutputs<SoftmaxObj>(std::move(input), output, axis);
|
g->addOpWithOutputs<SoftmaxObj>(std::move(input), output, axis);
|
||||||
|
@ -151,6 +193,16 @@ Tensor GraphHandlerObj::flatten(Tensor input, Tensor output, int axis) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Tensor GraphHandlerObj::transpose(Tensor data, Tensor transposed, Shape perm) {
|
||||||
|
if (transposed) {
|
||||||
|
g->addOpWithOutputs<TransposeObj>(std::move(data), transposed, perm);
|
||||||
|
return transposed;
|
||||||
|
} else {
|
||||||
|
return g->addOp<TransposeObj>(std::move(data), transposed, perm)
|
||||||
|
->getOutput();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Tensor GraphHandlerObj::reshape(Tensor data, Tensor reshaped, Shape shape) {
|
Tensor GraphHandlerObj::reshape(Tensor data, Tensor reshaped, Shape shape) {
|
||||||
if (reshaped) {
|
if (reshaped) {
|
||||||
g->addOpWithOutputs<ReshapeObj>(std::move(data), reshaped,
|
g->addOpWithOutputs<ReshapeObj>(std::move(data), reshaped,
|
||||||
|
@ -171,6 +223,18 @@ Tensor GraphHandlerObj::concat(TensorVec inputs, Tensor output, int dim) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TensorVec GraphHandlerObj::split(Tensor input, std::optional<TensorVec> outputs,
|
||||||
|
int axis, int num_outputs) {
|
||||||
|
if (outputs) {
|
||||||
|
g->addOpWithOutputs<SplitObj>(std::move(input), outputs, axis,
|
||||||
|
num_outputs);
|
||||||
|
return *outputs;
|
||||||
|
} else {
|
||||||
|
return g->addOp<SplitObj>(std::move(input), outputs, axis, num_outputs)
|
||||||
|
->getOutputs();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Tensor GraphHandlerObj::gather(Tensor data, Tensor indices, Tensor output,
|
Tensor GraphHandlerObj::gather(Tensor data, Tensor indices, Tensor output,
|
||||||
int axis) {
|
int axis) {
|
||||||
if (output) {
|
if (output) {
|
||||||
|
|
|
@ -64,79 +64,24 @@ vector<size_t> TensorObj::getStride() const {
|
||||||
|
|
||||||
void TensorObj::printData() const {
|
void TensorObj::printData() const {
|
||||||
IT_ASSERT(data != nullptr);
|
IT_ASSERT(data != nullptr);
|
||||||
void *ptr = nullptr;
|
if (!runtime->isCpu())
|
||||||
Blob buffer;
|
|
||||||
if (!runtime->isCpu()) {
|
|
||||||
buffer = NativeCpuRuntimeObj::getInstance()->allocBlob(getBytes());
|
|
||||||
runtime->copyBlobToCPU(buffer->getPtr<void *>(),
|
|
||||||
getRawDataPtr<void *>(), getBytes());
|
|
||||||
ptr = buffer->getPtr<void *>();
|
|
||||||
} else
|
|
||||||
ptr = data->getPtr<float *>();
|
|
||||||
if (dtype == DataType::Float32)
|
|
||||||
printDataFloat(static_cast<float *>(ptr));
|
|
||||||
else if (dtype == DataType::UInt32)
|
|
||||||
printDataUint32_t(static_cast<uint32_t *>(ptr));
|
|
||||||
else
|
|
||||||
IT_TODO_HALT();
|
IT_TODO_HALT();
|
||||||
}
|
|
||||||
|
|
||||||
void TensorObj::printDataFloat(float *ptr) const {
|
#define TRY_PRINT(N) \
|
||||||
std::cout << "Tensor: " << guid << std::endl;
|
if (dtype == DataType(N)) \
|
||||||
auto numDims = shape.size();
|
std::cout << dataToString<DT<N>::t>() << std::endl;
|
||||||
auto dimSzVec = std::vector<int>(numDims, 1);
|
|
||||||
dimSzVec[numDims - 1] = shape[numDims - 1];
|
|
||||||
for (int i = numDims - 1; i != 0; --i)
|
|
||||||
dimSzVec[i - 1] = dimSzVec[i] * shape[i - 1];
|
|
||||||
for (size_t i = 0, iEnd = size(); i < iEnd; ++i) {
|
|
||||||
if (iEnd > 1000 && i > 20 && i < iEnd - 20) {
|
|
||||||
printf("... , ");
|
|
||||||
i = iEnd - 20;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
for (size_t j = 0; j < numDims; ++j) {
|
|
||||||
if (i % dimSzVec[j] == 0) {
|
|
||||||
std::cout << "[";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
printf("%.1f", ptr[i]);
|
|
||||||
for (size_t j = 0; j < numDims; ++j) {
|
|
||||||
if ((int)i % dimSzVec[j] == dimSzVec[j] - 1) {
|
|
||||||
std::cout << "]";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (i != size() - 1)
|
|
||||||
std::cout << ", ";
|
|
||||||
if ((int)i % dimSzVec[numDims - 1] == dimSzVec[numDims - 1] - 1)
|
|
||||||
std::cout << std::endl;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void TensorObj::printDataUint32_t(uint32_t *ptr) const {
|
TRY_PRINT(0) // fmt: new line
|
||||||
IT_ASSERT(data != nullptr);
|
else TRY_PRINT(1) //
|
||||||
std::cout << "Tensor: " << guid << std::endl;
|
else TRY_PRINT(2) //
|
||||||
auto numDims = shape.size();
|
else TRY_PRINT(3) //
|
||||||
auto dimSzVec = std::vector<int>(numDims, 1);
|
else TRY_PRINT(4) //
|
||||||
dimSzVec[numDims - 1] = shape[numDims - 1];
|
else TRY_PRINT(5) //
|
||||||
for (int i = numDims - 1; i != 0; --i)
|
else TRY_PRINT(6) //
|
||||||
dimSzVec[i - 1] = dimSzVec[i] * shape[i - 1];
|
else TRY_PRINT(7) //
|
||||||
for (size_t i = 0, iEnd = size(); i < iEnd; ++i) {
|
else IT_TODO_HALT();
|
||||||
for (size_t j = 0; j < numDims; ++j) {
|
|
||||||
if (i % dimSzVec[j] == 0) {
|
#undef TRY_PRINT
|
||||||
std::cout << "[";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
std::cout << ptr[i];
|
|
||||||
for (size_t j = 0; j < numDims; ++j) {
|
|
||||||
if ((int)i % dimSzVec[j] == dimSzVec[j] - 1) {
|
|
||||||
std::cout << "]";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (i != size() - 1)
|
|
||||||
std::cout << ", ";
|
|
||||||
if ((int)i % dimSzVec[numDims - 1] == dimSzVec[numDims - 1] - 1)
|
|
||||||
std::cout << std::endl;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool TensorObj::equalData(const Tensor &rhs, double relativeError) const {
|
bool TensorObj::equalData(const Tensor &rhs, double relativeError) const {
|
||||||
|
@ -147,19 +92,27 @@ bool TensorObj::equalData(const Tensor &rhs, double relativeError) const {
|
||||||
IT_ASSERT(rhs->getRuntime()->isCpu());
|
IT_ASSERT(rhs->getRuntime()->isCpu());
|
||||||
if (size() != rhs->size())
|
if (size() != rhs->size())
|
||||||
return false;
|
return false;
|
||||||
if (getDType() == DataType::UInt32)
|
|
||||||
return equalDataImpl(getRawDataPtr<uint32_t *>(),
|
#define TEST_EQUAL(N) \
|
||||||
rhs->getRawDataPtr<uint32_t *>(), size(), 0);
|
if (dtype == DataType(N)) \
|
||||||
else if (getDType() == DataType::Float32)
|
return equalDataImpl(getRawDataPtr<DT<N>::t *>(), \
|
||||||
return equalDataImpl(getRawDataPtr<float *>(),
|
rhs->getRawDataPtr<DT<N>::t *>(), size());
|
||||||
rhs->getRawDataPtr<float *>(), size(),
|
|
||||||
relativeError);
|
TEST_EQUAL(0) // fmt: new line
|
||||||
else
|
else TEST_EQUAL(1) //
|
||||||
IT_TODO_HALT();
|
else TEST_EQUAL(2) //
|
||||||
|
else TEST_EQUAL(3) //
|
||||||
|
else TEST_EQUAL(4) //
|
||||||
|
else TEST_EQUAL(5) //
|
||||||
|
else TEST_EQUAL(6) //
|
||||||
|
else TEST_EQUAL(7) //
|
||||||
|
else IT_TODO_HALT();
|
||||||
|
|
||||||
|
#undef TEST_EQUAL
|
||||||
}
|
}
|
||||||
|
|
||||||
void TensorObj::dataMalloc() {
|
void TensorObj::dataMalloc() {
|
||||||
if (data == nullptr)
|
if (!data)
|
||||||
data = runtime->allocBlob(getBytes());
|
data = runtime->allocBlob(getBytes());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -201,9 +154,9 @@ Shape TensorObj::getPosByOffset(size_t offset, Shape dim) const {
|
||||||
size_t TensorObj::getOffsetByPos(Shape pos, Shape dim) const {
|
size_t TensorObj::getOffsetByPos(Shape pos, Shape dim) const {
|
||||||
int n = dim.size();
|
int n = dim.size();
|
||||||
size_t offset = pos.at(0);
|
size_t offset = pos.at(0);
|
||||||
for (auto i = 1; i < n; i++) {
|
for (auto i = 1; i < n; i++)
|
||||||
offset = offset * dim.at(i) + pos.at(i);
|
offset = offset * dim.at(i) + pos.at(i);
|
||||||
}
|
|
||||||
return offset;
|
return offset;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -213,10 +166,10 @@ size_t TensorObj::getOffsetByBroadcastOffset(size_t bcOffset,
|
||||||
|
|
||||||
Shape pos = bcPos;
|
Shape pos = bcPos;
|
||||||
int n = shape.size();
|
int n = shape.size();
|
||||||
for (auto i = 0; i < n; i++) {
|
for (auto i = 0; i < n; i++)
|
||||||
if (shape.at(i) == 1)
|
if (shape.at(i) == 1)
|
||||||
pos[i] = 0;
|
pos[i] = 0;
|
||||||
}
|
|
||||||
return getOffsetByPos(pos, shape);
|
return getOffsetByPos(pos, shape);
|
||||||
}
|
}
|
||||||
}; // namespace infini
|
}; // namespace infini
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
#include "cuda/cuda_runtime.h"
|
#include "cuda/cuda_runtime.h"
|
||||||
#include "core/kernel.h"
|
#include "core/kernel.h"
|
||||||
#include "core/perf_engine.h"
|
#include "core/perf_engine.h"
|
||||||
|
#include "core/runtime.h"
|
||||||
#include "operators/conv.h"
|
#include "operators/conv.h"
|
||||||
#include "operators/matmul.h"
|
#include "operators/matmul.h"
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
@ -16,10 +17,11 @@ void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
|
||||||
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
|
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
|
||||||
auto perfData = perfEngine.getPerfData(perfKey);
|
auto perfData = perfEngine.getPerfData(perfKey);
|
||||||
// IT_ASSERT(perfData, "No perf data for OP " + op->toString());
|
// IT_ASSERT(perfData, "No perf data for OP " + op->toString());
|
||||||
if (perfData)
|
if (perfData) {
|
||||||
kernel->compute(op, perfData, this);
|
kernel->compute(op, perfData, this);
|
||||||
else
|
} else {
|
||||||
kernel->compute(op, this);
|
kernel->compute(op, this);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -73,4 +75,4 @@ void CudaRuntimeObj::sync() const { checkCudaError(cudaDeviceSynchronize()); }
|
||||||
|
|
||||||
string CudaRuntimeObj::toString() const { return "CUDA Runtime"; }
|
string CudaRuntimeObj::toString() const { return "CUDA Runtime"; }
|
||||||
|
|
||||||
} // namespace infini
|
} // namespace infini
|
||||||
|
|
|
@ -3,15 +3,24 @@
|
||||||
#include "operators/concat.h"
|
#include "operators/concat.h"
|
||||||
#include "operators/conv.h"
|
#include "operators/conv.h"
|
||||||
#include "operators/gather.h"
|
#include "operators/gather.h"
|
||||||
|
#include "operators/matmul.h"
|
||||||
|
#include "operators/pad.h"
|
||||||
#include "operators/pooling.h"
|
#include "operators/pooling.h"
|
||||||
#include "operators/reduce_mean.h"
|
#include "operators/reduce_mean.h"
|
||||||
#include "operators/reshape.h"
|
#include "operators/reshape.h"
|
||||||
|
#include "operators/split.h"
|
||||||
|
#include "operators/transpose.h"
|
||||||
|
#include "operators/unary.h"
|
||||||
|
#include <algorithm>
|
||||||
#include <pybind11/stl.h>
|
#include <pybind11/stl.h>
|
||||||
|
|
||||||
#ifdef USE_CUDA
|
#ifdef USE_CUDA
|
||||||
#include "cuda/cuda_runtime.h"
|
#include "cuda/cuda_runtime.h"
|
||||||
#include "cuda/operator_timer.h"
|
#include "cuda/operator_timer.h"
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef USE_BANG
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
#endif
|
||||||
#ifdef USE_INTELCPU
|
#ifdef USE_INTELCPU
|
||||||
#include "intelcpu/mkl_runtime.h"
|
#include "intelcpu/mkl_runtime.h"
|
||||||
#include "intelcpu/operator_timer.h"
|
#include "intelcpu/operator_timer.h"
|
||||||
|
@ -57,6 +66,7 @@ void export_values(py::module &m) {
|
||||||
.VALUE(OpType, G2BMM)
|
.VALUE(OpType, G2BMM)
|
||||||
.VALUE(OpType, GBMM)
|
.VALUE(OpType, GBMM)
|
||||||
.VALUE(OpType, Pad)
|
.VALUE(OpType, Pad)
|
||||||
|
.VALUE(OpType, Clip)
|
||||||
.VALUE(OpType, Slice)
|
.VALUE(OpType, Slice)
|
||||||
.VALUE(OpType, Concat)
|
.VALUE(OpType, Concat)
|
||||||
.VALUE(OpType, Split)
|
.VALUE(OpType, Split)
|
||||||
|
@ -78,11 +88,12 @@ void export_values(py::module &m) {
|
||||||
.VALUE(OpType, Softmax)
|
.VALUE(OpType, Softmax)
|
||||||
.VALUE(OpType, Activation)
|
.VALUE(OpType, Activation)
|
||||||
.VALUE(OpType, Relu)
|
.VALUE(OpType, Relu)
|
||||||
|
.VALUE(OpType, PRelu)
|
||||||
.VALUE(OpType, Sigmoid)
|
.VALUE(OpType, Sigmoid)
|
||||||
.VALUE(OpType, Tanh)
|
.VALUE(OpType, Tanh)
|
||||||
.VALUE(OpType, Abs)
|
.VALUE(OpType, Abs)
|
||||||
.VALUE(OpType, Resize)
|
.VALUE(OpType, Resize)
|
||||||
.VALUE(OpType, MemBound)
|
.VALUE(OpType, Dropout)
|
||||||
.export_values();
|
.export_values();
|
||||||
|
|
||||||
#undef VALUE
|
#undef VALUE
|
||||||
|
@ -112,6 +123,10 @@ static int tensor_dtype(Tensor t) {
|
||||||
static Ref<CudaRuntimeObj> cuda_runtime() { return make_ref<CudaRuntimeObj>(); }
|
static Ref<CudaRuntimeObj> cuda_runtime() { return make_ref<CudaRuntimeObj>(); }
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef USE_BANG
|
||||||
|
static Ref<BangRuntimeObj> bang_runtime() { return make_ref<BangRuntimeObj>(); }
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef USE_INTELCPU
|
#ifdef USE_INTELCPU
|
||||||
static Ref<RuntimeObj> intelcpu_runtime() { return make_ref<MklRuntimeObj>(); }
|
static Ref<RuntimeObj> intelcpu_runtime() { return make_ref<MklRuntimeObj>(); }
|
||||||
#endif
|
#endif
|
||||||
|
@ -123,11 +138,27 @@ static std::tuple<int, int, int, int, int, int> conv_attrs_of(Operator op) {
|
||||||
conv->getDw(), conv->getSh(), conv->getSw());
|
conv->getDw(), conv->getSh(), conv->getSw());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static std::tuple<int, int, int, int, int, int, int, int>
|
||||||
|
conv_trans_attrs_of(Operator op) {
|
||||||
|
IT_ASSERT(op->getOpType() == OpType::ConvTrans);
|
||||||
|
auto conv = dynamic_cast<const ConvTransposed2dObj *>(op.get());
|
||||||
|
auto [oph, opw] = conv->getOutputPadding();
|
||||||
|
return std::make_tuple(conv->getPh(), conv->getPw(), conv->getDh(),
|
||||||
|
conv->getDw(), conv->getSh(), conv->getSw(), oph,
|
||||||
|
opw);
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::tuple<bool, bool> matmul_attrs_of(Operator op) {
|
||||||
|
IT_ASSERT(op->getOpType() == OpType::Matmul);
|
||||||
|
auto matmul = dynamic_cast<const MatmulObj *>(op.get());
|
||||||
|
return std::make_tuple(matmul->getTransA(), matmul->getTransB());
|
||||||
|
}
|
||||||
|
|
||||||
static std::tuple<float, float, bool> batch_norm_attrs_of(Operator op) {
|
static std::tuple<float, float, bool> batch_norm_attrs_of(Operator op) {
|
||||||
IT_ASSERT(op->getOpType() == OpType::BatchNorm);
|
IT_ASSERT(op->getOpType() == OpType::BatchNorm);
|
||||||
auto batchnorm = dynamic_cast<const BatchNormObj *>(op.get());
|
auto batchnorm = dynamic_cast<const BatchNormObj *>(op.get());
|
||||||
return std::make_tuple(batchnorm->getMomentum(), batchnorm->getEps(),
|
return std::make_tuple(batchnorm->getMomentum(), batchnorm->getEps(),
|
||||||
batchnorm->getTraining());
|
batchnorm->getTrainingMode());
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::tuple<int, int, int, int, int, int, int, int>
|
static std::tuple<int, int, int, int, int, int, int, int>
|
||||||
|
@ -140,45 +171,88 @@ pool_attrs_of(Operator op) {
|
||||||
pool->getSh(), pool->getSw());
|
pool->getSh(), pool->getSw());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static std::tuple<std::optional<float>, std::optional<float>>
|
||||||
|
clip_attrs_of(Operator op) {
|
||||||
|
IT_ASSERT(op->getOpType() == OpType::Clip);
|
||||||
|
auto clip = dynamic_cast<const ClipObj *>(op.get());
|
||||||
|
return std::make_tuple(clip->getMin(), clip->getMax());
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::tuple<vector<int>, bool> reduce_mean_attrs_of(Operator op) {
|
||||||
|
IT_ASSERT(op->getOpType() == OpType::ReduceMean);
|
||||||
|
auto reduce_mean = dynamic_cast<const ReduceMeanObj *>(op.get());
|
||||||
|
auto &set = reduce_mean->getAxes();
|
||||||
|
return std::make_tuple(vector(set.begin(), set.end()),
|
||||||
|
reduce_mean->getKeepDims());
|
||||||
|
}
|
||||||
|
|
||||||
static int concat_axis_of(Operator op) {
|
static int concat_axis_of(Operator op) {
|
||||||
IT_ASSERT(op->getOpType() == OpType::Concat);
|
IT_ASSERT(op->getOpType() == OpType::Concat);
|
||||||
return dynamic_cast<const ConcatObj *>(op.get())->getDim();
|
return dynamic_cast<const ConcatObj *>(op.get())->getDim();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int split_axis_of(Operator op) {
|
||||||
|
IT_ASSERT(op->getOpType() == OpType::Split);
|
||||||
|
return dynamic_cast<const SplitObj *>(op.get())->getDim();
|
||||||
|
}
|
||||||
|
|
||||||
static int gather_axis_of(Operator op) {
|
static int gather_axis_of(Operator op) {
|
||||||
IT_ASSERT(op->getOpType() == OpType::Gather);
|
IT_ASSERT(op->getOpType() == OpType::Gather);
|
||||||
return dynamic_cast<const GatherObj *>(op.get())->getAxis();
|
return dynamic_cast<const GatherObj *>(op.get())->getAxis();
|
||||||
}
|
}
|
||||||
|
|
||||||
static vector<int> reduce_mean_axes_of(Operator op) {
|
static vector<int64_t> reshape_shape_of(Operator op) {
|
||||||
IT_ASSERT(op->getOpType() == OpType::ReduceMean);
|
IT_ASSERT(op->getOpType() == OpType::Reshape);
|
||||||
auto &set = dynamic_cast<const ReduceMeanObj *>(op.get())->getAxes();
|
auto shape = dynamic_cast<const ReshapeObj *>(op.get())->getShape();
|
||||||
return vector(set.begin(), set.end());
|
vector<int64_t> ans(shape.size());
|
||||||
|
std::transform(shape.begin(), shape.end(), ans.begin(),
|
||||||
|
[](auto x) { return static_cast<int64_t>(x); });
|
||||||
|
return ans;
|
||||||
}
|
}
|
||||||
|
|
||||||
static Shape reshape_shape_of(Operator op) {
|
static vector<int64_t> pad_pads_of(Operator op) {
|
||||||
IT_ASSERT(op->getOpType() == OpType::Reshape);
|
IT_ASSERT(op->getOpType() == OpType::Pad);
|
||||||
return dynamic_cast<const ReshapeObj *>(op.get())->getShape();
|
auto shape = dynamic_cast<const PadObj *>(op.get())->getPads();
|
||||||
|
vector<int64_t> ans(shape.size());
|
||||||
|
std::transform(shape.begin(), shape.end(), ans.begin(),
|
||||||
|
[](auto x) { return static_cast<int64_t>(x); });
|
||||||
|
return ans;
|
||||||
|
}
|
||||||
|
|
||||||
|
static vector<int> transpose_permute_of(Operator op) {
|
||||||
|
IT_ASSERT(op->getOpType() == OpType::Transpose);
|
||||||
|
return dynamic_cast<const TransposeObj *>(op.get())->getPermute();
|
||||||
}
|
}
|
||||||
|
|
||||||
void export_functions(py::module &m) {
|
void export_functions(py::module &m) {
|
||||||
#define FUNCTION(NAME) def(#NAME, &NAME)
|
#define FUNCTION(NAME) def(#NAME, &NAME)
|
||||||
|
m.def("cpu_runtime", &NativeCpuRuntimeObj::getInstance)
|
||||||
#ifdef USE_CUDA
|
#ifdef USE_CUDA
|
||||||
m.def("runtime", cuda_runtime)
|
.def("cuda_runtime", cuda_runtime)
|
||||||
#elif USE_INTELCPU
|
#endif
|
||||||
m.def("runtime", intelcpu_runtime)
|
#ifdef USE_INTELCPU
|
||||||
#else
|
.def("intelcpu_runtime", intelcpu_runtime)
|
||||||
m.def("runtime", &NativeCpuRuntimeObj::getInstance)
|
#endif
|
||||||
|
#ifdef USE_CUDA
|
||||||
|
.FUNCTION(cuda_runtime)
|
||||||
|
#endif
|
||||||
|
#ifdef USE_BANG
|
||||||
|
.FUNCTION(bang_runtime)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
.FUNCTION(conv_attrs_of)
|
.FUNCTION(conv_attrs_of)
|
||||||
|
.FUNCTION(conv_trans_attrs_of)
|
||||||
|
.FUNCTION(matmul_attrs_of)
|
||||||
.FUNCTION(batch_norm_attrs_of)
|
.FUNCTION(batch_norm_attrs_of)
|
||||||
.FUNCTION(pool_attrs_of)
|
.FUNCTION(pool_attrs_of)
|
||||||
|
.FUNCTION(clip_attrs_of)
|
||||||
|
.FUNCTION(reduce_mean_attrs_of)
|
||||||
.FUNCTION(tensor_dtype)
|
.FUNCTION(tensor_dtype)
|
||||||
.FUNCTION(reshape_shape_of)
|
.FUNCTION(reshape_shape_of)
|
||||||
|
.FUNCTION(pad_pads_of)
|
||||||
|
.FUNCTION(transpose_permute_of)
|
||||||
.FUNCTION(concat_axis_of)
|
.FUNCTION(concat_axis_of)
|
||||||
.FUNCTION(gather_axis_of)
|
.FUNCTION(split_axis_of)
|
||||||
.FUNCTION(reduce_mean_axes_of);
|
.FUNCTION(gather_axis_of);
|
||||||
#undef FUNCTION
|
#undef FUNCTION
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -191,6 +265,10 @@ void init_graph_builder(py::module &m) {
|
||||||
#ifdef USE_CUDA
|
#ifdef USE_CUDA
|
||||||
py::class_<CudaRuntimeObj, std::shared_ptr<CudaRuntimeObj>, RuntimeObj>(
|
py::class_<CudaRuntimeObj, std::shared_ptr<CudaRuntimeObj>, RuntimeObj>(
|
||||||
m, "CudaRuntime");
|
m, "CudaRuntime");
|
||||||
|
#endif
|
||||||
|
#ifdef USE_BANG
|
||||||
|
py::class_<BangRuntimeObj, std::shared_ptr<BangRuntimeObj>, RuntimeObj>(
|
||||||
|
m, "BangRuntime");
|
||||||
#endif
|
#endif
|
||||||
py::class_<TensorObj, std::shared_ptr<TensorObj>>(m, "Tensor")
|
py::class_<TensorObj, std::shared_ptr<TensorObj>>(m, "Tensor")
|
||||||
.def("fuid", &TensorObj::getFuid, policy::automatic)
|
.def("fuid", &TensorObj::getFuid, policy::automatic)
|
||||||
|
@ -215,6 +293,7 @@ void init_graph_builder(py::module &m) {
|
||||||
.def(py::init<Runtime>())
|
.def(py::init<Runtime>())
|
||||||
.def("tensor", &Handler::tensor, policy::move)
|
.def("tensor", &Handler::tensor, policy::move)
|
||||||
.def("conv", &Handler::conv, policy::move)
|
.def("conv", &Handler::conv, policy::move)
|
||||||
|
.def("convTransposed2d", &Handler::convTransposed2d, policy::move)
|
||||||
.def("matmul", &Handler::matmul, policy::move)
|
.def("matmul", &Handler::matmul, policy::move)
|
||||||
.def("batchNorm", &Handler::batchNorm, policy::move)
|
.def("batchNorm", &Handler::batchNorm, policy::move)
|
||||||
.def("maxPool", &Handler::maxPool, policy::move)
|
.def("maxPool", &Handler::maxPool, policy::move)
|
||||||
|
@ -229,15 +308,21 @@ void init_graph_builder(py::module &m) {
|
||||||
.def("tanh", &Handler::tanh, policy::move)
|
.def("tanh", &Handler::tanh, policy::move)
|
||||||
.def("softmax", &Handler::softmax, policy::move)
|
.def("softmax", &Handler::softmax, policy::move)
|
||||||
.def("abs", &Handler::abs, policy::move)
|
.def("abs", &Handler::abs, policy::move)
|
||||||
|
.def("shape", &Handler::shape, policy::move)
|
||||||
.def("identity", &Handler::identity, policy::move)
|
.def("identity", &Handler::identity, policy::move)
|
||||||
.def("flatten", &Handler::flatten, policy::move)
|
.def("flatten", &Handler::flatten, policy::move)
|
||||||
|
.def("pRelu", &Handler::pRelu, policy::move)
|
||||||
|
.def("clip", &Handler::clip, policy::move)
|
||||||
|
.def("transpose", &Handler::transpose, policy::move)
|
||||||
.def("reshape", &Handler::reshape, policy::move)
|
.def("reshape", &Handler::reshape, policy::move)
|
||||||
.def("concat", &Handler::concat, policy::move)
|
.def("concat", &Handler::concat, policy::move)
|
||||||
|
.def("split", &Handler::split, policy::move)
|
||||||
.def("gather", &Handler::gather, policy::move)
|
.def("gather", &Handler::gather, policy::move)
|
||||||
.def("reduce_mean", &Handler::reduceMean, policy::move)
|
.def("reduce_mean", &Handler::reduceMean, policy::move)
|
||||||
.def("slice", &Handler::slice, policy::move)
|
.def("slice", &Handler::slice, policy::move)
|
||||||
.def("pad", &Handler::pad, policy::move)
|
.def("pad", &Handler::pad, policy::move)
|
||||||
.def("topo_sort", &Handler::topo_sort, policy::automatic)
|
.def("topo_sort", &Handler::topo_sort, policy::automatic)
|
||||||
|
.def("optimize", &Handler::optimize, policy::automatic)
|
||||||
.def("operators", &Handler::operators, policy::move)
|
.def("operators", &Handler::operators, policy::move)
|
||||||
.def("data_malloc", &Handler::data_malloc, policy::automatic)
|
.def("data_malloc", &Handler::data_malloc, policy::automatic)
|
||||||
.def("run", &Handler::run, policy::automatic);
|
.def("run", &Handler::run, policy::automatic);
|
||||||
|
|
|
@ -0,0 +1,208 @@
|
||||||
|
#include "bang/bang_kernel_without_config.h"
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
#include "operators/unary.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
class UnaryCnnl : public BangKernelWithoutConfig {
|
||||||
|
virtual cnnlActivationMode_t getOpType() const = 0;
|
||||||
|
virtual float getCoef() const = 0;
|
||||||
|
virtual tuple<float, float> getAlphBeta() const { return {1.f, 0.f}; }
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<UnaryObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
|
cnnlTensorDescriptor_t aDesc, cDesc;
|
||||||
|
auto dim = op->getInputs(0)->getDims();
|
||||||
|
int len = dim.size();
|
||||||
|
int size = 1;
|
||||||
|
for (int i = 0; i < len; ++i) {
|
||||||
|
size *= dim[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
int dim_array[1] = {size};
|
||||||
|
// get inputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY,
|
||||||
|
CNNL_DTYPE_FLOAT, 1, dim_array));
|
||||||
|
|
||||||
|
// get outputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY,
|
||||||
|
CNNL_DTYPE_FLOAT, 1, dim_array));
|
||||||
|
|
||||||
|
// get op descriptor
|
||||||
|
cnnlActivationDescriptor_t opDesc;
|
||||||
|
checkCnnlError(cnnlCreateActivationDescriptor(&opDesc));
|
||||||
|
checkCnnlError(cnnlSetActivationDescriptor(
|
||||||
|
opDesc, getOpType(), CNNL_NOT_PROPAGATE_NAN, getCoef()));
|
||||||
|
|
||||||
|
auto [alpha, beta] = getAlphBeta();
|
||||||
|
cnnlStatus_t stat =
|
||||||
|
cnnlActivationForward(context->cnnlHandle(), opDesc, &alpha, aDesc,
|
||||||
|
aData, &beta, cDesc, cData);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Destories in BANG does not require sync. But cnnl does not state
|
||||||
|
// whether sync is required before destories.
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||||
|
checkCnnlError(cnnlDestroyActivationDescriptor(opDesc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class RoundCnnl : public BangKernelWithoutConfig {
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<UnaryObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
|
cnnlTensorDescriptor_t aDesc, cDesc;
|
||||||
|
auto dim = op->getInputs(0)->getDims();
|
||||||
|
if (dim.size() != 4)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
|
||||||
|
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||||
|
// get inputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
// get outputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
cnnlStatus_t stat =
|
||||||
|
cnnlRound(context->cnnlHandle(), aDesc, aData, cDesc, cData);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Destories in BANG does not require sync. But cnnl does not state
|
||||||
|
// whether sync is required before destories.
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class SquareCnnl : public BangKernelWithoutConfig {
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<UnaryObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
|
cnnlTensorDescriptor_t aDesc, cDesc;
|
||||||
|
auto dim = op->getInputs(0)->getDims();
|
||||||
|
if (dim.size() != 4)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
|
||||||
|
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||||
|
// get inputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
// get outputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
cnnlStatus_t stat =
|
||||||
|
cnnlSquare(context->cnnlHandle(), aDesc, aData, cDesc, cData);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Destories in BANG does not require sync. But cnnl does not state
|
||||||
|
// whether sync is required before destories.
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class PReluCnnl : public BangKernelWithoutConfig {
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<PReluObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
|
cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
|
||||||
|
auto dim = op->getInputs(0)->getDims();
|
||||||
|
if (dim.size() != 4)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
|
||||||
|
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||||
|
int alpha_array[4] = {1, 1, 1, 1};
|
||||||
|
// get inputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
bDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, alpha_array));
|
||||||
|
// get outputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
cnnlStatus_t stat = cnnlPrelu(context->cnnlHandle(), aDesc, aData,
|
||||||
|
bDesc, bData, cDesc, cData);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Destories in BANG does not require sync. But cnnl does not state
|
||||||
|
// whether sync is required before destories.
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class ReluCnnl : public UnaryCnnl {
|
||||||
|
cnnlActivationMode_t getOpType() const override {
|
||||||
|
return CNNL_ACTIVATION_RELU;
|
||||||
|
}
|
||||||
|
float getCoef() const override { return 0.0; }
|
||||||
|
};
|
||||||
|
|
||||||
|
class SigmoidCnnl : public UnaryCnnl {
|
||||||
|
cnnlActivationMode_t getOpType() const override {
|
||||||
|
return CNNL_ACTIVATION_SIGMOID;
|
||||||
|
}
|
||||||
|
float getCoef() const override { return 0.0; }
|
||||||
|
};
|
||||||
|
|
||||||
|
class TanhCnnl : public UnaryCnnl {
|
||||||
|
cnnlActivationMode_t getOpType() const override {
|
||||||
|
return CNNL_ACTIVATION_TANH;
|
||||||
|
}
|
||||||
|
float getCoef() const override { return 0.0; }
|
||||||
|
};
|
||||||
|
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::Relu, DataType::Float32, ReluCnnl,
|
||||||
|
"Relu_cnnl_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::PRelu, DataType::Float32, PReluCnnl,
|
||||||
|
"PRelu_cnnl_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::Sigmoid, DataType::Float32, SigmoidCnnl,
|
||||||
|
"Sigmoid_cnnl_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::Tanh, DataType::Float32, TanhCnnl,
|
||||||
|
"Tanh_cnnl_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::Round, DataType::Float32, RoundCnnl,
|
||||||
|
"Round_cnnl_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::Square, DataType::Float32, SquareCnnl,
|
||||||
|
"Square_cnnl_BANG_Float32");
|
||||||
|
|
||||||
|
}; // namespace infini
|
|
@ -0,0 +1,94 @@
|
||||||
|
#include "operators/activation_backward.h"
|
||||||
|
#include "bang/bang_kernel_without_config.h"
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
class ActivationBackwardCnnl : public BangKernelWithoutConfig {
|
||||||
|
virtual cnnlActivationMode_t getOpType() const = 0;
|
||||||
|
virtual float getCoef() const = 0;
|
||||||
|
virtual tuple<float, float> getAlphBeta() const { return {1.f, 0.f}; }
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<ActivationBackwardObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
void *const yData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const diffYData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||||
|
void *const xData = (op->getInputs(2)->getRawDataPtr<void *>());
|
||||||
|
void *const diffXData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
|
cnnlTensorDescriptor_t yDesc, diffYDesc, xDesc, diffXDesc;
|
||||||
|
auto dim = op->getInputs(0)->getDims();
|
||||||
|
if (dim.size() != 4)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
|
||||||
|
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||||
|
// get inputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&yDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(yDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
// get inputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&diffYDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(diffYDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
// get inputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&xDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(xDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
// get outputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&diffXDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(diffXDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
// get op descriptor
|
||||||
|
cnnlActivationDescriptor_t opDesc;
|
||||||
|
checkCnnlError(cnnlCreateActivationDescriptor(&opDesc));
|
||||||
|
checkCnnlError(cnnlSetActivationDescriptor(
|
||||||
|
opDesc, getOpType(), CNNL_NOT_PROPAGATE_NAN, getCoef()));
|
||||||
|
|
||||||
|
auto [alpha, beta] = getAlphBeta();
|
||||||
|
cnnlStatus_t stat = cnnlActivationBackward(
|
||||||
|
context->cnnlHandle(), opDesc, &alpha, yDesc, yData, diffYDesc,
|
||||||
|
diffYData, xDesc, xData, &beta, diffXDesc, diffXData);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Destories in BANG does not require sync. But cnnl does not state
|
||||||
|
// whether sync is required before destories.
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(yDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(diffYDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(xDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(diffXDesc));
|
||||||
|
checkCnnlError(cnnlDestroyActivationDescriptor(opDesc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class ReluBackwardCnnl : public ActivationBackwardCnnl {
|
||||||
|
cnnlActivationMode_t getOpType() const override {
|
||||||
|
return CNNL_ACTIVATION_RELU;
|
||||||
|
}
|
||||||
|
float getCoef() const override { return 0.0; }
|
||||||
|
};
|
||||||
|
|
||||||
|
class SigmoidBackwardCnnl : public ActivationBackwardCnnl {
|
||||||
|
cnnlActivationMode_t getOpType() const override {
|
||||||
|
return CNNL_ACTIVATION_SIGMOID;
|
||||||
|
}
|
||||||
|
float getCoef() const override { return 0.0; }
|
||||||
|
};
|
||||||
|
|
||||||
|
class TanhBackwardCnnl : public ActivationBackwardCnnl {
|
||||||
|
cnnlActivationMode_t getOpType() const override {
|
||||||
|
return CNNL_ACTIVATION_TANH;
|
||||||
|
}
|
||||||
|
float getCoef() const override { return 0.0; }
|
||||||
|
};
|
||||||
|
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::ReluBackward, DataType::Float32,
|
||||||
|
ReluBackwardCnnl, "ReluBackward_cnnl_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::SigmoidBackward, DataType::Float32,
|
||||||
|
SigmoidBackwardCnnl, "SigmoidBackward_cnnl_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::TanhBackward, DataType::Float32,
|
||||||
|
TanhBackwardCnnl, "TanhBackward_cnnl_BANG_Float32");
|
||||||
|
|
||||||
|
}; // namespace infini
|
|
@ -0,0 +1,185 @@
|
||||||
|
#include "bang/bang_kernel_without_config.h"
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
#include "operators/unary.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
class CastCnnl : public BangKernelWithoutConfig {
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<CastObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
|
cnnlTensorDescriptor_t aDesc, cDesc;
|
||||||
|
auto dim = op->getInputs(0)->getDims();
|
||||||
|
if (dim.size() != 4)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
|
||||||
|
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||||
|
// get inputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||||
|
cnnlCastDataType_t NlCastType;
|
||||||
|
CastObj::CastType type = op->getType();
|
||||||
|
switch (type) {
|
||||||
|
case CastObj::Float2Int64:
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT64, 4, dim_array));
|
||||||
|
NlCastType = CNNL_CAST_FLOAT_TO_INT64;
|
||||||
|
break;
|
||||||
|
case CastObj::Float2Int32:
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array));
|
||||||
|
NlCastType = CNNL_CAST_FLOAT_TO_INT32;
|
||||||
|
break;
|
||||||
|
case CastObj::Float2Int16:
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT16, 4, dim_array));
|
||||||
|
NlCastType = CNNL_CAST_FLOAT_TO_INT16;
|
||||||
|
break;
|
||||||
|
case CastObj::Float2Int8:
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT8, 4, dim_array));
|
||||||
|
NlCastType = CNNL_CAST_FLOAT_TO_INT8;
|
||||||
|
break;
|
||||||
|
case CastObj::Int322Float:
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
NlCastType = CNNL_CAST_INT32_TO_FLOAT;
|
||||||
|
break;
|
||||||
|
case CastObj::Int322Int8:
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT8, 4, dim_array));
|
||||||
|
NlCastType = CNNL_CAST_INT32_TO_INT8;
|
||||||
|
break;
|
||||||
|
case CastObj::Int322Int16:
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT16, 4, dim_array));
|
||||||
|
NlCastType = CNNL_CAST_INT32_TO_INT16;
|
||||||
|
break;
|
||||||
|
case CastObj::Int162Float:
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT16, 4, dim_array));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
NlCastType = CNNL_CAST_INT16_TO_FLOAT;
|
||||||
|
break;
|
||||||
|
case CastObj::Int162Int32:
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT16, 4, dim_array));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array));
|
||||||
|
NlCastType = CNNL_CAST_INT16_TO_INT32;
|
||||||
|
break;
|
||||||
|
case CastObj::Int82Float:
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT8, 4, dim_array));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
NlCastType = CNNL_CAST_INT8_TO_FLOAT;
|
||||||
|
break;
|
||||||
|
case CastObj::Int82Int16:
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT8, 4, dim_array));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT16, 4, dim_array));
|
||||||
|
NlCastType = CNNL_CAST_INT8_TO_INT16;
|
||||||
|
break;
|
||||||
|
case CastObj::Int82Int32:
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT8, 4, dim_array));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array));
|
||||||
|
NlCastType = CNNL_CAST_INT8_TO_INT32;
|
||||||
|
break;
|
||||||
|
case CastObj::Uint82Float:
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_UINT8, 4, dim_array));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
NlCastType = CNNL_CAST_UINT8_TO_FLOAT;
|
||||||
|
break;
|
||||||
|
case CastObj::Uint82Int32:
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_UINT8, 4, dim_array));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array));
|
||||||
|
NlCastType = CNNL_CAST_UINT8_TO_INT32;
|
||||||
|
break;
|
||||||
|
case CastObj::Uint82Int64:
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_UINT8, 4, dim_array));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT64, 4, dim_array));
|
||||||
|
NlCastType = CNNL_CAST_UINT8_TO_INT64;
|
||||||
|
break;
|
||||||
|
case CastObj::Int322Int64:
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT64, 4, dim_array));
|
||||||
|
NlCastType = CNNL_CAST_INT32_TO_INT64;
|
||||||
|
break;
|
||||||
|
case CastObj::Int642Int32:
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT64, 4, dim_array));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array));
|
||||||
|
NlCastType = CNNL_CAST_INT64_TO_INT32;
|
||||||
|
break;
|
||||||
|
case CastObj::Int642Uint32:
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT64, 4, dim_array));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_UINT32, 4, dim_array));
|
||||||
|
NlCastType = CNNL_CAST_INT64_TO_UINT32;
|
||||||
|
break;
|
||||||
|
case CastObj::Int642Float:
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT64, 4, dim_array));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
NlCastType = CNNL_CAST_INT64_TO_FLOAT;
|
||||||
|
break;
|
||||||
|
case CastObj::Uint322Int64:
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_UINT32, 4, dim_array));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT64, 4, dim_array));
|
||||||
|
NlCastType = CNNL_CAST_UINT32_TO_INT64;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
IT_TODO_HALT();
|
||||||
|
}
|
||||||
|
cnnlStatus_t stat = cnnlCastDataType(context->cnnlHandle(), aDesc,
|
||||||
|
aData, NlCastType, cDesc, cData);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Destories in BANG does not require sync. But cnnl does not state
|
||||||
|
// whether sync is required before destories.
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::Cast, DataType::Float32, CastCnnl,
|
||||||
|
"Cast_cnnl_BANG_Float32");
|
||||||
|
|
||||||
|
}; // namespace infini
|
|
@ -0,0 +1,46 @@
|
||||||
|
#include "bang/bang_kernel_without_config.h"
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
#include "operators/unary.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
class CeilCnnl : public BangKernelWithoutConfig {
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<UnaryObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
|
cnnlTensorDescriptor_t aDesc, cDesc;
|
||||||
|
auto dim = op->getInputs(0)->getDims();
|
||||||
|
if (dim.size() != 4)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
|
||||||
|
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||||
|
// get inputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
// get outputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
cnnlStatus_t stat =
|
||||||
|
cnnlCeil(context->cnnlHandle(), aDesc, aData, cDesc, cData);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Destories in BANG does not require sync. But cnnl does not state
|
||||||
|
// whether sync is required before destories.
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::Ceil, DataType::Float32, CeilCnnl,
|
||||||
|
"Ceil_cnnl_BANG_Float32");
|
||||||
|
|
||||||
|
}; // namespace infini
|
|
@ -0,0 +1,42 @@
|
||||||
|
#include "bang/bang_kernel_without_config.h"
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
#include "operators/unary.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
class ClipCnnl : public BangKernelWithoutConfig {
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<ClipObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
float min = op->getMin().value();
|
||||||
|
float max = op->getMax().value();
|
||||||
|
|
||||||
|
cnnlTensorDescriptor_t aDesc;
|
||||||
|
auto dim = op->getInputs(0)->getDims();
|
||||||
|
if (dim.size() != 4)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
|
||||||
|
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||||
|
// get inputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
cnnlStatus_t stat =
|
||||||
|
cnnlClip(context->cnnlHandle(), aDesc, aData, &min, &max, cData);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Destories in BANG does not require sync. But cnnl does not state
|
||||||
|
// whether sync is required before destories.
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::Clip, DataType::Float32, ClipCnnl,
|
||||||
|
"Clip_cnnl_BANG_Float32");
|
||||||
|
|
||||||
|
}; // namespace infini
|
|
@ -0,0 +1,68 @@
|
||||||
|
#include "operators/concat.h"
|
||||||
|
#include "bang/bang_kernel_without_config.h"
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
class ConcatCnnl : public BangKernelWithoutConfig {
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<ConcatObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
int num = op->numInputs();
|
||||||
|
int axis = op->getDim();
|
||||||
|
void *argv[num];
|
||||||
|
for (int i = 0; i < num; ++i) {
|
||||||
|
argv[i] = op->getInputs(i)->getRawDataPtr<void *>();
|
||||||
|
}
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
|
cnnlTensorDescriptor_t desc;
|
||||||
|
|
||||||
|
int dim_array[num][4];
|
||||||
|
for (int i = 0; i < num; ++i) {
|
||||||
|
auto dim = op->getInputs(i)->getDims();
|
||||||
|
if (dim.size() != 4) {
|
||||||
|
IT_TODO_HALT();
|
||||||
|
}
|
||||||
|
dim_array[i][0] = dim[0];
|
||||||
|
dim_array[i][1] = dim[1];
|
||||||
|
dim_array[i][2] = dim[2];
|
||||||
|
dim_array[i][3] = dim[3];
|
||||||
|
}
|
||||||
|
|
||||||
|
auto dim = op->getOutput()->getDims();
|
||||||
|
int dimout_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||||
|
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&desc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
desc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dimout_array));
|
||||||
|
cnnlTensorDescriptor_t descArray[num];
|
||||||
|
for (int i = 0; i < num; ++i) {
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&descArray[i]));
|
||||||
|
checkCnnlError(
|
||||||
|
cnnlSetTensorDescriptor(descArray[i], CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array[i]));
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t wsSize;
|
||||||
|
cnnlGetConcatWorkspaceSize(context->cnnlHandle(), num, &wsSize);
|
||||||
|
BangPtr wsData = context->getWorkspace(wsSize);
|
||||||
|
|
||||||
|
cnnlStatus_t stat =
|
||||||
|
cnnlConcat(context->cnnlHandle(), num, axis, descArray, argv,
|
||||||
|
wsData, wsSize, desc, cData);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Destories in BANG does not require sync. But cnnl does not state
|
||||||
|
// whether sync is required before destories.
|
||||||
|
for (int i = 0; i < num; ++i) {
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(descArray[i]));
|
||||||
|
}
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(desc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::Concat, DataType::Float32, ConcatCnnl,
|
||||||
|
"Concat_cnnl_BANG_Float32");
|
||||||
|
}; // namespace infini
|
|
@ -0,0 +1,88 @@
|
||||||
|
#include "bang/bang_kernel_without_config.h"
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
#include "operators/conv.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
class ConvTransCnnl : public BangKernelWithoutConfig {
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<ConvBaseObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
|
||||||
|
const auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
|
||||||
|
const int cpg = op->getChannelPerGroup();
|
||||||
|
const int g = c / cpg;
|
||||||
|
|
||||||
|
int pad[4] = {ph, ph, pw, pw};
|
||||||
|
int stride[2] = {sh, sw};
|
||||||
|
int dilation[2] = {dh, dw};
|
||||||
|
|
||||||
|
cnnlConvolutionDescriptor_t convDesc;
|
||||||
|
checkCnnlError(cnnlCreateConvolutionDescriptor(&convDesc));
|
||||||
|
checkCnnlError(cnnlSetConvolutionDescriptor(
|
||||||
|
convDesc, 4, pad, stride, dilation, g, CNNL_DTYPE_FLOAT));
|
||||||
|
|
||||||
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
|
cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
|
||||||
|
auto dimInputs0 = op->getInputs(0)->getDims();
|
||||||
|
auto dimInputs1 = op->getInputs(1)->getDims();
|
||||||
|
auto dimOutput = op->getOutput()->getDims();
|
||||||
|
|
||||||
|
if (dimInputs0.size() != 4)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
if (dimInputs1.size() != 4)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
if (dimOutput.size() != 4)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
|
||||||
|
int inputs0[4] = {dimInputs0[0], dimInputs0[1], dimInputs0[2],
|
||||||
|
dimInputs0[3]};
|
||||||
|
int inputs1[4] = {dimInputs1[0], dimInputs1[1], dimInputs1[2],
|
||||||
|
dimInputs1[3]};
|
||||||
|
int output[4] = {dimOutput[0], dimOutput[1], dimOutput[2],
|
||||||
|
dimOutput[3]};
|
||||||
|
|
||||||
|
// get inputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, inputs0));
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, inputs1));
|
||||||
|
// get outputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, output));
|
||||||
|
|
||||||
|
cnnlConvolutionBwdDataAlgo_t algo;
|
||||||
|
cnnlGetConvolutionBackwardDataAlgorithm(
|
||||||
|
context->cnnlHandle(), aDesc, bDesc, convDesc, cDesc,
|
||||||
|
CNNL_CONVOLUTION_BWD_DATA_FASTEST, &algo);
|
||||||
|
size_t wsSize;
|
||||||
|
cnnlGetConvolutionBackwardDataWorkspaceSize(context->cnnlHandle(),
|
||||||
|
aDesc, bDesc, convDesc,
|
||||||
|
cDesc, algo, &wsSize);
|
||||||
|
BangPtr wsData = context->getWorkspace(wsSize);
|
||||||
|
|
||||||
|
cnnlStatus_t stat = cnnlConvolutionBackwardData(
|
||||||
|
context->cnnlHandle(), NULL, aDesc, aData, bDesc, bData, convDesc,
|
||||||
|
algo, wsData, wsSize, NULL, cDesc, cData);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Destories in BANG does not require sync. But cnnl does not state
|
||||||
|
// whether sync is required before destories.
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||||
|
checkCnnlError(cnnlDestroyConvolutionDescriptor(convDesc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::ConvTrans, DataType::Float32,
|
||||||
|
ConvTransCnnl, "ConvTrans_cnnl_BANG_Float32");
|
||||||
|
}; // namespace infini
|
|
@ -0,0 +1,159 @@
|
||||||
|
#include "bang/bang_kernel_without_config.h"
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
#include "operators/conv.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
class ConvBackwardFilterCnnl : public BangKernelWithoutConfig {
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<ConvBackwardFilterObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
|
||||||
|
const auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
|
||||||
|
const int cpg = op->getChannelPerGroup();
|
||||||
|
const int g = c / cpg;
|
||||||
|
|
||||||
|
int pad[4] = {ph, ph, pw, pw};
|
||||||
|
int stride[2] = {sh, sw};
|
||||||
|
int dilation[2] = {dh, dw};
|
||||||
|
|
||||||
|
cnnlConvolutionDescriptor_t convDesc;
|
||||||
|
checkCnnlError(cnnlCreateConvolutionDescriptor(&convDesc));
|
||||||
|
checkCnnlError(cnnlSetConvolutionDescriptor(
|
||||||
|
convDesc, 4, pad, stride, dilation, g, CNNL_DTYPE_FLOAT));
|
||||||
|
|
||||||
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
|
cnnlTensorDescriptor_t aDesc, bDesc, cDesc, aDescTrans, bDescTrans,
|
||||||
|
cDescTrans;
|
||||||
|
auto dimInputs0 = op->getInputs(0)->getDims();
|
||||||
|
auto dimInputs1 = op->getInputs(1)->getDims();
|
||||||
|
auto dimOutput = op->getOutput()->getDims();
|
||||||
|
|
||||||
|
if (dimInputs0.size() != 4)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
if (dimInputs1.size() != 4)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
if (dimOutput.size() != 4)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
|
||||||
|
int inputs0Array[4] = {dimInputs0[0], dimInputs0[1], dimInputs0[2],
|
||||||
|
dimInputs0[3]};
|
||||||
|
int inputs1Array[4] = {dimInputs1[0], dimInputs1[1], dimInputs1[2],
|
||||||
|
dimInputs1[3]};
|
||||||
|
int outputArray[4] = {dimOutput[0], dimOutput[1], dimOutput[2],
|
||||||
|
dimOutput[3]};
|
||||||
|
|
||||||
|
int inputs0ArrayTrans[4] = {dimInputs0[0], dimInputs0[2], dimInputs0[3],
|
||||||
|
dimInputs0[1]};
|
||||||
|
int inputs1ArrayTrans[4] = {dimInputs1[0], dimInputs1[2], dimInputs1[3],
|
||||||
|
dimInputs1[1]};
|
||||||
|
int outputArrayTrans[4] = {dimOutput[0], dimOutput[2], dimOutput[3],
|
||||||
|
dimOutput[1]};
|
||||||
|
|
||||||
|
int transMode[4] = {0, 2, 3, 1};
|
||||||
|
cnnlTransposeDescriptor_t transDesc;
|
||||||
|
checkCnnlError(cnnlCreateTransposeDescriptor(&transDesc));
|
||||||
|
checkCnnlError(cnnlSetTransposeDescriptor(transDesc, 4, transMode));
|
||||||
|
|
||||||
|
// get inputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, inputs0Array));
|
||||||
|
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&aDescTrans));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(aDescTrans, CNNL_LAYOUT_NHWC,
|
||||||
|
CNNL_DTYPE_FLOAT, 4,
|
||||||
|
inputs0ArrayTrans));
|
||||||
|
|
||||||
|
size_t wsTrans1Size = dimInputs0[0] * dimInputs0[1] * dimInputs0[2] *
|
||||||
|
dimInputs0[3] * sizeof(float);
|
||||||
|
BangPtr wsTrans1Data = context->getWorkspace(wsTrans1Size);
|
||||||
|
|
||||||
|
cnnlStatus_t stat =
|
||||||
|
cnnlTranspose(context->cnnlHandle(), transDesc, aDesc, aData,
|
||||||
|
aDescTrans, wsTrans1Data);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
bDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, inputs1Array));
|
||||||
|
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&bDescTrans));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(bDescTrans, CNNL_LAYOUT_NHWC,
|
||||||
|
CNNL_DTYPE_FLOAT, 4,
|
||||||
|
inputs1ArrayTrans));
|
||||||
|
|
||||||
|
size_t wsTrans2Size = dimInputs1[0] * dimInputs1[1] * dimInputs1[2] *
|
||||||
|
dimInputs1[3] * sizeof(float);
|
||||||
|
BangPtr wsTrans2Data = context->getWorkspace(wsTrans2Size);
|
||||||
|
|
||||||
|
stat = cnnlTranspose(context->cnnlHandle(), transDesc, bDesc, bData,
|
||||||
|
bDescTrans, wsTrans2Data);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// get outputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, outputArray));
|
||||||
|
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&cDescTrans));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(cDescTrans, CNNL_LAYOUT_NHWC,
|
||||||
|
CNNL_DTYPE_FLOAT, 4,
|
||||||
|
outputArrayTrans));
|
||||||
|
|
||||||
|
size_t wsTrans3Size = dimOutput[0] * dimOutput[1] * dimOutput[2] *
|
||||||
|
dimOutput[3] * sizeof(float);
|
||||||
|
BangPtr wsTrans3Data = context->getWorkspace(wsTrans3Size);
|
||||||
|
|
||||||
|
cnnlConvolutionBwdFilterAlgo_t algo;
|
||||||
|
cnnlGetConvolutionBackwardFilterAlgorithm(
|
||||||
|
context->cnnlHandle(), convDesc, aDescTrans, bDescTrans, cDescTrans,
|
||||||
|
CNNL_CONVOLUTION_BWD_FILTER_FASTEST, &algo);
|
||||||
|
|
||||||
|
size_t wsSize;
|
||||||
|
cnnlGetConvolutionBackwardFilterWorkspaceSize(
|
||||||
|
context->cnnlHandle(), aDescTrans, bDescTrans, cDescTrans, convDesc,
|
||||||
|
algo, &wsSize);
|
||||||
|
BangPtr wsData = context->getWorkspace(wsSize);
|
||||||
|
|
||||||
|
stat = cnnlConvolutionBackwardFilter(
|
||||||
|
context->cnnlHandle(), NULL, aDescTrans, wsTrans1Data, bDescTrans,
|
||||||
|
wsTrans2Data, convDesc, algo, wsData, wsSize, NULL, cDescTrans,
|
||||||
|
wsTrans3Data);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
int transMode2[4] = {0, 3, 1, 2};
|
||||||
|
cnnlTransposeDescriptor_t transOutputDesc;
|
||||||
|
checkCnnlError(cnnlCreateTransposeDescriptor(&transOutputDesc));
|
||||||
|
checkCnnlError(
|
||||||
|
cnnlSetTransposeDescriptor(transOutputDesc, 4, transMode2));
|
||||||
|
|
||||||
|
stat = cnnlTranspose(context->cnnlHandle(), transOutputDesc, cDescTrans,
|
||||||
|
wsTrans3Data, cDesc, cData);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Destories in BANG does not require sync. But cnnl does not state
|
||||||
|
// whether sync is required before destories.
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(aDescTrans));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(bDescTrans));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(cDescTrans));
|
||||||
|
checkCnnlError(cnnlDestroyTransposeDescriptor(transDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTransposeDescriptor(transOutputDesc));
|
||||||
|
checkCnnlError(cnnlDestroyConvolutionDescriptor(convDesc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::ConvBackwardFilter, DataType::Float32,
|
||||||
|
ConvBackwardFilterCnnl, "ConvBackwardFilter_cnnl_BANG_Float32");
|
||||||
|
}; // namespace infini
|
|
@ -0,0 +1,46 @@
|
||||||
|
#include "bang/bang_kernel_without_config.h"
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
#include "operators/unary.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
class CopyCnnl : public BangKernelWithoutConfig {
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<UnaryObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
|
cnnlTensorDescriptor_t aDesc, cDesc;
|
||||||
|
auto dim = op->getInputs(0)->getDims();
|
||||||
|
if (dim.size() != 4)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
|
||||||
|
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||||
|
// get inputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
// get outputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
cnnlStatus_t stat =
|
||||||
|
cnnlCopy(context->cnnlHandle(), aDesc, aData, cDesc, cData);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Destories in BANG does not require sync. But cnnl does not state
|
||||||
|
// whether sync is required before destories.
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::Copy, DataType::Float32, CopyCnnl,
|
||||||
|
"Copy_cnnl_BANG_Float32");
|
||||||
|
|
||||||
|
}; // namespace infini
|
|
@ -0,0 +1,53 @@
|
||||||
|
#include "operators/det.h"
|
||||||
|
#include "bang/bang_kernel_without_config.h"
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
class DetCnnl : public BangKernelWithoutConfig {
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<DetObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
DetObj::Mode mode = op->getMode();
|
||||||
|
cnnlDetMode_t nlMode;
|
||||||
|
if (mode == DetObj::LogDet) {
|
||||||
|
nlMode = CNNL_DET_MODE_LOGDET;
|
||||||
|
} else {
|
||||||
|
nlMode = CNNL_DET_MODE_DET;
|
||||||
|
}
|
||||||
|
cnnlTensorDescriptor_t aDesc, cDesc;
|
||||||
|
auto dimin = op->getInputs(0)->getDims();
|
||||||
|
auto dimout = op->getOutput()->getDims();
|
||||||
|
if (dimin.size() != 4 || dimout.size() != 2)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
|
||||||
|
int dimin_array[4] = {dimin[0], dimin[1], dimin[2], dimin[3]};
|
||||||
|
int dimout_array[2] = {dimout[0], dimout[1]};
|
||||||
|
// get inputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
aDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 4, dimin_array));
|
||||||
|
|
||||||
|
// get outputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
cDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 2, dimout_array));
|
||||||
|
|
||||||
|
cnnlStatus_t stat =
|
||||||
|
cnnlDet(context->cnnlHandle(), nlMode, aDesc, aData, cDesc, cData);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Destories in BANG does not require sync. But cnnl does not state
|
||||||
|
// whether sync is required before destories.
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::Det, DataType::Float32, DetCnnl,
|
||||||
|
"Det_cnnl_BANG_Float32");
|
||||||
|
}; // namespace infini
|
|
@ -1,5 +1,4 @@
|
||||||
#include "operators/element_wise.h"
|
#include "operators/element_wise.h"
|
||||||
#include "bang/bang_element_wise.h"
|
|
||||||
#include "bang/bang_kernel_without_config.h"
|
#include "bang/bang_kernel_without_config.h"
|
||||||
#include "bang/bang_runtime.h"
|
#include "bang/bang_runtime.h"
|
||||||
|
|
||||||
|
@ -66,6 +65,514 @@ class ElementWiseCnnl : public BangKernelWithoutConfig {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
class LogicOpCnnl : public BangKernelWithoutConfig {
|
||||||
|
virtual cnnlLogicOp_t getOpType() const = 0;
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<ElementWiseObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
|
cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
|
||||||
|
auto dim = op->getInputs(0)->getDims();
|
||||||
|
if (dim.size() != 4)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
|
||||||
|
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||||
|
// get inputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
// get outputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
size_t wsSize;
|
||||||
|
cnnlGetLogicOpWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
|
||||||
|
&wsSize);
|
||||||
|
|
||||||
|
BangPtr wsData = context->getWorkspace(wsSize);
|
||||||
|
|
||||||
|
cnnlStatus_t stat =
|
||||||
|
cnnlLogicOp(context->cnnlHandle(), getOpType(), aDesc, aData, bDesc,
|
||||||
|
bData, wsData, wsSize, cDesc, cData);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Destories in BANG does not require sync. But cnnl does not state
|
||||||
|
// whether sync is required before destories.
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class BitComputeCnnl : public BangKernelWithoutConfig {
|
||||||
|
virtual cnnlBitComputeOp_t getOpType() const = 0;
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<ElementWiseObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
|
cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
|
||||||
|
auto dim = op->getInputs(0)->getDims();
|
||||||
|
if (dim.size() != 4)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
|
||||||
|
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||||
|
// get inputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_INT32, 4, dim_array));
|
||||||
|
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_INT32, 4, dim_array));
|
||||||
|
|
||||||
|
// get outputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_INT32, 4, dim_array));
|
||||||
|
|
||||||
|
size_t wsSize;
|
||||||
|
cnnlGetBitComputeWorkspaceSize(context->cnnlHandle(), aDesc, bDesc,
|
||||||
|
cDesc, &wsSize);
|
||||||
|
|
||||||
|
BangPtr wsData = context->getWorkspace(wsSize);
|
||||||
|
|
||||||
|
cnnlStatus_t stat =
|
||||||
|
cnnlBitCompute_v2(context->cnnlHandle(), getOpType(), aDesc, aData,
|
||||||
|
bDesc, bData, cDesc, cData, wsData, wsSize);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Destories in BANG does not require sync. But cnnl does not state
|
||||||
|
// whether sync is required before destories.
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class DivCnnl : public BangKernelWithoutConfig {
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<ElementWiseObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
|
cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
|
||||||
|
auto dim = op->getInputs(0)->getDims();
|
||||||
|
if (dim.size() != 4)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
|
||||||
|
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||||
|
// get inputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
// get outputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
size_t wsSize;
|
||||||
|
cnnlGetDivWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
|
||||||
|
&wsSize);
|
||||||
|
|
||||||
|
BangPtr wsData = context->getWorkspace(wsSize);
|
||||||
|
|
||||||
|
cnnlStatus_t stat = cnnlDiv_v2(
|
||||||
|
context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION, aDesc,
|
||||||
|
aData, bDesc, bData, wsData, wsSize, cDesc, cData);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Destories in BANG does not require sync. But cnnl does not state
|
||||||
|
// whether sync is required before destories.
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class MaximumCnnl : public BangKernelWithoutConfig {
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<ElementWiseObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
|
cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
|
||||||
|
auto dim = op->getInputs(0)->getDims();
|
||||||
|
if (dim.size() != 4)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
|
||||||
|
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||||
|
// get inputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
// get outputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
// get op descriptor
|
||||||
|
size_t wsSize;
|
||||||
|
cnnlGetMaximumWorkspaceSize(context->cnnlHandle(), cDesc, &wsSize);
|
||||||
|
BangPtr wsData = context->getWorkspace(wsSize);
|
||||||
|
|
||||||
|
cnnlStatus_t stat =
|
||||||
|
cnnlMaximum(context->cnnlHandle(), aDesc, aData, bDesc, bData,
|
||||||
|
cDesc, cData, wsData, wsSize);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Destories in BANG does not require sync. But cnnl does not state
|
||||||
|
// whether sync is required before destories.
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class MinimumCnnl : public BangKernelWithoutConfig {
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<ElementWiseObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
|
cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
|
||||||
|
auto dim = op->getInputs(0)->getDims();
|
||||||
|
if (dim.size() != 4)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
|
||||||
|
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||||
|
// get inputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
// get outputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
// get op descriptor
|
||||||
|
size_t wsSize;
|
||||||
|
cnnlGetMinimumWorkspaceSize(context->cnnlHandle(), cDesc, &wsSize);
|
||||||
|
BangPtr wsData = context->getWorkspace(wsSize);
|
||||||
|
|
||||||
|
cnnlStatus_t stat =
|
||||||
|
cnnlMinimum(context->cnnlHandle(), aDesc, aData, bDesc, bData,
|
||||||
|
cDesc, cData, wsData, wsSize);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Destories in BANG does not require sync. But cnnl does not state
|
||||||
|
// whether sync is required before destories.
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class MSELossCnnl : public BangKernelWithoutConfig {
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<MSELossObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
MSELossObj::Reduction reduction = op->getReduction();
|
||||||
|
cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
|
||||||
|
auto dim = op->getInputs(0)->getDims();
|
||||||
|
if (dim.size() != 4)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
|
||||||
|
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||||
|
int dim_out[4] = {1, 1, 1, 1};
|
||||||
|
// get inputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
// get outputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||||
|
if (reduction == MSELossObj::None) {
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
} else {
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_out));
|
||||||
|
}
|
||||||
|
cnnlStatus_t stat;
|
||||||
|
if (reduction == MSELossObj::None) {
|
||||||
|
stat = cnnlMSELoss(context->cnnlHandle(), CNNL_MSE_LOSS_NONE, aDesc,
|
||||||
|
aData, bDesc, bData, cDesc, cData);
|
||||||
|
} else if (reduction == MSELossObj::Sum) {
|
||||||
|
stat = cnnlMSELoss(context->cnnlHandle(), CNNL_MSE_LOSS_SUM, aDesc,
|
||||||
|
aData, bDesc, bData, cDesc, cData);
|
||||||
|
} else {
|
||||||
|
stat = cnnlMSELoss(context->cnnlHandle(), CNNL_MSE_LOSS_MEAN, aDesc,
|
||||||
|
aData, bDesc, bData, cDesc, cData);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Destories in BANG does not require sync. But cnnl does not state
|
||||||
|
// whether sync is required before destories.
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class PowerCnnl : public BangKernelWithoutConfig {
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<ElementWiseObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
|
cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
|
||||||
|
auto dim = op->getInputs(0)->getDims();
|
||||||
|
if (dim.size() != 4)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
|
||||||
|
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||||
|
// get inputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
// get outputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
// get op descriptor
|
||||||
|
size_t wsSize;
|
||||||
|
cnnlGetPowWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
|
||||||
|
&wsSize);
|
||||||
|
BangPtr wsData = context->getWorkspace(wsSize);
|
||||||
|
|
||||||
|
cnnlStatus_t stat =
|
||||||
|
cnnlPow(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION,
|
||||||
|
aDesc, aData, bDesc, bData, wsData, wsSize, cDesc, cData);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Destories in BANG does not require sync. But cnnl does not state
|
||||||
|
// whether sync is required before destories.
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class FloorDivCnnl : public BangKernelWithoutConfig {
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<ElementWiseObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
|
cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
|
||||||
|
auto dim = op->getInputs(0)->getDims();
|
||||||
|
if (dim.size() != 4)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
|
||||||
|
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||||
|
// get inputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
// get outputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
size_t wsSize;
|
||||||
|
cnnlGetFloorDivWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
|
||||||
|
&wsSize);
|
||||||
|
|
||||||
|
BangPtr wsData = context->getWorkspace(wsSize);
|
||||||
|
|
||||||
|
cnnlStatus_t stat = cnnlFloorDiv_v2(
|
||||||
|
context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION, aDesc,
|
||||||
|
aData, bDesc, bData, cDesc, cData, wsData, wsSize);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Destories in BANG does not require sync. But cnnl does not state
|
||||||
|
// whether sync is required before destories.
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class FloorModCnnl : public BangKernelWithoutConfig {
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<ElementWiseObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
|
cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
|
||||||
|
auto dim = op->getInputs(0)->getDims();
|
||||||
|
if (dim.size() != 4)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
|
||||||
|
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||||
|
// get inputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
// get outputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
size_t wsSize;
|
||||||
|
cnnlGetFloorModWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
|
||||||
|
&wsSize);
|
||||||
|
|
||||||
|
BangPtr wsData = context->getWorkspace(wsSize);
|
||||||
|
|
||||||
|
cnnlStatus_t stat =
|
||||||
|
cnnlFloorMod(context->cnnlHandle(), aDesc, aData, bDesc, bData,
|
||||||
|
cDesc, cData, wsData, wsSize);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Destories in BANG does not require sync. But cnnl does not state
|
||||||
|
// whether sync is required before destories.
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class SquaredDifferenceCnnl : public BangKernelWithoutConfig {
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<ElementWiseObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
|
cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
|
||||||
|
auto dim = op->getInputs(0)->getDims();
|
||||||
|
if (dim.size() != 4)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
|
||||||
|
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||||
|
// get inputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
// get outputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
size_t wsSize;
|
||||||
|
cnnlGetSquaredDifferenceWorkspaceSize(context->cnnlHandle(), aDesc,
|
||||||
|
bDesc, cDesc, &wsSize);
|
||||||
|
|
||||||
|
BangPtr wsData = context->getWorkspace(wsSize);
|
||||||
|
|
||||||
|
cnnlStatus_t stat =
|
||||||
|
cnnlSquaredDifference(context->cnnlHandle(), aDesc, aData, bDesc,
|
||||||
|
bData, cDesc, cData, wsData, wsSize);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Destories in BANG does not require sync. But cnnl does not state
|
||||||
|
// whether sync is required before destories.
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
class AddCnnl : public ElementWiseCnnl {
|
class AddCnnl : public ElementWiseCnnl {
|
||||||
cnnlOpTensorDesc_t getOpType() const override { return CNNL_OP_TENSOR_ADD; }
|
cnnlOpTensorDesc_t getOpType() const override { return CNNL_OP_TENSOR_ADD; }
|
||||||
};
|
};
|
||||||
|
@ -81,12 +588,57 @@ class MulCnnl : public ElementWiseCnnl {
|
||||||
cnnlOpTensorDesc_t getOpType() const override { return CNNL_OP_TENSOR_MUL; }
|
cnnlOpTensorDesc_t getOpType() const override { return CNNL_OP_TENSOR_MUL; }
|
||||||
};
|
};
|
||||||
|
|
||||||
class ElementWiseBang : public BangKernelWithoutConfig {
|
class EqualCnnl : public LogicOpCnnl {
|
||||||
void compute(const Operator &_op,
|
cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_EQ; }
|
||||||
const RuntimeObj *_context) const override {
|
|
||||||
element_wise_kernel(_context, _op);
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
class NotEqualCnnl : public LogicOpCnnl {
|
||||||
|
cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_NE; }
|
||||||
|
};
|
||||||
|
class GreaterThanCnnl : public LogicOpCnnl {
|
||||||
|
cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_GT; }
|
||||||
|
};
|
||||||
|
class GreaterEqualCnnl : public LogicOpCnnl {
|
||||||
|
cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_GE; }
|
||||||
|
};
|
||||||
|
class LessThanCnnl : public LogicOpCnnl {
|
||||||
|
cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_LT; }
|
||||||
|
};
|
||||||
|
class LessEqualCnnl : public LogicOpCnnl {
|
||||||
|
cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_LE; }
|
||||||
|
};
|
||||||
|
class AndCnnl : public LogicOpCnnl {
|
||||||
|
cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_AND; }
|
||||||
|
};
|
||||||
|
class OrCnnl : public LogicOpCnnl {
|
||||||
|
cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_OR; }
|
||||||
|
};
|
||||||
|
class XorCnnl : public LogicOpCnnl {
|
||||||
|
cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_XOR; }
|
||||||
|
};
|
||||||
|
class NotCnnl : public LogicOpCnnl {
|
||||||
|
cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_NOT; }
|
||||||
|
};
|
||||||
|
|
||||||
|
class BitAndCnnl : public BitComputeCnnl {
|
||||||
|
cnnlBitComputeOp_t getOpType() const override { return CNNL_CYCLE_BAND_OP; }
|
||||||
|
};
|
||||||
|
class BitOrCnnl : public BitComputeCnnl {
|
||||||
|
cnnlBitComputeOp_t getOpType() const override { return CNNL_CYCLE_BOR_OP; }
|
||||||
|
};
|
||||||
|
class BitXorCnnl : public BitComputeCnnl {
|
||||||
|
cnnlBitComputeOp_t getOpType() const override { return CNNL_CYCLE_BXOR_OP; }
|
||||||
|
};
|
||||||
|
class BitNotCnnl : public BitComputeCnnl {
|
||||||
|
cnnlBitComputeOp_t getOpType() const override { return CNNL_BNOT_OP; }
|
||||||
|
};
|
||||||
|
// class BitLeftShiftCnnl : public BitComputeCnnl {
|
||||||
|
// cnnlBitComputeOp_t getOpType() const override { return
|
||||||
|
// CNNL_BLEFT_SHIFT_OP_V2; }
|
||||||
|
// };
|
||||||
|
// class BitRightShiftCnnl : public BitComputeCnnl {
|
||||||
|
// cnnlBitComputeOp_t getOpType() const override { return
|
||||||
|
// CNNL_BLEFT_SHIFT_OP_V2; }
|
||||||
|
// };
|
||||||
|
|
||||||
REGISTER_KERNEL(Device::BANG, OpType::Add, DataType::Float32, AddCnnl,
|
REGISTER_KERNEL(Device::BANG, OpType::Add, DataType::Float32, AddCnnl,
|
||||||
"Add_cnnl_BANG_Float32");
|
"Add_cnnl_BANG_Float32");
|
||||||
|
@ -95,8 +647,56 @@ REGISTER_KERNEL(Device::BANG, OpType::Sub, DataType::Float32, SubCnnl,
|
||||||
REGISTER_KERNEL(Device::BANG, OpType::Mul, DataType::Float32, MulCnnl,
|
REGISTER_KERNEL(Device::BANG, OpType::Mul, DataType::Float32, MulCnnl,
|
||||||
"Mul_cnnl_BANG_Float32");
|
"Mul_cnnl_BANG_Float32");
|
||||||
|
|
||||||
REGISTER_KERNEL(Device::BANG, OpType::Div, DataType::Float32, ElementWiseBang,
|
REGISTER_KERNEL(Device::BANG, OpType::Div, DataType::Float32, DivCnnl,
|
||||||
"Div_Bang_Float32");
|
"Div_cnnl_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::Maximum, DataType::Float32, MaximumCnnl,
|
||||||
|
"Maximum_cnnl_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::Minimum, DataType::Float32, MinimumCnnl,
|
||||||
|
"Minimum_cnnl_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::MSELoss, DataType::Float32, MSELossCnnl,
|
||||||
|
"MSELoss_cnnl_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::Power, DataType::Float32, PowerCnnl,
|
||||||
|
"Power_cnnl_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::FloorDiv, DataType::Float32, FloorDivCnnl,
|
||||||
|
"FloorDiv_cnnl_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::FloorMod, DataType::Float32, FloorModCnnl,
|
||||||
|
"FloorMod_cnnl_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::SquaredDifference, DataType::Float32,
|
||||||
|
SquaredDifferenceCnnl, "SquaredDifference_cnnl_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::Equal, DataType::Float32, EqualCnnl,
|
||||||
|
"Equal_cnnl_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::NotEqual, DataType::Float32, NotEqualCnnl,
|
||||||
|
"NotEqual_cnnl_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::GreaterThan, DataType::Float32,
|
||||||
|
GreaterThanCnnl, "GreaterThan_cnnl_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::GreaterEqual, DataType::Float32,
|
||||||
|
GreaterEqualCnnl, "GreaterEqual_cnnl_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::LessThan, DataType::Float32, LessThanCnnl,
|
||||||
|
"LessThan_cnnl_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::LessEqual, DataType::Float32,
|
||||||
|
LessEqualCnnl, "LessEqual_cnnl_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::And, DataType::Float32, AndCnnl,
|
||||||
|
"And_cnnl_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::Or, DataType::Float32, OrCnnl,
|
||||||
|
"Or_cnnl_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::Xor, DataType::Float32, XorCnnl,
|
||||||
|
"Xor_cnnl_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::Not, DataType::Float32, NotCnnl,
|
||||||
|
"Not_cnnl_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::BitAnd, DataType::Float32, BitAndCnnl,
|
||||||
|
"BitAnd_cnnl_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::BitOr, DataType::Float32, BitOrCnnl,
|
||||||
|
"BitOr_cnnl_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::BitXor, DataType::Float32, BitXorCnnl,
|
||||||
|
"BitXor_cnnl_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::BitNot, DataType::Float32, BitNotCnnl,
|
||||||
|
"BitNot_cnnl_BANG_Float32");
|
||||||
|
// REGISTER_KERNEL(Device::BANG, OpType::BitLeftShift, DataType::Float32,
|
||||||
|
// BitLeftShiftCnnl,
|
||||||
|
// "BitLeftShift_cnnl_BANG_Float32");
|
||||||
|
// REGISTER_KERNEL(Device::BANG, OpType::BitRightShift, DataType::Float32,
|
||||||
|
// BitRightShiftCnnl,
|
||||||
|
// "BitRightShift_cnnl_BANG_Float32");
|
||||||
// REGISTER_KERNEL(Device::BANG, OpType::Pow, DataType::Float32,
|
// REGISTER_KERNEL(Device::BANG, OpType::Pow, DataType::Float32,
|
||||||
// ElementWiseBang,
|
// ElementWiseBang,
|
||||||
// "Pow_Bang_Float32");
|
// "Pow_Bang_Float32");
|
||||||
|
|
|
@ -0,0 +1,47 @@
|
||||||
|
#include "bang/bang_kernel_without_config.h"
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
#include "operators/unary.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
class ErfCnnl : public BangKernelWithoutConfig {
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<UnaryObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
|
cnnlTensorDescriptor_t aDesc, cDesc;
|
||||||
|
auto dim = op->getInputs(0)->getDims();
|
||||||
|
if (dim.size() != 4)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
|
||||||
|
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||||
|
// get inputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
// get outputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
cnnlStatus_t stat =
|
||||||
|
cnnlErf_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION,
|
||||||
|
aDesc, aData, cDesc, cData);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Destories in BANG does not require sync. But cnnl does not state
|
||||||
|
// whether sync is required before destories.
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::Erf, DataType::Float32, ErfCnnl,
|
||||||
|
"Erf_cnnl_BANG_Float32");
|
||||||
|
|
||||||
|
}; // namespace infini
|
|
@ -0,0 +1,47 @@
|
||||||
|
#include "bang/bang_kernel_without_config.h"
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
#include "operators/unary.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
class ExpCnnl : public BangKernelWithoutConfig {
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<UnaryObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
|
cnnlTensorDescriptor_t aDesc, cDesc;
|
||||||
|
auto dim = op->getInputs(0)->getDims();
|
||||||
|
if (dim.size() != 4)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
|
||||||
|
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||||
|
// get inputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
// get outputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
cnnlStatus_t stat =
|
||||||
|
cnnlExp_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION,
|
||||||
|
aDesc, aData, cDesc, cData);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Destories in BANG does not require sync. But cnnl does not state
|
||||||
|
// whether sync is required before destories.
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::Exp, DataType::Float32, ExpCnnl,
|
||||||
|
"Exp_cnnl_BANG_Float32");
|
||||||
|
|
||||||
|
}; // namespace infini
|
|
@ -0,0 +1,40 @@
|
||||||
|
#include "bang/bang_kernel_without_config.h"
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
#include "operators/unary.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
class FillCnnl : public BangKernelWithoutConfig {
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<FillObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
float value = op->getValue();
|
||||||
|
|
||||||
|
cnnlTensorDescriptor_t cDesc;
|
||||||
|
auto dim = op->getOutput()->getDims();
|
||||||
|
if (dim.size() != 4)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
|
||||||
|
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||||
|
// get outputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
cnnlStatus_t stat =
|
||||||
|
cnnlFill(context->cnnlHandle(), value, cDesc, cData);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Destories in BANG does not require sync. But cnnl does not state
|
||||||
|
// whether sync is required before destories.
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::Fill, DataType::Float32, FillCnnl,
|
||||||
|
"Fill_cnnl_BANG_Float32");
|
||||||
|
|
||||||
|
}; // namespace infini
|
|
@ -0,0 +1,46 @@
|
||||||
|
#include "bang/bang_kernel_without_config.h"
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
#include "operators/unary.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
class FloorCnnl : public BangKernelWithoutConfig {
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<UnaryObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
|
cnnlTensorDescriptor_t aDesc, cDesc;
|
||||||
|
auto dim = op->getInputs(0)->getDims();
|
||||||
|
if (dim.size() != 4)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
|
||||||
|
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||||
|
// get inputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
// get outputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
cnnlStatus_t stat =
|
||||||
|
cnnlFloor(context->cnnlHandle(), aDesc, aData, cDesc, cData);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Destories in BANG does not require sync. But cnnl does not state
|
||||||
|
// whether sync is required before destories.
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::Floor, DataType::Float32, FloorCnnl,
|
||||||
|
"Floor_cnnl_BANG_Float32");
|
||||||
|
|
||||||
|
}; // namespace infini
|
|
@ -0,0 +1,42 @@
|
||||||
|
#include "bang/bang_kernel_without_config.h"
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
#include "operators/unary.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
class HardtanhCnnl : public BangKernelWithoutConfig {
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<HardtanhObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
float min = op->getMin();
|
||||||
|
float max = op->getMax();
|
||||||
|
|
||||||
|
cnnlTensorDescriptor_t aDesc;
|
||||||
|
auto dim = op->getInputs(0)->getDims();
|
||||||
|
if (dim.size() != 4)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
|
||||||
|
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||||
|
// get inputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
cnnlStatus_t stat = cnnlHardtanh(context->cnnlHandle(), aDesc, aData,
|
||||||
|
max, min, aDesc, cData);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Destories in BANG does not require sync. But cnnl does not state
|
||||||
|
// whether sync is required before destories.
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::Hardtanh, DataType::Float32, HardtanhCnnl,
|
||||||
|
"Hardtanh_cnnl_BANG_Float32");
|
||||||
|
|
||||||
|
}; // namespace infini
|
|
@ -0,0 +1,40 @@
|
||||||
|
#include "bang/bang_kernel_without_config.h"
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
#include "operators/unary.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
class L2LossCnnl : public BangKernelWithoutConfig {
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<L2LossObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
|
cnnlTensorDescriptor_t aDesc;
|
||||||
|
auto dim = op->getInputs(0)->getDims();
|
||||||
|
if (dim.size() != 4)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
|
||||||
|
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||||
|
// get inputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
cnnlStatus_t stat =
|
||||||
|
cnnlL2Loss(context->cnnlHandle(), aDesc, aData, cData);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Destories in BANG does not require sync. But cnnl does not state
|
||||||
|
// whether sync is required before destories.
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::L2Loss, DataType::Float32, L2LossCnnl,
|
||||||
|
"L2Loss_cnnl_BANG_Float32");
|
||||||
|
|
||||||
|
}; // namespace infini
|
|
@ -0,0 +1,62 @@
|
||||||
|
#include "bang/bang_kernel_without_config.h"
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
#include "operators/unary.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
class LogCnnl : public BangKernelWithoutConfig {
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<LogObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
auto type = op->getType();
|
||||||
|
cnnlLogBase_t base;
|
||||||
|
switch (type) {
|
||||||
|
case LogObj::Log2:
|
||||||
|
base = CNNL_LOG_2;
|
||||||
|
break;
|
||||||
|
case LogObj::LogE:
|
||||||
|
base = CNNL_LOG_E;
|
||||||
|
break;
|
||||||
|
case LogObj::Log10:
|
||||||
|
base = CNNL_LOG_10;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
IT_TODO_HALT();
|
||||||
|
}
|
||||||
|
|
||||||
|
cnnlTensorDescriptor_t aDesc, cDesc;
|
||||||
|
auto dim = op->getInputs(0)->getDims();
|
||||||
|
if (dim.size() != 4)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
|
||||||
|
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||||
|
// get inputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
// get outputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
cnnlStatus_t stat =
|
||||||
|
cnnlLog_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION,
|
||||||
|
base, aDesc, aData, cDesc, cData);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Destories in BANG does not require sync. But cnnl does not state
|
||||||
|
// whether sync is required before destories.
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::Log, DataType::Float32, LogCnnl,
|
||||||
|
"Log_cnnl_BANG_Float32");
|
||||||
|
|
||||||
|
}; // namespace infini
|
|
@ -18,19 +18,27 @@ class MatmulCnnl : public BangKernelWithoutConfig {
|
||||||
auto dimInputs0 = op->getInputs(0)->getDims();
|
auto dimInputs0 = op->getInputs(0)->getDims();
|
||||||
auto dimInputs1 = op->getInputs(1)->getDims();
|
auto dimInputs1 = op->getInputs(1)->getDims();
|
||||||
auto dimOutput = op->getOutput()->getDims();
|
auto dimOutput = op->getOutput()->getDims();
|
||||||
if (dimInputs0.size() != 3)
|
int input0_batch_size = 1;
|
||||||
IT_TODO_HALT();
|
int input1_batch_size = 1;
|
||||||
if (dimInputs1.size() != 3)
|
int output_batch_size = 1;
|
||||||
IT_TODO_HALT();
|
for (size_t i = 0; i < dimInputs0.size() - 2; ++i) {
|
||||||
if (dimOutput.size() != 3)
|
input0_batch_size *= dimInputs0[i];
|
||||||
IT_TODO_HALT();
|
input1_batch_size *= dimInputs1[i];
|
||||||
|
output_batch_size *= dimOutput[i];
|
||||||
|
}
|
||||||
|
|
||||||
bool transA = op->getTransA();
|
bool transA = op->getTransA();
|
||||||
bool transB = op->getTransB();
|
bool transB = op->getTransB();
|
||||||
|
|
||||||
int inputs0Array[3] = {dimInputs0[0], dimInputs0[1], dimInputs0[2]};
|
int inputs0Array[3] = {input0_batch_size,
|
||||||
int inputs1Array[3] = {dimInputs1[0], dimInputs1[1], dimInputs1[2]};
|
dimInputs0[dimInputs0.size() - 2],
|
||||||
int outputArray[3] = {dimOutput[0], dimOutput[1], dimOutput[2]};
|
dimInputs0[dimInputs0.size() - 1]};
|
||||||
|
int inputs1Array[3] = {input1_batch_size,
|
||||||
|
dimInputs1[dimInputs1.size() - 2],
|
||||||
|
dimInputs1[dimInputs1.size() - 1]};
|
||||||
|
int outputArray[3] = {output_batch_size,
|
||||||
|
dimOutput[dimOutput.size() - 2],
|
||||||
|
dimOutput[dimOutput.size() - 1]};
|
||||||
|
|
||||||
// get inputs
|
// get inputs
|
||||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||||
|
|
|
@ -0,0 +1,46 @@
|
||||||
|
#include "bang/bang_kernel_without_config.h"
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
#include "operators/unary.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
class NegTensorCnnl : public BangKernelWithoutConfig {
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<UnaryObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
|
cnnlTensorDescriptor_t aDesc, cDesc;
|
||||||
|
auto dim = op->getInputs(0)->getDims();
|
||||||
|
if (dim.size() != 4)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
|
||||||
|
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||||
|
// get inputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
// get outputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
cnnlStatus_t stat =
|
||||||
|
cnnlNegTensor(context->cnnlHandle(), aDesc, aData, cDesc, cData);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Destories in BANG does not require sync. But cnnl does not state
|
||||||
|
// whether sync is required before destories.
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::Neg, DataType::Float32, NegTensorCnnl,
|
||||||
|
"Neg_cnnl_BANG_Float32");
|
||||||
|
|
||||||
|
}; // namespace infini
|
|
@ -0,0 +1,65 @@
|
||||||
|
#include "operators/pad.h"
|
||||||
|
#include "bang/bang_kernel_without_config.h"
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
class PadCnnl : public BangKernelWithoutConfig {
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<PadObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
|
cnnlTensorDescriptor_t aDesc, cDesc;
|
||||||
|
auto dim = op->getOutput()->getDims();
|
||||||
|
int dim_size = dim.size();
|
||||||
|
int dim_array[dim_size];
|
||||||
|
for (int i = 0; i < dim_size; ++i) {
|
||||||
|
dim_array[i] = dim[i];
|
||||||
|
}
|
||||||
|
int paddings[dim_size * 2];
|
||||||
|
std::vector<int> pads = op->getPads();
|
||||||
|
if (pads.size() == 2 && dim_size != 1) {
|
||||||
|
for (int i = 0; i < dim_size * 2; i += 2) {
|
||||||
|
paddings[i] = pads[0];
|
||||||
|
paddings[i + 1] = pads[1];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (int i = 0; i < dim_size * 2; i += 2) {
|
||||||
|
paddings[i] = pads[i / 2];
|
||||||
|
paddings[i + 1] = pads[i / 2 + dim_size];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
int dimout_array[dim_size];
|
||||||
|
for (int i = 0; i < dim_size; ++i) {
|
||||||
|
dimout_array[i] = dim[i] + paddings[2 * i] + paddings[2 * i + 1];
|
||||||
|
}
|
||||||
|
float paddingValue = 0.0;
|
||||||
|
// input
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
aDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, dim_size, dim_array));
|
||||||
|
// output
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY,
|
||||||
|
CNNL_DTYPE_FLOAT, dim_size,
|
||||||
|
dimout_array));
|
||||||
|
|
||||||
|
cnnlStatus_t stat = cnnlPad(context->cnnlHandle(), aDesc, aData,
|
||||||
|
paddings, &paddingValue, cDesc, cData);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Destories in BANG does not require sync. But cnnl does not state
|
||||||
|
// whether sync is required before destories.
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::Pad, DataType::Float32, PadCnnl,
|
||||||
|
"Pad_cnnl_BANG_Float32");
|
||||||
|
|
||||||
|
}; // namespace infini
|
|
@ -0,0 +1,73 @@
|
||||||
|
#include "operators/pooling.h"
|
||||||
|
#include "bang/bang_kernel_without_config.h"
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
class PoolingCnnl : public BangKernelWithoutConfig {
|
||||||
|
virtual cnnlPoolingMode_t getPoolingMode() const = 0;
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<PoolingObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
void *const inData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const outData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
|
const auto [n, c, h, w, kh, kw] = op->getNCHWRS();
|
||||||
|
const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
|
||||||
|
|
||||||
|
// get inputs
|
||||||
|
int inArray[4] = {n, c, h, w};
|
||||||
|
cnnlTensorDescriptor_t inDesc;
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&inDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(inDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, inArray));
|
||||||
|
|
||||||
|
// get maxpool descriptor
|
||||||
|
cnnlPoolingDescriptor_t poolingDesc;
|
||||||
|
checkCnnlError(cnnlCreatePoolingDescriptor(&poolingDesc));
|
||||||
|
checkCnnlError(cnnlSetPooling2dDescriptor_v2(
|
||||||
|
poolingDesc, getPoolingMode(), CNNL_NOT_PROPAGATE_NAN, kh, kw, ph,
|
||||||
|
ph, pw, pw, sh, sw, dh, dw, false));
|
||||||
|
|
||||||
|
// get outputs
|
||||||
|
auto outVec = op->getOutput()->getDims();
|
||||||
|
int outArray[4] = {outVec[0], outVec[1], outVec[2], outVec[3]};
|
||||||
|
cnnlTensorDescriptor_t outDesc;
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&outDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(outDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, outArray));
|
||||||
|
size_t wsSize;
|
||||||
|
cnnlGetPoolingWorkspaceSize(context->cnnlHandle(), getPoolingMode(),
|
||||||
|
outVec[3], outVec[2], &wsSize);
|
||||||
|
BangPtr wsData = context->getWorkspace(wsSize);
|
||||||
|
|
||||||
|
float alpha = 1.f, beta = 0.f;
|
||||||
|
checkCnnlError(cnnlPoolingForward(context->cnnlHandle(), poolingDesc,
|
||||||
|
&alpha, inDesc, inData, &beta,
|
||||||
|
outDesc, outData, wsData, wsSize));
|
||||||
|
|
||||||
|
// Destories in BANG does not require sync. But cnnl does not state
|
||||||
|
// whether sync is required before destories.
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(inDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(outDesc));
|
||||||
|
checkCnnlError(cnnlDestroyPoolingDescriptor(poolingDesc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class maxPoolCnnl : public PoolingCnnl {
|
||||||
|
cnnlPoolingMode_t getPoolingMode() const override {
|
||||||
|
return CNNL_POOLING_MAX;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class avgPoolCnnl : public PoolingCnnl {
|
||||||
|
cnnlPoolingMode_t getPoolingMode() const override {
|
||||||
|
return CNNL_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::MaxPool, DataType::Float32, maxPoolCnnl,
|
||||||
|
"MaxPool_cnnl_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::AvgPool, DataType::Float32, avgPoolCnnl,
|
||||||
|
"AvgPool_cnnl_BANG_Float32");
|
||||||
|
}; // namespace infini
|
|
@ -0,0 +1,46 @@
|
||||||
|
#include "bang/bang_kernel_without_config.h"
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
#include "operators/unary.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
class ReciprocalCnnl : public BangKernelWithoutConfig {
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<UnaryObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
|
cnnlTensorDescriptor_t aDesc, cDesc;
|
||||||
|
auto dim = op->getInputs(0)->getDims();
|
||||||
|
if (dim.size() != 4)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
|
||||||
|
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||||
|
// get inputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
// get outputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
cnnlStatus_t stat =
|
||||||
|
cnnlReciprocal(context->cnnlHandle(), aDesc, aData, cDesc, cData);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Destories in BANG does not require sync. But cnnl does not state
|
||||||
|
// whether sync is required before destories.
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::Reciprocal, DataType::Float32,
|
||||||
|
ReciprocalCnnl, "Reciprocal_cnnl_BANG_Float32");
|
||||||
|
|
||||||
|
}; // namespace infini
|
|
@ -0,0 +1,42 @@
|
||||||
|
#include "operators/reshape.h"
|
||||||
|
#include "bang/bang_kernel_without_config.h"
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
class CopyBang : public BangKernelWithoutConfig {
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<ReshapeObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
auto inData = op->getInputs(0)->getRawDataPtr<void *>();
|
||||||
|
auto outData = op->getOutputs()[0]->getRawDataPtr<void *>();
|
||||||
|
cnnlTensorDescriptor_t aDesc;
|
||||||
|
auto dim = op->getInputs(0)->getDims();
|
||||||
|
int len = dim.size();
|
||||||
|
int size = 1;
|
||||||
|
for (int i = 0; i < len; ++i) {
|
||||||
|
size *= dim[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
int dim_array[1] = {size};
|
||||||
|
// get inputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY,
|
||||||
|
CNNL_DTYPE_FLOAT, 1, dim_array));
|
||||||
|
cnnlStatus_t stat =
|
||||||
|
cnnlCopy(context->cnnlHandle(), aDesc, inData, aDesc, outData);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
// reshape/flatten/identity all act as copying from input to output.
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::Reshape, DataType::Float32, CopyBang,
|
||||||
|
"Reshape_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::Flatten, DataType::Float32, CopyBang,
|
||||||
|
"Flatten_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::Identity, DataType::Float32, CopyBang,
|
||||||
|
"Identity_BANG_Float32");
|
||||||
|
|
||||||
|
} // namespace infini
|
|
@ -0,0 +1,47 @@
|
||||||
|
#include "bang/bang_kernel_without_config.h"
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
#include "operators/unary.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
class RsqrtCnnl : public BangKernelWithoutConfig {
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<UnaryObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
|
cnnlTensorDescriptor_t aDesc, cDesc;
|
||||||
|
auto dim = op->getInputs(0)->getDims();
|
||||||
|
if (dim.size() != 4)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
|
||||||
|
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||||
|
// get inputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
// get outputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
cnnlStatus_t stat =
|
||||||
|
cnnlRsqrt_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION,
|
||||||
|
aDesc, aData, cDesc, cData);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Destories in BANG does not require sync. But cnnl does not state
|
||||||
|
// whether sync is required before destories.
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::Rsqrt, DataType::Float32, RsqrtCnnl,
|
||||||
|
"Rsqrt_cnnl_BANG_Float32");
|
||||||
|
|
||||||
|
}; // namespace infini
|
|
@ -0,0 +1,69 @@
|
||||||
|
#include "operators/split.h"
|
||||||
|
#include "bang/bang_kernel_without_config.h"
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
class SplitCnnl : public BangKernelWithoutConfig {
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<SplitObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
int num = op->numOutputs();
|
||||||
|
int axis = op->getDim();
|
||||||
|
void *argv[num];
|
||||||
|
for (int i = 0; i < num; ++i) {
|
||||||
|
argv[i] = op->getOutput(i)->getRawDataPtr<void *>();
|
||||||
|
}
|
||||||
|
void *const inputData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
|
||||||
|
cnnlTensorDescriptor_t desc;
|
||||||
|
|
||||||
|
int dimout_array[num][4];
|
||||||
|
for (int i = 0; i < num; ++i) {
|
||||||
|
auto dim = op->getOutput(i)->getDims();
|
||||||
|
if (dim.size() != 4) {
|
||||||
|
IT_TODO_HALT();
|
||||||
|
}
|
||||||
|
dimout_array[i][0] = dim[0];
|
||||||
|
dimout_array[i][1] = dim[1];
|
||||||
|
dimout_array[i][2] = dim[2];
|
||||||
|
dimout_array[i][3] = dim[3];
|
||||||
|
}
|
||||||
|
auto dim = op->getInputs(0)->getDims();
|
||||||
|
if (dim.size() != 4) {
|
||||||
|
IT_TODO_HALT();
|
||||||
|
}
|
||||||
|
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&desc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(desc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
cnnlTensorDescriptor_t descArray[num];
|
||||||
|
for (int i = 0; i < num; ++i) {
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&descArray[i]));
|
||||||
|
checkCnnlError(
|
||||||
|
cnnlSetTensorDescriptor(descArray[i], CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dimout_array[i]));
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t wsSize;
|
||||||
|
cnnlGetSplitWorkspaceSize(context->cnnlHandle(), num, &wsSize);
|
||||||
|
BangPtr wsData = context->getWorkspace(wsSize);
|
||||||
|
|
||||||
|
cnnlStatus_t stat =
|
||||||
|
cnnlSplit(context->cnnlHandle(), num, axis, desc, inputData, wsData,
|
||||||
|
wsSize, descArray, argv);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Destories in BANG does not require sync. But cnnl does not state
|
||||||
|
// whether sync is required before destories.
|
||||||
|
for (int i = 0; i < num; ++i) {
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(descArray[i]));
|
||||||
|
}
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(desc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::Split, DataType::Float32, SplitCnnl,
|
||||||
|
"Split_cnnl_BANG_Float32");
|
||||||
|
}; // namespace infini
|
|
@ -0,0 +1,47 @@
|
||||||
|
#include "bang/bang_kernel_without_config.h"
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
#include "operators/unary.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
class SqrtCnnl : public BangKernelWithoutConfig {
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<UnaryObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
|
cnnlTensorDescriptor_t aDesc, cDesc;
|
||||||
|
auto dim = op->getInputs(0)->getDims();
|
||||||
|
if (dim.size() != 4)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
|
||||||
|
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||||
|
// get inputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
// get outputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
cnnlStatus_t stat =
|
||||||
|
cnnlSqrt_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION,
|
||||||
|
aDesc, aData, cDesc, cData);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Destories in BANG does not require sync. But cnnl does not state
|
||||||
|
// whether sync is required before destories.
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::Sqrt, DataType::Float32, SqrtCnnl,
|
||||||
|
"Sqrt_cnnl_BANG_Float32");
|
||||||
|
|
||||||
|
}; // namespace infini
|
|
@ -0,0 +1,60 @@
|
||||||
|
#include "operators/transpose.h"
|
||||||
|
#include "bang/bang_kernel_without_config.h"
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
class TransposeCnnl : public BangKernelWithoutConfig {
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<TransposeObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
|
cnnlTensorDescriptor_t aDesc, cDesc;
|
||||||
|
auto dimin = op->getInputs(0)->getDims();
|
||||||
|
auto dimout = op->getOutput()->getDims();
|
||||||
|
if (dimin.size() != 4 || dimout.size() != 4)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
|
||||||
|
int dimin_array[4] = {dimin[0], dimin[1], dimin[2], dimin[3]};
|
||||||
|
int dimout_array[4] = {dimout[0], dimout[1], dimout[2], dimout[3]};
|
||||||
|
// get inputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
aDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 4, dimin_array));
|
||||||
|
|
||||||
|
// get outputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
cDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 4, dimout_array));
|
||||||
|
|
||||||
|
// get op descriptor
|
||||||
|
auto permute = op->getPermute();
|
||||||
|
cnnlTransposeDescriptor_t opDesc;
|
||||||
|
checkCnnlError(cnnlCreateTransposeDescriptor(&opDesc));
|
||||||
|
checkCnnlError(cnnlSetTransposeDescriptor(opDesc, 4, permute.data()));
|
||||||
|
|
||||||
|
size_t wsSize;
|
||||||
|
cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), aDesc, opDesc,
|
||||||
|
&wsSize);
|
||||||
|
BangPtr wsData = context->getWorkspace(wsSize);
|
||||||
|
|
||||||
|
cnnlStatus_t stat =
|
||||||
|
cnnlTranspose_v2(context->cnnlHandle(), opDesc, aDesc, aData, cDesc,
|
||||||
|
cData, wsData, wsSize);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Destories in BANG does not require sync. But cnnl does not state
|
||||||
|
// whether sync is required before destories.
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTransposeDescriptor(opDesc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::Transpose, DataType::Float32,
|
||||||
|
TransposeCnnl, "Transpose_cnnl_BANG_Float32");
|
||||||
|
}; // namespace infini
|
|
@ -0,0 +1,184 @@
|
||||||
|
#include "bang/bang_kernel_without_config.h"
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
#include "operators/unary.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
class TrigonCnnl : public BangKernelWithoutConfig {
|
||||||
|
virtual cnnlTrigonFunctionMode_t getOpType() const = 0;
|
||||||
|
virtual cnnlComputationPreference_t getPrefer() const = 0;
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<UnaryObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
|
cnnlTensorDescriptor_t aDesc, cDesc;
|
||||||
|
auto dim = op->getInputs(0)->getDims();
|
||||||
|
if (dim.size() != 4)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
|
||||||
|
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||||
|
// get inputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
// get outputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||||
|
|
||||||
|
// get op descriptor
|
||||||
|
cnnlTrigonDescriptor_t opDesc;
|
||||||
|
checkCnnlError(cnnlCreateTrigonDescriptor(&opDesc));
|
||||||
|
checkCnnlError(cnnlSetTrigonDescriptor(opDesc, getOpType()));
|
||||||
|
|
||||||
|
cnnlStatus_t stat = cnnlTrigonForward(context->cnnlHandle(), opDesc,
|
||||||
|
aDesc, aData, cDesc, cData);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Destories in BANG does not require sync. But cnnl does not state
|
||||||
|
// whether sync is required before destories.
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTrigonDescriptor(opDesc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class SinCnnl : public TrigonCnnl {
|
||||||
|
cnnlTrigonFunctionMode_t getOpType() const override {
|
||||||
|
return CNNL_TRIGON_SIN;
|
||||||
|
}
|
||||||
|
cnnlComputationPreference_t getPrefer() const override {
|
||||||
|
return CNNL_COMPUTATION_HIGH_PRECISION;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class CosCnnl : public TrigonCnnl {
|
||||||
|
cnnlTrigonFunctionMode_t getOpType() const override {
|
||||||
|
return CNNL_TRIGON_COS;
|
||||||
|
}
|
||||||
|
cnnlComputationPreference_t getPrefer() const override {
|
||||||
|
return CNNL_COMPUTATION_HIGH_PRECISION;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class TanCnnl : public TrigonCnnl {
|
||||||
|
cnnlTrigonFunctionMode_t getOpType() const override {
|
||||||
|
return CNNL_TRIGON_TAN;
|
||||||
|
}
|
||||||
|
cnnlComputationPreference_t getPrefer() const override {
|
||||||
|
return CNNL_COMPUTATION_HIGH_PRECISION;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class ASinCnnl : public TrigonCnnl {
|
||||||
|
cnnlTrigonFunctionMode_t getOpType() const override {
|
||||||
|
return CNNL_TRIGON_ASIN;
|
||||||
|
}
|
||||||
|
cnnlComputationPreference_t getPrefer() const override {
|
||||||
|
return CNNL_COMPUTATION_HIGH_PRECISION;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class ACosCnnl : public TrigonCnnl {
|
||||||
|
cnnlTrigonFunctionMode_t getOpType() const override {
|
||||||
|
return CNNL_TRIGON_ACOS;
|
||||||
|
}
|
||||||
|
cnnlComputationPreference_t getPrefer() const override {
|
||||||
|
return CNNL_COMPUTATION_HIGH_PRECISION;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class ATanCnnl : public TrigonCnnl {
|
||||||
|
cnnlTrigonFunctionMode_t getOpType() const override {
|
||||||
|
return CNNL_TRIGON_ATAN;
|
||||||
|
}
|
||||||
|
cnnlComputationPreference_t getPrefer() const override {
|
||||||
|
return CNNL_COMPUTATION_HIGH_PRECISION;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class SinHCnnl : public TrigonCnnl {
|
||||||
|
cnnlTrigonFunctionMode_t getOpType() const override {
|
||||||
|
return CNNL_TRIGON_SINH;
|
||||||
|
}
|
||||||
|
cnnlComputationPreference_t getPrefer() const override {
|
||||||
|
return CNNL_COMPUTATION_HIGH_PRECISION;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class CosHCnnl : public TrigonCnnl {
|
||||||
|
cnnlTrigonFunctionMode_t getOpType() const override {
|
||||||
|
return CNNL_TRIGON_COSH;
|
||||||
|
}
|
||||||
|
cnnlComputationPreference_t getPrefer() const override {
|
||||||
|
return CNNL_COMPUTATION_HIGH_PRECISION;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class TanHCnnl : public TrigonCnnl {
|
||||||
|
cnnlTrigonFunctionMode_t getOpType() const override {
|
||||||
|
return CNNL_TRIGON_TANH;
|
||||||
|
}
|
||||||
|
cnnlComputationPreference_t getPrefer() const override {
|
||||||
|
return CNNL_COMPUTATION_HIGH_PRECISION;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class ASinHCnnl : public TrigonCnnl {
|
||||||
|
cnnlTrigonFunctionMode_t getOpType() const override {
|
||||||
|
return CNNL_TRIGON_ASINH;
|
||||||
|
}
|
||||||
|
cnnlComputationPreference_t getPrefer() const override {
|
||||||
|
return CNNL_COMPUTATION_HIGH_PRECISION;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class ACosHCnnl : public TrigonCnnl {
|
||||||
|
cnnlTrigonFunctionMode_t getOpType() const override {
|
||||||
|
return CNNL_TRIGON_ACOSH;
|
||||||
|
}
|
||||||
|
cnnlComputationPreference_t getPrefer() const override {
|
||||||
|
return CNNL_COMPUTATION_HIGH_PRECISION;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class ATanHCnnl : public TrigonCnnl {
|
||||||
|
cnnlTrigonFunctionMode_t getOpType() const override {
|
||||||
|
return CNNL_TRIGON_ATANH;
|
||||||
|
}
|
||||||
|
cnnlComputationPreference_t getPrefer() const override {
|
||||||
|
return CNNL_COMPUTATION_HIGH_PRECISION;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::Sin, DataType::Float32, SinCnnl,
|
||||||
|
"Sin_cnnl_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::Cos, DataType::Float32, CosCnnl,
|
||||||
|
"Cos_cnnl_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::Tan, DataType::Float32, TanCnnl,
|
||||||
|
"Tan_cnnl_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::ASin, DataType::Float32, ASinCnnl,
|
||||||
|
"ASin_cnnl_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::ACos, DataType::Float32, ACosCnnl,
|
||||||
|
"ACos_cnnl_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::ATan, DataType::Float32, ATanCnnl,
|
||||||
|
"ATan_cnnl_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::SinH, DataType::Float32, SinHCnnl,
|
||||||
|
"SinH_cnnl_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::CosH, DataType::Float32, CosHCnnl,
|
||||||
|
"CosH_cnnl_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::TanH, DataType::Float32, TanHCnnl,
|
||||||
|
"TanH_cnnl_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::ASinH, DataType::Float32, ASinHCnnl,
|
||||||
|
"ASinH_cnnl_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::ACosH, DataType::Float32, ACosHCnnl,
|
||||||
|
"ACosH_cnnl_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::ATanH, DataType::Float32, ATanHCnnl,
|
||||||
|
"ATanH_cnnl_BANG_Float32");
|
||||||
|
|
||||||
|
}; // namespace infini
|
|
@ -11,17 +11,37 @@ template <typename T> class NativeElementWise : public CpuKernelWithoutConfig {
|
||||||
T *inptr1 = op->getInputs(1)->getRawDataPtr<T *>();
|
T *inptr1 = op->getInputs(1)->getRawDataPtr<T *>();
|
||||||
T *outptr = op->getOutput()->getRawDataPtr<T *>();
|
T *outptr = op->getOutput()->getRawDataPtr<T *>();
|
||||||
|
|
||||||
auto outDim = op->getOutput()->getDims();
|
int a[4] = {1, 1, 1, 1};
|
||||||
|
int b[4] = {1, 1, 1, 1};
|
||||||
|
int c[4] = {1, 1, 1, 1};
|
||||||
|
auto a_input = op->getInputs(0)->getDims();
|
||||||
|
auto b_input = op->getInputs(1)->getDims();
|
||||||
|
auto c_output = op->getOutput()->getDims();
|
||||||
|
std::copy(a_input.begin(), a_input.end(), a + (4 - a_input.size()));
|
||||||
|
std::copy(b_input.begin(), b_input.end(), b + (4 - b_input.size()));
|
||||||
|
std::copy(c_output.begin(), c_output.end(), c + (4 - c_output.size()));
|
||||||
|
|
||||||
auto n = op->getOutput()->size();
|
auto n = op->getOutput()->size();
|
||||||
for (size_t offset = 0; offset < n; offset++) {
|
for (size_t i = 0; i < n; ++i) {
|
||||||
// For now,we only process the same dims here, broardcast will be
|
int c0_index = i / (c[1] * c[2] * c[3]);
|
||||||
// considered in the opt layer.
|
int c1_index = (i % (c[1] * c[2] * c[3])) / (c[2] * c[3]);
|
||||||
/*auto offset0 =
|
int c2_index = ((i % (c[1] * c[2] * c[3])) % (c[2] * c[3])) / c[3];
|
||||||
op->getInputs(0)->getOffsetByBroadcastOffset(offset, outDim);
|
int c3_index = ((i % (c[1] * c[2] * c[3])) % (c[2] * c[3])) % c[3];
|
||||||
auto offset1 =
|
|
||||||
op->getInputs(1)->getOffsetByBroadcastOffset(offset, outDim);
|
int a0_index = c0_index % a[0];
|
||||||
outptr[offset] = doCompute(inptr0[offset0], inptr1[offset1]);*/
|
int a1_index = c1_index % a[1];
|
||||||
outptr[offset] = doCompute(inptr0[offset], inptr1[offset]);
|
int a2_index = c2_index % a[2];
|
||||||
|
int a3_index = c3_index % a[3];
|
||||||
|
|
||||||
|
int b0_index = c0_index % b[0];
|
||||||
|
int b1_index = c1_index % b[1];
|
||||||
|
int b2_index = c2_index % b[2];
|
||||||
|
int b3_index = c3_index % b[3];
|
||||||
|
outptr[i] = doCompute(
|
||||||
|
inptr0[a0_index * a[1] * a[2] * a[3] + a1_index * a[2] * a[3] +
|
||||||
|
a2_index * a[3] + a3_index],
|
||||||
|
inptr1[b0_index * b[1] * b[2] * b[3] + b1_index * b[2] * b[3] +
|
||||||
|
b2_index * b[3] + b3_index]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -55,4 +75,4 @@ REGISTER_KERNEL(Device::CPU, OpType::Div, DataType::UInt32, NaiveDiv<uint32_t>,
|
||||||
"divNaive_CPU_uint32");
|
"divNaive_CPU_uint32");
|
||||||
REGISTER_KERNEL(Device::CPU, OpType::Div, DataType::Float32, NaiveDiv<float>,
|
REGISTER_KERNEL(Device::CPU, OpType::Div, DataType::Float32, NaiveDiv<float>,
|
||||||
"divNaive_CPU_float32");
|
"divNaive_CPU_float32");
|
||||||
}; // namespace infini
|
}; // namespace infini
|
||||||
|
|
|
@ -56,6 +56,25 @@ template <typename T> class NaiveAbs : public NativeUnary<T> {
|
||||||
T doCompute(T val) const override { return val < 0 ? -val : val; }
|
T doCompute(T val) const override { return val < 0 ? -val : val; }
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template <typename T> class Clip : public CpuKernelWithoutConfig {
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *context) const override {
|
||||||
|
auto op = as<ClipObj>(_op);
|
||||||
|
T *inptr = op->getInputs(0)->getRawDataPtr<T *>();
|
||||||
|
T *outptr = op->getOutput()->getRawDataPtr<T *>();
|
||||||
|
auto minValue = op->getMin();
|
||||||
|
auto maxValue = op->getMax();
|
||||||
|
|
||||||
|
auto n = op->getOutput()->size();
|
||||||
|
for (size_t offset = 0; offset < n; offset++) {
|
||||||
|
auto val = *inptr++;
|
||||||
|
*outptr++ = (minValue && val < *minValue) ? *minValue
|
||||||
|
: (maxValue && val > *maxValue) ? *maxValue
|
||||||
|
: val;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
REGISTER_KERNEL(Device::CPU, OpType::Relu, DataType::UInt32,
|
REGISTER_KERNEL(Device::CPU, OpType::Relu, DataType::UInt32,
|
||||||
NaiveRelu<uint32_t>, "reluNaive_CPU_uint32");
|
NaiveRelu<uint32_t>, "reluNaive_CPU_uint32");
|
||||||
REGISTER_KERNEL(Device::CPU, OpType::Relu, DataType::Float32, NaiveRelu<float>,
|
REGISTER_KERNEL(Device::CPU, OpType::Relu, DataType::Float32, NaiveRelu<float>,
|
||||||
|
@ -76,4 +95,6 @@ REGISTER_KERNEL(Device::CPU, OpType::Softmax, DataType::UInt32,
|
||||||
NaiveSoftmax<uint32_t>, "softmaxNaive_CPU_uint32");
|
NaiveSoftmax<uint32_t>, "softmaxNaive_CPU_uint32");
|
||||||
REGISTER_KERNEL(Device::CPU, OpType::Softmax, DataType::Float32,
|
REGISTER_KERNEL(Device::CPU, OpType::Softmax, DataType::Float32,
|
||||||
NaiveSoftmax<float>, "softmaxNaive_CPU_float32");
|
NaiveSoftmax<float>, "softmaxNaive_CPU_float32");
|
||||||
|
REGISTER_KERNEL(Device::CPU, OpType::Clip, DataType::Float32, Clip<float>,
|
||||||
|
"Clip_CPU_float32");
|
||||||
}; // namespace infini
|
}; // namespace infini
|
||||||
|
|
|
@ -20,18 +20,17 @@ class BatchNormCudnn : public CudaKernelWithoutConfig {
|
||||||
auto dims = op->getInputs(0)->getDims();
|
auto dims = op->getInputs(0)->getDims();
|
||||||
// Only 4D and 5D tensors are supported by
|
// Only 4D and 5D tensors are supported by
|
||||||
// cudnnBatchNormalizationForwardInference
|
// cudnnBatchNormalizationForwardInference
|
||||||
IT_ASSERT(dims.size() == 4 || dims.size() == 5);
|
IT_ASSERT(dims.size() == 4);
|
||||||
|
|
||||||
int dimArray[CUDNN_DIM_MAX], strideArray[CUDNN_DIM_MAX],
|
int dimArray[4], strideArray[4], dimPArray[4], stridePArray[4];
|
||||||
dimPArray[CUDNN_DIM_MAX], stridePArray[CUDNN_DIM_MAX];
|
|
||||||
for (size_t i = 0; i < dims.size(); ++i) {
|
for (size_t i = 0; i < dims.size(); ++i) {
|
||||||
dimArray[i] = dims[i];
|
dimArray[i] = dims[i];
|
||||||
strideArray[i] = op->getInputs(0)->getStride()[i];
|
strideArray[i] = op->getInputs(0)->getStride()[i];
|
||||||
dimPArray[i] = 1;
|
dimPArray[i] = 1;
|
||||||
stridePArray[i] = 1;
|
stridePArray[i] = 1;
|
||||||
}
|
}
|
||||||
dimPArray[1] = op->getInputs(0)->getDims()[1];
|
dimPArray[1] = op->getInputs(1)->getDims()[0];
|
||||||
stridePArray[1] = op->getInputs(0)->getStride()[1];
|
stridePArray[0] = op->getInputs(1)->getDims()[0];
|
||||||
// get inputs
|
// get inputs
|
||||||
cudnnTensorDescriptor_t inDesc;
|
cudnnTensorDescriptor_t inDesc;
|
||||||
checkCudnnError(cudnnCreateTensorDescriptor(&inDesc));
|
checkCudnnError(cudnnCreateTensorDescriptor(&inDesc));
|
||||||
|
|
|
@ -0,0 +1,27 @@
|
||||||
|
#include "cuda/cuda_clip.h"
|
||||||
|
#include "cuda/cuda_kernel_wihtout_config.h"
|
||||||
|
#include "cuda/cuda_runtime.h"
|
||||||
|
#include "operators/unary.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
class ClipCuda : public CudaKernelWithoutConfig {
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<ClipObj>(_op);
|
||||||
|
|
||||||
|
void *const inputData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const outputData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
auto min = op->getMin();
|
||||||
|
auto max = op->getMax();
|
||||||
|
auto dim = op->getInputs(0)->getDims();
|
||||||
|
int num = dim[0] * dim[1] * dim[2] * dim[3];
|
||||||
|
clip_kernel((float *)inputData, (float *)outputData, num,
|
||||||
|
min ? *min : NAN, max ? *max : NAN);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
REGISTER_KERNEL(Device::CUDA, OpType::Clip, DataType::Float32, ClipCuda,
|
||||||
|
"Clip_CUDA_Float32");
|
||||||
|
|
||||||
|
}; // namespace infini
|
|
@ -0,0 +1,32 @@
|
||||||
|
#include "core/common.h"
|
||||||
|
#include "core/constants.h"
|
||||||
|
#include "cuda/cuda_common.h"
|
||||||
|
#include <math.h>
|
||||||
|
|
||||||
|
using infini::E_CONSTANT;
|
||||||
|
constexpr unsigned int num_threads() { return 32 * 4; }
|
||||||
|
constexpr int thread_work_size() { return 4; }
|
||||||
|
constexpr int block_work_size() { return thread_work_size() * num_threads(); }
|
||||||
|
|
||||||
|
__global__ void _clip_kernel(float *input, float *output, int n, float minValue,
|
||||||
|
float maxValue) {
|
||||||
|
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
|
int stride = blockDim.x * gridDim.x;
|
||||||
|
for (int i = index; i < n; i += stride) {
|
||||||
|
output[i] = (!isnan(minValue) && input[i] < minValue)
|
||||||
|
? minValue
|
||||||
|
: (!isnan(maxValue) && input[i] > maxValue)
|
||||||
|
? maxValue : input[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
void clip_kernel(float *input, float *output, int num, float minValue,
|
||||||
|
float maxValue) {
|
||||||
|
int blocksize = block_work_size();
|
||||||
|
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
||||||
|
_clip_kernel<<<blocksize, gridsize>>>(input, output, num, minValue,
|
||||||
|
maxValue);
|
||||||
|
}
|
||||||
|
|
||||||
|
}; // namespace infini
|
|
@ -19,24 +19,37 @@ class ElementWiseCudnn : public CudaKernelWithoutConfig {
|
||||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
cudnnTensorDescriptor_t aDesc, bDesc, cDesc;
|
cudnnTensorDescriptor_t aDesc, bDesc, cDesc;
|
||||||
auto dim = op->getInputs(0)->getDims();
|
auto a_dim = op->getInputs(0)->getDims();
|
||||||
if (dim.size() != 4)
|
auto b_dim = op->getInputs(1)->getDims();
|
||||||
|
auto c_dim = op->getOutput()->getDims();
|
||||||
|
|
||||||
|
if (a_dim.size() > 4 || b_dim.size() > 4 || c_dim.size() > 4)
|
||||||
IT_TODO_HALT();
|
IT_TODO_HALT();
|
||||||
int n = dim[0], c = dim[1], h = dim[2], w = dim[3];
|
|
||||||
|
int a[4] = {1, 1, 1, 1};
|
||||||
|
int b[4] = {1, 1, 1, 1};
|
||||||
|
int c[4] = {1, 1, 1, 1};
|
||||||
|
|
||||||
|
std::copy(a_dim.begin(), a_dim.end(), a + (4 - a_dim.size()));
|
||||||
|
std::copy(b_dim.begin(), b_dim.end(), b + (4 - b_dim.size()));
|
||||||
|
std::copy(c_dim.begin(), c_dim.end(), c + (4 - c_dim.size()));
|
||||||
|
|
||||||
// get inputs
|
// get inputs
|
||||||
checkCudnnError(cudnnCreateTensorDescriptor(&aDesc));
|
checkCudnnError(cudnnCreateTensorDescriptor(&aDesc));
|
||||||
checkCudnnError(cudnnSetTensor4dDescriptor(
|
checkCudnnError(cudnnSetTensor4dDescriptor(aDesc, CUDNN_TENSOR_NCHW,
|
||||||
aDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, c, h, w));
|
CUDNN_DATA_FLOAT, a[0], a[1],
|
||||||
|
a[2], a[3]));
|
||||||
|
|
||||||
checkCudnnError(cudnnCreateTensorDescriptor(&bDesc));
|
checkCudnnError(cudnnCreateTensorDescriptor(&bDesc));
|
||||||
checkCudnnError(cudnnSetTensor4dDescriptor(
|
checkCudnnError(cudnnSetTensor4dDescriptor(bDesc, CUDNN_TENSOR_NCHW,
|
||||||
bDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, c, h, w));
|
CUDNN_DATA_FLOAT, b[0], b[1],
|
||||||
|
b[2], b[3]));
|
||||||
|
|
||||||
// get outputs
|
// get outputs
|
||||||
checkCudnnError(cudnnCreateTensorDescriptor(&cDesc));
|
checkCudnnError(cudnnCreateTensorDescriptor(&cDesc));
|
||||||
checkCudnnError(cudnnSetTensor4dDescriptor(
|
checkCudnnError(cudnnSetTensor4dDescriptor(cDesc, CUDNN_TENSOR_NCHW,
|
||||||
cDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, c, h, w));
|
CUDNN_DATA_FLOAT, c[0], c[1],
|
||||||
|
c[2], c[3]));
|
||||||
|
|
||||||
// get op descriptor
|
// get op descriptor
|
||||||
cudnnOpTensorDescriptor_t opDesc;
|
cudnnOpTensorDescriptor_t opDesc;
|
||||||
|
@ -81,13 +94,27 @@ class ElementWiseCuda : public CudaKernelWithoutConfig {
|
||||||
float *const aData = (op->getInputs(0)->getRawDataPtr<float *>());
|
float *const aData = (op->getInputs(0)->getRawDataPtr<float *>());
|
||||||
float *const bData = (op->getInputs(1)->getRawDataPtr<float *>());
|
float *const bData = (op->getInputs(1)->getRawDataPtr<float *>());
|
||||||
float *const cData = (op->getOutput()->getRawDataPtr<float *>());
|
float *const cData = (op->getOutput()->getRawDataPtr<float *>());
|
||||||
|
auto a_dim = op->getInputs(0)->getDims();
|
||||||
|
auto b_dim = op->getInputs(1)->getDims();
|
||||||
|
auto c_dim = op->getOutput()->getDims();
|
||||||
|
|
||||||
|
if (a_dim.size() > 4 || b_dim.size() > 4 || c_dim.size() > 4)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
|
||||||
|
int a[4] = {1, 1, 1, 1};
|
||||||
|
int b[4] = {1, 1, 1, 1};
|
||||||
|
int c[4] = {1, 1, 1, 1};
|
||||||
|
|
||||||
|
std::copy(a_dim.begin(), a_dim.end(), a + (4 - a_dim.size()));
|
||||||
|
std::copy(b_dim.begin(), b_dim.end(), b + (4 - b_dim.size()));
|
||||||
|
std::copy(c_dim.begin(), c_dim.end(), c + (4 - c_dim.size()));
|
||||||
|
|
||||||
auto dim = op->getInputs(0)->getDims();
|
|
||||||
int n = dim[0], c = dim[1], h = dim[2], w = dim[3];
|
|
||||||
if (op->getOpType() == OpType::Div)
|
if (op->getOpType() == OpType::Div)
|
||||||
div_kernel(aData, bData, cData, n * c * h * w);
|
div_kernel(aData, bData, cData, a[0], a[1], a[2], a[3], b[0], b[1],
|
||||||
|
b[2], b[3], c[0], c[1], c[2], c[3]);
|
||||||
else if (op->getOpType() == OpType::Pow)
|
else if (op->getOpType() == OpType::Pow)
|
||||||
pow_kernel(aData, bData, cData, n * c * h * w);
|
pow_kernel(aData, bData, cData, a[0], a[1], a[2], a[3], b[0], b[1],
|
||||||
|
b[2], b[3], c[0], c[1], c[2], c[3]);
|
||||||
else
|
else
|
||||||
IT_TODO_HALT();
|
IT_TODO_HALT();
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,34 +5,75 @@ constexpr unsigned int num_threads() { return 32 * 4; }
|
||||||
constexpr int thread_work_size() { return 4; }
|
constexpr int thread_work_size() { return 4; }
|
||||||
constexpr int block_work_size() { return thread_work_size() * num_threads(); }
|
constexpr int block_work_size() { return thread_work_size() * num_threads(); }
|
||||||
|
|
||||||
__global__ void _div_kernel(float *x, float *y, float *z, int n) {
|
__global__ void _div_kernel(float *x, float *y, float *z, int a0, int a1, int a2, int a3,
|
||||||
|
int b0, int b1, int b2, int b3,
|
||||||
|
int c0, int c1, int c2, int c3) {
|
||||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
int stride = blockDim.x * gridDim.x;
|
int stride = blockDim.x * gridDim.x;
|
||||||
|
int n = c0 * c1 * c2 * c3;
|
||||||
|
|
||||||
for (int i = index; i < n; i += stride) {
|
for (int i = index; i < n; i += stride) {
|
||||||
z[i] = x[i] / y[i];
|
int c0_index = i/ (c1 * c2 * c3);
|
||||||
|
int c1_index = (i % (c1 * c2 * c3)) / (c2 * c3);
|
||||||
|
int c2_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) / c3;
|
||||||
|
int c3_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) % c3;
|
||||||
|
|
||||||
|
int a0_index = c0_index % a0;
|
||||||
|
int a1_index = c1_index % a1;
|
||||||
|
int a2_index = c2_index % a2;
|
||||||
|
int a3_index = c3_index % a3;
|
||||||
|
|
||||||
|
int b0_index = c0_index % b0;
|
||||||
|
int b1_index = c1_index % b1;
|
||||||
|
int b2_index = c2_index % b2;
|
||||||
|
int b3_index = c3_index % b3;
|
||||||
|
z[i] = x[a0_index*a1*a2*a3 + a1_index*a2*a3 + a2_index*a3 + a3_index] / y[b0_index*b1*b2*b3 + b1_index*b2*b3 + b2_index*b3 + b3_index];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void _pow_kernel(float *x, float *y, float *z, int n) {
|
__global__ void _pow_kernel(float *x, float *y, float *z, int a0, int a1, int a2, int a3,
|
||||||
|
int b0, int b1, int b2, int b3,
|
||||||
|
int c0, int c1, int c2, int c3) {
|
||||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
int stride = blockDim.x * gridDim.x;
|
int stride = blockDim.x * gridDim.x;
|
||||||
|
int n = c0 * c1 * c2 * c3;
|
||||||
|
|
||||||
for (int i = index; i < n; i += stride) {
|
for (int i = index; i < n; i += stride) {
|
||||||
z[i] = pow(x[i], y[i]);
|
int c0_index = i/ (c1 * c2 * c3);
|
||||||
|
int c1_index = (i % (c1 * c2 * c3)) / (c2 * c3);
|
||||||
|
int c2_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) / c3;
|
||||||
|
int c3_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) % c3;
|
||||||
|
|
||||||
|
int a0_index = c0_index % a0;
|
||||||
|
int a1_index = c1_index % a1;
|
||||||
|
int a2_index = c2_index % a2;
|
||||||
|
int a3_index = c3_index % a3;
|
||||||
|
|
||||||
|
int b0_index = c0_index % b0;
|
||||||
|
int b1_index = c1_index % b1;
|
||||||
|
int b2_index = c2_index % b2;
|
||||||
|
int b3_index = c3_index % b3;
|
||||||
|
z[i] = pow(x[a0_index*a1*a2*a3 + a1_index*a2*a3 + a2_index*a3 + a3_index], y[b0_index*b1*b2*b3 + b1_index*b2*b3 + b2_index*b3 + b3_index]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
void div_kernel(float *a, float *b, float *c, int num) {
|
void div_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3,
|
||||||
|
int b0, int b1, int b2, int b3,
|
||||||
|
int c0, int c1, int c2, int c3) {
|
||||||
|
|
||||||
int blocksize = block_work_size();
|
int blocksize = block_work_size();
|
||||||
|
int num = c0*c1*c2*c3;
|
||||||
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
||||||
_div_kernel<<<blocksize, gridsize>>>(a, b, c, num);
|
_div_kernel<<<blocksize, gridsize>>>(a, b, c, a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3);
|
||||||
}
|
}
|
||||||
void pow_kernel(float *a, float *b, float *c, int num) {
|
void pow_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3,
|
||||||
|
int b0, int b1, int b2, int b3,
|
||||||
|
int c0, int c1, int c2, int c3) {
|
||||||
int blocksize = block_work_size();
|
int blocksize = block_work_size();
|
||||||
|
int num = c0*c1*c2*c3;
|
||||||
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
||||||
_pow_kernel<<<blocksize, gridsize>>>(a, b, c, num);
|
_pow_kernel<<<blocksize, gridsize>>>(a, b, c, a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3);
|
||||||
}
|
}
|
||||||
|
|
||||||
}; // namespace infini
|
}; // namespace infini
|
||||||
|
|
|
@ -34,7 +34,7 @@ class SliceCuda : private PadSliceCudaCompute, public CudaKernelWithoutConfig {
|
||||||
void compute(const Operator &op,
|
void compute(const Operator &op,
|
||||||
const RuntimeObj *_context) const override {
|
const RuntimeObj *_context) const override {
|
||||||
do_compute(op->getOutput(), op->getInputs(0),
|
do_compute(op->getOutput(), op->getInputs(0),
|
||||||
as<SliceObj>(op)->getStart(), false);
|
as<SliceObj>(op)->getStarts(), false);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -60,6 +60,52 @@ class ActivationCudnn : public CudaKernelWithoutConfig {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
class SoftmaxCudnn : public CudaKernelWithoutConfig {
|
||||||
|
virtual cudnnSoftmaxAlgorithm_t getAlgorithmType() const = 0;
|
||||||
|
virtual cudnnSoftmaxMode_t getModeType() const = 0;
|
||||||
|
virtual tuple<float, float> getAlphBeta() const { return {1.f, 0.f}; }
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<UnaryObj>(_op);
|
||||||
|
auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
void *const inputData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const outputData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
|
cudnnTensorDescriptor_t inputDesc, outputDesc;
|
||||||
|
auto dim = op->getInputs(0)->getDims();
|
||||||
|
if (dim.size() > 4)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
int dim_array[4] = {1, 1, 1, 1};
|
||||||
|
memcpy(dim_array + (4 - dim.size()), dim.data(),
|
||||||
|
dim.size() * sizeof(int));
|
||||||
|
|
||||||
|
// get inputs
|
||||||
|
checkCudnnError(cudnnCreateTensorDescriptor(&inputDesc));
|
||||||
|
checkCudnnError(cudnnSetTensor4dDescriptor(
|
||||||
|
inputDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, dim_array[0],
|
||||||
|
dim_array[1], dim_array[2], dim_array[3]));
|
||||||
|
|
||||||
|
// get outputs
|
||||||
|
checkCudnnError(cudnnCreateTensorDescriptor(&outputDesc));
|
||||||
|
checkCudnnError(cudnnSetTensor4dDescriptor(
|
||||||
|
outputDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, dim_array[0],
|
||||||
|
dim_array[1], dim_array[2], dim_array[3]));
|
||||||
|
|
||||||
|
auto [alpha, beta] = getAlphBeta();
|
||||||
|
cudnnStatus_t stat = cudnnSoftmaxForward(
|
||||||
|
context->cudnnHandle(), getAlgorithmType(), getModeType(), &alpha,
|
||||||
|
inputDesc, inputData, &beta, outputDesc, outputData);
|
||||||
|
if (stat != CUDNN_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Destories in CUDA does not require sync. But cuDNN does not state
|
||||||
|
// whether sync is required before destories.
|
||||||
|
checkCudnnError(cudnnDestroyTensorDescriptor(inputDesc));
|
||||||
|
checkCudnnError(cudnnDestroyTensorDescriptor(outputDesc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
class ReluCudnn : public ActivationCudnn {
|
class ReluCudnn : public ActivationCudnn {
|
||||||
cudnnActivationMode_t getOpType() const override {
|
cudnnActivationMode_t getOpType() const override {
|
||||||
return CUDNN_ACTIVATION_RELU;
|
return CUDNN_ACTIVATION_RELU;
|
||||||
|
|
|
@ -23,7 +23,7 @@ class MklSlice : public MklKernelWithoutConfig {
|
||||||
std::vector<dnnl_dim_t> sDims, offsets;
|
std::vector<dnnl_dim_t> sDims, offsets;
|
||||||
for (int i = 0; i < ndim; ++i) {
|
for (int i = 0; i < ndim; ++i) {
|
||||||
sDims.push_back(oDims.at(i));
|
sDims.push_back(oDims.at(i));
|
||||||
offsets.push_back(op->getStart().at(i));
|
offsets.push_back(op->getStarts().at(i));
|
||||||
}
|
}
|
||||||
auto sliceMd = srcMd.submemory_desc(sDims, offsets);
|
auto sliceMd = srcMd.submemory_desc(sDims, offsets);
|
||||||
auto sliceMemory =
|
auto sliceMemory =
|
||||||
|
|
|
@ -1,46 +0,0 @@
|
||||||
cmake_minimum_required(VERSION 3.3)
|
|
||||||
project(bangops)
|
|
||||||
include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include")
|
|
||||||
set(LIBRARY_OUTPUT_PATH "${CMAKE_BINARY_DIR}/lib")
|
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -fPIC -std=c++11 -pthread -pipe")
|
|
||||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} ${CMAKE_CXX_FLAGS} -O3")
|
|
||||||
set(CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} -Wl,--gc-sections -fPIC")
|
|
||||||
|
|
||||||
# check `NEUWARE_HOME` env
|
|
||||||
message(${NEUWARE_HOME})
|
|
||||||
if(EXISTS ${NEUWARE_HOME})
|
|
||||||
include_directories("${NEUWARE_HOME}/include")
|
|
||||||
link_directories("${NEUWARE_HOME}/lib64")
|
|
||||||
link_directories("${NEUWARE_HOME}/lib")
|
|
||||||
set(NEUWARE_ROOT_DIR "${NEUWARE_HOME}")
|
|
||||||
else()
|
|
||||||
message(FATAL_ERROR "NEUWARE directory cannot be found, refer README.md to prepare NEUWARE_HOME environment.")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# setup cmake search path
|
|
||||||
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH}
|
|
||||||
"${CMAKE_SOURCE_DIR}/cmake"
|
|
||||||
"${NEUWARE_HOME}/cmake"
|
|
||||||
"${NEUWARE_HOME}/cmake/modules"
|
|
||||||
)
|
|
||||||
|
|
||||||
# include FindBANG.cmake and check cncc
|
|
||||||
find_package(BANG)
|
|
||||||
if(NOT BANG_FOUND)
|
|
||||||
message(FATAL_ERROR "BANG cannot be found.")
|
|
||||||
elseif (NOT BANG_CNCC_EXECUTABLE)
|
|
||||||
message(FATAL_ERROR "cncc not found, please ensure cncc is in your PATH env or set variable BANG_CNCC_EXECUTABLE from cmake. Otherwise you should check path used by find_program(BANG_CNCC_EXECUTABLE) in FindBANG.cmake")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# setup cncc flags
|
|
||||||
set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS} -fPIC -Wall -Werror -std=c++11 -pthread")
|
|
||||||
set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS} -O3")
|
|
||||||
set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS}" "--bang-mlu-arch=mtp_220"
|
|
||||||
"--bang-mlu-arch=mtp_270"
|
|
||||||
"--bang-mlu-arch=mtp_290"
|
|
||||||
"--bang-mlu-arch=mtp_372"
|
|
||||||
)
|
|
||||||
|
|
||||||
file(GLOB_RECURSE src_files ${src_files} "${CMAKE_CURRENT_SOURCE_DIR}/src/*.mlu")
|
|
||||||
bang_add_library(bangops SHARED ${src_files})
|
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue