forked from jiuyuan/InfiniTensor
Compare commits
3 Commits
master
...
add_paddle
Author | SHA1 | Date |
---|---|---|
learner2468 | a68ac10107 | |
learner2468 | 6d62350631 | |
learner2468 | 57954fd523 |
|
@ -14,10 +14,10 @@ env:
|
|||
protobuf-version: "3.21.12"
|
||||
python-version: "3.10"
|
||||
|
||||
resnet-download: https://github.com/InfiniTensor/InfiniTensor/releases/download/test-models/resnet18-v2-7.onnx
|
||||
inception-download: https://github.com/InfiniTensor/InfiniTensor/releases/download/test-models/inception-v2-9.onnx
|
||||
densenet-download: https://github.com/InfiniTensor/InfiniTensor/releases/download/test-models/densenet-12.onnx
|
||||
efficientnet-download: https://github.com/InfiniTensor/InfiniTensor/releases/download/test-models/efficientnet-lite4-11.onnx
|
||||
resnet-download: https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet18-v2-7.onnx
|
||||
inception-download: https://media.githubusercontent.com/media/onnx/models/main/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx
|
||||
densenet-download: https://github.com/onnx/models/raw/main/vision/classification/densenet-121/model/densenet-12.onnx
|
||||
efficientnet-download: https://github.com/onnx/models/raw/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx
|
||||
|
||||
jobs:
|
||||
build:
|
||||
|
|
|
@ -13,6 +13,3 @@
|
|||
[submodule "example"]
|
||||
path = examples/NNmodel
|
||||
url = git@github.com:wanghailu0717/NNmodel.git
|
||||
[submodule "examples/distributed/onnxsim_large_model"]
|
||||
path = examples/distributed/onnxsim_large_model
|
||||
url = git@github.com:luchangli03/onnxsim_large_model.git
|
||||
|
|
|
@ -13,7 +13,7 @@ if(USE_CUDA)
|
|||
message("CMake 3.18 or higher is required for setting CUDAToolkit")
|
||||
cmake_minimum_required(VERSION 3.18) # FindCUDAToolkit
|
||||
else()
|
||||
cmake_minimum_required(VERSION 3.17)
|
||||
cmake_minimum_required(VERSION 3.12)
|
||||
endif()
|
||||
|
||||
include(CMakeDependentOption)
|
||||
|
@ -53,13 +53,11 @@ endif()
|
|||
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
set(CMAKE_CXX_EXTENSIONS OFF) # -std=gnu++11 when on, -std=c++11 when off
|
||||
add_compile_options(-Wno-error=unused-variable)
|
||||
|
||||
find_package(
|
||||
Python
|
||||
COMPONENTS Interpreter Development
|
||||
REQUIRED)
|
||||
|
||||
# OpenMP
|
||||
find_package(OpenMP)
|
||||
if(OpenMP_C_FOUND)
|
||||
|
@ -263,14 +261,7 @@ if(USE_BANG)
|
|||
# BangC Kernels
|
||||
################################################################################
|
||||
|
||||
if (BUILD_DIST)
|
||||
find_library(CAMBRICON_CNCL libcncl.so "${NEUWARE_HOME}/lib64")
|
||||
target_link_libraries(InfiniTensor ${CAMBRICON_CNCL} ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
|
||||
message(STATUS "Add BUILD_DIST, use CNCL with BANG")
|
||||
add_compile_definitions(INFINI_USE_CNCL=1)
|
||||
else()
|
||||
target_link_libraries(InfiniTensor ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
|
||||
endif()
|
||||
target_link_libraries(InfiniTensor ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
|
||||
endif()
|
||||
|
||||
if(USE_KUNLUN)
|
||||
|
@ -284,9 +275,9 @@ if(USE_KUNLUN)
|
|||
endif()
|
||||
message(STATUS "KUNLUN_HOME: ${KUNLUN_HOME}")
|
||||
|
||||
include_directories("${KUNLUN_HOME}/include/")
|
||||
find_library(KUNLUN_RT libxpurt.so "${KUNLUN_HOME}/lib64/")
|
||||
find_library(KUNLUN_DNN libxpuapi.so "${KUNLUN_HOME}/lib64/")
|
||||
include_directories("${KUNLUN_HOME}/XTDK/include/")
|
||||
find_library(KUNLUN_RT libxpurt.so "${KUNLUN_HOME}/lib64")
|
||||
find_library(KUNLUN_DNN libxpuapi.so "${KUNLUN_HOME}/XTDK/shlib")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lstdc++ -Wall -Werror")
|
||||
|
||||
if ((NOT DEFINED TARGET_CPU_ARCH) AND (NOT DEFINED ENV{TARGET_CPU_ARCH}))
|
||||
|
@ -299,13 +290,6 @@ if(USE_KUNLUN)
|
|||
endif()
|
||||
message(STATUS "TARGET_CPU_ARCH: ${TARGET_CPU_ARCH}")
|
||||
|
||||
if (BUILD_DIST)
|
||||
message(STATUS "Add BUILD_DIST, use XCCL with KUNLUN XPU")
|
||||
list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
|
||||
find_package(XCCL REQUIRED)
|
||||
add_compile_definitions(INFINI_USE_XCCL=1)
|
||||
target_link_libraries(InfiniTensor ${XCCL_LIBRARIES})
|
||||
endif()
|
||||
target_link_libraries(InfiniTensor ${KUNLUN_RT} ${KUNLUN_DNN} stdc++)
|
||||
endif()
|
||||
|
||||
|
@ -340,11 +324,9 @@ if(BUILD_TEST)
|
|||
endif()
|
||||
if (USE_BANG)
|
||||
build_test(test/kernels/bang/*.cc)
|
||||
build_test(test/bang/*.cc)
|
||||
endif()
|
||||
if (USE_KUNLUN)
|
||||
build_test(test/kernels/kunlun/*.cc)
|
||||
build_test(test/kunlun/*.cc)
|
||||
endif()
|
||||
if (USE_INTELCPU)
|
||||
build_test(test/kernels/intelcpu/*.cc)
|
||||
|
|
3
Makefile
3
Makefile
|
@ -7,9 +7,7 @@ KUNLUN ?= OFF
|
|||
INTELCPU ?= off
|
||||
BACKTRACE ?= ON
|
||||
TEST ?= ON
|
||||
DIST ?= OFF
|
||||
NNET ?= OFF
|
||||
DIST ?= OFF
|
||||
FORMAT_ORIGIN ?=
|
||||
# Docker build options
|
||||
DOCKER_NAME ?= infinitensor
|
||||
|
@ -31,7 +29,6 @@ CMAKE_OPT += -DUSE_BANG=$(BANG)
|
|||
CMAKE_OPT += -DUSE_KUNLUN=$(KUNLUN)
|
||||
CMAKE_OPT += -DUSE_BACKTRACE=$(BACKTRACE)
|
||||
CMAKE_OPT += -DBUILD_TEST=$(TEST)
|
||||
CMAKE_OPT += -DBUILD_DIST=$(DIST)
|
||||
CMAKE_OPT += -DBUILD_NNET=$(NNET)
|
||||
|
||||
ifeq ($(INTELCPU), ON)
|
||||
|
|
|
@ -33,14 +33,13 @@ There are several configurable CMake options, see the [CMakeLists.txt](/CMakeLis
|
|||
|
||||
## Roadmap
|
||||
|
||||
- [RefactorGraph](https://github.com/InfiniTensor/RefactorGraph) is a newly designed AI framework that is set to replace the current main branch.
|
||||
- [EinNet](https://github.com/InfiniTensor/InfiniTensor/tree/NNET_e2e) is going to be merged into the main branch.
|
||||
- Integration of [PET](https://github.com/thu-pacman/PET), a tensor program optimizer supporting partially equivalent transformations.
|
||||
- Supported hardware
|
||||
- ✔ NVIDIA GPU
|
||||
- ✔ Cambricon MLU
|
||||
- ✔ Kunlunxin XPU
|
||||
- ⬜ Ascend NPU
|
||||
- ⬜ Kunlunxin XPU
|
||||
|
||||
## Contributor Guide
|
||||
|
||||
|
|
|
@ -1,76 +0,0 @@
|
|||
SET(CNCL_LIB_SEARCH_PATHS $ENV{NEUWARE_HOME}/lib64)
|
||||
SET(CNCL_INCLUDE_SEARCH_PATHS $ENV{NEUWARE_HOME}/include)
|
||||
|
||||
set(CNCL_INCLUDE_DIR $ENV{NEUWARE_HOME}/include)
|
||||
set(CNCL_LIB_DIR $ENV{NEUWARE_HOME}/lib64)
|
||||
set(CNCL_VERSION $ENV{CNCL_VERSION} CACHE STRING "Version of CNCL to build with")
|
||||
|
||||
if ($ENV{CNCL_ROOT_DIR})
|
||||
message(WARNING "CNCL_ROOT_DIR is deprecated. Please set CNCL_ROOT instead.")
|
||||
endif()
|
||||
list(APPEND CNCL_ROOT $ENV{CNCL_ROOT_DIR} ${MLU_TOOLKIT_ROOT_DIR})
|
||||
# Compatible layer for CMake <3.12. CNCL_ROOT will be accounted in for searching paths and libraries for CMake >=3.12.
|
||||
list(APPEND CMAKE_PREFIX_PATH ${CNCL_ROOT})
|
||||
|
||||
find_path(CNCL_INCLUDE_DIRS
|
||||
NAMES cncl.h
|
||||
HINTS ${CNCL_INCLUDE_DIR})
|
||||
|
||||
if (USE_STATIC_CNCL)
|
||||
MESSAGE(STATUS "USE_STATIC_CNCL is set. Linking with static CNCL library.")
|
||||
SET(CNCL_LIBNAME "CNCL_static")
|
||||
if (CNCL_VERSION) # Prefer the versioned library if a specific CNCL version is specified
|
||||
set(CMAKE_FIND_LIBRARY_SUFFIXES ".a.${CNCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
|
||||
endif()
|
||||
else()
|
||||
SET(CNCL_LIBNAME "cncl")
|
||||
if (CNCL_VERSION) # Prefer the versioned library if a specific CNCL version is specified
|
||||
set(CMAKE_FIND_LIBRARY_SUFFIXES ".so.${CNCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
|
||||
endif()
|
||||
endif()
|
||||
|
||||
find_library(CNCL_LIBRARIES
|
||||
NAMES ${CNCL_LIBNAME}
|
||||
HINTS ${CNCL_LIB_DIR})
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
find_package_handle_standard_args(CNCL DEFAULT_MSG CNCL_INCLUDE_DIRS CNCL_LIBRARIES)
|
||||
|
||||
if(CNCL_FOUND) # obtaining CNCL version and some sanity checks
|
||||
set (CNCL_HEADER_FILE "${CNCL_INCLUDE_DIRS}/cncl.h")
|
||||
message (STATUS "Determining CNCL version from ${CNCL_HEADER_FILE}...")
|
||||
set (OLD_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES})
|
||||
list (APPEND CMAKE_REQUIRED_INCLUDES ${CNCL_INCLUDE_DIRS})
|
||||
include(CheckCXXSymbolExists)
|
||||
check_cxx_symbol_exists(CNCL_VERSION_CODE CNCL.h CNCL_VERSION_DEFINED)
|
||||
|
||||
if (CNCL_VERSION_DEFINED)
|
||||
set(file "${PROJECT_BINARY_DIR}/detect_cncl_version.cc")
|
||||
file(WRITE ${file} "
|
||||
#include <iostream>
|
||||
#include <cncl.h>
|
||||
int main()
|
||||
{
|
||||
std::cout << CNCL_MAJOR << '.' << CNCL_MINOR << '.' << CNCL_PATCH << std::endl;
|
||||
int x;
|
||||
CNCLGetVersion(&x);
|
||||
return x == CNCL_VERSION_CODE;
|
||||
}
|
||||
")
|
||||
try_run(CNCL_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file}
|
||||
RUN_OUTPUT_VARIABLE CNCL_VERSION_FROM_HEADER
|
||||
CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${CNCL_INCLUDE_DIRS}"
|
||||
LINK_LIBRARIES ${CNCL_LIBRARIES})
|
||||
if (NOT CNCL_VERSION_MATCHED)
|
||||
message(FATAL_ERROR "Found CNCL header version and library version do not match! \
|
||||
(include: ${CNCL_INCLUDE_DIRS}, library: ${CNCL_LIBRARIES}) Please set CNCL_INCLUDE_DIR and CNCL_LIB_DIR manually.")
|
||||
endif()
|
||||
message(STATUS "CNCL version: ${CNCL_VERSION_FROM_HEADER}")
|
||||
else()
|
||||
# message(STATUS "CNCL version < 2.3.5-5")
|
||||
endif ()
|
||||
set (CMAKE_REQUIRED_INCLUDES ${OLD_CMAKE_REQUIRED_INCLUDES})
|
||||
|
||||
message(STATUS "Found CNCL (include: ${CNCL_INCLUDE_DIRS}, library: ${CNCL_LIBRARIES})")
|
||||
mark_as_advanced(CNCL_ROOT_DIR CNCL_INCLUDE_DIRS CNCL_LIBRARIES)
|
||||
endif()
|
|
@ -1,27 +0,0 @@
|
|||
# Find the xccl libraries
|
||||
set(XCCL_INCLUDE_DIR $ENV{KUNLUN_HOME}/include CACHE PATH "Folder contains KUNLUN XCCL headers")
|
||||
set(XCCL_LIB_DIR $ENV{KUNLUN_HOME} CACHE PATH "Folder contains KUNLUN XCCL libraries")
|
||||
|
||||
list(APPEND CMAKE_PREFIX_PATH $ENV{KUNLUN_HOME})
|
||||
|
||||
find_path(XCCL_INCLUDE_DIRS # ${XCCL_INCLUDE_DIR}
|
||||
NAMES xpu/bkcl.h
|
||||
HINTS XCCL_INCLUDE_DIR)
|
||||
|
||||
find_library(XCCL_LIBRARIES # ${XCCL_LIB_DIR}
|
||||
NAMES lib64/libbkcl.so
|
||||
HINTS XCCL_LIB_DIR)
|
||||
|
||||
message(STATUS "XCCL_INCLUDE_DIRS: ${XCCL_INCLUDE_DIRS}")
|
||||
message(STATUS "XCCL_LIBRARIES: ${XCCL_LIBRARIES}")
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
find_package_handle_standard_args(XCCL DEFAULT_MSG XCCL_INCLUDE_DIRS XCCL_LIBRARIES)
|
||||
|
||||
if (XCCL_FOUND)
|
||||
set (XCCL_HEADER_FILE "${XCCL_INCLUDE_DIRS}/xpu/bkcl.h")
|
||||
message (STATUS "Determing XCCL version from ${XCCL_HEADER_FILE}...")
|
||||
list (APPEND CMAKE_REQUIRED_INCLUDES ${XCCL_INCLUDE_DIRS})
|
||||
message(STATUS "Found XCCL (include: ${XCCL_INCLUDE_DIRS}, library: ${XCCL_LIBRARIES})")
|
||||
mark_as_advanced(XCCL_INCLUDE_DIRS XCCL_LIBRARIES)
|
||||
endif()
|
|
@ -2,7 +2,6 @@
|
|||
|
||||
## 目录
|
||||
|
||||
|
||||
- [环境支持](#环境支持)
|
||||
- [神经网络支持](#神经网络支持)
|
||||
- [技术支持](#技术支持)
|
||||
|
@ -20,10 +19,10 @@
|
|||
|
||||
目前已经验证过的神经网络模型有
|
||||
|
||||
- [x] [ResNet18-v2](https://github.com/onnx/models/blob/main/validated/vision/classification/resnet/model/resnet18-v2-7.onnx)
|
||||
- [x] [DenseNet-121-12](https://github.com/onnx/models/blob/main/validated/vision/classification/densenet-121/model/densenet-12.onnx)
|
||||
- [x] [Inception-2](https://github.com/onnx/models/blob/main/validated/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx)
|
||||
- [x] [EfficientNet-Lite4](https://github.com/onnx/models/blob/main/validated/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx)
|
||||
- [x] [ResNet18-v2](https://github.com/onnx/models/blob/main/vision/classification/resnet/model/resnet18-v2-7.onnx)
|
||||
- [x] [DenseNet-121-12](https://github.com/onnx/models/blob/main/vision/classification/densenet-121/model/densenet-12.onnx)
|
||||
- [x] [Inception-2](https://github.com/onnx/models/blob/main/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx)
|
||||
- [x] [EfficientNet-Lite4](https://github.com/onnx/models/blob/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx)
|
||||
|
||||
## 技术支持
|
||||
|
||||
|
|
|
@ -3,10 +3,9 @@
|
|||
## 目录
|
||||
|
||||
- [使用方法](#使用方法)
|
||||
- [python 前端应用指南](#python-前端应用指南)
|
||||
- [导入 onnx 模型](#导入-onnx-模型)
|
||||
- [优化](#优化)
|
||||
- [导出 onnx 模型](#导出-onnx-模型)
|
||||
- [python-前端应用指南](#python-前端应用指南)
|
||||
- [导入-onnx-模型](#导入-onnx-模型)
|
||||
- [导出-onnx-模型](#导出-onnx-模型)
|
||||
- [执行推理](#执行推理)
|
||||
- [样例代码](#样例代码)
|
||||
- [技术支持](#技术支持)
|
||||
|
@ -14,7 +13,7 @@
|
|||
|
||||
## 使用方法
|
||||
|
||||
项目管理功能已写到 [Makefile](../Makefile),支持下列功能:
|
||||
项目管理功能已写到 [Makefile](Makefile),支持下列功能:
|
||||
|
||||
- 编译项目:`make`/`make build`
|
||||
- 清理生成文件:`make clean`
|
||||
|
@ -39,10 +38,10 @@
|
|||
|
||||
支持的模型:
|
||||
|
||||
- [x] [ResNet18-v2](https://github.com/onnx/models/blob/main/validated/vision/classification/resnet/model/resnet18-v2-7.onnx)
|
||||
- [x] [DenseNet-121-12](https://github.com/onnx/models/blob/main/validated/vision/classification/densenet-121/model/densenet-12.onnx)
|
||||
- [x] [Inception-2](https://github.com/onnx/models/blob/main/validated/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx)
|
||||
- [x] [EfficientNet-Lite4](https://github.com/onnx/models/blob/main/validated/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx)
|
||||
- [x] [ResNet18-v2](https://github.com/onnx/models/blob/main/vision/classification/resnet/model/resnet18-v2-7.onnx)
|
||||
- [x] [DenseNet-121-12](https://github.com/onnx/models/blob/main/vision/classification/densenet-121/model/densenet-12.onnx)
|
||||
- [x] [Inception-2](https://github.com/onnx/models/blob/main/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx)
|
||||
- [x] [EfficientNet-Lite4](https://github.com/onnx/models/blob/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx)
|
||||
|
||||
```python
|
||||
import onnx
|
||||
|
@ -97,7 +96,7 @@ for name, tensor in stub.inputs.items():
|
|||
print(name, tensor.shape(), tensor)
|
||||
```
|
||||
|
||||
对于 [resnet18-v2-7.onnx](https://github.com/onnx/models/blob/main/validated/vision/classification/resnet/model/resnet18-v2-7.onnx),会打印出:
|
||||
对于 [resnet18-v2-7.onnx](https://github.com/onnx/models/blob/main/vision/classification/resnet/model/resnet18-v2-7.onnx),会打印出:
|
||||
|
||||
```plaintext
|
||||
data [1, 3, 224, 224] <backend.Tensor object at 0x7efeb828e3b0>
|
||||
|
@ -138,7 +137,7 @@ for name, tensor in stub.outputs.items():
|
|||
|
||||
### 样例代码
|
||||
|
||||
您可以参照[resnet.py](https://github.com/wanghailu0717/NNmodel/blob/main/ResNet/resnet.py)的样例代码进行了解,并尝试运行。在这个文件中,我们使用了 Pytorch 构建了 resnet 网络。您可以查阅该脚本使用方式:
|
||||
您可以参照[./example/Resnet/resnet.py](./example/ResNet/resnet.py)的样例代码进行了解,并尝试运行。在这个文件中,我们使用了 Pytorch 构建了 resnet 网络。您可以查阅该脚本使用方式:
|
||||
|
||||
```python
|
||||
python resnet.py -h
|
||||
|
|
|
@ -1,39 +0,0 @@
|
|||
# 分布式脚本
|
||||
|
||||
## 英伟达平台运行方式
|
||||
|
||||
#### 1. 运行pytorch模型并生成输入和标准输出,可选择导出onnx
|
||||
|
||||
使用 `--export_onnx` 设置导出onnx的目录,默认为当前路径 `./`,不使用这个flag则只进行计算和生成输入输出。
|
||||
|
||||
```bash
|
||||
python run_pytorch.py --model gpt2 --batch_size 1 --length 1 --export_onnx ./
|
||||
```
|
||||
|
||||
会在当前目录下生成输入输出文件`test_inputs.npy` 和 `test_results.npy`,目前只支持单一输入输出。
|
||||
|
||||
#### 2. 运行InfiniTensor分布式脚本
|
||||
|
||||
```bash
|
||||
python cuda_launch.py --model "/XXX/XXX.onnx" --nproc_per_node 4
|
||||
```
|
||||
|
||||
## 寒武纪平台运行方式
|
||||
|
||||
**将上述运行脚本 `run_pytorch.py` 以及 `cuda_launch.py` 针对寒武纪平台做了相应的适配,具体见 `run_pytorch_mlu.py` 以及 `bang_launch.py`。**
|
||||
|
||||
#### 1. 运行pytorch模型并生成输入和标准输出,可选择导出onnx
|
||||
|
||||
使用 `--export_onnx` 设置导出onnx的目录,默认为当前路径 `./`,不使用这个flag则只进行计算和生成输入输出。
|
||||
|
||||
```bash
|
||||
python run_pytorch_mlu.py --model gpt2 --batch_size 1 --length 1 --export_onnx ./
|
||||
```
|
||||
|
||||
会在当前目录下生成输入输出文件`test_inputs.npy` 和 `test_results.npy`,目前只支持单一输入输出。
|
||||
|
||||
#### 2. 运行InfiniTensor分布式脚本
|
||||
|
||||
```bash
|
||||
python bang_launch.py --model "/XXX/XXX.onnx" --nproc_per_node 4
|
||||
```
|
|
@ -1,187 +0,0 @@
|
|||
import sys
|
||||
sys.path.append('../')
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import time
|
||||
import multiprocessing as mp
|
||||
from pyinfinitensor.onnx import OnnxStub, backend
|
||||
import onnx
|
||||
from onnx.external_data_helper import convert_model_to_external_data
|
||||
from onnx.shape_inference import infer_shapes_path
|
||||
import numpy as np
|
||||
from parallel_opt import parallel_model
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description="launch distributed infinitensor")
|
||||
parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
|
||||
parser.add_argument(
|
||||
"--nproc_per_node", type=int, default=1, help="number of processes per node"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--name", type=str, default="test", help="name of this instance."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model", type=str, required=True, help="path to the ONNX model file."
|
||||
)
|
||||
parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
|
||||
parser.add_argument("--length", type=int, default=1, help="sequence length.")
|
||||
parser.add_argument(
|
||||
"--gen_std",
|
||||
action="store_true",
|
||||
help="whether to generate the standard results.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--type", type=str, choices=["fp32", "fp16", "tf32"], default="fp32", help="data type"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
print("arg setting: ", args)
|
||||
return (
|
||||
args.num_nodes,
|
||||
args.nproc_per_node,
|
||||
args.name,
|
||||
args.model,
|
||||
args.batch_size,
|
||||
args.length,
|
||||
args.gen_std,
|
||||
args.type,
|
||||
)
|
||||
|
||||
|
||||
def run_model(model, runtime, world_size=1, rank=0, n=10, data_type="default"):
|
||||
stub = OnnxStub(model, runtime, matmul_compute_type=data_type)
|
||||
load_inputs(stub, world_size, rank)
|
||||
# stub.tune()
|
||||
stub.run()
|
||||
# get outputs
|
||||
outputs = next(stub.outputs.values().__iter__()).copyout_numpy()
|
||||
|
||||
# bench
|
||||
for _ in range(n):
|
||||
stub.run()
|
||||
begin = time.time()
|
||||
for _ in range(n * 2):
|
||||
stub.run()
|
||||
end = time.time()
|
||||
avg_time = (end - begin) / (n * 2)
|
||||
print(f"average time: {avg_time}")
|
||||
return outputs
|
||||
|
||||
def load_inputs(stub, world_size=1, rank=0):
|
||||
for i, (name, tensor) in enumerate(stub.inputs.items()):
|
||||
input = np.load(f"./data/input_{i}.npy")
|
||||
if all(x == y for x,y in zip(input.shape,tensor.shape())):
|
||||
tensor.copyin_numpy(input)
|
||||
else:
|
||||
tensor.copyin_numpy(np.hsplit(input, world_size)[rank])
|
||||
|
||||
|
||||
def run_and_compare(name, model, runtime, world_size=1, rank=0, data_type="default"):
|
||||
results = np.load(f"./data/output.npy")
|
||||
outputs = run_model(model, runtime, world_size, rank, data_type=data_type)
|
||||
print("outputs abs mean:", abs(outputs).mean())
|
||||
print("max abs diff:", abs(outputs - results).max())
|
||||
|
||||
def start_worker(
|
||||
name: str, world_size: int, rank: int, local_rank: int, model: onnx.ModelProto, data_type: str
|
||||
):
|
||||
dist_name = name + "_dist"
|
||||
model = parallel_model(model, world_size, rank)
|
||||
extern_path = f"./{dist_name}_rank{rank}.pb"
|
||||
if os.path.exists(extern_path):
|
||||
os.remove(extern_path)
|
||||
onnx.save_model(
|
||||
model,
|
||||
f"./{dist_name}_rank{rank}.onnx",
|
||||
save_as_external_data=True,
|
||||
location=extern_path,
|
||||
)
|
||||
#infer_shapes_path(f"./{dist_name}_rank{rank}.onnx")
|
||||
runtime = backend.BangRuntime(local_rank)
|
||||
# print("init comm")
|
||||
runtime.init_comm(
|
||||
dist_name,
|
||||
world_size,
|
||||
rank,
|
||||
)
|
||||
run_and_compare(name, model, runtime, world_size, rank, data_type)
|
||||
|
||||
|
||||
def start_single(name, model, data_type):
|
||||
runtime = backend.BangRuntime(0)
|
||||
run_and_compare(name, model, runtime, data_type=data_type)
|
||||
|
||||
def generate_input_output(model):
|
||||
os.makedirs(os.path.dirname("./data/"), exist_ok=True)
|
||||
runtime = backend.BangRuntime(0)
|
||||
stub = OnnxStub(model, runtime)
|
||||
position_id = 0
|
||||
for i, (name, tensor) in enumerate(stub.inputs.items()):
|
||||
input = tensor.copyout_numpy()
|
||||
if np.issubdtype(input.dtype, np.integer):
|
||||
if input.size == 1:
|
||||
# input = np.array([position_id])
|
||||
input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
|
||||
else:
|
||||
input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
|
||||
elif input.dtype == np.bool_:
|
||||
input = np.random.randint(0,2,size=input.shape) > 0
|
||||
else:
|
||||
if i == 0:
|
||||
input = np.ones(input.shape).astype(input.dtype)
|
||||
position_id = input.shape[-1] - 1
|
||||
else:
|
||||
input = np.random.rand(*input.shape).astype(input.dtype)
|
||||
tensor.copyin_numpy(input)
|
||||
np.save(f"./data/input_{i}", input)
|
||||
stub.run()
|
||||
time.sleep(0.01)
|
||||
output = next(stub.outputs.values().__iter__()).copyout_numpy()
|
||||
if np.isnan(output).any():
|
||||
print("Nan in output")
|
||||
np.save(f"./data/output", output)
|
||||
|
||||
|
||||
def main():
|
||||
nnodes, nproc_per_node, name, model_path, bs, length, gen_std, data_type = parse_args()
|
||||
data_type = "default" if data_type == "fp32" else data_type
|
||||
|
||||
model = onnx.load(model_path)
|
||||
|
||||
# generate standart output
|
||||
if gen_std:
|
||||
print(f"generate standard data for {name}.")
|
||||
# a small vocabulary size to fit all LLM.
|
||||
generate_input_output(model)
|
||||
return
|
||||
|
||||
if nproc_per_node == 1:
|
||||
# run single process.
|
||||
# use standalone process to isolate bang.
|
||||
print("run model by single MLU.")
|
||||
# p = mp.Process(target=start_single, args=(name, model, data_type))
|
||||
# p.start()
|
||||
# p.join()
|
||||
start_single(name, model, data_type)
|
||||
return
|
||||
|
||||
# run distributed parallel.
|
||||
world_size = nnodes * nproc_per_node
|
||||
print(f"run model by {world_size} MLU in parallel.")
|
||||
workers = [
|
||||
mp.Process(
|
||||
target=start_worker,
|
||||
args=(name, world_size, rank, rank % nproc_per_node, model, data_type),
|
||||
)
|
||||
for rank in range(world_size)
|
||||
]
|
||||
|
||||
for w in workers:
|
||||
w.start()
|
||||
|
||||
for w in workers:
|
||||
w.join()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -1,249 +0,0 @@
|
|||
import argparse
|
||||
import torch
|
||||
import torch_mlu
|
||||
from transformers import BertModel, BertConfig
|
||||
from transformers import GPT2Model, GPT2Config
|
||||
from transformers import OPTModel, OPTConfig
|
||||
from transformers import AlbertModel, AlbertConfig
|
||||
from transformers import LlamaModel, LlamaConfig
|
||||
import time
|
||||
import numpy as np
|
||||
import onnx
|
||||
import sys
|
||||
import os
|
||||
from onnx.external_data_helper import convert_model_to_external_data
|
||||
from onnxsim import simplify
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description="Run pytorch gpt2/bert/opt and optionally export onnx.")
|
||||
parser.add_argument(
|
||||
"--model", type=str, choices=["gpt2", "bert", "opt", "llama", "albert"], required=True, help="model type"
|
||||
)
|
||||
parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
|
||||
parser.add_argument("--length", type=int, default=1, help="sequence length.")
|
||||
parser.add_argument(
|
||||
"--export_onnx",
|
||||
type=str,
|
||||
nargs="?",
|
||||
default=None,
|
||||
const="./",
|
||||
help="whether and where to export onnx file",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--type", type=str, choices=["fp32", "fp16", "tf32"], required=True, help="model data type"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
print("arg setting: ", args)
|
||||
return (
|
||||
args.model,
|
||||
args.batch_size,
|
||||
args.length,
|
||||
args.export_onnx,
|
||||
args.type
|
||||
)
|
||||
|
||||
|
||||
def get_model(modelname):
|
||||
match modelname:
|
||||
case "albert":
|
||||
model = AlbertModel.from_pretrained("albert/albert-base-v2")
|
||||
voc_size = AlbertConfig().vocab_size
|
||||
case "bert":
|
||||
model = BertModel.from_pretrained("bert-base-uncased", add_pooling_layer=False, hidden_act="gelu_new") # erf is not impl by infini
|
||||
voc_size = BertConfig().vocab_size
|
||||
case "gpt2":
|
||||
model = GPT2Model.from_pretrained("GPT2")
|
||||
voc_size = GPT2Config().vocab_size
|
||||
case "opt":
|
||||
model = OPTModel.from_pretrained("facebook/opt-125m")
|
||||
voc_size = OPTConfig().vocab_size
|
||||
case "llama":
|
||||
model = LlamaModel.from_pretrained("meta-llama/Llama-2-7b-hf")
|
||||
voc_size = LlamaConfig().vocab_size
|
||||
case _:
|
||||
raise KeyError(modelname)
|
||||
|
||||
model = model.eval()
|
||||
return model, voc_size
|
||||
|
||||
def run_pytorch(torch_model, voc_size, batchsize, len, dtype="fp32"):
|
||||
data = np.random.randint(0, voc_size, (batchsize, len), dtype=np.int32)
|
||||
os.makedirs(os.path.dirname("./data/"), exist_ok=True)
|
||||
np.save("./data/input_0", data)
|
||||
inputs = torch.from_numpy(data).to("mlu")
|
||||
torch_model = torch_model.to("mlu")
|
||||
if dtype == "fp16":
|
||||
torch_model = torch_model.half()
|
||||
|
||||
n_iter = 20
|
||||
with torch.no_grad():
|
||||
for _ in range(10):
|
||||
outputs = torch_model(inputs)
|
||||
torch.mlu.synchronize()
|
||||
begin = time.time()
|
||||
with torch.no_grad():
|
||||
for _ in range(n_iter):
|
||||
torch.mlu.synchronize()
|
||||
outputs = torch_model(inputs)
|
||||
torch.mlu.synchronize()
|
||||
torch.mlu.synchronize()
|
||||
end = time.time()
|
||||
|
||||
avg_time = (end - begin) / n_iter
|
||||
outputs = outputs.last_hidden_state.to("cpu")
|
||||
print("outputs abs mean:", abs(np.array(outputs)).mean())
|
||||
print(f"average time: {avg_time}")
|
||||
# torch.mlu.memory.empty_cache()
|
||||
np.save("./data/output", np.array(outputs))
|
||||
print("Save input & output into ./data.")
|
||||
|
||||
|
||||
def export_onnx(modelname, model, data, path, extern=False, dtype="fp32"):
|
||||
data = data.to("mlu")
|
||||
model = model.to("mlu")
|
||||
if dtype == "fp16":
|
||||
model = model.half()
|
||||
torch.onnx.export(model, data, path, verbose=False, do_constant_folding=True)
|
||||
if modelname != "llama":
|
||||
# use onnxsim to simplify
|
||||
onnx_model = onnx.load(path)
|
||||
onnx_model, check = simplify(onnx_model, skipped_optimizers=['eliminate_duplicate_initializer'])
|
||||
# onnx_model, check = simplify(onnx_model, skipped_optimizers=['fuse_qkv', 'eliminate_duplicate_initializer'])
|
||||
assert check
|
||||
add_value_info_for_constants(onnx_model)
|
||||
onnx_model = onnx.shape_inference.infer_shapes(onnx_model)
|
||||
if extern:
|
||||
extern_path = path.replace('.onnx', '.pb')
|
||||
if os.path.exists(extern_path):
|
||||
os.remove(extern_path)
|
||||
extern_path = extern_path.split("/")[-1]
|
||||
convert_model_to_external_data(
|
||||
onnx_model,
|
||||
all_tensors_to_one_file=True,
|
||||
location=extern_path,
|
||||
size_threshold=1024,
|
||||
convert_attribute=False,
|
||||
)
|
||||
onnx.save(onnx_model, path)
|
||||
else:
|
||||
# use third party tool to simplify llama
|
||||
# reference: https://github.com/luchangli03/onnxsim_large_model/
|
||||
sys.path.append("onnxsim_large_model")
|
||||
from onnx_utils import set_onnx_input_shape
|
||||
from compress_model import SIZE_1MB, compress_onnx_model, uncompress_onnx_model
|
||||
|
||||
in_model_path = path
|
||||
out_model_path = path
|
||||
if not out_model_path:
|
||||
out_model_path = in_model_path[:-5] + ".sim.onnx"
|
||||
if os.path.isdir(out_model_path):
|
||||
out_model_path = os.path.join(out_model_path, os.path.basename(in_model_path))
|
||||
|
||||
onnx_model = onnx.load(in_model_path)
|
||||
print(f"load model from {in_model_path} success")
|
||||
|
||||
size_th_bytes = 1024 * 1024
|
||||
|
||||
onnx_model, removed_inits = compress_onnx_model(onnx_model, size_th_bytes=size_th_bytes)
|
||||
print(f"compress model success")
|
||||
|
||||
onnx_model = set_onnx_input_shape(onnx_model, "")
|
||||
|
||||
tensor_size_threshold = f"1024KB"
|
||||
skipped_optimizers = []
|
||||
skipped_optimizers.append("eliminate_duplicate_initializer")
|
||||
onnx_model, check = simplify(onnx_model, skipped_optimizers=skipped_optimizers,
|
||||
tensor_size_threshold=tensor_size_threshold)
|
||||
if not check:
|
||||
raise ValueError(f"simplify compressed model {in_model_path} failed")
|
||||
|
||||
print(f"simplify model success")
|
||||
|
||||
onnx_model = uncompress_onnx_model(onnx_model, removed_inits)
|
||||
print(f"uncompress model success")
|
||||
|
||||
add_value_info_for_constants(onnx_model)
|
||||
|
||||
onnx.save(onnx_model, out_model_path, save_as_external_data=True)
|
||||
|
||||
|
||||
def add_value_info_for_constants(model : onnx.ModelProto):
|
||||
"""
|
||||
Currently onnx.shape_inference doesn't use the shape of initializers, so add
|
||||
that info explicitly as ValueInfoProtos.
|
||||
Mutates the model.
|
||||
Args:
|
||||
model: The ModelProto to update.
|
||||
"""
|
||||
# All (top-level) constants will have ValueInfos before IRv4 as they are all inputs
|
||||
if model.ir_version < 4:
|
||||
return
|
||||
|
||||
def add_const_value_infos_to_graph(graph : onnx.GraphProto):
|
||||
inputs = {i.name for i in graph.input}
|
||||
existing_info = {vi.name: vi for vi in graph.value_info}
|
||||
for init in graph.initializer:
|
||||
# Check it really is a constant, not an input
|
||||
if init.name in inputs:
|
||||
continue
|
||||
|
||||
# The details we want to add
|
||||
elem_type = init.data_type
|
||||
shape = init.dims
|
||||
|
||||
# Get existing or create new value info for this constant
|
||||
vi = existing_info.get(init.name)
|
||||
if vi is None:
|
||||
vi = graph.value_info.add()
|
||||
vi.name = init.name
|
||||
|
||||
# Even though it would be weird, we will not overwrite info even if it doesn't match
|
||||
tt = vi.type.tensor_type
|
||||
if tt.elem_type == onnx.TensorProto.UNDEFINED:
|
||||
tt.elem_type = elem_type
|
||||
if not tt.HasField("shape"):
|
||||
# Ensure we set an empty list if the const is scalar (zero dims)
|
||||
tt.shape.dim.extend([])
|
||||
for dim in shape:
|
||||
tt.shape.dim.add().dim_value = dim
|
||||
|
||||
# Handle subgraphs
|
||||
for node in graph.node:
|
||||
for attr in node.attribute:
|
||||
# Ref attrs refer to other attrs, so we don't need to do anything
|
||||
if attr.ref_attr_name != "":
|
||||
continue
|
||||
|
||||
if attr.type == onnx.AttributeProto.GRAPH:
|
||||
add_const_value_infos_to_graph(attr.g)
|
||||
if attr.type == onnx.AttributeProto.GRAPHS:
|
||||
for g in attr.graphs:
|
||||
add_const_value_infos_to_graph(g)
|
||||
|
||||
|
||||
return add_const_value_infos_to_graph(model.graph)
|
||||
|
||||
|
||||
def main():
|
||||
torch.backends.mlu.matmul.allow_tf32 = False
|
||||
torch.backends.cnnl.allow_tf32 = False
|
||||
modelname, batchsize, seqlen, export_path, dtype = parse_args()
|
||||
if dtype == "tf32":
|
||||
torch.backends.mlu.matmul.allow_tf32 = True
|
||||
else:
|
||||
os.environ["CAMBRICON_TF32_OVERRIDE"] = "0"
|
||||
|
||||
model, voc_size = get_model(modelname)
|
||||
if export_path is not None:
|
||||
filename = "{}_{}_{}_{}.onnx".format(modelname, batchsize, seqlen, dtype)
|
||||
path = os.path.join(export_path, filename)
|
||||
if not os.path.exists(path):
|
||||
param = torch.zeros((batchsize, seqlen), dtype=torch.int)
|
||||
export_onnx(modelname, model, param, path, True, dtype)
|
||||
else:
|
||||
print("Onnx path exists, skipping export.")
|
||||
|
||||
run_pytorch(model, voc_size, batchsize, seqlen, dtype)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -1,188 +0,0 @@
|
|||
import argparse
|
||||
import torch
|
||||
from transformers import BertModel, BertConfig
|
||||
from transformers import GPT2Model, GPT2Config
|
||||
from transformers import OPTModel, OPTConfig
|
||||
import time
|
||||
import numpy as np
|
||||
import onnx
|
||||
import os
|
||||
from onnx.external_data_helper import convert_model_to_external_data
|
||||
from onnxsim import simplify
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description="Run pytorch gpt2/bert/opt and optionally export onnx.")
|
||||
parser.add_argument(
|
||||
"--model", type=str, choices=["gpt2", "bert", "opt"], required=True, help="model type"
|
||||
)
|
||||
parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
|
||||
parser.add_argument("--length", type=int, default=1, help="sequence length.")
|
||||
parser.add_argument(
|
||||
"--export_onnx",
|
||||
type=str,
|
||||
nargs="?",
|
||||
default=None,
|
||||
const="./",
|
||||
help="whether and where to export onnx file",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--type", type=str, choices=["fp32", "fp16", "tf32"], default="fp32", help="data type"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
print("arg setting: ", args)
|
||||
return (
|
||||
args.model,
|
||||
args.batch_size,
|
||||
args.length,
|
||||
args.export_onnx,
|
||||
args.type,
|
||||
)
|
||||
|
||||
|
||||
def get_model(modelname):
|
||||
match modelname:
|
||||
case "bert":
|
||||
model = BertModel.from_pretrained("bert-base-uncased", add_pooling_layer=False, hidden_act="gelu_new") # erf is not impl by infini
|
||||
voc_size = BertConfig().vocab_size
|
||||
case "gpt2":
|
||||
model = GPT2Model.from_pretrained("gpt2")
|
||||
voc_size = GPT2Config().vocab_size
|
||||
case "opt":
|
||||
model = model = OPTModel.from_pretrained("./opt-125m")
|
||||
voc_size = OPTConfig().vocab_size
|
||||
case _:
|
||||
raise KeyError(modelname)
|
||||
|
||||
model = model.eval()
|
||||
return model, voc_size
|
||||
|
||||
def run_pytorch(torch_model, voc_size, batchsize, len):
|
||||
data = np.random.randint(0, voc_size, (batchsize, len), dtype=np.int32)
|
||||
np.save("test_inputs", data)
|
||||
inputs = torch.from_numpy(data).to("cuda")
|
||||
torch_model = torch_model.to("cuda")
|
||||
|
||||
n_iter = 20
|
||||
with torch.no_grad():
|
||||
for _ in range(10):
|
||||
outputs = torch_model(inputs)
|
||||
torch.cuda.synchronize()
|
||||
begin = time.time()
|
||||
with torch.no_grad():
|
||||
for _ in range(n_iter):
|
||||
torch.cuda.synchronize()
|
||||
outputs = torch_model(inputs)
|
||||
#
|
||||
torch.cuda.synchronize()
|
||||
torch.cuda.synchronize()
|
||||
end = time.time()
|
||||
|
||||
avg_time = (end - begin) / n_iter
|
||||
outputs = outputs.last_hidden_state.to("cpu")
|
||||
print("outputs abs mean:", abs(np.array(outputs)).mean())
|
||||
print(f"average time: {avg_time}")
|
||||
torch.cuda.memory.empty_cache()
|
||||
np.save("test_results", np.array(outputs, dtype=np.float32))
|
||||
print("Save input & output as test_inputs.npy and test_results.npy")
|
||||
|
||||
|
||||
def export_onnx(model, data, path, extern=False):
|
||||
torch.onnx.export(model, data, path, verbose=False, do_constant_folding=True)
|
||||
onnx_model = onnx.load(path)
|
||||
onnx_model, check = simplify(onnx_model, skipped_optimizers=['eliminate_duplicate_initializer'])
|
||||
#onnx_model, check = simplify(onnx_model, skipped_optimizers=['fuse_qkv', 'eliminate_duplicate_initializer'])
|
||||
assert check
|
||||
add_value_info_for_constants(onnx_model)
|
||||
onnx_model = onnx.shape_inference.infer_shapes(onnx_model)
|
||||
if extern:
|
||||
extern_path = path.replace('.onnx', '.pb')
|
||||
if os.path.exists(extern_path):
|
||||
os.remove(extern_path)
|
||||
convert_model_to_external_data(
|
||||
onnx_model,
|
||||
all_tensors_to_one_file=True,
|
||||
location=extern_path,
|
||||
size_threshold=1024,
|
||||
convert_attribute=False,
|
||||
)
|
||||
onnx.save(onnx_model, path)
|
||||
|
||||
def add_value_info_for_constants(model : onnx.ModelProto):
|
||||
"""
|
||||
Currently onnx.shape_inference doesn't use the shape of initializers, so add
|
||||
that info explicitly as ValueInfoProtos.
|
||||
Mutates the model.
|
||||
Args:
|
||||
model: The ModelProto to update.
|
||||
"""
|
||||
# All (top-level) constants will have ValueInfos before IRv4 as they are all inputs
|
||||
if model.ir_version < 4:
|
||||
return
|
||||
|
||||
def add_const_value_infos_to_graph(graph : onnx.GraphProto):
|
||||
inputs = {i.name for i in graph.input}
|
||||
existing_info = {vi.name: vi for vi in graph.value_info}
|
||||
for init in graph.initializer:
|
||||
# Check it really is a constant, not an input
|
||||
if init.name in inputs:
|
||||
continue
|
||||
|
||||
# The details we want to add
|
||||
elem_type = init.data_type
|
||||
shape = init.dims
|
||||
|
||||
# Get existing or create new value info for this constant
|
||||
vi = existing_info.get(init.name)
|
||||
if vi is None:
|
||||
vi = graph.value_info.add()
|
||||
vi.name = init.name
|
||||
|
||||
# Even though it would be weird, we will not overwrite info even if it doesn't match
|
||||
tt = vi.type.tensor_type
|
||||
if tt.elem_type == onnx.TensorProto.UNDEFINED:
|
||||
tt.elem_type = elem_type
|
||||
if not tt.HasField("shape"):
|
||||
# Ensure we set an empty list if the const is scalar (zero dims)
|
||||
tt.shape.dim.extend([])
|
||||
for dim in shape:
|
||||
tt.shape.dim.add().dim_value = dim
|
||||
|
||||
# Handle subgraphs
|
||||
for node in graph.node:
|
||||
for attr in node.attribute:
|
||||
# Ref attrs refer to other attrs, so we don't need to do anything
|
||||
if attr.ref_attr_name != "":
|
||||
continue
|
||||
|
||||
if attr.type == onnx.AttributeProto.GRAPH:
|
||||
add_const_value_infos_to_graph(attr.g)
|
||||
if attr.type == onnx.AttributeProto.GRAPHS:
|
||||
for g in attr.graphs:
|
||||
add_const_value_infos_to_graph(g)
|
||||
|
||||
|
||||
return add_const_value_infos_to_graph(model.graph)
|
||||
|
||||
|
||||
def main():
|
||||
torch.backends.cuda.matmul.allow_tf32 = False
|
||||
torch.backends.cudnn.allow_tf32 = False
|
||||
modelname, batchsize, seqlen, export_path, data_type = parse_args()
|
||||
if data_type == "tf32":
|
||||
torch.backends.cuda.matmul.allow_tf32 = True
|
||||
else:
|
||||
os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
|
||||
|
||||
model, voc_size = get_model(modelname)
|
||||
if export_path is not None:
|
||||
filename = "{}_{}_{}.onnx".format(modelname, batchsize, seqlen)
|
||||
path = os.path.join(export_path, filename)
|
||||
param = torch.zeros((batchsize, seqlen), dtype=torch.int)
|
||||
export_onnx(model, param, path, True)
|
||||
|
||||
if data_type == "fp16":
|
||||
model = model.half()
|
||||
run_pytorch(model, voc_size, batchsize, seqlen)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -1,14 +0,0 @@
|
|||
export HF_ENDPOINT=https://hf-mirror.com
|
||||
|
||||
models=("bert" "gpt2" "llama")
|
||||
batch_size=(1 32)
|
||||
seq_len=(100 500)
|
||||
nproc=(1 2 4)
|
||||
|
||||
for model in "${models[@]}"; do
|
||||
for bs in "${batch_size[@]}"; do
|
||||
for len in "${seq_len[@]}"; do
|
||||
python run_pytorch.py --model "$model" --batch_size "$bs" --length "$len" --export_onnx ../models/"$model" --export_only
|
||||
done
|
||||
done
|
||||
done
|
|
@ -1,280 +0,0 @@
|
|||
import sys
|
||||
sys.path.append('../')
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import time
|
||||
import multiprocessing as mp
|
||||
from pyinfinitensor.onnx import OnnxStub, backend
|
||||
import onnx
|
||||
from onnx.external_data_helper import convert_model_to_external_data
|
||||
from onnx.shape_inference import infer_shapes_path
|
||||
import numpy as np
|
||||
from parallel_opt import parallel_model
|
||||
from functools import wraps
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description="launch distributed infinitensor")
|
||||
parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
|
||||
parser.add_argument(
|
||||
"--nproc_per_node", type=int, default=2, help="number of processes per node"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--name", type=str, choices=["gpt2", "bert", "llama"], help="name of model."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model", type=str, default="", help="path to the ONNX model file."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--gen_std",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="whether to generate the standard results.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--run_single",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="whether run model with single process with standard inputs"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input_dir",
|
||||
default="./",
|
||||
help="path to save model input data"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--result_dir",
|
||||
default="./",
|
||||
help="path to save model standard output"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--internal_model_dir",
|
||||
default="./",
|
||||
help="path to save internal onnx model for parallel run"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
# check path, mkdir if not exist
|
||||
check_exists(args.input_dir)
|
||||
check_exists(args.result_dir)
|
||||
check_exists(args.internal_model_dir)
|
||||
|
||||
print("arg setting: ", args)
|
||||
return (
|
||||
args.num_nodes,
|
||||
args.nproc_per_node,
|
||||
args.name,
|
||||
args.model,
|
||||
args.gen_std,
|
||||
args.run_single,
|
||||
args.input_dir,
|
||||
args.result_dir,
|
||||
args.internal_model_dir
|
||||
)
|
||||
|
||||
|
||||
"""
|
||||
utils function for this scripts
|
||||
"""
|
||||
def check_exists(path: str):
|
||||
if not os.path.exists(path):
|
||||
os.makedirs(path)
|
||||
|
||||
def np_assert(base, test, rtol=1e-2, atol=1e-1):
|
||||
# np.testing.assert_allclose(test, base, rtol, atol)
|
||||
print("max abs diff:", abs(base - test).max())
|
||||
|
||||
|
||||
"""
|
||||
Perf wrapper, run function n times
|
||||
then average
|
||||
"""
|
||||
def perf_it(n):
|
||||
def decorator(func):
|
||||
@wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
# warmup
|
||||
for _ in range(n):
|
||||
func(*args, **kwargs)
|
||||
|
||||
t_total = 0
|
||||
for _ in range(n):
|
||||
t0 = time.time()
|
||||
func(*args, **kwargs)
|
||||
t1 = time.time()
|
||||
t_total += t1 - t0
|
||||
avg_time = (t_total) / n
|
||||
print(f"Avg runtime of {n} time is {avg_time:.6f} seconds")
|
||||
return avg_time
|
||||
return wrapper
|
||||
return decorator
|
||||
|
||||
|
||||
"""
|
||||
Run InfiniTensor model with Standard input
|
||||
check=True: check with standard output gen by pytorch
|
||||
perf=True: run n times to get avg time
|
||||
"""
|
||||
def run_model(task_name,
|
||||
model,
|
||||
runtime,
|
||||
world_size=1,
|
||||
rank=0,
|
||||
n=10,
|
||||
check=True,
|
||||
perf=True):
|
||||
|
||||
stub = OnnxStub(model, runtime,
|
||||
use_naive_allocator=True \
|
||||
if task_name == "llama" else False)
|
||||
|
||||
# load in Onnx model inputs
|
||||
def load_inputs(stub: OnnxStub):
|
||||
# check exists
|
||||
inputs = []
|
||||
for i, (name, tensor) in enumerate(stub.inputs.items()):
|
||||
input_path = os.path.join(input_dir, \
|
||||
f"{task_name}_input_{i}.npy")
|
||||
print(input_path)
|
||||
if os.path.exists(input_path):
|
||||
input = np.load(input_path)
|
||||
else :
|
||||
raise KeyError(f"{i} th input of model not exists")
|
||||
# check shape
|
||||
if all(x == y for x,y in zip(input.shape, tensor.shape())):
|
||||
tensor.copyin_numpy(input)
|
||||
else:
|
||||
tensor.copyin_numpy(np.hsplit(input, world_size)[rank])
|
||||
|
||||
load_inputs(stub)
|
||||
# stub.tune()
|
||||
stub.run()
|
||||
time.sleep(0.01)
|
||||
output = next(stub.outputs.values().__iter__()).copyout_numpy()
|
||||
|
||||
# check output results with standard output
|
||||
if check:
|
||||
st_output_path = os.path.join(result_dir, \
|
||||
f"{task_name}_output.npy")
|
||||
assert os.path.exists(st_output_path) , \
|
||||
"standard output not exists"
|
||||
st_output = np.load(st_output_path)
|
||||
if np.isnan(output).any():
|
||||
print("Nan in output")
|
||||
exit()
|
||||
np_assert(st_output, output)
|
||||
|
||||
# perf
|
||||
if perf:
|
||||
@perf_it(n)
|
||||
def perf_infinitensor(stub: OnnxStub):
|
||||
stub.run()
|
||||
perf_infinitensor(stub)
|
||||
|
||||
return output
|
||||
|
||||
|
||||
"""
|
||||
Start a worker in Parallel
|
||||
"""
|
||||
def start_worker(name: str,
|
||||
world_size: int,
|
||||
rank: int,
|
||||
local_rank: int,
|
||||
model: onnx.ModelProto):
|
||||
|
||||
dist_name = name + "_dist"
|
||||
# partial a onnx model to world_size part
|
||||
model = parallel_model(model, world_size, rank)
|
||||
onnx.save(model, os.path.join(internal_model_dir, \
|
||||
f"{dist_name}_rank{rank}.onnx"), save_as_external_data=True)
|
||||
runtime = backend.KUNLUNRuntime(local_rank)
|
||||
# print("init comm")
|
||||
runtime.init_comm(
|
||||
dist_name,
|
||||
world_size,
|
||||
rank,
|
||||
)
|
||||
run_model(name, model, runtime, world_size, rank)
|
||||
|
||||
|
||||
"""
|
||||
generate standard input/output with
|
||||
sigle card run
|
||||
"""
|
||||
def gen_standard(task_name: str, model: onnx.ModelProto):
|
||||
runtime = backend.KUNLUNRuntime(0)
|
||||
stub = OnnxStub(model, runtime)
|
||||
position_id = 0
|
||||
# generate random input for model
|
||||
for i, (name, tensor) in enumerate(stub.inputs.items()):
|
||||
input = tensor.copyout_numpy()
|
||||
if np.issubdtype(input.dtype, np.integer):
|
||||
if input.size == 1:
|
||||
input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
|
||||
else:
|
||||
input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
|
||||
elif input.dtype == np.bool_:
|
||||
input = np.random.randint(0,2,size=input.shape) > 0
|
||||
else:
|
||||
if i == 0:
|
||||
input = np.ones(input.shape).astype(input.dtype)
|
||||
position_id = input.shape[-1] - 1
|
||||
else:
|
||||
input = np.random.rand(*input.shape).astype(input.dtype)
|
||||
tensor.copyin_numpy(input)
|
||||
np.save(os.path.join(input_dir, \
|
||||
f"{task_name}_input_{i}.npy"), input)
|
||||
stub.run()
|
||||
# print(stub.outputs)
|
||||
output = next(stub.outputs.values().__iter__()).copyout_numpy()
|
||||
if np.isnan(output).any():
|
||||
print("Nan in output")
|
||||
exit()
|
||||
np.save(os.path.join(result_dir, f"{task_name}_output.npy"), output)
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
global input_dir, result_dir, internal_model_dir
|
||||
|
||||
nnodes, nproc_per_node, task_name, \
|
||||
model_path, gen_std, run_single, \
|
||||
input_dir, result_dir, internal_model_dir = parse_args()
|
||||
|
||||
# load input onnx model
|
||||
model = onnx.load(model_path)
|
||||
|
||||
# generate standart output
|
||||
if gen_std:
|
||||
print("Generate inputs and outputs.")
|
||||
gen_standard(task_name, model)
|
||||
return
|
||||
|
||||
if run_single:
|
||||
print("Run model by one GPU card.")
|
||||
runtime = backend.KUNLUNRuntime(0)
|
||||
run_model(task_name, model, runtime)
|
||||
return
|
||||
|
||||
# run distributed parallel.
|
||||
world_size = nnodes * nproc_per_node
|
||||
print(f"Run model by {world_size} GPU in parallel.")
|
||||
workers = [
|
||||
mp.Process(
|
||||
target=start_worker,
|
||||
args=(task_name, world_size, rank, rank % nproc_per_node, model),
|
||||
)
|
||||
for rank in range(world_size)
|
||||
]
|
||||
|
||||
for w in workers:
|
||||
w.start()
|
||||
|
||||
for w in workers:
|
||||
w.join()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -1,36 +0,0 @@
|
|||
export HF_ENDPOINT=https://hf-mirror.com
|
||||
|
||||
# models=("bert" "gpt2" "llama")
|
||||
models=("bert" "gpt2")
|
||||
batch_size=(1 32)
|
||||
seq_len=(100 500)
|
||||
nproc=(1 2 4)
|
||||
|
||||
results_dir="results"
|
||||
|
||||
if [ -d "$results_dir" ]; then
|
||||
echo "directory ./$results_dir exists"
|
||||
else
|
||||
mkdir -p "$results_dir"
|
||||
echo "mkdir $results_dir, logs saved there"
|
||||
fi
|
||||
|
||||
|
||||
for model in "${models[@]}"; do
|
||||
for bs in "${batch_size[@]}"; do
|
||||
for len in "${seq_len[@]}"; do
|
||||
# run pytorch model
|
||||
echo "Run pytorch $model with batch_size=$bs length=$len ."
|
||||
python run_pytorch.py --model "$model" --batch_size "$bs" --length "$len" #> results/"$model"_"$bs"_"$len"_pytorch
|
||||
for n in "${nproc[@]}"; do
|
||||
# run infinitensor
|
||||
echo "Run $n parallel infinitensor "$model" with batch_size=$bs and length=$len ."
|
||||
python kunlun_launch.py --name "$model" --model ../models/"$model"/"$model"_"$bs"_"$len".onnx --nproc_per_node=$n # >> results/"$model"_"$bs"_"$len"_infini
|
||||
# delete internal files
|
||||
find ./ -type f -name "*.onnx" -delete
|
||||
find ./ -type f -name "*.pb" -delete
|
||||
done
|
||||
find ./ -type f -name "*.npy" -delete
|
||||
done
|
||||
done
|
||||
done
|
|
@ -1,35 +0,0 @@
|
|||
export HF_ENDPOINT=https://hf-mirror.com
|
||||
|
||||
# models=("bert" "gpt2" "llama")
|
||||
models=("llama")
|
||||
batch_size=(1 )
|
||||
seq_len=(100 500)
|
||||
nproc=(1 2 4)
|
||||
|
||||
results_dir="results"
|
||||
|
||||
if [ -d "$results_dir" ]; then
|
||||
echo "directory ./$results_dir exists"
|
||||
else
|
||||
mkdir -p "$results_dir"
|
||||
echo "mkdir $results_dir, logs saved there"
|
||||
fi
|
||||
|
||||
|
||||
for model in "${models[@]}"; do
|
||||
for bs in "${batch_size[@]}"; do
|
||||
for len in "${seq_len[@]}"; do
|
||||
echo "Run pytorch llama with batch_size="$bs" and length="$len""
|
||||
python run_pytorch.py --model "$model" --batch_size "$bs" --length "$len"
|
||||
for n in "${nproc[@]}"; do
|
||||
# run pytorch model
|
||||
echo "Run infinitensor llama with batch_size="$bs" and length="$len" and nproc="$n"."
|
||||
python kunlun_launch.py --name llama --model ../models/llama/llama_"$bs"_"$len"_fp32.onnx --nproc_per_node=$n
|
||||
# delete internal files
|
||||
find ./ -type f -name "*.onnx" -delete
|
||||
find ./ -type f -name "*0c" -delete
|
||||
done
|
||||
find ./ -type f -name "*.npy" -delete
|
||||
done
|
||||
done
|
||||
done
|
|
@ -1,245 +0,0 @@
|
|||
import argparse
|
||||
import torch
|
||||
from transformers import BertModel, BertConfig
|
||||
from transformers import GPT2Model, GPT2Config
|
||||
from transformers import OPTModel, OPTConfig
|
||||
from transformers import LlamaModel, LlamaConfig
|
||||
import time
|
||||
import numpy as np
|
||||
import onnx
|
||||
import os
|
||||
import sys
|
||||
from onnx.external_data_helper import convert_model_to_external_data
|
||||
from onnxsim import simplify
|
||||
|
||||
torch.backends.cuda.matmul.allow_tf32 = False
|
||||
torch.backends.cudnn.allow_tf32 = False
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description="Run pytorch gpt2/bert/opt and optionally export onnx.")
|
||||
parser.add_argument(
|
||||
"--model", type=str, choices=["gpt2", "bert", "opt", "llama"], required=True, help="model type"
|
||||
)
|
||||
parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
|
||||
parser.add_argument("--length", type=int, default=1, help="sequence length.")
|
||||
parser.add_argument(
|
||||
"--export_onnx",
|
||||
type=str,
|
||||
nargs="?",
|
||||
default=None,
|
||||
const="./",
|
||||
help="whether and where to export onnx file",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input_dir",
|
||||
type=str,
|
||||
default="./",
|
||||
help="path to save pytorch model input data"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--result_dir",
|
||||
type=str,
|
||||
default="./",
|
||||
help="path to save pytorch model output data"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--export_only",
|
||||
action="store_true"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
print("arg setting: ", args)
|
||||
return (
|
||||
args.model,
|
||||
args.batch_size,
|
||||
args.length,
|
||||
args.export_onnx,
|
||||
args.input_dir,
|
||||
args.result_dir,
|
||||
args.export_only
|
||||
)
|
||||
|
||||
|
||||
def get_model(modelname):
|
||||
if modelname == "bert":
|
||||
model = BertModel.from_pretrained("bert-base-uncased", add_pooling_layer=False, hidden_act="gelu_new") # erf is not impl by infini
|
||||
voc_size = BertConfig().vocab_size
|
||||
elif modelname == "gpt2":
|
||||
model = GPT2Model.from_pretrained("gpt2")
|
||||
voc_size = GPT2Config().vocab_size
|
||||
elif modelname == "opt":
|
||||
model = OPTModel.from_pretrained("./opt-125m")
|
||||
voc_size = OPTConfig().vocab_size
|
||||
elif modelname == "llama":
|
||||
model = LlamaModel.from_pretrained("meta-llama/Llama-2-7b-hf")
|
||||
voc_size = LlamaConfig().vocab_size
|
||||
else :
|
||||
raise KeyError(modelname)
|
||||
|
||||
model = model.eval()
|
||||
return model, voc_size
|
||||
|
||||
def run_pytorch(torch_model, voc_size, batchsize, len, model_name):
|
||||
data = np.random.randint(0, voc_size, (batchsize, len), dtype=np.int32)
|
||||
np.save(os.path.join(input_dir, f"{model_name}_input_0.npy"), data)
|
||||
inputs = torch.from_numpy(data).to("cuda")
|
||||
torch_model = torch_model.to("cuda")
|
||||
|
||||
n_iter = 10
|
||||
with torch.no_grad():
|
||||
for _ in range(10):
|
||||
outputs = torch_model(inputs)
|
||||
torch.cuda.synchronize()
|
||||
begin = time.time()
|
||||
with torch.no_grad():
|
||||
for _ in range(n_iter):
|
||||
torch.cuda.synchronize()
|
||||
outputs = torch_model(inputs)
|
||||
#
|
||||
torch.cuda.synchronize()
|
||||
torch.cuda.synchronize()
|
||||
end = time.time()
|
||||
|
||||
avg_time = (end - begin) / n_iter
|
||||
outputs = outputs.last_hidden_state.to("cpu")
|
||||
print("outputs abs mean:", abs(np.array(outputs)).mean())
|
||||
print(f"average time: {avg_time}")
|
||||
torch.cuda.memory.empty_cache()
|
||||
np.save(os.path.join(result_dir, f"{model_name}_output.npy"), \
|
||||
np.array(outputs))
|
||||
print(f"Save input & output as {model_name}_input_0.npy and {model_name}_output.npy")
|
||||
|
||||
|
||||
def export_onnx(model_name, model, data, path, extern=False):
|
||||
# torch.onnx.export(model, data, path, verbose=False, do_constant_folding=True)
|
||||
|
||||
if model_name != "llama":
|
||||
onnx_model = onnx.load(path)
|
||||
onnx_model, check = simplify(onnx_model,
|
||||
skipped_optimizers=['fuse_qkv', 'eliminate_duplicate_initializer'])
|
||||
# skipped_optimizers=['fuse_qkv'])
|
||||
assert check
|
||||
add_value_info_for_constants(onnx_model)
|
||||
onnx_model = onnx.shape_inference.infer_shapes(onnx_model)
|
||||
if extern:
|
||||
extern_path = path.replace('.onnx', '.pb')
|
||||
if os.path.exists(extern_path):
|
||||
os.remove(extern_path)
|
||||
convert_model_to_external_data(
|
||||
onnx_model,
|
||||
all_tensors_to_one_file=True,
|
||||
location=extern_path.split("/")[-1],
|
||||
size_threshold=1024,
|
||||
convert_attribute=False,
|
||||
)
|
||||
onnx.save(onnx_model, path)
|
||||
else:
|
||||
sys.path.append("onnxsim_large_model")
|
||||
from onnx_utils import set_onnx_input_shape
|
||||
from compress_model import SIZE_1MB, compress_onnx_model, uncompress_onnx_model
|
||||
|
||||
in_model_path = path
|
||||
out_model_path = in_model_path[:-5] + ".sim.onnx"
|
||||
|
||||
onnx_model = onnx.load(in_model_path)
|
||||
print(f"load model from {in_model_path} success")
|
||||
|
||||
size_th_bytes = 1024 * 1024
|
||||
onnx_model, removed_inits = compress_onnx_model(onnx_model, size_th_bytes=size_th_bytes)
|
||||
print("compress model success")
|
||||
|
||||
onnx_model = set_onnx_input_shape(onnx_model, "")
|
||||
tensor_size_threshold = f"1024KB"
|
||||
skipped_optimizers = []
|
||||
skipped_optimizers.append("eliminate_duplicate_initializer")
|
||||
onnx_model, check = simplify(onnx_model, skipped_optimizers=skipped_optimizers,
|
||||
tensor_size_threshold=tensor_size_threshold)
|
||||
if not check:
|
||||
raise ValueError(f"simplify compressed model {in_model_path} failed")
|
||||
|
||||
print(f"simplify model success")
|
||||
|
||||
onnx_model = uncompress_onnx_model(onnx_model, removed_inits)
|
||||
print(f"uncompress model success")
|
||||
|
||||
add_value_info_for_constants(onnx_model)
|
||||
|
||||
onnx.save(onnx_model, out_model_path, save_as_external_data=True)
|
||||
|
||||
|
||||
def add_value_info_for_constants(model : onnx.ModelProto):
|
||||
"""
|
||||
Currently onnx.shape_inference doesn't use the shape of initializers, so add
|
||||
that info explicitly as ValueInfoProtos.
|
||||
Mutates the model.
|
||||
Args:
|
||||
model: The ModelProto to update.
|
||||
"""
|
||||
# All (top-level) constants will have ValueInfos before IRv4 as they are all inputs
|
||||
if model.ir_version < 4:
|
||||
return
|
||||
|
||||
def add_const_value_infos_to_graph(graph : onnx.GraphProto):
|
||||
inputs = {i.name for i in graph.input}
|
||||
existing_info = {vi.name: vi for vi in graph.value_info}
|
||||
for init in graph.initializer:
|
||||
# Check it really is a constant, not an input
|
||||
if init.name in inputs:
|
||||
continue
|
||||
|
||||
# The details we want to add
|
||||
elem_type = init.data_type
|
||||
shape = init.dims
|
||||
|
||||
# Get existing or create new value info for this constant
|
||||
vi = existing_info.get(init.name)
|
||||
if vi is None:
|
||||
vi = graph.value_info.add()
|
||||
vi.name = init.name
|
||||
|
||||
# Even though it would be weird, we will not overwrite info even if it doesn't match
|
||||
tt = vi.type.tensor_type
|
||||
if tt.elem_type == onnx.TensorProto.UNDEFINED:
|
||||
tt.elem_type = elem_type
|
||||
if not tt.HasField("shape"):
|
||||
# Ensure we set an empty list if the const is scalar (zero dims)
|
||||
tt.shape.dim.extend([])
|
||||
for dim in shape:
|
||||
tt.shape.dim.add().dim_value = dim
|
||||
|
||||
# Handle subgraphs
|
||||
for node in graph.node:
|
||||
for attr in node.attribute:
|
||||
# Ref attrs refer to other attrs, so we don't need to do anything
|
||||
if attr.ref_attr_name != "":
|
||||
continue
|
||||
|
||||
if attr.type == onnx.AttributeProto.GRAPH:
|
||||
add_const_value_infos_to_graph(attr.g)
|
||||
if attr.type == onnx.AttributeProto.GRAPHS:
|
||||
for g in attr.graphs:
|
||||
add_const_value_infos_to_graph(g)
|
||||
|
||||
|
||||
return add_const_value_infos_to_graph(model.graph)
|
||||
|
||||
|
||||
def main():
|
||||
global input_dir, result_dir
|
||||
|
||||
modelname, batchsize, seqlen, \
|
||||
export_path, input_dir, result_dir, export_only = parse_args()
|
||||
|
||||
model, voc_size = get_model(modelname) # pytorch model
|
||||
|
||||
if export_path is not None:
|
||||
os.makedirs(export_path, exist_ok=True)
|
||||
filename = "{}_{}_{}.onnx".format(modelname, batchsize, seqlen)
|
||||
path = os.path.join(export_path, filename)
|
||||
param = torch.zeros((batchsize, seqlen), dtype=torch.int)
|
||||
export_onnx(modelname, model, param, path, True) # export pytorch model to onnx model
|
||||
if export_only:
|
||||
return
|
||||
|
||||
run_pytorch(model, voc_size, batchsize, seqlen, modelname)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -10,6 +10,9 @@ import numpy as np
|
|||
from parallel_opt import parallel_model
|
||||
|
||||
|
||||
os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description="launch distributed infinitensor")
|
||||
parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
|
||||
|
@ -29,9 +32,6 @@ def parse_args():
|
|||
action="store_true",
|
||||
help="whether to generate the standard results.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--type", type=str, choices=["fp32", "fp16", "tf32"], default="fp32", help="data type"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
print("arg setting: ", args)
|
||||
return (
|
||||
|
@ -42,13 +42,12 @@ def parse_args():
|
|||
args.batch_size,
|
||||
args.length,
|
||||
args.gen_std,
|
||||
args.type,
|
||||
)
|
||||
|
||||
|
||||
def run_model(model, runtime, inputs, n=10, data_type = "default"):
|
||||
stub = OnnxStub(model, runtime, matmul_compute_type=data_type)
|
||||
for tensor, input in zip(stub.inputs.values(), inputs, strict=False):
|
||||
def run_model(model, runtime, inputs, n=10):
|
||||
stub = OnnxStub(model, runtime)
|
||||
for tensor, input in zip(stub.inputs.values(), inputs):
|
||||
tensor.copyin_numpy(input)
|
||||
# stub.tune()
|
||||
stub.run()
|
||||
|
@ -56,7 +55,7 @@ def run_model(model, runtime, inputs, n=10, data_type = "default"):
|
|||
outputs = next(stub.outputs.values().__iter__()).copyout_numpy()
|
||||
|
||||
# bench
|
||||
for tensor, input in zip(stub.inputs.values(), inputs, strict=False):
|
||||
for tensor, input in zip(stub.inputs.values(), inputs):
|
||||
tensor.copyin_numpy(input)
|
||||
begin = time.time()
|
||||
for _ in range(n):
|
||||
|
@ -67,17 +66,17 @@ def run_model(model, runtime, inputs, n=10, data_type = "default"):
|
|||
return outputs
|
||||
|
||||
|
||||
def run_and_compare(name, model, runtime, data_type):
|
||||
def run_and_compare(name, model, runtime):
|
||||
input_ids = np.load(f"{name}_inputs.npy")
|
||||
position_ids = np.arange(input_ids.shape[-1])
|
||||
results = np.load(f"{name}_results.npy")
|
||||
outputs = run_model(model, runtime, (input_ids, position_ids), data_type=data_type)
|
||||
outputs = run_model(model, runtime, (input_ids, position_ids))
|
||||
print("outputs abs mean:", abs(outputs).mean())
|
||||
print("max abs diff:", abs(outputs - results).max())
|
||||
np.testing.assert_allclose(outputs, results, rtol=1e-6, atol=1e-3)
|
||||
|
||||
|
||||
def start_worker(
|
||||
name: str, world_size: int, rank: int, local_rank: int, model: onnx.ModelProto, data_type: str
|
||||
name: str, world_size: int, rank: int, local_rank: int, model: onnx.ModelProto
|
||||
):
|
||||
dist_name = name + "_dist"
|
||||
model = parallel_model(model, world_size, rank)
|
||||
|
@ -90,7 +89,7 @@ def start_worker(
|
|||
save_as_external_data=True,
|
||||
location=extern_path,
|
||||
)
|
||||
#infer_shapes_path(f"./{dist_name}_rank{rank}.onnx")
|
||||
infer_shapes_path(f"./{dist_name}_rank{rank}.onnx")
|
||||
runtime = backend.CudaRuntime(local_rank)
|
||||
# print("init comm")
|
||||
runtime.init_comm(
|
||||
|
@ -98,12 +97,12 @@ def start_worker(
|
|||
world_size,
|
||||
rank,
|
||||
)
|
||||
run_and_compare(name, model, runtime, data_type)
|
||||
run_and_compare(name, model, runtime)
|
||||
|
||||
|
||||
def start_single(name, model, data_type):
|
||||
def start_single(name, model):
|
||||
runtime = backend.CudaRuntime(0)
|
||||
run_and_compare(name, model, runtime, data_type)
|
||||
run_and_compare(name, model, runtime)
|
||||
|
||||
|
||||
def gen_standard(name, model, voc_size, bs, len):
|
||||
|
@ -118,10 +117,8 @@ def gen_standard(name, model, voc_size, bs, len):
|
|||
|
||||
|
||||
def main():
|
||||
nnodes, nproc_per_node, name, model_path, bs, length, gen_std, data_type = parse_args()
|
||||
data_type = "default" if data_type == "fp32" else data_type
|
||||
if data_type != "tf32":
|
||||
os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
|
||||
nnodes, nproc_per_node, name, model_path, bs, length, gen_std = parse_args()
|
||||
|
||||
model = onnx.load(model_path)
|
||||
|
||||
# generate standart output
|
||||
|
@ -135,7 +132,7 @@ def main():
|
|||
# run single process.
|
||||
# use standalone process to isolate cuda.
|
||||
print("run model by single GPU.")
|
||||
p = mp.Process(target=start_single, args=(name, model, data_type))
|
||||
p = mp.Process(target=start_single, args=(name, model))
|
||||
p.start()
|
||||
p.join()
|
||||
|
||||
|
@ -145,7 +142,7 @@ def main():
|
|||
workers = [
|
||||
mp.Process(
|
||||
target=start_worker,
|
||||
args=(name, world_size, rank, rank % nproc_per_node, model, data_type),
|
||||
args=(name, world_size, rank, rank % nproc_per_node, model),
|
||||
)
|
||||
for rank in range(world_size)
|
||||
]
|
|
@ -1 +0,0 @@
|
|||
Subproject commit cbcf3fbf985a00494b0f136c92eaccd42031bf65
|
|
@ -110,11 +110,12 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
|
|||
s_dim = 0
|
||||
elif in_plc.dim == 2:
|
||||
s_dim = 1
|
||||
|
||||
assert s_dim != -1
|
||||
assert out_dims[s_dim] % tp_world_size == 0, out_dims
|
||||
out_dims[s_dim] //= tp_world_size
|
||||
# if ONNX uses the same tensor for multiple Reshape Nodes, then rename it to distingush from others.
|
||||
node.input[1] = node.output[0] + "_shape"
|
||||
# node.input[1] = node.output[0] + "_shape"
|
||||
data[node.input[1]] = numpy_helper.from_array(out_dims, name=node.input[1])
|
||||
place[node.output[0]] = Shard(s_dim)
|
||||
|
||||
|
@ -136,7 +137,7 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
|
|||
place[node.output[0]] = Shard(list(perm).index(plc.dim))
|
||||
|
||||
def shard_node(node: NodeProto):
|
||||
if node.op_type in ["Relu", "Tanh", "Softmax", "Cast"]:
|
||||
if node.op_type in ["Relu", "Tanh", "Softmax"]:
|
||||
place[node.output[0]] = place[node.input[0]]
|
||||
elif node.op_type in ["Where"]:
|
||||
place[node.output[0]] = place[node.input[1]]
|
||||
|
@ -176,14 +177,7 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
|
|||
input in data for input in node.input
|
||||
):
|
||||
# FIXME(constroy): the last MatMul should not be sharded as TP.
|
||||
if (
|
||||
node.output[0] in output
|
||||
or (
|
||||
index + 1 < len(model.graph.node)
|
||||
and model.graph.node[index + 1].output[0]
|
||||
)
|
||||
in output
|
||||
):
|
||||
if node.output[0] in output:
|
||||
continue
|
||||
groups = 1
|
||||
# If the Gemm or Matmul is followed by a split, then the inputs are concatinated by groups
|
||||
|
@ -243,5 +237,5 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
|
|||
if tt.HasField("shape"):
|
||||
tt.ClearField("shape")
|
||||
model = helper.make_model(graph)
|
||||
#model = onnx.shape_inference.infer_shapes(model)
|
||||
model = onnx.shape_inference.infer_shapes(model)
|
||||
return model
|
||||
|
|
|
@ -1,145 +0,0 @@
|
|||
import os
|
||||
from pyinfinitensor.onnx import OnnxStub, backend
|
||||
import numpy as np
|
||||
import onnx
|
||||
import torch
|
||||
from transformers import LlamaModel, LlamaForCausalLM
|
||||
from tqdm import tqdm
|
||||
import onnx_graphsurgeon as gs
|
||||
from onnxsim import simplify
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description='')
|
||||
parser.add_argument('--batchsize', dest='batchsize', type=int, default=1)
|
||||
parser.add_argument('--layer', dest='n_layers', type=int, default=2)
|
||||
parser.add_argument('--iter', dest='n_iter', type=int, default=1)
|
||||
parser.add_argument('--n_max_length', dest='n_max_length', type=int, default=1024)
|
||||
parser.add_argument('--pretrained_llama_path', dest='pretrained_llama_path', type=str,
|
||||
default="/data0/shared/data/public/opensource_models/meta-llama/Llama-2-7b-hf/")
|
||||
parser.add_argument('--onnx_model_path', dest='onnx_model_path', type=str,
|
||||
default="/data1/shared/llama")
|
||||
args = parser.parse_args()
|
||||
|
||||
ONNX_MODEL_PATH = "{}/llama_bs{}_layer{}.onnx".format(args.onnx_model_path, args.batchsize, args.n_layers)
|
||||
ONNX_WEIGHT_PATH = "./llama_bs{}_layer{}.pb".format(args.batchsize, args.n_layers)
|
||||
|
||||
def export_onnx(model: LlamaModel, ONNX_MODEL_PATH):
|
||||
param = torch.zeros(
|
||||
(args.batchsize, 1024), dtype=torch.long)
|
||||
logits = model(param, past_key_values=None)
|
||||
param_kvcache = torch.zeros((args.batchsize, 1), dtype=torch.long)
|
||||
|
||||
torch.onnx.export(model, (param_kvcache, {"past_key_values": logits.past_key_values,
|
||||
"position_ids": param_kvcache}), ONNX_MODEL_PATH, verbose=False,
|
||||
do_constant_folding=True,)
|
||||
onnx_model = onnx.load(ONNX_MODEL_PATH)
|
||||
print("simplifing onnx model")
|
||||
onnx_model, check = simplify(onnx_model, skipped_optimizers=[
|
||||
'eliminate_duplicate_initializer'])
|
||||
assert check
|
||||
|
||||
onnx.save(onnx_model, ONNX_MODEL_PATH, save_as_external_data=True, location=ONNX_WEIGHT_PATH)
|
||||
print("simlifing finished.")
|
||||
|
||||
|
||||
@gs.Graph.register()
|
||||
def replace_with_attention(self, inputs, outputs, inputs_added, outputs_removed):
|
||||
for inp in inputs:
|
||||
inp.outputs.clear()
|
||||
for out in outputs:
|
||||
out.inputs.clear()
|
||||
for inp in inputs_added:
|
||||
inputs.append(inp)
|
||||
for out in outputs_removed:
|
||||
out.inputs.clear()
|
||||
return self.layer(op="AttentionKVCache", inputs=inputs, outputs=outputs)
|
||||
|
||||
|
||||
def replace_onnx_with_attention_op():
|
||||
graph = gs.import_onnx(
|
||||
onnx.load(ONNX_MODEL_PATH))
|
||||
tmap = graph.tensors()
|
||||
for i in range(args.n_layers):
|
||||
inputs = [
|
||||
tmap["onnx::Concat_" + str((i+1)*2)],
|
||||
tmap["onnx::Concat_" + str((i+1)*2+1)],
|
||||
tmap["/model/layers." + str(i) + "/self_attn/Add_output_0"],
|
||||
tmap["/model/layers." + str(i) + "/self_attn/Add_1_output_0"],
|
||||
tmap["/model/layers." + str(i) + "/self_attn/Transpose_2_output_0"]]
|
||||
outputs = [
|
||||
tmap["/model/layers." + str(i) + "/self_attn/MatMul_1_output_0"]]
|
||||
|
||||
inputs_added = [graph.inputs[1]]
|
||||
outputs_removed = []
|
||||
|
||||
graph.replace_with_attention(
|
||||
inputs, outputs, inputs_added, outputs_removed)
|
||||
|
||||
graph.outputs = [tmap[graph.outputs[0].name]]
|
||||
graph.cleanup(True).toposort()
|
||||
onnx.save(gs.export_onnx(graph), ONNX_MODEL_PATH, save_as_external_data=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
kvcache_torch = None
|
||||
torch_model = LlamaForCausalLM.from_pretrained(
|
||||
args.pretrained_llama_path, num_hidden_layers=int(args.n_layers)).eval()
|
||||
|
||||
n_heads = torch_model.config.num_attention_heads
|
||||
n_dims = torch_model.config.hidden_size // n_heads
|
||||
|
||||
if not os.path.exists(ONNX_MODEL_PATH):
|
||||
print("exporting onnx graph")
|
||||
export_onnx(torch_model, ONNX_MODEL_PATH)
|
||||
replace_onnx_with_attention_op()
|
||||
else:
|
||||
print("will use exsiting onnx graph")
|
||||
|
||||
onnx_model = onnx.load(ONNX_MODEL_PATH)
|
||||
stub = OnnxStub(onnx_model, backend.cuda_runtime())
|
||||
|
||||
count_wrong = 0
|
||||
for i in tqdm(range(0, args.n_max_length)):
|
||||
query = np.random.randint(
|
||||
torch_model.config.vocab_size, size=(args.batchsize, 1), dtype=np.int32)
|
||||
position_id = i*np.ones((args.batchsize, 1), dtype=np.int32)
|
||||
|
||||
####################################
|
||||
# pytorch
|
||||
####################################
|
||||
outputs_torch = torch_model(
|
||||
torch.tensor(query), past_key_values=kvcache_torch)
|
||||
logit_torch = outputs_torch['logits']
|
||||
kvcache_torch = outputs_torch['past_key_values']
|
||||
|
||||
####################################
|
||||
# infinitensor
|
||||
####################################
|
||||
# copyin input
|
||||
(list(stub.inputs.items()))[0][1].copyin_int64(
|
||||
query.reshape(-1).tolist())
|
||||
(list(stub.inputs.items()))[1][1].copyin_int64(
|
||||
position_id.reshape(-1).tolist())
|
||||
|
||||
stub.run()
|
||||
|
||||
####################################
|
||||
# validation
|
||||
####################################
|
||||
# copyout output
|
||||
logits_it = np.array((list(stub.outputs.items()))
|
||||
[0][1].copyout_float())
|
||||
|
||||
try:
|
||||
np.testing.assert_allclose(
|
||||
logit_torch[:, -1, :].detach().cpu().numpy().flatten(), logits_it, rtol=1e-3, atol=1e-3)
|
||||
except Exception as e:
|
||||
try:
|
||||
np.testing.assert_allclose(
|
||||
np.argmax(logit_torch[:, -1, :].detach().cpu().numpy().flatten()), np.argmax(logits_it), rtol=1e-3, atol=1e-3)
|
||||
except:
|
||||
count_wrong = count_wrong + 1
|
||||
|
||||
result = "{}/{} failed.".format(count_wrong, args.n_max_length)
|
||||
print(result)
|
||||
del stub
|
|
@ -1,5 +1,6 @@
|
|||
|
||||
import paddle
|
||||
import numpy as np
|
||||
import paddle.vision.transforms as T
|
||||
from paddle.vision.datasets import Cifar10
|
||||
from pyinfinitensor.onnx import OnnxStub, backend
|
||||
|
@ -77,4 +78,4 @@ def run_cifar_train_and_infer():
|
|||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_cifar_train_and_infer()
|
||||
run_cifar_train_and_infer()
|
|
@ -1,4 +1,5 @@
|
|||
import paddle
|
||||
import numpy as np
|
||||
import paddle.vision.transforms as T
|
||||
from paddle.vision.datasets import Cifar10
|
||||
from pyinfinitensor.onnx import OnnxStub, backend
|
||||
|
@ -77,4 +78,4 @@ def run_cifar_train_and_infer():
|
|||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_cifar_train_and_infer()
|
||||
run_cifar_train_and_infer()
|
|
@ -1,5 +1,6 @@
|
|||
|
||||
import paddle
|
||||
import numpy as np
|
||||
import paddle.vision.transforms as T
|
||||
from paddle.vision.datasets import Cifar10
|
||||
from pyinfinitensor.onnx import OnnxStub, backend
|
||||
|
@ -78,4 +79,4 @@ def run_cifar_train_and_infer():
|
|||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_cifar_train_and_infer()
|
||||
run_cifar_train_and_infer()
|
|
@ -2,10 +2,6 @@
|
|||
#include "cnnl.h"
|
||||
#include "cnrt.h"
|
||||
#include "core/common.h"
|
||||
#include "core/data_type.h"
|
||||
#ifdef INFINI_USE_CNCL
|
||||
#include "cncl.h"
|
||||
#endif
|
||||
|
||||
#define checkBangError(call) \
|
||||
{ \
|
||||
|
@ -31,70 +27,4 @@ namespace infini {
|
|||
|
||||
using BangPtr = void *;
|
||||
|
||||
inline cnnlDataType_t cnnlDataTypeConvert(DataType dataType) {
|
||||
if (dataType == DataType::Float32) {
|
||||
return CNNL_DTYPE_FLOAT;
|
||||
}
|
||||
if (dataType == DataType::Float16) {
|
||||
return CNNL_DTYPE_HALF;
|
||||
}
|
||||
if (dataType == DataType::Double) {
|
||||
return CNNL_DTYPE_DOUBLE;
|
||||
}
|
||||
if (dataType == DataType::Int8) {
|
||||
return CNNL_DTYPE_INT8;
|
||||
}
|
||||
if (dataType == DataType::Int32) {
|
||||
return CNNL_DTYPE_INT32;
|
||||
}
|
||||
if (dataType == DataType::UInt8) {
|
||||
return CNNL_DTYPE_UINT8;
|
||||
}
|
||||
if (dataType == DataType::BFloat16) {
|
||||
return CNNL_DTYPE_BFLOAT16;
|
||||
}
|
||||
if (dataType == DataType::Int64) {
|
||||
return CNNL_DTYPE_INT64;
|
||||
}
|
||||
if (dataType == DataType::Bool) {
|
||||
return CNNL_DTYPE_BOOL;
|
||||
}
|
||||
IT_TODO_HALT_MSG("Data type " + dataType.toString() +
|
||||
" not supported in CNNL.");
|
||||
}
|
||||
|
||||
#ifdef INFINI_USE_CNCL
|
||||
inline cnclDataType_t cnclDataTypeConvert(DataType dataType) {
|
||||
if (dataType == DataType::Float32) {
|
||||
return cnclFloat32;
|
||||
}
|
||||
if (dataType == DataType::Float16) {
|
||||
return cnclHalf;
|
||||
}
|
||||
if (dataType == DataType::Int8) {
|
||||
return cnclInt8;
|
||||
}
|
||||
if (dataType == DataType::Int16) {
|
||||
return cnclInt16;
|
||||
}
|
||||
if (dataType == DataType::Int32) {
|
||||
return cnclInt32;
|
||||
}
|
||||
if (dataType == DataType::UInt8) {
|
||||
return cnclUint8;
|
||||
}
|
||||
if (dataType == DataType::UInt16) {
|
||||
return cnclUint16;
|
||||
}
|
||||
if (dataType == DataType::UInt32) {
|
||||
return cnclUint32;
|
||||
}
|
||||
if (dataType == DataType::BFloat16) {
|
||||
return cnclBfloat16;
|
||||
}
|
||||
IT_TODO_HALT_MSG("Data type " + dataType.toString() +
|
||||
" not supported in CNCL.");
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -7,19 +7,17 @@ namespace infini {
|
|||
class BangRuntimeObj : public RuntimeObj {
|
||||
private:
|
||||
cnnlHandle_t cnnl;
|
||||
cnrtQueue_t queue;
|
||||
std::unique_ptr<CommunicatorObj> comm;
|
||||
BangPtr workspace;
|
||||
size_t workspaceSize;
|
||||
mutable size_t cursor;
|
||||
|
||||
public:
|
||||
explicit BangRuntimeObj(int deviceId = 0)
|
||||
: RuntimeObj(Device::BANG, deviceId) {
|
||||
BangRuntimeObj() : RuntimeObj(Device::BANG) {
|
||||
cnInit(0);
|
||||
CNdev dev;
|
||||
cnDeviceGet(&dev, deviceId);
|
||||
cnDeviceGet(&dev, 0);
|
||||
checkBangError(cnrtSetDevice(dev));
|
||||
cnrtQueue_t queue;
|
||||
checkBangError(cnrtQueueCreate(&queue));
|
||||
|
||||
checkCnnlError(cnnlCreate(&cnnl));
|
||||
|
@ -32,7 +30,6 @@ class BangRuntimeObj : public RuntimeObj {
|
|||
}
|
||||
virtual ~BangRuntimeObj() {
|
||||
dealloc(workspace);
|
||||
checkBangError(cnrtQueueDestroy(queue));
|
||||
checkCnnlError(cnnlDestroy(cnnl));
|
||||
}
|
||||
string toString() const override;
|
||||
|
@ -76,9 +73,10 @@ class BangRuntimeObj : public RuntimeObj {
|
|||
checkBangError(cnrtMemcpy(dst, const_cast<void *>(src), bytes,
|
||||
CNRT_MEM_TRANS_DIR_PEER2PEER));
|
||||
}
|
||||
void initComm(const string &name, int worldSize, int rank) final;
|
||||
CommunicatorObj &getCommunicator() const override { return *comm; }
|
||||
cnrtQueue_t getBangQueue() const { return queue; }
|
||||
|
||||
void initComm(const string &, int, int) override { IT_TODO_HALT(); }
|
||||
|
||||
CommunicatorObj &getCommunicator() const override { IT_TODO_HALT(); }
|
||||
|
||||
private:
|
||||
void runWithoutSync(const Graph &graph, bool tune, bool profiling) const;
|
||||
|
|
|
@ -1,79 +0,0 @@
|
|||
#pragma once
|
||||
#include "bang_common.h"
|
||||
#include "core/communicator.h"
|
||||
#include <chrono>
|
||||
#include <cncl.h>
|
||||
#include <cnrt.h>
|
||||
#include <cstdlib>
|
||||
#include <filesystem>
|
||||
#include <fstream>
|
||||
#include <mutex>
|
||||
#include <thread>
|
||||
|
||||
namespace infini {
|
||||
|
||||
class CnclCommunicatorObj final : public CommunicatorObj {
|
||||
private:
|
||||
cnclComm_t *comms;
|
||||
|
||||
public:
|
||||
CnclCommunicatorObj(const string &name, int worldSize, int rank)
|
||||
: CommunicatorObj(worldSize, rank) {
|
||||
const std::string filePath("./" + name + "_cncl_id.bin");
|
||||
cnclCliqueId clique_id;
|
||||
if (rank == 0) {
|
||||
CNCL_CHECK(cnclGetCliqueId(&clique_id));
|
||||
std::ofstream ofs(filePath, std::ios::binary);
|
||||
ofs.write((char *)&clique_id, sizeof(cnclCliqueId));
|
||||
|
||||
} else {
|
||||
auto begin = std::chrono::steady_clock::now();
|
||||
while (!std::filesystem::exists(filePath)) {
|
||||
auto now = std::chrono::steady_clock::now();
|
||||
_IT_ASSERT_2(now < begin + std::chrono::seconds(10),
|
||||
"time limit (10s) exceeded.");
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(100));
|
||||
}
|
||||
std::ifstream ifs(filePath, std::ios::binary);
|
||||
ifs.read((char *)&clique_id, sizeof(cnclCliqueId));
|
||||
}
|
||||
|
||||
int num_comms = 1;
|
||||
int *dev_list = new int[num_comms];
|
||||
int *rank_list = new int[num_comms];
|
||||
comms = new cnclComm_t[num_comms];
|
||||
uint32_t num_dev = 0;
|
||||
checkBangError(cnrtGetDeviceCount(&num_dev));
|
||||
|
||||
for (int i = 0; i < num_comms; i++) {
|
||||
rank_list[i] = rank;
|
||||
dev_list[i] = rank_list[i] % num_dev;
|
||||
}
|
||||
|
||||
CNCL_CHECK(cnclInitComms(comms, num_comms, dev_list, rank_list,
|
||||
worldSize, &clique_id));
|
||||
|
||||
if (rank == 0) {
|
||||
std::filesystem::remove(filePath);
|
||||
}
|
||||
|
||||
delete[] dev_list;
|
||||
delete[] rank_list;
|
||||
}
|
||||
|
||||
~CnclCommunicatorObj() {
|
||||
CNCL_CHECK(cnclDestroyComms(comms, 1));
|
||||
delete[] comms;
|
||||
}
|
||||
|
||||
// Get the actual cnclComm_t
|
||||
cnclComm_t getCnclComm() { return comms[0]; }
|
||||
|
||||
virtual string toString() const final {
|
||||
std::ostringstream oss;
|
||||
oss << "CNCL communicator";
|
||||
return oss.str();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace infini
|
|
@ -61,30 +61,16 @@ template <typename T> auto enum_to_underlying(T e) {
|
|||
}
|
||||
|
||||
template <typename T> std::string vecToString(const std::vector<T> &vec) {
|
||||
std::stringstream ss;
|
||||
ss << "[";
|
||||
for (size_t i = 0; i < vec.size(); ++i) {
|
||||
ss << vec.at(i);
|
||||
if (i < vec.size() - 1) {
|
||||
ss << ",";
|
||||
}
|
||||
std::string ret;
|
||||
ret.append("[");
|
||||
for (auto d : vec) {
|
||||
ret.append(std::to_string(d));
|
||||
ret.append(",");
|
||||
}
|
||||
ss << "]";
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
template <typename T> std::string vecToString(const T *st, size_t length) {
|
||||
std::stringstream ss;
|
||||
ss << "[";
|
||||
size_t i = 0;
|
||||
for (i = 0; i < length; i++) {
|
||||
ss << *(st + i);
|
||||
if (i < length - 1) {
|
||||
ss << ",";
|
||||
}
|
||||
}
|
||||
ss << "]";
|
||||
return ss.str();
|
||||
if (!vec.empty())
|
||||
ret.pop_back();
|
||||
ret.append("]");
|
||||
return ret;
|
||||
}
|
||||
|
||||
double timeit(
|
||||
|
|
|
@ -5,10 +5,6 @@
|
|||
#include <cstdint>
|
||||
#include <iostream>
|
||||
|
||||
#ifdef USE_CUDA
|
||||
#include "cuda/cuda_runtime.h"
|
||||
#endif
|
||||
|
||||
namespace infini {
|
||||
|
||||
class GraphHandlerObj {
|
||||
|
@ -30,14 +26,12 @@ class GraphHandlerObj {
|
|||
int pw, int sh, int sw, int dh, int dw, int oph,
|
||||
int opw);
|
||||
Tensor matmul(Tensor a, Tensor b, Tensor y, bool transA, bool transB,
|
||||
Tensor bias, ActType act,
|
||||
std::string matmul_compute_type = "default");
|
||||
Tensor bias, ActType act);
|
||||
Tensor batchNormalization(Tensor input, Tensor output, Tensor mean,
|
||||
Tensor var, Tensor scale, Tensor bias,
|
||||
float momentum, float eps, bool training);
|
||||
Tensor layerNormalization(Tensor input, Tensor scale, Tensor output,
|
||||
Tensor bias, float eps, int axis, int stash_type);
|
||||
Tensor rmsNorm(Tensor input, Tensor weight, Tensor output);
|
||||
|
||||
Tensor maxPool(Tensor input, Tensor output, int kh, int kw, int dh, int dw,
|
||||
int ph, int pw, int sh, int sw, int ceilMode);
|
||||
|
@ -53,7 +47,6 @@ class GraphHandlerObj {
|
|||
Tensor max(Tensor a, Tensor b, Tensor c);
|
||||
|
||||
Tensor relu(Tensor x, Tensor y);
|
||||
Tensor silu(Tensor x, Tensor y);
|
||||
Tensor gelu(Tensor x, Tensor y);
|
||||
Tensor sigmoid(Tensor x, Tensor y);
|
||||
Tensor hardSigmoid(Tensor x, Tensor y);
|
||||
|
@ -72,21 +65,12 @@ class GraphHandlerObj {
|
|||
std::optional<float> max);
|
||||
Tensor transpose(Tensor data, Tensor transposed, Shape perm);
|
||||
Tensor reshape(Tensor data, Tensor reshaped, Shape shape);
|
||||
Tensor resize(Tensor input, Tensor output,
|
||||
const std::optional<vector<int>> &axes, Tensor sizes,
|
||||
Tensor scales, Tensor roi, vector<uint32_t> sizes_,
|
||||
vector<float> scales_, vector<float> roi_, string mode,
|
||||
string ratioPolicy, string nearestMode,
|
||||
string coordTransMode);
|
||||
Tensor squeeze(Tensor input, Tensor output, Shape axes);
|
||||
Tensor unsqueeze(Tensor input, Tensor output, Shape axes);
|
||||
Tensor concat(TensorVec inputs, Tensor output, int dim);
|
||||
Tensor attentionKVCache(Tensor input_k_cache, Tensor input_v_cache,
|
||||
Tensor input_q, Tensor input_k, Tensor input_v,
|
||||
Tensor position_id, Tensor output_matmul);
|
||||
Tensor RoPE(Tensor pos, Tensor input, Tensor output);
|
||||
TensorVec split(Tensor input, std::optional<TensorVec> outputs, int axis,
|
||||
std::variant<int, vector<int>> numOrRatio);
|
||||
int num_outputs);
|
||||
Tensor gather(Tensor data, Tensor indices, Tensor output, int axis);
|
||||
Tensor gatherElements(Tensor data, Tensor indices, Tensor output, int axis);
|
||||
Tensor reduceMean(Tensor data, Tensor reduced,
|
||||
|
@ -110,13 +94,8 @@ class GraphHandlerObj {
|
|||
Tensor allReduceAvg(Tensor input, Tensor output);
|
||||
TensorVec allGather(Tensor input, std::optional<TensorVec> outputs, int n);
|
||||
Tensor broadcast(Tensor input, Tensor output, int root);
|
||||
Tensor send(Tensor input, int source, int destination, Tensor output);
|
||||
Tensor recv(Tensor output, int source, int destination, Shape dims,
|
||||
int outputType, Tensor input);
|
||||
Tensor depthToSpace(Tensor input, Tensor output, int blocksize,
|
||||
std::string mode);
|
||||
Tensor lrn(Tensor input, Tensor output, float alpha, float beta, float bias,
|
||||
int size);
|
||||
|
||||
//------ modifiers
|
||||
|
||||
|
@ -143,12 +122,6 @@ class GraphHandlerObj {
|
|||
inline void run() { g->getRuntime()->run(g); }
|
||||
|
||||
inline double get_perf_time() { return g->getRuntime()->getPerfTime(g); }
|
||||
|
||||
#ifdef USE_CUDA
|
||||
inline void run_with_cudagraph() {
|
||||
(as<CudaRuntimeObj>(g->getRuntime()))->runWithCudaGraph(g);
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -2,11 +2,10 @@
|
|||
#include "core/common.h"
|
||||
#include "core/operator.h"
|
||||
#include "core/tensor.h"
|
||||
#include "utils/operator_utils.h"
|
||||
#include <functional>
|
||||
#include <nlohmann/json.hpp>
|
||||
namespace infini {
|
||||
using json = nlohmann::json;
|
||||
namespace infini {
|
||||
|
||||
class RuntimeObj; // Forward declaration for Kernel::compute
|
||||
|
||||
|
@ -30,6 +29,7 @@ class Kernel {
|
|||
public:
|
||||
Kernel() {}
|
||||
virtual ~Kernel() {}
|
||||
|
||||
/**
|
||||
* @param op The operator to be executed.
|
||||
* @param record The parameters for kernel execution. If extra parameters
|
||||
|
@ -102,9 +102,11 @@ class KernelRegistry {
|
|||
}
|
||||
Kernel *getKernel(const KernelAttrs &kernelAttrs) const {
|
||||
auto it = kernels.find(kernelAttrs);
|
||||
IT_ASSERT(it != kernels.end(), "Kernel not found for key {" +
|
||||
get_kernel_attrs_str(kernelAttrs) +
|
||||
"}");
|
||||
IT_ASSERT(it != kernels.end(),
|
||||
"Kernel not found for key {" +
|
||||
to_string(enum_to_underlying(std::get<0>(kernelAttrs))) +
|
||||
", " + std::to_string(std::get<1>(kernelAttrs)) + ", " +
|
||||
std::get<2>(kernelAttrs).toString() + "}");
|
||||
return std::get<0>(it->second);
|
||||
}
|
||||
const KernelRecord &getKernelItem(const KernelAttrs &kernelAttrs) const {
|
||||
|
@ -129,16 +131,15 @@ class CpuKernelWithoutConfig : public Kernel {
|
|||
|
||||
} // namespace infini
|
||||
|
||||
#define _REGISTER_KERNEL_1(device, opType, kernel, name, cnt) \
|
||||
#define _REGISTER_KERNEL_1(device, opType, dataType, kernel, name, cnt) \
|
||||
namespace infini { \
|
||||
static const bool _CAT(_register_kernel_, cnt) = \
|
||||
KernelRegistry::getInstance().registerKernel(KernelAttrs{device, \
|
||||
opType}, \
|
||||
new kernel(), name); \
|
||||
KernelRegistry::getInstance().registerKernel( \
|
||||
KernelAttrs{device, opType, dataType}, new kernel(), name); \
|
||||
}
|
||||
|
||||
#define REGISTER_KERNEL(device, opType, kernel, name) \
|
||||
_REGISTER_KERNEL_1(device, opType, kernel, name, __COUNTER__)
|
||||
#define REGISTER_KERNEL(device, opType, dataType, kernel, name) \
|
||||
_REGISTER_KERNEL_1(device, opType, dataType, kernel, name, __COUNTER__)
|
||||
|
||||
#define _REGISTER_CONSTRUCTOR_1(type, constructor, cnt) \
|
||||
namespace infini { \
|
||||
|
|
|
@ -151,14 +151,11 @@ struct OpType {
|
|||
ReduceSum, // Reduce
|
||||
ReduceSumSquare, // Reduce
|
||||
Relu, // Unary
|
||||
Silu, // Unary
|
||||
Reshape,
|
||||
Resize,
|
||||
ReverseSequence,
|
||||
RoiAlign,
|
||||
RoPE, // Fusion
|
||||
Round, // Unary
|
||||
RMSNorm, // Fusion
|
||||
Round, // Unary
|
||||
STFT,
|
||||
Scan,
|
||||
Scatter,
|
||||
|
@ -235,8 +232,6 @@ struct OpType {
|
|||
AllReduceAvg,
|
||||
AllGather,
|
||||
Broadcast,
|
||||
Send,
|
||||
Recv,
|
||||
} type;
|
||||
|
||||
constexpr OpType(decltype(type) t) : type(t) {}
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
#include "core/tensor.h"
|
||||
|
||||
namespace infini {
|
||||
using KernelAttrs = std::tuple<Device, OpType::underlying_t>;
|
||||
using KernelAttrs = std::tuple<Device, OpType::underlying_t, DataType>;
|
||||
|
||||
struct OpPerfKey {
|
||||
HashType hash;
|
||||
|
@ -90,7 +90,6 @@ class OperatorObj : public Object {
|
|||
OpType getOpType() const { return type; }
|
||||
// HACK: set correct data type
|
||||
DataType getDType() const { return getInputs(0)->getDType(); }
|
||||
DataType getOutDType() const { return getOutput()->getDType(); }
|
||||
virtual int numInputs() const = 0;
|
||||
virtual int numOutputs() const = 0;
|
||||
|
||||
|
|
|
@ -2,8 +2,8 @@
|
|||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include <nlohmann/json_fwd.hpp>
|
||||
namespace infini {
|
||||
using json = nlohmann::json;
|
||||
namespace infini {
|
||||
|
||||
class PerfEngine {
|
||||
public:
|
||||
|
|
|
@ -15,7 +15,6 @@ class GraphObj;
|
|||
class GraphHandlerObj;
|
||||
class RuntimeObj;
|
||||
class BlobObj;
|
||||
template <typename T> class WorkspaceObj;
|
||||
|
||||
using TensorBase = Ref<TensorBaseObj>;
|
||||
using Tensor = Ref<TensorObj>;
|
||||
|
@ -24,7 +23,6 @@ using Graph = Ref<GraphObj>;
|
|||
using GraphHandler = Ref<GraphHandlerObj>;
|
||||
using Runtime = Ref<RuntimeObj>;
|
||||
using Blob = Ref<BlobObj>;
|
||||
template <typename T> using Workspace = Ref<WorkspaceObj<T>>;
|
||||
|
||||
using TensorVec = vector<Tensor>;
|
||||
using OpVec = vector<Operator>;
|
||||
|
|
|
@ -4,14 +4,11 @@
|
|||
#include "utils/data_convert.h"
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
|
||||
#if USE_CUDA
|
||||
#include "cuda/cuda_runtime.h"
|
||||
#endif
|
||||
#if USE_BANG
|
||||
#include "bang/bang_runtime.h"
|
||||
#endif
|
||||
|
||||
namespace infini {
|
||||
|
||||
// TODO: how to deal with this
|
||||
|
@ -45,16 +42,8 @@ class TensorObj : public TensorBaseObj {
|
|||
bool isOutput() const { return tensorType == TensorType::output; }
|
||||
bool isOthers() const { return tensorType == TensorType::others; }
|
||||
void setWeight() { tensorType = TensorType::weight; }
|
||||
void setInput() {
|
||||
if (!this->isWeight()) {
|
||||
tensorType = TensorType::input;
|
||||
}
|
||||
}
|
||||
void setOutput() {
|
||||
if (!this->isWeight()) {
|
||||
tensorType = TensorType::output;
|
||||
}
|
||||
}
|
||||
void setInput() { tensorType = TensorType::input; }
|
||||
void setOutput() { tensorType = TensorType::output; }
|
||||
string tensorTypeToString() const {
|
||||
switch (tensorType) {
|
||||
case TensorType::weight:
|
||||
|
@ -144,7 +133,6 @@ class TensorObj : public TensorBaseObj {
|
|||
}
|
||||
|
||||
void printData() const;
|
||||
void dumpData(std::ofstream &ofs) const;
|
||||
bool equalData(const Tensor &rhs, double relativeError = 1e-6) const;
|
||||
|
||||
template <typename T> bool equalData(const vector<T> &dataVector) {
|
||||
|
@ -200,20 +188,13 @@ class TensorObj : public TensorBaseObj {
|
|||
if (a[i] != b[i])
|
||||
return false;
|
||||
} else if constexpr (std::is_floating_point_v<T>) {
|
||||
if (std::min(fabs(a[i]), fabs(b[i])) == 0. &&
|
||||
fabs(a[i] - b[i]) > relativeError) {
|
||||
printf("Error on %lu: %f %f\n", i, a[i], b[i]);
|
||||
return false;
|
||||
} else if (std::min(fabs(a[i]), fabs(b[i])) != 0. &&
|
||||
fabs(a[i] - b[i]) /
|
||||
std::max(fabs(a[i]), fabs(b[i])) >
|
||||
relativeError) {
|
||||
if (fabs(a[i] - b[i]) / std::max(fabs(a[i]), fabs(b[i])) >
|
||||
relativeError) {
|
||||
printf("Error on %lu: %f %f\n", i, a[i], b[i]);
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
} else
|
||||
static_assert(!sizeof(T), "Unsupported data type");
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
@ -248,8 +229,8 @@ class TensorObj : public TensorBaseObj {
|
|||
// // std::cerr << "Init beginned " << std::endl;
|
||||
// #pragma omp parallel for
|
||||
// for (size_t i = 0; i < iEnd; ++i)
|
||||
// data[i] = fastrand(random_seed[omp_get_thread_num() *
|
||||
// 16]) % 10000;
|
||||
// data[i] = fastrand(random_seed[omp_get_thread_num() * 16]) %
|
||||
// 10000;
|
||||
// // std::cerr << "Init finished" << std::endl;
|
||||
// computed = ComputedFull;
|
||||
// return true;
|
||||
|
@ -294,8 +275,8 @@ class TensorObj : public TensorBaseObj {
|
|||
// auto nDim = dims.size();
|
||||
// auto nBroadcastDim = ds.size() - nDim;
|
||||
// for (size_t i = 0; i < nDim; ++i)
|
||||
// if (ds[nBroadcastDim + i] < 0 || ds[nBroadcastDim +
|
||||
// i] >= dims[i])
|
||||
// if (ds[nBroadcastDim + i] < 0 || ds[nBroadcastDim + i] >=
|
||||
// dims[i])
|
||||
// return (size_t)-1;
|
||||
// size_t idx = 0;
|
||||
// for (size_t i = 0; i < nDim; ++i)
|
||||
|
@ -354,14 +335,12 @@ class TensorObj : public TensorBaseObj {
|
|||
// return (g_seed >> 16) & 0x7FFF;
|
||||
// }
|
||||
|
||||
// std::vector<std::vector<int>> const *getSplittingPoints()
|
||||
// const {
|
||||
// std::vector<std::vector<int>> const *getSplittingPoints() const {
|
||||
// assert(!splittingPoints.empty());
|
||||
// return &splittingPoints;
|
||||
// }
|
||||
|
||||
// bool setSplittingPoints(std::vector<std::vector<int>> value)
|
||||
// {
|
||||
// bool setSplittingPoints(std::vector<std::vector<int>> value) {
|
||||
// assert(!value.empty());
|
||||
// splittingPoints = value;
|
||||
// return true;
|
||||
|
|
|
@ -1,42 +0,0 @@
|
|||
#pragma once
|
||||
#include "core/runtime.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T> class WorkspaceObj {
|
||||
private:
|
||||
T workspace; // workspace pointer
|
||||
size_t workspaceSize; // Size of workspace
|
||||
size_t workspaceAlloc; // currently use workspace size
|
||||
|
||||
public:
|
||||
WorkspaceObj(T workspace_, size_t workspaceSize_)
|
||||
: workspace(workspace_), workspaceSize(workspaceSize_) {
|
||||
workspaceAlloc = 0;
|
||||
}
|
||||
virtual ~WorkspaceObj() {
|
||||
// Dealloc workspace in RuntimeObj
|
||||
// Set workspace = nullptr here
|
||||
workspace = nullptr;
|
||||
}
|
||||
size_t getWorkspaceSize() const { return workspaceSize; }
|
||||
|
||||
T getWorkspace(size_t size) {
|
||||
// Get unused workspace
|
||||
IT_ASSERT(size + workspaceAlloc <= workspaceSize);
|
||||
auto ret = (T)(static_cast<uint8_t *>(workspace) + workspaceAlloc);
|
||||
workspaceAlloc += size;
|
||||
return ret;
|
||||
}
|
||||
T getWorkspace() {
|
||||
// Override getWorkspace in order to dealloc in runtime
|
||||
return workspace;
|
||||
}
|
||||
void resetWorkspace() {
|
||||
// Reset workspaceAlloc every time end kernel
|
||||
workspaceAlloc = 0;
|
||||
}
|
||||
size_t getWorkspaceAlloc() const { return workspaceAlloc; }
|
||||
};
|
||||
|
||||
} // namespace infini
|
|
@ -1,5 +1,4 @@
|
|||
#pragma once
|
||||
#include "core/common.h"
|
||||
#include <cstdio>
|
||||
|
||||
struct AttentionKVCacheMetadata {
|
||||
|
@ -11,7 +10,6 @@ namespace infini {
|
|||
void attention_kvcache_kernel(float *input_k_cache, float *input_v_cache,
|
||||
float *input_q, float *input_k, float *input_v,
|
||||
int *position_id, float *output_matmul,
|
||||
const AttentionKVCacheMetadata &compMeta,
|
||||
float *output_O_temp, float *output_sum_temp);
|
||||
const AttentionKVCacheMetadata &compMeta);
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -5,7 +5,6 @@
|
|||
#include <cuda_profiler_api.h>
|
||||
#include <cudnn.h>
|
||||
#include <curand.h>
|
||||
#include <memory>
|
||||
|
||||
#define checkCudaError(call) \
|
||||
if (auto err = call; err != cudaSuccess) \
|
||||
|
@ -112,20 +111,4 @@ inline const char *curandGetErrorString(curandStatus_t error) {
|
|||
|
||||
using CudaPtr = void *;
|
||||
|
||||
class CUDAStream {
|
||||
public:
|
||||
CUDAStream(const CUDAStream &) = delete;
|
||||
CUDAStream(CUDAStream &&) = delete;
|
||||
void operator=(const CUDAStream &) = delete;
|
||||
void operator=(CUDAStream &&) = delete;
|
||||
static cudaStream_t getCurrentStream() { return _stream; }
|
||||
static void Init() { CUDAStream::_stream = 0; };
|
||||
static void createStream() { checkCudaError(cudaStreamCreate(&_stream)); }
|
||||
static void destroyStream() { checkCudaError(cudaStreamDestroy(_stream)); }
|
||||
|
||||
private:
|
||||
CUDAStream(){};
|
||||
static cudaStream_t _stream;
|
||||
};
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -1,20 +1,13 @@
|
|||
#pragma once
|
||||
|
||||
namespace infini {
|
||||
void div_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
|
||||
int a2, int a3, int b0, int b1, int b2, int b3, int c0, int c1,
|
||||
int c2, int c3);
|
||||
void add_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
|
||||
int a2, int a3, int b0, int b1, int b2, int b3, int c0, int c1,
|
||||
int c2, int c3);
|
||||
void pow_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
|
||||
int a2, int a3, int b0, int b1, int b2, int b3, int c0, int c1,
|
||||
int c2, int c3);
|
||||
void less_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
|
||||
int a2, int a3, int b0, int b1, int b2, int b3, int c0, int c1,
|
||||
int c2, int c3);
|
||||
|
||||
void div_const_kernel(int dType, void *a, void *b, void *c, size_t n);
|
||||
|
||||
void pow_const_kernel(int dType, void *a, void *b, void *c, size_t n);
|
||||
void div_kernel(void *a, void *b, void *c, int a0, int a1, int a2, int a3,
|
||||
int b0, int b1, int b2, int b3, int c0, int c1, int c2, int c3);
|
||||
void add_kernel(void *a, void *b, void *c, int a0, int a1, int a2, int a3,
|
||||
int b0, int b1, int b2, int b3, int c0, int c1, int c2, int c3);
|
||||
void pow_kernel(void *a, void *b, void *c, int a0, int a1, int a2, int a3,
|
||||
int b0, int b1, int b2, int b3, int c0, int c1, int c2, int c3);
|
||||
void less_kernel(void *a, void *b, void *c, int a0, int a1, int a2, int a3,
|
||||
int b0, int b1, int b2, int b3, int c0, int c1, int c2,
|
||||
int c3);
|
||||
}; // namespace infini
|
||||
|
|
|
@ -3,10 +3,7 @@
|
|||
#include "operators/unary.h"
|
||||
#include "utils/small_array.h"
|
||||
namespace infini {
|
||||
void expandKernel(int dType, void *input, void *output, int nDims,
|
||||
int outputsize, SmallArray inputShape,
|
||||
SmallArray outputShape);
|
||||
void expandKernel(float *input, float *output, int nDims, int outputsize,
|
||||
SmallArray inputShape, SmallArray outputShape);
|
||||
|
||||
void expandRowKernel(int dType, void *input, void *output, int n_rows,
|
||||
int row_len);
|
||||
}; // namespace infini
|
||||
|
|
|
@ -8,10 +8,4 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
|
|||
void LaynormKernel(const float *input, const float *scale, const float eps,
|
||||
int size, int scaleSize, const int dimsize, const int stride,
|
||||
float *output);
|
||||
void LaynormKernel(const half *input, const half *scale, const half eps,
|
||||
int size, int scaleSize, const int dimsize, const int stride,
|
||||
half *output, const half *bias, int biasSize);
|
||||
void LaynormKernel(const half *input, const half *scale, const half eps,
|
||||
int size, int scaleSize, const int dimsize, const int stride,
|
||||
half *output);
|
||||
}; // namespace infini
|
||||
|
|
|
@ -1,10 +0,0 @@
|
|||
#pragma once
|
||||
|
||||
#include "operators/rms_norm.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
void rmsnorm_kernel(int dType, void *input, void *weight, void *output,
|
||||
int num_tokens, int hidden_size);
|
||||
|
||||
}; // namespace infini
|
|
@ -1,12 +0,0 @@
|
|||
#pragma once
|
||||
|
||||
#include "operators/rope.h"
|
||||
#include "utils/small_array.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
void rope_kernel(int dType, int *pos, void *input, void *output, int size,
|
||||
int dim_model, int dim_head, int hidden_stride,
|
||||
int pos_stride);
|
||||
|
||||
}; // namespace infini
|
|
@ -14,9 +14,6 @@ class CudaRuntimeObj : public RuntimeObj {
|
|||
std::unique_ptr<CommunicatorObj> comm;
|
||||
CudaPtr workspace;
|
||||
size_t workspaceSize;
|
||||
bool isCudaGraphCreated;
|
||||
cudaGraph_t cudaGraph;
|
||||
cudaGraphExec_t cudaGraphInstance;
|
||||
|
||||
public:
|
||||
explicit CudaRuntimeObj(int deviceId = 0)
|
||||
|
@ -29,16 +26,9 @@ class CudaRuntimeObj : public RuntimeObj {
|
|||
// size_t longformerNum = 3lu * (1 << 30);
|
||||
workspaceSize = 7ll << 30; // 7 GB
|
||||
workspace = alloc(workspaceSize);
|
||||
isCudaGraphCreated = false;
|
||||
CUDAStream::Init();
|
||||
}
|
||||
virtual ~CudaRuntimeObj() {
|
||||
try {
|
||||
if (isCudaGraphCreated) {
|
||||
checkCudaError(cudaGraphExecDestroy(cudaGraphInstance));
|
||||
checkCudaError(cudaGraphDestroy(cudaGraph));
|
||||
CUDAStream::destroyStream();
|
||||
}
|
||||
dealloc(workspace);
|
||||
checkCudnnError(cudnnDestroy(cudnn));
|
||||
checkCublasError(cublasDestroy(cublas));
|
||||
|
@ -85,8 +75,6 @@ class CudaRuntimeObj : public RuntimeObj {
|
|||
|
||||
void runWithoutSync(const Graph &graph) const;
|
||||
|
||||
void runWithCudaGraph(const Graph &graph);
|
||||
|
||||
// init communicator
|
||||
void initComm(const string &name, int worldSize, int rank) final;
|
||||
|
||||
|
|
|
@ -3,6 +3,4 @@
|
|||
namespace infini {
|
||||
void softmax_kernel(int num_blocks, float *input, float *output, int size,
|
||||
int dimsize, int stride);
|
||||
void softmax_kernel(int num_blocks, half *input, half *output, int size,
|
||||
int dimsize, int stride);
|
||||
} // namespace infini
|
||||
}
|
||||
|
|
|
@ -8,8 +8,8 @@ const int DIM_MAX_SIZE = 8;
|
|||
// Concat operator acts like element tensors composing to one big tensor,and
|
||||
// split operator acts like one big tensor being composed by element
|
||||
// tensors.
|
||||
template <typename T> struct ElementTensorMetadata {
|
||||
T *data[BATCH_SIZE];
|
||||
struct ElementTensorMetadata {
|
||||
float *data[BATCH_SIZE];
|
||||
int dimBgNo[BATCH_SIZE]; // the dimention begin no of the element tensor in
|
||||
// the composed tensor.
|
||||
int dimSize[BATCH_SIZE]; // the dimention size of the element tensor.
|
||||
|
@ -20,17 +20,16 @@ template <typename T> struct ElementTensorMetadata {
|
|||
data[i], dimBgNo[i], dimSize[i], nElements[i]);
|
||||
}
|
||||
};
|
||||
template <typename T> struct ComposedTensorMetadata {
|
||||
|
||||
struct ComposedTensorMetadata {
|
||||
int dimSize[DIM_MAX_SIZE];
|
||||
int stride[DIM_MAX_SIZE];
|
||||
T *data;
|
||||
float *data;
|
||||
};
|
||||
|
||||
namespace infini {
|
||||
void split_concat_kernel(const ElementTensorMetadata<float> &eleMeta,
|
||||
const ComposedTensorMetadata<float> &compMeta, int dim,
|
||||
int batchSize, int nDims, bool isSplit);
|
||||
void split_concat_kernel(const ElementTensorMetadata<half> &eleMeta,
|
||||
const ComposedTensorMetadata<half> &compMeta, int dim,
|
||||
void split_concat_kernel(const ElementTensorMetadata &eleMeta,
|
||||
const ComposedTensorMetadata &compMeta, int dim,
|
||||
int batchSize, int nDims, bool isSplit);
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
|
||||
namespace infini {
|
||||
|
||||
void transpose_kernel(int dType, void *input, void *output, int nDims, int size,
|
||||
void transpose_kernel(float *input, float *output, int nDims, int size,
|
||||
SmallArray strides, SmallArray outputShape);
|
||||
|
||||
}; // namespace infini
|
||||
|
|
|
@ -3,22 +3,48 @@
|
|||
#include "operators/unary.h"
|
||||
|
||||
namespace infini {
|
||||
template <typename T> void softmax_kernel(T *input, T *output, size_t num);
|
||||
template <typename T> void relu_kernel(T *input, T *output, size_t num);
|
||||
template <typename T> void silu_kernel(T *input, T *output, size_t num);
|
||||
template <typename T> void sigmoid_kernel(T *input, T *output, size_t num);
|
||||
template <typename T> void tanh_kernel(T *input, T *output, size_t num);
|
||||
template <typename T> void abs_kernel(T *input, T *output, size_t num);
|
||||
template <typename T> void sqrt_kernel(T *input, T *output, size_t num);
|
||||
template <typename T> void neg_kernel(T *input, T *output, size_t num);
|
||||
template <typename T> void gelu_kernel(T *input, T *output, size_t num);
|
||||
template <typename T> void erf_kernel(T *input, T *output, size_t num);
|
||||
template <typename T> void hard_sigmoid_kernel(T *input, T *output, size_t num);
|
||||
template <typename T> void hard_swish_kernel(T *input, T *output, size_t num);
|
||||
void softmax_kernel(float *input, float *output, size_t num);
|
||||
void relu_kernel(float *input, float *output, size_t num);
|
||||
void sigmoid_kernel(float *input, float *output, size_t num);
|
||||
void tanh_kernel(float *input, float *output, size_t num);
|
||||
void abs_kernel(float *input, float *output, size_t num);
|
||||
void sqrt_kernel(float *input, float *output, size_t num);
|
||||
void neg_kernel(float *input, float *output, size_t num);
|
||||
void gelu_kernel(float *input, float *output, size_t num);
|
||||
void erf_kernel(float *input, float *output, size_t num);
|
||||
void hard_sigmoid_kernel(float *input, float *output, size_t num);
|
||||
void hard_swish_kernel(float *input, float *output, size_t num);
|
||||
|
||||
template <typename INPUT, typename OUTPUT>
|
||||
void cast_kernel(INPUT *input, OUTPUT *output, size_t num);
|
||||
void unary_kernel(const Operator &_op) {
|
||||
auto op = as<UnaryObj>(_op);
|
||||
float *const inputData = (op->getInputs(0)->getRawDataPtr<float *>());
|
||||
float *const outputData = (op->getOutput()->getRawDataPtr<float *>());
|
||||
|
||||
void unary_kernel(const Operator &_op);
|
||||
size_t num = op->getOutput()->size();
|
||||
if (op->getOpType() == OpType::Softmax)
|
||||
softmax_kernel(inputData, outputData, num);
|
||||
else if (op->getOpType() == OpType::Relu)
|
||||
relu_kernel(inputData, outputData, num);
|
||||
else if (op->getOpType() == OpType::Sigmoid)
|
||||
sigmoid_kernel(inputData, outputData, num);
|
||||
else if (op->getOpType() == OpType::HardSigmoid)
|
||||
hard_sigmoid_kernel(inputData, outputData, num);
|
||||
else if (op->getOpType() == OpType::HardSwish)
|
||||
hard_swish_kernel(inputData, outputData, num);
|
||||
else if (op->getOpType() == OpType::Tanh)
|
||||
tanh_kernel(inputData, outputData, num);
|
||||
else if (op->getOpType() == OpType::Abs)
|
||||
abs_kernel(inputData, outputData, num);
|
||||
else if (op->getOpType() == OpType::Sqrt)
|
||||
sqrt_kernel(inputData, outputData, num);
|
||||
else if (op->getOpType() == OpType::Gelu)
|
||||
gelu_kernel(inputData, outputData, num);
|
||||
else if (op->getOpType() == OpType::Neg)
|
||||
neg_kernel(inputData, outputData, num);
|
||||
else if (op->getOpType() == OpType::Erf)
|
||||
erf_kernel(inputData, outputData, num);
|
||||
else
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
|
||||
}; // namespace infini
|
||||
|
|
|
@ -1,29 +1,11 @@
|
|||
#pragma once
|
||||
#include "core/tensor.h"
|
||||
#include "cuda/cuda_common.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
void cudaPrintFloat(float *x, int len);
|
||||
|
||||
void cudaPrintTensor(const Tensor &tensor);
|
||||
void cudaPrintTensor(const Tensor &tensor) {
|
||||
cudaPrintFloat(tensor->getRawDataPtr<float *>(), tensor->size());
|
||||
}
|
||||
|
||||
cudnnDataType_t cudnnDataTypeConvert(DataType dataType);
|
||||
cudaDataType cublasDataTypeConvert(DataType);
|
||||
|
||||
template <int index> struct DT_CUDA {};
|
||||
template <> struct DT_CUDA<0> { using t = bool; };
|
||||
template <> struct DT_CUDA<1> { using t = float; };
|
||||
template <> struct DT_CUDA<2> { using t = unsigned char; };
|
||||
template <> struct DT_CUDA<3> { using t = char; };
|
||||
template <> struct DT_CUDA<4> { using t = unsigned short; };
|
||||
template <> struct DT_CUDA<5> { using t = short; };
|
||||
template <> struct DT_CUDA<6> { using t = int; };
|
||||
template <> struct DT_CUDA<7> { using t = long long; };
|
||||
template <> struct DT_CUDA<9> { using t = bool; };
|
||||
template <> struct DT_CUDA<10> { using t = half; };
|
||||
template <> struct DT_CUDA<11> { using t = double; };
|
||||
template <> struct DT_CUDA<12> { using t = unsigned int; };
|
||||
template <> struct DT_CUDA<13> { using t = unsigned long long; };
|
||||
template <> struct DT_CUDA<16> { using t = nv_bfloat16; };
|
||||
} // namespace infini
|
||||
} // namespace infini
|
|
@ -3,15 +3,10 @@
|
|||
#include "utils/small_array.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
void whereKernel(const float *inputX, const float *inputY,
|
||||
const uint8_t *condition, float *output, int nDims,
|
||||
int outputsize, SmallArray inputXShape, SmallArray inputYShape,
|
||||
SmallArray conditionShape, SmallArray outputShape, int xSize,
|
||||
int ySize, int cSize);
|
||||
void whereKernel(const half *inputX, const half *inputY,
|
||||
const uint8_t *condition, half *output, int nDims,
|
||||
int outputsize, SmallArray inputXShape, SmallArray inputYShape,
|
||||
SmallArray conditionShape, SmallArray outputShape, int xSize,
|
||||
int ySize, int cSize);
|
||||
|
||||
}; // namespace infini
|
||||
|
|
|
@ -53,8 +53,7 @@ inline void initGatherMetaData(GatherMetaData &metaData,
|
|||
metaData.inStride[i] = in->getStride()[i];
|
||||
}
|
||||
}
|
||||
template <typename T>
|
||||
void gather_kernel(T *in, T *out, GatherMetaData metaData, size_t num);
|
||||
void gather_kernel(float *in, float *out, GatherMetaData metaData, size_t num);
|
||||
|
||||
void gather_elements_kernel(void *in, void *out, GatherMetaData metaData,
|
||||
size_t num);
|
||||
|
|
|
@ -1,23 +0,0 @@
|
|||
#include "core/op_type.h"
|
||||
#include "kunlun/kunlun_common.h"
|
||||
|
||||
namespace infini {
|
||||
using KunlunActType = xdnn::Activation_t;
|
||||
KunlunActType parseActType(ActType act) {
|
||||
switch (act) {
|
||||
case ActType::None:
|
||||
return KunlunActType::LINEAR;
|
||||
case ActType::Tanh:
|
||||
return KunlunActType::TANH;
|
||||
case ActType::Sigmoid:
|
||||
return KunlunActType::SIGMOID;
|
||||
case ActType::Relu:
|
||||
return KunlunActType::RELU6;
|
||||
default:
|
||||
fprintf(stderr, "Activation Type not support yet!\n");
|
||||
break;
|
||||
}
|
||||
return KunlunActType::LINEAR;
|
||||
}
|
||||
|
||||
}; // namespace infini
|
|
@ -3,8 +3,6 @@
|
|||
#include "xpu/runtime_ex.h"
|
||||
#include "xpu/xdnn.h"
|
||||
|
||||
namespace xdnn = baidu::xpu::api;
|
||||
|
||||
#define checkKUNLUNError(call) \
|
||||
{ \
|
||||
auto err = call; \
|
||||
|
|
|
@ -1,35 +1,28 @@
|
|||
#pragma once
|
||||
#include "core/runtime.h"
|
||||
#include "core/workspace.h"
|
||||
#include "kunlun/kunlun_common.h"
|
||||
#ifdef INFINI_USE_XCCL
|
||||
#include "kunlun/xccl_communicator.h"
|
||||
#endif
|
||||
|
||||
namespace infini {
|
||||
|
||||
class KUNLUNRuntimeObj : public RuntimeObj {
|
||||
private:
|
||||
xdnn::Context *ctx;
|
||||
std::unique_ptr<CommunicatorObj> comm;
|
||||
// KUNLUNPtr workspace;
|
||||
// size_t workspaceSize;
|
||||
Workspace<KUNLUNPtr> workspace;
|
||||
baidu::xpu::api::Context *xdnn;
|
||||
KUNLUNPtr workspace;
|
||||
size_t workspaceSize;
|
||||
|
||||
public:
|
||||
KUNLUNRuntimeObj(int deviceId = 0) : RuntimeObj(Device::KUNLUN) {
|
||||
xpu_set_device(deviceId);
|
||||
ctx = xdnn::create_context();
|
||||
KUNLUNRuntimeObj() : RuntimeObj(Device::KUNLUN) {
|
||||
xdnn = baidu::xpu::api::create_context();
|
||||
// 10GB for Longformer
|
||||
// size_t longformerNum = 3lu * (1 << 30);
|
||||
size_t workspaceSize = 2llu << 30; // 2 GB
|
||||
KUNLUNPtr wkspacePtr = alloc(workspaceSize);
|
||||
workspace =
|
||||
make_ref<WorkspaceObj<KUNLUNPtr>>(wkspacePtr, workspaceSize);
|
||||
workspaceSize = 3ll << 30; // 3 GB
|
||||
// std::cout<<workspaceSize/1024/1024/1024<< std::endl;
|
||||
// std::cout<<std::bitset<64>(workspaceSize)<< std::endl;
|
||||
workspace = alloc(workspaceSize);
|
||||
}
|
||||
virtual ~KUNLUNRuntimeObj() {
|
||||
KUNLUNPtr wkspacePtr = workspace->getWorkspace();
|
||||
dealloc(wkspacePtr);
|
||||
xdnn::destroy_context(ctx);
|
||||
dealloc(workspace);
|
||||
baidu::xpu::api::destroy_context(xdnn);
|
||||
}
|
||||
string toString() const override;
|
||||
|
||||
|
@ -38,41 +31,40 @@ class KUNLUNRuntimeObj : public RuntimeObj {
|
|||
// double runEvaluation(const Graph &graph, int nWarmups,
|
||||
// int nEvaluations) const;
|
||||
void sync() const;
|
||||
|
||||
KUNLUNPtr alloc(size_t size) override {
|
||||
void *ptr;
|
||||
checkKUNLUNError(
|
||||
xpu_malloc((void **)&ptr, size, XPUMemoryKind::XPU_MEM_HBM));
|
||||
xpu_malloc_ex((void **)&ptr, size, XPUMemoryKind::XPU_MEM_MAIN));
|
||||
return ptr;
|
||||
}
|
||||
void dealloc(void *ptr) override { xpu_free(ptr); }
|
||||
|
||||
xdnn::Context *KUNLUNHandle() const { return ctx; }
|
||||
// Get $size workspace by bytes
|
||||
baidu::xpu::api::Context *KUNLUNHandle() const { return xdnn; }
|
||||
KUNLUNPtr getWorkspace(size_t size) const {
|
||||
auto ret = workspace->getWorkspace(size);
|
||||
return ret;
|
||||
IT_ASSERT(size <= workspaceSize);
|
||||
return workspace;
|
||||
}
|
||||
Workspace<KUNLUNPtr> getWorkspaceObj() const { return workspace; }
|
||||
|
||||
void copyBlobFromCPU(void *dst, const void *src,
|
||||
size_t bytes) const override {
|
||||
xpu_memcpy(dst, const_cast<void *>(src), bytes,
|
||||
XPUMemcpyKind::XPU_HOST_TO_DEVICE);
|
||||
}
|
||||
|
||||
void copyBlobToCPU(void *dst, const void *src,
|
||||
size_t bytes) const override {
|
||||
xpu_memcpy(dst, const_cast<void *>(src), bytes,
|
||||
XPUMemcpyKind::XPU_DEVICE_TO_HOST);
|
||||
}
|
||||
|
||||
void copyBlobInsideRuntime(void *dst, const void *src,
|
||||
size_t bytes) const override {
|
||||
xpu_memcpy(dst, const_cast<void *>(src), bytes,
|
||||
XPUMemcpyKind::XPU_DEVICE_TO_DEVICE);
|
||||
}
|
||||
void initComm(const string &name, int worldSize, int rank) final;
|
||||
|
||||
CommunicatorObj &getCommunicator() const final { return *comm; }
|
||||
void initComm(const string &, int, int) override { IT_TODO_HALT(); }
|
||||
|
||||
CommunicatorObj &getCommunicator() const override { IT_TODO_HALT(); }
|
||||
|
||||
private:
|
||||
void runWithoutSync(const Graph &graph, bool tune, bool profiling) const;
|
||||
|
|
|
@ -1,60 +0,0 @@
|
|||
#pragma once
|
||||
#include "core/communicator.h"
|
||||
#include "xpu/bkcl.h"
|
||||
#include <chrono>
|
||||
#include <filesystem>
|
||||
#include <fstream>
|
||||
#include <thread>
|
||||
|
||||
#define checkXcclError(call) \
|
||||
{ \
|
||||
auto err = call; \
|
||||
if (BKCL_SUCCESS != err) { \
|
||||
fprintf(stderr, "XCCL error in %s:%i.\n", __FILE__, __LINE__); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
}
|
||||
|
||||
namespace infini {
|
||||
|
||||
class XcclCommunicatorObj final : public CommunicatorObj {
|
||||
private:
|
||||
BKCLContext_t comm;
|
||||
|
||||
public:
|
||||
XcclCommunicatorObj(const string &name, int worldSize, int rank)
|
||||
: CommunicatorObj(worldSize, rank) {
|
||||
const std::string filePath("./" + name + "_xccl_id.bin");
|
||||
BKCLUniqueId commId;
|
||||
if (rank == 0) {
|
||||
checkXcclError(bkcl_get_unique_id(&commId));
|
||||
std::ofstream ofs(filePath, std::ios::binary);
|
||||
ofs.write((char *)&commId, sizeof(BKCLUniqueId));
|
||||
} else {
|
||||
auto begin = std::chrono::steady_clock::now();
|
||||
while (!std::filesystem::exists(filePath)) {
|
||||
auto now = std::chrono::steady_clock::now();
|
||||
_IT_ASSERT_2(now < begin + std::chrono::seconds(100),
|
||||
"time limit (100s) exceeded.");
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(100));
|
||||
}
|
||||
std::ifstream ifs(filePath, std::ios::binary);
|
||||
ifs.read((char *)&commId, sizeof(BKCLUniqueId));
|
||||
}
|
||||
checkXcclError(bkcl_init_rank(&comm, rank, worldSize, &commId));
|
||||
if (rank == 0) {
|
||||
std::filesystem::remove(filePath);
|
||||
}
|
||||
}
|
||||
|
||||
BKCLContext_t getXcclComm() { return comm; }
|
||||
|
||||
~XcclCommunicatorObj() final { checkXcclError(bkcl_destroy_context(comm)); }
|
||||
virtual string toString() const final {
|
||||
std::ostringstream oss;
|
||||
oss << "XCCL communicator";
|
||||
return oss.str();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace infini
|
|
@ -1,29 +0,0 @@
|
|||
#pragma once
|
||||
#include "core/operator.h"
|
||||
|
||||
namespace infini {
|
||||
class LRNObj : public OperatorObj {
|
||||
|
||||
public:
|
||||
LRNObj(GraphObj *graph, Tensor inputX, Tensor inputY, float alpha,
|
||||
float beta, float bias, int size);
|
||||
OP_CLONE(LRNObj);
|
||||
|
||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
|
||||
|
||||
std::string toString() const override;
|
||||
int numInputs() const override { return inputs.size(); }
|
||||
int numOutputs() const override { return 1; }
|
||||
auto getAlphaBetaBias() const {
|
||||
return tuple(alpha_value, beta_value, bias_value);
|
||||
}
|
||||
auto getSize() const { return size_value; }
|
||||
|
||||
private:
|
||||
float alpha_value, beta_value, bias_value;
|
||||
int size_value;
|
||||
vector<int> getWorkloadVector() const override;
|
||||
vector<int> getOpAttrVector() const override;
|
||||
};
|
||||
|
||||
} // namespace infini
|
|
@ -17,9 +17,6 @@ class MatmulObj : public OperatorObj {
|
|||
// Auxiliary attributes which are not a part of operator attributes.
|
||||
int b, m, n, k;
|
||||
|
||||
// Specifies the data precision for the matrix multiply.
|
||||
std::string computeType = "default";
|
||||
|
||||
public:
|
||||
/**
|
||||
* @brief Matmul operator with batch broadcast and tensor transpose
|
||||
|
@ -41,11 +38,10 @@ class MatmulObj : public OperatorObj {
|
|||
* @param transB If matrix B should be transposed when computing.
|
||||
* @param bias The bias tensor.
|
||||
* @param act The activation function.
|
||||
* @param computeType Specifies the data precision for the matrix multiply.
|
||||
*/
|
||||
MatmulObj(GraphObj *graph, Tensor A, Tensor B, Tensor C,
|
||||
bool transA = false, bool transB = false, Tensor bias = nullptr,
|
||||
ActType act = ActType::None, std::string computeType = "default");
|
||||
ActType act = ActType::None);
|
||||
OP_CLONE(MatmulObj);
|
||||
|
||||
std::string toString() const override;
|
||||
|
@ -64,7 +60,6 @@ class MatmulObj : public OperatorObj {
|
|||
int getN() const { return n; }
|
||||
int getK() const { return k; }
|
||||
auto getBMNK() const { return tuple{b, m, n, k}; }
|
||||
std::string getComputeType() const { return computeType; }
|
||||
|
||||
private:
|
||||
vector<int> getWorkloadVector() const override;
|
||||
|
|
|
@ -1,46 +0,0 @@
|
|||
#pragma once
|
||||
#include "core/operator.h"
|
||||
|
||||
namespace infini {
|
||||
/**
|
||||
*
|
||||
* https://docs.nvidia.com/deeplearning/nccl/archives/nccl_2193/user-guide/docs/index.html
|
||||
*/
|
||||
class RecvObj : public OperatorObj {
|
||||
|
||||
public:
|
||||
/**
|
||||
* @brief Construct a new SendRecv object
|
||||
*
|
||||
* @param graph The computation graph that this operator belongs to.
|
||||
* @param input default nullptr, because recv does not have input.
|
||||
* @param output recv output
|
||||
* @param source the send rank
|
||||
* @param destination the recv rank
|
||||
* @param dims The shape of the output tensor.
|
||||
*/
|
||||
RecvObj(GraphObj *graph, Tensor output, int source, int destination,
|
||||
Shape dims, int outputType, Tensor input = nullptr);
|
||||
OP_CLONE(RecvObj);
|
||||
|
||||
int numInputs() const override { return inputs.size(); }
|
||||
int numOutputs() const override { return 1; }
|
||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
|
||||
std::string toString() const override;
|
||||
DataType getDType() const;
|
||||
int getSourceRank() const { return source; }
|
||||
int getDestinationRank() const { return destination; }
|
||||
inline Shape getShape() const { return dims; }
|
||||
|
||||
private:
|
||||
vector<int> getWorkloadVector() const override;
|
||||
vector<int> getOpAttrVector() const override;
|
||||
vector<DataType> inferDataType(const TensorVec &inputs) const override;
|
||||
|
||||
protected:
|
||||
int source;
|
||||
int destination;
|
||||
Shape dims;
|
||||
int outputType;
|
||||
};
|
||||
} // namespace infini
|
|
@ -27,60 +27,6 @@ class ResizeObj : public OperatorObj {
|
|||
enum class EKeepAspectRatioPolicy { stretch, notLarger, notSmaller, none };
|
||||
enum class ECoeffMode { nearest, linear, cubic };
|
||||
|
||||
static ECoordinateTransMode fromECoordinateTransModeStr(string mode) {
|
||||
if (mode == "half_pixel") {
|
||||
return ECoordinateTransMode::halfPixel;
|
||||
} else if (mode == "asymmetric") {
|
||||
return ECoordinateTransMode::asymmetric;
|
||||
} else if (mode == "align_corners") {
|
||||
return ECoordinateTransMode::alignCorners;
|
||||
} else if (mode == "pytorch_half_pixel") {
|
||||
return ECoordinateTransMode::pytorchHalfPixel;
|
||||
} else if (mode == "tf_crop_and_resize") {
|
||||
return ECoordinateTransMode::tfCropAndResize;
|
||||
} else {
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
}
|
||||
|
||||
static ENearestMode fromENearestModeStr(string mode) {
|
||||
if (mode == "round_prefer_floor") {
|
||||
return ENearestMode::roundPreferFloor;
|
||||
} else if (mode == "round_prefer_ceil") {
|
||||
return ENearestMode::roundPreferCeil;
|
||||
} else if (mode == "floor") {
|
||||
return ENearestMode::floor;
|
||||
} else if (mode == "ceil") {
|
||||
return ENearestMode::ceil;
|
||||
} else {
|
||||
return ENearestMode::none;
|
||||
}
|
||||
}
|
||||
|
||||
static EKeepAspectRatioPolicy fromRatioPolicyStr(string ratioPolicyStr) {
|
||||
if (ratioPolicyStr == "stretch") {
|
||||
return EKeepAspectRatioPolicy::stretch;
|
||||
} else if (ratioPolicyStr == "not_larger") {
|
||||
return EKeepAspectRatioPolicy::notLarger;
|
||||
} else if (ratioPolicyStr == "not_smaller") {
|
||||
return EKeepAspectRatioPolicy::notSmaller;
|
||||
} else {
|
||||
return EKeepAspectRatioPolicy::none;
|
||||
}
|
||||
}
|
||||
|
||||
static ECoeffMode fromECoeffModeStr(string mode) {
|
||||
if (mode == "nearest") {
|
||||
return ECoeffMode::nearest;
|
||||
} else if (mode == "linear") {
|
||||
return ECoeffMode::linear;
|
||||
} else if (mode == "cubic") {
|
||||
return ECoeffMode::cubic;
|
||||
} else {
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
vector<int> axes;
|
||||
vector<float> scales;
|
||||
|
|
|
@ -1,34 +0,0 @@
|
|||
#pragma once
|
||||
#include "core/operator.h"
|
||||
|
||||
namespace infini {
|
||||
/**
|
||||
* @brief Fused RMSNorm Operator
|
||||
*
|
||||
*/
|
||||
class RMSNormObj : public OperatorObj {
|
||||
int dim;
|
||||
|
||||
public:
|
||||
/**
|
||||
* @brief Construct a new RMSNorm object.
|
||||
*
|
||||
* @param graph The computation graph that this operator belongs to.
|
||||
* @param input The input tensor.
|
||||
* @param output The output tensor.
|
||||
*/
|
||||
RMSNormObj(GraphObj *graph, Tensor input, Tensor weight, Tensor output);
|
||||
OP_CLONE(RMSNormObj);
|
||||
|
||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
|
||||
|
||||
std::string toString() const override;
|
||||
int numInputs() const override { return 2; }
|
||||
int numOutputs() const override { return 1; }
|
||||
int getDim() const { return dim; }
|
||||
|
||||
private:
|
||||
vector<int> getWorkloadVector() const override;
|
||||
vector<int> getOpAttrVector() const override;
|
||||
};
|
||||
} // namespace infini
|
|
@ -1,29 +0,0 @@
|
|||
#pragma once
|
||||
#include "core/operator.h"
|
||||
|
||||
namespace infini {
|
||||
class RoPEObj : public OperatorObj {
|
||||
public:
|
||||
/**
|
||||
* @brief Construct a new RotaryEmbedding object.
|
||||
*
|
||||
* @param graph The computation graph that this operator belongs to.
|
||||
* @param pos The positon id of the query.
|
||||
* @param input The input tensor.
|
||||
* @param output The output tensor.
|
||||
*/
|
||||
RoPEObj(GraphObj *graph, Tensor pos, Tensor input, Tensor output);
|
||||
OP_CLONE(RoPEObj);
|
||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
|
||||
|
||||
std::string toString() const override;
|
||||
int numInputs() const override { return 2; }
|
||||
int numOutputs() const override { return 1; }
|
||||
DataType getDType() const { return getInputs(1)->getDType(); }
|
||||
|
||||
private:
|
||||
vector<int> getWorkloadVector() const override;
|
||||
vector<int> getOpAttrVector() const override;
|
||||
};
|
||||
|
||||
} // namespace infini
|
|
@ -1,42 +0,0 @@
|
|||
#pragma once
|
||||
#include "core/operator.h"
|
||||
|
||||
namespace infini {
|
||||
/**
|
||||
*
|
||||
* https://docs.nvidia.com/deeplearning/nccl/archives/nccl_2193/user-guide/docs/index.html
|
||||
*/
|
||||
class SendObj : public OperatorObj {
|
||||
|
||||
public:
|
||||
/**
|
||||
* @brief Construct a new SendRecv object
|
||||
*
|
||||
* @param graph The computation graph that this operator belongs to.
|
||||
* @param input send input
|
||||
* @param output recv output
|
||||
* @param source the send rank
|
||||
* @param destination the recv rank
|
||||
*/
|
||||
SendObj(GraphObj *graph, Tensor input, int source, int destination,
|
||||
Tensor output = nullptr);
|
||||
OP_CLONE(SendObj);
|
||||
|
||||
int numInputs() const override { return 1; }
|
||||
int numOutputs() const override { return outputs.size(); }
|
||||
std::string toString() const override;
|
||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
|
||||
|
||||
int getSourceRank() const { return source; }
|
||||
int getDestinationRank() const { return destination; }
|
||||
|
||||
private:
|
||||
vector<int> getWorkloadVector() const override;
|
||||
vector<int> getOpAttrVector() const override;
|
||||
vector<DataType> inferDataType(const TensorVec &inputs) const override;
|
||||
|
||||
protected:
|
||||
int source;
|
||||
int destination;
|
||||
};
|
||||
} // namespace infini
|
|
@ -1,39 +0,0 @@
|
|||
#pragma once
|
||||
|
||||
#include "core/operator.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
/**
|
||||
* @brief Remove single-dimensional entries from the shape of a tensor.
|
||||
*
|
||||
*/
|
||||
class SqueezeObj : public OperatorObj {
|
||||
Shape axes;
|
||||
|
||||
public:
|
||||
/**
|
||||
* @brief Construct a new Squeeze object.
|
||||
*
|
||||
* @param graph The computation graph that this operator belongs to.
|
||||
* @param input The input tensor.
|
||||
* @param output The output tensor.
|
||||
* @param axes List of integers indicating the dimensions to squeeze.
|
||||
*/
|
||||
SqueezeObj(GraphObj *graph, Tensor input, Tensor output, Shape axes);
|
||||
OP_CLONE(SqueezeObj);
|
||||
|
||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
|
||||
|
||||
std::string toString() const override;
|
||||
int numInputs() const override { return 1; }
|
||||
int numOutputs() const override { return 1; }
|
||||
|
||||
inline Shape getAxes() const { return axes; }
|
||||
|
||||
private:
|
||||
vector<int> getWorkloadVector() const override;
|
||||
vector<int> getOpAttrVector() const override;
|
||||
};
|
||||
|
||||
} // namespace infini
|
|
@ -159,7 +159,6 @@ enum class CastType {
|
|||
Uint322Int64,
|
||||
Float162Float,
|
||||
BFloat162Float,
|
||||
Float2Float,
|
||||
};
|
||||
|
||||
class CastObj : public OperatorObj {
|
||||
|
@ -259,7 +258,6 @@ class LogObj : public OperatorObj {
|
|||
};
|
||||
|
||||
DEFINE_UNARY_OBJ(Relu, OpType::Relu)
|
||||
DEFINE_UNARY_OBJ(Silu, OpType::Silu)
|
||||
DEFINE_UNARY_OBJ(Gelu, OpType::Gelu)
|
||||
DEFINE_UNARY_OBJ(Sigmoid, OpType::Sigmoid)
|
||||
DEFINE_UNARY_OBJ(Tanh, OpType::Tanh)
|
||||
|
|
|
@ -1,38 +0,0 @@
|
|||
#pragma once
|
||||
|
||||
#include "core/operator.h"
|
||||
|
||||
namespace infini {
|
||||
/**
|
||||
* @brief nsert single-dimensional entries to the shape of an input tensor.
|
||||
*
|
||||
*/
|
||||
class UnsqueezeObj : public OperatorObj {
|
||||
Shape axes;
|
||||
|
||||
public:
|
||||
/**
|
||||
* @brief Construct a new Unsqueeze object.
|
||||
*
|
||||
* @param graph The computation graph that this operator belongs to.
|
||||
* @param input The input tensor.
|
||||
* @param output The output tensor.
|
||||
* @param axes List of integers indicating the dimensions to be inserted.
|
||||
*/
|
||||
UnsqueezeObj(GraphObj *graph, Tensor input, Tensor output, Shape axes);
|
||||
OP_CLONE(UnsqueezeObj);
|
||||
|
||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
|
||||
|
||||
std::string toString() const override;
|
||||
int numInputs() const override { return 1; }
|
||||
int numOutputs() const override { return 1; }
|
||||
|
||||
inline Shape getAxes() const { return axes; }
|
||||
|
||||
private:
|
||||
vector<int> getWorkloadVector() const override;
|
||||
vector<int> getOpAttrVector() const override;
|
||||
};
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,14 @@
|
|||
#pragma once
|
||||
|
||||
namespace infini {
|
||||
void broadcastShape(const Shape &originShape, SmallArray &modifyShape,
|
||||
int nDims, int size) {
|
||||
for (int i = nDims - size - 1; i >= 0; --i) {
|
||||
modifyShape.data[i] = 1;
|
||||
}
|
||||
for (int i = nDims - 1; i >= nDims - size; --i) {
|
||||
modifyShape.data[i] = originShape[i - nDims + size];
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -91,12 +91,6 @@ template <int val> class ValGenerator : public DataGenerator {
|
|||
fill<uint32_t>(data, size);
|
||||
}
|
||||
void fill(float *data, size_t size) override { fill<float>(data, size); }
|
||||
void fill_fp16(uint16_t *data, size_t size) {
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
float x = 1.0f * val;
|
||||
data[i] = float_to_fp16(x);
|
||||
}
|
||||
}
|
||||
};
|
||||
typedef ValGenerator<1> OneGenerator;
|
||||
typedef ValGenerator<0> ZeroGenerator;
|
||||
|
|
|
@ -2,33 +2,16 @@
|
|||
#ifndef OPERATOR_UTIL_H
|
||||
#define OPERATOR_UTIL_H
|
||||
|
||||
#include "core/operator.h"
|
||||
#include "core/tensor.h"
|
||||
|
||||
#include "utils/small_array.h"
|
||||
#include <numeric>
|
||||
|
||||
namespace infini {
|
||||
|
||||
// Launch a broadcast shape based on the shape of input A and B
|
||||
Shape infer_broadcast(const Shape &A, const Shape &B);
|
||||
// Launch the real axis based on rank and current axis
|
||||
int get_real_axis(const int &axis, const int &rank);
|
||||
// Check if tensor B is unidirectional broadcastable to tensor A
|
||||
// check if tensor B is unidirectional broadcastable to tensor A
|
||||
bool is_unidirectional_broadcasting(const Shape &A, const Shape &B);
|
||||
// Locate the index with size from Shape
|
||||
Shape locate_index(size_t inputN, const Shape &shape);
|
||||
// Delocate the ShapeIndex from Shape with broadcast
|
||||
size_t delocate_index(const Shape &shapeIndex, const Shape &shape,
|
||||
const Shape &stride);
|
||||
// Convert KernelAttrs to a string representation
|
||||
std::string get_kernel_attrs_str(const KernelAttrs &kernelAttrs);
|
||||
// VectorProd
|
||||
int shapeProd(std::vector<int>::iterator start, std::vector<int>::iterator end);
|
||||
void broadcastShape(const Shape &originShape, SmallArray &modifyShape,
|
||||
int nDims, int size);
|
||||
void broadcastShape(const Shape &tempShape, Shape &modifyShape);
|
||||
|
||||
} // namespace infini
|
||||
|
||||
#endif
|
||||
|
|
|
@ -4,14 +4,6 @@ namespace infini {
|
|||
#define SMALL_ARRAY_SIZE 8
|
||||
struct SmallArray {
|
||||
int data[SMALL_ARRAY_SIZE];
|
||||
|
||||
int prod(int start, int end) {
|
||||
int result = 1;
|
||||
for (int i = start; i < end; ++i) {
|
||||
result *= data[i];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace infini
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -295,36 +295,6 @@ class TestStringMethods(unittest.TestCase):
|
|||
make_graph([reshape], "reshape", [data, shape], [reshaped], [shape_data])
|
||||
)
|
||||
|
||||
def test_resize(self):
|
||||
x = make_tensor_value_info("x", TensorProto.FLOAT, [1, 128, 40, 40])
|
||||
roi = make_tensor("roi", TensorProto.FLOAT, [0], [])
|
||||
scales = make_tensor("scales", TensorProto.FLOAT, [4], [1, 1, 2, 2])
|
||||
y = make_tensor_value_info("y", TensorProto.FLOAT, [1, 128, 80, 80])
|
||||
reshape = make_node("Resize", ["x", "roi", "scales"], ["y"], name="resize")
|
||||
make_and_import_model(make_graph([reshape], "resize", [x], [y], [roi, scales]))
|
||||
|
||||
def test_squeeze(self):
|
||||
input = make_tensor_value_info("input", TensorProto.FLOAT, [1, 3, 1, 5])
|
||||
axes = make_tensor_value_info("axes", TensorProto.INT64, [2])
|
||||
axes_data = make_tensor("axes", TensorProto.INT64, [2], [0, 2])
|
||||
output = make_tensor_value_info("output", TensorProto.FLOAT, [3, 5])
|
||||
squeeze = make_node("Squeeze", ["input", "axes"], ["output"], name="squeeze")
|
||||
make_and_import_model(
|
||||
make_graph([squeeze], "squeeze", [input, axes], [output], [axes_data])
|
||||
)
|
||||
|
||||
def test_unsqueeze(self):
|
||||
input = make_tensor_value_info("input", TensorProto.FLOAT, [2, 3, 4, 5])
|
||||
axes = make_tensor_value_info("axes", TensorProto.INT64, [2])
|
||||
axes_data = make_tensor("axes", TensorProto.INT64, [2], [0, 2])
|
||||
output = make_tensor_value_info("output", TensorProto.FLOAT, [1, 2, 1, 3, 4, 5])
|
||||
unsqueeze = make_node(
|
||||
"Unsqueeze", ["input", "axes"], ["output"], name="unsqueeze"
|
||||
)
|
||||
make_and_import_model(
|
||||
make_graph([unsqueeze], "unsqueeze", [input, axes], [output], [axes_data])
|
||||
)
|
||||
|
||||
def test_concat(self):
|
||||
input1 = make_tensor_value_info("input1", TensorProto.FLOAT, [1, 3, 2, 4])
|
||||
input2 = make_tensor_value_info("input2", TensorProto.FLOAT, [1, 3, 2, 5])
|
||||
|
@ -367,7 +337,7 @@ class TestStringMethods(unittest.TestCase):
|
|||
"ReduceMean", ["data"], ["reduced"], keepdims=1, name="reduceMean"
|
||||
)
|
||||
make_and_import_model(make_graph([reduceMean], "reduceMean", [data], [reduced]))
|
||||
|
||||
|
||||
def test_reduce_sum(self):
|
||||
data = make_tensor_value_info("data", TensorProto.FLOAT, [2, 3, 3, 4])
|
||||
reduced = make_tensor_value_info("reduced", TensorProto.FLOAT, [1, 1, 1, 1])
|
||||
|
@ -463,20 +433,7 @@ class TestStringMethods(unittest.TestCase):
|
|||
def test_split(self):
|
||||
input = make_tensor_value_info("input", TensorProto.FLOAT, [1, 3, 2, 4])
|
||||
split = make_node("Split", ["input"], ["output"], name="split", axis=0)
|
||||
output = make_tensor_value_info("output", TensorProto.FLOAT, [1, 3, 2, 4])
|
||||
make_and_import_model(make_graph([split], "split", [input], [output]))
|
||||
|
||||
def test_split1(self):
|
||||
input = make_tensor_value_info("input", TensorProto.FLOAT, [1, 3, 2, 4])
|
||||
splitAttr = make_tensor("split", TensorProto.INT64, [2], [2, 1])
|
||||
output1 = make_tensor_value_info("output1", TensorProto.FLOAT, [1, 2, 2, 4])
|
||||
output2 = make_tensor_value_info("output2", TensorProto.FLOAT, [1, 1, 2, 4])
|
||||
split = make_node(
|
||||
"Split", ["input", "split"], ["output1", "output2"], name="split", axis=1
|
||||
)
|
||||
make_and_import_model(
|
||||
make_graph([split], "split", [input], [output1, output2], [splitAttr])
|
||||
)
|
||||
make_and_import_model(make_graph([split], "split", [input], []))
|
||||
|
||||
def test_allBroadcast(self):
|
||||
input = make_tensor_value_info("input", TensorProto.FLOAT, [1, 3, 2, 4])
|
||||
|
@ -551,29 +508,6 @@ class TestStringMethods(unittest.TestCase):
|
|||
where = make_node("Where", ["x", "y", "con"], ["output"], name="where")
|
||||
make_and_import_model(make_graph([where], "where", [x, y, con], [output]))
|
||||
|
||||
def test_send(self):
|
||||
sendInput = make_tensor_value_info("input", TensorProto.FLOAT, [1, 3, 5, 7])
|
||||
send = make_node("Send", ["input"], [], name="send", source=0, destination=1)
|
||||
graph = make_graph([send], "send", [sendInput], [])
|
||||
model = make_model(graph)
|
||||
from_onnx(model, backend.cpu_runtime())
|
||||
|
||||
def test_recv(self):
|
||||
recvOutput = make_tensor_value_info("output", TensorProto.FLOAT, [1, 3, 5, 7])
|
||||
recv = make_node(
|
||||
"Recv",
|
||||
[],
|
||||
["output"],
|
||||
name="recv",
|
||||
source=0,
|
||||
destination=1,
|
||||
shape=[1, 3, 5, 7],
|
||||
dataType=1,
|
||||
)
|
||||
graph = make_graph([recv], "recv", [], [recvOutput])
|
||||
model = make_model(graph)
|
||||
from_onnx(model, backend.cpu_runtime())
|
||||
|
||||
|
||||
class TestDynamicTensor(unittest.TestCase):
|
||||
def test_dynamic_tensor(self):
|
||||
|
@ -583,7 +517,6 @@ class TestDynamicTensor(unittest.TestCase):
|
|||
for root, dirs, files in os.walk(current_path):
|
||||
if filename in files:
|
||||
model_file = os.path.join(root, filename)
|
||||
|
||||
model = OnnxStub(onnx.load(model_file), backend.cpu_runtime())
|
||||
output_key = list(model.outputs.keys())[0]
|
||||
old_output_shape = model.getShape(output_key)
|
||||
|
|
|
@ -1,9 +1,6 @@
|
|||
#include "bang/bang_runtime.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/perf_engine.h"
|
||||
#ifdef INFINI_USE_CNCL
|
||||
#include "bang/cncl_communicator.h"
|
||||
#endif
|
||||
|
||||
namespace infini {
|
||||
|
||||
|
@ -16,7 +13,8 @@ void BangRuntimeObj::runWithoutSync(const Graph &graph, bool tune = false,
|
|||
std::map<OpType, int> opCnt;
|
||||
for (auto &op : graph->getOperators()) {
|
||||
// HACK: set correct data type
|
||||
auto kernelAttrs = KernelAttrs{device, op->getOpType().underlying()};
|
||||
auto kernelAttrs =
|
||||
KernelAttrs{device, op->getOpType().underlying(), op->getDType()};
|
||||
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
|
||||
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
|
||||
auto perfData = perfEngine.getPerfData(perfKey);
|
||||
|
@ -61,15 +59,4 @@ void BangRuntimeObj::sync() const { cnrtSyncDevice(); }
|
|||
|
||||
string BangRuntimeObj::toString() const { return "BANG Runtime"; }
|
||||
|
||||
void BangRuntimeObj::initComm(const string &name, int worldSize, int rank) {
|
||||
IT_ASSERT(worldSize > 0);
|
||||
IT_ASSERT(rank >= 0);
|
||||
IT_ASSERT(rank < worldSize);
|
||||
IT_ASSERT(!comm) << "communicator is already initialized.";
|
||||
#ifdef INFINI_USE_CNCL
|
||||
comm = std::make_unique<CnclCommunicatorObj>(name, worldSize, rank);
|
||||
#else
|
||||
IT_TODO_HALT_MSG("Not compiled with CNCL.");
|
||||
#endif
|
||||
}
|
||||
} // namespace infini
|
||||
|
|
|
@ -11,33 +11,20 @@ GraphObj::GraphObj(Runtime runtime, OpVec ops_in)
|
|||
map<UidBaseType, Tensor> tensorPool;
|
||||
// Clone tensors
|
||||
for (const auto &op : ops_in) {
|
||||
for (const auto &t : op->getInputs()) {
|
||||
if (t) {
|
||||
if (tensorPool.find(t->getFuid()) == tensorPool.end())
|
||||
tensorPool[t->getFuid()] = cloneTensor(t);
|
||||
}
|
||||
}
|
||||
for (const auto &t : op->getOutputs()) {
|
||||
if (t) {
|
||||
if (tensorPool.find(t->getFuid()) == tensorPool.end())
|
||||
tensorPool[t->getFuid()] = cloneTensor(t);
|
||||
}
|
||||
}
|
||||
for (const auto &t : op->getInputs())
|
||||
if (tensorPool.find(t->getFuid()) == tensorPool.end())
|
||||
tensorPool[t->getFuid()] = cloneTensor(t);
|
||||
for (const auto &t : op->getOutputs())
|
||||
if (tensorPool.find(t->getFuid()) == tensorPool.end())
|
||||
tensorPool[t->getFuid()] = cloneTensor(t);
|
||||
}
|
||||
// Clone operators and add connections
|
||||
for (const auto &op : ops_in) {
|
||||
TensorVec inputs, outputs;
|
||||
for (const auto &t : op->getInputs()) {
|
||||
if (t) {
|
||||
inputs.emplace_back(tensorPool.at(t->getFuid()));
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto &t : op->getOutputs()) {
|
||||
if (t) {
|
||||
outputs.emplace_back(tensorPool.at(t->getFuid()));
|
||||
}
|
||||
}
|
||||
for (const auto &t : op->getInputs())
|
||||
inputs.emplace_back(tensorPool.at(t->getFuid()));
|
||||
for (const auto &t : op->getOutputs())
|
||||
outputs.emplace_back(tensorPool.at(t->getFuid()));
|
||||
addOperatorAndConnect(op->clone(inputs, outputs));
|
||||
}
|
||||
}
|
||||
|
@ -46,21 +33,17 @@ void GraphObj::addOperatorAndConnect(const Operator &op) {
|
|||
sorted = false;
|
||||
ops.push_back(op);
|
||||
for (auto &input : op->getInputs()) {
|
||||
if (input) {
|
||||
input->addTarget(op);
|
||||
if (auto pred = input->getSource()) {
|
||||
pred->addSuccessors(op);
|
||||
op->addPredecessors(pred);
|
||||
}
|
||||
input->addTarget(op);
|
||||
if (auto pred = input->getSource()) {
|
||||
pred->addSuccessors(op);
|
||||
op->addPredecessors(pred);
|
||||
}
|
||||
}
|
||||
for (auto &output : op->getOutputs()) {
|
||||
if (output) {
|
||||
output->setSource(op);
|
||||
for (auto &succ : output->getTargets()) {
|
||||
succ->addPredecessors(op);
|
||||
op->addSuccessors(succ);
|
||||
}
|
||||
output->setSource(op);
|
||||
for (auto &succ : output->getTargets()) {
|
||||
succ->addPredecessors(op);
|
||||
op->addSuccessors(succ);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -87,33 +70,48 @@ string GraphObj::toString() const {
|
|||
}
|
||||
|
||||
bool GraphObj::topo_sort() {
|
||||
if (this->sorted) {
|
||||
if (this->sorted)
|
||||
return true;
|
||||
}
|
||||
|
||||
// std::unordered_set<Tensor> inputs;
|
||||
std::unordered_set<Operator> waiting(this->ops.begin(), this->ops.end());
|
||||
std::vector<Operator> sorted;
|
||||
std::unordered_set<OperatorObj *> flags;
|
||||
sorted.reserve(ops.size());
|
||||
flags.reserve(ops.size());
|
||||
while (sorted.size() < ops.size()) {
|
||||
|
||||
while (!waiting.empty()) {
|
||||
// Any node is move to sorted in this loop.
|
||||
auto modified = false;
|
||||
for (auto const &op : ops) {
|
||||
if (auto const &inputs = op->getInputs();
|
||||
flags.find(op.get()) == flags.end() &&
|
||||
std::all_of(inputs.begin(), inputs.end(),
|
||||
[&flags](auto const &input) {
|
||||
auto ptr = input->getSource().get();
|
||||
return !ptr || flags.find(ptr) != flags.end();
|
||||
})) {
|
||||
// Find head nodes.
|
||||
for (auto it = waiting.begin(); it != waiting.end();) {
|
||||
const auto &this_inputs = (*it)->getInputs();
|
||||
// If none of the input tensors is in waiting list,
|
||||
// this node is a head node.
|
||||
const auto is_head = std::all_of(
|
||||
this_inputs.begin(), this_inputs.end(), [&](const auto &input) {
|
||||
auto src = input->getSource();
|
||||
return src // If the source node is in the waiting list,
|
||||
// means that this node is not the head node.
|
||||
? waiting.find(src) == waiting.end()
|
||||
// This tensor has no source node,
|
||||
// it must be a input tensor.
|
||||
: (/*inputs.insert(input),*/ true);
|
||||
});
|
||||
// Moves head node to sorted.
|
||||
if (is_head) {
|
||||
modified = true;
|
||||
sorted.emplace_back(op);
|
||||
flags.insert(op.get());
|
||||
sorted.emplace_back(std::move(*it));
|
||||
it = waiting.erase(it);
|
||||
} else {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
// Waiting list never modifies during a pass,
|
||||
// sorting fails.
|
||||
if (!modified) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Done.
|
||||
this->ops = std::move(sorted);
|
||||
return this->sorted = true;
|
||||
}
|
||||
|
@ -157,7 +155,6 @@ void GraphObj::shape_infer() {
|
|||
|
||||
void GraphObj::dataMalloc(bool useNaiveAllocator, size_t memPoolSize) {
|
||||
// topological sorting first
|
||||
|
||||
IT_ASSERT(topo_sort() == true);
|
||||
if (useNaiveAllocator) {
|
||||
// can not set memory pool when use naive allocator
|
||||
|
@ -167,10 +164,7 @@ void GraphObj::dataMalloc(bool useNaiveAllocator, size_t memPoolSize) {
|
|||
// note: behavior may not match running in non-naive mode, and it may
|
||||
// not reproduce the bug
|
||||
for (auto &tensor : tensors) {
|
||||
if (!tensor->isWeight() ||
|
||||
(tensor->isWeight() && !weightAllocated)) {
|
||||
tensor->dataMalloc();
|
||||
}
|
||||
tensor->dataMalloc();
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
@ -228,28 +222,24 @@ void GraphObj::dataMalloc(bool useNaiveAllocator, size_t memPoolSize) {
|
|||
// memory should be allocated for the op's output first
|
||||
auto outputs = op->getOutputs();
|
||||
for (auto &tensor : outputs) {
|
||||
if (tensor) {
|
||||
if (tensor->isOthers()) {
|
||||
tensorToOffset[tensor.get()] =
|
||||
allocator.alloc(tensor->getBytes());
|
||||
}
|
||||
if (tensor->isOthers()) {
|
||||
tensorToOffset[tensor.get()] =
|
||||
allocator.alloc(tensor->getBytes());
|
||||
}
|
||||
}
|
||||
auto inputs = op->getInputs();
|
||||
for (auto &tensor : inputs) {
|
||||
if (tensor) {
|
||||
if (tensor->isOthers()) {
|
||||
auto tensorIter = tensorToRefCount.find(tensor.get());
|
||||
IT_ASSERT(tensorIter != tensorToRefCount.end());
|
||||
IT_ASSERT(tensorToRefCount[tensor.get()] > 0);
|
||||
tensorToRefCount[tensor.get()] -= 1;
|
||||
if (tensorToRefCount[tensor.get()] == 0) {
|
||||
// indicate that this tensor will no longer be used and
|
||||
// perform memory free
|
||||
tensorToRefCount.erase(tensor.get());
|
||||
allocator.free(tensorToOffset[tensor.get()],
|
||||
tensor->getBytes());
|
||||
}
|
||||
if (tensor->isOthers()) {
|
||||
auto tensorIter = tensorToRefCount.find(tensor.get());
|
||||
IT_ASSERT(tensorIter != tensorToRefCount.end());
|
||||
IT_ASSERT(tensorToRefCount[tensor.get()] > 0);
|
||||
tensorToRefCount[tensor.get()] -= 1;
|
||||
if (tensorToRefCount[tensor.get()] == 0) {
|
||||
// indicate that this tensor will no longer be used and
|
||||
// perform memory free
|
||||
tensorToRefCount.erase(tensor.get());
|
||||
allocator.free(tensorToOffset[tensor.get()],
|
||||
tensor->getBytes());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -10,27 +10,18 @@
|
|||
#include "operators/expand.h"
|
||||
#include "operators/gather.h"
|
||||
#include "operators/layer_norm.h"
|
||||
#include "operators/lrn.h"
|
||||
#include "operators/matmul.h"
|
||||
#include "operators/pad.h"
|
||||
#include "operators/pooling.h"
|
||||
#include "operators/recv.h"
|
||||
#include "operators/reduce.h"
|
||||
#include "operators/reshape.h"
|
||||
#include "operators/resize.h"
|
||||
#include "operators/rms_norm.h"
|
||||
#include "operators/rope.h"
|
||||
#include "operators/send.h"
|
||||
#include "operators/slice.h"
|
||||
#include "operators/softmax.h"
|
||||
#include "operators/split.h"
|
||||
#include "operators/squeeze.h"
|
||||
#include "operators/transpose.h"
|
||||
#include "operators/unary.h"
|
||||
#include "operators/unsqueeze.h"
|
||||
#include "operators/where.h"
|
||||
#include <numeric>
|
||||
#include <variant>
|
||||
|
||||
namespace infini {
|
||||
|
||||
|
@ -74,17 +65,15 @@ Tensor GraphHandlerObj::convTransposed2d(Tensor input, Tensor weight,
|
|||
}
|
||||
|
||||
Tensor GraphHandlerObj::matmul(Tensor a, Tensor b, Tensor y, bool transA,
|
||||
bool transB, Tensor bias, ActType act,
|
||||
std::string matmul_compute_type) {
|
||||
bool transB, Tensor bias, ActType act) {
|
||||
if (y) {
|
||||
g->addOpWithOutputs<MatmulObj>(std::move(a), std::move(b), y, transA,
|
||||
transB, std::move(bias), act,
|
||||
matmul_compute_type);
|
||||
transB, std::move(bias), act);
|
||||
return y;
|
||||
} else {
|
||||
return g
|
||||
->addOp<MatmulObj>(std::move(a), std::move(b), y, transA, transB,
|
||||
std::move(bias), act, matmul_compute_type)
|
||||
std::move(bias), act)
|
||||
->getOutput();
|
||||
}
|
||||
}
|
||||
|
@ -125,17 +114,6 @@ Tensor GraphHandlerObj::layerNormalization(Tensor input, Tensor scale,
|
|||
}
|
||||
}
|
||||
|
||||
Tensor GraphHandlerObj::rmsNorm(Tensor input, Tensor weight, Tensor output) {
|
||||
if (output) {
|
||||
g->addOpWithOutputs<RMSNormObj>(std::move(input), std::move(weight),
|
||||
output);
|
||||
return output;
|
||||
} else {
|
||||
return g->addOp<RMSNormObj>(std::move(input), std::move(weight), output)
|
||||
->getOutput();
|
||||
}
|
||||
}
|
||||
|
||||
Tensor GraphHandlerObj::maxPool(Tensor input, Tensor output, int kh, int kw,
|
||||
int dh, int dw, int ph, int pw, int sh, int sw,
|
||||
int ceilMode) {
|
||||
|
@ -196,7 +174,6 @@ DEFINE_ELEMENT_WISE_METHOD(max, Maximum)
|
|||
} \
|
||||
}
|
||||
|
||||
DEFINE_UNARY_METHOD(silu, Silu)
|
||||
DEFINE_UNARY_METHOD(relu, Relu)
|
||||
DEFINE_UNARY_METHOD(gelu, Gelu)
|
||||
DEFINE_UNARY_METHOD(sigmoid, Sigmoid)
|
||||
|
@ -273,64 +250,6 @@ Tensor GraphHandlerObj::reshape(Tensor data, Tensor reshaped, Shape shape) {
|
|||
}
|
||||
}
|
||||
|
||||
Tensor GraphHandlerObj::resize(Tensor input, Tensor output,
|
||||
const std::optional<vector<int>> &axes,
|
||||
Tensor sizes, Tensor scales, Tensor roi,
|
||||
vector<uint32_t> sizes_, vector<float> scales_,
|
||||
vector<float> roi_, string mode,
|
||||
string ratioPolicy, string nearestMode,
|
||||
string coordTransMode) {
|
||||
if (sizes_.size() > 0) {
|
||||
sizes->dataMalloc();
|
||||
sizes->copyin<uint32_t>(sizes_);
|
||||
}
|
||||
if (scales_.size() > 0) {
|
||||
scales->dataMalloc();
|
||||
scales->copyin<float>(scales_);
|
||||
}
|
||||
if (roi_.size() > 0) {
|
||||
roi->dataMalloc();
|
||||
roi->copyin<float>(roi_);
|
||||
}
|
||||
ResizeObj::EKeepAspectRatioPolicy ratioPolicy_ =
|
||||
ResizeObj::fromRatioPolicyStr(ratioPolicy);
|
||||
ResizeObj::ENearestMode nearestMode_ =
|
||||
ResizeObj::fromENearestModeStr(nearestMode);
|
||||
ResizeObj::ECoordinateTransMode coordTransMode_ =
|
||||
ResizeObj::fromECoordinateTransModeStr(coordTransMode);
|
||||
ResizeObj::ECoeffMode mode_ = ResizeObj::fromECoeffModeStr(mode);
|
||||
if (output) {
|
||||
if (mode == "nearest") {
|
||||
g->addOpWithOutputs<ResizeObj>(
|
||||
std::move(input), output, std::move(axes), std::move(sizes),
|
||||
std::move(scales), std::move(roi), ratioPolicy_, nearestMode_,
|
||||
coordTransMode_);
|
||||
} else {
|
||||
g->addOpWithOutputs<ResizeObj>(
|
||||
std::move(input), output, std::move(axes), std::move(sizes),
|
||||
std::move(scales), std::move(roi), mode_, ratioPolicy_,
|
||||
coordTransMode_);
|
||||
}
|
||||
return output;
|
||||
} else {
|
||||
if (mode == "nearest") {
|
||||
return g
|
||||
->addOp<ResizeObj>(std::move(input), output, std::move(axes),
|
||||
std::move(sizes), std::move(scales),
|
||||
std::move(roi), ratioPolicy_, nearestMode_,
|
||||
coordTransMode_)
|
||||
->getOutput();
|
||||
} else {
|
||||
return g
|
||||
->addOp<ResizeObj>(std::move(input), output, std::move(axes),
|
||||
std::move(sizes), std::move(scales),
|
||||
std::move(roi), mode_, ratioPolicy_,
|
||||
coordTransMode_)
|
||||
->getOutput();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Tensor GraphHandlerObj::concat(TensorVec inputs, Tensor output, int dim) {
|
||||
if (output) {
|
||||
g->addOpWithOutputs<ConcatObj>(std::move(inputs), output, dim);
|
||||
|
@ -350,7 +269,7 @@ Tensor GraphHandlerObj::attentionKVCache(Tensor input_k_cache,
|
|||
std::move(input_k_cache), std::move(input_v_cache),
|
||||
std::move(input_q), std::move(input_k), std::move(input_v),
|
||||
std::move(position_id), output_matmul);
|
||||
return output_matmul;
|
||||
return {output_matmul};
|
||||
} else {
|
||||
return g
|
||||
->addOp<AttentionKVCacheObj>(
|
||||
|
@ -361,40 +280,15 @@ Tensor GraphHandlerObj::attentionKVCache(Tensor input_k_cache,
|
|||
}
|
||||
}
|
||||
|
||||
Tensor GraphHandlerObj::RoPE(Tensor pos, Tensor input, Tensor output) {
|
||||
if (output) {
|
||||
g->addOpWithOutputs<RoPEObj>(std::move(pos), std::move(input), output);
|
||||
return output;
|
||||
} else {
|
||||
return g->addOp<RoPEObj>(std::move(pos), std::move(input), output)
|
||||
->getOutput();
|
||||
}
|
||||
}
|
||||
|
||||
TensorVec GraphHandlerObj::split(Tensor input, std::optional<TensorVec> outputs,
|
||||
int axis,
|
||||
std::variant<int, vector<int>> numOrRatio) {
|
||||
int axis, int num_outputs) {
|
||||
if (outputs) {
|
||||
if (std::holds_alternative<int>(numOrRatio)) {
|
||||
g->addOpWithOutputs<SplitObj>(std::move(input), outputs, axis,
|
||||
std::get<int>(numOrRatio));
|
||||
} else {
|
||||
g->addOpWithOutputs<SplitObj>(std::move(input), outputs, axis,
|
||||
std::get<vector<int>>(numOrRatio));
|
||||
}
|
||||
g->addOpWithOutputs<SplitObj>(std::move(input), outputs, axis,
|
||||
num_outputs);
|
||||
return *outputs;
|
||||
} else {
|
||||
if (std::holds_alternative<int>(numOrRatio)) {
|
||||
return g
|
||||
->addOp<SplitObj>(std::move(input), outputs, axis,
|
||||
std::get<int>(numOrRatio))
|
||||
->getOutputs();
|
||||
} else {
|
||||
return g
|
||||
->addOp<SplitObj>(std::move(input), outputs, axis,
|
||||
std::get<vector<int>>(numOrRatio))
|
||||
->getOutputs();
|
||||
}
|
||||
return g->addOp<SplitObj>(std::move(input), outputs, axis, num_outputs)
|
||||
->getOutputs();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -540,39 +434,6 @@ Tensor GraphHandlerObj::broadcast(Tensor input, Tensor output, int root) {
|
|||
}
|
||||
}
|
||||
|
||||
Tensor GraphHandlerObj::send(Tensor input, int source, int destination,
|
||||
Tensor output) {
|
||||
if (output) {
|
||||
|
||||
g->addOpWithOutputs<SendObj>(std::move(input), source, destination,
|
||||
output);
|
||||
|
||||
return output;
|
||||
} else {
|
||||
return g->addOp<SendObj>(std::move(input), source, destination, output)
|
||||
->getOutput();
|
||||
}
|
||||
}
|
||||
|
||||
Tensor GraphHandlerObj::recv(Tensor output, int source, int destination,
|
||||
Shape dims, int outputType, Tensor input) {
|
||||
|
||||
if (output) {
|
||||
|
||||
g->addOpWithOutputs<RecvObj>(output, source, destination,
|
||||
std::move(dims), outputType,
|
||||
std::move(input));
|
||||
|
||||
return output;
|
||||
} else {
|
||||
|
||||
return g
|
||||
->addOp<RecvObj>(output, source, destination, std::move(dims),
|
||||
outputType, std::move(input))
|
||||
->getOutput();
|
||||
}
|
||||
}
|
||||
|
||||
Tensor GraphHandlerObj::cast(Tensor input, Tensor output, int to) {
|
||||
if (output) {
|
||||
g->addOpWithOutputs<CastObj>(std::move(input), output,
|
||||
|
@ -623,41 +484,6 @@ Tensor GraphHandlerObj::depthToSpace(Tensor input, Tensor output, int blocksize,
|
|||
}
|
||||
}
|
||||
|
||||
Tensor GraphHandlerObj::lrn(Tensor input, Tensor output, float alpha,
|
||||
float beta, float bias, int size) {
|
||||
if (output) {
|
||||
g->addOpWithOutputs<LRNObj>(std::move(input), output, alpha, beta, bias,
|
||||
size);
|
||||
return output;
|
||||
} else {
|
||||
return g
|
||||
->addOp<LRNObj>(std::move(input), output, alpha, beta, bias, size)
|
||||
->getOutput();
|
||||
}
|
||||
}
|
||||
|
||||
Tensor GraphHandlerObj::squeeze(Tensor input, Tensor output, Shape axes) {
|
||||
if (output) {
|
||||
g->addOpWithOutputs<SqueezeObj>(std::move(input), output,
|
||||
std::move(axes));
|
||||
return output;
|
||||
} else {
|
||||
return g->addOp<SqueezeObj>(std::move(input), output, std::move(axes))
|
||||
->getOutput();
|
||||
}
|
||||
}
|
||||
|
||||
Tensor GraphHandlerObj::unsqueeze(Tensor input, Tensor output, Shape axes) {
|
||||
if (output) {
|
||||
g->addOpWithOutputs<UnsqueezeObj>(std::move(input), output,
|
||||
std::move(axes));
|
||||
return output;
|
||||
} else {
|
||||
return g->addOp<UnsqueezeObj>(std::move(input), output, std::move(axes))
|
||||
->getOutput();
|
||||
}
|
||||
}
|
||||
|
||||
static CastType inferCastType(Tensor input, int to) {
|
||||
auto iType = input->getDType();
|
||||
auto oType = DataType(to);
|
||||
|
@ -709,8 +535,6 @@ static CastType inferCastType(Tensor input, int to) {
|
|||
return CastType::Float162Float;
|
||||
} else if (iType == DataType::BFloat16 && oType == DataType::Float32) {
|
||||
return CastType::BFloat162Float;
|
||||
} else if (iType == DataType::Float32 && oType == DataType::Float32) {
|
||||
return CastType::Float2Float;
|
||||
} else {
|
||||
IT_TODO_HALT_MSG("Unsupported CastType : input_type is " +
|
||||
iType.toString() + " output_type is " +
|
||||
|
|
|
@ -6,10 +6,8 @@ namespace infini {
|
|||
|
||||
OperatorObj::OperatorObj(OpType opType, TensorVec inputs, TensorVec outputs)
|
||||
: type(opType), inputs(inputs), outputs(outputs) {
|
||||
if (opType != OpType::Recv) {
|
||||
for (const auto &t : inputs)
|
||||
IT_ASSERT(t);
|
||||
}
|
||||
for (const auto &t : inputs)
|
||||
IT_ASSERT(t);
|
||||
}
|
||||
|
||||
void OperatorObj::removePredecessors(const Operator &op) {
|
||||
|
|
|
@ -17,7 +17,8 @@ void CpuRuntimeObj::run(const Graph &graph, bool tune, bool profiling) const {
|
|||
std::map<OpType, int> opCnt;
|
||||
|
||||
for (auto &op : graph->getOperators()) {
|
||||
auto kernelAttrs = KernelAttrs{device, op->getOpType().underlying()};
|
||||
auto kernelAttrs =
|
||||
KernelAttrs{device, op->getOpType().underlying(), op->getDType()};
|
||||
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
|
||||
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
|
||||
auto perfData = perfEngine.getPerfData(perfKey);
|
||||
|
@ -65,7 +66,8 @@ double RuntimeObj::getPerfTime(const Graph &graph, bool profiling) const {
|
|||
std::map<OpType, int> opCnt;
|
||||
|
||||
for (auto &op : graph->getOperators()) {
|
||||
auto kernelAttrs = KernelAttrs{device, op->getOpType().underlying()};
|
||||
auto kernelAttrs =
|
||||
KernelAttrs{device, op->getOpType().underlying(), op->getDType()};
|
||||
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
|
||||
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
|
||||
auto perfData = perfEngine.getPerfData(perfKey);
|
||||
|
|
|
@ -66,36 +66,6 @@ void TensorObj::setShape(Shape shape_) {
|
|||
_size = size;
|
||||
}
|
||||
|
||||
void TensorObj::dumpData(std::ofstream &ofs) const {
|
||||
IT_ASSERT(data != nullptr);
|
||||
if (!runtime->isCpu())
|
||||
IT_TODO_HALT();
|
||||
|
||||
#define TRY_DUMP(N) \
|
||||
if (dtype == DataType(N)) \
|
||||
ofs << dataToString<DT<N>::t>() << std::endl;
|
||||
|
||||
TRY_DUMP(0) // fmt: new line
|
||||
else TRY_DUMP(1) //
|
||||
else TRY_DUMP(2) //
|
||||
else TRY_DUMP(3) //
|
||||
else TRY_DUMP(4) //
|
||||
else TRY_DUMP(5) //
|
||||
else TRY_DUMP(6) //
|
||||
else TRY_DUMP(7) //
|
||||
else TRY_DUMP(8) //
|
||||
else TRY_DUMP(9) //
|
||||
else TRY_DUMP(10) //
|
||||
else TRY_DUMP(11) //
|
||||
else TRY_DUMP(12) //
|
||||
else TRY_DUMP(13) //
|
||||
else TRY_DUMP(16) //
|
||||
else IT_TODO_HALT();
|
||||
ofs.flush();
|
||||
|
||||
#undef TRY_DUMP
|
||||
}
|
||||
|
||||
void TensorObj::printData() const {
|
||||
IT_ASSERT(data != nullptr);
|
||||
if (!runtime->isCpu())
|
||||
|
|
|
@ -19,12 +19,14 @@ void CHECK_CUDA_KERNEL_ERROR(infini::Operator op) {
|
|||
}
|
||||
|
||||
namespace infini {
|
||||
|
||||
void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
|
||||
const auto &kernelRegistry = KernelRegistry::getInstance();
|
||||
auto &perfEngine = PerfEngine::getInstance();
|
||||
for (auto &op : graph->getOperators()) {
|
||||
// HACK: set correct data type
|
||||
auto kernelAttrs = KernelAttrs{device, op->getOpType().underlying()};
|
||||
auto kernelAttrs =
|
||||
KernelAttrs{device, op->getOpType().underlying(), op->getDType()};
|
||||
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
|
||||
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
|
||||
auto perfData = perfEngine.getPerfData(perfKey);
|
||||
|
@ -38,27 +40,6 @@ void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
|
|||
}
|
||||
}
|
||||
|
||||
void CudaRuntimeObj::runWithCudaGraph(const Graph &graph) {
|
||||
if (!isCudaGraphCreated) {
|
||||
CUDAStream::createStream();
|
||||
checkCudnnError(cudnnSetStream(cudnn, CUDAStream::getCurrentStream()));
|
||||
checkCublasError(
|
||||
cublasSetStream(cublas, CUDAStream::getCurrentStream()));
|
||||
checkCudaError(cudaStreamBeginCapture(CUDAStream::getCurrentStream(),
|
||||
cudaStreamCaptureModeGlobal));
|
||||
runWithoutSync(graph);
|
||||
checkCudaError(
|
||||
cudaStreamEndCapture(CUDAStream::getCurrentStream(), &cudaGraph));
|
||||
checkCudaError(
|
||||
cudaGraphInstantiate(&cudaGraphInstance, cudaGraph, NULL, NULL, 0));
|
||||
isCudaGraphCreated = true;
|
||||
} else {
|
||||
checkCudaError(
|
||||
cudaGraphLaunch(cudaGraphInstance, CUDAStream::getCurrentStream()));
|
||||
}
|
||||
checkCudaError(cudaStreamSynchronize(CUDAStream::getCurrentStream()));
|
||||
}
|
||||
|
||||
void CudaRuntimeObj::tune(const Graph &graph, bool profiling = false) const {
|
||||
const auto &kernelRegistry = KernelRegistry::getInstance();
|
||||
auto &perfEngine = PerfEngine::getInstance();
|
||||
|
@ -67,7 +48,8 @@ void CudaRuntimeObj::tune(const Graph &graph, bool profiling = false) const {
|
|||
std::map<OpType, int> opCnt;
|
||||
for (auto &op : graph->getOperators()) {
|
||||
// HACK: set correct data type
|
||||
auto kernelAttrs = KernelAttrs{device, op->getOpType().underlying()};
|
||||
auto kernelAttrs = KernelAttrs{device, op->getOpType().underlying(),
|
||||
DataType::Float32};
|
||||
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
|
||||
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
|
||||
auto perfData = perfEngine.getPerfData(perfKey);
|
||||
|
@ -122,5 +104,4 @@ void CudaRuntimeObj::initComm(const string &name, int worldSize, int rank) {
|
|||
#endif
|
||||
}
|
||||
|
||||
cudaStream_t CUDAStream::_stream = 0;
|
||||
} // namespace infini
|
||||
|
|
|
@ -1,6 +1,4 @@
|
|||
#include "core/data_type.h"
|
||||
#include "cuda/cuda_common.h"
|
||||
#include "cuda/cuda_utility.h"
|
||||
#include <cstdio>
|
||||
|
||||
__global__ void cudaPrintFloatImpl(float *x, int len) {
|
||||
|
@ -16,60 +14,8 @@ __global__ void cudaPrintFloatImpl(float *x, int len) {
|
|||
namespace infini {
|
||||
|
||||
void cudaPrintFloat(float *x, int len) {
|
||||
cudaPrintFloatImpl
|
||||
<<<1, 1, 0, CUDAStream::getCurrentStream()>>>(x, len);
|
||||
cudaPrintFloatImpl<<<1, 1>>>(x, len);
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
void cudaPrintTensor(const Tensor &tensor) {
|
||||
cudaPrintFloat(tensor->getRawDataPtr<float *>(), tensor->size());
|
||||
}
|
||||
|
||||
cudnnDataType_t cudnnDataTypeConvert(DataType dataType) {
|
||||
if (dataType == DataType::Float32) {
|
||||
return CUDNN_DATA_FLOAT;
|
||||
}
|
||||
if (dataType == DataType::Double) {
|
||||
return CUDNN_DATA_DOUBLE;
|
||||
}
|
||||
if (dataType == DataType::Float16) {
|
||||
return CUDNN_DATA_HALF;
|
||||
}
|
||||
if (dataType == DataType::Int8) {
|
||||
return CUDNN_DATA_INT8;
|
||||
}
|
||||
if (dataType == DataType::Int32) {
|
||||
return CUDNN_DATA_INT32;
|
||||
}
|
||||
if (dataType == DataType::UInt8) {
|
||||
return CUDNN_DATA_UINT8;
|
||||
}
|
||||
if (dataType == DataType::BFloat16) {
|
||||
return CUDNN_DATA_BFLOAT16;
|
||||
}
|
||||
if (dataType == DataType::Int64) {
|
||||
return CUDNN_DATA_INT64;
|
||||
}
|
||||
if (dataType == DataType::Bool) {
|
||||
return CUDNN_DATA_BOOLEAN;
|
||||
}
|
||||
IT_ASSERT(false, "Unsupported data type");
|
||||
}
|
||||
|
||||
cudaDataType cublasDataTypeConvert(DataType dataType) {
|
||||
switch (dataType.getIndex()) {
|
||||
case 1:
|
||||
return CUDA_R_32F;
|
||||
// case 3:
|
||||
// return CUDA_R_8I;
|
||||
case 10:
|
||||
return CUDA_R_16F;
|
||||
case 11:
|
||||
return CUDA_R_64F;
|
||||
// case 16:
|
||||
// return CUDA_R_16BF;
|
||||
default:
|
||||
IT_ASSERT(false, "MatMul Unsupported data type");
|
||||
}
|
||||
}
|
||||
} // namespace infini
|
||||
|
|
|
@ -5,17 +5,14 @@
|
|||
#include "operators/conv.h"
|
||||
#include "operators/expand.h"
|
||||
#include "operators/gather.h"
|
||||
#include "operators/lrn.h"
|
||||
#include "operators/matmul.h"
|
||||
#include "operators/pad.h"
|
||||
#include "operators/pooling.h"
|
||||
#include "operators/reduce.h"
|
||||
#include "operators/reshape.h"
|
||||
#include "operators/split.h"
|
||||
#include "operators/squeeze.h"
|
||||
#include "operators/transpose.h"
|
||||
#include "operators/unary.h"
|
||||
#include "operators/unsqueeze.h"
|
||||
#include <algorithm>
|
||||
#include <pybind11/numpy.h>
|
||||
#include <pybind11/pybind11.h>
|
||||
|
@ -95,8 +92,6 @@ void export_values(py::module &m) {
|
|||
.VALUE(OpType, ReduceMean)
|
||||
.VALUE(OpType, ReduceSum)
|
||||
.VALUE(OpType, Reshape)
|
||||
.VALUE(OpType, Squeeze)
|
||||
.VALUE(OpType, Unsqueeze)
|
||||
.VALUE(OpType, Flatten)
|
||||
.VALUE(OpType, Identity)
|
||||
.VALUE(OpType, BatchNormalization)
|
||||
|
@ -118,7 +113,6 @@ void export_values(py::module &m) {
|
|||
.VALUE(OpType, Erf)
|
||||
.VALUE(OpType, Where)
|
||||
.VALUE(OpType, DepthToSpace)
|
||||
.VALUE(OpType, LRN)
|
||||
.export_values();
|
||||
|
||||
#undef VALUE
|
||||
|
@ -260,24 +254,6 @@ static vector<int64_t> reshape_shape_of(Operator op) {
|
|||
return ans;
|
||||
}
|
||||
|
||||
static vector<int64_t> squeeze_axes_of(Operator op) {
|
||||
IT_ASSERT(op->getOpType() == OpType::Squeeze);
|
||||
auto axes = dynamic_cast<const SqueezeObj *>(op.get())->getAxes();
|
||||
vector<int64_t> ans(axes.size());
|
||||
std::transform(axes.begin(), axes.end(), ans.begin(),
|
||||
[](auto x) { return static_cast<int64_t>(x); });
|
||||
return ans;
|
||||
}
|
||||
|
||||
static vector<int64_t> unsqueeze_axes_of(Operator op) {
|
||||
IT_ASSERT(op->getOpType() == OpType::Unsqueeze);
|
||||
auto axes = dynamic_cast<const UnsqueezeObj *>(op.get())->getAxes();
|
||||
vector<int64_t> ans(axes.size());
|
||||
std::transform(axes.begin(), axes.end(), ans.begin(),
|
||||
[](auto x) { return static_cast<int64_t>(x); });
|
||||
return ans;
|
||||
}
|
||||
|
||||
static vector<int64_t> expand_shape_of(Operator op) {
|
||||
IT_ASSERT(op->getOpType() == OpType::Expand);
|
||||
auto shape = dynamic_cast<const ExpandObj *>(op.get())->getShape();
|
||||
|
@ -320,14 +296,6 @@ static std::tuple<int, std::string> depth_to_space_attrs_of(Operator op) {
|
|||
depth_to_space->getModeString());
|
||||
}
|
||||
|
||||
static std::tuple<float, float, float, int> lrn_attrs_of(Operator op) {
|
||||
IT_ASSERT(op->getOpType() == OpType::LRN);
|
||||
auto lrn = dynamic_cast<const LRNObj *>(op.get());
|
||||
auto [alpha, beta, bias] = lrn->getAlphaBetaBias();
|
||||
auto size = lrn->getSize();
|
||||
return std::make_tuple(alpha, beta, bias, size);
|
||||
}
|
||||
|
||||
void export_functions(py::module &m) {
|
||||
#define FUNCTION(NAME) def(#NAME, &NAME)
|
||||
m.def("cpu_runtime", &NativeCpuRuntimeObj::getInstance)
|
||||
|
@ -364,10 +332,7 @@ void export_functions(py::module &m) {
|
|||
.FUNCTION(gather_axis_of)
|
||||
.FUNCTION(flatten_axis_of)
|
||||
.FUNCTION(cast_to_of)
|
||||
.FUNCTION(depth_to_space_attrs_of)
|
||||
.FUNCTION(squeeze_axes_of)
|
||||
.FUNCTION(unsqueeze_axes_of)
|
||||
.FUNCTION(lrn_attrs_of);
|
||||
.FUNCTION(depth_to_space_attrs_of);
|
||||
#undef FUNCTION
|
||||
}
|
||||
|
||||
|
@ -423,15 +388,11 @@ void init_graph_builder(py::module &m) {
|
|||
#endif
|
||||
#ifdef USE_BANG
|
||||
py::class_<BangRuntimeObj, std::shared_ptr<BangRuntimeObj>, RuntimeObj>(
|
||||
m, "BangRuntime")
|
||||
.def(py::init<int>(), py::arg("device") = 0)
|
||||
.def("init_comm", &BangRuntimeObj::initComm);
|
||||
m, "BangRuntime");
|
||||
#endif
|
||||
#ifdef USE_KUNLUN
|
||||
py::class_<KUNLUNRuntimeObj, std::shared_ptr<KUNLUNRuntimeObj>, RuntimeObj>(
|
||||
m, "KUNLUNRuntime")
|
||||
.def(py::init<int>(), py::arg("device") = 0)
|
||||
.def("init_comm", &KUNLUNRuntimeObj::initComm);
|
||||
m, "KUNLUNRuntime");
|
||||
#endif
|
||||
py::class_<TensorObj, std::shared_ptr<TensorObj>>(m, "Tensor",
|
||||
py::buffer_protocol())
|
||||
|
@ -506,7 +467,6 @@ void init_graph_builder(py::module &m) {
|
|||
.def("matmul", &Handler::matmul, policy::move)
|
||||
.def("batchNormalization", &Handler::batchNormalization, policy::move)
|
||||
.def("layerNormalization", &Handler::layerNormalization, policy::move)
|
||||
.def("RMSNorm", &Handler::rmsNorm, policy::move)
|
||||
.def("maxPool", &Handler::maxPool, policy::move)
|
||||
.def("avgPool", &Handler::avgPool, policy::move)
|
||||
.def("add", &Handler::add, policy::move)
|
||||
|
@ -518,7 +478,6 @@ void init_graph_builder(py::module &m) {
|
|||
.def("min", &Handler::min, policy::move)
|
||||
.def("max", &Handler::max, policy::move)
|
||||
.def("relu", &Handler::relu, policy::move)
|
||||
.def("silu", &Handler::silu, policy::move)
|
||||
.def("gelu", &Handler::gelu, policy::move)
|
||||
.def("sigmoid", &Handler::sigmoid, policy::move)
|
||||
.def("tanh", &Handler::tanh, policy::move)
|
||||
|
@ -536,12 +495,8 @@ void init_graph_builder(py::module &m) {
|
|||
.def("transpose", &Handler::transpose, policy::move)
|
||||
.def("depthToSpace", &Handler::depthToSpace, policy::move)
|
||||
.def("reshape", &Handler::reshape, policy::move)
|
||||
.def("resize", &Handler::resize, policy::move)
|
||||
.def("squeeze", &Handler::squeeze, policy::move)
|
||||
.def("unsqueeze", &Handler::unsqueeze, policy::move)
|
||||
.def("concat", &Handler::concat, policy::move)
|
||||
.def("attentionKVCache", &Handler::attentionKVCache, policy::move)
|
||||
.def("RoPE", &Handler::RoPE, policy::move)
|
||||
.def("split", &Handler::split, policy::move)
|
||||
.def("gather", &Handler::gather, policy::move)
|
||||
.def("gatherElements", &Handler::gatherElements, policy::move)
|
||||
|
@ -556,13 +511,10 @@ void init_graph_builder(py::module &m) {
|
|||
.def("allReduceAvg", &Handler::allReduceAvg, policy::move)
|
||||
.def("allGather", &Handler::allGather, policy::move)
|
||||
.def("broadcast", &Handler::broadcast, policy::move)
|
||||
.def("send", &Handler::send, policy::move)
|
||||
.def("recv", &Handler::recv, policy::move)
|
||||
.def("cast", &Handler::cast, policy::move)
|
||||
.def("expand", &Handler::expand, policy::move)
|
||||
.def("erf", &Handler::erf, policy::move)
|
||||
.def("where", &Handler::where, policy::move)
|
||||
.def("lrn", &Handler::lrn, policy::move)
|
||||
.def("topo_sort", &Handler::topo_sort, policy::automatic)
|
||||
.def("optimize", &Handler::optimize, policy::automatic)
|
||||
.def("operators", &Handler::operators, policy::move)
|
||||
|
@ -574,10 +526,6 @@ void init_graph_builder(py::module &m) {
|
|||
.def("get_perf_time", &Handler::get_perf_time, policy::automatic)
|
||||
.def("tune", &Handler::tune, policy::automatic)
|
||||
.def("run", &Handler::run, policy::automatic)
|
||||
#ifdef USE_CUDA
|
||||
.def("run_with_cudagraph", &Handler::run_with_cudagraph,
|
||||
policy::automatic)
|
||||
#endif
|
||||
.def("shape_infer", &Handler::shape_infer, policy::automatic)
|
||||
.def("change_shape", &Handler::change_shape, policy::automatic)
|
||||
.def("getDims", &Handler::getDims, policy::automatic)
|
||||
|
|
|
@ -2,17 +2,12 @@
|
|||
#include "bang/bang_runtime.h"
|
||||
#include "operators/softmax.h"
|
||||
#include "operators/unary.h"
|
||||
#include <iostream>
|
||||
|
||||
namespace infini {
|
||||
class UnaryCnnl : public BangKernelWithoutConfig {
|
||||
virtual cnnlActivationMode_t getOpType() const = 0;
|
||||
virtual float getCoef() const = 0;
|
||||
virtual tuple<float, float> getAlphBeta() const { return {1.f, 0.f}; }
|
||||
virtual float getSlicedDim() const { return 0.0; }
|
||||
virtual float getGamma() const { return 0.0; }
|
||||
virtual float getScale() const { return 0.0; }
|
||||
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<UnaryObj>(_op);
|
||||
|
@ -26,19 +21,17 @@ class UnaryCnnl : public BangKernelWithoutConfig {
|
|||
auto cDim = op->getOutput()->getDims();
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
aDim.size(), aDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, aDim.size(),
|
||||
aDim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
cDim.size(), cDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, cDim.size(),
|
||||
cDim.data()));
|
||||
cnnlActivationDescriptor_t opDesc;
|
||||
checkCnnlError(cnnlCreateActivationDescriptor(&opDesc));
|
||||
checkCnnlError(cnnlSetActivationDescriptor_v5(
|
||||
opDesc, getOpType(), CNNL_ACTIVATION_HIGH_PRECISION,
|
||||
CNNL_NOT_PROPAGATE_NAN, getCoef(), getSlicedDim(), getGamma(),
|
||||
getScale(), true));
|
||||
checkCnnlError(cnnlSetActivationDescriptor(
|
||||
opDesc, getOpType(), CNNL_NOT_PROPAGATE_NAN, getCoef()));
|
||||
|
||||
auto [alpha, beta] = getAlphBeta();
|
||||
cnnlStatus_t stat =
|
||||
|
@ -66,13 +59,13 @@ class RoundCnnl : public BangKernelWithoutConfig {
|
|||
auto cDim = op->getOutput()->getDims();
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
aDim.size(), aDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, aDim.size(),
|
||||
aDim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
cDim.size(), cDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, cDim.size(),
|
||||
cDim.data()));
|
||||
cnnlStatus_t stat =
|
||||
cnnlRound(context->cnnlHandle(), aDesc, aData, cDesc, cData);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
|
@ -97,22 +90,18 @@ class PReluCnnl : public BangKernelWithoutConfig {
|
|||
auto bDim = op->getInputs(1)->getDims();
|
||||
auto cDim = op->getOutput()->getDims();
|
||||
|
||||
if (auto alignSize = aDim.size() - bDim.size(); alignSize) {
|
||||
bDim.insert(bDim.begin(), alignSize, 1);
|
||||
}
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
aDim.size(), aDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, aDim.size(),
|
||||
aDim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
bDim.size(), bDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, bDim.size(),
|
||||
bDim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
cDim.size(), cDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, cDim.size(),
|
||||
cDim.data()));
|
||||
|
||||
cnnlStatus_t stat = cnnlPrelu(context->cnnlHandle(), aDesc, aData,
|
||||
bDesc, bData, cDesc, cData);
|
||||
|
@ -142,68 +131,48 @@ class SoftmaxCnnl : public BangKernelWithoutConfig {
|
|||
std::vector<int> inDim = {1, 1, 1};
|
||||
std::vector<int> outDim = inDim;
|
||||
|
||||
if (aDim.size() >= 3) {
|
||||
if (axis == 0) {
|
||||
mode = CNNL_SOFTMAX_MODE_HIGH_DIMENSION;
|
||||
inDim[0] = aDim[0];
|
||||
inDim[1] = aDim[1];
|
||||
for (size_t i = 2; i < aDim.size(); ++i) {
|
||||
inDim[2] *= aDim[i];
|
||||
}
|
||||
outDim = inDim;
|
||||
} else if (axis == aDim.size() - 1) {
|
||||
mode = CNNL_SOFTMAX_MODE_LOW_DIMENSION;
|
||||
inDim[0] = aDim[0];
|
||||
for (size_t i = 1; i < axis; ++i) {
|
||||
inDim[1] *= aDim[i];
|
||||
}
|
||||
inDim[2] = aDim[axis];
|
||||
outDim = inDim;
|
||||
} else {
|
||||
mode = CNNL_SOFTMAX_MODE_MEDIUM_DIMENSION;
|
||||
for (size_t i = 0; i < axis; ++i) {
|
||||
inDim[0] *= aDim[i];
|
||||
}
|
||||
inDim[1] = aDim[axis];
|
||||
for (size_t i = axis + 1; i < aDim.size(); ++i) {
|
||||
inDim[2] *= aDim[i];
|
||||
}
|
||||
outDim = inDim;
|
||||
}
|
||||
} else if (aDim.size() == 2) {
|
||||
if (axis == 0) {
|
||||
mode = CNNL_SOFTMAX_MODE_HIGH_DIMENSION;
|
||||
inDim = aDim;
|
||||
inDim.push_back(1);
|
||||
outDim = inDim;
|
||||
} else {
|
||||
mode = CNNL_SOFTMAX_MODE_LOW_DIMENSION;
|
||||
inDim = aDim;
|
||||
inDim.insert(inDim.begin(), 1);
|
||||
outDim = inDim;
|
||||
}
|
||||
} else {
|
||||
if (axis == 0) {
|
||||
mode = CNNL_SOFTMAX_MODE_HIGH_DIMENSION;
|
||||
inDim = aDim;
|
||||
inDim.push_back(1);
|
||||
inDim.push_back(1);
|
||||
inDim[0] = aDim[0];
|
||||
inDim[1] = aDim[1];
|
||||
for (size_t i = 2; i < aDim.size(); ++i) {
|
||||
inDim[2] *= aDim[i];
|
||||
}
|
||||
outDim = inDim;
|
||||
} else if (axis == aDim.size() - 1) {
|
||||
mode = CNNL_SOFTMAX_MODE_LOW_DIMENSION;
|
||||
inDim[0] = aDim[0];
|
||||
for (size_t i = 1; i < axis; ++i) {
|
||||
inDim[1] *= aDim[i];
|
||||
}
|
||||
inDim[2] = aDim[axis];
|
||||
outDim = inDim;
|
||||
} else {
|
||||
mode = CNNL_SOFTMAX_MODE_MEDIUM_DIMENSION;
|
||||
for (size_t i = 0; i < axis; ++i) {
|
||||
inDim[0] *= aDim[i];
|
||||
}
|
||||
inDim[1] = aDim[axis];
|
||||
for (size_t i = axis + 1; i < aDim.size(); ++i) {
|
||||
inDim[2] *= aDim[i];
|
||||
}
|
||||
outDim = inDim;
|
||||
}
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
|
||||
inDim.size(), inDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY,
|
||||
CNNL_DTYPE_FLOAT, inDim.size(),
|
||||
inDim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
|
||||
outDim.size(), outDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY,
|
||||
CNNL_DTYPE_FLOAT, outDim.size(),
|
||||
outDim.data()));
|
||||
float alpha = 1.0;
|
||||
float beta = 0.0;
|
||||
cnnlStatus_t stat =
|
||||
cnnlSoftmaxForward_v2(context->cnnlHandle(), CNNL_SOFTMAX_ACCURATE,
|
||||
mode, CNNL_COMPUTATION_ULTRAHIGH_PRECISION,
|
||||
&alpha, aDesc, aData, &beta, cDesc, cData);
|
||||
mode, CNNL_COMPUTATION_HIGH_PRECISION, &alpha,
|
||||
aDesc, aData, &beta, cDesc, cData);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||
|
@ -225,32 +194,15 @@ class SigmoidCnnl : public UnaryCnnl {
|
|||
float getCoef() const override { return 0.0; }
|
||||
};
|
||||
|
||||
class HardSwishCnnl : public UnaryCnnl {
|
||||
cnnlActivationMode_t getOpType() const override {
|
||||
return CNNL_ACTIVATION_HARDSWISH;
|
||||
}
|
||||
float getCoef() const override { return 0.0; }
|
||||
};
|
||||
|
||||
class HardSigmoidCnnl : public UnaryCnnl {
|
||||
cnnlActivationMode_t getOpType() const override {
|
||||
return CNNL_ACTIVATION_HARDSIGMOID;
|
||||
}
|
||||
float getCoef() const override { return 0.0; }
|
||||
float getGamma() const override { return 1.f / 6.f; }
|
||||
float getScale() const override { return 0.5f; }
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Relu, ReluCnnl, "Relu_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::PRelu, PReluCnnl, "PRelu_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Sigmoid, SigmoidCnnl,
|
||||
"Sigmoid_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Round, RoundCnnl, "Round_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Softmax, SoftmaxCnnl,
|
||||
"Softmax_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::HardSigmoid, HardSigmoidCnnl,
|
||||
"HardSigmoid_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::HardSwish, HardSwishCnnl,
|
||||
"HardSwish_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Relu, DataType::Float32, ReluCnnl,
|
||||
"Relu_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::PRelu, DataType::Float32, PReluCnnl,
|
||||
"PRelu_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Sigmoid, DataType::Float32, SigmoidCnnl,
|
||||
"Sigmoid_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Round, DataType::Float32, RoundCnnl,
|
||||
"Round_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Softmax, DataType::Float32, SoftmaxCnnl,
|
||||
"Softmax_cnnl_BANG_Float32");
|
||||
|
||||
}; // namespace infini
|
||||
|
|
|
@ -24,21 +24,21 @@ class ActivationBackwardCnnl : public BangKernelWithoutConfig {
|
|||
auto diffxDim = op->getOutput()->getDims();
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&yDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
yDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
yDim.size(), yDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(yDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, yDim.size(),
|
||||
yDim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&diffYDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
diffYDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
diffyDim.size(), diffyDim.data()));
|
||||
diffYDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, diffyDim.size(),
|
||||
diffyDim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&xDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
xDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
xDim.size(), xDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(xDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, xDim.size(),
|
||||
xDim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&diffXDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
diffXDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
diffxDim.size(), diffxDim.data()));
|
||||
diffXDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, diffxDim.size(),
|
||||
diffxDim.data()));
|
||||
// get op descriptor
|
||||
cnnlActivationDescriptor_t opDesc;
|
||||
checkCnnlError(cnnlCreateActivationDescriptor(&opDesc));
|
||||
|
@ -81,11 +81,11 @@ class TanhBackwardCnnl : public ActivationBackwardCnnl {
|
|||
float getCoef() const override { return 0.0; }
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::ReluBackward, ReluBackwardCnnl,
|
||||
"ReluBackward_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::SigmoidBackward, SigmoidBackwardCnnl,
|
||||
"SigmoidBackward_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::TanhBackward, TanhBackwardCnnl,
|
||||
"TanhBackward_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::ReluBackward, DataType::Float32,
|
||||
ReluBackwardCnnl, "ReluBackward_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::SigmoidBackward, DataType::Float32,
|
||||
SigmoidBackwardCnnl, "SigmoidBackward_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::TanhBackward, DataType::Float32,
|
||||
TanhBackwardCnnl, "TanhBackward_cnnl_BANG_Float32");
|
||||
|
||||
}; // namespace infini
|
||||
|
|
|
@ -1,49 +0,0 @@
|
|||
#ifdef INFINI_USE_CNCL
|
||||
#include "operators/all_gather.h"
|
||||
#include "bang/bang_kernel_without_config.h"
|
||||
#include "bang/bang_runtime.h"
|
||||
#include "bang/cncl_communicator.h"
|
||||
#include <thread>
|
||||
namespace infini {
|
||||
class AllGatherCNCL : public BangKernelWithoutConfig {
|
||||
public:
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<AllGatherObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
int world_size = op->getWorldSize();
|
||||
// Check if world size info in operator matches runtime
|
||||
IT_ASSERT(world_size == context->getCommunicator().getWorldSize());
|
||||
|
||||
void *input = op->getInputs(0)->getRawDataPtr<void *>();
|
||||
BangPtr output_temp =
|
||||
context->getWorkspace(op->getInputs(0)->getBytes() * world_size);
|
||||
// void *output = op->getOutput()->getRawDataPtr<void *>();
|
||||
checkBangError(cnrtMalloc(&output_temp,
|
||||
op->getInputs(0)->getBytes() * world_size));
|
||||
size_t bytes = op->getInputs(0)->getBytes();
|
||||
size_t count = bytes / op->getDType().getSize();
|
||||
|
||||
cnclComm_t comm =
|
||||
dynamic_cast<CnclCommunicatorObj &>(context->getCommunicator())
|
||||
.getCnclComm();
|
||||
cnrtQueue_t queue = context->getBangQueue();
|
||||
CNCL_CHECK(cnclAllGather(input, output_temp, count,
|
||||
cnclDataTypeConvert(op->getDType()), comm,
|
||||
queue));
|
||||
checkBangError(cnrtQueueSync(queue));
|
||||
for (int i = 0; i < world_size; ++i) {
|
||||
Tensor output = op->getOutput(i);
|
||||
context->copyBlobInsideRuntime(
|
||||
output->getRawDataPtr<float *>(),
|
||||
static_cast<float *>(output_temp) + i * count, bytes);
|
||||
}
|
||||
checkBangError(cnrtFree(output_temp));
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::AllGather, AllGatherCNCL,
|
||||
"AllGather_CNCL_BANG");
|
||||
} // namespace infini
|
||||
|
||||
#endif
|
|
@ -1,54 +0,0 @@
|
|||
#ifdef INFINI_USE_CNCL
|
||||
#include "operators/all_reduce.h"
|
||||
#include "bang/bang_kernel_without_config.h"
|
||||
#include "bang/bang_runtime.h"
|
||||
#include "bang/cncl_communicator.h"
|
||||
#include <thread>
|
||||
namespace infini {
|
||||
class AllReduceCNCL : public BangKernelWithoutConfig {
|
||||
public:
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<AllReduceBaseObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
void *input = op->getInputs(0)->getRawDataPtr<void *>();
|
||||
void *output = op->getOutput()->getRawDataPtr<void *>();
|
||||
size_t bytes = op->getInputs(0)->getBytes();
|
||||
size_t count = bytes / op->getDType().getSize();
|
||||
cnclComm_t comm =
|
||||
dynamic_cast<CnclCommunicatorObj &>(context->getCommunicator())
|
||||
.getCnclComm();
|
||||
cnrtQueue_t queue = context->getBangQueue();
|
||||
// checkBangError(cnrtQueueSync(queue));
|
||||
CNCL_CHECK(cnclAllReduce(input, output, count,
|
||||
cnclDataTypeConvert(op->getDType()),
|
||||
getRedOp(), comm, queue));
|
||||
checkBangError(cnrtQueueSync(queue));
|
||||
}
|
||||
|
||||
virtual cnclReduceOp_t getRedOp() const = 0;
|
||||
};
|
||||
|
||||
class AllReduceSumCNCL : public AllReduceCNCL {
|
||||
cnclReduceOp_t getRedOp() const override { return cnclSum; }
|
||||
};
|
||||
class AllReduceProdCNCL : public AllReduceCNCL {
|
||||
cnclReduceOp_t getRedOp() const override { return cnclProd; }
|
||||
};
|
||||
class AllReduceMinCNCL : public AllReduceCNCL {
|
||||
cnclReduceOp_t getRedOp() const override { return cnclMin; }
|
||||
};
|
||||
class AllReduceMaxCNCL : public AllReduceCNCL {
|
||||
cnclReduceOp_t getRedOp() const override { return cnclMax; }
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::AllReduceSum, AllReduceSumCNCL,
|
||||
"AllReduce_Sum_CNCL_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::AllReduceProd, AllReduceProdCNCL,
|
||||
"AllReduce_Prod_CNCL_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::AllReduceMin, AllReduceMinCNCL,
|
||||
"AllReduce_Min_CNCL_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::AllReduceMax, AllReduceMaxCNCL,
|
||||
"AllReduce_Max_CNCL_BANG");
|
||||
} // namespace infini
|
||||
#endif
|
|
@ -17,91 +17,55 @@ class BatchNormCnnl : public BangKernelWithoutConfig {
|
|||
void *const output = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
auto dims = op->getInputs(0)->getDims();
|
||||
auto outDims = op->getOutput()->getDims();
|
||||
|
||||
if (dims.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int dimsTrans[4] = {dims[0], dims[2], dims[3], dims[1]};
|
||||
int dimsOutTrans[4] = {outDims[0], outDims[2], outDims[3], outDims[1]};
|
||||
int permute[4] = {0, 2, 3, 1};
|
||||
int permuteOut[4] = {0, 3, 1, 2};
|
||||
int dimArray[4], strideArray[4], dimPArray[1], stridePArray[1];
|
||||
|
||||
for (size_t i = 0; i < dims.size(); ++i) {
|
||||
dimArray[i] = dims[i];
|
||||
strideArray[i] = op->getInputs(0)->getStride()[i];
|
||||
}
|
||||
int w = dimArray[3];
|
||||
dimArray[3] = dimArray[1];
|
||||
int h = dimArray[2];
|
||||
dimArray[1] = h;
|
||||
dimArray[2] = w;
|
||||
|
||||
dimPArray[0] = op->getInputs(1)->getDims()[0];
|
||||
stridePArray[0] = op->getInputs(1)->getDims()[0];
|
||||
// get inputs
|
||||
cnnlTensorDescriptor_t inDesc, intransDesc, outDesc, outtransDesc;
|
||||
cnnlTensorDescriptor_t inDesc;
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&inDesc));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&intransDesc));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&outDesc));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&outtransDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
inDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
dims.size(), dims.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
intransDesc, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()),
|
||||
dims.size(), dimsTrans));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
outDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
outDims.size(), outDims.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
outtransDesc, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()),
|
||||
outDims.size(), dimsOutTrans));
|
||||
cnnlTransposeDescriptor_t opDesc;
|
||||
checkCnnlError(cnnlCreateTransposeDescriptor(&opDesc));
|
||||
checkCnnlError(cnnlSetTransposeDescriptor(opDesc, 4, permute));
|
||||
size_t wsSize;
|
||||
cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), inDesc, opDesc,
|
||||
&wsSize);
|
||||
BangPtr wsData = context->getWorkspace(wsSize);
|
||||
BangPtr inputTrans = context->getWorkspace(
|
||||
cnnlGetTensorElementNum(inDesc) * op->getDType().getSize());
|
||||
BangPtr outputTrans = context->getWorkspace(
|
||||
cnnlGetTensorElementNum(inDesc) * op->getDType().getSize());
|
||||
cnnlStatus_t stat =
|
||||
cnnlTranspose_v2(context->cnnlHandle(), opDesc, inDesc, input,
|
||||
intransDesc, inputTrans, wsData, wsSize);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
checkCnnlError(cnnlSetTensorDescriptorEx(inDesc, CNNL_LAYOUT_NHWC,
|
||||
CNNL_DTYPE_FLOAT, dims.size(),
|
||||
dimArray, strideArray));
|
||||
|
||||
// get bnScaleBiasMeanVarDesc
|
||||
auto dimsScaleBiasMeanVar = op->getInputs(1)->getDims();
|
||||
cnnlTensorDescriptor_t paraDesc;
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(¶Desc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
paraDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
|
||||
dimsScaleBiasMeanVar.size(), dimsScaleBiasMeanVar.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptorEx(paraDesc, CNNL_LAYOUT_ARRAY,
|
||||
CNNL_DTYPE_FLOAT, 1, dimPArray,
|
||||
stridePArray));
|
||||
|
||||
float alpha = 1.f, beta = 0.f;
|
||||
// This mode is intended for use after convolutional layers
|
||||
stat = cnnlBatchNormForwardInference(
|
||||
context->cnnlHandle(), &alpha, &beta, intransDesc, inputTrans,
|
||||
paraDesc, scale, bias, mean, var, op->getEps(), outtransDesc,
|
||||
outputTrans);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
cnnlStatus_t stat = cnnlBatchNormForwardInference(
|
||||
context->cnnlHandle(), &alpha, &beta, inDesc, input, paraDesc,
|
||||
scale, bias, mean, var, op->getEps(), inDesc, output);
|
||||
|
||||
cnnlTransposeDescriptor_t op2Desc;
|
||||
checkCnnlError(cnnlCreateTransposeDescriptor(&op2Desc));
|
||||
checkCnnlError(cnnlSetTransposeDescriptor(op2Desc, 4, permuteOut));
|
||||
cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), intransDesc,
|
||||
op2Desc, &wsSize);
|
||||
BangPtr ws2Data = context->getWorkspace(wsSize);
|
||||
stat = cnnlTranspose_v2(context->cnnlHandle(), op2Desc, outtransDesc,
|
||||
outputTrans, outDesc, output, ws2Data, wsSize);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(inDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(outDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(intransDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(outtransDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(paraDesc));
|
||||
checkCnnlError(cnnlDestroyTransposeDescriptor(opDesc));
|
||||
checkCnnlError(cnnlDestroyTransposeDescriptor(op2Desc));
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::BatchNormalization, BatchNormCnnl,
|
||||
"BatchNorm_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::BatchNormalization, DataType::Float32,
|
||||
BatchNormCnnl, "BatchNorm_cnnl_BANG_Float32");
|
||||
|
||||
}; // namespace infini
|
||||
|
|
|
@ -1,35 +0,0 @@
|
|||
#ifdef INFINI_USE_CNCL
|
||||
#include "operators/broadcast.h"
|
||||
#include "bang/bang_kernel_without_config.h"
|
||||
#include "bang/bang_runtime.h"
|
||||
#include "bang/cncl_communicator.h"
|
||||
#include <thread>
|
||||
namespace infini {
|
||||
class BroadcastCNCL : public BangKernelWithoutConfig {
|
||||
public:
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<BroadcastObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
void *input = op->getInputs(0)->getRawDataPtr<void *>();
|
||||
void *output = op->getOutput()->getRawDataPtr<void *>();
|
||||
size_t bytes = op->getInputs(0)->getBytes();
|
||||
size_t count = bytes / op->getDType().getSize();
|
||||
|
||||
cnclComm_t comm =
|
||||
dynamic_cast<CnclCommunicatorObj &>(context->getCommunicator())
|
||||
.getCnclComm();
|
||||
cnrtQueue_t queue = context->getBangQueue();
|
||||
// TODO: Using default stream 0 for now.
|
||||
CNCL_CHECK(cnclBroadcast(input, output, count,
|
||||
cnclDataTypeConvert(op->getDType()),
|
||||
op->getRoot(), comm, queue));
|
||||
checkBangError(cnrtQueueSync(queue));
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Broadcast, BroadcastCNCL,
|
||||
"Broadcast_CNCL_BANG");
|
||||
} // namespace infini
|
||||
|
||||
#endif
|
|
@ -199,24 +199,6 @@ class CastCnnl : public BangKernelWithoutConfig {
|
|||
dim.data()));
|
||||
NlCastType = CNNL_CAST_UINT32_TO_INT64;
|
||||
break;
|
||||
case CastType::Float162Float:
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_HALF, dim.size(),
|
||||
dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, dim.size(),
|
||||
dim.data()));
|
||||
NlCastType = CNNL_CAST_HALF_TO_FLOAT;
|
||||
break;
|
||||
case CastType::Float2Float16:
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, dim.size(),
|
||||
dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_HALF, dim.size(),
|
||||
dim.data()));
|
||||
NlCastType = CNNL_CAST_FLOAT_TO_HALF;
|
||||
break;
|
||||
default:
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
|
@ -230,6 +212,7 @@ class CastCnnl : public BangKernelWithoutConfig {
|
|||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Cast, CastCnnl, "Cast_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Cast, DataType::Float32, CastCnnl,
|
||||
"Cast_cnnl_BANG_Float32");
|
||||
|
||||
}; // namespace infini
|
||||
|
|
|
@ -17,13 +17,13 @@ class CeilCnnl : public BangKernelWithoutConfig {
|
|||
auto cDim = op->getOutput()->getDims();
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
aDim.size(), aDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, aDim.size(),
|
||||
aDim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
cDim.size(), cDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, cDim.size(),
|
||||
cDim.data()));
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlCeil(context->cnnlHandle(), aDesc, aData, cDesc, cData);
|
||||
|
@ -35,6 +35,7 @@ class CeilCnnl : public BangKernelWithoutConfig {
|
|||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Ceil, CeilCnnl, "Ceil_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Ceil, DataType::Float32, CeilCnnl,
|
||||
"Ceil_cnnl_BANG_Float32");
|
||||
|
||||
}; // namespace infini
|
||||
|
|
|
@ -18,9 +18,9 @@ class ClipCnnl : public BangKernelWithoutConfig {
|
|||
auto aDim = op->getInputs(0)->getDims();
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
aDim.size(), aDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, aDim.size(),
|
||||
aDim.data()));
|
||||
cnnlStatus_t stat =
|
||||
cnnlClip(context->cnnlHandle(), aDesc, aData, &min, &max, cData);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
|
@ -30,6 +30,7 @@ class ClipCnnl : public BangKernelWithoutConfig {
|
|||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Clip, ClipCnnl, "Clip_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Clip, DataType::Float32, ClipCnnl,
|
||||
"Clip_cnnl_BANG_Float32");
|
||||
|
||||
}; // namespace infini
|
||||
|
|
|
@ -14,18 +14,17 @@ class ConcatCnnl : public BangKernelWithoutConfig {
|
|||
auto cDim = op->getOutput()->getDims();
|
||||
cnnlTensorDescriptor_t desc;
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&desc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
desc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
cDim.size(), cDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(desc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, cDim.size(),
|
||||
cDim.data()));
|
||||
|
||||
cnnlTensorDescriptor_t descArray[num];
|
||||
for (int i = 0; i < num; ++i) {
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&descArray[i]));
|
||||
checkCnnlError(
|
||||
cnnlSetTensorDescriptor(descArray[i], CNNL_LAYOUT_NCHW,
|
||||
cnnlDataTypeConvert(op->getDType()),
|
||||
op->getInputs(i)->getDims().size(),
|
||||
op->getInputs(i)->getDims().data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
descArray[i], CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT,
|
||||
op->getInputs(i)->getDims().size(),
|
||||
op->getInputs(i)->getDims().data()));
|
||||
}
|
||||
|
||||
void *argv[num];
|
||||
|
@ -51,5 +50,6 @@ class ConcatCnnl : public BangKernelWithoutConfig {
|
|||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Concat, ConcatCnnl, "Concat_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Concat, DataType::Float32, ConcatCnnl,
|
||||
"Concat_cnnl_BANG_Float32");
|
||||
}; // namespace infini
|
||||
|
|
|
@ -20,9 +20,8 @@ class ConvCnnl : public BangKernelWithoutConfig {
|
|||
|
||||
cnnlConvolutionDescriptor_t convDesc;
|
||||
checkCnnlError(cnnlCreateConvolutionDescriptor(&convDesc));
|
||||
checkCnnlError(
|
||||
cnnlSetConvolutionDescriptor(convDesc, 4, pad, stride, dilation, g,
|
||||
cnnlDataTypeConvert(op->getDType())));
|
||||
checkCnnlError(cnnlSetConvolutionDescriptor(
|
||||
convDesc, 4, pad, stride, dilation, g, CNNL_DTYPE_FLOAT));
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
|
@ -55,24 +54,20 @@ class ConvCnnl : public BangKernelWithoutConfig {
|
|||
|
||||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aInDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aInDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
|
||||
inputs0));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aInDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, inputs0));
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()), 4,
|
||||
inputs0Array));
|
||||
aDesc, CNNL_LAYOUT_NHWC, CNNL_DTYPE_FLOAT, 4, inputs0Array));
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bInDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
bInDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
|
||||
inputs1));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(bInDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, inputs1));
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
bDesc, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()), 4,
|
||||
inputs1Array));
|
||||
bDesc, CNNL_LAYOUT_NHWC, CNNL_DTYPE_FLOAT, 4, inputs1Array));
|
||||
|
||||
int permute[4] = {0, 2, 3, 1};
|
||||
cnnlTransposeDescriptor_t opDesc;
|
||||
|
@ -84,7 +79,7 @@ class ConvCnnl : public BangKernelWithoutConfig {
|
|||
&wsSize);
|
||||
BangPtr wsData = context->getWorkspace(wsSize);
|
||||
BangPtr aDataOut = context->getWorkspace(
|
||||
cnnlGetTensorElementNum(aInDesc) * op->getDType().getSize());
|
||||
cnnlGetTensorElementNum(aInDesc) * sizeof(float));
|
||||
cnnlStatus_t stat =
|
||||
cnnlTranspose_v2(context->cnnlHandle(), opDesc, aInDesc, aData,
|
||||
aDesc, aDataOut, wsData, wsSize);
|
||||
|
@ -95,7 +90,7 @@ class ConvCnnl : public BangKernelWithoutConfig {
|
|||
&wsSize);
|
||||
wsData = context->getWorkspace(wsSize);
|
||||
BangPtr bDataOut = context->getWorkspace(
|
||||
cnnlGetTensorElementNum(bInDesc) * op->getDType().getSize());
|
||||
cnnlGetTensorElementNum(bInDesc) * sizeof(float));
|
||||
stat = cnnlTranspose_v2(context->cnnlHandle(), opDesc, bInDesc, bData,
|
||||
bDesc, bDataOut, wsData, wsSize);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
|
@ -104,13 +99,11 @@ class ConvCnnl : public BangKernelWithoutConfig {
|
|||
// get outputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cInDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cInDesc, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()), 4,
|
||||
outputArray));
|
||||
cInDesc, CNNL_LAYOUT_NHWC, CNNL_DTYPE_FLOAT, 4, outputArray));
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
|
||||
output));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, output));
|
||||
|
||||
cnnlConvolutionForwardAlgo_t algo;
|
||||
cnnlGetConvolutionForwardAlgorithm(context->cnnlHandle(), convDesc,
|
||||
|
@ -122,7 +115,7 @@ class ConvCnnl : public BangKernelWithoutConfig {
|
|||
algo, &wsSize);
|
||||
wsData = context->getWorkspace(wsSize);
|
||||
BangPtr cDataIn = context->getWorkspace(
|
||||
cnnlGetTensorElementNum(cInDesc) * op->getDType().getSize());
|
||||
cnnlGetTensorElementNum(cInDesc) * sizeof(float));
|
||||
|
||||
stat = cnnlConvolutionForward(
|
||||
context->cnnlHandle(), convDesc, algo, NULL, aDesc, aDataOut, bDesc,
|
||||
|
@ -158,5 +151,6 @@ class ConvCnnl : public BangKernelWithoutConfig {
|
|||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Conv, ConvCnnl, "Conv_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Conv, DataType::Float32, ConvCnnl,
|
||||
"Conv_cnnl_BANG_Float32");
|
||||
}; // namespace infini
|
||||
|
|
|
@ -20,9 +20,8 @@ class ConvTransCnnl : public BangKernelWithoutConfig {
|
|||
|
||||
cnnlConvolutionDescriptor_t convDesc;
|
||||
checkCnnlError(cnnlCreateConvolutionDescriptor(&convDesc));
|
||||
checkCnnlError(
|
||||
cnnlSetConvolutionDescriptor(convDesc, 4, pad, stride, dilation, g,
|
||||
cnnlDataTypeConvert(op->getDType())));
|
||||
checkCnnlError(cnnlSetConvolutionDescriptor(
|
||||
convDesc, 4, pad, stride, dilation, g, CNNL_DTYPE_FLOAT));
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
|
@ -43,17 +42,14 @@ class ConvTransCnnl : public BangKernelWithoutConfig {
|
|||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
|
||||
dimInputs0.data()));
|
||||
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dimInputs0.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
|
||||
dimInputs1.data()));
|
||||
bDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dimInputs1.data()));
|
||||
// get outputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
|
||||
dimOutput.data()));
|
||||
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dimOutput.data()));
|
||||
|
||||
cnnlConvolutionBwdDataAlgo_t algo;
|
||||
cnnlGetConvolutionBackwardDataAlgorithm(
|
||||
|
@ -80,6 +76,6 @@ class ConvTransCnnl : public BangKernelWithoutConfig {
|
|||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::ConvTranspose, ConvTransCnnl,
|
||||
"ConvTrans_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::ConvTranspose, DataType::Float32,
|
||||
ConvTransCnnl, "ConvTrans_cnnl_BANG_Float32");
|
||||
}; // namespace infini
|
||||
|
|
|
@ -20,9 +20,8 @@ class ConvBackwardFilterCnnl : public BangKernelWithoutConfig {
|
|||
|
||||
cnnlConvolutionDescriptor_t convDesc;
|
||||
checkCnnlError(cnnlCreateConvolutionDescriptor(&convDesc));
|
||||
checkCnnlError(
|
||||
cnnlSetConvolutionDescriptor(convDesc, 4, pad, stride, dilation, g,
|
||||
cnnlDataTypeConvert(op->getDType())));
|
||||
checkCnnlError(cnnlSetConvolutionDescriptor(
|
||||
convDesc, 4, pad, stride, dilation, g, CNNL_DTYPE_FLOAT));
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
|
@ -63,16 +62,15 @@ class ConvBackwardFilterCnnl : public BangKernelWithoutConfig {
|
|||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
|
||||
inputs0Array));
|
||||
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, inputs0Array));
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDescTrans));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDescTrans, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()),
|
||||
4, inputs0ArrayTrans));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDescTrans, CNNL_LAYOUT_NHWC,
|
||||
CNNL_DTYPE_FLOAT, 4,
|
||||
inputs0ArrayTrans));
|
||||
|
||||
size_t wsTrans1Size = dimInputs0[0] * dimInputs0[1] * dimInputs0[2] *
|
||||
dimInputs0[3] * op->getDType().getSize();
|
||||
dimInputs0[3] * sizeof(float);
|
||||
BangPtr wsTrans1Data = context->getWorkspace(wsTrans1Size);
|
||||
|
||||
cnnlStatus_t stat =
|
||||
|
@ -83,16 +81,15 @@ class ConvBackwardFilterCnnl : public BangKernelWithoutConfig {
|
|||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
|
||||
inputs1Array));
|
||||
bDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, inputs1Array));
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDescTrans));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
bDescTrans, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()),
|
||||
4, inputs1ArrayTrans));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(bDescTrans, CNNL_LAYOUT_NHWC,
|
||||
CNNL_DTYPE_FLOAT, 4,
|
||||
inputs1ArrayTrans));
|
||||
|
||||
size_t wsTrans2Size = dimInputs1[0] * dimInputs1[1] * dimInputs1[2] *
|
||||
dimInputs1[3] * op->getDType().getSize();
|
||||
dimInputs1[3] * sizeof(float);
|
||||
BangPtr wsTrans2Data = context->getWorkspace(wsTrans2Size);
|
||||
|
||||
stat = cnnlTranspose(context->cnnlHandle(), transDesc, bDesc, bData,
|
||||
|
@ -103,16 +100,15 @@ class ConvBackwardFilterCnnl : public BangKernelWithoutConfig {
|
|||
// get outputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
|
||||
outputArray));
|
||||
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, outputArray));
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDescTrans));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDescTrans, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()),
|
||||
4, outputArrayTrans));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDescTrans, CNNL_LAYOUT_NHWC,
|
||||
CNNL_DTYPE_FLOAT, 4,
|
||||
outputArrayTrans));
|
||||
|
||||
size_t wsTrans3Size = dimOutput[0] * dimOutput[1] * dimOutput[2] *
|
||||
dimOutput[3] * op->getDType().getSize();
|
||||
dimOutput[3] * sizeof(float);
|
||||
BangPtr wsTrans3Data = context->getWorkspace(wsTrans3Size);
|
||||
|
||||
cnnlConvolutionBwdFilterAlgo_t algo;
|
||||
|
@ -158,6 +154,6 @@ class ConvBackwardFilterCnnl : public BangKernelWithoutConfig {
|
|||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::ConvBackwardFilter,
|
||||
ConvBackwardFilterCnnl, "ConvBackwardFilter_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::ConvBackwardFilter, DataType::Float32,
|
||||
ConvBackwardFilterCnnl, "ConvBackwardFilter_cnnl_BANG_Float32");
|
||||
}; // namespace infini
|
||||
|
|
|
@ -23,14 +23,14 @@ class DetCnnl : public BangKernelWithoutConfig {
|
|||
auto dimout = op->getOutput()->getDims();
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
|
||||
dimin.size(), dimin.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY,
|
||||
CNNL_DTYPE_FLOAT, dimin.size(),
|
||||
dimin.data()));
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
|
||||
dimout.size(), dimout.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY,
|
||||
CNNL_DTYPE_FLOAT, dimout.size(),
|
||||
dimout.data()));
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlDet(context->cnnlHandle(), nlMode, aDesc, aData, cDesc, cData);
|
||||
|
@ -42,5 +42,6 @@ class DetCnnl : public BangKernelWithoutConfig {
|
|||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Det, DetCnnl, "Det_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Det, DataType::Float32, DetCnnl,
|
||||
"Det_cnnl_BANG_Float32");
|
||||
}; // namespace infini
|
||||
|
|
|
@ -12,7 +12,6 @@ class ElementWiseCnnl : public BangKernelWithoutConfig {
|
|||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ElementWiseObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
auto [aAlpha, bAlpha, beta] = getAlphBeta();
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
|
@ -31,33 +30,32 @@ class ElementWiseCnnl : public BangKernelWithoutConfig {
|
|||
}
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
a_dim.size(), a_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, a_dim.size(),
|
||||
a_dim.data()));
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
b_dim.size(), b_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, b_dim.size(),
|
||||
b_dim.data()));
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
c_dim.size(), c_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, c_dim.size(),
|
||||
c_dim.data()));
|
||||
|
||||
cnnlOpTensorDescriptor_t opDesc;
|
||||
checkCnnlError(cnnlCreateOpTensorDescriptor(&opDesc));
|
||||
checkCnnlError(cnnlSetOpTensorDescriptor(
|
||||
opDesc, getOpType(), cnnlDataTypeConvert(op->getDType()),
|
||||
CNNL_NOT_PROPAGATE_NAN));
|
||||
opDesc, getOpType(), CNNL_DTYPE_FLOAT, CNNL_NOT_PROPAGATE_NAN));
|
||||
|
||||
size_t wsSize;
|
||||
cnnlGetOpTensorWorkspaceSize_v2(context->cnnlHandle(), opDesc, &aAlpha,
|
||||
aDesc, aData, &bAlpha, bDesc, bData,
|
||||
&beta, cDesc, cData, &wsSize);
|
||||
cnnlGetOpTensorWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
|
||||
&wsSize);
|
||||
|
||||
BangPtr wsData = context->getWorkspace(wsSize);
|
||||
|
||||
auto [aAlpha, bAlpha, beta] = getAlphBeta();
|
||||
cnnlStatus_t stat = cnnlOpTensor(context->cnnlHandle(), opDesc, &aAlpha,
|
||||
aDesc, aData, &bAlpha, bDesc, bData,
|
||||
wsData, wsSize, &beta, cDesc, cData);
|
||||
|
@ -95,17 +93,17 @@ class LogicOpCnnl : public BangKernelWithoutConfig {
|
|||
}
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
a_dim.size(), a_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, a_dim.size(),
|
||||
a_dim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
b_dim.size(), b_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, b_dim.size(),
|
||||
b_dim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
c_dim.size(), c_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, c_dim.size(),
|
||||
c_dim.data()));
|
||||
|
||||
size_t wsSize;
|
||||
cnnlGetLogicOpWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
|
||||
|
@ -200,17 +198,17 @@ class DivCnnl : public BangKernelWithoutConfig {
|
|||
}
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
a_dim.size(), a_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, a_dim.size(),
|
||||
a_dim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
b_dim.size(), b_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, b_dim.size(),
|
||||
b_dim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
c_dim.size(), c_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, c_dim.size(),
|
||||
c_dim.data()));
|
||||
|
||||
size_t wsSize;
|
||||
cnnlGetDivWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
|
||||
|
@ -252,17 +250,17 @@ class MaximumCnnl : public BangKernelWithoutConfig {
|
|||
}
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
a_dim.size(), a_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, a_dim.size(),
|
||||
a_dim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
b_dim.size(), b_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, b_dim.size(),
|
||||
b_dim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
c_dim.size(), c_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, c_dim.size(),
|
||||
c_dim.data()));
|
||||
|
||||
size_t wsSize;
|
||||
cnnlGetMaximumWorkspaceSize(context->cnnlHandle(), cDesc, &wsSize);
|
||||
|
@ -303,17 +301,17 @@ class MinimumCnnl : public BangKernelWithoutConfig {
|
|||
}
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
a_dim.size(), a_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, a_dim.size(),
|
||||
a_dim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
b_dim.size(), b_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, b_dim.size(),
|
||||
b_dim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
c_dim.size(), c_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, c_dim.size(),
|
||||
c_dim.data()));
|
||||
|
||||
size_t wsSize;
|
||||
cnnlGetMinimumWorkspaceSize(context->cnnlHandle(), cDesc, &wsSize);
|
||||
|
@ -354,18 +352,18 @@ class MSELossCnnl : public BangKernelWithoutConfig {
|
|||
}
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
a_dim.size(), a_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, a_dim.size(),
|
||||
a_dim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
b_dim.size(), b_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, b_dim.size(),
|
||||
b_dim.data()));
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
c_dim.size(), c_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, c_dim.size(),
|
||||
c_dim.data()));
|
||||
cnnlStatus_t stat;
|
||||
if (reduction == MSELossObj::None) {
|
||||
stat = cnnlMSELoss(context->cnnlHandle(), CNNL_MSE_LOSS_NONE, aDesc,
|
||||
|
@ -411,17 +409,17 @@ class PowerCnnl : public BangKernelWithoutConfig {
|
|||
}
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
a_dim.size(), a_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, a_dim.size(),
|
||||
a_dim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
b_dim.size(), b_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, b_dim.size(),
|
||||
b_dim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
c_dim.size(), c_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, c_dim.size(),
|
||||
c_dim.data()));
|
||||
|
||||
size_t wsSize;
|
||||
cnnlGetPowWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
|
||||
|
@ -463,17 +461,17 @@ class FloorDivCnnl : public BangKernelWithoutConfig {
|
|||
}
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
a_dim.size(), a_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, a_dim.size(),
|
||||
a_dim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
b_dim.size(), b_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, b_dim.size(),
|
||||
b_dim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
c_dim.size(), c_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, c_dim.size(),
|
||||
c_dim.data()));
|
||||
|
||||
size_t wsSize;
|
||||
cnnlGetFloorDivWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
|
||||
|
@ -515,17 +513,17 @@ class FloorModCnnl : public BangKernelWithoutConfig {
|
|||
}
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
a_dim.size(), a_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, a_dim.size(),
|
||||
a_dim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
b_dim.size(), b_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, b_dim.size(),
|
||||
b_dim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
c_dim.size(), c_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, c_dim.size(),
|
||||
c_dim.data()));
|
||||
|
||||
size_t wsSize;
|
||||
cnnlGetFloorModWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
|
||||
|
@ -567,17 +565,17 @@ class SquaredDifferenceCnnl : public BangKernelWithoutConfig {
|
|||
}
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
a_dim.size(), a_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, a_dim.size(),
|
||||
a_dim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
b_dim.size(), b_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, b_dim.size(),
|
||||
b_dim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
c_dim.size(), c_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, c_dim.size(),
|
||||
c_dim.data()));
|
||||
|
||||
size_t wsSize;
|
||||
cnnlGetSquaredDifferenceWorkspaceSize(context->cnnlHandle(), aDesc,
|
||||
|
@ -660,48 +658,62 @@ class BitNotCnnl : public BitComputeCnnl {
|
|||
// CNNL_BLEFT_SHIFT_OP_V2; }
|
||||
// };
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Add, AddCnnl, "Add_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Sub, SubCnnl, "Sub_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Mul, MulCnnl, "Mul_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Add, DataType::Float32, AddCnnl,
|
||||
"Add_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Sub, DataType::Float32, SubCnnl,
|
||||
"Sub_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Mul, DataType::Float32, MulCnnl,
|
||||
"Mul_cnnl_BANG_Float32");
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Div, DivCnnl, "Div_cnnl");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Max, MaximumCnnl, "Maximum_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Min, MinimumCnnl, "Minimum_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::MSELoss, MSELossCnnl,
|
||||
"MSELoss_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Pow, PowerCnnl, "Power_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::FloorDiv, FloorDivCnnl,
|
||||
"FloorDiv_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::FloorMod, FloorModCnnl,
|
||||
"FloorMod_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::SquaredDifference, SquaredDifferenceCnnl,
|
||||
"SquaredDifference_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Equal, EqualCnnl, "Equal_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Greater, GreaterThanCnnl,
|
||||
"GreaterThan_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::GreaterOrEqual, GreaterEqualCnnl,
|
||||
"GreaterEqual_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Less, LessThanCnnl, "LessThan_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::LessOrEqual, LessEqualCnnl,
|
||||
"LessEqual_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::And, AndCnnl, "And_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Or, OrCnnl, "Or_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Xor, XorCnnl, "Xor_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Not, NotCnnl, "Not_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::BitwiseAnd, BitAndCnnl,
|
||||
"BitAnd_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::BitwiseOr, BitOrCnnl, "BitOr_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::BitwiseXor, BitXorCnnl,
|
||||
"BitXor_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::BitwiseNot, BitNotCnnl,
|
||||
"BitNot_cnnl_BANG");
|
||||
// REGISTER_KERNEL(Device::BANG, OpType::BitLeftShift,
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Div, DataType::Float32, DivCnnl,
|
||||
"Div_cnnl_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Max, DataType::Float32, MaximumCnnl,
|
||||
"Maximum_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Min, DataType::Float32, MinimumCnnl,
|
||||
"Minimum_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::MSELoss, DataType::Float32, MSELossCnnl,
|
||||
"MSELoss_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Pow, DataType::Float32, PowerCnnl,
|
||||
"Power_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::FloorDiv, DataType::Float32, FloorDivCnnl,
|
||||
"FloorDiv_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::FloorMod, DataType::Float32, FloorModCnnl,
|
||||
"FloorMod_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::SquaredDifference, DataType::Float32,
|
||||
SquaredDifferenceCnnl, "SquaredDifference_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Equal, DataType::Float32, EqualCnnl,
|
||||
"Equal_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Greater, DataType::Float32,
|
||||
GreaterThanCnnl, "GreaterThan_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::GreaterOrEqual, DataType::Float32,
|
||||
GreaterEqualCnnl, "GreaterEqual_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Less, DataType::Float32, LessThanCnnl,
|
||||
"LessThan_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::LessOrEqual, DataType::Float32,
|
||||
LessEqualCnnl, "LessEqual_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::And, DataType::Float32, AndCnnl,
|
||||
"And_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Or, DataType::Float32, OrCnnl,
|
||||
"Or_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Xor, DataType::Float32, XorCnnl,
|
||||
"Xor_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Not, DataType::Float32, NotCnnl,
|
||||
"Not_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::BitwiseAnd, DataType::Float32, BitAndCnnl,
|
||||
"BitAnd_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::BitwiseOr, DataType::Float32, BitOrCnnl,
|
||||
"BitOr_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::BitwiseXor, DataType::Float32, BitXorCnnl,
|
||||
"BitXor_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::BitwiseNot, DataType::Float32, BitNotCnnl,
|
||||
"BitNot_cnnl_BANG_Float32");
|
||||
// REGISTER_KERNEL(Device::BANG, OpType::BitLeftShift, DataType::Float32,
|
||||
// BitLeftShiftCnnl,
|
||||
// "BitLeftShift_cnnl_BANG");
|
||||
// REGISTER_KERNEL(Device::BANG, OpType::BitRightShift,
|
||||
// "BitLeftShift_cnnl_BANG_Float32");
|
||||
// REGISTER_KERNEL(Device::BANG, OpType::BitRightShift, DataType::Float32,
|
||||
// BitRightShiftCnnl,
|
||||
// "BitRightShift_cnnl_BANG");
|
||||
// REGISTER_KERNEL(Device::BANG, OpType::Pow,
|
||||
// "BitRightShift_cnnl_BANG_Float32");
|
||||
// REGISTER_KERNEL(Device::BANG, OpType::Pow, DataType::Float32,
|
||||
// ElementWiseBang,
|
||||
// "Pow_Bang");
|
||||
// "Pow_Bang_Float32");
|
||||
}; // namespace infini
|
||||
|
|
|
@ -17,13 +17,13 @@ class ErfCnnl : public BangKernelWithoutConfig {
|
|||
auto cDim = op->getOutput()->getDims();
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
aDim.size(), aDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, aDim.size(),
|
||||
aDim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
cDim.size(), cDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, cDim.size(),
|
||||
cDim.data()));
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlErf_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION,
|
||||
|
@ -36,6 +36,7 @@ class ErfCnnl : public BangKernelWithoutConfig {
|
|||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Erf, ErfCnnl, "Erf_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Erf, DataType::Float32, ErfCnnl,
|
||||
"Erf_cnnl_BANG_Float32");
|
||||
|
||||
}; // namespace infini
|
||||
|
|
|
@ -17,13 +17,13 @@ class ExpCnnl : public BangKernelWithoutConfig {
|
|||
auto cDim = op->getOutput()->getDims();
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
aDim.size(), aDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, aDim.size(),
|
||||
aDim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
cDim.size(), cDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, cDim.size(),
|
||||
cDim.data()));
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlExp_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION,
|
||||
|
@ -36,6 +36,7 @@ class ExpCnnl : public BangKernelWithoutConfig {
|
|||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Exp, ExpCnnl, "Exp_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Exp, DataType::Float32, ExpCnnl,
|
||||
"Exp_cnnl_BANG_Float32");
|
||||
|
||||
}; // namespace infini
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue