Compare commits

..

3 Commits

Author SHA1 Message Date
learner2468 a68ac10107 Enrich dev doc 2023-12-05 17:14:28 +08:00
learner2468 6d62350631 Change function name and add dev doc 2023-12-05 17:10:46 +08:00
learner2468 57954fd523 Add paddle model and use InfiniTensor to infer 2023-12-05 16:53:28 +08:00
275 changed files with 3548 additions and 11190 deletions

View File

@ -14,10 +14,10 @@ env:
protobuf-version: "3.21.12"
python-version: "3.10"
resnet-download: https://github.com/InfiniTensor/InfiniTensor/releases/download/test-models/resnet18-v2-7.onnx
inception-download: https://github.com/InfiniTensor/InfiniTensor/releases/download/test-models/inception-v2-9.onnx
densenet-download: https://github.com/InfiniTensor/InfiniTensor/releases/download/test-models/densenet-12.onnx
efficientnet-download: https://github.com/InfiniTensor/InfiniTensor/releases/download/test-models/efficientnet-lite4-11.onnx
resnet-download: https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet18-v2-7.onnx
inception-download: https://media.githubusercontent.com/media/onnx/models/main/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx
densenet-download: https://github.com/onnx/models/raw/main/vision/classification/densenet-121/model/densenet-12.onnx
efficientnet-download: https://github.com/onnx/models/raw/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx
jobs:
build:

3
.gitmodules vendored
View File

@ -13,6 +13,3 @@
[submodule "example"]
path = examples/NNmodel
url = git@github.com:wanghailu0717/NNmodel.git
[submodule "examples/distributed/onnxsim_large_model"]
path = examples/distributed/onnxsim_large_model
url = git@github.com:luchangli03/onnxsim_large_model.git

View File

@ -13,7 +13,7 @@ if(USE_CUDA)
message("CMake 3.18 or higher is required for setting CUDAToolkit")
cmake_minimum_required(VERSION 3.18) # FindCUDAToolkit
else()
cmake_minimum_required(VERSION 3.17)
cmake_minimum_required(VERSION 3.12)
endif()
include(CMakeDependentOption)
@ -53,13 +53,11 @@ endif()
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_EXTENSIONS OFF) # -std=gnu++11 when on, -std=c++11 when off
add_compile_options(-Wno-error=unused-variable)
find_package(
Python
COMPONENTS Interpreter Development
REQUIRED)
# OpenMP
find_package(OpenMP)
if(OpenMP_C_FOUND)
@ -263,14 +261,7 @@ if(USE_BANG)
# BangC Kernels
################################################################################
if (BUILD_DIST)
find_library(CAMBRICON_CNCL libcncl.so "${NEUWARE_HOME}/lib64")
target_link_libraries(InfiniTensor ${CAMBRICON_CNCL} ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
message(STATUS "Add BUILD_DIST, use CNCL with BANG")
add_compile_definitions(INFINI_USE_CNCL=1)
else()
target_link_libraries(InfiniTensor ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
endif()
target_link_libraries(InfiniTensor ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
endif()
if(USE_KUNLUN)
@ -284,9 +275,9 @@ if(USE_KUNLUN)
endif()
message(STATUS "KUNLUN_HOME: ${KUNLUN_HOME}")
include_directories("${KUNLUN_HOME}/include/")
find_library(KUNLUN_RT libxpurt.so "${KUNLUN_HOME}/lib64/")
find_library(KUNLUN_DNN libxpuapi.so "${KUNLUN_HOME}/lib64/")
include_directories("${KUNLUN_HOME}/XTDK/include/")
find_library(KUNLUN_RT libxpurt.so "${KUNLUN_HOME}/lib64")
find_library(KUNLUN_DNN libxpuapi.so "${KUNLUN_HOME}/XTDK/shlib")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lstdc++ -Wall -Werror")
if ((NOT DEFINED TARGET_CPU_ARCH) AND (NOT DEFINED ENV{TARGET_CPU_ARCH}))
@ -299,13 +290,6 @@ if(USE_KUNLUN)
endif()
message(STATUS "TARGET_CPU_ARCH: ${TARGET_CPU_ARCH}")
if (BUILD_DIST)
message(STATUS "Add BUILD_DIST, use XCCL with KUNLUN XPU")
list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
find_package(XCCL REQUIRED)
add_compile_definitions(INFINI_USE_XCCL=1)
target_link_libraries(InfiniTensor ${XCCL_LIBRARIES})
endif()
target_link_libraries(InfiniTensor ${KUNLUN_RT} ${KUNLUN_DNN} stdc++)
endif()
@ -340,11 +324,9 @@ if(BUILD_TEST)
endif()
if (USE_BANG)
build_test(test/kernels/bang/*.cc)
build_test(test/bang/*.cc)
endif()
if (USE_KUNLUN)
build_test(test/kernels/kunlun/*.cc)
build_test(test/kunlun/*.cc)
endif()
if (USE_INTELCPU)
build_test(test/kernels/intelcpu/*.cc)

View File

@ -7,9 +7,7 @@ KUNLUN ?= OFF
INTELCPU ?= off
BACKTRACE ?= ON
TEST ?= ON
DIST ?= OFF
NNET ?= OFF
DIST ?= OFF
FORMAT_ORIGIN ?=
# Docker build options
DOCKER_NAME ?= infinitensor
@ -31,7 +29,6 @@ CMAKE_OPT += -DUSE_BANG=$(BANG)
CMAKE_OPT += -DUSE_KUNLUN=$(KUNLUN)
CMAKE_OPT += -DUSE_BACKTRACE=$(BACKTRACE)
CMAKE_OPT += -DBUILD_TEST=$(TEST)
CMAKE_OPT += -DBUILD_DIST=$(DIST)
CMAKE_OPT += -DBUILD_NNET=$(NNET)
ifeq ($(INTELCPU), ON)

View File

@ -33,14 +33,13 @@ There are several configurable CMake options, see the [CMakeLists.txt](/CMakeLis
## Roadmap
- [RefactorGraph](https://github.com/InfiniTensor/RefactorGraph) is a newly designed AI framework that is set to replace the current main branch.
- [EinNet](https://github.com/InfiniTensor/InfiniTensor/tree/NNET_e2e) is going to be merged into the main branch.
- Integration of [PET](https://github.com/thu-pacman/PET), a tensor program optimizer supporting partially equivalent transformations.
- Supported hardware
- ✔ NVIDIA GPU
- ✔ Cambricon MLU
- ✔ Kunlunxin XPU
- ⬜ Ascend NPU
- ⬜ Kunlunxin XPU
## Contributor Guide

View File

@ -1,76 +0,0 @@
SET(CNCL_LIB_SEARCH_PATHS $ENV{NEUWARE_HOME}/lib64)
SET(CNCL_INCLUDE_SEARCH_PATHS $ENV{NEUWARE_HOME}/include)
set(CNCL_INCLUDE_DIR $ENV{NEUWARE_HOME}/include)
set(CNCL_LIB_DIR $ENV{NEUWARE_HOME}/lib64)
set(CNCL_VERSION $ENV{CNCL_VERSION} CACHE STRING "Version of CNCL to build with")
if ($ENV{CNCL_ROOT_DIR})
message(WARNING "CNCL_ROOT_DIR is deprecated. Please set CNCL_ROOT instead.")
endif()
list(APPEND CNCL_ROOT $ENV{CNCL_ROOT_DIR} ${MLU_TOOLKIT_ROOT_DIR})
# Compatible layer for CMake <3.12. CNCL_ROOT will be accounted in for searching paths and libraries for CMake >=3.12.
list(APPEND CMAKE_PREFIX_PATH ${CNCL_ROOT})
find_path(CNCL_INCLUDE_DIRS
NAMES cncl.h
HINTS ${CNCL_INCLUDE_DIR})
if (USE_STATIC_CNCL)
MESSAGE(STATUS "USE_STATIC_CNCL is set. Linking with static CNCL library.")
SET(CNCL_LIBNAME "CNCL_static")
if (CNCL_VERSION) # Prefer the versioned library if a specific CNCL version is specified
set(CMAKE_FIND_LIBRARY_SUFFIXES ".a.${CNCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
endif()
else()
SET(CNCL_LIBNAME "cncl")
if (CNCL_VERSION) # Prefer the versioned library if a specific CNCL version is specified
set(CMAKE_FIND_LIBRARY_SUFFIXES ".so.${CNCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
endif()
endif()
find_library(CNCL_LIBRARIES
NAMES ${CNCL_LIBNAME}
HINTS ${CNCL_LIB_DIR})
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(CNCL DEFAULT_MSG CNCL_INCLUDE_DIRS CNCL_LIBRARIES)
if(CNCL_FOUND) # obtaining CNCL version and some sanity checks
set (CNCL_HEADER_FILE "${CNCL_INCLUDE_DIRS}/cncl.h")
message (STATUS "Determining CNCL version from ${CNCL_HEADER_FILE}...")
set (OLD_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES})
list (APPEND CMAKE_REQUIRED_INCLUDES ${CNCL_INCLUDE_DIRS})
include(CheckCXXSymbolExists)
check_cxx_symbol_exists(CNCL_VERSION_CODE CNCL.h CNCL_VERSION_DEFINED)
if (CNCL_VERSION_DEFINED)
set(file "${PROJECT_BINARY_DIR}/detect_cncl_version.cc")
file(WRITE ${file} "
#include <iostream>
#include <cncl.h>
int main()
{
std::cout << CNCL_MAJOR << '.' << CNCL_MINOR << '.' << CNCL_PATCH << std::endl;
int x;
CNCLGetVersion(&x);
return x == CNCL_VERSION_CODE;
}
")
try_run(CNCL_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file}
RUN_OUTPUT_VARIABLE CNCL_VERSION_FROM_HEADER
CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${CNCL_INCLUDE_DIRS}"
LINK_LIBRARIES ${CNCL_LIBRARIES})
if (NOT CNCL_VERSION_MATCHED)
message(FATAL_ERROR "Found CNCL header version and library version do not match! \
(include: ${CNCL_INCLUDE_DIRS}, library: ${CNCL_LIBRARIES}) Please set CNCL_INCLUDE_DIR and CNCL_LIB_DIR manually.")
endif()
message(STATUS "CNCL version: ${CNCL_VERSION_FROM_HEADER}")
else()
# message(STATUS "CNCL version < 2.3.5-5")
endif ()
set (CMAKE_REQUIRED_INCLUDES ${OLD_CMAKE_REQUIRED_INCLUDES})
message(STATUS "Found CNCL (include: ${CNCL_INCLUDE_DIRS}, library: ${CNCL_LIBRARIES})")
mark_as_advanced(CNCL_ROOT_DIR CNCL_INCLUDE_DIRS CNCL_LIBRARIES)
endif()

View File

@ -1,27 +0,0 @@
# Find the xccl libraries
set(XCCL_INCLUDE_DIR $ENV{KUNLUN_HOME}/include CACHE PATH "Folder contains KUNLUN XCCL headers")
set(XCCL_LIB_DIR $ENV{KUNLUN_HOME} CACHE PATH "Folder contains KUNLUN XCCL libraries")
list(APPEND CMAKE_PREFIX_PATH $ENV{KUNLUN_HOME})
find_path(XCCL_INCLUDE_DIRS # ${XCCL_INCLUDE_DIR}
NAMES xpu/bkcl.h
HINTS XCCL_INCLUDE_DIR)
find_library(XCCL_LIBRARIES # ${XCCL_LIB_DIR}
NAMES lib64/libbkcl.so
HINTS XCCL_LIB_DIR)
message(STATUS "XCCL_INCLUDE_DIRS: ${XCCL_INCLUDE_DIRS}")
message(STATUS "XCCL_LIBRARIES: ${XCCL_LIBRARIES}")
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(XCCL DEFAULT_MSG XCCL_INCLUDE_DIRS XCCL_LIBRARIES)
if (XCCL_FOUND)
set (XCCL_HEADER_FILE "${XCCL_INCLUDE_DIRS}/xpu/bkcl.h")
message (STATUS "Determing XCCL version from ${XCCL_HEADER_FILE}...")
list (APPEND CMAKE_REQUIRED_INCLUDES ${XCCL_INCLUDE_DIRS})
message(STATUS "Found XCCL (include: ${XCCL_INCLUDE_DIRS}, library: ${XCCL_LIBRARIES})")
mark_as_advanced(XCCL_INCLUDE_DIRS XCCL_LIBRARIES)
endif()

View File

@ -2,7 +2,6 @@
## 目录
- [环境支持](#环境支持)
- [神经网络支持](#神经网络支持)
- [技术支持](#技术支持)
@ -20,10 +19,10 @@
目前已经验证过的神经网络模型有
- [x] [ResNet18-v2](https://github.com/onnx/models/blob/main/validated/vision/classification/resnet/model/resnet18-v2-7.onnx)
- [x] [DenseNet-121-12](https://github.com/onnx/models/blob/main/validated/vision/classification/densenet-121/model/densenet-12.onnx)
- [x] [Inception-2](https://github.com/onnx/models/blob/main/validated/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx)
- [x] [EfficientNet-Lite4](https://github.com/onnx/models/blob/main/validated/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx)
- [x] [ResNet18-v2](https://github.com/onnx/models/blob/main/vision/classification/resnet/model/resnet18-v2-7.onnx)
- [x] [DenseNet-121-12](https://github.com/onnx/models/blob/main/vision/classification/densenet-121/model/densenet-12.onnx)
- [x] [Inception-2](https://github.com/onnx/models/blob/main/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx)
- [x] [EfficientNet-Lite4](https://github.com/onnx/models/blob/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx)
## 技术支持

View File

@ -3,10 +3,9 @@
## 目录
- [使用方法](#使用方法)
- [python 前端应用指南](#python-前端应用指南)
- [导入 onnx 模型](#导入-onnx-模型)
- [优化](#优化)
- [导出 onnx 模型](#导出-onnx-模型)
- [python-前端应用指南](#python-前端应用指南)
- [导入-onnx-模型](#导入-onnx-模型)
- [导出-onnx-模型](#导出-onnx-模型)
- [执行推理](#执行推理)
- [样例代码](#样例代码)
- [技术支持](#技术支持)
@ -14,7 +13,7 @@
## 使用方法
项目管理功能已写到 [Makefile](../Makefile),支持下列功能:
项目管理功能已写到 [Makefile](Makefile),支持下列功能:
- 编译项目:`make`/`make build`
- 清理生成文件:`make clean`
@ -39,10 +38,10 @@
支持的模型:
- [x] [ResNet18-v2](https://github.com/onnx/models/blob/main/validated/vision/classification/resnet/model/resnet18-v2-7.onnx)
- [x] [DenseNet-121-12](https://github.com/onnx/models/blob/main/validated/vision/classification/densenet-121/model/densenet-12.onnx)
- [x] [Inception-2](https://github.com/onnx/models/blob/main/validated/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx)
- [x] [EfficientNet-Lite4](https://github.com/onnx/models/blob/main/validated/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx)
- [x] [ResNet18-v2](https://github.com/onnx/models/blob/main/vision/classification/resnet/model/resnet18-v2-7.onnx)
- [x] [DenseNet-121-12](https://github.com/onnx/models/blob/main/vision/classification/densenet-121/model/densenet-12.onnx)
- [x] [Inception-2](https://github.com/onnx/models/blob/main/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx)
- [x] [EfficientNet-Lite4](https://github.com/onnx/models/blob/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx)
```python
import onnx
@ -97,7 +96,7 @@ for name, tensor in stub.inputs.items():
print(name, tensor.shape(), tensor)
```
对于 [resnet18-v2-7.onnx](https://github.com/onnx/models/blob/main/validated/vision/classification/resnet/model/resnet18-v2-7.onnx),会打印出:
对于 [resnet18-v2-7.onnx](https://github.com/onnx/models/blob/main/vision/classification/resnet/model/resnet18-v2-7.onnx),会打印出:
```plaintext
data [1, 3, 224, 224] <backend.Tensor object at 0x7efeb828e3b0>
@ -138,7 +137,7 @@ for name, tensor in stub.outputs.items():
### 样例代码
您可以参照[resnet.py](https://github.com/wanghailu0717/NNmodel/blob/main/ResNet/resnet.py)的样例代码进行了解,并尝试运行。在这个文件中,我们使用了 Pytorch 构建了 resnet 网络。您可以查阅该脚本使用方式:
您可以参照[./example/Resnet/resnet.py](./example/ResNet/resnet.py)的样例代码进行了解,并尝试运行。在这个文件中,我们使用了 Pytorch 构建了 resnet 网络。您可以查阅该脚本使用方式:
```python
python resnet.py -h

View File

@ -1,39 +0,0 @@
# 分布式脚本
## 英伟达平台运行方式
#### 1. 运行pytorch模型并生成输入和标准输出可选择导出onnx
使用 `--export_onnx` 设置导出onnx的目录默认为当前路径 `./`不使用这个flag则只进行计算和生成输入输出。
```bash
python run_pytorch.py --model gpt2 --batch_size 1 --length 1 --export_onnx ./
```
会在当前目录下生成输入输出文件`test_inputs.npy` 和 `test_results.npy`,目前只支持单一输入输出。
#### 2. 运行InfiniTensor分布式脚本
```bash
python cuda_launch.py --model "/XXX/XXX.onnx" --nproc_per_node 4
```
## 寒武纪平台运行方式
**将上述运行脚本 `run_pytorch.py` 以及 `cuda_launch.py` 针对寒武纪平台做了相应的适配,具体见 `run_pytorch_mlu.py` 以及 `bang_launch.py`。**
#### 1. 运行pytorch模型并生成输入和标准输出可选择导出onnx
使用 `--export_onnx` 设置导出onnx的目录默认为当前路径 `./`不使用这个flag则只进行计算和生成输入输出。
```bash
python run_pytorch_mlu.py --model gpt2 --batch_size 1 --length 1 --export_onnx ./
```
会在当前目录下生成输入输出文件`test_inputs.npy` 和 `test_results.npy`,目前只支持单一输入输出。
#### 2. 运行InfiniTensor分布式脚本
```bash
python bang_launch.py --model "/XXX/XXX.onnx" --nproc_per_node 4
```

View File

@ -1,187 +0,0 @@
import sys
sys.path.append('../')
import argparse
import os
import time
import multiprocessing as mp
from pyinfinitensor.onnx import OnnxStub, backend
import onnx
from onnx.external_data_helper import convert_model_to_external_data
from onnx.shape_inference import infer_shapes_path
import numpy as np
from parallel_opt import parallel_model
def parse_args():
parser = argparse.ArgumentParser(description="launch distributed infinitensor")
parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
parser.add_argument(
"--nproc_per_node", type=int, default=1, help="number of processes per node"
)
parser.add_argument(
"--name", type=str, default="test", help="name of this instance."
)
parser.add_argument(
"--model", type=str, required=True, help="path to the ONNX model file."
)
parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
parser.add_argument("--length", type=int, default=1, help="sequence length.")
parser.add_argument(
"--gen_std",
action="store_true",
help="whether to generate the standard results.",
)
parser.add_argument(
"--type", type=str, choices=["fp32", "fp16", "tf32"], default="fp32", help="data type"
)
args = parser.parse_args()
print("arg setting: ", args)
return (
args.num_nodes,
args.nproc_per_node,
args.name,
args.model,
args.batch_size,
args.length,
args.gen_std,
args.type,
)
def run_model(model, runtime, world_size=1, rank=0, n=10, data_type="default"):
stub = OnnxStub(model, runtime, matmul_compute_type=data_type)
load_inputs(stub, world_size, rank)
# stub.tune()
stub.run()
# get outputs
outputs = next(stub.outputs.values().__iter__()).copyout_numpy()
# bench
for _ in range(n):
stub.run()
begin = time.time()
for _ in range(n * 2):
stub.run()
end = time.time()
avg_time = (end - begin) / (n * 2)
print(f"average time: {avg_time}")
return outputs
def load_inputs(stub, world_size=1, rank=0):
for i, (name, tensor) in enumerate(stub.inputs.items()):
input = np.load(f"./data/input_{i}.npy")
if all(x == y for x,y in zip(input.shape,tensor.shape())):
tensor.copyin_numpy(input)
else:
tensor.copyin_numpy(np.hsplit(input, world_size)[rank])
def run_and_compare(name, model, runtime, world_size=1, rank=0, data_type="default"):
results = np.load(f"./data/output.npy")
outputs = run_model(model, runtime, world_size, rank, data_type=data_type)
print("outputs abs mean:", abs(outputs).mean())
print("max abs diff:", abs(outputs - results).max())
def start_worker(
name: str, world_size: int, rank: int, local_rank: int, model: onnx.ModelProto, data_type: str
):
dist_name = name + "_dist"
model = parallel_model(model, world_size, rank)
extern_path = f"./{dist_name}_rank{rank}.pb"
if os.path.exists(extern_path):
os.remove(extern_path)
onnx.save_model(
model,
f"./{dist_name}_rank{rank}.onnx",
save_as_external_data=True,
location=extern_path,
)
#infer_shapes_path(f"./{dist_name}_rank{rank}.onnx")
runtime = backend.BangRuntime(local_rank)
# print("init comm")
runtime.init_comm(
dist_name,
world_size,
rank,
)
run_and_compare(name, model, runtime, world_size, rank, data_type)
def start_single(name, model, data_type):
runtime = backend.BangRuntime(0)
run_and_compare(name, model, runtime, data_type=data_type)
def generate_input_output(model):
os.makedirs(os.path.dirname("./data/"), exist_ok=True)
runtime = backend.BangRuntime(0)
stub = OnnxStub(model, runtime)
position_id = 0
for i, (name, tensor) in enumerate(stub.inputs.items()):
input = tensor.copyout_numpy()
if np.issubdtype(input.dtype, np.integer):
if input.size == 1:
# input = np.array([position_id])
input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
else:
input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
elif input.dtype == np.bool_:
input = np.random.randint(0,2,size=input.shape) > 0
else:
if i == 0:
input = np.ones(input.shape).astype(input.dtype)
position_id = input.shape[-1] - 1
else:
input = np.random.rand(*input.shape).astype(input.dtype)
tensor.copyin_numpy(input)
np.save(f"./data/input_{i}", input)
stub.run()
time.sleep(0.01)
output = next(stub.outputs.values().__iter__()).copyout_numpy()
if np.isnan(output).any():
print("Nan in output")
np.save(f"./data/output", output)
def main():
nnodes, nproc_per_node, name, model_path, bs, length, gen_std, data_type = parse_args()
data_type = "default" if data_type == "fp32" else data_type
model = onnx.load(model_path)
# generate standart output
if gen_std:
print(f"generate standard data for {name}.")
# a small vocabulary size to fit all LLM.
generate_input_output(model)
return
if nproc_per_node == 1:
# run single process.
# use standalone process to isolate bang.
print("run model by single MLU.")
# p = mp.Process(target=start_single, args=(name, model, data_type))
# p.start()
# p.join()
start_single(name, model, data_type)
return
# run distributed parallel.
world_size = nnodes * nproc_per_node
print(f"run model by {world_size} MLU in parallel.")
workers = [
mp.Process(
target=start_worker,
args=(name, world_size, rank, rank % nproc_per_node, model, data_type),
)
for rank in range(world_size)
]
for w in workers:
w.start()
for w in workers:
w.join()
if __name__ == "__main__":
main()

View File

@ -1,249 +0,0 @@
import argparse
import torch
import torch_mlu
from transformers import BertModel, BertConfig
from transformers import GPT2Model, GPT2Config
from transformers import OPTModel, OPTConfig
from transformers import AlbertModel, AlbertConfig
from transformers import LlamaModel, LlamaConfig
import time
import numpy as np
import onnx
import sys
import os
from onnx.external_data_helper import convert_model_to_external_data
from onnxsim import simplify
def parse_args():
parser = argparse.ArgumentParser(description="Run pytorch gpt2/bert/opt and optionally export onnx.")
parser.add_argument(
"--model", type=str, choices=["gpt2", "bert", "opt", "llama", "albert"], required=True, help="model type"
)
parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
parser.add_argument("--length", type=int, default=1, help="sequence length.")
parser.add_argument(
"--export_onnx",
type=str,
nargs="?",
default=None,
const="./",
help="whether and where to export onnx file",
)
parser.add_argument(
"--type", type=str, choices=["fp32", "fp16", "tf32"], required=True, help="model data type"
)
args = parser.parse_args()
print("arg setting: ", args)
return (
args.model,
args.batch_size,
args.length,
args.export_onnx,
args.type
)
def get_model(modelname):
match modelname:
case "albert":
model = AlbertModel.from_pretrained("albert/albert-base-v2")
voc_size = AlbertConfig().vocab_size
case "bert":
model = BertModel.from_pretrained("bert-base-uncased", add_pooling_layer=False, hidden_act="gelu_new") # erf is not impl by infini
voc_size = BertConfig().vocab_size
case "gpt2":
model = GPT2Model.from_pretrained("GPT2")
voc_size = GPT2Config().vocab_size
case "opt":
model = OPTModel.from_pretrained("facebook/opt-125m")
voc_size = OPTConfig().vocab_size
case "llama":
model = LlamaModel.from_pretrained("meta-llama/Llama-2-7b-hf")
voc_size = LlamaConfig().vocab_size
case _:
raise KeyError(modelname)
model = model.eval()
return model, voc_size
def run_pytorch(torch_model, voc_size, batchsize, len, dtype="fp32"):
data = np.random.randint(0, voc_size, (batchsize, len), dtype=np.int32)
os.makedirs(os.path.dirname("./data/"), exist_ok=True)
np.save("./data/input_0", data)
inputs = torch.from_numpy(data).to("mlu")
torch_model = torch_model.to("mlu")
if dtype == "fp16":
torch_model = torch_model.half()
n_iter = 20
with torch.no_grad():
for _ in range(10):
outputs = torch_model(inputs)
torch.mlu.synchronize()
begin = time.time()
with torch.no_grad():
for _ in range(n_iter):
torch.mlu.synchronize()
outputs = torch_model(inputs)
torch.mlu.synchronize()
torch.mlu.synchronize()
end = time.time()
avg_time = (end - begin) / n_iter
outputs = outputs.last_hidden_state.to("cpu")
print("outputs abs mean:", abs(np.array(outputs)).mean())
print(f"average time: {avg_time}")
# torch.mlu.memory.empty_cache()
np.save("./data/output", np.array(outputs))
print("Save input & output into ./data.")
def export_onnx(modelname, model, data, path, extern=False, dtype="fp32"):
data = data.to("mlu")
model = model.to("mlu")
if dtype == "fp16":
model = model.half()
torch.onnx.export(model, data, path, verbose=False, do_constant_folding=True)
if modelname != "llama":
# use onnxsim to simplify
onnx_model = onnx.load(path)
onnx_model, check = simplify(onnx_model, skipped_optimizers=['eliminate_duplicate_initializer'])
# onnx_model, check = simplify(onnx_model, skipped_optimizers=['fuse_qkv', 'eliminate_duplicate_initializer'])
assert check
add_value_info_for_constants(onnx_model)
onnx_model = onnx.shape_inference.infer_shapes(onnx_model)
if extern:
extern_path = path.replace('.onnx', '.pb')
if os.path.exists(extern_path):
os.remove(extern_path)
extern_path = extern_path.split("/")[-1]
convert_model_to_external_data(
onnx_model,
all_tensors_to_one_file=True,
location=extern_path,
size_threshold=1024,
convert_attribute=False,
)
onnx.save(onnx_model, path)
else:
# use third party tool to simplify llama
# reference: https://github.com/luchangli03/onnxsim_large_model/
sys.path.append("onnxsim_large_model")
from onnx_utils import set_onnx_input_shape
from compress_model import SIZE_1MB, compress_onnx_model, uncompress_onnx_model
in_model_path = path
out_model_path = path
if not out_model_path:
out_model_path = in_model_path[:-5] + ".sim.onnx"
if os.path.isdir(out_model_path):
out_model_path = os.path.join(out_model_path, os.path.basename(in_model_path))
onnx_model = onnx.load(in_model_path)
print(f"load model from {in_model_path} success")
size_th_bytes = 1024 * 1024
onnx_model, removed_inits = compress_onnx_model(onnx_model, size_th_bytes=size_th_bytes)
print(f"compress model success")
onnx_model = set_onnx_input_shape(onnx_model, "")
tensor_size_threshold = f"1024KB"
skipped_optimizers = []
skipped_optimizers.append("eliminate_duplicate_initializer")
onnx_model, check = simplify(onnx_model, skipped_optimizers=skipped_optimizers,
tensor_size_threshold=tensor_size_threshold)
if not check:
raise ValueError(f"simplify compressed model {in_model_path} failed")
print(f"simplify model success")
onnx_model = uncompress_onnx_model(onnx_model, removed_inits)
print(f"uncompress model success")
add_value_info_for_constants(onnx_model)
onnx.save(onnx_model, out_model_path, save_as_external_data=True)
def add_value_info_for_constants(model : onnx.ModelProto):
"""
Currently onnx.shape_inference doesn't use the shape of initializers, so add
that info explicitly as ValueInfoProtos.
Mutates the model.
Args:
model: The ModelProto to update.
"""
# All (top-level) constants will have ValueInfos before IRv4 as they are all inputs
if model.ir_version < 4:
return
def add_const_value_infos_to_graph(graph : onnx.GraphProto):
inputs = {i.name for i in graph.input}
existing_info = {vi.name: vi for vi in graph.value_info}
for init in graph.initializer:
# Check it really is a constant, not an input
if init.name in inputs:
continue
# The details we want to add
elem_type = init.data_type
shape = init.dims
# Get existing or create new value info for this constant
vi = existing_info.get(init.name)
if vi is None:
vi = graph.value_info.add()
vi.name = init.name
# Even though it would be weird, we will not overwrite info even if it doesn't match
tt = vi.type.tensor_type
if tt.elem_type == onnx.TensorProto.UNDEFINED:
tt.elem_type = elem_type
if not tt.HasField("shape"):
# Ensure we set an empty list if the const is scalar (zero dims)
tt.shape.dim.extend([])
for dim in shape:
tt.shape.dim.add().dim_value = dim
# Handle subgraphs
for node in graph.node:
for attr in node.attribute:
# Ref attrs refer to other attrs, so we don't need to do anything
if attr.ref_attr_name != "":
continue
if attr.type == onnx.AttributeProto.GRAPH:
add_const_value_infos_to_graph(attr.g)
if attr.type == onnx.AttributeProto.GRAPHS:
for g in attr.graphs:
add_const_value_infos_to_graph(g)
return add_const_value_infos_to_graph(model.graph)
def main():
torch.backends.mlu.matmul.allow_tf32 = False
torch.backends.cnnl.allow_tf32 = False
modelname, batchsize, seqlen, export_path, dtype = parse_args()
if dtype == "tf32":
torch.backends.mlu.matmul.allow_tf32 = True
else:
os.environ["CAMBRICON_TF32_OVERRIDE"] = "0"
model, voc_size = get_model(modelname)
if export_path is not None:
filename = "{}_{}_{}_{}.onnx".format(modelname, batchsize, seqlen, dtype)
path = os.path.join(export_path, filename)
if not os.path.exists(path):
param = torch.zeros((batchsize, seqlen), dtype=torch.int)
export_onnx(modelname, model, param, path, True, dtype)
else:
print("Onnx path exists, skipping export.")
run_pytorch(model, voc_size, batchsize, seqlen, dtype)
if __name__ == "__main__":
main()

View File

@ -1,188 +0,0 @@
import argparse
import torch
from transformers import BertModel, BertConfig
from transformers import GPT2Model, GPT2Config
from transformers import OPTModel, OPTConfig
import time
import numpy as np
import onnx
import os
from onnx.external_data_helper import convert_model_to_external_data
from onnxsim import simplify
def parse_args():
parser = argparse.ArgumentParser(description="Run pytorch gpt2/bert/opt and optionally export onnx.")
parser.add_argument(
"--model", type=str, choices=["gpt2", "bert", "opt"], required=True, help="model type"
)
parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
parser.add_argument("--length", type=int, default=1, help="sequence length.")
parser.add_argument(
"--export_onnx",
type=str,
nargs="?",
default=None,
const="./",
help="whether and where to export onnx file",
)
parser.add_argument(
"--type", type=str, choices=["fp32", "fp16", "tf32"], default="fp32", help="data type"
)
args = parser.parse_args()
print("arg setting: ", args)
return (
args.model,
args.batch_size,
args.length,
args.export_onnx,
args.type,
)
def get_model(modelname):
match modelname:
case "bert":
model = BertModel.from_pretrained("bert-base-uncased", add_pooling_layer=False, hidden_act="gelu_new") # erf is not impl by infini
voc_size = BertConfig().vocab_size
case "gpt2":
model = GPT2Model.from_pretrained("gpt2")
voc_size = GPT2Config().vocab_size
case "opt":
model = model = OPTModel.from_pretrained("./opt-125m")
voc_size = OPTConfig().vocab_size
case _:
raise KeyError(modelname)
model = model.eval()
return model, voc_size
def run_pytorch(torch_model, voc_size, batchsize, len):
data = np.random.randint(0, voc_size, (batchsize, len), dtype=np.int32)
np.save("test_inputs", data)
inputs = torch.from_numpy(data).to("cuda")
torch_model = torch_model.to("cuda")
n_iter = 20
with torch.no_grad():
for _ in range(10):
outputs = torch_model(inputs)
torch.cuda.synchronize()
begin = time.time()
with torch.no_grad():
for _ in range(n_iter):
torch.cuda.synchronize()
outputs = torch_model(inputs)
#
torch.cuda.synchronize()
torch.cuda.synchronize()
end = time.time()
avg_time = (end - begin) / n_iter
outputs = outputs.last_hidden_state.to("cpu")
print("outputs abs mean:", abs(np.array(outputs)).mean())
print(f"average time: {avg_time}")
torch.cuda.memory.empty_cache()
np.save("test_results", np.array(outputs, dtype=np.float32))
print("Save input & output as test_inputs.npy and test_results.npy")
def export_onnx(model, data, path, extern=False):
torch.onnx.export(model, data, path, verbose=False, do_constant_folding=True)
onnx_model = onnx.load(path)
onnx_model, check = simplify(onnx_model, skipped_optimizers=['eliminate_duplicate_initializer'])
#onnx_model, check = simplify(onnx_model, skipped_optimizers=['fuse_qkv', 'eliminate_duplicate_initializer'])
assert check
add_value_info_for_constants(onnx_model)
onnx_model = onnx.shape_inference.infer_shapes(onnx_model)
if extern:
extern_path = path.replace('.onnx', '.pb')
if os.path.exists(extern_path):
os.remove(extern_path)
convert_model_to_external_data(
onnx_model,
all_tensors_to_one_file=True,
location=extern_path,
size_threshold=1024,
convert_attribute=False,
)
onnx.save(onnx_model, path)
def add_value_info_for_constants(model : onnx.ModelProto):
"""
Currently onnx.shape_inference doesn't use the shape of initializers, so add
that info explicitly as ValueInfoProtos.
Mutates the model.
Args:
model: The ModelProto to update.
"""
# All (top-level) constants will have ValueInfos before IRv4 as they are all inputs
if model.ir_version < 4:
return
def add_const_value_infos_to_graph(graph : onnx.GraphProto):
inputs = {i.name for i in graph.input}
existing_info = {vi.name: vi for vi in graph.value_info}
for init in graph.initializer:
# Check it really is a constant, not an input
if init.name in inputs:
continue
# The details we want to add
elem_type = init.data_type
shape = init.dims
# Get existing or create new value info for this constant
vi = existing_info.get(init.name)
if vi is None:
vi = graph.value_info.add()
vi.name = init.name
# Even though it would be weird, we will not overwrite info even if it doesn't match
tt = vi.type.tensor_type
if tt.elem_type == onnx.TensorProto.UNDEFINED:
tt.elem_type = elem_type
if not tt.HasField("shape"):
# Ensure we set an empty list if the const is scalar (zero dims)
tt.shape.dim.extend([])
for dim in shape:
tt.shape.dim.add().dim_value = dim
# Handle subgraphs
for node in graph.node:
for attr in node.attribute:
# Ref attrs refer to other attrs, so we don't need to do anything
if attr.ref_attr_name != "":
continue
if attr.type == onnx.AttributeProto.GRAPH:
add_const_value_infos_to_graph(attr.g)
if attr.type == onnx.AttributeProto.GRAPHS:
for g in attr.graphs:
add_const_value_infos_to_graph(g)
return add_const_value_infos_to_graph(model.graph)
def main():
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False
modelname, batchsize, seqlen, export_path, data_type = parse_args()
if data_type == "tf32":
torch.backends.cuda.matmul.allow_tf32 = True
else:
os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
model, voc_size = get_model(modelname)
if export_path is not None:
filename = "{}_{}_{}.onnx".format(modelname, batchsize, seqlen)
path = os.path.join(export_path, filename)
param = torch.zeros((batchsize, seqlen), dtype=torch.int)
export_onnx(model, param, path, True)
if data_type == "fp16":
model = model.half()
run_pytorch(model, voc_size, batchsize, seqlen)
if __name__ == "__main__":
main()

View File

@ -1,14 +0,0 @@
export HF_ENDPOINT=https://hf-mirror.com
models=("bert" "gpt2" "llama")
batch_size=(1 32)
seq_len=(100 500)
nproc=(1 2 4)
for model in "${models[@]}"; do
for bs in "${batch_size[@]}"; do
for len in "${seq_len[@]}"; do
python run_pytorch.py --model "$model" --batch_size "$bs" --length "$len" --export_onnx ../models/"$model" --export_only
done
done
done

View File

@ -1,280 +0,0 @@
import sys
sys.path.append('../')
import argparse
import os
import time
import multiprocessing as mp
from pyinfinitensor.onnx import OnnxStub, backend
import onnx
from onnx.external_data_helper import convert_model_to_external_data
from onnx.shape_inference import infer_shapes_path
import numpy as np
from parallel_opt import parallel_model
from functools import wraps
def parse_args():
parser = argparse.ArgumentParser(description="launch distributed infinitensor")
parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
parser.add_argument(
"--nproc_per_node", type=int, default=2, help="number of processes per node"
)
parser.add_argument(
"--name", type=str, choices=["gpt2", "bert", "llama"], help="name of model."
)
parser.add_argument(
"--model", type=str, default="", help="path to the ONNX model file."
)
parser.add_argument(
"--gen_std",
default=False,
action="store_true",
help="whether to generate the standard results.",
)
parser.add_argument(
"--run_single",
default=False,
action="store_true",
help="whether run model with single process with standard inputs"
)
parser.add_argument(
"--input_dir",
default="./",
help="path to save model input data"
)
parser.add_argument(
"--result_dir",
default="./",
help="path to save model standard output"
)
parser.add_argument(
"--internal_model_dir",
default="./",
help="path to save internal onnx model for parallel run"
)
args = parser.parse_args()
# check path, mkdir if not exist
check_exists(args.input_dir)
check_exists(args.result_dir)
check_exists(args.internal_model_dir)
print("arg setting: ", args)
return (
args.num_nodes,
args.nproc_per_node,
args.name,
args.model,
args.gen_std,
args.run_single,
args.input_dir,
args.result_dir,
args.internal_model_dir
)
"""
utils function for this scripts
"""
def check_exists(path: str):
if not os.path.exists(path):
os.makedirs(path)
def np_assert(base, test, rtol=1e-2, atol=1e-1):
# np.testing.assert_allclose(test, base, rtol, atol)
print("max abs diff:", abs(base - test).max())
"""
Perf wrapper, run function n times
then average
"""
def perf_it(n):
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
# warmup
for _ in range(n):
func(*args, **kwargs)
t_total = 0
for _ in range(n):
t0 = time.time()
func(*args, **kwargs)
t1 = time.time()
t_total += t1 - t0
avg_time = (t_total) / n
print(f"Avg runtime of {n} time is {avg_time:.6f} seconds")
return avg_time
return wrapper
return decorator
"""
Run InfiniTensor model with Standard input
check=True: check with standard output gen by pytorch
perf=True: run n times to get avg time
"""
def run_model(task_name,
model,
runtime,
world_size=1,
rank=0,
n=10,
check=True,
perf=True):
stub = OnnxStub(model, runtime,
use_naive_allocator=True \
if task_name == "llama" else False)
# load in Onnx model inputs
def load_inputs(stub: OnnxStub):
# check exists
inputs = []
for i, (name, tensor) in enumerate(stub.inputs.items()):
input_path = os.path.join(input_dir, \
f"{task_name}_input_{i}.npy")
print(input_path)
if os.path.exists(input_path):
input = np.load(input_path)
else :
raise KeyError(f"{i} th input of model not exists")
# check shape
if all(x == y for x,y in zip(input.shape, tensor.shape())):
tensor.copyin_numpy(input)
else:
tensor.copyin_numpy(np.hsplit(input, world_size)[rank])
load_inputs(stub)
# stub.tune()
stub.run()
time.sleep(0.01)
output = next(stub.outputs.values().__iter__()).copyout_numpy()
# check output results with standard output
if check:
st_output_path = os.path.join(result_dir, \
f"{task_name}_output.npy")
assert os.path.exists(st_output_path) , \
"standard output not exists"
st_output = np.load(st_output_path)
if np.isnan(output).any():
print("Nan in output")
exit()
np_assert(st_output, output)
# perf
if perf:
@perf_it(n)
def perf_infinitensor(stub: OnnxStub):
stub.run()
perf_infinitensor(stub)
return output
"""
Start a worker in Parallel
"""
def start_worker(name: str,
world_size: int,
rank: int,
local_rank: int,
model: onnx.ModelProto):
dist_name = name + "_dist"
# partial a onnx model to world_size part
model = parallel_model(model, world_size, rank)
onnx.save(model, os.path.join(internal_model_dir, \
f"{dist_name}_rank{rank}.onnx"), save_as_external_data=True)
runtime = backend.KUNLUNRuntime(local_rank)
# print("init comm")
runtime.init_comm(
dist_name,
world_size,
rank,
)
run_model(name, model, runtime, world_size, rank)
"""
generate standard input/output with
sigle card run
"""
def gen_standard(task_name: str, model: onnx.ModelProto):
runtime = backend.KUNLUNRuntime(0)
stub = OnnxStub(model, runtime)
position_id = 0
# generate random input for model
for i, (name, tensor) in enumerate(stub.inputs.items()):
input = tensor.copyout_numpy()
if np.issubdtype(input.dtype, np.integer):
if input.size == 1:
input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
else:
input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
elif input.dtype == np.bool_:
input = np.random.randint(0,2,size=input.shape) > 0
else:
if i == 0:
input = np.ones(input.shape).astype(input.dtype)
position_id = input.shape[-1] - 1
else:
input = np.random.rand(*input.shape).astype(input.dtype)
tensor.copyin_numpy(input)
np.save(os.path.join(input_dir, \
f"{task_name}_input_{i}.npy"), input)
stub.run()
# print(stub.outputs)
output = next(stub.outputs.values().__iter__()).copyout_numpy()
if np.isnan(output).any():
print("Nan in output")
exit()
np.save(os.path.join(result_dir, f"{task_name}_output.npy"), output)
def main():
global input_dir, result_dir, internal_model_dir
nnodes, nproc_per_node, task_name, \
model_path, gen_std, run_single, \
input_dir, result_dir, internal_model_dir = parse_args()
# load input onnx model
model = onnx.load(model_path)
# generate standart output
if gen_std:
print("Generate inputs and outputs.")
gen_standard(task_name, model)
return
if run_single:
print("Run model by one GPU card.")
runtime = backend.KUNLUNRuntime(0)
run_model(task_name, model, runtime)
return
# run distributed parallel.
world_size = nnodes * nproc_per_node
print(f"Run model by {world_size} GPU in parallel.")
workers = [
mp.Process(
target=start_worker,
args=(task_name, world_size, rank, rank % nproc_per_node, model),
)
for rank in range(world_size)
]
for w in workers:
w.start()
for w in workers:
w.join()
if __name__ == "__main__":
main()

View File

@ -1,36 +0,0 @@
export HF_ENDPOINT=https://hf-mirror.com
# models=("bert" "gpt2" "llama")
models=("bert" "gpt2")
batch_size=(1 32)
seq_len=(100 500)
nproc=(1 2 4)
results_dir="results"
if [ -d "$results_dir" ]; then
echo "directory ./$results_dir exists"
else
mkdir -p "$results_dir"
echo "mkdir $results_dir, logs saved there"
fi
for model in "${models[@]}"; do
for bs in "${batch_size[@]}"; do
for len in "${seq_len[@]}"; do
# run pytorch model
echo "Run pytorch $model with batch_size=$bs length=$len ."
python run_pytorch.py --model "$model" --batch_size "$bs" --length "$len" #> results/"$model"_"$bs"_"$len"_pytorch
for n in "${nproc[@]}"; do
# run infinitensor
echo "Run $n parallel infinitensor "$model" with batch_size=$bs and length=$len ."
python kunlun_launch.py --name "$model" --model ../models/"$model"/"$model"_"$bs"_"$len".onnx --nproc_per_node=$n # >> results/"$model"_"$bs"_"$len"_infini
# delete internal files
find ./ -type f -name "*.onnx" -delete
find ./ -type f -name "*.pb" -delete
done
find ./ -type f -name "*.npy" -delete
done
done
done

View File

@ -1,35 +0,0 @@
export HF_ENDPOINT=https://hf-mirror.com
# models=("bert" "gpt2" "llama")
models=("llama")
batch_size=(1 )
seq_len=(100 500)
nproc=(1 2 4)
results_dir="results"
if [ -d "$results_dir" ]; then
echo "directory ./$results_dir exists"
else
mkdir -p "$results_dir"
echo "mkdir $results_dir, logs saved there"
fi
for model in "${models[@]}"; do
for bs in "${batch_size[@]}"; do
for len in "${seq_len[@]}"; do
echo "Run pytorch llama with batch_size="$bs" and length="$len""
python run_pytorch.py --model "$model" --batch_size "$bs" --length "$len"
for n in "${nproc[@]}"; do
# run pytorch model
echo "Run infinitensor llama with batch_size="$bs" and length="$len" and nproc="$n"."
python kunlun_launch.py --name llama --model ../models/llama/llama_"$bs"_"$len"_fp32.onnx --nproc_per_node=$n
# delete internal files
find ./ -type f -name "*.onnx" -delete
find ./ -type f -name "*0c" -delete
done
find ./ -type f -name "*.npy" -delete
done
done
done

View File

@ -1,245 +0,0 @@
import argparse
import torch
from transformers import BertModel, BertConfig
from transformers import GPT2Model, GPT2Config
from transformers import OPTModel, OPTConfig
from transformers import LlamaModel, LlamaConfig
import time
import numpy as np
import onnx
import os
import sys
from onnx.external_data_helper import convert_model_to_external_data
from onnxsim import simplify
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False
def parse_args():
parser = argparse.ArgumentParser(description="Run pytorch gpt2/bert/opt and optionally export onnx.")
parser.add_argument(
"--model", type=str, choices=["gpt2", "bert", "opt", "llama"], required=True, help="model type"
)
parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
parser.add_argument("--length", type=int, default=1, help="sequence length.")
parser.add_argument(
"--export_onnx",
type=str,
nargs="?",
default=None,
const="./",
help="whether and where to export onnx file",
)
parser.add_argument(
"--input_dir",
type=str,
default="./",
help="path to save pytorch model input data"
)
parser.add_argument(
"--result_dir",
type=str,
default="./",
help="path to save pytorch model output data"
)
parser.add_argument(
"--export_only",
action="store_true"
)
args = parser.parse_args()
print("arg setting: ", args)
return (
args.model,
args.batch_size,
args.length,
args.export_onnx,
args.input_dir,
args.result_dir,
args.export_only
)
def get_model(modelname):
if modelname == "bert":
model = BertModel.from_pretrained("bert-base-uncased", add_pooling_layer=False, hidden_act="gelu_new") # erf is not impl by infini
voc_size = BertConfig().vocab_size
elif modelname == "gpt2":
model = GPT2Model.from_pretrained("gpt2")
voc_size = GPT2Config().vocab_size
elif modelname == "opt":
model = OPTModel.from_pretrained("./opt-125m")
voc_size = OPTConfig().vocab_size
elif modelname == "llama":
model = LlamaModel.from_pretrained("meta-llama/Llama-2-7b-hf")
voc_size = LlamaConfig().vocab_size
else :
raise KeyError(modelname)
model = model.eval()
return model, voc_size
def run_pytorch(torch_model, voc_size, batchsize, len, model_name):
data = np.random.randint(0, voc_size, (batchsize, len), dtype=np.int32)
np.save(os.path.join(input_dir, f"{model_name}_input_0.npy"), data)
inputs = torch.from_numpy(data).to("cuda")
torch_model = torch_model.to("cuda")
n_iter = 10
with torch.no_grad():
for _ in range(10):
outputs = torch_model(inputs)
torch.cuda.synchronize()
begin = time.time()
with torch.no_grad():
for _ in range(n_iter):
torch.cuda.synchronize()
outputs = torch_model(inputs)
#
torch.cuda.synchronize()
torch.cuda.synchronize()
end = time.time()
avg_time = (end - begin) / n_iter
outputs = outputs.last_hidden_state.to("cpu")
print("outputs abs mean:", abs(np.array(outputs)).mean())
print(f"average time: {avg_time}")
torch.cuda.memory.empty_cache()
np.save(os.path.join(result_dir, f"{model_name}_output.npy"), \
np.array(outputs))
print(f"Save input & output as {model_name}_input_0.npy and {model_name}_output.npy")
def export_onnx(model_name, model, data, path, extern=False):
# torch.onnx.export(model, data, path, verbose=False, do_constant_folding=True)
if model_name != "llama":
onnx_model = onnx.load(path)
onnx_model, check = simplify(onnx_model,
skipped_optimizers=['fuse_qkv', 'eliminate_duplicate_initializer'])
# skipped_optimizers=['fuse_qkv'])
assert check
add_value_info_for_constants(onnx_model)
onnx_model = onnx.shape_inference.infer_shapes(onnx_model)
if extern:
extern_path = path.replace('.onnx', '.pb')
if os.path.exists(extern_path):
os.remove(extern_path)
convert_model_to_external_data(
onnx_model,
all_tensors_to_one_file=True,
location=extern_path.split("/")[-1],
size_threshold=1024,
convert_attribute=False,
)
onnx.save(onnx_model, path)
else:
sys.path.append("onnxsim_large_model")
from onnx_utils import set_onnx_input_shape
from compress_model import SIZE_1MB, compress_onnx_model, uncompress_onnx_model
in_model_path = path
out_model_path = in_model_path[:-5] + ".sim.onnx"
onnx_model = onnx.load(in_model_path)
print(f"load model from {in_model_path} success")
size_th_bytes = 1024 * 1024
onnx_model, removed_inits = compress_onnx_model(onnx_model, size_th_bytes=size_th_bytes)
print("compress model success")
onnx_model = set_onnx_input_shape(onnx_model, "")
tensor_size_threshold = f"1024KB"
skipped_optimizers = []
skipped_optimizers.append("eliminate_duplicate_initializer")
onnx_model, check = simplify(onnx_model, skipped_optimizers=skipped_optimizers,
tensor_size_threshold=tensor_size_threshold)
if not check:
raise ValueError(f"simplify compressed model {in_model_path} failed")
print(f"simplify model success")
onnx_model = uncompress_onnx_model(onnx_model, removed_inits)
print(f"uncompress model success")
add_value_info_for_constants(onnx_model)
onnx.save(onnx_model, out_model_path, save_as_external_data=True)
def add_value_info_for_constants(model : onnx.ModelProto):
"""
Currently onnx.shape_inference doesn't use the shape of initializers, so add
that info explicitly as ValueInfoProtos.
Mutates the model.
Args:
model: The ModelProto to update.
"""
# All (top-level) constants will have ValueInfos before IRv4 as they are all inputs
if model.ir_version < 4:
return
def add_const_value_infos_to_graph(graph : onnx.GraphProto):
inputs = {i.name for i in graph.input}
existing_info = {vi.name: vi for vi in graph.value_info}
for init in graph.initializer:
# Check it really is a constant, not an input
if init.name in inputs:
continue
# The details we want to add
elem_type = init.data_type
shape = init.dims
# Get existing or create new value info for this constant
vi = existing_info.get(init.name)
if vi is None:
vi = graph.value_info.add()
vi.name = init.name
# Even though it would be weird, we will not overwrite info even if it doesn't match
tt = vi.type.tensor_type
if tt.elem_type == onnx.TensorProto.UNDEFINED:
tt.elem_type = elem_type
if not tt.HasField("shape"):
# Ensure we set an empty list if the const is scalar (zero dims)
tt.shape.dim.extend([])
for dim in shape:
tt.shape.dim.add().dim_value = dim
# Handle subgraphs
for node in graph.node:
for attr in node.attribute:
# Ref attrs refer to other attrs, so we don't need to do anything
if attr.ref_attr_name != "":
continue
if attr.type == onnx.AttributeProto.GRAPH:
add_const_value_infos_to_graph(attr.g)
if attr.type == onnx.AttributeProto.GRAPHS:
for g in attr.graphs:
add_const_value_infos_to_graph(g)
return add_const_value_infos_to_graph(model.graph)
def main():
global input_dir, result_dir
modelname, batchsize, seqlen, \
export_path, input_dir, result_dir, export_only = parse_args()
model, voc_size = get_model(modelname) # pytorch model
if export_path is not None:
os.makedirs(export_path, exist_ok=True)
filename = "{}_{}_{}.onnx".format(modelname, batchsize, seqlen)
path = os.path.join(export_path, filename)
param = torch.zeros((batchsize, seqlen), dtype=torch.int)
export_onnx(modelname, model, param, path, True) # export pytorch model to onnx model
if export_only:
return
run_pytorch(model, voc_size, batchsize, seqlen, modelname)
if __name__ == "__main__":
main()

View File

@ -10,6 +10,9 @@ import numpy as np
from parallel_opt import parallel_model
os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
def parse_args():
parser = argparse.ArgumentParser(description="launch distributed infinitensor")
parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
@ -29,9 +32,6 @@ def parse_args():
action="store_true",
help="whether to generate the standard results.",
)
parser.add_argument(
"--type", type=str, choices=["fp32", "fp16", "tf32"], default="fp32", help="data type"
)
args = parser.parse_args()
print("arg setting: ", args)
return (
@ -42,13 +42,12 @@ def parse_args():
args.batch_size,
args.length,
args.gen_std,
args.type,
)
def run_model(model, runtime, inputs, n=10, data_type = "default"):
stub = OnnxStub(model, runtime, matmul_compute_type=data_type)
for tensor, input in zip(stub.inputs.values(), inputs, strict=False):
def run_model(model, runtime, inputs, n=10):
stub = OnnxStub(model, runtime)
for tensor, input in zip(stub.inputs.values(), inputs):
tensor.copyin_numpy(input)
# stub.tune()
stub.run()
@ -56,7 +55,7 @@ def run_model(model, runtime, inputs, n=10, data_type = "default"):
outputs = next(stub.outputs.values().__iter__()).copyout_numpy()
# bench
for tensor, input in zip(stub.inputs.values(), inputs, strict=False):
for tensor, input in zip(stub.inputs.values(), inputs):
tensor.copyin_numpy(input)
begin = time.time()
for _ in range(n):
@ -67,17 +66,17 @@ def run_model(model, runtime, inputs, n=10, data_type = "default"):
return outputs
def run_and_compare(name, model, runtime, data_type):
def run_and_compare(name, model, runtime):
input_ids = np.load(f"{name}_inputs.npy")
position_ids = np.arange(input_ids.shape[-1])
results = np.load(f"{name}_results.npy")
outputs = run_model(model, runtime, (input_ids, position_ids), data_type=data_type)
outputs = run_model(model, runtime, (input_ids, position_ids))
print("outputs abs mean:", abs(outputs).mean())
print("max abs diff:", abs(outputs - results).max())
np.testing.assert_allclose(outputs, results, rtol=1e-6, atol=1e-3)
def start_worker(
name: str, world_size: int, rank: int, local_rank: int, model: onnx.ModelProto, data_type: str
name: str, world_size: int, rank: int, local_rank: int, model: onnx.ModelProto
):
dist_name = name + "_dist"
model = parallel_model(model, world_size, rank)
@ -90,7 +89,7 @@ def start_worker(
save_as_external_data=True,
location=extern_path,
)
#infer_shapes_path(f"./{dist_name}_rank{rank}.onnx")
infer_shapes_path(f"./{dist_name}_rank{rank}.onnx")
runtime = backend.CudaRuntime(local_rank)
# print("init comm")
runtime.init_comm(
@ -98,12 +97,12 @@ def start_worker(
world_size,
rank,
)
run_and_compare(name, model, runtime, data_type)
run_and_compare(name, model, runtime)
def start_single(name, model, data_type):
def start_single(name, model):
runtime = backend.CudaRuntime(0)
run_and_compare(name, model, runtime, data_type)
run_and_compare(name, model, runtime)
def gen_standard(name, model, voc_size, bs, len):
@ -118,10 +117,8 @@ def gen_standard(name, model, voc_size, bs, len):
def main():
nnodes, nproc_per_node, name, model_path, bs, length, gen_std, data_type = parse_args()
data_type = "default" if data_type == "fp32" else data_type
if data_type != "tf32":
os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
nnodes, nproc_per_node, name, model_path, bs, length, gen_std = parse_args()
model = onnx.load(model_path)
# generate standart output
@ -135,7 +132,7 @@ def main():
# run single process.
# use standalone process to isolate cuda.
print("run model by single GPU.")
p = mp.Process(target=start_single, args=(name, model, data_type))
p = mp.Process(target=start_single, args=(name, model))
p.start()
p.join()
@ -145,7 +142,7 @@ def main():
workers = [
mp.Process(
target=start_worker,
args=(name, world_size, rank, rank % nproc_per_node, model, data_type),
args=(name, world_size, rank, rank % nproc_per_node, model),
)
for rank in range(world_size)
]

@ -1 +0,0 @@
Subproject commit cbcf3fbf985a00494b0f136c92eaccd42031bf65

View File

@ -110,11 +110,12 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
s_dim = 0
elif in_plc.dim == 2:
s_dim = 1
assert s_dim != -1
assert out_dims[s_dim] % tp_world_size == 0, out_dims
out_dims[s_dim] //= tp_world_size
# if ONNX uses the same tensor for multiple Reshape Nodes, then rename it to distingush from others.
node.input[1] = node.output[0] + "_shape"
# node.input[1] = node.output[0] + "_shape"
data[node.input[1]] = numpy_helper.from_array(out_dims, name=node.input[1])
place[node.output[0]] = Shard(s_dim)
@ -136,7 +137,7 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
place[node.output[0]] = Shard(list(perm).index(plc.dim))
def shard_node(node: NodeProto):
if node.op_type in ["Relu", "Tanh", "Softmax", "Cast"]:
if node.op_type in ["Relu", "Tanh", "Softmax"]:
place[node.output[0]] = place[node.input[0]]
elif node.op_type in ["Where"]:
place[node.output[0]] = place[node.input[1]]
@ -176,14 +177,7 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
input in data for input in node.input
):
# FIXME(constroy): the last MatMul should not be sharded as TP.
if (
node.output[0] in output
or (
index + 1 < len(model.graph.node)
and model.graph.node[index + 1].output[0]
)
in output
):
if node.output[0] in output:
continue
groups = 1
# If the Gemm or Matmul is followed by a split, then the inputs are concatinated by groups
@ -243,5 +237,5 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
if tt.HasField("shape"):
tt.ClearField("shape")
model = helper.make_model(graph)
#model = onnx.shape_inference.infer_shapes(model)
model = onnx.shape_inference.infer_shapes(model)
return model

View File

@ -1,145 +0,0 @@
import os
from pyinfinitensor.onnx import OnnxStub, backend
import numpy as np
import onnx
import torch
from transformers import LlamaModel, LlamaForCausalLM
from tqdm import tqdm
import onnx_graphsurgeon as gs
from onnxsim import simplify
import argparse
parser = argparse.ArgumentParser(description='')
parser.add_argument('--batchsize', dest='batchsize', type=int, default=1)
parser.add_argument('--layer', dest='n_layers', type=int, default=2)
parser.add_argument('--iter', dest='n_iter', type=int, default=1)
parser.add_argument('--n_max_length', dest='n_max_length', type=int, default=1024)
parser.add_argument('--pretrained_llama_path', dest='pretrained_llama_path', type=str,
default="/data0/shared/data/public/opensource_models/meta-llama/Llama-2-7b-hf/")
parser.add_argument('--onnx_model_path', dest='onnx_model_path', type=str,
default="/data1/shared/llama")
args = parser.parse_args()
ONNX_MODEL_PATH = "{}/llama_bs{}_layer{}.onnx".format(args.onnx_model_path, args.batchsize, args.n_layers)
ONNX_WEIGHT_PATH = "./llama_bs{}_layer{}.pb".format(args.batchsize, args.n_layers)
def export_onnx(model: LlamaModel, ONNX_MODEL_PATH):
param = torch.zeros(
(args.batchsize, 1024), dtype=torch.long)
logits = model(param, past_key_values=None)
param_kvcache = torch.zeros((args.batchsize, 1), dtype=torch.long)
torch.onnx.export(model, (param_kvcache, {"past_key_values": logits.past_key_values,
"position_ids": param_kvcache}), ONNX_MODEL_PATH, verbose=False,
do_constant_folding=True,)
onnx_model = onnx.load(ONNX_MODEL_PATH)
print("simplifing onnx model")
onnx_model, check = simplify(onnx_model, skipped_optimizers=[
'eliminate_duplicate_initializer'])
assert check
onnx.save(onnx_model, ONNX_MODEL_PATH, save_as_external_data=True, location=ONNX_WEIGHT_PATH)
print("simlifing finished.")
@gs.Graph.register()
def replace_with_attention(self, inputs, outputs, inputs_added, outputs_removed):
for inp in inputs:
inp.outputs.clear()
for out in outputs:
out.inputs.clear()
for inp in inputs_added:
inputs.append(inp)
for out in outputs_removed:
out.inputs.clear()
return self.layer(op="AttentionKVCache", inputs=inputs, outputs=outputs)
def replace_onnx_with_attention_op():
graph = gs.import_onnx(
onnx.load(ONNX_MODEL_PATH))
tmap = graph.tensors()
for i in range(args.n_layers):
inputs = [
tmap["onnx::Concat_" + str((i+1)*2)],
tmap["onnx::Concat_" + str((i+1)*2+1)],
tmap["/model/layers." + str(i) + "/self_attn/Add_output_0"],
tmap["/model/layers." + str(i) + "/self_attn/Add_1_output_0"],
tmap["/model/layers." + str(i) + "/self_attn/Transpose_2_output_0"]]
outputs = [
tmap["/model/layers." + str(i) + "/self_attn/MatMul_1_output_0"]]
inputs_added = [graph.inputs[1]]
outputs_removed = []
graph.replace_with_attention(
inputs, outputs, inputs_added, outputs_removed)
graph.outputs = [tmap[graph.outputs[0].name]]
graph.cleanup(True).toposort()
onnx.save(gs.export_onnx(graph), ONNX_MODEL_PATH, save_as_external_data=True)
if __name__ == "__main__":
kvcache_torch = None
torch_model = LlamaForCausalLM.from_pretrained(
args.pretrained_llama_path, num_hidden_layers=int(args.n_layers)).eval()
n_heads = torch_model.config.num_attention_heads
n_dims = torch_model.config.hidden_size // n_heads
if not os.path.exists(ONNX_MODEL_PATH):
print("exporting onnx graph")
export_onnx(torch_model, ONNX_MODEL_PATH)
replace_onnx_with_attention_op()
else:
print("will use exsiting onnx graph")
onnx_model = onnx.load(ONNX_MODEL_PATH)
stub = OnnxStub(onnx_model, backend.cuda_runtime())
count_wrong = 0
for i in tqdm(range(0, args.n_max_length)):
query = np.random.randint(
torch_model.config.vocab_size, size=(args.batchsize, 1), dtype=np.int32)
position_id = i*np.ones((args.batchsize, 1), dtype=np.int32)
####################################
# pytorch
####################################
outputs_torch = torch_model(
torch.tensor(query), past_key_values=kvcache_torch)
logit_torch = outputs_torch['logits']
kvcache_torch = outputs_torch['past_key_values']
####################################
# infinitensor
####################################
# copyin input
(list(stub.inputs.items()))[0][1].copyin_int64(
query.reshape(-1).tolist())
(list(stub.inputs.items()))[1][1].copyin_int64(
position_id.reshape(-1).tolist())
stub.run()
####################################
# validation
####################################
# copyout output
logits_it = np.array((list(stub.outputs.items()))
[0][1].copyout_float())
try:
np.testing.assert_allclose(
logit_torch[:, -1, :].detach().cpu().numpy().flatten(), logits_it, rtol=1e-3, atol=1e-3)
except Exception as e:
try:
np.testing.assert_allclose(
np.argmax(logit_torch[:, -1, :].detach().cpu().numpy().flatten()), np.argmax(logits_it), rtol=1e-3, atol=1e-3)
except:
count_wrong = count_wrong + 1
result = "{}/{} failed.".format(count_wrong, args.n_max_length)
print(result)
del stub

View File

@ -1,5 +1,6 @@
import paddle
import numpy as np
import paddle.vision.transforms as T
from paddle.vision.datasets import Cifar10
from pyinfinitensor.onnx import OnnxStub, backend
@ -77,4 +78,4 @@ def run_cifar_train_and_infer():
if __name__ == "__main__":
run_cifar_train_and_infer()
run_cifar_train_and_infer()

View File

@ -1,4 +1,5 @@
import paddle
import numpy as np
import paddle.vision.transforms as T
from paddle.vision.datasets import Cifar10
from pyinfinitensor.onnx import OnnxStub, backend
@ -77,4 +78,4 @@ def run_cifar_train_and_infer():
if __name__ == "__main__":
run_cifar_train_and_infer()
run_cifar_train_and_infer()

View File

@ -1,5 +1,6 @@
import paddle
import numpy as np
import paddle.vision.transforms as T
from paddle.vision.datasets import Cifar10
from pyinfinitensor.onnx import OnnxStub, backend
@ -78,4 +79,4 @@ def run_cifar_train_and_infer():
if __name__ == "__main__":
run_cifar_train_and_infer()
run_cifar_train_and_infer()

View File

@ -2,10 +2,6 @@
#include "cnnl.h"
#include "cnrt.h"
#include "core/common.h"
#include "core/data_type.h"
#ifdef INFINI_USE_CNCL
#include "cncl.h"
#endif
#define checkBangError(call) \
{ \
@ -31,70 +27,4 @@ namespace infini {
using BangPtr = void *;
inline cnnlDataType_t cnnlDataTypeConvert(DataType dataType) {
if (dataType == DataType::Float32) {
return CNNL_DTYPE_FLOAT;
}
if (dataType == DataType::Float16) {
return CNNL_DTYPE_HALF;
}
if (dataType == DataType::Double) {
return CNNL_DTYPE_DOUBLE;
}
if (dataType == DataType::Int8) {
return CNNL_DTYPE_INT8;
}
if (dataType == DataType::Int32) {
return CNNL_DTYPE_INT32;
}
if (dataType == DataType::UInt8) {
return CNNL_DTYPE_UINT8;
}
if (dataType == DataType::BFloat16) {
return CNNL_DTYPE_BFLOAT16;
}
if (dataType == DataType::Int64) {
return CNNL_DTYPE_INT64;
}
if (dataType == DataType::Bool) {
return CNNL_DTYPE_BOOL;
}
IT_TODO_HALT_MSG("Data type " + dataType.toString() +
" not supported in CNNL.");
}
#ifdef INFINI_USE_CNCL
inline cnclDataType_t cnclDataTypeConvert(DataType dataType) {
if (dataType == DataType::Float32) {
return cnclFloat32;
}
if (dataType == DataType::Float16) {
return cnclHalf;
}
if (dataType == DataType::Int8) {
return cnclInt8;
}
if (dataType == DataType::Int16) {
return cnclInt16;
}
if (dataType == DataType::Int32) {
return cnclInt32;
}
if (dataType == DataType::UInt8) {
return cnclUint8;
}
if (dataType == DataType::UInt16) {
return cnclUint16;
}
if (dataType == DataType::UInt32) {
return cnclUint32;
}
if (dataType == DataType::BFloat16) {
return cnclBfloat16;
}
IT_TODO_HALT_MSG("Data type " + dataType.toString() +
" not supported in CNCL.");
}
#endif
} // namespace infini

View File

@ -7,19 +7,17 @@ namespace infini {
class BangRuntimeObj : public RuntimeObj {
private:
cnnlHandle_t cnnl;
cnrtQueue_t queue;
std::unique_ptr<CommunicatorObj> comm;
BangPtr workspace;
size_t workspaceSize;
mutable size_t cursor;
public:
explicit BangRuntimeObj(int deviceId = 0)
: RuntimeObj(Device::BANG, deviceId) {
BangRuntimeObj() : RuntimeObj(Device::BANG) {
cnInit(0);
CNdev dev;
cnDeviceGet(&dev, deviceId);
cnDeviceGet(&dev, 0);
checkBangError(cnrtSetDevice(dev));
cnrtQueue_t queue;
checkBangError(cnrtQueueCreate(&queue));
checkCnnlError(cnnlCreate(&cnnl));
@ -32,7 +30,6 @@ class BangRuntimeObj : public RuntimeObj {
}
virtual ~BangRuntimeObj() {
dealloc(workspace);
checkBangError(cnrtQueueDestroy(queue));
checkCnnlError(cnnlDestroy(cnnl));
}
string toString() const override;
@ -76,9 +73,10 @@ class BangRuntimeObj : public RuntimeObj {
checkBangError(cnrtMemcpy(dst, const_cast<void *>(src), bytes,
CNRT_MEM_TRANS_DIR_PEER2PEER));
}
void initComm(const string &name, int worldSize, int rank) final;
CommunicatorObj &getCommunicator() const override { return *comm; }
cnrtQueue_t getBangQueue() const { return queue; }
void initComm(const string &, int, int) override { IT_TODO_HALT(); }
CommunicatorObj &getCommunicator() const override { IT_TODO_HALT(); }
private:
void runWithoutSync(const Graph &graph, bool tune, bool profiling) const;

View File

@ -1,79 +0,0 @@
#pragma once
#include "bang_common.h"
#include "core/communicator.h"
#include <chrono>
#include <cncl.h>
#include <cnrt.h>
#include <cstdlib>
#include <filesystem>
#include <fstream>
#include <mutex>
#include <thread>
namespace infini {
class CnclCommunicatorObj final : public CommunicatorObj {
private:
cnclComm_t *comms;
public:
CnclCommunicatorObj(const string &name, int worldSize, int rank)
: CommunicatorObj(worldSize, rank) {
const std::string filePath("./" + name + "_cncl_id.bin");
cnclCliqueId clique_id;
if (rank == 0) {
CNCL_CHECK(cnclGetCliqueId(&clique_id));
std::ofstream ofs(filePath, std::ios::binary);
ofs.write((char *)&clique_id, sizeof(cnclCliqueId));
} else {
auto begin = std::chrono::steady_clock::now();
while (!std::filesystem::exists(filePath)) {
auto now = std::chrono::steady_clock::now();
_IT_ASSERT_2(now < begin + std::chrono::seconds(10),
"time limit (10s) exceeded.");
std::this_thread::sleep_for(std::chrono::milliseconds(100));
}
std::ifstream ifs(filePath, std::ios::binary);
ifs.read((char *)&clique_id, sizeof(cnclCliqueId));
}
int num_comms = 1;
int *dev_list = new int[num_comms];
int *rank_list = new int[num_comms];
comms = new cnclComm_t[num_comms];
uint32_t num_dev = 0;
checkBangError(cnrtGetDeviceCount(&num_dev));
for (int i = 0; i < num_comms; i++) {
rank_list[i] = rank;
dev_list[i] = rank_list[i] % num_dev;
}
CNCL_CHECK(cnclInitComms(comms, num_comms, dev_list, rank_list,
worldSize, &clique_id));
if (rank == 0) {
std::filesystem::remove(filePath);
}
delete[] dev_list;
delete[] rank_list;
}
~CnclCommunicatorObj() {
CNCL_CHECK(cnclDestroyComms(comms, 1));
delete[] comms;
}
// Get the actual cnclComm_t
cnclComm_t getCnclComm() { return comms[0]; }
virtual string toString() const final {
std::ostringstream oss;
oss << "CNCL communicator";
return oss.str();
}
};
} // namespace infini

View File

@ -61,30 +61,16 @@ template <typename T> auto enum_to_underlying(T e) {
}
template <typename T> std::string vecToString(const std::vector<T> &vec) {
std::stringstream ss;
ss << "[";
for (size_t i = 0; i < vec.size(); ++i) {
ss << vec.at(i);
if (i < vec.size() - 1) {
ss << ",";
}
std::string ret;
ret.append("[");
for (auto d : vec) {
ret.append(std::to_string(d));
ret.append(",");
}
ss << "]";
return ss.str();
}
template <typename T> std::string vecToString(const T *st, size_t length) {
std::stringstream ss;
ss << "[";
size_t i = 0;
for (i = 0; i < length; i++) {
ss << *(st + i);
if (i < length - 1) {
ss << ",";
}
}
ss << "]";
return ss.str();
if (!vec.empty())
ret.pop_back();
ret.append("]");
return ret;
}
double timeit(

View File

@ -5,10 +5,6 @@
#include <cstdint>
#include <iostream>
#ifdef USE_CUDA
#include "cuda/cuda_runtime.h"
#endif
namespace infini {
class GraphHandlerObj {
@ -30,14 +26,12 @@ class GraphHandlerObj {
int pw, int sh, int sw, int dh, int dw, int oph,
int opw);
Tensor matmul(Tensor a, Tensor b, Tensor y, bool transA, bool transB,
Tensor bias, ActType act,
std::string matmul_compute_type = "default");
Tensor bias, ActType act);
Tensor batchNormalization(Tensor input, Tensor output, Tensor mean,
Tensor var, Tensor scale, Tensor bias,
float momentum, float eps, bool training);
Tensor layerNormalization(Tensor input, Tensor scale, Tensor output,
Tensor bias, float eps, int axis, int stash_type);
Tensor rmsNorm(Tensor input, Tensor weight, Tensor output);
Tensor maxPool(Tensor input, Tensor output, int kh, int kw, int dh, int dw,
int ph, int pw, int sh, int sw, int ceilMode);
@ -53,7 +47,6 @@ class GraphHandlerObj {
Tensor max(Tensor a, Tensor b, Tensor c);
Tensor relu(Tensor x, Tensor y);
Tensor silu(Tensor x, Tensor y);
Tensor gelu(Tensor x, Tensor y);
Tensor sigmoid(Tensor x, Tensor y);
Tensor hardSigmoid(Tensor x, Tensor y);
@ -72,21 +65,12 @@ class GraphHandlerObj {
std::optional<float> max);
Tensor transpose(Tensor data, Tensor transposed, Shape perm);
Tensor reshape(Tensor data, Tensor reshaped, Shape shape);
Tensor resize(Tensor input, Tensor output,
const std::optional<vector<int>> &axes, Tensor sizes,
Tensor scales, Tensor roi, vector<uint32_t> sizes_,
vector<float> scales_, vector<float> roi_, string mode,
string ratioPolicy, string nearestMode,
string coordTransMode);
Tensor squeeze(Tensor input, Tensor output, Shape axes);
Tensor unsqueeze(Tensor input, Tensor output, Shape axes);
Tensor concat(TensorVec inputs, Tensor output, int dim);
Tensor attentionKVCache(Tensor input_k_cache, Tensor input_v_cache,
Tensor input_q, Tensor input_k, Tensor input_v,
Tensor position_id, Tensor output_matmul);
Tensor RoPE(Tensor pos, Tensor input, Tensor output);
TensorVec split(Tensor input, std::optional<TensorVec> outputs, int axis,
std::variant<int, vector<int>> numOrRatio);
int num_outputs);
Tensor gather(Tensor data, Tensor indices, Tensor output, int axis);
Tensor gatherElements(Tensor data, Tensor indices, Tensor output, int axis);
Tensor reduceMean(Tensor data, Tensor reduced,
@ -110,13 +94,8 @@ class GraphHandlerObj {
Tensor allReduceAvg(Tensor input, Tensor output);
TensorVec allGather(Tensor input, std::optional<TensorVec> outputs, int n);
Tensor broadcast(Tensor input, Tensor output, int root);
Tensor send(Tensor input, int source, int destination, Tensor output);
Tensor recv(Tensor output, int source, int destination, Shape dims,
int outputType, Tensor input);
Tensor depthToSpace(Tensor input, Tensor output, int blocksize,
std::string mode);
Tensor lrn(Tensor input, Tensor output, float alpha, float beta, float bias,
int size);
//------ modifiers
@ -143,12 +122,6 @@ class GraphHandlerObj {
inline void run() { g->getRuntime()->run(g); }
inline double get_perf_time() { return g->getRuntime()->getPerfTime(g); }
#ifdef USE_CUDA
inline void run_with_cudagraph() {
(as<CudaRuntimeObj>(g->getRuntime()))->runWithCudaGraph(g);
}
#endif
};
} // namespace infini

View File

@ -2,11 +2,10 @@
#include "core/common.h"
#include "core/operator.h"
#include "core/tensor.h"
#include "utils/operator_utils.h"
#include <functional>
#include <nlohmann/json.hpp>
namespace infini {
using json = nlohmann::json;
namespace infini {
class RuntimeObj; // Forward declaration for Kernel::compute
@ -30,6 +29,7 @@ class Kernel {
public:
Kernel() {}
virtual ~Kernel() {}
/**
* @param op The operator to be executed.
* @param record The parameters for kernel execution. If extra parameters
@ -102,9 +102,11 @@ class KernelRegistry {
}
Kernel *getKernel(const KernelAttrs &kernelAttrs) const {
auto it = kernels.find(kernelAttrs);
IT_ASSERT(it != kernels.end(), "Kernel not found for key {" +
get_kernel_attrs_str(kernelAttrs) +
"}");
IT_ASSERT(it != kernels.end(),
"Kernel not found for key {" +
to_string(enum_to_underlying(std::get<0>(kernelAttrs))) +
", " + std::to_string(std::get<1>(kernelAttrs)) + ", " +
std::get<2>(kernelAttrs).toString() + "}");
return std::get<0>(it->second);
}
const KernelRecord &getKernelItem(const KernelAttrs &kernelAttrs) const {
@ -129,16 +131,15 @@ class CpuKernelWithoutConfig : public Kernel {
} // namespace infini
#define _REGISTER_KERNEL_1(device, opType, kernel, name, cnt) \
#define _REGISTER_KERNEL_1(device, opType, dataType, kernel, name, cnt) \
namespace infini { \
static const bool _CAT(_register_kernel_, cnt) = \
KernelRegistry::getInstance().registerKernel(KernelAttrs{device, \
opType}, \
new kernel(), name); \
KernelRegistry::getInstance().registerKernel( \
KernelAttrs{device, opType, dataType}, new kernel(), name); \
}
#define REGISTER_KERNEL(device, opType, kernel, name) \
_REGISTER_KERNEL_1(device, opType, kernel, name, __COUNTER__)
#define REGISTER_KERNEL(device, opType, dataType, kernel, name) \
_REGISTER_KERNEL_1(device, opType, dataType, kernel, name, __COUNTER__)
#define _REGISTER_CONSTRUCTOR_1(type, constructor, cnt) \
namespace infini { \

View File

@ -151,14 +151,11 @@ struct OpType {
ReduceSum, // Reduce
ReduceSumSquare, // Reduce
Relu, // Unary
Silu, // Unary
Reshape,
Resize,
ReverseSequence,
RoiAlign,
RoPE, // Fusion
Round, // Unary
RMSNorm, // Fusion
Round, // Unary
STFT,
Scan,
Scatter,
@ -235,8 +232,6 @@ struct OpType {
AllReduceAvg,
AllGather,
Broadcast,
Send,
Recv,
} type;
constexpr OpType(decltype(type) t) : type(t) {}

View File

@ -4,7 +4,7 @@
#include "core/tensor.h"
namespace infini {
using KernelAttrs = std::tuple<Device, OpType::underlying_t>;
using KernelAttrs = std::tuple<Device, OpType::underlying_t, DataType>;
struct OpPerfKey {
HashType hash;
@ -90,7 +90,6 @@ class OperatorObj : public Object {
OpType getOpType() const { return type; }
// HACK: set correct data type
DataType getDType() const { return getInputs(0)->getDType(); }
DataType getOutDType() const { return getOutput()->getDType(); }
virtual int numInputs() const = 0;
virtual int numOutputs() const = 0;

View File

@ -2,8 +2,8 @@
#include "core/graph.h"
#include "core/kernel.h"
#include <nlohmann/json_fwd.hpp>
namespace infini {
using json = nlohmann::json;
namespace infini {
class PerfEngine {
public:

View File

@ -15,7 +15,6 @@ class GraphObj;
class GraphHandlerObj;
class RuntimeObj;
class BlobObj;
template <typename T> class WorkspaceObj;
using TensorBase = Ref<TensorBaseObj>;
using Tensor = Ref<TensorObj>;
@ -24,7 +23,6 @@ using Graph = Ref<GraphObj>;
using GraphHandler = Ref<GraphHandlerObj>;
using Runtime = Ref<RuntimeObj>;
using Blob = Ref<BlobObj>;
template <typename T> using Workspace = Ref<WorkspaceObj<T>>;
using TensorVec = vector<Tensor>;
using OpVec = vector<Operator>;

View File

@ -4,14 +4,11 @@
#include "utils/data_convert.h"
#include <cmath>
#include <cstring>
#include <fstream>
#if USE_CUDA
#include "cuda/cuda_runtime.h"
#endif
#if USE_BANG
#include "bang/bang_runtime.h"
#endif
namespace infini {
// TODO: how to deal with this
@ -45,16 +42,8 @@ class TensorObj : public TensorBaseObj {
bool isOutput() const { return tensorType == TensorType::output; }
bool isOthers() const { return tensorType == TensorType::others; }
void setWeight() { tensorType = TensorType::weight; }
void setInput() {
if (!this->isWeight()) {
tensorType = TensorType::input;
}
}
void setOutput() {
if (!this->isWeight()) {
tensorType = TensorType::output;
}
}
void setInput() { tensorType = TensorType::input; }
void setOutput() { tensorType = TensorType::output; }
string tensorTypeToString() const {
switch (tensorType) {
case TensorType::weight:
@ -144,7 +133,6 @@ class TensorObj : public TensorBaseObj {
}
void printData() const;
void dumpData(std::ofstream &ofs) const;
bool equalData(const Tensor &rhs, double relativeError = 1e-6) const;
template <typename T> bool equalData(const vector<T> &dataVector) {
@ -200,20 +188,13 @@ class TensorObj : public TensorBaseObj {
if (a[i] != b[i])
return false;
} else if constexpr (std::is_floating_point_v<T>) {
if (std::min(fabs(a[i]), fabs(b[i])) == 0. &&
fabs(a[i] - b[i]) > relativeError) {
printf("Error on %lu: %f %f\n", i, a[i], b[i]);
return false;
} else if (std::min(fabs(a[i]), fabs(b[i])) != 0. &&
fabs(a[i] - b[i]) /
std::max(fabs(a[i]), fabs(b[i])) >
relativeError) {
if (fabs(a[i] - b[i]) / std::max(fabs(a[i]), fabs(b[i])) >
relativeError) {
printf("Error on %lu: %f %f\n", i, a[i], b[i]);
return false;
}
} else {
} else
static_assert(!sizeof(T), "Unsupported data type");
}
}
return true;
}
@ -248,8 +229,8 @@ class TensorObj : public TensorBaseObj {
// // std::cerr << "Init beginned " << std::endl;
// #pragma omp parallel for
// for (size_t i = 0; i < iEnd; ++i)
// data[i] = fastrand(random_seed[omp_get_thread_num() *
// 16]) % 10000;
// data[i] = fastrand(random_seed[omp_get_thread_num() * 16]) %
// 10000;
// // std::cerr << "Init finished" << std::endl;
// computed = ComputedFull;
// return true;
@ -294,8 +275,8 @@ class TensorObj : public TensorBaseObj {
// auto nDim = dims.size();
// auto nBroadcastDim = ds.size() - nDim;
// for (size_t i = 0; i < nDim; ++i)
// if (ds[nBroadcastDim + i] < 0 || ds[nBroadcastDim +
// i] >= dims[i])
// if (ds[nBroadcastDim + i] < 0 || ds[nBroadcastDim + i] >=
// dims[i])
// return (size_t)-1;
// size_t idx = 0;
// for (size_t i = 0; i < nDim; ++i)
@ -354,14 +335,12 @@ class TensorObj : public TensorBaseObj {
// return (g_seed >> 16) & 0x7FFF;
// }
// std::vector<std::vector<int>> const *getSplittingPoints()
// const {
// std::vector<std::vector<int>> const *getSplittingPoints() const {
// assert(!splittingPoints.empty());
// return &splittingPoints;
// }
// bool setSplittingPoints(std::vector<std::vector<int>> value)
// {
// bool setSplittingPoints(std::vector<std::vector<int>> value) {
// assert(!value.empty());
// splittingPoints = value;
// return true;

View File

@ -1,42 +0,0 @@
#pragma once
#include "core/runtime.h"
namespace infini {
template <class T> class WorkspaceObj {
private:
T workspace; // workspace pointer
size_t workspaceSize; // Size of workspace
size_t workspaceAlloc; // currently use workspace size
public:
WorkspaceObj(T workspace_, size_t workspaceSize_)
: workspace(workspace_), workspaceSize(workspaceSize_) {
workspaceAlloc = 0;
}
virtual ~WorkspaceObj() {
// Dealloc workspace in RuntimeObj
// Set workspace = nullptr here
workspace = nullptr;
}
size_t getWorkspaceSize() const { return workspaceSize; }
T getWorkspace(size_t size) {
// Get unused workspace
IT_ASSERT(size + workspaceAlloc <= workspaceSize);
auto ret = (T)(static_cast<uint8_t *>(workspace) + workspaceAlloc);
workspaceAlloc += size;
return ret;
}
T getWorkspace() {
// Override getWorkspace in order to dealloc in runtime
return workspace;
}
void resetWorkspace() {
// Reset workspaceAlloc every time end kernel
workspaceAlloc = 0;
}
size_t getWorkspaceAlloc() const { return workspaceAlloc; }
};
} // namespace infini

View File

@ -1,5 +1,4 @@
#pragma once
#include "core/common.h"
#include <cstdio>
struct AttentionKVCacheMetadata {
@ -11,7 +10,6 @@ namespace infini {
void attention_kvcache_kernel(float *input_k_cache, float *input_v_cache,
float *input_q, float *input_k, float *input_v,
int *position_id, float *output_matmul,
const AttentionKVCacheMetadata &compMeta,
float *output_O_temp, float *output_sum_temp);
const AttentionKVCacheMetadata &compMeta);
} // namespace infini

View File

@ -5,7 +5,6 @@
#include <cuda_profiler_api.h>
#include <cudnn.h>
#include <curand.h>
#include <memory>
#define checkCudaError(call) \
if (auto err = call; err != cudaSuccess) \
@ -112,20 +111,4 @@ inline const char *curandGetErrorString(curandStatus_t error) {
using CudaPtr = void *;
class CUDAStream {
public:
CUDAStream(const CUDAStream &) = delete;
CUDAStream(CUDAStream &&) = delete;
void operator=(const CUDAStream &) = delete;
void operator=(CUDAStream &&) = delete;
static cudaStream_t getCurrentStream() { return _stream; }
static void Init() { CUDAStream::_stream = 0; };
static void createStream() { checkCudaError(cudaStreamCreate(&_stream)); }
static void destroyStream() { checkCudaError(cudaStreamDestroy(_stream)); }
private:
CUDAStream(){};
static cudaStream_t _stream;
};
} // namespace infini

View File

@ -1,20 +1,13 @@
#pragma once
namespace infini {
void div_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
int a2, int a3, int b0, int b1, int b2, int b3, int c0, int c1,
int c2, int c3);
void add_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
int a2, int a3, int b0, int b1, int b2, int b3, int c0, int c1,
int c2, int c3);
void pow_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
int a2, int a3, int b0, int b1, int b2, int b3, int c0, int c1,
int c2, int c3);
void less_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
int a2, int a3, int b0, int b1, int b2, int b3, int c0, int c1,
int c2, int c3);
void div_const_kernel(int dType, void *a, void *b, void *c, size_t n);
void pow_const_kernel(int dType, void *a, void *b, void *c, size_t n);
void div_kernel(void *a, void *b, void *c, int a0, int a1, int a2, int a3,
int b0, int b1, int b2, int b3, int c0, int c1, int c2, int c3);
void add_kernel(void *a, void *b, void *c, int a0, int a1, int a2, int a3,
int b0, int b1, int b2, int b3, int c0, int c1, int c2, int c3);
void pow_kernel(void *a, void *b, void *c, int a0, int a1, int a2, int a3,
int b0, int b1, int b2, int b3, int c0, int c1, int c2, int c3);
void less_kernel(void *a, void *b, void *c, int a0, int a1, int a2, int a3,
int b0, int b1, int b2, int b3, int c0, int c1, int c2,
int c3);
}; // namespace infini

View File

@ -3,10 +3,7 @@
#include "operators/unary.h"
#include "utils/small_array.h"
namespace infini {
void expandKernel(int dType, void *input, void *output, int nDims,
int outputsize, SmallArray inputShape,
SmallArray outputShape);
void expandKernel(float *input, float *output, int nDims, int outputsize,
SmallArray inputShape, SmallArray outputShape);
void expandRowKernel(int dType, void *input, void *output, int n_rows,
int row_len);
}; // namespace infini

View File

@ -8,10 +8,4 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
void LaynormKernel(const float *input, const float *scale, const float eps,
int size, int scaleSize, const int dimsize, const int stride,
float *output);
void LaynormKernel(const half *input, const half *scale, const half eps,
int size, int scaleSize, const int dimsize, const int stride,
half *output, const half *bias, int biasSize);
void LaynormKernel(const half *input, const half *scale, const half eps,
int size, int scaleSize, const int dimsize, const int stride,
half *output);
}; // namespace infini

View File

@ -1,10 +0,0 @@
#pragma once
#include "operators/rms_norm.h"
namespace infini {
void rmsnorm_kernel(int dType, void *input, void *weight, void *output,
int num_tokens, int hidden_size);
}; // namespace infini

View File

@ -1,12 +0,0 @@
#pragma once
#include "operators/rope.h"
#include "utils/small_array.h"
namespace infini {
void rope_kernel(int dType, int *pos, void *input, void *output, int size,
int dim_model, int dim_head, int hidden_stride,
int pos_stride);
}; // namespace infini

View File

@ -14,9 +14,6 @@ class CudaRuntimeObj : public RuntimeObj {
std::unique_ptr<CommunicatorObj> comm;
CudaPtr workspace;
size_t workspaceSize;
bool isCudaGraphCreated;
cudaGraph_t cudaGraph;
cudaGraphExec_t cudaGraphInstance;
public:
explicit CudaRuntimeObj(int deviceId = 0)
@ -29,16 +26,9 @@ class CudaRuntimeObj : public RuntimeObj {
// size_t longformerNum = 3lu * (1 << 30);
workspaceSize = 7ll << 30; // 7 GB
workspace = alloc(workspaceSize);
isCudaGraphCreated = false;
CUDAStream::Init();
}
virtual ~CudaRuntimeObj() {
try {
if (isCudaGraphCreated) {
checkCudaError(cudaGraphExecDestroy(cudaGraphInstance));
checkCudaError(cudaGraphDestroy(cudaGraph));
CUDAStream::destroyStream();
}
dealloc(workspace);
checkCudnnError(cudnnDestroy(cudnn));
checkCublasError(cublasDestroy(cublas));
@ -85,8 +75,6 @@ class CudaRuntimeObj : public RuntimeObj {
void runWithoutSync(const Graph &graph) const;
void runWithCudaGraph(const Graph &graph);
// init communicator
void initComm(const string &name, int worldSize, int rank) final;

View File

@ -3,6 +3,4 @@
namespace infini {
void softmax_kernel(int num_blocks, float *input, float *output, int size,
int dimsize, int stride);
void softmax_kernel(int num_blocks, half *input, half *output, int size,
int dimsize, int stride);
} // namespace infini
}

View File

@ -8,8 +8,8 @@ const int DIM_MAX_SIZE = 8;
// Concat operator acts like element tensors composing to one big tensor,and
// split operator acts like one big tensor being composed by element
// tensors.
template <typename T> struct ElementTensorMetadata {
T *data[BATCH_SIZE];
struct ElementTensorMetadata {
float *data[BATCH_SIZE];
int dimBgNo[BATCH_SIZE]; // the dimention begin no of the element tensor in
// the composed tensor.
int dimSize[BATCH_SIZE]; // the dimention size of the element tensor.
@ -20,17 +20,16 @@ template <typename T> struct ElementTensorMetadata {
data[i], dimBgNo[i], dimSize[i], nElements[i]);
}
};
template <typename T> struct ComposedTensorMetadata {
struct ComposedTensorMetadata {
int dimSize[DIM_MAX_SIZE];
int stride[DIM_MAX_SIZE];
T *data;
float *data;
};
namespace infini {
void split_concat_kernel(const ElementTensorMetadata<float> &eleMeta,
const ComposedTensorMetadata<float> &compMeta, int dim,
int batchSize, int nDims, bool isSplit);
void split_concat_kernel(const ElementTensorMetadata<half> &eleMeta,
const ComposedTensorMetadata<half> &compMeta, int dim,
void split_concat_kernel(const ElementTensorMetadata &eleMeta,
const ComposedTensorMetadata &compMeta, int dim,
int batchSize, int nDims, bool isSplit);
} // namespace infini

View File

@ -5,7 +5,7 @@
namespace infini {
void transpose_kernel(int dType, void *input, void *output, int nDims, int size,
void transpose_kernel(float *input, float *output, int nDims, int size,
SmallArray strides, SmallArray outputShape);
}; // namespace infini

View File

@ -3,22 +3,48 @@
#include "operators/unary.h"
namespace infini {
template <typename T> void softmax_kernel(T *input, T *output, size_t num);
template <typename T> void relu_kernel(T *input, T *output, size_t num);
template <typename T> void silu_kernel(T *input, T *output, size_t num);
template <typename T> void sigmoid_kernel(T *input, T *output, size_t num);
template <typename T> void tanh_kernel(T *input, T *output, size_t num);
template <typename T> void abs_kernel(T *input, T *output, size_t num);
template <typename T> void sqrt_kernel(T *input, T *output, size_t num);
template <typename T> void neg_kernel(T *input, T *output, size_t num);
template <typename T> void gelu_kernel(T *input, T *output, size_t num);
template <typename T> void erf_kernel(T *input, T *output, size_t num);
template <typename T> void hard_sigmoid_kernel(T *input, T *output, size_t num);
template <typename T> void hard_swish_kernel(T *input, T *output, size_t num);
void softmax_kernel(float *input, float *output, size_t num);
void relu_kernel(float *input, float *output, size_t num);
void sigmoid_kernel(float *input, float *output, size_t num);
void tanh_kernel(float *input, float *output, size_t num);
void abs_kernel(float *input, float *output, size_t num);
void sqrt_kernel(float *input, float *output, size_t num);
void neg_kernel(float *input, float *output, size_t num);
void gelu_kernel(float *input, float *output, size_t num);
void erf_kernel(float *input, float *output, size_t num);
void hard_sigmoid_kernel(float *input, float *output, size_t num);
void hard_swish_kernel(float *input, float *output, size_t num);
template <typename INPUT, typename OUTPUT>
void cast_kernel(INPUT *input, OUTPUT *output, size_t num);
void unary_kernel(const Operator &_op) {
auto op = as<UnaryObj>(_op);
float *const inputData = (op->getInputs(0)->getRawDataPtr<float *>());
float *const outputData = (op->getOutput()->getRawDataPtr<float *>());
void unary_kernel(const Operator &_op);
size_t num = op->getOutput()->size();
if (op->getOpType() == OpType::Softmax)
softmax_kernel(inputData, outputData, num);
else if (op->getOpType() == OpType::Relu)
relu_kernel(inputData, outputData, num);
else if (op->getOpType() == OpType::Sigmoid)
sigmoid_kernel(inputData, outputData, num);
else if (op->getOpType() == OpType::HardSigmoid)
hard_sigmoid_kernel(inputData, outputData, num);
else if (op->getOpType() == OpType::HardSwish)
hard_swish_kernel(inputData, outputData, num);
else if (op->getOpType() == OpType::Tanh)
tanh_kernel(inputData, outputData, num);
else if (op->getOpType() == OpType::Abs)
abs_kernel(inputData, outputData, num);
else if (op->getOpType() == OpType::Sqrt)
sqrt_kernel(inputData, outputData, num);
else if (op->getOpType() == OpType::Gelu)
gelu_kernel(inputData, outputData, num);
else if (op->getOpType() == OpType::Neg)
neg_kernel(inputData, outputData, num);
else if (op->getOpType() == OpType::Erf)
erf_kernel(inputData, outputData, num);
else
IT_TODO_HALT();
}
}; // namespace infini

View File

@ -1,29 +1,11 @@
#pragma once
#include "core/tensor.h"
#include "cuda/cuda_common.h"
namespace infini {
void cudaPrintFloat(float *x, int len);
void cudaPrintTensor(const Tensor &tensor);
void cudaPrintTensor(const Tensor &tensor) {
cudaPrintFloat(tensor->getRawDataPtr<float *>(), tensor->size());
}
cudnnDataType_t cudnnDataTypeConvert(DataType dataType);
cudaDataType cublasDataTypeConvert(DataType);
template <int index> struct DT_CUDA {};
template <> struct DT_CUDA<0> { using t = bool; };
template <> struct DT_CUDA<1> { using t = float; };
template <> struct DT_CUDA<2> { using t = unsigned char; };
template <> struct DT_CUDA<3> { using t = char; };
template <> struct DT_CUDA<4> { using t = unsigned short; };
template <> struct DT_CUDA<5> { using t = short; };
template <> struct DT_CUDA<6> { using t = int; };
template <> struct DT_CUDA<7> { using t = long long; };
template <> struct DT_CUDA<9> { using t = bool; };
template <> struct DT_CUDA<10> { using t = half; };
template <> struct DT_CUDA<11> { using t = double; };
template <> struct DT_CUDA<12> { using t = unsigned int; };
template <> struct DT_CUDA<13> { using t = unsigned long long; };
template <> struct DT_CUDA<16> { using t = nv_bfloat16; };
} // namespace infini
} // namespace infini

View File

@ -3,15 +3,10 @@
#include "utils/small_array.h"
namespace infini {
void whereKernel(const float *inputX, const float *inputY,
const uint8_t *condition, float *output, int nDims,
int outputsize, SmallArray inputXShape, SmallArray inputYShape,
SmallArray conditionShape, SmallArray outputShape, int xSize,
int ySize, int cSize);
void whereKernel(const half *inputX, const half *inputY,
const uint8_t *condition, half *output, int nDims,
int outputsize, SmallArray inputXShape, SmallArray inputYShape,
SmallArray conditionShape, SmallArray outputShape, int xSize,
int ySize, int cSize);
}; // namespace infini

View File

@ -53,8 +53,7 @@ inline void initGatherMetaData(GatherMetaData &metaData,
metaData.inStride[i] = in->getStride()[i];
}
}
template <typename T>
void gather_kernel(T *in, T *out, GatherMetaData metaData, size_t num);
void gather_kernel(float *in, float *out, GatherMetaData metaData, size_t num);
void gather_elements_kernel(void *in, void *out, GatherMetaData metaData,
size_t num);

View File

@ -1,23 +0,0 @@
#include "core/op_type.h"
#include "kunlun/kunlun_common.h"
namespace infini {
using KunlunActType = xdnn::Activation_t;
KunlunActType parseActType(ActType act) {
switch (act) {
case ActType::None:
return KunlunActType::LINEAR;
case ActType::Tanh:
return KunlunActType::TANH;
case ActType::Sigmoid:
return KunlunActType::SIGMOID;
case ActType::Relu:
return KunlunActType::RELU6;
default:
fprintf(stderr, "Activation Type not support yet!\n");
break;
}
return KunlunActType::LINEAR;
}
}; // namespace infini

View File

@ -3,8 +3,6 @@
#include "xpu/runtime_ex.h"
#include "xpu/xdnn.h"
namespace xdnn = baidu::xpu::api;
#define checkKUNLUNError(call) \
{ \
auto err = call; \

View File

@ -1,35 +1,28 @@
#pragma once
#include "core/runtime.h"
#include "core/workspace.h"
#include "kunlun/kunlun_common.h"
#ifdef INFINI_USE_XCCL
#include "kunlun/xccl_communicator.h"
#endif
namespace infini {
class KUNLUNRuntimeObj : public RuntimeObj {
private:
xdnn::Context *ctx;
std::unique_ptr<CommunicatorObj> comm;
// KUNLUNPtr workspace;
// size_t workspaceSize;
Workspace<KUNLUNPtr> workspace;
baidu::xpu::api::Context *xdnn;
KUNLUNPtr workspace;
size_t workspaceSize;
public:
KUNLUNRuntimeObj(int deviceId = 0) : RuntimeObj(Device::KUNLUN) {
xpu_set_device(deviceId);
ctx = xdnn::create_context();
KUNLUNRuntimeObj() : RuntimeObj(Device::KUNLUN) {
xdnn = baidu::xpu::api::create_context();
// 10GB for Longformer
// size_t longformerNum = 3lu * (1 << 30);
size_t workspaceSize = 2llu << 30; // 2 GB
KUNLUNPtr wkspacePtr = alloc(workspaceSize);
workspace =
make_ref<WorkspaceObj<KUNLUNPtr>>(wkspacePtr, workspaceSize);
workspaceSize = 3ll << 30; // 3 GB
// std::cout<<workspaceSize/1024/1024/1024<< std::endl;
// std::cout<<std::bitset<64>(workspaceSize)<< std::endl;
workspace = alloc(workspaceSize);
}
virtual ~KUNLUNRuntimeObj() {
KUNLUNPtr wkspacePtr = workspace->getWorkspace();
dealloc(wkspacePtr);
xdnn::destroy_context(ctx);
dealloc(workspace);
baidu::xpu::api::destroy_context(xdnn);
}
string toString() const override;
@ -38,41 +31,40 @@ class KUNLUNRuntimeObj : public RuntimeObj {
// double runEvaluation(const Graph &graph, int nWarmups,
// int nEvaluations) const;
void sync() const;
KUNLUNPtr alloc(size_t size) override {
void *ptr;
checkKUNLUNError(
xpu_malloc((void **)&ptr, size, XPUMemoryKind::XPU_MEM_HBM));
xpu_malloc_ex((void **)&ptr, size, XPUMemoryKind::XPU_MEM_MAIN));
return ptr;
}
void dealloc(void *ptr) override { xpu_free(ptr); }
xdnn::Context *KUNLUNHandle() const { return ctx; }
// Get $size workspace by bytes
baidu::xpu::api::Context *KUNLUNHandle() const { return xdnn; }
KUNLUNPtr getWorkspace(size_t size) const {
auto ret = workspace->getWorkspace(size);
return ret;
IT_ASSERT(size <= workspaceSize);
return workspace;
}
Workspace<KUNLUNPtr> getWorkspaceObj() const { return workspace; }
void copyBlobFromCPU(void *dst, const void *src,
size_t bytes) const override {
xpu_memcpy(dst, const_cast<void *>(src), bytes,
XPUMemcpyKind::XPU_HOST_TO_DEVICE);
}
void copyBlobToCPU(void *dst, const void *src,
size_t bytes) const override {
xpu_memcpy(dst, const_cast<void *>(src), bytes,
XPUMemcpyKind::XPU_DEVICE_TO_HOST);
}
void copyBlobInsideRuntime(void *dst, const void *src,
size_t bytes) const override {
xpu_memcpy(dst, const_cast<void *>(src), bytes,
XPUMemcpyKind::XPU_DEVICE_TO_DEVICE);
}
void initComm(const string &name, int worldSize, int rank) final;
CommunicatorObj &getCommunicator() const final { return *comm; }
void initComm(const string &, int, int) override { IT_TODO_HALT(); }
CommunicatorObj &getCommunicator() const override { IT_TODO_HALT(); }
private:
void runWithoutSync(const Graph &graph, bool tune, bool profiling) const;

View File

@ -1,60 +0,0 @@
#pragma once
#include "core/communicator.h"
#include "xpu/bkcl.h"
#include <chrono>
#include <filesystem>
#include <fstream>
#include <thread>
#define checkXcclError(call) \
{ \
auto err = call; \
if (BKCL_SUCCESS != err) { \
fprintf(stderr, "XCCL error in %s:%i.\n", __FILE__, __LINE__); \
exit(EXIT_FAILURE); \
} \
}
namespace infini {
class XcclCommunicatorObj final : public CommunicatorObj {
private:
BKCLContext_t comm;
public:
XcclCommunicatorObj(const string &name, int worldSize, int rank)
: CommunicatorObj(worldSize, rank) {
const std::string filePath("./" + name + "_xccl_id.bin");
BKCLUniqueId commId;
if (rank == 0) {
checkXcclError(bkcl_get_unique_id(&commId));
std::ofstream ofs(filePath, std::ios::binary);
ofs.write((char *)&commId, sizeof(BKCLUniqueId));
} else {
auto begin = std::chrono::steady_clock::now();
while (!std::filesystem::exists(filePath)) {
auto now = std::chrono::steady_clock::now();
_IT_ASSERT_2(now < begin + std::chrono::seconds(100),
"time limit (100s) exceeded.");
std::this_thread::sleep_for(std::chrono::milliseconds(100));
}
std::ifstream ifs(filePath, std::ios::binary);
ifs.read((char *)&commId, sizeof(BKCLUniqueId));
}
checkXcclError(bkcl_init_rank(&comm, rank, worldSize, &commId));
if (rank == 0) {
std::filesystem::remove(filePath);
}
}
BKCLContext_t getXcclComm() { return comm; }
~XcclCommunicatorObj() final { checkXcclError(bkcl_destroy_context(comm)); }
virtual string toString() const final {
std::ostringstream oss;
oss << "XCCL communicator";
return oss.str();
}
};
} // namespace infini

View File

@ -1,29 +0,0 @@
#pragma once
#include "core/operator.h"
namespace infini {
class LRNObj : public OperatorObj {
public:
LRNObj(GraphObj *graph, Tensor inputX, Tensor inputY, float alpha,
float beta, float bias, int size);
OP_CLONE(LRNObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
std::string toString() const override;
int numInputs() const override { return inputs.size(); }
int numOutputs() const override { return 1; }
auto getAlphaBetaBias() const {
return tuple(alpha_value, beta_value, bias_value);
}
auto getSize() const { return size_value; }
private:
float alpha_value, beta_value, bias_value;
int size_value;
vector<int> getWorkloadVector() const override;
vector<int> getOpAttrVector() const override;
};
} // namespace infini

View File

@ -17,9 +17,6 @@ class MatmulObj : public OperatorObj {
// Auxiliary attributes which are not a part of operator attributes.
int b, m, n, k;
// Specifies the data precision for the matrix multiply.
std::string computeType = "default";
public:
/**
* @brief Matmul operator with batch broadcast and tensor transpose
@ -41,11 +38,10 @@ class MatmulObj : public OperatorObj {
* @param transB If matrix B should be transposed when computing.
* @param bias The bias tensor.
* @param act The activation function.
* @param computeType Specifies the data precision for the matrix multiply.
*/
MatmulObj(GraphObj *graph, Tensor A, Tensor B, Tensor C,
bool transA = false, bool transB = false, Tensor bias = nullptr,
ActType act = ActType::None, std::string computeType = "default");
ActType act = ActType::None);
OP_CLONE(MatmulObj);
std::string toString() const override;
@ -64,7 +60,6 @@ class MatmulObj : public OperatorObj {
int getN() const { return n; }
int getK() const { return k; }
auto getBMNK() const { return tuple{b, m, n, k}; }
std::string getComputeType() const { return computeType; }
private:
vector<int> getWorkloadVector() const override;

View File

@ -1,46 +0,0 @@
#pragma once
#include "core/operator.h"
namespace infini {
/**
*
* https://docs.nvidia.com/deeplearning/nccl/archives/nccl_2193/user-guide/docs/index.html
*/
class RecvObj : public OperatorObj {
public:
/**
* @brief Construct a new SendRecv object
*
* @param graph The computation graph that this operator belongs to.
* @param input default nullptr, because recv does not have input.
* @param output recv output
* @param source the send rank
* @param destination the recv rank
* @param dims The shape of the output tensor.
*/
RecvObj(GraphObj *graph, Tensor output, int source, int destination,
Shape dims, int outputType, Tensor input = nullptr);
OP_CLONE(RecvObj);
int numInputs() const override { return inputs.size(); }
int numOutputs() const override { return 1; }
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
std::string toString() const override;
DataType getDType() const;
int getSourceRank() const { return source; }
int getDestinationRank() const { return destination; }
inline Shape getShape() const { return dims; }
private:
vector<int> getWorkloadVector() const override;
vector<int> getOpAttrVector() const override;
vector<DataType> inferDataType(const TensorVec &inputs) const override;
protected:
int source;
int destination;
Shape dims;
int outputType;
};
} // namespace infini

View File

@ -27,60 +27,6 @@ class ResizeObj : public OperatorObj {
enum class EKeepAspectRatioPolicy { stretch, notLarger, notSmaller, none };
enum class ECoeffMode { nearest, linear, cubic };
static ECoordinateTransMode fromECoordinateTransModeStr(string mode) {
if (mode == "half_pixel") {
return ECoordinateTransMode::halfPixel;
} else if (mode == "asymmetric") {
return ECoordinateTransMode::asymmetric;
} else if (mode == "align_corners") {
return ECoordinateTransMode::alignCorners;
} else if (mode == "pytorch_half_pixel") {
return ECoordinateTransMode::pytorchHalfPixel;
} else if (mode == "tf_crop_and_resize") {
return ECoordinateTransMode::tfCropAndResize;
} else {
IT_TODO_HALT();
}
}
static ENearestMode fromENearestModeStr(string mode) {
if (mode == "round_prefer_floor") {
return ENearestMode::roundPreferFloor;
} else if (mode == "round_prefer_ceil") {
return ENearestMode::roundPreferCeil;
} else if (mode == "floor") {
return ENearestMode::floor;
} else if (mode == "ceil") {
return ENearestMode::ceil;
} else {
return ENearestMode::none;
}
}
static EKeepAspectRatioPolicy fromRatioPolicyStr(string ratioPolicyStr) {
if (ratioPolicyStr == "stretch") {
return EKeepAspectRatioPolicy::stretch;
} else if (ratioPolicyStr == "not_larger") {
return EKeepAspectRatioPolicy::notLarger;
} else if (ratioPolicyStr == "not_smaller") {
return EKeepAspectRatioPolicy::notSmaller;
} else {
return EKeepAspectRatioPolicy::none;
}
}
static ECoeffMode fromECoeffModeStr(string mode) {
if (mode == "nearest") {
return ECoeffMode::nearest;
} else if (mode == "linear") {
return ECoeffMode::linear;
} else if (mode == "cubic") {
return ECoeffMode::cubic;
} else {
IT_TODO_HALT();
}
}
private:
vector<int> axes;
vector<float> scales;

View File

@ -1,34 +0,0 @@
#pragma once
#include "core/operator.h"
namespace infini {
/**
* @brief Fused RMSNorm Operator
*
*/
class RMSNormObj : public OperatorObj {
int dim;
public:
/**
* @brief Construct a new RMSNorm object.
*
* @param graph The computation graph that this operator belongs to.
* @param input The input tensor.
* @param output The output tensor.
*/
RMSNormObj(GraphObj *graph, Tensor input, Tensor weight, Tensor output);
OP_CLONE(RMSNormObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
std::string toString() const override;
int numInputs() const override { return 2; }
int numOutputs() const override { return 1; }
int getDim() const { return dim; }
private:
vector<int> getWorkloadVector() const override;
vector<int> getOpAttrVector() const override;
};
} // namespace infini

View File

@ -1,29 +0,0 @@
#pragma once
#include "core/operator.h"
namespace infini {
class RoPEObj : public OperatorObj {
public:
/**
* @brief Construct a new RotaryEmbedding object.
*
* @param graph The computation graph that this operator belongs to.
* @param pos The positon id of the query.
* @param input The input tensor.
* @param output The output tensor.
*/
RoPEObj(GraphObj *graph, Tensor pos, Tensor input, Tensor output);
OP_CLONE(RoPEObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
std::string toString() const override;
int numInputs() const override { return 2; }
int numOutputs() const override { return 1; }
DataType getDType() const { return getInputs(1)->getDType(); }
private:
vector<int> getWorkloadVector() const override;
vector<int> getOpAttrVector() const override;
};
} // namespace infini

View File

@ -1,42 +0,0 @@
#pragma once
#include "core/operator.h"
namespace infini {
/**
*
* https://docs.nvidia.com/deeplearning/nccl/archives/nccl_2193/user-guide/docs/index.html
*/
class SendObj : public OperatorObj {
public:
/**
* @brief Construct a new SendRecv object
*
* @param graph The computation graph that this operator belongs to.
* @param input send input
* @param output recv output
* @param source the send rank
* @param destination the recv rank
*/
SendObj(GraphObj *graph, Tensor input, int source, int destination,
Tensor output = nullptr);
OP_CLONE(SendObj);
int numInputs() const override { return 1; }
int numOutputs() const override { return outputs.size(); }
std::string toString() const override;
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
int getSourceRank() const { return source; }
int getDestinationRank() const { return destination; }
private:
vector<int> getWorkloadVector() const override;
vector<int> getOpAttrVector() const override;
vector<DataType> inferDataType(const TensorVec &inputs) const override;
protected:
int source;
int destination;
};
} // namespace infini

View File

@ -1,39 +0,0 @@
#pragma once
#include "core/operator.h"
namespace infini {
/**
* @brief Remove single-dimensional entries from the shape of a tensor.
*
*/
class SqueezeObj : public OperatorObj {
Shape axes;
public:
/**
* @brief Construct a new Squeeze object.
*
* @param graph The computation graph that this operator belongs to.
* @param input The input tensor.
* @param output The output tensor.
* @param axes List of integers indicating the dimensions to squeeze.
*/
SqueezeObj(GraphObj *graph, Tensor input, Tensor output, Shape axes);
OP_CLONE(SqueezeObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
std::string toString() const override;
int numInputs() const override { return 1; }
int numOutputs() const override { return 1; }
inline Shape getAxes() const { return axes; }
private:
vector<int> getWorkloadVector() const override;
vector<int> getOpAttrVector() const override;
};
} // namespace infini

View File

@ -159,7 +159,6 @@ enum class CastType {
Uint322Int64,
Float162Float,
BFloat162Float,
Float2Float,
};
class CastObj : public OperatorObj {
@ -259,7 +258,6 @@ class LogObj : public OperatorObj {
};
DEFINE_UNARY_OBJ(Relu, OpType::Relu)
DEFINE_UNARY_OBJ(Silu, OpType::Silu)
DEFINE_UNARY_OBJ(Gelu, OpType::Gelu)
DEFINE_UNARY_OBJ(Sigmoid, OpType::Sigmoid)
DEFINE_UNARY_OBJ(Tanh, OpType::Tanh)

View File

@ -1,38 +0,0 @@
#pragma once
#include "core/operator.h"
namespace infini {
/**
* @brief nsert single-dimensional entries to the shape of an input tensor.
*
*/
class UnsqueezeObj : public OperatorObj {
Shape axes;
public:
/**
* @brief Construct a new Unsqueeze object.
*
* @param graph The computation graph that this operator belongs to.
* @param input The input tensor.
* @param output The output tensor.
* @param axes List of integers indicating the dimensions to be inserted.
*/
UnsqueezeObj(GraphObj *graph, Tensor input, Tensor output, Shape axes);
OP_CLONE(UnsqueezeObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
std::string toString() const override;
int numInputs() const override { return 1; }
int numOutputs() const override { return 1; }
inline Shape getAxes() const { return axes; }
private:
vector<int> getWorkloadVector() const override;
vector<int> getOpAttrVector() const override;
};
} // namespace infini

View File

@ -0,0 +1,14 @@
#pragma once
namespace infini {
void broadcastShape(const Shape &originShape, SmallArray &modifyShape,
int nDims, int size) {
for (int i = nDims - size - 1; i >= 0; --i) {
modifyShape.data[i] = 1;
}
for (int i = nDims - 1; i >= nDims - size; --i) {
modifyShape.data[i] = originShape[i - nDims + size];
}
}
} // namespace infini

View File

@ -91,12 +91,6 @@ template <int val> class ValGenerator : public DataGenerator {
fill<uint32_t>(data, size);
}
void fill(float *data, size_t size) override { fill<float>(data, size); }
void fill_fp16(uint16_t *data, size_t size) {
for (size_t i = 0; i < size; i++) {
float x = 1.0f * val;
data[i] = float_to_fp16(x);
}
}
};
typedef ValGenerator<1> OneGenerator;
typedef ValGenerator<0> ZeroGenerator;

View File

@ -2,33 +2,16 @@
#ifndef OPERATOR_UTIL_H
#define OPERATOR_UTIL_H
#include "core/operator.h"
#include "core/tensor.h"
#include "utils/small_array.h"
#include <numeric>
namespace infini {
// Launch a broadcast shape based on the shape of input A and B
Shape infer_broadcast(const Shape &A, const Shape &B);
// Launch the real axis based on rank and current axis
int get_real_axis(const int &axis, const int &rank);
// Check if tensor B is unidirectional broadcastable to tensor A
// check if tensor B is unidirectional broadcastable to tensor A
bool is_unidirectional_broadcasting(const Shape &A, const Shape &B);
// Locate the index with size from Shape
Shape locate_index(size_t inputN, const Shape &shape);
// Delocate the ShapeIndex from Shape with broadcast
size_t delocate_index(const Shape &shapeIndex, const Shape &shape,
const Shape &stride);
// Convert KernelAttrs to a string representation
std::string get_kernel_attrs_str(const KernelAttrs &kernelAttrs);
// VectorProd
int shapeProd(std::vector<int>::iterator start, std::vector<int>::iterator end);
void broadcastShape(const Shape &originShape, SmallArray &modifyShape,
int nDims, int size);
void broadcastShape(const Shape &tempShape, Shape &modifyShape);
} // namespace infini
#endif

View File

@ -4,14 +4,6 @@ namespace infini {
#define SMALL_ARRAY_SIZE 8
struct SmallArray {
int data[SMALL_ARRAY_SIZE];
int prod(int start, int end) {
int result = 1;
for (int i = start; i < end; ++i) {
result *= data[i];
}
return result;
}
};
} // namespace infini

File diff suppressed because it is too large Load Diff

View File

@ -295,36 +295,6 @@ class TestStringMethods(unittest.TestCase):
make_graph([reshape], "reshape", [data, shape], [reshaped], [shape_data])
)
def test_resize(self):
x = make_tensor_value_info("x", TensorProto.FLOAT, [1, 128, 40, 40])
roi = make_tensor("roi", TensorProto.FLOAT, [0], [])
scales = make_tensor("scales", TensorProto.FLOAT, [4], [1, 1, 2, 2])
y = make_tensor_value_info("y", TensorProto.FLOAT, [1, 128, 80, 80])
reshape = make_node("Resize", ["x", "roi", "scales"], ["y"], name="resize")
make_and_import_model(make_graph([reshape], "resize", [x], [y], [roi, scales]))
def test_squeeze(self):
input = make_tensor_value_info("input", TensorProto.FLOAT, [1, 3, 1, 5])
axes = make_tensor_value_info("axes", TensorProto.INT64, [2])
axes_data = make_tensor("axes", TensorProto.INT64, [2], [0, 2])
output = make_tensor_value_info("output", TensorProto.FLOAT, [3, 5])
squeeze = make_node("Squeeze", ["input", "axes"], ["output"], name="squeeze")
make_and_import_model(
make_graph([squeeze], "squeeze", [input, axes], [output], [axes_data])
)
def test_unsqueeze(self):
input = make_tensor_value_info("input", TensorProto.FLOAT, [2, 3, 4, 5])
axes = make_tensor_value_info("axes", TensorProto.INT64, [2])
axes_data = make_tensor("axes", TensorProto.INT64, [2], [0, 2])
output = make_tensor_value_info("output", TensorProto.FLOAT, [1, 2, 1, 3, 4, 5])
unsqueeze = make_node(
"Unsqueeze", ["input", "axes"], ["output"], name="unsqueeze"
)
make_and_import_model(
make_graph([unsqueeze], "unsqueeze", [input, axes], [output], [axes_data])
)
def test_concat(self):
input1 = make_tensor_value_info("input1", TensorProto.FLOAT, [1, 3, 2, 4])
input2 = make_tensor_value_info("input2", TensorProto.FLOAT, [1, 3, 2, 5])
@ -367,7 +337,7 @@ class TestStringMethods(unittest.TestCase):
"ReduceMean", ["data"], ["reduced"], keepdims=1, name="reduceMean"
)
make_and_import_model(make_graph([reduceMean], "reduceMean", [data], [reduced]))
def test_reduce_sum(self):
data = make_tensor_value_info("data", TensorProto.FLOAT, [2, 3, 3, 4])
reduced = make_tensor_value_info("reduced", TensorProto.FLOAT, [1, 1, 1, 1])
@ -463,20 +433,7 @@ class TestStringMethods(unittest.TestCase):
def test_split(self):
input = make_tensor_value_info("input", TensorProto.FLOAT, [1, 3, 2, 4])
split = make_node("Split", ["input"], ["output"], name="split", axis=0)
output = make_tensor_value_info("output", TensorProto.FLOAT, [1, 3, 2, 4])
make_and_import_model(make_graph([split], "split", [input], [output]))
def test_split1(self):
input = make_tensor_value_info("input", TensorProto.FLOAT, [1, 3, 2, 4])
splitAttr = make_tensor("split", TensorProto.INT64, [2], [2, 1])
output1 = make_tensor_value_info("output1", TensorProto.FLOAT, [1, 2, 2, 4])
output2 = make_tensor_value_info("output2", TensorProto.FLOAT, [1, 1, 2, 4])
split = make_node(
"Split", ["input", "split"], ["output1", "output2"], name="split", axis=1
)
make_and_import_model(
make_graph([split], "split", [input], [output1, output2], [splitAttr])
)
make_and_import_model(make_graph([split], "split", [input], []))
def test_allBroadcast(self):
input = make_tensor_value_info("input", TensorProto.FLOAT, [1, 3, 2, 4])
@ -551,29 +508,6 @@ class TestStringMethods(unittest.TestCase):
where = make_node("Where", ["x", "y", "con"], ["output"], name="where")
make_and_import_model(make_graph([where], "where", [x, y, con], [output]))
def test_send(self):
sendInput = make_tensor_value_info("input", TensorProto.FLOAT, [1, 3, 5, 7])
send = make_node("Send", ["input"], [], name="send", source=0, destination=1)
graph = make_graph([send], "send", [sendInput], [])
model = make_model(graph)
from_onnx(model, backend.cpu_runtime())
def test_recv(self):
recvOutput = make_tensor_value_info("output", TensorProto.FLOAT, [1, 3, 5, 7])
recv = make_node(
"Recv",
[],
["output"],
name="recv",
source=0,
destination=1,
shape=[1, 3, 5, 7],
dataType=1,
)
graph = make_graph([recv], "recv", [], [recvOutput])
model = make_model(graph)
from_onnx(model, backend.cpu_runtime())
class TestDynamicTensor(unittest.TestCase):
def test_dynamic_tensor(self):
@ -583,7 +517,6 @@ class TestDynamicTensor(unittest.TestCase):
for root, dirs, files in os.walk(current_path):
if filename in files:
model_file = os.path.join(root, filename)
model = OnnxStub(onnx.load(model_file), backend.cpu_runtime())
output_key = list(model.outputs.keys())[0]
old_output_shape = model.getShape(output_key)

View File

@ -1,9 +1,6 @@
#include "bang/bang_runtime.h"
#include "core/kernel.h"
#include "core/perf_engine.h"
#ifdef INFINI_USE_CNCL
#include "bang/cncl_communicator.h"
#endif
namespace infini {
@ -16,7 +13,8 @@ void BangRuntimeObj::runWithoutSync(const Graph &graph, bool tune = false,
std::map<OpType, int> opCnt;
for (auto &op : graph->getOperators()) {
// HACK: set correct data type
auto kernelAttrs = KernelAttrs{device, op->getOpType().underlying()};
auto kernelAttrs =
KernelAttrs{device, op->getOpType().underlying(), op->getDType()};
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
auto perfData = perfEngine.getPerfData(perfKey);
@ -61,15 +59,4 @@ void BangRuntimeObj::sync() const { cnrtSyncDevice(); }
string BangRuntimeObj::toString() const { return "BANG Runtime"; }
void BangRuntimeObj::initComm(const string &name, int worldSize, int rank) {
IT_ASSERT(worldSize > 0);
IT_ASSERT(rank >= 0);
IT_ASSERT(rank < worldSize);
IT_ASSERT(!comm) << "communicator is already initialized.";
#ifdef INFINI_USE_CNCL
comm = std::make_unique<CnclCommunicatorObj>(name, worldSize, rank);
#else
IT_TODO_HALT_MSG("Not compiled with CNCL.");
#endif
}
} // namespace infini

View File

@ -11,33 +11,20 @@ GraphObj::GraphObj(Runtime runtime, OpVec ops_in)
map<UidBaseType, Tensor> tensorPool;
// Clone tensors
for (const auto &op : ops_in) {
for (const auto &t : op->getInputs()) {
if (t) {
if (tensorPool.find(t->getFuid()) == tensorPool.end())
tensorPool[t->getFuid()] = cloneTensor(t);
}
}
for (const auto &t : op->getOutputs()) {
if (t) {
if (tensorPool.find(t->getFuid()) == tensorPool.end())
tensorPool[t->getFuid()] = cloneTensor(t);
}
}
for (const auto &t : op->getInputs())
if (tensorPool.find(t->getFuid()) == tensorPool.end())
tensorPool[t->getFuid()] = cloneTensor(t);
for (const auto &t : op->getOutputs())
if (tensorPool.find(t->getFuid()) == tensorPool.end())
tensorPool[t->getFuid()] = cloneTensor(t);
}
// Clone operators and add connections
for (const auto &op : ops_in) {
TensorVec inputs, outputs;
for (const auto &t : op->getInputs()) {
if (t) {
inputs.emplace_back(tensorPool.at(t->getFuid()));
}
}
for (const auto &t : op->getOutputs()) {
if (t) {
outputs.emplace_back(tensorPool.at(t->getFuid()));
}
}
for (const auto &t : op->getInputs())
inputs.emplace_back(tensorPool.at(t->getFuid()));
for (const auto &t : op->getOutputs())
outputs.emplace_back(tensorPool.at(t->getFuid()));
addOperatorAndConnect(op->clone(inputs, outputs));
}
}
@ -46,21 +33,17 @@ void GraphObj::addOperatorAndConnect(const Operator &op) {
sorted = false;
ops.push_back(op);
for (auto &input : op->getInputs()) {
if (input) {
input->addTarget(op);
if (auto pred = input->getSource()) {
pred->addSuccessors(op);
op->addPredecessors(pred);
}
input->addTarget(op);
if (auto pred = input->getSource()) {
pred->addSuccessors(op);
op->addPredecessors(pred);
}
}
for (auto &output : op->getOutputs()) {
if (output) {
output->setSource(op);
for (auto &succ : output->getTargets()) {
succ->addPredecessors(op);
op->addSuccessors(succ);
}
output->setSource(op);
for (auto &succ : output->getTargets()) {
succ->addPredecessors(op);
op->addSuccessors(succ);
}
}
}
@ -87,33 +70,48 @@ string GraphObj::toString() const {
}
bool GraphObj::topo_sort() {
if (this->sorted) {
if (this->sorted)
return true;
}
// std::unordered_set<Tensor> inputs;
std::unordered_set<Operator> waiting(this->ops.begin(), this->ops.end());
std::vector<Operator> sorted;
std::unordered_set<OperatorObj *> flags;
sorted.reserve(ops.size());
flags.reserve(ops.size());
while (sorted.size() < ops.size()) {
while (!waiting.empty()) {
// Any node is move to sorted in this loop.
auto modified = false;
for (auto const &op : ops) {
if (auto const &inputs = op->getInputs();
flags.find(op.get()) == flags.end() &&
std::all_of(inputs.begin(), inputs.end(),
[&flags](auto const &input) {
auto ptr = input->getSource().get();
return !ptr || flags.find(ptr) != flags.end();
})) {
// Find head nodes.
for (auto it = waiting.begin(); it != waiting.end();) {
const auto &this_inputs = (*it)->getInputs();
// If none of the input tensors is in waiting list,
// this node is a head node.
const auto is_head = std::all_of(
this_inputs.begin(), this_inputs.end(), [&](const auto &input) {
auto src = input->getSource();
return src // If the source node is in the waiting list,
// means that this node is not the head node.
? waiting.find(src) == waiting.end()
// This tensor has no source node,
// it must be a input tensor.
: (/*inputs.insert(input),*/ true);
});
// Moves head node to sorted.
if (is_head) {
modified = true;
sorted.emplace_back(op);
flags.insert(op.get());
sorted.emplace_back(std::move(*it));
it = waiting.erase(it);
} else {
++it;
}
}
// Waiting list never modifies during a pass,
// sorting fails.
if (!modified) {
return false;
}
}
// Done.
this->ops = std::move(sorted);
return this->sorted = true;
}
@ -157,7 +155,6 @@ void GraphObj::shape_infer() {
void GraphObj::dataMalloc(bool useNaiveAllocator, size_t memPoolSize) {
// topological sorting first
IT_ASSERT(topo_sort() == true);
if (useNaiveAllocator) {
// can not set memory pool when use naive allocator
@ -167,10 +164,7 @@ void GraphObj::dataMalloc(bool useNaiveAllocator, size_t memPoolSize) {
// note: behavior may not match running in non-naive mode, and it may
// not reproduce the bug
for (auto &tensor : tensors) {
if (!tensor->isWeight() ||
(tensor->isWeight() && !weightAllocated)) {
tensor->dataMalloc();
}
tensor->dataMalloc();
}
return;
}
@ -228,28 +222,24 @@ void GraphObj::dataMalloc(bool useNaiveAllocator, size_t memPoolSize) {
// memory should be allocated for the op's output first
auto outputs = op->getOutputs();
for (auto &tensor : outputs) {
if (tensor) {
if (tensor->isOthers()) {
tensorToOffset[tensor.get()] =
allocator.alloc(tensor->getBytes());
}
if (tensor->isOthers()) {
tensorToOffset[tensor.get()] =
allocator.alloc(tensor->getBytes());
}
}
auto inputs = op->getInputs();
for (auto &tensor : inputs) {
if (tensor) {
if (tensor->isOthers()) {
auto tensorIter = tensorToRefCount.find(tensor.get());
IT_ASSERT(tensorIter != tensorToRefCount.end());
IT_ASSERT(tensorToRefCount[tensor.get()] > 0);
tensorToRefCount[tensor.get()] -= 1;
if (tensorToRefCount[tensor.get()] == 0) {
// indicate that this tensor will no longer be used and
// perform memory free
tensorToRefCount.erase(tensor.get());
allocator.free(tensorToOffset[tensor.get()],
tensor->getBytes());
}
if (tensor->isOthers()) {
auto tensorIter = tensorToRefCount.find(tensor.get());
IT_ASSERT(tensorIter != tensorToRefCount.end());
IT_ASSERT(tensorToRefCount[tensor.get()] > 0);
tensorToRefCount[tensor.get()] -= 1;
if (tensorToRefCount[tensor.get()] == 0) {
// indicate that this tensor will no longer be used and
// perform memory free
tensorToRefCount.erase(tensor.get());
allocator.free(tensorToOffset[tensor.get()],
tensor->getBytes());
}
}
}

View File

@ -10,27 +10,18 @@
#include "operators/expand.h"
#include "operators/gather.h"
#include "operators/layer_norm.h"
#include "operators/lrn.h"
#include "operators/matmul.h"
#include "operators/pad.h"
#include "operators/pooling.h"
#include "operators/recv.h"
#include "operators/reduce.h"
#include "operators/reshape.h"
#include "operators/resize.h"
#include "operators/rms_norm.h"
#include "operators/rope.h"
#include "operators/send.h"
#include "operators/slice.h"
#include "operators/softmax.h"
#include "operators/split.h"
#include "operators/squeeze.h"
#include "operators/transpose.h"
#include "operators/unary.h"
#include "operators/unsqueeze.h"
#include "operators/where.h"
#include <numeric>
#include <variant>
namespace infini {
@ -74,17 +65,15 @@ Tensor GraphHandlerObj::convTransposed2d(Tensor input, Tensor weight,
}
Tensor GraphHandlerObj::matmul(Tensor a, Tensor b, Tensor y, bool transA,
bool transB, Tensor bias, ActType act,
std::string matmul_compute_type) {
bool transB, Tensor bias, ActType act) {
if (y) {
g->addOpWithOutputs<MatmulObj>(std::move(a), std::move(b), y, transA,
transB, std::move(bias), act,
matmul_compute_type);
transB, std::move(bias), act);
return y;
} else {
return g
->addOp<MatmulObj>(std::move(a), std::move(b), y, transA, transB,
std::move(bias), act, matmul_compute_type)
std::move(bias), act)
->getOutput();
}
}
@ -125,17 +114,6 @@ Tensor GraphHandlerObj::layerNormalization(Tensor input, Tensor scale,
}
}
Tensor GraphHandlerObj::rmsNorm(Tensor input, Tensor weight, Tensor output) {
if (output) {
g->addOpWithOutputs<RMSNormObj>(std::move(input), std::move(weight),
output);
return output;
} else {
return g->addOp<RMSNormObj>(std::move(input), std::move(weight), output)
->getOutput();
}
}
Tensor GraphHandlerObj::maxPool(Tensor input, Tensor output, int kh, int kw,
int dh, int dw, int ph, int pw, int sh, int sw,
int ceilMode) {
@ -196,7 +174,6 @@ DEFINE_ELEMENT_WISE_METHOD(max, Maximum)
} \
}
DEFINE_UNARY_METHOD(silu, Silu)
DEFINE_UNARY_METHOD(relu, Relu)
DEFINE_UNARY_METHOD(gelu, Gelu)
DEFINE_UNARY_METHOD(sigmoid, Sigmoid)
@ -273,64 +250,6 @@ Tensor GraphHandlerObj::reshape(Tensor data, Tensor reshaped, Shape shape) {
}
}
Tensor GraphHandlerObj::resize(Tensor input, Tensor output,
const std::optional<vector<int>> &axes,
Tensor sizes, Tensor scales, Tensor roi,
vector<uint32_t> sizes_, vector<float> scales_,
vector<float> roi_, string mode,
string ratioPolicy, string nearestMode,
string coordTransMode) {
if (sizes_.size() > 0) {
sizes->dataMalloc();
sizes->copyin<uint32_t>(sizes_);
}
if (scales_.size() > 0) {
scales->dataMalloc();
scales->copyin<float>(scales_);
}
if (roi_.size() > 0) {
roi->dataMalloc();
roi->copyin<float>(roi_);
}
ResizeObj::EKeepAspectRatioPolicy ratioPolicy_ =
ResizeObj::fromRatioPolicyStr(ratioPolicy);
ResizeObj::ENearestMode nearestMode_ =
ResizeObj::fromENearestModeStr(nearestMode);
ResizeObj::ECoordinateTransMode coordTransMode_ =
ResizeObj::fromECoordinateTransModeStr(coordTransMode);
ResizeObj::ECoeffMode mode_ = ResizeObj::fromECoeffModeStr(mode);
if (output) {
if (mode == "nearest") {
g->addOpWithOutputs<ResizeObj>(
std::move(input), output, std::move(axes), std::move(sizes),
std::move(scales), std::move(roi), ratioPolicy_, nearestMode_,
coordTransMode_);
} else {
g->addOpWithOutputs<ResizeObj>(
std::move(input), output, std::move(axes), std::move(sizes),
std::move(scales), std::move(roi), mode_, ratioPolicy_,
coordTransMode_);
}
return output;
} else {
if (mode == "nearest") {
return g
->addOp<ResizeObj>(std::move(input), output, std::move(axes),
std::move(sizes), std::move(scales),
std::move(roi), ratioPolicy_, nearestMode_,
coordTransMode_)
->getOutput();
} else {
return g
->addOp<ResizeObj>(std::move(input), output, std::move(axes),
std::move(sizes), std::move(scales),
std::move(roi), mode_, ratioPolicy_,
coordTransMode_)
->getOutput();
}
}
}
Tensor GraphHandlerObj::concat(TensorVec inputs, Tensor output, int dim) {
if (output) {
g->addOpWithOutputs<ConcatObj>(std::move(inputs), output, dim);
@ -350,7 +269,7 @@ Tensor GraphHandlerObj::attentionKVCache(Tensor input_k_cache,
std::move(input_k_cache), std::move(input_v_cache),
std::move(input_q), std::move(input_k), std::move(input_v),
std::move(position_id), output_matmul);
return output_matmul;
return {output_matmul};
} else {
return g
->addOp<AttentionKVCacheObj>(
@ -361,40 +280,15 @@ Tensor GraphHandlerObj::attentionKVCache(Tensor input_k_cache,
}
}
Tensor GraphHandlerObj::RoPE(Tensor pos, Tensor input, Tensor output) {
if (output) {
g->addOpWithOutputs<RoPEObj>(std::move(pos), std::move(input), output);
return output;
} else {
return g->addOp<RoPEObj>(std::move(pos), std::move(input), output)
->getOutput();
}
}
TensorVec GraphHandlerObj::split(Tensor input, std::optional<TensorVec> outputs,
int axis,
std::variant<int, vector<int>> numOrRatio) {
int axis, int num_outputs) {
if (outputs) {
if (std::holds_alternative<int>(numOrRatio)) {
g->addOpWithOutputs<SplitObj>(std::move(input), outputs, axis,
std::get<int>(numOrRatio));
} else {
g->addOpWithOutputs<SplitObj>(std::move(input), outputs, axis,
std::get<vector<int>>(numOrRatio));
}
g->addOpWithOutputs<SplitObj>(std::move(input), outputs, axis,
num_outputs);
return *outputs;
} else {
if (std::holds_alternative<int>(numOrRatio)) {
return g
->addOp<SplitObj>(std::move(input), outputs, axis,
std::get<int>(numOrRatio))
->getOutputs();
} else {
return g
->addOp<SplitObj>(std::move(input), outputs, axis,
std::get<vector<int>>(numOrRatio))
->getOutputs();
}
return g->addOp<SplitObj>(std::move(input), outputs, axis, num_outputs)
->getOutputs();
}
}
@ -540,39 +434,6 @@ Tensor GraphHandlerObj::broadcast(Tensor input, Tensor output, int root) {
}
}
Tensor GraphHandlerObj::send(Tensor input, int source, int destination,
Tensor output) {
if (output) {
g->addOpWithOutputs<SendObj>(std::move(input), source, destination,
output);
return output;
} else {
return g->addOp<SendObj>(std::move(input), source, destination, output)
->getOutput();
}
}
Tensor GraphHandlerObj::recv(Tensor output, int source, int destination,
Shape dims, int outputType, Tensor input) {
if (output) {
g->addOpWithOutputs<RecvObj>(output, source, destination,
std::move(dims), outputType,
std::move(input));
return output;
} else {
return g
->addOp<RecvObj>(output, source, destination, std::move(dims),
outputType, std::move(input))
->getOutput();
}
}
Tensor GraphHandlerObj::cast(Tensor input, Tensor output, int to) {
if (output) {
g->addOpWithOutputs<CastObj>(std::move(input), output,
@ -623,41 +484,6 @@ Tensor GraphHandlerObj::depthToSpace(Tensor input, Tensor output, int blocksize,
}
}
Tensor GraphHandlerObj::lrn(Tensor input, Tensor output, float alpha,
float beta, float bias, int size) {
if (output) {
g->addOpWithOutputs<LRNObj>(std::move(input), output, alpha, beta, bias,
size);
return output;
} else {
return g
->addOp<LRNObj>(std::move(input), output, alpha, beta, bias, size)
->getOutput();
}
}
Tensor GraphHandlerObj::squeeze(Tensor input, Tensor output, Shape axes) {
if (output) {
g->addOpWithOutputs<SqueezeObj>(std::move(input), output,
std::move(axes));
return output;
} else {
return g->addOp<SqueezeObj>(std::move(input), output, std::move(axes))
->getOutput();
}
}
Tensor GraphHandlerObj::unsqueeze(Tensor input, Tensor output, Shape axes) {
if (output) {
g->addOpWithOutputs<UnsqueezeObj>(std::move(input), output,
std::move(axes));
return output;
} else {
return g->addOp<UnsqueezeObj>(std::move(input), output, std::move(axes))
->getOutput();
}
}
static CastType inferCastType(Tensor input, int to) {
auto iType = input->getDType();
auto oType = DataType(to);
@ -709,8 +535,6 @@ static CastType inferCastType(Tensor input, int to) {
return CastType::Float162Float;
} else if (iType == DataType::BFloat16 && oType == DataType::Float32) {
return CastType::BFloat162Float;
} else if (iType == DataType::Float32 && oType == DataType::Float32) {
return CastType::Float2Float;
} else {
IT_TODO_HALT_MSG("Unsupported CastType : input_type is " +
iType.toString() + " output_type is " +

View File

@ -6,10 +6,8 @@ namespace infini {
OperatorObj::OperatorObj(OpType opType, TensorVec inputs, TensorVec outputs)
: type(opType), inputs(inputs), outputs(outputs) {
if (opType != OpType::Recv) {
for (const auto &t : inputs)
IT_ASSERT(t);
}
for (const auto &t : inputs)
IT_ASSERT(t);
}
void OperatorObj::removePredecessors(const Operator &op) {

View File

@ -17,7 +17,8 @@ void CpuRuntimeObj::run(const Graph &graph, bool tune, bool profiling) const {
std::map<OpType, int> opCnt;
for (auto &op : graph->getOperators()) {
auto kernelAttrs = KernelAttrs{device, op->getOpType().underlying()};
auto kernelAttrs =
KernelAttrs{device, op->getOpType().underlying(), op->getDType()};
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
auto perfData = perfEngine.getPerfData(perfKey);
@ -65,7 +66,8 @@ double RuntimeObj::getPerfTime(const Graph &graph, bool profiling) const {
std::map<OpType, int> opCnt;
for (auto &op : graph->getOperators()) {
auto kernelAttrs = KernelAttrs{device, op->getOpType().underlying()};
auto kernelAttrs =
KernelAttrs{device, op->getOpType().underlying(), op->getDType()};
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
auto perfData = perfEngine.getPerfData(perfKey);

View File

@ -66,36 +66,6 @@ void TensorObj::setShape(Shape shape_) {
_size = size;
}
void TensorObj::dumpData(std::ofstream &ofs) const {
IT_ASSERT(data != nullptr);
if (!runtime->isCpu())
IT_TODO_HALT();
#define TRY_DUMP(N) \
if (dtype == DataType(N)) \
ofs << dataToString<DT<N>::t>() << std::endl;
TRY_DUMP(0) // fmt: new line
else TRY_DUMP(1) //
else TRY_DUMP(2) //
else TRY_DUMP(3) //
else TRY_DUMP(4) //
else TRY_DUMP(5) //
else TRY_DUMP(6) //
else TRY_DUMP(7) //
else TRY_DUMP(8) //
else TRY_DUMP(9) //
else TRY_DUMP(10) //
else TRY_DUMP(11) //
else TRY_DUMP(12) //
else TRY_DUMP(13) //
else TRY_DUMP(16) //
else IT_TODO_HALT();
ofs.flush();
#undef TRY_DUMP
}
void TensorObj::printData() const {
IT_ASSERT(data != nullptr);
if (!runtime->isCpu())

View File

@ -19,12 +19,14 @@ void CHECK_CUDA_KERNEL_ERROR(infini::Operator op) {
}
namespace infini {
void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
const auto &kernelRegistry = KernelRegistry::getInstance();
auto &perfEngine = PerfEngine::getInstance();
for (auto &op : graph->getOperators()) {
// HACK: set correct data type
auto kernelAttrs = KernelAttrs{device, op->getOpType().underlying()};
auto kernelAttrs =
KernelAttrs{device, op->getOpType().underlying(), op->getDType()};
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
auto perfData = perfEngine.getPerfData(perfKey);
@ -38,27 +40,6 @@ void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
}
}
void CudaRuntimeObj::runWithCudaGraph(const Graph &graph) {
if (!isCudaGraphCreated) {
CUDAStream::createStream();
checkCudnnError(cudnnSetStream(cudnn, CUDAStream::getCurrentStream()));
checkCublasError(
cublasSetStream(cublas, CUDAStream::getCurrentStream()));
checkCudaError(cudaStreamBeginCapture(CUDAStream::getCurrentStream(),
cudaStreamCaptureModeGlobal));
runWithoutSync(graph);
checkCudaError(
cudaStreamEndCapture(CUDAStream::getCurrentStream(), &cudaGraph));
checkCudaError(
cudaGraphInstantiate(&cudaGraphInstance, cudaGraph, NULL, NULL, 0));
isCudaGraphCreated = true;
} else {
checkCudaError(
cudaGraphLaunch(cudaGraphInstance, CUDAStream::getCurrentStream()));
}
checkCudaError(cudaStreamSynchronize(CUDAStream::getCurrentStream()));
}
void CudaRuntimeObj::tune(const Graph &graph, bool profiling = false) const {
const auto &kernelRegistry = KernelRegistry::getInstance();
auto &perfEngine = PerfEngine::getInstance();
@ -67,7 +48,8 @@ void CudaRuntimeObj::tune(const Graph &graph, bool profiling = false) const {
std::map<OpType, int> opCnt;
for (auto &op : graph->getOperators()) {
// HACK: set correct data type
auto kernelAttrs = KernelAttrs{device, op->getOpType().underlying()};
auto kernelAttrs = KernelAttrs{device, op->getOpType().underlying(),
DataType::Float32};
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
auto perfData = perfEngine.getPerfData(perfKey);
@ -122,5 +104,4 @@ void CudaRuntimeObj::initComm(const string &name, int worldSize, int rank) {
#endif
}
cudaStream_t CUDAStream::_stream = 0;
} // namespace infini

View File

@ -1,6 +1,4 @@
#include "core/data_type.h"
#include "cuda/cuda_common.h"
#include "cuda/cuda_utility.h"
#include <cstdio>
__global__ void cudaPrintFloatImpl(float *x, int len) {
@ -16,60 +14,8 @@ __global__ void cudaPrintFloatImpl(float *x, int len) {
namespace infini {
void cudaPrintFloat(float *x, int len) {
cudaPrintFloatImpl
<<<1, 1, 0, CUDAStream::getCurrentStream()>>>(x, len);
cudaPrintFloatImpl<<<1, 1>>>(x, len);
cudaDeviceSynchronize();
}
void cudaPrintTensor(const Tensor &tensor) {
cudaPrintFloat(tensor->getRawDataPtr<float *>(), tensor->size());
}
cudnnDataType_t cudnnDataTypeConvert(DataType dataType) {
if (dataType == DataType::Float32) {
return CUDNN_DATA_FLOAT;
}
if (dataType == DataType::Double) {
return CUDNN_DATA_DOUBLE;
}
if (dataType == DataType::Float16) {
return CUDNN_DATA_HALF;
}
if (dataType == DataType::Int8) {
return CUDNN_DATA_INT8;
}
if (dataType == DataType::Int32) {
return CUDNN_DATA_INT32;
}
if (dataType == DataType::UInt8) {
return CUDNN_DATA_UINT8;
}
if (dataType == DataType::BFloat16) {
return CUDNN_DATA_BFLOAT16;
}
if (dataType == DataType::Int64) {
return CUDNN_DATA_INT64;
}
if (dataType == DataType::Bool) {
return CUDNN_DATA_BOOLEAN;
}
IT_ASSERT(false, "Unsupported data type");
}
cudaDataType cublasDataTypeConvert(DataType dataType) {
switch (dataType.getIndex()) {
case 1:
return CUDA_R_32F;
// case 3:
// return CUDA_R_8I;
case 10:
return CUDA_R_16F;
case 11:
return CUDA_R_64F;
// case 16:
// return CUDA_R_16BF;
default:
IT_ASSERT(false, "MatMul Unsupported data type");
}
}
} // namespace infini

View File

@ -5,17 +5,14 @@
#include "operators/conv.h"
#include "operators/expand.h"
#include "operators/gather.h"
#include "operators/lrn.h"
#include "operators/matmul.h"
#include "operators/pad.h"
#include "operators/pooling.h"
#include "operators/reduce.h"
#include "operators/reshape.h"
#include "operators/split.h"
#include "operators/squeeze.h"
#include "operators/transpose.h"
#include "operators/unary.h"
#include "operators/unsqueeze.h"
#include <algorithm>
#include <pybind11/numpy.h>
#include <pybind11/pybind11.h>
@ -95,8 +92,6 @@ void export_values(py::module &m) {
.VALUE(OpType, ReduceMean)
.VALUE(OpType, ReduceSum)
.VALUE(OpType, Reshape)
.VALUE(OpType, Squeeze)
.VALUE(OpType, Unsqueeze)
.VALUE(OpType, Flatten)
.VALUE(OpType, Identity)
.VALUE(OpType, BatchNormalization)
@ -118,7 +113,6 @@ void export_values(py::module &m) {
.VALUE(OpType, Erf)
.VALUE(OpType, Where)
.VALUE(OpType, DepthToSpace)
.VALUE(OpType, LRN)
.export_values();
#undef VALUE
@ -260,24 +254,6 @@ static vector<int64_t> reshape_shape_of(Operator op) {
return ans;
}
static vector<int64_t> squeeze_axes_of(Operator op) {
IT_ASSERT(op->getOpType() == OpType::Squeeze);
auto axes = dynamic_cast<const SqueezeObj *>(op.get())->getAxes();
vector<int64_t> ans(axes.size());
std::transform(axes.begin(), axes.end(), ans.begin(),
[](auto x) { return static_cast<int64_t>(x); });
return ans;
}
static vector<int64_t> unsqueeze_axes_of(Operator op) {
IT_ASSERT(op->getOpType() == OpType::Unsqueeze);
auto axes = dynamic_cast<const UnsqueezeObj *>(op.get())->getAxes();
vector<int64_t> ans(axes.size());
std::transform(axes.begin(), axes.end(), ans.begin(),
[](auto x) { return static_cast<int64_t>(x); });
return ans;
}
static vector<int64_t> expand_shape_of(Operator op) {
IT_ASSERT(op->getOpType() == OpType::Expand);
auto shape = dynamic_cast<const ExpandObj *>(op.get())->getShape();
@ -320,14 +296,6 @@ static std::tuple<int, std::string> depth_to_space_attrs_of(Operator op) {
depth_to_space->getModeString());
}
static std::tuple<float, float, float, int> lrn_attrs_of(Operator op) {
IT_ASSERT(op->getOpType() == OpType::LRN);
auto lrn = dynamic_cast<const LRNObj *>(op.get());
auto [alpha, beta, bias] = lrn->getAlphaBetaBias();
auto size = lrn->getSize();
return std::make_tuple(alpha, beta, bias, size);
}
void export_functions(py::module &m) {
#define FUNCTION(NAME) def(#NAME, &NAME)
m.def("cpu_runtime", &NativeCpuRuntimeObj::getInstance)
@ -364,10 +332,7 @@ void export_functions(py::module &m) {
.FUNCTION(gather_axis_of)
.FUNCTION(flatten_axis_of)
.FUNCTION(cast_to_of)
.FUNCTION(depth_to_space_attrs_of)
.FUNCTION(squeeze_axes_of)
.FUNCTION(unsqueeze_axes_of)
.FUNCTION(lrn_attrs_of);
.FUNCTION(depth_to_space_attrs_of);
#undef FUNCTION
}
@ -423,15 +388,11 @@ void init_graph_builder(py::module &m) {
#endif
#ifdef USE_BANG
py::class_<BangRuntimeObj, std::shared_ptr<BangRuntimeObj>, RuntimeObj>(
m, "BangRuntime")
.def(py::init<int>(), py::arg("device") = 0)
.def("init_comm", &BangRuntimeObj::initComm);
m, "BangRuntime");
#endif
#ifdef USE_KUNLUN
py::class_<KUNLUNRuntimeObj, std::shared_ptr<KUNLUNRuntimeObj>, RuntimeObj>(
m, "KUNLUNRuntime")
.def(py::init<int>(), py::arg("device") = 0)
.def("init_comm", &KUNLUNRuntimeObj::initComm);
m, "KUNLUNRuntime");
#endif
py::class_<TensorObj, std::shared_ptr<TensorObj>>(m, "Tensor",
py::buffer_protocol())
@ -506,7 +467,6 @@ void init_graph_builder(py::module &m) {
.def("matmul", &Handler::matmul, policy::move)
.def("batchNormalization", &Handler::batchNormalization, policy::move)
.def("layerNormalization", &Handler::layerNormalization, policy::move)
.def("RMSNorm", &Handler::rmsNorm, policy::move)
.def("maxPool", &Handler::maxPool, policy::move)
.def("avgPool", &Handler::avgPool, policy::move)
.def("add", &Handler::add, policy::move)
@ -518,7 +478,6 @@ void init_graph_builder(py::module &m) {
.def("min", &Handler::min, policy::move)
.def("max", &Handler::max, policy::move)
.def("relu", &Handler::relu, policy::move)
.def("silu", &Handler::silu, policy::move)
.def("gelu", &Handler::gelu, policy::move)
.def("sigmoid", &Handler::sigmoid, policy::move)
.def("tanh", &Handler::tanh, policy::move)
@ -536,12 +495,8 @@ void init_graph_builder(py::module &m) {
.def("transpose", &Handler::transpose, policy::move)
.def("depthToSpace", &Handler::depthToSpace, policy::move)
.def("reshape", &Handler::reshape, policy::move)
.def("resize", &Handler::resize, policy::move)
.def("squeeze", &Handler::squeeze, policy::move)
.def("unsqueeze", &Handler::unsqueeze, policy::move)
.def("concat", &Handler::concat, policy::move)
.def("attentionKVCache", &Handler::attentionKVCache, policy::move)
.def("RoPE", &Handler::RoPE, policy::move)
.def("split", &Handler::split, policy::move)
.def("gather", &Handler::gather, policy::move)
.def("gatherElements", &Handler::gatherElements, policy::move)
@ -556,13 +511,10 @@ void init_graph_builder(py::module &m) {
.def("allReduceAvg", &Handler::allReduceAvg, policy::move)
.def("allGather", &Handler::allGather, policy::move)
.def("broadcast", &Handler::broadcast, policy::move)
.def("send", &Handler::send, policy::move)
.def("recv", &Handler::recv, policy::move)
.def("cast", &Handler::cast, policy::move)
.def("expand", &Handler::expand, policy::move)
.def("erf", &Handler::erf, policy::move)
.def("where", &Handler::where, policy::move)
.def("lrn", &Handler::lrn, policy::move)
.def("topo_sort", &Handler::topo_sort, policy::automatic)
.def("optimize", &Handler::optimize, policy::automatic)
.def("operators", &Handler::operators, policy::move)
@ -574,10 +526,6 @@ void init_graph_builder(py::module &m) {
.def("get_perf_time", &Handler::get_perf_time, policy::automatic)
.def("tune", &Handler::tune, policy::automatic)
.def("run", &Handler::run, policy::automatic)
#ifdef USE_CUDA
.def("run_with_cudagraph", &Handler::run_with_cudagraph,
policy::automatic)
#endif
.def("shape_infer", &Handler::shape_infer, policy::automatic)
.def("change_shape", &Handler::change_shape, policy::automatic)
.def("getDims", &Handler::getDims, policy::automatic)

View File

@ -2,17 +2,12 @@
#include "bang/bang_runtime.h"
#include "operators/softmax.h"
#include "operators/unary.h"
#include <iostream>
namespace infini {
class UnaryCnnl : public BangKernelWithoutConfig {
virtual cnnlActivationMode_t getOpType() const = 0;
virtual float getCoef() const = 0;
virtual tuple<float, float> getAlphBeta() const { return {1.f, 0.f}; }
virtual float getSlicedDim() const { return 0.0; }
virtual float getGamma() const { return 0.0; }
virtual float getScale() const { return 0.0; }
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<UnaryObj>(_op);
@ -26,19 +21,17 @@ class UnaryCnnl : public BangKernelWithoutConfig {
auto cDim = op->getOutput()->getDims();
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
aDim.size(), aDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, aDim.size(),
aDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
cDim.size(), cDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, cDim.size(),
cDim.data()));
cnnlActivationDescriptor_t opDesc;
checkCnnlError(cnnlCreateActivationDescriptor(&opDesc));
checkCnnlError(cnnlSetActivationDescriptor_v5(
opDesc, getOpType(), CNNL_ACTIVATION_HIGH_PRECISION,
CNNL_NOT_PROPAGATE_NAN, getCoef(), getSlicedDim(), getGamma(),
getScale(), true));
checkCnnlError(cnnlSetActivationDescriptor(
opDesc, getOpType(), CNNL_NOT_PROPAGATE_NAN, getCoef()));
auto [alpha, beta] = getAlphBeta();
cnnlStatus_t stat =
@ -66,13 +59,13 @@ class RoundCnnl : public BangKernelWithoutConfig {
auto cDim = op->getOutput()->getDims();
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
aDim.size(), aDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, aDim.size(),
aDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
cDim.size(), cDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, cDim.size(),
cDim.data()));
cnnlStatus_t stat =
cnnlRound(context->cnnlHandle(), aDesc, aData, cDesc, cData);
if (stat != CNNL_STATUS_SUCCESS)
@ -97,22 +90,18 @@ class PReluCnnl : public BangKernelWithoutConfig {
auto bDim = op->getInputs(1)->getDims();
auto cDim = op->getOutput()->getDims();
if (auto alignSize = aDim.size() - bDim.size(); alignSize) {
bDim.insert(bDim.begin(), alignSize, 1);
}
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
aDim.size(), aDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, aDim.size(),
aDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
checkCnnlError(cnnlSetTensorDescriptor(
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
bDim.size(), bDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, bDim.size(),
bDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
cDim.size(), cDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, cDim.size(),
cDim.data()));
cnnlStatus_t stat = cnnlPrelu(context->cnnlHandle(), aDesc, aData,
bDesc, bData, cDesc, cData);
@ -142,68 +131,48 @@ class SoftmaxCnnl : public BangKernelWithoutConfig {
std::vector<int> inDim = {1, 1, 1};
std::vector<int> outDim = inDim;
if (aDim.size() >= 3) {
if (axis == 0) {
mode = CNNL_SOFTMAX_MODE_HIGH_DIMENSION;
inDim[0] = aDim[0];
inDim[1] = aDim[1];
for (size_t i = 2; i < aDim.size(); ++i) {
inDim[2] *= aDim[i];
}
outDim = inDim;
} else if (axis == aDim.size() - 1) {
mode = CNNL_SOFTMAX_MODE_LOW_DIMENSION;
inDim[0] = aDim[0];
for (size_t i = 1; i < axis; ++i) {
inDim[1] *= aDim[i];
}
inDim[2] = aDim[axis];
outDim = inDim;
} else {
mode = CNNL_SOFTMAX_MODE_MEDIUM_DIMENSION;
for (size_t i = 0; i < axis; ++i) {
inDim[0] *= aDim[i];
}
inDim[1] = aDim[axis];
for (size_t i = axis + 1; i < aDim.size(); ++i) {
inDim[2] *= aDim[i];
}
outDim = inDim;
}
} else if (aDim.size() == 2) {
if (axis == 0) {
mode = CNNL_SOFTMAX_MODE_HIGH_DIMENSION;
inDim = aDim;
inDim.push_back(1);
outDim = inDim;
} else {
mode = CNNL_SOFTMAX_MODE_LOW_DIMENSION;
inDim = aDim;
inDim.insert(inDim.begin(), 1);
outDim = inDim;
}
} else {
if (axis == 0) {
mode = CNNL_SOFTMAX_MODE_HIGH_DIMENSION;
inDim = aDim;
inDim.push_back(1);
inDim.push_back(1);
inDim[0] = aDim[0];
inDim[1] = aDim[1];
for (size_t i = 2; i < aDim.size(); ++i) {
inDim[2] *= aDim[i];
}
outDim = inDim;
} else if (axis == aDim.size() - 1) {
mode = CNNL_SOFTMAX_MODE_LOW_DIMENSION;
inDim[0] = aDim[0];
for (size_t i = 1; i < axis; ++i) {
inDim[1] *= aDim[i];
}
inDim[2] = aDim[axis];
outDim = inDim;
} else {
mode = CNNL_SOFTMAX_MODE_MEDIUM_DIMENSION;
for (size_t i = 0; i < axis; ++i) {
inDim[0] *= aDim[i];
}
inDim[1] = aDim[axis];
for (size_t i = axis + 1; i < aDim.size(); ++i) {
inDim[2] *= aDim[i];
}
outDim = inDim;
}
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
inDim.size(), inDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY,
CNNL_DTYPE_FLOAT, inDim.size(),
inDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
outDim.size(), outDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY,
CNNL_DTYPE_FLOAT, outDim.size(),
outDim.data()));
float alpha = 1.0;
float beta = 0.0;
cnnlStatus_t stat =
cnnlSoftmaxForward_v2(context->cnnlHandle(), CNNL_SOFTMAX_ACCURATE,
mode, CNNL_COMPUTATION_ULTRAHIGH_PRECISION,
&alpha, aDesc, aData, &beta, cDesc, cData);
mode, CNNL_COMPUTATION_HIGH_PRECISION, &alpha,
aDesc, aData, &beta, cDesc, cData);
if (stat != CNNL_STATUS_SUCCESS)
return;
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
@ -225,32 +194,15 @@ class SigmoidCnnl : public UnaryCnnl {
float getCoef() const override { return 0.0; }
};
class HardSwishCnnl : public UnaryCnnl {
cnnlActivationMode_t getOpType() const override {
return CNNL_ACTIVATION_HARDSWISH;
}
float getCoef() const override { return 0.0; }
};
class HardSigmoidCnnl : public UnaryCnnl {
cnnlActivationMode_t getOpType() const override {
return CNNL_ACTIVATION_HARDSIGMOID;
}
float getCoef() const override { return 0.0; }
float getGamma() const override { return 1.f / 6.f; }
float getScale() const override { return 0.5f; }
};
REGISTER_KERNEL(Device::BANG, OpType::Relu, ReluCnnl, "Relu_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::PRelu, PReluCnnl, "PRelu_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::Sigmoid, SigmoidCnnl,
"Sigmoid_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::Round, RoundCnnl, "Round_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::Softmax, SoftmaxCnnl,
"Softmax_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::HardSigmoid, HardSigmoidCnnl,
"HardSigmoid_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::HardSwish, HardSwishCnnl,
"HardSwish_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::Relu, DataType::Float32, ReluCnnl,
"Relu_cnnl_BANG_Float32");
REGISTER_KERNEL(Device::BANG, OpType::PRelu, DataType::Float32, PReluCnnl,
"PRelu_cnnl_BANG_Float32");
REGISTER_KERNEL(Device::BANG, OpType::Sigmoid, DataType::Float32, SigmoidCnnl,
"Sigmoid_cnnl_BANG_Float32");
REGISTER_KERNEL(Device::BANG, OpType::Round, DataType::Float32, RoundCnnl,
"Round_cnnl_BANG_Float32");
REGISTER_KERNEL(Device::BANG, OpType::Softmax, DataType::Float32, SoftmaxCnnl,
"Softmax_cnnl_BANG_Float32");
}; // namespace infini

View File

@ -24,21 +24,21 @@ class ActivationBackwardCnnl : public BangKernelWithoutConfig {
auto diffxDim = op->getOutput()->getDims();
checkCnnlError(cnnlCreateTensorDescriptor(&yDesc));
checkCnnlError(cnnlSetTensorDescriptor(
yDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
yDim.size(), yDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(yDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, yDim.size(),
yDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&diffYDesc));
checkCnnlError(cnnlSetTensorDescriptor(
diffYDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
diffyDim.size(), diffyDim.data()));
diffYDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, diffyDim.size(),
diffyDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&xDesc));
checkCnnlError(cnnlSetTensorDescriptor(
xDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
xDim.size(), xDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(xDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, xDim.size(),
xDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&diffXDesc));
checkCnnlError(cnnlSetTensorDescriptor(
diffXDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
diffxDim.size(), diffxDim.data()));
diffXDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, diffxDim.size(),
diffxDim.data()));
// get op descriptor
cnnlActivationDescriptor_t opDesc;
checkCnnlError(cnnlCreateActivationDescriptor(&opDesc));
@ -81,11 +81,11 @@ class TanhBackwardCnnl : public ActivationBackwardCnnl {
float getCoef() const override { return 0.0; }
};
REGISTER_KERNEL(Device::BANG, OpType::ReluBackward, ReluBackwardCnnl,
"ReluBackward_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::SigmoidBackward, SigmoidBackwardCnnl,
"SigmoidBackward_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::TanhBackward, TanhBackwardCnnl,
"TanhBackward_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::ReluBackward, DataType::Float32,
ReluBackwardCnnl, "ReluBackward_cnnl_BANG_Float32");
REGISTER_KERNEL(Device::BANG, OpType::SigmoidBackward, DataType::Float32,
SigmoidBackwardCnnl, "SigmoidBackward_cnnl_BANG_Float32");
REGISTER_KERNEL(Device::BANG, OpType::TanhBackward, DataType::Float32,
TanhBackwardCnnl, "TanhBackward_cnnl_BANG_Float32");
}; // namespace infini

View File

@ -1,49 +0,0 @@
#ifdef INFINI_USE_CNCL
#include "operators/all_gather.h"
#include "bang/bang_kernel_without_config.h"
#include "bang/bang_runtime.h"
#include "bang/cncl_communicator.h"
#include <thread>
namespace infini {
class AllGatherCNCL : public BangKernelWithoutConfig {
public:
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<AllGatherObj>(_op);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
int world_size = op->getWorldSize();
// Check if world size info in operator matches runtime
IT_ASSERT(world_size == context->getCommunicator().getWorldSize());
void *input = op->getInputs(0)->getRawDataPtr<void *>();
BangPtr output_temp =
context->getWorkspace(op->getInputs(0)->getBytes() * world_size);
// void *output = op->getOutput()->getRawDataPtr<void *>();
checkBangError(cnrtMalloc(&output_temp,
op->getInputs(0)->getBytes() * world_size));
size_t bytes = op->getInputs(0)->getBytes();
size_t count = bytes / op->getDType().getSize();
cnclComm_t comm =
dynamic_cast<CnclCommunicatorObj &>(context->getCommunicator())
.getCnclComm();
cnrtQueue_t queue = context->getBangQueue();
CNCL_CHECK(cnclAllGather(input, output_temp, count,
cnclDataTypeConvert(op->getDType()), comm,
queue));
checkBangError(cnrtQueueSync(queue));
for (int i = 0; i < world_size; ++i) {
Tensor output = op->getOutput(i);
context->copyBlobInsideRuntime(
output->getRawDataPtr<float *>(),
static_cast<float *>(output_temp) + i * count, bytes);
}
checkBangError(cnrtFree(output_temp));
}
};
REGISTER_KERNEL(Device::BANG, OpType::AllGather, AllGatherCNCL,
"AllGather_CNCL_BANG");
} // namespace infini
#endif

View File

@ -1,54 +0,0 @@
#ifdef INFINI_USE_CNCL
#include "operators/all_reduce.h"
#include "bang/bang_kernel_without_config.h"
#include "bang/bang_runtime.h"
#include "bang/cncl_communicator.h"
#include <thread>
namespace infini {
class AllReduceCNCL : public BangKernelWithoutConfig {
public:
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<AllReduceBaseObj>(_op);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *input = op->getInputs(0)->getRawDataPtr<void *>();
void *output = op->getOutput()->getRawDataPtr<void *>();
size_t bytes = op->getInputs(0)->getBytes();
size_t count = bytes / op->getDType().getSize();
cnclComm_t comm =
dynamic_cast<CnclCommunicatorObj &>(context->getCommunicator())
.getCnclComm();
cnrtQueue_t queue = context->getBangQueue();
// checkBangError(cnrtQueueSync(queue));
CNCL_CHECK(cnclAllReduce(input, output, count,
cnclDataTypeConvert(op->getDType()),
getRedOp(), comm, queue));
checkBangError(cnrtQueueSync(queue));
}
virtual cnclReduceOp_t getRedOp() const = 0;
};
class AllReduceSumCNCL : public AllReduceCNCL {
cnclReduceOp_t getRedOp() const override { return cnclSum; }
};
class AllReduceProdCNCL : public AllReduceCNCL {
cnclReduceOp_t getRedOp() const override { return cnclProd; }
};
class AllReduceMinCNCL : public AllReduceCNCL {
cnclReduceOp_t getRedOp() const override { return cnclMin; }
};
class AllReduceMaxCNCL : public AllReduceCNCL {
cnclReduceOp_t getRedOp() const override { return cnclMax; }
};
REGISTER_KERNEL(Device::BANG, OpType::AllReduceSum, AllReduceSumCNCL,
"AllReduce_Sum_CNCL_BANG");
REGISTER_KERNEL(Device::BANG, OpType::AllReduceProd, AllReduceProdCNCL,
"AllReduce_Prod_CNCL_BANG");
REGISTER_KERNEL(Device::BANG, OpType::AllReduceMin, AllReduceMinCNCL,
"AllReduce_Min_CNCL_BANG");
REGISTER_KERNEL(Device::BANG, OpType::AllReduceMax, AllReduceMaxCNCL,
"AllReduce_Max_CNCL_BANG");
} // namespace infini
#endif

View File

@ -17,91 +17,55 @@ class BatchNormCnnl : public BangKernelWithoutConfig {
void *const output = (op->getOutput()->getRawDataPtr<void *>());
auto dims = op->getInputs(0)->getDims();
auto outDims = op->getOutput()->getDims();
if (dims.size() != 4)
IT_TODO_HALT();
int dimsTrans[4] = {dims[0], dims[2], dims[3], dims[1]};
int dimsOutTrans[4] = {outDims[0], outDims[2], outDims[3], outDims[1]};
int permute[4] = {0, 2, 3, 1};
int permuteOut[4] = {0, 3, 1, 2};
int dimArray[4], strideArray[4], dimPArray[1], stridePArray[1];
for (size_t i = 0; i < dims.size(); ++i) {
dimArray[i] = dims[i];
strideArray[i] = op->getInputs(0)->getStride()[i];
}
int w = dimArray[3];
dimArray[3] = dimArray[1];
int h = dimArray[2];
dimArray[1] = h;
dimArray[2] = w;
dimPArray[0] = op->getInputs(1)->getDims()[0];
stridePArray[0] = op->getInputs(1)->getDims()[0];
// get inputs
cnnlTensorDescriptor_t inDesc, intransDesc, outDesc, outtransDesc;
cnnlTensorDescriptor_t inDesc;
checkCnnlError(cnnlCreateTensorDescriptor(&inDesc));
checkCnnlError(cnnlCreateTensorDescriptor(&intransDesc));
checkCnnlError(cnnlCreateTensorDescriptor(&outDesc));
checkCnnlError(cnnlCreateTensorDescriptor(&outtransDesc));
checkCnnlError(cnnlSetTensorDescriptor(
inDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
dims.size(), dims.data()));
checkCnnlError(cnnlSetTensorDescriptor(
intransDesc, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()),
dims.size(), dimsTrans));
checkCnnlError(cnnlSetTensorDescriptor(
outDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
outDims.size(), outDims.data()));
checkCnnlError(cnnlSetTensorDescriptor(
outtransDesc, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()),
outDims.size(), dimsOutTrans));
cnnlTransposeDescriptor_t opDesc;
checkCnnlError(cnnlCreateTransposeDescriptor(&opDesc));
checkCnnlError(cnnlSetTransposeDescriptor(opDesc, 4, permute));
size_t wsSize;
cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), inDesc, opDesc,
&wsSize);
BangPtr wsData = context->getWorkspace(wsSize);
BangPtr inputTrans = context->getWorkspace(
cnnlGetTensorElementNum(inDesc) * op->getDType().getSize());
BangPtr outputTrans = context->getWorkspace(
cnnlGetTensorElementNum(inDesc) * op->getDType().getSize());
cnnlStatus_t stat =
cnnlTranspose_v2(context->cnnlHandle(), opDesc, inDesc, input,
intransDesc, inputTrans, wsData, wsSize);
if (stat != CNNL_STATUS_SUCCESS)
return;
checkCnnlError(cnnlSetTensorDescriptorEx(inDesc, CNNL_LAYOUT_NHWC,
CNNL_DTYPE_FLOAT, dims.size(),
dimArray, strideArray));
// get bnScaleBiasMeanVarDesc
auto dimsScaleBiasMeanVar = op->getInputs(1)->getDims();
cnnlTensorDescriptor_t paraDesc;
checkCnnlError(cnnlCreateTensorDescriptor(&paraDesc));
checkCnnlError(cnnlSetTensorDescriptor(
paraDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
dimsScaleBiasMeanVar.size(), dimsScaleBiasMeanVar.data()));
checkCnnlError(cnnlSetTensorDescriptorEx(paraDesc, CNNL_LAYOUT_ARRAY,
CNNL_DTYPE_FLOAT, 1, dimPArray,
stridePArray));
float alpha = 1.f, beta = 0.f;
// This mode is intended for use after convolutional layers
stat = cnnlBatchNormForwardInference(
context->cnnlHandle(), &alpha, &beta, intransDesc, inputTrans,
paraDesc, scale, bias, mean, var, op->getEps(), outtransDesc,
outputTrans);
if (stat != CNNL_STATUS_SUCCESS)
return;
cnnlStatus_t stat = cnnlBatchNormForwardInference(
context->cnnlHandle(), &alpha, &beta, inDesc, input, paraDesc,
scale, bias, mean, var, op->getEps(), inDesc, output);
cnnlTransposeDescriptor_t op2Desc;
checkCnnlError(cnnlCreateTransposeDescriptor(&op2Desc));
checkCnnlError(cnnlSetTransposeDescriptor(op2Desc, 4, permuteOut));
cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), intransDesc,
op2Desc, &wsSize);
BangPtr ws2Data = context->getWorkspace(wsSize);
stat = cnnlTranspose_v2(context->cnnlHandle(), op2Desc, outtransDesc,
outputTrans, outDesc, output, ws2Data, wsSize);
if (stat != CNNL_STATUS_SUCCESS)
return;
// Destories in BANG does not require sync. But cnnl does not state
// whether sync is required before destories.
checkCnnlError(cnnlDestroyTensorDescriptor(inDesc));
checkCnnlError(cnnlDestroyTensorDescriptor(outDesc));
checkCnnlError(cnnlDestroyTensorDescriptor(intransDesc));
checkCnnlError(cnnlDestroyTensorDescriptor(outtransDesc));
checkCnnlError(cnnlDestroyTensorDescriptor(paraDesc));
checkCnnlError(cnnlDestroyTransposeDescriptor(opDesc));
checkCnnlError(cnnlDestroyTransposeDescriptor(op2Desc));
}
};
REGISTER_KERNEL(Device::BANG, OpType::BatchNormalization, BatchNormCnnl,
"BatchNorm_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::BatchNormalization, DataType::Float32,
BatchNormCnnl, "BatchNorm_cnnl_BANG_Float32");
}; // namespace infini

View File

@ -1,35 +0,0 @@
#ifdef INFINI_USE_CNCL
#include "operators/broadcast.h"
#include "bang/bang_kernel_without_config.h"
#include "bang/bang_runtime.h"
#include "bang/cncl_communicator.h"
#include <thread>
namespace infini {
class BroadcastCNCL : public BangKernelWithoutConfig {
public:
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<BroadcastObj>(_op);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *input = op->getInputs(0)->getRawDataPtr<void *>();
void *output = op->getOutput()->getRawDataPtr<void *>();
size_t bytes = op->getInputs(0)->getBytes();
size_t count = bytes / op->getDType().getSize();
cnclComm_t comm =
dynamic_cast<CnclCommunicatorObj &>(context->getCommunicator())
.getCnclComm();
cnrtQueue_t queue = context->getBangQueue();
// TODO: Using default stream 0 for now.
CNCL_CHECK(cnclBroadcast(input, output, count,
cnclDataTypeConvert(op->getDType()),
op->getRoot(), comm, queue));
checkBangError(cnrtQueueSync(queue));
}
};
REGISTER_KERNEL(Device::BANG, OpType::Broadcast, BroadcastCNCL,
"Broadcast_CNCL_BANG");
} // namespace infini
#endif

View File

@ -199,24 +199,6 @@ class CastCnnl : public BangKernelWithoutConfig {
dim.data()));
NlCastType = CNNL_CAST_UINT32_TO_INT64;
break;
case CastType::Float162Float:
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_HALF, dim.size(),
dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, dim.size(),
dim.data()));
NlCastType = CNNL_CAST_HALF_TO_FLOAT;
break;
case CastType::Float2Float16:
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, dim.size(),
dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_HALF, dim.size(),
dim.data()));
NlCastType = CNNL_CAST_FLOAT_TO_HALF;
break;
default:
IT_TODO_HALT();
}
@ -230,6 +212,7 @@ class CastCnnl : public BangKernelWithoutConfig {
}
};
REGISTER_KERNEL(Device::BANG, OpType::Cast, CastCnnl, "Cast_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::Cast, DataType::Float32, CastCnnl,
"Cast_cnnl_BANG_Float32");
}; // namespace infini

View File

@ -17,13 +17,13 @@ class CeilCnnl : public BangKernelWithoutConfig {
auto cDim = op->getOutput()->getDims();
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
aDim.size(), aDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, aDim.size(),
aDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
cDim.size(), cDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, cDim.size(),
cDim.data()));
cnnlStatus_t stat =
cnnlCeil(context->cnnlHandle(), aDesc, aData, cDesc, cData);
@ -35,6 +35,7 @@ class CeilCnnl : public BangKernelWithoutConfig {
}
};
REGISTER_KERNEL(Device::BANG, OpType::Ceil, CeilCnnl, "Ceil_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::Ceil, DataType::Float32, CeilCnnl,
"Ceil_cnnl_BANG_Float32");
}; // namespace infini

View File

@ -18,9 +18,9 @@ class ClipCnnl : public BangKernelWithoutConfig {
auto aDim = op->getInputs(0)->getDims();
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
aDim.size(), aDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, aDim.size(),
aDim.data()));
cnnlStatus_t stat =
cnnlClip(context->cnnlHandle(), aDesc, aData, &min, &max, cData);
if (stat != CNNL_STATUS_SUCCESS)
@ -30,6 +30,7 @@ class ClipCnnl : public BangKernelWithoutConfig {
}
};
REGISTER_KERNEL(Device::BANG, OpType::Clip, ClipCnnl, "Clip_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::Clip, DataType::Float32, ClipCnnl,
"Clip_cnnl_BANG_Float32");
}; // namespace infini

View File

@ -14,18 +14,17 @@ class ConcatCnnl : public BangKernelWithoutConfig {
auto cDim = op->getOutput()->getDims();
cnnlTensorDescriptor_t desc;
checkCnnlError(cnnlCreateTensorDescriptor(&desc));
checkCnnlError(cnnlSetTensorDescriptor(
desc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
cDim.size(), cDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(desc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, cDim.size(),
cDim.data()));
cnnlTensorDescriptor_t descArray[num];
for (int i = 0; i < num; ++i) {
checkCnnlError(cnnlCreateTensorDescriptor(&descArray[i]));
checkCnnlError(
cnnlSetTensorDescriptor(descArray[i], CNNL_LAYOUT_NCHW,
cnnlDataTypeConvert(op->getDType()),
op->getInputs(i)->getDims().size(),
op->getInputs(i)->getDims().data()));
checkCnnlError(cnnlSetTensorDescriptor(
descArray[i], CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT,
op->getInputs(i)->getDims().size(),
op->getInputs(i)->getDims().data()));
}
void *argv[num];
@ -51,5 +50,6 @@ class ConcatCnnl : public BangKernelWithoutConfig {
}
};
REGISTER_KERNEL(Device::BANG, OpType::Concat, ConcatCnnl, "Concat_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::Concat, DataType::Float32, ConcatCnnl,
"Concat_cnnl_BANG_Float32");
}; // namespace infini

View File

@ -20,9 +20,8 @@ class ConvCnnl : public BangKernelWithoutConfig {
cnnlConvolutionDescriptor_t convDesc;
checkCnnlError(cnnlCreateConvolutionDescriptor(&convDesc));
checkCnnlError(
cnnlSetConvolutionDescriptor(convDesc, 4, pad, stride, dilation, g,
cnnlDataTypeConvert(op->getDType())));
checkCnnlError(cnnlSetConvolutionDescriptor(
convDesc, 4, pad, stride, dilation, g, CNNL_DTYPE_FLOAT));
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
@ -55,24 +54,20 @@ class ConvCnnl : public BangKernelWithoutConfig {
// get inputs
checkCnnlError(cnnlCreateTensorDescriptor(&aInDesc));
checkCnnlError(cnnlSetTensorDescriptor(
aInDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
inputs0));
checkCnnlError(cnnlSetTensorDescriptor(aInDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, 4, inputs0));
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()), 4,
inputs0Array));
aDesc, CNNL_LAYOUT_NHWC, CNNL_DTYPE_FLOAT, 4, inputs0Array));
checkCnnlError(cnnlCreateTensorDescriptor(&bInDesc));
checkCnnlError(cnnlSetTensorDescriptor(
bInDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
inputs1));
checkCnnlError(cnnlSetTensorDescriptor(bInDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, 4, inputs1));
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
checkCnnlError(cnnlSetTensorDescriptor(
bDesc, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()), 4,
inputs1Array));
bDesc, CNNL_LAYOUT_NHWC, CNNL_DTYPE_FLOAT, 4, inputs1Array));
int permute[4] = {0, 2, 3, 1};
cnnlTransposeDescriptor_t opDesc;
@ -84,7 +79,7 @@ class ConvCnnl : public BangKernelWithoutConfig {
&wsSize);
BangPtr wsData = context->getWorkspace(wsSize);
BangPtr aDataOut = context->getWorkspace(
cnnlGetTensorElementNum(aInDesc) * op->getDType().getSize());
cnnlGetTensorElementNum(aInDesc) * sizeof(float));
cnnlStatus_t stat =
cnnlTranspose_v2(context->cnnlHandle(), opDesc, aInDesc, aData,
aDesc, aDataOut, wsData, wsSize);
@ -95,7 +90,7 @@ class ConvCnnl : public BangKernelWithoutConfig {
&wsSize);
wsData = context->getWorkspace(wsSize);
BangPtr bDataOut = context->getWorkspace(
cnnlGetTensorElementNum(bInDesc) * op->getDType().getSize());
cnnlGetTensorElementNum(bInDesc) * sizeof(float));
stat = cnnlTranspose_v2(context->cnnlHandle(), opDesc, bInDesc, bData,
bDesc, bDataOut, wsData, wsSize);
if (stat != CNNL_STATUS_SUCCESS)
@ -104,13 +99,11 @@ class ConvCnnl : public BangKernelWithoutConfig {
// get outputs
checkCnnlError(cnnlCreateTensorDescriptor(&cInDesc));
checkCnnlError(cnnlSetTensorDescriptor(
cInDesc, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()), 4,
outputArray));
cInDesc, CNNL_LAYOUT_NHWC, CNNL_DTYPE_FLOAT, 4, outputArray));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
output));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, 4, output));
cnnlConvolutionForwardAlgo_t algo;
cnnlGetConvolutionForwardAlgorithm(context->cnnlHandle(), convDesc,
@ -122,7 +115,7 @@ class ConvCnnl : public BangKernelWithoutConfig {
algo, &wsSize);
wsData = context->getWorkspace(wsSize);
BangPtr cDataIn = context->getWorkspace(
cnnlGetTensorElementNum(cInDesc) * op->getDType().getSize());
cnnlGetTensorElementNum(cInDesc) * sizeof(float));
stat = cnnlConvolutionForward(
context->cnnlHandle(), convDesc, algo, NULL, aDesc, aDataOut, bDesc,
@ -158,5 +151,6 @@ class ConvCnnl : public BangKernelWithoutConfig {
}
};
REGISTER_KERNEL(Device::BANG, OpType::Conv, ConvCnnl, "Conv_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::Conv, DataType::Float32, ConvCnnl,
"Conv_cnnl_BANG_Float32");
}; // namespace infini

View File

@ -20,9 +20,8 @@ class ConvTransCnnl : public BangKernelWithoutConfig {
cnnlConvolutionDescriptor_t convDesc;
checkCnnlError(cnnlCreateConvolutionDescriptor(&convDesc));
checkCnnlError(
cnnlSetConvolutionDescriptor(convDesc, 4, pad, stride, dilation, g,
cnnlDataTypeConvert(op->getDType())));
checkCnnlError(cnnlSetConvolutionDescriptor(
convDesc, 4, pad, stride, dilation, g, CNNL_DTYPE_FLOAT));
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
@ -43,17 +42,14 @@ class ConvTransCnnl : public BangKernelWithoutConfig {
// get inputs
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
dimInputs0.data()));
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dimInputs0.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
checkCnnlError(cnnlSetTensorDescriptor(
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
dimInputs1.data()));
bDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dimInputs1.data()));
// get outputs
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
dimOutput.data()));
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dimOutput.data()));
cnnlConvolutionBwdDataAlgo_t algo;
cnnlGetConvolutionBackwardDataAlgorithm(
@ -80,6 +76,6 @@ class ConvTransCnnl : public BangKernelWithoutConfig {
}
};
REGISTER_KERNEL(Device::BANG, OpType::ConvTranspose, ConvTransCnnl,
"ConvTrans_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::ConvTranspose, DataType::Float32,
ConvTransCnnl, "ConvTrans_cnnl_BANG_Float32");
}; // namespace infini

View File

@ -20,9 +20,8 @@ class ConvBackwardFilterCnnl : public BangKernelWithoutConfig {
cnnlConvolutionDescriptor_t convDesc;
checkCnnlError(cnnlCreateConvolutionDescriptor(&convDesc));
checkCnnlError(
cnnlSetConvolutionDescriptor(convDesc, 4, pad, stride, dilation, g,
cnnlDataTypeConvert(op->getDType())));
checkCnnlError(cnnlSetConvolutionDescriptor(
convDesc, 4, pad, stride, dilation, g, CNNL_DTYPE_FLOAT));
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
@ -63,16 +62,15 @@ class ConvBackwardFilterCnnl : public BangKernelWithoutConfig {
// get inputs
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
inputs0Array));
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, inputs0Array));
checkCnnlError(cnnlCreateTensorDescriptor(&aDescTrans));
checkCnnlError(cnnlSetTensorDescriptor(
aDescTrans, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()),
4, inputs0ArrayTrans));
checkCnnlError(cnnlSetTensorDescriptor(aDescTrans, CNNL_LAYOUT_NHWC,
CNNL_DTYPE_FLOAT, 4,
inputs0ArrayTrans));
size_t wsTrans1Size = dimInputs0[0] * dimInputs0[1] * dimInputs0[2] *
dimInputs0[3] * op->getDType().getSize();
dimInputs0[3] * sizeof(float);
BangPtr wsTrans1Data = context->getWorkspace(wsTrans1Size);
cnnlStatus_t stat =
@ -83,16 +81,15 @@ class ConvBackwardFilterCnnl : public BangKernelWithoutConfig {
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
checkCnnlError(cnnlSetTensorDescriptor(
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
inputs1Array));
bDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, inputs1Array));
checkCnnlError(cnnlCreateTensorDescriptor(&bDescTrans));
checkCnnlError(cnnlSetTensorDescriptor(
bDescTrans, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()),
4, inputs1ArrayTrans));
checkCnnlError(cnnlSetTensorDescriptor(bDescTrans, CNNL_LAYOUT_NHWC,
CNNL_DTYPE_FLOAT, 4,
inputs1ArrayTrans));
size_t wsTrans2Size = dimInputs1[0] * dimInputs1[1] * dimInputs1[2] *
dimInputs1[3] * op->getDType().getSize();
dimInputs1[3] * sizeof(float);
BangPtr wsTrans2Data = context->getWorkspace(wsTrans2Size);
stat = cnnlTranspose(context->cnnlHandle(), transDesc, bDesc, bData,
@ -103,16 +100,15 @@ class ConvBackwardFilterCnnl : public BangKernelWithoutConfig {
// get outputs
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
outputArray));
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, outputArray));
checkCnnlError(cnnlCreateTensorDescriptor(&cDescTrans));
checkCnnlError(cnnlSetTensorDescriptor(
cDescTrans, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()),
4, outputArrayTrans));
checkCnnlError(cnnlSetTensorDescriptor(cDescTrans, CNNL_LAYOUT_NHWC,
CNNL_DTYPE_FLOAT, 4,
outputArrayTrans));
size_t wsTrans3Size = dimOutput[0] * dimOutput[1] * dimOutput[2] *
dimOutput[3] * op->getDType().getSize();
dimOutput[3] * sizeof(float);
BangPtr wsTrans3Data = context->getWorkspace(wsTrans3Size);
cnnlConvolutionBwdFilterAlgo_t algo;
@ -158,6 +154,6 @@ class ConvBackwardFilterCnnl : public BangKernelWithoutConfig {
}
};
REGISTER_KERNEL(Device::BANG, OpType::ConvBackwardFilter,
ConvBackwardFilterCnnl, "ConvBackwardFilter_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::ConvBackwardFilter, DataType::Float32,
ConvBackwardFilterCnnl, "ConvBackwardFilter_cnnl_BANG_Float32");
}; // namespace infini

View File

@ -23,14 +23,14 @@ class DetCnnl : public BangKernelWithoutConfig {
auto dimout = op->getOutput()->getDims();
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
dimin.size(), dimin.data()));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY,
CNNL_DTYPE_FLOAT, dimin.size(),
dimin.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
dimout.size(), dimout.data()));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY,
CNNL_DTYPE_FLOAT, dimout.size(),
dimout.data()));
cnnlStatus_t stat =
cnnlDet(context->cnnlHandle(), nlMode, aDesc, aData, cDesc, cData);
@ -42,5 +42,6 @@ class DetCnnl : public BangKernelWithoutConfig {
}
};
REGISTER_KERNEL(Device::BANG, OpType::Det, DetCnnl, "Det_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::Det, DataType::Float32, DetCnnl,
"Det_cnnl_BANG_Float32");
}; // namespace infini

View File

@ -12,7 +12,6 @@ class ElementWiseCnnl : public BangKernelWithoutConfig {
const RuntimeObj *_context) const override {
auto op = as<ElementWiseObj>(_op);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
auto [aAlpha, bAlpha, beta] = getAlphBeta();
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
@ -31,33 +30,32 @@ class ElementWiseCnnl : public BangKernelWithoutConfig {
}
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
a_dim.size(), a_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, a_dim.size(),
a_dim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
checkCnnlError(cnnlSetTensorDescriptor(
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
b_dim.size(), b_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, b_dim.size(),
b_dim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
c_dim.size(), c_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, c_dim.size(),
c_dim.data()));
cnnlOpTensorDescriptor_t opDesc;
checkCnnlError(cnnlCreateOpTensorDescriptor(&opDesc));
checkCnnlError(cnnlSetOpTensorDescriptor(
opDesc, getOpType(), cnnlDataTypeConvert(op->getDType()),
CNNL_NOT_PROPAGATE_NAN));
opDesc, getOpType(), CNNL_DTYPE_FLOAT, CNNL_NOT_PROPAGATE_NAN));
size_t wsSize;
cnnlGetOpTensorWorkspaceSize_v2(context->cnnlHandle(), opDesc, &aAlpha,
aDesc, aData, &bAlpha, bDesc, bData,
&beta, cDesc, cData, &wsSize);
cnnlGetOpTensorWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
&wsSize);
BangPtr wsData = context->getWorkspace(wsSize);
auto [aAlpha, bAlpha, beta] = getAlphBeta();
cnnlStatus_t stat = cnnlOpTensor(context->cnnlHandle(), opDesc, &aAlpha,
aDesc, aData, &bAlpha, bDesc, bData,
wsData, wsSize, &beta, cDesc, cData);
@ -95,17 +93,17 @@ class LogicOpCnnl : public BangKernelWithoutConfig {
}
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
a_dim.size(), a_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, a_dim.size(),
a_dim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
checkCnnlError(cnnlSetTensorDescriptor(
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
b_dim.size(), b_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, b_dim.size(),
b_dim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
c_dim.size(), c_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, c_dim.size(),
c_dim.data()));
size_t wsSize;
cnnlGetLogicOpWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
@ -200,17 +198,17 @@ class DivCnnl : public BangKernelWithoutConfig {
}
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
a_dim.size(), a_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, a_dim.size(),
a_dim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
checkCnnlError(cnnlSetTensorDescriptor(
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
b_dim.size(), b_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, b_dim.size(),
b_dim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
c_dim.size(), c_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, c_dim.size(),
c_dim.data()));
size_t wsSize;
cnnlGetDivWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
@ -252,17 +250,17 @@ class MaximumCnnl : public BangKernelWithoutConfig {
}
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
a_dim.size(), a_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, a_dim.size(),
a_dim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
checkCnnlError(cnnlSetTensorDescriptor(
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
b_dim.size(), b_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, b_dim.size(),
b_dim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
c_dim.size(), c_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, c_dim.size(),
c_dim.data()));
size_t wsSize;
cnnlGetMaximumWorkspaceSize(context->cnnlHandle(), cDesc, &wsSize);
@ -303,17 +301,17 @@ class MinimumCnnl : public BangKernelWithoutConfig {
}
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
a_dim.size(), a_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, a_dim.size(),
a_dim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
checkCnnlError(cnnlSetTensorDescriptor(
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
b_dim.size(), b_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, b_dim.size(),
b_dim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
c_dim.size(), c_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, c_dim.size(),
c_dim.data()));
size_t wsSize;
cnnlGetMinimumWorkspaceSize(context->cnnlHandle(), cDesc, &wsSize);
@ -354,18 +352,18 @@ class MSELossCnnl : public BangKernelWithoutConfig {
}
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
a_dim.size(), a_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, a_dim.size(),
a_dim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
checkCnnlError(cnnlSetTensorDescriptor(
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
b_dim.size(), b_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, b_dim.size(),
b_dim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
c_dim.size(), c_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, c_dim.size(),
c_dim.data()));
cnnlStatus_t stat;
if (reduction == MSELossObj::None) {
stat = cnnlMSELoss(context->cnnlHandle(), CNNL_MSE_LOSS_NONE, aDesc,
@ -411,17 +409,17 @@ class PowerCnnl : public BangKernelWithoutConfig {
}
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
a_dim.size(), a_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, a_dim.size(),
a_dim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
checkCnnlError(cnnlSetTensorDescriptor(
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
b_dim.size(), b_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, b_dim.size(),
b_dim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
c_dim.size(), c_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, c_dim.size(),
c_dim.data()));
size_t wsSize;
cnnlGetPowWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
@ -463,17 +461,17 @@ class FloorDivCnnl : public BangKernelWithoutConfig {
}
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
a_dim.size(), a_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, a_dim.size(),
a_dim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
checkCnnlError(cnnlSetTensorDescriptor(
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
b_dim.size(), b_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, b_dim.size(),
b_dim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
c_dim.size(), c_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, c_dim.size(),
c_dim.data()));
size_t wsSize;
cnnlGetFloorDivWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
@ -515,17 +513,17 @@ class FloorModCnnl : public BangKernelWithoutConfig {
}
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
a_dim.size(), a_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, a_dim.size(),
a_dim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
checkCnnlError(cnnlSetTensorDescriptor(
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
b_dim.size(), b_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, b_dim.size(),
b_dim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
c_dim.size(), c_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, c_dim.size(),
c_dim.data()));
size_t wsSize;
cnnlGetFloorModWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
@ -567,17 +565,17 @@ class SquaredDifferenceCnnl : public BangKernelWithoutConfig {
}
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
a_dim.size(), a_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, a_dim.size(),
a_dim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
checkCnnlError(cnnlSetTensorDescriptor(
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
b_dim.size(), b_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, b_dim.size(),
b_dim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
c_dim.size(), c_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, c_dim.size(),
c_dim.data()));
size_t wsSize;
cnnlGetSquaredDifferenceWorkspaceSize(context->cnnlHandle(), aDesc,
@ -660,48 +658,62 @@ class BitNotCnnl : public BitComputeCnnl {
// CNNL_BLEFT_SHIFT_OP_V2; }
// };
REGISTER_KERNEL(Device::BANG, OpType::Add, AddCnnl, "Add_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::Sub, SubCnnl, "Sub_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::Mul, MulCnnl, "Mul_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::Add, DataType::Float32, AddCnnl,
"Add_cnnl_BANG_Float32");
REGISTER_KERNEL(Device::BANG, OpType::Sub, DataType::Float32, SubCnnl,
"Sub_cnnl_BANG_Float32");
REGISTER_KERNEL(Device::BANG, OpType::Mul, DataType::Float32, MulCnnl,
"Mul_cnnl_BANG_Float32");
REGISTER_KERNEL(Device::BANG, OpType::Div, DivCnnl, "Div_cnnl");
REGISTER_KERNEL(Device::BANG, OpType::Max, MaximumCnnl, "Maximum_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::Min, MinimumCnnl, "Minimum_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::MSELoss, MSELossCnnl,
"MSELoss_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::Pow, PowerCnnl, "Power_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::FloorDiv, FloorDivCnnl,
"FloorDiv_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::FloorMod, FloorModCnnl,
"FloorMod_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::SquaredDifference, SquaredDifferenceCnnl,
"SquaredDifference_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::Equal, EqualCnnl, "Equal_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::Greater, GreaterThanCnnl,
"GreaterThan_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::GreaterOrEqual, GreaterEqualCnnl,
"GreaterEqual_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::Less, LessThanCnnl, "LessThan_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::LessOrEqual, LessEqualCnnl,
"LessEqual_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::And, AndCnnl, "And_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::Or, OrCnnl, "Or_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::Xor, XorCnnl, "Xor_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::Not, NotCnnl, "Not_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::BitwiseAnd, BitAndCnnl,
"BitAnd_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::BitwiseOr, BitOrCnnl, "BitOr_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::BitwiseXor, BitXorCnnl,
"BitXor_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::BitwiseNot, BitNotCnnl,
"BitNot_cnnl_BANG");
// REGISTER_KERNEL(Device::BANG, OpType::BitLeftShift,
REGISTER_KERNEL(Device::BANG, OpType::Div, DataType::Float32, DivCnnl,
"Div_cnnl_Float32");
REGISTER_KERNEL(Device::BANG, OpType::Max, DataType::Float32, MaximumCnnl,
"Maximum_cnnl_BANG_Float32");
REGISTER_KERNEL(Device::BANG, OpType::Min, DataType::Float32, MinimumCnnl,
"Minimum_cnnl_BANG_Float32");
REGISTER_KERNEL(Device::BANG, OpType::MSELoss, DataType::Float32, MSELossCnnl,
"MSELoss_cnnl_BANG_Float32");
REGISTER_KERNEL(Device::BANG, OpType::Pow, DataType::Float32, PowerCnnl,
"Power_cnnl_BANG_Float32");
REGISTER_KERNEL(Device::BANG, OpType::FloorDiv, DataType::Float32, FloorDivCnnl,
"FloorDiv_cnnl_BANG_Float32");
REGISTER_KERNEL(Device::BANG, OpType::FloorMod, DataType::Float32, FloorModCnnl,
"FloorMod_cnnl_BANG_Float32");
REGISTER_KERNEL(Device::BANG, OpType::SquaredDifference, DataType::Float32,
SquaredDifferenceCnnl, "SquaredDifference_cnnl_BANG_Float32");
REGISTER_KERNEL(Device::BANG, OpType::Equal, DataType::Float32, EqualCnnl,
"Equal_cnnl_BANG_Float32");
REGISTER_KERNEL(Device::BANG, OpType::Greater, DataType::Float32,
GreaterThanCnnl, "GreaterThan_cnnl_BANG_Float32");
REGISTER_KERNEL(Device::BANG, OpType::GreaterOrEqual, DataType::Float32,
GreaterEqualCnnl, "GreaterEqual_cnnl_BANG_Float32");
REGISTER_KERNEL(Device::BANG, OpType::Less, DataType::Float32, LessThanCnnl,
"LessThan_cnnl_BANG_Float32");
REGISTER_KERNEL(Device::BANG, OpType::LessOrEqual, DataType::Float32,
LessEqualCnnl, "LessEqual_cnnl_BANG_Float32");
REGISTER_KERNEL(Device::BANG, OpType::And, DataType::Float32, AndCnnl,
"And_cnnl_BANG_Float32");
REGISTER_KERNEL(Device::BANG, OpType::Or, DataType::Float32, OrCnnl,
"Or_cnnl_BANG_Float32");
REGISTER_KERNEL(Device::BANG, OpType::Xor, DataType::Float32, XorCnnl,
"Xor_cnnl_BANG_Float32");
REGISTER_KERNEL(Device::BANG, OpType::Not, DataType::Float32, NotCnnl,
"Not_cnnl_BANG_Float32");
REGISTER_KERNEL(Device::BANG, OpType::BitwiseAnd, DataType::Float32, BitAndCnnl,
"BitAnd_cnnl_BANG_Float32");
REGISTER_KERNEL(Device::BANG, OpType::BitwiseOr, DataType::Float32, BitOrCnnl,
"BitOr_cnnl_BANG_Float32");
REGISTER_KERNEL(Device::BANG, OpType::BitwiseXor, DataType::Float32, BitXorCnnl,
"BitXor_cnnl_BANG_Float32");
REGISTER_KERNEL(Device::BANG, OpType::BitwiseNot, DataType::Float32, BitNotCnnl,
"BitNot_cnnl_BANG_Float32");
// REGISTER_KERNEL(Device::BANG, OpType::BitLeftShift, DataType::Float32,
// BitLeftShiftCnnl,
// "BitLeftShift_cnnl_BANG");
// REGISTER_KERNEL(Device::BANG, OpType::BitRightShift,
// "BitLeftShift_cnnl_BANG_Float32");
// REGISTER_KERNEL(Device::BANG, OpType::BitRightShift, DataType::Float32,
// BitRightShiftCnnl,
// "BitRightShift_cnnl_BANG");
// REGISTER_KERNEL(Device::BANG, OpType::Pow,
// "BitRightShift_cnnl_BANG_Float32");
// REGISTER_KERNEL(Device::BANG, OpType::Pow, DataType::Float32,
// ElementWiseBang,
// "Pow_Bang");
// "Pow_Bang_Float32");
}; // namespace infini

View File

@ -17,13 +17,13 @@ class ErfCnnl : public BangKernelWithoutConfig {
auto cDim = op->getOutput()->getDims();
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
aDim.size(), aDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, aDim.size(),
aDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
cDim.size(), cDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, cDim.size(),
cDim.data()));
cnnlStatus_t stat =
cnnlErf_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION,
@ -36,6 +36,7 @@ class ErfCnnl : public BangKernelWithoutConfig {
}
};
REGISTER_KERNEL(Device::BANG, OpType::Erf, ErfCnnl, "Erf_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::Erf, DataType::Float32, ErfCnnl,
"Erf_cnnl_BANG_Float32");
}; // namespace infini

View File

@ -17,13 +17,13 @@ class ExpCnnl : public BangKernelWithoutConfig {
auto cDim = op->getOutput()->getDims();
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
aDim.size(), aDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, aDim.size(),
aDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
cDim.size(), cDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, cDim.size(),
cDim.data()));
cnnlStatus_t stat =
cnnlExp_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION,
@ -36,6 +36,7 @@ class ExpCnnl : public BangKernelWithoutConfig {
}
};
REGISTER_KERNEL(Device::BANG, OpType::Exp, ExpCnnl, "Exp_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::Exp, DataType::Float32, ExpCnnl,
"Exp_cnnl_BANG_Float32");
}; // namespace infini

Some files were not shown because too many files have changed in this diff Show More