forked from jiuyuan/InfiniTensor
Merge branch 'ascend' of github.com:InfiniTensor/InfiniTensor into ascend
This commit is contained in:
commit
a765cd2a3d
|
@ -20,7 +20,6 @@ endif()
|
|||
include(CMakeDependentOption)
|
||||
project(InfiniTensor C CXX)
|
||||
|
||||
|
||||
cmake_dependent_option(BUILD_TEST_CORE "Build tests for core components" ON BUILD_TEST OFF)
|
||||
cmake_dependent_option(BUILD_TEST_PET "Build tests for PET" OFF BUILD_TEST OFF)
|
||||
|
||||
|
@ -55,11 +54,13 @@ endif()
|
|||
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
set(CMAKE_CXX_EXTENSIONS OFF) # -std=gnu++11 when on, -std=c++11 when off
|
||||
add_compile_options(-Wno-error=unused-variable)
|
||||
|
||||
find_package(
|
||||
Python
|
||||
COMPONENTS Interpreter Development
|
||||
REQUIRED)
|
||||
|
||||
# OpenMP
|
||||
find_package(OpenMP)
|
||||
if(OpenMP_C_FOUND)
|
||||
|
@ -252,7 +253,6 @@ if(USE_BANG)
|
|||
find_library(CAMBRICON_CNNL libcnnl.so "${NEUWARE_HOME}/lib64")
|
||||
find_library(CAMBRICON_CNRT libcnrt.so "${NEUWARE_HOME}/lib64")
|
||||
find_library(CAMBRICON_CNDRV libcndrv.so "${NEUWARE_HOME}/lib64")
|
||||
find_library(CAMBRICON_CNCL libcncl.so "${NEUWARE_HOME}/lib64")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lstdc++ -Wall -Werror")
|
||||
|
||||
if ((NOT DEFINED TARGET_CPU_ARCH) AND (NOT DEFINED ENV{TARGET_CPU_ARCH}))
|
||||
|
@ -269,12 +269,13 @@ if(USE_BANG)
|
|||
# BangC Kernels
|
||||
################################################################################
|
||||
|
||||
target_link_libraries(InfiniTensor ${CAMBRICON_CNCL} ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
|
||||
if (BUILD_DIST)
|
||||
find_library(CAMBRICON_CNCL libcncl.so "${NEUWARE_HOME}/lib64")
|
||||
target_link_libraries(InfiniTensor ${CAMBRICON_CNCL} ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
|
||||
message(STATUS "Add BUILD_DIST, use CNCL with BANG")
|
||||
|
||||
add_compile_definitions(INFINI_USE_CNCL=1)
|
||||
|
||||
else()
|
||||
target_link_libraries(InfiniTensor ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
@ -289,10 +290,11 @@ if(USE_KUNLUN)
|
|||
endif()
|
||||
message(STATUS "KUNLUN_HOME: ${KUNLUN_HOME}")
|
||||
|
||||
include_directories("${KUNLUN_HOME}/XTDK/include/")
|
||||
find_library(KUNLUN_RT libxpurt.so "${KUNLUN_HOME}/lib64")
|
||||
find_library(KUNLUN_DNN libxpuapi.so "${KUNLUN_HOME}/XTDK/shlib")
|
||||
include_directories("${KUNLUN_HOME}/include/")
|
||||
find_library(KUNLUN_RT libxpurt.so "${KUNLUN_HOME}/lib64/")
|
||||
find_library(KUNLUN_DNN libxpuapi.so "${KUNLUN_HOME}/lib64/")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lstdc++ -Wall -Werror")
|
||||
|
||||
if ((NOT DEFINED TARGET_CPU_ARCH) AND (NOT DEFINED ENV{TARGET_CPU_ARCH}))
|
||||
execute_process(COMMAND uname -m OUTPUT_VARIABLE _uname_m OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
set(TARGET_CPU_ARCH "${_uname_m}" CACHE STRING "Target CPU ARCH")
|
||||
|
@ -301,7 +303,16 @@ if(USE_KUNLUN)
|
|||
else()
|
||||
set(TARGET_CPU_ARCH $ENV{TARGET_CPU_ARCH} CACHE STRING "Target CPU ARCH")
|
||||
endif()
|
||||
|
||||
message(STATUS "TARGET_CPU_ARCH: ${TARGET_CPU_ARCH}")
|
||||
|
||||
if (BUILD_DIST)
|
||||
message(STATUS "Add BUILD_DIST, use XCCL with KUNLUN XPU")
|
||||
list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
|
||||
find_package(XCCL REQUIRED)
|
||||
add_compile_definitions(INFINI_USE_XCCL=1)
|
||||
target_link_libraries(InfiniTensor ${XCCL_LIBRARIES})
|
||||
endif()
|
||||
target_link_libraries(InfiniTensor ${KUNLUN_RT} ${KUNLUN_DNN} stdc++)
|
||||
endif()
|
||||
|
||||
|
@ -376,6 +387,7 @@ if(BUILD_TEST)
|
|||
endif()
|
||||
if (USE_KUNLUN)
|
||||
build_test(test/kernels/kunlun/*.cc)
|
||||
build_test(test/kunlun/*.cc)
|
||||
endif()
|
||||
if (USE_ASCEND)
|
||||
build_test(test/kernels/ascend/*.cc)
|
||||
|
@ -394,4 +406,4 @@ if(BUILD_TEST)
|
|||
add_executable(nnet_reader test/nnet/readlog.cc)
|
||||
target_link_libraries(nnet_reader InfiniTensor)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
3
Makefile
3
Makefile
|
@ -8,7 +8,9 @@ ASCEND ?= OFF
|
|||
INTELCPU ?= off
|
||||
BACKTRACE ?= ON
|
||||
TEST ?= ON
|
||||
DIST ?= OFF
|
||||
NNET ?= OFF
|
||||
DIST ?= OFF
|
||||
FORMAT_ORIGIN ?=
|
||||
# Docker build options
|
||||
DOCKER_NAME ?= infinitensor
|
||||
|
@ -31,6 +33,7 @@ CMAKE_OPT += -DUSE_KUNLUN=$(KUNLUN)
|
|||
CMAKE_OPT += -DUSE_ASCEND=$(ASCEND)
|
||||
CMAKE_OPT += -DUSE_BACKTRACE=$(BACKTRACE)
|
||||
CMAKE_OPT += -DBUILD_TEST=$(TEST)
|
||||
CMAKE_OPT += -DBUILD_DIST=$(DIST)
|
||||
CMAKE_OPT += -DBUILD_NNET=$(NNET)
|
||||
|
||||
ifeq ($(INTELCPU), ON)
|
||||
|
|
|
@ -0,0 +1,27 @@
|
|||
# Find the xccl libraries
|
||||
set(XCCL_INCLUDE_DIR $ENV{KUNLUN_HOME}/include CACHE PATH "Folder contains KUNLUN XCCL headers")
|
||||
set(XCCL_LIB_DIR $ENV{KUNLUN_HOME} CACHE PATH "Folder contains KUNLUN XCCL libraries")
|
||||
|
||||
list(APPEND CMAKE_PREFIX_PATH $ENV{KUNLUN_HOME})
|
||||
|
||||
find_path(XCCL_INCLUDE_DIRS # ${XCCL_INCLUDE_DIR}
|
||||
NAMES xpu/bkcl.h
|
||||
HINTS XCCL_INCLUDE_DIR)
|
||||
|
||||
find_library(XCCL_LIBRARIES # ${XCCL_LIB_DIR}
|
||||
NAMES lib64/libbkcl.so
|
||||
HINTS XCCL_LIB_DIR)
|
||||
|
||||
message(STATUS "XCCL_INCLUDE_DIRS: ${XCCL_INCLUDE_DIRS}")
|
||||
message(STATUS "XCCL_LIBRARIES: ${XCCL_LIBRARIES}")
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
find_package_handle_standard_args(XCCL DEFAULT_MSG XCCL_INCLUDE_DIRS XCCL_LIBRARIES)
|
||||
|
||||
if (XCCL_FOUND)
|
||||
set (XCCL_HEADER_FILE "${XCCL_INCLUDE_DIRS}/xpu/bkcl.h")
|
||||
message (STATUS "Determing XCCL version from ${XCCL_HEADER_FILE}...")
|
||||
list (APPEND CMAKE_REQUIRED_INCLUDES ${XCCL_INCLUDE_DIRS})
|
||||
message(STATUS "Found XCCL (include: ${XCCL_INCLUDE_DIRS}, library: ${XCCL_LIBRARIES})")
|
||||
mark_as_advanced(XCCL_INCLUDE_DIRS XCCL_LIBRARIES)
|
||||
endif()
|
|
@ -2,6 +2,7 @@
|
|||
|
||||
## 目录
|
||||
|
||||
|
||||
- [环境支持](#环境支持)
|
||||
- [神经网络支持](#神经网络支持)
|
||||
- [技术支持](#技术支持)
|
||||
|
@ -19,10 +20,10 @@
|
|||
|
||||
目前已经验证过的神经网络模型有
|
||||
|
||||
- [x] [ResNet18-v2](https://github.com/onnx/models/blob/main/vision/classification/resnet/model/resnet18-v2-7.onnx)
|
||||
- [x] [DenseNet-121-12](https://github.com/onnx/models/blob/main/vision/classification/densenet-121/model/densenet-12.onnx)
|
||||
- [x] [Inception-2](https://github.com/onnx/models/blob/main/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx)
|
||||
- [x] [EfficientNet-Lite4](https://github.com/onnx/models/blob/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx)
|
||||
- [x] [ResNet18-v2](https://github.com/onnx/models/blob/main/validated/vision/classification/resnet/model/resnet18-v2-7.onnx)
|
||||
- [x] [DenseNet-121-12](https://github.com/onnx/models/blob/main/validated/vision/classification/densenet-121/model/densenet-12.onnx)
|
||||
- [x] [Inception-2](https://github.com/onnx/models/blob/main/validated/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx)
|
||||
- [x] [EfficientNet-Lite4](https://github.com/onnx/models/blob/main/validated/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx)
|
||||
|
||||
## 技术支持
|
||||
|
||||
|
|
|
@ -3,9 +3,10 @@
|
|||
## 目录
|
||||
|
||||
- [使用方法](#使用方法)
|
||||
- [python-前端应用指南](#python-前端应用指南)
|
||||
- [导入-onnx-模型](#导入-onnx-模型)
|
||||
- [导出-onnx-模型](#导出-onnx-模型)
|
||||
- [python 前端应用指南](#python-前端应用指南)
|
||||
- [导入 onnx 模型](#导入-onnx-模型)
|
||||
- [优化](#优化)
|
||||
- [导出 onnx 模型](#导出-onnx-模型)
|
||||
- [执行推理](#执行推理)
|
||||
- [样例代码](#样例代码)
|
||||
- [技术支持](#技术支持)
|
||||
|
@ -13,7 +14,7 @@
|
|||
|
||||
## 使用方法
|
||||
|
||||
项目管理功能已写到 [Makefile](Makefile),支持下列功能:
|
||||
项目管理功能已写到 [Makefile](../Makefile),支持下列功能:
|
||||
|
||||
- 编译项目:`make`/`make build`
|
||||
- 清理生成文件:`make clean`
|
||||
|
@ -39,10 +40,10 @@
|
|||
|
||||
支持的模型:
|
||||
|
||||
- [x] [ResNet18-v2](https://github.com/onnx/models/blob/main/vision/classification/resnet/model/resnet18-v2-7.onnx)
|
||||
- [x] [DenseNet-121-12](https://github.com/onnx/models/blob/main/vision/classification/densenet-121/model/densenet-12.onnx)
|
||||
- [x] [Inception-2](https://github.com/onnx/models/blob/main/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx)
|
||||
- [x] [EfficientNet-Lite4](https://github.com/onnx/models/blob/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx)
|
||||
- [x] [ResNet18-v2](https://github.com/onnx/models/blob/main/validated/vision/classification/resnet/model/resnet18-v2-7.onnx)
|
||||
- [x] [DenseNet-121-12](https://github.com/onnx/models/blob/main/validated/vision/classification/densenet-121/model/densenet-12.onnx)
|
||||
- [x] [Inception-2](https://github.com/onnx/models/blob/main/validated/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx)
|
||||
- [x] [EfficientNet-Lite4](https://github.com/onnx/models/blob/main/validated/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx)
|
||||
|
||||
```python
|
||||
import onnx
|
||||
|
@ -97,7 +98,7 @@ for name, tensor in stub.inputs.items():
|
|||
print(name, tensor.shape(), tensor)
|
||||
```
|
||||
|
||||
对于 [resnet18-v2-7.onnx](https://github.com/onnx/models/blob/main/vision/classification/resnet/model/resnet18-v2-7.onnx),会打印出:
|
||||
对于 [resnet18-v2-7.onnx](https://github.com/onnx/models/blob/main/validated/vision/classification/resnet/model/resnet18-v2-7.onnx),会打印出:
|
||||
|
||||
```plaintext
|
||||
data [1, 3, 224, 224] <backend.Tensor object at 0x7efeb828e3b0>
|
||||
|
@ -138,7 +139,7 @@ for name, tensor in stub.outputs.items():
|
|||
|
||||
### 样例代码
|
||||
|
||||
您可以参照[./example/Resnet/resnet.py](./example/ResNet/resnet.py)的样例代码进行了解,并尝试运行。在这个文件中,我们使用了 Pytorch 构建了 resnet 网络。您可以查阅该脚本使用方式:
|
||||
您可以参照[resnet.py](https://github.com/wanghailu0717/NNmodel/blob/main/ResNet/resnet.py)的样例代码进行了解,并尝试运行。在这个文件中,我们使用了 Pytorch 构建了 resnet 网络。您可以查阅该脚本使用方式:
|
||||
|
||||
```python
|
||||
python resnet.py -h
|
||||
|
|
|
@ -0,0 +1,17 @@
|
|||
# 分布式脚本
|
||||
|
||||
#### 1. 运行pytorch模型并生成输入和标准输出,可选择导出onnx
|
||||
|
||||
使用 `--export_onnx` 设置导出onnx的目录,默认为当前路径 `./`,不使用这个flag则只进行计算和生成输入输出。
|
||||
|
||||
```bash
|
||||
python run_pytorch.py --model gpt2 --batch_size 1 --length 1 --export_onnx ./
|
||||
```
|
||||
|
||||
会在当前目录下生成输入输出文件`test_inputs.npy` 和 `test_results.npy`,目前只支持单一输入输出。
|
||||
|
||||
#### 2. 运行InfiniTensor分布式脚本
|
||||
|
||||
```bash
|
||||
python cuda_launch.py --model "/XXX/XXX.onnx" --nproc_per_node 4
|
||||
```
|
|
@ -10,9 +10,6 @@ import numpy as np
|
|||
from parallel_opt import parallel_model
|
||||
|
||||
|
||||
os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description="launch distributed infinitensor")
|
||||
parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
|
||||
|
@ -32,6 +29,9 @@ def parse_args():
|
|||
action="store_true",
|
||||
help="whether to generate the standard results.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--type", type=str, choices=["fp32", "fp16", "tf32"], default="fp32", help="data type"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
print("arg setting: ", args)
|
||||
return (
|
||||
|
@ -42,12 +42,13 @@ def parse_args():
|
|||
args.batch_size,
|
||||
args.length,
|
||||
args.gen_std,
|
||||
args.type,
|
||||
)
|
||||
|
||||
|
||||
def run_model(model, runtime, inputs, n=10):
|
||||
stub = OnnxStub(model, runtime)
|
||||
for tensor, input in zip(stub.inputs.values(), inputs):
|
||||
def run_model(model, runtime, inputs, n=10, data_type = "default"):
|
||||
stub = OnnxStub(model, runtime, matmul_compute_type=data_type)
|
||||
for tensor, input in zip(stub.inputs.values(), inputs, strict=False):
|
||||
tensor.copyin_numpy(input)
|
||||
# stub.tune()
|
||||
stub.run()
|
||||
|
@ -55,7 +56,7 @@ def run_model(model, runtime, inputs, n=10):
|
|||
outputs = next(stub.outputs.values().__iter__()).copyout_numpy()
|
||||
|
||||
# bench
|
||||
for tensor, input in zip(stub.inputs.values(), inputs):
|
||||
for tensor, input in zip(stub.inputs.values(), inputs, strict=False):
|
||||
tensor.copyin_numpy(input)
|
||||
begin = time.time()
|
||||
for _ in range(n):
|
||||
|
@ -66,17 +67,17 @@ def run_model(model, runtime, inputs, n=10):
|
|||
return outputs
|
||||
|
||||
|
||||
def run_and_compare(name, model, runtime):
|
||||
def run_and_compare(name, model, runtime, data_type):
|
||||
input_ids = np.load(f"{name}_inputs.npy")
|
||||
position_ids = np.arange(input_ids.shape[-1])
|
||||
results = np.load(f"{name}_results.npy")
|
||||
outputs = run_model(model, runtime, (input_ids, position_ids))
|
||||
outputs = run_model(model, runtime, (input_ids, position_ids), data_type=data_type)
|
||||
print("outputs abs mean:", abs(outputs).mean())
|
||||
np.testing.assert_allclose(outputs, results, rtol=1e-6, atol=1e-3)
|
||||
print("max abs diff:", abs(outputs - results).max())
|
||||
|
||||
|
||||
def start_worker(
|
||||
name: str, world_size: int, rank: int, local_rank: int, model: onnx.ModelProto
|
||||
name: str, world_size: int, rank: int, local_rank: int, model: onnx.ModelProto, data_type: str
|
||||
):
|
||||
dist_name = name + "_dist"
|
||||
model = parallel_model(model, world_size, rank)
|
||||
|
@ -89,7 +90,7 @@ def start_worker(
|
|||
save_as_external_data=True,
|
||||
location=extern_path,
|
||||
)
|
||||
infer_shapes_path(f"./{dist_name}_rank{rank}.onnx")
|
||||
#infer_shapes_path(f"./{dist_name}_rank{rank}.onnx")
|
||||
runtime = backend.CudaRuntime(local_rank)
|
||||
# print("init comm")
|
||||
runtime.init_comm(
|
||||
|
@ -97,12 +98,12 @@ def start_worker(
|
|||
world_size,
|
||||
rank,
|
||||
)
|
||||
run_and_compare(name, model, runtime)
|
||||
run_and_compare(name, model, runtime, data_type)
|
||||
|
||||
|
||||
def start_single(name, model):
|
||||
def start_single(name, model, data_type):
|
||||
runtime = backend.CudaRuntime(0)
|
||||
run_and_compare(name, model, runtime)
|
||||
run_and_compare(name, model, runtime, data_type)
|
||||
|
||||
|
||||
def gen_standard(name, model, voc_size, bs, len):
|
||||
|
@ -117,8 +118,10 @@ def gen_standard(name, model, voc_size, bs, len):
|
|||
|
||||
|
||||
def main():
|
||||
nnodes, nproc_per_node, name, model_path, bs, length, gen_std = parse_args()
|
||||
|
||||
nnodes, nproc_per_node, name, model_path, bs, length, gen_std, data_type = parse_args()
|
||||
data_type = "default" if data_type == "fp32" else data_type
|
||||
if data_type != "tf32":
|
||||
os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
|
||||
model = onnx.load(model_path)
|
||||
|
||||
# generate standart output
|
||||
|
@ -132,7 +135,7 @@ def main():
|
|||
# run single process.
|
||||
# use standalone process to isolate cuda.
|
||||
print("run model by single GPU.")
|
||||
p = mp.Process(target=start_single, args=(name, model))
|
||||
p = mp.Process(target=start_single, args=(name, model, data_type))
|
||||
p.start()
|
||||
p.join()
|
||||
|
||||
|
@ -142,7 +145,7 @@ def main():
|
|||
workers = [
|
||||
mp.Process(
|
||||
target=start_worker,
|
||||
args=(name, world_size, rank, rank % nproc_per_node, model),
|
||||
args=(name, world_size, rank, rank % nproc_per_node, model, data_type),
|
||||
)
|
||||
for rank in range(world_size)
|
||||
]
|
|
@ -0,0 +1,213 @@
|
|||
import argparse
|
||||
import os
|
||||
import time
|
||||
import multiprocessing as mp
|
||||
from pyinfinitensor.onnx import OnnxStub, backend
|
||||
import onnx
|
||||
from onnx.external_data_helper import convert_model_to_external_data
|
||||
from onnx.shape_inference import infer_shapes_path
|
||||
import numpy as np
|
||||
from parallel_opt import parallel_model
|
||||
|
||||
st_input_dir = "standard/inputs/"
|
||||
st_output_dir = "standard/outputs/"
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description="launch distributed infinitensor")
|
||||
parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
|
||||
parser.add_argument(
|
||||
"--nproc_per_node", type=int, default=2, help="number of processes per node"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--name", type=str, default="test", help="name of this instance."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model", type=str, default="/data1/shared/panzezhong/llama/fp32/my_llama_fp32.sim.onnx", help="path to the ONNX model file."
|
||||
)
|
||||
parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
|
||||
parser.add_argument("--length", type=int, default=1, help="sequence length.")
|
||||
parser.add_argument(
|
||||
"--gen_std",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="whether to generate the standard results.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--run_single",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="whether run model with single process with standard inputs"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
print("arg setting: ", args)
|
||||
return (
|
||||
args.num_nodes,
|
||||
args.nproc_per_node,
|
||||
args.name,
|
||||
args.model,
|
||||
args.batch_size,
|
||||
args.length,
|
||||
args.gen_std,
|
||||
args.run_single
|
||||
)
|
||||
|
||||
|
||||
def run_model(model, runtime, world_size=1, rank=0, n=10):
|
||||
stub = OnnxStub(model, runtime)
|
||||
load_inputs(stub, world_size, rank)
|
||||
# stub.tune()
|
||||
stub.run()
|
||||
# get outputs
|
||||
time.sleep(0.01)
|
||||
outputs = next(stub.outputs.values().__iter__()).copyout_numpy()
|
||||
|
||||
# bench
|
||||
begin = time.time()
|
||||
for _ in range(n):
|
||||
stub.run()
|
||||
end = time.time()
|
||||
avg_time = (end - begin) / n
|
||||
print(f"average time: {avg_time}")
|
||||
return outputs
|
||||
|
||||
|
||||
|
||||
def run_and_compare(name, model, runtime, world_size=1, rank = 0):
|
||||
results = np.load(os.path.join(st_output_dir,f"output.npy"))
|
||||
outputs = run_model(model, runtime, world_size, rank)
|
||||
print(outputs[:100])
|
||||
if np.isnan(outputs).any():
|
||||
print("Nan in output")
|
||||
print("answer argmax:", np.argmax(results))
|
||||
print("output argmax:", np.argmax(outputs))
|
||||
#np.testing.assert_allclose(outputs, results, rtol=1e-3, atol=1e-3)
|
||||
getDiff(results, outputs)
|
||||
|
||||
|
||||
def start_worker(
|
||||
name: str, world_size: int, rank: int, local_rank: int, model: onnx.ModelProto
|
||||
):
|
||||
dist_name = name + "_dist"
|
||||
model = parallel_model(model, world_size, rank)
|
||||
extern_path = f"./{dist_name}_rank{rank}.pb"
|
||||
if os.path.exists(extern_path):
|
||||
os.remove(extern_path)
|
||||
onnx.save_model(
|
||||
model,
|
||||
f"./{dist_name}_rank{rank}.onnx",
|
||||
save_as_external_data=True,
|
||||
location=extern_path,
|
||||
)
|
||||
infer_shapes_path(f"./{dist_name}_rank{rank}.onnx")
|
||||
runtime = backend.KUNLUNRuntime(local_rank)
|
||||
# print("init comm")
|
||||
runtime.init_comm(
|
||||
dist_name,
|
||||
world_size,
|
||||
rank,
|
||||
)
|
||||
run_and_compare(name, model, runtime, world_size, rank)
|
||||
|
||||
|
||||
def start_single(name, model):
|
||||
runtime = backend.KUNLUNRuntime(0)
|
||||
run_and_compare(name, model, runtime)
|
||||
|
||||
|
||||
def generate_input_output(model):
|
||||
runtime = backend.KUNLUNRuntime(0)
|
||||
stub = OnnxStub(model, runtime)
|
||||
position_id = 0
|
||||
for i, (name, tensor) in enumerate(stub.inputs.items()):
|
||||
input = tensor.copyout_numpy()
|
||||
if np.issubdtype(input.dtype, np.integer):
|
||||
if input.size == 1:
|
||||
# input = np.array([position_id])
|
||||
input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
|
||||
else:
|
||||
input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
|
||||
elif input.dtype == np.bool_:
|
||||
input = np.random.randint(0,2,size=input.shape) > 0
|
||||
else:
|
||||
if i == 0:
|
||||
input = np.ones(input.shape).astype(input.dtype)
|
||||
position_id = input.shape[-1] - 1
|
||||
else:
|
||||
input = np.random.rand(*input.shape).astype(input.dtype)
|
||||
tensor.copyin_numpy(input)
|
||||
np.save(os.path.join(st_input_dir, f"input_{i}"), input)
|
||||
stub.run()
|
||||
# print(stub.outputs)
|
||||
time.sleep(0.01)
|
||||
output = next(stub.outputs.values().__iter__()).copyout_numpy()
|
||||
print(output[:100])
|
||||
if np.isnan(output).any():
|
||||
print("Nan in output")
|
||||
np.save(os.path.join(st_output_dir, f"output"), output)
|
||||
|
||||
|
||||
def load_inputs(stub, world_size=1, rank=0):
|
||||
for i, (name, tensor) in enumerate(stub.inputs.items()):
|
||||
input = np.load(os.path.join(st_input_dir, f"input_{i}.npy"))
|
||||
if all(x == y for x,y in zip(input.shape,tensor.shape())):
|
||||
tensor.copyin_numpy(input)
|
||||
else:
|
||||
tensor.copyin_numpy(np.hsplit(input, world_size)[rank])
|
||||
|
||||
|
||||
def getDiff(base, test):
|
||||
absolute_diff = np.abs(np.subtract(base, test))
|
||||
max_absolute_diff = np.max(absolute_diff)
|
||||
|
||||
baseCopy = base.astype(np.float64).ravel()
|
||||
testCopy = test.astype(np.float64).ravel()
|
||||
upValue = np.sum(np.abs(baseCopy - testCopy))
|
||||
downValue = np.sum(np.abs(baseCopy)) + np.float64(1e-9)
|
||||
max_relative_diff = upValue / downValue
|
||||
print(f"Max absolute difference: {max_absolute_diff}\nMax relative difference: {max_relative_diff}")
|
||||
|
||||
return max_absolute_diff, max_relative_diff
|
||||
|
||||
|
||||
def main():
|
||||
nnodes, nproc_per_node, name, model_path, bs, length, gen_std, run_single = parse_args()
|
||||
|
||||
model = onnx.load(model_path)
|
||||
|
||||
# generate standart output
|
||||
if gen_std:
|
||||
print("Generate inputs and outputs.")
|
||||
p = mp.Process(target=generate_input_output, args=[model])
|
||||
p.start()
|
||||
p.join()
|
||||
return
|
||||
|
||||
# # run single process.
|
||||
# # use standalone process to isolate cuda.
|
||||
if run_single:
|
||||
print("run model by single GPU.")
|
||||
p = mp.Process(target=start_single, args=(name, model))
|
||||
p.start()
|
||||
p.join()
|
||||
return
|
||||
|
||||
# run distributed parallel.
|
||||
world_size = nnodes * nproc_per_node
|
||||
print(f"run model by {world_size} GPU in parallel.")
|
||||
workers = [
|
||||
mp.Process(
|
||||
target=start_worker,
|
||||
args=(name, world_size, rank, rank % nproc_per_node, model),
|
||||
)
|
||||
for rank in range(world_size)
|
||||
]
|
||||
|
||||
for w in workers:
|
||||
w.start()
|
||||
|
||||
for w in workers:
|
||||
w.join()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -244,5 +244,5 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
|
|||
if tt.HasField("shape"):
|
||||
tt.ClearField("shape")
|
||||
model = helper.make_model(graph)
|
||||
model = onnx.shape_inference.infer_shapes(model)
|
||||
#model = onnx.shape_inference.infer_shapes(model)
|
||||
return model
|
||||
|
|
|
@ -0,0 +1,188 @@
|
|||
import argparse
|
||||
import torch
|
||||
from transformers import BertModel, BertConfig
|
||||
from transformers import GPT2Model, GPT2Config
|
||||
from transformers import OPTModel, OPTConfig
|
||||
import time
|
||||
import numpy as np
|
||||
import onnx
|
||||
import os
|
||||
from onnx.external_data_helper import convert_model_to_external_data
|
||||
from onnxsim import simplify
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description="Run pytorch gpt2/bert/opt and optionally export onnx.")
|
||||
parser.add_argument(
|
||||
"--model", type=str, choices=["gpt2", "bert", "opt"], required=True, help="model type"
|
||||
)
|
||||
parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
|
||||
parser.add_argument("--length", type=int, default=1, help="sequence length.")
|
||||
parser.add_argument(
|
||||
"--export_onnx",
|
||||
type=str,
|
||||
nargs="?",
|
||||
default=None,
|
||||
const="./",
|
||||
help="whether and where to export onnx file",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--type", type=str, choices=["fp32", "fp16", "tf32"], default="fp32", help="data type"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
print("arg setting: ", args)
|
||||
return (
|
||||
args.model,
|
||||
args.batch_size,
|
||||
args.length,
|
||||
args.export_onnx,
|
||||
args.type,
|
||||
)
|
||||
|
||||
|
||||
def get_model(modelname):
|
||||
match modelname:
|
||||
case "bert":
|
||||
model = BertModel.from_pretrained("bert-base-uncased", add_pooling_layer=False, hidden_act="gelu_new") # erf is not impl by infini
|
||||
voc_size = BertConfig().vocab_size
|
||||
case "gpt2":
|
||||
model = GPT2Model.from_pretrained("gpt2")
|
||||
voc_size = GPT2Config().vocab_size
|
||||
case "opt":
|
||||
model = model = OPTModel.from_pretrained("./opt-125m")
|
||||
voc_size = OPTConfig().vocab_size
|
||||
case _:
|
||||
raise KeyError(modelname)
|
||||
|
||||
model = model.eval()
|
||||
return model, voc_size
|
||||
|
||||
def run_pytorch(torch_model, voc_size, batchsize, len):
|
||||
data = np.random.randint(0, voc_size, (batchsize, len), dtype=np.int32)
|
||||
np.save("test_inputs", data)
|
||||
inputs = torch.from_numpy(data).to("cuda")
|
||||
torch_model = torch_model.to("cuda")
|
||||
|
||||
n_iter = 20
|
||||
with torch.no_grad():
|
||||
for _ in range(10):
|
||||
outputs = torch_model(inputs)
|
||||
torch.cuda.synchronize()
|
||||
begin = time.time()
|
||||
with torch.no_grad():
|
||||
for _ in range(n_iter):
|
||||
torch.cuda.synchronize()
|
||||
outputs = torch_model(inputs)
|
||||
#
|
||||
torch.cuda.synchronize()
|
||||
torch.cuda.synchronize()
|
||||
end = time.time()
|
||||
|
||||
avg_time = (end - begin) / n_iter
|
||||
outputs = outputs.last_hidden_state.to("cpu")
|
||||
print("outputs abs mean:", abs(np.array(outputs)).mean())
|
||||
print(f"average time: {avg_time}")
|
||||
torch.cuda.memory.empty_cache()
|
||||
np.save("test_results", np.array(outputs, dtype=np.float32))
|
||||
print("Save input & output as test_inputs.npy and test_results.npy")
|
||||
|
||||
|
||||
def export_onnx(model, data, path, extern=False):
|
||||
torch.onnx.export(model, data, path, verbose=False, do_constant_folding=True)
|
||||
onnx_model = onnx.load(path)
|
||||
onnx_model, check = simplify(onnx_model, skipped_optimizers=['eliminate_duplicate_initializer'])
|
||||
#onnx_model, check = simplify(onnx_model, skipped_optimizers=['fuse_qkv', 'eliminate_duplicate_initializer'])
|
||||
assert check
|
||||
add_value_info_for_constants(onnx_model)
|
||||
onnx_model = onnx.shape_inference.infer_shapes(onnx_model)
|
||||
if extern:
|
||||
extern_path = path.replace('.onnx', '.pb')
|
||||
if os.path.exists(extern_path):
|
||||
os.remove(extern_path)
|
||||
convert_model_to_external_data(
|
||||
onnx_model,
|
||||
all_tensors_to_one_file=True,
|
||||
location=extern_path,
|
||||
size_threshold=1024,
|
||||
convert_attribute=False,
|
||||
)
|
||||
onnx.save(onnx_model, path)
|
||||
|
||||
def add_value_info_for_constants(model : onnx.ModelProto):
|
||||
"""
|
||||
Currently onnx.shape_inference doesn't use the shape of initializers, so add
|
||||
that info explicitly as ValueInfoProtos.
|
||||
Mutates the model.
|
||||
Args:
|
||||
model: The ModelProto to update.
|
||||
"""
|
||||
# All (top-level) constants will have ValueInfos before IRv4 as they are all inputs
|
||||
if model.ir_version < 4:
|
||||
return
|
||||
|
||||
def add_const_value_infos_to_graph(graph : onnx.GraphProto):
|
||||
inputs = {i.name for i in graph.input}
|
||||
existing_info = {vi.name: vi for vi in graph.value_info}
|
||||
for init in graph.initializer:
|
||||
# Check it really is a constant, not an input
|
||||
if init.name in inputs:
|
||||
continue
|
||||
|
||||
# The details we want to add
|
||||
elem_type = init.data_type
|
||||
shape = init.dims
|
||||
|
||||
# Get existing or create new value info for this constant
|
||||
vi = existing_info.get(init.name)
|
||||
if vi is None:
|
||||
vi = graph.value_info.add()
|
||||
vi.name = init.name
|
||||
|
||||
# Even though it would be weird, we will not overwrite info even if it doesn't match
|
||||
tt = vi.type.tensor_type
|
||||
if tt.elem_type == onnx.TensorProto.UNDEFINED:
|
||||
tt.elem_type = elem_type
|
||||
if not tt.HasField("shape"):
|
||||
# Ensure we set an empty list if the const is scalar (zero dims)
|
||||
tt.shape.dim.extend([])
|
||||
for dim in shape:
|
||||
tt.shape.dim.add().dim_value = dim
|
||||
|
||||
# Handle subgraphs
|
||||
for node in graph.node:
|
||||
for attr in node.attribute:
|
||||
# Ref attrs refer to other attrs, so we don't need to do anything
|
||||
if attr.ref_attr_name != "":
|
||||
continue
|
||||
|
||||
if attr.type == onnx.AttributeProto.GRAPH:
|
||||
add_const_value_infos_to_graph(attr.g)
|
||||
if attr.type == onnx.AttributeProto.GRAPHS:
|
||||
for g in attr.graphs:
|
||||
add_const_value_infos_to_graph(g)
|
||||
|
||||
|
||||
return add_const_value_infos_to_graph(model.graph)
|
||||
|
||||
|
||||
def main():
|
||||
torch.backends.cuda.matmul.allow_tf32 = False
|
||||
torch.backends.cudnn.allow_tf32 = False
|
||||
modelname, batchsize, seqlen, export_path, data_type = parse_args()
|
||||
if data_type == "tf32":
|
||||
torch.backends.cuda.matmul.allow_tf32 = True
|
||||
else:
|
||||
os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
|
||||
|
||||
model, voc_size = get_model(modelname)
|
||||
if export_path is not None:
|
||||
filename = "{}_{}_{}.onnx".format(modelname, batchsize, seqlen)
|
||||
path = os.path.join(export_path, filename)
|
||||
param = torch.zeros((batchsize, seqlen), dtype=torch.int)
|
||||
export_onnx(model, param, path, True)
|
||||
|
||||
if data_type == "fp16":
|
||||
model = model.half()
|
||||
run_pytorch(model, voc_size, batchsize, seqlen)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,145 @@
|
|||
import os
|
||||
from pyinfinitensor.onnx import OnnxStub, backend
|
||||
import numpy as np
|
||||
import onnx
|
||||
import torch
|
||||
from transformers import LlamaModel, LlamaForCausalLM
|
||||
from tqdm import tqdm
|
||||
import onnx_graphsurgeon as gs
|
||||
from onnxsim import simplify
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description='')
|
||||
parser.add_argument('--batchsize', dest='batchsize', type=int, default=1)
|
||||
parser.add_argument('--layer', dest='n_layers', type=int, default=2)
|
||||
parser.add_argument('--iter', dest='n_iter', type=int, default=1)
|
||||
parser.add_argument('--n_max_length', dest='n_max_length', type=int, default=1024)
|
||||
parser.add_argument('--pretrained_llama_path', dest='pretrained_llama_path', type=str,
|
||||
default="/data0/shared/data/public/opensource_models/meta-llama/Llama-2-7b-hf/")
|
||||
parser.add_argument('--onnx_model_path', dest='onnx_model_path', type=str,
|
||||
default="/data1/shared/llama")
|
||||
args = parser.parse_args()
|
||||
|
||||
ONNX_MODEL_PATH = "{}/llama_bs{}_layer{}.onnx".format(args.onnx_model_path, args.batchsize, args.n_layers)
|
||||
ONNX_WEIGHT_PATH = "./llama_bs{}_layer{}.pb".format(args.batchsize, args.n_layers)
|
||||
|
||||
def export_onnx(model: LlamaModel, ONNX_MODEL_PATH):
|
||||
param = torch.zeros(
|
||||
(args.batchsize, 1024), dtype=torch.long)
|
||||
logits = model(param, past_key_values=None)
|
||||
param_kvcache = torch.zeros((args.batchsize, 1), dtype=torch.long)
|
||||
|
||||
torch.onnx.export(model, (param_kvcache, {"past_key_values": logits.past_key_values,
|
||||
"position_ids": param_kvcache}), ONNX_MODEL_PATH, verbose=False,
|
||||
do_constant_folding=True,)
|
||||
onnx_model = onnx.load(ONNX_MODEL_PATH)
|
||||
print("simplifing onnx model")
|
||||
onnx_model, check = simplify(onnx_model, skipped_optimizers=[
|
||||
'eliminate_duplicate_initializer'])
|
||||
assert check
|
||||
|
||||
onnx.save(onnx_model, ONNX_MODEL_PATH, save_as_external_data=True, location=ONNX_WEIGHT_PATH)
|
||||
print("simlifing finished.")
|
||||
|
||||
|
||||
@gs.Graph.register()
|
||||
def replace_with_attention(self, inputs, outputs, inputs_added, outputs_removed):
|
||||
for inp in inputs:
|
||||
inp.outputs.clear()
|
||||
for out in outputs:
|
||||
out.inputs.clear()
|
||||
for inp in inputs_added:
|
||||
inputs.append(inp)
|
||||
for out in outputs_removed:
|
||||
out.inputs.clear()
|
||||
return self.layer(op="AttentionKVCache", inputs=inputs, outputs=outputs)
|
||||
|
||||
|
||||
def replace_onnx_with_attention_op():
|
||||
graph = gs.import_onnx(
|
||||
onnx.load(ONNX_MODEL_PATH))
|
||||
tmap = graph.tensors()
|
||||
for i in range(args.n_layers):
|
||||
inputs = [
|
||||
tmap["onnx::Concat_" + str((i+1)*2)],
|
||||
tmap["onnx::Concat_" + str((i+1)*2+1)],
|
||||
tmap["/model/layers." + str(i) + "/self_attn/Add_output_0"],
|
||||
tmap["/model/layers." + str(i) + "/self_attn/Add_1_output_0"],
|
||||
tmap["/model/layers." + str(i) + "/self_attn/Transpose_2_output_0"]]
|
||||
outputs = [
|
||||
tmap["/model/layers." + str(i) + "/self_attn/MatMul_1_output_0"]]
|
||||
|
||||
inputs_added = [graph.inputs[1]]
|
||||
outputs_removed = []
|
||||
|
||||
graph.replace_with_attention(
|
||||
inputs, outputs, inputs_added, outputs_removed)
|
||||
|
||||
graph.outputs = [tmap[graph.outputs[0].name]]
|
||||
graph.cleanup(True).toposort()
|
||||
onnx.save(gs.export_onnx(graph), ONNX_MODEL_PATH, save_as_external_data=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
kvcache_torch = None
|
||||
torch_model = LlamaForCausalLM.from_pretrained(
|
||||
args.pretrained_llama_path, num_hidden_layers=int(args.n_layers)).eval()
|
||||
|
||||
n_heads = torch_model.config.num_attention_heads
|
||||
n_dims = torch_model.config.hidden_size // n_heads
|
||||
|
||||
if not os.path.exists(ONNX_MODEL_PATH):
|
||||
print("exporting onnx graph")
|
||||
export_onnx(torch_model, ONNX_MODEL_PATH)
|
||||
replace_onnx_with_attention_op()
|
||||
else:
|
||||
print("will use exsiting onnx graph")
|
||||
|
||||
onnx_model = onnx.load(ONNX_MODEL_PATH)
|
||||
stub = OnnxStub(onnx_model, backend.cuda_runtime())
|
||||
|
||||
count_wrong = 0
|
||||
for i in tqdm(range(0, args.n_max_length)):
|
||||
query = np.random.randint(
|
||||
torch_model.config.vocab_size, size=(args.batchsize, 1), dtype=np.int32)
|
||||
position_id = i*np.ones((args.batchsize, 1), dtype=np.int32)
|
||||
|
||||
####################################
|
||||
# pytorch
|
||||
####################################
|
||||
outputs_torch = torch_model(
|
||||
torch.tensor(query), past_key_values=kvcache_torch)
|
||||
logit_torch = outputs_torch['logits']
|
||||
kvcache_torch = outputs_torch['past_key_values']
|
||||
|
||||
####################################
|
||||
# infinitensor
|
||||
####################################
|
||||
# copyin input
|
||||
(list(stub.inputs.items()))[0][1].copyin_int64(
|
||||
query.reshape(-1).tolist())
|
||||
(list(stub.inputs.items()))[1][1].copyin_int64(
|
||||
position_id.reshape(-1).tolist())
|
||||
|
||||
stub.run()
|
||||
|
||||
####################################
|
||||
# validation
|
||||
####################################
|
||||
# copyout output
|
||||
logits_it = np.array((list(stub.outputs.items()))
|
||||
[0][1].copyout_float())
|
||||
|
||||
try:
|
||||
np.testing.assert_allclose(
|
||||
logit_torch[:, -1, :].detach().cpu().numpy().flatten(), logits_it, rtol=1e-3, atol=1e-3)
|
||||
except Exception as e:
|
||||
try:
|
||||
np.testing.assert_allclose(
|
||||
np.argmax(logit_torch[:, -1, :].detach().cpu().numpy().flatten()), np.argmax(logits_it), rtol=1e-3, atol=1e-3)
|
||||
except:
|
||||
count_wrong = count_wrong + 1
|
||||
|
||||
result = "{}/{} failed.".format(count_wrong, args.n_max_length)
|
||||
print(result)
|
||||
del stub
|
|
@ -2,6 +2,10 @@
|
|||
#include "cnnl.h"
|
||||
#include "cnrt.h"
|
||||
#include "core/common.h"
|
||||
#include "core/data_type.h"
|
||||
#ifdef INFINI_USE_CNCL
|
||||
#include "cncl.h"
|
||||
#endif
|
||||
|
||||
#define checkBangError(call) \
|
||||
{ \
|
||||
|
@ -27,4 +31,70 @@ namespace infini {
|
|||
|
||||
using BangPtr = void *;
|
||||
|
||||
inline cnnlDataType_t cnnlDataTypeConvert(DataType dataType) {
|
||||
if (dataType == DataType::Float32) {
|
||||
return CNNL_DTYPE_FLOAT;
|
||||
}
|
||||
if (dataType == DataType::Float16) {
|
||||
return CNNL_DTYPE_HALF;
|
||||
}
|
||||
if (dataType == DataType::Double) {
|
||||
return CNNL_DTYPE_DOUBLE;
|
||||
}
|
||||
if (dataType == DataType::Int8) {
|
||||
return CNNL_DTYPE_INT8;
|
||||
}
|
||||
if (dataType == DataType::Int32) {
|
||||
return CNNL_DTYPE_INT32;
|
||||
}
|
||||
if (dataType == DataType::UInt8) {
|
||||
return CNNL_DTYPE_UINT8;
|
||||
}
|
||||
if (dataType == DataType::BFloat16) {
|
||||
return CNNL_DTYPE_BFLOAT16;
|
||||
}
|
||||
if (dataType == DataType::Int64) {
|
||||
return CNNL_DTYPE_INT64;
|
||||
}
|
||||
if (dataType == DataType::Bool) {
|
||||
return CNNL_DTYPE_BOOL;
|
||||
}
|
||||
IT_TODO_HALT_MSG("Data type " + dataType.toString() +
|
||||
" not supported in CNNL.");
|
||||
}
|
||||
|
||||
#ifdef INFINI_USE_CNCL
|
||||
inline cnclDataType_t cnclDataTypeConvert(DataType dataType) {
|
||||
if (dataType == DataType::Float32) {
|
||||
return cnclFloat32;
|
||||
}
|
||||
if (dataType == DataType::Float16) {
|
||||
return cnclHalf;
|
||||
}
|
||||
if (dataType == DataType::Int8) {
|
||||
return cnclInt8;
|
||||
}
|
||||
if (dataType == DataType::Int16) {
|
||||
return cnclInt16;
|
||||
}
|
||||
if (dataType == DataType::Int32) {
|
||||
return cnclInt32;
|
||||
}
|
||||
if (dataType == DataType::UInt8) {
|
||||
return cnclUint8;
|
||||
}
|
||||
if (dataType == DataType::UInt16) {
|
||||
return cnclUint16;
|
||||
}
|
||||
if (dataType == DataType::UInt32) {
|
||||
return cnclUint32;
|
||||
}
|
||||
if (dataType == DataType::BFloat16) {
|
||||
return cnclBfloat16;
|
||||
}
|
||||
IT_TODO_HALT_MSG("Data type " + dataType.toString() +
|
||||
" not supported in CNCL.");
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -61,16 +61,30 @@ template <typename T> auto enum_to_underlying(T e) {
|
|||
}
|
||||
|
||||
template <typename T> std::string vecToString(const std::vector<T> &vec) {
|
||||
std::string ret;
|
||||
ret.append("[");
|
||||
for (auto d : vec) {
|
||||
ret.append(std::to_string(d));
|
||||
ret.append(",");
|
||||
std::stringstream ss;
|
||||
ss << "[";
|
||||
for (size_t i = 0; i < vec.size(); ++i) {
|
||||
ss << vec.at(i);
|
||||
if (i < vec.size() - 1) {
|
||||
ss << ",";
|
||||
}
|
||||
}
|
||||
if (!vec.empty())
|
||||
ret.pop_back();
|
||||
ret.append("]");
|
||||
return ret;
|
||||
ss << "]";
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
template <typename T> std::string vecToString(const T *st, size_t length) {
|
||||
std::stringstream ss;
|
||||
ss << "[";
|
||||
size_t i = 0;
|
||||
for (i = 0; i < length; i++) {
|
||||
ss << *(st + i);
|
||||
if (i < length - 1) {
|
||||
ss << ",";
|
||||
}
|
||||
}
|
||||
ss << "]";
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
double timeit(
|
||||
|
|
|
@ -5,6 +5,10 @@
|
|||
#include <cstdint>
|
||||
#include <iostream>
|
||||
|
||||
#ifdef USE_CUDA
|
||||
#include "cuda/cuda_runtime.h"
|
||||
#endif
|
||||
|
||||
namespace infini {
|
||||
|
||||
class GraphHandlerObj {
|
||||
|
@ -26,12 +30,14 @@ class GraphHandlerObj {
|
|||
int pw, int sh, int sw, int dh, int dw, int oph,
|
||||
int opw);
|
||||
Tensor matmul(Tensor a, Tensor b, Tensor y, bool transA, bool transB,
|
||||
Tensor bias, ActType act);
|
||||
Tensor bias, ActType act,
|
||||
std::string matmul_compute_type = "default");
|
||||
Tensor batchNormalization(Tensor input, Tensor output, Tensor mean,
|
||||
Tensor var, Tensor scale, Tensor bias,
|
||||
float momentum, float eps, bool training);
|
||||
Tensor layerNormalization(Tensor input, Tensor scale, Tensor output,
|
||||
Tensor bias, float eps, int axis, int stash_type);
|
||||
Tensor rmsNorm(Tensor input, Tensor weight, Tensor output);
|
||||
|
||||
Tensor maxPool(Tensor input, Tensor output, int kh, int kw, int dh, int dw,
|
||||
int ph, int pw, int sh, int sw, int ceilMode);
|
||||
|
@ -47,6 +53,7 @@ class GraphHandlerObj {
|
|||
Tensor max(Tensor a, Tensor b, Tensor c);
|
||||
|
||||
Tensor relu(Tensor x, Tensor y);
|
||||
Tensor silu(Tensor x, Tensor y);
|
||||
Tensor gelu(Tensor x, Tensor y);
|
||||
Tensor sigmoid(Tensor x, Tensor y);
|
||||
Tensor hardSigmoid(Tensor x, Tensor y);
|
||||
|
@ -77,6 +84,7 @@ class GraphHandlerObj {
|
|||
Tensor attentionKVCache(Tensor input_k_cache, Tensor input_v_cache,
|
||||
Tensor input_q, Tensor input_k, Tensor input_v,
|
||||
Tensor position_id, Tensor output_matmul);
|
||||
Tensor RoPE(Tensor pos, Tensor input, Tensor output);
|
||||
TensorVec split(Tensor input, std::optional<TensorVec> outputs, int axis,
|
||||
std::variant<int, vector<int>> numOrRatio);
|
||||
Tensor gather(Tensor data, Tensor indices, Tensor output, int axis);
|
||||
|
@ -135,6 +143,12 @@ class GraphHandlerObj {
|
|||
inline void run() { g->getRuntime()->run(g); }
|
||||
|
||||
inline double get_perf_time() { return g->getRuntime()->getPerfTime(g); }
|
||||
|
||||
#ifdef USE_CUDA
|
||||
inline void run_with_cudagraph() {
|
||||
(as<CudaRuntimeObj>(g->getRuntime()))->runWithCudaGraph(g);
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
#include "utils/operator_utils.h"
|
||||
#include <functional>
|
||||
#include <nlohmann/json.hpp>
|
||||
using json = nlohmann::json;
|
||||
namespace infini {
|
||||
using json = nlohmann::json;
|
||||
|
||||
class RuntimeObj; // Forward declaration for Kernel::compute
|
||||
|
||||
|
|
|
@ -151,11 +151,14 @@ struct OpType {
|
|||
ReduceSum, // Reduce
|
||||
ReduceSumSquare, // Reduce
|
||||
Relu, // Unary
|
||||
Silu, // Unary
|
||||
Reshape,
|
||||
Resize,
|
||||
ReverseSequence,
|
||||
RoiAlign,
|
||||
Round, // Unary
|
||||
RoPE, // Fusion
|
||||
Round, // Unary
|
||||
RMSNorm, // Fusion
|
||||
STFT,
|
||||
Scan,
|
||||
Scatter,
|
||||
|
|
|
@ -2,8 +2,8 @@
|
|||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include <nlohmann/json_fwd.hpp>
|
||||
using json = nlohmann::json;
|
||||
namespace infini {
|
||||
using json = nlohmann::json;
|
||||
|
||||
class PerfEngine {
|
||||
public:
|
||||
|
|
|
@ -15,6 +15,7 @@ class GraphObj;
|
|||
class GraphHandlerObj;
|
||||
class RuntimeObj;
|
||||
class BlobObj;
|
||||
template <typename T> class WorkspaceObj;
|
||||
|
||||
using TensorBase = Ref<TensorBaseObj>;
|
||||
using Tensor = Ref<TensorObj>;
|
||||
|
@ -23,6 +24,7 @@ using Graph = Ref<GraphObj>;
|
|||
using GraphHandler = Ref<GraphHandlerObj>;
|
||||
using Runtime = Ref<RuntimeObj>;
|
||||
using Blob = Ref<BlobObj>;
|
||||
template <typename T> using Workspace = Ref<WorkspaceObj<T>>;
|
||||
|
||||
using TensorVec = vector<Tensor>;
|
||||
using OpVec = vector<Operator>;
|
||||
|
|
|
@ -4,6 +4,7 @@
|
|||
#include "utils/data_convert.h"
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
|
||||
#if USE_CUDA
|
||||
#include "cuda/cuda_runtime.h"
|
||||
|
@ -143,6 +144,7 @@ class TensorObj : public TensorBaseObj {
|
|||
}
|
||||
|
||||
void printData() const;
|
||||
void dumpData(std::ofstream &ofs) const;
|
||||
bool equalData(const Tensor &rhs, double relativeError = 1e-6) const;
|
||||
|
||||
template <typename T> bool equalData(const vector<T> &dataVector) {
|
||||
|
@ -198,13 +200,20 @@ class TensorObj : public TensorBaseObj {
|
|||
if (a[i] != b[i])
|
||||
return false;
|
||||
} else if constexpr (std::is_floating_point_v<T>) {
|
||||
if (fabs(a[i] - b[i]) / std::max(fabs(a[i]), fabs(b[i])) >
|
||||
relativeError) {
|
||||
if (std::min(fabs(a[i]), fabs(b[i])) == 0. &&
|
||||
fabs(a[i] - b[i]) > relativeError) {
|
||||
printf("Error on %lu: %f %f\n", i, a[i], b[i]);
|
||||
return false;
|
||||
} else if (std::min(fabs(a[i]), fabs(b[i])) != 0. &&
|
||||
fabs(a[i] - b[i]) /
|
||||
std::max(fabs(a[i]), fabs(b[i])) >
|
||||
relativeError) {
|
||||
printf("Error on %lu: %f %f\n", i, a[i], b[i]);
|
||||
return false;
|
||||
}
|
||||
} else
|
||||
} else {
|
||||
static_assert(!sizeof(T), "Unsupported data type");
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
@ -239,8 +248,8 @@ class TensorObj : public TensorBaseObj {
|
|||
// // std::cerr << "Init beginned " << std::endl;
|
||||
// #pragma omp parallel for
|
||||
// for (size_t i = 0; i < iEnd; ++i)
|
||||
// data[i] = fastrand(random_seed[omp_get_thread_num() * 16]) %
|
||||
// 10000;
|
||||
// data[i] = fastrand(random_seed[omp_get_thread_num() *
|
||||
// 16]) % 10000;
|
||||
// // std::cerr << "Init finished" << std::endl;
|
||||
// computed = ComputedFull;
|
||||
// return true;
|
||||
|
@ -285,8 +294,8 @@ class TensorObj : public TensorBaseObj {
|
|||
// auto nDim = dims.size();
|
||||
// auto nBroadcastDim = ds.size() - nDim;
|
||||
// for (size_t i = 0; i < nDim; ++i)
|
||||
// if (ds[nBroadcastDim + i] < 0 || ds[nBroadcastDim + i] >=
|
||||
// dims[i])
|
||||
// if (ds[nBroadcastDim + i] < 0 || ds[nBroadcastDim +
|
||||
// i] >= dims[i])
|
||||
// return (size_t)-1;
|
||||
// size_t idx = 0;
|
||||
// for (size_t i = 0; i < nDim; ++i)
|
||||
|
@ -345,12 +354,14 @@ class TensorObj : public TensorBaseObj {
|
|||
// return (g_seed >> 16) & 0x7FFF;
|
||||
// }
|
||||
|
||||
// std::vector<std::vector<int>> const *getSplittingPoints() const {
|
||||
// std::vector<std::vector<int>> const *getSplittingPoints()
|
||||
// const {
|
||||
// assert(!splittingPoints.empty());
|
||||
// return &splittingPoints;
|
||||
// }
|
||||
|
||||
// bool setSplittingPoints(std::vector<std::vector<int>> value) {
|
||||
// bool setSplittingPoints(std::vector<std::vector<int>> value)
|
||||
// {
|
||||
// assert(!value.empty());
|
||||
// splittingPoints = value;
|
||||
// return true;
|
||||
|
|
|
@ -0,0 +1,42 @@
|
|||
#pragma once
|
||||
#include "core/runtime.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T> class WorkspaceObj {
|
||||
private:
|
||||
T workspace; // workspace pointer
|
||||
size_t workspaceSize; // Size of workspace
|
||||
size_t workspaceAlloc; // currently use workspace size
|
||||
|
||||
public:
|
||||
WorkspaceObj(T workspace_, size_t workspaceSize_)
|
||||
: workspace(workspace_), workspaceSize(workspaceSize_) {
|
||||
workspaceAlloc = 0;
|
||||
}
|
||||
virtual ~WorkspaceObj() {
|
||||
// Dealloc workspace in RuntimeObj
|
||||
// Set workspace = nullptr here
|
||||
workspace = nullptr;
|
||||
}
|
||||
size_t getWorkspaceSize() const { return workspaceSize; }
|
||||
|
||||
T getWorkspace(size_t size) {
|
||||
// Get unused workspace
|
||||
IT_ASSERT(size + workspaceAlloc <= workspaceSize);
|
||||
auto ret = (T)(static_cast<uint8_t *>(workspace) + workspaceAlloc);
|
||||
workspaceAlloc += size;
|
||||
return ret;
|
||||
}
|
||||
T getWorkspace() {
|
||||
// Override getWorkspace in order to dealloc in runtime
|
||||
return workspace;
|
||||
}
|
||||
void resetWorkspace() {
|
||||
// Reset workspaceAlloc every time end kernel
|
||||
workspaceAlloc = 0;
|
||||
}
|
||||
size_t getWorkspaceAlloc() const { return workspaceAlloc; }
|
||||
};
|
||||
|
||||
} // namespace infini
|
|
@ -1,4 +1,5 @@
|
|||
#pragma once
|
||||
#include "core/common.h"
|
||||
#include <cstdio>
|
||||
|
||||
struct AttentionKVCacheMetadata {
|
||||
|
@ -10,6 +11,7 @@ namespace infini {
|
|||
void attention_kvcache_kernel(float *input_k_cache, float *input_v_cache,
|
||||
float *input_q, float *input_k, float *input_v,
|
||||
int *position_id, float *output_matmul,
|
||||
const AttentionKVCacheMetadata &compMeta);
|
||||
const AttentionKVCacheMetadata &compMeta,
|
||||
float *output_O_temp, float *output_sum_temp);
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -5,6 +5,7 @@
|
|||
#include <cuda_profiler_api.h>
|
||||
#include <cudnn.h>
|
||||
#include <curand.h>
|
||||
#include <memory>
|
||||
|
||||
#define checkCudaError(call) \
|
||||
if (auto err = call; err != cudaSuccess) \
|
||||
|
@ -111,4 +112,20 @@ inline const char *curandGetErrorString(curandStatus_t error) {
|
|||
|
||||
using CudaPtr = void *;
|
||||
|
||||
class CUDAStream {
|
||||
public:
|
||||
CUDAStream(const CUDAStream &) = delete;
|
||||
CUDAStream(CUDAStream &&) = delete;
|
||||
void operator=(const CUDAStream &) = delete;
|
||||
void operator=(CUDAStream &&) = delete;
|
||||
static cudaStream_t getCurrentStream() { return _stream; }
|
||||
static void Init() { CUDAStream::_stream = 0; };
|
||||
static void createStream() { checkCudaError(cudaStreamCreate(&_stream)); }
|
||||
static void destroyStream() { checkCudaError(cudaStreamDestroy(_stream)); }
|
||||
|
||||
private:
|
||||
CUDAStream(){};
|
||||
static cudaStream_t _stream;
|
||||
};
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -13,4 +13,8 @@ void pow_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
|
|||
void less_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
|
||||
int a2, int a3, int b0, int b1, int b2, int b3, int c0, int c1,
|
||||
int c2, int c3);
|
||||
|
||||
void div_const_kernel(int dType, void *a, void *b, void *c, size_t n);
|
||||
|
||||
void pow_const_kernel(int dType, void *a, void *b, void *c, size_t n);
|
||||
}; // namespace infini
|
||||
|
|
|
@ -7,4 +7,6 @@ void expandKernel(int dType, void *input, void *output, int nDims,
|
|||
int outputsize, SmallArray inputShape,
|
||||
SmallArray outputShape);
|
||||
|
||||
void expandRowKernel(int dType, void *input, void *output, int n_rows,
|
||||
int row_len);
|
||||
}; // namespace infini
|
||||
|
|
|
@ -0,0 +1,10 @@
|
|||
#pragma once
|
||||
|
||||
#include "operators/rms_norm.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
void rmsnorm_kernel(int dType, void *input, void *weight, void *output,
|
||||
int num_tokens, int hidden_size);
|
||||
|
||||
}; // namespace infini
|
|
@ -0,0 +1,12 @@
|
|||
#pragma once
|
||||
|
||||
#include "operators/rope.h"
|
||||
#include "utils/small_array.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
void rope_kernel(int dType, int *pos, void *input, void *output, int size,
|
||||
int dim_model, int dim_head, int hidden_stride,
|
||||
int pos_stride);
|
||||
|
||||
}; // namespace infini
|
|
@ -14,6 +14,9 @@ class CudaRuntimeObj : public RuntimeObj {
|
|||
std::unique_ptr<CommunicatorObj> comm;
|
||||
CudaPtr workspace;
|
||||
size_t workspaceSize;
|
||||
bool isCudaGraphCreated;
|
||||
cudaGraph_t cudaGraph;
|
||||
cudaGraphExec_t cudaGraphInstance;
|
||||
|
||||
public:
|
||||
explicit CudaRuntimeObj(int deviceId = 0)
|
||||
|
@ -26,9 +29,16 @@ class CudaRuntimeObj : public RuntimeObj {
|
|||
// size_t longformerNum = 3lu * (1 << 30);
|
||||
workspaceSize = 7ll << 30; // 7 GB
|
||||
workspace = alloc(workspaceSize);
|
||||
isCudaGraphCreated = false;
|
||||
CUDAStream::Init();
|
||||
}
|
||||
virtual ~CudaRuntimeObj() {
|
||||
try {
|
||||
if (isCudaGraphCreated) {
|
||||
checkCudaError(cudaGraphExecDestroy(cudaGraphInstance));
|
||||
checkCudaError(cudaGraphDestroy(cudaGraph));
|
||||
CUDAStream::destroyStream();
|
||||
}
|
||||
dealloc(workspace);
|
||||
checkCudnnError(cudnnDestroy(cudnn));
|
||||
checkCublasError(cublasDestroy(cublas));
|
||||
|
@ -75,6 +85,8 @@ class CudaRuntimeObj : public RuntimeObj {
|
|||
|
||||
void runWithoutSync(const Graph &graph) const;
|
||||
|
||||
void runWithCudaGraph(const Graph &graph);
|
||||
|
||||
// init communicator
|
||||
void initComm(const string &name, int worldSize, int rank) final;
|
||||
|
||||
|
|
|
@ -5,6 +5,7 @@
|
|||
namespace infini {
|
||||
template <typename T> void softmax_kernel(T *input, T *output, size_t num);
|
||||
template <typename T> void relu_kernel(T *input, T *output, size_t num);
|
||||
template <typename T> void silu_kernel(T *input, T *output, size_t num);
|
||||
template <typename T> void sigmoid_kernel(T *input, T *output, size_t num);
|
||||
template <typename T> void tanh_kernel(T *input, T *output, size_t num);
|
||||
template <typename T> void abs_kernel(T *input, T *output, size_t num);
|
||||
|
|
|
@ -0,0 +1,23 @@
|
|||
#include "core/op_type.h"
|
||||
#include "kunlun/kunlun_common.h"
|
||||
|
||||
namespace infini {
|
||||
using KunlunActType = xdnn::Activation_t;
|
||||
KunlunActType parseActType(ActType act) {
|
||||
switch (act) {
|
||||
case ActType::None:
|
||||
return KunlunActType::LINEAR;
|
||||
case ActType::Tanh:
|
||||
return KunlunActType::TANH;
|
||||
case ActType::Sigmoid:
|
||||
return KunlunActType::SIGMOID;
|
||||
case ActType::Relu:
|
||||
return KunlunActType::RELU6;
|
||||
default:
|
||||
fprintf(stderr, "Activation Type not support yet!\n");
|
||||
break;
|
||||
}
|
||||
return KunlunActType::LINEAR;
|
||||
}
|
||||
|
||||
}; // namespace infini
|
|
@ -3,6 +3,8 @@
|
|||
#include "xpu/runtime_ex.h"
|
||||
#include "xpu/xdnn.h"
|
||||
|
||||
namespace xdnn = baidu::xpu::api;
|
||||
|
||||
#define checkKUNLUNError(call) \
|
||||
{ \
|
||||
auto err = call; \
|
||||
|
|
|
@ -1,28 +1,35 @@
|
|||
#pragma once
|
||||
#include "core/runtime.h"
|
||||
#include "core/workspace.h"
|
||||
#include "kunlun/kunlun_common.h"
|
||||
|
||||
#ifdef INFINI_USE_XCCL
|
||||
#include "kunlun/xccl_communicator.h"
|
||||
#endif
|
||||
namespace infini {
|
||||
|
||||
class KUNLUNRuntimeObj : public RuntimeObj {
|
||||
private:
|
||||
baidu::xpu::api::Context *xdnn;
|
||||
KUNLUNPtr workspace;
|
||||
size_t workspaceSize;
|
||||
xdnn::Context *ctx;
|
||||
std::unique_ptr<CommunicatorObj> comm;
|
||||
// KUNLUNPtr workspace;
|
||||
// size_t workspaceSize;
|
||||
Workspace<KUNLUNPtr> workspace;
|
||||
|
||||
public:
|
||||
KUNLUNRuntimeObj() : RuntimeObj(Device::KUNLUN) {
|
||||
xdnn = baidu::xpu::api::create_context();
|
||||
KUNLUNRuntimeObj(int deviceId = 0) : RuntimeObj(Device::KUNLUN) {
|
||||
xpu_set_device(deviceId);
|
||||
ctx = xdnn::create_context();
|
||||
// 10GB for Longformer
|
||||
// size_t longformerNum = 3lu * (1 << 30);
|
||||
workspaceSize = 3ll << 30; // 3 GB
|
||||
// std::cout<<workspaceSize/1024/1024/1024<< std::endl;
|
||||
// std::cout<<std::bitset<64>(workspaceSize)<< std::endl;
|
||||
workspace = alloc(workspaceSize);
|
||||
size_t workspaceSize = 3llu << 30; // 3 GB
|
||||
KUNLUNPtr wkspacePtr = alloc(workspaceSize);
|
||||
workspace =
|
||||
make_ref<WorkspaceObj<KUNLUNPtr>>(wkspacePtr, workspaceSize);
|
||||
}
|
||||
virtual ~KUNLUNRuntimeObj() {
|
||||
dealloc(workspace);
|
||||
baidu::xpu::api::destroy_context(xdnn);
|
||||
KUNLUNPtr wkspacePtr = workspace->getWorkspace();
|
||||
dealloc(wkspacePtr);
|
||||
xdnn::destroy_context(ctx);
|
||||
}
|
||||
string toString() const override;
|
||||
|
||||
|
@ -31,6 +38,7 @@ class KUNLUNRuntimeObj : public RuntimeObj {
|
|||
// double runEvaluation(const Graph &graph, int nWarmups,
|
||||
// int nEvaluations) const;
|
||||
void sync() const;
|
||||
|
||||
KUNLUNPtr alloc(size_t size) override {
|
||||
void *ptr;
|
||||
checkKUNLUNError(
|
||||
|
@ -38,33 +46,33 @@ class KUNLUNRuntimeObj : public RuntimeObj {
|
|||
return ptr;
|
||||
}
|
||||
void dealloc(void *ptr) override { xpu_free(ptr); }
|
||||
baidu::xpu::api::Context *KUNLUNHandle() const { return xdnn; }
|
||||
|
||||
xdnn::Context *KUNLUNHandle() const { return ctx; }
|
||||
// Get $size workspace by bytes
|
||||
KUNLUNPtr getWorkspace(size_t size) const {
|
||||
IT_ASSERT(size <= workspaceSize);
|
||||
return workspace;
|
||||
auto ret = workspace->getWorkspace(size);
|
||||
return ret;
|
||||
}
|
||||
Workspace<KUNLUNPtr> getWorkspaceObj() const { return workspace; }
|
||||
|
||||
void copyBlobFromCPU(void *dst, const void *src,
|
||||
size_t bytes) const override {
|
||||
xpu_memcpy(dst, const_cast<void *>(src), bytes,
|
||||
XPUMemcpyKind::XPU_HOST_TO_DEVICE);
|
||||
}
|
||||
|
||||
void copyBlobToCPU(void *dst, const void *src,
|
||||
size_t bytes) const override {
|
||||
xpu_memcpy(dst, const_cast<void *>(src), bytes,
|
||||
XPUMemcpyKind::XPU_DEVICE_TO_HOST);
|
||||
}
|
||||
|
||||
void copyBlobInsideRuntime(void *dst, const void *src,
|
||||
size_t bytes) const override {
|
||||
xpu_memcpy(dst, const_cast<void *>(src), bytes,
|
||||
XPUMemcpyKind::XPU_DEVICE_TO_DEVICE);
|
||||
}
|
||||
void initComm(const string &name, int worldSize, int rank) final;
|
||||
|
||||
void initComm(const string &, int, int) override { IT_TODO_HALT(); }
|
||||
|
||||
CommunicatorObj &getCommunicator() const override { IT_TODO_HALT(); }
|
||||
CommunicatorObj &getCommunicator() const final { return *comm; }
|
||||
|
||||
private:
|
||||
void runWithoutSync(const Graph &graph, bool tune, bool profiling) const;
|
||||
|
|
|
@ -0,0 +1,60 @@
|
|||
#pragma once
|
||||
#include "core/communicator.h"
|
||||
#include "xpu/bkcl.h"
|
||||
#include <chrono>
|
||||
#include <filesystem>
|
||||
#include <fstream>
|
||||
#include <thread>
|
||||
|
||||
#define checkXcclError(call) \
|
||||
{ \
|
||||
auto err = call; \
|
||||
if (BKCL_SUCCESS != err) { \
|
||||
fprintf(stderr, "XCCL error in %s:%i.\n", __FILE__, __LINE__); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
}
|
||||
|
||||
namespace infini {
|
||||
|
||||
class XcclCommunicatorObj final : public CommunicatorObj {
|
||||
private:
|
||||
BKCLContext_t comm;
|
||||
|
||||
public:
|
||||
XcclCommunicatorObj(const string &name, int worldSize, int rank)
|
||||
: CommunicatorObj(worldSize, rank) {
|
||||
const std::string filePath("./" + name + "_xccl_id.bin");
|
||||
BKCLUniqueId commId;
|
||||
if (rank == 0) {
|
||||
checkXcclError(bkcl_get_unique_id(&commId));
|
||||
std::ofstream ofs(filePath, std::ios::binary);
|
||||
ofs.write((char *)&commId, sizeof(BKCLUniqueId));
|
||||
} else {
|
||||
auto begin = std::chrono::steady_clock::now();
|
||||
while (!std::filesystem::exists(filePath)) {
|
||||
auto now = std::chrono::steady_clock::now();
|
||||
_IT_ASSERT_2(now < begin + std::chrono::seconds(10),
|
||||
"time limit (10s) exceeded.");
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(100));
|
||||
}
|
||||
std::ifstream ifs(filePath, std::ios::binary);
|
||||
ifs.read((char *)&commId, sizeof(BKCLUniqueId));
|
||||
}
|
||||
checkXcclError(bkcl_init_rank(&comm, rank, worldSize, &commId));
|
||||
if (rank == 0) {
|
||||
std::filesystem::remove(filePath);
|
||||
}
|
||||
}
|
||||
|
||||
BKCLContext_t getXcclComm() { return comm; }
|
||||
|
||||
~XcclCommunicatorObj() final { checkXcclError(bkcl_destroy_context(comm)); }
|
||||
virtual string toString() const final {
|
||||
std::ostringstream oss;
|
||||
oss << "XCCL communicator";
|
||||
return oss.str();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace infini
|
|
@ -17,6 +17,9 @@ class MatmulObj : public OperatorObj {
|
|||
// Auxiliary attributes which are not a part of operator attributes.
|
||||
int b, m, n, k;
|
||||
|
||||
// Specifies the data precision for the matrix multiply.
|
||||
std::string computeType = "default";
|
||||
|
||||
public:
|
||||
/**
|
||||
* @brief Matmul operator with batch broadcast and tensor transpose
|
||||
|
@ -38,10 +41,11 @@ class MatmulObj : public OperatorObj {
|
|||
* @param transB If matrix B should be transposed when computing.
|
||||
* @param bias The bias tensor.
|
||||
* @param act The activation function.
|
||||
* @param computeType Specifies the data precision for the matrix multiply.
|
||||
*/
|
||||
MatmulObj(GraphObj *graph, Tensor A, Tensor B, Tensor C,
|
||||
bool transA = false, bool transB = false, Tensor bias = nullptr,
|
||||
ActType act = ActType::None);
|
||||
ActType act = ActType::None, std::string computeType = "default");
|
||||
OP_CLONE(MatmulObj);
|
||||
|
||||
std::string toString() const override;
|
||||
|
@ -60,6 +64,7 @@ class MatmulObj : public OperatorObj {
|
|||
int getN() const { return n; }
|
||||
int getK() const { return k; }
|
||||
auto getBMNK() const { return tuple{b, m, n, k}; }
|
||||
std::string getComputeType() const { return computeType; }
|
||||
|
||||
private:
|
||||
vector<int> getWorkloadVector() const override;
|
||||
|
|
|
@ -0,0 +1,34 @@
|
|||
#pragma once
|
||||
#include "core/operator.h"
|
||||
|
||||
namespace infini {
|
||||
/**
|
||||
* @brief Fused RMSNorm Operator
|
||||
*
|
||||
*/
|
||||
class RMSNormObj : public OperatorObj {
|
||||
int dim;
|
||||
|
||||
public:
|
||||
/**
|
||||
* @brief Construct a new RMSNorm object.
|
||||
*
|
||||
* @param graph The computation graph that this operator belongs to.
|
||||
* @param input The input tensor.
|
||||
* @param output The output tensor.
|
||||
*/
|
||||
RMSNormObj(GraphObj *graph, Tensor input, Tensor weight, Tensor output);
|
||||
OP_CLONE(RMSNormObj);
|
||||
|
||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
|
||||
|
||||
std::string toString() const override;
|
||||
int numInputs() const override { return 2; }
|
||||
int numOutputs() const override { return 1; }
|
||||
int getDim() const { return dim; }
|
||||
|
||||
private:
|
||||
vector<int> getWorkloadVector() const override;
|
||||
vector<int> getOpAttrVector() const override;
|
||||
};
|
||||
} // namespace infini
|
|
@ -0,0 +1,29 @@
|
|||
#pragma once
|
||||
#include "core/operator.h"
|
||||
|
||||
namespace infini {
|
||||
class RoPEObj : public OperatorObj {
|
||||
public:
|
||||
/**
|
||||
* @brief Construct a new RotaryEmbedding object.
|
||||
*
|
||||
* @param graph The computation graph that this operator belongs to.
|
||||
* @param pos The positon id of the query.
|
||||
* @param input The input tensor.
|
||||
* @param output The output tensor.
|
||||
*/
|
||||
RoPEObj(GraphObj *graph, Tensor pos, Tensor input, Tensor output);
|
||||
OP_CLONE(RoPEObj);
|
||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
|
||||
|
||||
std::string toString() const override;
|
||||
int numInputs() const override { return 2; }
|
||||
int numOutputs() const override { return 1; }
|
||||
DataType getDType() const { return getInputs(1)->getDType(); }
|
||||
|
||||
private:
|
||||
vector<int> getWorkloadVector() const override;
|
||||
vector<int> getOpAttrVector() const override;
|
||||
};
|
||||
|
||||
} // namespace infini
|
|
@ -7,9 +7,7 @@ namespace infini {
|
|||
*
|
||||
*/
|
||||
class SliceObj : public OperatorObj {
|
||||
template <class T> struct range_t {
|
||||
T start, end, step;
|
||||
};
|
||||
template <class T> struct range_t { T start, end, step; };
|
||||
vector<range_t<int>> axes;
|
||||
|
||||
public:
|
||||
|
|
|
@ -159,6 +159,7 @@ enum class CastType {
|
|||
Uint322Int64,
|
||||
Float162Float,
|
||||
BFloat162Float,
|
||||
Float2Float,
|
||||
};
|
||||
|
||||
class CastObj : public OperatorObj {
|
||||
|
@ -258,6 +259,7 @@ class LogObj : public OperatorObj {
|
|||
};
|
||||
|
||||
DEFINE_UNARY_OBJ(Relu, OpType::Relu)
|
||||
DEFINE_UNARY_OBJ(Silu, OpType::Silu)
|
||||
DEFINE_UNARY_OBJ(Gelu, OpType::Gelu)
|
||||
DEFINE_UNARY_OBJ(Sigmoid, OpType::Sigmoid)
|
||||
DEFINE_UNARY_OBJ(Tanh, OpType::Tanh)
|
||||
|
|
|
@ -1,14 +0,0 @@
|
|||
#pragma once
|
||||
|
||||
namespace infini {
|
||||
void broadcastShape(const Shape &originShape, SmallArray &modifyShape,
|
||||
int nDims, int size) {
|
||||
for (int i = nDims - size - 1; i >= 0; --i) {
|
||||
modifyShape.data[i] = 1;
|
||||
}
|
||||
for (int i = nDims - 1; i >= nDims - size; --i) {
|
||||
modifyShape.data[i] = originShape[i - nDims + size];
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -5,6 +5,9 @@
|
|||
#include "core/operator.h"
|
||||
#include "core/tensor.h"
|
||||
|
||||
#include "utils/small_array.h"
|
||||
#include <numeric>
|
||||
|
||||
namespace infini {
|
||||
|
||||
// Launch a broadcast shape based on the shape of input A and B
|
||||
|
@ -20,6 +23,12 @@ size_t delocate_index(const Shape &shapeIndex, const Shape &shape,
|
|||
const Shape &stride);
|
||||
// Convert KernelAttrs to a string representation
|
||||
std::string get_kernel_attrs_str(const KernelAttrs &kernelAttrs);
|
||||
// VectorProd
|
||||
int shapeProd(std::vector<int>::iterator start, std::vector<int>::iterator end);
|
||||
void broadcastShape(const Shape &originShape, SmallArray &modifyShape,
|
||||
int nDims, int size);
|
||||
void broadcastShape(const Shape &tempShape, Shape &modifyShape);
|
||||
|
||||
} // namespace infini
|
||||
|
||||
#endif
|
||||
|
|
|
@ -4,6 +4,14 @@ namespace infini {
|
|||
#define SMALL_ARRAY_SIZE 8
|
||||
struct SmallArray {
|
||||
int data[SMALL_ARRAY_SIZE];
|
||||
|
||||
int prod(int start, int end) {
|
||||
int result = 1;
|
||||
for (int i = start; i < end; ++i) {
|
||||
result *= data[i];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
import backend
|
||||
import backend
|
||||
from onnx import (
|
||||
ModelProto,
|
||||
TensorProto,
|
||||
|
@ -23,12 +23,13 @@ from onnx.checker import (
|
|||
ValidationError,
|
||||
)
|
||||
from onnx.shape_inference import infer_shapes
|
||||
from onnx.numpy_helper import to_array
|
||||
from onnx.numpy_helper import to_array, from_array
|
||||
from typing import Dict, List, Any, Tuple, Sequence, Union, Optional
|
||||
from functools import reduce
|
||||
from onnxsim import simplify
|
||||
import copy
|
||||
import warnings
|
||||
import numpy as np
|
||||
|
||||
|
||||
class OnnxStub:
|
||||
|
@ -37,7 +38,13 @@ class OnnxStub:
|
|||
It can be generated from an Onnx model object.
|
||||
"""
|
||||
|
||||
def __init__(self, model: ModelProto, runtime, use_naive_allocator: bool = False):
|
||||
def __init__(
|
||||
self,
|
||||
model: ModelProto,
|
||||
runtime,
|
||||
use_naive_allocator: bool = False,
|
||||
matmul_compute_type: str = "default",
|
||||
):
|
||||
# We use some user-defined operators for distributed inference
|
||||
try:
|
||||
# onnx simplifier performs inplace simplify
|
||||
|
@ -105,12 +112,6 @@ class OnnxStub:
|
|||
)
|
||||
tensors[input.name].set_input()
|
||||
|
||||
for output in model.graph.output:
|
||||
dims = _take_shape_dim(output.type.tensor_type.shape)
|
||||
tensors[output.name] = self.handler.tensor(
|
||||
dims, output.type.tensor_type.elem_type
|
||||
)
|
||||
tensors[output.name].set_output()
|
||||
|
||||
for node_idx in sorted_nodes:
|
||||
node = model.graph.node[node_idx]
|
||||
|
@ -252,13 +253,14 @@ class OnnxStub:
|
|||
)
|
||||
elif node.op_type == "MatMul":
|
||||
tensors[node.output[0]] = self.handler.matmul(
|
||||
tensors[node.input[0]],
|
||||
tensors[node.input[1]],
|
||||
tensors[node.input[0]], # input
|
||||
tensors[node.input[1]], # weight
|
||||
tensors.get(node.output[0]),
|
||||
False,
|
||||
False,
|
||||
None,
|
||||
backend.ActType.Linear,
|
||||
matmul_compute_type,
|
||||
)
|
||||
elif node.op_type == "Gemm":
|
||||
attributes = _parse_attribute(
|
||||
|
@ -278,6 +280,7 @@ class OnnxStub:
|
|||
transB == 1,
|
||||
tensors[node.input[2]] if len(node.input) > 2 else None,
|
||||
backend.ActType.Linear,
|
||||
matmul_compute_type,
|
||||
)
|
||||
elif node.op_type == "BatchNormalization":
|
||||
(input, mean, var, scale, bias) = (
|
||||
|
@ -321,6 +324,12 @@ class OnnxStub:
|
|||
axis,
|
||||
stash_type,
|
||||
)
|
||||
elif node.op_type == "RMSNorm":
|
||||
tensors[node.output[0]] = self.handler.RMSNorm(
|
||||
tensors[node.input[0]],
|
||||
tensors[node.input[1]],
|
||||
tensors.get(node.output[0]),
|
||||
)
|
||||
elif node.op_type == "MaxPool":
|
||||
attributes = _parse_attribute(
|
||||
node,
|
||||
|
@ -482,6 +491,11 @@ class OnnxStub:
|
|||
tensors[node.input[0]],
|
||||
tensors.get(node.output[0]),
|
||||
)
|
||||
elif node.op_type == "Silu":
|
||||
tensors[node.output[0]] = self.handler.silu(
|
||||
tensors[node.input[0]],
|
||||
tensors.get(node.output[0]),
|
||||
)
|
||||
elif node.op_type == "Gelu":
|
||||
tensors[node.output[0]] = self.handler.gelu(
|
||||
tensors[node.input[0]],
|
||||
|
@ -657,7 +671,7 @@ class OnnxStub:
|
|||
keep_aspect_ratio_policy,
|
||||
nearest_mode,
|
||||
coordinate_transformation_mode,
|
||||
)
|
||||
)
|
||||
elif node.op_type == "Squeeze":
|
||||
axes = (
|
||||
_parse_data(data[node.input[1]])
|
||||
|
@ -713,6 +727,12 @@ class OnnxStub:
|
|||
tensors[node.input[5]],
|
||||
tensors.get(node.output[0]),
|
||||
)
|
||||
elif node.op_type == "RoPE":
|
||||
tensors[node.output[0]]= self.handler.RoPE(
|
||||
tensors[node.input[0]],
|
||||
tensors[node.input[1]],
|
||||
tensors.get(node.output[0]),
|
||||
)
|
||||
elif node.op_type == "Split":
|
||||
split = (
|
||||
_parse_data(data[node.input[1]])
|
||||
|
@ -966,6 +986,25 @@ class OnnxStub:
|
|||
tensors.get(node.output[0]),
|
||||
)
|
||||
elif node.op_type == "Where":
|
||||
## If Y is single -inf, treat Where as Add
|
||||
## TODO: deal with cases where Y is single inf or 0
|
||||
if node.input[0] in data and node.input[2] in data:
|
||||
where_condition = to_array(data[node.input[0]])
|
||||
where_alt = to_array(data[node.input[2]])
|
||||
if where_alt.size == 1:
|
||||
if np.isneginf(where_alt) or np.all(where_alt < -3e38):
|
||||
node.input[0] = node.input[0] + "_alt"
|
||||
if node.input[0] not in data:
|
||||
where_value = np.where(where_condition, 0, -np.inf).astype(where_alt.dtype)
|
||||
data[node.input[0]] = from_array(where_value, node.input[0])
|
||||
tensors[node.input[0]] = self.handler.tensor(list(where_value.shape), data[node.input[0]].data_type)
|
||||
tensors[node.input[0]].set_weight()
|
||||
tensors[node.output[0]] = self.handler.add(
|
||||
tensors[node.input[1]],
|
||||
tensors[node.input[0]],
|
||||
tensors.get(node.output[0]),
|
||||
)
|
||||
continue
|
||||
tensors[node.output[0]] = self.handler.where(
|
||||
tensors[node.input[1]],
|
||||
tensors[node.input[2]],
|
||||
|
@ -995,10 +1034,12 @@ class OnnxStub:
|
|||
beta,
|
||||
bias,
|
||||
size,
|
||||
)
|
||||
)
|
||||
else:
|
||||
raise Exception('Unsupported operator "{}"'.format(node.op_type))
|
||||
|
||||
for output in model.graph.output:
|
||||
tensors[output.name].set_output()
|
||||
################################
|
||||
# Allocate memory space for data
|
||||
################################
|
||||
|
@ -1280,7 +1321,7 @@ class OnnxStub:
|
|||
axes,
|
||||
)
|
||||
)
|
||||
ctx.push_node(make_node(ty.name, inputs, outputs, name))
|
||||
ctx.push_node(make_node(ty.name, inputs, outputs, name))
|
||||
elif ty == backend.OpTypeId.Concat:
|
||||
axis = backend.concat_axis_of(op)
|
||||
ctx.push_node(make_node(ty.name, inputs, outputs, name, axis=axis))
|
||||
|
@ -1409,6 +1450,9 @@ class OnnxStub:
|
|||
def run(self) -> None:
|
||||
self.handler.run()
|
||||
|
||||
def run_with_cudagraph(self) -> None:
|
||||
self.handler.run_with_cudagraph()
|
||||
|
||||
def get_perf_time(self) -> float:
|
||||
self.handler.get_perf_time()
|
||||
|
||||
|
|
|
@ -463,13 +463,20 @@ class TestStringMethods(unittest.TestCase):
|
|||
def test_split(self):
|
||||
input = make_tensor_value_info("input", TensorProto.FLOAT, [1, 3, 2, 4])
|
||||
split = make_node("Split", ["input"], ["output"], name="split", axis=0)
|
||||
make_and_import_model(make_graph([split], "split", [input], []))
|
||||
output = make_tensor_value_info("output", TensorProto.FLOAT, [1, 3, 2, 4])
|
||||
make_and_import_model(make_graph([split], "split", [input], [output]))
|
||||
|
||||
def test_split1(self):
|
||||
input = make_tensor_value_info("input", TensorProto.FLOAT, [1, 3, 2, 4])
|
||||
splitAttr = make_tensor_value_info("split", TensorProto.INT64, [2, 1])
|
||||
split = make_node("Split", ["input", "split"], ["output"], name="split", axis=1)
|
||||
make_and_import_model(make_graph([split], "split", [input, splitAttr], []))
|
||||
splitAttr = make_tensor("split", TensorProto.INT64, [2], [2, 1])
|
||||
output1 = make_tensor_value_info("output1", TensorProto.FLOAT, [1, 2, 2, 4])
|
||||
output2 = make_tensor_value_info("output2", TensorProto.FLOAT, [1, 1, 2, 4])
|
||||
split = make_node(
|
||||
"Split", ["input", "split"], ["output1", "output2"], name="split", axis=1
|
||||
)
|
||||
make_and_import_model(
|
||||
make_graph([split], "split", [input], [output1, output2], [splitAttr])
|
||||
)
|
||||
|
||||
def test_allBroadcast(self):
|
||||
input = make_tensor_value_info("input", TensorProto.FLOAT, [1, 3, 2, 4])
|
||||
|
|
|
@ -18,6 +18,8 @@
|
|||
#include "operators/reduce.h"
|
||||
#include "operators/reshape.h"
|
||||
#include "operators/resize.h"
|
||||
#include "operators/rms_norm.h"
|
||||
#include "operators/rope.h"
|
||||
#include "operators/send.h"
|
||||
#include "operators/slice.h"
|
||||
#include "operators/softmax.h"
|
||||
|
@ -72,15 +74,17 @@ Tensor GraphHandlerObj::convTransposed2d(Tensor input, Tensor weight,
|
|||
}
|
||||
|
||||
Tensor GraphHandlerObj::matmul(Tensor a, Tensor b, Tensor y, bool transA,
|
||||
bool transB, Tensor bias, ActType act) {
|
||||
bool transB, Tensor bias, ActType act,
|
||||
std::string matmul_compute_type) {
|
||||
if (y) {
|
||||
g->addOpWithOutputs<MatmulObj>(std::move(a), std::move(b), y, transA,
|
||||
transB, std::move(bias), act);
|
||||
transB, std::move(bias), act,
|
||||
matmul_compute_type);
|
||||
return y;
|
||||
} else {
|
||||
return g
|
||||
->addOp<MatmulObj>(std::move(a), std::move(b), y, transA, transB,
|
||||
std::move(bias), act)
|
||||
std::move(bias), act, matmul_compute_type)
|
||||
->getOutput();
|
||||
}
|
||||
}
|
||||
|
@ -121,6 +125,17 @@ Tensor GraphHandlerObj::layerNormalization(Tensor input, Tensor scale,
|
|||
}
|
||||
}
|
||||
|
||||
Tensor GraphHandlerObj::rmsNorm(Tensor input, Tensor weight, Tensor output) {
|
||||
if (output) {
|
||||
g->addOpWithOutputs<RMSNormObj>(std::move(input), std::move(weight),
|
||||
output);
|
||||
return output;
|
||||
} else {
|
||||
return g->addOp<RMSNormObj>(std::move(input), std::move(weight), output)
|
||||
->getOutput();
|
||||
}
|
||||
}
|
||||
|
||||
Tensor GraphHandlerObj::maxPool(Tensor input, Tensor output, int kh, int kw,
|
||||
int dh, int dw, int ph, int pw, int sh, int sw,
|
||||
int ceilMode) {
|
||||
|
@ -181,6 +196,7 @@ DEFINE_ELEMENT_WISE_METHOD(max, Maximum)
|
|||
} \
|
||||
}
|
||||
|
||||
DEFINE_UNARY_METHOD(silu, Silu)
|
||||
DEFINE_UNARY_METHOD(relu, Relu)
|
||||
DEFINE_UNARY_METHOD(gelu, Gelu)
|
||||
DEFINE_UNARY_METHOD(sigmoid, Sigmoid)
|
||||
|
@ -345,6 +361,16 @@ Tensor GraphHandlerObj::attentionKVCache(Tensor input_k_cache,
|
|||
}
|
||||
}
|
||||
|
||||
Tensor GraphHandlerObj::RoPE(Tensor pos, Tensor input, Tensor output) {
|
||||
if (output) {
|
||||
g->addOpWithOutputs<RoPEObj>(std::move(pos), std::move(input), output);
|
||||
return output;
|
||||
} else {
|
||||
return g->addOp<RoPEObj>(std::move(pos), std::move(input), output)
|
||||
->getOutput();
|
||||
}
|
||||
}
|
||||
|
||||
TensorVec GraphHandlerObj::split(Tensor input, std::optional<TensorVec> outputs,
|
||||
int axis,
|
||||
std::variant<int, vector<int>> numOrRatio) {
|
||||
|
@ -683,6 +709,8 @@ static CastType inferCastType(Tensor input, int to) {
|
|||
return CastType::Float162Float;
|
||||
} else if (iType == DataType::BFloat16 && oType == DataType::Float32) {
|
||||
return CastType::BFloat162Float;
|
||||
} else if (iType == DataType::Float32 && oType == DataType::Float32) {
|
||||
return CastType::Float2Float;
|
||||
} else {
|
||||
IT_TODO_HALT_MSG("Unsupported CastType : input_type is " +
|
||||
iType.toString() + " output_type is " +
|
||||
|
|
|
@ -66,6 +66,36 @@ void TensorObj::setShape(Shape shape_) {
|
|||
_size = size;
|
||||
}
|
||||
|
||||
void TensorObj::dumpData(std::ofstream &ofs) const {
|
||||
IT_ASSERT(data != nullptr);
|
||||
if (!runtime->isCpu())
|
||||
IT_TODO_HALT();
|
||||
|
||||
#define TRY_DUMP(N) \
|
||||
if (dtype == DataType(N)) \
|
||||
ofs << dataToString<DT<N>::t>() << std::endl;
|
||||
|
||||
TRY_DUMP(0) // fmt: new line
|
||||
else TRY_DUMP(1) //
|
||||
else TRY_DUMP(2) //
|
||||
else TRY_DUMP(3) //
|
||||
else TRY_DUMP(4) //
|
||||
else TRY_DUMP(5) //
|
||||
else TRY_DUMP(6) //
|
||||
else TRY_DUMP(7) //
|
||||
else TRY_DUMP(8) //
|
||||
else TRY_DUMP(9) //
|
||||
else TRY_DUMP(10) //
|
||||
else TRY_DUMP(11) //
|
||||
else TRY_DUMP(12) //
|
||||
else TRY_DUMP(13) //
|
||||
else TRY_DUMP(16) //
|
||||
else IT_TODO_HALT();
|
||||
ofs.flush();
|
||||
|
||||
#undef TRY_DUMP
|
||||
}
|
||||
|
||||
void TensorObj::printData() const {
|
||||
IT_ASSERT(data != nullptr);
|
||||
if (!runtime->isCpu())
|
||||
|
|
|
@ -19,7 +19,6 @@ void CHECK_CUDA_KERNEL_ERROR(infini::Operator op) {
|
|||
}
|
||||
|
||||
namespace infini {
|
||||
|
||||
void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
|
||||
const auto &kernelRegistry = KernelRegistry::getInstance();
|
||||
auto &perfEngine = PerfEngine::getInstance();
|
||||
|
@ -39,6 +38,27 @@ void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
|
|||
}
|
||||
}
|
||||
|
||||
void CudaRuntimeObj::runWithCudaGraph(const Graph &graph) {
|
||||
if (!isCudaGraphCreated) {
|
||||
CUDAStream::createStream();
|
||||
checkCudnnError(cudnnSetStream(cudnn, CUDAStream::getCurrentStream()));
|
||||
checkCublasError(
|
||||
cublasSetStream(cublas, CUDAStream::getCurrentStream()));
|
||||
checkCudaError(cudaStreamBeginCapture(CUDAStream::getCurrentStream(),
|
||||
cudaStreamCaptureModeGlobal));
|
||||
runWithoutSync(graph);
|
||||
checkCudaError(
|
||||
cudaStreamEndCapture(CUDAStream::getCurrentStream(), &cudaGraph));
|
||||
checkCudaError(
|
||||
cudaGraphInstantiate(&cudaGraphInstance, cudaGraph, NULL, NULL, 0));
|
||||
isCudaGraphCreated = true;
|
||||
} else {
|
||||
checkCudaError(
|
||||
cudaGraphLaunch(cudaGraphInstance, CUDAStream::getCurrentStream()));
|
||||
}
|
||||
checkCudaError(cudaStreamSynchronize(CUDAStream::getCurrentStream()));
|
||||
}
|
||||
|
||||
void CudaRuntimeObj::tune(const Graph &graph, bool profiling = false) const {
|
||||
const auto &kernelRegistry = KernelRegistry::getInstance();
|
||||
auto &perfEngine = PerfEngine::getInstance();
|
||||
|
@ -102,4 +122,5 @@ void CudaRuntimeObj::initComm(const string &name, int worldSize, int rank) {
|
|||
#endif
|
||||
}
|
||||
|
||||
cudaStream_t CUDAStream::_stream = 0;
|
||||
} // namespace infini
|
||||
|
|
|
@ -16,7 +16,8 @@ __global__ void cudaPrintFloatImpl(float *x, int len) {
|
|||
namespace infini {
|
||||
|
||||
void cudaPrintFloat(float *x, int len) {
|
||||
cudaPrintFloatImpl<<<1, 1>>>(x, len);
|
||||
cudaPrintFloatImpl
|
||||
<<<1, 1, 0, CUDAStream::getCurrentStream()>>>(x, len);
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
|
|
|
@ -442,7 +442,9 @@ void init_graph_builder(py::module &m) {
|
|||
#endif
|
||||
#ifdef USE_KUNLUN
|
||||
py::class_<KUNLUNRuntimeObj, std::shared_ptr<KUNLUNRuntimeObj>, RuntimeObj>(
|
||||
m, "KUNLUNRuntime");
|
||||
m, "KUNLUNRuntime")
|
||||
.def(py::init<int>(), py::arg("device") = 0)
|
||||
.def("init_comm", &KUNLUNRuntimeObj::initComm);
|
||||
#endif
|
||||
|
||||
#ifdef USE_ASCEND
|
||||
|
@ -525,6 +527,7 @@ void init_graph_builder(py::module &m) {
|
|||
.def("matmul", &Handler::matmul, policy::move)
|
||||
.def("batchNormalization", &Handler::batchNormalization, policy::move)
|
||||
.def("layerNormalization", &Handler::layerNormalization, policy::move)
|
||||
.def("RMSNorm", &Handler::rmsNorm, policy::move)
|
||||
.def("maxPool", &Handler::maxPool, policy::move)
|
||||
.def("avgPool", &Handler::avgPool, policy::move)
|
||||
.def("add", &Handler::add, policy::move)
|
||||
|
@ -536,6 +539,7 @@ void init_graph_builder(py::module &m) {
|
|||
.def("min", &Handler::min, policy::move)
|
||||
.def("max", &Handler::max, policy::move)
|
||||
.def("relu", &Handler::relu, policy::move)
|
||||
.def("silu", &Handler::silu, policy::move)
|
||||
.def("gelu", &Handler::gelu, policy::move)
|
||||
.def("sigmoid", &Handler::sigmoid, policy::move)
|
||||
.def("tanh", &Handler::tanh, policy::move)
|
||||
|
@ -558,6 +562,7 @@ void init_graph_builder(py::module &m) {
|
|||
.def("unsqueeze", &Handler::unsqueeze, policy::move)
|
||||
.def("concat", &Handler::concat, policy::move)
|
||||
.def("attentionKVCache", &Handler::attentionKVCache, policy::move)
|
||||
.def("RoPE", &Handler::RoPE, policy::move)
|
||||
.def("split", &Handler::split, policy::move)
|
||||
.def("gather", &Handler::gather, policy::move)
|
||||
.def("gatherElements", &Handler::gatherElements, policy::move)
|
||||
|
@ -590,6 +595,10 @@ void init_graph_builder(py::module &m) {
|
|||
.def("get_perf_time", &Handler::get_perf_time, policy::automatic)
|
||||
.def("tune", &Handler::tune, policy::automatic)
|
||||
.def("run", &Handler::run, policy::automatic)
|
||||
#ifdef USE_CUDA
|
||||
.def("run_with_cudagraph", &Handler::run_with_cudagraph,
|
||||
policy::automatic)
|
||||
#endif
|
||||
.def("shape_infer", &Handler::shape_infer, policy::automatic)
|
||||
.def("change_shape", &Handler::change_shape, policy::automatic)
|
||||
.def("getDims", &Handler::getDims, policy::automatic)
|
||||
|
|
|
@ -21,12 +21,12 @@ namespace infini {
|
|||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>()); \
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>()); \
|
||||
\
|
||||
auto a = op->getInputs(0) -> getDims(); \
|
||||
auto aS = op->getInputs(0) -> getStride(); \
|
||||
auto b = op->getInputs(1) -> getDims(); \
|
||||
auto bS = op->getInputs(1) -> getStride(); \
|
||||
auto c = op->getOutput() -> getDims(); \
|
||||
auto cS = op->getOutput() -> getStride(); \
|
||||
auto a = op->getInputs(0)->getDims(); \
|
||||
auto aS = op->getInputs(0)->getStride(); \
|
||||
auto b = op->getInputs(1)->getDims(); \
|
||||
auto bS = op->getInputs(1)->getStride(); \
|
||||
auto c = op->getOutput()->getDims(); \
|
||||
auto cS = op->getOutput()->getStride(); \
|
||||
\
|
||||
std::vector<int64_t> aDim = castTo64(a); \
|
||||
std::vector<int64_t> aStride = castTo64(aS); \
|
||||
|
|
|
@ -92,22 +92,22 @@ class ReluAclnn : public ASCENDKernelWithoutConfig {
|
|||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>()); \
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>()); \
|
||||
\
|
||||
auto a = op->getInputs(0) -> getDims(); \
|
||||
auto a = op->getInputs(0)->getDims(); \
|
||||
std::vector<int64_t> aDim(a.size(), 1); \
|
||||
for (size_t i = 0; i < a.size(); ++i) { \
|
||||
aDim[i] = int64_t(a[i]); \
|
||||
} \
|
||||
auto aS = op->getInputs(0) -> getStride(); \
|
||||
auto aS = op->getInputs(0)->getStride(); \
|
||||
std::vector<int64_t> aStride(aS.size(), 1); \
|
||||
for (size_t i = 0; i < aS.size(); ++i) { \
|
||||
aStride[i] = int64_t(aS[i]); \
|
||||
} \
|
||||
auto c = op->getInputs(0) -> getDims(); \
|
||||
auto c = op->getInputs(0)->getDims(); \
|
||||
std::vector<int64_t> cDim(c.size(), 1); \
|
||||
for (size_t i = 0; i < c.size(); ++i) { \
|
||||
cDim[i] = int64_t(c[i]); \
|
||||
} \
|
||||
auto cS = op->getInputs(0) -> getStride(); \
|
||||
auto cS = op->getInputs(0)->getStride(); \
|
||||
std::vector<int64_t> cStride(cS.size(), 1); \
|
||||
for (size_t i = 0; i < cS.size(); ++i) { \
|
||||
cStride[i] = int64_t(cS[i]); \
|
||||
|
|
|
@ -2,16 +2,20 @@
|
|||
#include "bang/bang_runtime.h"
|
||||
#include "operators/softmax.h"
|
||||
#include "operators/unary.h"
|
||||
#include <iostream>
|
||||
|
||||
namespace infini {
|
||||
class UnaryCnnl : public BangKernelWithoutConfig {
|
||||
virtual cnnlActivationMode_t getOpType() const = 0;
|
||||
virtual float getCoef() const = 0;
|
||||
virtual tuple<float, float> getAlphBeta() const { return {1.f, 0.f}; }
|
||||
virtual float getSlicedDim() const { return 0.0; }
|
||||
virtual float getGamma() const { return 0.0; }
|
||||
virtual float getScale() const { return 0.0; }
|
||||
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<UnaryObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
|
@ -22,18 +26,19 @@ class UnaryCnnl : public BangKernelWithoutConfig {
|
|||
auto cDim = op->getOutput()->getDims();
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, aDim.size(),
|
||||
aDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
aDim.size(), aDim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, cDim.size(),
|
||||
cDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
cDim.size(), cDim.data()));
|
||||
cnnlActivationDescriptor_t opDesc;
|
||||
checkCnnlError(cnnlCreateActivationDescriptor(&opDesc));
|
||||
checkCnnlError(cnnlSetActivationDescriptor_v2(
|
||||
checkCnnlError(cnnlSetActivationDescriptor_v5(
|
||||
opDesc, getOpType(), CNNL_ACTIVATION_HIGH_PRECISION,
|
||||
CNNL_NOT_PROPAGATE_NAN, getCoef()));
|
||||
CNNL_NOT_PROPAGATE_NAN, getCoef(), getSlicedDim(), getGamma(),
|
||||
getScale(), true));
|
||||
|
||||
auto [alpha, beta] = getAlphBeta();
|
||||
cnnlStatus_t stat =
|
||||
|
@ -51,7 +56,6 @@ class RoundCnnl : public BangKernelWithoutConfig {
|
|||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<UnaryObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
|
@ -62,13 +66,13 @@ class RoundCnnl : public BangKernelWithoutConfig {
|
|||
auto cDim = op->getOutput()->getDims();
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, aDim.size(),
|
||||
aDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
aDim.size(), aDim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, cDim.size(),
|
||||
cDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
cDim.size(), cDim.data()));
|
||||
cnnlStatus_t stat =
|
||||
cnnlRound(context->cnnlHandle(), aDesc, aData, cDesc, cData);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
|
@ -82,7 +86,6 @@ class PReluCnnl : public BangKernelWithoutConfig {
|
|||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<PReluObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
|
@ -94,18 +97,22 @@ class PReluCnnl : public BangKernelWithoutConfig {
|
|||
auto bDim = op->getInputs(1)->getDims();
|
||||
auto cDim = op->getOutput()->getDims();
|
||||
|
||||
if (auto alignSize = aDim.size() - bDim.size(); alignSize) {
|
||||
bDim.insert(bDim.begin(), alignSize, 1);
|
||||
}
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, aDim.size(),
|
||||
aDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
aDim.size(), aDim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, bDim.size(),
|
||||
bDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
bDim.size(), bDim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, cDim.size(),
|
||||
cDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
cDim.size(), cDim.data()));
|
||||
|
||||
cnnlStatus_t stat = cnnlPrelu(context->cnnlHandle(), aDesc, aData,
|
||||
bDesc, bData, cDesc, cData);
|
||||
|
@ -122,7 +129,6 @@ class SoftmaxCnnl : public BangKernelWithoutConfig {
|
|||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<SoftmaxObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
|
@ -185,13 +191,13 @@ class SoftmaxCnnl : public BangKernelWithoutConfig {
|
|||
}
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY,
|
||||
CNNL_DTYPE_FLOAT, inDim.size(),
|
||||
inDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
|
||||
inDim.size(), inDim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY,
|
||||
CNNL_DTYPE_FLOAT, outDim.size(),
|
||||
outDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
|
||||
outDim.size(), outDim.data()));
|
||||
float alpha = 1.0;
|
||||
float beta = 0.0;
|
||||
cnnlStatus_t stat =
|
||||
|
@ -219,6 +225,22 @@ class SigmoidCnnl : public UnaryCnnl {
|
|||
float getCoef() const override { return 0.0; }
|
||||
};
|
||||
|
||||
class HardSwishCnnl : public UnaryCnnl {
|
||||
cnnlActivationMode_t getOpType() const override {
|
||||
return CNNL_ACTIVATION_HARDSWISH;
|
||||
}
|
||||
float getCoef() const override { return 0.0; }
|
||||
};
|
||||
|
||||
class HardSigmoidCnnl : public UnaryCnnl {
|
||||
cnnlActivationMode_t getOpType() const override {
|
||||
return CNNL_ACTIVATION_HARDSIGMOID;
|
||||
}
|
||||
float getCoef() const override { return 0.0; }
|
||||
float getGamma() const override { return 1.f / 6.f; }
|
||||
float getScale() const override { return 0.5f; }
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Relu, ReluCnnl, "Relu_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::PRelu, PReluCnnl, "PRelu_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Sigmoid, SigmoidCnnl,
|
||||
|
@ -226,5 +248,9 @@ REGISTER_KERNEL(Device::BANG, OpType::Sigmoid, SigmoidCnnl,
|
|||
REGISTER_KERNEL(Device::BANG, OpType::Round, RoundCnnl, "Round_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Softmax, SoftmaxCnnl,
|
||||
"Softmax_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::HardSigmoid, HardSigmoidCnnl,
|
||||
"HardSigmoid_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::HardSwish, HardSwishCnnl,
|
||||
"HardSwish_cnnl_BANG");
|
||||
|
||||
}; // namespace infini
|
||||
|
|
|
@ -10,7 +10,6 @@ class ActivationBackwardCnnl : public BangKernelWithoutConfig {
|
|||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ActivationBackwardObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const yData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
|
@ -25,21 +24,21 @@ class ActivationBackwardCnnl : public BangKernelWithoutConfig {
|
|||
auto diffxDim = op->getOutput()->getDims();
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&yDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(yDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, yDim.size(),
|
||||
yDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
yDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
yDim.size(), yDim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&diffYDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
diffYDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, diffyDim.size(),
|
||||
diffyDim.data()));
|
||||
diffYDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
diffyDim.size(), diffyDim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&xDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(xDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, xDim.size(),
|
||||
xDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
xDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
xDim.size(), xDim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&diffXDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
diffXDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, diffxDim.size(),
|
||||
diffxDim.data()));
|
||||
diffXDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
diffxDim.size(), diffxDim.data()));
|
||||
// get op descriptor
|
||||
cnnlActivationDescriptor_t opDesc;
|
||||
checkCnnlError(cnnlCreateActivationDescriptor(&opDesc));
|
||||
|
|
|
@ -19,7 +19,6 @@ class AllGatherCNCL : public BangKernelWithoutConfig {
|
|||
BangPtr output_temp =
|
||||
context->getWorkspace(op->getInputs(0)->getBytes() * world_size);
|
||||
// void *output = op->getOutput()->getRawDataPtr<void *>();
|
||||
// IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
checkBangError(cnrtMalloc(&output_temp,
|
||||
op->getInputs(0)->getBytes() * world_size));
|
||||
size_t bytes = op->getInputs(0)->getBytes();
|
||||
|
@ -29,8 +28,9 @@ class AllGatherCNCL : public BangKernelWithoutConfig {
|
|||
dynamic_cast<CnclCommunicatorObj &>(context->getCommunicator())
|
||||
.getCnclComm();
|
||||
cnrtQueue_t queue = context->getBangQueue();
|
||||
CNCL_CHECK(
|
||||
cnclAllGather(input, output_temp, count, cnclFloat32, comm, queue));
|
||||
CNCL_CHECK(cnclAllGather(input, output_temp, count,
|
||||
cnclDataTypeConvert(op->getDType()), comm,
|
||||
queue));
|
||||
checkBangError(cnrtQueueSync(queue));
|
||||
for (int i = 0; i < world_size; ++i) {
|
||||
Tensor output = op->getOutput(i);
|
||||
|
@ -42,8 +42,8 @@ class AllGatherCNCL : public BangKernelWithoutConfig {
|
|||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::AllGather, DataType::Float32,
|
||||
AllGatherCNCL, "AllGather_CNCL_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::AllGather, AllGatherCNCL,
|
||||
"AllGather_CNCL_BANG");
|
||||
} // namespace infini
|
||||
|
||||
#endif
|
||||
|
|
|
@ -13,15 +13,16 @@ class AllReduceCNCL : public BangKernelWithoutConfig {
|
|||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
void *input = op->getInputs(0)->getRawDataPtr<void *>();
|
||||
void *output = op->getOutput()->getRawDataPtr<void *>();
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
size_t count = op->getInputs(0)->size();
|
||||
size_t bytes = op->getInputs(0)->getBytes();
|
||||
size_t count = bytes / op->getDType().getSize();
|
||||
cnclComm_t comm =
|
||||
dynamic_cast<CnclCommunicatorObj &>(context->getCommunicator())
|
||||
.getCnclComm();
|
||||
cnrtQueue_t queue = context->getBangQueue();
|
||||
// checkBangError(cnrtQueueSync(queue));
|
||||
CNCL_CHECK(cnclAllReduce(input, output, count, cnclFloat, getRedOp(),
|
||||
comm, queue));
|
||||
CNCL_CHECK(cnclAllReduce(input, output, count,
|
||||
cnclDataTypeConvert(op->getDType()),
|
||||
getRedOp(), comm, queue));
|
||||
checkBangError(cnrtQueueSync(queue));
|
||||
}
|
||||
|
||||
|
@ -41,13 +42,13 @@ class AllReduceMaxCNCL : public AllReduceCNCL {
|
|||
cnclReduceOp_t getRedOp() const override { return cnclMax; }
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::AllReduceSum, DataType::Float32,
|
||||
AllReduceSumCNCL, "AllReduce_Sum_CNCL_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::AllReduceProd, DataType::Float32,
|
||||
AllReduceProdCNCL, "AllReduce_Prod_CNCL_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::AllReduceMin, DataType::Float32,
|
||||
AllReduceMinCNCL, "AllReduce_Min_CNCL_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::AllReduceMax, DataType::Float32,
|
||||
AllReduceMaxCNCL, "AllReduce_Max_CNCL_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::AllReduceSum, AllReduceSumCNCL,
|
||||
"AllReduce_Sum_CNCL_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::AllReduceProd, AllReduceProdCNCL,
|
||||
"AllReduce_Prod_CNCL_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::AllReduceMin, AllReduceMinCNCL,
|
||||
"AllReduce_Min_CNCL_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::AllReduceMax, AllReduceMaxCNCL,
|
||||
"AllReduce_Max_CNCL_BANG");
|
||||
} // namespace infini
|
||||
#endif
|
||||
|
|
|
@ -7,7 +7,6 @@ class BatchNormCnnl : public BangKernelWithoutConfig {
|
|||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<BatchNormObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const input = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
|
@ -33,18 +32,18 @@ class BatchNormCnnl : public BangKernelWithoutConfig {
|
|||
checkCnnlError(cnnlCreateTensorDescriptor(&intransDesc));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&outDesc));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&outtransDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(inDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, dims.size(),
|
||||
dims.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(intransDesc, CNNL_LAYOUT_NHWC,
|
||||
CNNL_DTYPE_FLOAT, dims.size(),
|
||||
dimsTrans));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(outDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, outDims.size(),
|
||||
outDims.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(outtransDesc, CNNL_LAYOUT_NHWC,
|
||||
CNNL_DTYPE_FLOAT, outDims.size(),
|
||||
dimsOutTrans));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
inDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
dims.size(), dims.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
intransDesc, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()),
|
||||
dims.size(), dimsTrans));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
outDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
outDims.size(), outDims.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
outtransDesc, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()),
|
||||
outDims.size(), dimsOutTrans));
|
||||
cnnlTransposeDescriptor_t opDesc;
|
||||
checkCnnlError(cnnlCreateTransposeDescriptor(&opDesc));
|
||||
checkCnnlError(cnnlSetTransposeDescriptor(opDesc, 4, permute));
|
||||
|
@ -53,9 +52,9 @@ class BatchNormCnnl : public BangKernelWithoutConfig {
|
|||
&wsSize);
|
||||
BangPtr wsData = context->getWorkspace(wsSize);
|
||||
BangPtr inputTrans = context->getWorkspace(
|
||||
cnnlGetTensorElementNum(inDesc) * sizeof(float));
|
||||
cnnlGetTensorElementNum(inDesc) * op->getDType().getSize());
|
||||
BangPtr outputTrans = context->getWorkspace(
|
||||
cnnlGetTensorElementNum(inDesc) * sizeof(float));
|
||||
cnnlGetTensorElementNum(inDesc) * op->getDType().getSize());
|
||||
cnnlStatus_t stat =
|
||||
cnnlTranspose_v2(context->cnnlHandle(), opDesc, inDesc, input,
|
||||
intransDesc, inputTrans, wsData, wsSize);
|
||||
|
@ -67,7 +66,7 @@ class BatchNormCnnl : public BangKernelWithoutConfig {
|
|||
cnnlTensorDescriptor_t paraDesc;
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(¶Desc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
paraDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT,
|
||||
paraDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
|
||||
dimsScaleBiasMeanVar.size(), dimsScaleBiasMeanVar.data()));
|
||||
|
||||
float alpha = 1.f, beta = 0.f;
|
||||
|
|
|
@ -13,22 +13,23 @@ class BroadcastCNCL : public BangKernelWithoutConfig {
|
|||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
void *input = op->getInputs(0)->getRawDataPtr<void *>();
|
||||
void *output = op->getOutput()->getRawDataPtr<void *>();
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
size_t count = op->getInputs(0)->getBytes() / op->getDType().getSize();
|
||||
size_t bytes = op->getInputs(0)->getBytes();
|
||||
size_t count = bytes / op->getDType().getSize();
|
||||
|
||||
cnclComm_t comm =
|
||||
dynamic_cast<CnclCommunicatorObj &>(context->getCommunicator())
|
||||
.getCnclComm();
|
||||
cnrtQueue_t queue = context->getBangQueue();
|
||||
// TODO: Using default stream 0 for now.
|
||||
CNCL_CHECK(cnclBroadcast(input, output, count, cnclFloat32,
|
||||
CNCL_CHECK(cnclBroadcast(input, output, count,
|
||||
cnclDataTypeConvert(op->getDType()),
|
||||
op->getRoot(), comm, queue));
|
||||
checkBangError(cnrtQueueSync(queue));
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Broadcast, DataType::Float32,
|
||||
BroadcastCNCL, "Broadcast_CNCL_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Broadcast, BroadcastCNCL,
|
||||
"Broadcast_CNCL_BANG");
|
||||
} // namespace infini
|
||||
|
||||
#endif
|
||||
|
|
|
@ -7,7 +7,6 @@ class CeilCnnl : public BangKernelWithoutConfig {
|
|||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<UnaryObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
|
@ -18,13 +17,13 @@ class CeilCnnl : public BangKernelWithoutConfig {
|
|||
auto cDim = op->getOutput()->getDims();
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, aDim.size(),
|
||||
aDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
aDim.size(), aDim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, cDim.size(),
|
||||
cDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
cDim.size(), cDim.data()));
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlCeil(context->cnnlHandle(), aDesc, aData, cDesc, cData);
|
||||
|
|
|
@ -7,7 +7,6 @@ class ClipCnnl : public BangKernelWithoutConfig {
|
|||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ClipObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
|
@ -19,9 +18,9 @@ class ClipCnnl : public BangKernelWithoutConfig {
|
|||
auto aDim = op->getInputs(0)->getDims();
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, aDim.size(),
|
||||
aDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
aDim.size(), aDim.data()));
|
||||
cnnlStatus_t stat =
|
||||
cnnlClip(context->cnnlHandle(), aDesc, aData, &min, &max, cData);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
|
|
|
@ -7,7 +7,6 @@ class ConcatCnnl : public BangKernelWithoutConfig {
|
|||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ConcatObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
int num = op->numInputs();
|
||||
int axis = op->getDim();
|
||||
|
@ -15,17 +14,18 @@ class ConcatCnnl : public BangKernelWithoutConfig {
|
|||
auto cDim = op->getOutput()->getDims();
|
||||
cnnlTensorDescriptor_t desc;
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&desc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(desc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, cDim.size(),
|
||||
cDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
desc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
cDim.size(), cDim.data()));
|
||||
|
||||
cnnlTensorDescriptor_t descArray[num];
|
||||
for (int i = 0; i < num; ++i) {
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&descArray[i]));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
descArray[i], CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT,
|
||||
op->getInputs(i)->getDims().size(),
|
||||
op->getInputs(i)->getDims().data()));
|
||||
checkCnnlError(
|
||||
cnnlSetTensorDescriptor(descArray[i], CNNL_LAYOUT_NCHW,
|
||||
cnnlDataTypeConvert(op->getDType()),
|
||||
op->getInputs(i)->getDims().size(),
|
||||
op->getInputs(i)->getDims().data()));
|
||||
}
|
||||
|
||||
void *argv[num];
|
||||
|
|
|
@ -7,7 +7,6 @@ class ConvCnnl : public BangKernelWithoutConfig {
|
|||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ConvObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
|
||||
|
@ -21,8 +20,9 @@ class ConvCnnl : public BangKernelWithoutConfig {
|
|||
|
||||
cnnlConvolutionDescriptor_t convDesc;
|
||||
checkCnnlError(cnnlCreateConvolutionDescriptor(&convDesc));
|
||||
checkCnnlError(cnnlSetConvolutionDescriptor(
|
||||
convDesc, 4, pad, stride, dilation, g, CNNL_DTYPE_FLOAT));
|
||||
checkCnnlError(
|
||||
cnnlSetConvolutionDescriptor(convDesc, 4, pad, stride, dilation, g,
|
||||
cnnlDataTypeConvert(op->getDType())));
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
|
@ -55,20 +55,24 @@ class ConvCnnl : public BangKernelWithoutConfig {
|
|||
|
||||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aInDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aInDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, inputs0));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aInDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
|
||||
inputs0));
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NHWC, CNNL_DTYPE_FLOAT, 4, inputs0Array));
|
||||
aDesc, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()), 4,
|
||||
inputs0Array));
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bInDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(bInDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, inputs1));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
bInDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
|
||||
inputs1));
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
bDesc, CNNL_LAYOUT_NHWC, CNNL_DTYPE_FLOAT, 4, inputs1Array));
|
||||
bDesc, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()), 4,
|
||||
inputs1Array));
|
||||
|
||||
int permute[4] = {0, 2, 3, 1};
|
||||
cnnlTransposeDescriptor_t opDesc;
|
||||
|
@ -80,7 +84,7 @@ class ConvCnnl : public BangKernelWithoutConfig {
|
|||
&wsSize);
|
||||
BangPtr wsData = context->getWorkspace(wsSize);
|
||||
BangPtr aDataOut = context->getWorkspace(
|
||||
cnnlGetTensorElementNum(aInDesc) * sizeof(float));
|
||||
cnnlGetTensorElementNum(aInDesc) * op->getDType().getSize());
|
||||
cnnlStatus_t stat =
|
||||
cnnlTranspose_v2(context->cnnlHandle(), opDesc, aInDesc, aData,
|
||||
aDesc, aDataOut, wsData, wsSize);
|
||||
|
@ -91,7 +95,7 @@ class ConvCnnl : public BangKernelWithoutConfig {
|
|||
&wsSize);
|
||||
wsData = context->getWorkspace(wsSize);
|
||||
BangPtr bDataOut = context->getWorkspace(
|
||||
cnnlGetTensorElementNum(bInDesc) * sizeof(float));
|
||||
cnnlGetTensorElementNum(bInDesc) * op->getDType().getSize());
|
||||
stat = cnnlTranspose_v2(context->cnnlHandle(), opDesc, bInDesc, bData,
|
||||
bDesc, bDataOut, wsData, wsSize);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
|
@ -100,11 +104,13 @@ class ConvCnnl : public BangKernelWithoutConfig {
|
|||
// get outputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cInDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cInDesc, CNNL_LAYOUT_NHWC, CNNL_DTYPE_FLOAT, 4, outputArray));
|
||||
cInDesc, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()), 4,
|
||||
outputArray));
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, output));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
|
||||
output));
|
||||
|
||||
cnnlConvolutionForwardAlgo_t algo;
|
||||
cnnlGetConvolutionForwardAlgorithm(context->cnnlHandle(), convDesc,
|
||||
|
@ -116,7 +122,7 @@ class ConvCnnl : public BangKernelWithoutConfig {
|
|||
algo, &wsSize);
|
||||
wsData = context->getWorkspace(wsSize);
|
||||
BangPtr cDataIn = context->getWorkspace(
|
||||
cnnlGetTensorElementNum(cInDesc) * sizeof(float));
|
||||
cnnlGetTensorElementNum(cInDesc) * op->getDType().getSize());
|
||||
|
||||
stat = cnnlConvolutionForward(
|
||||
context->cnnlHandle(), convDesc, algo, NULL, aDesc, aDataOut, bDesc,
|
||||
|
|
|
@ -7,7 +7,6 @@ class ConvTransCnnl : public BangKernelWithoutConfig {
|
|||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ConvBaseObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
|
||||
|
@ -21,8 +20,9 @@ class ConvTransCnnl : public BangKernelWithoutConfig {
|
|||
|
||||
cnnlConvolutionDescriptor_t convDesc;
|
||||
checkCnnlError(cnnlCreateConvolutionDescriptor(&convDesc));
|
||||
checkCnnlError(cnnlSetConvolutionDescriptor(
|
||||
convDesc, 4, pad, stride, dilation, g, CNNL_DTYPE_FLOAT));
|
||||
checkCnnlError(
|
||||
cnnlSetConvolutionDescriptor(convDesc, 4, pad, stride, dilation, g,
|
||||
cnnlDataTypeConvert(op->getDType())));
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
|
@ -43,14 +43,17 @@ class ConvTransCnnl : public BangKernelWithoutConfig {
|
|||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dimInputs0.data()));
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
|
||||
dimInputs0.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
bDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dimInputs1.data()));
|
||||
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
|
||||
dimInputs1.data()));
|
||||
// get outputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dimOutput.data()));
|
||||
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
|
||||
dimOutput.data()));
|
||||
|
||||
cnnlConvolutionBwdDataAlgo_t algo;
|
||||
cnnlGetConvolutionBackwardDataAlgorithm(
|
||||
|
|
|
@ -7,7 +7,6 @@ class ConvBackwardFilterCnnl : public BangKernelWithoutConfig {
|
|||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ConvBackwardFilterObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
|
||||
|
@ -21,8 +20,9 @@ class ConvBackwardFilterCnnl : public BangKernelWithoutConfig {
|
|||
|
||||
cnnlConvolutionDescriptor_t convDesc;
|
||||
checkCnnlError(cnnlCreateConvolutionDescriptor(&convDesc));
|
||||
checkCnnlError(cnnlSetConvolutionDescriptor(
|
||||
convDesc, 4, pad, stride, dilation, g, CNNL_DTYPE_FLOAT));
|
||||
checkCnnlError(
|
||||
cnnlSetConvolutionDescriptor(convDesc, 4, pad, stride, dilation, g,
|
||||
cnnlDataTypeConvert(op->getDType())));
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
|
@ -63,15 +63,16 @@ class ConvBackwardFilterCnnl : public BangKernelWithoutConfig {
|
|||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, inputs0Array));
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
|
||||
inputs0Array));
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDescTrans));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDescTrans, CNNL_LAYOUT_NHWC,
|
||||
CNNL_DTYPE_FLOAT, 4,
|
||||
inputs0ArrayTrans));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDescTrans, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()),
|
||||
4, inputs0ArrayTrans));
|
||||
|
||||
size_t wsTrans1Size = dimInputs0[0] * dimInputs0[1] * dimInputs0[2] *
|
||||
dimInputs0[3] * sizeof(float);
|
||||
dimInputs0[3] * op->getDType().getSize();
|
||||
BangPtr wsTrans1Data = context->getWorkspace(wsTrans1Size);
|
||||
|
||||
cnnlStatus_t stat =
|
||||
|
@ -82,15 +83,16 @@ class ConvBackwardFilterCnnl : public BangKernelWithoutConfig {
|
|||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
bDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, inputs1Array));
|
||||
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
|
||||
inputs1Array));
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDescTrans));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(bDescTrans, CNNL_LAYOUT_NHWC,
|
||||
CNNL_DTYPE_FLOAT, 4,
|
||||
inputs1ArrayTrans));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
bDescTrans, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()),
|
||||
4, inputs1ArrayTrans));
|
||||
|
||||
size_t wsTrans2Size = dimInputs1[0] * dimInputs1[1] * dimInputs1[2] *
|
||||
dimInputs1[3] * sizeof(float);
|
||||
dimInputs1[3] * op->getDType().getSize();
|
||||
BangPtr wsTrans2Data = context->getWorkspace(wsTrans2Size);
|
||||
|
||||
stat = cnnlTranspose(context->cnnlHandle(), transDesc, bDesc, bData,
|
||||
|
@ -101,15 +103,16 @@ class ConvBackwardFilterCnnl : public BangKernelWithoutConfig {
|
|||
// get outputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, outputArray));
|
||||
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
|
||||
outputArray));
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDescTrans));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDescTrans, CNNL_LAYOUT_NHWC,
|
||||
CNNL_DTYPE_FLOAT, 4,
|
||||
outputArrayTrans));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDescTrans, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()),
|
||||
4, outputArrayTrans));
|
||||
|
||||
size_t wsTrans3Size = dimOutput[0] * dimOutput[1] * dimOutput[2] *
|
||||
dimOutput[3] * sizeof(float);
|
||||
dimOutput[3] * op->getDType().getSize();
|
||||
BangPtr wsTrans3Data = context->getWorkspace(wsTrans3Size);
|
||||
|
||||
cnnlConvolutionBwdFilterAlgo_t algo;
|
||||
|
|
|
@ -7,7 +7,6 @@ class DetCnnl : public BangKernelWithoutConfig {
|
|||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<DetObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
|
@ -24,14 +23,14 @@ class DetCnnl : public BangKernelWithoutConfig {
|
|||
auto dimout = op->getOutput()->getDims();
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY,
|
||||
CNNL_DTYPE_FLOAT, dimin.size(),
|
||||
dimin.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
|
||||
dimin.size(), dimin.data()));
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY,
|
||||
CNNL_DTYPE_FLOAT, dimout.size(),
|
||||
dimout.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
|
||||
dimout.size(), dimout.data()));
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlDet(context->cnnlHandle(), nlMode, aDesc, aData, cDesc, cData);
|
||||
|
|
|
@ -11,8 +11,8 @@ class ElementWiseCnnl : public BangKernelWithoutConfig {
|
|||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ElementWiseObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
auto [aAlpha, bAlpha, beta] = getAlphBeta();
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
|
@ -31,32 +31,33 @@ class ElementWiseCnnl : public BangKernelWithoutConfig {
|
|||
}
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, a_dim.size(),
|
||||
a_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
a_dim.size(), a_dim.data()));
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, b_dim.size(),
|
||||
b_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
b_dim.size(), b_dim.data()));
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, c_dim.size(),
|
||||
c_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
c_dim.size(), c_dim.data()));
|
||||
|
||||
cnnlOpTensorDescriptor_t opDesc;
|
||||
checkCnnlError(cnnlCreateOpTensorDescriptor(&opDesc));
|
||||
checkCnnlError(cnnlSetOpTensorDescriptor(
|
||||
opDesc, getOpType(), CNNL_DTYPE_FLOAT, CNNL_NOT_PROPAGATE_NAN));
|
||||
opDesc, getOpType(), cnnlDataTypeConvert(op->getDType()),
|
||||
CNNL_NOT_PROPAGATE_NAN));
|
||||
|
||||
size_t wsSize;
|
||||
cnnlGetOpTensorWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
|
||||
&wsSize);
|
||||
cnnlGetOpTensorWorkspaceSize_v2(context->cnnlHandle(), opDesc, &aAlpha,
|
||||
aDesc, aData, &bAlpha, bDesc, bData,
|
||||
&beta, cDesc, cData, &wsSize);
|
||||
|
||||
BangPtr wsData = context->getWorkspace(wsSize);
|
||||
|
||||
auto [aAlpha, bAlpha, beta] = getAlphBeta();
|
||||
cnnlStatus_t stat = cnnlOpTensor(context->cnnlHandle(), opDesc, &aAlpha,
|
||||
aDesc, aData, &bAlpha, bDesc, bData,
|
||||
wsData, wsSize, &beta, cDesc, cData);
|
||||
|
@ -75,7 +76,6 @@ class LogicOpCnnl : public BangKernelWithoutConfig {
|
|||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ElementWiseObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
|
@ -95,17 +95,17 @@ class LogicOpCnnl : public BangKernelWithoutConfig {
|
|||
}
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, a_dim.size(),
|
||||
a_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
a_dim.size(), a_dim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, b_dim.size(),
|
||||
b_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
b_dim.size(), b_dim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, c_dim.size(),
|
||||
c_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
c_dim.size(), c_dim.data()));
|
||||
|
||||
size_t wsSize;
|
||||
cnnlGetLogicOpWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
|
||||
|
@ -129,7 +129,6 @@ class BitComputeCnnl : public BangKernelWithoutConfig {
|
|||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ElementWiseObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
|
@ -182,7 +181,6 @@ class DivCnnl : public BangKernelWithoutConfig {
|
|||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ElementWiseObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
|
@ -202,17 +200,17 @@ class DivCnnl : public BangKernelWithoutConfig {
|
|||
}
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, a_dim.size(),
|
||||
a_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
a_dim.size(), a_dim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, b_dim.size(),
|
||||
b_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
b_dim.size(), b_dim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, c_dim.size(),
|
||||
c_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
c_dim.size(), c_dim.data()));
|
||||
|
||||
size_t wsSize;
|
||||
cnnlGetDivWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
|
||||
|
@ -235,7 +233,6 @@ class MaximumCnnl : public BangKernelWithoutConfig {
|
|||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ElementWiseObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
|
@ -255,17 +252,17 @@ class MaximumCnnl : public BangKernelWithoutConfig {
|
|||
}
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, a_dim.size(),
|
||||
a_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
a_dim.size(), a_dim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, b_dim.size(),
|
||||
b_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
b_dim.size(), b_dim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, c_dim.size(),
|
||||
c_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
c_dim.size(), c_dim.data()));
|
||||
|
||||
size_t wsSize;
|
||||
cnnlGetMaximumWorkspaceSize(context->cnnlHandle(), cDesc, &wsSize);
|
||||
|
@ -287,7 +284,6 @@ class MinimumCnnl : public BangKernelWithoutConfig {
|
|||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ElementWiseObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
|
@ -307,17 +303,17 @@ class MinimumCnnl : public BangKernelWithoutConfig {
|
|||
}
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, a_dim.size(),
|
||||
a_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
a_dim.size(), a_dim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, b_dim.size(),
|
||||
b_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
b_dim.size(), b_dim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, c_dim.size(),
|
||||
c_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
c_dim.size(), c_dim.data()));
|
||||
|
||||
size_t wsSize;
|
||||
cnnlGetMinimumWorkspaceSize(context->cnnlHandle(), cDesc, &wsSize);
|
||||
|
@ -339,7 +335,6 @@ class MSELossCnnl : public BangKernelWithoutConfig {
|
|||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<MSELossObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
|
@ -359,18 +354,18 @@ class MSELossCnnl : public BangKernelWithoutConfig {
|
|||
}
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, a_dim.size(),
|
||||
a_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
a_dim.size(), a_dim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, b_dim.size(),
|
||||
b_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
b_dim.size(), b_dim.data()));
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, c_dim.size(),
|
||||
c_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
c_dim.size(), c_dim.data()));
|
||||
cnnlStatus_t stat;
|
||||
if (reduction == MSELossObj::None) {
|
||||
stat = cnnlMSELoss(context->cnnlHandle(), CNNL_MSE_LOSS_NONE, aDesc,
|
||||
|
@ -396,7 +391,6 @@ class PowerCnnl : public BangKernelWithoutConfig {
|
|||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ElementWiseObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
|
@ -417,17 +411,17 @@ class PowerCnnl : public BangKernelWithoutConfig {
|
|||
}
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, a_dim.size(),
|
||||
a_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
a_dim.size(), a_dim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, b_dim.size(),
|
||||
b_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
b_dim.size(), b_dim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, c_dim.size(),
|
||||
c_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
c_dim.size(), c_dim.data()));
|
||||
|
||||
size_t wsSize;
|
||||
cnnlGetPowWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
|
||||
|
@ -450,7 +444,6 @@ class FloorDivCnnl : public BangKernelWithoutConfig {
|
|||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ElementWiseObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
|
@ -470,17 +463,17 @@ class FloorDivCnnl : public BangKernelWithoutConfig {
|
|||
}
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, a_dim.size(),
|
||||
a_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
a_dim.size(), a_dim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, b_dim.size(),
|
||||
b_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
b_dim.size(), b_dim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, c_dim.size(),
|
||||
c_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
c_dim.size(), c_dim.data()));
|
||||
|
||||
size_t wsSize;
|
||||
cnnlGetFloorDivWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
|
||||
|
@ -503,7 +496,6 @@ class FloorModCnnl : public BangKernelWithoutConfig {
|
|||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ElementWiseObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
|
@ -523,17 +515,17 @@ class FloorModCnnl : public BangKernelWithoutConfig {
|
|||
}
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, a_dim.size(),
|
||||
a_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
a_dim.size(), a_dim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, b_dim.size(),
|
||||
b_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
b_dim.size(), b_dim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, c_dim.size(),
|
||||
c_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
c_dim.size(), c_dim.data()));
|
||||
|
||||
size_t wsSize;
|
||||
cnnlGetFloorModWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
|
||||
|
@ -556,7 +548,6 @@ class SquaredDifferenceCnnl : public BangKernelWithoutConfig {
|
|||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ElementWiseObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
|
@ -576,17 +567,17 @@ class SquaredDifferenceCnnl : public BangKernelWithoutConfig {
|
|||
}
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, a_dim.size(),
|
||||
a_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
a_dim.size(), a_dim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, b_dim.size(),
|
||||
b_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
b_dim.size(), b_dim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, c_dim.size(),
|
||||
c_dim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
c_dim.size(), c_dim.data()));
|
||||
|
||||
size_t wsSize;
|
||||
cnnlGetSquaredDifferenceWorkspaceSize(context->cnnlHandle(), aDesc,
|
||||
|
|
|
@ -7,7 +7,6 @@ class ErfCnnl : public BangKernelWithoutConfig {
|
|||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<UnaryObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
|
@ -18,13 +17,13 @@ class ErfCnnl : public BangKernelWithoutConfig {
|
|||
auto cDim = op->getOutput()->getDims();
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, aDim.size(),
|
||||
aDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
aDim.size(), aDim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, cDim.size(),
|
||||
cDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
cDim.size(), cDim.data()));
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlErf_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION,
|
||||
|
|
|
@ -7,7 +7,6 @@ class ExpCnnl : public BangKernelWithoutConfig {
|
|||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<UnaryObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
|
@ -18,13 +17,13 @@ class ExpCnnl : public BangKernelWithoutConfig {
|
|||
auto cDim = op->getOutput()->getDims();
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, aDim.size(),
|
||||
aDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
aDim.size(), aDim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, cDim.size(),
|
||||
cDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
cDim.size(), cDim.data()));
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlExp_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION,
|
||||
|
|
|
@ -7,7 +7,6 @@ class FillCnnl : public BangKernelWithoutConfig {
|
|||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<FillObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
@ -17,9 +16,9 @@ class FillCnnl : public BangKernelWithoutConfig {
|
|||
auto cDim = op->getOutput()->getDims();
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, cDim.size(),
|
||||
cDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
cDim.size(), cDim.data()));
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlFill(context->cnnlHandle(), value, cDesc, cData);
|
||||
|
|
|
@ -7,7 +7,6 @@ class FloorCnnl : public BangKernelWithoutConfig {
|
|||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<UnaryObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
|
@ -18,13 +17,13 @@ class FloorCnnl : public BangKernelWithoutConfig {
|
|||
auto cDim = op->getOutput()->getDims();
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, aDim.size(),
|
||||
aDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
aDim.size(), aDim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, cDim.size(),
|
||||
cDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
cDim.size(), cDim.data()));
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlFloor(context->cnnlHandle(), aDesc, aData, cDesc, cData);
|
||||
|
|
|
@ -7,7 +7,6 @@ class GatherCnnl : public BangKernelWithoutConfig {
|
|||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<GatherObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
|
@ -20,27 +19,56 @@ class GatherCnnl : public BangKernelWithoutConfig {
|
|||
auto cDim = op->getOutput()->getDims();
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY,
|
||||
CNNL_DTYPE_FLOAT, aDim.size(),
|
||||
aDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
|
||||
aDim.size(), aDim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(
|
||||
cnnlSetTensorDescriptorPointerMode(bDesc, CNNL_POINTER_MODE_HOST));
|
||||
|
||||
if (bDim.size() == 0) {
|
||||
bDim.push_back(1);
|
||||
}
|
||||
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_ARRAY,
|
||||
CNNL_DTYPE_INT32, bDim.size(),
|
||||
bDim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY,
|
||||
CNNL_DTYPE_FLOAT, cDim.size(),
|
||||
cDim.data()));
|
||||
|
||||
BangPtr wsData = context->getWorkspace(aDim.size() * 4);
|
||||
context->copyBlobFromCPU(wsData, aDim.data(), aDim.size() * 4);
|
||||
BangPtr indices;
|
||||
DataType indicesDataType = op->getInputs(1)->getDType();
|
||||
if (indicesDataType == DataType::Int64) {
|
||||
// cnnlGatherV2 does not support int64 indices
|
||||
int indicesSize =
|
||||
op->getInputs(1)->getBytes() / indicesDataType.getSize();
|
||||
indices = context->getWorkspace(indicesSize * sizeof(int));
|
||||
cnnlTensorDescriptor_t bDescInt64;
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDescInt64));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
bDescInt64, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_INT64, bDim.size(),
|
||||
bDim.data()));
|
||||
checkCnnlError(cnnlCastDataType(context->cnnlHandle(), bDescInt64,
|
||||
bData, CNNL_CAST_INT64_TO_INT32,
|
||||
bDesc, indices));
|
||||
cnrtQueueSync(context->getBangQueue());
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(bDescInt64));
|
||||
} else if (indicesDataType == DataType::Int32) {
|
||||
indices = bData;
|
||||
} else {
|
||||
IT_TODO_HALT_MSG("Unsupported data type of indices: " +
|
||||
indicesDataType.toString());
|
||||
}
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
|
||||
cDim.size(), cDim.data()));
|
||||
|
||||
BangPtr wsData = context->getWorkspace(aDim.size() * sizeof(int));
|
||||
context->copyBlobFromCPU(wsData, aDim.data(),
|
||||
aDim.size() * sizeof(int));
|
||||
|
||||
auto axis = op->getAxis();
|
||||
cnnlStatus_t stat =
|
||||
cnnlGatherV2(context->cnnlHandle(), axis, aDesc, aData,
|
||||
(int *)wsData, bDesc, (int *)bData, cDesc, cData);
|
||||
reinterpret_cast<const int *>(wsData), bDesc,
|
||||
reinterpret_cast<const int *>(indices), cDesc, cData);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
|
|
|
@ -7,7 +7,6 @@ class HardtanhCnnl : public BangKernelWithoutConfig {
|
|||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<HardtanhObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
|
@ -20,7 +19,8 @@ class HardtanhCnnl : public BangKernelWithoutConfig {
|
|||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, dim.size(), dim.data()));
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
dim.size(), dim.data()));
|
||||
|
||||
cnnlStatus_t stat = cnnlHardtanh(context->cnnlHandle(), aDesc, aData,
|
||||
max, min, aDesc, cData);
|
||||
|
|
|
@ -7,7 +7,6 @@ class L2LossCnnl : public BangKernelWithoutConfig {
|
|||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<L2LossObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
|
@ -18,7 +17,8 @@ class L2LossCnnl : public BangKernelWithoutConfig {
|
|||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, dim.size(), dim.data()));
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
dim.size(), dim.data()));
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlL2Loss(context->cnnlHandle(), aDesc, aData, cData);
|
||||
|
|
|
@ -8,7 +8,6 @@ class LayerNormCnnl : public BangKernelWithoutConfig {
|
|||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<LayerNormObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const inputData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
|
@ -29,17 +28,17 @@ class LayerNormCnnl : public BangKernelWithoutConfig {
|
|||
cnnlTensorDescriptor_t inDesc, fiterDesc, outDesc;
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&inDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(inDesc, CNNL_LAYOUT_ARRAY,
|
||||
CNNL_DTYPE_FLOAT, inDims.size(),
|
||||
inDims.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
inDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
|
||||
inDims.size(), inDims.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&fiterDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
fiterDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, fiterDims.size(),
|
||||
fiterDims.data()));
|
||||
fiterDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
|
||||
fiterDims.size(), fiterDims.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&outDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(outDesc, CNNL_LAYOUT_ARRAY,
|
||||
CNNL_DTYPE_FLOAT, outDims.size(),
|
||||
outDims.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
outDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
|
||||
outDims.size(), outDims.data()));
|
||||
size_t wsSize;
|
||||
cnnlGetLayerNormOpWorkspaceSize(context->cnnlHandle(), axis, inDesc,
|
||||
&wsSize);
|
||||
|
|
|
@ -7,7 +7,6 @@ class LogCnnl : public BangKernelWithoutConfig {
|
|||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<LogObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
|
@ -33,13 +32,13 @@ class LogCnnl : public BangKernelWithoutConfig {
|
|||
auto cDim = op->getOutput()->getDims();
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, aDim.size(),
|
||||
aDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
aDim.size(), aDim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, cDim.size(),
|
||||
cDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
cDim.size(), cDim.data()));
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlLog_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION,
|
||||
|
|
|
@ -7,7 +7,6 @@ class LRNCnnl : public BangKernelWithoutConfig {
|
|||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<LRNObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
|
@ -20,13 +19,13 @@ class LRNCnnl : public BangKernelWithoutConfig {
|
|||
auto size = op->getSize();
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, aDim.size(),
|
||||
aDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
aDim.size(), aDim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, cDim.size(),
|
||||
cDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
cDim.size(), cDim.data()));
|
||||
|
||||
size_t extra_size;
|
||||
cnnlGetLrnExtraInputSize_v2(context->cnnlHandle(), cDesc,
|
||||
|
|
|
@ -8,7 +8,6 @@ class MatmulCnnl : public BangKernelWithoutConfig {
|
|||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<MatmulObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
auto input_num = op->numInputs();
|
||||
|
@ -38,25 +37,26 @@ class MatmulCnnl : public BangKernelWithoutConfig {
|
|||
int32_t transB = op->getTransB();
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(
|
||||
cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT,
|
||||
dimInputs0.size(), dimInputs0.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
|
||||
dimInputs0.size(), dimInputs0.data()));
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(
|
||||
cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT,
|
||||
dimInputs1.size(), dimInputs1.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
bDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
|
||||
dimInputs1.size(), dimInputs1.data()));
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(
|
||||
cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT,
|
||||
dimOutput.size(), dimOutput.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
|
||||
dimOutput.size(), dimOutput.data()));
|
||||
|
||||
if (input_num > 2) {
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&biasDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
biasDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, dimBias.size(),
|
||||
dimBias.data()));
|
||||
checkCnnlError(
|
||||
cnnlSetTensorDescriptor(biasDesc, CNNL_LAYOUT_ARRAY,
|
||||
cnnlDataTypeConvert(op->getDType()),
|
||||
dimBias.size(), dimBias.data()));
|
||||
}
|
||||
|
||||
cnnlMatMulDescriptor_t bmm_desc;
|
||||
|
|
|
@ -7,7 +7,6 @@ class NegTensorCnnl : public BangKernelWithoutConfig {
|
|||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<UnaryObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
|
@ -18,13 +17,13 @@ class NegTensorCnnl : public BangKernelWithoutConfig {
|
|||
auto cDim = op->getOutput()->getDims();
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, aDim.size(),
|
||||
aDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
aDim.size(), aDim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, cDim.size(),
|
||||
cDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
cDim.size(), cDim.data()));
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlNegTensor(context->cnnlHandle(), aDesc, aData, cDesc, cData);
|
||||
|
|
|
@ -7,7 +7,6 @@ class PadCnnl : public BangKernelWithoutConfig {
|
|||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<PadObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
|
@ -37,14 +36,14 @@ class PadCnnl : public BangKernelWithoutConfig {
|
|||
float paddingValue = 0.0;
|
||||
// input
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY,
|
||||
CNNL_DTYPE_FLOAT, dimIn.size(),
|
||||
dimIn.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
|
||||
dimIn.size(), dimIn.data()));
|
||||
// output
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY,
|
||||
CNNL_DTYPE_FLOAT, dimOut.size(),
|
||||
dimOut.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
|
||||
dimOut.size(), dimOut.data()));
|
||||
|
||||
cnnlStatus_t stat = cnnlPad(context->cnnlHandle(), aDesc, aData,
|
||||
paddings, &paddingValue, cDesc, cData);
|
||||
|
|
|
@ -8,7 +8,6 @@ class PoolingCnnl : public BangKernelWithoutConfig {
|
|||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<PoolingObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
void *const inData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const outData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
@ -20,8 +19,9 @@ class PoolingCnnl : public BangKernelWithoutConfig {
|
|||
int inArray[4] = {n, c, h, w};
|
||||
cnnlTensorDescriptor_t inDesc;
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&inDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(inDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, inArray));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
inDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
|
||||
inArray));
|
||||
bool mode = op->getCeilMode();
|
||||
|
||||
// get maxpool descriptor
|
||||
|
@ -37,8 +37,9 @@ class PoolingCnnl : public BangKernelWithoutConfig {
|
|||
int outArray[4] = {outVec[0], outVec[1], outVec[2], outVec[3]};
|
||||
cnnlTensorDescriptor_t outDesc;
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&outDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(outDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, outArray));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
outDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
|
||||
outArray));
|
||||
size_t wsSize;
|
||||
cnnlGetPoolingWorkspaceSize(context->cnnlHandle(), getPoolingMode(),
|
||||
outVec[3], outVec[2], &wsSize);
|
||||
|
|
|
@ -7,7 +7,6 @@ class ReciprocalCnnl : public BangKernelWithoutConfig {
|
|||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<UnaryObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
|
@ -18,13 +17,13 @@ class ReciprocalCnnl : public BangKernelWithoutConfig {
|
|||
auto cDim = op->getOutput()->getDims();
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, aDim.size(),
|
||||
aDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
aDim.size(), aDim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, cDim.size(),
|
||||
cDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
cDim.size(), cDim.data()));
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlReciprocal(context->cnnlHandle(), aDesc, aData, cDesc, cData);
|
||||
|
|
|
@ -9,7 +9,6 @@ class ReduceCnnlBase : public BangKernelWithoutConfig {
|
|||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ReduceBaseObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
@ -26,20 +25,20 @@ class ReduceCnnlBase : public BangKernelWithoutConfig {
|
|||
cnnlTensorDescriptor_t inDesc, outDesc;
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&inDesc));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&outDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(inDesc, CNNL_LAYOUT_ARRAY,
|
||||
CNNL_DTYPE_FLOAT, aDim.size(),
|
||||
aDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(outDesc, CNNL_LAYOUT_ARRAY,
|
||||
CNNL_DTYPE_FLOAT, bDim.size(),
|
||||
bDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
inDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
|
||||
aDim.size(), aDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
outDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
|
||||
bDim.size(), bDim.data()));
|
||||
|
||||
// get reduce descriptor
|
||||
cnnlReduceDescriptor_t reduceDesc;
|
||||
checkCnnlError(cnnlCreateReduceDescriptor(&reduceDesc));
|
||||
checkCnnlError(cnnlSetReduceDescriptor_v2(
|
||||
reduceDesc, axes.data(), axes.size(), getReduceOp(),
|
||||
CNNL_DTYPE_FLOAT, CNNL_NOT_PROPAGATE_NAN, CNNL_REDUCE_NO_INDICES,
|
||||
CNNL_32BIT_INDICES, 0.0));
|
||||
cnnlDataTypeConvert(op->getDType()), CNNL_NOT_PROPAGATE_NAN,
|
||||
CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES, 0.0));
|
||||
|
||||
// get workspace
|
||||
size_t workspaceSize = 0;
|
||||
|
|
|
@ -14,8 +14,8 @@ class CopyBang : public BangKernelWithoutConfig {
|
|||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_INT8,
|
||||
dim.size() * op->getDType().getSize(), dim.data()));
|
||||
aDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
|
||||
dim.size(), dim.data()));
|
||||
cnnlStatus_t stat =
|
||||
cnnlCopy(context->cnnlHandle(), aDesc, inData, aDesc, outData);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
|
@ -28,5 +28,7 @@ class CopyBang : public BangKernelWithoutConfig {
|
|||
REGISTER_KERNEL(Device::BANG, OpType::Reshape, CopyBang, "Reshape_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Flatten, CopyBang, "Flatten_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Identity, CopyBang, "Identity_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Squeeze, CopyBang, "Squeeze_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Unsqueeze, CopyBang, "Unsqueeze_BANG");
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -7,7 +7,6 @@ class RsqrtCnnl : public BangKernelWithoutConfig {
|
|||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<UnaryObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
|
@ -18,13 +17,13 @@ class RsqrtCnnl : public BangKernelWithoutConfig {
|
|||
auto cDim = op->getOutput()->getDims();
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, aDim.size(),
|
||||
aDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
aDim.size(), aDim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, cDim.size(),
|
||||
cDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
cDim.size(), cDim.data()));
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlRsqrt_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION,
|
||||
|
|
|
@ -42,11 +42,13 @@ class SliceCnnl : public BangKernelWithoutConfig {
|
|||
// input
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, aDim_size, aDim_array));
|
||||
aDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
|
||||
aDim_size, aDim_array));
|
||||
// output
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, cDim_size, cDim_array));
|
||||
cDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
|
||||
cDim_size, cDim_array));
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlStridedSlice(context->cnnlHandle(), aDesc, aData, starts_array,
|
||||
|
@ -59,6 +61,6 @@ class SliceCnnl : public BangKernelWithoutConfig {
|
|||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Slice, DataType::Float32, SliceCnnl,
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Slice, SliceCnnl,
|
||||
"Slice_cnnl_BANG_Float32");
|
||||
}; // namespace infini
|
||||
|
|
|
@ -7,7 +7,6 @@ class SplitCnnl : public BangKernelWithoutConfig {
|
|||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<SplitObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
int num = op->numOutputs();
|
||||
int axis = op->getDim();
|
||||
|
@ -16,15 +15,17 @@ class SplitCnnl : public BangKernelWithoutConfig {
|
|||
cnnlTensorDescriptor_t desc;
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&desc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
desc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, dim.size(), dim.data()));
|
||||
desc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
dim.size(), dim.data()));
|
||||
|
||||
cnnlTensorDescriptor_t descArray[num];
|
||||
for (int i = 0; i < num; ++i) {
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&descArray[i]));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
descArray[i], CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT,
|
||||
op->getOutput(i)->getDims().size(),
|
||||
op->getOutput(i)->getDims().data()));
|
||||
checkCnnlError(
|
||||
cnnlSetTensorDescriptor(descArray[i], CNNL_LAYOUT_NCHW,
|
||||
cnnlDataTypeConvert(op->getDType()),
|
||||
op->getOutput(i)->getDims().size(),
|
||||
op->getOutput(i)->getDims().data()));
|
||||
}
|
||||
|
||||
void *const inputData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
|
|
|
@ -7,7 +7,6 @@ class SqrtCnnl : public BangKernelWithoutConfig {
|
|||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<UnaryObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
|
@ -18,13 +17,13 @@ class SqrtCnnl : public BangKernelWithoutConfig {
|
|||
auto cDim = op->getOutput()->getDims();
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, aDim.size(),
|
||||
aDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
aDim.size(), aDim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, cDim.size(),
|
||||
cDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
cDim.size(), cDim.data()));
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlSqrt_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION,
|
||||
|
|
|
@ -7,7 +7,6 @@ class TransposeCnnl : public BangKernelWithoutConfig {
|
|||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<TransposeObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
|
@ -18,13 +17,13 @@ class TransposeCnnl : public BangKernelWithoutConfig {
|
|||
auto dimout = op->getOutput()->getDims();
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY,
|
||||
CNNL_DTYPE_FLOAT, dimin.size(),
|
||||
dimin.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
|
||||
dimin.size(), dimin.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY,
|
||||
CNNL_DTYPE_FLOAT, dimout.size(),
|
||||
dimout.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
|
||||
dimout.size(), dimout.data()));
|
||||
|
||||
auto permute = op->getPermute();
|
||||
cnnlTransposeDescriptor_t opDesc;
|
||||
|
@ -53,7 +52,6 @@ class DepthToSpaceCnnl : public BangKernelWithoutConfig {
|
|||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<DepthToSpaceObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
|
@ -73,13 +71,13 @@ class DepthToSpaceCnnl : public BangKernelWithoutConfig {
|
|||
auto dimout = op->getOutput()->getDims();
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY,
|
||||
CNNL_DTYPE_FLOAT, reshape.size(),
|
||||
reshape.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
|
||||
reshape.size(), reshape.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(
|
||||
cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT,
|
||||
transpose.size(), transpose.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
|
||||
transpose.size(), transpose.data()));
|
||||
|
||||
cnnlTransposeDescriptor_t opDesc;
|
||||
checkCnnlError(cnnlCreateTransposeDescriptor(&opDesc));
|
||||
|
|
|
@ -9,7 +9,6 @@ class TrigonCnnl : public BangKernelWithoutConfig {
|
|||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<UnaryObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
|
@ -20,17 +19,18 @@ class TrigonCnnl : public BangKernelWithoutConfig {
|
|||
auto cDim = op->getOutput()->getDims();
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, aDim.size(),
|
||||
aDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
aDim.size(), aDim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, cDim.size(),
|
||||
cDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
|
||||
cDim.size(), cDim.data()));
|
||||
|
||||
cnnlTrigonDescriptor_t opDesc;
|
||||
checkCnnlError(cnnlCreateTrigonDescriptor(&opDesc));
|
||||
checkCnnlError(cnnlSetTrigonDescriptor(opDesc, getOpType()));
|
||||
checkCnnlError(
|
||||
cnnlSetTrigonDescriptor_v2(opDesc, getOpType(), getPrefer()));
|
||||
|
||||
cnnlStatus_t stat = cnnlTrigonForward(context->cnnlHandle(), opDesc,
|
||||
aDesc, aData, cDesc, cData);
|
||||
|
|
|
@ -7,7 +7,6 @@ class WhereCnnl : public BangKernelWithoutConfig {
|
|||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<WhereObj>(_op);
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
|
@ -35,21 +34,21 @@ class WhereCnnl : public BangKernelWithoutConfig {
|
|||
}
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY,
|
||||
CNNL_DTYPE_FLOAT, aDim.size(),
|
||||
aDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
|
||||
aDim.size(), aDim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_ARRAY,
|
||||
CNNL_DTYPE_FLOAT, bDim.size(),
|
||||
bDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
bDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
|
||||
bDim.size(), bDim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY,
|
||||
CNNL_DTYPE_BOOL, cDim.size(),
|
||||
cDim.data()));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&dDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(dDesc, CNNL_LAYOUT_ARRAY,
|
||||
CNNL_DTYPE_FLOAT, dDim.size(),
|
||||
dDim.data()));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
dDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
|
||||
dDim.size(), dDim.data()));
|
||||
size_t wsSize;
|
||||
cnnlGetSelectV2WorkspaceSize(context->cnnlHandle(), cDesc, aDesc, bDesc,
|
||||
&wsSize);
|
||||
|
|
|
@ -0,0 +1,29 @@
|
|||
#include "operators/reshape.h"
|
||||
#include "core/kernel.h"
|
||||
#include "operators/squeeze.h"
|
||||
#include "operators/unsqueeze.h"
|
||||
|
||||
namespace infini {
|
||||
class NaiveIdentity : public CpuKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *context) const override {
|
||||
auto size = _op->getInputs()[0]->getBytes();
|
||||
void *inptr = _op->getInputs(0)->getRawDataPtr<void *>();
|
||||
void *outptr = _op->getOutput()->getRawDataPtr<void *>();
|
||||
|
||||
std::memcpy(outptr, inptr, size);
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Reshape, NaiveIdentity,
|
||||
"ReshapeNaive_CPU");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Identity, NaiveIdentity,
|
||||
"IdentityNaive_CPU");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Unsqueeze, NaiveIdentity,
|
||||
"UnsqueezeNaive_CPU");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Squeeze, NaiveIdentity,
|
||||
"SqueezeNaive_CPU");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Flatten, NaiveIdentity,
|
||||
"FlattenNaive_CPU");
|
||||
|
||||
} // namespace infini
|
|
@ -47,6 +47,10 @@ class NativeUnary : public CpuKernelWithoutConfig {
|
|||
return 0.5 * val * (1 + std::erf(val / std::sqrt(2)));
|
||||
}
|
||||
|
||||
template <typename T> static T siluCompute(T val) {
|
||||
return val / (1 + pow(E_CONSTANT, -val));
|
||||
}
|
||||
|
||||
template <typename T> static T erfCompute(T val) { return std::erf(val); }
|
||||
|
||||
template <typename T> static T aCosCompute(T val) { return std::acos(val); }
|
||||
|
@ -84,6 +88,9 @@ class NativeUnary : public CpuKernelWithoutConfig {
|
|||
case OpType::Gelu:
|
||||
_doCompute = geluCompute<T>;
|
||||
break;
|
||||
case OpType::Silu:
|
||||
_doCompute = siluCompute<T>;
|
||||
break;
|
||||
case OpType::Sigmoid:
|
||||
_doCompute = sigmoidCompute<T>;
|
||||
break;
|
||||
|
@ -138,6 +145,9 @@ class NativeUnary : public CpuKernelWithoutConfig {
|
|||
case OpType::Atanh:
|
||||
_doCompute = aTanhCompute<T>;
|
||||
break;
|
||||
case OpType::Acosh:
|
||||
_doCompute = aCoshCompute<T>;
|
||||
break;
|
||||
default:
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
|
@ -289,6 +299,7 @@ class Log : public CpuKernelWithoutConfig {
|
|||
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Relu, NativeUnary, "reluNaive_CPU");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Gelu, NativeUnary, "geluNaive_CPU");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Silu, NativeUnary, "siluNaive_CPU");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Sigmoid, NativeUnary, "sigmoidNaive_CPU");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::HardSigmoid, NativeUnary,
|
||||
"hardSigmoidNaive_CPU");
|
||||
|
|
|
@ -28,9 +28,8 @@ class AllReduceNCCL : public CudaKernelWithoutConfig {
|
|||
ncclComm_t comm =
|
||||
dynamic_cast<NcclCommunicatorObj &>(context->getCommunicator())
|
||||
.getNcclComm();
|
||||
// TODO: Using default stream 0 for now.
|
||||
checkNcclError(
|
||||
ncclAllReduce(input, output, count, ncclType, getRedOp(), comm, 0));
|
||||
checkNcclError(ncclAllReduce(input, output, count, ncclType, getRedOp(),
|
||||
comm, CUDAStream::getCurrentStream()));
|
||||
}
|
||||
|
||||
virtual ncclRedOp_t getRedOp() const = 0;
|
||||
|
|
|
@ -21,7 +21,7 @@ class AttentionKVCacheCompute {
|
|||
public:
|
||||
void do_compute(Tensor input_k_cache, Tensor input_v_cache, Tensor input_q,
|
||||
Tensor input_k, Tensor input_v, Tensor position_id,
|
||||
Tensor output_matmul) const {
|
||||
Tensor output_matmul, CudaPtr p_workspace) const {
|
||||
AttentionKVCacheMetadata metadata;
|
||||
initAttentionKVCacheMetadata(metadata, input_v_cache);
|
||||
|
||||
|
@ -32,7 +32,8 @@ class AttentionKVCacheCompute {
|
|||
input_v->getRawDataPtr<float *>(),
|
||||
position_id->getRawDataPtr<int *>(),
|
||||
output_matmul->getRawDataPtr<float *>(),
|
||||
metadata);
|
||||
metadata, (float *)p_workspace,
|
||||
(float *)(p_workspace + (1ll << 30)));
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -41,10 +42,14 @@ class AttentionKVCacheCuda : private AttentionKVCacheCompute,
|
|||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
IT_ASSERT(_op->getDType() == DataType::Float32);
|
||||
|
||||
size_t workspaceSize = 2ll << 30;
|
||||
auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
|
||||
CudaPtr idxWsData = context->getWorkspace(workspaceSize);
|
||||
do_compute(_op->getInputs()[0], _op->getInputs()[1],
|
||||
_op->getInputs()[2], _op->getInputs()[3],
|
||||
_op->getInputs()[4], _op->getInputs()[5],
|
||||
_op->getOutputs()[0]);
|
||||
_op->getOutputs()[0], idxWsData);
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -2,127 +2,170 @@
|
|||
#include "cuda/cuda_attention_kvcache.h"
|
||||
#define WARP_SIZE 32
|
||||
#define BLOCKSIZE WARP_SIZE
|
||||
#define SEQ_UNIT 64
|
||||
#define SEQ_UNIT 16
|
||||
|
||||
__global__ void _attention_kvcache_kernel(float* input_k_cache,
|
||||
// ASSUME SEQ_LEN OF Q IS 1
|
||||
__global__ void _attention_kvcache_kernel_128_1(float* input_k_cache,
|
||||
float* input_v_cache,
|
||||
float* input_q,
|
||||
float* input_k,
|
||||
float* input_v,
|
||||
int* position_id,
|
||||
float* output_matmul,
|
||||
AttentionKVCacheMetadata compMeta) {
|
||||
AttentionKVCacheMetadata compMeta,
|
||||
float* output_O_temp,
|
||||
float* output_sum_temp) {
|
||||
int seq_length = position_id[0] + 1;
|
||||
int stride = (seq_length + SEQ_UNIT - 1) / SEQ_UNIT;
|
||||
if(blockIdx.y >= stride)
|
||||
return;
|
||||
|
||||
int lane_id = threadIdx.x % WARP_SIZE;
|
||||
int group_id = threadIdx.x / WARP_SIZE;
|
||||
int parallel_idx = blockIdx.x * (blockDim.x / WARP_SIZE) + group_id;
|
||||
int idx_seq = blockIdx.y * SEQ_UNIT;
|
||||
|
||||
if(parallel_idx >= compMeta.dimSize[0] * compMeta.dimSize[1])
|
||||
return;
|
||||
|
||||
float ptr_V[SEQ_UNIT*2];
|
||||
float ptr_K[SEQ_UNIT*2];
|
||||
float ptr_Q[2];
|
||||
float ptr_P[SEQ_UNIT];
|
||||
float ptr_V[SEQ_UNIT*4]; // V
|
||||
float ptr_K[SEQ_UNIT*4]; // K
|
||||
float ptr_Q[4]; // Q
|
||||
float ptr_P[SEQ_UNIT] = {0};
|
||||
|
||||
float ptr_O[2];
|
||||
float ptr_max[1];
|
||||
float ptr_sum[1];
|
||||
float ptr_O[4] = {0};
|
||||
float ptr_sum[1] = {0};
|
||||
|
||||
float ptr_max_last[1];
|
||||
float ptr_sum_last[1];
|
||||
float ptr_O_last[2];
|
||||
// readin Q
|
||||
(float4 &)ptr_Q[0] = (float4 &)input_q[(lane_id * 4) + (parallel_idx * 128)];
|
||||
int common_idx = (lane_id * 4) + (parallel_idx * compMeta.stride[1]);
|
||||
|
||||
(float2 &)ptr_Q[0] = (float2 &)input_q[(lane_id * 2) + (parallel_idx * 64)];
|
||||
|
||||
int SEQ_LENGTH = position_id[0] + 1;
|
||||
|
||||
int common_idx = (lane_id * 2) + (parallel_idx * compMeta.stride[1]);
|
||||
|
||||
|
||||
for (int idx_seq = 0; idx_seq < SEQ_LENGTH; idx_seq += SEQ_UNIT){
|
||||
ptr_max_last[0] = ptr_max[0];
|
||||
ptr_sum_last[0] = ptr_sum[0];
|
||||
(float2 &)ptr_O_last[0] = (float2 &)ptr_O[0];
|
||||
// Q*K
|
||||
#pragma unroll
|
||||
for (int idx_SEQ_UNIT = 0; idx_SEQ_UNIT < SEQ_UNIT && idx_SEQ_UNIT + idx_seq < seq_length; idx_SEQ_UNIT ++) {
|
||||
if(idx_SEQ_UNIT + idx_seq < seq_length - 1){
|
||||
(float4 &)ptr_K[idx_SEQ_UNIT * 4]
|
||||
= (float4 &) input_k_cache[common_idx + ((idx_SEQ_UNIT + idx_seq) * compMeta.stride[2])];
|
||||
}
|
||||
else{
|
||||
(float4 &)ptr_K[idx_SEQ_UNIT * 4]
|
||||
= (float4 &) input_k[((lane_id * 4) + parallel_idx * compMeta.stride[2])];
|
||||
(float4 &)input_k_cache[common_idx + ((idx_SEQ_UNIT + idx_seq) * compMeta.stride[2])] =
|
||||
(float4 &)ptr_K[idx_SEQ_UNIT * 4];
|
||||
}
|
||||
|
||||
|
||||
#pragma unroll
|
||||
for (int idx_SEQ_UNIT = 0; idx_SEQ_UNIT < SEQ_UNIT && idx_SEQ_UNIT + idx_seq < SEQ_LENGTH; idx_SEQ_UNIT ++) {
|
||||
if(idx_SEQ_UNIT + idx_seq < SEQ_LENGTH - 1){
|
||||
(float2 &)ptr_K[idx_SEQ_UNIT * 2]
|
||||
= (float2 &) input_k_cache[common_idx + ((idx_SEQ_UNIT + idx_seq) * compMeta.stride[2])];
|
||||
}
|
||||
else{
|
||||
(float2 &)ptr_K[idx_SEQ_UNIT * 2]
|
||||
= (float2 &) input_k[((lane_id * 2) + parallel_idx * compMeta.stride[2])];
|
||||
(float2 &)input_k_cache[common_idx + ((idx_SEQ_UNIT + idx_seq) * compMeta.stride[2])] =
|
||||
(float2 &)ptr_K[idx_SEQ_UNIT * 2];
|
||||
}
|
||||
ptr_K[idx_SEQ_UNIT * 2] = ptr_Q[0] * ptr_K[idx_SEQ_UNIT * 2];
|
||||
ptr_K[idx_SEQ_UNIT * 2 + 1] = ptr_Q[1] * ptr_K[idx_SEQ_UNIT * 2 + 1];
|
||||
|
||||
for (int i = 0; i < 4; i ++){
|
||||
ptr_K[idx_SEQ_UNIT * 4 + i] = ptr_Q[i] * ptr_K[idx_SEQ_UNIT * 4 + i];
|
||||
#pragma unroll
|
||||
for (int offset = 16; offset > 0; offset /= 2) {
|
||||
ptr_K[idx_SEQ_UNIT * 2] += __shfl_down_sync(0xffffffff, ptr_K[idx_SEQ_UNIT * 2], offset);
|
||||
ptr_K[idx_SEQ_UNIT * 4 + i] += __shfl_down_sync(0xffffffff, ptr_K[idx_SEQ_UNIT * 4 + i], offset);
|
||||
}
|
||||
ptr_P[idx_SEQ_UNIT] = ptr_K[idx_SEQ_UNIT * 2];
|
||||
#pragma unroll
|
||||
for (int offset = 16; offset > 0; offset /= 2){
|
||||
ptr_K[((idx_SEQ_UNIT * 2) + 1)] += __shfl_down_sync(0xffffffff, ptr_K[((idx_SEQ_UNIT * 2) + 1)], offset);
|
||||
}
|
||||
ptr_P[idx_SEQ_UNIT] += ptr_K[((idx_SEQ_UNIT * 2) + 1)];
|
||||
ptr_P[idx_SEQ_UNIT] += ptr_K[idx_SEQ_UNIT * 4 + i];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int idx_SEQ_UNIT = 0; idx_SEQ_UNIT < SEQ_UNIT && idx_SEQ_UNIT + idx_seq < SEQ_LENGTH; idx_SEQ_UNIT ++) {
|
||||
ptr_P[idx_SEQ_UNIT] = __shfl_sync(0xffffffff, ptr_P[idx_SEQ_UNIT], 0);
|
||||
ptr_P[idx_SEQ_UNIT] /= 8;
|
||||
ptr_max[0] = (idx_SEQ_UNIT == 0) ? ptr_P[0] : max(ptr_max[0], ptr_P[idx_SEQ_UNIT]);
|
||||
}
|
||||
ptr_max[0] = (idx_seq == 0) ? ptr_max[0] : max(ptr_max[0], ptr_max_last[0]);
|
||||
|
||||
ptr_sum[0] = 0;
|
||||
#pragma unroll
|
||||
for (int idx_SEQ_UNIT = 0; idx_SEQ_UNIT < SEQ_UNIT && idx_SEQ_UNIT + idx_seq < SEQ_LENGTH; idx_SEQ_UNIT ++) {
|
||||
ptr_P[idx_SEQ_UNIT] = expf(ptr_P[idx_SEQ_UNIT] - ptr_max[0]);
|
||||
ptr_sum[0] += ptr_P[idx_SEQ_UNIT];
|
||||
}
|
||||
ptr_sum[0] = (idx_seq == 0) ? ptr_sum[0] : expf(ptr_max_last[0] - ptr_max[0]) * ptr_sum_last[0] + ptr_sum[0];
|
||||
|
||||
ptr_O[0] = 0;
|
||||
ptr_O[1] = 0;
|
||||
#pragma unroll
|
||||
for (int idx_SEQ_UNIT = 0; idx_SEQ_UNIT < SEQ_UNIT && idx_SEQ_UNIT + idx_seq < SEQ_LENGTH; idx_SEQ_UNIT ++) {
|
||||
if(idx_SEQ_UNIT + idx_seq < SEQ_LENGTH - 1){
|
||||
(float2 &)ptr_V[idx_SEQ_UNIT * 2]
|
||||
= (float2 &) input_v_cache[common_idx + ((idx_SEQ_UNIT + idx_seq) * compMeta.stride[2])];
|
||||
}
|
||||
else{
|
||||
(float2 &)ptr_V[idx_SEQ_UNIT * 2]
|
||||
= (float2 &) input_v[((lane_id * 2) + parallel_idx * compMeta.stride[2])];
|
||||
(float2 &)input_v_cache[common_idx + ((idx_SEQ_UNIT + idx_seq) * compMeta.stride[2])] =
|
||||
(float2 &)ptr_V[idx_SEQ_UNIT * 2];
|
||||
}
|
||||
|
||||
ptr_P[idx_SEQ_UNIT] /= ptr_sum[0];
|
||||
|
||||
ptr_O[0] = fmaf(ptr_P[idx_SEQ_UNIT], ptr_V[(idx_SEQ_UNIT * 2)], ptr_O[0]);
|
||||
ptr_O[1] = fmaf(ptr_P[idx_SEQ_UNIT], ptr_V[(idx_SEQ_UNIT * 2) + 1], ptr_O[1]);
|
||||
}
|
||||
ptr_O[0] = (idx_seq == 0) ? ptr_O[0] : ptr_O[0] + ptr_O_last[0] * expf(ptr_max_last[0] - ptr_max[0]) * ptr_sum_last[0] / ptr_sum[0];
|
||||
ptr_O[1] = (idx_seq == 0) ? ptr_O[1] : ptr_O[1] + ptr_O_last[1] * expf(ptr_max_last[0] - ptr_max[0]) * ptr_sum_last[0] / ptr_sum[0];
|
||||
}
|
||||
(float2 &)output_matmul[(lane_id * 2) + (parallel_idx * compMeta.dimSize[3])] = (float2 &)ptr_O[0];
|
||||
|
||||
// div sqrt(d)
|
||||
#pragma unroll
|
||||
for (int idx_SEQ_UNIT = 0; idx_SEQ_UNIT < SEQ_UNIT && idx_SEQ_UNIT + idx_seq < seq_length; idx_SEQ_UNIT ++) {
|
||||
ptr_P[idx_SEQ_UNIT] = __shfl_sync(0xffffffff, ptr_P[idx_SEQ_UNIT], 0);
|
||||
ptr_P[idx_SEQ_UNIT] /= sqrt(128.0);
|
||||
}
|
||||
|
||||
// softmax
|
||||
#pragma unroll
|
||||
for (int idx_SEQ_UNIT = 0; idx_SEQ_UNIT < SEQ_UNIT && idx_SEQ_UNIT + idx_seq < seq_length; idx_SEQ_UNIT ++) {
|
||||
ptr_P[idx_SEQ_UNIT] = expf(ptr_P[idx_SEQ_UNIT]);
|
||||
ptr_sum[0] += ptr_P[idx_SEQ_UNIT];
|
||||
}
|
||||
|
||||
// * V
|
||||
#pragma unroll
|
||||
for (int idx_SEQ_UNIT = 0; idx_SEQ_UNIT < SEQ_UNIT && idx_SEQ_UNIT + idx_seq < seq_length; idx_SEQ_UNIT ++) {
|
||||
if(idx_SEQ_UNIT + idx_seq < seq_length - 1){
|
||||
(float4 &)ptr_V[idx_SEQ_UNIT * 4]
|
||||
= (float4 &) input_v_cache[common_idx + ((idx_SEQ_UNIT + idx_seq) * compMeta.stride[2])];
|
||||
}
|
||||
else{
|
||||
(float4 &)ptr_V[idx_SEQ_UNIT * 4]
|
||||
= (float4 &) input_v[((lane_id * 4) + parallel_idx * compMeta.stride[2])];
|
||||
(float4 &)input_v_cache[common_idx + ((idx_SEQ_UNIT + idx_seq) * compMeta.stride[2])]
|
||||
= (float4 &)ptr_V[idx_SEQ_UNIT * 4];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 4; i ++)
|
||||
ptr_O[i] = fmaf(ptr_P[idx_SEQ_UNIT], ptr_V[(idx_SEQ_UNIT * 4 + i)], ptr_O[i]);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 4; i ++)
|
||||
ptr_O[i] /= ptr_sum[0];
|
||||
|
||||
(float4 &)output_O_temp[(lane_id * 4) + (blockIdx.y * compMeta.dimSize[3]) + (parallel_idx * compMeta.dimSize[3] * stride)] = (float4 &)ptr_O[0];
|
||||
if(lane_id == 0){
|
||||
output_sum_temp[blockIdx.y + parallel_idx * stride] = ptr_sum[0];
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
__global__ void _attention_kvcache_kernel_128_2(int* position_id,
|
||||
float* output_matmul,
|
||||
AttentionKVCacheMetadata compMeta,
|
||||
float* output_O_temp,
|
||||
float* output_sum_temp) {
|
||||
int lane_id = threadIdx.x % WARP_SIZE;
|
||||
int group_id = threadIdx.x / WARP_SIZE;
|
||||
int parallel_idx = blockIdx.x * (blockDim.x / WARP_SIZE) + group_id;
|
||||
|
||||
float ptr_O[4] = {0};
|
||||
float ptr_O_sum[4] = {0};
|
||||
float ptr_sum = 0;
|
||||
float ptr_sum_temp;
|
||||
int size = (position_id[0] + SEQ_UNIT) / SEQ_UNIT;
|
||||
|
||||
#pragma unroll
|
||||
for(int i = 0; i < size; i ++){
|
||||
(float4 &)ptr_O[0]
|
||||
= (float4 &)output_O_temp[(lane_id * 4) + (i * compMeta.dimSize[3]) + parallel_idx * compMeta.dimSize[3] * size];
|
||||
ptr_sum_temp = output_sum_temp[i + parallel_idx * size];
|
||||
|
||||
#pragma unroll
|
||||
for(int k = 0; k < 4; k ++)
|
||||
ptr_O_sum[k] += ptr_O[k] * ptr_sum_temp;
|
||||
ptr_sum += ptr_sum_temp;
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for(int k = 0; k < 4; k ++)
|
||||
ptr_O_sum[k] = ptr_O_sum[k] / ptr_sum;
|
||||
|
||||
(float4 &)output_matmul[(lane_id * 4) + (parallel_idx * compMeta.dimSize[3])] = (float4 &)ptr_O_sum[0];
|
||||
|
||||
}
|
||||
|
||||
|
||||
namespace infini {
|
||||
void attention_kvcache_kernel(float *input_k_cache, float *input_v_cache, float *input_q, float *input_k,
|
||||
float *input_v, int *position_id, float *output_matmul,
|
||||
const AttentionKVCacheMetadata &compMeta) {
|
||||
IT_ASSERT(compMeta.dimSize[3] == 64);
|
||||
dim3 gridDim(compMeta.dimSize[0]*compMeta.dimSize[1]/(BLOCKSIZE/WARP_SIZE), 1);
|
||||
void attention_kvcache_kernel(float *input_k_cache, float *input_v_cache,
|
||||
float *input_q, float *input_k,
|
||||
float *input_v, int *position_id, float *output_matmul,
|
||||
const AttentionKVCacheMetadata &compMeta,
|
||||
float *output_O_temp, float *output_sum_temp) {
|
||||
IT_ASSERT(compMeta.dimSize[3] == 128);
|
||||
|
||||
int gridsize_y = (compMeta.dimSize[2] - 1 + SEQ_UNIT) / SEQ_UNIT;
|
||||
dim3 gridDim(compMeta.dimSize[0]*compMeta.dimSize[1]/(BLOCKSIZE/WARP_SIZE), gridsize_y);
|
||||
dim3 blockDim(BLOCKSIZE, 1);
|
||||
|
||||
_attention_kvcache_kernel<<<gridDim, blockDim>>>(
|
||||
input_k_cache, input_v_cache, input_q, input_k, input_v, position_id, output_matmul, compMeta);
|
||||
_attention_kvcache_kernel_128_1
|
||||
<<<gridDim, blockDim, 0, CUDAStream::getCurrentStream()>>>
|
||||
(input_k_cache, input_v_cache, input_q, input_k, input_v, position_id,
|
||||
compMeta, output_O_temp, output_sum_temp);
|
||||
|
||||
_attention_kvcache_kernel_128_2
|
||||
<<<compMeta.dimSize[0]*compMeta.dimSize[1]/(BLOCKSIZE/WARP_SIZE), WARP_SIZE,
|
||||
0, CUDAStream::getCurrentStream()>>>
|
||||
(position_id, output_matmul, compMeta, output_O_temp, output_sum_temp);
|
||||
}
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -25,8 +25,9 @@ void clip_kernel(float *input, float *output, int num, float minValue,
|
|||
float maxValue) {
|
||||
int blocksize = block_work_size();
|
||||
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
||||
_clip_kernel<<<gridsize, blocksize>>>(input, output, num, minValue,
|
||||
maxValue);
|
||||
_clip_kernel
|
||||
<<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>(
|
||||
input, output, num, minValue, maxValue);
|
||||
}
|
||||
|
||||
}; // namespace infini
|
||||
|
|
|
@ -115,6 +115,20 @@ class ElementWiseCuda : public CudaKernelWithoutConfig {
|
|||
auto a_dim = op->getInputs(0)->getDims();
|
||||
auto b_dim = op->getInputs(1)->getDims();
|
||||
auto c_dim = op->getOutput()->getDims();
|
||||
const int dType = _op->getDType().getIndex();
|
||||
|
||||
// Use optimized kernel if b is constant
|
||||
if (b_dim.size() == 0) {
|
||||
if (op->getOpType() == OpType::Div) {
|
||||
div_const_kernel(dType, aData, bData, cData,
|
||||
op->getOutput()->size());
|
||||
return;
|
||||
} else if (op->getOpType() == OpType::Pow) {
|
||||
pow_const_kernel(dType, aData, bData, cData,
|
||||
op->getOutput()->size());
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (a_dim.size() > 4 || b_dim.size() > 4 || c_dim.size() > 4)
|
||||
IT_TODO_HALT();
|
||||
|
@ -127,7 +141,6 @@ class ElementWiseCuda : public CudaKernelWithoutConfig {
|
|||
std::copy(b_dim.begin(), b_dim.end(), b + (4 - b_dim.size()));
|
||||
std::copy(c_dim.begin(), c_dim.end(), c + (4 - c_dim.size()));
|
||||
|
||||
const int dType = _op->getDType().getIndex();
|
||||
if (op->getOpType() == OpType::Div) {
|
||||
div_kernel(dType, aData, bData, cData, a[0], a[1], a[2], a[3], b[0],
|
||||
b[1], b[2], b[3], c[0], c[1], c[2], c[3]);
|
||||
|
|
|
@ -131,8 +131,9 @@ __global__ void _less_kernel(void *x, void *y, void *z, int a0, int a1, int a2,
|
|||
}
|
||||
|
||||
#define CASE(OP, T) \
|
||||
_##OP##_kernel<DT_CUDA<T>::t><<<gridsize, blocksize>>>( \
|
||||
a, b, c, a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3);
|
||||
_##OP##_kernel<DT_CUDA<T>::t> \
|
||||
<<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>( \
|
||||
a, b, c, a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3);
|
||||
|
||||
#define SWITCH_DTYPE(OP, DTYPE) \
|
||||
switch (DTYPE) { \
|
||||
|
@ -176,7 +177,92 @@ __global__ void _less_kernel(void *x, void *y, void *z, int a0, int a1, int a2,
|
|||
IT_TODO_HALT(); \
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__global__ void _div_const_kernel(void const *__restrict__ x,
|
||||
void const *__restrict__ y,
|
||||
void *__restrict__ z, const size_t n) {
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid < n) {
|
||||
((T *)z)[tid] = ((T *)x)[tid] / *((T *)y);
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__global__ void _pow_const_kernel(void const *__restrict__ x,
|
||||
void const *__restrict__ y,
|
||||
void *__restrict__ z, const size_t n) {
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid < n) {
|
||||
((T *)z)[tid] = pow(((T *)x)[tid], *((T *)y));
|
||||
}
|
||||
}
|
||||
template <>
|
||||
__global__ void _pow_const_kernel<half>(void const *__restrict__ x,
|
||||
void const *__restrict__ y,
|
||||
void *__restrict__ z, const size_t n) {
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid < n) {
|
||||
((half *)z)[tid] = pow(((float)((half *)x)[tid]), *((half *)y));
|
||||
}
|
||||
}
|
||||
|
||||
#define CASE_CONST(OP, T) \
|
||||
_##OP##_const_kernel<DT_CUDA<T>::t> \
|
||||
<<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>(a, b, c, \
|
||||
n);
|
||||
|
||||
#define SWITCH_DTYPE_CONST(OP, DTYPE) \
|
||||
switch (DTYPE) { \
|
||||
case 1: \
|
||||
CASE_CONST(OP, 1) \
|
||||
break; \
|
||||
case 2: \
|
||||
CASE_CONST(OP, 2) \
|
||||
break; \
|
||||
case 3: \
|
||||
CASE_CONST(OP, 3) \
|
||||
break; \
|
||||
case 4: \
|
||||
CASE_CONST(OP, 4) \
|
||||
break; \
|
||||
case 5: \
|
||||
CASE_CONST(OP, 5) \
|
||||
break; \
|
||||
case 6: \
|
||||
CASE_CONST(OP, 6) \
|
||||
break; \
|
||||
case 7: \
|
||||
CASE_CONST(OP, 7) \
|
||||
break; \
|
||||
case 10: \
|
||||
CASE_CONST(OP, 10) \
|
||||
break; \
|
||||
case 11: \
|
||||
CASE_CONST(OP, 11) \
|
||||
break; \
|
||||
case 12: \
|
||||
CASE_CONST(OP, 12) \
|
||||
break; \
|
||||
case 13: \
|
||||
CASE_CONST(OP, 13) \
|
||||
break; \
|
||||
default: \
|
||||
IT_TODO_HALT(); \
|
||||
}
|
||||
|
||||
namespace infini {
|
||||
void div_const_kernel(int dType, void *a, void *b, void *c, size_t n) {
|
||||
size_t blocksize = block_work_size();
|
||||
size_t gridsize = (n + block_work_size() - 1) / block_work_size();
|
||||
SWITCH_DTYPE_CONST(div, dType);
|
||||
}
|
||||
|
||||
void pow_const_kernel(int dType, void *a, void *b, void *c, size_t n) {
|
||||
size_t blocksize = block_work_size();
|
||||
size_t gridsize = (n + block_work_size() - 1) / block_work_size();
|
||||
SWITCH_DTYPE_CONST(pow, dType);
|
||||
}
|
||||
|
||||
void div_kernel(int dType, void *a, void *b, void *c, int a0, int a1, int a2,
|
||||
int a3, int b0, int b1, int b2, int b3, int c0, int c1, int c2,
|
||||
int c3) {
|
||||
|
@ -202,11 +288,13 @@ void pow_kernel(int dType, void *a, void *b, void *c, int a0, int a1, int a2,
|
|||
int num = c0 * c1 * c2 * c3;
|
||||
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
||||
if (dType == 1) {
|
||||
_pow_kernel<float><<<gridsize, blocksize>>>(a, b, c, a0, a1, a2, a3, b0,
|
||||
b1, b2, b3, c0, c1, c2, c3);
|
||||
_pow_kernel<float>
|
||||
<<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>(
|
||||
a, b, c, a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3);
|
||||
} else if (dType == 3) {
|
||||
_pow_kernel<int8_t><<<gridsize, blocksize>>>(
|
||||
a, b, c, a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3);
|
||||
_pow_kernel<int8_t>
|
||||
<<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>(
|
||||
a, b, c, a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3);
|
||||
} else if (dType == 10) {
|
||||
int a_size = a0 * a1 * a2 * a3;
|
||||
int b_size = b0 * b1 * b2 * b3;
|
||||
|
@ -220,9 +308,10 @@ void pow_kernel(int dType, void *a, void *b, void *c, int a0, int a1, int a2,
|
|||
for (int i = 0; i < b_size; ++i) {
|
||||
b_float[i] = __half2float(((half *)b)[i]);
|
||||
}
|
||||
_pow_kernel<float><<<gridsize, blocksize>>>(
|
||||
a_float.data(), b_float.data(), c_float.data(), a0, a1, a2, a3, b0,
|
||||
b1, b2, b3, c0, c1, c2, c3);
|
||||
_pow_kernel<float>
|
||||
<<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>(
|
||||
a_float.data(), b_float.data(), c_float.data(), a0, a1, a2, a3,
|
||||
b0, b1, b2, b3, c0, c1, c2, c3);
|
||||
for (int i = 0; i < c_size; ++i) {
|
||||
((half *)c)[i] = __float2half(c_float[i]);
|
||||
}
|
||||
|
|
|
@ -39,10 +39,19 @@ __global__ void _expandKernel(void *input, void *output, int nDims,
|
|||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
static __global__ void _expandRowKernel(void *__restrict__ dst,
|
||||
void const *__restrict__ src) {
|
||||
auto da = gridDim.x, db = blockDim.y, dx = blockDim.x, n = blockIdx.y,
|
||||
a = blockIdx.x, b = threadIdx.y, x = threadIdx.x;
|
||||
auto i = ((n * da + a) * db + b) * dx + x, j = (a * db + b) * dx + x;
|
||||
reinterpret_cast<T *>(dst)[i] = reinterpret_cast<T const *>(src)[j];
|
||||
}
|
||||
namespace infini {
|
||||
|
||||
#define CASE(T) \
|
||||
_expandKernel<DT_CUDA<T>::t><<<gridsize, blocksize>>>( \
|
||||
_expandKernel<DT_CUDA<T>::t><<<gridsize, blocksize, \
|
||||
0, CUDAStream::getCurrentStream()>>>( \
|
||||
input, output, nDims, outputsize, inputShape, outputShape);
|
||||
|
||||
#define SWITCH_DTYPE(DTYPE) \
|
||||
|
@ -95,4 +104,67 @@ void expandKernel(int dType, void *input, void *output, int nDims,
|
|||
SWITCH_DTYPE(dType)
|
||||
}
|
||||
|
||||
#define CASE_ROW(T) \
|
||||
_expandRowKernel<float> \
|
||||
<<<grid, block, 0, CUDAStream::getCurrentStream()>>>(output, input);
|
||||
|
||||
#define SWITCH_DTYPE_ROW(DTYPE) \
|
||||
switch (DTYPE) { \
|
||||
case 1: \
|
||||
CASE_ROW(1) \
|
||||
break; \
|
||||
case 2: \
|
||||
CASE_ROW(2) \
|
||||
break; \
|
||||
case 3: \
|
||||
CASE_ROW(3) \
|
||||
break; \
|
||||
case 4: \
|
||||
CASE_ROW(4) \
|
||||
break; \
|
||||
case 5: \
|
||||
CASE_ROW(5) \
|
||||
break; \
|
||||
case 6: \
|
||||
CASE_ROW(6) \
|
||||
break; \
|
||||
case 7: \
|
||||
CASE_ROW(7) \
|
||||
break; \
|
||||
case 10: \
|
||||
CASE_ROW(10) \
|
||||
break; \
|
||||
case 11: \
|
||||
CASE_ROW(11) \
|
||||
break; \
|
||||
case 12: \
|
||||
CASE_ROW(12) \
|
||||
break; \
|
||||
case 13: \
|
||||
CASE_ROW(13) \
|
||||
break; \
|
||||
case 16: \
|
||||
CASE_ROW(16) \
|
||||
break; \
|
||||
default: \
|
||||
IT_TODO_HALT(); \
|
||||
}
|
||||
|
||||
// Optimization for expanding a row vector. The row length must be a multiple of 32
|
||||
void expandRowKernel(int dType, void *input, void *output, int n_rows,
|
||||
int row_len) {
|
||||
// Factorize row_len: row_len = a x b x 32 (32 is the warp size), b<=32
|
||||
// input: 1 x (a x b x 32 x sizeT)
|
||||
// output: n_rows x (a x b x 32 x sizeT)
|
||||
// grid: n_rows x a
|
||||
// block: b x 32
|
||||
auto c = row_len / 32, b = c;
|
||||
if (b > 32) {
|
||||
for (b = 32; c % b != 0; --b);
|
||||
}
|
||||
auto a = c / b;
|
||||
dim3 grid(a, n_rows), block(32, b);
|
||||
SWITCH_DTYPE_ROW(dType)
|
||||
}
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -19,7 +19,8 @@ void extend_kernel(float *in, float *out, int blockSize, int blockSizeOuter,
|
|||
int oSize) {
|
||||
int blocksize = 32 * 16;
|
||||
int gridsize = (oSize + blocksize - 1) / blocksize;
|
||||
_extend_kernel<<<gridsize, blocksize>>>(in, out, blockSize, blockSizeOuter,
|
||||
oSize);
|
||||
_extend_kernel
|
||||
<<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>(
|
||||
in, out, blockSize, blockSizeOuter, oSize);
|
||||
}
|
||||
} // namespace infini
|
||||
|
|
|
@ -45,9 +45,12 @@ void gather_kernel(T *in, T *out, GatherMetaData metaData, size_t num) {
|
|||
int gridSize = (num + blockSize - 1) / blockSize;
|
||||
if (metaData.indexType == DataType::Int64) {
|
||||
_gather_kernel<T, int64_t>
|
||||
<<<gridSize, blockSize>>>(in, out, metaData, num);
|
||||
<<<gridSize, blockSize, 0, CUDAStream::getCurrentStream()>>>
|
||||
(in, out, metaData, num);
|
||||
} else {
|
||||
_gather_kernel<T, int><<<gridSize, blockSize>>>(in, out, metaData, num);
|
||||
_gather_kernel<T, int>
|
||||
<<<gridSize, blockSize, 0, CUDAStream::getCurrentStream()>>>
|
||||
(in, out, metaData, num);
|
||||
}
|
||||
}
|
||||
template void gather_kernel<float>(float *in, float *out,
|
||||
|
|
|
@ -40,22 +40,26 @@ void gather_elements_kernel(void *in, void *out, GatherMetaData metaData,
|
|||
int gridSize = (num + blockSize - 1) / blockSize;
|
||||
if (metaData.dataType == DataType::Float32 &&
|
||||
metaData.indexType == DataType::Int64) {
|
||||
_gather_elements_kernel<float, int64_t><<<gridSize, blockSize>>>(
|
||||
_gather_elements_kernel<float, int64_t>
|
||||
<<<gridSize, blockSize, 0, CUDAStream::getCurrentStream()>>>(
|
||||
reinterpret_cast<float *>(in), reinterpret_cast<float *>(out),
|
||||
metaData, num);
|
||||
} else if (metaData.dataType == DataType::Int32 &&
|
||||
metaData.indexType == DataType::Int64) {
|
||||
_gather_elements_kernel<int, int64_t><<<gridSize, blockSize>>>(
|
||||
_gather_elements_kernel<int, int64_t>
|
||||
<<<gridSize, blockSize, 0, CUDAStream::getCurrentStream()>>>(
|
||||
reinterpret_cast<int *>(in), reinterpret_cast<int *>(out), metaData,
|
||||
num);
|
||||
} else if (metaData.dataType == DataType::Float32 &&
|
||||
metaData.indexType == DataType::Int32) {
|
||||
_gather_elements_kernel<float, int><<<gridSize, blockSize>>>(
|
||||
_gather_elements_kernel<float, int>
|
||||
<<<gridSize, blockSize, 0, CUDAStream::getCurrentStream()>>>(
|
||||
reinterpret_cast<float *>(in), reinterpret_cast<float *>(out),
|
||||
metaData, num);
|
||||
} else if (metaData.dataType == DataType::Int32 &&
|
||||
metaData.indexType == DataType::Int32) {
|
||||
_gather_elements_kernel<int, int><<<gridSize, blockSize>>>(
|
||||
_gather_elements_kernel<int, int>
|
||||
<<<gridSize, blockSize, 0, CUDAStream::getCurrentStream()>>>(
|
||||
reinterpret_cast<int *>(in), reinterpret_cast<int *>(out), metaData,
|
||||
num);
|
||||
} else {
|
||||
|
|
|
@ -344,8 +344,8 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
|
|||
int BLOCK_DIM = 1024;
|
||||
|
||||
blockLaynormKernel<float, 1024>
|
||||
<<<num_block, BLOCK_DIM>>>(input, scale, dimsize, stride, output,
|
||||
eps, scaleSize, bias, biasSize);
|
||||
<<<num_block, BLOCK_DIM, 0, CUDAStream::getCurrentStream()>>>
|
||||
(input, scale, dimsize, stride, output, eps, scaleSize, bias, biasSize);
|
||||
} else if (dimsize > 31) {
|
||||
int BLOCK_DIM_x = 32;
|
||||
int BLOCK_DIM_y = 32;
|
||||
|
@ -353,9 +353,10 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
|
|||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpLaynormKernel<float, 32, 32><<<grid_dim, block_dim>>>(
|
||||
input, scale, dimsize, stride, output, eps, scaleSize, num_block,
|
||||
bias, biasSize);
|
||||
warpLaynormKernel<float, 32, 32>
|
||||
<<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
|
||||
(input, scale, dimsize, stride, output, eps, scaleSize, num_block,
|
||||
bias, biasSize);
|
||||
} else if (dimsize > 15) {
|
||||
int BLOCK_DIM_x = 16;
|
||||
int BLOCK_DIM_y = 64;
|
||||
|
@ -363,8 +364,9 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
|
|||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpLaynormKernel<float, 16, 64><<<grid_dim, block_dim>>>(
|
||||
input, scale, dimsize, stride, output, eps, scaleSize, num_block,
|
||||
warpLaynormKernel<float, 16, 64>
|
||||
<<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
|
||||
(input, scale, dimsize, stride, output, eps, scaleSize, num_block,
|
||||
bias, biasSize);
|
||||
} else if (dimsize > 7) {
|
||||
int BLOCK_DIM_x = 8;
|
||||
|
@ -373,8 +375,9 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
|
|||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpLaynormKernel<float, 8, 128><<<grid_dim, block_dim>>>(
|
||||
input, scale, dimsize, stride, output, eps, scaleSize, num_block,
|
||||
warpLaynormKernel<float, 8, 128>
|
||||
<<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
|
||||
(input, scale, dimsize, stride, output, eps, scaleSize, num_block,
|
||||
bias, biasSize);
|
||||
} else {
|
||||
int BLOCK_DIM_x = 4;
|
||||
|
@ -383,8 +386,9 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
|
|||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpLaynormKernel<float, 4, 256><<<grid_dim, block_dim>>>(
|
||||
input, scale, dimsize, stride, output, eps, scaleSize, num_block,
|
||||
warpLaynormKernel<float, 4, 256>
|
||||
<<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
|
||||
(input, scale, dimsize, stride, output, eps, scaleSize, num_block,
|
||||
bias, biasSize);
|
||||
}
|
||||
}
|
||||
|
@ -396,8 +400,9 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
|
|||
if (dimsize > 1024) {
|
||||
int BLOCK_DIM = 1024;
|
||||
|
||||
blockLaynormKernel<float, 1024><<<num_block, BLOCK_DIM>>>(
|
||||
input, scale, dimsize, stride, output, eps, scaleSize);
|
||||
blockLaynormKernel<float, 1024>
|
||||
<<<num_block, BLOCK_DIM, 0, CUDAStream::getCurrentStream()>>>
|
||||
(input, scale, dimsize, stride, output, eps, scaleSize);
|
||||
} else if (dimsize > 31) {
|
||||
int BLOCK_DIM_x = 32;
|
||||
int BLOCK_DIM_y = 32;
|
||||
|
@ -405,8 +410,9 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
|
|||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpLaynormKernel<float, 32, 32><<<grid_dim, block_dim>>>(
|
||||
input, scale, dimsize, stride, output, eps, scaleSize, num_block);
|
||||
warpLaynormKernel<float, 32, 32>
|
||||
<<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
|
||||
(input, scale, dimsize, stride, output, eps, scaleSize, num_block);
|
||||
} else if (dimsize > 15) {
|
||||
int BLOCK_DIM_x = 16;
|
||||
int BLOCK_DIM_y = 64;
|
||||
|
@ -414,8 +420,9 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
|
|||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpLaynormKernel<float, 16, 64><<<grid_dim, block_dim>>>(
|
||||
input, scale, dimsize, stride, output, eps, scaleSize, num_block);
|
||||
warpLaynormKernel<float, 16, 64>
|
||||
<<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
|
||||
(input, scale, dimsize, stride, output, eps, scaleSize, num_block);
|
||||
} else if (dimsize > 7) {
|
||||
int BLOCK_DIM_x = 8;
|
||||
int BLOCK_DIM_y = 128;
|
||||
|
@ -423,8 +430,9 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
|
|||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpLaynormKernel<float, 8, 128><<<grid_dim, block_dim>>>(
|
||||
input, scale, dimsize, stride, output, eps, scaleSize, num_block);
|
||||
warpLaynormKernel<float, 8, 128>
|
||||
<<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
|
||||
(input, scale, dimsize, stride, output, eps, scaleSize, num_block);
|
||||
} else {
|
||||
int BLOCK_DIM_x = 4;
|
||||
int BLOCK_DIM_y = 256;
|
||||
|
@ -432,8 +440,9 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
|
|||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpLaynormKernel<float, 4, 256><<<grid_dim, block_dim>>>(
|
||||
input, scale, dimsize, stride, output, eps, scaleSize, num_block);
|
||||
warpLaynormKernel<float, 4, 256>
|
||||
<<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
|
||||
(input, scale, dimsize, stride, output, eps, scaleSize, num_block);
|
||||
}
|
||||
}
|
||||
//-----------------
|
||||
|
@ -445,8 +454,8 @@ void LaynormKernel(const half *input, const half *scale, const half eps,
|
|||
int BLOCK_DIM = 1024;
|
||||
|
||||
blockLaynormKernel<half, 1024>
|
||||
<<<num_block, BLOCK_DIM>>>(input, scale, dimsize, stride, output,
|
||||
eps, scaleSize, bias, biasSize);
|
||||
<<<num_block, BLOCK_DIM, 0, CUDAStream::getCurrentStream()>>>
|
||||
(input, scale, dimsize, stride, output, eps, scaleSize, bias, biasSize);
|
||||
} else if (dimsize > 31) {
|
||||
int BLOCK_DIM_x = 32;
|
||||
int BLOCK_DIM_y = 32;
|
||||
|
@ -454,8 +463,9 @@ void LaynormKernel(const half *input, const half *scale, const half eps,
|
|||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpLaynormKernel<half, 32, 32><<<grid_dim, block_dim>>>(
|
||||
input, scale, dimsize, stride, output, eps, scaleSize, num_block,
|
||||
warpLaynormKernel<half, 32, 32>
|
||||
<<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
|
||||
(input, scale, dimsize, stride, output, eps, scaleSize, num_block,
|
||||
bias, biasSize);
|
||||
} else if (dimsize > 15) {
|
||||
int BLOCK_DIM_x = 16;
|
||||
|
@ -464,8 +474,9 @@ void LaynormKernel(const half *input, const half *scale, const half eps,
|
|||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpLaynormKernel<half, 16, 64><<<grid_dim, block_dim>>>(
|
||||
input, scale, dimsize, stride, output, eps, scaleSize, num_block,
|
||||
warpLaynormKernel<half, 16, 64>
|
||||
<<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
|
||||
(input, scale, dimsize, stride, output, eps, scaleSize, num_block,
|
||||
bias, biasSize);
|
||||
} else if (dimsize > 7) {
|
||||
int BLOCK_DIM_x = 8;
|
||||
|
@ -474,8 +485,9 @@ void LaynormKernel(const half *input, const half *scale, const half eps,
|
|||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpLaynormKernel<half, 8, 128><<<grid_dim, block_dim>>>(
|
||||
input, scale, dimsize, stride, output, eps, scaleSize, num_block,
|
||||
warpLaynormKernel<half, 8, 128>
|
||||
<<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
|
||||
(input, scale, dimsize, stride, output, eps, scaleSize, num_block,
|
||||
bias, biasSize);
|
||||
} else {
|
||||
int BLOCK_DIM_x = 4;
|
||||
|
@ -484,8 +496,9 @@ void LaynormKernel(const half *input, const half *scale, const half eps,
|
|||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpLaynormKernel<half, 4, 256><<<grid_dim, block_dim>>>(
|
||||
input, scale, dimsize, stride, output, eps, scaleSize, num_block,
|
||||
warpLaynormKernel<half, 4, 256>
|
||||
<<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
|
||||
(input, scale, dimsize, stride, output, eps, scaleSize, num_block,
|
||||
bias, biasSize);
|
||||
}
|
||||
}
|
||||
|
@ -497,8 +510,9 @@ void LaynormKernel(const half *input, const half *scale, const half eps,
|
|||
if (dimsize > 1024) {
|
||||
int BLOCK_DIM = 1024;
|
||||
|
||||
blockLaynormKernel<half, 1024><<<num_block, BLOCK_DIM>>>(
|
||||
input, scale, dimsize, stride, output, eps, scaleSize);
|
||||
blockLaynormKernel<half, 1024>
|
||||
<<<num_block, BLOCK_DIM, 0, CUDAStream::getCurrentStream()>>>
|
||||
(input, scale, dimsize, stride, output, eps, scaleSize);
|
||||
} else if (dimsize > 31) {
|
||||
int BLOCK_DIM_x = 32;
|
||||
int BLOCK_DIM_y = 32;
|
||||
|
@ -506,8 +520,9 @@ void LaynormKernel(const half *input, const half *scale, const half eps,
|
|||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpLaynormKernel<half, 32, 32><<<grid_dim, block_dim>>>(
|
||||
input, scale, dimsize, stride, output, eps, scaleSize, num_block);
|
||||
warpLaynormKernel<half, 32, 32>
|
||||
<<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
|
||||
(input, scale, dimsize, stride, output, eps, scaleSize, num_block);
|
||||
} else if (dimsize > 15) {
|
||||
int BLOCK_DIM_x = 16;
|
||||
int BLOCK_DIM_y = 64;
|
||||
|
@ -515,8 +530,9 @@ void LaynormKernel(const half *input, const half *scale, const half eps,
|
|||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpLaynormKernel<half, 16, 64><<<grid_dim, block_dim>>>(
|
||||
input, scale, dimsize, stride, output, eps, scaleSize, num_block);
|
||||
warpLaynormKernel<half, 16, 64>
|
||||
<<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
|
||||
(input, scale, dimsize, stride, output, eps, scaleSize, num_block);
|
||||
} else if (dimsize > 7) {
|
||||
int BLOCK_DIM_x = 8;
|
||||
int BLOCK_DIM_y = 128;
|
||||
|
@ -524,8 +540,9 @@ void LaynormKernel(const half *input, const half *scale, const half eps,
|
|||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpLaynormKernel<half, 8, 128><<<grid_dim, block_dim>>>(
|
||||
input, scale, dimsize, stride, output, eps, scaleSize, num_block);
|
||||
warpLaynormKernel<half, 8, 128>
|
||||
<<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
|
||||
(input, scale, dimsize, stride, output, eps, scaleSize, num_block);
|
||||
} else {
|
||||
int BLOCK_DIM_x = 4;
|
||||
int BLOCK_DIM_y = 256;
|
||||
|
@ -533,8 +550,9 @@ void LaynormKernel(const half *input, const half *scale, const half eps,
|
|||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpLaynormKernel<half, 4, 256><<<grid_dim, block_dim>>>(
|
||||
input, scale, dimsize, stride, output, eps, scaleSize, num_block);
|
||||
warpLaynormKernel<half, 4, 256>
|
||||
<<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
|
||||
(input, scale, dimsize, stride, output, eps, scaleSize, num_block);
|
||||
}
|
||||
}
|
||||
} // namespace infini
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue