Merge branch 'ascend' of github.com:InfiniTensor/InfiniTensor into ascend

This commit is contained in:
OdinaryWord 2024-04-25 17:28:18 +08:00
commit a765cd2a3d
155 changed files with 4262 additions and 1098 deletions

View File

@ -20,7 +20,6 @@ endif()
include(CMakeDependentOption)
project(InfiniTensor C CXX)
cmake_dependent_option(BUILD_TEST_CORE "Build tests for core components" ON BUILD_TEST OFF)
cmake_dependent_option(BUILD_TEST_PET "Build tests for PET" OFF BUILD_TEST OFF)
@ -55,11 +54,13 @@ endif()
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_EXTENSIONS OFF) # -std=gnu++11 when on, -std=c++11 when off
add_compile_options(-Wno-error=unused-variable)
find_package(
Python
COMPONENTS Interpreter Development
REQUIRED)
# OpenMP
find_package(OpenMP)
if(OpenMP_C_FOUND)
@ -252,7 +253,6 @@ if(USE_BANG)
find_library(CAMBRICON_CNNL libcnnl.so "${NEUWARE_HOME}/lib64")
find_library(CAMBRICON_CNRT libcnrt.so "${NEUWARE_HOME}/lib64")
find_library(CAMBRICON_CNDRV libcndrv.so "${NEUWARE_HOME}/lib64")
find_library(CAMBRICON_CNCL libcncl.so "${NEUWARE_HOME}/lib64")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lstdc++ -Wall -Werror")
if ((NOT DEFINED TARGET_CPU_ARCH) AND (NOT DEFINED ENV{TARGET_CPU_ARCH}))
@ -269,12 +269,13 @@ if(USE_BANG)
# BangC Kernels
################################################################################
target_link_libraries(InfiniTensor ${CAMBRICON_CNCL} ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
if (BUILD_DIST)
find_library(CAMBRICON_CNCL libcncl.so "${NEUWARE_HOME}/lib64")
target_link_libraries(InfiniTensor ${CAMBRICON_CNCL} ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
message(STATUS "Add BUILD_DIST, use CNCL with BANG")
add_compile_definitions(INFINI_USE_CNCL=1)
else()
target_link_libraries(InfiniTensor ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
endif()
endif()
@ -289,10 +290,11 @@ if(USE_KUNLUN)
endif()
message(STATUS "KUNLUN_HOME: ${KUNLUN_HOME}")
include_directories("${KUNLUN_HOME}/XTDK/include/")
find_library(KUNLUN_RT libxpurt.so "${KUNLUN_HOME}/lib64")
find_library(KUNLUN_DNN libxpuapi.so "${KUNLUN_HOME}/XTDK/shlib")
include_directories("${KUNLUN_HOME}/include/")
find_library(KUNLUN_RT libxpurt.so "${KUNLUN_HOME}/lib64/")
find_library(KUNLUN_DNN libxpuapi.so "${KUNLUN_HOME}/lib64/")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lstdc++ -Wall -Werror")
if ((NOT DEFINED TARGET_CPU_ARCH) AND (NOT DEFINED ENV{TARGET_CPU_ARCH}))
execute_process(COMMAND uname -m OUTPUT_VARIABLE _uname_m OUTPUT_STRIP_TRAILING_WHITESPACE)
set(TARGET_CPU_ARCH "${_uname_m}" CACHE STRING "Target CPU ARCH")
@ -301,7 +303,16 @@ if(USE_KUNLUN)
else()
set(TARGET_CPU_ARCH $ENV{TARGET_CPU_ARCH} CACHE STRING "Target CPU ARCH")
endif()
message(STATUS "TARGET_CPU_ARCH: ${TARGET_CPU_ARCH}")
if (BUILD_DIST)
message(STATUS "Add BUILD_DIST, use XCCL with KUNLUN XPU")
list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
find_package(XCCL REQUIRED)
add_compile_definitions(INFINI_USE_XCCL=1)
target_link_libraries(InfiniTensor ${XCCL_LIBRARIES})
endif()
target_link_libraries(InfiniTensor ${KUNLUN_RT} ${KUNLUN_DNN} stdc++)
endif()
@ -376,6 +387,7 @@ if(BUILD_TEST)
endif()
if (USE_KUNLUN)
build_test(test/kernels/kunlun/*.cc)
build_test(test/kunlun/*.cc)
endif()
if (USE_ASCEND)
build_test(test/kernels/ascend/*.cc)
@ -394,4 +406,4 @@ if(BUILD_TEST)
add_executable(nnet_reader test/nnet/readlog.cc)
target_link_libraries(nnet_reader InfiniTensor)
endif()
endif()
endif()

View File

@ -8,7 +8,9 @@ ASCEND ?= OFF
INTELCPU ?= off
BACKTRACE ?= ON
TEST ?= ON
DIST ?= OFF
NNET ?= OFF
DIST ?= OFF
FORMAT_ORIGIN ?=
# Docker build options
DOCKER_NAME ?= infinitensor
@ -31,6 +33,7 @@ CMAKE_OPT += -DUSE_KUNLUN=$(KUNLUN)
CMAKE_OPT += -DUSE_ASCEND=$(ASCEND)
CMAKE_OPT += -DUSE_BACKTRACE=$(BACKTRACE)
CMAKE_OPT += -DBUILD_TEST=$(TEST)
CMAKE_OPT += -DBUILD_DIST=$(DIST)
CMAKE_OPT += -DBUILD_NNET=$(NNET)
ifeq ($(INTELCPU), ON)

27
cmake/FindXCCL.cmake Normal file
View File

@ -0,0 +1,27 @@
# Find the xccl libraries
set(XCCL_INCLUDE_DIR $ENV{KUNLUN_HOME}/include CACHE PATH "Folder contains KUNLUN XCCL headers")
set(XCCL_LIB_DIR $ENV{KUNLUN_HOME} CACHE PATH "Folder contains KUNLUN XCCL libraries")
list(APPEND CMAKE_PREFIX_PATH $ENV{KUNLUN_HOME})
find_path(XCCL_INCLUDE_DIRS # ${XCCL_INCLUDE_DIR}
NAMES xpu/bkcl.h
HINTS XCCL_INCLUDE_DIR)
find_library(XCCL_LIBRARIES # ${XCCL_LIB_DIR}
NAMES lib64/libbkcl.so
HINTS XCCL_LIB_DIR)
message(STATUS "XCCL_INCLUDE_DIRS: ${XCCL_INCLUDE_DIRS}")
message(STATUS "XCCL_LIBRARIES: ${XCCL_LIBRARIES}")
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(XCCL DEFAULT_MSG XCCL_INCLUDE_DIRS XCCL_LIBRARIES)
if (XCCL_FOUND)
set (XCCL_HEADER_FILE "${XCCL_INCLUDE_DIRS}/xpu/bkcl.h")
message (STATUS "Determing XCCL version from ${XCCL_HEADER_FILE}...")
list (APPEND CMAKE_REQUIRED_INCLUDES ${XCCL_INCLUDE_DIRS})
message(STATUS "Found XCCL (include: ${XCCL_INCLUDE_DIRS}, library: ${XCCL_LIBRARIES})")
mark_as_advanced(XCCL_INCLUDE_DIRS XCCL_LIBRARIES)
endif()

View File

@ -2,6 +2,7 @@
## 目录
- [环境支持](#环境支持)
- [神经网络支持](#神经网络支持)
- [技术支持](#技术支持)
@ -19,10 +20,10 @@
目前已经验证过的神经网络模型有
- [x] [ResNet18-v2](https://github.com/onnx/models/blob/main/vision/classification/resnet/model/resnet18-v2-7.onnx)
- [x] [DenseNet-121-12](https://github.com/onnx/models/blob/main/vision/classification/densenet-121/model/densenet-12.onnx)
- [x] [Inception-2](https://github.com/onnx/models/blob/main/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx)
- [x] [EfficientNet-Lite4](https://github.com/onnx/models/blob/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx)
- [x] [ResNet18-v2](https://github.com/onnx/models/blob/main/validated/vision/classification/resnet/model/resnet18-v2-7.onnx)
- [x] [DenseNet-121-12](https://github.com/onnx/models/blob/main/validated/vision/classification/densenet-121/model/densenet-12.onnx)
- [x] [Inception-2](https://github.com/onnx/models/blob/main/validated/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx)
- [x] [EfficientNet-Lite4](https://github.com/onnx/models/blob/main/validated/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx)
## 技术支持

View File

@ -3,9 +3,10 @@
## 目录
- [使用方法](#使用方法)
- [python-前端应用指南](#python-前端应用指南)
- [导入-onnx-模型](#导入-onnx-模型)
- [导出-onnx-模型](#导出-onnx-模型)
- [python 前端应用指南](#python-前端应用指南)
- [导入 onnx 模型](#导入-onnx-模型)
- [优化](#优化)
- [导出 onnx 模型](#导出-onnx-模型)
- [执行推理](#执行推理)
- [样例代码](#样例代码)
- [技术支持](#技术支持)
@ -13,7 +14,7 @@
## 使用方法
项目管理功能已写到 [Makefile](Makefile),支持下列功能:
项目管理功能已写到 [Makefile](../Makefile),支持下列功能:
- 编译项目:`make`/`make build`
- 清理生成文件:`make clean`
@ -39,10 +40,10 @@
支持的模型:
- [x] [ResNet18-v2](https://github.com/onnx/models/blob/main/vision/classification/resnet/model/resnet18-v2-7.onnx)
- [x] [DenseNet-121-12](https://github.com/onnx/models/blob/main/vision/classification/densenet-121/model/densenet-12.onnx)
- [x] [Inception-2](https://github.com/onnx/models/blob/main/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx)
- [x] [EfficientNet-Lite4](https://github.com/onnx/models/blob/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx)
- [x] [ResNet18-v2](https://github.com/onnx/models/blob/main/validated/vision/classification/resnet/model/resnet18-v2-7.onnx)
- [x] [DenseNet-121-12](https://github.com/onnx/models/blob/main/validated/vision/classification/densenet-121/model/densenet-12.onnx)
- [x] [Inception-2](https://github.com/onnx/models/blob/main/validated/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx)
- [x] [EfficientNet-Lite4](https://github.com/onnx/models/blob/main/validated/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx)
```python
import onnx
@ -97,7 +98,7 @@ for name, tensor in stub.inputs.items():
print(name, tensor.shape(), tensor)
```
对于 [resnet18-v2-7.onnx](https://github.com/onnx/models/blob/main/vision/classification/resnet/model/resnet18-v2-7.onnx),会打印出:
对于 [resnet18-v2-7.onnx](https://github.com/onnx/models/blob/main/validated/vision/classification/resnet/model/resnet18-v2-7.onnx),会打印出:
```plaintext
data [1, 3, 224, 224] <backend.Tensor object at 0x7efeb828e3b0>
@ -138,7 +139,7 @@ for name, tensor in stub.outputs.items():
### 样例代码
您可以参照[./example/Resnet/resnet.py](./example/ResNet/resnet.py)的样例代码进行了解,并尝试运行。在这个文件中,我们使用了 Pytorch 构建了 resnet 网络。您可以查阅该脚本使用方式:
您可以参照[resnet.py](https://github.com/wanghailu0717/NNmodel/blob/main/ResNet/resnet.py)的样例代码进行了解,并尝试运行。在这个文件中,我们使用了 Pytorch 构建了 resnet 网络。您可以查阅该脚本使用方式:
```python
python resnet.py -h

View File

@ -0,0 +1,17 @@
# 分布式脚本
#### 1. 运行pytorch模型并生成输入和标准输出可选择导出onnx
使用 `--export_onnx` 设置导出onnx的目录默认为当前路径 `./`不使用这个flag则只进行计算和生成输入输出。
```bash
python run_pytorch.py --model gpt2 --batch_size 1 --length 1 --export_onnx ./
```
会在当前目录下生成输入输出文件`test_inputs.npy` 和 `test_results.npy`,目前只支持单一输入输出。
#### 2. 运行InfiniTensor分布式脚本
```bash
python cuda_launch.py --model "/XXX/XXX.onnx" --nproc_per_node 4
```

View File

@ -10,9 +10,6 @@ import numpy as np
from parallel_opt import parallel_model
os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
def parse_args():
parser = argparse.ArgumentParser(description="launch distributed infinitensor")
parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
@ -32,6 +29,9 @@ def parse_args():
action="store_true",
help="whether to generate the standard results.",
)
parser.add_argument(
"--type", type=str, choices=["fp32", "fp16", "tf32"], default="fp32", help="data type"
)
args = parser.parse_args()
print("arg setting: ", args)
return (
@ -42,12 +42,13 @@ def parse_args():
args.batch_size,
args.length,
args.gen_std,
args.type,
)
def run_model(model, runtime, inputs, n=10):
stub = OnnxStub(model, runtime)
for tensor, input in zip(stub.inputs.values(), inputs):
def run_model(model, runtime, inputs, n=10, data_type = "default"):
stub = OnnxStub(model, runtime, matmul_compute_type=data_type)
for tensor, input in zip(stub.inputs.values(), inputs, strict=False):
tensor.copyin_numpy(input)
# stub.tune()
stub.run()
@ -55,7 +56,7 @@ def run_model(model, runtime, inputs, n=10):
outputs = next(stub.outputs.values().__iter__()).copyout_numpy()
# bench
for tensor, input in zip(stub.inputs.values(), inputs):
for tensor, input in zip(stub.inputs.values(), inputs, strict=False):
tensor.copyin_numpy(input)
begin = time.time()
for _ in range(n):
@ -66,17 +67,17 @@ def run_model(model, runtime, inputs, n=10):
return outputs
def run_and_compare(name, model, runtime):
def run_and_compare(name, model, runtime, data_type):
input_ids = np.load(f"{name}_inputs.npy")
position_ids = np.arange(input_ids.shape[-1])
results = np.load(f"{name}_results.npy")
outputs = run_model(model, runtime, (input_ids, position_ids))
outputs = run_model(model, runtime, (input_ids, position_ids), data_type=data_type)
print("outputs abs mean:", abs(outputs).mean())
np.testing.assert_allclose(outputs, results, rtol=1e-6, atol=1e-3)
print("max abs diff:", abs(outputs - results).max())
def start_worker(
name: str, world_size: int, rank: int, local_rank: int, model: onnx.ModelProto
name: str, world_size: int, rank: int, local_rank: int, model: onnx.ModelProto, data_type: str
):
dist_name = name + "_dist"
model = parallel_model(model, world_size, rank)
@ -89,7 +90,7 @@ def start_worker(
save_as_external_data=True,
location=extern_path,
)
infer_shapes_path(f"./{dist_name}_rank{rank}.onnx")
#infer_shapes_path(f"./{dist_name}_rank{rank}.onnx")
runtime = backend.CudaRuntime(local_rank)
# print("init comm")
runtime.init_comm(
@ -97,12 +98,12 @@ def start_worker(
world_size,
rank,
)
run_and_compare(name, model, runtime)
run_and_compare(name, model, runtime, data_type)
def start_single(name, model):
def start_single(name, model, data_type):
runtime = backend.CudaRuntime(0)
run_and_compare(name, model, runtime)
run_and_compare(name, model, runtime, data_type)
def gen_standard(name, model, voc_size, bs, len):
@ -117,8 +118,10 @@ def gen_standard(name, model, voc_size, bs, len):
def main():
nnodes, nproc_per_node, name, model_path, bs, length, gen_std = parse_args()
nnodes, nproc_per_node, name, model_path, bs, length, gen_std, data_type = parse_args()
data_type = "default" if data_type == "fp32" else data_type
if data_type != "tf32":
os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
model = onnx.load(model_path)
# generate standart output
@ -132,7 +135,7 @@ def main():
# run single process.
# use standalone process to isolate cuda.
print("run model by single GPU.")
p = mp.Process(target=start_single, args=(name, model))
p = mp.Process(target=start_single, args=(name, model, data_type))
p.start()
p.join()
@ -142,7 +145,7 @@ def main():
workers = [
mp.Process(
target=start_worker,
args=(name, world_size, rank, rank % nproc_per_node, model),
args=(name, world_size, rank, rank % nproc_per_node, model, data_type),
)
for rank in range(world_size)
]

View File

@ -0,0 +1,213 @@
import argparse
import os
import time
import multiprocessing as mp
from pyinfinitensor.onnx import OnnxStub, backend
import onnx
from onnx.external_data_helper import convert_model_to_external_data
from onnx.shape_inference import infer_shapes_path
import numpy as np
from parallel_opt import parallel_model
st_input_dir = "standard/inputs/"
st_output_dir = "standard/outputs/"
def parse_args():
parser = argparse.ArgumentParser(description="launch distributed infinitensor")
parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
parser.add_argument(
"--nproc_per_node", type=int, default=2, help="number of processes per node"
)
parser.add_argument(
"--name", type=str, default="test", help="name of this instance."
)
parser.add_argument(
"--model", type=str, default="/data1/shared/panzezhong/llama/fp32/my_llama_fp32.sim.onnx", help="path to the ONNX model file."
)
parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
parser.add_argument("--length", type=int, default=1, help="sequence length.")
parser.add_argument(
"--gen_std",
default=False,
action="store_true",
help="whether to generate the standard results.",
)
parser.add_argument(
"--run_single",
default=False,
action="store_true",
help="whether run model with single process with standard inputs"
)
args = parser.parse_args()
print("arg setting: ", args)
return (
args.num_nodes,
args.nproc_per_node,
args.name,
args.model,
args.batch_size,
args.length,
args.gen_std,
args.run_single
)
def run_model(model, runtime, world_size=1, rank=0, n=10):
stub = OnnxStub(model, runtime)
load_inputs(stub, world_size, rank)
# stub.tune()
stub.run()
# get outputs
time.sleep(0.01)
outputs = next(stub.outputs.values().__iter__()).copyout_numpy()
# bench
begin = time.time()
for _ in range(n):
stub.run()
end = time.time()
avg_time = (end - begin) / n
print(f"average time: {avg_time}")
return outputs
def run_and_compare(name, model, runtime, world_size=1, rank = 0):
results = np.load(os.path.join(st_output_dir,f"output.npy"))
outputs = run_model(model, runtime, world_size, rank)
print(outputs[:100])
if np.isnan(outputs).any():
print("Nan in output")
print("answer argmax:", np.argmax(results))
print("output argmax:", np.argmax(outputs))
#np.testing.assert_allclose(outputs, results, rtol=1e-3, atol=1e-3)
getDiff(results, outputs)
def start_worker(
name: str, world_size: int, rank: int, local_rank: int, model: onnx.ModelProto
):
dist_name = name + "_dist"
model = parallel_model(model, world_size, rank)
extern_path = f"./{dist_name}_rank{rank}.pb"
if os.path.exists(extern_path):
os.remove(extern_path)
onnx.save_model(
model,
f"./{dist_name}_rank{rank}.onnx",
save_as_external_data=True,
location=extern_path,
)
infer_shapes_path(f"./{dist_name}_rank{rank}.onnx")
runtime = backend.KUNLUNRuntime(local_rank)
# print("init comm")
runtime.init_comm(
dist_name,
world_size,
rank,
)
run_and_compare(name, model, runtime, world_size, rank)
def start_single(name, model):
runtime = backend.KUNLUNRuntime(0)
run_and_compare(name, model, runtime)
def generate_input_output(model):
runtime = backend.KUNLUNRuntime(0)
stub = OnnxStub(model, runtime)
position_id = 0
for i, (name, tensor) in enumerate(stub.inputs.items()):
input = tensor.copyout_numpy()
if np.issubdtype(input.dtype, np.integer):
if input.size == 1:
# input = np.array([position_id])
input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
else:
input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
elif input.dtype == np.bool_:
input = np.random.randint(0,2,size=input.shape) > 0
else:
if i == 0:
input = np.ones(input.shape).astype(input.dtype)
position_id = input.shape[-1] - 1
else:
input = np.random.rand(*input.shape).astype(input.dtype)
tensor.copyin_numpy(input)
np.save(os.path.join(st_input_dir, f"input_{i}"), input)
stub.run()
# print(stub.outputs)
time.sleep(0.01)
output = next(stub.outputs.values().__iter__()).copyout_numpy()
print(output[:100])
if np.isnan(output).any():
print("Nan in output")
np.save(os.path.join(st_output_dir, f"output"), output)
def load_inputs(stub, world_size=1, rank=0):
for i, (name, tensor) in enumerate(stub.inputs.items()):
input = np.load(os.path.join(st_input_dir, f"input_{i}.npy"))
if all(x == y for x,y in zip(input.shape,tensor.shape())):
tensor.copyin_numpy(input)
else:
tensor.copyin_numpy(np.hsplit(input, world_size)[rank])
def getDiff(base, test):
absolute_diff = np.abs(np.subtract(base, test))
max_absolute_diff = np.max(absolute_diff)
baseCopy = base.astype(np.float64).ravel()
testCopy = test.astype(np.float64).ravel()
upValue = np.sum(np.abs(baseCopy - testCopy))
downValue = np.sum(np.abs(baseCopy)) + np.float64(1e-9)
max_relative_diff = upValue / downValue
print(f"Max absolute difference: {max_absolute_diff}\nMax relative difference: {max_relative_diff}")
return max_absolute_diff, max_relative_diff
def main():
nnodes, nproc_per_node, name, model_path, bs, length, gen_std, run_single = parse_args()
model = onnx.load(model_path)
# generate standart output
if gen_std:
print("Generate inputs and outputs.")
p = mp.Process(target=generate_input_output, args=[model])
p.start()
p.join()
return
# # run single process.
# # use standalone process to isolate cuda.
if run_single:
print("run model by single GPU.")
p = mp.Process(target=start_single, args=(name, model))
p.start()
p.join()
return
# run distributed parallel.
world_size = nnodes * nproc_per_node
print(f"run model by {world_size} GPU in parallel.")
workers = [
mp.Process(
target=start_worker,
args=(name, world_size, rank, rank % nproc_per_node, model),
)
for rank in range(world_size)
]
for w in workers:
w.start()
for w in workers:
w.join()
if __name__ == "__main__":
main()

View File

@ -244,5 +244,5 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
if tt.HasField("shape"):
tt.ClearField("shape")
model = helper.make_model(graph)
model = onnx.shape_inference.infer_shapes(model)
#model = onnx.shape_inference.infer_shapes(model)
return model

View File

@ -0,0 +1,188 @@
import argparse
import torch
from transformers import BertModel, BertConfig
from transformers import GPT2Model, GPT2Config
from transformers import OPTModel, OPTConfig
import time
import numpy as np
import onnx
import os
from onnx.external_data_helper import convert_model_to_external_data
from onnxsim import simplify
def parse_args():
parser = argparse.ArgumentParser(description="Run pytorch gpt2/bert/opt and optionally export onnx.")
parser.add_argument(
"--model", type=str, choices=["gpt2", "bert", "opt"], required=True, help="model type"
)
parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
parser.add_argument("--length", type=int, default=1, help="sequence length.")
parser.add_argument(
"--export_onnx",
type=str,
nargs="?",
default=None,
const="./",
help="whether and where to export onnx file",
)
parser.add_argument(
"--type", type=str, choices=["fp32", "fp16", "tf32"], default="fp32", help="data type"
)
args = parser.parse_args()
print("arg setting: ", args)
return (
args.model,
args.batch_size,
args.length,
args.export_onnx,
args.type,
)
def get_model(modelname):
match modelname:
case "bert":
model = BertModel.from_pretrained("bert-base-uncased", add_pooling_layer=False, hidden_act="gelu_new") # erf is not impl by infini
voc_size = BertConfig().vocab_size
case "gpt2":
model = GPT2Model.from_pretrained("gpt2")
voc_size = GPT2Config().vocab_size
case "opt":
model = model = OPTModel.from_pretrained("./opt-125m")
voc_size = OPTConfig().vocab_size
case _:
raise KeyError(modelname)
model = model.eval()
return model, voc_size
def run_pytorch(torch_model, voc_size, batchsize, len):
data = np.random.randint(0, voc_size, (batchsize, len), dtype=np.int32)
np.save("test_inputs", data)
inputs = torch.from_numpy(data).to("cuda")
torch_model = torch_model.to("cuda")
n_iter = 20
with torch.no_grad():
for _ in range(10):
outputs = torch_model(inputs)
torch.cuda.synchronize()
begin = time.time()
with torch.no_grad():
for _ in range(n_iter):
torch.cuda.synchronize()
outputs = torch_model(inputs)
#
torch.cuda.synchronize()
torch.cuda.synchronize()
end = time.time()
avg_time = (end - begin) / n_iter
outputs = outputs.last_hidden_state.to("cpu")
print("outputs abs mean:", abs(np.array(outputs)).mean())
print(f"average time: {avg_time}")
torch.cuda.memory.empty_cache()
np.save("test_results", np.array(outputs, dtype=np.float32))
print("Save input & output as test_inputs.npy and test_results.npy")
def export_onnx(model, data, path, extern=False):
torch.onnx.export(model, data, path, verbose=False, do_constant_folding=True)
onnx_model = onnx.load(path)
onnx_model, check = simplify(onnx_model, skipped_optimizers=['eliminate_duplicate_initializer'])
#onnx_model, check = simplify(onnx_model, skipped_optimizers=['fuse_qkv', 'eliminate_duplicate_initializer'])
assert check
add_value_info_for_constants(onnx_model)
onnx_model = onnx.shape_inference.infer_shapes(onnx_model)
if extern:
extern_path = path.replace('.onnx', '.pb')
if os.path.exists(extern_path):
os.remove(extern_path)
convert_model_to_external_data(
onnx_model,
all_tensors_to_one_file=True,
location=extern_path,
size_threshold=1024,
convert_attribute=False,
)
onnx.save(onnx_model, path)
def add_value_info_for_constants(model : onnx.ModelProto):
"""
Currently onnx.shape_inference doesn't use the shape of initializers, so add
that info explicitly as ValueInfoProtos.
Mutates the model.
Args:
model: The ModelProto to update.
"""
# All (top-level) constants will have ValueInfos before IRv4 as they are all inputs
if model.ir_version < 4:
return
def add_const_value_infos_to_graph(graph : onnx.GraphProto):
inputs = {i.name for i in graph.input}
existing_info = {vi.name: vi for vi in graph.value_info}
for init in graph.initializer:
# Check it really is a constant, not an input
if init.name in inputs:
continue
# The details we want to add
elem_type = init.data_type
shape = init.dims
# Get existing or create new value info for this constant
vi = existing_info.get(init.name)
if vi is None:
vi = graph.value_info.add()
vi.name = init.name
# Even though it would be weird, we will not overwrite info even if it doesn't match
tt = vi.type.tensor_type
if tt.elem_type == onnx.TensorProto.UNDEFINED:
tt.elem_type = elem_type
if not tt.HasField("shape"):
# Ensure we set an empty list if the const is scalar (zero dims)
tt.shape.dim.extend([])
for dim in shape:
tt.shape.dim.add().dim_value = dim
# Handle subgraphs
for node in graph.node:
for attr in node.attribute:
# Ref attrs refer to other attrs, so we don't need to do anything
if attr.ref_attr_name != "":
continue
if attr.type == onnx.AttributeProto.GRAPH:
add_const_value_infos_to_graph(attr.g)
if attr.type == onnx.AttributeProto.GRAPHS:
for g in attr.graphs:
add_const_value_infos_to_graph(g)
return add_const_value_infos_to_graph(model.graph)
def main():
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False
modelname, batchsize, seqlen, export_path, data_type = parse_args()
if data_type == "tf32":
torch.backends.cuda.matmul.allow_tf32 = True
else:
os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
model, voc_size = get_model(modelname)
if export_path is not None:
filename = "{}_{}_{}.onnx".format(modelname, batchsize, seqlen)
path = os.path.join(export_path, filename)
param = torch.zeros((batchsize, seqlen), dtype=torch.int)
export_onnx(model, param, path, True)
if data_type == "fp16":
model = model.half()
run_pytorch(model, voc_size, batchsize, seqlen)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,145 @@
import os
from pyinfinitensor.onnx import OnnxStub, backend
import numpy as np
import onnx
import torch
from transformers import LlamaModel, LlamaForCausalLM
from tqdm import tqdm
import onnx_graphsurgeon as gs
from onnxsim import simplify
import argparse
parser = argparse.ArgumentParser(description='')
parser.add_argument('--batchsize', dest='batchsize', type=int, default=1)
parser.add_argument('--layer', dest='n_layers', type=int, default=2)
parser.add_argument('--iter', dest='n_iter', type=int, default=1)
parser.add_argument('--n_max_length', dest='n_max_length', type=int, default=1024)
parser.add_argument('--pretrained_llama_path', dest='pretrained_llama_path', type=str,
default="/data0/shared/data/public/opensource_models/meta-llama/Llama-2-7b-hf/")
parser.add_argument('--onnx_model_path', dest='onnx_model_path', type=str,
default="/data1/shared/llama")
args = parser.parse_args()
ONNX_MODEL_PATH = "{}/llama_bs{}_layer{}.onnx".format(args.onnx_model_path, args.batchsize, args.n_layers)
ONNX_WEIGHT_PATH = "./llama_bs{}_layer{}.pb".format(args.batchsize, args.n_layers)
def export_onnx(model: LlamaModel, ONNX_MODEL_PATH):
param = torch.zeros(
(args.batchsize, 1024), dtype=torch.long)
logits = model(param, past_key_values=None)
param_kvcache = torch.zeros((args.batchsize, 1), dtype=torch.long)
torch.onnx.export(model, (param_kvcache, {"past_key_values": logits.past_key_values,
"position_ids": param_kvcache}), ONNX_MODEL_PATH, verbose=False,
do_constant_folding=True,)
onnx_model = onnx.load(ONNX_MODEL_PATH)
print("simplifing onnx model")
onnx_model, check = simplify(onnx_model, skipped_optimizers=[
'eliminate_duplicate_initializer'])
assert check
onnx.save(onnx_model, ONNX_MODEL_PATH, save_as_external_data=True, location=ONNX_WEIGHT_PATH)
print("simlifing finished.")
@gs.Graph.register()
def replace_with_attention(self, inputs, outputs, inputs_added, outputs_removed):
for inp in inputs:
inp.outputs.clear()
for out in outputs:
out.inputs.clear()
for inp in inputs_added:
inputs.append(inp)
for out in outputs_removed:
out.inputs.clear()
return self.layer(op="AttentionKVCache", inputs=inputs, outputs=outputs)
def replace_onnx_with_attention_op():
graph = gs.import_onnx(
onnx.load(ONNX_MODEL_PATH))
tmap = graph.tensors()
for i in range(args.n_layers):
inputs = [
tmap["onnx::Concat_" + str((i+1)*2)],
tmap["onnx::Concat_" + str((i+1)*2+1)],
tmap["/model/layers." + str(i) + "/self_attn/Add_output_0"],
tmap["/model/layers." + str(i) + "/self_attn/Add_1_output_0"],
tmap["/model/layers." + str(i) + "/self_attn/Transpose_2_output_0"]]
outputs = [
tmap["/model/layers." + str(i) + "/self_attn/MatMul_1_output_0"]]
inputs_added = [graph.inputs[1]]
outputs_removed = []
graph.replace_with_attention(
inputs, outputs, inputs_added, outputs_removed)
graph.outputs = [tmap[graph.outputs[0].name]]
graph.cleanup(True).toposort()
onnx.save(gs.export_onnx(graph), ONNX_MODEL_PATH, save_as_external_data=True)
if __name__ == "__main__":
kvcache_torch = None
torch_model = LlamaForCausalLM.from_pretrained(
args.pretrained_llama_path, num_hidden_layers=int(args.n_layers)).eval()
n_heads = torch_model.config.num_attention_heads
n_dims = torch_model.config.hidden_size // n_heads
if not os.path.exists(ONNX_MODEL_PATH):
print("exporting onnx graph")
export_onnx(torch_model, ONNX_MODEL_PATH)
replace_onnx_with_attention_op()
else:
print("will use exsiting onnx graph")
onnx_model = onnx.load(ONNX_MODEL_PATH)
stub = OnnxStub(onnx_model, backend.cuda_runtime())
count_wrong = 0
for i in tqdm(range(0, args.n_max_length)):
query = np.random.randint(
torch_model.config.vocab_size, size=(args.batchsize, 1), dtype=np.int32)
position_id = i*np.ones((args.batchsize, 1), dtype=np.int32)
####################################
# pytorch
####################################
outputs_torch = torch_model(
torch.tensor(query), past_key_values=kvcache_torch)
logit_torch = outputs_torch['logits']
kvcache_torch = outputs_torch['past_key_values']
####################################
# infinitensor
####################################
# copyin input
(list(stub.inputs.items()))[0][1].copyin_int64(
query.reshape(-1).tolist())
(list(stub.inputs.items()))[1][1].copyin_int64(
position_id.reshape(-1).tolist())
stub.run()
####################################
# validation
####################################
# copyout output
logits_it = np.array((list(stub.outputs.items()))
[0][1].copyout_float())
try:
np.testing.assert_allclose(
logit_torch[:, -1, :].detach().cpu().numpy().flatten(), logits_it, rtol=1e-3, atol=1e-3)
except Exception as e:
try:
np.testing.assert_allclose(
np.argmax(logit_torch[:, -1, :].detach().cpu().numpy().flatten()), np.argmax(logits_it), rtol=1e-3, atol=1e-3)
except:
count_wrong = count_wrong + 1
result = "{}/{} failed.".format(count_wrong, args.n_max_length)
print(result)
del stub

View File

@ -2,6 +2,10 @@
#include "cnnl.h"
#include "cnrt.h"
#include "core/common.h"
#include "core/data_type.h"
#ifdef INFINI_USE_CNCL
#include "cncl.h"
#endif
#define checkBangError(call) \
{ \
@ -27,4 +31,70 @@ namespace infini {
using BangPtr = void *;
inline cnnlDataType_t cnnlDataTypeConvert(DataType dataType) {
if (dataType == DataType::Float32) {
return CNNL_DTYPE_FLOAT;
}
if (dataType == DataType::Float16) {
return CNNL_DTYPE_HALF;
}
if (dataType == DataType::Double) {
return CNNL_DTYPE_DOUBLE;
}
if (dataType == DataType::Int8) {
return CNNL_DTYPE_INT8;
}
if (dataType == DataType::Int32) {
return CNNL_DTYPE_INT32;
}
if (dataType == DataType::UInt8) {
return CNNL_DTYPE_UINT8;
}
if (dataType == DataType::BFloat16) {
return CNNL_DTYPE_BFLOAT16;
}
if (dataType == DataType::Int64) {
return CNNL_DTYPE_INT64;
}
if (dataType == DataType::Bool) {
return CNNL_DTYPE_BOOL;
}
IT_TODO_HALT_MSG("Data type " + dataType.toString() +
" not supported in CNNL.");
}
#ifdef INFINI_USE_CNCL
inline cnclDataType_t cnclDataTypeConvert(DataType dataType) {
if (dataType == DataType::Float32) {
return cnclFloat32;
}
if (dataType == DataType::Float16) {
return cnclHalf;
}
if (dataType == DataType::Int8) {
return cnclInt8;
}
if (dataType == DataType::Int16) {
return cnclInt16;
}
if (dataType == DataType::Int32) {
return cnclInt32;
}
if (dataType == DataType::UInt8) {
return cnclUint8;
}
if (dataType == DataType::UInt16) {
return cnclUint16;
}
if (dataType == DataType::UInt32) {
return cnclUint32;
}
if (dataType == DataType::BFloat16) {
return cnclBfloat16;
}
IT_TODO_HALT_MSG("Data type " + dataType.toString() +
" not supported in CNCL.");
}
#endif
} // namespace infini

View File

@ -61,16 +61,30 @@ template <typename T> auto enum_to_underlying(T e) {
}
template <typename T> std::string vecToString(const std::vector<T> &vec) {
std::string ret;
ret.append("[");
for (auto d : vec) {
ret.append(std::to_string(d));
ret.append(",");
std::stringstream ss;
ss << "[";
for (size_t i = 0; i < vec.size(); ++i) {
ss << vec.at(i);
if (i < vec.size() - 1) {
ss << ",";
}
}
if (!vec.empty())
ret.pop_back();
ret.append("]");
return ret;
ss << "]";
return ss.str();
}
template <typename T> std::string vecToString(const T *st, size_t length) {
std::stringstream ss;
ss << "[";
size_t i = 0;
for (i = 0; i < length; i++) {
ss << *(st + i);
if (i < length - 1) {
ss << ",";
}
}
ss << "]";
return ss.str();
}
double timeit(

View File

@ -5,6 +5,10 @@
#include <cstdint>
#include <iostream>
#ifdef USE_CUDA
#include "cuda/cuda_runtime.h"
#endif
namespace infini {
class GraphHandlerObj {
@ -26,12 +30,14 @@ class GraphHandlerObj {
int pw, int sh, int sw, int dh, int dw, int oph,
int opw);
Tensor matmul(Tensor a, Tensor b, Tensor y, bool transA, bool transB,
Tensor bias, ActType act);
Tensor bias, ActType act,
std::string matmul_compute_type = "default");
Tensor batchNormalization(Tensor input, Tensor output, Tensor mean,
Tensor var, Tensor scale, Tensor bias,
float momentum, float eps, bool training);
Tensor layerNormalization(Tensor input, Tensor scale, Tensor output,
Tensor bias, float eps, int axis, int stash_type);
Tensor rmsNorm(Tensor input, Tensor weight, Tensor output);
Tensor maxPool(Tensor input, Tensor output, int kh, int kw, int dh, int dw,
int ph, int pw, int sh, int sw, int ceilMode);
@ -47,6 +53,7 @@ class GraphHandlerObj {
Tensor max(Tensor a, Tensor b, Tensor c);
Tensor relu(Tensor x, Tensor y);
Tensor silu(Tensor x, Tensor y);
Tensor gelu(Tensor x, Tensor y);
Tensor sigmoid(Tensor x, Tensor y);
Tensor hardSigmoid(Tensor x, Tensor y);
@ -77,6 +84,7 @@ class GraphHandlerObj {
Tensor attentionKVCache(Tensor input_k_cache, Tensor input_v_cache,
Tensor input_q, Tensor input_k, Tensor input_v,
Tensor position_id, Tensor output_matmul);
Tensor RoPE(Tensor pos, Tensor input, Tensor output);
TensorVec split(Tensor input, std::optional<TensorVec> outputs, int axis,
std::variant<int, vector<int>> numOrRatio);
Tensor gather(Tensor data, Tensor indices, Tensor output, int axis);
@ -135,6 +143,12 @@ class GraphHandlerObj {
inline void run() { g->getRuntime()->run(g); }
inline double get_perf_time() { return g->getRuntime()->getPerfTime(g); }
#ifdef USE_CUDA
inline void run_with_cudagraph() {
(as<CudaRuntimeObj>(g->getRuntime()))->runWithCudaGraph(g);
}
#endif
};
} // namespace infini

View File

@ -5,8 +5,8 @@
#include "utils/operator_utils.h"
#include <functional>
#include <nlohmann/json.hpp>
using json = nlohmann::json;
namespace infini {
using json = nlohmann::json;
class RuntimeObj; // Forward declaration for Kernel::compute

View File

@ -151,11 +151,14 @@ struct OpType {
ReduceSum, // Reduce
ReduceSumSquare, // Reduce
Relu, // Unary
Silu, // Unary
Reshape,
Resize,
ReverseSequence,
RoiAlign,
Round, // Unary
RoPE, // Fusion
Round, // Unary
RMSNorm, // Fusion
STFT,
Scan,
Scatter,

View File

@ -2,8 +2,8 @@
#include "core/graph.h"
#include "core/kernel.h"
#include <nlohmann/json_fwd.hpp>
using json = nlohmann::json;
namespace infini {
using json = nlohmann::json;
class PerfEngine {
public:

View File

@ -15,6 +15,7 @@ class GraphObj;
class GraphHandlerObj;
class RuntimeObj;
class BlobObj;
template <typename T> class WorkspaceObj;
using TensorBase = Ref<TensorBaseObj>;
using Tensor = Ref<TensorObj>;
@ -23,6 +24,7 @@ using Graph = Ref<GraphObj>;
using GraphHandler = Ref<GraphHandlerObj>;
using Runtime = Ref<RuntimeObj>;
using Blob = Ref<BlobObj>;
template <typename T> using Workspace = Ref<WorkspaceObj<T>>;
using TensorVec = vector<Tensor>;
using OpVec = vector<Operator>;

View File

@ -4,6 +4,7 @@
#include "utils/data_convert.h"
#include <cmath>
#include <cstring>
#include <fstream>
#if USE_CUDA
#include "cuda/cuda_runtime.h"
@ -143,6 +144,7 @@ class TensorObj : public TensorBaseObj {
}
void printData() const;
void dumpData(std::ofstream &ofs) const;
bool equalData(const Tensor &rhs, double relativeError = 1e-6) const;
template <typename T> bool equalData(const vector<T> &dataVector) {
@ -198,13 +200,20 @@ class TensorObj : public TensorBaseObj {
if (a[i] != b[i])
return false;
} else if constexpr (std::is_floating_point_v<T>) {
if (fabs(a[i] - b[i]) / std::max(fabs(a[i]), fabs(b[i])) >
relativeError) {
if (std::min(fabs(a[i]), fabs(b[i])) == 0. &&
fabs(a[i] - b[i]) > relativeError) {
printf("Error on %lu: %f %f\n", i, a[i], b[i]);
return false;
} else if (std::min(fabs(a[i]), fabs(b[i])) != 0. &&
fabs(a[i] - b[i]) /
std::max(fabs(a[i]), fabs(b[i])) >
relativeError) {
printf("Error on %lu: %f %f\n", i, a[i], b[i]);
return false;
}
} else
} else {
static_assert(!sizeof(T), "Unsupported data type");
}
}
return true;
}
@ -239,8 +248,8 @@ class TensorObj : public TensorBaseObj {
// // std::cerr << "Init beginned " << std::endl;
// #pragma omp parallel for
// for (size_t i = 0; i < iEnd; ++i)
// data[i] = fastrand(random_seed[omp_get_thread_num() * 16]) %
// 10000;
// data[i] = fastrand(random_seed[omp_get_thread_num() *
// 16]) % 10000;
// // std::cerr << "Init finished" << std::endl;
// computed = ComputedFull;
// return true;
@ -285,8 +294,8 @@ class TensorObj : public TensorBaseObj {
// auto nDim = dims.size();
// auto nBroadcastDim = ds.size() - nDim;
// for (size_t i = 0; i < nDim; ++i)
// if (ds[nBroadcastDim + i] < 0 || ds[nBroadcastDim + i] >=
// dims[i])
// if (ds[nBroadcastDim + i] < 0 || ds[nBroadcastDim +
// i] >= dims[i])
// return (size_t)-1;
// size_t idx = 0;
// for (size_t i = 0; i < nDim; ++i)
@ -345,12 +354,14 @@ class TensorObj : public TensorBaseObj {
// return (g_seed >> 16) & 0x7FFF;
// }
// std::vector<std::vector<int>> const *getSplittingPoints() const {
// std::vector<std::vector<int>> const *getSplittingPoints()
// const {
// assert(!splittingPoints.empty());
// return &splittingPoints;
// }
// bool setSplittingPoints(std::vector<std::vector<int>> value) {
// bool setSplittingPoints(std::vector<std::vector<int>> value)
// {
// assert(!value.empty());
// splittingPoints = value;
// return true;

42
include/core/workspace.h Normal file
View File

@ -0,0 +1,42 @@
#pragma once
#include "core/runtime.h"
namespace infini {
template <class T> class WorkspaceObj {
private:
T workspace; // workspace pointer
size_t workspaceSize; // Size of workspace
size_t workspaceAlloc; // currently use workspace size
public:
WorkspaceObj(T workspace_, size_t workspaceSize_)
: workspace(workspace_), workspaceSize(workspaceSize_) {
workspaceAlloc = 0;
}
virtual ~WorkspaceObj() {
// Dealloc workspace in RuntimeObj
// Set workspace = nullptr here
workspace = nullptr;
}
size_t getWorkspaceSize() const { return workspaceSize; }
T getWorkspace(size_t size) {
// Get unused workspace
IT_ASSERT(size + workspaceAlloc <= workspaceSize);
auto ret = (T)(static_cast<uint8_t *>(workspace) + workspaceAlloc);
workspaceAlloc += size;
return ret;
}
T getWorkspace() {
// Override getWorkspace in order to dealloc in runtime
return workspace;
}
void resetWorkspace() {
// Reset workspaceAlloc every time end kernel
workspaceAlloc = 0;
}
size_t getWorkspaceAlloc() const { return workspaceAlloc; }
};
} // namespace infini

View File

@ -1,4 +1,5 @@
#pragma once
#include "core/common.h"
#include <cstdio>
struct AttentionKVCacheMetadata {
@ -10,6 +11,7 @@ namespace infini {
void attention_kvcache_kernel(float *input_k_cache, float *input_v_cache,
float *input_q, float *input_k, float *input_v,
int *position_id, float *output_matmul,
const AttentionKVCacheMetadata &compMeta);
const AttentionKVCacheMetadata &compMeta,
float *output_O_temp, float *output_sum_temp);
} // namespace infini

View File

@ -5,6 +5,7 @@
#include <cuda_profiler_api.h>
#include <cudnn.h>
#include <curand.h>
#include <memory>
#define checkCudaError(call) \
if (auto err = call; err != cudaSuccess) \
@ -111,4 +112,20 @@ inline const char *curandGetErrorString(curandStatus_t error) {
using CudaPtr = void *;
class CUDAStream {
public:
CUDAStream(const CUDAStream &) = delete;
CUDAStream(CUDAStream &&) = delete;
void operator=(const CUDAStream &) = delete;
void operator=(CUDAStream &&) = delete;
static cudaStream_t getCurrentStream() { return _stream; }
static void Init() { CUDAStream::_stream = 0; };
static void createStream() { checkCudaError(cudaStreamCreate(&_stream)); }
static void destroyStream() { checkCudaError(cudaStreamDestroy(_stream)); }
private:
CUDAStream(){};
static cudaStream_t _stream;
};
} // namespace infini

View File

@ -13,4 +13,8 @@ void pow_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
void less_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
int a2, int a3, int b0, int b1, int b2, int b3, int c0, int c1,
int c2, int c3);
void div_const_kernel(int dType, void *a, void *b, void *c, size_t n);
void pow_const_kernel(int dType, void *a, void *b, void *c, size_t n);
}; // namespace infini

View File

@ -7,4 +7,6 @@ void expandKernel(int dType, void *input, void *output, int nDims,
int outputsize, SmallArray inputShape,
SmallArray outputShape);
void expandRowKernel(int dType, void *input, void *output, int n_rows,
int row_len);
}; // namespace infini

View File

@ -0,0 +1,10 @@
#pragma once
#include "operators/rms_norm.h"
namespace infini {
void rmsnorm_kernel(int dType, void *input, void *weight, void *output,
int num_tokens, int hidden_size);
}; // namespace infini

12
include/cuda/cuda_rope.h Normal file
View File

@ -0,0 +1,12 @@
#pragma once
#include "operators/rope.h"
#include "utils/small_array.h"
namespace infini {
void rope_kernel(int dType, int *pos, void *input, void *output, int size,
int dim_model, int dim_head, int hidden_stride,
int pos_stride);
}; // namespace infini

View File

@ -14,6 +14,9 @@ class CudaRuntimeObj : public RuntimeObj {
std::unique_ptr<CommunicatorObj> comm;
CudaPtr workspace;
size_t workspaceSize;
bool isCudaGraphCreated;
cudaGraph_t cudaGraph;
cudaGraphExec_t cudaGraphInstance;
public:
explicit CudaRuntimeObj(int deviceId = 0)
@ -26,9 +29,16 @@ class CudaRuntimeObj : public RuntimeObj {
// size_t longformerNum = 3lu * (1 << 30);
workspaceSize = 7ll << 30; // 7 GB
workspace = alloc(workspaceSize);
isCudaGraphCreated = false;
CUDAStream::Init();
}
virtual ~CudaRuntimeObj() {
try {
if (isCudaGraphCreated) {
checkCudaError(cudaGraphExecDestroy(cudaGraphInstance));
checkCudaError(cudaGraphDestroy(cudaGraph));
CUDAStream::destroyStream();
}
dealloc(workspace);
checkCudnnError(cudnnDestroy(cudnn));
checkCublasError(cublasDestroy(cublas));
@ -75,6 +85,8 @@ class CudaRuntimeObj : public RuntimeObj {
void runWithoutSync(const Graph &graph) const;
void runWithCudaGraph(const Graph &graph);
// init communicator
void initComm(const string &name, int worldSize, int rank) final;

View File

@ -5,6 +5,7 @@
namespace infini {
template <typename T> void softmax_kernel(T *input, T *output, size_t num);
template <typename T> void relu_kernel(T *input, T *output, size_t num);
template <typename T> void silu_kernel(T *input, T *output, size_t num);
template <typename T> void sigmoid_kernel(T *input, T *output, size_t num);
template <typename T> void tanh_kernel(T *input, T *output, size_t num);
template <typename T> void abs_kernel(T *input, T *output, size_t num);

View File

@ -0,0 +1,23 @@
#include "core/op_type.h"
#include "kunlun/kunlun_common.h"
namespace infini {
using KunlunActType = xdnn::Activation_t;
KunlunActType parseActType(ActType act) {
switch (act) {
case ActType::None:
return KunlunActType::LINEAR;
case ActType::Tanh:
return KunlunActType::TANH;
case ActType::Sigmoid:
return KunlunActType::SIGMOID;
case ActType::Relu:
return KunlunActType::RELU6;
default:
fprintf(stderr, "Activation Type not support yet!\n");
break;
}
return KunlunActType::LINEAR;
}
}; // namespace infini

View File

@ -3,6 +3,8 @@
#include "xpu/runtime_ex.h"
#include "xpu/xdnn.h"
namespace xdnn = baidu::xpu::api;
#define checkKUNLUNError(call) \
{ \
auto err = call; \

View File

@ -1,28 +1,35 @@
#pragma once
#include "core/runtime.h"
#include "core/workspace.h"
#include "kunlun/kunlun_common.h"
#ifdef INFINI_USE_XCCL
#include "kunlun/xccl_communicator.h"
#endif
namespace infini {
class KUNLUNRuntimeObj : public RuntimeObj {
private:
baidu::xpu::api::Context *xdnn;
KUNLUNPtr workspace;
size_t workspaceSize;
xdnn::Context *ctx;
std::unique_ptr<CommunicatorObj> comm;
// KUNLUNPtr workspace;
// size_t workspaceSize;
Workspace<KUNLUNPtr> workspace;
public:
KUNLUNRuntimeObj() : RuntimeObj(Device::KUNLUN) {
xdnn = baidu::xpu::api::create_context();
KUNLUNRuntimeObj(int deviceId = 0) : RuntimeObj(Device::KUNLUN) {
xpu_set_device(deviceId);
ctx = xdnn::create_context();
// 10GB for Longformer
// size_t longformerNum = 3lu * (1 << 30);
workspaceSize = 3ll << 30; // 3 GB
// std::cout<<workspaceSize/1024/1024/1024<< std::endl;
// std::cout<<std::bitset<64>(workspaceSize)<< std::endl;
workspace = alloc(workspaceSize);
size_t workspaceSize = 3llu << 30; // 3 GB
KUNLUNPtr wkspacePtr = alloc(workspaceSize);
workspace =
make_ref<WorkspaceObj<KUNLUNPtr>>(wkspacePtr, workspaceSize);
}
virtual ~KUNLUNRuntimeObj() {
dealloc(workspace);
baidu::xpu::api::destroy_context(xdnn);
KUNLUNPtr wkspacePtr = workspace->getWorkspace();
dealloc(wkspacePtr);
xdnn::destroy_context(ctx);
}
string toString() const override;
@ -31,6 +38,7 @@ class KUNLUNRuntimeObj : public RuntimeObj {
// double runEvaluation(const Graph &graph, int nWarmups,
// int nEvaluations) const;
void sync() const;
KUNLUNPtr alloc(size_t size) override {
void *ptr;
checkKUNLUNError(
@ -38,33 +46,33 @@ class KUNLUNRuntimeObj : public RuntimeObj {
return ptr;
}
void dealloc(void *ptr) override { xpu_free(ptr); }
baidu::xpu::api::Context *KUNLUNHandle() const { return xdnn; }
xdnn::Context *KUNLUNHandle() const { return ctx; }
// Get $size workspace by bytes
KUNLUNPtr getWorkspace(size_t size) const {
IT_ASSERT(size <= workspaceSize);
return workspace;
auto ret = workspace->getWorkspace(size);
return ret;
}
Workspace<KUNLUNPtr> getWorkspaceObj() const { return workspace; }
void copyBlobFromCPU(void *dst, const void *src,
size_t bytes) const override {
xpu_memcpy(dst, const_cast<void *>(src), bytes,
XPUMemcpyKind::XPU_HOST_TO_DEVICE);
}
void copyBlobToCPU(void *dst, const void *src,
size_t bytes) const override {
xpu_memcpy(dst, const_cast<void *>(src), bytes,
XPUMemcpyKind::XPU_DEVICE_TO_HOST);
}
void copyBlobInsideRuntime(void *dst, const void *src,
size_t bytes) const override {
xpu_memcpy(dst, const_cast<void *>(src), bytes,
XPUMemcpyKind::XPU_DEVICE_TO_DEVICE);
}
void initComm(const string &name, int worldSize, int rank) final;
void initComm(const string &, int, int) override { IT_TODO_HALT(); }
CommunicatorObj &getCommunicator() const override { IT_TODO_HALT(); }
CommunicatorObj &getCommunicator() const final { return *comm; }
private:
void runWithoutSync(const Graph &graph, bool tune, bool profiling) const;

View File

@ -0,0 +1,60 @@
#pragma once
#include "core/communicator.h"
#include "xpu/bkcl.h"
#include <chrono>
#include <filesystem>
#include <fstream>
#include <thread>
#define checkXcclError(call) \
{ \
auto err = call; \
if (BKCL_SUCCESS != err) { \
fprintf(stderr, "XCCL error in %s:%i.\n", __FILE__, __LINE__); \
exit(EXIT_FAILURE); \
} \
}
namespace infini {
class XcclCommunicatorObj final : public CommunicatorObj {
private:
BKCLContext_t comm;
public:
XcclCommunicatorObj(const string &name, int worldSize, int rank)
: CommunicatorObj(worldSize, rank) {
const std::string filePath("./" + name + "_xccl_id.bin");
BKCLUniqueId commId;
if (rank == 0) {
checkXcclError(bkcl_get_unique_id(&commId));
std::ofstream ofs(filePath, std::ios::binary);
ofs.write((char *)&commId, sizeof(BKCLUniqueId));
} else {
auto begin = std::chrono::steady_clock::now();
while (!std::filesystem::exists(filePath)) {
auto now = std::chrono::steady_clock::now();
_IT_ASSERT_2(now < begin + std::chrono::seconds(10),
"time limit (10s) exceeded.");
std::this_thread::sleep_for(std::chrono::milliseconds(100));
}
std::ifstream ifs(filePath, std::ios::binary);
ifs.read((char *)&commId, sizeof(BKCLUniqueId));
}
checkXcclError(bkcl_init_rank(&comm, rank, worldSize, &commId));
if (rank == 0) {
std::filesystem::remove(filePath);
}
}
BKCLContext_t getXcclComm() { return comm; }
~XcclCommunicatorObj() final { checkXcclError(bkcl_destroy_context(comm)); }
virtual string toString() const final {
std::ostringstream oss;
oss << "XCCL communicator";
return oss.str();
}
};
} // namespace infini

View File

@ -17,6 +17,9 @@ class MatmulObj : public OperatorObj {
// Auxiliary attributes which are not a part of operator attributes.
int b, m, n, k;
// Specifies the data precision for the matrix multiply.
std::string computeType = "default";
public:
/**
* @brief Matmul operator with batch broadcast and tensor transpose
@ -38,10 +41,11 @@ class MatmulObj : public OperatorObj {
* @param transB If matrix B should be transposed when computing.
* @param bias The bias tensor.
* @param act The activation function.
* @param computeType Specifies the data precision for the matrix multiply.
*/
MatmulObj(GraphObj *graph, Tensor A, Tensor B, Tensor C,
bool transA = false, bool transB = false, Tensor bias = nullptr,
ActType act = ActType::None);
ActType act = ActType::None, std::string computeType = "default");
OP_CLONE(MatmulObj);
std::string toString() const override;
@ -60,6 +64,7 @@ class MatmulObj : public OperatorObj {
int getN() const { return n; }
int getK() const { return k; }
auto getBMNK() const { return tuple{b, m, n, k}; }
std::string getComputeType() const { return computeType; }
private:
vector<int> getWorkloadVector() const override;

View File

@ -0,0 +1,34 @@
#pragma once
#include "core/operator.h"
namespace infini {
/**
* @brief Fused RMSNorm Operator
*
*/
class RMSNormObj : public OperatorObj {
int dim;
public:
/**
* @brief Construct a new RMSNorm object.
*
* @param graph The computation graph that this operator belongs to.
* @param input The input tensor.
* @param output The output tensor.
*/
RMSNormObj(GraphObj *graph, Tensor input, Tensor weight, Tensor output);
OP_CLONE(RMSNormObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
std::string toString() const override;
int numInputs() const override { return 2; }
int numOutputs() const override { return 1; }
int getDim() const { return dim; }
private:
vector<int> getWorkloadVector() const override;
vector<int> getOpAttrVector() const override;
};
} // namespace infini

29
include/operators/rope.h Normal file
View File

@ -0,0 +1,29 @@
#pragma once
#include "core/operator.h"
namespace infini {
class RoPEObj : public OperatorObj {
public:
/**
* @brief Construct a new RotaryEmbedding object.
*
* @param graph The computation graph that this operator belongs to.
* @param pos The positon id of the query.
* @param input The input tensor.
* @param output The output tensor.
*/
RoPEObj(GraphObj *graph, Tensor pos, Tensor input, Tensor output);
OP_CLONE(RoPEObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
std::string toString() const override;
int numInputs() const override { return 2; }
int numOutputs() const override { return 1; }
DataType getDType() const { return getInputs(1)->getDType(); }
private:
vector<int> getWorkloadVector() const override;
vector<int> getOpAttrVector() const override;
};
} // namespace infini

View File

@ -7,9 +7,7 @@ namespace infini {
*
*/
class SliceObj : public OperatorObj {
template <class T> struct range_t {
T start, end, step;
};
template <class T> struct range_t { T start, end, step; };
vector<range_t<int>> axes;
public:

View File

@ -159,6 +159,7 @@ enum class CastType {
Uint322Int64,
Float162Float,
BFloat162Float,
Float2Float,
};
class CastObj : public OperatorObj {
@ -258,6 +259,7 @@ class LogObj : public OperatorObj {
};
DEFINE_UNARY_OBJ(Relu, OpType::Relu)
DEFINE_UNARY_OBJ(Silu, OpType::Silu)
DEFINE_UNARY_OBJ(Gelu, OpType::Gelu)
DEFINE_UNARY_OBJ(Sigmoid, OpType::Sigmoid)
DEFINE_UNARY_OBJ(Tanh, OpType::Tanh)

View File

@ -1,14 +0,0 @@
#pragma once
namespace infini {
void broadcastShape(const Shape &originShape, SmallArray &modifyShape,
int nDims, int size) {
for (int i = nDims - size - 1; i >= 0; --i) {
modifyShape.data[i] = 1;
}
for (int i = nDims - 1; i >= nDims - size; --i) {
modifyShape.data[i] = originShape[i - nDims + size];
}
}
} // namespace infini

View File

@ -5,6 +5,9 @@
#include "core/operator.h"
#include "core/tensor.h"
#include "utils/small_array.h"
#include <numeric>
namespace infini {
// Launch a broadcast shape based on the shape of input A and B
@ -20,6 +23,12 @@ size_t delocate_index(const Shape &shapeIndex, const Shape &shape,
const Shape &stride);
// Convert KernelAttrs to a string representation
std::string get_kernel_attrs_str(const KernelAttrs &kernelAttrs);
// VectorProd
int shapeProd(std::vector<int>::iterator start, std::vector<int>::iterator end);
void broadcastShape(const Shape &originShape, SmallArray &modifyShape,
int nDims, int size);
void broadcastShape(const Shape &tempShape, Shape &modifyShape);
} // namespace infini
#endif

View File

@ -4,6 +4,14 @@ namespace infini {
#define SMALL_ARRAY_SIZE 8
struct SmallArray {
int data[SMALL_ARRAY_SIZE];
int prod(int start, int end) {
int result = 1;
for (int i = start; i < end; ++i) {
result *= data[i];
}
return result;
}
};
} // namespace infini

View File

@ -1,4 +1,4 @@
import backend
import backend
from onnx import (
ModelProto,
TensorProto,
@ -23,12 +23,13 @@ from onnx.checker import (
ValidationError,
)
from onnx.shape_inference import infer_shapes
from onnx.numpy_helper import to_array
from onnx.numpy_helper import to_array, from_array
from typing import Dict, List, Any, Tuple, Sequence, Union, Optional
from functools import reduce
from onnxsim import simplify
import copy
import warnings
import numpy as np
class OnnxStub:
@ -37,7 +38,13 @@ class OnnxStub:
It can be generated from an Onnx model object.
"""
def __init__(self, model: ModelProto, runtime, use_naive_allocator: bool = False):
def __init__(
self,
model: ModelProto,
runtime,
use_naive_allocator: bool = False,
matmul_compute_type: str = "default",
):
# We use some user-defined operators for distributed inference
try:
# onnx simplifier performs inplace simplify
@ -105,12 +112,6 @@ class OnnxStub:
)
tensors[input.name].set_input()
for output in model.graph.output:
dims = _take_shape_dim(output.type.tensor_type.shape)
tensors[output.name] = self.handler.tensor(
dims, output.type.tensor_type.elem_type
)
tensors[output.name].set_output()
for node_idx in sorted_nodes:
node = model.graph.node[node_idx]
@ -252,13 +253,14 @@ class OnnxStub:
)
elif node.op_type == "MatMul":
tensors[node.output[0]] = self.handler.matmul(
tensors[node.input[0]],
tensors[node.input[1]],
tensors[node.input[0]], # input
tensors[node.input[1]], # weight
tensors.get(node.output[0]),
False,
False,
None,
backend.ActType.Linear,
matmul_compute_type,
)
elif node.op_type == "Gemm":
attributes = _parse_attribute(
@ -278,6 +280,7 @@ class OnnxStub:
transB == 1,
tensors[node.input[2]] if len(node.input) > 2 else None,
backend.ActType.Linear,
matmul_compute_type,
)
elif node.op_type == "BatchNormalization":
(input, mean, var, scale, bias) = (
@ -321,6 +324,12 @@ class OnnxStub:
axis,
stash_type,
)
elif node.op_type == "RMSNorm":
tensors[node.output[0]] = self.handler.RMSNorm(
tensors[node.input[0]],
tensors[node.input[1]],
tensors.get(node.output[0]),
)
elif node.op_type == "MaxPool":
attributes = _parse_attribute(
node,
@ -482,6 +491,11 @@ class OnnxStub:
tensors[node.input[0]],
tensors.get(node.output[0]),
)
elif node.op_type == "Silu":
tensors[node.output[0]] = self.handler.silu(
tensors[node.input[0]],
tensors.get(node.output[0]),
)
elif node.op_type == "Gelu":
tensors[node.output[0]] = self.handler.gelu(
tensors[node.input[0]],
@ -657,7 +671,7 @@ class OnnxStub:
keep_aspect_ratio_policy,
nearest_mode,
coordinate_transformation_mode,
)
)
elif node.op_type == "Squeeze":
axes = (
_parse_data(data[node.input[1]])
@ -713,6 +727,12 @@ class OnnxStub:
tensors[node.input[5]],
tensors.get(node.output[0]),
)
elif node.op_type == "RoPE":
tensors[node.output[0]]= self.handler.RoPE(
tensors[node.input[0]],
tensors[node.input[1]],
tensors.get(node.output[0]),
)
elif node.op_type == "Split":
split = (
_parse_data(data[node.input[1]])
@ -966,6 +986,25 @@ class OnnxStub:
tensors.get(node.output[0]),
)
elif node.op_type == "Where":
## If Y is single -inf, treat Where as Add
## TODO: deal with cases where Y is single inf or 0
if node.input[0] in data and node.input[2] in data:
where_condition = to_array(data[node.input[0]])
where_alt = to_array(data[node.input[2]])
if where_alt.size == 1:
if np.isneginf(where_alt) or np.all(where_alt < -3e38):
node.input[0] = node.input[0] + "_alt"
if node.input[0] not in data:
where_value = np.where(where_condition, 0, -np.inf).astype(where_alt.dtype)
data[node.input[0]] = from_array(where_value, node.input[0])
tensors[node.input[0]] = self.handler.tensor(list(where_value.shape), data[node.input[0]].data_type)
tensors[node.input[0]].set_weight()
tensors[node.output[0]] = self.handler.add(
tensors[node.input[1]],
tensors[node.input[0]],
tensors.get(node.output[0]),
)
continue
tensors[node.output[0]] = self.handler.where(
tensors[node.input[1]],
tensors[node.input[2]],
@ -995,10 +1034,12 @@ class OnnxStub:
beta,
bias,
size,
)
)
else:
raise Exception('Unsupported operator "{}"'.format(node.op_type))
for output in model.graph.output:
tensors[output.name].set_output()
################################
# Allocate memory space for data
################################
@ -1280,7 +1321,7 @@ class OnnxStub:
axes,
)
)
ctx.push_node(make_node(ty.name, inputs, outputs, name))
ctx.push_node(make_node(ty.name, inputs, outputs, name))
elif ty == backend.OpTypeId.Concat:
axis = backend.concat_axis_of(op)
ctx.push_node(make_node(ty.name, inputs, outputs, name, axis=axis))
@ -1409,6 +1450,9 @@ class OnnxStub:
def run(self) -> None:
self.handler.run()
def run_with_cudagraph(self) -> None:
self.handler.run_with_cudagraph()
def get_perf_time(self) -> float:
self.handler.get_perf_time()

View File

@ -463,13 +463,20 @@ class TestStringMethods(unittest.TestCase):
def test_split(self):
input = make_tensor_value_info("input", TensorProto.FLOAT, [1, 3, 2, 4])
split = make_node("Split", ["input"], ["output"], name="split", axis=0)
make_and_import_model(make_graph([split], "split", [input], []))
output = make_tensor_value_info("output", TensorProto.FLOAT, [1, 3, 2, 4])
make_and_import_model(make_graph([split], "split", [input], [output]))
def test_split1(self):
input = make_tensor_value_info("input", TensorProto.FLOAT, [1, 3, 2, 4])
splitAttr = make_tensor_value_info("split", TensorProto.INT64, [2, 1])
split = make_node("Split", ["input", "split"], ["output"], name="split", axis=1)
make_and_import_model(make_graph([split], "split", [input, splitAttr], []))
splitAttr = make_tensor("split", TensorProto.INT64, [2], [2, 1])
output1 = make_tensor_value_info("output1", TensorProto.FLOAT, [1, 2, 2, 4])
output2 = make_tensor_value_info("output2", TensorProto.FLOAT, [1, 1, 2, 4])
split = make_node(
"Split", ["input", "split"], ["output1", "output2"], name="split", axis=1
)
make_and_import_model(
make_graph([split], "split", [input], [output1, output2], [splitAttr])
)
def test_allBroadcast(self):
input = make_tensor_value_info("input", TensorProto.FLOAT, [1, 3, 2, 4])

View File

@ -18,6 +18,8 @@
#include "operators/reduce.h"
#include "operators/reshape.h"
#include "operators/resize.h"
#include "operators/rms_norm.h"
#include "operators/rope.h"
#include "operators/send.h"
#include "operators/slice.h"
#include "operators/softmax.h"
@ -72,15 +74,17 @@ Tensor GraphHandlerObj::convTransposed2d(Tensor input, Tensor weight,
}
Tensor GraphHandlerObj::matmul(Tensor a, Tensor b, Tensor y, bool transA,
bool transB, Tensor bias, ActType act) {
bool transB, Tensor bias, ActType act,
std::string matmul_compute_type) {
if (y) {
g->addOpWithOutputs<MatmulObj>(std::move(a), std::move(b), y, transA,
transB, std::move(bias), act);
transB, std::move(bias), act,
matmul_compute_type);
return y;
} else {
return g
->addOp<MatmulObj>(std::move(a), std::move(b), y, transA, transB,
std::move(bias), act)
std::move(bias), act, matmul_compute_type)
->getOutput();
}
}
@ -121,6 +125,17 @@ Tensor GraphHandlerObj::layerNormalization(Tensor input, Tensor scale,
}
}
Tensor GraphHandlerObj::rmsNorm(Tensor input, Tensor weight, Tensor output) {
if (output) {
g->addOpWithOutputs<RMSNormObj>(std::move(input), std::move(weight),
output);
return output;
} else {
return g->addOp<RMSNormObj>(std::move(input), std::move(weight), output)
->getOutput();
}
}
Tensor GraphHandlerObj::maxPool(Tensor input, Tensor output, int kh, int kw,
int dh, int dw, int ph, int pw, int sh, int sw,
int ceilMode) {
@ -181,6 +196,7 @@ DEFINE_ELEMENT_WISE_METHOD(max, Maximum)
} \
}
DEFINE_UNARY_METHOD(silu, Silu)
DEFINE_UNARY_METHOD(relu, Relu)
DEFINE_UNARY_METHOD(gelu, Gelu)
DEFINE_UNARY_METHOD(sigmoid, Sigmoid)
@ -345,6 +361,16 @@ Tensor GraphHandlerObj::attentionKVCache(Tensor input_k_cache,
}
}
Tensor GraphHandlerObj::RoPE(Tensor pos, Tensor input, Tensor output) {
if (output) {
g->addOpWithOutputs<RoPEObj>(std::move(pos), std::move(input), output);
return output;
} else {
return g->addOp<RoPEObj>(std::move(pos), std::move(input), output)
->getOutput();
}
}
TensorVec GraphHandlerObj::split(Tensor input, std::optional<TensorVec> outputs,
int axis,
std::variant<int, vector<int>> numOrRatio) {
@ -683,6 +709,8 @@ static CastType inferCastType(Tensor input, int to) {
return CastType::Float162Float;
} else if (iType == DataType::BFloat16 && oType == DataType::Float32) {
return CastType::BFloat162Float;
} else if (iType == DataType::Float32 && oType == DataType::Float32) {
return CastType::Float2Float;
} else {
IT_TODO_HALT_MSG("Unsupported CastType : input_type is " +
iType.toString() + " output_type is " +

View File

@ -66,6 +66,36 @@ void TensorObj::setShape(Shape shape_) {
_size = size;
}
void TensorObj::dumpData(std::ofstream &ofs) const {
IT_ASSERT(data != nullptr);
if (!runtime->isCpu())
IT_TODO_HALT();
#define TRY_DUMP(N) \
if (dtype == DataType(N)) \
ofs << dataToString<DT<N>::t>() << std::endl;
TRY_DUMP(0) // fmt: new line
else TRY_DUMP(1) //
else TRY_DUMP(2) //
else TRY_DUMP(3) //
else TRY_DUMP(4) //
else TRY_DUMP(5) //
else TRY_DUMP(6) //
else TRY_DUMP(7) //
else TRY_DUMP(8) //
else TRY_DUMP(9) //
else TRY_DUMP(10) //
else TRY_DUMP(11) //
else TRY_DUMP(12) //
else TRY_DUMP(13) //
else TRY_DUMP(16) //
else IT_TODO_HALT();
ofs.flush();
#undef TRY_DUMP
}
void TensorObj::printData() const {
IT_ASSERT(data != nullptr);
if (!runtime->isCpu())

View File

@ -19,7 +19,6 @@ void CHECK_CUDA_KERNEL_ERROR(infini::Operator op) {
}
namespace infini {
void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
const auto &kernelRegistry = KernelRegistry::getInstance();
auto &perfEngine = PerfEngine::getInstance();
@ -39,6 +38,27 @@ void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
}
}
void CudaRuntimeObj::runWithCudaGraph(const Graph &graph) {
if (!isCudaGraphCreated) {
CUDAStream::createStream();
checkCudnnError(cudnnSetStream(cudnn, CUDAStream::getCurrentStream()));
checkCublasError(
cublasSetStream(cublas, CUDAStream::getCurrentStream()));
checkCudaError(cudaStreamBeginCapture(CUDAStream::getCurrentStream(),
cudaStreamCaptureModeGlobal));
runWithoutSync(graph);
checkCudaError(
cudaStreamEndCapture(CUDAStream::getCurrentStream(), &cudaGraph));
checkCudaError(
cudaGraphInstantiate(&cudaGraphInstance, cudaGraph, NULL, NULL, 0));
isCudaGraphCreated = true;
} else {
checkCudaError(
cudaGraphLaunch(cudaGraphInstance, CUDAStream::getCurrentStream()));
}
checkCudaError(cudaStreamSynchronize(CUDAStream::getCurrentStream()));
}
void CudaRuntimeObj::tune(const Graph &graph, bool profiling = false) const {
const auto &kernelRegistry = KernelRegistry::getInstance();
auto &perfEngine = PerfEngine::getInstance();
@ -102,4 +122,5 @@ void CudaRuntimeObj::initComm(const string &name, int worldSize, int rank) {
#endif
}
cudaStream_t CUDAStream::_stream = 0;
} // namespace infini

View File

@ -16,7 +16,8 @@ __global__ void cudaPrintFloatImpl(float *x, int len) {
namespace infini {
void cudaPrintFloat(float *x, int len) {
cudaPrintFloatImpl<<<1, 1>>>(x, len);
cudaPrintFloatImpl
<<<1, 1, 0, CUDAStream::getCurrentStream()>>>(x, len);
cudaDeviceSynchronize();
}

View File

@ -442,7 +442,9 @@ void init_graph_builder(py::module &m) {
#endif
#ifdef USE_KUNLUN
py::class_<KUNLUNRuntimeObj, std::shared_ptr<KUNLUNRuntimeObj>, RuntimeObj>(
m, "KUNLUNRuntime");
m, "KUNLUNRuntime")
.def(py::init<int>(), py::arg("device") = 0)
.def("init_comm", &KUNLUNRuntimeObj::initComm);
#endif
#ifdef USE_ASCEND
@ -525,6 +527,7 @@ void init_graph_builder(py::module &m) {
.def("matmul", &Handler::matmul, policy::move)
.def("batchNormalization", &Handler::batchNormalization, policy::move)
.def("layerNormalization", &Handler::layerNormalization, policy::move)
.def("RMSNorm", &Handler::rmsNorm, policy::move)
.def("maxPool", &Handler::maxPool, policy::move)
.def("avgPool", &Handler::avgPool, policy::move)
.def("add", &Handler::add, policy::move)
@ -536,6 +539,7 @@ void init_graph_builder(py::module &m) {
.def("min", &Handler::min, policy::move)
.def("max", &Handler::max, policy::move)
.def("relu", &Handler::relu, policy::move)
.def("silu", &Handler::silu, policy::move)
.def("gelu", &Handler::gelu, policy::move)
.def("sigmoid", &Handler::sigmoid, policy::move)
.def("tanh", &Handler::tanh, policy::move)
@ -558,6 +562,7 @@ void init_graph_builder(py::module &m) {
.def("unsqueeze", &Handler::unsqueeze, policy::move)
.def("concat", &Handler::concat, policy::move)
.def("attentionKVCache", &Handler::attentionKVCache, policy::move)
.def("RoPE", &Handler::RoPE, policy::move)
.def("split", &Handler::split, policy::move)
.def("gather", &Handler::gather, policy::move)
.def("gatherElements", &Handler::gatherElements, policy::move)
@ -590,6 +595,10 @@ void init_graph_builder(py::module &m) {
.def("get_perf_time", &Handler::get_perf_time, policy::automatic)
.def("tune", &Handler::tune, policy::automatic)
.def("run", &Handler::run, policy::automatic)
#ifdef USE_CUDA
.def("run_with_cudagraph", &Handler::run_with_cudagraph,
policy::automatic)
#endif
.def("shape_infer", &Handler::shape_infer, policy::automatic)
.def("change_shape", &Handler::change_shape, policy::automatic)
.def("getDims", &Handler::getDims, policy::automatic)

View File

@ -21,12 +21,12 @@ namespace infini {
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>()); \
void *const cData = (op->getOutput()->getRawDataPtr<void *>()); \
\
auto a = op->getInputs(0) -> getDims(); \
auto aS = op->getInputs(0) -> getStride(); \
auto b = op->getInputs(1) -> getDims(); \
auto bS = op->getInputs(1) -> getStride(); \
auto c = op->getOutput() -> getDims(); \
auto cS = op->getOutput() -> getStride(); \
auto a = op->getInputs(0)->getDims(); \
auto aS = op->getInputs(0)->getStride(); \
auto b = op->getInputs(1)->getDims(); \
auto bS = op->getInputs(1)->getStride(); \
auto c = op->getOutput()->getDims(); \
auto cS = op->getOutput()->getStride(); \
\
std::vector<int64_t> aDim = castTo64(a); \
std::vector<int64_t> aStride = castTo64(aS); \

View File

@ -92,22 +92,22 @@ class ReluAclnn : public ASCENDKernelWithoutConfig {
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>()); \
void *const cData = (op->getOutput()->getRawDataPtr<void *>()); \
\
auto a = op->getInputs(0) -> getDims(); \
auto a = op->getInputs(0)->getDims(); \
std::vector<int64_t> aDim(a.size(), 1); \
for (size_t i = 0; i < a.size(); ++i) { \
aDim[i] = int64_t(a[i]); \
} \
auto aS = op->getInputs(0) -> getStride(); \
auto aS = op->getInputs(0)->getStride(); \
std::vector<int64_t> aStride(aS.size(), 1); \
for (size_t i = 0; i < aS.size(); ++i) { \
aStride[i] = int64_t(aS[i]); \
} \
auto c = op->getInputs(0) -> getDims(); \
auto c = op->getInputs(0)->getDims(); \
std::vector<int64_t> cDim(c.size(), 1); \
for (size_t i = 0; i < c.size(); ++i) { \
cDim[i] = int64_t(c[i]); \
} \
auto cS = op->getInputs(0) -> getStride(); \
auto cS = op->getInputs(0)->getStride(); \
std::vector<int64_t> cStride(cS.size(), 1); \
for (size_t i = 0; i < cS.size(); ++i) { \
cStride[i] = int64_t(cS[i]); \

View File

@ -2,16 +2,20 @@
#include "bang/bang_runtime.h"
#include "operators/softmax.h"
#include "operators/unary.h"
#include <iostream>
namespace infini {
class UnaryCnnl : public BangKernelWithoutConfig {
virtual cnnlActivationMode_t getOpType() const = 0;
virtual float getCoef() const = 0;
virtual tuple<float, float> getAlphBeta() const { return {1.f, 0.f}; }
virtual float getSlicedDim() const { return 0.0; }
virtual float getGamma() const { return 0.0; }
virtual float getScale() const { return 0.0; }
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<UnaryObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -22,18 +26,19 @@ class UnaryCnnl : public BangKernelWithoutConfig {
auto cDim = op->getOutput()->getDims();
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, aDim.size(),
aDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
aDim.size(), aDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, cDim.size(),
cDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
cDim.size(), cDim.data()));
cnnlActivationDescriptor_t opDesc;
checkCnnlError(cnnlCreateActivationDescriptor(&opDesc));
checkCnnlError(cnnlSetActivationDescriptor_v2(
checkCnnlError(cnnlSetActivationDescriptor_v5(
opDesc, getOpType(), CNNL_ACTIVATION_HIGH_PRECISION,
CNNL_NOT_PROPAGATE_NAN, getCoef()));
CNNL_NOT_PROPAGATE_NAN, getCoef(), getSlicedDim(), getGamma(),
getScale(), true));
auto [alpha, beta] = getAlphBeta();
cnnlStatus_t stat =
@ -51,7 +56,6 @@ class RoundCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<UnaryObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -62,13 +66,13 @@ class RoundCnnl : public BangKernelWithoutConfig {
auto cDim = op->getOutput()->getDims();
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, aDim.size(),
aDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
aDim.size(), aDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, cDim.size(),
cDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
cDim.size(), cDim.data()));
cnnlStatus_t stat =
cnnlRound(context->cnnlHandle(), aDesc, aData, cDesc, cData);
if (stat != CNNL_STATUS_SUCCESS)
@ -82,7 +86,6 @@ class PReluCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<PReluObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -94,18 +97,22 @@ class PReluCnnl : public BangKernelWithoutConfig {
auto bDim = op->getInputs(1)->getDims();
auto cDim = op->getOutput()->getDims();
if (auto alignSize = aDim.size() - bDim.size(); alignSize) {
bDim.insert(bDim.begin(), alignSize, 1);
}
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, aDim.size(),
aDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
aDim.size(), aDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, bDim.size(),
bDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
bDim.size(), bDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, cDim.size(),
cDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
cDim.size(), cDim.data()));
cnnlStatus_t stat = cnnlPrelu(context->cnnlHandle(), aDesc, aData,
bDesc, bData, cDesc, cData);
@ -122,7 +129,6 @@ class SoftmaxCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<SoftmaxObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -185,13 +191,13 @@ class SoftmaxCnnl : public BangKernelWithoutConfig {
}
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY,
CNNL_DTYPE_FLOAT, inDim.size(),
inDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
inDim.size(), inDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY,
CNNL_DTYPE_FLOAT, outDim.size(),
outDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
outDim.size(), outDim.data()));
float alpha = 1.0;
float beta = 0.0;
cnnlStatus_t stat =
@ -219,6 +225,22 @@ class SigmoidCnnl : public UnaryCnnl {
float getCoef() const override { return 0.0; }
};
class HardSwishCnnl : public UnaryCnnl {
cnnlActivationMode_t getOpType() const override {
return CNNL_ACTIVATION_HARDSWISH;
}
float getCoef() const override { return 0.0; }
};
class HardSigmoidCnnl : public UnaryCnnl {
cnnlActivationMode_t getOpType() const override {
return CNNL_ACTIVATION_HARDSIGMOID;
}
float getCoef() const override { return 0.0; }
float getGamma() const override { return 1.f / 6.f; }
float getScale() const override { return 0.5f; }
};
REGISTER_KERNEL(Device::BANG, OpType::Relu, ReluCnnl, "Relu_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::PRelu, PReluCnnl, "PRelu_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::Sigmoid, SigmoidCnnl,
@ -226,5 +248,9 @@ REGISTER_KERNEL(Device::BANG, OpType::Sigmoid, SigmoidCnnl,
REGISTER_KERNEL(Device::BANG, OpType::Round, RoundCnnl, "Round_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::Softmax, SoftmaxCnnl,
"Softmax_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::HardSigmoid, HardSigmoidCnnl,
"HardSigmoid_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::HardSwish, HardSwishCnnl,
"HardSwish_cnnl_BANG");
}; // namespace infini

View File

@ -10,7 +10,6 @@ class ActivationBackwardCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ActivationBackwardObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *const yData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -25,21 +24,21 @@ class ActivationBackwardCnnl : public BangKernelWithoutConfig {
auto diffxDim = op->getOutput()->getDims();
checkCnnlError(cnnlCreateTensorDescriptor(&yDesc));
checkCnnlError(cnnlSetTensorDescriptor(yDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, yDim.size(),
yDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
yDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
yDim.size(), yDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&diffYDesc));
checkCnnlError(cnnlSetTensorDescriptor(
diffYDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, diffyDim.size(),
diffyDim.data()));
diffYDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
diffyDim.size(), diffyDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&xDesc));
checkCnnlError(cnnlSetTensorDescriptor(xDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, xDim.size(),
xDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
xDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
xDim.size(), xDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&diffXDesc));
checkCnnlError(cnnlSetTensorDescriptor(
diffXDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, diffxDim.size(),
diffxDim.data()));
diffXDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
diffxDim.size(), diffxDim.data()));
// get op descriptor
cnnlActivationDescriptor_t opDesc;
checkCnnlError(cnnlCreateActivationDescriptor(&opDesc));

View File

@ -19,7 +19,6 @@ class AllGatherCNCL : public BangKernelWithoutConfig {
BangPtr output_temp =
context->getWorkspace(op->getInputs(0)->getBytes() * world_size);
// void *output = op->getOutput()->getRawDataPtr<void *>();
// IT_ASSERT(op->getDType() == DataType::Float32);
checkBangError(cnrtMalloc(&output_temp,
op->getInputs(0)->getBytes() * world_size));
size_t bytes = op->getInputs(0)->getBytes();
@ -29,8 +28,9 @@ class AllGatherCNCL : public BangKernelWithoutConfig {
dynamic_cast<CnclCommunicatorObj &>(context->getCommunicator())
.getCnclComm();
cnrtQueue_t queue = context->getBangQueue();
CNCL_CHECK(
cnclAllGather(input, output_temp, count, cnclFloat32, comm, queue));
CNCL_CHECK(cnclAllGather(input, output_temp, count,
cnclDataTypeConvert(op->getDType()), comm,
queue));
checkBangError(cnrtQueueSync(queue));
for (int i = 0; i < world_size; ++i) {
Tensor output = op->getOutput(i);
@ -42,8 +42,8 @@ class AllGatherCNCL : public BangKernelWithoutConfig {
}
};
REGISTER_KERNEL(Device::BANG, OpType::AllGather, DataType::Float32,
AllGatherCNCL, "AllGather_CNCL_BANG_Float32");
REGISTER_KERNEL(Device::BANG, OpType::AllGather, AllGatherCNCL,
"AllGather_CNCL_BANG");
} // namespace infini
#endif

View File

@ -13,15 +13,16 @@ class AllReduceCNCL : public BangKernelWithoutConfig {
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *input = op->getInputs(0)->getRawDataPtr<void *>();
void *output = op->getOutput()->getRawDataPtr<void *>();
IT_ASSERT(op->getDType() == DataType::Float32);
size_t count = op->getInputs(0)->size();
size_t bytes = op->getInputs(0)->getBytes();
size_t count = bytes / op->getDType().getSize();
cnclComm_t comm =
dynamic_cast<CnclCommunicatorObj &>(context->getCommunicator())
.getCnclComm();
cnrtQueue_t queue = context->getBangQueue();
// checkBangError(cnrtQueueSync(queue));
CNCL_CHECK(cnclAllReduce(input, output, count, cnclFloat, getRedOp(),
comm, queue));
CNCL_CHECK(cnclAllReduce(input, output, count,
cnclDataTypeConvert(op->getDType()),
getRedOp(), comm, queue));
checkBangError(cnrtQueueSync(queue));
}
@ -41,13 +42,13 @@ class AllReduceMaxCNCL : public AllReduceCNCL {
cnclReduceOp_t getRedOp() const override { return cnclMax; }
};
REGISTER_KERNEL(Device::BANG, OpType::AllReduceSum, DataType::Float32,
AllReduceSumCNCL, "AllReduce_Sum_CNCL_BANG_Float32");
REGISTER_KERNEL(Device::BANG, OpType::AllReduceProd, DataType::Float32,
AllReduceProdCNCL, "AllReduce_Prod_CNCL_BANG_Float32");
REGISTER_KERNEL(Device::BANG, OpType::AllReduceMin, DataType::Float32,
AllReduceMinCNCL, "AllReduce_Min_CNCL_BANG_Float32");
REGISTER_KERNEL(Device::BANG, OpType::AllReduceMax, DataType::Float32,
AllReduceMaxCNCL, "AllReduce_Max_CNCL_BANG_Float32");
REGISTER_KERNEL(Device::BANG, OpType::AllReduceSum, AllReduceSumCNCL,
"AllReduce_Sum_CNCL_BANG");
REGISTER_KERNEL(Device::BANG, OpType::AllReduceProd, AllReduceProdCNCL,
"AllReduce_Prod_CNCL_BANG");
REGISTER_KERNEL(Device::BANG, OpType::AllReduceMin, AllReduceMinCNCL,
"AllReduce_Min_CNCL_BANG");
REGISTER_KERNEL(Device::BANG, OpType::AllReduceMax, AllReduceMaxCNCL,
"AllReduce_Max_CNCL_BANG");
} // namespace infini
#endif

View File

@ -7,7 +7,6 @@ class BatchNormCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<BatchNormObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *const input = (op->getInputs(0)->getRawDataPtr<void *>());
@ -33,18 +32,18 @@ class BatchNormCnnl : public BangKernelWithoutConfig {
checkCnnlError(cnnlCreateTensorDescriptor(&intransDesc));
checkCnnlError(cnnlCreateTensorDescriptor(&outDesc));
checkCnnlError(cnnlCreateTensorDescriptor(&outtransDesc));
checkCnnlError(cnnlSetTensorDescriptor(inDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, dims.size(),
dims.data()));
checkCnnlError(cnnlSetTensorDescriptor(intransDesc, CNNL_LAYOUT_NHWC,
CNNL_DTYPE_FLOAT, dims.size(),
dimsTrans));
checkCnnlError(cnnlSetTensorDescriptor(outDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, outDims.size(),
outDims.data()));
checkCnnlError(cnnlSetTensorDescriptor(outtransDesc, CNNL_LAYOUT_NHWC,
CNNL_DTYPE_FLOAT, outDims.size(),
dimsOutTrans));
checkCnnlError(cnnlSetTensorDescriptor(
inDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
dims.size(), dims.data()));
checkCnnlError(cnnlSetTensorDescriptor(
intransDesc, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()),
dims.size(), dimsTrans));
checkCnnlError(cnnlSetTensorDescriptor(
outDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
outDims.size(), outDims.data()));
checkCnnlError(cnnlSetTensorDescriptor(
outtransDesc, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()),
outDims.size(), dimsOutTrans));
cnnlTransposeDescriptor_t opDesc;
checkCnnlError(cnnlCreateTransposeDescriptor(&opDesc));
checkCnnlError(cnnlSetTransposeDescriptor(opDesc, 4, permute));
@ -53,9 +52,9 @@ class BatchNormCnnl : public BangKernelWithoutConfig {
&wsSize);
BangPtr wsData = context->getWorkspace(wsSize);
BangPtr inputTrans = context->getWorkspace(
cnnlGetTensorElementNum(inDesc) * sizeof(float));
cnnlGetTensorElementNum(inDesc) * op->getDType().getSize());
BangPtr outputTrans = context->getWorkspace(
cnnlGetTensorElementNum(inDesc) * sizeof(float));
cnnlGetTensorElementNum(inDesc) * op->getDType().getSize());
cnnlStatus_t stat =
cnnlTranspose_v2(context->cnnlHandle(), opDesc, inDesc, input,
intransDesc, inputTrans, wsData, wsSize);
@ -67,7 +66,7 @@ class BatchNormCnnl : public BangKernelWithoutConfig {
cnnlTensorDescriptor_t paraDesc;
checkCnnlError(cnnlCreateTensorDescriptor(&paraDesc));
checkCnnlError(cnnlSetTensorDescriptor(
paraDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT,
paraDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
dimsScaleBiasMeanVar.size(), dimsScaleBiasMeanVar.data()));
float alpha = 1.f, beta = 0.f;

View File

@ -13,22 +13,23 @@ class BroadcastCNCL : public BangKernelWithoutConfig {
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *input = op->getInputs(0)->getRawDataPtr<void *>();
void *output = op->getOutput()->getRawDataPtr<void *>();
IT_ASSERT(op->getDType() == DataType::Float32);
size_t count = op->getInputs(0)->getBytes() / op->getDType().getSize();
size_t bytes = op->getInputs(0)->getBytes();
size_t count = bytes / op->getDType().getSize();
cnclComm_t comm =
dynamic_cast<CnclCommunicatorObj &>(context->getCommunicator())
.getCnclComm();
cnrtQueue_t queue = context->getBangQueue();
// TODO: Using default stream 0 for now.
CNCL_CHECK(cnclBroadcast(input, output, count, cnclFloat32,
CNCL_CHECK(cnclBroadcast(input, output, count,
cnclDataTypeConvert(op->getDType()),
op->getRoot(), comm, queue));
checkBangError(cnrtQueueSync(queue));
}
};
REGISTER_KERNEL(Device::BANG, OpType::Broadcast, DataType::Float32,
BroadcastCNCL, "Broadcast_CNCL_BANG_Float32");
REGISTER_KERNEL(Device::BANG, OpType::Broadcast, BroadcastCNCL,
"Broadcast_CNCL_BANG");
} // namespace infini
#endif

View File

@ -7,7 +7,6 @@ class CeilCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<UnaryObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -18,13 +17,13 @@ class CeilCnnl : public BangKernelWithoutConfig {
auto cDim = op->getOutput()->getDims();
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, aDim.size(),
aDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
aDim.size(), aDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, cDim.size(),
cDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
cDim.size(), cDim.data()));
cnnlStatus_t stat =
cnnlCeil(context->cnnlHandle(), aDesc, aData, cDesc, cData);

View File

@ -7,7 +7,6 @@ class ClipCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ClipObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -19,9 +18,9 @@ class ClipCnnl : public BangKernelWithoutConfig {
auto aDim = op->getInputs(0)->getDims();
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, aDim.size(),
aDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
aDim.size(), aDim.data()));
cnnlStatus_t stat =
cnnlClip(context->cnnlHandle(), aDesc, aData, &min, &max, cData);
if (stat != CNNL_STATUS_SUCCESS)

View File

@ -7,7 +7,6 @@ class ConcatCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ConcatObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
int num = op->numInputs();
int axis = op->getDim();
@ -15,17 +14,18 @@ class ConcatCnnl : public BangKernelWithoutConfig {
auto cDim = op->getOutput()->getDims();
cnnlTensorDescriptor_t desc;
checkCnnlError(cnnlCreateTensorDescriptor(&desc));
checkCnnlError(cnnlSetTensorDescriptor(desc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, cDim.size(),
cDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
desc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
cDim.size(), cDim.data()));
cnnlTensorDescriptor_t descArray[num];
for (int i = 0; i < num; ++i) {
checkCnnlError(cnnlCreateTensorDescriptor(&descArray[i]));
checkCnnlError(cnnlSetTensorDescriptor(
descArray[i], CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT,
op->getInputs(i)->getDims().size(),
op->getInputs(i)->getDims().data()));
checkCnnlError(
cnnlSetTensorDescriptor(descArray[i], CNNL_LAYOUT_NCHW,
cnnlDataTypeConvert(op->getDType()),
op->getInputs(i)->getDims().size(),
op->getInputs(i)->getDims().data()));
}
void *argv[num];

View File

@ -7,7 +7,6 @@ class ConvCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ConvObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
@ -21,8 +20,9 @@ class ConvCnnl : public BangKernelWithoutConfig {
cnnlConvolutionDescriptor_t convDesc;
checkCnnlError(cnnlCreateConvolutionDescriptor(&convDesc));
checkCnnlError(cnnlSetConvolutionDescriptor(
convDesc, 4, pad, stride, dilation, g, CNNL_DTYPE_FLOAT));
checkCnnlError(
cnnlSetConvolutionDescriptor(convDesc, 4, pad, stride, dilation, g,
cnnlDataTypeConvert(op->getDType())));
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
@ -55,20 +55,24 @@ class ConvCnnl : public BangKernelWithoutConfig {
// get inputs
checkCnnlError(cnnlCreateTensorDescriptor(&aInDesc));
checkCnnlError(cnnlSetTensorDescriptor(aInDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, 4, inputs0));
checkCnnlError(cnnlSetTensorDescriptor(
aInDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
inputs0));
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NHWC, CNNL_DTYPE_FLOAT, 4, inputs0Array));
aDesc, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()), 4,
inputs0Array));
checkCnnlError(cnnlCreateTensorDescriptor(&bInDesc));
checkCnnlError(cnnlSetTensorDescriptor(bInDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, 4, inputs1));
checkCnnlError(cnnlSetTensorDescriptor(
bInDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
inputs1));
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
checkCnnlError(cnnlSetTensorDescriptor(
bDesc, CNNL_LAYOUT_NHWC, CNNL_DTYPE_FLOAT, 4, inputs1Array));
bDesc, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()), 4,
inputs1Array));
int permute[4] = {0, 2, 3, 1};
cnnlTransposeDescriptor_t opDesc;
@ -80,7 +84,7 @@ class ConvCnnl : public BangKernelWithoutConfig {
&wsSize);
BangPtr wsData = context->getWorkspace(wsSize);
BangPtr aDataOut = context->getWorkspace(
cnnlGetTensorElementNum(aInDesc) * sizeof(float));
cnnlGetTensorElementNum(aInDesc) * op->getDType().getSize());
cnnlStatus_t stat =
cnnlTranspose_v2(context->cnnlHandle(), opDesc, aInDesc, aData,
aDesc, aDataOut, wsData, wsSize);
@ -91,7 +95,7 @@ class ConvCnnl : public BangKernelWithoutConfig {
&wsSize);
wsData = context->getWorkspace(wsSize);
BangPtr bDataOut = context->getWorkspace(
cnnlGetTensorElementNum(bInDesc) * sizeof(float));
cnnlGetTensorElementNum(bInDesc) * op->getDType().getSize());
stat = cnnlTranspose_v2(context->cnnlHandle(), opDesc, bInDesc, bData,
bDesc, bDataOut, wsData, wsSize);
if (stat != CNNL_STATUS_SUCCESS)
@ -100,11 +104,13 @@ class ConvCnnl : public BangKernelWithoutConfig {
// get outputs
checkCnnlError(cnnlCreateTensorDescriptor(&cInDesc));
checkCnnlError(cnnlSetTensorDescriptor(
cInDesc, CNNL_LAYOUT_NHWC, CNNL_DTYPE_FLOAT, 4, outputArray));
cInDesc, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()), 4,
outputArray));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, 4, output));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
output));
cnnlConvolutionForwardAlgo_t algo;
cnnlGetConvolutionForwardAlgorithm(context->cnnlHandle(), convDesc,
@ -116,7 +122,7 @@ class ConvCnnl : public BangKernelWithoutConfig {
algo, &wsSize);
wsData = context->getWorkspace(wsSize);
BangPtr cDataIn = context->getWorkspace(
cnnlGetTensorElementNum(cInDesc) * sizeof(float));
cnnlGetTensorElementNum(cInDesc) * op->getDType().getSize());
stat = cnnlConvolutionForward(
context->cnnlHandle(), convDesc, algo, NULL, aDesc, aDataOut, bDesc,

View File

@ -7,7 +7,6 @@ class ConvTransCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ConvBaseObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
@ -21,8 +20,9 @@ class ConvTransCnnl : public BangKernelWithoutConfig {
cnnlConvolutionDescriptor_t convDesc;
checkCnnlError(cnnlCreateConvolutionDescriptor(&convDesc));
checkCnnlError(cnnlSetConvolutionDescriptor(
convDesc, 4, pad, stride, dilation, g, CNNL_DTYPE_FLOAT));
checkCnnlError(
cnnlSetConvolutionDescriptor(convDesc, 4, pad, stride, dilation, g,
cnnlDataTypeConvert(op->getDType())));
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
@ -43,14 +43,17 @@ class ConvTransCnnl : public BangKernelWithoutConfig {
// get inputs
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dimInputs0.data()));
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
dimInputs0.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
checkCnnlError(cnnlSetTensorDescriptor(
bDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dimInputs1.data()));
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
dimInputs1.data()));
// get outputs
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dimOutput.data()));
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
dimOutput.data()));
cnnlConvolutionBwdDataAlgo_t algo;
cnnlGetConvolutionBackwardDataAlgorithm(

View File

@ -7,7 +7,6 @@ class ConvBackwardFilterCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ConvBackwardFilterObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
@ -21,8 +20,9 @@ class ConvBackwardFilterCnnl : public BangKernelWithoutConfig {
cnnlConvolutionDescriptor_t convDesc;
checkCnnlError(cnnlCreateConvolutionDescriptor(&convDesc));
checkCnnlError(cnnlSetConvolutionDescriptor(
convDesc, 4, pad, stride, dilation, g, CNNL_DTYPE_FLOAT));
checkCnnlError(
cnnlSetConvolutionDescriptor(convDesc, 4, pad, stride, dilation, g,
cnnlDataTypeConvert(op->getDType())));
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
@ -63,15 +63,16 @@ class ConvBackwardFilterCnnl : public BangKernelWithoutConfig {
// get inputs
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, inputs0Array));
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
inputs0Array));
checkCnnlError(cnnlCreateTensorDescriptor(&aDescTrans));
checkCnnlError(cnnlSetTensorDescriptor(aDescTrans, CNNL_LAYOUT_NHWC,
CNNL_DTYPE_FLOAT, 4,
inputs0ArrayTrans));
checkCnnlError(cnnlSetTensorDescriptor(
aDescTrans, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()),
4, inputs0ArrayTrans));
size_t wsTrans1Size = dimInputs0[0] * dimInputs0[1] * dimInputs0[2] *
dimInputs0[3] * sizeof(float);
dimInputs0[3] * op->getDType().getSize();
BangPtr wsTrans1Data = context->getWorkspace(wsTrans1Size);
cnnlStatus_t stat =
@ -82,15 +83,16 @@ class ConvBackwardFilterCnnl : public BangKernelWithoutConfig {
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
checkCnnlError(cnnlSetTensorDescriptor(
bDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, inputs1Array));
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
inputs1Array));
checkCnnlError(cnnlCreateTensorDescriptor(&bDescTrans));
checkCnnlError(cnnlSetTensorDescriptor(bDescTrans, CNNL_LAYOUT_NHWC,
CNNL_DTYPE_FLOAT, 4,
inputs1ArrayTrans));
checkCnnlError(cnnlSetTensorDescriptor(
bDescTrans, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()),
4, inputs1ArrayTrans));
size_t wsTrans2Size = dimInputs1[0] * dimInputs1[1] * dimInputs1[2] *
dimInputs1[3] * sizeof(float);
dimInputs1[3] * op->getDType().getSize();
BangPtr wsTrans2Data = context->getWorkspace(wsTrans2Size);
stat = cnnlTranspose(context->cnnlHandle(), transDesc, bDesc, bData,
@ -101,15 +103,16 @@ class ConvBackwardFilterCnnl : public BangKernelWithoutConfig {
// get outputs
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, outputArray));
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
outputArray));
checkCnnlError(cnnlCreateTensorDescriptor(&cDescTrans));
checkCnnlError(cnnlSetTensorDescriptor(cDescTrans, CNNL_LAYOUT_NHWC,
CNNL_DTYPE_FLOAT, 4,
outputArrayTrans));
checkCnnlError(cnnlSetTensorDescriptor(
cDescTrans, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()),
4, outputArrayTrans));
size_t wsTrans3Size = dimOutput[0] * dimOutput[1] * dimOutput[2] *
dimOutput[3] * sizeof(float);
dimOutput[3] * op->getDType().getSize();
BangPtr wsTrans3Data = context->getWorkspace(wsTrans3Size);
cnnlConvolutionBwdFilterAlgo_t algo;

View File

@ -7,7 +7,6 @@ class DetCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<DetObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -24,14 +23,14 @@ class DetCnnl : public BangKernelWithoutConfig {
auto dimout = op->getOutput()->getDims();
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY,
CNNL_DTYPE_FLOAT, dimin.size(),
dimin.data()));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
dimin.size(), dimin.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY,
CNNL_DTYPE_FLOAT, dimout.size(),
dimout.data()));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
dimout.size(), dimout.data()));
cnnlStatus_t stat =
cnnlDet(context->cnnlHandle(), nlMode, aDesc, aData, cDesc, cData);

View File

@ -11,8 +11,8 @@ class ElementWiseCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ElementWiseObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
auto [aAlpha, bAlpha, beta] = getAlphBeta();
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
@ -31,32 +31,33 @@ class ElementWiseCnnl : public BangKernelWithoutConfig {
}
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, a_dim.size(),
a_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
a_dim.size(), a_dim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, b_dim.size(),
b_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
b_dim.size(), b_dim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, c_dim.size(),
c_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
c_dim.size(), c_dim.data()));
cnnlOpTensorDescriptor_t opDesc;
checkCnnlError(cnnlCreateOpTensorDescriptor(&opDesc));
checkCnnlError(cnnlSetOpTensorDescriptor(
opDesc, getOpType(), CNNL_DTYPE_FLOAT, CNNL_NOT_PROPAGATE_NAN));
opDesc, getOpType(), cnnlDataTypeConvert(op->getDType()),
CNNL_NOT_PROPAGATE_NAN));
size_t wsSize;
cnnlGetOpTensorWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
&wsSize);
cnnlGetOpTensorWorkspaceSize_v2(context->cnnlHandle(), opDesc, &aAlpha,
aDesc, aData, &bAlpha, bDesc, bData,
&beta, cDesc, cData, &wsSize);
BangPtr wsData = context->getWorkspace(wsSize);
auto [aAlpha, bAlpha, beta] = getAlphBeta();
cnnlStatus_t stat = cnnlOpTensor(context->cnnlHandle(), opDesc, &aAlpha,
aDesc, aData, &bAlpha, bDesc, bData,
wsData, wsSize, &beta, cDesc, cData);
@ -75,7 +76,6 @@ class LogicOpCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ElementWiseObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -95,17 +95,17 @@ class LogicOpCnnl : public BangKernelWithoutConfig {
}
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, a_dim.size(),
a_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
a_dim.size(), a_dim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, b_dim.size(),
b_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
b_dim.size(), b_dim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, c_dim.size(),
c_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
c_dim.size(), c_dim.data()));
size_t wsSize;
cnnlGetLogicOpWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
@ -129,7 +129,6 @@ class BitComputeCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ElementWiseObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -182,7 +181,6 @@ class DivCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ElementWiseObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -202,17 +200,17 @@ class DivCnnl : public BangKernelWithoutConfig {
}
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, a_dim.size(),
a_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
a_dim.size(), a_dim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, b_dim.size(),
b_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
b_dim.size(), b_dim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, c_dim.size(),
c_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
c_dim.size(), c_dim.data()));
size_t wsSize;
cnnlGetDivWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
@ -235,7 +233,6 @@ class MaximumCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ElementWiseObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -255,17 +252,17 @@ class MaximumCnnl : public BangKernelWithoutConfig {
}
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, a_dim.size(),
a_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
a_dim.size(), a_dim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, b_dim.size(),
b_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
b_dim.size(), b_dim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, c_dim.size(),
c_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
c_dim.size(), c_dim.data()));
size_t wsSize;
cnnlGetMaximumWorkspaceSize(context->cnnlHandle(), cDesc, &wsSize);
@ -287,7 +284,6 @@ class MinimumCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ElementWiseObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -307,17 +303,17 @@ class MinimumCnnl : public BangKernelWithoutConfig {
}
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, a_dim.size(),
a_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
a_dim.size(), a_dim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, b_dim.size(),
b_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
b_dim.size(), b_dim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, c_dim.size(),
c_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
c_dim.size(), c_dim.data()));
size_t wsSize;
cnnlGetMinimumWorkspaceSize(context->cnnlHandle(), cDesc, &wsSize);
@ -339,7 +335,6 @@ class MSELossCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<MSELossObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -359,18 +354,18 @@ class MSELossCnnl : public BangKernelWithoutConfig {
}
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, a_dim.size(),
a_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
a_dim.size(), a_dim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, b_dim.size(),
b_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
b_dim.size(), b_dim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, c_dim.size(),
c_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
c_dim.size(), c_dim.data()));
cnnlStatus_t stat;
if (reduction == MSELossObj::None) {
stat = cnnlMSELoss(context->cnnlHandle(), CNNL_MSE_LOSS_NONE, aDesc,
@ -396,7 +391,6 @@ class PowerCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ElementWiseObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -417,17 +411,17 @@ class PowerCnnl : public BangKernelWithoutConfig {
}
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, a_dim.size(),
a_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
a_dim.size(), a_dim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, b_dim.size(),
b_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
b_dim.size(), b_dim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, c_dim.size(),
c_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
c_dim.size(), c_dim.data()));
size_t wsSize;
cnnlGetPowWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
@ -450,7 +444,6 @@ class FloorDivCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ElementWiseObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -470,17 +463,17 @@ class FloorDivCnnl : public BangKernelWithoutConfig {
}
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, a_dim.size(),
a_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
a_dim.size(), a_dim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, b_dim.size(),
b_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
b_dim.size(), b_dim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, c_dim.size(),
c_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
c_dim.size(), c_dim.data()));
size_t wsSize;
cnnlGetFloorDivWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
@ -503,7 +496,6 @@ class FloorModCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ElementWiseObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -523,17 +515,17 @@ class FloorModCnnl : public BangKernelWithoutConfig {
}
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, a_dim.size(),
a_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
a_dim.size(), a_dim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, b_dim.size(),
b_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
b_dim.size(), b_dim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, c_dim.size(),
c_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
c_dim.size(), c_dim.data()));
size_t wsSize;
cnnlGetFloorModWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
@ -556,7 +548,6 @@ class SquaredDifferenceCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ElementWiseObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -576,17 +567,17 @@ class SquaredDifferenceCnnl : public BangKernelWithoutConfig {
}
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, a_dim.size(),
a_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
a_dim.size(), a_dim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, b_dim.size(),
b_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
b_dim.size(), b_dim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, c_dim.size(),
c_dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
c_dim.size(), c_dim.data()));
size_t wsSize;
cnnlGetSquaredDifferenceWorkspaceSize(context->cnnlHandle(), aDesc,

View File

@ -7,7 +7,6 @@ class ErfCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<UnaryObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -18,13 +17,13 @@ class ErfCnnl : public BangKernelWithoutConfig {
auto cDim = op->getOutput()->getDims();
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, aDim.size(),
aDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
aDim.size(), aDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, cDim.size(),
cDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
cDim.size(), cDim.data()));
cnnlStatus_t stat =
cnnlErf_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION,

View File

@ -7,7 +7,6 @@ class ExpCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<UnaryObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -18,13 +17,13 @@ class ExpCnnl : public BangKernelWithoutConfig {
auto cDim = op->getOutput()->getDims();
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, aDim.size(),
aDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
aDim.size(), aDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, cDim.size(),
cDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
cDim.size(), cDim.data()));
cnnlStatus_t stat =
cnnlExp_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION,

View File

@ -7,7 +7,6 @@ class FillCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<FillObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
@ -17,9 +16,9 @@ class FillCnnl : public BangKernelWithoutConfig {
auto cDim = op->getOutput()->getDims();
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, cDim.size(),
cDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
cDim.size(), cDim.data()));
cnnlStatus_t stat =
cnnlFill(context->cnnlHandle(), value, cDesc, cData);

View File

@ -7,7 +7,6 @@ class FloorCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<UnaryObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -18,13 +17,13 @@ class FloorCnnl : public BangKernelWithoutConfig {
auto cDim = op->getOutput()->getDims();
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, aDim.size(),
aDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
aDim.size(), aDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, cDim.size(),
cDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
cDim.size(), cDim.data()));
cnnlStatus_t stat =
cnnlFloor(context->cnnlHandle(), aDesc, aData, cDesc, cData);

View File

@ -7,7 +7,6 @@ class GatherCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<GatherObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -20,27 +19,56 @@ class GatherCnnl : public BangKernelWithoutConfig {
auto cDim = op->getOutput()->getDims();
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY,
CNNL_DTYPE_FLOAT, aDim.size(),
aDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
aDim.size(), aDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
checkCnnlError(
cnnlSetTensorDescriptorPointerMode(bDesc, CNNL_POINTER_MODE_HOST));
if (bDim.size() == 0) {
bDim.push_back(1);
}
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_ARRAY,
CNNL_DTYPE_INT32, bDim.size(),
bDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY,
CNNL_DTYPE_FLOAT, cDim.size(),
cDim.data()));
BangPtr wsData = context->getWorkspace(aDim.size() * 4);
context->copyBlobFromCPU(wsData, aDim.data(), aDim.size() * 4);
BangPtr indices;
DataType indicesDataType = op->getInputs(1)->getDType();
if (indicesDataType == DataType::Int64) {
// cnnlGatherV2 does not support int64 indices
int indicesSize =
op->getInputs(1)->getBytes() / indicesDataType.getSize();
indices = context->getWorkspace(indicesSize * sizeof(int));
cnnlTensorDescriptor_t bDescInt64;
checkCnnlError(cnnlCreateTensorDescriptor(&bDescInt64));
checkCnnlError(cnnlSetTensorDescriptor(
bDescInt64, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_INT64, bDim.size(),
bDim.data()));
checkCnnlError(cnnlCastDataType(context->cnnlHandle(), bDescInt64,
bData, CNNL_CAST_INT64_TO_INT32,
bDesc, indices));
cnrtQueueSync(context->getBangQueue());
checkCnnlError(cnnlDestroyTensorDescriptor(bDescInt64));
} else if (indicesDataType == DataType::Int32) {
indices = bData;
} else {
IT_TODO_HALT_MSG("Unsupported data type of indices: " +
indicesDataType.toString());
}
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
cDim.size(), cDim.data()));
BangPtr wsData = context->getWorkspace(aDim.size() * sizeof(int));
context->copyBlobFromCPU(wsData, aDim.data(),
aDim.size() * sizeof(int));
auto axis = op->getAxis();
cnnlStatus_t stat =
cnnlGatherV2(context->cnnlHandle(), axis, aDesc, aData,
(int *)wsData, bDesc, (int *)bData, cDesc, cData);
reinterpret_cast<const int *>(wsData), bDesc,
reinterpret_cast<const int *>(indices), cDesc, cData);
if (stat != CNNL_STATUS_SUCCESS)
return;

View File

@ -7,7 +7,6 @@ class HardtanhCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<HardtanhObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -20,7 +19,8 @@ class HardtanhCnnl : public BangKernelWithoutConfig {
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, dim.size(), dim.data()));
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
dim.size(), dim.data()));
cnnlStatus_t stat = cnnlHardtanh(context->cnnlHandle(), aDesc, aData,
max, min, aDesc, cData);

View File

@ -7,7 +7,6 @@ class L2LossCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<L2LossObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -18,7 +17,8 @@ class L2LossCnnl : public BangKernelWithoutConfig {
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, dim.size(), dim.data()));
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
dim.size(), dim.data()));
cnnlStatus_t stat =
cnnlL2Loss(context->cnnlHandle(), aDesc, aData, cData);

View File

@ -8,7 +8,6 @@ class LayerNormCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<LayerNormObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *const inputData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -29,17 +28,17 @@ class LayerNormCnnl : public BangKernelWithoutConfig {
cnnlTensorDescriptor_t inDesc, fiterDesc, outDesc;
checkCnnlError(cnnlCreateTensorDescriptor(&inDesc));
checkCnnlError(cnnlSetTensorDescriptor(inDesc, CNNL_LAYOUT_ARRAY,
CNNL_DTYPE_FLOAT, inDims.size(),
inDims.data()));
checkCnnlError(cnnlSetTensorDescriptor(
inDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
inDims.size(), inDims.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&fiterDesc));
checkCnnlError(cnnlSetTensorDescriptor(
fiterDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, fiterDims.size(),
fiterDims.data()));
fiterDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
fiterDims.size(), fiterDims.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&outDesc));
checkCnnlError(cnnlSetTensorDescriptor(outDesc, CNNL_LAYOUT_ARRAY,
CNNL_DTYPE_FLOAT, outDims.size(),
outDims.data()));
checkCnnlError(cnnlSetTensorDescriptor(
outDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
outDims.size(), outDims.data()));
size_t wsSize;
cnnlGetLayerNormOpWorkspaceSize(context->cnnlHandle(), axis, inDesc,
&wsSize);

View File

@ -7,7 +7,6 @@ class LogCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<LogObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -33,13 +32,13 @@ class LogCnnl : public BangKernelWithoutConfig {
auto cDim = op->getOutput()->getDims();
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, aDim.size(),
aDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
aDim.size(), aDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, cDim.size(),
cDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
cDim.size(), cDim.data()));
cnnlStatus_t stat =
cnnlLog_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION,

View File

@ -7,7 +7,6 @@ class LRNCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<LRNObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -20,13 +19,13 @@ class LRNCnnl : public BangKernelWithoutConfig {
auto size = op->getSize();
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, aDim.size(),
aDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
aDim.size(), aDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, cDim.size(),
cDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
cDim.size(), cDim.data()));
size_t extra_size;
cnnlGetLrnExtraInputSize_v2(context->cnnlHandle(), cDesc,

View File

@ -8,7 +8,6 @@ class MatmulCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<MatmulObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
auto input_num = op->numInputs();
@ -38,25 +37,26 @@ class MatmulCnnl : public BangKernelWithoutConfig {
int32_t transB = op->getTransB();
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(
cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT,
dimInputs0.size(), dimInputs0.data()));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
dimInputs0.size(), dimInputs0.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
checkCnnlError(
cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT,
dimInputs1.size(), dimInputs1.data()));
checkCnnlError(cnnlSetTensorDescriptor(
bDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
dimInputs1.size(), dimInputs1.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(
cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT,
dimOutput.size(), dimOutput.data()));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
dimOutput.size(), dimOutput.data()));
if (input_num > 2) {
checkCnnlError(cnnlCreateTensorDescriptor(&biasDesc));
checkCnnlError(cnnlSetTensorDescriptor(
biasDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, dimBias.size(),
dimBias.data()));
checkCnnlError(
cnnlSetTensorDescriptor(biasDesc, CNNL_LAYOUT_ARRAY,
cnnlDataTypeConvert(op->getDType()),
dimBias.size(), dimBias.data()));
}
cnnlMatMulDescriptor_t bmm_desc;

View File

@ -7,7 +7,6 @@ class NegTensorCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<UnaryObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -18,13 +17,13 @@ class NegTensorCnnl : public BangKernelWithoutConfig {
auto cDim = op->getOutput()->getDims();
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, aDim.size(),
aDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
aDim.size(), aDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, cDim.size(),
cDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
cDim.size(), cDim.data()));
cnnlStatus_t stat =
cnnlNegTensor(context->cnnlHandle(), aDesc, aData, cDesc, cData);

View File

@ -7,7 +7,6 @@ class PadCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<PadObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -37,14 +36,14 @@ class PadCnnl : public BangKernelWithoutConfig {
float paddingValue = 0.0;
// input
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY,
CNNL_DTYPE_FLOAT, dimIn.size(),
dimIn.data()));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
dimIn.size(), dimIn.data()));
// output
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY,
CNNL_DTYPE_FLOAT, dimOut.size(),
dimOut.data()));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
dimOut.size(), dimOut.data()));
cnnlStatus_t stat = cnnlPad(context->cnnlHandle(), aDesc, aData,
paddings, &paddingValue, cDesc, cData);

View File

@ -8,7 +8,6 @@ class PoolingCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<PoolingObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *const inData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const outData = (op->getOutput()->getRawDataPtr<void *>());
@ -20,8 +19,9 @@ class PoolingCnnl : public BangKernelWithoutConfig {
int inArray[4] = {n, c, h, w};
cnnlTensorDescriptor_t inDesc;
checkCnnlError(cnnlCreateTensorDescriptor(&inDesc));
checkCnnlError(cnnlSetTensorDescriptor(inDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, 4, inArray));
checkCnnlError(cnnlSetTensorDescriptor(
inDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
inArray));
bool mode = op->getCeilMode();
// get maxpool descriptor
@ -37,8 +37,9 @@ class PoolingCnnl : public BangKernelWithoutConfig {
int outArray[4] = {outVec[0], outVec[1], outVec[2], outVec[3]};
cnnlTensorDescriptor_t outDesc;
checkCnnlError(cnnlCreateTensorDescriptor(&outDesc));
checkCnnlError(cnnlSetTensorDescriptor(outDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, 4, outArray));
checkCnnlError(cnnlSetTensorDescriptor(
outDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
outArray));
size_t wsSize;
cnnlGetPoolingWorkspaceSize(context->cnnlHandle(), getPoolingMode(),
outVec[3], outVec[2], &wsSize);

View File

@ -7,7 +7,6 @@ class ReciprocalCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<UnaryObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -18,13 +17,13 @@ class ReciprocalCnnl : public BangKernelWithoutConfig {
auto cDim = op->getOutput()->getDims();
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, aDim.size(),
aDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
aDim.size(), aDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, cDim.size(),
cDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
cDim.size(), cDim.data()));
cnnlStatus_t stat =
cnnlReciprocal(context->cnnlHandle(), aDesc, aData, cDesc, cData);

View File

@ -9,7 +9,6 @@ class ReduceCnnlBase : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ReduceBaseObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
@ -26,20 +25,20 @@ class ReduceCnnlBase : public BangKernelWithoutConfig {
cnnlTensorDescriptor_t inDesc, outDesc;
checkCnnlError(cnnlCreateTensorDescriptor(&inDesc));
checkCnnlError(cnnlCreateTensorDescriptor(&outDesc));
checkCnnlError(cnnlSetTensorDescriptor(inDesc, CNNL_LAYOUT_ARRAY,
CNNL_DTYPE_FLOAT, aDim.size(),
aDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(outDesc, CNNL_LAYOUT_ARRAY,
CNNL_DTYPE_FLOAT, bDim.size(),
bDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
inDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
aDim.size(), aDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
outDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
bDim.size(), bDim.data()));
// get reduce descriptor
cnnlReduceDescriptor_t reduceDesc;
checkCnnlError(cnnlCreateReduceDescriptor(&reduceDesc));
checkCnnlError(cnnlSetReduceDescriptor_v2(
reduceDesc, axes.data(), axes.size(), getReduceOp(),
CNNL_DTYPE_FLOAT, CNNL_NOT_PROPAGATE_NAN, CNNL_REDUCE_NO_INDICES,
CNNL_32BIT_INDICES, 0.0));
cnnlDataTypeConvert(op->getDType()), CNNL_NOT_PROPAGATE_NAN,
CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES, 0.0));
// get workspace
size_t workspaceSize = 0;

View File

@ -14,8 +14,8 @@ class CopyBang : public BangKernelWithoutConfig {
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_INT8,
dim.size() * op->getDType().getSize(), dim.data()));
aDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
dim.size(), dim.data()));
cnnlStatus_t stat =
cnnlCopy(context->cnnlHandle(), aDesc, inData, aDesc, outData);
if (stat != CNNL_STATUS_SUCCESS)
@ -28,5 +28,7 @@ class CopyBang : public BangKernelWithoutConfig {
REGISTER_KERNEL(Device::BANG, OpType::Reshape, CopyBang, "Reshape_BANG");
REGISTER_KERNEL(Device::BANG, OpType::Flatten, CopyBang, "Flatten_BANG");
REGISTER_KERNEL(Device::BANG, OpType::Identity, CopyBang, "Identity_BANG");
REGISTER_KERNEL(Device::BANG, OpType::Squeeze, CopyBang, "Squeeze_BANG");
REGISTER_KERNEL(Device::BANG, OpType::Unsqueeze, CopyBang, "Unsqueeze_BANG");
} // namespace infini

View File

@ -7,7 +7,6 @@ class RsqrtCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<UnaryObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -18,13 +17,13 @@ class RsqrtCnnl : public BangKernelWithoutConfig {
auto cDim = op->getOutput()->getDims();
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, aDim.size(),
aDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
aDim.size(), aDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, cDim.size(),
cDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
cDim.size(), cDim.data()));
cnnlStatus_t stat =
cnnlRsqrt_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION,

View File

@ -42,11 +42,13 @@ class SliceCnnl : public BangKernelWithoutConfig {
// input
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, aDim_size, aDim_array));
aDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
aDim_size, aDim_array));
// output
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, cDim_size, cDim_array));
cDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
cDim_size, cDim_array));
cnnlStatus_t stat =
cnnlStridedSlice(context->cnnlHandle(), aDesc, aData, starts_array,
@ -59,6 +61,6 @@ class SliceCnnl : public BangKernelWithoutConfig {
}
};
REGISTER_KERNEL(Device::BANG, OpType::Slice, DataType::Float32, SliceCnnl,
REGISTER_KERNEL(Device::BANG, OpType::Slice, SliceCnnl,
"Slice_cnnl_BANG_Float32");
}; // namespace infini

View File

@ -7,7 +7,6 @@ class SplitCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<SplitObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
int num = op->numOutputs();
int axis = op->getDim();
@ -16,15 +15,17 @@ class SplitCnnl : public BangKernelWithoutConfig {
cnnlTensorDescriptor_t desc;
checkCnnlError(cnnlCreateTensorDescriptor(&desc));
checkCnnlError(cnnlSetTensorDescriptor(
desc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, dim.size(), dim.data()));
desc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
dim.size(), dim.data()));
cnnlTensorDescriptor_t descArray[num];
for (int i = 0; i < num; ++i) {
checkCnnlError(cnnlCreateTensorDescriptor(&descArray[i]));
checkCnnlError(cnnlSetTensorDescriptor(
descArray[i], CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT,
op->getOutput(i)->getDims().size(),
op->getOutput(i)->getDims().data()));
checkCnnlError(
cnnlSetTensorDescriptor(descArray[i], CNNL_LAYOUT_NCHW,
cnnlDataTypeConvert(op->getDType()),
op->getOutput(i)->getDims().size(),
op->getOutput(i)->getDims().data()));
}
void *const inputData = (op->getInputs(0)->getRawDataPtr<void *>());

View File

@ -7,7 +7,6 @@ class SqrtCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<UnaryObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -18,13 +17,13 @@ class SqrtCnnl : public BangKernelWithoutConfig {
auto cDim = op->getOutput()->getDims();
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, aDim.size(),
aDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
aDim.size(), aDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, cDim.size(),
cDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
cDim.size(), cDim.data()));
cnnlStatus_t stat =
cnnlSqrt_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION,

View File

@ -7,7 +7,6 @@ class TransposeCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<TransposeObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -18,13 +17,13 @@ class TransposeCnnl : public BangKernelWithoutConfig {
auto dimout = op->getOutput()->getDims();
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY,
CNNL_DTYPE_FLOAT, dimin.size(),
dimin.data()));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
dimin.size(), dimin.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY,
CNNL_DTYPE_FLOAT, dimout.size(),
dimout.data()));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
dimout.size(), dimout.data()));
auto permute = op->getPermute();
cnnlTransposeDescriptor_t opDesc;
@ -53,7 +52,6 @@ class DepthToSpaceCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<DepthToSpaceObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -73,13 +71,13 @@ class DepthToSpaceCnnl : public BangKernelWithoutConfig {
auto dimout = op->getOutput()->getDims();
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY,
CNNL_DTYPE_FLOAT, reshape.size(),
reshape.data()));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
reshape.size(), reshape.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(
cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT,
transpose.size(), transpose.data()));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
transpose.size(), transpose.data()));
cnnlTransposeDescriptor_t opDesc;
checkCnnlError(cnnlCreateTransposeDescriptor(&opDesc));

View File

@ -9,7 +9,6 @@ class TrigonCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<UnaryObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -20,17 +19,18 @@ class TrigonCnnl : public BangKernelWithoutConfig {
auto cDim = op->getOutput()->getDims();
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, aDim.size(),
aDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
aDim.size(), aDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, cDim.size(),
cDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
cDim.size(), cDim.data()));
cnnlTrigonDescriptor_t opDesc;
checkCnnlError(cnnlCreateTrigonDescriptor(&opDesc));
checkCnnlError(cnnlSetTrigonDescriptor(opDesc, getOpType()));
checkCnnlError(
cnnlSetTrigonDescriptor_v2(opDesc, getOpType(), getPrefer()));
cnnlStatus_t stat = cnnlTrigonForward(context->cnnlHandle(), opDesc,
aDesc, aData, cDesc, cData);

View File

@ -7,7 +7,6 @@ class WhereCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<WhereObj>(_op);
IT_ASSERT(op->getDType() == DataType::Float32);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -35,21 +34,21 @@ class WhereCnnl : public BangKernelWithoutConfig {
}
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY,
CNNL_DTYPE_FLOAT, aDim.size(),
aDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
aDim.size(), aDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_ARRAY,
CNNL_DTYPE_FLOAT, bDim.size(),
bDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
bDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
bDim.size(), bDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY,
CNNL_DTYPE_BOOL, cDim.size(),
cDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&dDesc));
checkCnnlError(cnnlSetTensorDescriptor(dDesc, CNNL_LAYOUT_ARRAY,
CNNL_DTYPE_FLOAT, dDim.size(),
dDim.data()));
checkCnnlError(cnnlSetTensorDescriptor(
dDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
dDim.size(), dDim.data()));
size_t wsSize;
cnnlGetSelectV2WorkspaceSize(context->cnnlHandle(), cDesc, aDesc, bDesc,
&wsSize);

View File

@ -0,0 +1,29 @@
#include "operators/reshape.h"
#include "core/kernel.h"
#include "operators/squeeze.h"
#include "operators/unsqueeze.h"
namespace infini {
class NaiveIdentity : public CpuKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *context) const override {
auto size = _op->getInputs()[0]->getBytes();
void *inptr = _op->getInputs(0)->getRawDataPtr<void *>();
void *outptr = _op->getOutput()->getRawDataPtr<void *>();
std::memcpy(outptr, inptr, size);
}
};
REGISTER_KERNEL(Device::CPU, OpType::Reshape, NaiveIdentity,
"ReshapeNaive_CPU");
REGISTER_KERNEL(Device::CPU, OpType::Identity, NaiveIdentity,
"IdentityNaive_CPU");
REGISTER_KERNEL(Device::CPU, OpType::Unsqueeze, NaiveIdentity,
"UnsqueezeNaive_CPU");
REGISTER_KERNEL(Device::CPU, OpType::Squeeze, NaiveIdentity,
"SqueezeNaive_CPU");
REGISTER_KERNEL(Device::CPU, OpType::Flatten, NaiveIdentity,
"FlattenNaive_CPU");
} // namespace infini

View File

@ -47,6 +47,10 @@ class NativeUnary : public CpuKernelWithoutConfig {
return 0.5 * val * (1 + std::erf(val / std::sqrt(2)));
}
template <typename T> static T siluCompute(T val) {
return val / (1 + pow(E_CONSTANT, -val));
}
template <typename T> static T erfCompute(T val) { return std::erf(val); }
template <typename T> static T aCosCompute(T val) { return std::acos(val); }
@ -84,6 +88,9 @@ class NativeUnary : public CpuKernelWithoutConfig {
case OpType::Gelu:
_doCompute = geluCompute<T>;
break;
case OpType::Silu:
_doCompute = siluCompute<T>;
break;
case OpType::Sigmoid:
_doCompute = sigmoidCompute<T>;
break;
@ -138,6 +145,9 @@ class NativeUnary : public CpuKernelWithoutConfig {
case OpType::Atanh:
_doCompute = aTanhCompute<T>;
break;
case OpType::Acosh:
_doCompute = aCoshCompute<T>;
break;
default:
IT_TODO_HALT();
}
@ -289,6 +299,7 @@ class Log : public CpuKernelWithoutConfig {
REGISTER_KERNEL(Device::CPU, OpType::Relu, NativeUnary, "reluNaive_CPU");
REGISTER_KERNEL(Device::CPU, OpType::Gelu, NativeUnary, "geluNaive_CPU");
REGISTER_KERNEL(Device::CPU, OpType::Silu, NativeUnary, "siluNaive_CPU");
REGISTER_KERNEL(Device::CPU, OpType::Sigmoid, NativeUnary, "sigmoidNaive_CPU");
REGISTER_KERNEL(Device::CPU, OpType::HardSigmoid, NativeUnary,
"hardSigmoidNaive_CPU");

View File

@ -28,9 +28,8 @@ class AllReduceNCCL : public CudaKernelWithoutConfig {
ncclComm_t comm =
dynamic_cast<NcclCommunicatorObj &>(context->getCommunicator())
.getNcclComm();
// TODO: Using default stream 0 for now.
checkNcclError(
ncclAllReduce(input, output, count, ncclType, getRedOp(), comm, 0));
checkNcclError(ncclAllReduce(input, output, count, ncclType, getRedOp(),
comm, CUDAStream::getCurrentStream()));
}
virtual ncclRedOp_t getRedOp() const = 0;

View File

@ -21,7 +21,7 @@ class AttentionKVCacheCompute {
public:
void do_compute(Tensor input_k_cache, Tensor input_v_cache, Tensor input_q,
Tensor input_k, Tensor input_v, Tensor position_id,
Tensor output_matmul) const {
Tensor output_matmul, CudaPtr p_workspace) const {
AttentionKVCacheMetadata metadata;
initAttentionKVCacheMetadata(metadata, input_v_cache);
@ -32,7 +32,8 @@ class AttentionKVCacheCompute {
input_v->getRawDataPtr<float *>(),
position_id->getRawDataPtr<int *>(),
output_matmul->getRawDataPtr<float *>(),
metadata);
metadata, (float *)p_workspace,
(float *)(p_workspace + (1ll << 30)));
}
};
@ -41,10 +42,14 @@ class AttentionKVCacheCuda : private AttentionKVCacheCompute,
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
IT_ASSERT(_op->getDType() == DataType::Float32);
size_t workspaceSize = 2ll << 30;
auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
CudaPtr idxWsData = context->getWorkspace(workspaceSize);
do_compute(_op->getInputs()[0], _op->getInputs()[1],
_op->getInputs()[2], _op->getInputs()[3],
_op->getInputs()[4], _op->getInputs()[5],
_op->getOutputs()[0]);
_op->getOutputs()[0], idxWsData);
}
};

View File

@ -2,127 +2,170 @@
#include "cuda/cuda_attention_kvcache.h"
#define WARP_SIZE 32
#define BLOCKSIZE WARP_SIZE
#define SEQ_UNIT 64
#define SEQ_UNIT 16
__global__ void _attention_kvcache_kernel(float* input_k_cache,
// ASSUME SEQ_LEN OF Q IS 1
__global__ void _attention_kvcache_kernel_128_1(float* input_k_cache,
float* input_v_cache,
float* input_q,
float* input_k,
float* input_v,
int* position_id,
float* output_matmul,
AttentionKVCacheMetadata compMeta) {
AttentionKVCacheMetadata compMeta,
float* output_O_temp,
float* output_sum_temp) {
int seq_length = position_id[0] + 1;
int stride = (seq_length + SEQ_UNIT - 1) / SEQ_UNIT;
if(blockIdx.y >= stride)
return;
int lane_id = threadIdx.x % WARP_SIZE;
int group_id = threadIdx.x / WARP_SIZE;
int parallel_idx = blockIdx.x * (blockDim.x / WARP_SIZE) + group_id;
int idx_seq = blockIdx.y * SEQ_UNIT;
if(parallel_idx >= compMeta.dimSize[0] * compMeta.dimSize[1])
return;
float ptr_V[SEQ_UNIT*2];
float ptr_K[SEQ_UNIT*2];
float ptr_Q[2];
float ptr_P[SEQ_UNIT];
float ptr_V[SEQ_UNIT*4]; // V
float ptr_K[SEQ_UNIT*4]; // K
float ptr_Q[4]; // Q
float ptr_P[SEQ_UNIT] = {0};
float ptr_O[2];
float ptr_max[1];
float ptr_sum[1];
float ptr_O[4] = {0};
float ptr_sum[1] = {0};
float ptr_max_last[1];
float ptr_sum_last[1];
float ptr_O_last[2];
// readin Q
(float4 &)ptr_Q[0] = (float4 &)input_q[(lane_id * 4) + (parallel_idx * 128)];
int common_idx = (lane_id * 4) + (parallel_idx * compMeta.stride[1]);
(float2 &)ptr_Q[0] = (float2 &)input_q[(lane_id * 2) + (parallel_idx * 64)];
int SEQ_LENGTH = position_id[0] + 1;
int common_idx = (lane_id * 2) + (parallel_idx * compMeta.stride[1]);
for (int idx_seq = 0; idx_seq < SEQ_LENGTH; idx_seq += SEQ_UNIT){
ptr_max_last[0] = ptr_max[0];
ptr_sum_last[0] = ptr_sum[0];
(float2 &)ptr_O_last[0] = (float2 &)ptr_O[0];
// Q*K
#pragma unroll
for (int idx_SEQ_UNIT = 0; idx_SEQ_UNIT < SEQ_UNIT && idx_SEQ_UNIT + idx_seq < seq_length; idx_SEQ_UNIT ++) {
if(idx_SEQ_UNIT + idx_seq < seq_length - 1){
(float4 &)ptr_K[idx_SEQ_UNIT * 4]
= (float4 &) input_k_cache[common_idx + ((idx_SEQ_UNIT + idx_seq) * compMeta.stride[2])];
}
else{
(float4 &)ptr_K[idx_SEQ_UNIT * 4]
= (float4 &) input_k[((lane_id * 4) + parallel_idx * compMeta.stride[2])];
(float4 &)input_k_cache[common_idx + ((idx_SEQ_UNIT + idx_seq) * compMeta.stride[2])] =
(float4 &)ptr_K[idx_SEQ_UNIT * 4];
}
#pragma unroll
for (int idx_SEQ_UNIT = 0; idx_SEQ_UNIT < SEQ_UNIT && idx_SEQ_UNIT + idx_seq < SEQ_LENGTH; idx_SEQ_UNIT ++) {
if(idx_SEQ_UNIT + idx_seq < SEQ_LENGTH - 1){
(float2 &)ptr_K[idx_SEQ_UNIT * 2]
= (float2 &) input_k_cache[common_idx + ((idx_SEQ_UNIT + idx_seq) * compMeta.stride[2])];
}
else{
(float2 &)ptr_K[idx_SEQ_UNIT * 2]
= (float2 &) input_k[((lane_id * 2) + parallel_idx * compMeta.stride[2])];
(float2 &)input_k_cache[common_idx + ((idx_SEQ_UNIT + idx_seq) * compMeta.stride[2])] =
(float2 &)ptr_K[idx_SEQ_UNIT * 2];
}
ptr_K[idx_SEQ_UNIT * 2] = ptr_Q[0] * ptr_K[idx_SEQ_UNIT * 2];
ptr_K[idx_SEQ_UNIT * 2 + 1] = ptr_Q[1] * ptr_K[idx_SEQ_UNIT * 2 + 1];
for (int i = 0; i < 4; i ++){
ptr_K[idx_SEQ_UNIT * 4 + i] = ptr_Q[i] * ptr_K[idx_SEQ_UNIT * 4 + i];
#pragma unroll
for (int offset = 16; offset > 0; offset /= 2) {
ptr_K[idx_SEQ_UNIT * 2] += __shfl_down_sync(0xffffffff, ptr_K[idx_SEQ_UNIT * 2], offset);
ptr_K[idx_SEQ_UNIT * 4 + i] += __shfl_down_sync(0xffffffff, ptr_K[idx_SEQ_UNIT * 4 + i], offset);
}
ptr_P[idx_SEQ_UNIT] = ptr_K[idx_SEQ_UNIT * 2];
#pragma unroll
for (int offset = 16; offset > 0; offset /= 2){
ptr_K[((idx_SEQ_UNIT * 2) + 1)] += __shfl_down_sync(0xffffffff, ptr_K[((idx_SEQ_UNIT * 2) + 1)], offset);
}
ptr_P[idx_SEQ_UNIT] += ptr_K[((idx_SEQ_UNIT * 2) + 1)];
ptr_P[idx_SEQ_UNIT] += ptr_K[idx_SEQ_UNIT * 4 + i];
}
#pragma unroll
for (int idx_SEQ_UNIT = 0; idx_SEQ_UNIT < SEQ_UNIT && idx_SEQ_UNIT + idx_seq < SEQ_LENGTH; idx_SEQ_UNIT ++) {
ptr_P[idx_SEQ_UNIT] = __shfl_sync(0xffffffff, ptr_P[idx_SEQ_UNIT], 0);
ptr_P[idx_SEQ_UNIT] /= 8;
ptr_max[0] = (idx_SEQ_UNIT == 0) ? ptr_P[0] : max(ptr_max[0], ptr_P[idx_SEQ_UNIT]);
}
ptr_max[0] = (idx_seq == 0) ? ptr_max[0] : max(ptr_max[0], ptr_max_last[0]);
ptr_sum[0] = 0;
#pragma unroll
for (int idx_SEQ_UNIT = 0; idx_SEQ_UNIT < SEQ_UNIT && idx_SEQ_UNIT + idx_seq < SEQ_LENGTH; idx_SEQ_UNIT ++) {
ptr_P[idx_SEQ_UNIT] = expf(ptr_P[idx_SEQ_UNIT] - ptr_max[0]);
ptr_sum[0] += ptr_P[idx_SEQ_UNIT];
}
ptr_sum[0] = (idx_seq == 0) ? ptr_sum[0] : expf(ptr_max_last[0] - ptr_max[0]) * ptr_sum_last[0] + ptr_sum[0];
ptr_O[0] = 0;
ptr_O[1] = 0;
#pragma unroll
for (int idx_SEQ_UNIT = 0; idx_SEQ_UNIT < SEQ_UNIT && idx_SEQ_UNIT + idx_seq < SEQ_LENGTH; idx_SEQ_UNIT ++) {
if(idx_SEQ_UNIT + idx_seq < SEQ_LENGTH - 1){
(float2 &)ptr_V[idx_SEQ_UNIT * 2]
= (float2 &) input_v_cache[common_idx + ((idx_SEQ_UNIT + idx_seq) * compMeta.stride[2])];
}
else{
(float2 &)ptr_V[idx_SEQ_UNIT * 2]
= (float2 &) input_v[((lane_id * 2) + parallel_idx * compMeta.stride[2])];
(float2 &)input_v_cache[common_idx + ((idx_SEQ_UNIT + idx_seq) * compMeta.stride[2])] =
(float2 &)ptr_V[idx_SEQ_UNIT * 2];
}
ptr_P[idx_SEQ_UNIT] /= ptr_sum[0];
ptr_O[0] = fmaf(ptr_P[idx_SEQ_UNIT], ptr_V[(idx_SEQ_UNIT * 2)], ptr_O[0]);
ptr_O[1] = fmaf(ptr_P[idx_SEQ_UNIT], ptr_V[(idx_SEQ_UNIT * 2) + 1], ptr_O[1]);
}
ptr_O[0] = (idx_seq == 0) ? ptr_O[0] : ptr_O[0] + ptr_O_last[0] * expf(ptr_max_last[0] - ptr_max[0]) * ptr_sum_last[0] / ptr_sum[0];
ptr_O[1] = (idx_seq == 0) ? ptr_O[1] : ptr_O[1] + ptr_O_last[1] * expf(ptr_max_last[0] - ptr_max[0]) * ptr_sum_last[0] / ptr_sum[0];
}
(float2 &)output_matmul[(lane_id * 2) + (parallel_idx * compMeta.dimSize[3])] = (float2 &)ptr_O[0];
// div sqrt(d)
#pragma unroll
for (int idx_SEQ_UNIT = 0; idx_SEQ_UNIT < SEQ_UNIT && idx_SEQ_UNIT + idx_seq < seq_length; idx_SEQ_UNIT ++) {
ptr_P[idx_SEQ_UNIT] = __shfl_sync(0xffffffff, ptr_P[idx_SEQ_UNIT], 0);
ptr_P[idx_SEQ_UNIT] /= sqrt(128.0);
}
// softmax
#pragma unroll
for (int idx_SEQ_UNIT = 0; idx_SEQ_UNIT < SEQ_UNIT && idx_SEQ_UNIT + idx_seq < seq_length; idx_SEQ_UNIT ++) {
ptr_P[idx_SEQ_UNIT] = expf(ptr_P[idx_SEQ_UNIT]);
ptr_sum[0] += ptr_P[idx_SEQ_UNIT];
}
// * V
#pragma unroll
for (int idx_SEQ_UNIT = 0; idx_SEQ_UNIT < SEQ_UNIT && idx_SEQ_UNIT + idx_seq < seq_length; idx_SEQ_UNIT ++) {
if(idx_SEQ_UNIT + idx_seq < seq_length - 1){
(float4 &)ptr_V[idx_SEQ_UNIT * 4]
= (float4 &) input_v_cache[common_idx + ((idx_SEQ_UNIT + idx_seq) * compMeta.stride[2])];
}
else{
(float4 &)ptr_V[idx_SEQ_UNIT * 4]
= (float4 &) input_v[((lane_id * 4) + parallel_idx * compMeta.stride[2])];
(float4 &)input_v_cache[common_idx + ((idx_SEQ_UNIT + idx_seq) * compMeta.stride[2])]
= (float4 &)ptr_V[idx_SEQ_UNIT * 4];
}
#pragma unroll
for (int i = 0; i < 4; i ++)
ptr_O[i] = fmaf(ptr_P[idx_SEQ_UNIT], ptr_V[(idx_SEQ_UNIT * 4 + i)], ptr_O[i]);
}
#pragma unroll
for (int i = 0; i < 4; i ++)
ptr_O[i] /= ptr_sum[0];
(float4 &)output_O_temp[(lane_id * 4) + (blockIdx.y * compMeta.dimSize[3]) + (parallel_idx * compMeta.dimSize[3] * stride)] = (float4 &)ptr_O[0];
if(lane_id == 0){
output_sum_temp[blockIdx.y + parallel_idx * stride] = ptr_sum[0];
}
}
__global__ void _attention_kvcache_kernel_128_2(int* position_id,
float* output_matmul,
AttentionKVCacheMetadata compMeta,
float* output_O_temp,
float* output_sum_temp) {
int lane_id = threadIdx.x % WARP_SIZE;
int group_id = threadIdx.x / WARP_SIZE;
int parallel_idx = blockIdx.x * (blockDim.x / WARP_SIZE) + group_id;
float ptr_O[4] = {0};
float ptr_O_sum[4] = {0};
float ptr_sum = 0;
float ptr_sum_temp;
int size = (position_id[0] + SEQ_UNIT) / SEQ_UNIT;
#pragma unroll
for(int i = 0; i < size; i ++){
(float4 &)ptr_O[0]
= (float4 &)output_O_temp[(lane_id * 4) + (i * compMeta.dimSize[3]) + parallel_idx * compMeta.dimSize[3] * size];
ptr_sum_temp = output_sum_temp[i + parallel_idx * size];
#pragma unroll
for(int k = 0; k < 4; k ++)
ptr_O_sum[k] += ptr_O[k] * ptr_sum_temp;
ptr_sum += ptr_sum_temp;
}
#pragma unroll
for(int k = 0; k < 4; k ++)
ptr_O_sum[k] = ptr_O_sum[k] / ptr_sum;
(float4 &)output_matmul[(lane_id * 4) + (parallel_idx * compMeta.dimSize[3])] = (float4 &)ptr_O_sum[0];
}
namespace infini {
void attention_kvcache_kernel(float *input_k_cache, float *input_v_cache, float *input_q, float *input_k,
float *input_v, int *position_id, float *output_matmul,
const AttentionKVCacheMetadata &compMeta) {
IT_ASSERT(compMeta.dimSize[3] == 64);
dim3 gridDim(compMeta.dimSize[0]*compMeta.dimSize[1]/(BLOCKSIZE/WARP_SIZE), 1);
void attention_kvcache_kernel(float *input_k_cache, float *input_v_cache,
float *input_q, float *input_k,
float *input_v, int *position_id, float *output_matmul,
const AttentionKVCacheMetadata &compMeta,
float *output_O_temp, float *output_sum_temp) {
IT_ASSERT(compMeta.dimSize[3] == 128);
int gridsize_y = (compMeta.dimSize[2] - 1 + SEQ_UNIT) / SEQ_UNIT;
dim3 gridDim(compMeta.dimSize[0]*compMeta.dimSize[1]/(BLOCKSIZE/WARP_SIZE), gridsize_y);
dim3 blockDim(BLOCKSIZE, 1);
_attention_kvcache_kernel<<<gridDim, blockDim>>>(
input_k_cache, input_v_cache, input_q, input_k, input_v, position_id, output_matmul, compMeta);
_attention_kvcache_kernel_128_1
<<<gridDim, blockDim, 0, CUDAStream::getCurrentStream()>>>
(input_k_cache, input_v_cache, input_q, input_k, input_v, position_id,
compMeta, output_O_temp, output_sum_temp);
_attention_kvcache_kernel_128_2
<<<compMeta.dimSize[0]*compMeta.dimSize[1]/(BLOCKSIZE/WARP_SIZE), WARP_SIZE,
0, CUDAStream::getCurrentStream()>>>
(position_id, output_matmul, compMeta, output_O_temp, output_sum_temp);
}
} // namespace infini

View File

@ -25,8 +25,9 @@ void clip_kernel(float *input, float *output, int num, float minValue,
float maxValue) {
int blocksize = block_work_size();
int gridsize = (num + block_work_size() - 1) / block_work_size();
_clip_kernel<<<gridsize, blocksize>>>(input, output, num, minValue,
maxValue);
_clip_kernel
<<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>(
input, output, num, minValue, maxValue);
}
}; // namespace infini

View File

@ -115,6 +115,20 @@ class ElementWiseCuda : public CudaKernelWithoutConfig {
auto a_dim = op->getInputs(0)->getDims();
auto b_dim = op->getInputs(1)->getDims();
auto c_dim = op->getOutput()->getDims();
const int dType = _op->getDType().getIndex();
// Use optimized kernel if b is constant
if (b_dim.size() == 0) {
if (op->getOpType() == OpType::Div) {
div_const_kernel(dType, aData, bData, cData,
op->getOutput()->size());
return;
} else if (op->getOpType() == OpType::Pow) {
pow_const_kernel(dType, aData, bData, cData,
op->getOutput()->size());
return;
}
}
if (a_dim.size() > 4 || b_dim.size() > 4 || c_dim.size() > 4)
IT_TODO_HALT();
@ -127,7 +141,6 @@ class ElementWiseCuda : public CudaKernelWithoutConfig {
std::copy(b_dim.begin(), b_dim.end(), b + (4 - b_dim.size()));
std::copy(c_dim.begin(), c_dim.end(), c + (4 - c_dim.size()));
const int dType = _op->getDType().getIndex();
if (op->getOpType() == OpType::Div) {
div_kernel(dType, aData, bData, cData, a[0], a[1], a[2], a[3], b[0],
b[1], b[2], b[3], c[0], c[1], c[2], c[3]);

View File

@ -131,8 +131,9 @@ __global__ void _less_kernel(void *x, void *y, void *z, int a0, int a1, int a2,
}
#define CASE(OP, T) \
_##OP##_kernel<DT_CUDA<T>::t><<<gridsize, blocksize>>>( \
a, b, c, a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3);
_##OP##_kernel<DT_CUDA<T>::t> \
<<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>( \
a, b, c, a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3);
#define SWITCH_DTYPE(OP, DTYPE) \
switch (DTYPE) { \
@ -176,7 +177,92 @@ __global__ void _less_kernel(void *x, void *y, void *z, int a0, int a1, int a2,
IT_TODO_HALT(); \
}
template <class T>
__global__ void _div_const_kernel(void const *__restrict__ x,
void const *__restrict__ y,
void *__restrict__ z, const size_t n) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < n) {
((T *)z)[tid] = ((T *)x)[tid] / *((T *)y);
}
}
template <class T>
__global__ void _pow_const_kernel(void const *__restrict__ x,
void const *__restrict__ y,
void *__restrict__ z, const size_t n) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < n) {
((T *)z)[tid] = pow(((T *)x)[tid], *((T *)y));
}
}
template <>
__global__ void _pow_const_kernel<half>(void const *__restrict__ x,
void const *__restrict__ y,
void *__restrict__ z, const size_t n) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < n) {
((half *)z)[tid] = pow(((float)((half *)x)[tid]), *((half *)y));
}
}
#define CASE_CONST(OP, T) \
_##OP##_const_kernel<DT_CUDA<T>::t> \
<<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>(a, b, c, \
n);
#define SWITCH_DTYPE_CONST(OP, DTYPE) \
switch (DTYPE) { \
case 1: \
CASE_CONST(OP, 1) \
break; \
case 2: \
CASE_CONST(OP, 2) \
break; \
case 3: \
CASE_CONST(OP, 3) \
break; \
case 4: \
CASE_CONST(OP, 4) \
break; \
case 5: \
CASE_CONST(OP, 5) \
break; \
case 6: \
CASE_CONST(OP, 6) \
break; \
case 7: \
CASE_CONST(OP, 7) \
break; \
case 10: \
CASE_CONST(OP, 10) \
break; \
case 11: \
CASE_CONST(OP, 11) \
break; \
case 12: \
CASE_CONST(OP, 12) \
break; \
case 13: \
CASE_CONST(OP, 13) \
break; \
default: \
IT_TODO_HALT(); \
}
namespace infini {
void div_const_kernel(int dType, void *a, void *b, void *c, size_t n) {
size_t blocksize = block_work_size();
size_t gridsize = (n + block_work_size() - 1) / block_work_size();
SWITCH_DTYPE_CONST(div, dType);
}
void pow_const_kernel(int dType, void *a, void *b, void *c, size_t n) {
size_t blocksize = block_work_size();
size_t gridsize = (n + block_work_size() - 1) / block_work_size();
SWITCH_DTYPE_CONST(pow, dType);
}
void div_kernel(int dType, void *a, void *b, void *c, int a0, int a1, int a2,
int a3, int b0, int b1, int b2, int b3, int c0, int c1, int c2,
int c3) {
@ -202,11 +288,13 @@ void pow_kernel(int dType, void *a, void *b, void *c, int a0, int a1, int a2,
int num = c0 * c1 * c2 * c3;
int gridsize = (num + block_work_size() - 1) / block_work_size();
if (dType == 1) {
_pow_kernel<float><<<gridsize, blocksize>>>(a, b, c, a0, a1, a2, a3, b0,
b1, b2, b3, c0, c1, c2, c3);
_pow_kernel<float>
<<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>(
a, b, c, a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3);
} else if (dType == 3) {
_pow_kernel<int8_t><<<gridsize, blocksize>>>(
a, b, c, a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3);
_pow_kernel<int8_t>
<<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>(
a, b, c, a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3);
} else if (dType == 10) {
int a_size = a0 * a1 * a2 * a3;
int b_size = b0 * b1 * b2 * b3;
@ -220,9 +308,10 @@ void pow_kernel(int dType, void *a, void *b, void *c, int a0, int a1, int a2,
for (int i = 0; i < b_size; ++i) {
b_float[i] = __half2float(((half *)b)[i]);
}
_pow_kernel<float><<<gridsize, blocksize>>>(
a_float.data(), b_float.data(), c_float.data(), a0, a1, a2, a3, b0,
b1, b2, b3, c0, c1, c2, c3);
_pow_kernel<float>
<<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>(
a_float.data(), b_float.data(), c_float.data(), a0, a1, a2, a3,
b0, b1, b2, b3, c0, c1, c2, c3);
for (int i = 0; i < c_size; ++i) {
((half *)c)[i] = __float2half(c_float[i]);
}

View File

@ -39,10 +39,19 @@ __global__ void _expandKernel(void *input, void *output, int nDims,
}
}
template <class T>
static __global__ void _expandRowKernel(void *__restrict__ dst,
void const *__restrict__ src) {
auto da = gridDim.x, db = blockDim.y, dx = blockDim.x, n = blockIdx.y,
a = blockIdx.x, b = threadIdx.y, x = threadIdx.x;
auto i = ((n * da + a) * db + b) * dx + x, j = (a * db + b) * dx + x;
reinterpret_cast<T *>(dst)[i] = reinterpret_cast<T const *>(src)[j];
}
namespace infini {
#define CASE(T) \
_expandKernel<DT_CUDA<T>::t><<<gridsize, blocksize>>>( \
_expandKernel<DT_CUDA<T>::t><<<gridsize, blocksize, \
0, CUDAStream::getCurrentStream()>>>( \
input, output, nDims, outputsize, inputShape, outputShape);
#define SWITCH_DTYPE(DTYPE) \
@ -95,4 +104,67 @@ void expandKernel(int dType, void *input, void *output, int nDims,
SWITCH_DTYPE(dType)
}
#define CASE_ROW(T) \
_expandRowKernel<float> \
<<<grid, block, 0, CUDAStream::getCurrentStream()>>>(output, input);
#define SWITCH_DTYPE_ROW(DTYPE) \
switch (DTYPE) { \
case 1: \
CASE_ROW(1) \
break; \
case 2: \
CASE_ROW(2) \
break; \
case 3: \
CASE_ROW(3) \
break; \
case 4: \
CASE_ROW(4) \
break; \
case 5: \
CASE_ROW(5) \
break; \
case 6: \
CASE_ROW(6) \
break; \
case 7: \
CASE_ROW(7) \
break; \
case 10: \
CASE_ROW(10) \
break; \
case 11: \
CASE_ROW(11) \
break; \
case 12: \
CASE_ROW(12) \
break; \
case 13: \
CASE_ROW(13) \
break; \
case 16: \
CASE_ROW(16) \
break; \
default: \
IT_TODO_HALT(); \
}
// Optimization for expanding a row vector. The row length must be a multiple of 32
void expandRowKernel(int dType, void *input, void *output, int n_rows,
int row_len) {
// Factorize row_len: row_len = a x b x 32 (32 is the warp size), b<=32
// input: 1 x (a x b x 32 x sizeT)
// output: n_rows x (a x b x 32 x sizeT)
// grid: n_rows x a
// block: b x 32
auto c = row_len / 32, b = c;
if (b > 32) {
for (b = 32; c % b != 0; --b);
}
auto a = c / b;
dim3 grid(a, n_rows), block(32, b);
SWITCH_DTYPE_ROW(dType)
}
} // namespace infini

View File

@ -19,7 +19,8 @@ void extend_kernel(float *in, float *out, int blockSize, int blockSizeOuter,
int oSize) {
int blocksize = 32 * 16;
int gridsize = (oSize + blocksize - 1) / blocksize;
_extend_kernel<<<gridsize, blocksize>>>(in, out, blockSize, blockSizeOuter,
oSize);
_extend_kernel
<<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>(
in, out, blockSize, blockSizeOuter, oSize);
}
} // namespace infini

View File

@ -45,9 +45,12 @@ void gather_kernel(T *in, T *out, GatherMetaData metaData, size_t num) {
int gridSize = (num + blockSize - 1) / blockSize;
if (metaData.indexType == DataType::Int64) {
_gather_kernel<T, int64_t>
<<<gridSize, blockSize>>>(in, out, metaData, num);
<<<gridSize, blockSize, 0, CUDAStream::getCurrentStream()>>>
(in, out, metaData, num);
} else {
_gather_kernel<T, int><<<gridSize, blockSize>>>(in, out, metaData, num);
_gather_kernel<T, int>
<<<gridSize, blockSize, 0, CUDAStream::getCurrentStream()>>>
(in, out, metaData, num);
}
}
template void gather_kernel<float>(float *in, float *out,

View File

@ -40,22 +40,26 @@ void gather_elements_kernel(void *in, void *out, GatherMetaData metaData,
int gridSize = (num + blockSize - 1) / blockSize;
if (metaData.dataType == DataType::Float32 &&
metaData.indexType == DataType::Int64) {
_gather_elements_kernel<float, int64_t><<<gridSize, blockSize>>>(
_gather_elements_kernel<float, int64_t>
<<<gridSize, blockSize, 0, CUDAStream::getCurrentStream()>>>(
reinterpret_cast<float *>(in), reinterpret_cast<float *>(out),
metaData, num);
} else if (metaData.dataType == DataType::Int32 &&
metaData.indexType == DataType::Int64) {
_gather_elements_kernel<int, int64_t><<<gridSize, blockSize>>>(
_gather_elements_kernel<int, int64_t>
<<<gridSize, blockSize, 0, CUDAStream::getCurrentStream()>>>(
reinterpret_cast<int *>(in), reinterpret_cast<int *>(out), metaData,
num);
} else if (metaData.dataType == DataType::Float32 &&
metaData.indexType == DataType::Int32) {
_gather_elements_kernel<float, int><<<gridSize, blockSize>>>(
_gather_elements_kernel<float, int>
<<<gridSize, blockSize, 0, CUDAStream::getCurrentStream()>>>(
reinterpret_cast<float *>(in), reinterpret_cast<float *>(out),
metaData, num);
} else if (metaData.dataType == DataType::Int32 &&
metaData.indexType == DataType::Int32) {
_gather_elements_kernel<int, int><<<gridSize, blockSize>>>(
_gather_elements_kernel<int, int>
<<<gridSize, blockSize, 0, CUDAStream::getCurrentStream()>>>(
reinterpret_cast<int *>(in), reinterpret_cast<int *>(out), metaData,
num);
} else {

View File

@ -344,8 +344,8 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
int BLOCK_DIM = 1024;
blockLaynormKernel<float, 1024>
<<<num_block, BLOCK_DIM>>>(input, scale, dimsize, stride, output,
eps, scaleSize, bias, biasSize);
<<<num_block, BLOCK_DIM, 0, CUDAStream::getCurrentStream()>>>
(input, scale, dimsize, stride, output, eps, scaleSize, bias, biasSize);
} else if (dimsize > 31) {
int BLOCK_DIM_x = 32;
int BLOCK_DIM_y = 32;
@ -353,9 +353,10 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
warpLaynormKernel<float, 32, 32><<<grid_dim, block_dim>>>(
input, scale, dimsize, stride, output, eps, scaleSize, num_block,
bias, biasSize);
warpLaynormKernel<float, 32, 32>
<<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
(input, scale, dimsize, stride, output, eps, scaleSize, num_block,
bias, biasSize);
} else if (dimsize > 15) {
int BLOCK_DIM_x = 16;
int BLOCK_DIM_y = 64;
@ -363,8 +364,9 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
warpLaynormKernel<float, 16, 64><<<grid_dim, block_dim>>>(
input, scale, dimsize, stride, output, eps, scaleSize, num_block,
warpLaynormKernel<float, 16, 64>
<<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
(input, scale, dimsize, stride, output, eps, scaleSize, num_block,
bias, biasSize);
} else if (dimsize > 7) {
int BLOCK_DIM_x = 8;
@ -373,8 +375,9 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
warpLaynormKernel<float, 8, 128><<<grid_dim, block_dim>>>(
input, scale, dimsize, stride, output, eps, scaleSize, num_block,
warpLaynormKernel<float, 8, 128>
<<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
(input, scale, dimsize, stride, output, eps, scaleSize, num_block,
bias, biasSize);
} else {
int BLOCK_DIM_x = 4;
@ -383,8 +386,9 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
warpLaynormKernel<float, 4, 256><<<grid_dim, block_dim>>>(
input, scale, dimsize, stride, output, eps, scaleSize, num_block,
warpLaynormKernel<float, 4, 256>
<<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
(input, scale, dimsize, stride, output, eps, scaleSize, num_block,
bias, biasSize);
}
}
@ -396,8 +400,9 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
if (dimsize > 1024) {
int BLOCK_DIM = 1024;
blockLaynormKernel<float, 1024><<<num_block, BLOCK_DIM>>>(
input, scale, dimsize, stride, output, eps, scaleSize);
blockLaynormKernel<float, 1024>
<<<num_block, BLOCK_DIM, 0, CUDAStream::getCurrentStream()>>>
(input, scale, dimsize, stride, output, eps, scaleSize);
} else if (dimsize > 31) {
int BLOCK_DIM_x = 32;
int BLOCK_DIM_y = 32;
@ -405,8 +410,9 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
warpLaynormKernel<float, 32, 32><<<grid_dim, block_dim>>>(
input, scale, dimsize, stride, output, eps, scaleSize, num_block);
warpLaynormKernel<float, 32, 32>
<<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
(input, scale, dimsize, stride, output, eps, scaleSize, num_block);
} else if (dimsize > 15) {
int BLOCK_DIM_x = 16;
int BLOCK_DIM_y = 64;
@ -414,8 +420,9 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
warpLaynormKernel<float, 16, 64><<<grid_dim, block_dim>>>(
input, scale, dimsize, stride, output, eps, scaleSize, num_block);
warpLaynormKernel<float, 16, 64>
<<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
(input, scale, dimsize, stride, output, eps, scaleSize, num_block);
} else if (dimsize > 7) {
int BLOCK_DIM_x = 8;
int BLOCK_DIM_y = 128;
@ -423,8 +430,9 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
warpLaynormKernel<float, 8, 128><<<grid_dim, block_dim>>>(
input, scale, dimsize, stride, output, eps, scaleSize, num_block);
warpLaynormKernel<float, 8, 128>
<<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
(input, scale, dimsize, stride, output, eps, scaleSize, num_block);
} else {
int BLOCK_DIM_x = 4;
int BLOCK_DIM_y = 256;
@ -432,8 +440,9 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
warpLaynormKernel<float, 4, 256><<<grid_dim, block_dim>>>(
input, scale, dimsize, stride, output, eps, scaleSize, num_block);
warpLaynormKernel<float, 4, 256>
<<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
(input, scale, dimsize, stride, output, eps, scaleSize, num_block);
}
}
//-----------------
@ -445,8 +454,8 @@ void LaynormKernel(const half *input, const half *scale, const half eps,
int BLOCK_DIM = 1024;
blockLaynormKernel<half, 1024>
<<<num_block, BLOCK_DIM>>>(input, scale, dimsize, stride, output,
eps, scaleSize, bias, biasSize);
<<<num_block, BLOCK_DIM, 0, CUDAStream::getCurrentStream()>>>
(input, scale, dimsize, stride, output, eps, scaleSize, bias, biasSize);
} else if (dimsize > 31) {
int BLOCK_DIM_x = 32;
int BLOCK_DIM_y = 32;
@ -454,8 +463,9 @@ void LaynormKernel(const half *input, const half *scale, const half eps,
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
warpLaynormKernel<half, 32, 32><<<grid_dim, block_dim>>>(
input, scale, dimsize, stride, output, eps, scaleSize, num_block,
warpLaynormKernel<half, 32, 32>
<<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
(input, scale, dimsize, stride, output, eps, scaleSize, num_block,
bias, biasSize);
} else if (dimsize > 15) {
int BLOCK_DIM_x = 16;
@ -464,8 +474,9 @@ void LaynormKernel(const half *input, const half *scale, const half eps,
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
warpLaynormKernel<half, 16, 64><<<grid_dim, block_dim>>>(
input, scale, dimsize, stride, output, eps, scaleSize, num_block,
warpLaynormKernel<half, 16, 64>
<<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
(input, scale, dimsize, stride, output, eps, scaleSize, num_block,
bias, biasSize);
} else if (dimsize > 7) {
int BLOCK_DIM_x = 8;
@ -474,8 +485,9 @@ void LaynormKernel(const half *input, const half *scale, const half eps,
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
warpLaynormKernel<half, 8, 128><<<grid_dim, block_dim>>>(
input, scale, dimsize, stride, output, eps, scaleSize, num_block,
warpLaynormKernel<half, 8, 128>
<<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
(input, scale, dimsize, stride, output, eps, scaleSize, num_block,
bias, biasSize);
} else {
int BLOCK_DIM_x = 4;
@ -484,8 +496,9 @@ void LaynormKernel(const half *input, const half *scale, const half eps,
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
warpLaynormKernel<half, 4, 256><<<grid_dim, block_dim>>>(
input, scale, dimsize, stride, output, eps, scaleSize, num_block,
warpLaynormKernel<half, 4, 256>
<<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
(input, scale, dimsize, stride, output, eps, scaleSize, num_block,
bias, biasSize);
}
}
@ -497,8 +510,9 @@ void LaynormKernel(const half *input, const half *scale, const half eps,
if (dimsize > 1024) {
int BLOCK_DIM = 1024;
blockLaynormKernel<half, 1024><<<num_block, BLOCK_DIM>>>(
input, scale, dimsize, stride, output, eps, scaleSize);
blockLaynormKernel<half, 1024>
<<<num_block, BLOCK_DIM, 0, CUDAStream::getCurrentStream()>>>
(input, scale, dimsize, stride, output, eps, scaleSize);
} else if (dimsize > 31) {
int BLOCK_DIM_x = 32;
int BLOCK_DIM_y = 32;
@ -506,8 +520,9 @@ void LaynormKernel(const half *input, const half *scale, const half eps,
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
warpLaynormKernel<half, 32, 32><<<grid_dim, block_dim>>>(
input, scale, dimsize, stride, output, eps, scaleSize, num_block);
warpLaynormKernel<half, 32, 32>
<<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
(input, scale, dimsize, stride, output, eps, scaleSize, num_block);
} else if (dimsize > 15) {
int BLOCK_DIM_x = 16;
int BLOCK_DIM_y = 64;
@ -515,8 +530,9 @@ void LaynormKernel(const half *input, const half *scale, const half eps,
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
warpLaynormKernel<half, 16, 64><<<grid_dim, block_dim>>>(
input, scale, dimsize, stride, output, eps, scaleSize, num_block);
warpLaynormKernel<half, 16, 64>
<<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
(input, scale, dimsize, stride, output, eps, scaleSize, num_block);
} else if (dimsize > 7) {
int BLOCK_DIM_x = 8;
int BLOCK_DIM_y = 128;
@ -524,8 +540,9 @@ void LaynormKernel(const half *input, const half *scale, const half eps,
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
warpLaynormKernel<half, 8, 128><<<grid_dim, block_dim>>>(
input, scale, dimsize, stride, output, eps, scaleSize, num_block);
warpLaynormKernel<half, 8, 128>
<<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
(input, scale, dimsize, stride, output, eps, scaleSize, num_block);
} else {
int BLOCK_DIM_x = 4;
int BLOCK_DIM_y = 256;
@ -533,8 +550,9 @@ void LaynormKernel(const half *input, const half *scale, const half eps,
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
warpLaynormKernel<half, 4, 256><<<grid_dim, block_dim>>>(
input, scale, dimsize, stride, output, eps, scaleSize, num_block);
warpLaynormKernel<half, 4, 256>
<<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
(input, scale, dimsize, stride, output, eps, scaleSize, num_block);
}
}
} // namespace infini

Some files were not shown because too many files have changed in this diff Show More