Compare commits

..

1 Commits

Author SHA1 Message Date
wanghailu0717 0a5d273130 Add: print derivation steps for conv2gemm 2023-11-10 23:16:44 +08:00
293 changed files with 3227 additions and 9672 deletions

View File

@ -14,10 +14,10 @@ env:
protobuf-version: "3.21.12" protobuf-version: "3.21.12"
python-version: "3.10" python-version: "3.10"
resnet-download: https://github.com/InfiniTensor/InfiniTensor/releases/download/test-models/resnet18-v2-7.onnx resnet-download: https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet18-v2-7.onnx
inception-download: https://github.com/InfiniTensor/InfiniTensor/releases/download/test-models/inception-v2-9.onnx inception-download: https://media.githubusercontent.com/media/onnx/models/main/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx
densenet-download: https://github.com/InfiniTensor/InfiniTensor/releases/download/test-models/densenet-12.onnx densenet-download: https://github.com/onnx/models/raw/main/vision/classification/densenet-121/model/densenet-12.onnx
efficientnet-download: https://github.com/InfiniTensor/InfiniTensor/releases/download/test-models/efficientnet-lite4-11.onnx efficientnet-download: https://github.com/onnx/models/raw/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx
jobs: jobs:
build: build:

View File

@ -13,7 +13,7 @@ if(USE_CUDA)
message("CMake 3.18 or higher is required for setting CUDAToolkit") message("CMake 3.18 or higher is required for setting CUDAToolkit")
cmake_minimum_required(VERSION 3.18) # FindCUDAToolkit cmake_minimum_required(VERSION 3.18) # FindCUDAToolkit
else() else()
cmake_minimum_required(VERSION 3.17) cmake_minimum_required(VERSION 3.12)
endif() endif()
include(CMakeDependentOption) include(CMakeDependentOption)
@ -118,7 +118,7 @@ if(BUILD_TEST)
include_directories(3rd-party/googletest/googletest/include) include_directories(3rd-party/googletest/googletest/include)
endif() endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -Wall -Werror -Wno-error=deprecated-declarations -Wno-error=pointer-arith") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -Wall -Werror -Wno-error=deprecated-declarations")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -UNDEBUG") # Enable assertion set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -UNDEBUG") # Enable assertion
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -UNDEBUG") # Enable assertion set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -UNDEBUG") # Enable assertion
@ -261,14 +261,7 @@ if(USE_BANG)
# BangC Kernels # BangC Kernels
################################################################################ ################################################################################
if (BUILD_DIST) target_link_libraries(InfiniTensor ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
find_library(CAMBRICON_CNCL libcncl.so "${NEUWARE_HOME}/lib64")
target_link_libraries(InfiniTensor ${CAMBRICON_CNCL} ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
message(STATUS "Add BUILD_DIST, use CNCL with BANG")
add_compile_definitions(INFINI_USE_CNCL=1)
else()
target_link_libraries(InfiniTensor ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
endif()
endif() endif()
if(USE_KUNLUN) if(USE_KUNLUN)
@ -331,7 +324,6 @@ if(BUILD_TEST)
endif() endif()
if (USE_BANG) if (USE_BANG)
build_test(test/kernels/bang/*.cc) build_test(test/kernels/bang/*.cc)
build_test(test/bang/*.cc)
endif() endif()
if (USE_KUNLUN) if (USE_KUNLUN)
build_test(test/kernels/kunlun/*.cc) build_test(test/kernels/kunlun/*.cc)

View File

@ -1,14 +1,13 @@
.PHONY : build clean format install-python test-cpp test-onnx .PHONY : build clean format install-python test-cpp test-onnx
TYPE ?= Release TYPE ?= Release
CUDA ?= OFF CUDA ?= ON
BANG ?= OFF BANG ?= OFF
KUNLUN ?= OFF KUNLUN ?= OFF
INTELCPU ?= off INTELCPU ?= off
BACKTRACE ?= ON BACKTRACE ?= ON
TEST ?= ON TEST ?= ON
DIST ?= OFF NNET ?= ON
NNET ?= OFF
FORMAT_ORIGIN ?= FORMAT_ORIGIN ?=
# Docker build options # Docker build options
DOCKER_NAME ?= infinitensor DOCKER_NAME ?= infinitensor
@ -30,7 +29,6 @@ CMAKE_OPT += -DUSE_BANG=$(BANG)
CMAKE_OPT += -DUSE_KUNLUN=$(KUNLUN) CMAKE_OPT += -DUSE_KUNLUN=$(KUNLUN)
CMAKE_OPT += -DUSE_BACKTRACE=$(BACKTRACE) CMAKE_OPT += -DUSE_BACKTRACE=$(BACKTRACE)
CMAKE_OPT += -DBUILD_TEST=$(TEST) CMAKE_OPT += -DBUILD_TEST=$(TEST)
CMAKE_OPT += -DBUILD_DIST=$(DIST)
CMAKE_OPT += -DBUILD_NNET=$(NNET) CMAKE_OPT += -DBUILD_NNET=$(NNET)
ifeq ($(INTELCPU), ON) ifeq ($(INTELCPU), ON)
@ -39,7 +37,7 @@ endif
build: build:
mkdir -p build/$(TYPE) mkdir -p build/$(TYPE)
cd build/$(TYPE) && cmake $(CMAKE_OPT) ../.. && make -j8 cd build/$(TYPE) && cmake $(CMAKE_OPT) ../.. && make -j99
clean: clean:
rm -rf build rm -rf build

View File

@ -33,14 +33,13 @@ There are several configurable CMake options, see the [CMakeLists.txt](/CMakeLis
## Roadmap ## Roadmap
- [RefactorGraph](https://github.com/InfiniTensor/RefactorGraph) is a newly designed AI framework that is set to replace the current main branch.
- [EinNet](https://github.com/InfiniTensor/InfiniTensor/tree/NNET_e2e) is going to be merged into the main branch. - [EinNet](https://github.com/InfiniTensor/InfiniTensor/tree/NNET_e2e) is going to be merged into the main branch.
- Integration of [PET](https://github.com/thu-pacman/PET), a tensor program optimizer supporting partially equivalent transformations. - Integration of [PET](https://github.com/thu-pacman/PET), a tensor program optimizer supporting partially equivalent transformations.
- Supported hardware - Supported hardware
- ✔ NVIDIA GPU - ✔ NVIDIA GPU
- ✔ Cambricon MLU - ✔ Cambricon MLU
- ✔ Kunlunxin XPU
- ⬜ Ascend NPU - ⬜ Ascend NPU
- ⬜ Kunlunxin XPU
## Contributor Guide ## Contributor Guide

View File

@ -1,76 +0,0 @@
SET(CNCL_LIB_SEARCH_PATHS $ENV{NEUWARE_HOME}/lib64)
SET(CNCL_INCLUDE_SEARCH_PATHS $ENV{NEUWARE_HOME}/include)
set(CNCL_INCLUDE_DIR $ENV{NEUWARE_HOME}/include)
set(CNCL_LIB_DIR $ENV{NEUWARE_HOME}/lib64)
set(CNCL_VERSION $ENV{CNCL_VERSION} CACHE STRING "Version of CNCL to build with")
if ($ENV{CNCL_ROOT_DIR})
message(WARNING "CNCL_ROOT_DIR is deprecated. Please set CNCL_ROOT instead.")
endif()
list(APPEND CNCL_ROOT $ENV{CNCL_ROOT_DIR} ${MLU_TOOLKIT_ROOT_DIR})
# Compatible layer for CMake <3.12. CNCL_ROOT will be accounted in for searching paths and libraries for CMake >=3.12.
list(APPEND CMAKE_PREFIX_PATH ${CNCL_ROOT})
find_path(CNCL_INCLUDE_DIRS
NAMES cncl.h
HINTS ${CNCL_INCLUDE_DIR})
if (USE_STATIC_CNCL)
MESSAGE(STATUS "USE_STATIC_CNCL is set. Linking with static CNCL library.")
SET(CNCL_LIBNAME "CNCL_static")
if (CNCL_VERSION) # Prefer the versioned library if a specific CNCL version is specified
set(CMAKE_FIND_LIBRARY_SUFFIXES ".a.${CNCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
endif()
else()
SET(CNCL_LIBNAME "cncl")
if (CNCL_VERSION) # Prefer the versioned library if a specific CNCL version is specified
set(CMAKE_FIND_LIBRARY_SUFFIXES ".so.${CNCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
endif()
endif()
find_library(CNCL_LIBRARIES
NAMES ${CNCL_LIBNAME}
HINTS ${CNCL_LIB_DIR})
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(CNCL DEFAULT_MSG CNCL_INCLUDE_DIRS CNCL_LIBRARIES)
if(CNCL_FOUND) # obtaining CNCL version and some sanity checks
set (CNCL_HEADER_FILE "${CNCL_INCLUDE_DIRS}/cncl.h")
message (STATUS "Determining CNCL version from ${CNCL_HEADER_FILE}...")
set (OLD_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES})
list (APPEND CMAKE_REQUIRED_INCLUDES ${CNCL_INCLUDE_DIRS})
include(CheckCXXSymbolExists)
check_cxx_symbol_exists(CNCL_VERSION_CODE CNCL.h CNCL_VERSION_DEFINED)
if (CNCL_VERSION_DEFINED)
set(file "${PROJECT_BINARY_DIR}/detect_cncl_version.cc")
file(WRITE ${file} "
#include <iostream>
#include <cncl.h>
int main()
{
std::cout << CNCL_MAJOR << '.' << CNCL_MINOR << '.' << CNCL_PATCH << std::endl;
int x;
CNCLGetVersion(&x);
return x == CNCL_VERSION_CODE;
}
")
try_run(CNCL_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file}
RUN_OUTPUT_VARIABLE CNCL_VERSION_FROM_HEADER
CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${CNCL_INCLUDE_DIRS}"
LINK_LIBRARIES ${CNCL_LIBRARIES})
if (NOT CNCL_VERSION_MATCHED)
message(FATAL_ERROR "Found CNCL header version and library version do not match! \
(include: ${CNCL_INCLUDE_DIRS}, library: ${CNCL_LIBRARIES}) Please set CNCL_INCLUDE_DIR and CNCL_LIB_DIR manually.")
endif()
message(STATUS "CNCL version: ${CNCL_VERSION_FROM_HEADER}")
else()
# message(STATUS "CNCL version < 2.3.5-5")
endif ()
set (CMAKE_REQUIRED_INCLUDES ${OLD_CMAKE_REQUIRED_INCLUDES})
message(STATUS "Found CNCL (include: ${CNCL_INCLUDE_DIRS}, library: ${CNCL_LIBRARIES})")
mark_as_advanced(CNCL_ROOT_DIR CNCL_INCLUDE_DIRS CNCL_LIBRARIES)
endif()

@ -1 +1 @@
Subproject commit b896cec2dba5b8522b141ac4f89eb43074ee1b98 Subproject commit 51d3105277f3774ed31c02ed4cd11fa92925af77

View File

@ -1,196 +0,0 @@
import argparse
import os
import time
import multiprocessing as mp
from pyinfinitensor.onnx import OnnxStub, backend
import onnx
from onnx.shape_inference import infer_shapes_path
import numpy as np
from parallel_opt import parallel_model
def parse_args():
parser = argparse.ArgumentParser(description="launch distributed infinitensor")
parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
parser.add_argument(
"--nproc_per_node", type=int, default=2, help="number of processes per node"
)
parser.add_argument(
"--name", type=str, default="test", help="name of this instance."
)
parser.add_argument(
"--model", type=str, default="/data/onnx_models/llama2/llama_bs1_seq1024.onnx",
help="path to the ONNX model file."
)
parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
parser.add_argument("--length", type=int, default=1, help="sequence length.")
parser.add_argument(
"--gen_std",
default=False,
action="store_true",
help="whether to generate the standard results.",
)
args = parser.parse_args()
print("arg setting: ", args)
return (
args.num_nodes,
args.nproc_per_node,
args.name,
args.model,
args.batch_size,
args.length,
args.gen_std,
)
def run_model(model, runtime, world_size=1, rank=0, n=10):
stub = OnnxStub(model, runtime)
load_inputs(stub, world_size, rank)
# stub.tune()
stub.run()
# get outputs
time.sleep(0.01)
outputs = next(stub.outputs.values().__iter__()).copyout_numpy()
# bench
begin = time.time()
for _ in range(n):
stub.run()
end = time.time()
avg_time = (end - begin) / n
print(f"average time: {avg_time}")
return outputs
def run_and_compare(name, model, runtime, world_size=1, rank = 0):
results = np.load(f"./data/output.npy")
outputs = run_model(model, runtime, world_size, rank)
print("answer argmax:", np.argmax(results))
print("output argmax:", np.argmax(outputs))
#np.testing.assert_allclose(outputs, results, rtol=1e-3, atol=1e-3)
getDiff(results, outputs)
def start_worker(
name: str, world_size: int, rank: int, local_rank: int, model: onnx.ModelProto
):
dist_name = name + "_dist"
model = parallel_model(model, world_size, rank)
extern_path = f"./{dist_name}_rank{rank}.pb"
if os.path.exists(extern_path):
os.remove(extern_path)
onnx.save_model(
model,
f"./{dist_name}_rank{rank}.onnx",
save_as_external_data=True,
location=extern_path,
)
infer_shapes_path(f"./{dist_name}_rank{rank}.onnx")
runtime = backend.BangRuntime(local_rank)
# print("init comm")
runtime.init_comm(
dist_name,
world_size,
rank,
)
run_and_compare(name, model, runtime, world_size, rank)
def start_single(name, model):
runtime = backend.BangRuntime(0)
run_and_compare(name, model, runtime)
def generate_input_output(model):
os.makedirs(os.path.dirname("./data/"), exist_ok=True)
runtime = backend.BangRuntime(0)
stub = OnnxStub(model, runtime)
position_id = 0
for i, (name, tensor) in enumerate(stub.inputs.items()):
input = tensor.copyout_numpy()
if np.issubdtype(input.dtype, np.integer):
if input.size == 1:
# input = np.array([position_id])
input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
else:
input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
elif input.dtype == np.bool_:
input = np.random.randint(0,2,size=input.shape) > 0
else:
if i == 0:
input = np.ones(input.shape).astype(input.dtype)
position_id = input.shape[-1] - 1
else:
input = np.random.rand(*input.shape).astype(input.dtype)
tensor.copyin_numpy(input)
np.save(f"./data/input_{i}", input)
stub.run()
time.sleep(0.01)
output = next(stub.outputs.values().__iter__()).copyout_numpy()
if np.isnan(output).any():
print("Nan in output")
np.save(f"./data/output", output)
def load_inputs(stub, world_size=1, rank=0):
for i, (name, tensor) in enumerate(stub.inputs.items()):
input = np.load(f"./data/input_{i}.npy")
if all(x == y for x,y in zip(input.shape,tensor.shape())):
tensor.copyin_numpy(input)
else:
tensor.copyin_numpy(np.hsplit(input, world_size)[rank])
def getDiff(base, test):
absolute_diff = np.abs(np.subtract(base, test))
max_absolute_diff = np.max(absolute_diff)
baseCopy = base.astype(np.float64).ravel()
testCopy = test.astype(np.float64).ravel()
upValue = np.sum(np.abs(baseCopy - testCopy))
downValue = np.sum(np.abs(baseCopy)) + np.float64(1e-9)
max_relative_diff = upValue / downValue
print(f"Max absolute difference: {max_absolute_diff}\n"
f"Max relative difference: {max_relative_diff}")
return max_absolute_diff, max_relative_diff
def main():
nnodes, nproc_per_node, name, model_path, bs, length, gen_std = parse_args()
model = onnx.load(model_path)
# generate standart output
if gen_std:
print("Generate inputs and outputs.")
p = mp.Process(target=generate_input_output, args=[model])
p.start()
p.join()
return
# run single process.
# use standalone process to isolate cuda.
print("run model by single MLU.")
p = mp.Process(target=start_single, args=(name, model))
p.start()
p.join()
# run distributed parallel.
world_size = nnodes * nproc_per_node
print(f"run model by {world_size} MLUs in parallel.")
workers = [
mp.Process(
target=start_worker,
args=(name, world_size, rank, rank % nproc_per_node, model),
)
for rank in range(world_size)
]
for w in workers:
w.start()
for w in workers:
w.join()
if __name__ == "__main__":
main()

View File

@ -115,7 +115,7 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
assert out_dims[s_dim] % tp_world_size == 0, out_dims assert out_dims[s_dim] % tp_world_size == 0, out_dims
out_dims[s_dim] //= tp_world_size out_dims[s_dim] //= tp_world_size
# if ONNX uses the same tensor for multiple Reshape Nodes, then rename it to distingush from others. # if ONNX uses the same tensor for multiple Reshape Nodes, then rename it to distingush from others.
node.input[1] = node.output[0] + "_shape" # node.input[1] = node.output[0] + "_shape"
data[node.input[1]] = numpy_helper.from_array(out_dims, name=node.input[1]) data[node.input[1]] = numpy_helper.from_array(out_dims, name=node.input[1])
place[node.output[0]] = Shard(s_dim) place[node.output[0]] = Shard(s_dim)
@ -137,7 +137,7 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
place[node.output[0]] = Shard(list(perm).index(plc.dim)) place[node.output[0]] = Shard(list(perm).index(plc.dim))
def shard_node(node: NodeProto): def shard_node(node: NodeProto):
if node.op_type in ["Relu", "Tanh", "Softmax", "Cast"]: if node.op_type in ["Relu", "Tanh", "Softmax"]:
place[node.output[0]] = place[node.input[0]] place[node.output[0]] = place[node.input[0]]
elif node.op_type in ["Where"]: elif node.op_type in ["Where"]:
place[node.output[0]] = place[node.input[1]] place[node.output[0]] = place[node.input[1]]
@ -177,14 +177,7 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
input in data for input in node.input input in data for input in node.input
): ):
# FIXME(constroy): the last MatMul should not be sharded as TP. # FIXME(constroy): the last MatMul should not be sharded as TP.
if ( if node.output[0] in output:
node.output[0] in output
or (
index + 1 < len(model.graph.node)
and model.graph.node[index + 1].output[0]
)
in output
):
continue continue
groups = 1 groups = 1
# If the Gemm or Matmul is followed by a split, then the inputs are concatinated by groups # If the Gemm or Matmul is followed by a split, then the inputs are concatinated by groups

View File

@ -1,145 +0,0 @@
import os
from pyinfinitensor.onnx import OnnxStub, backend
import numpy as np
import onnx
import torch
from transformers import LlamaModel, LlamaForCausalLM
from tqdm import tqdm
import onnx_graphsurgeon as gs
from onnxsim import simplify
import argparse
parser = argparse.ArgumentParser(description='')
parser.add_argument('--batchsize', dest='batchsize', type=int, default=1)
parser.add_argument('--layer', dest='n_layers', type=int, default=2)
parser.add_argument('--iter', dest='n_iter', type=int, default=1)
parser.add_argument('--n_max_length', dest='n_max_length', type=int, default=1024)
parser.add_argument('--pretrained_llama_path', dest='pretrained_llama_path', type=str,
default="/data0/shared/data/public/opensource_models/meta-llama/Llama-2-7b-hf/")
parser.add_argument('--onnx_model_path', dest='onnx_model_path', type=str,
default="/data1/shared/llama")
args = parser.parse_args()
ONNX_MODEL_PATH = "{}/llama_bs{}_layer{}.onnx".format(args.onnx_model_path, args.batchsize, args.n_layers)
ONNX_WEIGHT_PATH = "./llama_bs{}_layer{}.pb".format(args.batchsize, args.n_layers)
def export_onnx(model: LlamaModel, ONNX_MODEL_PATH):
param = torch.zeros(
(args.batchsize, 1024), dtype=torch.long)
logits = model(param, past_key_values=None)
param_kvcache = torch.zeros((args.batchsize, 1), dtype=torch.long)
torch.onnx.export(model, (param_kvcache, {"past_key_values": logits.past_key_values,
"position_ids": param_kvcache}), ONNX_MODEL_PATH, verbose=False,
do_constant_folding=True,)
onnx_model = onnx.load(ONNX_MODEL_PATH)
print("simplifing onnx model")
onnx_model, check = simplify(onnx_model, skipped_optimizers=[
'eliminate_duplicate_initializer'])
assert check
onnx.save(onnx_model, ONNX_MODEL_PATH, save_as_external_data=True, location=ONNX_WEIGHT_PATH)
print("simlifing finished.")
@gs.Graph.register()
def replace_with_attention(self, inputs, outputs, inputs_added, outputs_removed):
for inp in inputs:
inp.outputs.clear()
for out in outputs:
out.inputs.clear()
for inp in inputs_added:
inputs.append(inp)
for out in outputs_removed:
out.inputs.clear()
return self.layer(op="AttentionKVCache", inputs=inputs, outputs=outputs)
def replace_onnx_with_attention_op():
graph = gs.import_onnx(
onnx.load(ONNX_MODEL_PATH))
tmap = graph.tensors()
for i in range(args.n_layers):
inputs = [
tmap["onnx::Concat_" + str((i+1)*2)],
tmap["onnx::Concat_" + str((i+1)*2+1)],
tmap["/model/layers." + str(i) + "/self_attn/Add_output_0"],
tmap["/model/layers." + str(i) + "/self_attn/Add_1_output_0"],
tmap["/model/layers." + str(i) + "/self_attn/Transpose_2_output_0"]]
outputs = [
tmap["/model/layers." + str(i) + "/self_attn/MatMul_1_output_0"]]
inputs_added = [graph.inputs[1]]
outputs_removed = []
graph.replace_with_attention(
inputs, outputs, inputs_added, outputs_removed)
graph.outputs = [tmap[graph.outputs[0].name]]
graph.cleanup(True).toposort()
onnx.save(gs.export_onnx(graph), ONNX_MODEL_PATH, save_as_external_data=True)
if __name__ == "__main__":
kvcache_torch = None
torch_model = LlamaForCausalLM.from_pretrained(
args.pretrained_llama_path, num_hidden_layers=int(args.n_layers)).eval()
n_heads = torch_model.config.num_attention_heads
n_dims = torch_model.config.hidden_size // n_heads
if not os.path.exists(ONNX_MODEL_PATH):
print("exporting onnx graph")
export_onnx(torch_model, ONNX_MODEL_PATH)
replace_onnx_with_attention_op()
else:
print("will use exsiting onnx graph")
onnx_model = onnx.load(ONNX_MODEL_PATH)
stub = OnnxStub(onnx_model, backend.cuda_runtime())
count_wrong = 0
for i in tqdm(range(0, args.n_max_length)):
query = np.random.randint(
torch_model.config.vocab_size, size=(args.batchsize, 1), dtype=np.int32)
position_id = i*np.ones((args.batchsize, 1), dtype=np.int32)
####################################
# pytorch
####################################
outputs_torch = torch_model(
torch.tensor(query), past_key_values=kvcache_torch)
logit_torch = outputs_torch['logits']
kvcache_torch = outputs_torch['past_key_values']
####################################
# infinitensor
####################################
# copyin input
(list(stub.inputs.items()))[0][1].copyin_int64(
query.reshape(-1).tolist())
(list(stub.inputs.items()))[1][1].copyin_int64(
position_id.reshape(-1).tolist())
stub.run()
####################################
# validation
####################################
# copyout output
logits_it = np.array((list(stub.outputs.items()))
[0][1].copyout_float())
try:
np.testing.assert_allclose(
logit_torch[:, -1, :].detach().cpu().numpy().flatten(), logits_it, rtol=1e-3, atol=1e-3)
except Exception as e:
try:
np.testing.assert_allclose(
np.argmax(logit_torch[:, -1, :].detach().cpu().numpy().flatten()), np.argmax(logits_it), rtol=1e-3, atol=1e-3)
except:
count_wrong = count_wrong + 1
result = "{}/{} failed.".format(count_wrong, args.n_max_length)
print(result)
del stub

View File

@ -1,80 +0,0 @@
import paddle
import paddle.vision.transforms as T
from paddle.vision.datasets import Cifar10
from pyinfinitensor.onnx import OnnxStub, backend
import onnx
import itertools
def run_cifar_train_and_infer():
paddle.device.set_device("gpu")
transform = T.Compose(
[
T.Resize(224),
T.ToTensor(),
T.Normalize(
mean=[0.5, 0.5, 0.5],
std=[0.5, 0.5, 0.5],
to_rgb=True,
),
]
)
# 下载数据集并初始化 DataSet
train_dataset = paddle.vision.datasets.Cifar10(mode='train', transform=transform)
test_dataset = paddle.vision.datasets.Cifar10(mode='test', transform=transform)
# 模型组网并初始化网络
densenet = paddle.vision.models.DenseNet(num_classes=10)
model = paddle.Model(densenet)
# 模型训练的配置准备,准备损失函数,优化器和评价指标
model.prepare(paddle.optimizer.Adam(parameters=model.parameters()),
paddle.nn.CrossEntropyLoss(),
paddle.metric.Accuracy())
# 模型训练
model.fit(train_dataset, epochs=5, batch_size=64, verbose=1)
# 模型评估
model.evaluate(test_dataset, batch_size=64, verbose=1)
# export to ONNX
save_path = 'onnx.save/densenet' # 需要保存的路径
x_spec = paddle.static.InputSpec([1, 3, 224, 224], 'float32', 'x') # 为模型指定输入的形状和数据类型,支持持 Tensor 或 InputSpec InputSpec 支持动态的 shape。
paddle.onnx.export(densenet, save_path, input_spec=[x_spec], opset_version=11)
# 加载onnx模型并放到Infinitensor中
model_path = save_path + ".onnx"
onnx_model = onnx.load(model_path)
gofusion_model = OnnxStub(onnx_model, backend.cuda_runtime())
model = gofusion_model
model.init()
# 启动推理
cifar10_test = Cifar10(
mode="test",
transform=transform, # apply transform to every image
backend="cv2", # use OpenCV as image transform backend
)
batch_size = 1
total_size = 0
total_acc = 0.0
for data in itertools.islice(iter(cifar10_test), 10000):
images, labels = data
next(model.inputs.items().__iter__())[1].copyin_float(images.reshape([3*224*224]).tolist())
model.run()
outputs = next(model.outputs.items().__iter__())[1].copyout_float()
outputs = paddle.to_tensor(outputs)
outputs = paddle.reshape(outputs, (1, 10))
labels = paddle.to_tensor(labels)
labels = paddle.reshape(labels, (1,1))
acc = paddle.metric.accuracy(outputs, labels)
total_acc += acc
total_size += batch_size
print("test acc: {}".format(total_acc.numpy() / total_size))
if __name__ == "__main__":
run_cifar_train_and_infer()

View File

@ -1,80 +0,0 @@
import paddle
import paddle.vision.transforms as T
from paddle.vision.datasets import Cifar10
from pyinfinitensor.onnx import OnnxStub, backend
import onnx
import itertools
def run_cifar_train_and_infer():
paddle.device.set_device("gpu")
transform = T.Compose(
[
T.Resize(224),
T.ToTensor(),
T.Normalize(
mean=[0.5, 0.5, 0.5],
std=[0.5, 0.5, 0.5],
to_rgb=True,
),
]
)
# 下载数据集并初始化 DataSet
train_dataset = paddle.vision.datasets.Cifar10(mode='train', transform=transform)
test_dataset = paddle.vision.datasets.Cifar10(mode='test', transform=transform)
# 模型组网并初始化网络
inception = paddle.vision.models.InceptionV3(num_classes=10)
model = paddle.Model(inception)
# 模型训练的配置准备,准备损失函数,优化器和评价指标
model.prepare(paddle.optimizer.Adam(parameters=model.parameters()),
paddle.nn.CrossEntropyLoss(),
paddle.metric.Accuracy())
# 模型训练
model.fit(train_dataset, epochs=5, batch_size=64, verbose=1)
# 模型评估
model.evaluate(test_dataset, batch_size=64, verbose=1)
# export to ONNX
save_path = 'onnx.save/inception' # 需要保存的路径
x_spec = paddle.static.InputSpec([1, 3, 224, 224], 'float32', 'x') # 为模型指定输入的形状和数据类型,支持持 Tensor 或 InputSpec InputSpec 支持动态的 shape。
paddle.onnx.export(inception, save_path, input_spec=[x_spec], opset_version=11)
# 加载onnx模型并放到Infinitensor中
model_path = save_path + ".onnx"
onnx_model = onnx.load(model_path)
gofusion_model = OnnxStub(onnx_model, backend.cuda_runtime())
model = gofusion_model
model.init()
# 启动推理
cifar10_test = Cifar10(
mode="test",
transform=transform, # apply transform to every image
backend="cv2", # use OpenCV as image transform backend
)
batch_size = 1
total_size = 0
total_acc = 0.0
for data in itertools.islice(iter(cifar10_test), 10000):
images, labels = data
next(model.inputs.items().__iter__())[1].copyin_float(images.reshape([3*224*224]).tolist())
model.run()
outputs = next(model.outputs.items().__iter__())[1].copyout_float()
outputs = paddle.to_tensor(outputs)
outputs = paddle.reshape(outputs, (1, 10))
labels = paddle.to_tensor(labels)
labels = paddle.reshape(labels, (1,1))
acc = paddle.metric.accuracy(outputs, labels)
total_acc += acc
total_size += batch_size
print("test acc: {}".format(total_acc.numpy() / total_size))
if __name__ == "__main__":
run_cifar_train_and_infer()

View File

@ -1,31 +0,0 @@
## Description
This is a doc to tell you how to run paddle*.py in your machine. If your model run on other machines except Nvidia, you may need to make some change.
## What do we do in paddle*.py files?
1. Train model and evalute model with Cifar10 dataset
2. Export paddle model to onnx model
3. Load onnx model, infer with InfiniTensor and calculate the inference accuracy
## Command
1. Go to `/examples/python` folder
2. Run the following command
1. ```
python paddle_resnet.py
python paddle_densenet.py
python paddle_inception.py
```
## What should I do if I use other device(MLU, XPU, NPU)?
You need to change this code:
```
paddle.device.set_device("gpu") # Change gpu to mlu, xpu or npu
```

View File

@ -1,81 +0,0 @@
import paddle
import paddle.vision.transforms as T
from paddle.vision.datasets import Cifar10
from pyinfinitensor.onnx import OnnxStub, backend
import onnx
import itertools
from paddle.vision.models.resnet import BasicBlock
def run_cifar_train_and_infer():
paddle.device.set_device("gpu")
transform = T.Compose(
[
T.Resize(224),
T.ToTensor(),
T.Normalize(
mean=[0.5, 0.5, 0.5],
std=[0.5, 0.5, 0.5],
to_rgb=True,
),
]
)
# 下载数据集并初始化 DataSet
train_dataset = paddle.vision.datasets.Cifar10(mode='train', transform=transform)
test_dataset = paddle.vision.datasets.Cifar10(mode='test', transform=transform)
# 模型组网并初始化网络
resnet = paddle.vision.models.ResNet(BasicBlock, depth=18, num_classes=10)
model = paddle.Model(resnet)
# 模型训练的配置准备,准备损失函数,优化器和评价指标
model.prepare(paddle.optimizer.Adam(parameters=model.parameters()),
paddle.nn.CrossEntropyLoss(),
paddle.metric.Accuracy())
# 模型训练
model.fit(train_dataset, epochs=5, batch_size=64, verbose=1)
# 模型评估
model.evaluate(test_dataset, batch_size=64, verbose=1)
# export to ONNX
save_path = 'onnx.save/resnet' # 需要保存的路径
x_spec = paddle.static.InputSpec([1, 3, 224, 224], 'float32', 'x') # 为模型指定输入的形状和数据类型,支持持 Tensor 或 InputSpec InputSpec 支持动态的 shape。
paddle.onnx.export(resnet, save_path, input_spec=[x_spec], opset_version=11)
# 加载onnx模型并放到Infinitensor中
model_path = save_path + ".onnx"
onnx_model = onnx.load(model_path)
gofusion_model = OnnxStub(onnx_model, backend.cuda_runtime())
model = gofusion_model
model.init()
# 启动推理
cifar10_test = Cifar10(
mode="test",
transform=transform, # apply transform to every image
backend="cv2", # use OpenCV as image transform backend
)
batch_size = 1
total_size = 0
total_acc = 0.0
for data in itertools.islice(iter(cifar10_test), 10000):
images, labels = data
next(model.inputs.items().__iter__())[1].copyin_float(images.reshape([3*224*224]).tolist())
model.run()
outputs = next(model.outputs.items().__iter__())[1].copyout_float()
outputs = paddle.to_tensor(outputs)
outputs = paddle.reshape(outputs, (1, 10))
labels = paddle.to_tensor(labels)
labels = paddle.reshape(labels, (1,1))
acc = paddle.metric.accuracy(outputs, labels)
total_acc += acc
total_size += batch_size
print("test acc: {}".format(total_acc.numpy() / total_size))
if __name__ == "__main__":
run_cifar_train_and_infer()

View File

@ -2,10 +2,6 @@
#include "cnnl.h" #include "cnnl.h"
#include "cnrt.h" #include "cnrt.h"
#include "core/common.h" #include "core/common.h"
#include "core/data_type.h"
#ifdef INFINI_USE_CNCL
#include "cncl.h"
#endif
#define checkBangError(call) \ #define checkBangError(call) \
{ \ { \
@ -31,70 +27,4 @@ namespace infini {
using BangPtr = void *; using BangPtr = void *;
inline cnnlDataType_t cnnlDataTypeConvert(DataType dataType) {
if (dataType == DataType::Float32) {
return CNNL_DTYPE_FLOAT;
}
if (dataType == DataType::Float16) {
return CNNL_DTYPE_HALF;
}
if (dataType == DataType::Double) {
return CNNL_DTYPE_DOUBLE;
}
if (dataType == DataType::Int8) {
return CNNL_DTYPE_INT8;
}
if (dataType == DataType::Int32) {
return CNNL_DTYPE_INT32;
}
if (dataType == DataType::UInt8) {
return CNNL_DTYPE_UINT8;
}
if (dataType == DataType::BFloat16) {
return CNNL_DTYPE_BFLOAT16;
}
if (dataType == DataType::Int64) {
return CNNL_DTYPE_INT64;
}
if (dataType == DataType::Bool) {
return CNNL_DTYPE_BOOL;
}
IT_TODO_HALT_MSG("Data type " + dataType.toString() +
" not supported in CNNL.");
}
#ifdef INFINI_USE_CNCL
inline cnclDataType_t cnclDataTypeConvert(DataType dataType) {
if (dataType == DataType::Float32) {
return cnclFloat32;
}
if (dataType == DataType::Float16) {
return cnclHalf;
}
if (dataType == DataType::Int8) {
return cnclInt8;
}
if (dataType == DataType::Int16) {
return cnclInt16;
}
if (dataType == DataType::Int32) {
return cnclInt32;
}
if (dataType == DataType::UInt8) {
return cnclUint8;
}
if (dataType == DataType::UInt16) {
return cnclUint16;
}
if (dataType == DataType::UInt32) {
return cnclUint32;
}
if (dataType == DataType::BFloat16) {
return cnclBfloat16;
}
IT_TODO_HALT_MSG("Data type " + dataType.toString() +
" not supported in CNCL.");
}
#endif
} // namespace infini } // namespace infini

View File

@ -7,19 +7,16 @@ namespace infini {
class BangRuntimeObj : public RuntimeObj { class BangRuntimeObj : public RuntimeObj {
private: private:
cnnlHandle_t cnnl; cnnlHandle_t cnnl;
cnrtQueue_t queue;
std::unique_ptr<CommunicatorObj> comm;
BangPtr workspace; BangPtr workspace;
size_t workspaceSize; size_t workspaceSize;
mutable size_t cursor;
public: public:
explicit BangRuntimeObj(int deviceId = 0) BangRuntimeObj() : RuntimeObj(Device::BANG) {
: RuntimeObj(Device::BANG, deviceId) {
cnInit(0); cnInit(0);
CNdev dev; CNdev dev;
cnDeviceGet(&dev, deviceId); cnDeviceGet(&dev, 0);
checkBangError(cnrtSetDevice(dev)); checkBangError(cnrtSetDevice(dev));
cnrtQueue_t queue;
checkBangError(cnrtQueueCreate(&queue)); checkBangError(cnrtQueueCreate(&queue));
checkCnnlError(cnnlCreate(&cnnl)); checkCnnlError(cnnlCreate(&cnnl));
@ -27,12 +24,10 @@ class BangRuntimeObj : public RuntimeObj {
// 10GB for Longformer // 10GB for Longformer
// size_t longformerNum = 3lu * (1 << 30); // size_t longformerNum = 3lu * (1 << 30);
workspaceSize = 7ll << 30; // 7 GB workspaceSize = 7ll << 30; // 7 GB
cursor = 0;
workspace = alloc(workspaceSize); workspace = alloc(workspaceSize);
} }
virtual ~BangRuntimeObj() { virtual ~BangRuntimeObj() {
dealloc(workspace); dealloc(workspace);
checkBangError(cnrtQueueDestroy(queue));
checkCnnlError(cnnlDestroy(cnnl)); checkCnnlError(cnnlDestroy(cnnl));
} }
string toString() const override; string toString() const override;
@ -50,15 +45,10 @@ class BangRuntimeObj : public RuntimeObj {
void dealloc(void *ptr) override { checkBangError(cnrtFree(ptr)); } void dealloc(void *ptr) override { checkBangError(cnrtFree(ptr)); }
cnnlHandle_t cnnlHandle() const { return cnnl; } cnnlHandle_t cnnlHandle() const { return cnnl; }
BangPtr getWorkspace(size_t size) const { BangPtr getWorkspace(size_t size) const {
IT_ASSERT((cursor + size) <= workspaceSize); IT_ASSERT(size <= workspaceSize);
cursor += size; return workspace;
void *temp = workspace;
temp += (cursor - size);
return temp;
} }
void resetWorkspace() const { cursor = 0; }
void copyBlobFromCPU(void *dst, const void *src, void copyBlobFromCPU(void *dst, const void *src,
size_t bytes) const override { size_t bytes) const override {
checkBangError(cnrtMemcpy(dst, const_cast<void *>(src), bytes, checkBangError(cnrtMemcpy(dst, const_cast<void *>(src), bytes,
@ -76,9 +66,10 @@ class BangRuntimeObj : public RuntimeObj {
checkBangError(cnrtMemcpy(dst, const_cast<void *>(src), bytes, checkBangError(cnrtMemcpy(dst, const_cast<void *>(src), bytes,
CNRT_MEM_TRANS_DIR_PEER2PEER)); CNRT_MEM_TRANS_DIR_PEER2PEER));
} }
void initComm(const string &name, int worldSize, int rank) final;
CommunicatorObj &getCommunicator() const override { return *comm; } void initComm(const string &, int, int) override { IT_TODO_HALT(); }
cnrtQueue_t getBangQueue() const { return queue; }
CommunicatorObj &getCommunicator() const override { IT_TODO_HALT(); }
private: private:
void runWithoutSync(const Graph &graph, bool tune, bool profiling) const; void runWithoutSync(const Graph &graph, bool tune, bool profiling) const;

View File

@ -1,79 +0,0 @@
#pragma once
#include "bang_common.h"
#include "core/communicator.h"
#include <chrono>
#include <cncl.h>
#include <cnrt.h>
#include <cstdlib>
#include <filesystem>
#include <fstream>
#include <mutex>
#include <thread>
namespace infini {
class CnclCommunicatorObj final : public CommunicatorObj {
private:
cnclComm_t *comms;
public:
CnclCommunicatorObj(const string &name, int worldSize, int rank)
: CommunicatorObj(worldSize, rank) {
const std::string filePath("./" + name + "_cncl_id.bin");
cnclCliqueId clique_id;
if (rank == 0) {
CNCL_CHECK(cnclGetCliqueId(&clique_id));
std::ofstream ofs(filePath, std::ios::binary);
ofs.write((char *)&clique_id, sizeof(cnclCliqueId));
} else {
auto begin = std::chrono::steady_clock::now();
while (!std::filesystem::exists(filePath)) {
auto now = std::chrono::steady_clock::now();
_IT_ASSERT_2(now < begin + std::chrono::seconds(10),
"time limit (10s) exceeded.");
std::this_thread::sleep_for(std::chrono::milliseconds(100));
}
std::ifstream ifs(filePath, std::ios::binary);
ifs.read((char *)&clique_id, sizeof(cnclCliqueId));
}
int num_comms = 1;
int *dev_list = new int[num_comms];
int *rank_list = new int[num_comms];
comms = new cnclComm_t[num_comms];
uint32_t num_dev = 0;
checkBangError(cnrtGetDeviceCount(&num_dev));
for (int i = 0; i < num_comms; i++) {
rank_list[i] = rank;
dev_list[i] = rank_list[i] % num_dev;
}
CNCL_CHECK(cnclInitComms(comms, num_comms, dev_list, rank_list,
worldSize, &clique_id));
if (rank == 0) {
std::filesystem::remove(filePath);
}
delete[] dev_list;
delete[] rank_list;
}
~CnclCommunicatorObj() {
CNCL_CHECK(cnclDestroyComms(comms, 1));
delete[] comms;
}
// Get the actual cnclComm_t
cnclComm_t getCnclComm() { return comms[0]; }
virtual string toString() const final {
std::ostringstream oss;
oss << "CNCL communicator";
return oss.str();
}
};
} // namespace infini

View File

@ -53,7 +53,6 @@ class GraphObj : public Object {
const TensorVec &getTensors() const { return tensors; } const TensorVec &getTensors() const { return tensors; }
const OpVec &getOperators() const { return ops; } const OpVec &getOperators() const { return ops; }
OpVec getComputeOps() const; OpVec getComputeOps() const;
Tensor getTensor(int) const;
/** /**
* Sort the nodes in topological order. * Sort the nodes in topological order.
@ -65,13 +64,7 @@ class GraphObj : public Object {
void optimize(); void optimize();
void shape_infer(); void dataMalloc(bool useNaiveAllocator = false);
void dataMalloc(bool useNaiveAllocator = false, size_t memPoolSize = 0);
Tensor cloneKV(Tensor &tensor);
void freeHeap();
/** /**
* @brief Add an operator and create its outputs. Output tensor arguments * @brief Add an operator and create its outputs. Output tensor arguments

View File

@ -30,8 +30,6 @@ class GraphHandlerObj {
Tensor batchNormalization(Tensor input, Tensor output, Tensor mean, Tensor batchNormalization(Tensor input, Tensor output, Tensor mean,
Tensor var, Tensor scale, Tensor bias, Tensor var, Tensor scale, Tensor bias,
float momentum, float eps, bool training); float momentum, float eps, bool training);
Tensor layerNormalization(Tensor input, Tensor scale, Tensor output,
Tensor bias, float eps, int axis, int stash_type);
Tensor maxPool(Tensor input, Tensor output, int kh, int kw, int dh, int dw, Tensor maxPool(Tensor input, Tensor output, int kh, int kw, int dh, int dw,
int ph, int pw, int sh, int sw, int ceilMode); int ph, int pw, int sh, int sw, int ceilMode);
@ -47,7 +45,6 @@ class GraphHandlerObj {
Tensor max(Tensor a, Tensor b, Tensor c); Tensor max(Tensor a, Tensor b, Tensor c);
Tensor relu(Tensor x, Tensor y); Tensor relu(Tensor x, Tensor y);
Tensor silu(Tensor x, Tensor y);
Tensor gelu(Tensor x, Tensor y); Tensor gelu(Tensor x, Tensor y);
Tensor sigmoid(Tensor x, Tensor y); Tensor sigmoid(Tensor x, Tensor y);
Tensor hardSigmoid(Tensor x, Tensor y); Tensor hardSigmoid(Tensor x, Tensor y);
@ -66,27 +63,13 @@ class GraphHandlerObj {
std::optional<float> max); std::optional<float> max);
Tensor transpose(Tensor data, Tensor transposed, Shape perm); Tensor transpose(Tensor data, Tensor transposed, Shape perm);
Tensor reshape(Tensor data, Tensor reshaped, Shape shape); Tensor reshape(Tensor data, Tensor reshaped, Shape shape);
Tensor resize(Tensor input, Tensor output,
const std::optional<vector<int>> &axes, Tensor sizes,
Tensor scales, Tensor roi, vector<uint32_t> sizes_,
vector<float> scales_, vector<float> roi_, string mode,
string ratioPolicy, string nearestMode,
string coordTransMode);
Tensor squeeze(Tensor input, Tensor output, Shape axes);
Tensor unsqueeze(Tensor input, Tensor output, Shape axes);
Tensor concat(TensorVec inputs, Tensor output, int dim); Tensor concat(TensorVec inputs, Tensor output, int dim);
Tensor attentionKVCache(Tensor input_k_cache, Tensor input_v_cache,
Tensor input_q, Tensor input_k, Tensor input_v,
Tensor position_id, Tensor output_matmul);
Tensor RoPE(Tensor pos, Tensor input, Tensor output);
TensorVec split(Tensor input, std::optional<TensorVec> outputs, int axis, TensorVec split(Tensor input, std::optional<TensorVec> outputs, int axis,
std::variant<int, vector<int>> numOrRatio); int num_outputs);
Tensor gather(Tensor data, Tensor indices, Tensor output, int axis); Tensor gather(Tensor data, Tensor indices, Tensor output, int axis);
Tensor gatherElements(Tensor data, Tensor indices, Tensor output, int axis); Tensor gatherElements(Tensor data, Tensor indices, Tensor output, int axis);
Tensor reduceMean(Tensor data, Tensor reduced, Tensor reduceMean(Tensor data, Tensor reduced,
const optional<vector<int>> &axes, bool keepdims); const optional<vector<int>> &axes, bool keepdims);
Tensor reduceSum(Tensor data, Tensor reduced,
const optional<vector<int>> &axes, bool keepdims);
Tensor slice(Tensor input, Tensor output, const vector<int> &starts, Tensor slice(Tensor input, Tensor output, const vector<int> &starts,
const vector<int> &ends, const optional<vector<int>> &axes, const vector<int> &ends, const optional<vector<int>> &axes,
const optional<vector<int>> &steps); const optional<vector<int>> &steps);
@ -95,7 +78,6 @@ class GraphHandlerObj {
Tensor cast(Tensor input, Tensor output, int to); Tensor cast(Tensor input, Tensor output, int to);
Tensor expand(Tensor input, Tensor output, Shape dims); Tensor expand(Tensor input, Tensor output, Shape dims);
Tensor where(Tensor inputX, Tensor inputY, Tensor condition, Tensor output); Tensor where(Tensor inputX, Tensor inputY, Tensor condition, Tensor output);
std::vector<int> getDims(Tensor x) { return x->getDims(); }
Tensor allReduceSum(Tensor input, Tensor output); Tensor allReduceSum(Tensor input, Tensor output);
Tensor allReduceProd(Tensor input, Tensor output); Tensor allReduceProd(Tensor input, Tensor output);
@ -104,13 +86,8 @@ class GraphHandlerObj {
Tensor allReduceAvg(Tensor input, Tensor output); Tensor allReduceAvg(Tensor input, Tensor output);
TensorVec allGather(Tensor input, std::optional<TensorVec> outputs, int n); TensorVec allGather(Tensor input, std::optional<TensorVec> outputs, int n);
Tensor broadcast(Tensor input, Tensor output, int root); Tensor broadcast(Tensor input, Tensor output, int root);
Tensor send(Tensor input, int source, int destination, Tensor output);
Tensor recv(Tensor output, int source, int destination, Shape dims,
int outputType, Tensor input);
Tensor depthToSpace(Tensor input, Tensor output, int blocksize, Tensor depthToSpace(Tensor input, Tensor output, int blocksize,
std::string mode); std::string mode);
Tensor lrn(Tensor input, Tensor output, float alpha, float beta, float bias,
int size);
//------ modifiers //------ modifiers
@ -118,19 +95,9 @@ class GraphHandlerObj {
inline void optimize() { g->optimize(); } inline void optimize() { g->optimize(); }
inline void shape_infer() { g->shape_infer(); }
void change_shape(const vector<int> &shape, int tensorId);
//------ runtime //------ runtime
inline void data_malloc(bool useNaiveAllocator = false, inline void data_malloc() { g->dataMalloc(); }
size_t memPoolSize = 0) {
g->dataMalloc(useNaiveAllocator, memPoolSize);
}
inline Tensor clone_KV(Tensor &tensor) { return g->cloneKV(tensor); }
inline void free_heap() { g->freeHeap(); }
inline void tune() { g->getRuntime()->run(g, true); } inline void tune() { g->getRuntime()->run(g, true); }

View File

@ -2,7 +2,6 @@
#include "core/common.h" #include "core/common.h"
#include "core/operator.h" #include "core/operator.h"
#include "core/tensor.h" #include "core/tensor.h"
#include "utils/operator_utils.h"
#include <functional> #include <functional>
#include <nlohmann/json.hpp> #include <nlohmann/json.hpp>
using json = nlohmann::json; using json = nlohmann::json;
@ -30,6 +29,7 @@ class Kernel {
public: public:
Kernel() {} Kernel() {}
virtual ~Kernel() {} virtual ~Kernel() {}
/** /**
* @param op The operator to be executed. * @param op The operator to be executed.
* @param record The parameters for kernel execution. If extra parameters * @param record The parameters for kernel execution. If extra parameters
@ -102,9 +102,11 @@ class KernelRegistry {
} }
Kernel *getKernel(const KernelAttrs &kernelAttrs) const { Kernel *getKernel(const KernelAttrs &kernelAttrs) const {
auto it = kernels.find(kernelAttrs); auto it = kernels.find(kernelAttrs);
IT_ASSERT(it != kernels.end(), "Kernel not found for key {" + IT_ASSERT(it != kernels.end(),
get_kernel_attrs_str(kernelAttrs) + "Kernel not found for key {" +
"}"); to_string(enum_to_underlying(std::get<0>(kernelAttrs))) +
", " + std::to_string(std::get<1>(kernelAttrs)) + ", " +
std::get<2>(kernelAttrs).toString() + "}");
return std::get<0>(it->second); return std::get<0>(it->second);
} }
const KernelRecord &getKernelItem(const KernelAttrs &kernelAttrs) const { const KernelRecord &getKernelItem(const KernelAttrs &kernelAttrs) const {
@ -129,16 +131,15 @@ class CpuKernelWithoutConfig : public Kernel {
} // namespace infini } // namespace infini
#define _REGISTER_KERNEL_1(device, opType, kernel, name, cnt) \ #define _REGISTER_KERNEL_1(device, opType, dataType, kernel, name, cnt) \
namespace infini { \ namespace infini { \
static const bool _CAT(_register_kernel_, cnt) = \ static const bool _CAT(_register_kernel_, cnt) = \
KernelRegistry::getInstance().registerKernel(KernelAttrs{device, \ KernelRegistry::getInstance().registerKernel( \
opType}, \ KernelAttrs{device, opType, dataType}, new kernel(), name); \
new kernel(), name); \
} }
#define REGISTER_KERNEL(device, opType, kernel, name) \ #define REGISTER_KERNEL(device, opType, dataType, kernel, name) \
_REGISTER_KERNEL_1(device, opType, kernel, name, __COUNTER__) _REGISTER_KERNEL_1(device, opType, dataType, kernel, name, __COUNTER__)
#define _REGISTER_CONSTRUCTOR_1(type, constructor, cnt) \ #define _REGISTER_CONSTRUCTOR_1(type, constructor, cnt) \
namespace infini { \ namespace infini { \

View File

@ -26,23 +26,14 @@ class LazyAllocator {
size_t weightPeak = 0; size_t weightPeak = 0;
size_t heapPeak = 0;
size_t alignment; size_t alignment;
bool hasMemPool = false;
size_t memPoolSize = 0;
// pointer to the memory actually allocated // pointer to the memory actually allocated
void *ptr = nullptr; void *ptr = nullptr;
// pointer to the weight memory space // pointer to the weight memory space
void *weightPtr = nullptr; void *weightPtr = nullptr;
// memory pool ptr
void *memPoolPtr = nullptr;
// // a cache designed for a batch size that has already occurred // // a cache designed for a batch size that has already occurred
// std::unordered_map<size_t, std::unordered_map<TensorObj *, size_t>> // std::unordered_map<size_t, std::unordered_map<TensorObj *, size_t>>
// batchsizeToTensorOffset; // batchsizeToTensorOffset;
@ -77,10 +68,6 @@ class LazyAllocator {
void init(); void init();
void setMemPool(size_t memPoolSize);
bool getMemPoolStatus();
// function: simulate memory allocation // function: simulate memory allocation
// arguments // arguments
// size: size of memory block to be allocated // size: size of memory block to be allocated
@ -89,10 +76,6 @@ class LazyAllocator {
size_t allocWeight(size_t size); size_t allocWeight(size_t size);
size_t heapAlloc(size_t size);
void freeHeap();
// function: simulate memory free // function: simulate memory free
// arguments: // arguments:
// addr: head address offset of memory block to be free // addr: head address offset of memory block to be free
@ -109,8 +92,6 @@ class LazyAllocator {
void *getWeightPtr(); void *getWeightPtr();
void *getHeapPtr();
void info(); void info();
private: private:

View File

@ -25,7 +25,6 @@ struct OpType {
Asinh, // Unary Asinh, // Unary
Atan, // Unary Atan, // Unary
Atanh, // Unary Atanh, // Unary
AttentionKVCache, // Fusion
AveragePool, // Pool AveragePool, // Pool
BatchNormalization, // BatchNormalization, //
Bernoulli, // Bernoulli, //
@ -151,12 +150,10 @@ struct OpType {
ReduceSum, // Reduce ReduceSum, // Reduce
ReduceSumSquare, // Reduce ReduceSumSquare, // Reduce
Relu, // Unary Relu, // Unary
Silu, // Unary
Reshape, Reshape,
Resize, Resize,
ReverseSequence, ReverseSequence,
RoiAlign, RoiAlign,
RoPE, // Fusion
Round, // Unary Round, // Unary
STFT, STFT,
Scan, Scan,
@ -234,8 +231,6 @@ struct OpType {
AllReduceAvg, AllReduceAvg,
AllGather, AllGather,
Broadcast, Broadcast,
Send,
Recv,
} type; } type;
constexpr OpType(decltype(type) t) : type(t) {} constexpr OpType(decltype(type) t) : type(t) {}

View File

@ -4,7 +4,7 @@
#include "core/tensor.h" #include "core/tensor.h"
namespace infini { namespace infini {
using KernelAttrs = std::tuple<Device, OpType::underlying_t>; using KernelAttrs = std::tuple<Device, OpType::underlying_t, DataType>;
struct OpPerfKey { struct OpPerfKey {
HashType hash; HashType hash;
@ -55,7 +55,8 @@ class OperatorObj : public Object {
public: public:
OperatorObj(OpType opType, TensorVec inputs, TensorVec outputs); OperatorObj(OpType opType, TensorVec inputs, TensorVec outputs);
virtual optional<vector<Shape>> inferShape(const TensorVec &inputs) = 0; virtual optional<vector<Shape>>
inferShape(const TensorVec &inputs) const = 0;
virtual vector<DataType> inferDataType(const TensorVec &inputs) const; virtual vector<DataType> inferDataType(const TensorVec &inputs) const;
/** /**
* @brief Constructs outputs (if requried) and check whether the operator is * @brief Constructs outputs (if requried) and check whether the operator is
@ -90,7 +91,6 @@ class OperatorObj : public Object {
OpType getOpType() const { return type; } OpType getOpType() const { return type; }
// HACK: set correct data type // HACK: set correct data type
DataType getDType() const { return getInputs(0)->getDType(); } DataType getDType() const { return getInputs(0)->getDType(); }
DataType getOutDType() const { return getOutput()->getDType(); }
virtual int numInputs() const = 0; virtual int numInputs() const = 0;
virtual int numOutputs() const = 0; virtual int numOutputs() const = 0;
@ -105,7 +105,7 @@ class OperatorObj : public Object {
const TensorVec &newOutputs) const = 0; const TensorVec &newOutputs) const = 0;
protected: protected:
optional<vector<Shape>> inferShape(); optional<vector<Shape>> inferShape() const;
vector<DataType> inferDataType() const; vector<DataType> inferDataType() const;
private: private:

View File

@ -8,9 +8,7 @@
#if USE_CUDA #if USE_CUDA
#include "cuda/cuda_runtime.h" #include "cuda/cuda_runtime.h"
#endif #endif
#if USE_BANG
#include "bang/bang_runtime.h"
#endif
namespace infini { namespace infini {
// TODO: how to deal with this // TODO: how to deal with this
@ -33,7 +31,6 @@ class TensorObj : public TensorBaseObj {
size_t getBytes() const { return _size * dtype.getSize(); } size_t getBytes() const { return _size * dtype.getSize(); }
Shape getDims() const { return shape; } Shape getDims() const { return shape; }
void setShape(Shape shape_);
size_t getRank() const { return shape.size(); } size_t getRank() const { return shape.size(); }
Shape getStride() const; Shape getStride() const;
size_t getOffset(const vector<int> &ds) const; size_t getOffset(const vector<int> &ds) const;
@ -44,16 +41,8 @@ class TensorObj : public TensorBaseObj {
bool isOutput() const { return tensorType == TensorType::output; } bool isOutput() const { return tensorType == TensorType::output; }
bool isOthers() const { return tensorType == TensorType::others; } bool isOthers() const { return tensorType == TensorType::others; }
void setWeight() { tensorType = TensorType::weight; } void setWeight() { tensorType = TensorType::weight; }
void setInput() { void setInput() { tensorType = TensorType::input; }
if (!this->isWeight()) { void setOutput() { tensorType = TensorType::output; }
tensorType = TensorType::input;
}
}
void setOutput() {
if (!this->isWeight()) {
tensorType = TensorType::output;
}
}
string tensorTypeToString() const { string tensorTypeToString() const {
switch (tensorType) { switch (tensorType) {
case TensorType::weight: case TensorType::weight:

View File

@ -1,17 +0,0 @@
#pragma once
#include "core/common.h"
#include <cstdio>
struct AttentionKVCacheMetadata {
int dimSize[4];
int stride[4];
};
namespace infini {
void attention_kvcache_kernel(float *input_k_cache, float *input_v_cache,
float *input_q, float *input_k, float *input_v,
int *position_id, float *output_matmul,
const AttentionKVCacheMetadata &compMeta,
float *output_O_temp, float *output_sum_temp);
} // namespace infini

View File

@ -1,16 +1,8 @@
#pragma once #pragma once
namespace infini { namespace infini {
void div_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1, void div_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3,
int a2, int a3, int b0, int b1, int b2, int b3, int c0, int c1, int b0, int b1, int b2, int b3, int c0, int c1, int c2, int c3);
int c2, int c3); void pow_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3,
void add_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1, int b0, int b1, int b2, int b3, int c0, int c1, int c2, int c3);
int a2, int a3, int b0, int b1, int b2, int b3, int c0, int c1,
int c2, int c3);
void pow_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
int a2, int a3, int b0, int b1, int b2, int b3, int c0, int c1,
int c2, int c3);
void less_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
int a2, int a3, int b0, int b1, int b2, int b3, int c0, int c1,
int c2, int c3);
}; // namespace infini }; // namespace infini

View File

@ -3,8 +3,7 @@
#include "operators/unary.h" #include "operators/unary.h"
#include "utils/small_array.h" #include "utils/small_array.h"
namespace infini { namespace infini {
void expandKernel(int dType, void *input, void *output, int nDims, void expandKernel(float *input, float *output, int nDims, int outputsize,
int outputsize, SmallArray inputShape, SmallArray inputShape, SmallArray outputShape);
SmallArray outputShape);
}; // namespace infini }; // namespace infini

View File

@ -1,17 +0,0 @@
#pragma once
#include "operators/unary.h"
namespace infini {
void LaynormKernel(const float *input, const float *scale, const float eps,
int size, int scaleSize, const int dimsize, const int stride,
float *output, const float *bias, int biasSize);
void LaynormKernel(const float *input, const float *scale, const float eps,
int size, int scaleSize, const int dimsize, const int stride,
float *output);
void LaynormKernel(const half *input, const half *scale, const half eps,
int size, int scaleSize, const int dimsize, const int stride,
half *output, const half *bias, int biasSize);
void LaynormKernel(const half *input, const half *scale, const half eps,
int size, int scaleSize, const int dimsize, const int stride,
half *output);
}; // namespace infini

View File

@ -10,11 +10,10 @@ typedef struct {
int wholeNDim[MAX_DIM]; // dim size after padding or before slicing int wholeNDim[MAX_DIM]; // dim size after padding or before slicing
int partNDim[MAX_DIM]; // dim size before padding or after slicing int partNDim[MAX_DIM]; // dim size before padding or after slicing
int partStride[MAX_DIM]; // stride before padding or after slicing int partStride[MAX_DIM]; // stride before padding or after slicing
int DType;
} TransMetaData; } TransMetaData;
namespace infini { namespace infini {
void pad_slice_kernel(void *partData, void *wholeData, void pad_slice_kernel(float *partData, float *wholeData,
const TransMetaData &metadata, int nDims, int num, const TransMetaData &metadata, int nDims, int num,
bool isPad); bool isPad);
} // namespace infini } // namespace infini

View File

@ -1,12 +0,0 @@
#pragma once
#include "operators/rope.h"
#include "utils/small_array.h"
namespace infini {
void rope_kernel(int dType, int *pos, void *input, void *output, int size,
int dim_model, int dim_head, int hidden_stride,
int pos_stride);
}; // namespace infini

View File

@ -3,6 +3,4 @@
namespace infini { namespace infini {
void softmax_kernel(int num_blocks, float *input, float *output, int size, void softmax_kernel(int num_blocks, float *input, float *output, int size,
int dimsize, int stride); int dimsize, int stride);
void softmax_kernel(int num_blocks, half *input, half *output, int size, }
int dimsize, int stride);
} // namespace infini

View File

@ -8,8 +8,8 @@ const int DIM_MAX_SIZE = 8;
// Concat operator acts like element tensors composing to one big tensor,and // Concat operator acts like element tensors composing to one big tensor,and
// split operator acts like one big tensor being composed by element // split operator acts like one big tensor being composed by element
// tensors. // tensors.
template <typename T> struct ElementTensorMetadata { struct ElementTensorMetadata {
T *data[BATCH_SIZE]; float *data[BATCH_SIZE];
int dimBgNo[BATCH_SIZE]; // the dimention begin no of the element tensor in int dimBgNo[BATCH_SIZE]; // the dimention begin no of the element tensor in
// the composed tensor. // the composed tensor.
int dimSize[BATCH_SIZE]; // the dimention size of the element tensor. int dimSize[BATCH_SIZE]; // the dimention size of the element tensor.
@ -20,17 +20,16 @@ template <typename T> struct ElementTensorMetadata {
data[i], dimBgNo[i], dimSize[i], nElements[i]); data[i], dimBgNo[i], dimSize[i], nElements[i]);
} }
}; };
template <typename T> struct ComposedTensorMetadata {
struct ComposedTensorMetadata {
int dimSize[DIM_MAX_SIZE]; int dimSize[DIM_MAX_SIZE];
int stride[DIM_MAX_SIZE]; int stride[DIM_MAX_SIZE];
T *data; float *data;
}; };
namespace infini { namespace infini {
void split_concat_kernel(const ElementTensorMetadata<float> &eleMeta, void split_concat_kernel(const ElementTensorMetadata &eleMeta,
const ComposedTensorMetadata<float> &compMeta, int dim, const ComposedTensorMetadata &compMeta, int dim,
int batchSize, int nDims, bool isSplit);
void split_concat_kernel(const ElementTensorMetadata<half> &eleMeta,
const ComposedTensorMetadata<half> &compMeta, int dim,
int batchSize, int nDims, bool isSplit); int batchSize, int nDims, bool isSplit);
} // namespace infini } // namespace infini

View File

@ -5,7 +5,7 @@
namespace infini { namespace infini {
void transpose_kernel(int dType, void *input, void *output, int nDims, int size, void transpose_kernel(float *input, float *output, int nDims, int size,
SmallArray strides, SmallArray outputShape); SmallArray strides, SmallArray outputShape);
}; // namespace infini }; // namespace infini

View File

@ -3,22 +3,48 @@
#include "operators/unary.h" #include "operators/unary.h"
namespace infini { namespace infini {
template <typename T> void softmax_kernel(T *input, T *output, size_t num); void softmax_kernel(float *input, float *output, size_t num);
template <typename T> void relu_kernel(T *input, T *output, size_t num); void relu_kernel(float *input, float *output, size_t num);
template <typename T> void silu_kernel(T *input, T *output, size_t num); void sigmoid_kernel(float *input, float *output, size_t num);
template <typename T> void sigmoid_kernel(T *input, T *output, size_t num); void tanh_kernel(float *input, float *output, size_t num);
template <typename T> void tanh_kernel(T *input, T *output, size_t num); void abs_kernel(float *input, float *output, size_t num);
template <typename T> void abs_kernel(T *input, T *output, size_t num); void sqrt_kernel(float *input, float *output, size_t num);
template <typename T> void sqrt_kernel(T *input, T *output, size_t num); void neg_kernel(float *input, float *output, size_t num);
template <typename T> void neg_kernel(T *input, T *output, size_t num); void gelu_kernel(float *input, float *output, size_t num);
template <typename T> void gelu_kernel(T *input, T *output, size_t num); void erf_kernel(float *input, float *output, size_t num);
template <typename T> void erf_kernel(T *input, T *output, size_t num); void hard_sigmoid_kernel(float *input, float *output, size_t num);
template <typename T> void hard_sigmoid_kernel(T *input, T *output, size_t num); void hard_swish_kernel(float *input, float *output, size_t num);
template <typename T> void hard_swish_kernel(T *input, T *output, size_t num);
template <typename INPUT, typename OUTPUT> void unary_kernel(const Operator &_op) {
void cast_kernel(INPUT *input, OUTPUT *output, size_t num); auto op = as<UnaryObj>(_op);
float *const inputData = (op->getInputs(0)->getRawDataPtr<float *>());
float *const outputData = (op->getOutput()->getRawDataPtr<float *>());
void unary_kernel(const Operator &_op); size_t num = op->getOutput()->size();
if (op->getOpType() == OpType::Softmax)
softmax_kernel(inputData, outputData, num);
else if (op->getOpType() == OpType::Relu)
relu_kernel(inputData, outputData, num);
else if (op->getOpType() == OpType::Sigmoid)
sigmoid_kernel(inputData, outputData, num);
else if (op->getOpType() == OpType::HardSigmoid)
hard_sigmoid_kernel(inputData, outputData, num);
else if (op->getOpType() == OpType::HardSwish)
hard_swish_kernel(inputData, outputData, num);
else if (op->getOpType() == OpType::Tanh)
tanh_kernel(inputData, outputData, num);
else if (op->getOpType() == OpType::Abs)
abs_kernel(inputData, outputData, num);
else if (op->getOpType() == OpType::Sqrt)
sqrt_kernel(inputData, outputData, num);
else if (op->getOpType() == OpType::Gelu)
gelu_kernel(inputData, outputData, num);
else if (op->getOpType() == OpType::Neg)
neg_kernel(inputData, outputData, num);
else if (op->getOpType() == OpType::Erf)
erf_kernel(inputData, outputData, num);
else
IT_TODO_HALT();
}
}; // namespace infini }; // namespace infini

View File

@ -1,29 +1,11 @@
#pragma once
#include "core/tensor.h" #include "core/tensor.h"
#include "cuda/cuda_common.h"
namespace infini { namespace infini {
void cudaPrintFloat(float *x, int len); void cudaPrintFloat(float *x, int len);
void cudaPrintTensor(const Tensor &tensor); void cudaPrintTensor(const Tensor &tensor) {
cudaPrintFloat(tensor->getRawDataPtr<float *>(), tensor->size());
}
cudnnDataType_t cudnnDataTypeConvert(DataType dataType);
cudaDataType cublasDataTypeConvert(DataType);
template <int index> struct DT_CUDA {};
template <> struct DT_CUDA<0> { using t = bool; };
template <> struct DT_CUDA<1> { using t = float; };
template <> struct DT_CUDA<2> { using t = unsigned char; };
template <> struct DT_CUDA<3> { using t = char; };
template <> struct DT_CUDA<4> { using t = unsigned short; };
template <> struct DT_CUDA<5> { using t = short; };
template <> struct DT_CUDA<6> { using t = int; };
template <> struct DT_CUDA<7> { using t = long long; };
template <> struct DT_CUDA<9> { using t = bool; };
template <> struct DT_CUDA<10> { using t = half; };
template <> struct DT_CUDA<11> { using t = double; };
template <> struct DT_CUDA<12> { using t = unsigned int; };
template <> struct DT_CUDA<13> { using t = unsigned long long; };
template <> struct DT_CUDA<16> { using t = nv_bfloat16; };
} // namespace infini } // namespace infini

View File

@ -3,15 +3,9 @@
#include "utils/small_array.h" #include "utils/small_array.h"
namespace infini { namespace infini {
void whereKernel(const float *inputX, const float *inputY, void whereKernel(const float *inputX, const float *inputY,
const uint8_t *condition, float *output, int nDims, const uint8_t *condition, float *output, int nDims,
int outputsize, SmallArray inputXShape, SmallArray inputYShape, SmallArray inputXShape, SmallArray inputYShape,
SmallArray conditionShape, SmallArray outputShape, int xSize, SmallArray conditionShape, SmallArray outputShape);
int ySize, int cSize);
void whereKernel(const half *inputX, const half *inputY,
const uint8_t *condition, half *output, int nDims,
int outputsize, SmallArray inputXShape, SmallArray inputYShape,
SmallArray conditionShape, SmallArray outputShape, int xSize,
int ySize, int cSize);
}; // namespace infini }; // namespace infini

View File

@ -53,8 +53,7 @@ inline void initGatherMetaData(GatherMetaData &metaData,
metaData.inStride[i] = in->getStride()[i]; metaData.inStride[i] = in->getStride()[i];
} }
} }
template <typename T> void gather_kernel(float *in, float *out, GatherMetaData metaData, size_t num);
void gather_kernel(T *in, T *out, GatherMetaData metaData, size_t num);
void gather_elements_kernel(void *in, void *out, GatherMetaData metaData, void gather_elements_kernel(void *in, void *out, GatherMetaData metaData,
size_t num); size_t num);

View File

@ -68,8 +68,8 @@ static inline HashType genhash(string s) {
#define nnet_unimplemented_halt() \ #define nnet_unimplemented_halt() \
{ IT_TODO_HALT(); } { IT_TODO_HALT(); }
#define nnet_unimplemented_continue() \ #define nnet_unimplemented_continue()
{ dbg("Unimplemented"); } // { dbg("Unimplemented"); }
#define nnet_assert(expr, msg) assert(((void)(msg), (expr))) #define nnet_assert(expr, msg) assert(((void)(msg), (expr)))

View File

@ -35,7 +35,7 @@ class G2BMMObj : public OperatorObj {
OP_CLONE(G2BMMObj); OP_CLONE(G2BMMObj);
std::string toString() const override; std::string toString() const override;
optional<vector<Shape>> inferShape(const TensorVec &inputs) override; optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
int numInputs() const override { return 2; } int numInputs() const override { return 2; }
int numOutputs() const override { return 1; } int numOutputs() const override { return 1; }

View File

@ -33,7 +33,7 @@ class GBMMObj : public OperatorObj {
OP_CLONE(GBMMObj); OP_CLONE(GBMMObj);
std::string toString() const override; std::string toString() const override;
optional<vector<Shape>> inferShape(const TensorVec &inputs) override; optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
int numInputs() const override { return 2; } int numInputs() const override { return 2; }
int numOutputs() const override { return 1; } int numOutputs() const override { return 1; }

View File

@ -7,7 +7,7 @@ class ActivationBackwardObj : public OperatorObj {
ActivationBackwardObj(OpType type, GraphObj *graph, Tensor y, Tensor diff_y, ActivationBackwardObj(OpType type, GraphObj *graph, Tensor y, Tensor diff_y,
Tensor x, Tensor diff_x); Tensor x, Tensor diff_x);
OP_CLONE(ActivationBackwardObj); OP_CLONE(ActivationBackwardObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override; optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
std::string toString() const override; std::string toString() const override;
int numInputs() const override { return 3; } int numInputs() const override { return 3; }

View File

@ -27,7 +27,7 @@ class AllGatherObj : public OperatorObj {
int numInputs() const override { return 1; } int numInputs() const override { return 1; }
int numOutputs() const override { return world_size; } int numOutputs() const override { return world_size; }
optional<vector<Shape>> inferShape(const TensorVec &inputs) override; optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
std::string toString() const override; std::string toString() const override;

View File

@ -33,7 +33,7 @@ class AllReduceBaseObj : public OperatorObj {
int numInputs() const override { return 1; } int numInputs() const override { return 1; }
int numOutputs() const override { return 1; } int numOutputs() const override { return 1; }
optional<vector<Shape>> inferShape(const TensorVec &inputs) override { optional<vector<Shape>> inferShape(const TensorVec &inputs) const override {
return {{inputs[0]->getDims()}}; return {{inputs[0]->getDims()}};
}; };

View File

@ -1,43 +0,0 @@
#pragma once
#include "core/operator.h"
namespace infini {
/**
* @brief Fused Attention with KVCache input operator. All the input and output
* tensors should have the same rank except for the position_id.
*
*/
class AttentionKVCacheObj : public OperatorObj {
int dim;
public:
/**
* @brief Construct a new AttentionKVCache object.
*
* @param graph The computation graph that this operator belongs to.
* @param input_k_cache The k_cache input tensor.
* @param input_v_cache The v_cache input tensor.
* @param input_q The query input tensor.
* @param input_k The key input tensor.
* @param input_v The value input tensor.
* @param position_id The positon id of the query,
* @param output_matmul The query output tensor.
*/
AttentionKVCacheObj(GraphObj *graph, Tensor input_k_cache,
Tensor input_v_cache, Tensor input_q, Tensor input_k,
Tensor input_v, Tensor position_id,
Tensor output_matmul);
OP_CLONE(AttentionKVCacheObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
std::string toString() const override;
int numInputs() const override { return 6; }
int numOutputs() const override { return 1; }
int getDim() const { return dim; }
private:
vector<int> getWorkloadVector() const override;
vector<int> getOpAttrVector() const override;
};
} // namespace infini

View File

@ -34,7 +34,7 @@ class BatchNormObj : public OperatorObj {
Tensor var, Tensor scale, Tensor bias, float momentum = 0.9, Tensor var, Tensor scale, Tensor bias, float momentum = 0.9,
float eps = 1e-5, bool trainingMode = false); float eps = 1e-5, bool trainingMode = false);
OP_CLONE(BatchNormObj); OP_CLONE(BatchNormObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override; optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
std::string toString() const override; std::string toString() const override;
// output size will be 3 when training // output size will be 3 when training

View File

@ -26,7 +26,7 @@ class BroadcastObj : public OperatorObj {
int numInputs() const override { return 1; } int numInputs() const override { return 1; }
int numOutputs() const override { return 1; } int numOutputs() const override { return 1; }
optional<vector<Shape>> inferShape(const TensorVec &inputs) override { optional<vector<Shape>> inferShape(const TensorVec &inputs) const override {
return {{inputs[0]->getDims()}}; return {{inputs[0]->getDims()}};
}; };

View File

@ -22,7 +22,7 @@ class ConcatObj : public OperatorObj {
ConcatObj(GraphObj *graph, TensorVec inputs, Tensor output, int dim); ConcatObj(GraphObj *graph, TensorVec inputs, Tensor output, int dim);
OP_CLONE(ConcatObj); OP_CLONE(ConcatObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override; optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
std::string toString() const override; std::string toString() const override;
int numInputs() const override { return inputs.size(); } int numInputs() const override { return inputs.size(); }

View File

@ -142,7 +142,7 @@ class ConvObj : public ConvBaseObj {
ActType act = ActType::None); ActType act = ActType::None);
OP_CLONE(ConvObj); OP_CLONE(ConvObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override; optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
int getNumGroups() const override { return c / getChannelPerGroup(); } int getNumGroups() const override { return c / getChannelPerGroup(); }
private: private:
@ -164,7 +164,7 @@ class ConvBackwardFilterObj : public ConvBaseObj {
int sh = 1, int sw = 1, int dh = 1, int dw = 1, int sh = 1, int sw = 1, int dh = 1, int dw = 1,
Tensor bias = nullptr, ActType act = ActType::None); Tensor bias = nullptr, ActType act = ActType::None);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override; optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
ActType getAct() const { return act; } ActType getAct() const { return act; }
int getNumGroups() const override { return c / getChannelPerGroup(); } int getNumGroups() const override { return c / getChannelPerGroup(); }
@ -191,7 +191,7 @@ class ConvTransposed2dObj : public ConvBaseObj {
Tensor bias = nullptr, ActType act = ActType::None); Tensor bias = nullptr, ActType act = ActType::None);
OP_CLONE(ConvTransposed2dObj); OP_CLONE(ConvTransposed2dObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override; optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
int getNumGroups() const override { return group; } int getNumGroups() const override { return group; }
std::pair<int, int> getOutputPadding() const { return {oph, opw}; } std::pair<int, int> getOutputPadding() const { return {oph, opw}; }
@ -218,7 +218,7 @@ class ConvTransposed2dNHWCObj : public ConvBaseObj {
Tensor bias = nullptr, ActType act = ActType::None); Tensor bias = nullptr, ActType act = ActType::None);
OP_CLONE(ConvTransposed2dNHWCObj); OP_CLONE(ConvTransposed2dNHWCObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override; optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
int getNumGroups() const override { return group; } int getNumGroups() const override { return group; }
private: private:

View File

@ -7,7 +7,7 @@ class DetObj : public OperatorObj {
enum Mode { NormalDet = 0, LogDet }; enum Mode { NormalDet = 0, LogDet };
DetObj(GraphObj *graph, Tensor input, Tensor output, Mode mode); DetObj(GraphObj *graph, Tensor input, Tensor output, Mode mode);
OP_CLONE(DetObj); OP_CLONE(DetObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override; optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
std::string toString() const override; std::string toString() const override;
int numInputs() const override { return 1; } int numInputs() const override { return 1; }

View File

@ -37,7 +37,7 @@ class DropoutObj : public OperatorObj {
DropoutObj(GraphObj *graph, Tensor data, Tensor output, Tensor mask, DropoutObj(GraphObj *graph, Tensor data, Tensor output, Tensor mask,
float ratio, bool training_mode); float ratio, bool training_mode);
OP_CLONE(DropoutObj); OP_CLONE(DropoutObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override; optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
std::string toString() const override; std::string toString() const override;
int numInputs() const override { return 1; } int numInputs() const override { return 1; }

View File

@ -21,7 +21,7 @@ class ElementWiseObj : public OperatorObj {
*/ */
ElementWiseObj(OpType type, GraphObj *graph, Tensor input0, Tensor input1, ElementWiseObj(OpType type, GraphObj *graph, Tensor input0, Tensor input1,
Tensor output); Tensor output);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override; optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
std::string toString() const override; std::string toString() const override;
int numInputs() const override { return 2; } int numInputs() const override { return 2; }
@ -38,7 +38,7 @@ class MSELossObj : public OperatorObj {
MSELossObj(GraphObj *graph, Tensor input0, Tensor input1, MSELossObj(GraphObj *graph, Tensor input0, Tensor input1,
Reduction reduction, Tensor output); Reduction reduction, Tensor output);
OP_CLONE(MSELossObj); OP_CLONE(MSELossObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override; optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
Reduction getReduction() const { return reductionMode; } Reduction getReduction() const { return reductionMode; }
std::string toString() const override; std::string toString() const override;

View File

@ -21,7 +21,7 @@ class ExpandObj : public OperatorObj {
*/ */
ExpandObj(GraphObj *graph, Tensor input, Tensor output, Shape dims); ExpandObj(GraphObj *graph, Tensor input, Tensor output, Shape dims);
OP_CLONE(ExpandObj); OP_CLONE(ExpandObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override; optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
std::string toString() const override; std::string toString() const override;
int numInputs() const override { return 1; } int numInputs() const override { return 1; }

View File

@ -23,7 +23,7 @@ class ExtendObj : public OperatorObj {
ExtendObj(GraphObj *graph, Tensor input, Tensor output, int dim, ExtendObj(GraphObj *graph, Tensor input, Tensor output, int dim,
int num = 1); int num = 1);
OP_CLONE(ExtendObj); OP_CLONE(ExtendObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override; optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
std::string toString() const override; std::string toString() const override;
int numInputs() const override { return 1; } int numInputs() const override { return 1; }

View File

@ -39,7 +39,7 @@ class GatherObj : public GatherBaseObj {
int axis); int axis);
OP_CLONE(GatherObj); OP_CLONE(GatherObj);
std::string toString() const override; std::string toString() const override;
optional<vector<Shape>> inferShape(const TensorVec &inputs) override; optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
vector<DataType> inferDataType(const TensorVec &inputs) const override; vector<DataType> inferDataType(const TensorVec &inputs) const override;
private: private:
@ -69,7 +69,7 @@ class GatherElementsObj : public GatherBaseObj {
Tensor output, int axis); Tensor output, int axis);
OP_CLONE(GatherElementsObj); OP_CLONE(GatherElementsObj);
std::string toString() const override; std::string toString() const override;
optional<vector<Shape>> inferShape(const TensorVec &inputs) override; optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
vector<DataType> inferDataType(const TensorVec &inputs) const override; vector<DataType> inferDataType(const TensorVec &inputs) const override;
private: private:

View File

@ -1,30 +0,0 @@
#pragma once
#include "core/operator.h"
namespace infini {
class LayerNormObj : public OperatorObj {
float eps;
int axis, stash_type;
public:
LayerNormObj(GraphObj *graph, Tensor input, Tensor scale, Tensor output,
Tensor bias = nullptr, float eps = 1e-5, int axis = -1,
int stash_type = 1);
OP_CLONE(LayerNormObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
std::string toString() const override;
Tensor getBias() const { return inputs.size() > 2 ? inputs[2] : nullptr; }
int numInputs() const override { return inputs.size(); }
int numOutputs() const override { return outputs.size(); }
float getEps() const { return eps; }
int getAxis() const { return axis; }
int getStashType() const { return stash_type; }
private:
vector<int> getWorkloadVector() const override;
vector<int> getOpAttrVector() const override;
vector<DataType> inferDataType(const TensorVec &inputs) const override;
};
} // namespace infini

View File

@ -1,29 +0,0 @@
#pragma once
#include "core/operator.h"
namespace infini {
class LRNObj : public OperatorObj {
public:
LRNObj(GraphObj *graph, Tensor inputX, Tensor inputY, float alpha,
float beta, float bias, int size);
OP_CLONE(LRNObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
std::string toString() const override;
int numInputs() const override { return inputs.size(); }
int numOutputs() const override { return 1; }
auto getAlphaBetaBias() const {
return tuple(alpha_value, beta_value, bias_value);
}
auto getSize() const { return size_value; }
private:
float alpha_value, beta_value, bias_value;
int size_value;
vector<int> getWorkloadVector() const override;
vector<int> getOpAttrVector() const override;
};
} // namespace infini

View File

@ -45,7 +45,7 @@ class MatmulObj : public OperatorObj {
OP_CLONE(MatmulObj); OP_CLONE(MatmulObj);
std::string toString() const override; std::string toString() const override;
optional<vector<Shape>> inferShape(const TensorVec &inputs) override; optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
int numInputs() const override { return inputs.size(); } int numInputs() const override { return inputs.size(); }
int numOutputs() const override { return 1; } int numOutputs() const override { return 1; }

View File

@ -21,7 +21,7 @@ class MemBoundObj : public OperatorObj {
OP_CLONE(MemBoundObj); OP_CLONE(MemBoundObj);
std::string toString() const override; std::string toString() const override;
optional<vector<Shape>> inferShape(const TensorVec &inputs) override; optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
int numInputs() const override { return inputs.size(); } int numInputs() const override { return inputs.size(); }
int numOutputs() const override { return outputs.size(); } int numOutputs() const override { return outputs.size(); }

View File

@ -27,7 +27,7 @@ class PadObj : public OperatorObj {
const vector<int> &pads, const optional<vector<int>> &axes); const vector<int> &pads, const optional<vector<int>> &axes);
OP_CLONE(PadObj); OP_CLONE(PadObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override; optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
std::string toString() const override; std::string toString() const override;
int numInputs() const override { return 1; } int numInputs() const override { return 1; }
int numOutputs() const override { return 1; } int numOutputs() const override { return 1; }

View File

@ -41,7 +41,7 @@ class PoolingObj : public OperatorObj {
int ceilMode); int ceilMode);
OP_CLONE(PoolingObj); OP_CLONE(PoolingObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override; optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
std::string toString() const override; std::string toString() const override;
int numInputs() const override { return 1; } int numInputs() const override { return 1; }
int numOutputs() const override { return 1; } int numOutputs() const override { return 1; }

View File

@ -1,46 +0,0 @@
#pragma once
#include "core/operator.h"
namespace infini {
/**
*
* https://docs.nvidia.com/deeplearning/nccl/archives/nccl_2193/user-guide/docs/index.html
*/
class RecvObj : public OperatorObj {
public:
/**
* @brief Construct a new SendRecv object
*
* @param graph The computation graph that this operator belongs to.
* @param input default nullptr, because recv does not have input.
* @param output recv output
* @param source the send rank
* @param destination the recv rank
* @param dims The shape of the output tensor.
*/
RecvObj(GraphObj *graph, Tensor output, int source, int destination,
Shape dims, int outputType, Tensor input = nullptr);
OP_CLONE(RecvObj);
int numInputs() const override { return inputs.size(); }
int numOutputs() const override { return 1; }
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
std::string toString() const override;
DataType getDType() const;
int getSourceRank() const { return source; }
int getDestinationRank() const { return destination; }
inline Shape getShape() const { return dims; }
private:
vector<int> getWorkloadVector() const override;
vector<int> getOpAttrVector() const override;
vector<DataType> inferDataType(const TensorVec &inputs) const override;
protected:
int source;
int destination;
Shape dims;
int outputType;
};
} // namespace infini

View File

@ -3,30 +3,27 @@
namespace infini { namespace infini {
/** /**
* @brief Compute the reduction of input tensor's elements along certain axes. * @brief Compute the mean of input tensor's elements along certain axes.
* *
*/ */
class ReduceBaseObj : public OperatorObj { class ReduceMeanObj : public OperatorObj {
protected:
set<int> axes; // axis to reduce set<int> axes; // axis to reduce
bool keepDims; bool keepDims;
public: public:
/** /**
* @brief Construct a new Reduce object. * @brief Construct a new ReduceMean object.
* *
* @param graph The computation graph that this operator belongs to. * @param graph The computation graph that this operator belongs to.
* @param opType The operation type. Should be a Reduce operation.
* @param input The input tensor. * @param input The input tensor.
* @param output The output tensor. * @param output The output tensor.
* @param axes Axes to reduce. * @param axes Axes to reduce.
* @param keepDims Keep the reduced dimensions or not. * @param keepDims Keep the reduced dimensions or not.
*/ */
ReduceBaseObj(GraphObj *graph, OpType opType, Tensor input, Tensor output, ReduceMeanObj(GraphObj *graph, Tensor input, Tensor output,
const optional<vector<int>> &axes, bool keepDims); const optional<vector<int>> &axes, bool keepDims = true);
virtual ~ReduceBaseObj() {} OP_CLONE(ReduceMeanObj);
OP_CLONE(ReduceBaseObj); optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
std::string toString() const override; std::string toString() const override;
int numInputs() const override { return 1; } int numInputs() const override { return 1; }
@ -41,15 +38,4 @@ class ReduceBaseObj : public OperatorObj {
vector<int> getOpAttrVector() const override; vector<int> getOpAttrVector() const override;
}; };
class ReduceMeanObj : public ReduceBaseObj {
public:
ReduceMeanObj(GraphObj *graph, Tensor input, Tensor output,
const optional<vector<int>> &axes, bool keepDims = true);
};
class ReduceSumObj : public ReduceBaseObj {
public:
ReduceSumObj(GraphObj *graph, Tensor input, Tensor output,
const optional<vector<int>> &axes, bool keepDims = true);
};
} // namespace infini } // namespace infini

View File

@ -9,7 +9,6 @@ namespace infini {
*/ */
class ReshapeObj : public OperatorObj { class ReshapeObj : public OperatorObj {
Shape dims; Shape dims;
Shape outputShape;
public: public:
/** /**
@ -18,20 +17,18 @@ class ReshapeObj : public OperatorObj {
* @param graph The computation graph that this operator belongs to. * @param graph The computation graph that this operator belongs to.
* @param input The input tensor. * @param input The input tensor.
* @param output The output tensor. * @param output The output tensor.
* @param dims The shape to infer the output shape. * @param dims The shape of the output tensor.
* @param outputShape The real shape of output tensor.
*/ */
ReshapeObj(GraphObj *graph, Tensor input, Tensor output, Shape dims); ReshapeObj(GraphObj *graph, Tensor input, Tensor output, Shape dims);
OP_CLONE(ReshapeObj); OP_CLONE(ReshapeObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override; optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
std::string toString() const override; std::string toString() const override;
int numInputs() const override { return 1; } int numInputs() const override { return 1; }
int numOutputs() const override { return 1; } int numOutputs() const override { return 1; }
inline Shape getShape() const { return outputShape; } inline Shape getShape() const { return dims; }
inline Shape getDims() const { return dims; }
private: private:
vector<int> getWorkloadVector() const override; vector<int> getWorkloadVector() const override;
@ -58,7 +55,7 @@ class FlattenObj : public OperatorObj {
FlattenObj(GraphObj *graph, Tensor input, Tensor output, int axis); FlattenObj(GraphObj *graph, Tensor input, Tensor output, int axis);
OP_CLONE(FlattenObj); OP_CLONE(FlattenObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override; optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
std::string toString() const override; std::string toString() const override;
int numInputs() const override { return 1; } int numInputs() const override { return 1; }
@ -88,7 +85,7 @@ class IdentityObj : public OperatorObj {
IdentityObj(GraphObj *graph, Tensor input, Tensor output); IdentityObj(GraphObj *graph, Tensor input, Tensor output);
OP_CLONE(IdentityObj); OP_CLONE(IdentityObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override; optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
std::string toString() const override; std::string toString() const override;
int numInputs() const override { return 1; } int numInputs() const override { return 1; }

View File

@ -27,60 +27,6 @@ class ResizeObj : public OperatorObj {
enum class EKeepAspectRatioPolicy { stretch, notLarger, notSmaller, none }; enum class EKeepAspectRatioPolicy { stretch, notLarger, notSmaller, none };
enum class ECoeffMode { nearest, linear, cubic }; enum class ECoeffMode { nearest, linear, cubic };
static ECoordinateTransMode fromECoordinateTransModeStr(string mode) {
if (mode == "half_pixel") {
return ECoordinateTransMode::halfPixel;
} else if (mode == "asymmetric") {
return ECoordinateTransMode::asymmetric;
} else if (mode == "align_corners") {
return ECoordinateTransMode::alignCorners;
} else if (mode == "pytorch_half_pixel") {
return ECoordinateTransMode::pytorchHalfPixel;
} else if (mode == "tf_crop_and_resize") {
return ECoordinateTransMode::tfCropAndResize;
} else {
IT_TODO_HALT();
}
}
static ENearestMode fromENearestModeStr(string mode) {
if (mode == "round_prefer_floor") {
return ENearestMode::roundPreferFloor;
} else if (mode == "round_prefer_ceil") {
return ENearestMode::roundPreferCeil;
} else if (mode == "floor") {
return ENearestMode::floor;
} else if (mode == "ceil") {
return ENearestMode::ceil;
} else {
return ENearestMode::none;
}
}
static EKeepAspectRatioPolicy fromRatioPolicyStr(string ratioPolicyStr) {
if (ratioPolicyStr == "stretch") {
return EKeepAspectRatioPolicy::stretch;
} else if (ratioPolicyStr == "not_larger") {
return EKeepAspectRatioPolicy::notLarger;
} else if (ratioPolicyStr == "not_smaller") {
return EKeepAspectRatioPolicy::notSmaller;
} else {
return EKeepAspectRatioPolicy::none;
}
}
static ECoeffMode fromECoeffModeStr(string mode) {
if (mode == "nearest") {
return ECoeffMode::nearest;
} else if (mode == "linear") {
return ECoeffMode::linear;
} else if (mode == "cubic") {
return ECoeffMode::cubic;
} else {
IT_TODO_HALT();
}
}
private: private:
vector<int> axes; vector<int> axes;
vector<float> scales; vector<float> scales;
@ -114,7 +60,7 @@ class ResizeObj : public OperatorObj {
// Operator clone(TensorVec inputs, TensorVec outputs) override; // Operator clone(TensorVec inputs, TensorVec outputs) override;
vector<DataType> inferDataType(const TensorVec &inputs) const override; vector<DataType> inferDataType(const TensorVec &inputs) const override;
optional<vector<Shape>> inferShape(const TensorVec &inputs) override; optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
std::string toString() const override; std::string toString() const override;
int numInputs() const override { return inputs.size(); } int numInputs() const override { return inputs.size(); }
int numOutputs() const override { return 1; } int numOutputs() const override { return 1; }

View File

@ -1,29 +0,0 @@
#pragma once
#include "core/operator.h"
namespace infini {
class RoPEObj : public OperatorObj {
public:
/**
* @brief Construct a new RotaryEmbedding object.
*
* @param graph The computation graph that this operator belongs to.
* @param pos The positon id of the query.
* @param input The input tensor.
* @param output The output tensor.
*/
RoPEObj(GraphObj *graph, Tensor pos, Tensor input, Tensor output);
OP_CLONE(RoPEObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
std::string toString() const override;
int numInputs() const override { return 2; }
int numOutputs() const override { return 1; }
DataType getDType() const { return getInputs(1)->getDType(); }
private:
vector<int> getWorkloadVector() const override;
vector<int> getOpAttrVector() const override;
};
} // namespace infini

View File

@ -1,42 +0,0 @@
#pragma once
#include "core/operator.h"
namespace infini {
/**
*
* https://docs.nvidia.com/deeplearning/nccl/archives/nccl_2193/user-guide/docs/index.html
*/
class SendObj : public OperatorObj {
public:
/**
* @brief Construct a new SendRecv object
*
* @param graph The computation graph that this operator belongs to.
* @param input send input
* @param output recv output
* @param source the send rank
* @param destination the recv rank
*/
SendObj(GraphObj *graph, Tensor input, int source, int destination,
Tensor output = nullptr);
OP_CLONE(SendObj);
int numInputs() const override { return 1; }
int numOutputs() const override { return outputs.size(); }
std::string toString() const override;
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
int getSourceRank() const { return source; }
int getDestinationRank() const { return destination; }
private:
vector<int> getWorkloadVector() const override;
vector<int> getOpAttrVector() const override;
vector<DataType> inferDataType(const TensorVec &inputs) const override;
protected:
int source;
int destination;
};
} // namespace infini

View File

@ -32,7 +32,7 @@ class SliceObj : public OperatorObj {
const optional<vector<int>> &steps); const optional<vector<int>> &steps);
OP_CLONE(SliceObj); OP_CLONE(SliceObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override; optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
std::string toString() const override; std::string toString() const override;
inline int numInputs() const override { return 1; } inline int numInputs() const override { return 1; }
inline int numOutputs() const override { return 1; } inline int numOutputs() const override { return 1; }

View File

@ -10,7 +10,7 @@ class SoftmaxObj : public OperatorObj {
OP_CLONE(SoftmaxObj); OP_CLONE(SoftmaxObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override { optional<vector<Shape>> inferShape(const TensorVec &inputs) const override {
return {{inputs[0]->getDims()}}; return {{inputs[0]->getDims()}};
}; };

View File

@ -37,7 +37,7 @@ class SplitObj : public OperatorObj {
int dim, const vector<int> &ratio); int dim, const vector<int> &ratio);
OP_CLONE(SplitObj); OP_CLONE(SplitObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override; optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
std::string toString() const override; std::string toString() const override;
int numInputs() const override { return 1; } int numInputs() const override { return 1; }

View File

@ -1,39 +0,0 @@
#pragma once
#include "core/operator.h"
namespace infini {
/**
* @brief Remove single-dimensional entries from the shape of a tensor.
*
*/
class SqueezeObj : public OperatorObj {
Shape axes;
public:
/**
* @brief Construct a new Squeeze object.
*
* @param graph The computation graph that this operator belongs to.
* @param input The input tensor.
* @param output The output tensor.
* @param axes List of integers indicating the dimensions to squeeze.
*/
SqueezeObj(GraphObj *graph, Tensor input, Tensor output, Shape axes);
OP_CLONE(SqueezeObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
std::string toString() const override;
int numInputs() const override { return 1; }
int numOutputs() const override { return 1; }
inline Shape getAxes() const { return axes; }
private:
vector<int> getWorkloadVector() const override;
vector<int> getOpAttrVector() const override;
};
} // namespace infini

View File

@ -7,7 +7,7 @@ class TransposeObj : public OperatorObj {
TransposeObj(GraphObj *graph, Tensor input, Tensor output, TransposeObj(GraphObj *graph, Tensor input, Tensor output,
vector<int> permute); vector<int> permute);
OP_CLONE(TransposeObj); OP_CLONE(TransposeObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override; optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
std::string toString() const override; std::string toString() const override;
int numInputs() const override { return 1; } int numInputs() const override { return 1; }
@ -25,7 +25,7 @@ class DepthToSpaceObj : public OperatorObj {
DepthToSpaceObj(GraphObj *graph, Tensor input, Tensor output, int blocksize, DepthToSpaceObj(GraphObj *graph, Tensor input, Tensor output, int blocksize,
std::string mode); std::string mode);
OP_CLONE(DepthToSpaceObj); OP_CLONE(DepthToSpaceObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override; optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
std::string toString() const override; std::string toString() const override;
int numInputs() const override { return 1; } int numInputs() const override { return 1; }

View File

@ -17,7 +17,7 @@ class UnaryObj : public OperatorObj {
* @param output The output tensor. * @param output The output tensor.
*/ */
UnaryObj(OpType type, GraphObj *graph, Tensor input, Tensor output); UnaryObj(OpType type, GraphObj *graph, Tensor input, Tensor output);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override; optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
std::string toString() const override; std::string toString() const override;
int numInputs() const override { return 1; } int numInputs() const override { return 1; }
@ -33,7 +33,7 @@ class ClipObj : public OperatorObj {
ClipObj(GraphObj *graph, Tensor input, Tensor output, ClipObj(GraphObj *graph, Tensor input, Tensor output,
std::optional<float> min, std::optional<float> max); std::optional<float> min, std::optional<float> max);
OP_CLONE(ClipObj); OP_CLONE(ClipObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override; optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
std::string toString() const override; std::string toString() const override;
std::optional<float> getMin() const { return minValue; }; std::optional<float> getMin() const { return minValue; };
@ -52,7 +52,7 @@ class HardtanhObj : public OperatorObj {
HardtanhObj(GraphObj *graph, Tensor input, Tensor output, float min, HardtanhObj(GraphObj *graph, Tensor input, Tensor output, float min,
float max); float max);
OP_CLONE(HardtanhObj); OP_CLONE(HardtanhObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override; optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
std::string toString() const override; std::string toString() const override;
float getMin() const { return minValue; }; float getMin() const { return minValue; };
@ -70,7 +70,7 @@ class FlipObj : public OperatorObj {
public: public:
FlipObj(GraphObj *graph, Tensor input, Tensor output, vector<int> axis); FlipObj(GraphObj *graph, Tensor input, Tensor output, vector<int> axis);
OP_CLONE(FlipObj); OP_CLONE(FlipObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override; optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
std::string toString() const override; std::string toString() const override;
vector<int> getAxis() const { return axisValue; }; vector<int> getAxis() const { return axisValue; };
@ -87,7 +87,7 @@ class FillObj : public OperatorObj {
public: public:
FillObj(GraphObj *graph, Tensor input, Tensor output, float value); FillObj(GraphObj *graph, Tensor input, Tensor output, float value);
OP_CLONE(FillObj); OP_CLONE(FillObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override; optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
std::string toString() const override; std::string toString() const override;
float getValue() const { return setValue; }; float getValue() const { return setValue; };
@ -104,7 +104,7 @@ class L2LossObj : public OperatorObj {
public: public:
L2LossObj(GraphObj *graph, Tensor input, Tensor output); L2LossObj(GraphObj *graph, Tensor input, Tensor output);
OP_CLONE(L2LossObj); OP_CLONE(L2LossObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override; optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
std::string toString() const override; std::string toString() const override;
int numInputs() const override { return 1; } int numInputs() const override { return 1; }
@ -120,7 +120,7 @@ class TransformObj : public OperatorObj {
TransformObj(GraphObj *graph, Tensor input, Tensor output, float alpha, TransformObj(GraphObj *graph, Tensor input, Tensor output, float alpha,
float beta); float beta);
OP_CLONE(TransformObj); OP_CLONE(TransformObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override; optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
std::string toString() const override; std::string toString() const override;
float getAlpha() const { return alphaValue; } float getAlpha() const { return alphaValue; }
@ -165,7 +165,7 @@ class CastObj : public OperatorObj {
public: public:
CastObj(GraphObj *graph, Tensor input, Tensor output, CastType type); CastObj(GraphObj *graph, Tensor input, Tensor output, CastType type);
OP_CLONE(CastObj); OP_CLONE(CastObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override; optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
vector<DataType> inferDataType(const TensorVec &inputs) const override; vector<DataType> inferDataType(const TensorVec &inputs) const override;
std::string toString() const override; std::string toString() const override;
@ -185,7 +185,7 @@ class CumsumObj : public OperatorObj {
CumsumObj(GraphObj *graph, Tensor input, Tensor output, int axis, CumsumObj(GraphObj *graph, Tensor input, Tensor output, int axis,
bool exclusive, bool reverse); bool exclusive, bool reverse);
OP_CLONE(CumsumObj); OP_CLONE(CumsumObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override; optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
std::string toString() const override; std::string toString() const override;
int getAxis() const { return axisValue; } int getAxis() const { return axisValue; }
@ -205,7 +205,7 @@ class ShapeObj : public OperatorObj {
public: public:
ShapeObj(GraphObj *graph, Tensor input, Tensor output); ShapeObj(GraphObj *graph, Tensor input, Tensor output);
OP_CLONE(ShapeObj); OP_CLONE(ShapeObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override; optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
std::string toString() const override; std::string toString() const override;
int numInputs() const override { return 1; } int numInputs() const override { return 1; }
@ -216,7 +216,7 @@ class PReluObj : public OperatorObj {
public: public:
PReluObj(GraphObj *graph, Tensor input, Tensor alpha, Tensor output); PReluObj(GraphObj *graph, Tensor input, Tensor alpha, Tensor output);
OP_CLONE(PReluObj); OP_CLONE(PReluObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override; optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
std::string toString() const override; std::string toString() const override;
int numInputs() const override { return 2; } int numInputs() const override { return 2; }
@ -236,7 +236,7 @@ class LogObj : public OperatorObj {
}; };
LogObj(GraphObj *graph, Tensor input, Tensor output, LogType type); LogObj(GraphObj *graph, Tensor input, Tensor output, LogType type);
OP_CLONE(LogObj); OP_CLONE(LogObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override; optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
std::string toString() const override; std::string toString() const override;
LogType getType() const { return logType; } LogType getType() const { return logType; }
@ -258,7 +258,6 @@ class LogObj : public OperatorObj {
}; };
DEFINE_UNARY_OBJ(Relu, OpType::Relu) DEFINE_UNARY_OBJ(Relu, OpType::Relu)
DEFINE_UNARY_OBJ(Silu, OpType::Silu)
DEFINE_UNARY_OBJ(Gelu, OpType::Gelu) DEFINE_UNARY_OBJ(Gelu, OpType::Gelu)
DEFINE_UNARY_OBJ(Sigmoid, OpType::Sigmoid) DEFINE_UNARY_OBJ(Sigmoid, OpType::Sigmoid)
DEFINE_UNARY_OBJ(Tanh, OpType::Tanh) DEFINE_UNARY_OBJ(Tanh, OpType::Tanh)

View File

@ -1,38 +0,0 @@
#pragma once
#include "core/operator.h"
namespace infini {
/**
* @brief nsert single-dimensional entries to the shape of an input tensor.
*
*/
class UnsqueezeObj : public OperatorObj {
Shape axes;
public:
/**
* @brief Construct a new Unsqueeze object.
*
* @param graph The computation graph that this operator belongs to.
* @param input The input tensor.
* @param output The output tensor.
* @param axes List of integers indicating the dimensions to be inserted.
*/
UnsqueezeObj(GraphObj *graph, Tensor input, Tensor output, Shape axes);
OP_CLONE(UnsqueezeObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
std::string toString() const override;
int numInputs() const override { return 1; }
int numOutputs() const override { return 1; }
inline Shape getAxes() const { return axes; }
private:
vector<int> getWorkloadVector() const override;
vector<int> getOpAttrVector() const override;
};
} // namespace infini

View File

@ -22,7 +22,7 @@ class WhereObj : public OperatorObj {
Tensor output); Tensor output);
OP_CLONE(WhereObj); OP_CLONE(WhereObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override; optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
std::string toString() const override; std::string toString() const override;
int numInputs() const override { return inputs.size(); } int numInputs() const override { return inputs.size(); }

View File

@ -3,11 +3,11 @@
namespace infini { namespace infini {
void broadcastShape(const Shape &originShape, SmallArray &modifyShape, void broadcastShape(const Shape &originShape, SmallArray &modifyShape,
int nDims, int size) { int nDims, int size) {
for (int i = nDims - size - 1; i >= 0; --i) { for (int i = nDims - 1; i >= 0; --i) {
modifyShape.data[i] = 1; modifyShape.data[i] = 1;
} }
for (int i = nDims - 1; i >= nDims - size; --i) { for (int i = size - 1; i >= 0; --i) {
modifyShape.data[i] = originShape[i - nDims + size]; modifyShape.data[i + nDims - size] = originShape[i];
} }
} }

View File

@ -91,12 +91,6 @@ template <int val> class ValGenerator : public DataGenerator {
fill<uint32_t>(data, size); fill<uint32_t>(data, size);
} }
void fill(float *data, size_t size) override { fill<float>(data, size); } void fill(float *data, size_t size) override { fill<float>(data, size); }
void fill_fp16(uint16_t *data, size_t size) {
for (size_t i = 0; i < size; i++) {
float x = 1.0f * val;
data[i] = float_to_fp16(x);
}
}
}; };
typedef ValGenerator<1> OneGenerator; typedef ValGenerator<1> OneGenerator;
typedef ValGenerator<0> ZeroGenerator; typedef ValGenerator<0> ZeroGenerator;

View File

@ -2,7 +2,6 @@
#ifndef OPERATOR_UTIL_H #ifndef OPERATOR_UTIL_H
#define OPERATOR_UTIL_H #define OPERATOR_UTIL_H
#include "core/operator.h"
#include "core/tensor.h" #include "core/tensor.h"
namespace infini { namespace infini {
@ -11,15 +10,6 @@ namespace infini {
Shape infer_broadcast(const Shape &A, const Shape &B); Shape infer_broadcast(const Shape &A, const Shape &B);
// Launch the real axis based on rank and current axis // Launch the real axis based on rank and current axis
int get_real_axis(const int &axis, const int &rank); int get_real_axis(const int &axis, const int &rank);
// Check if tensor B is unidirectional broadcastable to tensor A
bool is_unidirectional_broadcasting(const Shape &A, const Shape &B);
// Locate the index with size from Shape
Shape locate_index(size_t inputN, const Shape &shape);
// Delocate the ShapeIndex from Shape with broadcast
size_t delocate_index(const Shape &shapeIndex, const Shape &shape,
const Shape &stride);
// Convert KernelAttrs to a string representation
std::string get_kernel_attrs_str(const KernelAttrs &kernelAttrs);
} // namespace infini } // namespace infini
#endif #endif

File diff suppressed because it is too large Load Diff

View File

@ -209,7 +209,6 @@ class TestStringMethods(unittest.TestCase):
make_and_import_model(make_graph([relu], "relu", [x], [y])) make_and_import_model(make_graph([relu], "relu", [x], [y]))
"""Gelu operator is not supported by onnx 14.1 currently.""" """Gelu operator is not supported by onnx 14.1 currently."""
def test_gelu(self): def test_gelu(self):
pass pass
# x = make_tensor_value_info("x", TensorProto.FLOAT, [1, 3, 5, 7]) # x = make_tensor_value_info("x", TensorProto.FLOAT, [1, 3, 5, 7])
@ -295,36 +294,6 @@ class TestStringMethods(unittest.TestCase):
make_graph([reshape], "reshape", [data, shape], [reshaped], [shape_data]) make_graph([reshape], "reshape", [data, shape], [reshaped], [shape_data])
) )
def test_resize(self):
x = make_tensor_value_info("x", TensorProto.FLOAT, [1, 128, 40, 40])
roi = make_tensor("roi", TensorProto.FLOAT, [0], [])
scales = make_tensor("scales", TensorProto.FLOAT, [4], [1, 1, 2, 2])
y = make_tensor_value_info("y", TensorProto.FLOAT, [1, 128, 80, 80])
reshape = make_node("Resize", ["x", "roi", "scales"], ["y"], name="resize")
make_and_import_model(make_graph([reshape], "resize", [x], [y], [roi, scales]))
def test_squeeze(self):
input = make_tensor_value_info("input", TensorProto.FLOAT, [1, 3, 1, 5])
axes = make_tensor_value_info("axes", TensorProto.INT64, [2])
axes_data = make_tensor("axes", TensorProto.INT64, [2], [0, 2])
output = make_tensor_value_info("output", TensorProto.FLOAT, [3, 5])
squeeze = make_node("Squeeze", ["input", "axes"], ["output"], name="squeeze")
make_and_import_model(
make_graph([squeeze], "squeeze", [input, axes], [output], [axes_data])
)
def test_unsqueeze(self):
input = make_tensor_value_info("input", TensorProto.FLOAT, [2, 3, 4, 5])
axes = make_tensor_value_info("axes", TensorProto.INT64, [2])
axes_data = make_tensor("axes", TensorProto.INT64, [2], [0, 2])
output = make_tensor_value_info("output", TensorProto.FLOAT, [1, 2, 1, 3, 4, 5])
unsqueeze = make_node(
"Unsqueeze", ["input", "axes"], ["output"], name="unsqueeze"
)
make_and_import_model(
make_graph([unsqueeze], "unsqueeze", [input, axes], [output], [axes_data])
)
def test_concat(self): def test_concat(self):
input1 = make_tensor_value_info("input1", TensorProto.FLOAT, [1, 3, 2, 4]) input1 = make_tensor_value_info("input1", TensorProto.FLOAT, [1, 3, 2, 4])
input2 = make_tensor_value_info("input2", TensorProto.FLOAT, [1, 3, 2, 5]) input2 = make_tensor_value_info("input2", TensorProto.FLOAT, [1, 3, 2, 5])
@ -368,14 +337,6 @@ class TestStringMethods(unittest.TestCase):
) )
make_and_import_model(make_graph([reduceMean], "reduceMean", [data], [reduced])) make_and_import_model(make_graph([reduceMean], "reduceMean", [data], [reduced]))
def test_reduce_sum(self):
data = make_tensor_value_info("data", TensorProto.FLOAT, [2, 3, 3, 4])
reduced = make_tensor_value_info("reduced", TensorProto.FLOAT, [1, 1, 1, 1])
reduceSum = make_node(
"ReduceSum", ["data"], ["reduced"], keepdims=1, name="reduceSum"
)
make_and_import_model(make_graph([reduceSum], "reduceSum", [data], [reduced]))
def test_slice(self): def test_slice(self):
data = make_tensor_value_info("data", TensorProto.UINT32, [10, 64, 162, 162]) data = make_tensor_value_info("data", TensorProto.UINT32, [10, 64, 162, 162])
output = make_tensor_value_info("output", TensorProto.UINT32, [1, 1, 99, 95]) output = make_tensor_value_info("output", TensorProto.UINT32, [1, 1, 99, 95])
@ -463,20 +424,7 @@ class TestStringMethods(unittest.TestCase):
def test_split(self): def test_split(self):
input = make_tensor_value_info("input", TensorProto.FLOAT, [1, 3, 2, 4]) input = make_tensor_value_info("input", TensorProto.FLOAT, [1, 3, 2, 4])
split = make_node("Split", ["input"], ["output"], name="split", axis=0) split = make_node("Split", ["input"], ["output"], name="split", axis=0)
output = make_tensor_value_info("output", TensorProto.FLOAT, [1, 3, 2, 4]) make_and_import_model(make_graph([split], "split", [input], []))
make_and_import_model(make_graph([split], "split", [input], [output]))
def test_split1(self):
input = make_tensor_value_info("input", TensorProto.FLOAT, [1, 3, 2, 4])
splitAttr = make_tensor("split", TensorProto.INT64, [2], [2, 1])
output1 = make_tensor_value_info("output1", TensorProto.FLOAT, [1, 2, 2, 4])
output2 = make_tensor_value_info("output2", TensorProto.FLOAT, [1, 1, 2, 4])
split = make_node(
"Split", ["input", "split"], ["output1", "output2"], name="split", axis=1
)
make_and_import_model(
make_graph([split], "split", [input], [output1, output2], [splitAttr])
)
def test_allBroadcast(self): def test_allBroadcast(self):
input = make_tensor_value_info("input", TensorProto.FLOAT, [1, 3, 2, 4]) input = make_tensor_value_info("input", TensorProto.FLOAT, [1, 3, 2, 4])
@ -551,47 +499,6 @@ class TestStringMethods(unittest.TestCase):
where = make_node("Where", ["x", "y", "con"], ["output"], name="where") where = make_node("Where", ["x", "y", "con"], ["output"], name="where")
make_and_import_model(make_graph([where], "where", [x, y, con], [output])) make_and_import_model(make_graph([where], "where", [x, y, con], [output]))
def test_send(self):
sendInput = make_tensor_value_info("input", TensorProto.FLOAT, [1, 3, 5, 7])
send = make_node("Send", ["input"], [], name="send", source=0, destination=1)
graph = make_graph([send], "send", [sendInput], [])
model = make_model(graph)
from_onnx(model, backend.cpu_runtime())
def test_recv(self):
recvOutput = make_tensor_value_info("output", TensorProto.FLOAT, [1, 3, 5, 7])
recv = make_node(
"Recv",
[],
["output"],
name="recv",
source=0,
destination=1,
shape=[1, 3, 5, 7],
dataType=1,
)
graph = make_graph([recv], "recv", [], [recvOutput])
model = make_model(graph)
from_onnx(model, backend.cpu_runtime())
class TestDynamicTensor(unittest.TestCase):
def test_dynamic_tensor(self):
filename = r"resnet18-v2-7.onnx"
current_path = os.getcwd()
model_file = ""
for root, dirs, files in os.walk(current_path):
if filename in files:
model_file = os.path.join(root, filename)
model = OnnxStub(onnx.load(model_file), backend.cpu_runtime())
output_key = list(model.outputs.keys())[0]
old_output_shape = model.getShape(output_key)
self.assertEqual(old_output_shape, ([1, 1000]))
model.set_input([[5, 3, 224, 224]])
new_output_shape = model.getShape(output_key)
self.assertEqual(new_output_shape, ([5, 1000]))
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()

2
run.sh Normal file
View File

@ -0,0 +1,2 @@
./test_mutator --gtest_filter="NMutator.Conv3x3_to_gemm"
./test_conv2gemm --gtest_filter="*.NCHW_FCRS_ruleBased"

View File

@ -1,9 +1,6 @@
#include "bang/bang_runtime.h" #include "bang/bang_runtime.h"
#include "core/kernel.h" #include "core/kernel.h"
#include "core/perf_engine.h" #include "core/perf_engine.h"
#ifdef INFINI_USE_CNCL
#include "bang/cncl_communicator.h"
#endif
namespace infini { namespace infini {
@ -16,20 +13,19 @@ void BangRuntimeObj::runWithoutSync(const Graph &graph, bool tune = false,
std::map<OpType, int> opCnt; std::map<OpType, int> opCnt;
for (auto &op : graph->getOperators()) { for (auto &op : graph->getOperators()) {
// HACK: set correct data type // HACK: set correct data type
auto kernelAttrs = KernelAttrs{device, op->getOpType().underlying()}; auto kernelAttrs =
KernelAttrs{device, op->getOpType().underlying(), op->getDType()};
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs); Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()}; auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
auto perfData = perfEngine.getPerfData(perfKey); auto perfData = perfEngine.getPerfData(perfKey);
if (!perfData && !tune) { if (!perfData && !tune) {
kernel->compute(op, this); kernel->compute(op, this);
this->resetWorkspace();
continue; continue;
} }
PerfRecord record; PerfRecord record;
if (!perfData) { if (!perfData) {
record = kernel->tune(op, this); record = kernel->tune(op, this);
this->resetWorkspace();
perfEngine.setPerfData(perfKey, record); perfEngine.setPerfData(perfKey, record);
} else } else
record = perfData; record = perfData;
@ -40,7 +36,6 @@ void BangRuntimeObj::runWithoutSync(const Graph &graph, bool tune = false,
if (profiling) { if (profiling) {
double t = timeit([&]() { kernel->compute(op, record, this); }, double t = timeit([&]() { kernel->compute(op, record, this); },
[&]() { sync(); }, 1, 1); [&]() { sync(); }, 1, 1);
this->resetWorkspace();
op->print(); op->print();
printf(" op_time on bang %lf\n", t); printf(" op_time on bang %lf\n", t);
totalTime += t; totalTime += t;
@ -61,15 +56,4 @@ void BangRuntimeObj::sync() const { cnrtSyncDevice(); }
string BangRuntimeObj::toString() const { return "BANG Runtime"; } string BangRuntimeObj::toString() const { return "BANG Runtime"; }
void BangRuntimeObj::initComm(const string &name, int worldSize, int rank) {
IT_ASSERT(worldSize > 0);
IT_ASSERT(rank >= 0);
IT_ASSERT(rank < worldSize);
IT_ASSERT(!comm) << "communicator is already initialized.";
#ifdef INFINI_USE_CNCL
comm = std::make_unique<CnclCommunicatorObj>(name, worldSize, rank);
#else
IT_TODO_HALT_MSG("Not compiled with CNCL.");
#endif
}
} // namespace infini } // namespace infini

View File

@ -1,7 +1,5 @@
#include "core/graph.h" #include "core/graph.h"
#include "operators/reshape.h"
#include <algorithm> #include <algorithm>
#include <numeric>
#include <queue> #include <queue>
namespace infini { namespace infini {
@ -11,33 +9,20 @@ GraphObj::GraphObj(Runtime runtime, OpVec ops_in)
map<UidBaseType, Tensor> tensorPool; map<UidBaseType, Tensor> tensorPool;
// Clone tensors // Clone tensors
for (const auto &op : ops_in) { for (const auto &op : ops_in) {
for (const auto &t : op->getInputs()) { for (const auto &t : op->getInputs())
if (t) { if (tensorPool.find(t->getFuid()) == tensorPool.end())
if (tensorPool.find(t->getFuid()) == tensorPool.end()) tensorPool[t->getFuid()] = cloneTensor(t);
tensorPool[t->getFuid()] = cloneTensor(t); for (const auto &t : op->getOutputs())
} if (tensorPool.find(t->getFuid()) == tensorPool.end())
} tensorPool[t->getFuid()] = cloneTensor(t);
for (const auto &t : op->getOutputs()) {
if (t) {
if (tensorPool.find(t->getFuid()) == tensorPool.end())
tensorPool[t->getFuid()] = cloneTensor(t);
}
}
} }
// Clone operators and add connections // Clone operators and add connections
for (const auto &op : ops_in) { for (const auto &op : ops_in) {
TensorVec inputs, outputs; TensorVec inputs, outputs;
for (const auto &t : op->getInputs()) { for (const auto &t : op->getInputs())
if (t) { inputs.emplace_back(tensorPool.at(t->getFuid()));
inputs.emplace_back(tensorPool.at(t->getFuid())); for (const auto &t : op->getOutputs())
} outputs.emplace_back(tensorPool.at(t->getFuid()));
}
for (const auto &t : op->getOutputs()) {
if (t) {
outputs.emplace_back(tensorPool.at(t->getFuid()));
}
}
addOperatorAndConnect(op->clone(inputs, outputs)); addOperatorAndConnect(op->clone(inputs, outputs));
} }
} }
@ -46,21 +31,17 @@ void GraphObj::addOperatorAndConnect(const Operator &op) {
sorted = false; sorted = false;
ops.push_back(op); ops.push_back(op);
for (auto &input : op->getInputs()) { for (auto &input : op->getInputs()) {
if (input) { input->addTarget(op);
input->addTarget(op); if (auto pred = input->getSource()) {
if (auto pred = input->getSource()) { pred->addSuccessors(op);
pred->addSuccessors(op); op->addPredecessors(pred);
op->addPredecessors(pred);
}
} }
} }
for (auto &output : op->getOutputs()) { for (auto &output : op->getOutputs()) {
if (output) { output->setSource(op);
output->setSource(op); for (auto &succ : output->getTargets()) {
for (auto &succ : output->getTargets()) { succ->addPredecessors(op);
succ->addPredecessors(op); op->addSuccessors(succ);
op->addSuccessors(succ);
}
} }
} }
} }
@ -87,33 +68,48 @@ string GraphObj::toString() const {
} }
bool GraphObj::topo_sort() { bool GraphObj::topo_sort() {
if (this->sorted) { if (this->sorted)
return true; return true;
}
// std::unordered_set<Tensor> inputs;
std::unordered_set<Operator> waiting(this->ops.begin(), this->ops.end());
std::vector<Operator> sorted; std::vector<Operator> sorted;
std::unordered_set<OperatorObj *> flags;
sorted.reserve(ops.size()); while (!waiting.empty()) {
flags.reserve(ops.size());
while (sorted.size() < ops.size()) {
// Any node is move to sorted in this loop. // Any node is move to sorted in this loop.
auto modified = false; auto modified = false;
for (auto const &op : ops) { // Find head nodes.
if (auto const &inputs = op->getInputs(); for (auto it = waiting.begin(); it != waiting.end();) {
flags.find(op.get()) == flags.end() && const auto &this_inputs = (*it)->getInputs();
std::all_of(inputs.begin(), inputs.end(), // If none of the input tensors is in waiting list,
[&flags](auto const &input) { // this node is a head node.
auto ptr = input->getSource().get(); const auto is_head = std::all_of(
return !ptr || flags.find(ptr) != flags.end(); this_inputs.begin(), this_inputs.end(), [&](const auto &input) {
})) { auto src = input->getSource();
return src // If the source node is in the waiting list,
// means that this node is not the head node.
? waiting.find(src) == waiting.end()
// This tensor has no source node,
// it must be a input tensor.
: (/*inputs.insert(input),*/ true);
});
// Moves head node to sorted.
if (is_head) {
modified = true; modified = true;
sorted.emplace_back(op); sorted.emplace_back(std::move(*it));
flags.insert(op.get()); it = waiting.erase(it);
} else {
++it;
} }
} }
// Waiting list never modifies during a pass,
// sorting fails.
if (!modified) { if (!modified) {
return false; return false;
} }
} }
// Done.
this->ops = std::move(sorted); this->ops = std::move(sorted);
return this->sorted = true; return this->sorted = true;
} }
@ -127,56 +123,19 @@ void GraphObj::optimize() {
} }
} }
Tensor GraphObj::getTensor(int fuid) const { void GraphObj::dataMalloc(bool useNaiveAllocator) {
for (auto tensor : tensors) {
if (tensor->getFuid() == fuid) {
return tensor;
}
}
return nullptr;
}
void GraphObj::shape_infer() {
for (auto &op : ops) {
auto ans = op->inferShape();
IT_ASSERT(ans.has_value());
auto oldOutputs = op->getOutputs();
IT_ASSERT(ans.value().size() == oldOutputs.size());
// replace the old outputshape and size with new one
for (int i = 0; i < (int)ans.value().size(); ++i) {
auto newShape = ans.value()[i];
auto oldShape = oldOutputs[i]->getDims();
auto fuid = oldOutputs[i]->getFuid();
if (newShape != oldShape) {
auto tensor = this->getTensor(fuid);
tensor->setShape(newShape);
}
}
}
}
void GraphObj::dataMalloc(bool useNaiveAllocator, size_t memPoolSize) {
// topological sorting first // topological sorting first
IT_ASSERT(topo_sort() == true); IT_ASSERT(topo_sort() == true);
if (useNaiveAllocator) { if (useNaiveAllocator) {
// can not set memory pool when use naive allocator
IT_ASSERT(memPoolSize == 0);
// used for debugging memory out-of-bounds access, tensors will not be // used for debugging memory out-of-bounds access, tensors will not be
// released correctly // released correctly
// note: behavior may not match running in non-naive mode, and it may // note: behavior may not match running in non-naive mode, and it may
// not reproduce the bug // not reproduce the bug
for (auto &tensor : tensors) { for (auto &tensor : tensors) {
if (!tensor->isWeight() || tensor->dataMalloc();
(tensor->isWeight() && !weightAllocated)) {
tensor->dataMalloc();
}
} }
return; return;
} }
if (memPoolSize > 0) {
allocator.setMemPool(memPoolSize);
}
// count the number of times all tensors are used // count the number of times all tensors are used
std::unordered_map<TensorObj *, size_t> tensorToRefCount; std::unordered_map<TensorObj *, size_t> tensorToRefCount;
// record the memory address offsets of all tensors to be allocated // record the memory address offsets of all tensors to be allocated
@ -228,28 +187,24 @@ void GraphObj::dataMalloc(bool useNaiveAllocator, size_t memPoolSize) {
// memory should be allocated for the op's output first // memory should be allocated for the op's output first
auto outputs = op->getOutputs(); auto outputs = op->getOutputs();
for (auto &tensor : outputs) { for (auto &tensor : outputs) {
if (tensor) { if (tensor->isOthers()) {
if (tensor->isOthers()) { tensorToOffset[tensor.get()] =
tensorToOffset[tensor.get()] = allocator.alloc(tensor->getBytes());
allocator.alloc(tensor->getBytes());
}
} }
} }
auto inputs = op->getInputs(); auto inputs = op->getInputs();
for (auto &tensor : inputs) { for (auto &tensor : inputs) {
if (tensor) { if (tensor->isOthers()) {
if (tensor->isOthers()) { auto tensorIter = tensorToRefCount.find(tensor.get());
auto tensorIter = tensorToRefCount.find(tensor.get()); IT_ASSERT(tensorIter != tensorToRefCount.end());
IT_ASSERT(tensorIter != tensorToRefCount.end()); IT_ASSERT(tensorToRefCount[tensor.get()] > 0);
IT_ASSERT(tensorToRefCount[tensor.get()] > 0); tensorToRefCount[tensor.get()] -= 1;
tensorToRefCount[tensor.get()] -= 1; if (tensorToRefCount[tensor.get()] == 0) {
if (tensorToRefCount[tensor.get()] == 0) { // indicate that this tensor will no longer be used and
// indicate that this tensor will no longer be used and // perform memory free
// perform memory free tensorToRefCount.erase(tensor.get());
tensorToRefCount.erase(tensor.get()); allocator.free(tensorToOffset[tensor.get()],
allocator.free(tensorToOffset[tensor.get()], tensor->getBytes());
tensor->getBytes());
}
} }
} }
} }
@ -267,27 +222,6 @@ void GraphObj::dataMalloc(bool useNaiveAllocator, size_t memPoolSize) {
} }
} }
Tensor GraphObj::cloneKV(Tensor &tensor) {
auto obj = tensor->clone();
if (allocator.getMemPoolStatus()) {
if (tensor->hasData()) {
obj->setDataBlob(make_ref<BlobObj>(
tensor->runtime,
static_cast<uint8_t *>(allocator.getHeapPtr()) +
allocator.heapAlloc(tensor->getBytes())));
obj->copyData(tensor);
}
} else {
if (tensor->hasData()) {
obj->dataMalloc();
obj->copyData(tensor);
}
}
return obj;
}
void GraphObj::freeHeap() { this->allocator.freeHeap(); }
Tensor GraphObj::addTensor(Shape dim, DataType dtype) { Tensor GraphObj::addTensor(Shape dim, DataType dtype) {
return tensors.emplace_back(make_ref<TensorObj>(dim, dtype, runtime)); return tensors.emplace_back(make_ref<TensorObj>(dim, dtype, runtime));
} }

View File

@ -1,7 +1,6 @@
#include "core/graph_handler.h" #include "core/graph_handler.h"
#include "operators/all_gather.h" #include "operators/all_gather.h"
#include "operators/all_reduce.h" #include "operators/all_reduce.h"
#include "operators/attention_kvcache.h"
#include "operators/batch_norm.h" #include "operators/batch_norm.h"
#include "operators/broadcast.h" #include "operators/broadcast.h"
#include "operators/concat.h" #include "operators/concat.h"
@ -9,27 +8,17 @@
#include "operators/element_wise.h" #include "operators/element_wise.h"
#include "operators/expand.h" #include "operators/expand.h"
#include "operators/gather.h" #include "operators/gather.h"
#include "operators/layer_norm.h"
#include "operators/lrn.h"
#include "operators/matmul.h" #include "operators/matmul.h"
#include "operators/pad.h" #include "operators/pad.h"
#include "operators/pooling.h" #include "operators/pooling.h"
#include "operators/recv.h" #include "operators/reduce_mean.h"
#include "operators/reduce.h"
#include "operators/reshape.h" #include "operators/reshape.h"
#include "operators/resize.h"
#include "operators/rope.h"
#include "operators/send.h"
#include "operators/slice.h" #include "operators/slice.h"
#include "operators/softmax.h" #include "operators/softmax.h"
#include "operators/split.h" #include "operators/split.h"
#include "operators/squeeze.h"
#include "operators/transpose.h" #include "operators/transpose.h"
#include "operators/unary.h" #include "operators/unary.h"
#include "operators/unsqueeze.h"
#include "operators/where.h" #include "operators/where.h"
#include <numeric>
#include <variant>
namespace infini { namespace infini {
@ -105,23 +94,6 @@ Tensor GraphHandlerObj::batchNormalization(Tensor input, Tensor output,
} }
} }
Tensor GraphHandlerObj::layerNormalization(Tensor input, Tensor scale,
Tensor output, Tensor bias,
float eps, int axis,
int stash_type) {
if (output) {
g->addOpWithOutputs<LayerNormObj>(std::move(input), std::move(scale),
output, std::move(bias), eps, axis,
stash_type);
return output;
} else {
return g
->addOp<LayerNormObj>(std::move(input), std::move(scale), output,
std::move(bias), eps, axis, stash_type)
->getOutput();
}
}
Tensor GraphHandlerObj::maxPool(Tensor input, Tensor output, int kh, int kw, Tensor GraphHandlerObj::maxPool(Tensor input, Tensor output, int kh, int kw,
int dh, int dw, int ph, int pw, int sh, int sw, int dh, int dw, int ph, int pw, int sh, int sw,
int ceilMode) { int ceilMode) {
@ -182,7 +154,6 @@ DEFINE_ELEMENT_WISE_METHOD(max, Maximum)
} \ } \
} }
DEFINE_UNARY_METHOD(silu, Silu)
DEFINE_UNARY_METHOD(relu, Relu) DEFINE_UNARY_METHOD(relu, Relu)
DEFINE_UNARY_METHOD(gelu, Gelu) DEFINE_UNARY_METHOD(gelu, Gelu)
DEFINE_UNARY_METHOD(sigmoid, Sigmoid) DEFINE_UNARY_METHOD(sigmoid, Sigmoid)
@ -259,64 +230,6 @@ Tensor GraphHandlerObj::reshape(Tensor data, Tensor reshaped, Shape shape) {
} }
} }
Tensor GraphHandlerObj::resize(Tensor input, Tensor output,
const std::optional<vector<int>> &axes,
Tensor sizes, Tensor scales, Tensor roi,
vector<uint32_t> sizes_, vector<float> scales_,
vector<float> roi_, string mode,
string ratioPolicy, string nearestMode,
string coordTransMode) {
if (sizes_.size() > 0) {
sizes->dataMalloc();
sizes->copyin<uint32_t>(sizes_);
}
if (scales_.size() > 0) {
scales->dataMalloc();
scales->copyin<float>(scales_);
}
if (roi_.size() > 0) {
roi->dataMalloc();
roi->copyin<float>(roi_);
}
ResizeObj::EKeepAspectRatioPolicy ratioPolicy_ =
ResizeObj::fromRatioPolicyStr(ratioPolicy);
ResizeObj::ENearestMode nearestMode_ =
ResizeObj::fromENearestModeStr(nearestMode);
ResizeObj::ECoordinateTransMode coordTransMode_ =
ResizeObj::fromECoordinateTransModeStr(coordTransMode);
ResizeObj::ECoeffMode mode_ = ResizeObj::fromECoeffModeStr(mode);
if (output) {
if (mode == "nearest") {
g->addOpWithOutputs<ResizeObj>(
std::move(input), output, std::move(axes), std::move(sizes),
std::move(scales), std::move(roi), ratioPolicy_, nearestMode_,
coordTransMode_);
} else {
g->addOpWithOutputs<ResizeObj>(
std::move(input), output, std::move(axes), std::move(sizes),
std::move(scales), std::move(roi), mode_, ratioPolicy_,
coordTransMode_);
}
return output;
} else {
if (mode == "nearest") {
return g
->addOp<ResizeObj>(std::move(input), output, std::move(axes),
std::move(sizes), std::move(scales),
std::move(roi), ratioPolicy_, nearestMode_,
coordTransMode_)
->getOutput();
} else {
return g
->addOp<ResizeObj>(std::move(input), output, std::move(axes),
std::move(sizes), std::move(scales),
std::move(roi), mode_, ratioPolicy_,
coordTransMode_)
->getOutput();
}
}
}
Tensor GraphHandlerObj::concat(TensorVec inputs, Tensor output, int dim) { Tensor GraphHandlerObj::concat(TensorVec inputs, Tensor output, int dim) {
if (output) { if (output) {
g->addOpWithOutputs<ConcatObj>(std::move(inputs), output, dim); g->addOpWithOutputs<ConcatObj>(std::move(inputs), output, dim);
@ -326,61 +239,15 @@ Tensor GraphHandlerObj::concat(TensorVec inputs, Tensor output, int dim) {
} }
} }
Tensor GraphHandlerObj::attentionKVCache(Tensor input_k_cache,
Tensor input_v_cache, Tensor input_q,
Tensor input_k, Tensor input_v,
Tensor position_id,
Tensor output_matmul) {
if (output_matmul) {
g->addOpWithOutputs<AttentionKVCacheObj>(
std::move(input_k_cache), std::move(input_v_cache),
std::move(input_q), std::move(input_k), std::move(input_v),
std::move(position_id), output_matmul);
return output_matmul;
} else {
return g
->addOp<AttentionKVCacheObj>(
std::move(input_k_cache), std::move(input_v_cache),
std::move(input_q), std::move(input_k), std::move(input_v),
std::move(position_id), output_matmul)
->getOutput();
}
}
Tensor GraphHandlerObj::RoPE(Tensor pos, Tensor input, Tensor output) {
if (output) {
g->addOpWithOutputs<RoPEObj>(std::move(pos), std::move(input), output);
return output;
} else {
return g->addOp<RoPEObj>(std::move(pos), std::move(input), output)
->getOutput();
}
}
TensorVec GraphHandlerObj::split(Tensor input, std::optional<TensorVec> outputs, TensorVec GraphHandlerObj::split(Tensor input, std::optional<TensorVec> outputs,
int axis, int axis, int num_outputs) {
std::variant<int, vector<int>> numOrRatio) {
if (outputs) { if (outputs) {
if (std::holds_alternative<int>(numOrRatio)) { g->addOpWithOutputs<SplitObj>(std::move(input), outputs, axis,
g->addOpWithOutputs<SplitObj>(std::move(input), outputs, axis, num_outputs);
std::get<int>(numOrRatio));
} else {
g->addOpWithOutputs<SplitObj>(std::move(input), outputs, axis,
std::get<vector<int>>(numOrRatio));
}
return *outputs; return *outputs;
} else { } else {
if (std::holds_alternative<int>(numOrRatio)) { return g->addOp<SplitObj>(std::move(input), outputs, axis, num_outputs)
return g ->getOutputs();
->addOp<SplitObj>(std::move(input), outputs, axis,
std::get<int>(numOrRatio))
->getOutputs();
} else {
return g
->addOp<SplitObj>(std::move(input), outputs, axis,
std::get<vector<int>>(numOrRatio))
->getOutputs();
}
} }
} }
@ -412,23 +279,18 @@ Tensor GraphHandlerObj::gatherElements(Tensor data, Tensor indices,
} }
} }
#define DEFINE_REDUCE_METHOD(name, obj) \ Tensor GraphHandlerObj::reduceMean(Tensor data, Tensor reduced,
Tensor GraphHandlerObj::name(Tensor data, Tensor reduced, \ const optional<vector<int>> &axes,
const optional<vector<int>> &axes, \ bool keepdims) {
bool keepdims) { \ if (reduced) {
if (reduced) { \ g->addOpWithOutputs<ReduceMeanObj>(std::move(data), reduced, axes,
g->addOpWithOutputs<_CAT(obj, Obj)>(std::move(data), reduced, \ keepdims);
axes, keepdims); \ return reduced;
return reduced; \ } else {
} else { \ return g->addOp<ReduceMeanObj>(std::move(data), reduced, axes, keepdims)
return g \ ->getOutput();
->addOp<_CAT(obj, Obj)>(std::move(data), reduced, axes, \
keepdims) \
->getOutput(); \
} \
} }
DEFINE_REDUCE_METHOD(reduceMean, ReduceMean) }
DEFINE_REDUCE_METHOD(reduceSum, ReduceSum)
Tensor GraphHandlerObj::slice(Tensor input, Tensor output, Tensor GraphHandlerObj::slice(Tensor input, Tensor output,
const vector<int> &starts, const vector<int> &starts,
@ -526,39 +388,6 @@ Tensor GraphHandlerObj::broadcast(Tensor input, Tensor output, int root) {
} }
} }
Tensor GraphHandlerObj::send(Tensor input, int source, int destination,
Tensor output) {
if (output) {
g->addOpWithOutputs<SendObj>(std::move(input), source, destination,
output);
return output;
} else {
return g->addOp<SendObj>(std::move(input), source, destination, output)
->getOutput();
}
}
Tensor GraphHandlerObj::recv(Tensor output, int source, int destination,
Shape dims, int outputType, Tensor input) {
if (output) {
g->addOpWithOutputs<RecvObj>(output, source, destination,
std::move(dims), outputType,
std::move(input));
return output;
} else {
return g
->addOp<RecvObj>(output, source, destination, std::move(dims),
outputType, std::move(input))
->getOutput();
}
}
Tensor GraphHandlerObj::cast(Tensor input, Tensor output, int to) { Tensor GraphHandlerObj::cast(Tensor input, Tensor output, int to) {
if (output) { if (output) {
g->addOpWithOutputs<CastObj>(std::move(input), output, g->addOpWithOutputs<CastObj>(std::move(input), output,
@ -609,41 +438,6 @@ Tensor GraphHandlerObj::depthToSpace(Tensor input, Tensor output, int blocksize,
} }
} }
Tensor GraphHandlerObj::lrn(Tensor input, Tensor output, float alpha,
float beta, float bias, int size) {
if (output) {
g->addOpWithOutputs<LRNObj>(std::move(input), output, alpha, beta, bias,
size);
return output;
} else {
return g
->addOp<LRNObj>(std::move(input), output, alpha, beta, bias, size)
->getOutput();
}
}
Tensor GraphHandlerObj::squeeze(Tensor input, Tensor output, Shape axes) {
if (output) {
g->addOpWithOutputs<SqueezeObj>(std::move(input), output,
std::move(axes));
return output;
} else {
return g->addOp<SqueezeObj>(std::move(input), output, std::move(axes))
->getOutput();
}
}
Tensor GraphHandlerObj::unsqueeze(Tensor input, Tensor output, Shape axes) {
if (output) {
g->addOpWithOutputs<UnsqueezeObj>(std::move(input), output,
std::move(axes));
return output;
} else {
return g->addOp<UnsqueezeObj>(std::move(input), output, std::move(axes))
->getOutput();
}
}
static CastType inferCastType(Tensor input, int to) { static CastType inferCastType(Tensor input, int to) {
auto iType = input->getDType(); auto iType = input->getDType();
auto oType = DataType(to); auto oType = DataType(to);
@ -739,11 +533,4 @@ static DataType dtype_repr_convert(int dtype) {
} }
} }
void GraphHandlerObj::change_shape(const vector<int> &shape, int tensorId) {
auto tensor = g->getTensor(tensorId);
IT_ASSERT(tensor != nullptr);
IT_ASSERT(shape.size() != 0);
tensor->setShape(shape);
}
} // namespace infini } // namespace infini

View File

@ -30,9 +30,6 @@ LazyAllocator::~LazyAllocator() {
if (this->weightPtr != nullptr) { if (this->weightPtr != nullptr) {
runtime->dealloc(this->weightPtr); runtime->dealloc(this->weightPtr);
} }
if (this->memPoolPtr != nullptr) {
runtime->dealloc(this->memPoolPtr);
}
} }
void LazyAllocator::init() { void LazyAllocator::init() {
@ -47,17 +44,6 @@ void LazyAllocator::init() {
this->ptr = nullptr; this->ptr = nullptr;
} }
void LazyAllocator::setMemPool(size_t memPoolSize) {
IT_ASSERT(memPoolSize > 0);
if (!this->hasMemPool) {
this->hasMemPool = true;
this->memPoolSize = memPoolSize;
this->memPoolPtr = runtime->alloc(memPoolSize);
}
}
bool LazyAllocator::getMemPoolStatus() { return this->hasMemPool; }
size_t LazyAllocator::alloc(size_t size) { size_t LazyAllocator::alloc(size_t size) {
// pad the size to the multiple of alignment // pad the size to the multiple of alignment
size = this->getAlignedSize(size); size = this->getAlignedSize(size);
@ -116,17 +102,6 @@ size_t LazyAllocator::allocWeight(size_t size) {
return retAddr; return retAddr;
} }
size_t LazyAllocator::heapAlloc(size_t size) {
size = this->getAlignedSize(size);
this->heapPeak += size;
IT_ASSERT(this->memPoolSize >=
this->weightPeak + this->peak + this->heapPeak);
size_t retAddr = this->memPoolSize - this->heapPeak;
return retAddr;
}
void LazyAllocator::freeHeap() { this->heapPeak = 0; }
void LazyAllocator::free(size_t addr, size_t size) { void LazyAllocator::free(size_t addr, size_t size) {
IT_ASSERT(this->ptr == nullptr); IT_ASSERT(this->ptr == nullptr);
size = getAlignedSize(size); size = getAlignedSize(size);
@ -168,40 +143,25 @@ void LazyAllocator::free(size_t addr, size_t size) {
} }
void *LazyAllocator::getPtr() { void *LazyAllocator::getPtr() {
if (!hasMemPool) { if (this->ptr == nullptr) {
if (this->ptr == nullptr) { this->ptr = runtime->alloc(this->peak);
this->ptr = runtime->alloc(this->peak); // #ifdef DEBUG_MODE
// #ifdef DEBUG_MODE // printf("LazyAllocator really alloc non-weight: %p %lu
// printf("LazyAllocator really alloc non-weight: %p %lu // bytes\n", this->ptr, peak);
// bytes\n", this->ptr, peak); // #endif
// #endif
}
return this->ptr;
} else {
IT_ASSERT(this->memPoolSize >= this->weightPeak + this->peak);
return static_cast<uint8_t *>(this->memPoolPtr) + weightPeak;
} }
return this->ptr;
} }
void *LazyAllocator::getWeightPtr() { void *LazyAllocator::getWeightPtr() {
if (!hasMemPool) { if (this->weightPtr == nullptr) {
if (this->weightPtr == nullptr) { this->weightPtr = runtime->alloc(this->weightPeak);
this->weightPtr = runtime->alloc(this->weightPeak); // #ifdef DEBUG_MODE
// #ifdef DEBUG_MODE // printf("LazyAllocator really alloc weight: %p %lu bytes\n",
// printf("LazyAllocator really alloc weight: %p %lu // this->weightPtr, weightPeak);
// bytes\n", // #endif
// this->weightPtr, weightPeak);
// #endif
}
return this->weightPtr;
} else {
return this->memPoolPtr;
} }
} return this->weightPtr;
void *LazyAllocator::getHeapPtr() {
IT_ASSERT(hasMemPool);
return this->memPoolPtr;
} }
size_t LazyAllocator::getAlignedSize(size_t size) { size_t LazyAllocator::getAlignedSize(size_t size) {

View File

@ -6,10 +6,8 @@ namespace infini {
OperatorObj::OperatorObj(OpType opType, TensorVec inputs, TensorVec outputs) OperatorObj::OperatorObj(OpType opType, TensorVec inputs, TensorVec outputs)
: type(opType), inputs(inputs), outputs(outputs) { : type(opType), inputs(inputs), outputs(outputs) {
if (opType != OpType::Recv) { for (const auto &t : inputs)
for (const auto &t : inputs) IT_ASSERT(t);
IT_ASSERT(t);
}
} }
void OperatorObj::removePredecessors(const Operator &op) { void OperatorObj::removePredecessors(const Operator &op) {
@ -79,7 +77,9 @@ bool OperatorObj::checkValid(GraphObj *graph) {
return true; return true;
} }
optional<vector<Shape>> OperatorObj::inferShape() { return inferShape(inputs); } optional<vector<Shape>> OperatorObj::inferShape() const {
return inferShape(inputs);
}
vector<DataType> OperatorObj::inferDataType(const TensorVec &inputs) const { vector<DataType> OperatorObj::inferDataType(const TensorVec &inputs) const {
auto dataType = inputs[0]->getDType(); auto dataType = inputs[0]->getDType();

View File

@ -17,7 +17,8 @@ void CpuRuntimeObj::run(const Graph &graph, bool tune, bool profiling) const {
std::map<OpType, int> opCnt; std::map<OpType, int> opCnt;
for (auto &op : graph->getOperators()) { for (auto &op : graph->getOperators()) {
auto kernelAttrs = KernelAttrs{device, op->getOpType().underlying()}; auto kernelAttrs =
KernelAttrs{device, op->getOpType().underlying(), op->getDType()};
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs); Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()}; auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
auto perfData = perfEngine.getPerfData(perfKey); auto perfData = perfEngine.getPerfData(perfKey);
@ -65,7 +66,8 @@ double RuntimeObj::getPerfTime(const Graph &graph, bool profiling) const {
std::map<OpType, int> opCnt; std::map<OpType, int> opCnt;
for (auto &op : graph->getOperators()) { for (auto &op : graph->getOperators()) {
auto kernelAttrs = KernelAttrs{device, op->getOpType().underlying()}; auto kernelAttrs =
KernelAttrs{device, op->getOpType().underlying(), op->getDType()};
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs); Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()}; auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
auto perfData = perfEngine.getPerfData(perfKey); auto perfData = perfEngine.getPerfData(perfKey);

View File

@ -59,13 +59,6 @@ Shape TensorObj::getStride() const {
return stride; return stride;
} }
void TensorObj::setShape(Shape shape_) {
shape = shape_;
size_t size = std::accumulate(shape.begin(), shape.end(), 1,
[](auto acc, auto x) { return acc * x; });
_size = size;
}
void TensorObj::printData() const { void TensorObj::printData() const {
IT_ASSERT(data != nullptr); IT_ASSERT(data != nullptr);
if (!runtime->isCpu()) if (!runtime->isCpu())

View File

@ -25,7 +25,8 @@ void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
auto &perfEngine = PerfEngine::getInstance(); auto &perfEngine = PerfEngine::getInstance();
for (auto &op : graph->getOperators()) { for (auto &op : graph->getOperators()) {
// HACK: set correct data type // HACK: set correct data type
auto kernelAttrs = KernelAttrs{device, op->getOpType().underlying()}; auto kernelAttrs =
KernelAttrs{device, op->getOpType().underlying(), op->getDType()};
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs); Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()}; auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
auto perfData = perfEngine.getPerfData(perfKey); auto perfData = perfEngine.getPerfData(perfKey);
@ -47,7 +48,8 @@ void CudaRuntimeObj::tune(const Graph &graph, bool profiling = false) const {
std::map<OpType, int> opCnt; std::map<OpType, int> opCnt;
for (auto &op : graph->getOperators()) { for (auto &op : graph->getOperators()) {
// HACK: set correct data type // HACK: set correct data type
auto kernelAttrs = KernelAttrs{device, op->getOpType().underlying()}; auto kernelAttrs = KernelAttrs{device, op->getOpType().underlying(),
DataType::Float32};
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs); Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()}; auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
auto perfData = perfEngine.getPerfData(perfKey); auto perfData = perfEngine.getPerfData(perfKey);

View File

@ -1,6 +1,4 @@
#include "core/data_type.h"
#include "cuda/cuda_common.h" #include "cuda/cuda_common.h"
#include "cuda/cuda_utility.h"
#include <cstdio> #include <cstdio>
__global__ void cudaPrintFloatImpl(float *x, int len) { __global__ void cudaPrintFloatImpl(float *x, int len) {
@ -20,55 +18,4 @@ void cudaPrintFloat(float *x, int len) {
cudaDeviceSynchronize(); cudaDeviceSynchronize();
} }
void cudaPrintTensor(const Tensor &tensor) {
cudaPrintFloat(tensor->getRawDataPtr<float *>(), tensor->size());
}
cudnnDataType_t cudnnDataTypeConvert(DataType dataType) {
if (dataType == DataType::Float32) {
return CUDNN_DATA_FLOAT;
}
if (dataType == DataType::Double) {
return CUDNN_DATA_DOUBLE;
}
if (dataType == DataType::Float16) {
return CUDNN_DATA_HALF;
}
if (dataType == DataType::Int8) {
return CUDNN_DATA_INT8;
}
if (dataType == DataType::Int32) {
return CUDNN_DATA_INT32;
}
if (dataType == DataType::UInt8) {
return CUDNN_DATA_UINT8;
}
if (dataType == DataType::BFloat16) {
return CUDNN_DATA_BFLOAT16;
}
if (dataType == DataType::Int64) {
return CUDNN_DATA_INT64;
}
if (dataType == DataType::Bool) {
return CUDNN_DATA_BOOLEAN;
}
IT_ASSERT(false, "Unsupported data type");
}
cudaDataType cublasDataTypeConvert(DataType dataType) {
switch (dataType.getIndex()) {
case 1:
return CUDA_R_32F;
// case 3:
// return CUDA_R_8I;
case 10:
return CUDA_R_16F;
case 11:
return CUDA_R_64F;
// case 16:
// return CUDA_R_16BF;
default:
IT_ASSERT(false, "MatMul Unsupported data type");
}
}
} // namespace infini } // namespace infini

View File

@ -5,17 +5,14 @@
#include "operators/conv.h" #include "operators/conv.h"
#include "operators/expand.h" #include "operators/expand.h"
#include "operators/gather.h" #include "operators/gather.h"
#include "operators/lrn.h"
#include "operators/matmul.h" #include "operators/matmul.h"
#include "operators/pad.h" #include "operators/pad.h"
#include "operators/pooling.h" #include "operators/pooling.h"
#include "operators/reduce.h" #include "operators/reduce_mean.h"
#include "operators/reshape.h" #include "operators/reshape.h"
#include "operators/split.h" #include "operators/split.h"
#include "operators/squeeze.h"
#include "operators/transpose.h" #include "operators/transpose.h"
#include "operators/unary.h" #include "operators/unary.h"
#include "operators/unsqueeze.h"
#include <algorithm> #include <algorithm>
#include <pybind11/numpy.h> #include <pybind11/numpy.h>
#include <pybind11/pybind11.h> #include <pybind11/pybind11.h>
@ -93,10 +90,7 @@ void export_values(py::module &m) {
.VALUE(OpType, Gather) .VALUE(OpType, Gather)
.VALUE(OpType, GatherElements) .VALUE(OpType, GatherElements)
.VALUE(OpType, ReduceMean) .VALUE(OpType, ReduceMean)
.VALUE(OpType, ReduceSum)
.VALUE(OpType, Reshape) .VALUE(OpType, Reshape)
.VALUE(OpType, Squeeze)
.VALUE(OpType, Unsqueeze)
.VALUE(OpType, Flatten) .VALUE(OpType, Flatten)
.VALUE(OpType, Identity) .VALUE(OpType, Identity)
.VALUE(OpType, BatchNormalization) .VALUE(OpType, BatchNormalization)
@ -118,7 +112,6 @@ void export_values(py::module &m) {
.VALUE(OpType, Erf) .VALUE(OpType, Erf)
.VALUE(OpType, Where) .VALUE(OpType, Where)
.VALUE(OpType, DepthToSpace) .VALUE(OpType, DepthToSpace)
.VALUE(OpType, LRN)
.export_values(); .export_values();
#undef VALUE #undef VALUE
@ -226,13 +219,12 @@ clip_attrs_of(Operator op) {
return std::make_tuple(clip->getMin(), clip->getMax()); return std::make_tuple(clip->getMin(), clip->getMax());
} }
static std::tuple<vector<int>, bool> reduce_attrs_of(Operator op) { static std::tuple<vector<int>, bool> reduce_mean_attrs_of(Operator op) {
IT_ASSERT(op->getOpType() == OpType::ReduceMean || IT_ASSERT(op->getOpType() == OpType::ReduceMean);
op->getOpType() == OpType::ReduceSum); auto reduce_mean = dynamic_cast<const ReduceMeanObj *>(op.get());
auto reduce = dynamic_cast<const ReduceBaseObj *>(op.get()); auto &set = reduce_mean->getAxes();
auto &set = reduce->getAxes();
return std::make_tuple(vector(set.begin(), set.end()), return std::make_tuple(vector(set.begin(), set.end()),
reduce->getKeepDims()); reduce_mean->getKeepDims());
} }
static int concat_axis_of(Operator op) { static int concat_axis_of(Operator op) {
@ -260,24 +252,6 @@ static vector<int64_t> reshape_shape_of(Operator op) {
return ans; return ans;
} }
static vector<int64_t> squeeze_axes_of(Operator op) {
IT_ASSERT(op->getOpType() == OpType::Squeeze);
auto axes = dynamic_cast<const SqueezeObj *>(op.get())->getAxes();
vector<int64_t> ans(axes.size());
std::transform(axes.begin(), axes.end(), ans.begin(),
[](auto x) { return static_cast<int64_t>(x); });
return ans;
}
static vector<int64_t> unsqueeze_axes_of(Operator op) {
IT_ASSERT(op->getOpType() == OpType::Unsqueeze);
auto axes = dynamic_cast<const UnsqueezeObj *>(op.get())->getAxes();
vector<int64_t> ans(axes.size());
std::transform(axes.begin(), axes.end(), ans.begin(),
[](auto x) { return static_cast<int64_t>(x); });
return ans;
}
static vector<int64_t> expand_shape_of(Operator op) { static vector<int64_t> expand_shape_of(Operator op) {
IT_ASSERT(op->getOpType() == OpType::Expand); IT_ASSERT(op->getOpType() == OpType::Expand);
auto shape = dynamic_cast<const ExpandObj *>(op.get())->getShape(); auto shape = dynamic_cast<const ExpandObj *>(op.get())->getShape();
@ -320,14 +294,6 @@ static std::tuple<int, std::string> depth_to_space_attrs_of(Operator op) {
depth_to_space->getModeString()); depth_to_space->getModeString());
} }
static std::tuple<float, float, float, int> lrn_attrs_of(Operator op) {
IT_ASSERT(op->getOpType() == OpType::LRN);
auto lrn = dynamic_cast<const LRNObj *>(op.get());
auto [alpha, beta, bias] = lrn->getAlphaBetaBias();
auto size = lrn->getSize();
return std::make_tuple(alpha, beta, bias, size);
}
void export_functions(py::module &m) { void export_functions(py::module &m) {
#define FUNCTION(NAME) def(#NAME, &NAME) #define FUNCTION(NAME) def(#NAME, &NAME)
m.def("cpu_runtime", &NativeCpuRuntimeObj::getInstance) m.def("cpu_runtime", &NativeCpuRuntimeObj::getInstance)
@ -353,7 +319,7 @@ void export_functions(py::module &m) {
.FUNCTION(batch_norm_attrs_of) .FUNCTION(batch_norm_attrs_of)
.FUNCTION(pool_attrs_of) .FUNCTION(pool_attrs_of)
.FUNCTION(clip_attrs_of) .FUNCTION(clip_attrs_of)
.FUNCTION(reduce_attrs_of) .FUNCTION(reduce_mean_attrs_of)
.FUNCTION(tensor_dtype) .FUNCTION(tensor_dtype)
.FUNCTION(reshape_shape_of) .FUNCTION(reshape_shape_of)
.FUNCTION(expand_shape_of) .FUNCTION(expand_shape_of)
@ -364,10 +330,7 @@ void export_functions(py::module &m) {
.FUNCTION(gather_axis_of) .FUNCTION(gather_axis_of)
.FUNCTION(flatten_axis_of) .FUNCTION(flatten_axis_of)
.FUNCTION(cast_to_of) .FUNCTION(cast_to_of)
.FUNCTION(depth_to_space_attrs_of) .FUNCTION(depth_to_space_attrs_of);
.FUNCTION(squeeze_axes_of)
.FUNCTION(unsqueeze_axes_of)
.FUNCTION(lrn_attrs_of);
#undef FUNCTION #undef FUNCTION
} }
@ -423,9 +386,7 @@ void init_graph_builder(py::module &m) {
#endif #endif
#ifdef USE_BANG #ifdef USE_BANG
py::class_<BangRuntimeObj, std::shared_ptr<BangRuntimeObj>, RuntimeObj>( py::class_<BangRuntimeObj, std::shared_ptr<BangRuntimeObj>, RuntimeObj>(
m, "BangRuntime") m, "BangRuntime");
.def(py::init<int>(), py::arg("device") = 0)
.def("init_comm", &BangRuntimeObj::initComm);
#endif #endif
#ifdef USE_KUNLUN #ifdef USE_KUNLUN
py::class_<KUNLUNRuntimeObj, std::shared_ptr<KUNLUNRuntimeObj>, RuntimeObj>( py::class_<KUNLUNRuntimeObj, std::shared_ptr<KUNLUNRuntimeObj>, RuntimeObj>(
@ -485,10 +446,7 @@ void init_graph_builder(py::module &m) {
}) })
.def("has_target", &TensorObj::hasTarget, policy::automatic) .def("has_target", &TensorObj::hasTarget, policy::automatic)
.def("src", &TensorObj::getSource, policy::move) .def("src", &TensorObj::getSource, policy::move)
.def("printData", &TensorObj::printData, policy::automatic) .def("printData", &TensorObj::printData, policy::automatic);
.def("copy_data",
py::overload_cast<const Tensor &>(&TensorObj::copyData),
policy::move);
py::class_<OperatorObj, std::shared_ptr<OperatorObj>>(m, "Operator") py::class_<OperatorObj, std::shared_ptr<OperatorObj>>(m, "Operator")
.def("op_type", &OperatorObj::getOpType, policy::automatic) .def("op_type", &OperatorObj::getOpType, policy::automatic)
.def("inputs", py::overload_cast<>(&OperatorObj::getInputs, py::const_), .def("inputs", py::overload_cast<>(&OperatorObj::getInputs, py::const_),
@ -503,19 +461,16 @@ void init_graph_builder(py::module &m) {
.def("convTransposed2d", &Handler::convTransposed2d, policy::move) .def("convTransposed2d", &Handler::convTransposed2d, policy::move)
.def("matmul", &Handler::matmul, policy::move) .def("matmul", &Handler::matmul, policy::move)
.def("batchNormalization", &Handler::batchNormalization, policy::move) .def("batchNormalization", &Handler::batchNormalization, policy::move)
.def("layerNormalization", &Handler::layerNormalization, policy::move)
.def("maxPool", &Handler::maxPool, policy::move) .def("maxPool", &Handler::maxPool, policy::move)
.def("avgPool", &Handler::avgPool, policy::move) .def("avgPool", &Handler::avgPool, policy::move)
.def("add", &Handler::add, policy::move) .def("add", &Handler::add, policy::move)
.def("sub", &Handler::sub, policy::move) .def("sub", &Handler::sub, policy::move)
.def("mul", &Handler::mul, policy::move) .def("mul", &Handler::mul, policy::move)
.def("max", &Handler::max, policy::move)
.def("div", &Handler::div, policy::move) .def("div", &Handler::div, policy::move)
.def("pow", &Handler::pow, policy::move) .def("pow", &Handler::pow, policy::move)
.def("min", &Handler::min, policy::move) .def("min", &Handler::min, policy::move)
.def("max", &Handler::max, policy::move) .def("max", &Handler::max, policy::move)
.def("relu", &Handler::relu, policy::move) .def("relu", &Handler::relu, policy::move)
.def("silu", &Handler::silu, policy::move)
.def("gelu", &Handler::gelu, policy::move) .def("gelu", &Handler::gelu, policy::move)
.def("sigmoid", &Handler::sigmoid, policy::move) .def("sigmoid", &Handler::sigmoid, policy::move)
.def("tanh", &Handler::tanh, policy::move) .def("tanh", &Handler::tanh, policy::move)
@ -533,17 +488,11 @@ void init_graph_builder(py::module &m) {
.def("transpose", &Handler::transpose, policy::move) .def("transpose", &Handler::transpose, policy::move)
.def("depthToSpace", &Handler::depthToSpace, policy::move) .def("depthToSpace", &Handler::depthToSpace, policy::move)
.def("reshape", &Handler::reshape, policy::move) .def("reshape", &Handler::reshape, policy::move)
.def("resize", &Handler::resize, policy::move)
.def("squeeze", &Handler::squeeze, policy::move)
.def("unsqueeze", &Handler::unsqueeze, policy::move)
.def("concat", &Handler::concat, policy::move) .def("concat", &Handler::concat, policy::move)
.def("attentionKVCache", &Handler::attentionKVCache, policy::move)
.def("RoPE", &Handler::RoPE, policy::move)
.def("split", &Handler::split, policy::move) .def("split", &Handler::split, policy::move)
.def("gather", &Handler::gather, policy::move) .def("gather", &Handler::gather, policy::move)
.def("gatherElements", &Handler::gatherElements, policy::move) .def("gatherElements", &Handler::gatherElements, policy::move)
.def("reduceMean", &Handler::reduceMean, policy::move) .def("reduce_mean", &Handler::reduceMean, policy::move)
.def("reduceSum", &Handler::reduceSum, policy::move)
.def("slice", &Handler::slice, policy::move) .def("slice", &Handler::slice, policy::move)
.def("pad", &Handler::pad, policy::move) .def("pad", &Handler::pad, policy::move)
.def("allReduceSum", &Handler::allReduceSum, policy::move) .def("allReduceSum", &Handler::allReduceSum, policy::move)
@ -553,27 +502,17 @@ void init_graph_builder(py::module &m) {
.def("allReduceAvg", &Handler::allReduceAvg, policy::move) .def("allReduceAvg", &Handler::allReduceAvg, policy::move)
.def("allGather", &Handler::allGather, policy::move) .def("allGather", &Handler::allGather, policy::move)
.def("broadcast", &Handler::broadcast, policy::move) .def("broadcast", &Handler::broadcast, policy::move)
.def("send", &Handler::send, policy::move)
.def("recv", &Handler::recv, policy::move)
.def("cast", &Handler::cast, policy::move) .def("cast", &Handler::cast, policy::move)
.def("expand", &Handler::expand, policy::move) .def("expand", &Handler::expand, policy::move)
.def("erf", &Handler::erf, policy::move) .def("erf", &Handler::erf, policy::move)
.def("where", &Handler::where, policy::move) .def("where", &Handler::where, policy::move)
.def("lrn", &Handler::lrn, policy::move)
.def("topo_sort", &Handler::topo_sort, policy::automatic) .def("topo_sort", &Handler::topo_sort, policy::automatic)
.def("optimize", &Handler::optimize, policy::automatic) .def("optimize", &Handler::optimize, policy::automatic)
.def("operators", &Handler::operators, policy::move) .def("operators", &Handler::operators, policy::move)
.def("data_malloc", &Handler::data_malloc, .def("data_malloc", &Handler::data_malloc, policy::automatic)
py::arg("useNaiveAllocator") = false, py::arg("memPoolSize") = 0,
policy::automatic)
.def("clone_KV", &Handler::clone_KV, policy::move)
.def("free_heap", &Handler::free_heap, policy::move)
.def("get_perf_time", &Handler::get_perf_time, policy::automatic) .def("get_perf_time", &Handler::get_perf_time, policy::automatic)
.def("tune", &Handler::tune, policy::automatic) .def("tune", &Handler::tune, policy::automatic)
.def("run", &Handler::run, policy::automatic) .def("run", &Handler::run, policy::automatic)
.def("shape_infer", &Handler::shape_infer, policy::automatic)
.def("change_shape", &Handler::change_shape, policy::automatic)
.def("getDims", &Handler::getDims, policy::automatic)
.def("get_perf_time", &Handler::get_perf_time, policy::automatic); .def("get_perf_time", &Handler::get_perf_time, policy::automatic);
} }

View File

@ -2,17 +2,12 @@
#include "bang/bang_runtime.h" #include "bang/bang_runtime.h"
#include "operators/softmax.h" #include "operators/softmax.h"
#include "operators/unary.h" #include "operators/unary.h"
#include <iostream>
namespace infini { namespace infini {
class UnaryCnnl : public BangKernelWithoutConfig { class UnaryCnnl : public BangKernelWithoutConfig {
virtual cnnlActivationMode_t getOpType() const = 0; virtual cnnlActivationMode_t getOpType() const = 0;
virtual float getCoef() const = 0; virtual float getCoef() const = 0;
virtual tuple<float, float> getAlphBeta() const { return {1.f, 0.f}; } virtual tuple<float, float> getAlphBeta() const { return {1.f, 0.f}; }
virtual float getSlicedDim() const { return 0.0; }
virtual float getGamma() const { return 0.0; }
virtual float getScale() const { return 0.0; }
void compute(const Operator &_op, void compute(const Operator &_op,
const RuntimeObj *_context) const override { const RuntimeObj *_context) const override {
auto op = as<UnaryObj>(_op); auto op = as<UnaryObj>(_op);
@ -26,19 +21,17 @@ class UnaryCnnl : public BangKernelWithoutConfig {
auto cDim = op->getOutput()->getDims(); auto cDim = op->getOutput()->getDims();
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor( checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), CNNL_DTYPE_FLOAT, aDim.size(),
aDim.size(), aDim.data())); aDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor( checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), CNNL_DTYPE_FLOAT, cDim.size(),
cDim.size(), cDim.data())); cDim.data()));
cnnlActivationDescriptor_t opDesc; cnnlActivationDescriptor_t opDesc;
checkCnnlError(cnnlCreateActivationDescriptor(&opDesc)); checkCnnlError(cnnlCreateActivationDescriptor(&opDesc));
checkCnnlError(cnnlSetActivationDescriptor_v5( checkCnnlError(cnnlSetActivationDescriptor(
opDesc, getOpType(), CNNL_ACTIVATION_HIGH_PRECISION, opDesc, getOpType(), CNNL_NOT_PROPAGATE_NAN, getCoef()));
CNNL_NOT_PROPAGATE_NAN, getCoef(), getSlicedDim(), getGamma(),
getScale(), true));
auto [alpha, beta] = getAlphBeta(); auto [alpha, beta] = getAlphBeta();
cnnlStatus_t stat = cnnlStatus_t stat =
@ -66,13 +59,13 @@ class RoundCnnl : public BangKernelWithoutConfig {
auto cDim = op->getOutput()->getDims(); auto cDim = op->getOutput()->getDims();
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor( checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), CNNL_DTYPE_FLOAT, aDim.size(),
aDim.size(), aDim.data())); aDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor( checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), CNNL_DTYPE_FLOAT, cDim.size(),
cDim.size(), cDim.data())); cDim.data()));
cnnlStatus_t stat = cnnlStatus_t stat =
cnnlRound(context->cnnlHandle(), aDesc, aData, cDesc, cData); cnnlRound(context->cnnlHandle(), aDesc, aData, cDesc, cData);
if (stat != CNNL_STATUS_SUCCESS) if (stat != CNNL_STATUS_SUCCESS)
@ -97,22 +90,18 @@ class PReluCnnl : public BangKernelWithoutConfig {
auto bDim = op->getInputs(1)->getDims(); auto bDim = op->getInputs(1)->getDims();
auto cDim = op->getOutput()->getDims(); auto cDim = op->getOutput()->getDims();
if (auto alignSize = aDim.size() - bDim.size(); alignSize) {
bDim.insert(bDim.begin(), alignSize, 1);
}
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor( checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), CNNL_DTYPE_FLOAT, aDim.size(),
aDim.size(), aDim.data())); aDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc)); checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
checkCnnlError(cnnlSetTensorDescriptor( checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), CNNL_DTYPE_FLOAT, bDim.size(),
bDim.size(), bDim.data())); bDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor( checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), CNNL_DTYPE_FLOAT, cDim.size(),
cDim.size(), cDim.data())); cDim.data()));
cnnlStatus_t stat = cnnlPrelu(context->cnnlHandle(), aDesc, aData, cnnlStatus_t stat = cnnlPrelu(context->cnnlHandle(), aDesc, aData,
bDesc, bData, cDesc, cData); bDesc, bData, cDesc, cData);
@ -142,68 +131,48 @@ class SoftmaxCnnl : public BangKernelWithoutConfig {
std::vector<int> inDim = {1, 1, 1}; std::vector<int> inDim = {1, 1, 1};
std::vector<int> outDim = inDim; std::vector<int> outDim = inDim;
if (aDim.size() >= 3) { if (axis == 0) {
if (axis == 0) {
mode = CNNL_SOFTMAX_MODE_HIGH_DIMENSION;
inDim[0] = aDim[0];
inDim[1] = aDim[1];
for (size_t i = 2; i < aDim.size(); ++i) {
inDim[2] *= aDim[i];
}
outDim = inDim;
} else if (axis == aDim.size() - 1) {
mode = CNNL_SOFTMAX_MODE_LOW_DIMENSION;
inDim[0] = aDim[0];
for (size_t i = 1; i < axis; ++i) {
inDim[1] *= aDim[i];
}
inDim[2] = aDim[axis];
outDim = inDim;
} else {
mode = CNNL_SOFTMAX_MODE_MEDIUM_DIMENSION;
for (size_t i = 0; i < axis; ++i) {
inDim[0] *= aDim[i];
}
inDim[1] = aDim[axis];
for (size_t i = axis + 1; i < aDim.size(); ++i) {
inDim[2] *= aDim[i];
}
outDim = inDim;
}
} else if (aDim.size() == 2) {
if (axis == 0) {
mode = CNNL_SOFTMAX_MODE_HIGH_DIMENSION;
inDim = aDim;
inDim.push_back(1);
outDim = inDim;
} else {
mode = CNNL_SOFTMAX_MODE_LOW_DIMENSION;
inDim = aDim;
inDim.insert(inDim.begin(), 1);
outDim = inDim;
}
} else {
mode = CNNL_SOFTMAX_MODE_HIGH_DIMENSION; mode = CNNL_SOFTMAX_MODE_HIGH_DIMENSION;
inDim = aDim; inDim[0] = aDim[0];
inDim.push_back(1); inDim[1] = aDim[1];
inDim.push_back(1); for (size_t i = 2; i < aDim.size(); ++i) {
inDim[2] *= aDim[i];
}
outDim = inDim;
} else if (axis == aDim.size() - 1) {
mode = CNNL_SOFTMAX_MODE_LOW_DIMENSION;
inDim[0] = aDim[0];
for (size_t i = 1; i < axis; ++i) {
inDim[1] *= aDim[i];
}
inDim[2] = aDim[axis];
outDim = inDim;
} else {
mode = CNNL_SOFTMAX_MODE_MEDIUM_DIMENSION;
for (size_t i = 0; i < axis; ++i) {
inDim[0] *= aDim[i];
}
inDim[1] = aDim[axis];
for (size_t i = axis + 1; i < aDim.size(); ++i) {
inDim[2] *= aDim[i];
}
outDim = inDim; outDim = inDim;
} }
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor( checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY,
aDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()), CNNL_DTYPE_FLOAT, inDim.size(),
inDim.size(), inDim.data())); inDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor( checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY,
cDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()), CNNL_DTYPE_FLOAT, outDim.size(),
outDim.size(), outDim.data())); outDim.data()));
float alpha = 1.0; float alpha = 1.0;
float beta = 0.0; float beta = 0.0;
cnnlStatus_t stat = cnnlStatus_t stat =
cnnlSoftmaxForward_v2(context->cnnlHandle(), CNNL_SOFTMAX_ACCURATE, cnnlSoftmaxForward_v2(context->cnnlHandle(), CNNL_SOFTMAX_ACCURATE,
mode, CNNL_COMPUTATION_ULTRAHIGH_PRECISION, mode, CNNL_COMPUTATION_HIGH_PRECISION, &alpha,
&alpha, aDesc, aData, &beta, cDesc, cData); aDesc, aData, &beta, cDesc, cData);
if (stat != CNNL_STATUS_SUCCESS) if (stat != CNNL_STATUS_SUCCESS)
return; return;
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
@ -225,32 +194,15 @@ class SigmoidCnnl : public UnaryCnnl {
float getCoef() const override { return 0.0; } float getCoef() const override { return 0.0; }
}; };
class HardSwishCnnl : public UnaryCnnl { REGISTER_KERNEL(Device::BANG, OpType::Relu, DataType::Float32, ReluCnnl,
cnnlActivationMode_t getOpType() const override { "Relu_cnnl_BANG_Float32");
return CNNL_ACTIVATION_HARDSWISH; REGISTER_KERNEL(Device::BANG, OpType::PRelu, DataType::Float32, PReluCnnl,
} "PRelu_cnnl_BANG_Float32");
float getCoef() const override { return 0.0; } REGISTER_KERNEL(Device::BANG, OpType::Sigmoid, DataType::Float32, SigmoidCnnl,
}; "Sigmoid_cnnl_BANG_Float32");
REGISTER_KERNEL(Device::BANG, OpType::Round, DataType::Float32, RoundCnnl,
class HardSigmoidCnnl : public UnaryCnnl { "Round_cnnl_BANG_Float32");
cnnlActivationMode_t getOpType() const override { REGISTER_KERNEL(Device::BANG, OpType::Softmax, DataType::Float32, SoftmaxCnnl,
return CNNL_ACTIVATION_HARDSIGMOID; "Softmax_cnnl_BANG_Float32");
}
float getCoef() const override { return 0.0; }
float getGamma() const override { return 1.f / 6.f; }
float getScale() const override { return 0.5f; }
};
REGISTER_KERNEL(Device::BANG, OpType::Relu, ReluCnnl, "Relu_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::PRelu, PReluCnnl, "PRelu_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::Sigmoid, SigmoidCnnl,
"Sigmoid_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::Round, RoundCnnl, "Round_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::Softmax, SoftmaxCnnl,
"Softmax_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::HardSigmoid, HardSigmoidCnnl,
"HardSigmoid_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::HardSwish, HardSwishCnnl,
"HardSwish_cnnl_BANG");
}; // namespace infini }; // namespace infini

View File

@ -24,21 +24,21 @@ class ActivationBackwardCnnl : public BangKernelWithoutConfig {
auto diffxDim = op->getOutput()->getDims(); auto diffxDim = op->getOutput()->getDims();
checkCnnlError(cnnlCreateTensorDescriptor(&yDesc)); checkCnnlError(cnnlCreateTensorDescriptor(&yDesc));
checkCnnlError(cnnlSetTensorDescriptor( checkCnnlError(cnnlSetTensorDescriptor(yDesc, CNNL_LAYOUT_NCHW,
yDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), CNNL_DTYPE_FLOAT, yDim.size(),
yDim.size(), yDim.data())); yDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&diffYDesc)); checkCnnlError(cnnlCreateTensorDescriptor(&diffYDesc));
checkCnnlError(cnnlSetTensorDescriptor( checkCnnlError(cnnlSetTensorDescriptor(
diffYDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), diffYDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, diffyDim.size(),
diffyDim.size(), diffyDim.data())); diffyDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&xDesc)); checkCnnlError(cnnlCreateTensorDescriptor(&xDesc));
checkCnnlError(cnnlSetTensorDescriptor( checkCnnlError(cnnlSetTensorDescriptor(xDesc, CNNL_LAYOUT_NCHW,
xDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), CNNL_DTYPE_FLOAT, xDim.size(),
xDim.size(), xDim.data())); xDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&diffXDesc)); checkCnnlError(cnnlCreateTensorDescriptor(&diffXDesc));
checkCnnlError(cnnlSetTensorDescriptor( checkCnnlError(cnnlSetTensorDescriptor(
diffXDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), diffXDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, diffxDim.size(),
diffxDim.size(), diffxDim.data())); diffxDim.data()));
// get op descriptor // get op descriptor
cnnlActivationDescriptor_t opDesc; cnnlActivationDescriptor_t opDesc;
checkCnnlError(cnnlCreateActivationDescriptor(&opDesc)); checkCnnlError(cnnlCreateActivationDescriptor(&opDesc));
@ -81,11 +81,11 @@ class TanhBackwardCnnl : public ActivationBackwardCnnl {
float getCoef() const override { return 0.0; } float getCoef() const override { return 0.0; }
}; };
REGISTER_KERNEL(Device::BANG, OpType::ReluBackward, ReluBackwardCnnl, REGISTER_KERNEL(Device::BANG, OpType::ReluBackward, DataType::Float32,
"ReluBackward_cnnl_BANG"); ReluBackwardCnnl, "ReluBackward_cnnl_BANG_Float32");
REGISTER_KERNEL(Device::BANG, OpType::SigmoidBackward, SigmoidBackwardCnnl, REGISTER_KERNEL(Device::BANG, OpType::SigmoidBackward, DataType::Float32,
"SigmoidBackward_cnnl_BANG"); SigmoidBackwardCnnl, "SigmoidBackward_cnnl_BANG_Float32");
REGISTER_KERNEL(Device::BANG, OpType::TanhBackward, TanhBackwardCnnl, REGISTER_KERNEL(Device::BANG, OpType::TanhBackward, DataType::Float32,
"TanhBackward_cnnl_BANG"); TanhBackwardCnnl, "TanhBackward_cnnl_BANG_Float32");
}; // namespace infini }; // namespace infini

View File

@ -1,49 +0,0 @@
#ifdef INFINI_USE_CNCL
#include "operators/all_gather.h"
#include "bang/bang_kernel_without_config.h"
#include "bang/bang_runtime.h"
#include "bang/cncl_communicator.h"
#include <thread>
namespace infini {
class AllGatherCNCL : public BangKernelWithoutConfig {
public:
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<AllGatherObj>(_op);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
int world_size = op->getWorldSize();
// Check if world size info in operator matches runtime
IT_ASSERT(world_size == context->getCommunicator().getWorldSize());
void *input = op->getInputs(0)->getRawDataPtr<void *>();
BangPtr output_temp =
context->getWorkspace(op->getInputs(0)->getBytes() * world_size);
// void *output = op->getOutput()->getRawDataPtr<void *>();
checkBangError(cnrtMalloc(&output_temp,
op->getInputs(0)->getBytes() * world_size));
size_t bytes = op->getInputs(0)->getBytes();
size_t count = bytes / op->getDType().getSize();
cnclComm_t comm =
dynamic_cast<CnclCommunicatorObj &>(context->getCommunicator())
.getCnclComm();
cnrtQueue_t queue = context->getBangQueue();
CNCL_CHECK(cnclAllGather(input, output_temp, count,
cnclDataTypeConvert(op->getDType()), comm,
queue));
checkBangError(cnrtQueueSync(queue));
for (int i = 0; i < world_size; ++i) {
Tensor output = op->getOutput(i);
context->copyBlobInsideRuntime(
output->getRawDataPtr<float *>(),
static_cast<float *>(output_temp) + i * count, bytes);
}
checkBangError(cnrtFree(output_temp));
}
};
REGISTER_KERNEL(Device::BANG, OpType::AllGather, AllGatherCNCL,
"AllGather_CNCL_BANG");
} // namespace infini
#endif

View File

@ -1,54 +0,0 @@
#ifdef INFINI_USE_CNCL
#include "operators/all_reduce.h"
#include "bang/bang_kernel_without_config.h"
#include "bang/bang_runtime.h"
#include "bang/cncl_communicator.h"
#include <thread>
namespace infini {
class AllReduceCNCL : public BangKernelWithoutConfig {
public:
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<AllReduceBaseObj>(_op);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *input = op->getInputs(0)->getRawDataPtr<void *>();
void *output = op->getOutput()->getRawDataPtr<void *>();
size_t bytes = op->getInputs(0)->getBytes();
size_t count = bytes / op->getDType().getSize();
cnclComm_t comm =
dynamic_cast<CnclCommunicatorObj &>(context->getCommunicator())
.getCnclComm();
cnrtQueue_t queue = context->getBangQueue();
// checkBangError(cnrtQueueSync(queue));
CNCL_CHECK(cnclAllReduce(input, output, count,
cnclDataTypeConvert(op->getDType()),
getRedOp(), comm, queue));
checkBangError(cnrtQueueSync(queue));
}
virtual cnclReduceOp_t getRedOp() const = 0;
};
class AllReduceSumCNCL : public AllReduceCNCL {
cnclReduceOp_t getRedOp() const override { return cnclSum; }
};
class AllReduceProdCNCL : public AllReduceCNCL {
cnclReduceOp_t getRedOp() const override { return cnclProd; }
};
class AllReduceMinCNCL : public AllReduceCNCL {
cnclReduceOp_t getRedOp() const override { return cnclMin; }
};
class AllReduceMaxCNCL : public AllReduceCNCL {
cnclReduceOp_t getRedOp() const override { return cnclMax; }
};
REGISTER_KERNEL(Device::BANG, OpType::AllReduceSum, AllReduceSumCNCL,
"AllReduce_Sum_CNCL_BANG");
REGISTER_KERNEL(Device::BANG, OpType::AllReduceProd, AllReduceProdCNCL,
"AllReduce_Prod_CNCL_BANG");
REGISTER_KERNEL(Device::BANG, OpType::AllReduceMin, AllReduceMinCNCL,
"AllReduce_Min_CNCL_BANG");
REGISTER_KERNEL(Device::BANG, OpType::AllReduceMax, AllReduceMaxCNCL,
"AllReduce_Max_CNCL_BANG");
} // namespace infini
#endif

View File

@ -17,91 +17,55 @@ class BatchNormCnnl : public BangKernelWithoutConfig {
void *const output = (op->getOutput()->getRawDataPtr<void *>()); void *const output = (op->getOutput()->getRawDataPtr<void *>());
auto dims = op->getInputs(0)->getDims(); auto dims = op->getInputs(0)->getDims();
auto outDims = op->getOutput()->getDims();
if (dims.size() != 4) if (dims.size() != 4)
IT_TODO_HALT(); IT_TODO_HALT();
int dimsTrans[4] = {dims[0], dims[2], dims[3], dims[1]}; int dimArray[4], strideArray[4], dimPArray[1], stridePArray[1];
int dimsOutTrans[4] = {outDims[0], outDims[2], outDims[3], outDims[1]};
int permute[4] = {0, 2, 3, 1};
int permuteOut[4] = {0, 3, 1, 2};
for (size_t i = 0; i < dims.size(); ++i) {
dimArray[i] = dims[i];
strideArray[i] = op->getInputs(0)->getStride()[i];
}
int w = dimArray[3];
dimArray[3] = dimArray[1];
int h = dimArray[2];
dimArray[1] = h;
dimArray[2] = w;
dimPArray[0] = op->getInputs(1)->getDims()[0];
stridePArray[0] = op->getInputs(1)->getDims()[0];
// get inputs // get inputs
cnnlTensorDescriptor_t inDesc, intransDesc, outDesc, outtransDesc; cnnlTensorDescriptor_t inDesc;
checkCnnlError(cnnlCreateTensorDescriptor(&inDesc)); checkCnnlError(cnnlCreateTensorDescriptor(&inDesc));
checkCnnlError(cnnlCreateTensorDescriptor(&intransDesc)); checkCnnlError(cnnlSetTensorDescriptorEx(inDesc, CNNL_LAYOUT_NHWC,
checkCnnlError(cnnlCreateTensorDescriptor(&outDesc)); CNNL_DTYPE_FLOAT, dims.size(),
checkCnnlError(cnnlCreateTensorDescriptor(&outtransDesc)); dimArray, strideArray));
checkCnnlError(cnnlSetTensorDescriptor(
inDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
dims.size(), dims.data()));
checkCnnlError(cnnlSetTensorDescriptor(
intransDesc, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()),
dims.size(), dimsTrans));
checkCnnlError(cnnlSetTensorDescriptor(
outDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
outDims.size(), outDims.data()));
checkCnnlError(cnnlSetTensorDescriptor(
outtransDesc, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()),
outDims.size(), dimsOutTrans));
cnnlTransposeDescriptor_t opDesc;
checkCnnlError(cnnlCreateTransposeDescriptor(&opDesc));
checkCnnlError(cnnlSetTransposeDescriptor(opDesc, 4, permute));
size_t wsSize;
cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), inDesc, opDesc,
&wsSize);
BangPtr wsData = context->getWorkspace(wsSize);
BangPtr inputTrans = context->getWorkspace(
cnnlGetTensorElementNum(inDesc) * op->getDType().getSize());
BangPtr outputTrans = context->getWorkspace(
cnnlGetTensorElementNum(inDesc) * op->getDType().getSize());
cnnlStatus_t stat =
cnnlTranspose_v2(context->cnnlHandle(), opDesc, inDesc, input,
intransDesc, inputTrans, wsData, wsSize);
if (stat != CNNL_STATUS_SUCCESS)
return;
// get bnScaleBiasMeanVarDesc // get bnScaleBiasMeanVarDesc
auto dimsScaleBiasMeanVar = op->getInputs(1)->getDims();
cnnlTensorDescriptor_t paraDesc; cnnlTensorDescriptor_t paraDesc;
checkCnnlError(cnnlCreateTensorDescriptor(&paraDesc)); checkCnnlError(cnnlCreateTensorDescriptor(&paraDesc));
checkCnnlError(cnnlSetTensorDescriptor( checkCnnlError(cnnlSetTensorDescriptorEx(paraDesc, CNNL_LAYOUT_ARRAY,
paraDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()), CNNL_DTYPE_FLOAT, 1, dimPArray,
dimsScaleBiasMeanVar.size(), dimsScaleBiasMeanVar.data())); stridePArray));
float alpha = 1.f, beta = 0.f; float alpha = 1.f, beta = 0.f;
// This mode is intended for use after convolutional layers // This mode is intended for use after convolutional layers
stat = cnnlBatchNormForwardInference( cnnlStatus_t stat = cnnlBatchNormForwardInference(
context->cnnlHandle(), &alpha, &beta, intransDesc, inputTrans, context->cnnlHandle(), &alpha, &beta, inDesc, input, paraDesc,
paraDesc, scale, bias, mean, var, op->getEps(), outtransDesc, scale, bias, mean, var, op->getEps(), inDesc, output);
outputTrans);
if (stat != CNNL_STATUS_SUCCESS)
return;
cnnlTransposeDescriptor_t op2Desc;
checkCnnlError(cnnlCreateTransposeDescriptor(&op2Desc));
checkCnnlError(cnnlSetTransposeDescriptor(op2Desc, 4, permuteOut));
cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), intransDesc,
op2Desc, &wsSize);
BangPtr ws2Data = context->getWorkspace(wsSize);
stat = cnnlTranspose_v2(context->cnnlHandle(), op2Desc, outtransDesc,
outputTrans, outDesc, output, ws2Data, wsSize);
if (stat != CNNL_STATUS_SUCCESS) if (stat != CNNL_STATUS_SUCCESS)
return; return;
// Destories in BANG does not require sync. But cnnl does not state // Destories in BANG does not require sync. But cnnl does not state
// whether sync is required before destories. // whether sync is required before destories.
checkCnnlError(cnnlDestroyTensorDescriptor(inDesc)); checkCnnlError(cnnlDestroyTensorDescriptor(inDesc));
checkCnnlError(cnnlDestroyTensorDescriptor(outDesc));
checkCnnlError(cnnlDestroyTensorDescriptor(intransDesc));
checkCnnlError(cnnlDestroyTensorDescriptor(outtransDesc));
checkCnnlError(cnnlDestroyTensorDescriptor(paraDesc)); checkCnnlError(cnnlDestroyTensorDescriptor(paraDesc));
checkCnnlError(cnnlDestroyTransposeDescriptor(opDesc));
checkCnnlError(cnnlDestroyTransposeDescriptor(op2Desc));
} }
}; };
REGISTER_KERNEL(Device::BANG, OpType::BatchNormalization, BatchNormCnnl, REGISTER_KERNEL(Device::BANG, OpType::BatchNormalization, DataType::Float32,
"BatchNorm_cnnl_BANG"); BatchNormCnnl, "BatchNorm_cnnl_BANG_Float32");
}; // namespace infini }; // namespace infini

View File

@ -1,35 +0,0 @@
#ifdef INFINI_USE_CNCL
#include "operators/broadcast.h"
#include "bang/bang_kernel_without_config.h"
#include "bang/bang_runtime.h"
#include "bang/cncl_communicator.h"
#include <thread>
namespace infini {
class BroadcastCNCL : public BangKernelWithoutConfig {
public:
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<BroadcastObj>(_op);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *input = op->getInputs(0)->getRawDataPtr<void *>();
void *output = op->getOutput()->getRawDataPtr<void *>();
size_t bytes = op->getInputs(0)->getBytes();
size_t count = bytes / op->getDType().getSize();
cnclComm_t comm =
dynamic_cast<CnclCommunicatorObj &>(context->getCommunicator())
.getCnclComm();
cnrtQueue_t queue = context->getBangQueue();
// TODO: Using default stream 0 for now.
CNCL_CHECK(cnclBroadcast(input, output, count,
cnclDataTypeConvert(op->getDType()),
op->getRoot(), comm, queue));
checkBangError(cnrtQueueSync(queue));
}
};
REGISTER_KERNEL(Device::BANG, OpType::Broadcast, BroadcastCNCL,
"Broadcast_CNCL_BANG");
} // namespace infini
#endif

View File

@ -212,6 +212,7 @@ class CastCnnl : public BangKernelWithoutConfig {
} }
}; };
REGISTER_KERNEL(Device::BANG, OpType::Cast, CastCnnl, "Cast_cnnl_BANG"); REGISTER_KERNEL(Device::BANG, OpType::Cast, DataType::Float32, CastCnnl,
"Cast_cnnl_BANG_Float32");
}; // namespace infini }; // namespace infini

View File

@ -17,13 +17,13 @@ class CeilCnnl : public BangKernelWithoutConfig {
auto cDim = op->getOutput()->getDims(); auto cDim = op->getOutput()->getDims();
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor( checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), CNNL_DTYPE_FLOAT, aDim.size(),
aDim.size(), aDim.data())); aDim.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor( checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), CNNL_DTYPE_FLOAT, cDim.size(),
cDim.size(), cDim.data())); cDim.data()));
cnnlStatus_t stat = cnnlStatus_t stat =
cnnlCeil(context->cnnlHandle(), aDesc, aData, cDesc, cData); cnnlCeil(context->cnnlHandle(), aDesc, aData, cDesc, cData);
@ -35,6 +35,7 @@ class CeilCnnl : public BangKernelWithoutConfig {
} }
}; };
REGISTER_KERNEL(Device::BANG, OpType::Ceil, CeilCnnl, "Ceil_cnnl_BANG"); REGISTER_KERNEL(Device::BANG, OpType::Ceil, DataType::Float32, CeilCnnl,
"Ceil_cnnl_BANG_Float32");
}; // namespace infini }; // namespace infini

View File

@ -18,9 +18,9 @@ class ClipCnnl : public BangKernelWithoutConfig {
auto aDim = op->getInputs(0)->getDims(); auto aDim = op->getInputs(0)->getDims();
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor( checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), CNNL_DTYPE_FLOAT, aDim.size(),
aDim.size(), aDim.data())); aDim.data()));
cnnlStatus_t stat = cnnlStatus_t stat =
cnnlClip(context->cnnlHandle(), aDesc, aData, &min, &max, cData); cnnlClip(context->cnnlHandle(), aDesc, aData, &min, &max, cData);
if (stat != CNNL_STATUS_SUCCESS) if (stat != CNNL_STATUS_SUCCESS)
@ -30,6 +30,7 @@ class ClipCnnl : public BangKernelWithoutConfig {
} }
}; };
REGISTER_KERNEL(Device::BANG, OpType::Clip, ClipCnnl, "Clip_cnnl_BANG"); REGISTER_KERNEL(Device::BANG, OpType::Clip, DataType::Float32, ClipCnnl,
"Clip_cnnl_BANG_Float32");
}; // namespace infini }; // namespace infini

View File

@ -14,18 +14,17 @@ class ConcatCnnl : public BangKernelWithoutConfig {
auto cDim = op->getOutput()->getDims(); auto cDim = op->getOutput()->getDims();
cnnlTensorDescriptor_t desc; cnnlTensorDescriptor_t desc;
checkCnnlError(cnnlCreateTensorDescriptor(&desc)); checkCnnlError(cnnlCreateTensorDescriptor(&desc));
checkCnnlError(cnnlSetTensorDescriptor( checkCnnlError(cnnlSetTensorDescriptor(desc, CNNL_LAYOUT_NCHW,
desc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), CNNL_DTYPE_FLOAT, cDim.size(),
cDim.size(), cDim.data())); cDim.data()));
cnnlTensorDescriptor_t descArray[num]; cnnlTensorDescriptor_t descArray[num];
for (int i = 0; i < num; ++i) { for (int i = 0; i < num; ++i) {
checkCnnlError(cnnlCreateTensorDescriptor(&descArray[i])); checkCnnlError(cnnlCreateTensorDescriptor(&descArray[i]));
checkCnnlError( checkCnnlError(cnnlSetTensorDescriptor(
cnnlSetTensorDescriptor(descArray[i], CNNL_LAYOUT_NCHW, descArray[i], CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT,
cnnlDataTypeConvert(op->getDType()), op->getInputs(i)->getDims().size(),
op->getInputs(i)->getDims().size(), op->getInputs(i)->getDims().data()));
op->getInputs(i)->getDims().data()));
} }
void *argv[num]; void *argv[num];
@ -51,5 +50,6 @@ class ConcatCnnl : public BangKernelWithoutConfig {
} }
}; };
REGISTER_KERNEL(Device::BANG, OpType::Concat, ConcatCnnl, "Concat_cnnl_BANG"); REGISTER_KERNEL(Device::BANG, OpType::Concat, DataType::Float32, ConcatCnnl,
"Concat_cnnl_BANG_Float32");
}; // namespace infini }; // namespace infini

View File

@ -20,9 +20,8 @@ class ConvCnnl : public BangKernelWithoutConfig {
cnnlConvolutionDescriptor_t convDesc; cnnlConvolutionDescriptor_t convDesc;
checkCnnlError(cnnlCreateConvolutionDescriptor(&convDesc)); checkCnnlError(cnnlCreateConvolutionDescriptor(&convDesc));
checkCnnlError( checkCnnlError(cnnlSetConvolutionDescriptor(
cnnlSetConvolutionDescriptor(convDesc, 4, pad, stride, dilation, g, convDesc, 4, pad, stride, dilation, g, CNNL_DTYPE_FLOAT));
cnnlDataTypeConvert(op->getDType())));
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>()); void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>()); void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
@ -55,24 +54,20 @@ class ConvCnnl : public BangKernelWithoutConfig {
// get inputs // get inputs
checkCnnlError(cnnlCreateTensorDescriptor(&aInDesc)); checkCnnlError(cnnlCreateTensorDescriptor(&aInDesc));
checkCnnlError(cnnlSetTensorDescriptor( checkCnnlError(cnnlSetTensorDescriptor(aInDesc, CNNL_LAYOUT_NCHW,
aInDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4, CNNL_DTYPE_FLOAT, 4, inputs0));
inputs0));
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor( checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()), 4, aDesc, CNNL_LAYOUT_NHWC, CNNL_DTYPE_FLOAT, 4, inputs0Array));
inputs0Array));
checkCnnlError(cnnlCreateTensorDescriptor(&bInDesc)); checkCnnlError(cnnlCreateTensorDescriptor(&bInDesc));
checkCnnlError(cnnlSetTensorDescriptor( checkCnnlError(cnnlSetTensorDescriptor(bInDesc, CNNL_LAYOUT_NCHW,
bInDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4, CNNL_DTYPE_FLOAT, 4, inputs1));
inputs1));
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc)); checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
checkCnnlError(cnnlSetTensorDescriptor( checkCnnlError(cnnlSetTensorDescriptor(
bDesc, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()), 4, bDesc, CNNL_LAYOUT_NHWC, CNNL_DTYPE_FLOAT, 4, inputs1Array));
inputs1Array));
int permute[4] = {0, 2, 3, 1}; int permute[4] = {0, 2, 3, 1};
cnnlTransposeDescriptor_t opDesc; cnnlTransposeDescriptor_t opDesc;
@ -84,7 +79,7 @@ class ConvCnnl : public BangKernelWithoutConfig {
&wsSize); &wsSize);
BangPtr wsData = context->getWorkspace(wsSize); BangPtr wsData = context->getWorkspace(wsSize);
BangPtr aDataOut = context->getWorkspace( BangPtr aDataOut = context->getWorkspace(
cnnlGetTensorElementNum(aInDesc) * op->getDType().getSize()); cnnlGetTensorElementNum(aInDesc) * sizeof(float));
cnnlStatus_t stat = cnnlStatus_t stat =
cnnlTranspose_v2(context->cnnlHandle(), opDesc, aInDesc, aData, cnnlTranspose_v2(context->cnnlHandle(), opDesc, aInDesc, aData,
aDesc, aDataOut, wsData, wsSize); aDesc, aDataOut, wsData, wsSize);
@ -95,7 +90,7 @@ class ConvCnnl : public BangKernelWithoutConfig {
&wsSize); &wsSize);
wsData = context->getWorkspace(wsSize); wsData = context->getWorkspace(wsSize);
BangPtr bDataOut = context->getWorkspace( BangPtr bDataOut = context->getWorkspace(
cnnlGetTensorElementNum(bInDesc) * op->getDType().getSize()); cnnlGetTensorElementNum(bInDesc) * sizeof(float));
stat = cnnlTranspose_v2(context->cnnlHandle(), opDesc, bInDesc, bData, stat = cnnlTranspose_v2(context->cnnlHandle(), opDesc, bInDesc, bData,
bDesc, bDataOut, wsData, wsSize); bDesc, bDataOut, wsData, wsSize);
if (stat != CNNL_STATUS_SUCCESS) if (stat != CNNL_STATUS_SUCCESS)
@ -104,13 +99,11 @@ class ConvCnnl : public BangKernelWithoutConfig {
// get outputs // get outputs
checkCnnlError(cnnlCreateTensorDescriptor(&cInDesc)); checkCnnlError(cnnlCreateTensorDescriptor(&cInDesc));
checkCnnlError(cnnlSetTensorDescriptor( checkCnnlError(cnnlSetTensorDescriptor(
cInDesc, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()), 4, cInDesc, CNNL_LAYOUT_NHWC, CNNL_DTYPE_FLOAT, 4, outputArray));
outputArray));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor( checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4, CNNL_DTYPE_FLOAT, 4, output));
output));
cnnlConvolutionForwardAlgo_t algo; cnnlConvolutionForwardAlgo_t algo;
cnnlGetConvolutionForwardAlgorithm(context->cnnlHandle(), convDesc, cnnlGetConvolutionForwardAlgorithm(context->cnnlHandle(), convDesc,
@ -122,11 +115,11 @@ class ConvCnnl : public BangKernelWithoutConfig {
algo, &wsSize); algo, &wsSize);
wsData = context->getWorkspace(wsSize); wsData = context->getWorkspace(wsSize);
BangPtr cDataIn = context->getWorkspace( BangPtr cDataIn = context->getWorkspace(
cnnlGetTensorElementNum(cInDesc) * op->getDType().getSize()); cnnlGetTensorElementNum(cInDesc) * sizeof(float));
stat = cnnlConvolutionForward( stat = cnnlConvolutionForward(
context->cnnlHandle(), convDesc, algo, NULL, aDesc, aDataOut, bDesc, context->cnnlHandle(), convDesc, algo, NULL, aDesc, aData, bDesc,
bDataOut, NULL, NULL, wsData, wsSize, NULL, cInDesc, cDataIn); bData, NULL, NULL, wsData, wsSize, NULL, cInDesc, cDataIn);
if (stat != CNNL_STATUS_SUCCESS) if (stat != CNNL_STATUS_SUCCESS)
return; return;
@ -137,10 +130,10 @@ class ConvCnnl : public BangKernelWithoutConfig {
cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), cInDesc, opOutDesc, cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), cInDesc, opOutDesc,
&wsSize); &wsSize);
BangPtr wsData2 = context->getWorkspace(wsSize); wsData = context->getWorkspace(wsSize);
stat = cnnlTranspose_v2(context->cnnlHandle(), opOutDesc, cInDesc, stat = cnnlTranspose_v2(context->cnnlHandle(), opOutDesc, cInDesc,
cDataIn, cDesc, cData, wsData2, wsSize); cDataIn, cDesc, cData, wsData, wsSize);
if (stat != CNNL_STATUS_SUCCESS) if (stat != CNNL_STATUS_SUCCESS)
return; return;
@ -158,5 +151,6 @@ class ConvCnnl : public BangKernelWithoutConfig {
} }
}; };
REGISTER_KERNEL(Device::BANG, OpType::Conv, ConvCnnl, "Conv_cnnl_BANG"); REGISTER_KERNEL(Device::BANG, OpType::Conv, DataType::Float32, ConvCnnl,
"Conv_cnnl_BANG_Float32");
}; // namespace infini }; // namespace infini

Some files were not shown because too many files have changed in this diff Show More