forked from jiuyuan/InfiniTensor
Compare commits
5 Commits
master
...
cuda-trans
Author | SHA1 | Date |
---|---|---|
![]() |
7146294baa | |
![]() |
73e3f1fc6f | |
![]() |
86133c8d0a | |
![]() |
2761d46737 | |
![]() |
aa1c3222ed |
|
@ -13,6 +13,3 @@
|
||||||
[submodule "example"]
|
[submodule "example"]
|
||||||
path = examples/NNmodel
|
path = examples/NNmodel
|
||||||
url = git@github.com:wanghailu0717/NNmodel.git
|
url = git@github.com:wanghailu0717/NNmodel.git
|
||||||
[submodule "examples/distributed/onnxsim_large_model"]
|
|
||||||
path = examples/distributed/onnxsim_large_model
|
|
||||||
url = git@github.com:luchangli03/onnxsim_large_model.git
|
|
||||||
|
|
|
@ -1,7 +1,5 @@
|
||||||
# 分布式脚本
|
# 分布式脚本
|
||||||
|
|
||||||
## 英伟达平台运行方式
|
|
||||||
|
|
||||||
#### 1. 运行pytorch模型并生成输入和标准输出,可选择导出onnx
|
#### 1. 运行pytorch模型并生成输入和标准输出,可选择导出onnx
|
||||||
|
|
||||||
使用 `--export_onnx` 设置导出onnx的目录,默认为当前路径 `./`,不使用这个flag则只进行计算和生成输入输出。
|
使用 `--export_onnx` 设置导出onnx的目录,默认为当前路径 `./`,不使用这个flag则只进行计算和生成输入输出。
|
||||||
|
@ -17,23 +15,3 @@ python run_pytorch.py --model gpt2 --batch_size 1 --length 1 --export_onnx ./
|
||||||
```bash
|
```bash
|
||||||
python cuda_launch.py --model "/XXX/XXX.onnx" --nproc_per_node 4
|
python cuda_launch.py --model "/XXX/XXX.onnx" --nproc_per_node 4
|
||||||
```
|
```
|
||||||
|
|
||||||
## 寒武纪平台运行方式
|
|
||||||
|
|
||||||
**将上述运行脚本 `run_pytorch.py` 以及 `cuda_launch.py` 针对寒武纪平台做了相应的适配,具体见 `run_pytorch_mlu.py` 以及 `bang_launch.py`。**
|
|
||||||
|
|
||||||
#### 1. 运行pytorch模型并生成输入和标准输出,可选择导出onnx
|
|
||||||
|
|
||||||
使用 `--export_onnx` 设置导出onnx的目录,默认为当前路径 `./`,不使用这个flag则只进行计算和生成输入输出。
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python run_pytorch_mlu.py --model gpt2 --batch_size 1 --length 1 --export_onnx ./
|
|
||||||
```
|
|
||||||
|
|
||||||
会在当前目录下生成输入输出文件`test_inputs.npy` 和 `test_results.npy`,目前只支持单一输入输出。
|
|
||||||
|
|
||||||
#### 2. 运行InfiniTensor分布式脚本
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python bang_launch.py --model "/XXX/XXX.onnx" --nproc_per_node 4
|
|
||||||
```
|
|
|
@ -1,249 +0,0 @@
|
||||||
import argparse
|
|
||||||
import torch
|
|
||||||
import torch_mlu
|
|
||||||
from transformers import BertModel, BertConfig
|
|
||||||
from transformers import GPT2Model, GPT2Config
|
|
||||||
from transformers import OPTModel, OPTConfig
|
|
||||||
from transformers import AlbertModel, AlbertConfig
|
|
||||||
from transformers import LlamaModel, LlamaConfig
|
|
||||||
import time
|
|
||||||
import numpy as np
|
|
||||||
import onnx
|
|
||||||
import sys
|
|
||||||
import os
|
|
||||||
from onnx.external_data_helper import convert_model_to_external_data
|
|
||||||
from onnxsim import simplify
|
|
||||||
|
|
||||||
def parse_args():
|
|
||||||
parser = argparse.ArgumentParser(description="Run pytorch gpt2/bert/opt and optionally export onnx.")
|
|
||||||
parser.add_argument(
|
|
||||||
"--model", type=str, choices=["gpt2", "bert", "opt", "llama", "albert"], required=True, help="model type"
|
|
||||||
)
|
|
||||||
parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
|
|
||||||
parser.add_argument("--length", type=int, default=1, help="sequence length.")
|
|
||||||
parser.add_argument(
|
|
||||||
"--export_onnx",
|
|
||||||
type=str,
|
|
||||||
nargs="?",
|
|
||||||
default=None,
|
|
||||||
const="./",
|
|
||||||
help="whether and where to export onnx file",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--type", type=str, choices=["fp32", "fp16", "tf32"], required=True, help="model data type"
|
|
||||||
)
|
|
||||||
args = parser.parse_args()
|
|
||||||
print("arg setting: ", args)
|
|
||||||
return (
|
|
||||||
args.model,
|
|
||||||
args.batch_size,
|
|
||||||
args.length,
|
|
||||||
args.export_onnx,
|
|
||||||
args.type
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def get_model(modelname):
|
|
||||||
match modelname:
|
|
||||||
case "albert":
|
|
||||||
model = AlbertModel.from_pretrained("albert/albert-base-v2")
|
|
||||||
voc_size = AlbertConfig().vocab_size
|
|
||||||
case "bert":
|
|
||||||
model = BertModel.from_pretrained("bert-base-uncased", add_pooling_layer=False, hidden_act="gelu_new") # erf is not impl by infini
|
|
||||||
voc_size = BertConfig().vocab_size
|
|
||||||
case "gpt2":
|
|
||||||
model = GPT2Model.from_pretrained("GPT2")
|
|
||||||
voc_size = GPT2Config().vocab_size
|
|
||||||
case "opt":
|
|
||||||
model = OPTModel.from_pretrained("facebook/opt-125m")
|
|
||||||
voc_size = OPTConfig().vocab_size
|
|
||||||
case "llama":
|
|
||||||
model = LlamaModel.from_pretrained("meta-llama/Llama-2-7b-hf")
|
|
||||||
voc_size = LlamaConfig().vocab_size
|
|
||||||
case _:
|
|
||||||
raise KeyError(modelname)
|
|
||||||
|
|
||||||
model = model.eval()
|
|
||||||
return model, voc_size
|
|
||||||
|
|
||||||
def run_pytorch(torch_model, voc_size, batchsize, len, dtype="fp32"):
|
|
||||||
data = np.random.randint(0, voc_size, (batchsize, len), dtype=np.int32)
|
|
||||||
os.makedirs(os.path.dirname("./data/"), exist_ok=True)
|
|
||||||
np.save("./data/input_0", data)
|
|
||||||
inputs = torch.from_numpy(data).to("mlu")
|
|
||||||
torch_model = torch_model.to("mlu")
|
|
||||||
if dtype == "fp16":
|
|
||||||
torch_model = torch_model.half()
|
|
||||||
|
|
||||||
n_iter = 20
|
|
||||||
with torch.no_grad():
|
|
||||||
for _ in range(10):
|
|
||||||
outputs = torch_model(inputs)
|
|
||||||
torch.mlu.synchronize()
|
|
||||||
begin = time.time()
|
|
||||||
with torch.no_grad():
|
|
||||||
for _ in range(n_iter):
|
|
||||||
torch.mlu.synchronize()
|
|
||||||
outputs = torch_model(inputs)
|
|
||||||
torch.mlu.synchronize()
|
|
||||||
torch.mlu.synchronize()
|
|
||||||
end = time.time()
|
|
||||||
|
|
||||||
avg_time = (end - begin) / n_iter
|
|
||||||
outputs = outputs.last_hidden_state.to("cpu")
|
|
||||||
print("outputs abs mean:", abs(np.array(outputs)).mean())
|
|
||||||
print(f"average time: {avg_time}")
|
|
||||||
# torch.mlu.memory.empty_cache()
|
|
||||||
np.save("./data/output", np.array(outputs))
|
|
||||||
print("Save input & output into ./data.")
|
|
||||||
|
|
||||||
|
|
||||||
def export_onnx(modelname, model, data, path, extern=False, dtype="fp32"):
|
|
||||||
data = data.to("mlu")
|
|
||||||
model = model.to("mlu")
|
|
||||||
if dtype == "fp16":
|
|
||||||
model = model.half()
|
|
||||||
torch.onnx.export(model, data, path, verbose=False, do_constant_folding=True)
|
|
||||||
if modelname != "llama":
|
|
||||||
# use onnxsim to simplify
|
|
||||||
onnx_model = onnx.load(path)
|
|
||||||
onnx_model, check = simplify(onnx_model, skipped_optimizers=['eliminate_duplicate_initializer'])
|
|
||||||
# onnx_model, check = simplify(onnx_model, skipped_optimizers=['fuse_qkv', 'eliminate_duplicate_initializer'])
|
|
||||||
assert check
|
|
||||||
add_value_info_for_constants(onnx_model)
|
|
||||||
onnx_model = onnx.shape_inference.infer_shapes(onnx_model)
|
|
||||||
if extern:
|
|
||||||
extern_path = path.replace('.onnx', '.pb')
|
|
||||||
if os.path.exists(extern_path):
|
|
||||||
os.remove(extern_path)
|
|
||||||
extern_path = extern_path.split("/")[-1]
|
|
||||||
convert_model_to_external_data(
|
|
||||||
onnx_model,
|
|
||||||
all_tensors_to_one_file=True,
|
|
||||||
location=extern_path,
|
|
||||||
size_threshold=1024,
|
|
||||||
convert_attribute=False,
|
|
||||||
)
|
|
||||||
onnx.save(onnx_model, path)
|
|
||||||
else:
|
|
||||||
# use third party tool to simplify llama
|
|
||||||
# reference: https://github.com/luchangli03/onnxsim_large_model/
|
|
||||||
sys.path.append("onnxsim_large_model")
|
|
||||||
from onnx_utils import set_onnx_input_shape
|
|
||||||
from compress_model import SIZE_1MB, compress_onnx_model, uncompress_onnx_model
|
|
||||||
|
|
||||||
in_model_path = path
|
|
||||||
out_model_path = path
|
|
||||||
if not out_model_path:
|
|
||||||
out_model_path = in_model_path[:-5] + ".sim.onnx"
|
|
||||||
if os.path.isdir(out_model_path):
|
|
||||||
out_model_path = os.path.join(out_model_path, os.path.basename(in_model_path))
|
|
||||||
|
|
||||||
onnx_model = onnx.load(in_model_path)
|
|
||||||
print(f"load model from {in_model_path} success")
|
|
||||||
|
|
||||||
size_th_bytes = 1024 * 1024
|
|
||||||
|
|
||||||
onnx_model, removed_inits = compress_onnx_model(onnx_model, size_th_bytes=size_th_bytes)
|
|
||||||
print(f"compress model success")
|
|
||||||
|
|
||||||
onnx_model = set_onnx_input_shape(onnx_model, "")
|
|
||||||
|
|
||||||
tensor_size_threshold = f"1024KB"
|
|
||||||
skipped_optimizers = []
|
|
||||||
skipped_optimizers.append("eliminate_duplicate_initializer")
|
|
||||||
onnx_model, check = simplify(onnx_model, skipped_optimizers=skipped_optimizers,
|
|
||||||
tensor_size_threshold=tensor_size_threshold)
|
|
||||||
if not check:
|
|
||||||
raise ValueError(f"simplify compressed model {in_model_path} failed")
|
|
||||||
|
|
||||||
print(f"simplify model success")
|
|
||||||
|
|
||||||
onnx_model = uncompress_onnx_model(onnx_model, removed_inits)
|
|
||||||
print(f"uncompress model success")
|
|
||||||
|
|
||||||
add_value_info_for_constants(onnx_model)
|
|
||||||
|
|
||||||
onnx.save(onnx_model, out_model_path, save_as_external_data=True)
|
|
||||||
|
|
||||||
|
|
||||||
def add_value_info_for_constants(model : onnx.ModelProto):
|
|
||||||
"""
|
|
||||||
Currently onnx.shape_inference doesn't use the shape of initializers, so add
|
|
||||||
that info explicitly as ValueInfoProtos.
|
|
||||||
Mutates the model.
|
|
||||||
Args:
|
|
||||||
model: The ModelProto to update.
|
|
||||||
"""
|
|
||||||
# All (top-level) constants will have ValueInfos before IRv4 as they are all inputs
|
|
||||||
if model.ir_version < 4:
|
|
||||||
return
|
|
||||||
|
|
||||||
def add_const_value_infos_to_graph(graph : onnx.GraphProto):
|
|
||||||
inputs = {i.name for i in graph.input}
|
|
||||||
existing_info = {vi.name: vi for vi in graph.value_info}
|
|
||||||
for init in graph.initializer:
|
|
||||||
# Check it really is a constant, not an input
|
|
||||||
if init.name in inputs:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# The details we want to add
|
|
||||||
elem_type = init.data_type
|
|
||||||
shape = init.dims
|
|
||||||
|
|
||||||
# Get existing or create new value info for this constant
|
|
||||||
vi = existing_info.get(init.name)
|
|
||||||
if vi is None:
|
|
||||||
vi = graph.value_info.add()
|
|
||||||
vi.name = init.name
|
|
||||||
|
|
||||||
# Even though it would be weird, we will not overwrite info even if it doesn't match
|
|
||||||
tt = vi.type.tensor_type
|
|
||||||
if tt.elem_type == onnx.TensorProto.UNDEFINED:
|
|
||||||
tt.elem_type = elem_type
|
|
||||||
if not tt.HasField("shape"):
|
|
||||||
# Ensure we set an empty list if the const is scalar (zero dims)
|
|
||||||
tt.shape.dim.extend([])
|
|
||||||
for dim in shape:
|
|
||||||
tt.shape.dim.add().dim_value = dim
|
|
||||||
|
|
||||||
# Handle subgraphs
|
|
||||||
for node in graph.node:
|
|
||||||
for attr in node.attribute:
|
|
||||||
# Ref attrs refer to other attrs, so we don't need to do anything
|
|
||||||
if attr.ref_attr_name != "":
|
|
||||||
continue
|
|
||||||
|
|
||||||
if attr.type == onnx.AttributeProto.GRAPH:
|
|
||||||
add_const_value_infos_to_graph(attr.g)
|
|
||||||
if attr.type == onnx.AttributeProto.GRAPHS:
|
|
||||||
for g in attr.graphs:
|
|
||||||
add_const_value_infos_to_graph(g)
|
|
||||||
|
|
||||||
|
|
||||||
return add_const_value_infos_to_graph(model.graph)
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
torch.backends.mlu.matmul.allow_tf32 = False
|
|
||||||
torch.backends.cnnl.allow_tf32 = False
|
|
||||||
modelname, batchsize, seqlen, export_path, dtype = parse_args()
|
|
||||||
if dtype == "tf32":
|
|
||||||
torch.backends.mlu.matmul.allow_tf32 = True
|
|
||||||
else:
|
|
||||||
os.environ["CAMBRICON_TF32_OVERRIDE"] = "0"
|
|
||||||
|
|
||||||
model, voc_size = get_model(modelname)
|
|
||||||
if export_path is not None:
|
|
||||||
filename = "{}_{}_{}_{}.onnx".format(modelname, batchsize, seqlen, dtype)
|
|
||||||
path = os.path.join(export_path, filename)
|
|
||||||
if not os.path.exists(path):
|
|
||||||
param = torch.zeros((batchsize, seqlen), dtype=torch.int)
|
|
||||||
export_onnx(modelname, model, param, path, True, dtype)
|
|
||||||
else:
|
|
||||||
print("Onnx path exists, skipping export.")
|
|
||||||
|
|
||||||
run_pytorch(model, voc_size, batchsize, seqlen, dtype)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
|
@ -1,39 +1,35 @@
|
||||||
import sys
|
|
||||||
sys.path.append('../')
|
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
import multiprocessing as mp
|
import multiprocessing as mp
|
||||||
from pyinfinitensor.onnx import OnnxStub, backend
|
from pyinfinitensor.onnx import OnnxStub, backend
|
||||||
import onnx
|
import onnx
|
||||||
from onnx.external_data_helper import convert_model_to_external_data
|
|
||||||
from onnx.shape_inference import infer_shapes_path
|
from onnx.shape_inference import infer_shapes_path
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from parallel_opt import parallel_model
|
from parallel_opt import parallel_model
|
||||||
|
|
||||||
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
parser = argparse.ArgumentParser(description="launch distributed infinitensor")
|
parser = argparse.ArgumentParser(description="launch distributed infinitensor")
|
||||||
parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
|
parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--nproc_per_node", type=int, default=1, help="number of processes per node"
|
"--nproc_per_node", type=int, default=2, help="number of processes per node"
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--name", type=str, default="test", help="name of this instance."
|
"--name", type=str, default="test", help="name of this instance."
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--model", type=str, required=True, help="path to the ONNX model file."
|
"--model", type=str, default="/data/onnx_models/llama2/llama_bs1_seq1024.onnx",
|
||||||
|
help="path to the ONNX model file."
|
||||||
)
|
)
|
||||||
parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
|
parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
|
||||||
parser.add_argument("--length", type=int, default=1, help="sequence length.")
|
parser.add_argument("--length", type=int, default=1, help="sequence length.")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--gen_std",
|
"--gen_std",
|
||||||
|
default=False,
|
||||||
action="store_true",
|
action="store_true",
|
||||||
help="whether to generate the standard results.",
|
help="whether to generate the standard results.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
|
||||||
"--type", type=str, choices=["fp32", "fp16", "tf32"], default="fp32", help="data type"
|
|
||||||
)
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
print("arg setting: ", args)
|
print("arg setting: ", args)
|
||||||
return (
|
return (
|
||||||
|
@ -44,46 +40,39 @@ def parse_args():
|
||||||
args.batch_size,
|
args.batch_size,
|
||||||
args.length,
|
args.length,
|
||||||
args.gen_std,
|
args.gen_std,
|
||||||
args.type,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def run_model(model, runtime, world_size=1, rank=0, n=10, data_type="default"):
|
def run_model(model, runtime, world_size=1, rank=0, n=10):
|
||||||
stub = OnnxStub(model, runtime, matmul_compute_type=data_type)
|
stub = OnnxStub(model, runtime)
|
||||||
load_inputs(stub, world_size, rank)
|
load_inputs(stub, world_size, rank)
|
||||||
# stub.tune()
|
# stub.tune()
|
||||||
stub.run()
|
stub.run()
|
||||||
# get outputs
|
# get outputs
|
||||||
|
time.sleep(0.01)
|
||||||
outputs = next(stub.outputs.values().__iter__()).copyout_numpy()
|
outputs = next(stub.outputs.values().__iter__()).copyout_numpy()
|
||||||
|
|
||||||
# bench
|
# bench
|
||||||
|
begin = time.time()
|
||||||
for _ in range(n):
|
for _ in range(n):
|
||||||
stub.run()
|
stub.run()
|
||||||
begin = time.time()
|
|
||||||
for _ in range(n * 2):
|
|
||||||
stub.run()
|
|
||||||
end = time.time()
|
end = time.time()
|
||||||
avg_time = (end - begin) / (n * 2)
|
avg_time = (end - begin) / n
|
||||||
print(f"average time: {avg_time}")
|
print(f"average time: {avg_time}")
|
||||||
return outputs
|
return outputs
|
||||||
|
|
||||||
def load_inputs(stub, world_size=1, rank=0):
|
|
||||||
for i, (name, tensor) in enumerate(stub.inputs.items()):
|
|
||||||
input = np.load(f"./data/input_{i}.npy")
|
|
||||||
if all(x == y for x,y in zip(input.shape,tensor.shape())):
|
|
||||||
tensor.copyin_numpy(input)
|
|
||||||
else:
|
|
||||||
tensor.copyin_numpy(np.hsplit(input, world_size)[rank])
|
|
||||||
|
|
||||||
|
def run_and_compare(name, model, runtime, world_size=1, rank = 0):
|
||||||
def run_and_compare(name, model, runtime, world_size=1, rank=0, data_type="default"):
|
|
||||||
results = np.load(f"./data/output.npy")
|
results = np.load(f"./data/output.npy")
|
||||||
outputs = run_model(model, runtime, world_size, rank, data_type=data_type)
|
outputs = run_model(model, runtime, world_size, rank)
|
||||||
print("outputs abs mean:", abs(outputs).mean())
|
print("answer argmax:", np.argmax(results))
|
||||||
print("max abs diff:", abs(outputs - results).max())
|
print("output argmax:", np.argmax(outputs))
|
||||||
|
#np.testing.assert_allclose(outputs, results, rtol=1e-3, atol=1e-3)
|
||||||
|
getDiff(results, outputs)
|
||||||
|
|
||||||
|
|
||||||
def start_worker(
|
def start_worker(
|
||||||
name: str, world_size: int, rank: int, local_rank: int, model: onnx.ModelProto, data_type: str
|
name: str, world_size: int, rank: int, local_rank: int, model: onnx.ModelProto
|
||||||
):
|
):
|
||||||
dist_name = name + "_dist"
|
dist_name = name + "_dist"
|
||||||
model = parallel_model(model, world_size, rank)
|
model = parallel_model(model, world_size, rank)
|
||||||
|
@ -96,7 +85,7 @@ def start_worker(
|
||||||
save_as_external_data=True,
|
save_as_external_data=True,
|
||||||
location=extern_path,
|
location=extern_path,
|
||||||
)
|
)
|
||||||
#infer_shapes_path(f"./{dist_name}_rank{rank}.onnx")
|
infer_shapes_path(f"./{dist_name}_rank{rank}.onnx")
|
||||||
runtime = backend.BangRuntime(local_rank)
|
runtime = backend.BangRuntime(local_rank)
|
||||||
# print("init comm")
|
# print("init comm")
|
||||||
runtime.init_comm(
|
runtime.init_comm(
|
||||||
|
@ -104,12 +93,13 @@ def start_worker(
|
||||||
world_size,
|
world_size,
|
||||||
rank,
|
rank,
|
||||||
)
|
)
|
||||||
run_and_compare(name, model, runtime, world_size, rank, data_type)
|
run_and_compare(name, model, runtime, world_size, rank)
|
||||||
|
|
||||||
|
|
||||||
def start_single(name, model, data_type):
|
def start_single(name, model):
|
||||||
runtime = backend.BangRuntime(0)
|
runtime = backend.BangRuntime(0)
|
||||||
run_and_compare(name, model, runtime, data_type=data_type)
|
run_and_compare(name, model, runtime)
|
||||||
|
|
||||||
|
|
||||||
def generate_input_output(model):
|
def generate_input_output(model):
|
||||||
os.makedirs(os.path.dirname("./data/"), exist_ok=True)
|
os.makedirs(os.path.dirname("./data/"), exist_ok=True)
|
||||||
|
@ -142,36 +132,55 @@ def generate_input_output(model):
|
||||||
np.save(f"./data/output", output)
|
np.save(f"./data/output", output)
|
||||||
|
|
||||||
|
|
||||||
|
def load_inputs(stub, world_size=1, rank=0):
|
||||||
|
for i, (name, tensor) in enumerate(stub.inputs.items()):
|
||||||
|
input = np.load(f"./data/input_{i}.npy")
|
||||||
|
if all(x == y for x,y in zip(input.shape,tensor.shape())):
|
||||||
|
tensor.copyin_numpy(input)
|
||||||
|
else:
|
||||||
|
tensor.copyin_numpy(np.hsplit(input, world_size)[rank])
|
||||||
|
|
||||||
|
def getDiff(base, test):
|
||||||
|
absolute_diff = np.abs(np.subtract(base, test))
|
||||||
|
max_absolute_diff = np.max(absolute_diff)
|
||||||
|
|
||||||
|
baseCopy = base.astype(np.float64).ravel()
|
||||||
|
testCopy = test.astype(np.float64).ravel()
|
||||||
|
upValue = np.sum(np.abs(baseCopy - testCopy))
|
||||||
|
downValue = np.sum(np.abs(baseCopy)) + np.float64(1e-9)
|
||||||
|
max_relative_diff = upValue / downValue
|
||||||
|
print(f"Max absolute difference: {max_absolute_diff}\n"
|
||||||
|
f"Max relative difference: {max_relative_diff}")
|
||||||
|
return max_absolute_diff, max_relative_diff
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
nnodes, nproc_per_node, name, model_path, bs, length, gen_std, data_type = parse_args()
|
nnodes, nproc_per_node, name, model_path, bs, length, gen_std = parse_args()
|
||||||
data_type = "default" if data_type == "fp32" else data_type
|
|
||||||
|
|
||||||
model = onnx.load(model_path)
|
model = onnx.load(model_path)
|
||||||
|
|
||||||
# generate standart output
|
# generate standart output
|
||||||
if gen_std:
|
if gen_std:
|
||||||
print(f"generate standard data for {name}.")
|
print("Generate inputs and outputs.")
|
||||||
# a small vocabulary size to fit all LLM.
|
p = mp.Process(target=generate_input_output, args=[model])
|
||||||
generate_input_output(model)
|
p.start()
|
||||||
|
p.join()
|
||||||
return
|
return
|
||||||
|
|
||||||
if nproc_per_node == 1:
|
# run single process.
|
||||||
# run single process.
|
# use standalone process to isolate cuda.
|
||||||
# use standalone process to isolate bang.
|
print("run model by single MLU.")
|
||||||
print("run model by single MLU.")
|
p = mp.Process(target=start_single, args=(name, model))
|
||||||
# p = mp.Process(target=start_single, args=(name, model, data_type))
|
p.start()
|
||||||
# p.start()
|
p.join()
|
||||||
# p.join()
|
|
||||||
start_single(name, model, data_type)
|
|
||||||
return
|
|
||||||
|
|
||||||
# run distributed parallel.
|
# run distributed parallel.
|
||||||
world_size = nnodes * nproc_per_node
|
world_size = nnodes * nproc_per_node
|
||||||
print(f"run model by {world_size} MLU in parallel.")
|
print(f"run model by {world_size} MLUs in parallel.")
|
||||||
workers = [
|
workers = [
|
||||||
mp.Process(
|
mp.Process(
|
||||||
target=start_worker,
|
target=start_worker,
|
||||||
args=(name, world_size, rank, rank % nproc_per_node, model, data_type),
|
args=(name, world_size, rank, rank % nproc_per_node, model),
|
||||||
)
|
)
|
||||||
for rank in range(world_size)
|
for rank in range(world_size)
|
||||||
]
|
]
|
|
@ -1,14 +0,0 @@
|
||||||
export HF_ENDPOINT=https://hf-mirror.com
|
|
||||||
|
|
||||||
models=("bert" "gpt2" "llama")
|
|
||||||
batch_size=(1 32)
|
|
||||||
seq_len=(100 500)
|
|
||||||
nproc=(1 2 4)
|
|
||||||
|
|
||||||
for model in "${models[@]}"; do
|
|
||||||
for bs in "${batch_size[@]}"; do
|
|
||||||
for len in "${seq_len[@]}"; do
|
|
||||||
python run_pytorch.py --model "$model" --batch_size "$bs" --length "$len" --export_onnx ../models/"$model" --export_only
|
|
||||||
done
|
|
||||||
done
|
|
||||||
done
|
|
|
@ -1,280 +0,0 @@
|
||||||
import sys
|
|
||||||
sys.path.append('../')
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import os
|
|
||||||
import time
|
|
||||||
import multiprocessing as mp
|
|
||||||
from pyinfinitensor.onnx import OnnxStub, backend
|
|
||||||
import onnx
|
|
||||||
from onnx.external_data_helper import convert_model_to_external_data
|
|
||||||
from onnx.shape_inference import infer_shapes_path
|
|
||||||
import numpy as np
|
|
||||||
from parallel_opt import parallel_model
|
|
||||||
from functools import wraps
|
|
||||||
|
|
||||||
|
|
||||||
def parse_args():
|
|
||||||
parser = argparse.ArgumentParser(description="launch distributed infinitensor")
|
|
||||||
parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
|
|
||||||
parser.add_argument(
|
|
||||||
"--nproc_per_node", type=int, default=2, help="number of processes per node"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--name", type=str, choices=["gpt2", "bert", "llama"], help="name of model."
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--model", type=str, default="", help="path to the ONNX model file."
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--gen_std",
|
|
||||||
default=False,
|
|
||||||
action="store_true",
|
|
||||||
help="whether to generate the standard results.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--run_single",
|
|
||||||
default=False,
|
|
||||||
action="store_true",
|
|
||||||
help="whether run model with single process with standard inputs"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--input_dir",
|
|
||||||
default="./",
|
|
||||||
help="path to save model input data"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--result_dir",
|
|
||||||
default="./",
|
|
||||||
help="path to save model standard output"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--internal_model_dir",
|
|
||||||
default="./",
|
|
||||||
help="path to save internal onnx model for parallel run"
|
|
||||||
)
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
# check path, mkdir if not exist
|
|
||||||
check_exists(args.input_dir)
|
|
||||||
check_exists(args.result_dir)
|
|
||||||
check_exists(args.internal_model_dir)
|
|
||||||
|
|
||||||
print("arg setting: ", args)
|
|
||||||
return (
|
|
||||||
args.num_nodes,
|
|
||||||
args.nproc_per_node,
|
|
||||||
args.name,
|
|
||||||
args.model,
|
|
||||||
args.gen_std,
|
|
||||||
args.run_single,
|
|
||||||
args.input_dir,
|
|
||||||
args.result_dir,
|
|
||||||
args.internal_model_dir
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
|
||||||
utils function for this scripts
|
|
||||||
"""
|
|
||||||
def check_exists(path: str):
|
|
||||||
if not os.path.exists(path):
|
|
||||||
os.makedirs(path)
|
|
||||||
|
|
||||||
def np_assert(base, test, rtol=1e-2, atol=1e-1):
|
|
||||||
# np.testing.assert_allclose(test, base, rtol, atol)
|
|
||||||
print("max abs diff:", abs(base - test).max())
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
|
||||||
Perf wrapper, run function n times
|
|
||||||
then average
|
|
||||||
"""
|
|
||||||
def perf_it(n):
|
|
||||||
def decorator(func):
|
|
||||||
@wraps(func)
|
|
||||||
def wrapper(*args, **kwargs):
|
|
||||||
# warmup
|
|
||||||
for _ in range(n):
|
|
||||||
func(*args, **kwargs)
|
|
||||||
|
|
||||||
t_total = 0
|
|
||||||
for _ in range(n):
|
|
||||||
t0 = time.time()
|
|
||||||
func(*args, **kwargs)
|
|
||||||
t1 = time.time()
|
|
||||||
t_total += t1 - t0
|
|
||||||
avg_time = (t_total) / n
|
|
||||||
print(f"Avg runtime of {n} time is {avg_time:.6f} seconds")
|
|
||||||
return avg_time
|
|
||||||
return wrapper
|
|
||||||
return decorator
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
|
||||||
Run InfiniTensor model with Standard input
|
|
||||||
check=True: check with standard output gen by pytorch
|
|
||||||
perf=True: run n times to get avg time
|
|
||||||
"""
|
|
||||||
def run_model(task_name,
|
|
||||||
model,
|
|
||||||
runtime,
|
|
||||||
world_size=1,
|
|
||||||
rank=0,
|
|
||||||
n=10,
|
|
||||||
check=True,
|
|
||||||
perf=True):
|
|
||||||
|
|
||||||
stub = OnnxStub(model, runtime,
|
|
||||||
use_naive_allocator=True \
|
|
||||||
if task_name == "llama" else False)
|
|
||||||
|
|
||||||
# load in Onnx model inputs
|
|
||||||
def load_inputs(stub: OnnxStub):
|
|
||||||
# check exists
|
|
||||||
inputs = []
|
|
||||||
for i, (name, tensor) in enumerate(stub.inputs.items()):
|
|
||||||
input_path = os.path.join(input_dir, \
|
|
||||||
f"{task_name}_input_{i}.npy")
|
|
||||||
print(input_path)
|
|
||||||
if os.path.exists(input_path):
|
|
||||||
input = np.load(input_path)
|
|
||||||
else :
|
|
||||||
raise KeyError(f"{i} th input of model not exists")
|
|
||||||
# check shape
|
|
||||||
if all(x == y for x,y in zip(input.shape, tensor.shape())):
|
|
||||||
tensor.copyin_numpy(input)
|
|
||||||
else:
|
|
||||||
tensor.copyin_numpy(np.hsplit(input, world_size)[rank])
|
|
||||||
|
|
||||||
load_inputs(stub)
|
|
||||||
# stub.tune()
|
|
||||||
stub.run()
|
|
||||||
time.sleep(0.01)
|
|
||||||
output = next(stub.outputs.values().__iter__()).copyout_numpy()
|
|
||||||
|
|
||||||
# check output results with standard output
|
|
||||||
if check:
|
|
||||||
st_output_path = os.path.join(result_dir, \
|
|
||||||
f"{task_name}_output.npy")
|
|
||||||
assert os.path.exists(st_output_path) , \
|
|
||||||
"standard output not exists"
|
|
||||||
st_output = np.load(st_output_path)
|
|
||||||
if np.isnan(output).any():
|
|
||||||
print("Nan in output")
|
|
||||||
exit()
|
|
||||||
np_assert(st_output, output)
|
|
||||||
|
|
||||||
# perf
|
|
||||||
if perf:
|
|
||||||
@perf_it(n)
|
|
||||||
def perf_infinitensor(stub: OnnxStub):
|
|
||||||
stub.run()
|
|
||||||
perf_infinitensor(stub)
|
|
||||||
|
|
||||||
return output
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
|
||||||
Start a worker in Parallel
|
|
||||||
"""
|
|
||||||
def start_worker(name: str,
|
|
||||||
world_size: int,
|
|
||||||
rank: int,
|
|
||||||
local_rank: int,
|
|
||||||
model: onnx.ModelProto):
|
|
||||||
|
|
||||||
dist_name = name + "_dist"
|
|
||||||
# partial a onnx model to world_size part
|
|
||||||
model = parallel_model(model, world_size, rank)
|
|
||||||
onnx.save(model, os.path.join(internal_model_dir, \
|
|
||||||
f"{dist_name}_rank{rank}.onnx"), save_as_external_data=True)
|
|
||||||
runtime = backend.KUNLUNRuntime(local_rank)
|
|
||||||
# print("init comm")
|
|
||||||
runtime.init_comm(
|
|
||||||
dist_name,
|
|
||||||
world_size,
|
|
||||||
rank,
|
|
||||||
)
|
|
||||||
run_model(name, model, runtime, world_size, rank)
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
|
||||||
generate standard input/output with
|
|
||||||
sigle card run
|
|
||||||
"""
|
|
||||||
def gen_standard(task_name: str, model: onnx.ModelProto):
|
|
||||||
runtime = backend.KUNLUNRuntime(0)
|
|
||||||
stub = OnnxStub(model, runtime)
|
|
||||||
position_id = 0
|
|
||||||
# generate random input for model
|
|
||||||
for i, (name, tensor) in enumerate(stub.inputs.items()):
|
|
||||||
input = tensor.copyout_numpy()
|
|
||||||
if np.issubdtype(input.dtype, np.integer):
|
|
||||||
if input.size == 1:
|
|
||||||
input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
|
|
||||||
else:
|
|
||||||
input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
|
|
||||||
elif input.dtype == np.bool_:
|
|
||||||
input = np.random.randint(0,2,size=input.shape) > 0
|
|
||||||
else:
|
|
||||||
if i == 0:
|
|
||||||
input = np.ones(input.shape).astype(input.dtype)
|
|
||||||
position_id = input.shape[-1] - 1
|
|
||||||
else:
|
|
||||||
input = np.random.rand(*input.shape).astype(input.dtype)
|
|
||||||
tensor.copyin_numpy(input)
|
|
||||||
np.save(os.path.join(input_dir, \
|
|
||||||
f"{task_name}_input_{i}.npy"), input)
|
|
||||||
stub.run()
|
|
||||||
# print(stub.outputs)
|
|
||||||
output = next(stub.outputs.values().__iter__()).copyout_numpy()
|
|
||||||
if np.isnan(output).any():
|
|
||||||
print("Nan in output")
|
|
||||||
exit()
|
|
||||||
np.save(os.path.join(result_dir, f"{task_name}_output.npy"), output)
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
|
|
||||||
global input_dir, result_dir, internal_model_dir
|
|
||||||
|
|
||||||
nnodes, nproc_per_node, task_name, \
|
|
||||||
model_path, gen_std, run_single, \
|
|
||||||
input_dir, result_dir, internal_model_dir = parse_args()
|
|
||||||
|
|
||||||
# load input onnx model
|
|
||||||
model = onnx.load(model_path)
|
|
||||||
|
|
||||||
# generate standart output
|
|
||||||
if gen_std:
|
|
||||||
print("Generate inputs and outputs.")
|
|
||||||
gen_standard(task_name, model)
|
|
||||||
return
|
|
||||||
|
|
||||||
if run_single:
|
|
||||||
print("Run model by one GPU card.")
|
|
||||||
runtime = backend.KUNLUNRuntime(0)
|
|
||||||
run_model(task_name, model, runtime)
|
|
||||||
return
|
|
||||||
|
|
||||||
# run distributed parallel.
|
|
||||||
world_size = nnodes * nproc_per_node
|
|
||||||
print(f"Run model by {world_size} GPU in parallel.")
|
|
||||||
workers = [
|
|
||||||
mp.Process(
|
|
||||||
target=start_worker,
|
|
||||||
args=(task_name, world_size, rank, rank % nproc_per_node, model),
|
|
||||||
)
|
|
||||||
for rank in range(world_size)
|
|
||||||
]
|
|
||||||
|
|
||||||
for w in workers:
|
|
||||||
w.start()
|
|
||||||
|
|
||||||
for w in workers:
|
|
||||||
w.join()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
|
@ -1,36 +0,0 @@
|
||||||
export HF_ENDPOINT=https://hf-mirror.com
|
|
||||||
|
|
||||||
# models=("bert" "gpt2" "llama")
|
|
||||||
models=("bert" "gpt2")
|
|
||||||
batch_size=(1 32)
|
|
||||||
seq_len=(100 500)
|
|
||||||
nproc=(1 2 4)
|
|
||||||
|
|
||||||
results_dir="results"
|
|
||||||
|
|
||||||
if [ -d "$results_dir" ]; then
|
|
||||||
echo "directory ./$results_dir exists"
|
|
||||||
else
|
|
||||||
mkdir -p "$results_dir"
|
|
||||||
echo "mkdir $results_dir, logs saved there"
|
|
||||||
fi
|
|
||||||
|
|
||||||
|
|
||||||
for model in "${models[@]}"; do
|
|
||||||
for bs in "${batch_size[@]}"; do
|
|
||||||
for len in "${seq_len[@]}"; do
|
|
||||||
# run pytorch model
|
|
||||||
echo "Run pytorch $model with batch_size=$bs length=$len ."
|
|
||||||
python run_pytorch.py --model "$model" --batch_size "$bs" --length "$len" #> results/"$model"_"$bs"_"$len"_pytorch
|
|
||||||
for n in "${nproc[@]}"; do
|
|
||||||
# run infinitensor
|
|
||||||
echo "Run $n parallel infinitensor "$model" with batch_size=$bs and length=$len ."
|
|
||||||
python kunlun_launch.py --name "$model" --model ../models/"$model"/"$model"_"$bs"_"$len".onnx --nproc_per_node=$n # >> results/"$model"_"$bs"_"$len"_infini
|
|
||||||
# delete internal files
|
|
||||||
find ./ -type f -name "*.onnx" -delete
|
|
||||||
find ./ -type f -name "*.pb" -delete
|
|
||||||
done
|
|
||||||
find ./ -type f -name "*.npy" -delete
|
|
||||||
done
|
|
||||||
done
|
|
||||||
done
|
|
|
@ -1,35 +0,0 @@
|
||||||
export HF_ENDPOINT=https://hf-mirror.com
|
|
||||||
|
|
||||||
# models=("bert" "gpt2" "llama")
|
|
||||||
models=("llama")
|
|
||||||
batch_size=(1 )
|
|
||||||
seq_len=(100 500)
|
|
||||||
nproc=(1 2 4)
|
|
||||||
|
|
||||||
results_dir="results"
|
|
||||||
|
|
||||||
if [ -d "$results_dir" ]; then
|
|
||||||
echo "directory ./$results_dir exists"
|
|
||||||
else
|
|
||||||
mkdir -p "$results_dir"
|
|
||||||
echo "mkdir $results_dir, logs saved there"
|
|
||||||
fi
|
|
||||||
|
|
||||||
|
|
||||||
for model in "${models[@]}"; do
|
|
||||||
for bs in "${batch_size[@]}"; do
|
|
||||||
for len in "${seq_len[@]}"; do
|
|
||||||
echo "Run pytorch llama with batch_size="$bs" and length="$len""
|
|
||||||
python run_pytorch.py --model "$model" --batch_size "$bs" --length "$len"
|
|
||||||
for n in "${nproc[@]}"; do
|
|
||||||
# run pytorch model
|
|
||||||
echo "Run infinitensor llama with batch_size="$bs" and length="$len" and nproc="$n"."
|
|
||||||
python kunlun_launch.py --name llama --model ../models/llama/llama_"$bs"_"$len"_fp32.onnx --nproc_per_node=$n
|
|
||||||
# delete internal files
|
|
||||||
find ./ -type f -name "*.onnx" -delete
|
|
||||||
find ./ -type f -name "*0c" -delete
|
|
||||||
done
|
|
||||||
find ./ -type f -name "*.npy" -delete
|
|
||||||
done
|
|
||||||
done
|
|
||||||
done
|
|
|
@ -1,245 +0,0 @@
|
||||||
import argparse
|
|
||||||
import torch
|
|
||||||
from transformers import BertModel, BertConfig
|
|
||||||
from transformers import GPT2Model, GPT2Config
|
|
||||||
from transformers import OPTModel, OPTConfig
|
|
||||||
from transformers import LlamaModel, LlamaConfig
|
|
||||||
import time
|
|
||||||
import numpy as np
|
|
||||||
import onnx
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
from onnx.external_data_helper import convert_model_to_external_data
|
|
||||||
from onnxsim import simplify
|
|
||||||
|
|
||||||
torch.backends.cuda.matmul.allow_tf32 = False
|
|
||||||
torch.backends.cudnn.allow_tf32 = False
|
|
||||||
def parse_args():
|
|
||||||
parser = argparse.ArgumentParser(description="Run pytorch gpt2/bert/opt and optionally export onnx.")
|
|
||||||
parser.add_argument(
|
|
||||||
"--model", type=str, choices=["gpt2", "bert", "opt", "llama"], required=True, help="model type"
|
|
||||||
)
|
|
||||||
parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
|
|
||||||
parser.add_argument("--length", type=int, default=1, help="sequence length.")
|
|
||||||
parser.add_argument(
|
|
||||||
"--export_onnx",
|
|
||||||
type=str,
|
|
||||||
nargs="?",
|
|
||||||
default=None,
|
|
||||||
const="./",
|
|
||||||
help="whether and where to export onnx file",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--input_dir",
|
|
||||||
type=str,
|
|
||||||
default="./",
|
|
||||||
help="path to save pytorch model input data"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--result_dir",
|
|
||||||
type=str,
|
|
||||||
default="./",
|
|
||||||
help="path to save pytorch model output data"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--export_only",
|
|
||||||
action="store_true"
|
|
||||||
)
|
|
||||||
args = parser.parse_args()
|
|
||||||
print("arg setting: ", args)
|
|
||||||
return (
|
|
||||||
args.model,
|
|
||||||
args.batch_size,
|
|
||||||
args.length,
|
|
||||||
args.export_onnx,
|
|
||||||
args.input_dir,
|
|
||||||
args.result_dir,
|
|
||||||
args.export_only
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def get_model(modelname):
|
|
||||||
if modelname == "bert":
|
|
||||||
model = BertModel.from_pretrained("bert-base-uncased", add_pooling_layer=False, hidden_act="gelu_new") # erf is not impl by infini
|
|
||||||
voc_size = BertConfig().vocab_size
|
|
||||||
elif modelname == "gpt2":
|
|
||||||
model = GPT2Model.from_pretrained("gpt2")
|
|
||||||
voc_size = GPT2Config().vocab_size
|
|
||||||
elif modelname == "opt":
|
|
||||||
model = OPTModel.from_pretrained("./opt-125m")
|
|
||||||
voc_size = OPTConfig().vocab_size
|
|
||||||
elif modelname == "llama":
|
|
||||||
model = LlamaModel.from_pretrained("meta-llama/Llama-2-7b-hf")
|
|
||||||
voc_size = LlamaConfig().vocab_size
|
|
||||||
else :
|
|
||||||
raise KeyError(modelname)
|
|
||||||
|
|
||||||
model = model.eval()
|
|
||||||
return model, voc_size
|
|
||||||
|
|
||||||
def run_pytorch(torch_model, voc_size, batchsize, len, model_name):
|
|
||||||
data = np.random.randint(0, voc_size, (batchsize, len), dtype=np.int32)
|
|
||||||
np.save(os.path.join(input_dir, f"{model_name}_input_0.npy"), data)
|
|
||||||
inputs = torch.from_numpy(data).to("cuda")
|
|
||||||
torch_model = torch_model.to("cuda")
|
|
||||||
|
|
||||||
n_iter = 10
|
|
||||||
with torch.no_grad():
|
|
||||||
for _ in range(10):
|
|
||||||
outputs = torch_model(inputs)
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
begin = time.time()
|
|
||||||
with torch.no_grad():
|
|
||||||
for _ in range(n_iter):
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
outputs = torch_model(inputs)
|
|
||||||
#
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
end = time.time()
|
|
||||||
|
|
||||||
avg_time = (end - begin) / n_iter
|
|
||||||
outputs = outputs.last_hidden_state.to("cpu")
|
|
||||||
print("outputs abs mean:", abs(np.array(outputs)).mean())
|
|
||||||
print(f"average time: {avg_time}")
|
|
||||||
torch.cuda.memory.empty_cache()
|
|
||||||
np.save(os.path.join(result_dir, f"{model_name}_output.npy"), \
|
|
||||||
np.array(outputs))
|
|
||||||
print(f"Save input & output as {model_name}_input_0.npy and {model_name}_output.npy")
|
|
||||||
|
|
||||||
|
|
||||||
def export_onnx(model_name, model, data, path, extern=False):
|
|
||||||
# torch.onnx.export(model, data, path, verbose=False, do_constant_folding=True)
|
|
||||||
|
|
||||||
if model_name != "llama":
|
|
||||||
onnx_model = onnx.load(path)
|
|
||||||
onnx_model, check = simplify(onnx_model,
|
|
||||||
skipped_optimizers=['fuse_qkv', 'eliminate_duplicate_initializer'])
|
|
||||||
# skipped_optimizers=['fuse_qkv'])
|
|
||||||
assert check
|
|
||||||
add_value_info_for_constants(onnx_model)
|
|
||||||
onnx_model = onnx.shape_inference.infer_shapes(onnx_model)
|
|
||||||
if extern:
|
|
||||||
extern_path = path.replace('.onnx', '.pb')
|
|
||||||
if os.path.exists(extern_path):
|
|
||||||
os.remove(extern_path)
|
|
||||||
convert_model_to_external_data(
|
|
||||||
onnx_model,
|
|
||||||
all_tensors_to_one_file=True,
|
|
||||||
location=extern_path.split("/")[-1],
|
|
||||||
size_threshold=1024,
|
|
||||||
convert_attribute=False,
|
|
||||||
)
|
|
||||||
onnx.save(onnx_model, path)
|
|
||||||
else:
|
|
||||||
sys.path.append("onnxsim_large_model")
|
|
||||||
from onnx_utils import set_onnx_input_shape
|
|
||||||
from compress_model import SIZE_1MB, compress_onnx_model, uncompress_onnx_model
|
|
||||||
|
|
||||||
in_model_path = path
|
|
||||||
out_model_path = in_model_path[:-5] + ".sim.onnx"
|
|
||||||
|
|
||||||
onnx_model = onnx.load(in_model_path)
|
|
||||||
print(f"load model from {in_model_path} success")
|
|
||||||
|
|
||||||
size_th_bytes = 1024 * 1024
|
|
||||||
onnx_model, removed_inits = compress_onnx_model(onnx_model, size_th_bytes=size_th_bytes)
|
|
||||||
print("compress model success")
|
|
||||||
|
|
||||||
onnx_model = set_onnx_input_shape(onnx_model, "")
|
|
||||||
tensor_size_threshold = f"1024KB"
|
|
||||||
skipped_optimizers = []
|
|
||||||
skipped_optimizers.append("eliminate_duplicate_initializer")
|
|
||||||
onnx_model, check = simplify(onnx_model, skipped_optimizers=skipped_optimizers,
|
|
||||||
tensor_size_threshold=tensor_size_threshold)
|
|
||||||
if not check:
|
|
||||||
raise ValueError(f"simplify compressed model {in_model_path} failed")
|
|
||||||
|
|
||||||
print(f"simplify model success")
|
|
||||||
|
|
||||||
onnx_model = uncompress_onnx_model(onnx_model, removed_inits)
|
|
||||||
print(f"uncompress model success")
|
|
||||||
|
|
||||||
add_value_info_for_constants(onnx_model)
|
|
||||||
|
|
||||||
onnx.save(onnx_model, out_model_path, save_as_external_data=True)
|
|
||||||
|
|
||||||
|
|
||||||
def add_value_info_for_constants(model : onnx.ModelProto):
|
|
||||||
"""
|
|
||||||
Currently onnx.shape_inference doesn't use the shape of initializers, so add
|
|
||||||
that info explicitly as ValueInfoProtos.
|
|
||||||
Mutates the model.
|
|
||||||
Args:
|
|
||||||
model: The ModelProto to update.
|
|
||||||
"""
|
|
||||||
# All (top-level) constants will have ValueInfos before IRv4 as they are all inputs
|
|
||||||
if model.ir_version < 4:
|
|
||||||
return
|
|
||||||
|
|
||||||
def add_const_value_infos_to_graph(graph : onnx.GraphProto):
|
|
||||||
inputs = {i.name for i in graph.input}
|
|
||||||
existing_info = {vi.name: vi for vi in graph.value_info}
|
|
||||||
for init in graph.initializer:
|
|
||||||
# Check it really is a constant, not an input
|
|
||||||
if init.name in inputs:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# The details we want to add
|
|
||||||
elem_type = init.data_type
|
|
||||||
shape = init.dims
|
|
||||||
|
|
||||||
# Get existing or create new value info for this constant
|
|
||||||
vi = existing_info.get(init.name)
|
|
||||||
if vi is None:
|
|
||||||
vi = graph.value_info.add()
|
|
||||||
vi.name = init.name
|
|
||||||
|
|
||||||
# Even though it would be weird, we will not overwrite info even if it doesn't match
|
|
||||||
tt = vi.type.tensor_type
|
|
||||||
if tt.elem_type == onnx.TensorProto.UNDEFINED:
|
|
||||||
tt.elem_type = elem_type
|
|
||||||
if not tt.HasField("shape"):
|
|
||||||
# Ensure we set an empty list if the const is scalar (zero dims)
|
|
||||||
tt.shape.dim.extend([])
|
|
||||||
for dim in shape:
|
|
||||||
tt.shape.dim.add().dim_value = dim
|
|
||||||
|
|
||||||
# Handle subgraphs
|
|
||||||
for node in graph.node:
|
|
||||||
for attr in node.attribute:
|
|
||||||
# Ref attrs refer to other attrs, so we don't need to do anything
|
|
||||||
if attr.ref_attr_name != "":
|
|
||||||
continue
|
|
||||||
|
|
||||||
if attr.type == onnx.AttributeProto.GRAPH:
|
|
||||||
add_const_value_infos_to_graph(attr.g)
|
|
||||||
if attr.type == onnx.AttributeProto.GRAPHS:
|
|
||||||
for g in attr.graphs:
|
|
||||||
add_const_value_infos_to_graph(g)
|
|
||||||
|
|
||||||
|
|
||||||
return add_const_value_infos_to_graph(model.graph)
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
global input_dir, result_dir
|
|
||||||
|
|
||||||
modelname, batchsize, seqlen, \
|
|
||||||
export_path, input_dir, result_dir, export_only = parse_args()
|
|
||||||
|
|
||||||
model, voc_size = get_model(modelname) # pytorch model
|
|
||||||
|
|
||||||
if export_path is not None:
|
|
||||||
os.makedirs(export_path, exist_ok=True)
|
|
||||||
filename = "{}_{}_{}.onnx".format(modelname, batchsize, seqlen)
|
|
||||||
path = os.path.join(export_path, filename)
|
|
||||||
param = torch.zeros((batchsize, seqlen), dtype=torch.int)
|
|
||||||
export_onnx(modelname, model, param, path, True) # export pytorch model to onnx model
|
|
||||||
if export_only:
|
|
||||||
return
|
|
||||||
|
|
||||||
run_pytorch(model, voc_size, batchsize, seqlen, modelname)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
|
@ -0,0 +1,213 @@
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
import multiprocessing as mp
|
||||||
|
from pyinfinitensor.onnx import OnnxStub, backend
|
||||||
|
import onnx
|
||||||
|
from onnx.external_data_helper import convert_model_to_external_data
|
||||||
|
from onnx.shape_inference import infer_shapes_path
|
||||||
|
import numpy as np
|
||||||
|
from parallel_opt import parallel_model
|
||||||
|
|
||||||
|
st_input_dir = "standard/inputs/"
|
||||||
|
st_output_dir = "standard/outputs/"
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
parser = argparse.ArgumentParser(description="launch distributed infinitensor")
|
||||||
|
parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
|
||||||
|
parser.add_argument(
|
||||||
|
"--nproc_per_node", type=int, default=2, help="number of processes per node"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--name", type=str, default="test", help="name of this instance."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--model", type=str, default="/data1/shared/panzezhong/llama/fp32/my_llama_fp32.sim.onnx", help="path to the ONNX model file."
|
||||||
|
)
|
||||||
|
parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
|
||||||
|
parser.add_argument("--length", type=int, default=1, help="sequence length.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--gen_std",
|
||||||
|
default=False,
|
||||||
|
action="store_true",
|
||||||
|
help="whether to generate the standard results.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--run_single",
|
||||||
|
default=False,
|
||||||
|
action="store_true",
|
||||||
|
help="whether run model with single process with standard inputs"
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
print("arg setting: ", args)
|
||||||
|
return (
|
||||||
|
args.num_nodes,
|
||||||
|
args.nproc_per_node,
|
||||||
|
args.name,
|
||||||
|
args.model,
|
||||||
|
args.batch_size,
|
||||||
|
args.length,
|
||||||
|
args.gen_std,
|
||||||
|
args.run_single
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def run_model(model, runtime, world_size=1, rank=0, n=10):
|
||||||
|
stub = OnnxStub(model, runtime)
|
||||||
|
load_inputs(stub, world_size, rank)
|
||||||
|
# stub.tune()
|
||||||
|
stub.run()
|
||||||
|
# get outputs
|
||||||
|
time.sleep(0.01)
|
||||||
|
outputs = next(stub.outputs.values().__iter__()).copyout_numpy()
|
||||||
|
|
||||||
|
# bench
|
||||||
|
begin = time.time()
|
||||||
|
for _ in range(n):
|
||||||
|
stub.run()
|
||||||
|
end = time.time()
|
||||||
|
avg_time = (end - begin) / n
|
||||||
|
print(f"average time: {avg_time}")
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def run_and_compare(name, model, runtime, world_size=1, rank = 0):
|
||||||
|
results = np.load(os.path.join(st_output_dir,f"output.npy"))
|
||||||
|
outputs = run_model(model, runtime, world_size, rank)
|
||||||
|
print(outputs[:100])
|
||||||
|
if np.isnan(outputs).any():
|
||||||
|
print("Nan in output")
|
||||||
|
print("answer argmax:", np.argmax(results))
|
||||||
|
print("output argmax:", np.argmax(outputs))
|
||||||
|
#np.testing.assert_allclose(outputs, results, rtol=1e-3, atol=1e-3)
|
||||||
|
getDiff(results, outputs)
|
||||||
|
|
||||||
|
|
||||||
|
def start_worker(
|
||||||
|
name: str, world_size: int, rank: int, local_rank: int, model: onnx.ModelProto
|
||||||
|
):
|
||||||
|
dist_name = name + "_dist"
|
||||||
|
model = parallel_model(model, world_size, rank)
|
||||||
|
extern_path = f"./{dist_name}_rank{rank}.pb"
|
||||||
|
if os.path.exists(extern_path):
|
||||||
|
os.remove(extern_path)
|
||||||
|
onnx.save_model(
|
||||||
|
model,
|
||||||
|
f"./{dist_name}_rank{rank}.onnx",
|
||||||
|
save_as_external_data=True,
|
||||||
|
location=extern_path,
|
||||||
|
)
|
||||||
|
infer_shapes_path(f"./{dist_name}_rank{rank}.onnx")
|
||||||
|
runtime = backend.KUNLUNRuntime(local_rank)
|
||||||
|
# print("init comm")
|
||||||
|
runtime.init_comm(
|
||||||
|
dist_name,
|
||||||
|
world_size,
|
||||||
|
rank,
|
||||||
|
)
|
||||||
|
run_and_compare(name, model, runtime, world_size, rank)
|
||||||
|
|
||||||
|
|
||||||
|
def start_single(name, model):
|
||||||
|
runtime = backend.KUNLUNRuntime(0)
|
||||||
|
run_and_compare(name, model, runtime)
|
||||||
|
|
||||||
|
|
||||||
|
def generate_input_output(model):
|
||||||
|
runtime = backend.KUNLUNRuntime(0)
|
||||||
|
stub = OnnxStub(model, runtime)
|
||||||
|
position_id = 0
|
||||||
|
for i, (name, tensor) in enumerate(stub.inputs.items()):
|
||||||
|
input = tensor.copyout_numpy()
|
||||||
|
if np.issubdtype(input.dtype, np.integer):
|
||||||
|
if input.size == 1:
|
||||||
|
# input = np.array([position_id])
|
||||||
|
input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
|
||||||
|
else:
|
||||||
|
input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
|
||||||
|
elif input.dtype == np.bool_:
|
||||||
|
input = np.random.randint(0,2,size=input.shape) > 0
|
||||||
|
else:
|
||||||
|
if i == 0:
|
||||||
|
input = np.ones(input.shape).astype(input.dtype)
|
||||||
|
position_id = input.shape[-1] - 1
|
||||||
|
else:
|
||||||
|
input = np.random.rand(*input.shape).astype(input.dtype)
|
||||||
|
tensor.copyin_numpy(input)
|
||||||
|
np.save(os.path.join(st_input_dir, f"input_{i}"), input)
|
||||||
|
stub.run()
|
||||||
|
# print(stub.outputs)
|
||||||
|
time.sleep(0.01)
|
||||||
|
output = next(stub.outputs.values().__iter__()).copyout_numpy()
|
||||||
|
print(output[:100])
|
||||||
|
if np.isnan(output).any():
|
||||||
|
print("Nan in output")
|
||||||
|
np.save(os.path.join(st_output_dir, f"output"), output)
|
||||||
|
|
||||||
|
|
||||||
|
def load_inputs(stub, world_size=1, rank=0):
|
||||||
|
for i, (name, tensor) in enumerate(stub.inputs.items()):
|
||||||
|
input = np.load(os.path.join(st_input_dir, f"input_{i}.npy"))
|
||||||
|
if all(x == y for x,y in zip(input.shape,tensor.shape())):
|
||||||
|
tensor.copyin_numpy(input)
|
||||||
|
else:
|
||||||
|
tensor.copyin_numpy(np.hsplit(input, world_size)[rank])
|
||||||
|
|
||||||
|
|
||||||
|
def getDiff(base, test):
|
||||||
|
absolute_diff = np.abs(np.subtract(base, test))
|
||||||
|
max_absolute_diff = np.max(absolute_diff)
|
||||||
|
|
||||||
|
baseCopy = base.astype(np.float64).ravel()
|
||||||
|
testCopy = test.astype(np.float64).ravel()
|
||||||
|
upValue = np.sum(np.abs(baseCopy - testCopy))
|
||||||
|
downValue = np.sum(np.abs(baseCopy)) + np.float64(1e-9)
|
||||||
|
max_relative_diff = upValue / downValue
|
||||||
|
print(f"Max absolute difference: {max_absolute_diff}\nMax relative difference: {max_relative_diff}")
|
||||||
|
|
||||||
|
return max_absolute_diff, max_relative_diff
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
nnodes, nproc_per_node, name, model_path, bs, length, gen_std, run_single = parse_args()
|
||||||
|
|
||||||
|
model = onnx.load(model_path)
|
||||||
|
|
||||||
|
# generate standart output
|
||||||
|
if gen_std:
|
||||||
|
print("Generate inputs and outputs.")
|
||||||
|
p = mp.Process(target=generate_input_output, args=[model])
|
||||||
|
p.start()
|
||||||
|
p.join()
|
||||||
|
return
|
||||||
|
|
||||||
|
# # run single process.
|
||||||
|
# # use standalone process to isolate cuda.
|
||||||
|
if run_single:
|
||||||
|
print("run model by single GPU.")
|
||||||
|
p = mp.Process(target=start_single, args=(name, model))
|
||||||
|
p.start()
|
||||||
|
p.join()
|
||||||
|
return
|
||||||
|
|
||||||
|
# run distributed parallel.
|
||||||
|
world_size = nnodes * nproc_per_node
|
||||||
|
print(f"run model by {world_size} GPU in parallel.")
|
||||||
|
workers = [
|
||||||
|
mp.Process(
|
||||||
|
target=start_worker,
|
||||||
|
args=(name, world_size, rank, rank % nproc_per_node, model),
|
||||||
|
)
|
||||||
|
for rank in range(world_size)
|
||||||
|
]
|
||||||
|
|
||||||
|
for w in workers:
|
||||||
|
w.start()
|
||||||
|
|
||||||
|
for w in workers:
|
||||||
|
w.join()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
|
@ -1 +0,0 @@
|
||||||
Subproject commit cbcf3fbf985a00494b0f136c92eaccd42031bf65
|
|
|
@ -110,6 +110,7 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
|
||||||
s_dim = 0
|
s_dim = 0
|
||||||
elif in_plc.dim == 2:
|
elif in_plc.dim == 2:
|
||||||
s_dim = 1
|
s_dim = 1
|
||||||
|
|
||||||
assert s_dim != -1
|
assert s_dim != -1
|
||||||
assert out_dims[s_dim] % tp_world_size == 0, out_dims
|
assert out_dims[s_dim] % tp_world_size == 0, out_dims
|
||||||
out_dims[s_dim] //= tp_world_size
|
out_dims[s_dim] //= tp_world_size
|
||||||
|
|
|
@ -3,10 +3,11 @@
|
||||||
#include "operators/unary.h"
|
#include "operators/unary.h"
|
||||||
#include "utils/small_array.h"
|
#include "utils/small_array.h"
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
void expandKernel(int dType, void *input, void *output, int a0, int a1, int a2,
|
||||||
|
int a3, int b0, int b1, int b2, int b3);
|
||||||
void expandKernel(int dType, void *input, void *output, int nDims,
|
void expandKernel(int dType, void *input, void *output, int nDims,
|
||||||
int outputsize, SmallArray inputShape,
|
int outputsize, SmallArray inputShape,
|
||||||
SmallArray outputShape);
|
SmallArray outputShape);
|
||||||
|
|
||||||
void expandRowKernel(int dType, void *input, void *output, int n_rows,
|
void expandRowKernel(int dType, void *input, void *output, int n_rows,
|
||||||
int row_len);
|
int row_len);
|
||||||
}; // namespace infini
|
}; // namespace infini
|
||||||
|
|
|
@ -1,16 +1,14 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
#include "operators/unary.h"
|
#include "operators/unary.h"
|
||||||
#include "utils/small_array.h"
|
#include "utils/small_array.h"
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
void whereKernel(const float *inputX, const float *inputY,
|
void whereKernel(int dTypeIndex, void *inputX, void *inputY,
|
||||||
const uint8_t *condition, float *output, int nDims,
|
const uint8_t *condition, void *output, int a0, int a1, int a2,
|
||||||
int outputsize, SmallArray inputXShape, SmallArray inputYShape,
|
int a3, int b0, int b1, int b2, int b3, int c0, int c1, int c2,
|
||||||
SmallArray conditionShape, SmallArray outputShape, int xSize,
|
int c3, int d0, int d1, int d2, int d3);
|
||||||
int ySize, int cSize);
|
void whereKernel(int dTypeIndex, void *inputX, void *inputY,
|
||||||
void whereKernel(const half *inputX, const half *inputY,
|
const uint8_t *condition, void *output, int nDims,
|
||||||
const uint8_t *condition, half *output, int nDims,
|
|
||||||
int outputsize, SmallArray inputXShape, SmallArray inputYShape,
|
int outputsize, SmallArray inputXShape, SmallArray inputYShape,
|
||||||
SmallArray conditionShape, SmallArray outputShape, int xSize,
|
SmallArray conditionShape, SmallArray outputShape, int xSize,
|
||||||
int ySize, int cSize);
|
int ySize, int cSize);
|
||||||
|
|
|
@ -21,7 +21,7 @@ class KUNLUNRuntimeObj : public RuntimeObj {
|
||||||
ctx = xdnn::create_context();
|
ctx = xdnn::create_context();
|
||||||
// 10GB for Longformer
|
// 10GB for Longformer
|
||||||
// size_t longformerNum = 3lu * (1 << 30);
|
// size_t longformerNum = 3lu * (1 << 30);
|
||||||
size_t workspaceSize = 2llu << 30; // 2 GB
|
size_t workspaceSize = 3llu << 30; // 3 GB
|
||||||
KUNLUNPtr wkspacePtr = alloc(workspaceSize);
|
KUNLUNPtr wkspacePtr = alloc(workspaceSize);
|
||||||
workspace =
|
workspace =
|
||||||
make_ref<WorkspaceObj<KUNLUNPtr>>(wkspacePtr, workspaceSize);
|
make_ref<WorkspaceObj<KUNLUNPtr>>(wkspacePtr, workspaceSize);
|
||||||
|
@ -42,7 +42,7 @@ class KUNLUNRuntimeObj : public RuntimeObj {
|
||||||
KUNLUNPtr alloc(size_t size) override {
|
KUNLUNPtr alloc(size_t size) override {
|
||||||
void *ptr;
|
void *ptr;
|
||||||
checkKUNLUNError(
|
checkKUNLUNError(
|
||||||
xpu_malloc((void **)&ptr, size, XPUMemoryKind::XPU_MEM_HBM));
|
xpu_malloc_ex((void **)&ptr, size, XPUMemoryKind::XPU_MEM_MAIN));
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
void dealloc(void *ptr) override { xpu_free(ptr); }
|
void dealloc(void *ptr) override { xpu_free(ptr); }
|
||||||
|
|
|
@ -34,8 +34,8 @@ class XcclCommunicatorObj final : public CommunicatorObj {
|
||||||
auto begin = std::chrono::steady_clock::now();
|
auto begin = std::chrono::steady_clock::now();
|
||||||
while (!std::filesystem::exists(filePath)) {
|
while (!std::filesystem::exists(filePath)) {
|
||||||
auto now = std::chrono::steady_clock::now();
|
auto now = std::chrono::steady_clock::now();
|
||||||
_IT_ASSERT_2(now < begin + std::chrono::seconds(100),
|
_IT_ASSERT_2(now < begin + std::chrono::seconds(10),
|
||||||
"time limit (100s) exceeded.");
|
"time limit (10s) exceeded.");
|
||||||
std::this_thread::sleep_for(std::chrono::milliseconds(100));
|
std::this_thread::sleep_for(std::chrono::milliseconds(100));
|
||||||
}
|
}
|
||||||
std::ifstream ifs(filePath, std::ios::binary);
|
std::ifstream ifs(filePath, std::ios::binary);
|
||||||
|
|
|
@ -967,7 +967,7 @@ class OnnxStub:
|
||||||
tensors[node.input[0]],
|
tensors[node.input[0]],
|
||||||
tensors.get(node.output[0]),
|
tensors.get(node.output[0]),
|
||||||
)
|
)
|
||||||
elif node.op_type in ["Constant", "ConstantOfShape"]:
|
elif node.op_type == "Constant":
|
||||||
output_name = node.output[0]
|
output_name = node.output[0]
|
||||||
attributes = _parse_attribute(node)
|
attributes = _parse_attribute(node)
|
||||||
tensor = attributes["value"]
|
tensor = attributes["value"]
|
||||||
|
|
|
@ -199,24 +199,6 @@ class CastCnnl : public BangKernelWithoutConfig {
|
||||||
dim.data()));
|
dim.data()));
|
||||||
NlCastType = CNNL_CAST_UINT32_TO_INT64;
|
NlCastType = CNNL_CAST_UINT32_TO_INT64;
|
||||||
break;
|
break;
|
||||||
case CastType::Float162Float:
|
|
||||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
|
||||||
CNNL_DTYPE_HALF, dim.size(),
|
|
||||||
dim.data()));
|
|
||||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
|
||||||
CNNL_DTYPE_FLOAT, dim.size(),
|
|
||||||
dim.data()));
|
|
||||||
NlCastType = CNNL_CAST_HALF_TO_FLOAT;
|
|
||||||
break;
|
|
||||||
case CastType::Float2Float16:
|
|
||||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
|
||||||
CNNL_DTYPE_FLOAT, dim.size(),
|
|
||||||
dim.data()));
|
|
||||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
|
||||||
CNNL_DTYPE_HALF, dim.size(),
|
|
||||||
dim.data()));
|
|
||||||
NlCastType = CNNL_CAST_FLOAT_TO_HALF;
|
|
||||||
break;
|
|
||||||
default:
|
default:
|
||||||
IT_TODO_HALT();
|
IT_TODO_HALT();
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,16 +19,14 @@ class LayerNormCnnl : public BangKernelWithoutConfig {
|
||||||
void *const outputData = (op->getOutput()->getRawDataPtr<void *>());
|
void *const outputData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
auto inDims = op->getInputs(0)->getDims();
|
auto inDims = op->getInputs(0)->getDims();
|
||||||
auto fiterDims = op->getInputs(1)->getDims();
|
|
||||||
auto outDims = op->getOutput()->getDims();
|
auto outDims = op->getOutput()->getDims();
|
||||||
|
auto fiterDims = op->getOutput(1)->getDims();
|
||||||
|
|
||||||
float eps = op->getEps();
|
float eps = op->getEps();
|
||||||
const int axis = op->getAxis();
|
const int axis = op->getAxis();
|
||||||
|
|
||||||
Shape outMeanDims(outDims);
|
cnnlTensorDescriptor_t inDesc, fiterDesc, outDesc;
|
||||||
outMeanDims.erase(outMeanDims.begin() + axis);
|
|
||||||
|
|
||||||
cnnlTensorDescriptor_t inDesc, fiterDesc, outDesc, outMeanDesc;
|
|
||||||
checkCnnlError(cnnlCreateTensorDescriptor(&inDesc));
|
checkCnnlError(cnnlCreateTensorDescriptor(&inDesc));
|
||||||
checkCnnlError(cnnlSetTensorDescriptor(
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
inDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
|
inDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
|
||||||
|
@ -41,23 +39,15 @@ class LayerNormCnnl : public BangKernelWithoutConfig {
|
||||||
checkCnnlError(cnnlSetTensorDescriptor(
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
outDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
|
outDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
|
||||||
outDims.size(), outDims.data()));
|
outDims.size(), outDims.data()));
|
||||||
checkCnnlError(cnnlCreateTensorDescriptor(&outMeanDesc));
|
|
||||||
checkCnnlError(cnnlSetTensorDescriptor(
|
|
||||||
outMeanDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
|
|
||||||
outMeanDims.size(), outMeanDims.data()));
|
|
||||||
size_t wsSize;
|
size_t wsSize;
|
||||||
cnnlGetLayerNormOpWorkspaceSize(context->cnnlHandle(), axis, inDesc,
|
cnnlGetLayerNormOpWorkspaceSize(context->cnnlHandle(), axis, inDesc,
|
||||||
&wsSize);
|
&wsSize);
|
||||||
BangPtr wsData = context->getWorkspace(wsSize);
|
BangPtr wsData = context->getWorkspace(wsSize);
|
||||||
size_t meanSize =
|
|
||||||
cnnlGetTensorElementNum(outMeanDesc) * op->getDType().getSize();
|
|
||||||
BangPtr meanData = context->getWorkspace(meanSize);
|
|
||||||
BangPtr rstdData = context->getWorkspace(meanSize);
|
|
||||||
|
|
||||||
cnnlStatus_t stat = cnnlLayerNormForward(
|
cnnlStatus_t stat = cnnlLayerNormForward(
|
||||||
context->cnnlHandle(), inDesc, inputData, axis, fiterDesc,
|
context->cnnlHandle(), inDesc, inputData, axis, fiterDesc,
|
||||||
scaleData, biasData, eps, wsData, wsSize, outDesc, outputData,
|
scaleData, biasData, eps, wsData, wsSize, outDesc, outputData,
|
||||||
outMeanDesc, meanData, rstdData);
|
inDesc, NULL, NULL);
|
||||||
|
|
||||||
if (stat != CNNL_STATUS_SUCCESS)
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
return;
|
return;
|
||||||
|
|
|
@ -66,13 +66,6 @@ class MatmulCnnl : public BangKernelWithoutConfig {
|
||||||
cnnlSetMatMulDescAttr(bmm_desc, CNNL_MATMUL_DESC_TRANSB, &transB,
|
cnnlSetMatMulDescAttr(bmm_desc, CNNL_MATMUL_DESC_TRANSB, &transB,
|
||||||
sizeof(int32_t));
|
sizeof(int32_t));
|
||||||
|
|
||||||
std::string computeTypeStr = op->getComputeType();
|
|
||||||
if (computeTypeStr == "tf32") {
|
|
||||||
int32_t tf32 = 1;
|
|
||||||
cnnlSetMatMulDescAttr(bmm_desc, CNNL_MATMUL_ALLOW_TF32, &tf32,
|
|
||||||
sizeof(int32_t));
|
|
||||||
}
|
|
||||||
|
|
||||||
cnnlMatMulAlgo_t bmm_algo;
|
cnnlMatMulAlgo_t bmm_algo;
|
||||||
cnnlMatMulAlgoCreate(&bmm_algo);
|
cnnlMatMulAlgoCreate(&bmm_algo);
|
||||||
|
|
||||||
|
|
|
@ -5,34 +5,42 @@
|
||||||
constexpr unsigned int num_threads() { return 32 * 4; }
|
constexpr unsigned int num_threads() { return 32 * 4; }
|
||||||
constexpr int thread_work_size() { return 4; }
|
constexpr int thread_work_size() { return 4; }
|
||||||
constexpr int block_work_size() { return thread_work_size() * num_threads(); }
|
constexpr int block_work_size() { return thread_work_size() * num_threads(); }
|
||||||
|
const int repeat = 1;
|
||||||
template <class T>
|
template <class T>
|
||||||
__global__ void _div_kernel(void *x, void *y, void *z, int a0, int a1, int a2,
|
__global__ void _div_kernel(void *x, void *y, void *z, int a0, int a1, int a2,
|
||||||
int a3, int b0, int b1, int b2, int b3, int c0,
|
int a3, int b0, int b1, int b2, int b3, int c0,
|
||||||
int c1, int c2, int c3) {
|
int c1, int c2, int c3) {
|
||||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
int stride = blockDim.x * gridDim.x;
|
|
||||||
int n = c0 * c1 * c2 * c3;
|
|
||||||
|
|
||||||
for (int i = index; i < n; i += stride) {
|
int stride1 = c2 * c3;
|
||||||
int c0_index = i / (c1 * c2 * c3);
|
int stride0 = c1 * stride1;
|
||||||
int c1_index = (i % (c1 * c2 * c3)) / (c2 * c3);
|
int n = c0 * stride0;
|
||||||
int c2_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) / c3;
|
int end = (repeat * index + repeat < n ? repeat * index + repeat : n);
|
||||||
int c3_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) % c3;
|
for (int i = repeat * index; i < end; i++) {
|
||||||
|
int xIdx = (a0 * a1 * a2 * a3 == n ? i : 0);
|
||||||
|
int yIdx = (b0 * b1 * b2 * b3 == n ? i : 0);
|
||||||
|
|
||||||
int a0_index = c0_index % a0;
|
bool aIdx = (a0 * a1 * a2 * a3 < n && a0 * a1 * a2 * a3 > 1);
|
||||||
int a1_index = c1_index % a1;
|
bool bIdx = (b0 * b1 * b2 * b3 < n && b0 * b1 * b2 * b3 > 1);
|
||||||
int a2_index = c2_index % a2;
|
if (aIdx || bIdx) {
|
||||||
int a3_index = c3_index % a3;
|
int c0_index = i / stride0;
|
||||||
|
int c1_index = (i % stride0) / stride1;
|
||||||
|
int c2_index = (i % stride1) / c3;
|
||||||
|
int c3_index = i % c3;
|
||||||
|
if (aIdx) {
|
||||||
|
|
||||||
int b0_index = c0_index % b0;
|
xIdx = (c0_index % a0) * a1 * a2 * a3 +
|
||||||
int b1_index = c1_index % b1;
|
(c1_index % a1) * a2 * a3 + (c2_index % a2) * a3 +
|
||||||
int b2_index = c2_index % b2;
|
c3_index % a3;
|
||||||
int b3_index = c3_index % b3;
|
}
|
||||||
((T *)z)[i] = ((T *)x)[a0_index * a1 * a2 * a3 + a1_index * a2 * a3 +
|
if (bIdx) {
|
||||||
a2_index * a3 + a3_index] /
|
|
||||||
((T *)y)[b0_index * b1 * b2 * b3 + b1_index * b2 * b3 +
|
yIdx = (c0_index % b0) * b1 * b2 * b3 +
|
||||||
b2_index * b3 + b3_index];
|
(c1_index % b1) * b2 * b3 + (c2_index % b2) * b3 +
|
||||||
|
c3_index % b3;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
((T *)z)[i] = ((T *)x)[xIdx] / ((T *)y)[yIdx];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -41,28 +49,36 @@ __global__ void _add_kernel(void *x, void *y, void *z, int a0, int a1, int a2,
|
||||||
int a3, int b0, int b1, int b2, int b3, int c0,
|
int a3, int b0, int b1, int b2, int b3, int c0,
|
||||||
int c1, int c2, int c3) {
|
int c1, int c2, int c3) {
|
||||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
int stride = blockDim.x * gridDim.x;
|
|
||||||
int n = c0 * c1 * c2 * c3;
|
|
||||||
|
|
||||||
for (int i = index; i < n; i += stride) {
|
int stride1 = c2 * c3;
|
||||||
int c0_index = i / (c1 * c2 * c3);
|
int stride0 = c1 * stride1;
|
||||||
int c1_index = (i % (c1 * c2 * c3)) / (c2 * c3);
|
int n = c0 * stride0;
|
||||||
int c2_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) / c3;
|
int end = (repeat * index + repeat < n ? repeat * index + repeat : n);
|
||||||
int c3_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) % c3;
|
for (int i = repeat * index; i < end; i++) {
|
||||||
|
int xIdx = (a0 * a1 * a2 * a3 == n ? i : 0);
|
||||||
|
int yIdx = (b0 * b1 * b2 * b3 == n ? i : 0);
|
||||||
|
|
||||||
int a0_index = c0_index % a0;
|
bool aIdx = (a0 * a1 * a2 * a3 < n && a0 * a1 * a2 * a3 > 1);
|
||||||
int a1_index = c1_index % a1;
|
bool bIdx = (b0 * b1 * b2 * b3 < n && b0 * b1 * b2 * b3 > 1);
|
||||||
int a2_index = c2_index % a2;
|
if (aIdx || bIdx) {
|
||||||
int a3_index = c3_index % a3;
|
int c0_index = i / stride0;
|
||||||
|
int c1_index = (i % stride0) / stride1;
|
||||||
|
int c2_index = (i % stride1) / c3;
|
||||||
|
int c3_index = i % c3;
|
||||||
|
if (aIdx) {
|
||||||
|
|
||||||
int b0_index = c0_index % b0;
|
xIdx = (c0_index % a0) * a1 * a2 * a3 +
|
||||||
int b1_index = c1_index % b1;
|
(c1_index % a1) * a2 * a3 + (c2_index % a2) * a3 +
|
||||||
int b2_index = c2_index % b2;
|
c3_index % a3;
|
||||||
int b3_index = c3_index % b3;
|
}
|
||||||
((T *)z)[i] = ((T *)x)[a0_index * a1 * a2 * a3 + a1_index * a2 * a3 +
|
if (bIdx) {
|
||||||
a2_index * a3 + a3_index] +
|
|
||||||
((T *)y)[b0_index * b1 * b2 * b3 + b1_index * b2 * b3 +
|
yIdx = (c0_index % b0) * b1 * b2 * b3 +
|
||||||
b2_index * b3 + b3_index];
|
(c1_index % b1) * b2 * b3 + (c2_index % b2) * b3 +
|
||||||
|
c3_index % b3;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
((T *)z)[i] = ((T *)x)[xIdx] + ((T *)y)[yIdx];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -71,29 +87,36 @@ __global__ void _pow_kernel(void *x, void *y, void *z, int a0, int a1, int a2,
|
||||||
int a3, int b0, int b1, int b2, int b3, int c0,
|
int a3, int b0, int b1, int b2, int b3, int c0,
|
||||||
int c1, int c2, int c3) {
|
int c1, int c2, int c3) {
|
||||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
int stride = blockDim.x * gridDim.x;
|
|
||||||
int n = c0 * c1 * c2 * c3;
|
|
||||||
|
|
||||||
for (int i = index; i < n; i += stride) {
|
int stride1 = c2 * c3;
|
||||||
int c0_index = i / (c1 * c2 * c3);
|
int stride0 = c1 * stride1;
|
||||||
int c1_index = (i % (c1 * c2 * c3)) / (c2 * c3);
|
int n = c0 * stride0;
|
||||||
int c2_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) / c3;
|
int end = (repeat * index + repeat < n ? repeat * index + repeat : n);
|
||||||
int c3_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) % c3;
|
for (int i = repeat * index; i < end; i++) {
|
||||||
|
int xIdx = (a0 * a1 * a2 * a3 == n ? i : 0);
|
||||||
|
int yIdx = (b0 * b1 * b2 * b3 == n ? i : 0);
|
||||||
|
|
||||||
int a0_index = c0_index % a0;
|
bool aIdx = (a0 * a1 * a2 * a3 < n && a0 * a1 * a2 * a3 > 1);
|
||||||
int a1_index = c1_index % a1;
|
bool bIdx = (b0 * b1 * b2 * b3 < n && b0 * b1 * b2 * b3 > 1);
|
||||||
int a2_index = c2_index % a2;
|
if (aIdx || bIdx) {
|
||||||
int a3_index = c3_index % a3;
|
int c0_index = i / stride0;
|
||||||
|
int c1_index = (i % stride0) / stride1;
|
||||||
|
int c2_index = (i % stride1) / c3;
|
||||||
|
int c3_index = i % c3;
|
||||||
|
if (aIdx) {
|
||||||
|
|
||||||
int b0_index = c0_index % b0;
|
xIdx = (c0_index % a0) * a1 * a2 * a3 +
|
||||||
int b1_index = c1_index % b1;
|
(c1_index % a1) * a2 * a3 + (c2_index % a2) * a3 +
|
||||||
int b2_index = c2_index % b2;
|
c3_index % a3;
|
||||||
int b3_index = c3_index % b3;
|
}
|
||||||
((T *)z)[i] =
|
if (bIdx) {
|
||||||
pow(((T *)x)[a0_index * a1 * a2 * a3 + a1_index * a2 * a3 +
|
|
||||||
a2_index * a3 + a3_index],
|
yIdx = (c0_index % b0) * b1 * b2 * b3 +
|
||||||
((T *)y)[b0_index * b1 * b2 * b3 + b1_index * b2 * b3 +
|
(c1_index % b1) * b2 * b3 + (c2_index % b2) * b3 +
|
||||||
b2_index * b3 + b3_index]);
|
c3_index % b3;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
((T *)z)[i] = pow(((T *)x)[xIdx], ((T *)y)[yIdx]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -102,31 +125,36 @@ __global__ void _less_kernel(void *x, void *y, void *z, int a0, int a1, int a2,
|
||||||
int a3, int b0, int b1, int b2, int b3, int c0,
|
int a3, int b0, int b1, int b2, int b3, int c0,
|
||||||
int c1, int c2, int c3) {
|
int c1, int c2, int c3) {
|
||||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
int stride = blockDim.x * gridDim.x;
|
|
||||||
int n = c0 * c1 * c2 * c3;
|
|
||||||
|
|
||||||
for (int i = index; i < n; i += stride) {
|
int stride1 = c2 * c3;
|
||||||
int c0_index = i / (c1 * c2 * c3);
|
int stride0 = c1 * stride1;
|
||||||
int c1_index = (i % (c1 * c2 * c3)) / (c2 * c3);
|
int n = c0 * stride0;
|
||||||
int c2_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) / c3;
|
int end = (repeat * index + repeat < n ? repeat * index + repeat : n);
|
||||||
int c3_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) % c3;
|
for (int i = repeat * index; i < end; i++) {
|
||||||
|
int xIdx = (a0 * a1 * a2 * a3 == n ? i : 0);
|
||||||
|
int yIdx = (b0 * b1 * b2 * b3 == n ? i : 0);
|
||||||
|
|
||||||
int a0_index = c0_index % a0;
|
bool aIdx = (a0 * a1 * a2 * a3 < n && a0 * a1 * a2 * a3 > 1);
|
||||||
int a1_index = c1_index % a1;
|
bool bIdx = (b0 * b1 * b2 * b3 < n && b0 * b1 * b2 * b3 > 1);
|
||||||
int a2_index = c2_index % a2;
|
if (aIdx || bIdx) {
|
||||||
int a3_index = c3_index % a3;
|
int c0_index = i / stride0;
|
||||||
|
int c1_index = (i % stride0) / stride1;
|
||||||
|
int c2_index = (i % stride1) / c3;
|
||||||
|
int c3_index = i % c3;
|
||||||
|
if (aIdx) {
|
||||||
|
|
||||||
int b0_index = c0_index % b0;
|
xIdx = (c0_index % a0) * a1 * a2 * a3 +
|
||||||
int b1_index = c1_index % b1;
|
(c1_index % a1) * a2 * a3 + (c2_index % a2) * a3 +
|
||||||
int b2_index = c2_index % b2;
|
c3_index % a3;
|
||||||
int b3_index = c3_index % b3;
|
}
|
||||||
((bool *)z)[i] =
|
if (bIdx) {
|
||||||
((T *)x)[a0_index * a1 * a2 * a3 + a1_index * a2 * a3 +
|
|
||||||
a2_index * a3 + a3_index] <
|
yIdx = (c0_index % b0) * b1 * b2 * b3 +
|
||||||
((T *)y)[b0_index * b1 * b2 * b3 + b1_index * b2 * b3 +
|
(c1_index % b1) * b2 * b3 + (c2_index % b2) * b3 +
|
||||||
b2_index * b3 + b3_index]
|
c3_index % b3;
|
||||||
? true
|
}
|
||||||
: false;
|
}
|
||||||
|
((bool *)z)[i] = ((T *)x)[xIdx] < ((T *)y)[yIdx] ? true : false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -176,7 +204,6 @@ __global__ void _less_kernel(void *x, void *y, void *z, int a0, int a1, int a2,
|
||||||
default: \
|
default: \
|
||||||
IT_TODO_HALT(); \
|
IT_TODO_HALT(); \
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class T>
|
template <class T>
|
||||||
__global__ void _div_const_kernel(void const *__restrict__ x,
|
__global__ void _div_const_kernel(void const *__restrict__ x,
|
||||||
void const *__restrict__ y,
|
void const *__restrict__ y,
|
||||||
|
@ -269,7 +296,8 @@ void div_kernel(int dType, void *a, void *b, void *c, int a0, int a1, int a2,
|
||||||
|
|
||||||
int blocksize = block_work_size();
|
int blocksize = block_work_size();
|
||||||
int num = c0 * c1 * c2 * c3;
|
int num = c0 * c1 * c2 * c3;
|
||||||
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
int gridsize =
|
||||||
|
(num + repeat * block_work_size() - 1) / (repeat * block_work_size());
|
||||||
SWITCH_DTYPE(div, dType)
|
SWITCH_DTYPE(div, dType)
|
||||||
}
|
}
|
||||||
void add_kernel(int dType, void *a, void *b, void *c, int a0, int a1, int a2,
|
void add_kernel(int dType, void *a, void *b, void *c, int a0, int a1, int a2,
|
||||||
|
@ -278,7 +306,8 @@ void add_kernel(int dType, void *a, void *b, void *c, int a0, int a1, int a2,
|
||||||
|
|
||||||
int blocksize = block_work_size();
|
int blocksize = block_work_size();
|
||||||
int num = c0 * c1 * c2 * c3;
|
int num = c0 * c1 * c2 * c3;
|
||||||
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
int gridsize =
|
||||||
|
(num + repeat * block_work_size() - 1) / (repeat * block_work_size());
|
||||||
SWITCH_DTYPE(add, dType)
|
SWITCH_DTYPE(add, dType)
|
||||||
}
|
}
|
||||||
void pow_kernel(int dType, void *a, void *b, void *c, int a0, int a1, int a2,
|
void pow_kernel(int dType, void *a, void *b, void *c, int a0, int a1, int a2,
|
||||||
|
@ -286,7 +315,8 @@ void pow_kernel(int dType, void *a, void *b, void *c, int a0, int a1, int a2,
|
||||||
int c3) {
|
int c3) {
|
||||||
int blocksize = block_work_size();
|
int blocksize = block_work_size();
|
||||||
int num = c0 * c1 * c2 * c3;
|
int num = c0 * c1 * c2 * c3;
|
||||||
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
int gridsize =
|
||||||
|
(num + repeat * block_work_size() - 1) / (repeat * block_work_size());
|
||||||
if (dType == 1) {
|
if (dType == 1) {
|
||||||
_pow_kernel<float>
|
_pow_kernel<float>
|
||||||
<<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>(
|
<<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>(
|
||||||
|
@ -324,7 +354,8 @@ void less_kernel(int dType, void *a, void *b, void *c, int a0, int a1, int a2,
|
||||||
int c3) {
|
int c3) {
|
||||||
int blocksize = block_work_size();
|
int blocksize = block_work_size();
|
||||||
int num = c0 * c1 * c2 * c3;
|
int num = c0 * c1 * c2 * c3;
|
||||||
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
int gridsize =
|
||||||
|
(num + repeat * block_work_size() - 1) / (repeat * block_work_size());
|
||||||
SWITCH_DTYPE(less, dType)
|
SWITCH_DTYPE(less, dType)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -12,22 +12,33 @@ class ExpandCuda : public CudaKernelWithoutConfig {
|
||||||
|
|
||||||
void *const inputData = (op->getInputs(0)->getRawDataPtr<void *>());
|
void *const inputData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
void *const outputData = (op->getOutput()->getRawDataPtr<void *>());
|
void *const outputData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
const auto &in_Shape = op->getInputs(0)->getDims(); // input shape
|
auto a_dim = op->getInputs(0)->getDims();
|
||||||
const auto &out_Shape = op->getShape(); // output shape
|
auto b_dim = op->getOutput()->getDims(); // output shape
|
||||||
|
|
||||||
SmallArray inputShape, outputShape;
|
|
||||||
int nDims = op->getInputs(0)->getDims().size();
|
|
||||||
|
|
||||||
IT_ASSERT(nDims <= SMALL_ARRAY_SIZE);
|
|
||||||
int outputsize = 1; // the length of the output vector after flatten
|
|
||||||
for (int i = 0; i < nDims; ++i) {
|
|
||||||
outputShape.data[i] = out_Shape[i];
|
|
||||||
inputShape.data[i] = in_Shape[i];
|
|
||||||
outputsize *= out_Shape[i];
|
|
||||||
}
|
|
||||||
const int dType = op->getDType().getIndex();
|
const int dType = op->getDType().getIndex();
|
||||||
expandKernel(dType, inputData, outputData, nDims, outputsize,
|
if (a_dim.size() > 4 || b_dim.size() > 4) {
|
||||||
inputShape, outputShape);
|
SmallArray inputShape, outputShape;
|
||||||
|
int nDims = op->getInputs(0)->getDims().size();
|
||||||
|
|
||||||
|
IT_ASSERT(nDims <= SMALL_ARRAY_SIZE);
|
||||||
|
int outputsize = 1; // the length of the output vector after flatten
|
||||||
|
for (int i = 0; i < nDims; ++i) {
|
||||||
|
outputShape.data[i] = b_dim[i];
|
||||||
|
inputShape.data[i] = a_dim[i];
|
||||||
|
outputsize *= b_dim[i];
|
||||||
|
}
|
||||||
|
const int dType = op->getDType().getIndex();
|
||||||
|
expandKernel(dType, inputData, outputData, nDims, outputsize,
|
||||||
|
inputShape, outputShape);
|
||||||
|
|
||||||
|
} else {
|
||||||
|
int a[4] = {1, 1, 1, 1};
|
||||||
|
int b[4] = {1, 1, 1, 1};
|
||||||
|
std::copy(a_dim.begin(), a_dim.end(), a + (4 - a_dim.size()));
|
||||||
|
std::copy(b_dim.begin(), b_dim.end(), b + (4 - b_dim.size()));
|
||||||
|
expandKernel(dType, inputData, outputData, a[0], a[1], a[2], a[3],
|
||||||
|
b[0], b[1], b[2], b[3]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -6,7 +6,31 @@
|
||||||
constexpr unsigned int num_threads() { return 32 * 4; }
|
constexpr unsigned int num_threads() { return 32 * 4; }
|
||||||
constexpr int thread_work_size() { return 4; }
|
constexpr int thread_work_size() { return 4; }
|
||||||
constexpr int block_work_size() { return thread_work_size() * num_threads(); }
|
constexpr int block_work_size() { return thread_work_size() * num_threads(); }
|
||||||
|
const int repeat = 1;
|
||||||
|
template <class T>
|
||||||
|
__global__ void _expandKernel(void *input, void *output, int a0, int a1, int a2,
|
||||||
|
int a3, int b0, int b1, int b2, int b3) {
|
||||||
|
|
||||||
|
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
|
|
||||||
|
int stride1 = b2 * b3;
|
||||||
|
int stride0 = b1 * stride1;
|
||||||
|
int n = b0 * stride0;
|
||||||
|
int end = (repeat * index + repeat < n ? repeat * index + repeat : n);
|
||||||
|
for (int i = repeat * index; i < end; i++) {
|
||||||
|
int xIdx = (a0 * a1 * a2 * a3 == n ? i : 0);
|
||||||
|
bool aIdx = (a0 * a1 * a2 * a3 < n && a0 * a1 * a2 * a3 > 1);
|
||||||
|
if (aIdx) {
|
||||||
|
int b0_index = i / stride0;
|
||||||
|
int b1_index = (i % stride0) / stride1;
|
||||||
|
int b2_index = (i % stride1) / b3;
|
||||||
|
int b3_index = i % b3;
|
||||||
|
xIdx = (b0_index % a0) * a1 * a2 * a3 + (b1_index % a1) * a2 * a3 +
|
||||||
|
(b2_index % a2) * a3 + b3_index % a3;
|
||||||
|
}
|
||||||
|
((T *)output)[i] = ((T *)input)[xIdx];
|
||||||
|
}
|
||||||
|
}
|
||||||
template <class T>
|
template <class T>
|
||||||
__global__ void _expandKernel(void *input, void *output, int nDims,
|
__global__ void _expandKernel(void *input, void *output, int nDims,
|
||||||
int outputsize, infini::SmallArray inputShape,
|
int outputsize, infini::SmallArray inputShape,
|
||||||
|
@ -38,7 +62,6 @@ __global__ void _expandKernel(void *input, void *output, int nDims,
|
||||||
((T *)output)[outputIdx] = ((T *)input)[inputIdx];
|
((T *)output)[outputIdx] = ((T *)input)[inputIdx];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class T>
|
template <class T>
|
||||||
static __global__ void _expandRowKernel(void *__restrict__ dst,
|
static __global__ void _expandRowKernel(void *__restrict__ dst,
|
||||||
void const *__restrict__ src) {
|
void const *__restrict__ src) {
|
||||||
|
@ -50,9 +73,9 @@ static __global__ void _expandRowKernel(void *__restrict__ dst,
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
#define CASE(T) \
|
#define CASE(T) \
|
||||||
_expandKernel<DT_CUDA<T>::t><<<gridsize, blocksize, \
|
_expandKernel<DT_CUDA<T>::t> \
|
||||||
0, CUDAStream::getCurrentStream()>>>( \
|
<<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>( \
|
||||||
input, output, nDims, outputsize, inputShape, outputShape);
|
input, output, a0, a1, a2, a3, b0, b1, b2, b3);
|
||||||
|
|
||||||
#define SWITCH_DTYPE(DTYPE) \
|
#define SWITCH_DTYPE(DTYPE) \
|
||||||
switch (DTYPE) { \
|
switch (DTYPE) { \
|
||||||
|
@ -96,14 +119,56 @@ namespace infini {
|
||||||
IT_TODO_HALT(); \
|
IT_TODO_HALT(); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void expandKernel(int dType, void *input, void *output, int a0, int a1, int a2,
|
||||||
|
int a3, int b0, int b1, int b2, int b3) {
|
||||||
|
int blocksize = block_work_size();
|
||||||
|
int outputsize = b0 * b1 * b2 * b3;
|
||||||
|
int gridsize = (outputsize + repeat * block_work_size() - 1) /
|
||||||
|
(repeat * block_work_size());
|
||||||
|
SWITCH_DTYPE(dType)
|
||||||
|
}
|
||||||
|
#define CASECurrency(T) \
|
||||||
|
_expandKernel<DT_CUDA<T>::t> \
|
||||||
|
<<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>( \
|
||||||
|
input, output, nDims, outputsize, inputShape, outputShape);
|
||||||
|
|
||||||
|
#define SWITCHCurrency_DTYPE(DTYPE) \
|
||||||
|
switch (DTYPE) { \
|
||||||
|
case 1: \
|
||||||
|
CASECurrency(1) break; \
|
||||||
|
case 2: \
|
||||||
|
CASECurrency(2) break; \
|
||||||
|
case 3: \
|
||||||
|
CASECurrency(3) break; \
|
||||||
|
case 4: \
|
||||||
|
CASECurrency(4) break; \
|
||||||
|
case 5: \
|
||||||
|
CASECurrency(5) break; \
|
||||||
|
case 6: \
|
||||||
|
CASECurrency(6) break; \
|
||||||
|
case 7: \
|
||||||
|
CASECurrency(7) break; \
|
||||||
|
case 10: \
|
||||||
|
CASECurrency(10) break; \
|
||||||
|
case 11: \
|
||||||
|
CASECurrency(11) break; \
|
||||||
|
case 12: \
|
||||||
|
CASECurrency(12) break; \
|
||||||
|
case 13: \
|
||||||
|
CASECurrency(13) break; \
|
||||||
|
case 16: \
|
||||||
|
CASECurrency(16) break; \
|
||||||
|
default: \
|
||||||
|
IT_TODO_HALT(); \
|
||||||
|
}
|
||||||
|
|
||||||
void expandKernel(int dType, void *input, void *output, int nDims,
|
void expandKernel(int dType, void *input, void *output, int nDims,
|
||||||
int outputsize, SmallArray inputShape,
|
int outputsize, SmallArray inputShape,
|
||||||
SmallArray outputShape) {
|
SmallArray outputShape) {
|
||||||
int blocksize = block_work_size();
|
int blocksize = block_work_size();
|
||||||
int gridsize = (outputsize + block_work_size() - 1) / block_work_size();
|
int gridsize = (outputsize + block_work_size() - 1) / block_work_size();
|
||||||
SWITCH_DTYPE(dType)
|
SWITCHCurrency_DTYPE(dType)
|
||||||
}
|
}
|
||||||
|
|
||||||
#define CASE_ROW(T) \
|
#define CASE_ROW(T) \
|
||||||
_expandRowKernel<float> \
|
_expandRowKernel<float> \
|
||||||
<<<grid, block, 0, CUDAStream::getCurrentStream()>>>(output, input);
|
<<<grid, block, 0, CUDAStream::getCurrentStream()>>>(output, input);
|
||||||
|
@ -150,7 +215,8 @@ void expandKernel(int dType, void *input, void *output, int nDims,
|
||||||
IT_TODO_HALT(); \
|
IT_TODO_HALT(); \
|
||||||
}
|
}
|
||||||
|
|
||||||
// Optimization for expanding a row vector. The row length must be a multiple of 32
|
// Optimization for expanding a row vector. The row length must be a multiple of
|
||||||
|
// 32
|
||||||
void expandRowKernel(int dType, void *input, void *output, int n_rows,
|
void expandRowKernel(int dType, void *input, void *output, int n_rows,
|
||||||
int row_len) {
|
int row_len) {
|
||||||
// Factorize row_len: row_len = a x b x 32 (32 is the warp size), b<=32
|
// Factorize row_len: row_len = a x b x 32 (32 is the warp size), b<=32
|
||||||
|
@ -160,7 +226,8 @@ void expandRowKernel(int dType, void *input, void *output, int n_rows,
|
||||||
// block: b x 32
|
// block: b x 32
|
||||||
auto c = row_len / 32, b = c;
|
auto c = row_len / 32, b = c;
|
||||||
if (b > 32) {
|
if (b > 32) {
|
||||||
for (b = 32; c % b != 0; --b);
|
for (b = 32; c % b != 0; --b)
|
||||||
|
;
|
||||||
}
|
}
|
||||||
auto a = c / b;
|
auto a = c / b;
|
||||||
dim3 grid(a, n_rows), block(32, b);
|
dim3 grid(a, n_rows), block(32, b);
|
||||||
|
|
|
@ -87,20 +87,7 @@ class matmulCublas : public Kernel {
|
||||||
beta_naive = 1.f;
|
beta_naive = 1.f;
|
||||||
auto inC = op->getInputs(2);
|
auto inC = op->getInputs(2);
|
||||||
auto out = op->getOutput();
|
auto out = op->getOutput();
|
||||||
SmallArray inputShape, outputShape;
|
|
||||||
int nDims = out->getRank();
|
|
||||||
IT_ASSERT(nDims <= SMALL_ARRAY_SIZE);
|
|
||||||
// FIXME(constroy): use size_t for outputsize.
|
|
||||||
int outputsize = 1; // the length of the output vector after flatten
|
|
||||||
int offset = nDims - inC->getRank();
|
|
||||||
for (int i = 0; i < offset; ++i)
|
|
||||||
inputShape.data[i] = 1;
|
|
||||||
for (int i = 0; i < nDims; ++i) {
|
|
||||||
outputShape.data[i] = out->getDims()[i];
|
|
||||||
outputsize *= outputShape.data[i];
|
|
||||||
if (i >= offset)
|
|
||||||
inputShape.data[i] = inC->getDims()[i - offset];
|
|
||||||
}
|
|
||||||
const int dType = dataType.getIndex();
|
const int dType = dataType.getIndex();
|
||||||
|
|
||||||
// Bias in linear layer is row vector of (1,n), n is the number of
|
// Bias in linear layer is row vector of (1,n), n is the number of
|
||||||
|
@ -111,9 +98,40 @@ class matmulCublas : public Kernel {
|
||||||
out->size() / inC->getDims()[0],
|
out->size() / inC->getDims()[0],
|
||||||
inC->getDims()[0]);
|
inC->getDims()[0]);
|
||||||
} else {
|
} else {
|
||||||
expandKernel(dType, inC->getRawDataPtr<void *>(),
|
auto a_dim = out->getDims();
|
||||||
out->getRawDataPtr<void *>(), nDims, outputsize,
|
auto b_dim = inC->getDims(); // output shape
|
||||||
inputShape, outputShape);
|
|
||||||
|
if (a_dim.size() > 4 || b_dim.size() > 4) {
|
||||||
|
SmallArray inputShape, outputShape;
|
||||||
|
int nDims = out->getRank();
|
||||||
|
IT_ASSERT(nDims <= SMALL_ARRAY_SIZE);
|
||||||
|
// FIXME(constroy): use size_t for outputsize.
|
||||||
|
int outputsize =
|
||||||
|
1; // the length of the output vector after flatten
|
||||||
|
int offset = nDims - inC->getRank();
|
||||||
|
for (int i = 0; i < offset; ++i)
|
||||||
|
inputShape.data[i] = 1;
|
||||||
|
for (int i = 0; i < nDims; ++i) {
|
||||||
|
outputShape.data[i] = out->getDims()[i];
|
||||||
|
outputsize *= outputShape.data[i];
|
||||||
|
if (i >= offset)
|
||||||
|
inputShape.data[i] = inC->getDims()[i - offset];
|
||||||
|
}
|
||||||
|
expandKernel(dType, inC->getRawDataPtr<void *>(),
|
||||||
|
out->getRawDataPtr<void *>(), nDims,
|
||||||
|
outputsize, inputShape, outputShape);
|
||||||
|
|
||||||
|
} else {
|
||||||
|
int a[4] = {1, 1, 1, 1};
|
||||||
|
int b[4] = {1, 1, 1, 1};
|
||||||
|
std::copy(a_dim.begin(), a_dim.end(),
|
||||||
|
a + (4 - a_dim.size()));
|
||||||
|
std::copy(b_dim.begin(), b_dim.end(),
|
||||||
|
b + (4 - b_dim.size()));
|
||||||
|
expandKernel(dType, inC->getRawDataPtr<void *>(),
|
||||||
|
out->getRawDataPtr<void *>(), a[0], a[1], a[2],
|
||||||
|
a[3], b[0], b[1], b[2], b[3]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// TODO:use compute type
|
// TODO:use compute type
|
||||||
|
|
|
@ -16,31 +16,57 @@ class TransposeCuda : public CudaKernelWithoutConfig {
|
||||||
void *const outputData = output->getRawDataPtr<void *>();
|
void *const outputData = output->getRawDataPtr<void *>();
|
||||||
const auto &inputShape = input->getDims();
|
const auto &inputShape = input->getDims();
|
||||||
const auto &outputShape = output->getDims();
|
const auto &outputShape = output->getDims();
|
||||||
|
const int dType = op->getDType().getIndex();
|
||||||
const auto &perm = op->getPermute();
|
|
||||||
int size = input->size();
|
int size = input->size();
|
||||||
int nDims = input->getDims().size();
|
int nDims = input->getDims().size();
|
||||||
|
//----------------
|
||||||
// Compute strides
|
bool condition = true;
|
||||||
SmallArray strides, buffer;
|
int gnum = 0;
|
||||||
IT_ASSERT(nDims <= SMALL_ARRAY_SIZE);
|
for (int i = 0; i < nDims; i++) {
|
||||||
int curStride = 1;
|
if (inputShape[i] > 1) {
|
||||||
for (int i = nDims - 1; i >= 0; --i) {
|
while (gnum < nDims) {
|
||||||
buffer.data[i] = curStride;
|
if (outputShape[gnum] > 1) {
|
||||||
curStride *= inputShape[i];
|
gnum += 1;
|
||||||
}
|
break;
|
||||||
for (int i = 0; i < nDims; ++i) {
|
} else {
|
||||||
strides.data[i] = buffer.data[perm[i]];
|
gnum += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (inputShape[i] != outputShape[gnum - 1]) {
|
||||||
|
condition = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
//----------------
|
||||||
|
if (condition) {
|
||||||
|
cudaMemcpyAsync(outputData, inputData, op->getInputs(0)->getBytes(),
|
||||||
|
cudaMemcpyDeviceToDevice,
|
||||||
|
CUDAStream::getCurrentStream());
|
||||||
|
|
||||||
SmallArray outputDims;
|
} else {
|
||||||
for (int i = 0; i < nDims; ++i) {
|
const auto &perm = op->getPermute();
|
||||||
outputDims.data[i] = outputShape[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
const int dType = op->getDType().getIndex();
|
// Compute strides
|
||||||
transpose_kernel(dType, inputData, outputData, nDims, size, strides,
|
SmallArray strides, buffer;
|
||||||
outputDims);
|
IT_ASSERT(nDims <= SMALL_ARRAY_SIZE);
|
||||||
|
int curStride = 1;
|
||||||
|
for (int i = nDims - 1; i >= 0; --i) {
|
||||||
|
buffer.data[i] = curStride;
|
||||||
|
curStride *= inputShape[i];
|
||||||
|
}
|
||||||
|
for (int i = 0; i < nDims; ++i) {
|
||||||
|
strides.data[i] = buffer.data[perm[i]];
|
||||||
|
}
|
||||||
|
|
||||||
|
SmallArray outputDims;
|
||||||
|
for (int i = 0; i < nDims; ++i) {
|
||||||
|
outputDims.data[i] = outputShape[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
transpose_kernel(dType, inputData, outputData, nDims, size, strides,
|
||||||
|
outputDims);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -24,8 +24,8 @@ __global__ void _transpose_kernel(void *input, void *output, int nDims,
|
||||||
}
|
}
|
||||||
#define CASE(T) \
|
#define CASE(T) \
|
||||||
_transpose_kernel<DT_CUDA<T>::t> \
|
_transpose_kernel<DT_CUDA<T>::t> \
|
||||||
<<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>> \
|
<<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>( \
|
||||||
(input, output, nDims, size, strides, outputShape);
|
input, output, nDims, size, strides, outputShape);
|
||||||
|
|
||||||
#define SWITCH_DTYPE(DTYPE) \
|
#define SWITCH_DTYPE(DTYPE) \
|
||||||
switch (DTYPE) { \
|
switch (DTYPE) { \
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
#include "operators/where.h"
|
#include "operators/where.h"
|
||||||
#include "cuda/cuda_kernel_wihtout_config.h"
|
#include "cuda/cuda_kernel_wihtout_config.h"
|
||||||
#include "cuda/cuda_runtime.h"
|
#include "cuda/cuda_runtime.h"
|
||||||
|
#include "cuda/cuda_utility.h"
|
||||||
#include "cuda/cuda_where.h"
|
#include "cuda/cuda_where.h"
|
||||||
#include "utils/operator_utils.h"
|
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
|
@ -15,39 +15,50 @@ class WhereCuda : public CudaKernelWithoutConfig {
|
||||||
void *const inputYData = (op->getInputs(1)->getRawDataPtr<void *>());
|
void *const inputYData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||||
void *const conditionData = (op->getInputs(2)->getRawDataPtr<void *>());
|
void *const conditionData = (op->getInputs(2)->getRawDataPtr<void *>());
|
||||||
void *const outputData = (op->getOutput()->getRawDataPtr<void *>());
|
void *const outputData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
const auto &opInputXShape = op->getInputs(0)->getDims();
|
|
||||||
const auto &opInputYShape = op->getInputs(1)->getDims();
|
|
||||||
const auto &opConditionShape = op->getInputs(2)->getDims();
|
|
||||||
const auto &opOutputShape = op->getOutput()->getDims();
|
|
||||||
|
|
||||||
const int xSize = op->getInputs(0)->getRank();
|
auto a_dim = op->getInputs(0)->getDims();
|
||||||
const int ySize = op->getInputs(1)->getRank();
|
auto b_dim = op->getInputs(1)->getDims();
|
||||||
const int cSize = op->getInputs(2)->getRank();
|
auto c_dim = op->getInputs(2)->getDims();
|
||||||
|
auto d_dim = op->getOutput()->getDims();
|
||||||
|
const int dTypeIndex = op->getDType().getIndex();
|
||||||
|
if (a_dim.size() > 4 || b_dim.size() > 4 || c_dim.size() > 4 ||
|
||||||
|
d_dim.size() > 4) {
|
||||||
|
const int xSize = op->getInputs(0)->getRank();
|
||||||
|
const int ySize = op->getInputs(1)->getRank();
|
||||||
|
const int cSize = op->getInputs(2)->getRank();
|
||||||
|
|
||||||
int nDims = op->getOutput()->getDims().size();
|
int nDims = op->getOutput()->getDims().size();
|
||||||
IT_ASSERT(nDims <= SMALL_ARRAY_SIZE);
|
IT_ASSERT(nDims <= SMALL_ARRAY_SIZE);
|
||||||
int outputsize = 1;
|
int outputsize = 1;
|
||||||
SmallArray inputXShape, inputYShape, conditionShape, outputShape;
|
SmallArray inputXShape, inputYShape, conditionShape, outputShape;
|
||||||
for (int i = nDims - 1; i >= 0; --i) {
|
for (int i = nDims - 1; i >= 0; --i) {
|
||||||
outputShape.data[i] = opOutputShape[i];
|
outputShape.data[i] = d_dim[i];
|
||||||
outputsize *= outputShape.data[i];
|
outputsize *= outputShape.data[i];
|
||||||
|
}
|
||||||
|
broadcastShape(a_dim, inputXShape, nDims, xSize);
|
||||||
|
broadcastShape(b_dim, inputYShape, nDims, ySize);
|
||||||
|
broadcastShape(c_dim, conditionShape, nDims, cSize);
|
||||||
|
whereKernel(dTypeIndex, inputXData, inputYData,
|
||||||
|
(uint8_t *)conditionData, outputData, nDims, outputsize,
|
||||||
|
inputXShape, inputYShape, conditionShape, outputShape,
|
||||||
|
xSize, ySize, cSize);
|
||||||
}
|
}
|
||||||
broadcastShape(opInputXShape, inputXShape, nDims, xSize);
|
|
||||||
broadcastShape(opInputYShape, inputYShape, nDims, ySize);
|
|
||||||
broadcastShape(opConditionShape, conditionShape, nDims, cSize);
|
|
||||||
|
|
||||||
if (op->getDType() == DataType::Float32) {
|
else {
|
||||||
whereKernel((float *)inputXData, (float *)inputYData,
|
int a[4] = {1, 1, 1, 1};
|
||||||
(uint8_t *)conditionData, (float *)outputData, nDims,
|
int b[4] = {1, 1, 1, 1};
|
||||||
outputsize, inputXShape, inputYShape, conditionShape,
|
int c[4] = {1, 1, 1, 1};
|
||||||
outputShape, xSize, ySize, cSize);
|
int d[4] = {1, 1, 1, 1};
|
||||||
} else if (op->getDType() == DataType::Float16) {
|
|
||||||
whereKernel((half *)inputXData, (half *)inputYData,
|
std::copy(a_dim.begin(), a_dim.end(), a + (4 - a_dim.size()));
|
||||||
(uint8_t *)conditionData, (half *)outputData, nDims,
|
std::copy(b_dim.begin(), b_dim.end(), b + (4 - b_dim.size()));
|
||||||
outputsize, inputXShape, inputYShape, conditionShape,
|
std::copy(c_dim.begin(), c_dim.end(), c + (4 - c_dim.size()));
|
||||||
outputShape, xSize, ySize, cSize);
|
std::copy(d_dim.begin(), d_dim.end(), d + (4 - d_dim.size()));
|
||||||
} else {
|
|
||||||
IT_ASSERT(false);
|
whereKernel(dTypeIndex, inputXData, inputYData,
|
||||||
|
(uint8_t *)conditionData, outputData, a[0], a[1], a[2],
|
||||||
|
a[3], b[0], b[1], b[2], b[3], c[0], c[1], c[2], c[3],
|
||||||
|
d[0], d[1], d[2], d[3]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
|
@ -1,6 +1,109 @@
|
||||||
#include "cuda/cuda_common.h"
|
#include "cuda/cuda_common.h"
|
||||||
|
#include "cuda/cuda_utility.h"
|
||||||
#include "utils/small_array.h"
|
#include "utils/small_array.h"
|
||||||
|
const int repeat = 1;
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
__global__ void
|
||||||
|
_whereKernel(void *inputX, void *inputY, const uint8_t *condition, void *output,
|
||||||
|
int a0, int a1, int a2, int a3, int b0, int b1, int b2, int b3,
|
||||||
|
int c0, int c1, int c2, int c3, int d0, int d1, int d2, int d3) {
|
||||||
|
|
||||||
|
int stride1 = d2 * d3;
|
||||||
|
int stride0 = d1 * stride1;
|
||||||
|
int n = d0 * stride0;
|
||||||
|
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
|
int end = (repeat * index + repeat < n ? repeat * index + repeat : n);
|
||||||
|
for (int i = repeat * index; i < end; i++) {
|
||||||
|
int inputXIdx = (a0 * a1 * a2 * a3 == n ? i : 0);
|
||||||
|
int inputYIdx = (b0 * b1 * b2 * b3 == n ? i : 0);
|
||||||
|
int conditionIdx = (c0 * c1 * c2 * c3 == n ? i : 0);
|
||||||
|
|
||||||
|
bool aIdx = (a0 * a1 * a2 * a3 < n && a0 * a1 * a2 * a3 > 1);
|
||||||
|
bool bIdx = (b0 * b1 * b2 * b3 < n && b0 * b1 * b2 * b3 > 1);
|
||||||
|
bool cIdx = (c0 * c1 * c2 * c3 < n && c0 * c1 * c2 * c3 > 1);
|
||||||
|
if (aIdx || bIdx || cIdx) {
|
||||||
|
int d0_index = i / stride0;
|
||||||
|
int d1_index = (i % stride0) / stride1;
|
||||||
|
int d2_index = (i % stride1) / d3;
|
||||||
|
int d3_index = i % d3;
|
||||||
|
if (aIdx) {
|
||||||
|
int a0_index = d0_index % a0;
|
||||||
|
int a1_index = d1_index % a1;
|
||||||
|
int a2_index = d2_index % a2;
|
||||||
|
int a3_index = d3_index % a3;
|
||||||
|
inputXIdx = a0_index * a1 * a2 * a3 + a1_index * a2 * a3 +
|
||||||
|
a2_index * a3 + a3_index;
|
||||||
|
}
|
||||||
|
if (bIdx) {
|
||||||
|
int b0_index = d0_index % b0;
|
||||||
|
int b1_index = d1_index % b1;
|
||||||
|
int b2_index = d2_index % b2;
|
||||||
|
int b3_index = d3_index % b3;
|
||||||
|
inputYIdx = b0_index * b1 * b2 * b3 + b1_index * b2 * b3 +
|
||||||
|
b2_index * b3 + b3_index;
|
||||||
|
}
|
||||||
|
if (cIdx) {
|
||||||
|
int c0_index = d0_index % c0;
|
||||||
|
int c1_index = d1_index % c1;
|
||||||
|
int c2_index = d2_index % c2;
|
||||||
|
int c3_index = d3_index % c3;
|
||||||
|
conditionIdx = c0_index * c1 * c2 * c3 + c1_index * c2 * c3 +
|
||||||
|
c2_index * c3 + c3_index;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
((T *)output)[i] = condition[conditionIdx] ? ((T *)inputX)[inputXIdx]
|
||||||
|
: ((T *)inputY)[inputYIdx];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#define CASE(T) \
|
||||||
|
_whereKernel<DT_CUDA<T>::t> \
|
||||||
|
<<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>( \
|
||||||
|
inputX, inputY, condition, output, a0, a1, a2, a3, b0, b1, b2, b3, \
|
||||||
|
c0, c1, c2, c3, d0, d1, d2, d3);
|
||||||
|
|
||||||
|
#define SWITCH_DTYPE(DTYPE) \
|
||||||
|
switch (DTYPE) { \
|
||||||
|
case 1: \
|
||||||
|
CASE(1) \
|
||||||
|
break; \
|
||||||
|
case 2: \
|
||||||
|
CASE(2) \
|
||||||
|
break; \
|
||||||
|
case 3: \
|
||||||
|
CASE(3) \
|
||||||
|
break; \
|
||||||
|
case 4: \
|
||||||
|
CASE(4) \
|
||||||
|
break; \
|
||||||
|
case 5: \
|
||||||
|
CASE(5) \
|
||||||
|
break; \
|
||||||
|
case 6: \
|
||||||
|
CASE(6) \
|
||||||
|
break; \
|
||||||
|
case 7: \
|
||||||
|
CASE(7) \
|
||||||
|
break; \
|
||||||
|
case 10: \
|
||||||
|
CASE(10) \
|
||||||
|
break; \
|
||||||
|
case 11: \
|
||||||
|
CASE(11) \
|
||||||
|
break; \
|
||||||
|
case 12: \
|
||||||
|
CASE(12) \
|
||||||
|
break; \
|
||||||
|
case 13: \
|
||||||
|
CASE(13) \
|
||||||
|
break; \
|
||||||
|
case 16: \
|
||||||
|
CASE(16) \
|
||||||
|
break; \
|
||||||
|
default: \
|
||||||
|
IT_TODO_HALT(); \
|
||||||
|
}
|
||||||
__device__ int inferIndex(infini::SmallArray inputShape,
|
__device__ int inferIndex(infini::SmallArray inputShape,
|
||||||
infini::SmallArray outputShape, int nDims, int size,
|
infini::SmallArray outputShape, int nDims, int size,
|
||||||
int outputIdx) {
|
int outputIdx) {
|
||||||
|
@ -19,11 +122,10 @@ __device__ int inferIndex(infini::SmallArray inputShape,
|
||||||
}
|
}
|
||||||
template <typename T>
|
template <typename T>
|
||||||
__global__ void
|
__global__ void
|
||||||
_whereKernel(const T *inputX, const T *inputY, const uint8_t *condition,
|
_whereKernel(void *inputX, void *inputY, const uint8_t *condition, void *output,
|
||||||
T *output, int nDims, int outputsize,
|
int nDims, int outputsize, infini::SmallArray inputXShape,
|
||||||
infini::SmallArray inputXShape, infini::SmallArray inputYShape,
|
infini::SmallArray inputYShape, infini::SmallArray conditionShape,
|
||||||
infini::SmallArray conditionShape, infini::SmallArray outputShape,
|
infini::SmallArray outputShape, int xSize, int ySize, int cSize) {
|
||||||
int xSize, int ySize, int cSize) {
|
|
||||||
|
|
||||||
int outputIdx = blockIdx.x * blockDim.x + threadIdx.x;
|
int outputIdx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
if (outputIdx < outputsize) {
|
if (outputIdx < outputsize) {
|
||||||
|
@ -35,14 +137,74 @@ _whereKernel(const T *inputX, const T *inputY, const uint8_t *condition,
|
||||||
int inputYIdx =
|
int inputYIdx =
|
||||||
inferIndex(inputYShape, outputShape, nDims, ySize, outputIdx);
|
inferIndex(inputYShape, outputShape, nDims, ySize, outputIdx);
|
||||||
|
|
||||||
output[outputIdx] =
|
((T *)output)[outputIdx] = condition[conditionIdx]
|
||||||
condition[conditionIdx] ? inputX[inputXIdx] : inputY[inputYIdx];
|
? ((T *)inputX)[inputXIdx]
|
||||||
|
: ((T *)inputY)[inputYIdx];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#define CASECurrency(T) \
|
||||||
|
_whereKernel<DT_CUDA<T>::t> \
|
||||||
|
<<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>( \
|
||||||
|
inputX, inputY, condition, output, nDims, outputsize, inputXShape, \
|
||||||
|
inputYShape, conditionShape, outputShape, xSize, ySize, cSize);
|
||||||
|
|
||||||
|
#define SWITCHCurrency_DTYPE(DTYPE) \
|
||||||
|
switch (DTYPE) { \
|
||||||
|
case 1: \
|
||||||
|
CASECurrency(1) break; \
|
||||||
|
case 2: \
|
||||||
|
CASECurrency(2) break; \
|
||||||
|
case 3: \
|
||||||
|
CASECurrency(3) break; \
|
||||||
|
case 4: \
|
||||||
|
CASECurrency(4) break; \
|
||||||
|
case 5: \
|
||||||
|
CASECurrency(5) break; \
|
||||||
|
case 6: \
|
||||||
|
CASECurrency(6) break; \
|
||||||
|
case 7: \
|
||||||
|
CASECurrency(7) break; \
|
||||||
|
case 10: \
|
||||||
|
CASECurrency(10) break; \
|
||||||
|
case 11: \
|
||||||
|
CASECurrency(11) break; \
|
||||||
|
case 12: \
|
||||||
|
CASECurrency(12) break; \
|
||||||
|
case 13: \
|
||||||
|
CASECurrency(13) break; \
|
||||||
|
case 16: \
|
||||||
|
CASECurrency(16) break; \
|
||||||
|
default: \
|
||||||
|
IT_TODO_HALT(); \
|
||||||
|
}
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
void whereKernel(int dTypeIndex, void *inputX, void *inputY,
|
||||||
|
const uint8_t *condition, void *output, int a0, int a1, int a2,
|
||||||
|
int a3, int b0, int b1, int b2, int b3, int c0, int c1, int c2,
|
||||||
|
int c3, int d0, int d1, int d2, int d3) {
|
||||||
|
int blocksize;
|
||||||
|
int outputsize = d0 * d1 * d2 * d3;
|
||||||
|
if (outputsize > 511 * repeat) {
|
||||||
|
blocksize = 1024;
|
||||||
|
} else if (outputsize > 255 * repeat) {
|
||||||
|
blocksize = 512;
|
||||||
|
} else if (outputsize > 127 * repeat) {
|
||||||
|
blocksize = 256;
|
||||||
|
} else if (outputsize > 63 * repeat) {
|
||||||
|
blocksize = 128;
|
||||||
|
} else if (outputsize > 31 * repeat) {
|
||||||
|
blocksize = 64;
|
||||||
|
} else {
|
||||||
|
blocksize = 32;
|
||||||
|
}
|
||||||
|
int gridsize = (outputsize + repeat * blocksize - 1) / (repeat * blocksize);
|
||||||
|
|
||||||
|
SWITCH_DTYPE(dTypeIndex)
|
||||||
|
}
|
||||||
|
|
||||||
namespace infini {
|
void whereKernel(int dTypeIndex, void *inputX, void *inputY,
|
||||||
void whereKernel(const float *inputX, const float *inputY,
|
const uint8_t *condition, void *output, int nDims,
|
||||||
const uint8_t *condition, float *output, int nDims,
|
|
||||||
int outputsize, SmallArray inputXShape, SmallArray inputYShape,
|
int outputsize, SmallArray inputXShape, SmallArray inputYShape,
|
||||||
SmallArray conditionShape, SmallArray outputShape, int xSize,
|
SmallArray conditionShape, SmallArray outputShape, int xSize,
|
||||||
int ySize, int cSize) {
|
int ySize, int cSize) {
|
||||||
|
@ -61,34 +223,8 @@ void whereKernel(const float *inputX, const float *inputY,
|
||||||
blocksize = 32;
|
blocksize = 32;
|
||||||
}
|
}
|
||||||
int gridsize = (outputsize + blocksize - 1) / blocksize;
|
int gridsize = (outputsize + blocksize - 1) / blocksize;
|
||||||
_whereKernel<float>
|
|
||||||
<<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>(
|
SWITCHCurrency_DTYPE(dTypeIndex)
|
||||||
inputX, inputY, condition, output, nDims, outputsize, inputXShape,
|
|
||||||
inputYShape, conditionShape, outputShape, xSize, ySize, cSize);
|
|
||||||
}
|
|
||||||
void whereKernel(const half *inputX, const half *inputY,
|
|
||||||
const uint8_t *condition, half *output, int nDims,
|
|
||||||
int outputsize, SmallArray inputXShape, SmallArray inputYShape,
|
|
||||||
SmallArray conditionShape, SmallArray outputShape, int xSize,
|
|
||||||
int ySize, int cSize) {
|
|
||||||
int blocksize;
|
|
||||||
if (outputsize > 511) {
|
|
||||||
blocksize = 1024;
|
|
||||||
} else if (outputsize > 255) {
|
|
||||||
blocksize = 512;
|
|
||||||
} else if (outputsize > 127) {
|
|
||||||
blocksize = 256;
|
|
||||||
} else if (outputsize > 63) {
|
|
||||||
blocksize = 128;
|
|
||||||
} else if (outputsize > 31) {
|
|
||||||
blocksize = 64;
|
|
||||||
} else {
|
|
||||||
blocksize = 32;
|
|
||||||
}
|
|
||||||
int gridsize = (outputsize + blocksize - 1) / blocksize;
|
|
||||||
_whereKernel<half>
|
|
||||||
<<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>(
|
|
||||||
inputX, inputY, condition, output, nDims, outputsize, inputXShape,
|
|
||||||
inputYShape, conditionShape, outputShape, xSize, ySize, cSize);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace infini
|
} // namespace infini
|
||||||
|
|
|
@ -97,14 +97,11 @@ class DivXdnn : public KUNLUNKernelWithoutConfig {
|
||||||
auto aDim = op->getInputs(0)->getDims();
|
auto aDim = op->getInputs(0)->getDims();
|
||||||
auto bSize = op->getInputs(1)->size();
|
auto bSize = op->getInputs(1)->size();
|
||||||
auto bDim = op->getInputs(1)->getDims();
|
auto bDim = op->getInputs(1)->getDims();
|
||||||
|
auto dtype = op->getDType();
|
||||||
|
|
||||||
// op input a, b is scalar while aDim and b Dim is empty
|
|
||||||
if (bDim.size() == 0) {
|
if (bDim.size() == 0) {
|
||||||
bDim.push_back(1);
|
bDim.push_back(1);
|
||||||
}
|
}
|
||||||
if (aDim.size() == 0) {
|
|
||||||
aDim.push_back(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (aSize == bSize) {
|
if (aSize == bSize) {
|
||||||
// Do ElementWise Sub with no broadcast
|
// Do ElementWise Sub with no broadcast
|
||||||
|
@ -112,9 +109,23 @@ class DivXdnn : public KUNLUNKernelWithoutConfig {
|
||||||
(float *)aData, (float *)bData,
|
(float *)aData, (float *)bData,
|
||||||
(float *)cData, aSize));
|
(float *)cData, aSize));
|
||||||
} else {
|
} else {
|
||||||
checkKUNLUNError(xdnn::broadcast_div<float>(
|
// Do broadcast div
|
||||||
context->KUNLUNHandle(), (float *)aData, (float *)bData,
|
Shape aligned = infer_broadcast(aDim, bDim);
|
||||||
(float *)cData, aDim, bDim));
|
if (aligned == aDim) {
|
||||||
|
// BData need to be broadcasted
|
||||||
|
checkKUNLUNError(xdnn::broadcast_div<float>(
|
||||||
|
context->KUNLUNHandle(), (float *)aData, (float *)bData,
|
||||||
|
(float *)cData, aDim, bDim));
|
||||||
|
} else {
|
||||||
|
// Use workspace to broadcast aData
|
||||||
|
KUNLUNPtr wks = context->getWorkspace(bSize * dtype.getSize());
|
||||||
|
checkKUNLUNError(xdnn::broadcast<float>(
|
||||||
|
context->KUNLUNHandle(), (float *)aData, (float *)wks, aDim,
|
||||||
|
bDim));
|
||||||
|
checkKUNLUNError(xdnn::div<float>(context->KUNLUNHandle(),
|
||||||
|
(float *)wks, (float *)bData,
|
||||||
|
(float *)cData, bSize));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
|
@ -570,7 +570,6 @@ REGISTER_KERNEL(Device::KUNLUN, OpType::Reciprocal, ReciprocalXdnn,
|
||||||
REGISTER_KERNEL(Device::KUNLUN, OpType::Reshape, CopyXdnn, "Reshape_xdnn");
|
REGISTER_KERNEL(Device::KUNLUN, OpType::Reshape, CopyXdnn, "Reshape_xdnn");
|
||||||
REGISTER_KERNEL(Device::KUNLUN, OpType::Flatten, CopyXdnn, "Flatten_xdnn");
|
REGISTER_KERNEL(Device::KUNLUN, OpType::Flatten, CopyXdnn, "Flatten_xdnn");
|
||||||
REGISTER_KERNEL(Device::KUNLUN, OpType::Identity, CopyXdnn, "Identity_xdnn");
|
REGISTER_KERNEL(Device::KUNLUN, OpType::Identity, CopyXdnn, "Identity_xdnn");
|
||||||
REGISTER_KERNEL(Device::KUNLUN, OpType::Squeeze, CopyXdnn, "Squeeze_xdnn");
|
|
||||||
REGISTER_KERNEL(Device::KUNLUN, OpType::Abs, AbsXdnn, "Abs_xdnn");
|
REGISTER_KERNEL(Device::KUNLUN, OpType::Abs, AbsXdnn, "Abs_xdnn");
|
||||||
REGISTER_KERNEL(Device::KUNLUN, OpType::Atan, ATanXdnn, "Atan_xdnn");
|
REGISTER_KERNEL(Device::KUNLUN, OpType::Atan, ATanXdnn, "Atan_xdnn");
|
||||||
REGISTER_KERNEL(Device::KUNLUN, OpType::Log, LogXdnn, "Log_xdnn");
|
REGISTER_KERNEL(Device::KUNLUN, OpType::Log, LogXdnn, "Log_xdnn");
|
||||||
|
|
|
@ -84,6 +84,17 @@ void test_whereFp16(
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(CUDA_WhereFp32, run) {
|
TEST(CUDA_WhereFp32, run) {
|
||||||
|
test_whereFp32(
|
||||||
|
Shape{2, 2, 3, 1, 2},
|
||||||
|
vector<float>{0., 1., 2., 3., 4., 5., 6., 7.,
|
||||||
|
8., 9., 10., 11., 12., 13., 14., 15.,
|
||||||
|
16., 17., 18., 19., 20., 21., 22., 23.},
|
||||||
|
Shape{2, 2, 3, 1, 2},
|
||||||
|
vector<float>{0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
|
||||||
|
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.},
|
||||||
|
Shape{2, 3, 1, 2}, vector<uint8_t>{0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1},
|
||||||
|
vector<float>{0., 1., 2., 0., 0., 0., 6., 7., 0., 9., 10., 11.,
|
||||||
|
0., 13., 14., 0., 0., 0., 18., 19., 0., 21., 22., 23.});
|
||||||
test_whereFp32(
|
test_whereFp32(
|
||||||
Shape{2, 2, 3, 1}, vector<float>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
|
Shape{2, 2, 3, 1}, vector<float>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
|
||||||
Shape{2, 2, 3, 1}, vector<float>{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
Shape{2, 2, 3, 1}, vector<float>{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||||
|
|
Loading…
Reference in New Issue