Compare commits

..

5 Commits

Author SHA1 Message Date
kilinchange 3b7b5740af allocate workspace from allocator for kunlun runtime 2024-04-08 15:48:06 +08:00
Haojie Wang e4387904c2
Merge branch 'master' into kunlun_dist_op 2024-04-03 09:58:21 +08:00
wanghailu 14a40a1967 Merge branch 'master' of github.com:InfiniTensor/InfiniTensor into kunlun_dist_op 2024-04-03 01:01:40 +08:00
weijie01 32a13b7760 kunlun distributed 2024-04-02 17:15:08 +08:00
weijie01 a71cd14963 kunlun dist inference fix 2024-04-02 15:30:46 +08:00
25 changed files with 419 additions and 594 deletions

3
.gitmodules vendored
View File

@ -13,6 +13,3 @@
[submodule "example"]
path = examples/NNmodel
url = git@github.com:wanghailu0717/NNmodel.git
[submodule "examples/distributed/onnxsim_large_model"]
path = examples/distributed/onnxsim_large_model
url = git@github.com:luchangli03/onnxsim_large_model.git

View File

@ -40,7 +40,7 @@ endif
build:
mkdir -p build/$(TYPE)
cd build/$(TYPE) && cmake $(CMAKE_OPT) ../.. && make -j8
cd build/$(TYPE) && cmake $(CMAKE_OPT) ../.. && make -j64
clean:
rm -rf build

View File

@ -1,7 +1,5 @@
# 分布式脚本
## 英伟达平台运行方式
#### 1. 运行pytorch模型并生成输入和标准输出可选择导出onnx
使用 `--export_onnx` 设置导出onnx的目录默认为当前路径 `./`不使用这个flag则只进行计算和生成输入输出。
@ -17,23 +15,3 @@ python run_pytorch.py --model gpt2 --batch_size 1 --length 1 --export_onnx ./
```bash
python cuda_launch.py --model "/XXX/XXX.onnx" --nproc_per_node 4
```
## 寒武纪平台运行方式
**将上述运行脚本 `run_pytorch.py` 以及 `cuda_launch.py` 针对寒武纪平台做了相应的适配,具体见 `run_pytorch_mlu.py` 以及 `bang_launch.py`。**
#### 1. 运行pytorch模型并生成输入和标准输出可选择导出onnx
使用 `--export_onnx` 设置导出onnx的目录默认为当前路径 `./`不使用这个flag则只进行计算和生成输入输出。
```bash
python run_pytorch_mlu.py --model gpt2 --batch_size 1 --length 1 --export_onnx ./
```
会在当前目录下生成输入输出文件`test_inputs.npy` 和 `test_results.npy`,目前只支持单一输入输出。
#### 2. 运行InfiniTensor分布式脚本
```bash
python bang_launch.py --model "/XXX/XXX.onnx" --nproc_per_node 4
```

View File

@ -1,39 +1,35 @@
import sys
sys.path.append('../')
import argparse
import os
import time
import multiprocessing as mp
from pyinfinitensor.onnx import OnnxStub, backend
import onnx
from onnx.external_data_helper import convert_model_to_external_data
from onnx.shape_inference import infer_shapes_path
import numpy as np
from parallel_opt import parallel_model
def parse_args():
parser = argparse.ArgumentParser(description="launch distributed infinitensor")
parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
parser.add_argument(
"--nproc_per_node", type=int, default=1, help="number of processes per node"
"--nproc_per_node", type=int, default=2, help="number of processes per node"
)
parser.add_argument(
"--name", type=str, default="test", help="name of this instance."
)
parser.add_argument(
"--model", type=str, required=True, help="path to the ONNX model file."
"--model", type=str, default="/data/onnx_models/llama2/llama_bs1_seq1024.onnx",
help="path to the ONNX model file."
)
parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
parser.add_argument("--length", type=int, default=1, help="sequence length.")
parser.add_argument(
"--gen_std",
default=False,
action="store_true",
help="whether to generate the standard results.",
)
parser.add_argument(
"--type", type=str, choices=["fp32", "fp16", "tf32"], default="fp32", help="data type"
)
args = parser.parse_args()
print("arg setting: ", args)
return (
@ -44,46 +40,39 @@ def parse_args():
args.batch_size,
args.length,
args.gen_std,
args.type,
)
def run_model(model, runtime, world_size=1, rank=0, n=10, data_type="default"):
stub = OnnxStub(model, runtime, matmul_compute_type=data_type)
def run_model(model, runtime, world_size=1, rank=0, n=10):
stub = OnnxStub(model, runtime)
load_inputs(stub, world_size, rank)
# stub.tune()
stub.run()
# get outputs
time.sleep(0.01)
outputs = next(stub.outputs.values().__iter__()).copyout_numpy()
# bench
begin = time.time()
for _ in range(n):
stub.run()
begin = time.time()
for _ in range(n * 2):
stub.run()
end = time.time()
avg_time = (end - begin) / (n * 2)
avg_time = (end - begin) / n
print(f"average time: {avg_time}")
return outputs
def load_inputs(stub, world_size=1, rank=0):
for i, (name, tensor) in enumerate(stub.inputs.items()):
input = np.load(f"./data/input_{i}.npy")
if all(x == y for x,y in zip(input.shape,tensor.shape())):
tensor.copyin_numpy(input)
else:
tensor.copyin_numpy(np.hsplit(input, world_size)[rank])
def run_and_compare(name, model, runtime, world_size=1, rank=0, data_type="default"):
def run_and_compare(name, model, runtime, world_size=1, rank = 0):
results = np.load(f"./data/output.npy")
outputs = run_model(model, runtime, world_size, rank, data_type=data_type)
print("outputs abs mean:", abs(outputs).mean())
print("max abs diff:", abs(outputs - results).max())
outputs = run_model(model, runtime, world_size, rank)
print("answer argmax:", np.argmax(results))
print("output argmax:", np.argmax(outputs))
#np.testing.assert_allclose(outputs, results, rtol=1e-3, atol=1e-3)
getDiff(results, outputs)
def start_worker(
name: str, world_size: int, rank: int, local_rank: int, model: onnx.ModelProto, data_type: str
name: str, world_size: int, rank: int, local_rank: int, model: onnx.ModelProto
):
dist_name = name + "_dist"
model = parallel_model(model, world_size, rank)
@ -96,7 +85,7 @@ def start_worker(
save_as_external_data=True,
location=extern_path,
)
#infer_shapes_path(f"./{dist_name}_rank{rank}.onnx")
infer_shapes_path(f"./{dist_name}_rank{rank}.onnx")
runtime = backend.BangRuntime(local_rank)
# print("init comm")
runtime.init_comm(
@ -104,12 +93,13 @@ def start_worker(
world_size,
rank,
)
run_and_compare(name, model, runtime, world_size, rank, data_type)
run_and_compare(name, model, runtime, world_size, rank)
def start_single(name, model, data_type):
def start_single(name, model):
runtime = backend.BangRuntime(0)
run_and_compare(name, model, runtime, data_type=data_type)
run_and_compare(name, model, runtime)
def generate_input_output(model):
os.makedirs(os.path.dirname("./data/"), exist_ok=True)
@ -142,36 +132,55 @@ def generate_input_output(model):
np.save(f"./data/output", output)
def load_inputs(stub, world_size=1, rank=0):
for i, (name, tensor) in enumerate(stub.inputs.items()):
input = np.load(f"./data/input_{i}.npy")
if all(x == y for x,y in zip(input.shape,tensor.shape())):
tensor.copyin_numpy(input)
else:
tensor.copyin_numpy(np.hsplit(input, world_size)[rank])
def getDiff(base, test):
absolute_diff = np.abs(np.subtract(base, test))
max_absolute_diff = np.max(absolute_diff)
baseCopy = base.astype(np.float64).ravel()
testCopy = test.astype(np.float64).ravel()
upValue = np.sum(np.abs(baseCopy - testCopy))
downValue = np.sum(np.abs(baseCopy)) + np.float64(1e-9)
max_relative_diff = upValue / downValue
print(f"Max absolute difference: {max_absolute_diff}\n"
f"Max relative difference: {max_relative_diff}")
return max_absolute_diff, max_relative_diff
def main():
nnodes, nproc_per_node, name, model_path, bs, length, gen_std, data_type = parse_args()
data_type = "default" if data_type == "fp32" else data_type
nnodes, nproc_per_node, name, model_path, bs, length, gen_std = parse_args()
model = onnx.load(model_path)
# generate standart output
if gen_std:
print(f"generate standard data for {name}.")
# a small vocabulary size to fit all LLM.
generate_input_output(model)
print("Generate inputs and outputs.")
p = mp.Process(target=generate_input_output, args=[model])
p.start()
p.join()
return
if nproc_per_node == 1:
# run single process.
# use standalone process to isolate bang.
print("run model by single MLU.")
# p = mp.Process(target=start_single, args=(name, model, data_type))
# p.start()
# p.join()
start_single(name, model, data_type)
return
# run single process.
# use standalone process to isolate cuda.
print("run model by single MLU.")
p = mp.Process(target=start_single, args=(name, model))
p.start()
p.join()
# run distributed parallel.
world_size = nnodes * nproc_per_node
print(f"run model by {world_size} MLU in parallel.")
print(f"run model by {world_size} MLUs in parallel.")
workers = [
mp.Process(
target=start_worker,
args=(name, world_size, rank, rank % nproc_per_node, model, data_type),
args=(name, world_size, rank, rank % nproc_per_node, model),
)
for rank in range(world_size)
]

View File

@ -1,249 +0,0 @@
import argparse
import torch
import torch_mlu
from transformers import BertModel, BertConfig
from transformers import GPT2Model, GPT2Config
from transformers import OPTModel, OPTConfig
from transformers import AlbertModel, AlbertConfig
from transformers import LlamaModel, LlamaConfig
import time
import numpy as np
import onnx
import sys
import os
from onnx.external_data_helper import convert_model_to_external_data
from onnxsim import simplify
def parse_args():
parser = argparse.ArgumentParser(description="Run pytorch gpt2/bert/opt and optionally export onnx.")
parser.add_argument(
"--model", type=str, choices=["gpt2", "bert", "opt", "llama", "albert"], required=True, help="model type"
)
parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
parser.add_argument("--length", type=int, default=1, help="sequence length.")
parser.add_argument(
"--export_onnx",
type=str,
nargs="?",
default=None,
const="./",
help="whether and where to export onnx file",
)
parser.add_argument(
"--type", type=str, choices=["fp32", "fp16", "tf32"], required=True, help="model data type"
)
args = parser.parse_args()
print("arg setting: ", args)
return (
args.model,
args.batch_size,
args.length,
args.export_onnx,
args.type
)
def get_model(modelname):
match modelname:
case "albert":
model = AlbertModel.from_pretrained("albert/albert-base-v2")
voc_size = AlbertConfig().vocab_size
case "bert":
model = BertModel.from_pretrained("bert-base-uncased", add_pooling_layer=False, hidden_act="gelu_new") # erf is not impl by infini
voc_size = BertConfig().vocab_size
case "gpt2":
model = GPT2Model.from_pretrained("GPT2")
voc_size = GPT2Config().vocab_size
case "opt":
model = OPTModel.from_pretrained("facebook/opt-125m")
voc_size = OPTConfig().vocab_size
case "llama":
model = LlamaModel.from_pretrained("meta-llama/Llama-2-7b-hf")
voc_size = LlamaConfig().vocab_size
case _:
raise KeyError(modelname)
model = model.eval()
return model, voc_size
def run_pytorch(torch_model, voc_size, batchsize, len, dtype="fp32"):
data = np.random.randint(0, voc_size, (batchsize, len), dtype=np.int32)
os.makedirs(os.path.dirname("./data/"), exist_ok=True)
np.save("./data/input_0", data)
inputs = torch.from_numpy(data).to("mlu")
torch_model = torch_model.to("mlu")
if dtype == "fp16":
torch_model = torch_model.half()
n_iter = 20
with torch.no_grad():
for _ in range(10):
outputs = torch_model(inputs)
torch.mlu.synchronize()
begin = time.time()
with torch.no_grad():
for _ in range(n_iter):
torch.mlu.synchronize()
outputs = torch_model(inputs)
torch.mlu.synchronize()
torch.mlu.synchronize()
end = time.time()
avg_time = (end - begin) / n_iter
outputs = outputs.last_hidden_state.to("cpu")
print("outputs abs mean:", abs(np.array(outputs)).mean())
print(f"average time: {avg_time}")
# torch.mlu.memory.empty_cache()
np.save("./data/output", np.array(outputs))
print("Save input & output into ./data.")
def export_onnx(modelname, model, data, path, extern=False, dtype="fp32"):
data = data.to("mlu")
model = model.to("mlu")
if dtype == "fp16":
model = model.half()
torch.onnx.export(model, data, path, verbose=False, do_constant_folding=True)
if modelname != "llama":
# use onnxsim to simplify
onnx_model = onnx.load(path)
onnx_model, check = simplify(onnx_model, skipped_optimizers=['eliminate_duplicate_initializer'])
# onnx_model, check = simplify(onnx_model, skipped_optimizers=['fuse_qkv', 'eliminate_duplicate_initializer'])
assert check
add_value_info_for_constants(onnx_model)
onnx_model = onnx.shape_inference.infer_shapes(onnx_model)
if extern:
extern_path = path.replace('.onnx', '.pb')
if os.path.exists(extern_path):
os.remove(extern_path)
extern_path = extern_path.split("/")[-1]
convert_model_to_external_data(
onnx_model,
all_tensors_to_one_file=True,
location=extern_path,
size_threshold=1024,
convert_attribute=False,
)
onnx.save(onnx_model, path)
else:
# use third party tool to simplify llama
# reference: https://github.com/luchangli03/onnxsim_large_model/
sys.path.append("onnxsim_large_model")
from onnx_utils import set_onnx_input_shape
from compress_model import SIZE_1MB, compress_onnx_model, uncompress_onnx_model
in_model_path = path
out_model_path = path
if not out_model_path:
out_model_path = in_model_path[:-5] + ".sim.onnx"
if os.path.isdir(out_model_path):
out_model_path = os.path.join(out_model_path, os.path.basename(in_model_path))
onnx_model = onnx.load(in_model_path)
print(f"load model from {in_model_path} success")
size_th_bytes = 1024 * 1024
onnx_model, removed_inits = compress_onnx_model(onnx_model, size_th_bytes=size_th_bytes)
print(f"compress model success")
onnx_model = set_onnx_input_shape(onnx_model, "")
tensor_size_threshold = f"1024KB"
skipped_optimizers = []
skipped_optimizers.append("eliminate_duplicate_initializer")
onnx_model, check = simplify(onnx_model, skipped_optimizers=skipped_optimizers,
tensor_size_threshold=tensor_size_threshold)
if not check:
raise ValueError(f"simplify compressed model {in_model_path} failed")
print(f"simplify model success")
onnx_model = uncompress_onnx_model(onnx_model, removed_inits)
print(f"uncompress model success")
add_value_info_for_constants(onnx_model)
onnx.save(onnx_model, out_model_path, save_as_external_data=True)
def add_value_info_for_constants(model : onnx.ModelProto):
"""
Currently onnx.shape_inference doesn't use the shape of initializers, so add
that info explicitly as ValueInfoProtos.
Mutates the model.
Args:
model: The ModelProto to update.
"""
# All (top-level) constants will have ValueInfos before IRv4 as they are all inputs
if model.ir_version < 4:
return
def add_const_value_infos_to_graph(graph : onnx.GraphProto):
inputs = {i.name for i in graph.input}
existing_info = {vi.name: vi for vi in graph.value_info}
for init in graph.initializer:
# Check it really is a constant, not an input
if init.name in inputs:
continue
# The details we want to add
elem_type = init.data_type
shape = init.dims
# Get existing or create new value info for this constant
vi = existing_info.get(init.name)
if vi is None:
vi = graph.value_info.add()
vi.name = init.name
# Even though it would be weird, we will not overwrite info even if it doesn't match
tt = vi.type.tensor_type
if tt.elem_type == onnx.TensorProto.UNDEFINED:
tt.elem_type = elem_type
if not tt.HasField("shape"):
# Ensure we set an empty list if the const is scalar (zero dims)
tt.shape.dim.extend([])
for dim in shape:
tt.shape.dim.add().dim_value = dim
# Handle subgraphs
for node in graph.node:
for attr in node.attribute:
# Ref attrs refer to other attrs, so we don't need to do anything
if attr.ref_attr_name != "":
continue
if attr.type == onnx.AttributeProto.GRAPH:
add_const_value_infos_to_graph(attr.g)
if attr.type == onnx.AttributeProto.GRAPHS:
for g in attr.graphs:
add_const_value_infos_to_graph(g)
return add_const_value_infos_to_graph(model.graph)
def main():
torch.backends.mlu.matmul.allow_tf32 = False
torch.backends.cnnl.allow_tf32 = False
modelname, batchsize, seqlen, export_path, dtype = parse_args()
if dtype == "tf32":
torch.backends.mlu.matmul.allow_tf32 = True
else:
os.environ["CAMBRICON_TF32_OVERRIDE"] = "0"
model, voc_size = get_model(modelname)
if export_path is not None:
filename = "{}_{}_{}_{}.onnx".format(modelname, batchsize, seqlen, dtype)
path = os.path.join(export_path, filename)
if not os.path.exists(path):
param = torch.zeros((batchsize, seqlen), dtype=torch.int)
export_onnx(modelname, model, param, path, True, dtype)
else:
print("Onnx path exists, skipping export.")
run_pytorch(model, voc_size, batchsize, seqlen, dtype)
if __name__ == "__main__":
main()

View File

@ -10,6 +10,9 @@ import numpy as np
from parallel_opt import parallel_model
os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
def parse_args():
parser = argparse.ArgumentParser(description="launch distributed infinitensor")
parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
@ -29,9 +32,6 @@ def parse_args():
action="store_true",
help="whether to generate the standard results.",
)
parser.add_argument(
"--type", type=str, choices=["fp32", "fp16", "tf32"], default="fp32", help="data type"
)
args = parser.parse_args()
print("arg setting: ", args)
return (
@ -42,12 +42,11 @@ def parse_args():
args.batch_size,
args.length,
args.gen_std,
args.type,
)
def run_model(model, runtime, inputs, n=10, data_type = "default"):
stub = OnnxStub(model, runtime, matmul_compute_type=data_type)
def run_model(model, runtime, inputs, n=10):
stub = OnnxStub(model, runtime)
for tensor, input in zip(stub.inputs.values(), inputs, strict=False):
tensor.copyin_numpy(input)
# stub.tune()
@ -67,17 +66,17 @@ def run_model(model, runtime, inputs, n=10, data_type = "default"):
return outputs
def run_and_compare(name, model, runtime, data_type):
def run_and_compare(name, model, runtime):
input_ids = np.load(f"{name}_inputs.npy")
position_ids = np.arange(input_ids.shape[-1])
results = np.load(f"{name}_results.npy")
outputs = run_model(model, runtime, (input_ids, position_ids), data_type=data_type)
outputs = run_model(model, runtime, (input_ids, position_ids))
print("outputs abs mean:", abs(outputs).mean())
print("max abs diff:", abs(outputs - results).max())
def start_worker(
name: str, world_size: int, rank: int, local_rank: int, model: onnx.ModelProto, data_type: str
name: str, world_size: int, rank: int, local_rank: int, model: onnx.ModelProto
):
dist_name = name + "_dist"
model = parallel_model(model, world_size, rank)
@ -98,12 +97,12 @@ def start_worker(
world_size,
rank,
)
run_and_compare(name, model, runtime, data_type)
run_and_compare(name, model, runtime)
def start_single(name, model, data_type):
def start_single(name, model):
runtime = backend.CudaRuntime(0)
run_and_compare(name, model, runtime, data_type)
run_and_compare(name, model, runtime)
def gen_standard(name, model, voc_size, bs, len):
@ -118,10 +117,8 @@ def gen_standard(name, model, voc_size, bs, len):
def main():
nnodes, nproc_per_node, name, model_path, bs, length, gen_std, data_type = parse_args()
data_type = "default" if data_type == "fp32" else data_type
if data_type != "tf32":
os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
nnodes, nproc_per_node, name, model_path, bs, length, gen_std = parse_args()
model = onnx.load(model_path)
# generate standart output
@ -135,7 +132,7 @@ def main():
# run single process.
# use standalone process to isolate cuda.
print("run model by single GPU.")
p = mp.Process(target=start_single, args=(name, model, data_type))
p = mp.Process(target=start_single, args=(name, model))
p.start()
p.join()
@ -145,7 +142,7 @@ def main():
workers = [
mp.Process(
target=start_worker,
args=(name, world_size, rank, rank % nproc_per_node, model, data_type),
args=(name, world_size, rank, rank % nproc_per_node, model),
)
for rank in range(world_size)
]

View File

@ -1,14 +0,0 @@
export HF_ENDPOINT=https://hf-mirror.com
models=("bert" "gpt2" "llama")
batch_size=(1 32)
seq_len=(100 500)
nproc=(1 2 4)
for model in "${models[@]}"; do
for bs in "${batch_size[@]}"; do
for len in "${seq_len[@]}"; do
python run_pytorch.py --model "$model" --batch_size "$bs" --length "$len" --export_onnx ../models/"$model" --export_only
done
done
done

View File

@ -21,7 +21,7 @@ def parse_args():
"--nproc_per_node", type=int, default=2, help="number of processes per node"
)
parser.add_argument(
"--name", type=str, choices=["gpt2", "bert", "llama"], help="name of model."
"--name", type=str, default="test", help="name of this instance."
)
parser.add_argument(
"--model", type=str, default="", help="path to the ONNX model file."
@ -54,12 +54,12 @@ def parse_args():
help="path to save internal onnx model for parallel run"
)
args = parser.parse_args()
# check path, mkdir if not exist
check_exists(args.input_dir)
check_exists(args.result_dir)
check_exists(args.internal_model_dir)
print("arg setting: ", args)
return (
args.num_nodes,
@ -97,7 +97,7 @@ def perf_it(n):
# warmup
for _ in range(n):
func(*args, **kwargs)
t_total = 0
for _ in range(n):
t0 = time.time()
@ -112,22 +112,20 @@ def perf_it(n):
"""
Run InfiniTensor model with Standard input
Run InfiniTensor model with Standard input
check=True: check with standard output gen by pytorch
perf=True: run n times to get avg time
"""
def run_model(task_name,
model,
runtime,
world_size=1,
rank=0,
model,
runtime,
world_size=1,
rank=0,
n=10,
check=True,
perf=True):
stub = OnnxStub(model, runtime,
use_naive_allocator=True \
if task_name == "llama" else False)
stub = OnnxStub(model, runtime)
# load in Onnx model inputs
def load_inputs(stub: OnnxStub):
@ -163,9 +161,9 @@ def run_model(task_name,
if np.isnan(output).any():
print("Nan in output")
exit()
np_assert(st_output, output)
np_assert(st_output, output)
# perf
# perf
if perf:
@perf_it(n)
def perf_infinitensor(stub: OnnxStub):
@ -178,17 +176,17 @@ def run_model(task_name,
"""
Start a worker in Parallel
"""
def start_worker(name: str,
world_size: int,
rank: int,
local_rank: int,
def start_worker(name: str,
world_size: int,
rank: int,
local_rank: int,
model: onnx.ModelProto):
dist_name = name + "_dist"
# partial a onnx model to world_size part
model = parallel_model(model, world_size, rank)
onnx.save(model, os.path.join(internal_model_dir, \
f"{dist_name}_rank{rank}.onnx"), save_as_external_data=True)
f"{dist_name}_rank{rank}.onnx"))
runtime = backend.KUNLUNRuntime(local_rank)
# print("init comm")
runtime.init_comm(
@ -200,10 +198,10 @@ def start_worker(name: str,
"""
generate standard input/output with
generate standard input/output with
sigle card run
"""
def gen_standard(task_name: str, model: onnx.ModelProto):
def gen_stardard(task_name: str, model: onnx.ModelProto):
runtime = backend.KUNLUNRuntime(0)
stub = OnnxStub(model, runtime)
position_id = 0
@ -238,25 +236,25 @@ def gen_standard(task_name: str, model: onnx.ModelProto):
def main():
global input_dir, result_dir, internal_model_dir
nnodes, nproc_per_node, task_name, \
model_path, gen_std, run_single, \
input_dir, result_dir, internal_model_dir = parse_args()
# load input onnx model
model = onnx.load(model_path)
# generate standart output
if gen_std:
print("Generate inputs and outputs.")
gen_standard(task_name, model)
gen_stardard(task_name, model)
return
if run_single:
print("Run model by one GPU card.")
runtime = backend.KUNLUNRuntime(0)
run_model(task_name, model, runtime)
return
return
# run distributed parallel.
world_size = nnodes * nproc_per_node

View File

@ -0,0 +1,215 @@
import sys
sys.path.append("../")
import argparse
import os
import time
import multiprocessing as mp
from pyinfinitensor.onnx import OnnxStub, backend
import onnx
from onnx.external_data_helper import convert_model_to_external_data
from onnx.shape_inference import infer_shapes_path
import numpy as np
from parallel_opt import parallel_model
st_input_dir = ".cache/input/"
st_output_dir = ".cache/output/"
def parse_args():
parser = argparse.ArgumentParser(description="launch distributed infinitensor")
parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
parser.add_argument(
"--nproc_per_node", type=int, default=2, help="number of processes per node"
)
parser.add_argument(
"--name", type=str, default="test", help="name of this instance."
)
parser.add_argument(
"--model", type=str, default="/data1/shared/panzezhong/llama/fp32/my_llama_fp32.sim.onnx", help="path to the ONNX model file."
)
parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
parser.add_argument("--length", type=int, default=1, help="sequence length.")
parser.add_argument(
"--gen_std",
default=False,
action="store_true",
help="whether to generate the standard results.",
)
parser.add_argument(
"--run_single",
default=False,
action="store_true",
help="whether run model with single process with standard inputs"
)
args = parser.parse_args()
print("arg setting: ", args)
return (
args.num_nodes,
args.nproc_per_node,
args.name,
args.model,
args.batch_size,
args.length,
args.gen_std,
args.run_single
)
def run_model(model, runtime, world_size=1, rank=0, n=10):
stub = OnnxStub(model, runtime)
load_inputs(stub, world_size, rank)
# stub.tune()
stub.run()
# get outputs
time.sleep(0.01)
outputs = next(stub.outputs.values().__iter__()).copyout_numpy()
# bench
begin = time.time()
for _ in range(n):
stub.run()
end = time.time()
avg_time = (end - begin) / n
print(f"average time: {avg_time}")
return outputs
def run_and_compare(name, model, runtime, world_size=1, rank = 0):
results = np.load(os.path.join(st_output_dir, "test_output.npy"))
outputs = run_model(model, runtime, world_size, rank)
print(outputs[:100])
if np.isnan(outputs).any():
print("Nan in output")
print("answer argmax:", np.argmax(results))
print("output argmax:", np.argmax(outputs))
#np.testing.assert_allclose(outputs, results, rtol=1e-3, atol=1e-3)
getDiff(results, outputs)
def start_worker(
name: str, world_size: int, rank: int, local_rank: int, model: onnx.ModelProto
):
dist_name = name + "_dist"
model = parallel_model(model, world_size, rank)
extern_path = f"./{dist_name}_rank{rank}.pb"
if os.path.exists(extern_path):
os.remove(extern_path)
onnx.save_model(
model,
f"./{dist_name}_rank{rank}.onnx",
save_as_external_data=True,
location=extern_path,
)
infer_shapes_path(f"./{dist_name}_rank{rank}.onnx")
runtime = backend.KUNLUNRuntime(local_rank)
# print("init comm")
runtime.init_comm(
dist_name,
world_size,
rank,
)
run_and_compare(name, model, runtime, world_size, rank)
def start_single(name, model):
runtime = backend.KUNLUNRuntime(0)
run_and_compare(name, model, runtime)
def generate_input_output(model):
runtime = backend.KUNLUNRuntime(0)
stub = OnnxStub(model, runtime)
position_id = 0
for i, (name, tensor) in enumerate(stub.inputs.items()):
input = tensor.copyout_numpy()
if np.issubdtype(input.dtype, np.integer):
if input.size == 1:
# input = np.array([position_id])
input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
else:
input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
elif input.dtype == np.bool_:
input = np.random.randint(0,2,size=input.shape) > 0
else:
if i == 0:
input = np.ones(input.shape).astype(input.dtype)
position_id = input.shape[-1] - 1
else:
input = np.random.rand(*input.shape).astype(input.dtype)
tensor.copyin_numpy(input)
np.save(os.path.join(st_input_dir, f"input_{i}"), input)
stub.run()
# print(stub.outputs)
time.sleep(0.01)
output = next(stub.outputs.values().__iter__()).copyout_numpy()
print(output[:100])
if np.isnan(output).any():
print("Nan in output")
np.save(os.path.join(st_output_dir, f"output"), output)
def load_inputs(stub, world_size=1, rank=0):
for i, (name, tensor) in enumerate(stub.inputs.items()):
input = np.load(os.path.join(st_input_dir, f"test_input_{name}.npy"))
if all(x == y for x,y in zip(input.shape,tensor.shape())):
tensor.copyin_numpy(input)
else:
tensor.copyin_numpy(np.hsplit(input, world_size)[rank])
def getDiff(base, test):
absolute_diff = np.abs(np.subtract(base, test))
max_absolute_diff = np.max(absolute_diff)
baseCopy = base.astype(np.float64).ravel()
testCopy = test.astype(np.float64).ravel()
upValue = np.sum(np.abs(baseCopy - testCopy))
downValue = np.sum(np.abs(baseCopy)) + np.float64(1e-9)
max_relative_diff = upValue / downValue
print(f"Max absolute difference: {max_absolute_diff}\nMax relative difference: {max_relative_diff}")
return max_absolute_diff, max_relative_diff
def main():
nnodes, nproc_per_node, name, model_path, bs, length, gen_std, run_single = parse_args()
model = onnx.load(model_path)
# generate standart output
if gen_std:
print("Generate inputs and outputs.")
p = mp.Process(target=generate_input_output, args=[model])
p.start()
p.join()
return
# # run single process.
# # use standalone process to isolate cuda.
if run_single:
print("run model by single GPU.")
p = mp.Process(target=start_single, args=(name, model))
p.start()
p.join()
return
# run distributed parallel.
world_size = nnodes * nproc_per_node
print(f"run model by {world_size} GPU in parallel.")
workers = [
mp.Process(
target=start_worker,
args=(name, world_size, rank, rank % nproc_per_node, model),
)
for rank in range(world_size)
]
for w in workers:
w.start()
for w in workers:
w.join()
if __name__ == "__main__":
main()

View File

@ -1,36 +0,0 @@
export HF_ENDPOINT=https://hf-mirror.com
# models=("bert" "gpt2" "llama")
models=("bert" "gpt2")
batch_size=(1 32)
seq_len=(100 500)
nproc=(1 2 4)
results_dir="results"
if [ -d "$results_dir" ]; then
echo "directory ./$results_dir exists"
else
mkdir -p "$results_dir"
echo "mkdir $results_dir, logs saved there"
fi
for model in "${models[@]}"; do
for bs in "${batch_size[@]}"; do
for len in "${seq_len[@]}"; do
# run pytorch model
echo "Run pytorch $model with batch_size=$bs length=$len ."
python run_pytorch.py --model "$model" --batch_size "$bs" --length "$len" #> results/"$model"_"$bs"_"$len"_pytorch
for n in "${nproc[@]}"; do
# run infinitensor
echo "Run $n parallel infinitensor "$model" with batch_size=$bs and length=$len ."
python kunlun_launch.py --name "$model" --model ../models/"$model"/"$model"_"$bs"_"$len".onnx --nproc_per_node=$n # >> results/"$model"_"$bs"_"$len"_infini
# delete internal files
find ./ -type f -name "*.onnx" -delete
find ./ -type f -name "*.pb" -delete
done
find ./ -type f -name "*.npy" -delete
done
done
done

View File

@ -1,35 +0,0 @@
export HF_ENDPOINT=https://hf-mirror.com
# models=("bert" "gpt2" "llama")
models=("llama")
batch_size=(1 )
seq_len=(100 500)
nproc=(1 2 4)
results_dir="results"
if [ -d "$results_dir" ]; then
echo "directory ./$results_dir exists"
else
mkdir -p "$results_dir"
echo "mkdir $results_dir, logs saved there"
fi
for model in "${models[@]}"; do
for bs in "${batch_size[@]}"; do
for len in "${seq_len[@]}"; do
echo "Run pytorch llama with batch_size="$bs" and length="$len""
python run_pytorch.py --model "$model" --batch_size "$bs" --length "$len"
for n in "${nproc[@]}"; do
# run pytorch model
echo "Run infinitensor llama with batch_size="$bs" and length="$len" and nproc="$n"."
python kunlun_launch.py --name llama --model ../models/llama/llama_"$bs"_"$len"_fp32.onnx --nproc_per_node=$n
# delete internal files
find ./ -type f -name "*.onnx" -delete
find ./ -type f -name "*0c" -delete
done
find ./ -type f -name "*.npy" -delete
done
done
done

View File

@ -8,7 +8,6 @@ import time
import numpy as np
import onnx
import os
import sys
from onnx.external_data_helper import convert_model_to_external_data
from onnxsim import simplify
@ -41,10 +40,6 @@ def parse_args():
default="./",
help="path to save pytorch model output data"
)
parser.add_argument(
"--export_only",
action="store_true"
)
args = parser.parse_args()
print("arg setting: ", args)
return (
@ -53,8 +48,7 @@ def parse_args():
args.length,
args.export_onnx,
args.input_dir,
args.result_dir,
args.export_only
args.result_dir
)
@ -83,7 +77,7 @@ def run_pytorch(torch_model, voc_size, batchsize, len, model_name):
inputs = torch.from_numpy(data).to("cuda")
torch_model = torch_model.to("cuda")
n_iter = 10
n_iter = 20
with torch.no_grad():
for _ in range(10):
outputs = torch_model(inputs)
@ -93,11 +87,11 @@ def run_pytorch(torch_model, voc_size, batchsize, len, model_name):
for _ in range(n_iter):
torch.cuda.synchronize()
outputs = torch_model(inputs)
#
#
torch.cuda.synchronize()
torch.cuda.synchronize()
end = time.time()
avg_time = (end - begin) / n_iter
outputs = outputs.last_hidden_state.to("cpu")
print("outputs abs mean:", abs(np.array(outputs)).mean())
@ -109,61 +103,33 @@ def run_pytorch(torch_model, voc_size, batchsize, len, model_name):
def export_onnx(model_name, model, data, path, extern=False):
# torch.onnx.export(model, data, path, verbose=False, do_constant_folding=True)
if model_name != "llama":
onnx_model = onnx.load(path)
torch.onnx.export(model, data, path, verbose=False, do_constant_folding=True)
onnx_model = onnx.load(path)
# onnx_model, check = simplify(onnx_model,
# skip_shape_inference=True,
# skipped_optimizers=['eliminate_duplicate_initializer'])
if model_name == "gpt2":
onnx_model, check = simplify(onnx_model,
skip_shape_inference=True,
skipped_optimizers=['fuse_qkv', 'eliminate_duplicate_initializer'])
else :
onnx_model, check = simplify(onnx_model,
skipped_optimizers=['fuse_qkv', 'eliminate_duplicate_initializer'])
# skipped_optimizers=['fuse_qkv'])
assert check
add_value_info_for_constants(onnx_model)
onnx_model = onnx.shape_inference.infer_shapes(onnx_model)
if extern:
extern_path = path.replace('.onnx', '.pb')
if os.path.exists(extern_path):
os.remove(extern_path)
convert_model_to_external_data(
onnx_model,
all_tensors_to_one_file=True,
location=extern_path.split("/")[-1],
size_threshold=1024,
convert_attribute=False,
)
onnx.save(onnx_model, path)
else:
sys.path.append("onnxsim_large_model")
from onnx_utils import set_onnx_input_shape
from compress_model import SIZE_1MB, compress_onnx_model, uncompress_onnx_model
in_model_path = path
out_model_path = in_model_path[:-5] + ".sim.onnx"
onnx_model = onnx.load(in_model_path)
print(f"load model from {in_model_path} success")
size_th_bytes = 1024 * 1024
onnx_model, removed_inits = compress_onnx_model(onnx_model, size_th_bytes=size_th_bytes)
print("compress model success")
onnx_model = set_onnx_input_shape(onnx_model, "")
tensor_size_threshold = f"1024KB"
skipped_optimizers = []
skipped_optimizers.append("eliminate_duplicate_initializer")
onnx_model, check = simplify(onnx_model, skipped_optimizers=skipped_optimizers,
tensor_size_threshold=tensor_size_threshold)
if not check:
raise ValueError(f"simplify compressed model {in_model_path} failed")
print(f"simplify model success")
onnx_model = uncompress_onnx_model(onnx_model, removed_inits)
print(f"uncompress model success")
add_value_info_for_constants(onnx_model)
onnx.save(onnx_model, out_model_path, save_as_external_data=True)
skipped_optimizers=['fuse_qkv', 'eliminate_duplicate_initializer'])
assert check
add_value_info_for_constants(onnx_model)
onnx_model = onnx.shape_inference.infer_shapes(onnx_model)
if extern:
extern_path = path.replace('.onnx', '.pb')
if os.path.exists(extern_path):
os.remove(extern_path)
convert_model_to_external_data(
onnx_model,
all_tensors_to_one_file=True,
location=extern_path.split("/")[-1],
size_threshold=1024,
convert_attribute=False,
)
onnx.save(onnx_model, path)
def add_value_info_for_constants(model : onnx.ModelProto):
"""
@ -226,20 +192,17 @@ def main():
global input_dir, result_dir
modelname, batchsize, seqlen, \
export_path, input_dir, result_dir, export_only = parse_args()
export_path, input_dir, result_dir = parse_args()
model, voc_size = get_model(modelname) # pytorch model
if export_path is not None:
os.makedirs(export_path, exist_ok=True)
filename = "{}_{}_{}.onnx".format(modelname, batchsize, seqlen)
path = os.path.join(export_path, filename)
param = torch.zeros((batchsize, seqlen), dtype=torch.int)
export_onnx(modelname, model, param, path, True) # export pytorch model to onnx model
if export_only:
return
run_pytorch(model, voc_size, batchsize, seqlen, modelname)
if __name__ == "__main__":
main()
main()

View File

@ -0,0 +1,17 @@
export HF_ENDPOINT=https://hf-mirror.com
models=("bert" "gpt2")
batch_size=(1 32)
seq_len=(100 500)
nproc=(1 2 4)
for model in "${models[@]}"; do
for bs in "${batch_size[@]}"; do
for len in "${seq_len[@]}"; do
python -m xacc run_pytorch.py --model "$model" --batch_size "$bs" --length "$len" --export_onnx ../models/"$model" > results/"$model"_"$bs"_"$len"
for n in "${nproc[@]}"; do
python kunlun_launch.py --name "$model" --model ../models/"$model"/"$model"_"$bs"_"$len".onnx --nproc_per_node=$n >> results/"$model"_"$bs"_"$len"
done
done
done
done

@ -1 +0,0 @@
Subproject commit cbcf3fbf985a00494b0f136c92eaccd42031bf65

View File

@ -80,6 +80,7 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
def shard_reshape(node: NodeProto):
# print("reshape", node.name, node.input[0], place[node.input[0]])
# import pdb; pdb.set_trace()
if not is_sharded(node.input[0]):
return
in_plc = place[node.input[0]]
@ -110,6 +111,7 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
s_dim = 0
elif in_plc.dim == 2:
s_dim = 1
# import pdb; pdb.set_trace()
assert s_dim != -1
assert out_dims[s_dim] % tp_world_size == 0, out_dims
out_dims[s_dim] //= tp_world_size
@ -245,3 +247,7 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
model = helper.make_model(graph)
#model = onnx.shape_inference.infer_shapes(model)
return model
if __name__ == "__main__":
model = onnx.load("./models/gpt2/gpt2_1_100.onnx")
models = parallel_model(model, 2, 0)

View File

@ -10,6 +10,8 @@ import os
from onnx.external_data_helper import convert_model_to_external_data
from onnxsim import simplify
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False
def parse_args():
parser = argparse.ArgumentParser(description="Run pytorch gpt2/bert/opt and optionally export onnx.")
parser.add_argument(
@ -25,17 +27,14 @@ def parse_args():
const="./",
help="whether and where to export onnx file",
)
parser.add_argument(
"--type", type=str, choices=["fp32", "fp16", "tf32"], default="fp32", help="data type"
)
args = parser.parse_args()
args = parser.parse_args()
print("arg setting: ", args)
return (
args.model,
args.batch_size,
args.length,
args.export_onnx,
args.type,
args.export_onnx
)
@ -82,7 +81,7 @@ def run_pytorch(torch_model, voc_size, batchsize, len):
print("outputs abs mean:", abs(np.array(outputs)).mean())
print(f"average time: {avg_time}")
torch.cuda.memory.empty_cache()
np.save("test_results", np.array(outputs, dtype=np.float32))
np.save("test_results", np.array(outputs))
print("Save input & output as test_inputs.npy and test_results.npy")
@ -165,14 +164,7 @@ def add_value_info_for_constants(model : onnx.ModelProto):
def main():
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False
modelname, batchsize, seqlen, export_path, data_type = parse_args()
if data_type == "tf32":
torch.backends.cuda.matmul.allow_tf32 = True
else:
os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
modelname, batchsize, seqlen, export_path = parse_args()
model, voc_size = get_model(modelname)
if export_path is not None:
filename = "{}_{}_{}.onnx".format(modelname, batchsize, seqlen)
@ -180,8 +172,6 @@ def main():
param = torch.zeros((batchsize, seqlen), dtype=torch.int)
export_onnx(model, param, path, True)
if data_type == "fp16":
model = model.half()
run_pytorch(model, voc_size, batchsize, seqlen)
if __name__ == "__main__":

View File

@ -21,10 +21,10 @@ class KUNLUNRuntimeObj : public RuntimeObj {
ctx = xdnn::create_context();
// 10GB for Longformer
// size_t longformerNum = 3lu * (1 << 30);
size_t workspaceSize = 2llu << 30; // 2 GB
KUNLUNPtr wkspacePtr = alloc(workspaceSize);
workspace =
make_ref<WorkspaceObj<KUNLUNPtr>>(wkspacePtr, workspaceSize);
size_t workspaceSize = 3llu << 30; // 3 GB
// KUNLUNPtr wkspacePtr = alloc(workspaceSize);
// workspace =
// make_ref<WorkspaceObj<KUNLUNPtr>>(wkspacePtr, workspaceSize);
}
virtual ~KUNLUNRuntimeObj() {
KUNLUNPtr wkspacePtr = workspace->getWorkspace();
@ -33,6 +33,10 @@ class KUNLUNRuntimeObj : public RuntimeObj {
}
string toString() const override;
void init_workspace(KUNLUNPtr workspacePtr) {
workspace = make_ref<WorkspaceObj<KUNLUNPtr>>(wkspacePtr, workspaceSize);
}
void run(const Graph &graph, bool tune = false,
bool profiling = false) const;
// double runEvaluation(const Graph &graph, int nWarmups,
@ -42,7 +46,7 @@ class KUNLUNRuntimeObj : public RuntimeObj {
KUNLUNPtr alloc(size_t size) override {
void *ptr;
checkKUNLUNError(
xpu_malloc((void **)&ptr, size, XPUMemoryKind::XPU_MEM_HBM));
xpu_malloc_ex((void **)&ptr, size, XPUMemoryKind::XPU_MEM_MAIN));
return ptr;
}
void dealloc(void *ptr) override { xpu_free(ptr); }

View File

@ -34,8 +34,8 @@ class XcclCommunicatorObj final : public CommunicatorObj {
auto begin = std::chrono::steady_clock::now();
while (!std::filesystem::exists(filePath)) {
auto now = std::chrono::steady_clock::now();
_IT_ASSERT_2(now < begin + std::chrono::seconds(100),
"time limit (100s) exceeded.");
_IT_ASSERT_2(now < begin + std::chrono::seconds(10),
"time limit (10s) exceeded.");
std::this_thread::sleep_for(std::chrono::milliseconds(100));
}
std::ifstream ifs(filePath, std::ios::binary);

View File

@ -265,6 +265,11 @@ void GraphObj::dataMalloc(bool useNaiveAllocator, size_t memPoolSize) {
tensorToOffset[tensor.get()]));
}
}
if (runtime->isKUNLUN()) {
allocator.heapAlloc(runtime->workspaceSize);
runtime->init_workspace(allocator.getHeapPtr());
}
}
Tensor GraphObj::cloneKV(Tensor &tensor) {

View File

@ -199,24 +199,6 @@ class CastCnnl : public BangKernelWithoutConfig {
dim.data()));
NlCastType = CNNL_CAST_UINT32_TO_INT64;
break;
case CastType::Float162Float:
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_HALF, dim.size(),
dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, dim.size(),
dim.data()));
NlCastType = CNNL_CAST_HALF_TO_FLOAT;
break;
case CastType::Float2Float16:
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, dim.size(),
dim.data()));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_HALF, dim.size(),
dim.data()));
NlCastType = CNNL_CAST_FLOAT_TO_HALF;
break;
default:
IT_TODO_HALT();
}

View File

@ -19,16 +19,14 @@ class LayerNormCnnl : public BangKernelWithoutConfig {
void *const outputData = (op->getOutput()->getRawDataPtr<void *>());
auto inDims = op->getInputs(0)->getDims();
auto fiterDims = op->getInputs(1)->getDims();
auto outDims = op->getOutput()->getDims();
auto fiterDims = op->getOutput(1)->getDims();
float eps = op->getEps();
const int axis = op->getAxis();
Shape outMeanDims(outDims);
outMeanDims.erase(outMeanDims.begin() + axis);
cnnlTensorDescriptor_t inDesc, fiterDesc, outDesc;
cnnlTensorDescriptor_t inDesc, fiterDesc, outDesc, outMeanDesc;
checkCnnlError(cnnlCreateTensorDescriptor(&inDesc));
checkCnnlError(cnnlSetTensorDescriptor(
inDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
@ -41,23 +39,15 @@ class LayerNormCnnl : public BangKernelWithoutConfig {
checkCnnlError(cnnlSetTensorDescriptor(
outDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
outDims.size(), outDims.data()));
checkCnnlError(cnnlCreateTensorDescriptor(&outMeanDesc));
checkCnnlError(cnnlSetTensorDescriptor(
outMeanDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
outMeanDims.size(), outMeanDims.data()));
size_t wsSize;
cnnlGetLayerNormOpWorkspaceSize(context->cnnlHandle(), axis, inDesc,
&wsSize);
BangPtr wsData = context->getWorkspace(wsSize);
size_t meanSize =
cnnlGetTensorElementNum(outMeanDesc) * op->getDType().getSize();
BangPtr meanData = context->getWorkspace(meanSize);
BangPtr rstdData = context->getWorkspace(meanSize);
cnnlStatus_t stat = cnnlLayerNormForward(
context->cnnlHandle(), inDesc, inputData, axis, fiterDesc,
scaleData, biasData, eps, wsData, wsSize, outDesc, outputData,
outMeanDesc, meanData, rstdData);
inDesc, NULL, NULL);
if (stat != CNNL_STATUS_SUCCESS)
return;

View File

@ -66,13 +66,6 @@ class MatmulCnnl : public BangKernelWithoutConfig {
cnnlSetMatMulDescAttr(bmm_desc, CNNL_MATMUL_DESC_TRANSB, &transB,
sizeof(int32_t));
std::string computeTypeStr = op->getComputeType();
if (computeTypeStr == "tf32") {
int32_t tf32 = 1;
cnnlSetMatMulDescAttr(bmm_desc, CNNL_MATMUL_ALLOW_TF32, &tf32,
sizeof(int32_t));
}
cnnlMatMulAlgo_t bmm_algo;
cnnlMatMulAlgoCreate(&bmm_algo);

View File

@ -20,9 +20,15 @@ class AllReduceXCCL : public KUNLUNKernelWithoutConfig {
BKCLContext_t comm =
dynamic_cast<XcclCommunicatorObj &>(context->getCommunicator())
.getXcclComm();
// double t = timeit(
// [&]() {
checkXcclError(bkcl_all_reduce(comm, input, output, count,
BKCLDataType::BKCL_FLOAT, getRedOp(),
0));
// },
// [&]() { context->sync(); });
// std::cout << "Time consuming for " << op->getInputs(0)->size() << "
// size is " << t << std::endl;
}
virtual BKCLOp getRedOp() const = 0;
};

View File

@ -97,14 +97,11 @@ class DivXdnn : public KUNLUNKernelWithoutConfig {
auto aDim = op->getInputs(0)->getDims();
auto bSize = op->getInputs(1)->size();
auto bDim = op->getInputs(1)->getDims();
auto dtype = op->getDType();
// op input a, b is scalar while aDim and b Dim is empty
if (bDim.size() == 0) {
bDim.push_back(1);
}
if (aDim.size() == 0) {
aDim.push_back(1);
}
if (aSize == bSize) {
// Do ElementWise Sub with no broadcast
@ -112,9 +109,23 @@ class DivXdnn : public KUNLUNKernelWithoutConfig {
(float *)aData, (float *)bData,
(float *)cData, aSize));
} else {
checkKUNLUNError(xdnn::broadcast_div<float>(
context->KUNLUNHandle(), (float *)aData, (float *)bData,
(float *)cData, aDim, bDim));
// Do broadcast div
Shape aligned = infer_broadcast(aDim, bDim);
if (aligned == aDim) {
// BData need to be broadcasted
checkKUNLUNError(xdnn::broadcast_div<float>(
context->KUNLUNHandle(), (float *)aData, (float *)bData,
(float *)cData, aDim, bDim));
} else {
// Use workspace to broadcast aData
KUNLUNPtr wks = context->getWorkspace(bSize * dtype.getSize());
checkKUNLUNError(xdnn::broadcast<float>(
context->KUNLUNHandle(), (float *)aData, (float *)wks, aDim,
bDim));
checkKUNLUNError(xdnn::div<float>(context->KUNLUNHandle(),
(float *)wks, (float *)bData,
(float *)cData, bSize));
}
}
return;
}

View File

@ -570,7 +570,6 @@ REGISTER_KERNEL(Device::KUNLUN, OpType::Reciprocal, ReciprocalXdnn,
REGISTER_KERNEL(Device::KUNLUN, OpType::Reshape, CopyXdnn, "Reshape_xdnn");
REGISTER_KERNEL(Device::KUNLUN, OpType::Flatten, CopyXdnn, "Flatten_xdnn");
REGISTER_KERNEL(Device::KUNLUN, OpType::Identity, CopyXdnn, "Identity_xdnn");
REGISTER_KERNEL(Device::KUNLUN, OpType::Squeeze, CopyXdnn, "Squeeze_xdnn");
REGISTER_KERNEL(Device::KUNLUN, OpType::Abs, AbsXdnn, "Abs_xdnn");
REGISTER_KERNEL(Device::KUNLUN, OpType::Atan, ATanXdnn, "Atan_xdnn");
REGISTER_KERNEL(Device::KUNLUN, OpType::Log, LogXdnn, "Log_xdnn");