forked from jiuyuan/InfiniTensor
feat(dist):分布式脚本支持混合精度 (#226)
This commit is contained in:
parent
eafbff6cf9
commit
d1de3ab5c2
|
@ -10,9 +10,6 @@ import numpy as np
|
||||||
from parallel_opt import parallel_model
|
from parallel_opt import parallel_model
|
||||||
|
|
||||||
|
|
||||||
os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
|
|
||||||
|
|
||||||
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
parser = argparse.ArgumentParser(description="launch distributed infinitensor")
|
parser = argparse.ArgumentParser(description="launch distributed infinitensor")
|
||||||
parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
|
parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
|
||||||
|
@ -32,6 +29,9 @@ def parse_args():
|
||||||
action="store_true",
|
action="store_true",
|
||||||
help="whether to generate the standard results.",
|
help="whether to generate the standard results.",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--type", type=str, choices=["fp32", "fp16", "tf32"], default="fp32", help="data type"
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
print("arg setting: ", args)
|
print("arg setting: ", args)
|
||||||
return (
|
return (
|
||||||
|
@ -42,11 +42,12 @@ def parse_args():
|
||||||
args.batch_size,
|
args.batch_size,
|
||||||
args.length,
|
args.length,
|
||||||
args.gen_std,
|
args.gen_std,
|
||||||
|
args.type,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def run_model(model, runtime, inputs, n=10):
|
def run_model(model, runtime, inputs, n=10, data_type = "default"):
|
||||||
stub = OnnxStub(model, runtime)
|
stub = OnnxStub(model, runtime, matmul_compute_type=data_type)
|
||||||
for tensor, input in zip(stub.inputs.values(), inputs, strict=False):
|
for tensor, input in zip(stub.inputs.values(), inputs, strict=False):
|
||||||
tensor.copyin_numpy(input)
|
tensor.copyin_numpy(input)
|
||||||
# stub.tune()
|
# stub.tune()
|
||||||
|
@ -66,17 +67,17 @@ def run_model(model, runtime, inputs, n=10):
|
||||||
return outputs
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
def run_and_compare(name, model, runtime):
|
def run_and_compare(name, model, runtime, data_type):
|
||||||
input_ids = np.load(f"{name}_inputs.npy")
|
input_ids = np.load(f"{name}_inputs.npy")
|
||||||
position_ids = np.arange(input_ids.shape[-1])
|
position_ids = np.arange(input_ids.shape[-1])
|
||||||
results = np.load(f"{name}_results.npy")
|
results = np.load(f"{name}_results.npy")
|
||||||
outputs = run_model(model, runtime, (input_ids, position_ids))
|
outputs = run_model(model, runtime, (input_ids, position_ids), data_type=data_type)
|
||||||
print("outputs abs mean:", abs(outputs).mean())
|
print("outputs abs mean:", abs(outputs).mean())
|
||||||
print("max abs diff:", abs(outputs - results).max())
|
print("max abs diff:", abs(outputs - results).max())
|
||||||
|
|
||||||
|
|
||||||
def start_worker(
|
def start_worker(
|
||||||
name: str, world_size: int, rank: int, local_rank: int, model: onnx.ModelProto
|
name: str, world_size: int, rank: int, local_rank: int, model: onnx.ModelProto, data_type: str
|
||||||
):
|
):
|
||||||
dist_name = name + "_dist"
|
dist_name = name + "_dist"
|
||||||
model = parallel_model(model, world_size, rank)
|
model = parallel_model(model, world_size, rank)
|
||||||
|
@ -97,12 +98,12 @@ def start_worker(
|
||||||
world_size,
|
world_size,
|
||||||
rank,
|
rank,
|
||||||
)
|
)
|
||||||
run_and_compare(name, model, runtime)
|
run_and_compare(name, model, runtime, data_type)
|
||||||
|
|
||||||
|
|
||||||
def start_single(name, model):
|
def start_single(name, model, data_type):
|
||||||
runtime = backend.CudaRuntime(0)
|
runtime = backend.CudaRuntime(0)
|
||||||
run_and_compare(name, model, runtime)
|
run_and_compare(name, model, runtime, data_type)
|
||||||
|
|
||||||
|
|
||||||
def gen_standard(name, model, voc_size, bs, len):
|
def gen_standard(name, model, voc_size, bs, len):
|
||||||
|
@ -117,8 +118,10 @@ def gen_standard(name, model, voc_size, bs, len):
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
nnodes, nproc_per_node, name, model_path, bs, length, gen_std = parse_args()
|
nnodes, nproc_per_node, name, model_path, bs, length, gen_std, data_type = parse_args()
|
||||||
|
data_type = "default" if data_type == "fp32" else data_type
|
||||||
|
if data_type != "tf32":
|
||||||
|
os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
|
||||||
model = onnx.load(model_path)
|
model = onnx.load(model_path)
|
||||||
|
|
||||||
# generate standart output
|
# generate standart output
|
||||||
|
@ -132,7 +135,7 @@ def main():
|
||||||
# run single process.
|
# run single process.
|
||||||
# use standalone process to isolate cuda.
|
# use standalone process to isolate cuda.
|
||||||
print("run model by single GPU.")
|
print("run model by single GPU.")
|
||||||
p = mp.Process(target=start_single, args=(name, model))
|
p = mp.Process(target=start_single, args=(name, model, data_type))
|
||||||
p.start()
|
p.start()
|
||||||
p.join()
|
p.join()
|
||||||
|
|
||||||
|
@ -142,7 +145,7 @@ def main():
|
||||||
workers = [
|
workers = [
|
||||||
mp.Process(
|
mp.Process(
|
||||||
target=start_worker,
|
target=start_worker,
|
||||||
args=(name, world_size, rank, rank % nproc_per_node, model),
|
args=(name, world_size, rank, rank % nproc_per_node, model, data_type),
|
||||||
)
|
)
|
||||||
for rank in range(world_size)
|
for rank in range(world_size)
|
||||||
]
|
]
|
||||||
|
|
|
@ -10,8 +10,6 @@ import os
|
||||||
from onnx.external_data_helper import convert_model_to_external_data
|
from onnx.external_data_helper import convert_model_to_external_data
|
||||||
from onnxsim import simplify
|
from onnxsim import simplify
|
||||||
|
|
||||||
torch.backends.cuda.matmul.allow_tf32 = False
|
|
||||||
torch.backends.cudnn.allow_tf32 = False
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
parser = argparse.ArgumentParser(description="Run pytorch gpt2/bert/opt and optionally export onnx.")
|
parser = argparse.ArgumentParser(description="Run pytorch gpt2/bert/opt and optionally export onnx.")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
|
@ -27,14 +25,17 @@ def parse_args():
|
||||||
const="./",
|
const="./",
|
||||||
help="whether and where to export onnx file",
|
help="whether and where to export onnx file",
|
||||||
)
|
)
|
||||||
args = parser.parse_args()
|
parser.add_argument(
|
||||||
|
"--type", type=str, choices=["fp32", "fp16", "tf32"], default="fp32", help="data type"
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
print("arg setting: ", args)
|
print("arg setting: ", args)
|
||||||
return (
|
return (
|
||||||
args.model,
|
args.model,
|
||||||
args.batch_size,
|
args.batch_size,
|
||||||
args.length,
|
args.length,
|
||||||
args.export_onnx
|
args.export_onnx,
|
||||||
|
args.type,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -81,7 +82,7 @@ def run_pytorch(torch_model, voc_size, batchsize, len):
|
||||||
print("outputs abs mean:", abs(np.array(outputs)).mean())
|
print("outputs abs mean:", abs(np.array(outputs)).mean())
|
||||||
print(f"average time: {avg_time}")
|
print(f"average time: {avg_time}")
|
||||||
torch.cuda.memory.empty_cache()
|
torch.cuda.memory.empty_cache()
|
||||||
np.save("test_results", np.array(outputs))
|
np.save("test_results", np.array(outputs, dtype=np.float32))
|
||||||
print("Save input & output as test_inputs.npy and test_results.npy")
|
print("Save input & output as test_inputs.npy and test_results.npy")
|
||||||
|
|
||||||
|
|
||||||
|
@ -164,7 +165,14 @@ def add_value_info_for_constants(model : onnx.ModelProto):
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
modelname, batchsize, seqlen, export_path = parse_args()
|
torch.backends.cuda.matmul.allow_tf32 = False
|
||||||
|
torch.backends.cudnn.allow_tf32 = False
|
||||||
|
modelname, batchsize, seqlen, export_path, data_type = parse_args()
|
||||||
|
if data_type == "tf32":
|
||||||
|
torch.backends.cuda.matmul.allow_tf32 = True
|
||||||
|
else:
|
||||||
|
os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
|
||||||
|
|
||||||
model, voc_size = get_model(modelname)
|
model, voc_size = get_model(modelname)
|
||||||
if export_path is not None:
|
if export_path is not None:
|
||||||
filename = "{}_{}_{}.onnx".format(modelname, batchsize, seqlen)
|
filename = "{}_{}_{}.onnx".format(modelname, batchsize, seqlen)
|
||||||
|
@ -172,6 +180,8 @@ def main():
|
||||||
param = torch.zeros((batchsize, seqlen), dtype=torch.int)
|
param = torch.zeros((batchsize, seqlen), dtype=torch.int)
|
||||||
export_onnx(model, param, path, True)
|
export_onnx(model, param, path, True)
|
||||||
|
|
||||||
|
if data_type == "fp16":
|
||||||
|
model = model.half()
|
||||||
run_pytorch(model, voc_size, batchsize, seqlen)
|
run_pytorch(model, voc_size, batchsize, seqlen)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
Loading…
Reference in New Issue