forked from jiuyuan/InfiniTensor
246 lines
8.6 KiB
Python
246 lines
8.6 KiB
Python
import argparse
|
|
import torch
|
|
from transformers import BertModel, BertConfig
|
|
from transformers import GPT2Model, GPT2Config
|
|
from transformers import OPTModel, OPTConfig
|
|
from transformers import LlamaModel, LlamaConfig
|
|
import time
|
|
import numpy as np
|
|
import onnx
|
|
import os
|
|
import sys
|
|
from onnx.external_data_helper import convert_model_to_external_data
|
|
from onnxsim import simplify
|
|
|
|
torch.backends.cuda.matmul.allow_tf32 = False
|
|
torch.backends.cudnn.allow_tf32 = False
|
|
def parse_args():
|
|
parser = argparse.ArgumentParser(description="Run pytorch gpt2/bert/opt and optionally export onnx.")
|
|
parser.add_argument(
|
|
"--model", type=str, choices=["gpt2", "bert", "opt", "llama"], required=True, help="model type"
|
|
)
|
|
parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
|
|
parser.add_argument("--length", type=int, default=1, help="sequence length.")
|
|
parser.add_argument(
|
|
"--export_onnx",
|
|
type=str,
|
|
nargs="?",
|
|
default=None,
|
|
const="./",
|
|
help="whether and where to export onnx file",
|
|
)
|
|
parser.add_argument(
|
|
"--input_dir",
|
|
type=str,
|
|
default="./",
|
|
help="path to save pytorch model input data"
|
|
)
|
|
parser.add_argument(
|
|
"--result_dir",
|
|
type=str,
|
|
default="./",
|
|
help="path to save pytorch model output data"
|
|
)
|
|
parser.add_argument(
|
|
"--export_only",
|
|
action="store_true"
|
|
)
|
|
args = parser.parse_args()
|
|
print("arg setting: ", args)
|
|
return (
|
|
args.model,
|
|
args.batch_size,
|
|
args.length,
|
|
args.export_onnx,
|
|
args.input_dir,
|
|
args.result_dir,
|
|
args.export_only
|
|
)
|
|
|
|
|
|
def get_model(modelname):
|
|
if modelname == "bert":
|
|
model = BertModel.from_pretrained("bert-base-uncased", add_pooling_layer=False, hidden_act="gelu_new") # erf is not impl by infini
|
|
voc_size = BertConfig().vocab_size
|
|
elif modelname == "gpt2":
|
|
model = GPT2Model.from_pretrained("gpt2")
|
|
voc_size = GPT2Config().vocab_size
|
|
elif modelname == "opt":
|
|
model = OPTModel.from_pretrained("./opt-125m")
|
|
voc_size = OPTConfig().vocab_size
|
|
elif modelname == "llama":
|
|
model = LlamaModel.from_pretrained("meta-llama/Llama-2-7b-hf")
|
|
voc_size = LlamaConfig().vocab_size
|
|
else :
|
|
raise KeyError(modelname)
|
|
|
|
model = model.eval()
|
|
return model, voc_size
|
|
|
|
def run_pytorch(torch_model, voc_size, batchsize, len, model_name):
|
|
data = np.random.randint(0, voc_size, (batchsize, len), dtype=np.int32)
|
|
np.save(os.path.join(input_dir, f"{model_name}_input_0.npy"), data)
|
|
inputs = torch.from_numpy(data).to("cuda")
|
|
torch_model = torch_model.to("cuda")
|
|
|
|
n_iter = 10
|
|
with torch.no_grad():
|
|
for _ in range(10):
|
|
outputs = torch_model(inputs)
|
|
torch.cuda.synchronize()
|
|
begin = time.time()
|
|
with torch.no_grad():
|
|
for _ in range(n_iter):
|
|
torch.cuda.synchronize()
|
|
outputs = torch_model(inputs)
|
|
#
|
|
torch.cuda.synchronize()
|
|
torch.cuda.synchronize()
|
|
end = time.time()
|
|
|
|
avg_time = (end - begin) / n_iter
|
|
outputs = outputs.last_hidden_state.to("cpu")
|
|
print("outputs abs mean:", abs(np.array(outputs)).mean())
|
|
print(f"average time: {avg_time}")
|
|
torch.cuda.memory.empty_cache()
|
|
np.save(os.path.join(result_dir, f"{model_name}_output.npy"), \
|
|
np.array(outputs))
|
|
print(f"Save input & output as {model_name}_input_0.npy and {model_name}_output.npy")
|
|
|
|
|
|
def export_onnx(model_name, model, data, path, extern=False):
|
|
# torch.onnx.export(model, data, path, verbose=False, do_constant_folding=True)
|
|
|
|
if model_name != "llama":
|
|
onnx_model = onnx.load(path)
|
|
onnx_model, check = simplify(onnx_model,
|
|
skipped_optimizers=['fuse_qkv', 'eliminate_duplicate_initializer'])
|
|
# skipped_optimizers=['fuse_qkv'])
|
|
assert check
|
|
add_value_info_for_constants(onnx_model)
|
|
onnx_model = onnx.shape_inference.infer_shapes(onnx_model)
|
|
if extern:
|
|
extern_path = path.replace('.onnx', '.pb')
|
|
if os.path.exists(extern_path):
|
|
os.remove(extern_path)
|
|
convert_model_to_external_data(
|
|
onnx_model,
|
|
all_tensors_to_one_file=True,
|
|
location=extern_path.split("/")[-1],
|
|
size_threshold=1024,
|
|
convert_attribute=False,
|
|
)
|
|
onnx.save(onnx_model, path)
|
|
else:
|
|
sys.path.append("onnxsim_large_model")
|
|
from onnx_utils import set_onnx_input_shape
|
|
from compress_model import SIZE_1MB, compress_onnx_model, uncompress_onnx_model
|
|
|
|
in_model_path = path
|
|
out_model_path = in_model_path[:-5] + ".sim.onnx"
|
|
|
|
onnx_model = onnx.load(in_model_path)
|
|
print(f"load model from {in_model_path} success")
|
|
|
|
size_th_bytes = 1024 * 1024
|
|
onnx_model, removed_inits = compress_onnx_model(onnx_model, size_th_bytes=size_th_bytes)
|
|
print("compress model success")
|
|
|
|
onnx_model = set_onnx_input_shape(onnx_model, "")
|
|
tensor_size_threshold = f"1024KB"
|
|
skipped_optimizers = []
|
|
skipped_optimizers.append("eliminate_duplicate_initializer")
|
|
onnx_model, check = simplify(onnx_model, skipped_optimizers=skipped_optimizers,
|
|
tensor_size_threshold=tensor_size_threshold)
|
|
if not check:
|
|
raise ValueError(f"simplify compressed model {in_model_path} failed")
|
|
|
|
print(f"simplify model success")
|
|
|
|
onnx_model = uncompress_onnx_model(onnx_model, removed_inits)
|
|
print(f"uncompress model success")
|
|
|
|
add_value_info_for_constants(onnx_model)
|
|
|
|
onnx.save(onnx_model, out_model_path, save_as_external_data=True)
|
|
|
|
|
|
def add_value_info_for_constants(model : onnx.ModelProto):
|
|
"""
|
|
Currently onnx.shape_inference doesn't use the shape of initializers, so add
|
|
that info explicitly as ValueInfoProtos.
|
|
Mutates the model.
|
|
Args:
|
|
model: The ModelProto to update.
|
|
"""
|
|
# All (top-level) constants will have ValueInfos before IRv4 as they are all inputs
|
|
if model.ir_version < 4:
|
|
return
|
|
|
|
def add_const_value_infos_to_graph(graph : onnx.GraphProto):
|
|
inputs = {i.name for i in graph.input}
|
|
existing_info = {vi.name: vi for vi in graph.value_info}
|
|
for init in graph.initializer:
|
|
# Check it really is a constant, not an input
|
|
if init.name in inputs:
|
|
continue
|
|
|
|
# The details we want to add
|
|
elem_type = init.data_type
|
|
shape = init.dims
|
|
|
|
# Get existing or create new value info for this constant
|
|
vi = existing_info.get(init.name)
|
|
if vi is None:
|
|
vi = graph.value_info.add()
|
|
vi.name = init.name
|
|
|
|
# Even though it would be weird, we will not overwrite info even if it doesn't match
|
|
tt = vi.type.tensor_type
|
|
if tt.elem_type == onnx.TensorProto.UNDEFINED:
|
|
tt.elem_type = elem_type
|
|
if not tt.HasField("shape"):
|
|
# Ensure we set an empty list if the const is scalar (zero dims)
|
|
tt.shape.dim.extend([])
|
|
for dim in shape:
|
|
tt.shape.dim.add().dim_value = dim
|
|
|
|
# Handle subgraphs
|
|
for node in graph.node:
|
|
for attr in node.attribute:
|
|
# Ref attrs refer to other attrs, so we don't need to do anything
|
|
if attr.ref_attr_name != "":
|
|
continue
|
|
|
|
if attr.type == onnx.AttributeProto.GRAPH:
|
|
add_const_value_infos_to_graph(attr.g)
|
|
if attr.type == onnx.AttributeProto.GRAPHS:
|
|
for g in attr.graphs:
|
|
add_const_value_infos_to_graph(g)
|
|
|
|
|
|
return add_const_value_infos_to_graph(model.graph)
|
|
|
|
|
|
def main():
|
|
global input_dir, result_dir
|
|
|
|
modelname, batchsize, seqlen, \
|
|
export_path, input_dir, result_dir, export_only = parse_args()
|
|
|
|
model, voc_size = get_model(modelname) # pytorch model
|
|
|
|
if export_path is not None:
|
|
os.makedirs(export_path, exist_ok=True)
|
|
filename = "{}_{}_{}.onnx".format(modelname, batchsize, seqlen)
|
|
path = os.path.join(export_path, filename)
|
|
param = torch.zeros((batchsize, seqlen), dtype=torch.int)
|
|
export_onnx(modelname, model, param, path, True) # export pytorch model to onnx model
|
|
if export_only:
|
|
return
|
|
|
|
run_pytorch(model, voc_size, batchsize, seqlen, modelname)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|