LLaMA-Factory-310P3/mindie/examples/models/gpt_neox/main.py

126 lines
3.4 KiB
Python

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (c) Huawei Technologies Co., Ltd. 2024-2024. All rights reserved.
"""
gpt-neox-20b
@create: 2024/1/24 19:32
@since: 2024/1/24 19:32
"""
import argparse
import os
from atb_speed.common.config import atb_speed_config
from atb_speed.common.launcher import ParallelLauncher, Launcher
from atb_speed.common.performance.base import PerformanceTest
from atb_speed.common.precision import get_precision_test_cls
from transformers import AutoTokenizer, AutoModelForCausalLM
def parse_args():
parser = argparse.ArgumentParser(description="Adapting LLM on Ascend")
parser.add_argument(
"--task",
type=str,
default='inference',
choices=['inference', 'precision', 'performance'],
help="Specify the task in which to run the script"
)
args = parser.parse_args()
return args
class LMParallel(ParallelLauncher):
"""
多卡推理launcher
"""
def init_model(self):
tokenizer = AutoTokenizer.from_pretrained(self.model_path, padding_side='left')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
part_model_path = os.path.join(self.model_path, 'part_model', str(self.local_rank))
model = AutoModelForCausalLM.from_pretrained(part_model_path, trust_remote_code=True)
model = model.half().to(self._device)
model.eval()
model.generation_config = self.remove_part_of_generation_config(model.generation_config)
return model, tokenizer
class LM(Launcher):
"""
单卡推理launcher
"""
def init_model(self):
"""
模型初始化
:return:
"""
tokenizer = AutoTokenizer.from_pretrained(self.model_path, padding_side='left')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model = AutoModelForCausalLM.from_pretrained(self.model_path, trust_remote_code=True).half().to(self._device)
model.eval()
model.generation_config = self.remove_part_of_generation_config(model.generation_config)
return model, tokenizer
def demo_ceval(launcher: Launcher):
"""
:param launcher:
:return:
"""
c_t = get_precision_test_cls()(launcher)
c_t.run()
def demo_perf(launcher: Launcher):
"""
:param launcher:
:return:
"""
performance_test = PerformanceTest(launcher)
performance_test.warm_up()
performance_test.run_test()
def demo_inference(launcher: Launcher):
"""
:param launcher:
:return:
"""
param_dict = {"max_new_tokens": 64, "do_sample": False, "repetition_penalty": 1.1}
launcher.logger.info("---------------warm-up---------------")
launcher.infer('Hamlet->Shakespeare\nOne Hundred Years of Solitude->', param_dict)
launcher.logger.info("---------------inference---------------")
launcher.infer('How to learn a new language?', param_dict)
launcher.logger.info("---------------batch---------------")
query_list = [
"How to learn a new language?",
'The CEO of Google is',
]
launcher.infer_batch(query_list, param_dict)
TASK_MAP = {
"inference": demo_inference,
"precision": demo_ceval,
"performance": demo_perf
}
def main():
args = parse_args()
atb_speed_config.init_config("config.ini")
if atb_speed_config.model.device_num > 1:
launcher = LMParallel()
else:
launcher = LM()
TASK_MAP.get(args.task)(launcher)
if __name__ == "__main__":
main()