添加运行说明
This commit is contained in:
parent
8bba131e95
commit
1f6ab65bb2
44
README.md
44
README.md
|
@ -4,11 +4,43 @@
|
|||
|
||||
时间:2024.08.31
|
||||
|
||||
# 下载项目
|
||||
```shell
|
||||
git clone https://osredm.com/pia7vn6wp/repo_304.git
|
||||
cd repo_304
|
||||
```
|
||||
|
||||
# 前端
|
||||
```shell
|
||||
1. 启动镜像
|
||||
|
||||
参赛作品评测总分值为100分,评测项包括:
|
||||
●策划书逻辑完整性(20分);
|
||||
●应用构思的创新性(20分);
|
||||
●应用方案可行性设计(20分);
|
||||
●应用产品的商业价值/社会价值(10分);
|
||||
●应用Demo演示/体验(30分)。
|
||||
docker run -d -p 3000:8080 -e OPENAI_API_KEY=EMPTY -v open-webui:/app/backend/data --name open-webui --restart always ghcr.io/open-webui/open-webui:main
|
||||
```
|
||||
|
||||
# 后端
|
||||
## 如果在端侧运行
|
||||
```shell
|
||||
1. 启动镜像,需将"-v /data/zksc/repo_304:/workspace"中的"/data/zksc/repo_304"替换为本项目所在的主机目录
|
||||
|
||||
docker run -d -p 8080:8000 -v /data/zksc/repo_304:/workspace --name 9g --privileged --cap-add ALL --restart always --workdir /workspace 9g_finetuning/lora:v3 sleep infinity
|
||||
|
||||
2. 进入镜像
|
||||
docker exec -it 9g /bin/bash
|
||||
|
||||
3. 启动后端
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli api config/inference/fm9g_lora_sft_repo.yaml
|
||||
```
|
||||
|
||||
现在访问"http://服务器ip:3000"访问大模型
|
||||
|
||||
参赛作品评测总分值为100分,评测项包括:
|
||||
|
||||
●策划书逻辑完整性(20分);
|
||||
|
||||
●应用构思的创新性(20分);
|
||||
|
||||
●应用方案可行性设计(20分);
|
||||
|
||||
●应用产品的商业价值/社会价值(10分);
|
||||
|
||||
●应用Demo演示/体验(30分)。
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
model_name_or_path: models/fm9g_lora_sft_repo
|
||||
template: 9g
|
|
@ -0,0 +1,38 @@
|
|||
### model
|
||||
model_name_or_path: /workspace/models/fm9g_2b_hf_models
|
||||
|
||||
### method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: lora
|
||||
lora_target: q_proj,v_proj
|
||||
|
||||
### dataset
|
||||
dataset: food_hot, menu_names, occupation_recipe, yuansu
|
||||
template: 9g
|
||||
cutoff_len: 1024
|
||||
max_samples: 1000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
|
||||
### output
|
||||
output_dir: saves/fm9g/lora/sft_repo
|
||||
logging_steps: 10
|
||||
save_steps: 1160 #500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
|
||||
### train
|
||||
per_device_train_batch_size: 8
|
||||
gradient_accumulation_steps: 8
|
||||
learning_rate: 0.0001
|
||||
num_train_epochs: 40.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_steps: 0.1
|
||||
fp16: true
|
||||
|
||||
### eval
|
||||
val_size: 0.1
|
||||
per_device_eval_batch_size: 1
|
||||
evaluation_strategy: steps
|
||||
eval_steps: 10 #500
|
|
@ -0,0 +1,13 @@
|
|||
### Note: DO NOT use quantized model or quantization_bit when merging lora adapters
|
||||
|
||||
### model
|
||||
model_name_or_path: /workspace/models/fm9g_2b_hf_models
|
||||
adapter_name_or_path: saves/fm9g/lora/sft_repo/checkpoint-1160
|
||||
template: 9g
|
||||
finetuning_type: lora
|
||||
|
||||
### export
|
||||
export_dir: /workspace/zksc/repo_304/models/fm9g_lora_sft_repo
|
||||
export_size: 2
|
||||
export_device: cpu
|
||||
export_legacy_format: false
|
|
@ -0,0 +1,14 @@
|
|||
{
|
||||
"menu_names":{
|
||||
"file_name" :"menu_names.json"
|
||||
},
|
||||
"food_hot":{
|
||||
"file_name" :"food_hot.json"
|
||||
},
|
||||
"yuansu":{
|
||||
"file_name" :"yuansu.json"
|
||||
},
|
||||
"occupation_recipe":{
|
||||
"file_name" :"occupation_recipe.json"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,23 @@
|
|||
import argparse
|
||||
import json
|
||||
|
||||
def generateJson(inputPath, outputPath):
|
||||
with open(inputPath, "rb") as data:
|
||||
with open(outputPath, "w", encoding="utf8") as outputs:
|
||||
dataArry = []
|
||||
for inx, line in enumerate(data):
|
||||
dataLine = json.loads(line)
|
||||
dataItem = {}
|
||||
dataItem['instruction'] = ""
|
||||
dataItem['input'] = (dataLine['input'][:len(dataLine['input'])-4])[4:]
|
||||
dataItem['output'] = dataLine['output']
|
||||
dataArry.append(dataItem)
|
||||
json.dump(dataArry, outputs, ensure_ascii=False)
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--path", "-p", required=True, help="Input data file path.")
|
||||
parser.add_argument("--output", "-o", required=True, help="Output data file path.")
|
||||
args = parser.parse_args()
|
||||
|
||||
generateJson(args.path, args.output)
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,217 @@
|
|||
[
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请为教师职业推荐相关食谱",
|
||||
"output": "肺调养食谱,咳喘食谱,益智补脑食谱,清热去火食谱,健忘食谱,咽炎食谱"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请为记者职业推荐相关食谱",
|
||||
"output": "健忘食谱,益智补脑食谱,咽炎食谱,失眠食谱"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请为演员职业推荐相关食谱",
|
||||
"output": "减肥菜谱,美容菜谱,延缓衰老食谱,气血双补食谱,乌发食谱,丰胸食谱,补虚养身食谱"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请为厨师职业推荐相关食谱",
|
||||
"output": "高温环境作业人群食谱,肺调养食谱"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请为医生职业推荐相关食谱",
|
||||
"output": "益智补脑食谱,健忘食谱,防癌抗癌食谱"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请为护士职业推荐相关食谱",
|
||||
"output": "接触电离辐射人员食谱,益智补脑食谱,防癌抗癌食谱,滋阴食谱,补虚养身食谱"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请为司机职业推荐相关食谱",
|
||||
"output": "痔疮食谱,明目食谱,关节炎食谱"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请为军人职业推荐相关食谱",
|
||||
"output": "跌打骨折食谱,运动员食谱,健脾开胃食谱,防暑食谱"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请为律师职业推荐相关食谱",
|
||||
"output": "益智补脑食谱,健忘食谱,失眠食谱,健脾开胃食谱"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请为翻译职业推荐相关食谱",
|
||||
"output": "肾调养食谱,补虚养身食谱,气血双补食谱,益智补脑食谱,失眠食谱,头痛食谱"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请为法官职业推荐相关食谱",
|
||||
"output": "益智补脑食谱,高血压食谱,神经衰弱食谱,心悸食谱,乌发食谱,头痛食谱"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请为保安职业推荐相关食谱",
|
||||
"output": "冻疮食谱,营养不良食谱,壮腰健肾食谱,防暑食谱,气血双补食谱"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请为花匠职业推荐相关食谱",
|
||||
"output": "接触化学毒素人员食谱,清热解毒食谱,气血双补食谱,防暑食谱,骨质增生食谱"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请为清洁工职业推荐相关食谱",
|
||||
"output": "脾调养食谱,肝调养食谱,补虚养身食谱,气血双补食谱,理气调理食谱"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请为理发师职业推荐相关食谱",
|
||||
"output": "营养不良食谱,肝调养食谱,肾调养食谱,乌发食谱,健脾开胃"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请为消防员职业推荐相关食谱",
|
||||
"output": "神经衰弱食谱,高血压食谱,明目食谱,活血化瘀食谱,气血双补食谱"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请为推销员职业推荐相关食谱",
|
||||
"output": "脾调养食谱,高血压食谱,气血双补食谱,肝调养食谱,头痛食谱"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请为运动员职业推荐相关食谱",
|
||||
"output": "不孕不育食谱,健脾开胃食谱,增肥食谱,关节炎食谱,运动员食谱,跌打骨折食谱"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请为机长职业推荐相关食谱",
|
||||
"output": "低温环境作业人群食谱,神经衰弱食谱,明目食谱,失眠食谱,心调养食谱"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请为空姐职业推荐相关食谱",
|
||||
"output": "滋阴食谱,美容菜谱,咳喘食谱,气血双补食谱,理气调理食谱"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请为画师职业推荐相关食谱",
|
||||
"output": "明目食谱,补虚养身食谱,活血化瘀食谱,益智补脑食谱,日本料理"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请为动漫设计职业推荐相关食谱",
|
||||
"output": "日本料理,健脾开胃食谱,神经衰弱食谱,明目食谱,益智补脑食谱"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请为白领职业推荐相关食谱",
|
||||
"output": "益智补脑食谱,健忘食谱,失眠食谱,心调养食谱,头痛食谱,解酒食谱"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请为前台职业推荐相关食谱",
|
||||
"output": "家常菜谱,壮腰健肾食谱,美容菜谱,咳喘食谱,心调养食谱"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请为学生职业推荐相关食谱",
|
||||
"output": "青少年食谱,健脾开胃食谱,明目食谱,健忘食谱"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请为商人职业推荐相关食谱",
|
||||
"output": "肾调养食谱,肝调养食谱,肝硬化食谱,解酒食谱,美味粥汤"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请为会计职业推荐相关食谱",
|
||||
"output": "明目食谱,益智补脑食谱,精品主食,美味粥汤,关节炎食谱"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请为程序员职业推荐相关食谱",
|
||||
"output": "神经衰弱食谱,益智补脑食谱,健忘食谱,乌发食谱,失眠食谱"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请为服务员职业推荐相关食谱",
|
||||
"output": "健忘食谱,耳鸣食谱,明目食谱,理气调理食谱,咽炎食谱,"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请为作家职业推荐相关食谱",
|
||||
"output": "明目食谱,骨质增生食谱,神经衰弱食谱,益智补脑食谱,健忘食谱"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请为模特职业推荐相关食谱",
|
||||
"output": "美容菜谱,减肥菜谱,延缓衰老食谱,丰胸食谱,乌发食谱"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请为导游职业推荐相关食谱",
|
||||
"output": "益智补脑食谱,健忘食谱,咽炎食谱,高血压食谱,补气食谱"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请为歌手职业推荐相关食谱",
|
||||
"output": "咽炎食谱,健忘食谱,补气食谱,减肥菜谱,清热去火食谱,理气调理食谱"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请为裁缝职业推荐相关食谱",
|
||||
"output": "壮腰健肾食谱,明目食谱,脚气食谱,关节炎食谱,健脾开胃食谱"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请为快递员职业推荐相关食谱",
|
||||
"output": "皮肤病食谱,壮腰健肾食谱,防暑食谱,关节炎食谱,跌打骨折食谱"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请为外卖员职业推荐相关食谱",
|
||||
"output": "皮肤病食谱,防暑食谱,胃调养食谱,跌打骨折食谱,壮腰健肾食谱"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请为主持人职业推荐相关食谱",
|
||||
"output": "咽炎食谱,关节炎食谱,壮腰健肾食谱,明目食谱,防癌抗癌食谱"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请为漫画家职业推荐相关食谱",
|
||||
"output": "壮腰健肾食谱,皮肤病食谱,明目食谱,减肥菜谱,前列腺疾病食谱"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请为面点师职业推荐相关食谱",
|
||||
"output": "关节炎食谱,肺调养食谱,壮腰健肾食谱,支气管炎食谱,皮肤病食谱"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请为美甲师职业推荐相关食谱",
|
||||
"output": "关节炎食谱,明目食谱,皮肤病食谱,防癌抗癌食谱,肺调养食谱"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请为保姆职业推荐相关食谱",
|
||||
"output": "关节炎食谱,壮腰健肾食谱,明目食谱,感冒食谱,气血双补食谱"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请为电力工程师职业推荐相关食谱",
|
||||
"output": "接触电离辐射人员食谱,防癌抗癌食谱,明目食谱,关节炎食谱,壮腰健肾食谱"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请为化验员职业推荐相关食谱",
|
||||
"output": "明目食谱,防癌抗癌食谱,感冒食谱,皮肤病食谱,肺调养食谱"
|
||||
}
|
||||
]
|
|
@ -1,43 +1,43 @@
|
|||
{"input": "<用户>请为教师职业推荐相关食谱<AI>", "output": "肺调养食谱,咳喘食谱,益智补脑食谱,清热去火食谱,健忘食谱,咽炎食谱"}
|
||||
{"input": "<用户>请为记者职业推荐相关食谱<AI>", "output": "健忘食谱,益智补脑食谱,咽炎食谱,失眠食谱"}
|
||||
{"input": "<用户>请为演员职业推荐相关食谱<AI>", "output": "减肥菜谱,美容菜谱,延缓衰老食谱,气血双补食谱,乌发食谱,丰胸食谱,补虚养身食谱"}
|
||||
{"input": "<用户>请为厨师职业推荐相关食谱<AI>", "output": "高温环境作业人群食谱,肺调养食谱"}
|
||||
{"input": "<用户>请为医生职业推荐相关食谱<AI>", "output": "益智补脑食谱,健忘食谱,防癌抗癌食谱"}
|
||||
{"input": "<用户>请为护士职业推荐相关食谱<AI>", "output": "接触电离辐射人员食谱,益智补脑食谱,防癌抗癌食谱,滋阴食谱,补虚养身食谱"}
|
||||
{"input": "<用户>请为司机职业推荐相关食谱<AI>", "output": "痔疮食谱,明目食谱,关节炎食谱"}
|
||||
{"input": "<用户>请为军人职业推荐相关食谱<AI>", "output": "跌打骨折食谱,运动员食谱,健脾开胃食谱,防暑食谱"}
|
||||
{"input": "<用户>请为律师职业推荐相关食谱<AI>", "output": "益智补脑食谱,健忘食谱,失眠食谱,健脾开胃食谱"}
|
||||
{"input": "<用户>请为翻译职业推荐相关食谱<AI>", "output": "肾调养食谱,补虚养身食谱,气血双补食谱,益智补脑食谱,失眠食谱,头痛食谱"}
|
||||
{"input": "<用户>请为法官职业推荐相关食谱<AI>", "output": "益智补脑食谱,高血压食谱,神经衰弱食谱,心悸食谱,乌发食谱,头痛食谱"}
|
||||
{"input": "<用户>请为保安职业推荐相关食谱<AI>", "output": "冻疮食谱,营养不良食谱,壮腰健肾食谱,防暑食谱,气血双补食谱"}
|
||||
{"input": "<用户>请为花匠职业推荐相关食谱<AI>", "output": "接触化学毒素人员食谱,清热解毒食谱,气血双补食谱,防暑食谱,骨质增生食谱"}
|
||||
{"input": "<用户>请为清洁工职业推荐相关食谱<AI>", "output": "脾调养食谱,肝调养食谱,补虚养身食谱,气血双补食谱,理气调理食谱"}
|
||||
{"input": "<用户>请为理发师职业推荐相关食谱<AI>", "output": "营养不良食谱,肝调养食谱,肾调养食谱,乌发食谱,健脾开胃"}
|
||||
{"input": "<用户>请为消防员职业推荐相关食谱<AI>", "output": "神经衰弱食谱,高血压食谱,明目食谱,活血化瘀食谱,气血双补食谱"}
|
||||
{"input": "<用户>请为推销员职业推荐相关食谱<AI>", "output": "脾调养食谱,高血压食谱,气血双补食谱,肝调养食谱,头痛食谱"}
|
||||
{"input": "<用户>请为运动员职业推荐相关食谱<AI>", "output": "不孕不育食谱,健脾开胃食谱,增肥食谱,关节炎食谱,运动员食谱,跌打骨折食谱"}
|
||||
{"input": "<用户>请为机长职业推荐相关食谱<AI>", "output": "低温环境作业人群食谱,神经衰弱食谱,明目食谱,失眠食谱,心调养食谱"}
|
||||
{"input": "<用户>请为空姐职业推荐相关食谱<AI>", "output": "滋阴食谱,美容菜谱,咳喘食谱,气血双补食谱,理气调理食谱"}
|
||||
{"input": "<用户>请为画师职业推荐相关食谱<AI>", "output": "明目食谱,补虚养身食谱,活血化瘀食谱,益智补脑食谱,日本料理"}
|
||||
{"input": "<用户>请为动漫设计职业推荐相关食谱<AI>", "output": "日本料理,健脾开胃食谱,神经衰弱食谱,明目食谱,益智补脑食谱"}
|
||||
{"input": "<用户>请为白领职业推荐相关食谱<AI>", "output": "益智补脑食谱,健忘食谱,失眠食谱,心调养食谱,头痛食谱,解酒食谱"}
|
||||
{"input": "<用户>请为前台职业推荐相关食谱<AI>", "output": "家常菜谱,壮腰健肾食谱,美容菜谱,咳喘食谱,心调养食谱"}
|
||||
{"input": "<用户>请为学生职业推荐相关食谱<AI>", "output": "青少年食谱,健脾开胃食谱,明目食谱,健忘食谱"}
|
||||
{"input": "<用户>请为商人职业推荐相关食谱<AI>", "output": "肾调养食谱,肝调养食谱,肝硬化食谱,解酒食谱,美味粥汤"}
|
||||
{"input": "<用户>请为会计职业推荐相关食谱<AI>", "output": "明目食谱,益智补脑食谱,精品主食,美味粥汤,关节炎食谱"}
|
||||
{"input": "<用户>请为程序员职业推荐相关食谱<AI>", "output": "神经衰弱食谱,益智补脑食谱,健忘食谱,乌发食谱,失眠食谱"}
|
||||
{"input": "<用户>请为服务员职业推荐相关食谱<AI>", "output": "健忘食谱,耳鸣食谱,明目食谱,理气调理食谱,咽炎食谱,"}
|
||||
{"input": "<用户>请为作家职业推荐相关食谱<AI>", "output": "明目食谱,骨质增生食谱,神经衰弱食谱,益智补脑食谱,健忘食谱"}
|
||||
{"input": "<用户>请为模特职业推荐相关食谱<AI>", "output": "美容菜谱,减肥菜谱,延缓衰老食谱,丰胸食谱,乌发食谱"}
|
||||
{"input": "<用户>请为导游职业推荐相关食谱<AI>", "output": "益智补脑食谱,健忘食谱,咽炎食谱,高血压食谱,补气食谱"}
|
||||
{"input": "<用户>请为歌手职业推荐相关食谱<AI>", "output": "咽炎食谱,健忘食谱,补气食谱,减肥菜谱,清热去火食谱,理气调理食谱"}
|
||||
{"input": "<用户>请为裁缝职业推荐相关食谱<AI>", "output": "壮腰健肾食谱,明目食谱,脚气食谱,关节炎食谱,健脾开胃食谱"}
|
||||
{"input": "<用户>请为快递员职业推荐相关食谱<AI>", "output": "皮肤病食谱,壮腰健肾食谱,防暑食谱,关节炎食谱,跌打骨折食谱"}
|
||||
{"input": "<用户>请为外卖员职业推荐相关食谱<AI>", "output": "皮肤病食谱,防暑食谱,胃调养食谱,跌打骨折食谱,壮腰健肾食谱"}
|
||||
{"input": "<用户>请为主持人职业推荐相关食谱<AI>", "output": "咽炎食谱,关节炎食谱,壮腰健肾食谱,明目食谱,防癌抗癌食谱"}
|
||||
{"input": "<用户>请为漫画家职业推荐相关食谱<AI>", "output": "壮腰健肾食谱,皮肤病食谱,明目食谱,减肥菜谱,前列腺疾病食谱"}
|
||||
{"input": "<用户>请为面点师职业推荐相关食谱<AI>", "output": "关节炎食谱,肺调养食谱,壮腰健肾食谱,支气管炎食谱,皮肤病食谱"}
|
||||
{"input": "<用户>请为美甲师职业推荐相关食谱<AI>", "output": "关节炎食谱,明目食谱,皮肤病食谱,防癌抗癌食谱,肺调养食谱"}
|
||||
{"input": "<用户>请为保姆职业推荐相关食谱<AI>", "output": "关节炎食谱,壮腰健肾食谱,明目食谱,感冒食谱,气血双补食谱"}
|
||||
{"input": "<用户>请为电力工程师职业推荐相关食谱<AI>", "output": "接触电离辐射人员食谱,防癌抗癌食谱,明目食谱,关节炎食谱,壮腰健肾食谱"}
|
||||
{"input": "<用户>请为化验员职业推荐相关食谱<AI>", "output": "明目食谱,防癌抗癌食谱,感冒食谱,皮肤病食谱,肺调养食谱"}
|
||||
{"input": "<用户>请为教师职业推荐相关食谱<AI>", "output": "肺调养食谱,咳喘食谱,益智补脑食谱,清热去火食谱,健忘食谱,咽炎食谱"}
|
||||
{"input": "<用户>请为记者职业推荐相关食谱<AI>", "output": "健忘食谱,益智补脑食谱,咽炎食谱,失眠食谱"}
|
||||
{"input": "<用户>请为演员职业推荐相关食谱<AI>", "output": "减肥菜谱,美容菜谱,延缓衰老食谱,气血双补食谱,乌发食谱,丰胸食谱,补虚养身食谱"}
|
||||
{"input": "<用户>请为厨师职业推荐相关食谱<AI>", "output": "高温环境作业人群食谱,肺调养食谱"}
|
||||
{"input": "<用户>请为医生职业推荐相关食谱<AI>", "output": "益智补脑食谱,健忘食谱,防癌抗癌食谱"}
|
||||
{"input": "<用户>请为护士职业推荐相关食谱<AI>", "output": "接触电离辐射人员食谱,益智补脑食谱,防癌抗癌食谱,滋阴食谱,补虚养身食谱"}
|
||||
{"input": "<用户>请为司机职业推荐相关食谱<AI>", "output": "痔疮食谱,明目食谱,关节炎食谱"}
|
||||
{"input": "<用户>请为军人职业推荐相关食谱<AI>", "output": "跌打骨折食谱,运动员食谱,健脾开胃食谱,防暑食谱"}
|
||||
{"input": "<用户>请为律师职业推荐相关食谱<AI>", "output": "益智补脑食谱,健忘食谱,失眠食谱,健脾开胃食谱"}
|
||||
{"input": "<用户>请为翻译职业推荐相关食谱<AI>", "output": "肾调养食谱,补虚养身食谱,气血双补食谱,益智补脑食谱,失眠食谱,头痛食谱"}
|
||||
{"input": "<用户>请为法官职业推荐相关食谱<AI>", "output": "益智补脑食谱,高血压食谱,神经衰弱食谱,心悸食谱,乌发食谱,头痛食谱"}
|
||||
{"input": "<用户>请为保安职业推荐相关食谱<AI>", "output": "冻疮食谱,营养不良食谱,壮腰健肾食谱,防暑食谱,气血双补食谱"}
|
||||
{"input": "<用户>请为花匠职业推荐相关食谱<AI>", "output": "接触化学毒素人员食谱,清热解毒食谱,气血双补食谱,防暑食谱,骨质增生食谱"}
|
||||
{"input": "<用户>请为清洁工职业推荐相关食谱<AI>", "output": "脾调养食谱,肝调养食谱,补虚养身食谱,气血双补食谱,理气调理食谱"}
|
||||
{"input": "<用户>请为理发师职业推荐相关食谱<AI>", "output": "营养不良食谱,肝调养食谱,肾调养食谱,乌发食谱,健脾开胃"}
|
||||
{"input": "<用户>请为消防员职业推荐相关食谱<AI>", "output": "神经衰弱食谱,高血压食谱,明目食谱,活血化瘀食谱,气血双补食谱"}
|
||||
{"input": "<用户>请为推销员职业推荐相关食谱<AI>", "output": "脾调养食谱,高血压食谱,气血双补食谱,肝调养食谱,头痛食谱"}
|
||||
{"input": "<用户>请为运动员职业推荐相关食谱<AI>", "output": "不孕不育食谱,健脾开胃食谱,增肥食谱,关节炎食谱,运动员食谱,跌打骨折食谱"}
|
||||
{"input": "<用户>请为机长职业推荐相关食谱<AI>", "output": "低温环境作业人群食谱,神经衰弱食谱,明目食谱,失眠食谱,心调养食谱"}
|
||||
{"input": "<用户>请为空姐职业推荐相关食谱<AI>", "output": "滋阴食谱,美容菜谱,咳喘食谱,气血双补食谱,理气调理食谱"}
|
||||
{"input": "<用户>请为画师职业推荐相关食谱<AI>", "output": "明目食谱,补虚养身食谱,活血化瘀食谱,益智补脑食谱,日本料理"}
|
||||
{"input": "<用户>请为动漫设计职业推荐相关食谱<AI>", "output": "日本料理,健脾开胃食谱,神经衰弱食谱,明目食谱,益智补脑食谱"}
|
||||
{"input": "<用户>请为白领职业推荐相关食谱<AI>", "output": "益智补脑食谱,健忘食谱,失眠食谱,心调养食谱,头痛食谱,解酒食谱"}
|
||||
{"input": "<用户>请为前台职业推荐相关食谱<AI>", "output": "家常菜谱,壮腰健肾食谱,美容菜谱,咳喘食谱,心调养食谱"}
|
||||
{"input": "<用户>请为学生职业推荐相关食谱<AI>", "output": "青少年食谱,健脾开胃食谱,明目食谱,健忘食谱"}
|
||||
{"input": "<用户>请为商人职业推荐相关食谱<AI>", "output": "肾调养食谱,肝调养食谱,肝硬化食谱,解酒食谱,美味粥汤"}
|
||||
{"input": "<用户>请为会计职业推荐相关食谱<AI>", "output": "明目食谱,益智补脑食谱,精品主食,美味粥汤,关节炎食谱"}
|
||||
{"input": "<用户>请为程序员职业推荐相关食谱<AI>", "output": "神经衰弱食谱,益智补脑食谱,健忘食谱,乌发食谱,失眠食谱"}
|
||||
{"input": "<用户>请为服务员职业推荐相关食谱<AI>", "output": "健忘食谱,耳鸣食谱,明目食谱,理气调理食谱,咽炎食谱,"}
|
||||
{"input": "<用户>请为作家职业推荐相关食谱<AI>", "output": "明目食谱,骨质增生食谱,神经衰弱食谱,益智补脑食谱,健忘食谱"}
|
||||
{"input": "<用户>请为模特职业推荐相关食谱<AI>", "output": "美容菜谱,减肥菜谱,延缓衰老食谱,丰胸食谱,乌发食谱"}
|
||||
{"input": "<用户>请为导游职业推荐相关食谱<AI>", "output": "益智补脑食谱,健忘食谱,咽炎食谱,高血压食谱,补气食谱"}
|
||||
{"input": "<用户>请为歌手职业推荐相关食谱<AI>", "output": "咽炎食谱,健忘食谱,补气食谱,减肥菜谱,清热去火食谱,理气调理食谱"}
|
||||
{"input": "<用户>请为裁缝职业推荐相关食谱<AI>", "output": "壮腰健肾食谱,明目食谱,脚气食谱,关节炎食谱,健脾开胃食谱"}
|
||||
{"input": "<用户>请为快递员职业推荐相关食谱<AI>", "output": "皮肤病食谱,壮腰健肾食谱,防暑食谱,关节炎食谱,跌打骨折食谱"}
|
||||
{"input": "<用户>请为外卖员职业推荐相关食谱<AI>", "output": "皮肤病食谱,防暑食谱,胃调养食谱,跌打骨折食谱,壮腰健肾食谱"}
|
||||
{"input": "<用户>请为主持人职业推荐相关食谱<AI>", "output": "咽炎食谱,关节炎食谱,壮腰健肾食谱,明目食谱,防癌抗癌食谱"}
|
||||
{"input": "<用户>请为漫画家职业推荐相关食谱<AI>", "output": "壮腰健肾食谱,皮肤病食谱,明目食谱,减肥菜谱,前列腺疾病食谱"}
|
||||
{"input": "<用户>请为面点师职业推荐相关食谱<AI>", "output": "关节炎食谱,肺调养食谱,壮腰健肾食谱,支气管炎食谱,皮肤病食谱"}
|
||||
{"input": "<用户>请为美甲师职业推荐相关食谱<AI>", "output": "关节炎食谱,明目食谱,皮肤病食谱,防癌抗癌食谱,肺调养食谱"}
|
||||
{"input": "<用户>请为保姆职业推荐相关食谱<AI>", "output": "关节炎食谱,壮腰健肾食谱,明目食谱,感冒食谱,气血双补食谱"}
|
||||
{"input": "<用户>请为电力工程师职业推荐相关食谱<AI>", "output": "接触电离辐射人员食谱,防癌抗癌食谱,明目食谱,关节炎食谱,壮腰健肾食谱"}
|
||||
{"input": "<用户>请为化验员职业推荐相关食谱<AI>", "output": "明目食谱,防癌抗癌食谱,感冒食谱,皮肤病食谱,肺调养食谱"}
|
||||
|
|
|
@ -0,0 +1,977 @@
|
|||
[
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素B1的食物",
|
||||
"output": "好的,以下是富含高维生素B1的食物:能量棒,酵母,培根,小麦胚粉,素食汉堡,香肠,葵花子仁"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素B1的食物",
|
||||
"output": "好的,以下是富含高维生素B1的食物:小麦胚芽,酵母,亚麻籽,大麻籽,松子,澳洲坚果,燕麦麸皮"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素B1的食物",
|
||||
"output": "好的,以下是富含高维生素B1的食物:红花籽仁,面条,四棱豆,面包屑,霹雳果,腊肉,黑豆"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素B1的食物",
|
||||
"output": "好的,以下是富含高维生素B1的食物:黑龟豆,麦片,意大利面,大豆,扁豆,开心果,罂粟籽"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素B1的食物",
|
||||
"output": "好的,以下是富含高维生素B1的食物:豇豆,四棱豆叶,粗粒小麦粉,芥末,猪大排,豆类,咸肉"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素B1的食物",
|
||||
"output": "好的,以下是富含高维生素B1的食物:燕麦,鼠尾草,棉籽仁,蔓越莓,豆类,花生仁,豌豆"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素B1的食物",
|
||||
"output": "好的,以下是富含高维生素B1的食物:花豆,澳大利亚坚果,大豆粉,豆粕,黄豆,豌豆,花卷"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素B1的食物",
|
||||
"output": "好的,以下是富含高维生素B1的食物:大腊肠,芝麻,叉烧肉,山核桃,大麦,榛子,鸽子豌豆"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素B1的食物",
|
||||
"output": "好的,以下是富含高维生素B1的食物:花生,羽扇豆,孜然,绿豆,榛子,奇亚籽,巴西栗"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素B1的食物",
|
||||
"output": "好的,以下是富含高维生素B1的食物:意大利面,芸豆,培根片,面包,马牙大豆,大豆粉,大米"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素B1的食物",
|
||||
"output": "好的,以下是富含高维生素B1的食物:番茄,松饼,蚕豆,啤酒香肠,华夫饼,糙米,猪肉"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素B1的食物",
|
||||
"output": "好的,以下是富含高维生素B1的食物:百吉饼,面包,辣椒,猪肉,芸豆,芸豆,芸豆"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素B1的食物",
|
||||
"output": "好的,以下是富含高维生素B1的食物:小麦麸皮,迷迭香,百里香,金华火腿,全麦面粉,豆腐,豆粕"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素B1的食物",
|
||||
"output": "好的,以下是富含高维生素B1的食物:豌豆,面包果种子,香肠,汤,鹰嘴豆,猪肉,鸡心"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素B1的食物",
|
||||
"output": "好的,以下是富含高维生素B1的食物:德式小香肠,洋葱粉"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素B2的食物",
|
||||
"output": "好的,以下是富含高维生素B2的食物:大红菇,牛奶粉,驼鹿肝,橙汁饮料,酵母,香杏丁蘑,香菜"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素B2的食物",
|
||||
"output": "好的,以下是富含高维生素B2的食物:羊肚菌,猪肝,羊肾,能量棒,香杏片口蘑,杏仁,羊肝"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素B2的食物",
|
||||
"output": "好的,以下是富含高维生素B2的食物:德国式熏肝香肠,松蘑,冬菇,咖啡,龙蒿,牛肝,香菇"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素B2的食物",
|
||||
"output": "好的,以下是富含高维生素B2的食物:婴儿奶粉,辣椒粉,火鸡肝,罗勒,奶酪,普中红蘑,母乳化奶粉"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素B2的食物",
|
||||
"output": "好的,以下是富含高维生素B2的食物:大豆粉,杏仁,蘑菇,鸡肝,麦片,鸭肝,桂圆肉"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素B2的食物",
|
||||
"output": "好的,以下是富含高维生素B2的食物:紫菜,黄螺,黄蘑,黄鳝,驯鹿肉,巴旦木,大豆粉"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素B2的食物",
|
||||
"output": "好的,以下是富含高维生素B2的食物:辣椒粉,奶酪,鸭心,大豆,牛肾,法兰克福,辣椒粉"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素B2的食物",
|
||||
"output": "好的,以下是富含高维生素B2的食物:驯鹿肉,酵母,全脂速溶牛奶粉,小麦胚粉,鸭胰,大豆,苜蓿"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素B2的食物",
|
||||
"output": "好的,以下是富含高维生素B2的食物:全脂牛奶粉,巴旦木,南瓜粉,榛蘑,猪肾,奶豆腐,酸梅晶"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素B2的食物",
|
||||
"output": "好的,以下是富含高维生素B2的食物:汤菜,辣木叶,华夫饼,苦豆子,龙舌兰,桑葚,四棱豆叶"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素B2的食物",
|
||||
"output": "好的,以下是富含高维生素B2的食物:豆腐丝,鸭蛋黄,鹅蛋黄,小麦麸皮,粗粒小麦粉,可可粉,杏仁"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素B2的食物",
|
||||
"output": "好的,以下是富含高维生素B2的食物:食用黄麻,龙牙豆,发菜,牛至,蛋黄,白鲸,金丝小枣"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素B2的食物",
|
||||
"output": "好的,以下是富含高维生素B2的食物:腊羊肉,小麦胚芽,麋鹿肉,鹌鹑蛋,培根,猪心,百里香"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素B2的食物",
|
||||
"output": "好的,以下是富含高维生素B2的食物:黄油渣,火鸡蛋,雅培,角豆粉,红肖梨,红螺,豆瓣酱"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素B2的食物",
|
||||
"output": "好的,以下是富含高维生素B2的食物:可可粉,鸡蛋"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素B6的食物",
|
||||
"output": "好的,以下是富含高维生素B6的食物:鼠尾草,猴面包树粉,龙蒿,青椒,能量棒,红牛,辣椒"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素B6的食物",
|
||||
"output": "好的,以下是富含高维生素B6的食物:辣椒粉,辣椒粉,辣椒粉,香叶,迷迭香,开心果,大蒜粉"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素B6的食物",
|
||||
"output": "好的,以下是富含高维生素B6的食物:洋葱,酵母,葵花子仁,罗勒,麸皮,小麦麸皮,小麦胚粉"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素B6的食物",
|
||||
"output": "好的,以下是富含高维生素B6的食物:小麦胚芽,大蒜,大蒜,辣木叶,马郁兰,红花籽仁,牛肝"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素B6的食物",
|
||||
"output": "好的,以下是富含高维生素B6的食物:牛至,胡萝卜,火鸡肝,金枪鱼,华夫饼,藏红花,白玉米"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素B6的食物",
|
||||
"output": "好的,以下是富含高维生素B6的食物:黄玉米,金枪鱼,冬菇,蘑菇,香菇,金枪鱼,麋鹿肉"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素B6的食物",
|
||||
"output": "好的,以下是富含高维生素B6的食物:驼鹿肝,羊肝,香菜,麋鹿肉,芹菜籽,鸡肝,金枪鱼"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素B6的食物",
|
||||
"output": "好的,以下是富含高维生素B6的食物:香肠,三文鱼,棕榈心,葵花子,苦瓜,芝麻,芝麻"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素B6的食物",
|
||||
"output": "好的,以下是富含高维生素B6的食物:麋鹿肉,棉籽仁,西红柿种子,猪肉,猪肉,鸭肝,鹅肝"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素B6的食物",
|
||||
"output": "好的,以下是富含高维生素B6的食物:西梅干,野鸡,糙米粉,洋葱粉,麦片,橡子,猪肝"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素B6的食物",
|
||||
"output": "好的,以下是富含高维生素B6的食物:橡子粉,面包果种子,山毛榉,猪肉,猪大排,栗子,牛肾"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素B6的食物",
|
||||
"output": "好的,以下是富含高维生素B6的食物:栗子,野鸡,大麦麦芽粉,八角,鸸鹋肉,鸸鹋肉,鸭胸脯肉"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素B6的食物",
|
||||
"output": "好的,以下是富含高维生素B6的食物:麋鹿肉,莲子,姜粉,玉米,玉米粒,牛肉,剑鱼"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素B6的食物",
|
||||
"output": "好的,以下是富含高维生素B6的食物:香菜,食用黄麻,鹌鹑,胡芦巴籽,大麻籽,火鸡,牛肉"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素B6的食物",
|
||||
"output": "好的,以下是富含高维生素B6的食物:玉米面,苋菜籽"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素C的食物",
|
||||
"output": "好的,以下是富含高维生素C的食物:针叶樱桃,针叶樱桃汁,酸枣,青椒,果味饮料,玫瑰果,枣"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素C的食物",
|
||||
"output": "好的,以下是富含高维生素C的食物:辣椒,沙棘,橙汁饮料,大白菜,辣椒,黑醋栗,黑加仑"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素C的食物",
|
||||
"output": "好的,以下是富含高维生素C的食物:猴面包树粉,辣椒,百里香,辣椒,欧芹,青椒,辣椒"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素C的食物",
|
||||
"output": "好的,以下是富含高维生素C的食物:香菜,油菜,墨西哥胡椒,苜蓿,蜜枣,落葵,羽衣甘蓝"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素C的食物",
|
||||
"output": "好的,以下是富含高维生素C的食物:猕猴桃,能量棒,西兰花,苹果汁,莳萝,抱子甘蓝,菜花"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素C的食物",
|
||||
"output": "好的,以下是富含高维生素C的食物:菠菜,藏红花,辣椒,大蒜,辣椒,甘蓝菜,香菜"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素C的食物",
|
||||
"output": "好的,以下是富含高维生素C的食物:辣椒,酸刺,食用黄麻,芥菜,番茄汁,鱼腥草,芥菜"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素C的食物",
|
||||
"output": "好的,以下是富含高维生素C的食物:枣,水芹,番石榴,豌豆苗,柿子,油菜薹,西兰花"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素C的食物",
|
||||
"output": "好的,以下是富含高维生素C的食物:辣椒,中华猕猴桃,余甘子,抱子甘蓝,球茎甘蓝,迷迭香,菜花"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素C的食物",
|
||||
"output": "好的,以下是富含高维生素C的食物:柚子,木瓜,豌豆,豌豆,萝卜叶,脐橙,草莓"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素C的食物",
|
||||
"output": "好的,以下是富含高维生素C的食物:韭菜,枸杞,苹果汁,红菜薹,汤菜,卷心菜,菠萝"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素C的食物",
|
||||
"output": "好的,以下是富含高维生素C的食物:西兰花,苦瓜,地瓜叶,独行菜,蜜枣,蔬菜汁餐前开胃菜,乐陵枣"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素C的食物",
|
||||
"output": "好的,以下是富含高维生素C的食物:橙子,红果,柠檬,豆瓣菜,败酱,芋头叶,辣木叶"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素C的食物",
|
||||
"output": "好的,以下是富含高维生素C的食物:马郁兰,西兰花,芥菜,萝卜缨,橙汁,龙蒿,百里香"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素C的食物",
|
||||
"output": "好的,以下是富含高维生素C的食物:花椰菜,花椰菜"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素E的食物",
|
||||
"output": "好的,以下是富含高维生素E的食物:金丝瓜,小蓟,沙棘,猪蹄,鸡蛋白,苹果汁,蘑菇"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素E的食物",
|
||||
"output": "好的,以下是富含高维生素E的食物:蘑菇,马铃薯,土豆,土豆,咖啡,豆腐,樱桃香草布丁"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素E的食物",
|
||||
"output": "好的,以下是富含高维生素E的食物:马铃薯,马铃薯,马铃薯,味噌,纳豆,豆腐,豆腐"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素E的食物",
|
||||
"output": "好的,以下是富含高维生素E的食物:豆腐,苹果汁,珍珠大麦,鸡精,蒲公英叶,红元帅苹果,橘子汁"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素E的食物",
|
||||
"output": "好的,以下是富含高维生素E的食物:苜蓿籽,蜜瓜,菠萝汁,菠萝,菠萝,紫花苜蓿种子芽,洋葱"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素E的食物",
|
||||
"output": "好的,以下是富含高维生素E的食物:洋葱,洋葱,豌豆,芸豆,豌豆,豆类,百香果"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素E的食物",
|
||||
"output": "好的,以下是富含高维生素E的食物:珍珠大麦,海蜇,菜瓜,婆罗门参,食用黄麻,西瓜,鸡腿"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素E的食物",
|
||||
"output": "好的,以下是富含高维生素E的食物:油豆角,黄瓜,豌豆,可可粉,芸豆,黄瓜,萝卜"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素E的食物",
|
||||
"output": "好的,以下是富含高维生素E的食物:豌豆,芸豆,甜菜,甜菜,甜菜根,牛奶,葡萄柚汁"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素E的食物",
|
||||
"output": "好的,以下是富含高维生素E的食物:橙汁,油豆角,甜菜,土豆,大米,葡萄柚汁,豆腐"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素E的食物",
|
||||
"output": "好的,以下是富含高维生素E的食物:苹果汁,甜菜,竹笋,酸模,苣荬菜,猪肉,奶片"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素E的食物",
|
||||
"output": "好的,以下是富含高维生素E的食物:苹果,樱桃,海枣,哈密瓜,西瓜,蚕豆,小米"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素E的食物",
|
||||
"output": "好的,以下是富含高维生素E的食物:火鸡肉汁,花椰菜,犹太逾越节薄饼,意大利面,麦芽饮料混合物,发菜,祝光苹果"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素E的食物",
|
||||
"output": "好的,以下是富含高维生素E的食物:烧鹅,火鸡腿,狗母鱼,樱桃,樱桃,花椰菜,花椰菜"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高维生素E的食物",
|
||||
"output": "好的,以下是富含高维生素E的食物:玉米,土豆"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含低胆固醇的食物",
|
||||
"output": "好的,以下是富含低胆固醇的食物:葡萄柚汁,麦芽混合饮料粉,面包,松饼,酱汁,巧克力,雅培"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含低胆固醇的食物",
|
||||
"output": "好的,以下是富含低胆固醇的食物:火鸡肉汁,松饼,海鲜酱汁,松饼,素食汉堡,海蜇,巧克力"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含低胆固醇的食物",
|
||||
"output": "好的,以下是富含低胆固醇的食物:海蜇皮,牛奶,汤,辣牛肉酱与豆,海蜇头,樱桃香草布丁,甜甜圈"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含低胆固醇的食物",
|
||||
"output": "好的,以下是富含低胆固醇的食物:麦芽饮料混合物,人奶,奶酪,奶昔,牛奶,麦芽饮料混合物,鲱鱼"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含低胆固醇的食物",
|
||||
"output": "好的,以下是富含低胆固醇的食物:鸡精,士力架,圣代,牛奶,酸奶,酸奶,水果冰淇淋"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含低胆固醇的食物",
|
||||
"output": "好的,以下是富含低胆固醇的食物:蛋黄酱,华夫饼,酸奶,果味奶,甜甜圈,阿拉斯加帝王蟹,辣椒"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含低胆固醇的食物",
|
||||
"output": "好的,以下是富含低胆固醇的食物:牛奶巧克力,百吉饼,安康鱼,比萨,潜艇三明治,金枪鱼,甜甜圈"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含低胆固醇的食物",
|
||||
"output": "好的,以下是富含低胆固醇的食物:羊奶,金枪鱼,三明治,甜甜圈,安康鱼,鲒花,鸭掌"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含低胆固醇的食物",
|
||||
"output": "好的,以下是富含低胆固醇的食物:奶豆腐,奶豆腐,炼乳,石斑鱼,鲷鱼,喜乐,金枪鱼"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含低胆固醇的食物",
|
||||
"output": "好的,以下是富含低胆固醇的食物:牛肉,金枪鱼,嫩鸡胸肉,阿拉斯加帝王蟹,美式香蕉面包,面包鱼,鸡肉馅饼"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含低胆固醇的食物",
|
||||
"output": "好的,以下是富含低胆固醇的食物:龙利鱼,芝士汉堡,腊肉,鸭皮,风干肠,石斑鱼,鲷鱼"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含低胆固醇的食物",
|
||||
"output": "好的,以下是富含低胆固醇的食物:金枪鱼,金枪鱼,兔肉,鳐鱼,多宝鱼,鸭翅,火鸡胸脯肉"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含低胆固醇的食物",
|
||||
"output": "好的,以下是富含低胆固醇的食物:金枪鱼,海参,乌龟,蜗牛,青蛙腿,猪血,蒜肠"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含低胆固醇的食物",
|
||||
"output": "好的,以下是富含低胆固醇的食物:牛蹄筋,奶疙瘩,海参,杏仁露,虱目鱼,羊肉,塘虱鱼"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含低胆固醇的食物",
|
||||
"output": "好的,以下是富含低胆固醇的食物:阿拉斯加帝王蟹,牛肉"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高钾的食物",
|
||||
"output": "好的,以下是富含高钾的食物:咖啡,咖啡,口蘑,龙蒿,龙井茶,香菜,罗勒"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高钾的食物",
|
||||
"output": "好的,以下是富含高钾的食物:大豆粉,可可粉,榛蘑,豆粕,石榴花茶,大豆粉,辣椒粉"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高钾的食物",
|
||||
"output": "好的,以下是富含高钾的食物:大白菜,大豆浓缩蛋白,猴面包树粉,西红柿种子,姜黄粉,大豆粉,面包果种子"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高钾的食物",
|
||||
"output": "好的,以下是富含高钾的食物:黄蘑,辣椒粉,红茶,牛奶粉,黄豆粉,杏,棕榈心"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高钾的食物",
|
||||
"output": "好的,以下是富含高钾的食物:大豆,紫菜,大豆,孜然,白笋,羊肚菌,藏红花"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高钾的食物",
|
||||
"output": "好的,以下是富含高钾的食物:绿茶,花茶,刺楸,银耳,豆类,可可粉,小麦胚粉"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高钾的食物",
|
||||
"output": "好的,以下是富含高钾的食物:马郁兰,黄豆,黑龟豆,香蕉,芸豆,黑豆,铁观音茶"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高钾的食物",
|
||||
"output": "好的,以下是富含高钾的食物:青椒,芸豆,芹菜籽,花豆,鸽子豌豆,豆粕,黑豆"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高钾的食物",
|
||||
"output": "好的,以下是富含高钾的食物:大豆,芸豆,辣椒粉,葛缕子籽,棉籽仁,桂圆,蔓越莓"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高钾的食物",
|
||||
"output": "好的,以下是富含高钾的食物:胡椒粉,姜粉,墨鱼,牛至,赤豆,绿豆,榛子"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高钾的食物",
|
||||
"output": "好的,以下是富含高钾的食物:腌龙须菜,蘑菇,芸豆,大麻籽,大蒜粉,豆类,小麦麸皮"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高钾的食物",
|
||||
"output": "好的,以下是富含高钾的食物:咖喱粉,杏,冬菇,五香粉,鱿鱼,豆蔻,胡萝卜"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高钾的食物",
|
||||
"output": "好的,以下是富含高钾的食物:豇豆,茴香籽,辣椒,蚕豆,面包果种子,土豆粉,扁豆"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高钾的食物",
|
||||
"output": "好的,以下是富含高钾的食物:鼠尾草,蚕豆,芸豆,西梅干,绿豆面,黄豆,香菜"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高钾的食物",
|
||||
"output": "好的,以下是富含高钾的食物:开心果,丁香粉"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高镁的食物",
|
||||
"output": "好的,以下是富含高镁的食物:苔菜,海参,罗勒,大麻籽,小麦麸皮,松子,榛子"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高镁的食物",
|
||||
"output": "好的,以下是富含高镁的食物:可可粉,可可粉,湖盐,西瓜子,芹菜籽,棉籽仁,大豆粉"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高镁的食物",
|
||||
"output": "好的,以下是富含高镁的食物:鼠尾草,榛子,香菜,亚麻籽,胡麻子,麸皮,南瓜子"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高镁的食物",
|
||||
"output": "好的,以下是富含高镁的食物:巴西栗,芥末,大豆粉,孜然,墨鱼,红花籽仁,鲍鱼"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高镁的食物",
|
||||
"output": "好的,以下是富含高镁的食物:罂粟籽,龙蒿,马郁兰,茴香籽,奇亚籽,桑葚,巧克力"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高镁的食物",
|
||||
"output": "好的,以下是富含高镁的食物:咖啡,黄毛籽,芥末,丁香鱼,大豆浓缩蛋白,西红柿种子,咖啡"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高镁的食物",
|
||||
"output": "好的,以下是富含高镁的食物:山核桃,豆粕,蛏干,霹雳果,腰果,芝麻,大豆粉"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高镁的食物",
|
||||
"output": "好的,以下是富含高镁的食物:葵花子仁,大豆,地衣,腰果,杏仁,牛至,香菜"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高镁的食物",
|
||||
"output": "好的,以下是富含高镁的食物:巴旦木,葵花子,绿豆,虾皮,葵花子,藏红花,芥菜干"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高镁的食物",
|
||||
"output": "好的,以下是富含高镁的食物:丁香粉,荞麦,葛缕子籽,咖喱粉,荞麦粉,松子,蜗牛"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高镁的食物",
|
||||
"output": "好的,以下是富含高镁的食物:土盐,能量棒,苋菜籽,黑豆,莲子,巴旦木,小麦胚芽"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高镁的食物",
|
||||
"output": "好的,以下是富含高镁的食物:芝麻酱,海米,燕麦麸皮,松子,香海螺,荞麦,紫萼香茶菜"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高镁的食物",
|
||||
"output": "好的,以下是富含高镁的食物:豆蔻,大豆,龙井茶,辣椒粉,黄豆,荞麦片,迷迭香"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高镁的食物",
|
||||
"output": "好的,以下是富含高镁的食物:百里香,大白菜,茶砖,姜粉,姜黄粉,龙舌兰,石榴花茶"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高镁的食物",
|
||||
"output": "好的,以下是富含高镁的食物:芝麻,珠茶"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含低钠的食物",
|
||||
"output": "好的,以下是富含低钠的食物:节瓜,小蓟,旱酥梨,竹笋,白砂糖,白玉米面,晚桃"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含低钠的食物",
|
||||
"output": "好的,以下是富含低钠的食物:福橘,二锅头,芸豆,葫芦,黄元帅苹果,雪花梨,海棠果"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含低钠的食物",
|
||||
"output": "好的,以下是富含低钠的食物:吊蛋,胡麻油,白笋,榆钱,珍珠花菜,红富士苹果,红元帅苹果"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含低钠的食物",
|
||||
"output": "好的,以下是富含低钠的食物:京白梨,高山白桃,杨梅,茶油,南瓜,黄香蕉苹果,香梨"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含低钠的食物",
|
||||
"output": "好的,以下是富含低钠的食物:酒枣,红粉皮石榴,柿子,橘柑子,菠萝,香蕉,粳米"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含低钠的食物",
|
||||
"output": "好的,以下是富含低钠的食物:金瓜,苏梅梨,石榴,早橘,扁豆,白瓜,佛手瓜"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含低钠的食物",
|
||||
"output": "好的,以下是富含低钠的食物:韭苔,蒌蒿,巴梨,软梨,鸭广梨,蒲桃,醋栗"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含低钠的食物",
|
||||
"output": "好的,以下是富含低钠的食物:苹果,大米,富士苹果,盛宴苹果,史密斯奶奶苹果,红星苹果,杏子"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含低钠的食物",
|
||||
"output": "好的,以下是富含低钠的食物:香蕉,黑莓,蓝莓,蓝莓,樱桃,无花果,醋栗"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含低钠的食物",
|
||||
"output": "好的,以下是富含低钠的食物:葡萄柚汁,葡萄柚汁,葡萄柚汁,柠檬汁,橙汁,橘汁,芒果"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含低钠的食物",
|
||||
"output": "好的,以下是富含低钠的食物:脐橙,梨,菠萝,菠萝,柚子,覆盆子,草莓"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含低钠的食物",
|
||||
"output": "好的,以下是富含低钠的食物:西瓜,油豆角,油豆角,南瓜,杏仁,开心果,荞麦"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含低钠的食物",
|
||||
"output": "好的,以下是富含低钠的食物:大米,苹果和蓝莓,梨,红安茹梨,茶,花豆,李子干"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含低钠的食物",
|
||||
"output": "好的,以下是富含低钠的食物:大豆,粗粒小麦粉,意大利面,黑豆,蔓越莓,芸豆,糙米"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含低钠的食物",
|
||||
"output": "好的,以下是富含低钠的食物:蓝莓,覆盆子"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高钙的食物",
|
||||
"output": "好的,以下是富含高钙的食物:石螺,罗勒,马郁兰,百里香,牛奶粉,芹菜籽,香菜"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高钙的食物",
|
||||
"output": "好的,以下是富含高钙的食物:鼠尾草,西红柿种子,牛至,芥菜干,罂粟籽,迷迭香,曲拉"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高钙的食物",
|
||||
"output": "好的,以下是富含高钙的食物:芝麻酱,石榴花茶,香菜,龙蒿,发菜,果味饮料,田螺"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高钙的食物",
|
||||
"output": "好的,以下是富含高钙的食物:肉桂,婴儿奶粉,虾皮,鸡蛋粉,孜然,大白菜,奶酪"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高钙的食物",
|
||||
"output": "好的,以下是富含高钙的食物:黄毛籽,蕨菜,香叶,奶皮子,榛子,奶酪,芝麻"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高钙的食物",
|
||||
"output": "好的,以下是富含高钙的食物:龙舌兰,酸酪蛋,茴香籽,奶疙瘩,螺,豆腐干,苜蓿"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高钙的食物",
|
||||
"output": "好的,以下是富含高钙的食物:切达奶酪,葛缕子籽,全脂牛奶粉,虾头酱,全脂速溶牛奶粉,芥末,麦片"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高钙的食物",
|
||||
"output": "好的,以下是富含高钙的食物:花椒,丁香粉,奇亚籽,桑葚,芝麻,奶豆腐,黄油渣"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高钙的食物",
|
||||
"output": "好的,以下是富含高钙的食物:油菜,苜蓿籽,丁香鱼,海米,湖盐,雅培,红螺"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高钙的食物",
|
||||
"output": "好的,以下是富含高钙的食物:咖喱粉,能量棒,刺楸,刺荨麻,龙舌兰,胡萝卜,花茶"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高钙的食物",
|
||||
"output": "好的,以下是富含高钙的食物:素大肠,胡椒粉,四棱豆,酸枣,龙舌兰,铁观音茶,菠菜"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高钙的食物",
|
||||
"output": "好的,以下是富含高钙的食物:百里香,白米虾,塘水虾,龙井茶,橙汁饮料,食用黄麻,李广杏脯"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高钙的食物",
|
||||
"output": "好的,以下是富含高钙的食物:西瓜子,洋葱粉,豆蔻,多春鱼,红茶,豆腐,豆腐"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高钙的食物",
|
||||
"output": "好的,以下是富含高钙的食物:大豆浓缩蛋白,葡萄叶,奶豆腐,羊乳酪,磷虾,紫萼香茶菜,洋葱"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高钙的食物",
|
||||
"output": "好的,以下是富含高钙的食物:胡萝卜缨,豆腐"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高铁的食物",
|
||||
"output": "好的,以下是富含高铁的食物:苔菜,普中红蘑,珍珠白蘑,香杏片口蘑,百里香,香杏丁蘑,木耳"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高铁的食物",
|
||||
"output": "好的,以下是富含高铁的食物:罗勒,蛏干,松蘑,发菜,马郁兰,白鲸,孜然"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高铁的食物",
|
||||
"output": "好的,以下是富含高铁的食物:苜蓿籽,姜黄粉,紫菜,蘑菇,芝麻酱,麦片,芹菜籽"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高铁的食物",
|
||||
"output": "好的,以下是富含高铁的食物:香叶,桑葚,青稞,珠茶,芥菜干,牛至,五香粉"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高铁的食物",
|
||||
"output": "好的,以下是富含高铁的食物:蛏子,胡芦巴籽,龙蒿,羊肚菌,鸭血,迷迭香,鼠尾草"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高铁的食物",
|
||||
"output": "好的,以下是富含高铁的食物:红茶,百里香,南瓜粉,河蚌,菠菜,白鲸,湖盐"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高铁的食物",
|
||||
"output": "好的,以下是富含高铁的食物:榛蘑,鸡血,沙鸡,乌骨鸡,石榴花茶,墨鱼,蕨菜"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高铁的食物",
|
||||
"output": "好的,以下是富含高铁的食物:龙井茶,鸭肝,芝麻,猪肝,鲍鱼,黄蘑,香菜"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高铁的食物",
|
||||
"output": "好的,以下是富含高铁的食物:香菜,秋蛤蜊,辣椒粉,地衣,火鸡肝,辣椒粉,酸酪蛋"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高铁的食物",
|
||||
"output": "好的,以下是富含高铁的食物:姜粉,胡麻子,田螺,口蘑,油菜,扁豆,咖喱粉"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高铁的食物",
|
||||
"output": "好的,以下是富含高铁的食物:黑笋,奶疙瘩,咖喱牛肉干,羊血,酵母,藕粉,花茶"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高铁的食物",
|
||||
"output": "好的,以下是富含高铁的食物:百里香,巧克力,辣椒粉,芥末,髯海豹肉,髯海豹肉,腐竹"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高铁的食物",
|
||||
"output": "好的,以下是富含高铁的食物:豆瓣酱,葛缕子籽,黄毛籽,大豆,牛肉干,可可粉,毛蛤蜊"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高铁的食物",
|
||||
"output": "好的,以下是富含高铁的食物:沙棘果汁,豆粕,茶砖,刺蛄,大豆分离蛋白,绿茶,糜子米"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高铁的食物",
|
||||
"output": "好的,以下是富含高铁的食物:芝麻,豆蔻"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高锌的食物",
|
||||
"output": "好的,以下是富含高锌的食物:生蚝,小麦胚粉,蕨菜,蛏干,山核桃,小麦胚芽,马肉"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高锌的食物",
|
||||
"output": "好的,以下是富含高锌的食物:羊肚菌,龙舌兰,扇贝,泥蚶,赤贝,鱿鱼,沙鸡"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高锌的食物",
|
||||
"output": "好的,以下是富含高锌的食物:乌骨鸡,牛肉,山羊肉,螺蛳,墨鱼,能量棒,腊羊肉"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高锌的食物",
|
||||
"output": "好的,以下是富含高锌的食物:大麻籽,巧克力,糌粑,牡蛎,火鸡腿,口蘑,松子"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高锌的食物",
|
||||
"output": "好的,以下是富含高锌的食物:香菇,辣椒,酵母,罂粟籽,香杏片口蘑,兔肉,白鲸"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高锌的食物",
|
||||
"output": "好的,以下是富含高锌的食物:香醋,麋鹿肉,香杏丁蘑,羊肉,阿拉斯加帝王蟹,香肠,牛肉"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高锌的食物",
|
||||
"output": "好的,以下是富含高锌的食物:牛肉,黑笋,咖喱牛肉干,豆蔻,小麦麸皮,牛肉干,南瓜子"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高锌的食物",
|
||||
"output": "好的,以下是富含高锌的食物:酱牛肉,罗勒,牛肉,奶酪,牛肉,芹菜籽,牛肉"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高锌的食物",
|
||||
"output": "好的,以下是富含高锌的食物:可可粉,榛蘑,西瓜子,贻贝,牛肉,鸡蛋黄粉,牛肉"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高锌的食物",
|
||||
"output": "好的,以下是富含高锌的食物:双孢蘑菇,大豆蛋白粉,松子,山核桃,可可粉,蘑菇,琼脂"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高锌的食物",
|
||||
"output": "好的,以下是富含高锌的食物:河蚌,松蘑,羊肉干,百里香,石螺,桑葚,芝麻"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高锌的食物",
|
||||
"output": "好的,以下是富含高锌的食物:芥末,驼鹿肝,羊肉,葵花子,棉籽仁,麸皮,野生稻"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高锌的食物",
|
||||
"output": "好的,以下是富含高锌的食物:鸡蛋粉,阿拉斯加帝王蟹,葵花子,西瓜子,龙井茶,榛子,麋鹿肉"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高锌的食物",
|
||||
"output": "好的,以下是富含高锌的食物:猪肝,腰果,梭子蟹,葛缕子籽,松子,珍宝蟹,香菜"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高锌的食物",
|
||||
"output": "好的,以下是富含高锌的食物:腰果,北美野牛肉"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高叶酸的食物",
|
||||
"output": "好的,以下是富含高叶酸的食物:酵母,酵母,鸭肝,鹅肝,火鸡肝,豇豆,豇豆"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高叶酸的食物",
|
||||
"output": "好的,以下是富含高叶酸的食物:绿豆,绿豆,赤小豆,赤豆,能量棒,蔓越莓,鸡肝"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高叶酸的食物",
|
||||
"output": "好的,以下是富含高叶酸的食物:紫菜,鹰嘴豆,花豆,扁豆,鸽子豌豆,黑豆,黑豆"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高叶酸的食物",
|
||||
"output": "好的,以下是富含高叶酸的食物:黑龟豆,蚕豆,蚕豆,蚕豆,花豆,芸豆,芸豆"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高叶酸的食物",
|
||||
"output": "好的,以下是富含高叶酸的食物:芸豆,芸豆,芸豆,芸豆,花豆,黄豆,大豆"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高叶酸的食物",
|
||||
"output": "好的,以下是富含高叶酸的食物:豆类,大豆,豆类,羽扇豆,黄豆粉,大豆粉,大豆浓缩蛋白"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高叶酸的食物",
|
||||
"output": "好的,以下是富含高叶酸的食物:葛根,罗勒,迷迭香,大豆粉,豆粕,牛肝,大豆蛋白粉"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高叶酸的食物",
|
||||
"output": "好的,以下是富含高叶酸的食物:榛蘑,小麦胚粉,小麦胚芽,豌豆,香菜,马郁兰,鼠尾草"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高叶酸的食物",
|
||||
"output": "好的,以下是富含高叶酸的食物:龙蒿,百里香,花生,花生,葵花子,牛至,意大利面"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高叶酸的食物",
|
||||
"output": "好的,以下是富含高叶酸的食物:棉籽仁,粳米,粳米,粳米,大米,羊肝,面条"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高叶酸的食物",
|
||||
"output": "好的,以下是富含高叶酸的食物:青椒,葵花子仁,大豆粉,驼鹿肝,绿豆,猪肝,蔓越莓"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高叶酸的食物",
|
||||
"output": "好的,以下是富含高叶酸的食物:蔓越莓,大豆,黄蘑,花卷,裙带菜,菠菜,菠菜"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高叶酸的食物",
|
||||
"output": "好的,以下是富含高叶酸的食物:萝卜叶,芦笋,藜麦,小麦粉,小麦粉,粗粒小麦粉,黄豆"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高叶酸的食物",
|
||||
"output": "好的,以下是富含高叶酸的食物:海带,香菜,香叶,麦片,茼蒿,意大利面,大豆分离蛋白"
|
||||
},
|
||||
{
|
||||
"instruction": "",
|
||||
"input": "请帮我推荐一些富含高叶酸的食物",
|
||||
"output": "好的,以下是富含高叶酸的食物:花豆,黄豆芽"
|
||||
}
|
||||
]
|
|
@ -0,0 +1,37 @@
|
|||
{
|
||||
"_name_or_path": "/workspace/models/fm9g_2b_hf_models",
|
||||
"architectures": [
|
||||
"FM9GForCausalLM"
|
||||
],
|
||||
"attention_bias": false,
|
||||
"attention_dropout": 0.0,
|
||||
"auto_map": {
|
||||
"AutoConfig": "configuration_fm9g.FM9GConfig",
|
||||
"AutoModel": "modeling_fm9g.FM9GForCausalLM",
|
||||
"AutoModelForCausalLM": "modeling_fm9g.FM9GForCausalLM",
|
||||
"AutoModelForSeq2SeqLM": "modeling_fm9g.FM9GForCausalLM",
|
||||
"AutoModelForSequenceClassification": "modeling_fm9g.FM9GForSequenceClassification"
|
||||
},
|
||||
"bos_token_id": 1,
|
||||
"dim_model_base": 256,
|
||||
"eos_token_id": 2,
|
||||
"hidden_act": "silu",
|
||||
"hidden_size": 2304,
|
||||
"initializer_range": 0.1,
|
||||
"intermediate_size": 5760,
|
||||
"max_position_embeddings": 4096,
|
||||
"model_type": "fm9g",
|
||||
"num_attention_heads": 36,
|
||||
"num_hidden_layers": 40,
|
||||
"num_key_value_heads": 36,
|
||||
"pretraining_tp": 1,
|
||||
"rms_norm_eps": 1e-05,
|
||||
"rope_scaling": null,
|
||||
"rope_theta": 10000.0,
|
||||
"scale_depth": 1.4,
|
||||
"scale_emb": 12,
|
||||
"torch_dtype": "float16",
|
||||
"transformers_version": "4.41.1",
|
||||
"use_cache": true,
|
||||
"vocab_size": 122753
|
||||
}
|
|
@ -0,0 +1,187 @@
|
|||
# coding=utf-8
|
||||
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
|
||||
# and OPT implementations in this library. It has been modified from its
|
||||
# original forms to accommodate minor architectural differences compared
|
||||
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""FM9G model configuration"""
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
from transformers.utils import logging
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
FM9G_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
|
||||
|
||||
|
||||
class FM9GConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a [`FM9GModel`]. It is used to instantiate an FM9G
|
||||
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
|
||||
defaults will yield a similar configuration to that of the FM9G-7B.
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
|
||||
Args:
|
||||
vocab_size (`int`, *optional*, defaults to 32000):
|
||||
Vocabulary size of the FM9G model. Defines the number of different tokens that can be represented by the
|
||||
`inputs_ids` passed when calling [`FM9GModel`]
|
||||
hidden_size (`int`, *optional*, defaults to 4096):
|
||||
Dimension of the hidden representations.
|
||||
intermediate_size (`int`, *optional*, defaults to 11008):
|
||||
Dimension of the MLP representations.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 32):
|
||||
Number of hidden layers in the Transformer decoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 32):
|
||||
Number of attention heads for each attention layer in the Transformer decoder.
|
||||
num_key_value_heads (`int`, *optional*):
|
||||
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
|
||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||
`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||
by meanpooling all the original heads within that group. For more details checkout [this
|
||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||
`num_attention_heads`.
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||
The non-linear activation function (function or string) in the decoder.
|
||||
max_position_embeddings (`int`, *optional*, defaults to 2048):
|
||||
The maximum sequence length that this model might ever be used with.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
|
||||
The epsilon used by the rms normalization layers.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||
relevant if `config.is_decoder=True`.
|
||||
pad_token_id (`int`, *optional*):
|
||||
Padding token id.
|
||||
bos_token_id (`int`, *optional*, defaults to 1):
|
||||
Beginning of stream token id.
|
||||
eos_token_id (`int`, *optional*, defaults to 2):
|
||||
End of stream token id.
|
||||
pretraining_tp (`int`, *optional*, defaults to 1):
|
||||
Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
|
||||
document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
|
||||
necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
|
||||
issue](https://github.com/pytorch/pytorch/issues/76232).
|
||||
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
||||
Whether to tie weight embeddings
|
||||
rope_theta (`float`, *optional*, defaults to 10000.0):
|
||||
The base period of the RoPE embeddings.
|
||||
rope_scaling (`Dict`, *optional*):
|
||||
Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
|
||||
strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
|
||||
`{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
|
||||
`max_position_embeddings` to the expected new maximum.
|
||||
attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
|
||||
Whether to use a bias in the query, key, value and output projection layers during self-attention.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
|
||||
"""
|
||||
|
||||
model_type = "fm9g"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=32000,
|
||||
hidden_size=4096,
|
||||
intermediate_size=11008,
|
||||
num_hidden_layers=32,
|
||||
num_attention_heads=32,
|
||||
num_key_value_heads=None,
|
||||
hidden_act="silu",
|
||||
max_position_embeddings=2048,
|
||||
initializer_range=0.02,
|
||||
rms_norm_eps=1e-6,
|
||||
use_cache=True,
|
||||
pad_token_id=None,
|
||||
bos_token_id=1,
|
||||
eos_token_id=2,
|
||||
pretraining_tp=1,
|
||||
tie_word_embeddings=True,
|
||||
rope_theta=10000.0,
|
||||
rope_scaling=None,
|
||||
attention_bias=False,
|
||||
attention_dropout=0.0,
|
||||
scale_emb=1,
|
||||
dim_model_base=1,
|
||||
scale_depth=1,
|
||||
**kwargs,
|
||||
):
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
|
||||
# for backward compatibility
|
||||
if num_key_value_heads is None:
|
||||
num_key_value_heads = num_attention_heads
|
||||
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
self.hidden_act = hidden_act
|
||||
self.initializer_range = initializer_range
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
self.pretraining_tp = pretraining_tp
|
||||
self.use_cache = use_cache
|
||||
self.rope_theta = rope_theta
|
||||
self.rope_scaling = rope_scaling
|
||||
self._rope_scaling_validation()
|
||||
self.attention_bias = attention_bias
|
||||
self.attention_dropout = attention_dropout
|
||||
self.scale_emb = scale_emb
|
||||
self.dim_model_base = dim_model_base
|
||||
self.scale_depth = scale_depth
|
||||
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
try:
|
||||
import flash_attn
|
||||
self._attn_implementation = "flash_attention_2"
|
||||
except:
|
||||
pass
|
||||
|
||||
def _rope_scaling_validation(self):
|
||||
"""
|
||||
Validate the `rope_scaling` configuration.
|
||||
"""
|
||||
if self.rope_scaling is None:
|
||||
return
|
||||
|
||||
if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
|
||||
raise ValueError(
|
||||
"`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
|
||||
f"got {self.rope_scaling}"
|
||||
)
|
||||
rope_scaling_type = self.rope_scaling.get("type", None)
|
||||
rope_scaling_factor = self.rope_scaling.get("factor", None)
|
||||
if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
|
||||
raise ValueError(
|
||||
f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
|
||||
)
|
||||
if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
|
||||
raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
|
|
@ -0,0 +1,8 @@
|
|||
{
|
||||
"bos_token_id": 1,
|
||||
"do_sample": true,
|
||||
"eos_token_id": 2,
|
||||
"temperature": 0.8,
|
||||
"top_p": 0.8,
|
||||
"transformers_version": "4.41.1"
|
||||
}
|
|
@ -0,0 +1,369 @@
|
|||
{
|
||||
"metadata": {
|
||||
"total_size": 5449761792
|
||||
},
|
||||
"weight_map": {
|
||||
"model.embed_tokens.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.11.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.22.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.22.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.23.input_layernorm.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.23.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.23.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.23.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.23.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.24.input_layernorm.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.24.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.24.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.24.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.24.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.24.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.24.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.24.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.24.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.25.input_layernorm.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.25.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.25.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.25.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.25.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.25.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.25.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.25.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.25.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.26.input_layernorm.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.26.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.26.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.26.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.26.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.26.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.26.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.26.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.26.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.27.input_layernorm.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.27.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.27.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.27.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.27.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.27.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.27.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.27.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.27.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
|
||||
"model.layers.28.input_layernorm.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.28.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.28.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.28.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.28.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.28.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.28.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.28.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.28.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.29.input_layernorm.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.29.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.29.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.29.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.29.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.29.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.29.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.29.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.29.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.30.input_layernorm.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.30.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.30.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.30.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.30.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.30.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.30.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.30.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.30.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.31.input_layernorm.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.31.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.31.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.31.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.31.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.31.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.31.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.31.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.31.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.32.input_layernorm.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.32.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.32.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.32.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.32.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.32.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.32.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.32.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.32.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.33.input_layernorm.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.33.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.33.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.33.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.33.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.33.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.33.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.33.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.33.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.34.input_layernorm.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.34.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.34.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.34.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.34.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.34.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.34.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.34.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.34.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.35.input_layernorm.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.35.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.35.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.35.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.35.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.35.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.35.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.35.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.35.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.36.input_layernorm.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.36.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.36.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.36.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.36.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.36.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.36.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.36.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.36.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.37.input_layernorm.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.37.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.37.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.37.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.37.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.37.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.37.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.37.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.37.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.38.input_layernorm.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.38.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.38.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.38.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.38.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.38.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.38.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.38.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.38.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.39.input_layernorm.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.39.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.39.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.39.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.39.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.39.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.39.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.39.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.39.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
|
||||
"model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
|
||||
"model.norm.weight": "model-00003-of-00003.safetensors"
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,24 @@
|
|||
{
|
||||
"bos_token": {
|
||||
"content": "<s>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
"eos_token": {
|
||||
"content": "</s>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
"pad_token": "</s>",
|
||||
"unk_token": {
|
||||
"content": "<unk>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load Diff
Binary file not shown.
|
@ -0,0 +1,44 @@
|
|||
{
|
||||
"add_bos_token": true,
|
||||
"add_eos_token": false,
|
||||
"added_tokens_decoder": {
|
||||
"0": {
|
||||
"content": "<unk>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"1": {
|
||||
"content": "<s>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"2": {
|
||||
"content": "</s>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
}
|
||||
},
|
||||
"bos_token": "<s>",
|
||||
"chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{{ '<s>' + system_message }}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<用户>' + content + '<AI>' }}{% elif message['role'] == 'assistant' %}{{ content + '</s>' }}{% endif %}{% endfor %}",
|
||||
"clean_up_tokenization_spaces": false,
|
||||
"eos_token": "</s>",
|
||||
"legacy": true,
|
||||
"model_max_length": 1000000000000000019884624838656,
|
||||
"pad_token": "</s>",
|
||||
"padding_side": "left",
|
||||
"sp_model_kwargs": {},
|
||||
"spaces_between_special_tokens": false,
|
||||
"split_special_tokens": false,
|
||||
"tokenizer_class": "LlamaTokenizer",
|
||||
"unk_token": "<unk>",
|
||||
"use_default_system_prompt": false
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
|
||||
Requirement already satisfied: accelerate in /usr/local/python3/lib/python3.10/site-packages (0.24.1)
|
||||
Requirement already satisfied: numpy>=1.17 in /usr/local/python3/lib/python3.10/site-packages (from accelerate) (1.24.4)
|
||||
Requirement already satisfied: packaging>=20.0 in /usr/local/python3/lib/python3.10/site-packages (from accelerate) (23.2)
|
||||
Requirement already satisfied: psutil in /usr/local/python3/lib/python3.10/site-packages (from accelerate) (5.9.8)
|
||||
Requirement already satisfied: pyyaml in /usr/local/python3/lib/python3.10/site-packages (from accelerate) (6.0.1)
|
||||
Requirement already satisfied: torch>=1.10.0 in /usr/local/python3/lib/python3.10/site-packages (from accelerate) (2.1.1+corex.4.0.0)
|
||||
Requirement already satisfied: huggingface-hub in /usr/local/python3/lib/python3.10/site-packages (from accelerate) (0.23.1)
|
||||
Requirement already satisfied: filelock in /usr/local/python3/lib/python3.10/site-packages (from torch>=1.10.0->accelerate) (3.14.0)
|
||||
Requirement already satisfied: typing-extensions in /usr/local/python3/lib/python3.10/site-packages (from torch>=1.10.0->accelerate) (4.11.0)
|
||||
Requirement already satisfied: sympy in /usr/local/python3/lib/python3.10/site-packages (from torch>=1.10.0->accelerate) (1.12)
|
||||
Requirement already satisfied: networkx in /usr/local/python3/lib/python3.10/site-packages (from torch>=1.10.0->accelerate) (3.3)
|
||||
Requirement already satisfied: jinja2 in /usr/local/python3/lib/python3.10/site-packages (from torch>=1.10.0->accelerate) (3.1.4)
|
||||
Requirement already satisfied: fsspec in /usr/local/python3/lib/python3.10/site-packages (from torch>=1.10.0->accelerate) (2024.3.1)
|
||||
Requirement already satisfied: requests in /usr/local/python3/lib/python3.10/site-packages (from huggingface-hub->accelerate) (2.31.0)
|
||||
Requirement already satisfied: tqdm>=4.42.1 in /usr/local/python3/lib/python3.10/site-packages (from huggingface-hub->accelerate) (4.66.1)
|
||||
Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/python3/lib/python3.10/site-packages (from jinja2->torch>=1.10.0->accelerate) (2.1.5)
|
||||
Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/python3/lib/python3.10/site-packages (from requests->huggingface-hub->accelerate) (3.3.2)
|
||||
Requirement already satisfied: idna<4,>=2.5 in /usr/local/python3/lib/python3.10/site-packages (from requests->huggingface-hub->accelerate) (3.7)
|
||||
Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/python3/lib/python3.10/site-packages (from requests->huggingface-hub->accelerate) (2.2.1)
|
||||
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/python3/lib/python3.10/site-packages (from requests->huggingface-hub->accelerate) (2024.2.2)
|
||||
Requirement already satisfied: mpmath>=0.19 in /usr/local/python3/lib/python3.10/site-packages (from sympy->torch>=1.10.0->accelerate) (1.3.0)
|
|
@ -0,0 +1,37 @@
|
|||
cff-version: 1.2.0
|
||||
date-released: 2024-03
|
||||
message: "If you use this software, please cite it as below."
|
||||
authors:
|
||||
- family-names: "Zheng"
|
||||
given-names: "Yaowei"
|
||||
- family-names: "Zhang"
|
||||
given-names: "Richong"
|
||||
- family-names: "Zhang"
|
||||
given-names: "Junhao"
|
||||
- family-names: "Ye"
|
||||
given-names: "Yanhan"
|
||||
- family-names: "Luo"
|
||||
given-names: "Zheyan"
|
||||
- family-names: "Ma"
|
||||
given-names: "Yongqiang"
|
||||
title: "LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models"
|
||||
url: "https://arxiv.org/abs/2403.13372"
|
||||
preferred-citation:
|
||||
type: article
|
||||
authors:
|
||||
- family-names: "Zheng"
|
||||
given-names: "Yaowei"
|
||||
- family-names: "Zhang"
|
||||
given-names: "Richong"
|
||||
- family-names: "Zhang"
|
||||
given-names: "Junhao"
|
||||
- family-names: "Ye"
|
||||
given-names: "Yanhan"
|
||||
- family-names: "Luo"
|
||||
given-names: "Zheyan"
|
||||
- family-names: "Ma"
|
||||
given-names: "Yongqiang"
|
||||
journal: "arXiv preprint arXiv:2403.13372"
|
||||
title: "LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models"
|
||||
url: "https://arxiv.org/abs/2403.13372"
|
||||
year: 2024
|
|
@ -0,0 +1,14 @@
|
|||
FROM nvcr.io/nvidia/pytorch:24.01-py3
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY requirements.txt /app/
|
||||
RUN pip install -r requirements.txt
|
||||
|
||||
COPY . /app/
|
||||
RUN pip install -e .[metrics,bitsandbytes,qwen]
|
||||
|
||||
VOLUME [ "/root/.cache/huggingface/", "/app/data", "/app/output" ]
|
||||
EXPOSE 7860
|
||||
|
||||
CMD [ "llamafactory-cli", "webui" ]
|
|
@ -0,0 +1,201 @@
|
|||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
|
@ -0,0 +1,11 @@
|
|||
.PHONY: quality style
|
||||
|
||||
check_dirs := scripts src tests
|
||||
|
||||
quality:
|
||||
ruff check $(check_dirs)
|
||||
ruff format --check $(check_dirs)
|
||||
|
||||
style:
|
||||
ruff check $(check_dirs) --fix
|
||||
ruff format $(check_dirs)
|
|
@ -0,0 +1,543 @@
|
|||
![# LLaMA Factory](assets/logo.png)
|
||||
|
||||
[![GitHub Repo stars](https://img.shields.io/github/stars/hiyouga/LLaMA-Factory?style=social)](https://github.com/hiyouga/LLaMA-Factory/stargazers)
|
||||
[![GitHub Code License](https://img.shields.io/github/license/hiyouga/LLaMA-Factory)](LICENSE)
|
||||
[![GitHub last commit](https://img.shields.io/github/last-commit/hiyouga/LLaMA-Factory)](https://github.com/hiyouga/LLaMA-Factory/commits/main)
|
||||
[![PyPI](https://img.shields.io/pypi/v/llamafactory)](https://pypi.org/project/llamafactory/)
|
||||
[![Citation](https://img.shields.io/badge/citation-44-green)](#projects-using-llama-factory)
|
||||
[![GitHub pull request](https://img.shields.io/badge/PRs-welcome-blue)](https://github.com/hiyouga/LLaMA-Factory/pulls)
|
||||
[![Discord](https://dcbadge.vercel.app/api/server/rKfvV9r9FK?compact=true&style=flat)](https://discord.gg/rKfvV9r9FK)
|
||||
[![Twitter](https://img.shields.io/twitter/follow/llamafactory_ai)](https://twitter.com/llamafactory_ai)
|
||||
[![Spaces](https://img.shields.io/badge/🤗-Open%20in%20Spaces-blue)](https://huggingface.co/spaces/hiyouga/LLaMA-Board)
|
||||
[![Studios](https://img.shields.io/badge/ModelScope-Open%20in%20Studios-blue)](https://modelscope.cn/studios/hiyouga/LLaMA-Board)
|
||||
[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1eRTPn37ltBbYsISy9Aw2NuI2Aq5CQrD9?usp=sharing)
|
||||
|
||||
[![GitHub Tread](https://trendshift.io/api/badge/repositories/4535)](https://trendshift.io/repositories/4535)
|
||||
|
||||
👋 Join our [WeChat](assets/wechat.jpg).
|
||||
|
||||
\[ English | [中文](README_zh.md) \]
|
||||
|
||||
**Fine-tuning a large language model can be easy as...**
|
||||
|
||||
https://github.com/hiyouga/LLaMA-Factory/assets/16256802/9840a653-7e9c-41c8-ae89-7ace5698baf6
|
||||
|
||||
Choose your path:
|
||||
|
||||
- **Colab**: https://colab.research.google.com/drive/1eRTPn37ltBbYsISy9Aw2NuI2Aq5CQrD9?usp=sharing
|
||||
- **Local machine**: Please refer to [usage](#getting-started)
|
||||
|
||||
## Table of Contents
|
||||
|
||||
- [Features](#features)
|
||||
- [Benchmark](#benchmark)
|
||||
- [Changelog](#changelog)
|
||||
- [Supported Models](#supported-models)
|
||||
- [Supported Training Approaches](#supported-training-approaches)
|
||||
- [Provided Datasets](#provided-datasets)
|
||||
- [Requirement](#requirement)
|
||||
- [Getting Started](#getting-started)
|
||||
- [Projects using LLaMA Factory](#projects-using-llama-factory)
|
||||
- [License](#license)
|
||||
- [Citation](#citation)
|
||||
- [Acknowledgement](#acknowledgement)
|
||||
|
||||
## Features
|
||||
|
||||
- **Various models**: LLaMA, LLaVA, Mistral, Mixtral-MoE, Qwen, Yi, Gemma, Baichuan, ChatGLM, Phi, etc.
|
||||
- **Integrated methods**: (Continuous) pre-training, (multimodal) supervised fine-tuning, reward modeling, PPO, DPO, KTO and ORPO.
|
||||
- **Scalable resources**: 32-bit full-tuning, 16-bit freeze-tuning, 16-bit LoRA and 2/4/8-bit QLoRA via AQLM/AWQ/GPTQ/LLM.int8.
|
||||
- **Advanced algorithms**: GaLore, BAdam, DoRA, LongLoRA, LLaMA Pro, Mixture-of-Depths, LoRA+, LoftQ and Agent tuning.
|
||||
- **Practical tricks**: FlashAttention-2, Unsloth, RoPE scaling, NEFTune and rsLoRA.
|
||||
- **Experiment monitors**: LlamaBoard, TensorBoard, Wandb, MLflow, etc.
|
||||
- **Faster inference**: OpenAI-style API, Gradio UI and CLI with vLLM worker.
|
||||
|
||||
## Benchmark
|
||||
|
||||
Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/ptuning), LLaMA Factory's LoRA tuning offers up to **3.7 times faster** training speed with a better Rouge score on the advertising text generation task. By leveraging 4-bit quantization technique, LLaMA Factory's QLoRA further improves the efficiency regarding the GPU memory.
|
||||
|
||||
![benchmark](assets/benchmark.svg)
|
||||
|
||||
<details><summary>Definitions</summary>
|
||||
|
||||
- **Training Speed**: the number of training samples processed per second during the training. (bs=4, cutoff_len=1024)
|
||||
- **Rouge Score**: Rouge-2 score on the development set of the [advertising text generation](https://aclanthology.org/D19-1321.pdf) task. (bs=4, cutoff_len=1024)
|
||||
- **GPU Memory**: Peak GPU memory usage in 4-bit quantized training. (bs=1, cutoff_len=1024)
|
||||
- We adopt `pre_seq_len=128` for ChatGLM's P-Tuning and `lora_rank=32` for LLaMA Factory's LoRA tuning.
|
||||
|
||||
</details>
|
||||
|
||||
## Changelog
|
||||
|
||||
[24/05/20] We supported fine-tuning the **PaliGemma** series models. Note that the PaliGemma models are pre-trained models, you need to fine-tune them with `gemma` template for chat completion.
|
||||
|
||||
[24/05/18] We supported **[KTO](https://arxiv.org/abs/2402.01306)** algorithm for preference learning. See [examples](examples/README.md) for usage.
|
||||
|
||||
[24/05/14] We supported training and inference on the Ascend NPU devices. Check [installation](#installation) section for details.
|
||||
|
||||
<details><summary>Full Changelog</summary>
|
||||
|
||||
[24/04/26] We supported fine-tuning the **LLaVA-1.5** multimodal LLMs. See [examples](examples/README.md) for usage.
|
||||
|
||||
[24/04/22] We provided a **[Colab notebook](https://colab.research.google.com/drive/1eRTPn37ltBbYsISy9Aw2NuI2Aq5CQrD9?usp=sharing)** for fine-tuning the Llama-3 model on a free T4 GPU. Two Llama-3-derived models fine-tuned using LLaMA Factory are available at Hugging Face, check [Llama3-8B-Chinese-Chat](https://huggingface.co/shenzhi-wang/Llama3-8B-Chinese-Chat) and [Llama3-Chinese](https://huggingface.co/zhichen/Llama3-Chinese) for details.
|
||||
|
||||
[24/04/21] We supported **[Mixture-of-Depths](https://arxiv.org/abs/2404.02258)** according to [AstraMindAI's implementation](https://github.com/astramind-ai/Mixture-of-depths). See [examples](examples/README.md) for usage.
|
||||
|
||||
[24/04/16] We supported **[BAdam](https://arxiv.org/abs/2404.02827)**. See [examples](examples/README.md) for usage.
|
||||
|
||||
[24/04/16] We supported **[unsloth](https://github.com/unslothai/unsloth)**'s long-sequence training (Llama-2-7B-56k within 24GB). It achieves **117%** speed and **50%** memory compared with FlashAttention-2, more benchmarks can be found in [this page](https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-comparison).
|
||||
|
||||
[24/03/31] We supported **[ORPO](https://arxiv.org/abs/2403.07691)**. See [examples](examples/README.md) for usage.
|
||||
|
||||
[24/03/21] Our paper "[LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models](https://arxiv.org/abs/2403.13372)" is available at arXiv!
|
||||
|
||||
[24/03/20] We supported **FSDP+QLoRA** that fine-tunes a 70B model on 2x24GB GPUs. See [examples](examples/README.md) for usage.
|
||||
|
||||
[24/03/13] We supported **[LoRA+](https://arxiv.org/abs/2402.12354)**. See [examples](examples/README.md) for usage.
|
||||
|
||||
[24/03/07] We supported gradient low-rank projection (**[GaLore](https://arxiv.org/abs/2403.03507)**) algorithm. See [examples](examples/README.md) for usage.
|
||||
|
||||
[24/03/07] We integrated **[vLLM](https://github.com/vllm-project/vllm)** for faster and concurrent inference. Try `infer_backend: vllm` to enjoy **270%** inference speed.
|
||||
|
||||
[24/02/28] We supported weight-decomposed LoRA (**[DoRA](https://arxiv.org/abs/2402.09353)**). Try `use_dora: true` to activate DoRA training.
|
||||
|
||||
[24/02/15] We supported **block expansion** proposed by [LLaMA Pro](https://github.com/TencentARC/LLaMA-Pro). See [examples](examples/README.md) for usage.
|
||||
|
||||
[24/02/05] Qwen1.5 (Qwen2 beta version) series models are supported in LLaMA-Factory. Check this [blog post](https://qwenlm.github.io/blog/qwen1.5/) for details.
|
||||
|
||||
[24/01/18] We supported **agent tuning** for most models, equipping model with tool using abilities by fine-tuning with `dataset: glaive_toolcall`.
|
||||
|
||||
[23/12/23] We supported **[unsloth](https://github.com/unslothai/unsloth)**'s implementation to boost LoRA tuning for the LLaMA, Mistral and Yi models. Try `use_unsloth: true` argument to activate unsloth patch. It achieves **170%** speed in our benchmark, check [this page](https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-comparison) for details.
|
||||
|
||||
[23/12/12] We supported fine-tuning the latest MoE model **[Mixtral 8x7B](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)** in our framework. See hardware requirement [here](#hardware-requirement).
|
||||
|
||||
[23/12/01] We supported downloading pre-trained models and datasets from the **[ModelScope Hub](https://modelscope.cn/models)** for Chinese mainland users. See [this tutorial](#download-from-modelscope-hub) for usage.
|
||||
|
||||
[23/10/21] We supported **[NEFTune](https://arxiv.org/abs/2310.05914)** trick for fine-tuning. Try `neftune_noise_alpha: 5` argument to activate NEFTune.
|
||||
|
||||
[23/09/27] We supported **$S^2$-Attn** proposed by [LongLoRA](https://github.com/dvlab-research/LongLoRA) for the LLaMA models. Try `shift_attn: true` argument to enable shift short attention.
|
||||
|
||||
[23/09/23] We integrated MMLU, C-Eval and CMMLU benchmarks in this repo. See [examples](examples/README.md) for usage.
|
||||
|
||||
[23/09/10] We supported **[FlashAttention-2](https://github.com/Dao-AILab/flash-attention)**. Try `flash_attn: fa2` argument to enable FlashAttention-2 if you are using RTX4090, A100 or H100 GPUs.
|
||||
|
||||
[23/08/12] We supported **RoPE scaling** to extend the context length of the LLaMA models. Try `rope_scaling: linear` argument in training and `rope_scaling: dynamic` argument at inference to extrapolate the position embeddings.
|
||||
|
||||
[23/08/11] We supported **[DPO training](https://arxiv.org/abs/2305.18290)** for instruction-tuned models. See [examples](examples/README.md) for usage.
|
||||
|
||||
[23/07/31] We supported **dataset streaming**. Try `streaming: true` and `max_steps: 10000` arguments to load your dataset in streaming mode.
|
||||
|
||||
[23/07/29] We released two instruction-tuned 13B models at Hugging Face. See these Hugging Face Repos ([LLaMA-2](https://huggingface.co/hiyouga/Llama-2-Chinese-13b-chat) / [Baichuan](https://huggingface.co/hiyouga/Baichuan-13B-sft)) for details.
|
||||
|
||||
[23/07/18] We developed an **all-in-one Web UI** for training, evaluation and inference. Try `train_web.py` to fine-tune models in your Web browser. Thank [@KanadeSiina](https://github.com/KanadeSiina) and [@codemayq](https://github.com/codemayq) for their efforts in the development.
|
||||
|
||||
[23/07/09] We released **[FastEdit](https://github.com/hiyouga/FastEdit)** ⚡🩹, an easy-to-use package for editing the factual knowledge of large language models efficiently. Please follow [FastEdit](https://github.com/hiyouga/FastEdit) if you are interested.
|
||||
|
||||
[23/06/29] We provided a **reproducible example** of training a chat model using instruction-following datasets, see [Baichuan-7B-sft](https://huggingface.co/hiyouga/Baichuan-7B-sft) for details.
|
||||
|
||||
[23/06/22] We aligned the [demo API](src/api_demo.py) with the [OpenAI's](https://platform.openai.com/docs/api-reference/chat) format where you can insert the fine-tuned model in **arbitrary ChatGPT-based applications**.
|
||||
|
||||
[23/06/03] We supported quantized training and inference (aka **[QLoRA](https://github.com/artidoro/qlora)**). See [examples](examples/README.md) for usage.
|
||||
|
||||
</details>
|
||||
|
||||
## Supported Models
|
||||
|
||||
| Model | Model size | Default module | Template |
|
||||
| -------------------------------------------------------- | -------------------------------- | ----------------- | --------- |
|
||||
| [Baichuan2](https://huggingface.co/baichuan-inc) | 7B/13B | W_pack | baichuan2 |
|
||||
| [BLOOM](https://huggingface.co/bigscience) | 560M/1.1B/1.7B/3B/7.1B/176B | query_key_value | - |
|
||||
| [BLOOMZ](https://huggingface.co/bigscience) | 560M/1.1B/1.7B/3B/7.1B/176B | query_key_value | - |
|
||||
| [ChatGLM3](https://huggingface.co/THUDM) | 6B | query_key_value | chatglm3 |
|
||||
| [Command-R](https://huggingface.co/CohereForAI) | 35B/104B | q_proj,v_proj | cohere |
|
||||
| [DeepSeek (MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | q_proj,v_proj | deepseek |
|
||||
| [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | query_key_value | falcon |
|
||||
| [Gemma/CodeGemma](https://huggingface.co/google) | 2B/7B | q_proj,v_proj | gemma |
|
||||
| [InternLM2](https://huggingface.co/internlm) | 7B/20B | wqkv | intern2 |
|
||||
| [LLaMA](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | q_proj,v_proj | - |
|
||||
| [LLaMA-2](https://huggingface.co/meta-llama) | 7B/13B/70B | q_proj,v_proj | llama2 |
|
||||
| [LLaMA-3](https://huggingface.co/meta-llama) | 8B/70B | q_proj,v_proj | llama3 |
|
||||
| [LLaVA-1.5](https://huggingface.co/llava-hf) | 7B/13B | q_proj,v_proj | vicuna |
|
||||
| [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | q_proj,v_proj | mistral |
|
||||
| [OLMo](https://huggingface.co/allenai) | 1B/7B | q_proj,v_proj | - |
|
||||
| [PaliGemma](https://huggingface.co/google) | 3B | q_proj,v_proj | gemma |
|
||||
| [Phi-1.5/2](https://huggingface.co/microsoft) | 1.3B/2.7B | q_proj,v_proj | - |
|
||||
| [Phi-3](https://huggingface.co/microsoft) | 3.8B | qkv_proj | phi |
|
||||
| [Qwen](https://huggingface.co/Qwen) | 1.8B/7B/14B/72B | c_attn | qwen |
|
||||
| [Qwen1.5 (Code/MoE)](https://huggingface.co/Qwen) | 0.5B/1.8B/4B/7B/14B/32B/72B/110B | q_proj,v_proj | qwen |
|
||||
| [StarCoder2](https://huggingface.co/bigcode) | 3B/7B/15B | q_proj,v_proj | - |
|
||||
| [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | q_proj,v_proj | xverse |
|
||||
| [Yi (1/1.5)](https://huggingface.co/01-ai) | 6B/9B/34B | q_proj,v_proj | yi |
|
||||
| [Yi-VL](https://huggingface.co/01-ai) | 6B/34B | q_proj,v_proj | yi_vl |
|
||||
| [Yuan](https://huggingface.co/IEITYuan) | 2B/51B/102B | q_proj,v_proj | yuan |
|
||||
|
||||
> [!NOTE]
|
||||
> **Default module** is used for the `--lora_target` argument, you can use `--lora_target all` to specify all the available modules for better convergence.
|
||||
>
|
||||
> For the "base" models, the `--template` argument can be chosen from `default`, `alpaca`, `vicuna` etc. But make sure to use the **corresponding template** for the "instruct/chat" models.
|
||||
>
|
||||
> Remember to use the **SAME** template in training and inference.
|
||||
|
||||
Please refer to [constants.py](src/llamafactory/extras/constants.py) for a full list of models we supported.
|
||||
|
||||
You also can add a custom chat template to [template.py](src/llamafactory/data/template.py).
|
||||
|
||||
## Supported Training Approaches
|
||||
|
||||
| Approach | Full-tuning | Freeze-tuning | LoRA | QLoRA |
|
||||
| ---------------------- | ------------------ | ------------------ | ------------------ | ------------------ |
|
||||
| Pre-Training | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
|
||||
| Supervised Fine-Tuning | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
|
||||
| Reward Modeling | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
|
||||
| PPO Training | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
|
||||
| DPO Training | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
|
||||
| KTO Training | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
|
||||
| ORPO Training | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
|
||||
|
||||
## Provided Datasets
|
||||
|
||||
<details><summary>Pre-training datasets</summary>
|
||||
|
||||
- [Wiki Demo (en)](data/wiki_demo.txt)
|
||||
- [RefinedWeb (en)](https://huggingface.co/datasets/tiiuae/falcon-refinedweb)
|
||||
- [RedPajama V2 (en)](https://huggingface.co/datasets/togethercomputer/RedPajama-Data-V2)
|
||||
- [Wikipedia (en)](https://huggingface.co/datasets/olm/olm-wikipedia-20221220)
|
||||
- [Wikipedia (zh)](https://huggingface.co/datasets/pleisto/wikipedia-cn-20230720-filtered)
|
||||
- [Pile (en)](https://huggingface.co/datasets/EleutherAI/pile)
|
||||
- [SkyPile (zh)](https://huggingface.co/datasets/Skywork/SkyPile-150B)
|
||||
- [The Stack (en)](https://huggingface.co/datasets/bigcode/the-stack)
|
||||
- [StarCoder (en)](https://huggingface.co/datasets/bigcode/starcoderdata)
|
||||
|
||||
</details>
|
||||
|
||||
<details><summary>Supervised fine-tuning datasets</summary>
|
||||
|
||||
- [Identity (en&zh)](data/identity.json)
|
||||
- [Stanford Alpaca (en)](https://github.com/tatsu-lab/stanford_alpaca)
|
||||
- [Stanford Alpaca (zh)](https://github.com/ymcui/Chinese-LLaMA-Alpaca-3)
|
||||
- [Alpaca GPT4 (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
|
||||
- [Glaive Function Calling V2 (en&zh)](https://huggingface.co/datasets/glaiveai/glaive-function-calling-v2)
|
||||
- [LIMA (en)](https://huggingface.co/datasets/GAIR/lima)
|
||||
- [Guanaco Dataset (multilingual)](https://huggingface.co/datasets/JosephusCheung/GuanacoDataset)
|
||||
- [BELLE 2M (zh)](https://huggingface.co/datasets/BelleGroup/train_2M_CN)
|
||||
- [BELLE 1M (zh)](https://huggingface.co/datasets/BelleGroup/train_1M_CN)
|
||||
- [BELLE 0.5M (zh)](https://huggingface.co/datasets/BelleGroup/train_0.5M_CN)
|
||||
- [BELLE Dialogue 0.4M (zh)](https://huggingface.co/datasets/BelleGroup/generated_chat_0.4M)
|
||||
- [BELLE School Math 0.25M (zh)](https://huggingface.co/datasets/BelleGroup/school_math_0.25M)
|
||||
- [BELLE Multiturn Chat 0.8M (zh)](https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M)
|
||||
- [UltraChat (en)](https://github.com/thunlp/UltraChat)
|
||||
- [OpenPlatypus (en)](https://huggingface.co/datasets/garage-bAInd/Open-Platypus)
|
||||
- [CodeAlpaca 20k (en)](https://huggingface.co/datasets/sahil2801/CodeAlpaca-20k)
|
||||
- [Alpaca CoT (multilingual)](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT)
|
||||
- [OpenOrca (en)](https://huggingface.co/datasets/Open-Orca/OpenOrca)
|
||||
- [SlimOrca (en)](https://huggingface.co/datasets/Open-Orca/SlimOrca)
|
||||
- [MathInstruct (en)](https://huggingface.co/datasets/TIGER-Lab/MathInstruct)
|
||||
- [Firefly 1.1M (zh)](https://huggingface.co/datasets/YeungNLP/firefly-train-1.1M)
|
||||
- [Wiki QA (en)](https://huggingface.co/datasets/wiki_qa)
|
||||
- [Web QA (zh)](https://huggingface.co/datasets/suolyer/webqa)
|
||||
- [WebNovel (zh)](https://huggingface.co/datasets/zxbsmk/webnovel_cn)
|
||||
- [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar)
|
||||
- [deepctrl (en&zh)](https://www.modelscope.cn/datasets/deepctrl/deepctrl-sft-data)
|
||||
- [Advertise Generating (zh)](https://huggingface.co/datasets/HasturOfficial/adgen)
|
||||
- [ShareGPT Hyperfiltered (en)](https://huggingface.co/datasets/totally-not-an-llm/sharegpt-hyperfiltered-3k)
|
||||
- [ShareGPT4 (en&zh)](https://huggingface.co/datasets/shibing624/sharegpt_gpt4)
|
||||
- [UltraChat 200k (en)](https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k)
|
||||
- [AgentInstruct (en)](https://huggingface.co/datasets/THUDM/AgentInstruct)
|
||||
- [LMSYS Chat 1M (en)](https://huggingface.co/datasets/lmsys/lmsys-chat-1m)
|
||||
- [Evol Instruct V2 (en)](https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_V2_196k)
|
||||
- [Cosmopedia (en)](https://huggingface.co/datasets/HuggingFaceTB/cosmopedia)
|
||||
- [STEM (zh)](https://huggingface.co/datasets/hfl/stem_zh_instruction)
|
||||
- [Ruozhiba (zh)](https://huggingface.co/datasets/hfl/ruozhiba_gpt4_turbo)
|
||||
- [LLaVA mixed (en&zh)](https://huggingface.co/datasets/BUAADreamer/llava-en-zh-300k)
|
||||
- [Open Assistant (de)](https://huggingface.co/datasets/mayflowergmbh/oasst_de)
|
||||
- [Dolly 15k (de)](https://huggingface.co/datasets/mayflowergmbh/dolly-15k_de)
|
||||
- [Alpaca GPT4 (de)](https://huggingface.co/datasets/mayflowergmbh/alpaca-gpt4_de)
|
||||
- [OpenSchnabeltier (de)](https://huggingface.co/datasets/mayflowergmbh/openschnabeltier_de)
|
||||
- [Evol Instruct (de)](https://huggingface.co/datasets/mayflowergmbh/evol-instruct_de)
|
||||
- [Dolphin (de)](https://huggingface.co/datasets/mayflowergmbh/dolphin_de)
|
||||
- [Booksum (de)](https://huggingface.co/datasets/mayflowergmbh/booksum_de)
|
||||
- [Airoboros (de)](https://huggingface.co/datasets/mayflowergmbh/airoboros-3.0_de)
|
||||
- [Ultrachat (de)](https://huggingface.co/datasets/mayflowergmbh/ultra-chat_de)
|
||||
|
||||
</details>
|
||||
|
||||
<details><summary>Preference datasets</summary>
|
||||
|
||||
- [DPO mixed (en&zh)](https://huggingface.co/datasets/hiyouga/DPO-En-Zh-20k)
|
||||
- [Orca DPO Pairs (en)](https://huggingface.co/datasets/Intel/orca_dpo_pairs)
|
||||
- [HH-RLHF (en)](https://huggingface.co/datasets/Anthropic/hh-rlhf)
|
||||
- [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar)
|
||||
- [Orca DPO (de)](https://huggingface.co/datasets/mayflowergmbh/intel_orca_dpo_pairs_de)
|
||||
- [KTO mixed (en)](https://huggingface.co/datasets/argilla/kto-mix-15k)
|
||||
|
||||
</details>
|
||||
|
||||
Some datasets require confirmation before using them, so we recommend logging in with your Hugging Face account using these commands.
|
||||
|
||||
```bash
|
||||
pip install --upgrade huggingface_hub
|
||||
huggingface-cli login
|
||||
```
|
||||
|
||||
## Requirement
|
||||
|
||||
| Mandatory | Minimum | Recommend |
|
||||
| ------------ | ------- | --------- |
|
||||
| python | 3.8 | 3.10 |
|
||||
| torch | 1.13.1 | 2.2.0 |
|
||||
| transformers | 4.37.2 | 4.41.0 |
|
||||
| datasets | 2.14.3 | 2.19.1 |
|
||||
| accelerate | 0.27.2 | 0.30.1 |
|
||||
| peft | 0.9.0 | 0.11.1 |
|
||||
| trl | 0.8.2 | 0.8.6 |
|
||||
|
||||
| Optional | Minimum | Recommend |
|
||||
| ------------ | ------- | --------- |
|
||||
| CUDA | 11.6 | 12.2 |
|
||||
| deepspeed | 0.10.0 | 0.14.0 |
|
||||
| bitsandbytes | 0.39.0 | 0.43.1 |
|
||||
| vllm | 0.4.0 | 0.4.2 |
|
||||
| flash-attn | 2.3.0 | 2.5.8 |
|
||||
|
||||
### Hardware Requirement
|
||||
|
||||
\* *estimated*
|
||||
|
||||
| Method | Bits | 7B | 13B | 30B | 70B | 110B | 8x7B | 8x22B |
|
||||
| ----------------- | ---- | ----- | ----- | ----- | ------ | ------ | ----- | ------ |
|
||||
| Full | AMP | 120GB | 240GB | 600GB | 1200GB | 2000GB | 900GB | 2400GB |
|
||||
| Full | 16 | 60GB | 120GB | 300GB | 600GB | 900GB | 400GB | 1200GB |
|
||||
| Freeze | 16 | 20GB | 40GB | 80GB | 200GB | 360GB | 160GB | 400GB |
|
||||
| LoRA/GaLore/BAdam | 16 | 16GB | 32GB | 64GB | 160GB | 240GB | 120GB | 320GB |
|
||||
| QLoRA | 8 | 10GB | 20GB | 40GB | 80GB | 140GB | 60GB | 160GB |
|
||||
| QLoRA | 4 | 6GB | 12GB | 24GB | 48GB | 72GB | 30GB | 96GB |
|
||||
| QLoRA | 2 | 4GB | 8GB | 16GB | 24GB | 48GB | 18GB | 48GB |
|
||||
|
||||
## Getting Started
|
||||
|
||||
### Installation
|
||||
|
||||
> [!IMPORTANT]
|
||||
> Installation is mandatory.
|
||||
|
||||
```bash
|
||||
git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git
|
||||
cd LLaMA-Factory
|
||||
pip install -e .[torch,metrics]
|
||||
```
|
||||
|
||||
Extra dependencies available: torch, metrics, deepspeed, bitsandbytes, vllm, galore, badam, gptq, awq, aqlm, qwen, modelscope, quality
|
||||
|
||||
> [!TIP]
|
||||
> Use `pip install --no-deps -e .` to resolve package conflicts.
|
||||
|
||||
<details><summary>For Windows users</summary>
|
||||
|
||||
If you want to enable the quantized LoRA (QLoRA) on the Windows platform, you need to install a pre-built version of `bitsandbytes` library, which supports CUDA 11.1 to 12.2, please select the appropriate [release version](https://github.com/jllllll/bitsandbytes-windows-webui/releases/tag/wheels) based on your CUDA version.
|
||||
|
||||
```bash
|
||||
pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.2.post2-py3-none-win_amd64.whl
|
||||
```
|
||||
|
||||
To enable FlashAttention-2 on the Windows platform, you need to install the precompiled `flash-attn` library, which supports CUDA 12.1 to 12.2. Please download the corresponding version from [flash-attention](https://github.com/bdashore3/flash-attention/releases) based on your requirements.
|
||||
|
||||
</details>
|
||||
|
||||
<details><summary>For Ascend NPU users</summary>
|
||||
|
||||
Join [NPU user group](assets/wechat_npu.jpg).
|
||||
|
||||
To utilize Ascend NPU devices for (distributed) training and inference, you need to install the **[torch-npu](https://gitee.com/ascend/pytorch)** library and the **[Ascend CANN Kernels](https://www.hiascend.com/developer/download/community/result?module=cann)**.
|
||||
|
||||
| Requirement | Minimum | Recommend |
|
||||
| ------------ | ------- | --------- |
|
||||
| CANN | 8.0.RC1 | 8.0.RC1 |
|
||||
| torch | 2.2.0 | 2.2.0 |
|
||||
| torch-npu | 2.2.0 | 2.2.0 |
|
||||
| deepspeed | 0.13.2 | 0.13.2 |
|
||||
|
||||
Docker image:
|
||||
|
||||
- 32GB: [Download page](http://mirrors.cn-central-221.ovaijisuan.com/detail/130.html)
|
||||
- 64GB: Coming soon
|
||||
|
||||
Remember to use `ASCEND_RT_VISIBLE_DEVICES` instead of `CUDA_VISIBLE_DEVICES` to specify the device to use.
|
||||
|
||||
If you cannot infer model on NPU devices, try setting `do_sample: false` in the configurations.
|
||||
|
||||
</details>
|
||||
|
||||
### Data Preparation
|
||||
|
||||
Please refer to [data/README.md](data/README.md) for checking the details about the format of dataset files. You can either use datasets on HuggingFace / ModelScope hub or load the dataset in local disk.
|
||||
|
||||
> [!NOTE]
|
||||
> Please update `data/dataset_info.json` to use your custom dataset.
|
||||
|
||||
### Quickstart
|
||||
|
||||
Use the following 3 commands to run LoRA **fine-tuning**, **inference** and **merging** of the Llama3-8B-Instruct model, respectively.
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_sft.yaml
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli chat examples/inference/llama3_lora_sft.yaml
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
|
||||
```
|
||||
|
||||
See [examples/README.md](examples/README.md) for advanced usage (including distributed training).
|
||||
|
||||
> [!TIP]
|
||||
> Use `llamafactory-cli help` to show help information.
|
||||
|
||||
### Fine-Tuning with LLaMA Board GUI (powered by [Gradio](https://github.com/gradio-app/gradio))
|
||||
|
||||
> [!IMPORTANT]
|
||||
> LLaMA Board GUI only supports training on a single GPU.
|
||||
|
||||
#### Use local environment
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 GRADIO_SHARE=1 llamafactory-cli webui
|
||||
```
|
||||
|
||||
<details><summary>For Alibaba Cloud PAI or AutoDL users</summary>
|
||||
|
||||
If you encountered display problems in LLaMA Board on Alibaba Cloud PAI, try using the following command to set environment variables before starting LLaMA Board:
|
||||
|
||||
```bash
|
||||
export GRADIO_SERVER_PORT=7860 GRADIO_ROOT_PATH=/${JUPYTER_NAME}/proxy/7860/
|
||||
```
|
||||
|
||||
If you are using AutoDL, please install a specific version of Gradio:
|
||||
|
||||
```bash
|
||||
pip install gradio==4.10.0
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
#### Use Docker
|
||||
|
||||
```bash
|
||||
docker build -f ./Dockerfile -t llama-factory:latest .
|
||||
docker run --gpus=all \
|
||||
-v ./hf_cache:/root/.cache/huggingface/ \
|
||||
-v ./data:/app/data \
|
||||
-v ./output:/app/output \
|
||||
-e CUDA_VISIBLE_DEVICES=0 \
|
||||
-p 7860:7860 \
|
||||
--shm-size 16G \
|
||||
--name llama_factory \
|
||||
-d llama-factory:latest
|
||||
```
|
||||
|
||||
#### Use Docker Compose
|
||||
|
||||
```bash
|
||||
docker compose -f ./docker-compose.yml up -d
|
||||
```
|
||||
|
||||
<details><summary>Details about volume</summary>
|
||||
|
||||
- hf_cache: Utilize Hugging Face cache on the host machine. Reassignable if a cache already exists in a different directory.
|
||||
- data: Place datasets on this dir of the host machine so that they can be selected on LLaMA Board GUI.
|
||||
- output: Set export dir to this location so that the merged result can be accessed directly on the host machine.
|
||||
|
||||
</details>
|
||||
|
||||
### Deploy with OpenAI-style API and vLLM
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0,1 API_PORT=8000 llamafactory-cli api examples/inference/llama3_vllm.yaml
|
||||
```
|
||||
|
||||
### Download from ModelScope Hub
|
||||
|
||||
If you have trouble with downloading models and datasets from Hugging Face, you can use ModelScope.
|
||||
|
||||
```bash
|
||||
export USE_MODELSCOPE_HUB=1 # `set USE_MODELSCOPE_HUB=1` for Windows
|
||||
```
|
||||
|
||||
Train the model by specifying a model ID of the ModelScope Hub as the `--model_name_or_path`. You can find a full list of model IDs at [ModelScope Hub](https://modelscope.cn/models), e.g., `LLM-Research/Meta-Llama-3-8B-Instruct`.
|
||||
|
||||
## Projects using LLaMA Factory
|
||||
|
||||
If you have a project that should be incorporated, please contact via email or create a pull request.
|
||||
|
||||
<details><summary>Click to show</summary>
|
||||
|
||||
1. Wang et al. ESRL: Efficient Sampling-based Reinforcement Learning for Sequence Generation. 2023. [[arxiv]](https://arxiv.org/abs/2308.02223)
|
||||
1. Yu et al. Open, Closed, or Small Language Models for Text Classification? 2023. [[arxiv]](https://arxiv.org/abs/2308.10092)
|
||||
1. Wang et al. UbiPhysio: Support Daily Functioning, Fitness, and Rehabilitation with Action Understanding and Feedback in Natural Language. 2023. [[arxiv]](https://arxiv.org/abs/2308.10526)
|
||||
1. Luceri et al. Leveraging Large Language Models to Detect Influence Campaigns in Social Media. 2023. [[arxiv]](https://arxiv.org/abs/2311.07816)
|
||||
1. Zhang et al. Alleviating Hallucinations of Large Language Models through Induced Hallucinations. 2023. [[arxiv]](https://arxiv.org/abs/2312.15710)
|
||||
1. Wang et al. Know Your Needs Better: Towards Structured Understanding of Marketer Demands with Analogical Reasoning Augmented LLMs. 2024. [[arxiv]](https://arxiv.org/abs/2401.04319)
|
||||
1. Wang et al. CANDLE: Iterative Conceptualization and Instantiation Distillation from Large Language Models for Commonsense Reasoning. 2024. [[arxiv]](https://arxiv.org/abs/2401.07286)
|
||||
1. Choi et al. FACT-GPT: Fact-Checking Augmentation via Claim Matching with LLMs. 2024. [[arxiv]](https://arxiv.org/abs/2402.05904)
|
||||
1. Zhang et al. AutoMathText: Autonomous Data Selection with Language Models for Mathematical Texts. 2024. [[arxiv]](https://arxiv.org/abs/2402.07625)
|
||||
1. Lyu et al. KnowTuning: Knowledge-aware Fine-tuning for Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2402.11176)
|
||||
1. Yang et al. LaCo: Large Language Model Pruning via Layer Collaps. 2024. [[arxiv]](https://arxiv.org/abs/2402.11187)
|
||||
1. Bhardwaj et al. Language Models are Homer Simpson! Safety Re-Alignment of Fine-tuned Language Models through Task Arithmetic. 2024. [[arxiv]](https://arxiv.org/abs/2402.11746)
|
||||
1. Yang et al. Enhancing Empathetic Response Generation by Augmenting LLMs with Small-scale Empathetic Models. 2024. [[arxiv]](https://arxiv.org/abs/2402.11801)
|
||||
1. Yi et al. Generation Meets Verification: Accelerating Large Language Model Inference with Smart Parallel Auto-Correct Decoding. 2024. [[arxiv]](https://arxiv.org/abs/2402.11809)
|
||||
1. Cao et al. Head-wise Shareable Attention for Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2402.11819)
|
||||
1. Zhang et al. Enhancing Multilingual Capabilities of Large Language Models through Self-Distillation from Resource-Rich Languages. 2024. [[arxiv]](https://arxiv.org/abs/2402.12204)
|
||||
1. Kim et al. Efficient and Effective Vocabulary Expansion Towards Multilingual Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2402.14714)
|
||||
1. Yu et al. KIEval: A Knowledge-grounded Interactive Evaluation Framework for Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2402.15043)
|
||||
1. Huang et al. Key-Point-Driven Data Synthesis with its Enhancement on Mathematical Reasoning. 2024. [[arxiv]](https://arxiv.org/abs/2403.02333)
|
||||
1. Duan et al. Negating Negatives: Alignment without Human Positive Samples via Distributional Dispreference Optimization. 2024. [[arxiv]](https://arxiv.org/abs/2403.03419)
|
||||
1. Xie and Schwertfeger. Empowering Robotics with Large Language Models: osmAG Map Comprehension with LLMs. 2024. [[arxiv]](https://arxiv.org/abs/2403.08228)
|
||||
1. Wu et al. Large Language Models are Parallel Multilingual Learners. 2024. [[arxiv]](https://arxiv.org/abs/2403.09073)
|
||||
1. Zhang et al. EDT: Improving Large Language Models' Generation by Entropy-based Dynamic Temperature Sampling. 2024. [[arxiv]](https://arxiv.org/abs/2403.14541)
|
||||
1. Weller et al. FollowIR: Evaluating and Teaching Information Retrieval Models to Follow Instructions. 2024. [[arxiv]](https://arxiv.org/abs/2403.15246)
|
||||
1. Hongbin Na. CBT-LLM: A Chinese Large Language Model for Cognitive Behavioral Therapy-based Mental Health Question Answering. 2024. [[arxiv]](https://arxiv.org/abs/2403.16008)
|
||||
1. Zan et al. CodeS: Natural Language to Code Repository via Multi-Layer Sketch. 2024. [[arxiv]](https://arxiv.org/abs/2403.16443)
|
||||
1. Liu et al. Extensive Self-Contrast Enables Feedback-Free Language Model Alignment. 2024. [[arxiv]](https://arxiv.org/abs/2404.00604)
|
||||
1. Luo et al. BAdam: A Memory Efficient Full Parameter Training Method for Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.02827)
|
||||
1. Du et al. Chinese Tiny LLM: Pretraining a Chinese-Centric Large Language Model. 2024. [[arxiv]](https://arxiv.org/abs/2404.04167)
|
||||
1. Ma et al. Parameter Efficient Quasi-Orthogonal Fine-Tuning via Givens Rotation. 2024. [[arxiv]](https://arxiv.org/abs/2404.04316)
|
||||
1. Liu et al. Dynamic Generation of Personalities with Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.07084)
|
||||
1. Shang et al. How Far Have We Gone in Stripped Binary Code Understanding Using Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.09836)
|
||||
1. Huang et al. LLMTune: Accelerate Database Knob Tuning with Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.11581)
|
||||
1. Deng et al. Text-Tuple-Table: Towards Information Integration in Text-to-Table Generation via Global Tuple Extraction. 2024. [[arxiv]](https://arxiv.org/abs/2404.14215)
|
||||
1. Acikgoz et al. Hippocrates: An Open-Source Framework for Advancing Large Language Models in Healthcare. 2024. [[arxiv]](https://arxiv.org/abs/2404.16621)
|
||||
1. Zhang et al. Small Language Models Need Strong Verifiers to Self-Correct Reasoning. 2024. [[arxiv]](https://arxiv.org/abs/2404.17140)
|
||||
1. Zhou et al. FREB-TQA: A Fine-Grained Robustness Evaluation Benchmark for Table Question Answering. 2024. [[arxiv]](https://arxiv.org/abs/2404.18585)
|
||||
1. **[StarWhisper](https://github.com/Yu-Yang-Li/StarWhisper)**: A large language model for Astronomy, based on ChatGLM2-6B and Qwen-14B.
|
||||
1. **[DISC-LawLLM](https://github.com/FudanDISC/DISC-LawLLM)**: A large language model specialized in Chinese legal domain, based on Baichuan-13B, is capable of retrieving and reasoning on legal knowledge.
|
||||
1. **[Sunsimiao](https://github.com/X-D-Lab/Sunsimiao)**: A large language model specialized in Chinese medical domain, based on Baichuan-7B and ChatGLM-6B.
|
||||
1. **[CareGPT](https://github.com/WangRongsheng/CareGPT)**: A series of large language models for Chinese medical domain, based on LLaMA2-7B and Baichuan-13B.
|
||||
1. **[MachineMindset](https://github.com/PKU-YuanGroup/Machine-Mindset/)**: A series of MBTI Personality large language models, capable of giving any LLM 16 different personality types based on different datasets and training methods.
|
||||
1. **[Luminia-13B-v3](https://huggingface.co/Nekochu/Luminia-13B-v3)**: A large language model specialized in generate metadata for stable diffusion. [[🤗Demo]](https://huggingface.co/spaces/Nekochu/Luminia-13B_SD_Prompt)
|
||||
1. **[Chinese-LLaVA-Med](https://github.com/BUAADreamer/Chinese-LLaVA-Med)**: A multimodal large language model specialized in Chinese medical domain, based on LLaVA-1.5-7B.
|
||||
|
||||
</details>
|
||||
|
||||
## License
|
||||
|
||||
This repository is licensed under the [Apache-2.0 License](LICENSE).
|
||||
|
||||
Please follow the model licenses to use the corresponding model weights: [Baichuan2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [Command-R](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [InternLM2](https://github.com/InternLM/InternLM#license) / [LLaMA](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [LLaMA-2 (LLaVA-1.5)](https://ai.meta.com/llama/license/) / [LLaMA-3](https://llama.meta.com/llama3/license/) / [Mistral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Phi-3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/LICENSE) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yi-1.5](LICENSE) / [Yuan](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan)
|
||||
|
||||
## Citation
|
||||
|
||||
If this work is helpful, please kindly cite as:
|
||||
|
||||
```bibtex
|
||||
@article{zheng2024llamafactory,
|
||||
title={LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models},
|
||||
author={Yaowei Zheng and Richong Zhang and Junhao Zhang and Yanhan Ye and Zheyan Luo and Yongqiang Ma},
|
||||
journal={arXiv preprint arXiv:2403.13372},
|
||||
year={2024},
|
||||
url={http://arxiv.org/abs/2403.13372}
|
||||
}
|
||||
```
|
||||
|
||||
## Acknowledgement
|
||||
|
||||
This repo benefits from [PEFT](https://github.com/huggingface/peft), [TRL](https://github.com/huggingface/trl), [QLoRA](https://github.com/artidoro/qlora) and [FastChat](https://github.com/lm-sys/FastChat). Thanks for their wonderful works.
|
||||
|
||||
## Star History
|
||||
|
||||
![Star History Chart](https://api.star-history.com/svg?repos=hiyouga/LLaMA-Factory&type=Date)
|
|
@ -0,0 +1,543 @@
|
|||
![# LLaMA Factory](assets/logo.png)
|
||||
|
||||
[![GitHub Repo stars](https://img.shields.io/github/stars/hiyouga/LLaMA-Factory?style=social)](https://github.com/hiyouga/LLaMA-Factory/stargazers)
|
||||
[![GitHub Code License](https://img.shields.io/github/license/hiyouga/LLaMA-Factory)](LICENSE)
|
||||
[![GitHub last commit](https://img.shields.io/github/last-commit/hiyouga/LLaMA-Factory)](https://github.com/hiyouga/LLaMA-Factory/commits/main)
|
||||
[![PyPI](https://img.shields.io/pypi/v/llamafactory)](https://pypi.org/project/llamafactory/)
|
||||
[![Citation](https://img.shields.io/badge/citation-44-green)](#使用了-llama-factory-的项目)
|
||||
[![GitHub pull request](https://img.shields.io/badge/PRs-welcome-blue)](https://github.com/hiyouga/LLaMA-Factory/pulls)
|
||||
[![Discord](https://dcbadge.vercel.app/api/server/rKfvV9r9FK?compact=true&style=flat)](https://discord.gg/rKfvV9r9FK)
|
||||
[![Twitter](https://img.shields.io/twitter/follow/llamafactory_ai)](https://twitter.com/llamafactory_ai)
|
||||
[![Spaces](https://img.shields.io/badge/🤗-Open%20in%20Spaces-blue)](https://huggingface.co/spaces/hiyouga/LLaMA-Board)
|
||||
[![Studios](https://img.shields.io/badge/ModelScope-Open%20in%20Studios-blue)](https://modelscope.cn/studios/hiyouga/LLaMA-Board)
|
||||
[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1d5KQtbemerlSDSxZIfAaWXhKr30QypiK?usp=sharing)
|
||||
|
||||
[![GitHub Tread](https://trendshift.io/api/badge/repositories/4535)](https://trendshift.io/repositories/4535)
|
||||
|
||||
👋 加入我们的[微信群](assets/wechat.jpg)。
|
||||
|
||||
\[ [English](README.md) | 中文 \]
|
||||
|
||||
**微调大模型可以像这样轻松…**
|
||||
|
||||
https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd-d76c6d0a6594
|
||||
|
||||
选择你的打开方式:
|
||||
|
||||
- **Colab**:https://colab.research.google.com/drive/1d5KQtbemerlSDSxZIfAaWXhKr30QypiK?usp=sharing
|
||||
- **本地机器**:请见[如何使用](#如何使用)
|
||||
|
||||
## 目录
|
||||
|
||||
- [项目特色](#项目特色)
|
||||
- [性能指标](#性能指标)
|
||||
- [更新日志](#更新日志)
|
||||
- [模型](#模型)
|
||||
- [训练方法](#训练方法)
|
||||
- [数据集](#数据集)
|
||||
- [软硬件依赖](#软硬件依赖)
|
||||
- [如何使用](#如何使用)
|
||||
- [使用了 LLaMA Factory 的项目](#使用了-llama-factory-的项目)
|
||||
- [协议](#协议)
|
||||
- [引用](#引用)
|
||||
- [致谢](#致谢)
|
||||
|
||||
## 项目特色
|
||||
|
||||
- **多种模型**:LLaMA、LLaVA、Mistral、Mixtral-MoE、Qwen、Yi、Gemma、Baichuan、ChatGLM、Phi 等等。
|
||||
- **集成方法**:(增量)预训练、(多模态)指令监督微调、奖励模型训练、PPO 训练、DPO 训练、KTO 训练和 ORPO 训练。
|
||||
- **多种精度**:32 比特全参数微调、16 比特冻结微调、16 比特 LoRA 微调和基于 AQLM/AWQ/GPTQ/LLM.int8 的 2/4/8 比特 QLoRA 微调。
|
||||
- **先进算法**:GaLore、BAdam、DoRA、LongLoRA、LLaMA Pro、Mixture-of-Depths、LoRA+、LoftQ 和 Agent 微调。
|
||||
- **实用技巧**:FlashAttention-2、Unsloth、RoPE scaling、NEFTune 和 rsLoRA。
|
||||
- **实验监控**:LlamaBoard、TensorBoard、Wandb、MLflow 等等。
|
||||
- **极速推理**:基于 vLLM 的 OpenAI 风格 API、浏览器界面和命令行接口。
|
||||
|
||||
## 性能指标
|
||||
|
||||
与 ChatGLM 官方的 [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/ptuning) 微调相比,LLaMA Factory 的 LoRA 微调提供了 **3.7 倍**的加速比,同时在广告文案生成任务上取得了更高的 Rouge 分数。结合 4 比特量化技术,LLaMA Factory 的 QLoRA 微调进一步降低了 GPU 显存消耗。
|
||||
|
||||
![benchmark](assets/benchmark.svg)
|
||||
|
||||
<details><summary>变量定义</summary>
|
||||
|
||||
- **Training Speed**: 训练阶段每秒处理的样本数量。(批处理大小=4,截断长度=1024)
|
||||
- **Rouge Score**: [广告文案生成](https://aclanthology.org/D19-1321.pdf)任务验证集上的 Rouge-2 分数。(批处理大小=4,截断长度=1024)
|
||||
- **GPU Memory**: 4 比特量化训练的 GPU 显存峰值。(批处理大小=1,截断长度=1024)
|
||||
- 我们在 ChatGLM 的 P-Tuning 中采用 `pre_seq_len=128`,在 LLaMA Factory 的 LoRA 微调中采用 `lora_rank=32`。
|
||||
|
||||
</details>
|
||||
|
||||
## 更新日志
|
||||
|
||||
[24/05/20] 我们支持了 **PaliGemma** 系列模型的微调。注意 PaliGemma 是预训练模型,你需要使用 `gemma` 模板进行微调使其获得对话能力。
|
||||
|
||||
[24/05/18] 我们支持了 **[KTO](https://arxiv.org/abs/2402.01306)** 偏好对齐算法。详细用法请参照 [examples](examples/README_zh.md)。
|
||||
|
||||
[24/05/14] 我们支持了昇腾 NPU 设备的训练和推理。详情请查阅[安装](#安装-llama-factory)部分。
|
||||
|
||||
<details><summary>展开日志</summary>
|
||||
|
||||
[24/04/26] 我们支持了多模态模型 **LLaVA-1.5** 的微调。详细用法请参照 [examples](examples/README_zh.md)。
|
||||
|
||||
[24/04/22] 我们提供了在免费 T4 GPU 上微调 Llama-3 模型的 **[Colab 笔记本](https://colab.research.google.com/drive/1d5KQtbemerlSDSxZIfAaWXhKr30QypiK?usp=sharing)**。Hugging Face 社区公开了两个利用 LLaMA Factory 微调的 Llama-3 模型,详情请见 [Llama3-8B-Chinese-Chat](https://huggingface.co/shenzhi-wang/Llama3-8B-Chinese-Chat) 和 [Llama3-Chinese](https://huggingface.co/zhichen/Llama3-Chinese)。
|
||||
|
||||
[24/04/21] 我们基于 [AstraMindAI 的仓库](https://github.com/astramind-ai/Mixture-of-depths)支持了 **[混合深度训练](https://arxiv.org/abs/2404.02258)**。详细用法请参照 [examples](examples/README_zh.md)。
|
||||
|
||||
[24/04/16] 我们支持了 **[BAdam](https://arxiv.org/abs/2404.02827)**。详细用法请参照 [examples](examples/README_zh.md)。
|
||||
|
||||
[24/04/16] 我们支持了 **[unsloth](https://github.com/unslothai/unsloth)** 的长序列训练(24GB 可训练 Llama-2-7B-56k)。该方法相比 FlashAttention-2 提供了 **117%** 的训练速度和 **50%** 的显存节约。更多数据请见[此页面](https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-comparison)。
|
||||
|
||||
[24/03/31] 我们支持了 **[ORPO](https://arxiv.org/abs/2403.07691)**。详细用法请参照 [examples](examples/README_zh.md)。
|
||||
|
||||
[24/03/21] 我们的论文 "[LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models](https://arxiv.org/abs/2403.13372)" 可在 arXiv 上查看!
|
||||
|
||||
[24/03/20] 我们支持了能在 2x24GB GPU 上微调 70B 模型的 **FSDP+QLoRA**。详细用法请参照 [examples](examples/README_zh.md)。
|
||||
|
||||
[24/03/13] 我们支持了 **[LoRA+](https://arxiv.org/abs/2402.12354)**。详细用法请参照 [examples](examples/README_zh.md)。
|
||||
|
||||
[24/03/07] 我们支持了梯度低秩投影(**[GaLore](https://arxiv.org/abs/2403.03507)**)算法。详细用法请参照 [examples](examples/README_zh.md)。
|
||||
|
||||
[24/03/07] 我们集成了 **[vLLM](https://github.com/vllm-project/vllm)** 以实现极速并发推理。请使用 `infer_backend: vllm` 来获得 **270%** 的推理速度。
|
||||
|
||||
[24/02/28] 我们支持了 **[DoRA](https://arxiv.org/abs/2402.09353)** 微调。请使用 `use_dora: true` 参数进行 DoRA 微调。
|
||||
|
||||
[24/02/15] 我们支持了 [LLaMA Pro](https://github.com/TencentARC/LLaMA-Pro) 提出的**块扩展**方法。详细用法请参照 [examples](examples/README_zh.md)。
|
||||
|
||||
[24/02/05] Qwen1.5(Qwen2 测试版)系列模型已在 LLaMA-Factory 中实现微调支持。详情请查阅该[博客页面](https://qwenlm.github.io/zh/blog/qwen1.5/)。
|
||||
|
||||
[24/01/18] 我们针对绝大多数模型实现了 **Agent 微调**,微调时指定 `dataset: glaive_toolcall` 即可使模型获得工具调用能力。
|
||||
|
||||
[23/12/23] 我们针对 LLaMA, Mistral 和 Yi 模型支持了 **[unsloth](https://github.com/unslothai/unsloth)** 的 LoRA 训练加速。请使用 `use_unsloth: true` 参数启用 unsloth 优化。该方法可提供 **170%** 的训练速度,详情请查阅[此页面](https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-comparison)。
|
||||
|
||||
[23/12/12] 我们支持了微调最新的混合专家模型 **[Mixtral 8x7B](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)**。硬件需求请查阅[此处](#硬件依赖)。
|
||||
|
||||
[23/12/01] 我们支持了从 **[魔搭社区](https://modelscope.cn/models)** 下载预训练模型和数据集。详细用法请参照 [此教程](#从魔搭社区下载)。
|
||||
|
||||
[23/10/21] 我们支持了 **[NEFTune](https://arxiv.org/abs/2310.05914)** 训练技巧。请使用 `neftune_noise_alpha: 5` 参数启用 NEFTune。
|
||||
|
||||
[23/09/27] 我们针对 LLaMA 模型支持了 [LongLoRA](https://github.com/dvlab-research/LongLoRA) 提出的 **$S^2$-Attn**。请使用 `shift_attn: true` 参数以启用该功能。
|
||||
|
||||
[23/09/23] 我们在项目中集成了 MMLU、C-Eval 和 CMMLU 评估集。详细用法请参照 [examples](examples/README_zh.md)。
|
||||
|
||||
[23/09/10] 我们支持了 **[FlashAttention-2](https://github.com/Dao-AILab/flash-attention)**。如果您使用的是 RTX4090、A100 或 H100 GPU,请使用 `flash_attn: fa2` 参数以启用 FlashAttention-2。
|
||||
|
||||
[23/08/12] 我们支持了 **RoPE 插值**来扩展 LLaMA 模型的上下文长度。请使用 `rope_scaling: linear` 参数训练模型或使用 `rope_scaling: dynamic` 参数评估模型。
|
||||
|
||||
[23/08/11] 我们支持了指令模型的 **[DPO 训练](https://arxiv.org/abs/2305.18290)**。详细用法请参照 [examples](examples/README_zh.md)。
|
||||
|
||||
[23/07/31] 我们支持了**数据流式加载**。请使用 `streaming: true` 和 `max_steps: 10000` 参数来流式加载数据集。
|
||||
|
||||
[23/07/29] 我们在 Hugging Face 发布了两个 13B 指令微调模型。详细内容请查阅我们的 Hugging Face 项目([LLaMA-2](https://huggingface.co/hiyouga/Llama-2-Chinese-13b-chat) / [Baichuan](https://huggingface.co/hiyouga/Baichuan-13B-sft))。
|
||||
|
||||
[23/07/18] 我们开发了支持训练和测试的**浏览器一体化界面**。请使用 `train_web.py` 在您的浏览器中微调模型。感谢 [@KanadeSiina](https://github.com/KanadeSiina) 和 [@codemayq](https://github.com/codemayq) 在该功能开发中付出的努力。
|
||||
|
||||
[23/07/09] 我们开源了 **[FastEdit](https://github.com/hiyouga/FastEdit)** ⚡🩹,一个简单易用的、能迅速编辑大模型事实记忆的工具包。如果您感兴趣请关注我们的 [FastEdit](https://github.com/hiyouga/FastEdit) 项目。
|
||||
|
||||
[23/06/29] 我们提供了一个**可复现的**指令模型微调示例,详细内容请查阅 [Baichuan-7B-sft](https://huggingface.co/hiyouga/Baichuan-7B-sft)。
|
||||
|
||||
[23/06/22] 我们对齐了[示例 API](src/api_demo.py) 与 [OpenAI API](https://platform.openai.com/docs/api-reference/chat) 的格式,您可以将微调模型接入**任意基于 ChatGPT 的应用**中。
|
||||
|
||||
[23/06/03] 我们实现了 4 比特的 LoRA 训练(也称 **[QLoRA](https://github.com/artidoro/qlora)**)。详细用法请参照 [examples](examples/README_zh.md)。
|
||||
|
||||
</details>
|
||||
|
||||
## 模型
|
||||
|
||||
| 模型名 | 模型大小 | 默认模块 | Template |
|
||||
| -------------------------------------------------------- | -------------------------------- | ----------------- | --------- |
|
||||
| [Baichuan2](https://huggingface.co/baichuan-inc) | 7B/13B | W_pack | baichuan2 |
|
||||
| [BLOOM](https://huggingface.co/bigscience) | 560M/1.1B/1.7B/3B/7.1B/176B | query_key_value | - |
|
||||
| [BLOOMZ](https://huggingface.co/bigscience) | 560M/1.1B/1.7B/3B/7.1B/176B | query_key_value | - |
|
||||
| [ChatGLM3](https://huggingface.co/THUDM) | 6B | query_key_value | chatglm3 |
|
||||
| [Command-R](https://huggingface.co/CohereForAI) | 35B/104B | q_proj,v_proj | cohere |
|
||||
| [DeepSeek (MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | q_proj,v_proj | deepseek |
|
||||
| [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | query_key_value | falcon |
|
||||
| [Gemma/CodeGemma](https://huggingface.co/google) | 2B/7B | q_proj,v_proj | gemma |
|
||||
| [InternLM2](https://huggingface.co/internlm) | 7B/20B | wqkv | intern2 |
|
||||
| [LLaMA](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | q_proj,v_proj | - |
|
||||
| [LLaMA-2](https://huggingface.co/meta-llama) | 7B/13B/70B | q_proj,v_proj | llama2 |
|
||||
| [LLaMA-3](https://huggingface.co/meta-llama) | 8B/70B | q_proj,v_proj | llama3 |
|
||||
| [LLaVA-1.5](https://huggingface.co/llava-hf) | 7B/13B | q_proj,v_proj | vicuna |
|
||||
| [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | q_proj,v_proj | mistral |
|
||||
| [OLMo](https://huggingface.co/allenai) | 1B/7B | q_proj,v_proj | - |
|
||||
| [PaliGemma](https://huggingface.co/google) | 3B | q_proj,v_proj | gemma |
|
||||
| [Phi-1.5/2](https://huggingface.co/microsoft) | 1.3B/2.7B | q_proj,v_proj | - |
|
||||
| [Phi-3](https://huggingface.co/microsoft) | 3.8B | qkv_proj | phi |
|
||||
| [Qwen](https://huggingface.co/Qwen) | 1.8B/7B/14B/72B | c_attn | qwen |
|
||||
| [Qwen1.5 (Code/MoE)](https://huggingface.co/Qwen) | 0.5B/1.8B/4B/7B/14B/32B/72B/110B | q_proj,v_proj | qwen |
|
||||
| [StarCoder2](https://huggingface.co/bigcode) | 3B/7B/15B | q_proj,v_proj | - |
|
||||
| [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | q_proj,v_proj | xverse |
|
||||
| [Yi (1/1.5)](https://huggingface.co/01-ai) | 6B/9B/34B | q_proj,v_proj | yi |
|
||||
| [Yi-VL](https://huggingface.co/01-ai) | 6B/34B | q_proj,v_proj | yi_vl |
|
||||
| [Yuan](https://huggingface.co/IEITYuan) | 2B/51B/102B | q_proj,v_proj | yuan |
|
||||
|
||||
> [!NOTE]
|
||||
> **默认模块**应作为 `--lora_target` 参数的默认值,可使用 `--lora_target all` 参数指定全部模块以取得更好的效果。
|
||||
>
|
||||
> 对于所有“基座”(Base)模型,`--template` 参数可以是 `default`, `alpaca`, `vicuna` 等任意值。但“对话”(Instruct/Chat)模型请务必使用**对应的模板**。
|
||||
>
|
||||
> 请务必在训练和推理时使用**完全一致**的模板。
|
||||
|
||||
项目所支持模型的完整列表请参阅 [constants.py](src/llamafactory/extras/constants.py)。
|
||||
|
||||
您也可以在 [template.py](src/llamafactory/data/template.py) 中添加自己的对话模板。
|
||||
|
||||
## 训练方法
|
||||
|
||||
| 方法 | 全参数训练 | 部分参数训练 | LoRA | QLoRA |
|
||||
| ---------------------- | ------------------ | ------------------ | ------------------ | ------------------ |
|
||||
| 预训练 | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
|
||||
| 指令监督微调 | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
|
||||
| 奖励模型训练 | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
|
||||
| PPO 训练 | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
|
||||
| DPO 训练 | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
|
||||
| KTO 训练 | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
|
||||
| ORPO 训练 | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
|
||||
|
||||
## 数据集
|
||||
|
||||
<details><summary>预训练数据集</summary>
|
||||
|
||||
- [Wiki Demo (en)](data/wiki_demo.txt)
|
||||
- [RefinedWeb (en)](https://huggingface.co/datasets/tiiuae/falcon-refinedweb)
|
||||
- [RedPajama V2 (en)](https://huggingface.co/datasets/togethercomputer/RedPajama-Data-V2)
|
||||
- [Wikipedia (en)](https://huggingface.co/datasets/olm/olm-wikipedia-20221220)
|
||||
- [Wikipedia (zh)](https://huggingface.co/datasets/pleisto/wikipedia-cn-20230720-filtered)
|
||||
- [Pile (en)](https://huggingface.co/datasets/EleutherAI/pile)
|
||||
- [SkyPile (zh)](https://huggingface.co/datasets/Skywork/SkyPile-150B)
|
||||
- [The Stack (en)](https://huggingface.co/datasets/bigcode/the-stack)
|
||||
- [StarCoder (en)](https://huggingface.co/datasets/bigcode/starcoderdata)
|
||||
|
||||
</details>
|
||||
|
||||
<details><summary>指令微调数据集</summary>
|
||||
|
||||
- [Identity (en&zh)](data/identity.json)
|
||||
- [Stanford Alpaca (en)](https://github.com/tatsu-lab/stanford_alpaca)
|
||||
- [Stanford Alpaca (zh)](https://github.com/ymcui/Chinese-LLaMA-Alpaca-3)
|
||||
- [Alpaca GPT4 (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
|
||||
- [Glaive Function Calling V2 (en&zh)](https://huggingface.co/datasets/glaiveai/glaive-function-calling-v2)
|
||||
- [LIMA (en)](https://huggingface.co/datasets/GAIR/lima)
|
||||
- [Guanaco Dataset (multilingual)](https://huggingface.co/datasets/JosephusCheung/GuanacoDataset)
|
||||
- [BELLE 2M (zh)](https://huggingface.co/datasets/BelleGroup/train_2M_CN)
|
||||
- [BELLE 1M (zh)](https://huggingface.co/datasets/BelleGroup/train_1M_CN)
|
||||
- [BELLE 0.5M (zh)](https://huggingface.co/datasets/BelleGroup/train_0.5M_CN)
|
||||
- [BELLE Dialogue 0.4M (zh)](https://huggingface.co/datasets/BelleGroup/generated_chat_0.4M)
|
||||
- [BELLE School Math 0.25M (zh)](https://huggingface.co/datasets/BelleGroup/school_math_0.25M)
|
||||
- [BELLE Multiturn Chat 0.8M (zh)](https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M)
|
||||
- [UltraChat (en)](https://github.com/thunlp/UltraChat)
|
||||
- [OpenPlatypus (en)](https://huggingface.co/datasets/garage-bAInd/Open-Platypus)
|
||||
- [CodeAlpaca 20k (en)](https://huggingface.co/datasets/sahil2801/CodeAlpaca-20k)
|
||||
- [Alpaca CoT (multilingual)](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT)
|
||||
- [OpenOrca (en)](https://huggingface.co/datasets/Open-Orca/OpenOrca)
|
||||
- [SlimOrca (en)](https://huggingface.co/datasets/Open-Orca/SlimOrca)
|
||||
- [MathInstruct (en)](https://huggingface.co/datasets/TIGER-Lab/MathInstruct)
|
||||
- [Firefly 1.1M (zh)](https://huggingface.co/datasets/YeungNLP/firefly-train-1.1M)
|
||||
- [Wiki QA (en)](https://huggingface.co/datasets/wiki_qa)
|
||||
- [Web QA (zh)](https://huggingface.co/datasets/suolyer/webqa)
|
||||
- [WebNovel (zh)](https://huggingface.co/datasets/zxbsmk/webnovel_cn)
|
||||
- [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar)
|
||||
- [deepctrl (en&zh)](https://www.modelscope.cn/datasets/deepctrl/deepctrl-sft-data)
|
||||
- [Advertise Generating (zh)](https://huggingface.co/datasets/HasturOfficial/adgen)
|
||||
- [ShareGPT Hyperfiltered (en)](https://huggingface.co/datasets/totally-not-an-llm/sharegpt-hyperfiltered-3k)
|
||||
- [ShareGPT4 (en&zh)](https://huggingface.co/datasets/shibing624/sharegpt_gpt4)
|
||||
- [UltraChat 200k (en)](https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k)
|
||||
- [AgentInstruct (en)](https://huggingface.co/datasets/THUDM/AgentInstruct)
|
||||
- [LMSYS Chat 1M (en)](https://huggingface.co/datasets/lmsys/lmsys-chat-1m)
|
||||
- [Evol Instruct V2 (en)](https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_V2_196k)
|
||||
- [Cosmopedia (en)](https://huggingface.co/datasets/HuggingFaceTB/cosmopedia)
|
||||
- [STEM (zh)](https://huggingface.co/datasets/hfl/stem_zh_instruction)
|
||||
- [Ruozhiba (zh)](https://huggingface.co/datasets/hfl/ruozhiba_gpt4_turbo)
|
||||
- [LLaVA mixed (en&zh)](https://huggingface.co/datasets/BUAADreamer/llava-en-zh-300k)
|
||||
- [Open Assistant (de)](https://huggingface.co/datasets/mayflowergmbh/oasst_de)
|
||||
- [Dolly 15k (de)](https://huggingface.co/datasets/mayflowergmbh/dolly-15k_de)
|
||||
- [Alpaca GPT4 (de)](https://huggingface.co/datasets/mayflowergmbh/alpaca-gpt4_de)
|
||||
- [OpenSchnabeltier (de)](https://huggingface.co/datasets/mayflowergmbh/openschnabeltier_de)
|
||||
- [Evol Instruct (de)](https://huggingface.co/datasets/mayflowergmbh/evol-instruct_de)
|
||||
- [Dolphin (de)](https://huggingface.co/datasets/mayflowergmbh/dolphin_de)
|
||||
- [Booksum (de)](https://huggingface.co/datasets/mayflowergmbh/booksum_de)
|
||||
- [Airoboros (de)](https://huggingface.co/datasets/mayflowergmbh/airoboros-3.0_de)
|
||||
- [Ultrachat (de)](https://huggingface.co/datasets/mayflowergmbh/ultra-chat_de)
|
||||
|
||||
</details>
|
||||
|
||||
<details><summary>偏好数据集</summary>
|
||||
|
||||
- [DPO mixed (en&zh)](https://huggingface.co/datasets/hiyouga/DPO-En-Zh-20k)
|
||||
- [Orca DPO Pairs (en)](https://huggingface.co/datasets/Intel/orca_dpo_pairs)
|
||||
- [HH-RLHF (en)](https://huggingface.co/datasets/Anthropic/hh-rlhf)
|
||||
- [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar)
|
||||
- [Orca DPO (de)](https://huggingface.co/datasets/mayflowergmbh/intel_orca_dpo_pairs_de)
|
||||
- [KTO mixed (en)](https://huggingface.co/datasets/argilla/kto-mix-15k)
|
||||
|
||||
</details>
|
||||
|
||||
部分数据集的使用需要确认,我们推荐使用下述命令登录您的 Hugging Face 账户。
|
||||
|
||||
```bash
|
||||
pip install --upgrade huggingface_hub
|
||||
huggingface-cli login
|
||||
```
|
||||
|
||||
## 软硬件依赖
|
||||
|
||||
| 必需项 | 至少 | 推荐 |
|
||||
| ------------ | ------- | --------- |
|
||||
| python | 3.8 | 3.10 |
|
||||
| torch | 1.13.1 | 2.2.0 |
|
||||
| transformers | 4.37.2 | 4.41.0 |
|
||||
| datasets | 2.14.3 | 2.19.1 |
|
||||
| accelerate | 0.27.2 | 0.30.1 |
|
||||
| peft | 0.9.0 | 0.11.1 |
|
||||
| trl | 0.8.2 | 0.8.6 |
|
||||
|
||||
| 可选项 | 至少 | 推荐 |
|
||||
| ------------ | ------- | --------- |
|
||||
| CUDA | 11.6 | 12.2 |
|
||||
| deepspeed | 0.10.0 | 0.14.0 |
|
||||
| bitsandbytes | 0.39.0 | 0.43.1 |
|
||||
| vllm | 0.4.0 | 0.4.2 |
|
||||
| flash-attn | 2.3.0 | 2.5.8 |
|
||||
|
||||
### 硬件依赖
|
||||
|
||||
\* *估算值*
|
||||
|
||||
| 方法 | 精度 | 7B | 13B | 30B | 70B | 110B | 8x7B | 8x22B |
|
||||
| ----------------- | ---- | ----- | ----- | ----- | ------ | ------ | ----- | ------ |
|
||||
| Full | AMP | 120GB | 240GB | 600GB | 1200GB | 2000GB | 900GB | 2400GB |
|
||||
| Full | 16 | 60GB | 120GB | 300GB | 600GB | 900GB | 400GB | 1200GB |
|
||||
| Freeze | 16 | 20GB | 40GB | 80GB | 200GB | 360GB | 160GB | 400GB |
|
||||
| LoRA/GaLore/BAdam | 16 | 16GB | 32GB | 64GB | 160GB | 240GB | 120GB | 320GB |
|
||||
| QLoRA | 8 | 10GB | 20GB | 40GB | 80GB | 140GB | 60GB | 160GB |
|
||||
| QLoRA | 4 | 6GB | 12GB | 24GB | 48GB | 72GB | 30GB | 96GB |
|
||||
| QLoRA | 2 | 4GB | 8GB | 16GB | 24GB | 48GB | 18GB | 48GB |
|
||||
|
||||
## 如何使用
|
||||
|
||||
### 安装 LLaMA Factory
|
||||
|
||||
> [!IMPORTANT]
|
||||
> 此步骤为必需。
|
||||
|
||||
```bash
|
||||
git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git
|
||||
cd LLaMA-Factory
|
||||
pip install -e .[torch,metrics]
|
||||
```
|
||||
|
||||
可选的额外依赖项:torch、metrics、deepspeed、bitsandbytes、vllm、galore、badam、gptq、awq、aqlm、qwen、modelscope、quality
|
||||
|
||||
> [!TIP]
|
||||
> 遇到包冲突时,可使用 `pip install --no-deps -e .` 解决。
|
||||
|
||||
<details><summary>Windows 用户指南</summary>
|
||||
|
||||
如果要在 Windows 平台上开启量化 LoRA(QLoRA),需要安装预编译的 `bitsandbytes` 库, 支持 CUDA 11.1 到 12.2, 请根据您的 CUDA 版本情况选择适合的[发布版本](https://github.com/jllllll/bitsandbytes-windows-webui/releases/tag/wheels)。
|
||||
|
||||
```bash
|
||||
pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.2.post2-py3-none-win_amd64.whl
|
||||
```
|
||||
|
||||
如果要在 Windows 平台上开启 FlashAttention-2,需要安装预编译的 `flash-attn` 库,支持 CUDA 12.1 到 12.2,请根据需求到 [flash-attention](https://github.com/bdashore3/flash-attention/releases) 下载对应版本安装。
|
||||
|
||||
</details>
|
||||
|
||||
<details><summary>昇腾 NPU 用户指南</summary>
|
||||
|
||||
加入 [NPU 用户群](assets/wechat_npu.jpg)。
|
||||
|
||||
如果使用昇腾 NPU 设备进行(分布式)训练或推理,需要安装 **[torch-npu](https://gitee.com/ascend/pytorch)** 库和 **[Ascend CANN Kernels](https://www.hiascend.com/developer/download/community/result?module=cann)**。
|
||||
|
||||
| 依赖项 | 至少 | 推荐 |
|
||||
| ------------ | ------- | --------- |
|
||||
| CANN | 8.0.RC1 | 8.0.RC1 |
|
||||
| torch | 2.2.0 | 2.2.0 |
|
||||
| torch-npu | 2.2.0 | 2.2.0 |
|
||||
| deepspeed | 0.13.2 | 0.13.2 |
|
||||
|
||||
Docker 镜像:
|
||||
|
||||
- 32GB:[下载地址](http://mirrors.cn-central-221.ovaijisuan.com/detail/130.html)
|
||||
- 64GB:敬请期待
|
||||
|
||||
请记得使用 `ASCEND_RT_VISIBLE_DEVICES` 而非 `CUDA_VISIBLE_DEVICES` 来指定您使用的设备。
|
||||
|
||||
如果遇到无法正常推理的情况,请尝试设置 `do_sample: false`。
|
||||
|
||||
</details>
|
||||
|
||||
### 数据准备
|
||||
|
||||
关于数据集文件的格式,请参考 [data/README_zh.md](data/README_zh.md) 的内容。你可以使用 HuggingFace / ModelScope 上的数据集或加载本地数据集。
|
||||
|
||||
> [!NOTE]
|
||||
> 使用自定义数据集时,请更新 `data/dataset_info.json` 文件。
|
||||
|
||||
### 快速开始
|
||||
|
||||
下面三行命令分别对 Llama3-8B-Instruct 模型进行 LoRA **微调**、**推理**和**合并**。
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_sft.yaml
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli chat examples/inference/llama3_lora_sft.yaml
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
|
||||
```
|
||||
|
||||
高级用法请参考 [examples/README_zh.md](examples/README_zh.md)(包括多 GPU 微调)。
|
||||
|
||||
> [!TIP]
|
||||
> 使用 `llamafactory-cli help` 显示帮助信息。
|
||||
|
||||
### LLaMA Board 可视化微调(由 [Gradio](https://github.com/gradio-app/gradio) 驱动)
|
||||
|
||||
> [!IMPORTANT]
|
||||
> LLaMA Board 可视化界面目前仅支持单 GPU 训练。
|
||||
|
||||
#### 使用本地环境
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 GRADIO_SHARE=1 llamafactory-cli webui
|
||||
```
|
||||
|
||||
<details><summary>阿里云 PAI 和 AutoDL 用户指南</summary>
|
||||
|
||||
如果您在阿里云 PAI 上使用 LLaMA Board 时遇到显示问题,请尝试在启动前使用以下命令设置环境变量:
|
||||
|
||||
```bash
|
||||
export GRADIO_SERVER_PORT=7860 GRADIO_ROOT_PATH=/${JUPYTER_NAME}/proxy/7860/
|
||||
```
|
||||
|
||||
如果您正在使用 AutoDL,请安装下述 Gradio 版本:
|
||||
|
||||
```bash
|
||||
pip install gradio==4.10.0
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
#### 使用 Docker
|
||||
|
||||
```bash
|
||||
docker build -f ./Dockerfile -t llama-factory:latest .
|
||||
docker run --gpus=all \
|
||||
-v ./hf_cache:/root/.cache/huggingface/ \
|
||||
-v ./data:/app/data \
|
||||
-v ./output:/app/output \
|
||||
-e CUDA_VISIBLE_DEVICES=0 \
|
||||
-p 7860:7860 \
|
||||
--shm-size 16G \
|
||||
--name llama_factory \
|
||||
-d llama-factory:latest
|
||||
```
|
||||
|
||||
#### 使用 Docker Compose
|
||||
|
||||
```bash
|
||||
docker compose -f ./docker-compose.yml up -d
|
||||
```
|
||||
|
||||
<details><summary>数据卷详情</summary>
|
||||
|
||||
- hf_cache:使用宿主机的 Hugging Face 缓存文件夹,允许更改为新的目录。
|
||||
- data:宿主机中存放数据集的文件夹路径。
|
||||
- output:将导出目录设置为该路径后,即可在宿主机中访问导出后的模型。
|
||||
|
||||
</details>
|
||||
|
||||
### 利用 vLLM 部署 OpenAI API
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0,1 API_PORT=8000 llamafactory-cli api examples/inference/llama3_vllm.yaml
|
||||
```
|
||||
|
||||
### 从魔搭社区下载
|
||||
|
||||
如果您在 Hugging Face 模型和数据集的下载中遇到了问题,可以通过下述方法使用魔搭社区。
|
||||
|
||||
```bash
|
||||
export USE_MODELSCOPE_HUB=1 # Windows 使用 `set USE_MODELSCOPE_HUB=1`
|
||||
```
|
||||
|
||||
将 `--model_name_or_path` 设置为模型 ID 来加载对应的模型。在[魔搭社区](https://modelscope.cn/models)查看所有可用的模型,例如 `LLM-Research/Meta-Llama-3-8B-Instruct`。
|
||||
|
||||
## 使用了 LLaMA Factory 的项目
|
||||
|
||||
如果您有项目希望添加至下述列表,请通过邮件联系或者创建一个 PR。
|
||||
|
||||
<details><summary>点击显示</summary>
|
||||
|
||||
1. Wang et al. ESRL: Efficient Sampling-based Reinforcement Learning for Sequence Generation. 2023. [[arxiv]](https://arxiv.org/abs/2308.02223)
|
||||
1. Yu et al. Open, Closed, or Small Language Models for Text Classification? 2023. [[arxiv]](https://arxiv.org/abs/2308.10092)
|
||||
1. Wang et al. UbiPhysio: Support Daily Functioning, Fitness, and Rehabilitation with Action Understanding and Feedback in Natural Language. 2023. [[arxiv]](https://arxiv.org/abs/2308.10526)
|
||||
1. Luceri et al. Leveraging Large Language Models to Detect Influence Campaigns in Social Media. 2023. [[arxiv]](https://arxiv.org/abs/2311.07816)
|
||||
1. Zhang et al. Alleviating Hallucinations of Large Language Models through Induced Hallucinations. 2023. [[arxiv]](https://arxiv.org/abs/2312.15710)
|
||||
1. Wang et al. Know Your Needs Better: Towards Structured Understanding of Marketer Demands with Analogical Reasoning Augmented LLMs. 2024. [[arxiv]](https://arxiv.org/abs/2401.04319)
|
||||
1. Wang et al. CANDLE: Iterative Conceptualization and Instantiation Distillation from Large Language Models for Commonsense Reasoning. 2024. [[arxiv]](https://arxiv.org/abs/2401.07286)
|
||||
1. Choi et al. FACT-GPT: Fact-Checking Augmentation via Claim Matching with LLMs. 2024. [[arxiv]](https://arxiv.org/abs/2402.05904)
|
||||
1. Zhang et al. AutoMathText: Autonomous Data Selection with Language Models for Mathematical Texts. 2024. [[arxiv]](https://arxiv.org/abs/2402.07625)
|
||||
1. Lyu et al. KnowTuning: Knowledge-aware Fine-tuning for Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2402.11176)
|
||||
1. Yang et al. LaCo: Large Language Model Pruning via Layer Collaps. 2024. [[arxiv]](https://arxiv.org/abs/2402.11187)
|
||||
1. Bhardwaj et al. Language Models are Homer Simpson! Safety Re-Alignment of Fine-tuned Language Models through Task Arithmetic. 2024. [[arxiv]](https://arxiv.org/abs/2402.11746)
|
||||
1. Yang et al. Enhancing Empathetic Response Generation by Augmenting LLMs with Small-scale Empathetic Models. 2024. [[arxiv]](https://arxiv.org/abs/2402.11801)
|
||||
1. Yi et al. Generation Meets Verification: Accelerating Large Language Model Inference with Smart Parallel Auto-Correct Decoding. 2024. [[arxiv]](https://arxiv.org/abs/2402.11809)
|
||||
1. Cao et al. Head-wise Shareable Attention for Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2402.11819)
|
||||
1. Zhang et al. Enhancing Multilingual Capabilities of Large Language Models through Self-Distillation from Resource-Rich Languages. 2024. [[arxiv]](https://arxiv.org/abs/2402.12204)
|
||||
1. Kim et al. Efficient and Effective Vocabulary Expansion Towards Multilingual Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2402.14714)
|
||||
1. Yu et al. KIEval: A Knowledge-grounded Interactive Evaluation Framework for Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2402.15043)
|
||||
1. Huang et al. Key-Point-Driven Data Synthesis with its Enhancement on Mathematical Reasoning. 2024. [[arxiv]](https://arxiv.org/abs/2403.02333)
|
||||
1. Duan et al. Negating Negatives: Alignment without Human Positive Samples via Distributional Dispreference Optimization. 2024. [[arxiv]](https://arxiv.org/abs/2403.03419)
|
||||
1. Xie and Schwertfeger. Empowering Robotics with Large Language Models: osmAG Map Comprehension with LLMs. 2024. [[arxiv]](https://arxiv.org/abs/2403.08228)
|
||||
1. Wu et al. Large Language Models are Parallel Multilingual Learners. 2024. [[arxiv]](https://arxiv.org/abs/2403.09073)
|
||||
1. Zhang et al. EDT: Improving Large Language Models' Generation by Entropy-based Dynamic Temperature Sampling. 2024. [[arxiv]](https://arxiv.org/abs/2403.14541)
|
||||
1. Weller et al. FollowIR: Evaluating and Teaching Information Retrieval Models to Follow Instructions. 2024. [[arxiv]](https://arxiv.org/abs/2403.15246)
|
||||
1. Hongbin Na. CBT-LLM: A Chinese Large Language Model for Cognitive Behavioral Therapy-based Mental Health Question Answering. 2024. [[arxiv]](https://arxiv.org/abs/2403.16008)
|
||||
1. Zan et al. CodeS: Natural Language to Code Repository via Multi-Layer Sketch. 2024. [[arxiv]](https://arxiv.org/abs/2403.16443)
|
||||
1. Liu et al. Extensive Self-Contrast Enables Feedback-Free Language Model Alignment. 2024. [[arxiv]](https://arxiv.org/abs/2404.00604)
|
||||
1. Luo et al. BAdam: A Memory Efficient Full Parameter Training Method for Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.02827)
|
||||
1. Du et al. Chinese Tiny LLM: Pretraining a Chinese-Centric Large Language Model. 2024. [[arxiv]](https://arxiv.org/abs/2404.04167)
|
||||
1. Ma et al. Parameter Efficient Quasi-Orthogonal Fine-Tuning via Givens Rotation. 2024. [[arxiv]](https://arxiv.org/abs/2404.04316)
|
||||
1. Liu et al. Dynamic Generation of Personalities with Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.07084)
|
||||
1. Shang et al. How Far Have We Gone in Stripped Binary Code Understanding Using Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.09836)
|
||||
1. Huang et al. LLMTune: Accelerate Database Knob Tuning with Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.11581)
|
||||
1. Deng et al. Text-Tuple-Table: Towards Information Integration in Text-to-Table Generation via Global Tuple Extraction. 2024. [[arxiv]](https://arxiv.org/abs/2404.14215)
|
||||
1. Acikgoz et al. Hippocrates: An Open-Source Framework for Advancing Large Language Models in Healthcare. 2024. [[arxiv]](https://arxiv.org/abs/2404.16621)
|
||||
1. Zhang et al. Small Language Models Need Strong Verifiers to Self-Correct Reasoning. 2024. [[arxiv]](https://arxiv.org/abs/2404.17140)
|
||||
1. Zhou et al. FREB-TQA: A Fine-Grained Robustness Evaluation Benchmark for Table Question Answering. 2024. [[arxiv]](https://arxiv.org/abs/2404.18585)
|
||||
1. **[StarWhisper](https://github.com/Yu-Yang-Li/StarWhisper)**: 天文大模型 StarWhisper,基于 ChatGLM2-6B 和 Qwen-14B 在天文数据上微调而得。
|
||||
1. **[DISC-LawLLM](https://github.com/FudanDISC/DISC-LawLLM)**: 中文法律领域大模型 DISC-LawLLM,基于 Baichuan-13B 微调而得,具有法律推理和知识检索能力。
|
||||
1. **[Sunsimiao](https://github.com/X-D-Lab/Sunsimiao)**: 孙思邈中文医疗大模型 Sumsimiao,基于 Baichuan-7B 和 ChatGLM-6B 在中文医疗数据上微调而得。
|
||||
1. **[CareGPT](https://github.com/WangRongsheng/CareGPT)**: 医疗大模型项目 CareGPT,基于 LLaMA2-7B 和 Baichuan-13B 在中文医疗数据上微调而得。
|
||||
1. **[MachineMindset](https://github.com/PKU-YuanGroup/Machine-Mindset/)**:MBTI性格大模型项目,根据数据集与训练方式让任意 LLM 拥有 16 个不同的性格类型。
|
||||
1. **[Luminia-13B-v3](https://huggingface.co/Nekochu/Luminia-13B-v3)**:一个用于生成 Stable Diffusion 提示词的大型语言模型。[[🤗Demo]](https://huggingface.co/spaces/Nekochu/Luminia-13B_SD_Prompt)
|
||||
1. **[Chinese-LLaVA-Med](https://github.com/BUAADreamer/Chinese-LLaVA-Med)**:中文多模态医学大模型,基于 LLaVA-1.5-7B 在中文多模态医疗数据上微调而得。
|
||||
|
||||
</details>
|
||||
|
||||
## 协议
|
||||
|
||||
本仓库的代码依照 [Apache-2.0](LICENSE) 协议开源。
|
||||
|
||||
使用模型权重时,请遵循对应的模型协议:[Baichuan2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [Command-R](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [InternLM2](https://github.com/InternLM/InternLM#license) / [LLaMA](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [LLaMA-2 (LLaVA-1.5)](https://ai.meta.com/llama/license/) / [LLaMA-3](https://llama.meta.com/llama3/license/) / [Mistral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Phi-3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/LICENSE) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yi-1.5](LICENSE) / [Yuan](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan)
|
||||
|
||||
## 引用
|
||||
|
||||
如果您觉得此项目有帮助,请考虑以下列格式引用
|
||||
|
||||
```bibtex
|
||||
@article{zheng2024llamafactory,
|
||||
title={LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models},
|
||||
author={Yaowei Zheng and Richong Zhang and Junhao Zhang and Yanhan Ye and Zheyan Luo and Yongqiang Ma},
|
||||
journal={arXiv preprint arXiv:2403.13372},
|
||||
year={2024},
|
||||
url={http://arxiv.org/abs/2403.13372}
|
||||
}
|
||||
```
|
||||
|
||||
## 致谢
|
||||
|
||||
本项目受益于 [PEFT](https://github.com/huggingface/peft)、[TRL](https://github.com/huggingface/trl)、[QLoRA](https://github.com/artidoro/qlora) 和 [FastChat](https://github.com/lm-sys/FastChat),感谢以上诸位作者的付出。
|
||||
|
||||
## Star History
|
||||
|
||||
![Star History Chart](https://api.star-history.com/svg?repos=hiyouga/LLaMA-Factory&type=Date)
|
File diff suppressed because it is too large
Load Diff
After Width: | Height: | Size: 29 KiB |
Binary file not shown.
After Width: | Height: | Size: 56 KiB |
Binary file not shown.
After Width: | Height: | Size: 145 KiB |
Binary file not shown.
After Width: | Height: | Size: 146 KiB |
|
@ -0,0 +1,25 @@
|
|||
version: '3.8'
|
||||
|
||||
services:
|
||||
llama-factory:
|
||||
build:
|
||||
dockerfile: Dockerfile
|
||||
context: .
|
||||
container_name: llama_factory
|
||||
volumes:
|
||||
- ./hf_cache:/root/.cache/huggingface/
|
||||
- ./data:/app/data
|
||||
- ./output:/app/output
|
||||
environment:
|
||||
- CUDA_VISIBLE_DEVICES=0
|
||||
ports:
|
||||
- "7860:7860"
|
||||
ipc: host
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
count: "all"
|
||||
capabilities: [gpu]
|
||||
restart: unless-stopped
|
|
@ -0,0 +1,160 @@
|
|||
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import os
|
||||
|
||||
import datasets
|
||||
import pandas as pd
|
||||
|
||||
|
||||
_CITATION = """\
|
||||
@article{huang2023ceval,
|
||||
title={C-Eval: A Multi-Level Multi-Discipline Chinese Evaluation Suite for Foundation Models},
|
||||
author={Huang, Yuzhen and Bai, Yuzhuo and Zhu, Zhihao and Zhang, Junlei and Zhang, Jinghan and Su, Tangjun and Liu, Junteng and Lv, Chuancheng and Zhang, Yikai and Lei, Jiayi and Fu, Yao and Sun, Maosong and He, Junxian},
|
||||
journal={arXiv preprint arXiv:2305.08322},
|
||||
year={2023}
|
||||
}
|
||||
"""
|
||||
|
||||
_DESCRIPTION = """\
|
||||
C-Eval is a comprehensive Chinese evaluation suite for foundation models. It consists of 13948 multi-choice questions spanning 52 diverse disciplines and four difficulty levels.
|
||||
"""
|
||||
|
||||
_HOMEPAGE = "https://cevalbenchmark.com"
|
||||
|
||||
_LICENSE = "Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License"
|
||||
|
||||
_URL = "ceval.zip"
|
||||
|
||||
task_list = [
|
||||
"computer_network",
|
||||
"operating_system",
|
||||
"computer_architecture",
|
||||
"college_programming",
|
||||
"college_physics",
|
||||
"college_chemistry",
|
||||
"advanced_mathematics",
|
||||
"probability_and_statistics",
|
||||
"discrete_mathematics",
|
||||
"electrical_engineer",
|
||||
"metrology_engineer",
|
||||
"high_school_mathematics",
|
||||
"high_school_physics",
|
||||
"high_school_chemistry",
|
||||
"high_school_biology",
|
||||
"middle_school_mathematics",
|
||||
"middle_school_biology",
|
||||
"middle_school_physics",
|
||||
"middle_school_chemistry",
|
||||
"veterinary_medicine",
|
||||
"college_economics",
|
||||
"business_administration",
|
||||
"marxism",
|
||||
"mao_zedong_thought",
|
||||
"education_science",
|
||||
"teacher_qualification",
|
||||
"high_school_politics",
|
||||
"high_school_geography",
|
||||
"middle_school_politics",
|
||||
"middle_school_geography",
|
||||
"modern_chinese_history",
|
||||
"ideological_and_moral_cultivation",
|
||||
"logic",
|
||||
"law",
|
||||
"chinese_language_and_literature",
|
||||
"art_studies",
|
||||
"professional_tour_guide",
|
||||
"legal_professional",
|
||||
"high_school_chinese",
|
||||
"high_school_history",
|
||||
"middle_school_history",
|
||||
"civil_servant",
|
||||
"sports_science",
|
||||
"plant_protection",
|
||||
"basic_medicine",
|
||||
"clinical_medicine",
|
||||
"urban_and_rural_planner",
|
||||
"accountant",
|
||||
"fire_engineer",
|
||||
"environmental_impact_assessment_engineer",
|
||||
"tax_accountant",
|
||||
"physician",
|
||||
]
|
||||
|
||||
|
||||
class CevalConfig(datasets.BuilderConfig):
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(version=datasets.Version("1.0.0"), **kwargs)
|
||||
|
||||
|
||||
class Ceval(datasets.GeneratorBasedBuilder):
|
||||
BUILDER_CONFIGS = [
|
||||
CevalConfig(
|
||||
name=task_name,
|
||||
)
|
||||
for task_name in task_list
|
||||
]
|
||||
|
||||
def _info(self):
|
||||
features = datasets.Features(
|
||||
{
|
||||
"id": datasets.Value("int32"),
|
||||
"question": datasets.Value("string"),
|
||||
"A": datasets.Value("string"),
|
||||
"B": datasets.Value("string"),
|
||||
"C": datasets.Value("string"),
|
||||
"D": datasets.Value("string"),
|
||||
"answer": datasets.Value("string"),
|
||||
"explanation": datasets.Value("string"),
|
||||
}
|
||||
)
|
||||
return datasets.DatasetInfo(
|
||||
description=_DESCRIPTION,
|
||||
features=features,
|
||||
homepage=_HOMEPAGE,
|
||||
license=_LICENSE,
|
||||
citation=_CITATION,
|
||||
)
|
||||
|
||||
def _split_generators(self, dl_manager):
|
||||
data_dir = dl_manager.download_and_extract(_URL)
|
||||
task_name = self.config.name
|
||||
return [
|
||||
datasets.SplitGenerator(
|
||||
name=datasets.Split.TEST,
|
||||
gen_kwargs={
|
||||
"filepath": os.path.join(data_dir, "test", f"{task_name}_test.csv"),
|
||||
},
|
||||
),
|
||||
datasets.SplitGenerator(
|
||||
name=datasets.Split.VALIDATION,
|
||||
gen_kwargs={
|
||||
"filepath": os.path.join(data_dir, "val", f"{task_name}_val.csv"),
|
||||
},
|
||||
),
|
||||
datasets.SplitGenerator(
|
||||
name=datasets.Split.TRAIN,
|
||||
gen_kwargs={
|
||||
"filepath": os.path.join(data_dir, "dev", f"{task_name}_dev.csv"),
|
||||
},
|
||||
),
|
||||
]
|
||||
|
||||
def _generate_examples(self, filepath):
|
||||
df = pd.read_csv(filepath, encoding="utf-8")
|
||||
for i, instance in enumerate(df.to_dict(orient="records")):
|
||||
if "answer" not in instance.keys():
|
||||
instance["answer"] = ""
|
||||
if "explanation" not in instance.keys():
|
||||
instance["explanation"] = ""
|
||||
yield i, instance
|
Binary file not shown.
|
@ -0,0 +1,210 @@
|
|||
{
|
||||
"accountant": {
|
||||
"name": "注册会计师",
|
||||
"category": "Other"
|
||||
},
|
||||
"advanced_mathematics": {
|
||||
"name": "高等数学",
|
||||
"category": "STEM"
|
||||
},
|
||||
"art_studies": {
|
||||
"name": "艺术学",
|
||||
"category": "Humanities"
|
||||
},
|
||||
"basic_medicine": {
|
||||
"name": "基础医学",
|
||||
"category": "Other"
|
||||
},
|
||||
"business_administration": {
|
||||
"name": "工商管理",
|
||||
"category": "Social Sciences"
|
||||
},
|
||||
"chinese_language_and_literature": {
|
||||
"name": "中国语言文学",
|
||||
"category": "Humanities"
|
||||
},
|
||||
"civil_servant": {
|
||||
"name": "公务员",
|
||||
"category": "Other"
|
||||
},
|
||||
"clinical_medicine": {
|
||||
"name": "临床医学",
|
||||
"category": "Other"
|
||||
},
|
||||
"college_chemistry": {
|
||||
"name": "大学化学",
|
||||
"category": "STEM"
|
||||
},
|
||||
"college_economics": {
|
||||
"name": "大学经济学",
|
||||
"category": "Social Sciences"
|
||||
},
|
||||
"college_physics": {
|
||||
"name": "大学物理",
|
||||
"category": "STEM"
|
||||
},
|
||||
"college_programming": {
|
||||
"name": "大学编程",
|
||||
"category": "STEM"
|
||||
},
|
||||
"computer_architecture": {
|
||||
"name": "计算机组成",
|
||||
"category": "STEM"
|
||||
},
|
||||
"computer_network": {
|
||||
"name": "计算机网络",
|
||||
"category": "STEM"
|
||||
},
|
||||
"discrete_mathematics": {
|
||||
"name": "离散数学",
|
||||
"category": "STEM"
|
||||
},
|
||||
"education_science": {
|
||||
"name": "教育学",
|
||||
"category": "Social Sciences"
|
||||
},
|
||||
"electrical_engineer": {
|
||||
"name": "注册电气工程师",
|
||||
"category": "STEM"
|
||||
},
|
||||
"environmental_impact_assessment_engineer": {
|
||||
"name": "环境影响评价工程师",
|
||||
"category": "Other"
|
||||
},
|
||||
"fire_engineer": {
|
||||
"name": "注册消防工程师",
|
||||
"category": "Other"
|
||||
},
|
||||
"high_school_biology": {
|
||||
"name": "高中生物",
|
||||
"category": "STEM"
|
||||
},
|
||||
"high_school_chemistry": {
|
||||
"name": "高中化学",
|
||||
"category": "STEM"
|
||||
},
|
||||
"high_school_chinese": {
|
||||
"name": "高中语文",
|
||||
"category": "Humanities"
|
||||
},
|
||||
"high_school_geography": {
|
||||
"name": "高中地理",
|
||||
"category": "Social Sciences"
|
||||
},
|
||||
"high_school_history": {
|
||||
"name": "高中历史",
|
||||
"category": "Humanities"
|
||||
},
|
||||
"high_school_mathematics": {
|
||||
"name": "高中数学",
|
||||
"category": "STEM"
|
||||
},
|
||||
"high_school_physics": {
|
||||
"name": "高中物理",
|
||||
"category": "STEM"
|
||||
},
|
||||
"high_school_politics": {
|
||||
"name": "高中政治",
|
||||
"category": "Social Sciences"
|
||||
},
|
||||
"ideological_and_moral_cultivation": {
|
||||
"name": "思想道德修养与法律基础",
|
||||
"category": "Humanities"
|
||||
},
|
||||
"law": {
|
||||
"name": "法学",
|
||||
"category": "Humanities"
|
||||
},
|
||||
"legal_professional": {
|
||||
"name": "法律职业资格",
|
||||
"category": "Humanities"
|
||||
},
|
||||
"logic": {
|
||||
"name": "逻辑学",
|
||||
"category": "Humanities"
|
||||
},
|
||||
"mao_zedong_thought": {
|
||||
"name": "毛泽东思想和中国特色社会主义理论体系概论",
|
||||
"category": "Social Sciences"
|
||||
},
|
||||
"marxism": {
|
||||
"name": "马克思主义基本原理",
|
||||
"category": "Social Sciences"
|
||||
},
|
||||
"metrology_engineer": {
|
||||
"name": "注册计量师",
|
||||
"category": "STEM"
|
||||
},
|
||||
"middle_school_biology": {
|
||||
"name": "初中生物",
|
||||
"category": "STEM"
|
||||
},
|
||||
"middle_school_chemistry": {
|
||||
"name": "初中化学",
|
||||
"category": "STEM"
|
||||
},
|
||||
"middle_school_geography": {
|
||||
"name": "初中地理",
|
||||
"category": "Social Sciences"
|
||||
},
|
||||
"middle_school_history": {
|
||||
"name": "初中历史",
|
||||
"category": "Humanities"
|
||||
},
|
||||
"middle_school_mathematics": {
|
||||
"name": "初中数学",
|
||||
"category": "STEM"
|
||||
},
|
||||
"middle_school_physics": {
|
||||
"name": "初中物理",
|
||||
"category": "STEM"
|
||||
},
|
||||
"middle_school_politics": {
|
||||
"name": "初中政治",
|
||||
"category": "Social Sciences"
|
||||
},
|
||||
"modern_chinese_history": {
|
||||
"name": "近代史纲要",
|
||||
"category": "Humanities"
|
||||
},
|
||||
"operating_system": {
|
||||
"name": "操作系统",
|
||||
"category": "STEM"
|
||||
},
|
||||
"physician": {
|
||||
"name": "医师资格",
|
||||
"category": "Other"
|
||||
},
|
||||
"plant_protection": {
|
||||
"name": "植物保护",
|
||||
"category": "Other"
|
||||
},
|
||||
"probability_and_statistics": {
|
||||
"name": "概率统计",
|
||||
"category": "STEM"
|
||||
},
|
||||
"professional_tour_guide": {
|
||||
"name": "导游资格",
|
||||
"category": "Humanities"
|
||||
},
|
||||
"sports_science": {
|
||||
"name": "体育学",
|
||||
"category": "Other"
|
||||
},
|
||||
"tax_accountant": {
|
||||
"name": "税务师",
|
||||
"category": "Other"
|
||||
},
|
||||
"teacher_qualification": {
|
||||
"name": "教师资格",
|
||||
"category": "Social Sciences"
|
||||
},
|
||||
"urban_and_rural_planner": {
|
||||
"name": "注册城乡规划师",
|
||||
"category": "Other"
|
||||
},
|
||||
"veterinary_medicine": {
|
||||
"name": "兽医学",
|
||||
"category": "STEM"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,167 @@
|
|||
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import os
|
||||
|
||||
import datasets
|
||||
import pandas as pd
|
||||
|
||||
|
||||
_CITATION = """\
|
||||
@article{li2023cmmlu,
|
||||
title={CMMLU: Measuring massive multitask language understanding in Chinese},
|
||||
author={Haonan Li and Yixuan Zhang and Fajri Koto and Yifei Yang and Hai Zhao and Yeyun Gong and Nan Duan and Timothy Baldwin},
|
||||
journal={arXiv preprint arXiv:2306.09212},
|
||||
year={2023}
|
||||
}
|
||||
"""
|
||||
|
||||
_DESCRIPTION = """\
|
||||
CMMLU is a comprehensive Chinese assessment suite specifically designed to evaluate the advanced knowledge and reasoning abilities of LLMs within the Chinese language and cultural context.
|
||||
"""
|
||||
|
||||
_HOMEPAGE = "https://github.com/haonan-li/CMMLU"
|
||||
|
||||
_LICENSE = "Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License"
|
||||
|
||||
_URL = "cmmlu.zip"
|
||||
|
||||
task_list = [
|
||||
"agronomy",
|
||||
"anatomy",
|
||||
"ancient_chinese",
|
||||
"arts",
|
||||
"astronomy",
|
||||
"business_ethics",
|
||||
"chinese_civil_service_exam",
|
||||
"chinese_driving_rule",
|
||||
"chinese_food_culture",
|
||||
"chinese_foreign_policy",
|
||||
"chinese_history",
|
||||
"chinese_literature",
|
||||
"chinese_teacher_qualification",
|
||||
"clinical_knowledge",
|
||||
"college_actuarial_science",
|
||||
"college_education",
|
||||
"college_engineering_hydrology",
|
||||
"college_law",
|
||||
"college_mathematics",
|
||||
"college_medical_statistics",
|
||||
"college_medicine",
|
||||
"computer_science",
|
||||
"computer_security",
|
||||
"conceptual_physics",
|
||||
"construction_project_management",
|
||||
"economics",
|
||||
"education",
|
||||
"electrical_engineering",
|
||||
"elementary_chinese",
|
||||
"elementary_commonsense",
|
||||
"elementary_information_and_technology",
|
||||
"elementary_mathematics",
|
||||
"ethnology",
|
||||
"food_science",
|
||||
"genetics",
|
||||
"global_facts",
|
||||
"high_school_biology",
|
||||
"high_school_chemistry",
|
||||
"high_school_geography",
|
||||
"high_school_mathematics",
|
||||
"high_school_physics",
|
||||
"high_school_politics",
|
||||
"human_sexuality",
|
||||
"international_law",
|
||||
"journalism",
|
||||
"jurisprudence",
|
||||
"legal_and_moral_basis",
|
||||
"logical",
|
||||
"machine_learning",
|
||||
"management",
|
||||
"marketing",
|
||||
"marxist_theory",
|
||||
"modern_chinese",
|
||||
"nutrition",
|
||||
"philosophy",
|
||||
"professional_accounting",
|
||||
"professional_law",
|
||||
"professional_medicine",
|
||||
"professional_psychology",
|
||||
"public_relations",
|
||||
"security_study",
|
||||
"sociology",
|
||||
"sports_science",
|
||||
"traditional_chinese_medicine",
|
||||
"virology",
|
||||
"world_history",
|
||||
"world_religions",
|
||||
]
|
||||
|
||||
|
||||
class CMMLUConfig(datasets.BuilderConfig):
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(version=datasets.Version("1.0.1"), **kwargs)
|
||||
|
||||
|
||||
class CMMLU(datasets.GeneratorBasedBuilder):
|
||||
BUILDER_CONFIGS = [
|
||||
CMMLUConfig(
|
||||
name=task_name,
|
||||
)
|
||||
for task_name in task_list
|
||||
]
|
||||
|
||||
def _info(self):
|
||||
features = datasets.Features(
|
||||
{
|
||||
"question": datasets.Value("string"),
|
||||
"A": datasets.Value("string"),
|
||||
"B": datasets.Value("string"),
|
||||
"C": datasets.Value("string"),
|
||||
"D": datasets.Value("string"),
|
||||
"answer": datasets.Value("string"),
|
||||
}
|
||||
)
|
||||
return datasets.DatasetInfo(
|
||||
description=_DESCRIPTION,
|
||||
features=features,
|
||||
homepage=_HOMEPAGE,
|
||||
license=_LICENSE,
|
||||
citation=_CITATION,
|
||||
)
|
||||
|
||||
def _split_generators(self, dl_manager):
|
||||
data_dir = dl_manager.download_and_extract(_URL)
|
||||
task_name = self.config.name
|
||||
return [
|
||||
datasets.SplitGenerator(
|
||||
name=datasets.Split.TEST,
|
||||
gen_kwargs={
|
||||
"filepath": os.path.join(data_dir, f"test/{task_name}.csv"),
|
||||
},
|
||||
),
|
||||
datasets.SplitGenerator(
|
||||
name=datasets.Split.TRAIN,
|
||||
gen_kwargs={
|
||||
"filepath": os.path.join(data_dir, f"dev/{task_name}.csv"),
|
||||
},
|
||||
),
|
||||
]
|
||||
|
||||
def _generate_examples(self, filepath):
|
||||
df = pd.read_csv(filepath, header=0, index_col=0, encoding="utf-8")
|
||||
for i, instance in enumerate(df.to_dict(orient="records")):
|
||||
question = instance.pop("Question", "")
|
||||
answer = instance.pop("Answer", "")
|
||||
instance["question"] = question
|
||||
instance["answer"] = answer
|
||||
yield i, instance
|
Binary file not shown.
|
@ -0,0 +1,270 @@
|
|||
{
|
||||
"agronomy": {
|
||||
"name": "农学",
|
||||
"category": "Other"
|
||||
},
|
||||
"anatomy": {
|
||||
"name": "解剖学",
|
||||
"category": "STEM"
|
||||
},
|
||||
"ancient_chinese": {
|
||||
"name": "古汉语",
|
||||
"category": "Social Sciences"
|
||||
},
|
||||
"arts": {
|
||||
"name": "艺术学",
|
||||
"category": "Humanities"
|
||||
},
|
||||
"astronomy": {
|
||||
"name": "天文学",
|
||||
"category": "STEM"
|
||||
},
|
||||
"business_ethics": {
|
||||
"name": "商业伦理",
|
||||
"category": "Social Sciences"
|
||||
},
|
||||
"chinese_civil_service_exam": {
|
||||
"name": "中国公务员考试",
|
||||
"category": "Social Sciences"
|
||||
},
|
||||
"chinese_driving_rule": {
|
||||
"name": "中国驾驶规则",
|
||||
"category": "Other"
|
||||
},
|
||||
"chinese_food_culture": {
|
||||
"name": "中国饮食文化",
|
||||
"category": "Social Sciences"
|
||||
},
|
||||
"chinese_foreign_policy": {
|
||||
"name": "中国外交政策",
|
||||
"category": "Social Sciences"
|
||||
},
|
||||
"chinese_history": {
|
||||
"name": "中国历史",
|
||||
"category": "Humanities"
|
||||
},
|
||||
"chinese_literature": {
|
||||
"name": "中国文学",
|
||||
"category": "Humanities"
|
||||
},
|
||||
"chinese_teacher_qualification": {
|
||||
"name": "中国教师资格",
|
||||
"category": "Social Sciences"
|
||||
},
|
||||
"college_actuarial_science": {
|
||||
"name": "大学精算学",
|
||||
"category": "STEM"
|
||||
},
|
||||
"college_education": {
|
||||
"name": "大学教育学",
|
||||
"category": "Social Sciences"
|
||||
},
|
||||
"college_engineering_hydrology": {
|
||||
"name": "大学工程水文学",
|
||||
"category": "STEM"
|
||||
},
|
||||
"college_law": {
|
||||
"name": "大学法律",
|
||||
"category": "Humanities"
|
||||
},
|
||||
"college_mathematics": {
|
||||
"name": "大学数学",
|
||||
"category": "STEM"
|
||||
},
|
||||
"college_medical_statistics": {
|
||||
"name": "大学医学统计",
|
||||
"category": "STEM"
|
||||
},
|
||||
"clinical_knowledge": {
|
||||
"name": "临床知识",
|
||||
"category": "Other"
|
||||
},
|
||||
"college_medicine": {
|
||||
"name": "大学医学",
|
||||
"category": "Other"
|
||||
},
|
||||
"computer_science": {
|
||||
"name": "计算机科学",
|
||||
"category": "STEM"
|
||||
},
|
||||
"computer_security": {
|
||||
"name": "计算机安全",
|
||||
"category": "Other"
|
||||
},
|
||||
"conceptual_physics": {
|
||||
"name": "概念物理学",
|
||||
"category": "STEM"
|
||||
},
|
||||
"construction_project_management": {
|
||||
"name": "建设工程管理",
|
||||
"category": "Other"
|
||||
},
|
||||
"economics": {
|
||||
"name": "经济学",
|
||||
"category": "Social Sciences"
|
||||
},
|
||||
"education": {
|
||||
"name": "教育学",
|
||||
"category": "Social Sciences"
|
||||
},
|
||||
"elementary_chinese": {
|
||||
"name": "小学语文",
|
||||
"category": "Social Sciences"
|
||||
},
|
||||
"elementary_commonsense": {
|
||||
"name": "小学常识",
|
||||
"category": "Other"
|
||||
},
|
||||
"elementary_information_and_technology": {
|
||||
"name": "小学信息技术",
|
||||
"category": "Other"
|
||||
},
|
||||
"electrical_engineering": {
|
||||
"name": "电气工程",
|
||||
"category": "STEM"
|
||||
},
|
||||
"elementary_mathematics": {
|
||||
"name": "初等数学",
|
||||
"category": "STEM"
|
||||
},
|
||||
"ethnology": {
|
||||
"name": "民族学",
|
||||
"category": "Social Sciences"
|
||||
},
|
||||
"food_science": {
|
||||
"name": "食品科学",
|
||||
"category": "Other"
|
||||
},
|
||||
"genetics": {
|
||||
"name": "遗传学",
|
||||
"category": "STEM"
|
||||
},
|
||||
"global_facts": {
|
||||
"name": "全球事实",
|
||||
"category": "Humanities"
|
||||
},
|
||||
"high_school_biology": {
|
||||
"name": "高中生物",
|
||||
"category": "STEM"
|
||||
},
|
||||
"high_school_chemistry": {
|
||||
"name": "高中化学",
|
||||
"category": "STEM"
|
||||
},
|
||||
"high_school_geography": {
|
||||
"name": "高中地理",
|
||||
"category": "Social Sciences"
|
||||
},
|
||||
"high_school_mathematics": {
|
||||
"name": "高中数学",
|
||||
"category": "STEM"
|
||||
},
|
||||
"high_school_physics": {
|
||||
"name": "高中物理学",
|
||||
"category": "STEM"
|
||||
},
|
||||
"high_school_politics": {
|
||||
"name": "高中政治",
|
||||
"category": "Social Sciences"
|
||||
},
|
||||
"human_sexuality": {
|
||||
"name": "人类性行为",
|
||||
"category": "Other"
|
||||
},
|
||||
"international_law": {
|
||||
"name": "国际法学",
|
||||
"category": "Humanities"
|
||||
},
|
||||
"journalism": {
|
||||
"name": "新闻学",
|
||||
"category": "Social Sciences"
|
||||
},
|
||||
"jurisprudence": {
|
||||
"name": "法理学",
|
||||
"category": "Humanities"
|
||||
},
|
||||
"legal_and_moral_basis": {
|
||||
"name": "法律与道德基础",
|
||||
"category": "Other"
|
||||
},
|
||||
"logical": {
|
||||
"name": "逻辑学",
|
||||
"category": "Humanities"
|
||||
},
|
||||
"machine_learning": {
|
||||
"name": "机器学习",
|
||||
"category": "STEM"
|
||||
},
|
||||
"management": {
|
||||
"name": "管理学",
|
||||
"category": "Social Sciences"
|
||||
},
|
||||
"marketing": {
|
||||
"name": "市场营销",
|
||||
"category": "Social Sciences"
|
||||
},
|
||||
"marxist_theory": {
|
||||
"name": "马克思主义理论",
|
||||
"category": "Humanities"
|
||||
},
|
||||
"modern_chinese": {
|
||||
"name": "现代汉语",
|
||||
"category": "Social Sciences"
|
||||
},
|
||||
"nutrition": {
|
||||
"name": "营养学",
|
||||
"category": "Other"
|
||||
},
|
||||
"philosophy": {
|
||||
"name": "哲学",
|
||||
"category": "Humanities"
|
||||
},
|
||||
"professional_accounting": {
|
||||
"name": "专业会计",
|
||||
"category": "Social Sciences"
|
||||
},
|
||||
"professional_law": {
|
||||
"name": "专业法学",
|
||||
"category": "Humanities"
|
||||
},
|
||||
"professional_medicine": {
|
||||
"name": "专业医学",
|
||||
"category": "Other"
|
||||
},
|
||||
"professional_psychology": {
|
||||
"name": "专业心理学",
|
||||
"category": "Social Sciences"
|
||||
},
|
||||
"public_relations": {
|
||||
"name": "公共关系",
|
||||
"category": "Social Sciences"
|
||||
},
|
||||
"security_study": {
|
||||
"name": "安全研究",
|
||||
"category": "Social Sciences"
|
||||
},
|
||||
"sociology": {
|
||||
"name": "社会学",
|
||||
"category": "Social Sciences"
|
||||
},
|
||||
"sports_science": {
|
||||
"name": "体育学",
|
||||
"category": "Other"
|
||||
},
|
||||
"traditional_chinese_medicine": {
|
||||
"name": "中医中药",
|
||||
"category": "Other"
|
||||
},
|
||||
"virology": {
|
||||
"name": "病毒学",
|
||||
"category": "STEM"
|
||||
},
|
||||
"world_history": {
|
||||
"name": "世界历史",
|
||||
"category": "Humanities"
|
||||
},
|
||||
"world_religions": {
|
||||
"name": "世界宗教",
|
||||
"category": "Humanities"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,230 @@
|
|||
{
|
||||
"abstract_algebra": {
|
||||
"name": "abstract algebra",
|
||||
"category": "STEM"
|
||||
},
|
||||
"anatomy": {
|
||||
"name": "anatomy",
|
||||
"category": "Other"
|
||||
},
|
||||
"astronomy": {
|
||||
"name": "astronomy",
|
||||
"category": "STEM"
|
||||
},
|
||||
"business_ethics": {
|
||||
"name": "business ethics",
|
||||
"category": "Other"
|
||||
},
|
||||
"clinical_knowledge": {
|
||||
"name": "clinical knowledge",
|
||||
"category": "Other"
|
||||
},
|
||||
"college_biology": {
|
||||
"name": "college biology",
|
||||
"category": "STEM"
|
||||
},
|
||||
"college_chemistry": {
|
||||
"name": "college chemistry",
|
||||
"category": "STEM"
|
||||
},
|
||||
"college_computer_science": {
|
||||
"name": "college computer science",
|
||||
"category": "STEM"
|
||||
},
|
||||
"college_mathematics": {
|
||||
"name": "college mathematics",
|
||||
"category": "STEM"
|
||||
},
|
||||
"college_medicine": {
|
||||
"name": "college medicine",
|
||||
"category": "Other"
|
||||
},
|
||||
"college_physics": {
|
||||
"name": "college physics",
|
||||
"category": "STEM"
|
||||
},
|
||||
"computer_security": {
|
||||
"name": "computer security",
|
||||
"category": "STEM"
|
||||
},
|
||||
"conceptual_physics": {
|
||||
"name": "conceptual physics",
|
||||
"category": "STEM"
|
||||
},
|
||||
"econometrics": {
|
||||
"name": "econometrics",
|
||||
"category": "Social Sciences"
|
||||
},
|
||||
"electrical_engineering": {
|
||||
"name": "electrical engineering",
|
||||
"category": "STEM"
|
||||
},
|
||||
"elementary_mathematics": {
|
||||
"name": "elementary mathematics",
|
||||
"category": "STEM"
|
||||
},
|
||||
"formal_logic": {
|
||||
"name": "formal logic",
|
||||
"category": "Humanities"
|
||||
},
|
||||
"global_facts": {
|
||||
"name": "global facts",
|
||||
"category": "Other"
|
||||
},
|
||||
"high_school_biology": {
|
||||
"name": "high school biology",
|
||||
"category": "STEM"
|
||||
},
|
||||
"high_school_chemistry": {
|
||||
"name": "high school chemistry",
|
||||
"category": "STEM"
|
||||
},
|
||||
"high_school_computer_science": {
|
||||
"name": "high school computer science",
|
||||
"category": "STEM"
|
||||
},
|
||||
"high_school_european_history": {
|
||||
"name": "high school european history",
|
||||
"category": "Humanities"
|
||||
},
|
||||
"high_school_geography": {
|
||||
"name": "high school geography",
|
||||
"category": "Social Sciences"
|
||||
},
|
||||
"high_school_government_and_politics": {
|
||||
"name": "high school government and politics",
|
||||
"category": "Social Sciences"
|
||||
},
|
||||
"high_school_macroeconomics": {
|
||||
"name": "high school macroeconomics",
|
||||
"category": "Social Sciences"
|
||||
},
|
||||
"high_school_mathematics": {
|
||||
"name": "high school mathematics",
|
||||
"category": "STEM"
|
||||
},
|
||||
"high_school_microeconomics": {
|
||||
"name": "high school microeconomics",
|
||||
"category": "Social Sciences"
|
||||
},
|
||||
"high_school_physics": {
|
||||
"name": "high school physics",
|
||||
"category": "STEM"
|
||||
},
|
||||
"high_school_psychology": {
|
||||
"name": "high school psychology",
|
||||
"category": "Social Sciences"
|
||||
},
|
||||
"high_school_statistics": {
|
||||
"name": "high school statistics",
|
||||
"category": "STEM"
|
||||
},
|
||||
"high_school_us_history": {
|
||||
"name": "high school us history",
|
||||
"category": "Humanities"
|
||||
},
|
||||
"high_school_world_history": {
|
||||
"name": "high school world history",
|
||||
"category": "Humanities"
|
||||
},
|
||||
"human_aging": {
|
||||
"name": "human aging",
|
||||
"category": "Other"
|
||||
},
|
||||
"human_sexuality": {
|
||||
"name": "human sexuality",
|
||||
"category": "Social Sciences"
|
||||
},
|
||||
"international_law": {
|
||||
"name": "international law",
|
||||
"category": "Humanities"
|
||||
},
|
||||
"jurisprudence": {
|
||||
"name": "jurisprudence",
|
||||
"category": "Humanities"
|
||||
},
|
||||
"logical_fallacies": {
|
||||
"name": "logical fallacies",
|
||||
"category": "Humanities"
|
||||
},
|
||||
"machine_learning": {
|
||||
"name": "machine learning",
|
||||
"category": "STEM"
|
||||
},
|
||||
"management": {
|
||||
"name": "management",
|
||||
"category": "Other"
|
||||
},
|
||||
"marketing": {
|
||||
"name": "marketing",
|
||||
"category": "Other"
|
||||
},
|
||||
"medical_genetics": {
|
||||
"name": "medical genetics",
|
||||
"category": "Other"
|
||||
},
|
||||
"miscellaneous": {
|
||||
"name": "miscellaneous",
|
||||
"category": "Other"
|
||||
},
|
||||
"moral_disputes": {
|
||||
"name": "moral disputes",
|
||||
"category": "Humanities"
|
||||
},
|
||||
"moral_scenarios": {
|
||||
"name": "moral scenarios",
|
||||
"category": "Humanities"
|
||||
},
|
||||
"nutrition": {
|
||||
"name": "nutrition",
|
||||
"category": "Other"
|
||||
},
|
||||
"philosophy": {
|
||||
"name": "philosophy",
|
||||
"category": "Humanities"
|
||||
},
|
||||
"prehistory": {
|
||||
"name": "prehistory",
|
||||
"category": "Humanities"
|
||||
},
|
||||
"professional_accounting": {
|
||||
"name": "professional accounting",
|
||||
"category": "Other"
|
||||
},
|
||||
"professional_law": {
|
||||
"name": "professional law",
|
||||
"category": "Humanities"
|
||||
},
|
||||
"professional_medicine": {
|
||||
"name": "professional medicine",
|
||||
"category": "Other"
|
||||
},
|
||||
"professional_psychology": {
|
||||
"name": "professional psychology",
|
||||
"category": "Social Sciences"
|
||||
},
|
||||
"public_relations": {
|
||||
"name": "public relations",
|
||||
"category": "Social Sciences"
|
||||
},
|
||||
"security_studies": {
|
||||
"name": "security studies",
|
||||
"category": "Social Sciences"
|
||||
},
|
||||
"sociology": {
|
||||
"name": "sociology",
|
||||
"category": "Social Sciences"
|
||||
},
|
||||
"us_foreign_policy": {
|
||||
"name": "us foreign policy",
|
||||
"category": "Social Sciences"
|
||||
},
|
||||
"virology": {
|
||||
"name": "virology",
|
||||
"category": "Other"
|
||||
},
|
||||
"world_religions": {
|
||||
"name": "world religions",
|
||||
"category": "Humanities"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,161 @@
|
|||
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import os
|
||||
|
||||
import datasets
|
||||
import pandas as pd
|
||||
|
||||
|
||||
_CITATION = """\
|
||||
@article{hendryckstest2021,
|
||||
title={Measuring Massive Multitask Language Understanding},
|
||||
author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
|
||||
journal={Proceedings of the International Conference on Learning Representations (ICLR)},
|
||||
year={2021}
|
||||
}
|
||||
"""
|
||||
|
||||
_DESCRIPTION = """\
|
||||
Measuring Massive Multitask Language Understanding by Dan Hendrycks, Collin Burns, Steven Basart, Andy Zou, Mantas Mazeika, Dawn Song, and Jacob Steinhardt (ICLR 2021).
|
||||
"""
|
||||
|
||||
_HOMEPAGE = "https://github.com/hendrycks/test"
|
||||
|
||||
_LICENSE = "MIT"
|
||||
|
||||
_URL = "mmlu.zip"
|
||||
|
||||
task_list = [
|
||||
"high_school_european_history",
|
||||
"business_ethics",
|
||||
"clinical_knowledge",
|
||||
"medical_genetics",
|
||||
"high_school_us_history",
|
||||
"high_school_physics",
|
||||
"high_school_world_history",
|
||||
"virology",
|
||||
"high_school_microeconomics",
|
||||
"econometrics",
|
||||
"college_computer_science",
|
||||
"high_school_biology",
|
||||
"abstract_algebra",
|
||||
"professional_accounting",
|
||||
"philosophy",
|
||||
"professional_medicine",
|
||||
"nutrition",
|
||||
"global_facts",
|
||||
"machine_learning",
|
||||
"security_studies",
|
||||
"public_relations",
|
||||
"professional_psychology",
|
||||
"prehistory",
|
||||
"anatomy",
|
||||
"human_sexuality",
|
||||
"college_medicine",
|
||||
"high_school_government_and_politics",
|
||||
"college_chemistry",
|
||||
"logical_fallacies",
|
||||
"high_school_geography",
|
||||
"elementary_mathematics",
|
||||
"human_aging",
|
||||
"college_mathematics",
|
||||
"high_school_psychology",
|
||||
"formal_logic",
|
||||
"high_school_statistics",
|
||||
"international_law",
|
||||
"high_school_mathematics",
|
||||
"high_school_computer_science",
|
||||
"conceptual_physics",
|
||||
"miscellaneous",
|
||||
"high_school_chemistry",
|
||||
"marketing",
|
||||
"professional_law",
|
||||
"management",
|
||||
"college_physics",
|
||||
"jurisprudence",
|
||||
"world_religions",
|
||||
"sociology",
|
||||
"us_foreign_policy",
|
||||
"high_school_macroeconomics",
|
||||
"computer_security",
|
||||
"moral_scenarios",
|
||||
"moral_disputes",
|
||||
"electrical_engineering",
|
||||
"astronomy",
|
||||
"college_biology",
|
||||
]
|
||||
|
||||
|
||||
class MMLUConfig(datasets.BuilderConfig):
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(version=datasets.Version("1.0.0"), **kwargs)
|
||||
|
||||
|
||||
class MMLU(datasets.GeneratorBasedBuilder):
|
||||
BUILDER_CONFIGS = [
|
||||
MMLUConfig(
|
||||
name=task_name,
|
||||
)
|
||||
for task_name in task_list
|
||||
]
|
||||
|
||||
def _info(self):
|
||||
features = datasets.Features(
|
||||
{
|
||||
"question": datasets.Value("string"),
|
||||
"A": datasets.Value("string"),
|
||||
"B": datasets.Value("string"),
|
||||
"C": datasets.Value("string"),
|
||||
"D": datasets.Value("string"),
|
||||
"answer": datasets.Value("string"),
|
||||
}
|
||||
)
|
||||
return datasets.DatasetInfo(
|
||||
description=_DESCRIPTION,
|
||||
features=features,
|
||||
homepage=_HOMEPAGE,
|
||||
license=_LICENSE,
|
||||
citation=_CITATION,
|
||||
)
|
||||
|
||||
def _split_generators(self, dl_manager):
|
||||
data_dir = dl_manager.download_and_extract(_URL)
|
||||
task_name = self.config.name
|
||||
return [
|
||||
datasets.SplitGenerator(
|
||||
name=datasets.Split.TEST,
|
||||
gen_kwargs={
|
||||
"filepath": os.path.join(data_dir, "data", "test", f"{task_name}_test.csv"),
|
||||
},
|
||||
),
|
||||
datasets.SplitGenerator(
|
||||
name=datasets.Split.VALIDATION,
|
||||
gen_kwargs={
|
||||
"filepath": os.path.join(data_dir, "data", "val", f"{task_name}_val.csv"),
|
||||
},
|
||||
),
|
||||
datasets.SplitGenerator(
|
||||
name=datasets.Split.TRAIN,
|
||||
gen_kwargs={
|
||||
"filepath": os.path.join(data_dir, "data", "dev", f"{task_name}_dev.csv"),
|
||||
},
|
||||
),
|
||||
]
|
||||
|
||||
def _generate_examples(self, filepath):
|
||||
df = pd.read_csv(filepath)
|
||||
df.columns = ["question", "A", "B", "C", "D", "answer"]
|
||||
|
||||
for i, instance in enumerate(df.to_dict(orient="records")):
|
||||
yield i, instance
|
Binary file not shown.
|
@ -0,0 +1,237 @@
|
|||
We provide diverse examples about fine-tuning LLMs.
|
||||
|
||||
Make sure to execute these commands in the `LLaMA-Factory` directory.
|
||||
|
||||
## Table of Contents
|
||||
|
||||
- [LoRA Fine-Tuning on A Single GPU](#lora-fine-tuning-on-a-single-gpu)
|
||||
- [QLoRA Fine-Tuning on a Single GPU](#qlora-fine-tuning-on-a-single-gpu)
|
||||
- [LoRA Fine-Tuning on Multiple GPUs](#lora-fine-tuning-on-multiple-gpus)
|
||||
- [LoRA Fine-Tuning on Multiple NPUs](#lora-fine-tuning-on-multiple-npus)
|
||||
- [Full-Parameter Fine-Tuning on Multiple GPUs](#full-parameter-fine-tuning-on-multiple-gpus)
|
||||
- [Merging LoRA Adapters and Quantization](#merging-lora-adapters-and-quantization)
|
||||
- [Inferring LoRA Fine-Tuned Models](#inferring-lora-fine-tuned-models)
|
||||
- [Extras](#extras)
|
||||
|
||||
## Examples
|
||||
|
||||
### LoRA Fine-Tuning on A Single GPU
|
||||
|
||||
#### (Continuous) Pre-Training
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_pretrain.yaml
|
||||
```
|
||||
|
||||
#### Supervised Fine-Tuning
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_sft.yaml
|
||||
```
|
||||
|
||||
#### Multimodal Supervised Fine-Tuning
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llava1_5_lora_sft.yaml
|
||||
```
|
||||
|
||||
#### Reward Modeling
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_reward.yaml
|
||||
```
|
||||
|
||||
#### PPO Training
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_ppo.yaml
|
||||
```
|
||||
|
||||
#### DPO Training
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_dpo.yaml
|
||||
```
|
||||
|
||||
#### KTO Training
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_kto.yaml
|
||||
```
|
||||
|
||||
#### ORPO Training
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_orpo.yaml
|
||||
```
|
||||
|
||||
#### Preprocess Dataset
|
||||
|
||||
It is useful for large dataset, use `tokenized_path` in config to load the preprocessed dataset.
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_preprocess.yaml
|
||||
```
|
||||
|
||||
#### Evaluating on MMLU/CMMLU/C-Eval Benchmarks
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli eval examples/lora_single_gpu/llama3_lora_eval.yaml
|
||||
```
|
||||
|
||||
#### Batch Predicting and Computing BLEU and ROUGE Scores
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_predict.yaml
|
||||
```
|
||||
|
||||
### QLoRA Fine-Tuning on a Single GPU
|
||||
|
||||
#### Supervised Fine-Tuning with 4/8-bit Bitsandbytes Quantization (Recommended)
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml
|
||||
```
|
||||
|
||||
#### Supervised Fine-Tuning with 4/8-bit GPTQ Quantization
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml
|
||||
```
|
||||
|
||||
#### Supervised Fine-Tuning with 4-bit AWQ Quantization
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_lora_sft_awq.yaml
|
||||
```
|
||||
|
||||
#### Supervised Fine-Tuning with 2-bit AQLM Quantization
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml
|
||||
```
|
||||
|
||||
### LoRA Fine-Tuning on Multiple GPUs
|
||||
|
||||
#### Supervised Fine-Tuning with Accelerate on Single Node
|
||||
|
||||
```bash
|
||||
bash examples/lora_multi_gpu/single_node.sh
|
||||
```
|
||||
|
||||
#### Supervised Fine-Tuning with Accelerate on Multiple Nodes
|
||||
|
||||
```bash
|
||||
bash examples/lora_multi_gpu/multi_node.sh
|
||||
```
|
||||
|
||||
#### Supervised Fine-Tuning with DeepSpeed ZeRO-3 (Weight Sharding)
|
||||
|
||||
```bash
|
||||
bash examples/lora_multi_gpu/ds_zero3.sh
|
||||
```
|
||||
|
||||
### LoRA Fine-Tuning on Multiple NPUs
|
||||
|
||||
#### Supervised Fine-Tuning with DeepSpeed ZeRO-0
|
||||
|
||||
```bash
|
||||
bash examples/lora_multi_npu/ds_zero0.sh
|
||||
```
|
||||
|
||||
### Full-Parameter Fine-Tuning on Multiple GPUs
|
||||
|
||||
#### Supervised Fine-Tuning with Accelerate on Single Node
|
||||
|
||||
```bash
|
||||
bash examples/full_multi_gpu/single_node.sh
|
||||
```
|
||||
|
||||
#### Supervised Fine-Tuning with Accelerate on Multiple Nodes
|
||||
|
||||
```bash
|
||||
bash examples/full_multi_gpu/multi_node.sh
|
||||
```
|
||||
|
||||
#### Batch Predicting and Computing BLEU and ROUGE Scores
|
||||
|
||||
```bash
|
||||
bash examples/full_multi_gpu/predict.sh
|
||||
```
|
||||
|
||||
### Merging LoRA Adapters and Quantization
|
||||
|
||||
#### Merge LoRA Adapters
|
||||
|
||||
Note: DO NOT use quantized model or `quantization_bit` when merging LoRA adapters.
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
|
||||
```
|
||||
|
||||
#### Quantizing Model using AutoGPTQ
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_gptq.yaml
|
||||
```
|
||||
|
||||
### Inferring LoRA Fine-Tuned Models
|
||||
|
||||
Use `CUDA_VISIBLE_DEVICES=0,1` to infer models on multiple devices.
|
||||
|
||||
#### Use CLI
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli chat examples/inference/llama3_lora_sft.yaml
|
||||
```
|
||||
|
||||
#### Use Web UI
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli webchat examples/inference/llama3_lora_sft.yaml
|
||||
```
|
||||
|
||||
#### Launch OpenAI-style API
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli api examples/inference/llama3_lora_sft.yaml
|
||||
```
|
||||
|
||||
### Extras
|
||||
|
||||
#### Full-Parameter Fine-Tuning using GaLore
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/galore/llama3_full_sft.yaml
|
||||
```
|
||||
|
||||
#### Full-Parameter Fine-Tuning using BAdam
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/badam/llama3_full_sft.yaml
|
||||
```
|
||||
|
||||
#### LoRA+ Fine-Tuning
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/loraplus/llama3_lora_sft.yaml
|
||||
```
|
||||
|
||||
#### Mixture-of-Depths Fine-Tuning
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/mod/llama3_full_sft.yaml
|
||||
```
|
||||
|
||||
#### LLaMA-Pro Fine-Tuning
|
||||
|
||||
```bash
|
||||
bash examples/extras/llama_pro/expand.sh
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/llama_pro/llama3_freeze_sft.yaml
|
||||
```
|
||||
|
||||
#### FSDP+QLoRA Fine-Tuning
|
||||
|
||||
```bash
|
||||
bash examples/extras/fsdp_qlora/single_node.sh
|
||||
```
|
|
@ -0,0 +1,237 @@
|
|||
我们提供了多样化的大模型微调示例脚本。
|
||||
|
||||
请确保在 `LLaMA-Factory` 目录下执行下述命令。
|
||||
|
||||
## 目录
|
||||
|
||||
- [单 GPU LoRA 微调](#单-gpu-lora-微调)
|
||||
- [单 GPU QLoRA 微调](#单-gpu-qlora-微调)
|
||||
- [多 GPU LoRA 微调](#多-gpu-lora-微调)
|
||||
- [多 NPU LoRA 微调](#多-npu-lora-微调)
|
||||
- [多 GPU 全参数微调](#多-gpu-全参数微调)
|
||||
- [合并 LoRA 适配器与模型量化](#合并-lora-适配器与模型量化)
|
||||
- [推理 LoRA 模型](#推理-lora-模型)
|
||||
- [杂项](#杂项)
|
||||
|
||||
## 示例
|
||||
|
||||
### 单 GPU LoRA 微调
|
||||
|
||||
#### (增量)预训练
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_pretrain.yaml
|
||||
```
|
||||
|
||||
#### 指令监督微调
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_sft.yaml
|
||||
```
|
||||
|
||||
#### 多模态指令监督微调
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llava1_5_lora_sft.yaml
|
||||
```
|
||||
|
||||
#### 奖励模型训练
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_reward.yaml
|
||||
```
|
||||
|
||||
#### PPO 训练
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_ppo.yaml
|
||||
```
|
||||
|
||||
#### DPO 训练
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_dpo.yaml
|
||||
```
|
||||
|
||||
#### KTO 训练
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_kto.yaml
|
||||
```
|
||||
|
||||
#### ORPO 训练
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_orpo.yaml
|
||||
```
|
||||
|
||||
#### 预处理数据集
|
||||
|
||||
对于大数据集有帮助,在配置中使用 `tokenized_path` 以加载预处理后的数据集。
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_preprocess.yaml
|
||||
```
|
||||
|
||||
#### 在 MMLU/CMMLU/C-Eval 上评估
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli eval examples/lora_single_gpu/llama3_lora_eval.yaml
|
||||
```
|
||||
|
||||
#### 批量预测并计算 BLEU 和 ROUGE 分数
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_predict.yaml
|
||||
```
|
||||
|
||||
### 单 GPU QLoRA 微调
|
||||
|
||||
#### 基于 4/8 比特 Bitsandbytes 量化进行指令监督微调(推荐)
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml
|
||||
```
|
||||
|
||||
#### 基于 4/8 比特 GPTQ 量化进行指令监督微调
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml
|
||||
```
|
||||
|
||||
#### 基于 4 比特 AWQ 量化进行指令监督微调
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_lora_sft_awq.yaml
|
||||
```
|
||||
|
||||
#### 基于 2 比特 AQLM 量化进行指令监督微调
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml
|
||||
```
|
||||
|
||||
### 多 GPU LoRA 微调
|
||||
|
||||
#### 使用 Accelerate 进行单节点训练
|
||||
|
||||
```bash
|
||||
bash examples/lora_multi_gpu/single_node.sh
|
||||
```
|
||||
|
||||
#### 使用 Accelerate 进行多节点训练
|
||||
|
||||
```bash
|
||||
bash examples/lora_multi_gpu/multi_node.sh
|
||||
```
|
||||
|
||||
#### 使用 DeepSpeed ZeRO-3 平均分配显存
|
||||
|
||||
```bash
|
||||
bash examples/lora_multi_gpu/ds_zero3.sh
|
||||
```
|
||||
|
||||
### 多 NPU LoRA 微调
|
||||
|
||||
#### 使用 DeepSpeed ZeRO-0 训练
|
||||
|
||||
```bash
|
||||
bash examples/lora_multi_npu/ds_zero0.sh
|
||||
```
|
||||
|
||||
### 多 GPU 全参数微调
|
||||
|
||||
#### 使用 DeepSpeed 进行单节点训练
|
||||
|
||||
```bash
|
||||
bash examples/full_multi_gpu/single_node.sh
|
||||
```
|
||||
|
||||
#### 使用 DeepSpeed 进行多节点训练
|
||||
|
||||
```bash
|
||||
bash examples/full_multi_gpu/multi_node.sh
|
||||
```
|
||||
|
||||
#### 批量预测并计算 BLEU 和 ROUGE 分数
|
||||
|
||||
```bash
|
||||
bash examples/full_multi_gpu/predict.sh
|
||||
```
|
||||
|
||||
### 合并 LoRA 适配器与模型量化
|
||||
|
||||
#### 合并 LoRA 适配器
|
||||
|
||||
注:请勿使用量化后的模型或 `quantization_bit` 参数来合并 LoRA 适配器。
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
|
||||
```
|
||||
|
||||
#### 使用 AutoGPTQ 量化模型
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_gptq.yaml
|
||||
```
|
||||
|
||||
### 推理 LoRA 模型
|
||||
|
||||
使用 `CUDA_VISIBLE_DEVICES=0,1` 进行多卡推理。
|
||||
|
||||
#### 使用命令行接口
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli chat examples/inference/llama3_lora_sft.yaml
|
||||
```
|
||||
|
||||
#### 使用浏览器界面
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli webchat examples/inference/llama3_lora_sft.yaml
|
||||
```
|
||||
|
||||
#### 启动 OpenAI 风格 API
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli api examples/inference/llama3_lora_sft.yaml
|
||||
```
|
||||
|
||||
### 杂项
|
||||
|
||||
#### 使用 GaLore 进行全参数训练
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/galore/llama3_full_sft.yaml
|
||||
```
|
||||
|
||||
#### 使用 BAdam 进行全参数训练
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/badam/llama3_full_sft.yaml
|
||||
```
|
||||
|
||||
#### LoRA+ 微调
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/loraplus/llama3_lora_sft.yaml
|
||||
```
|
||||
|
||||
#### 深度混合微调
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/mod/llama3_full_sft.yaml
|
||||
```
|
||||
|
||||
#### LLaMA-Pro 微调
|
||||
|
||||
```bash
|
||||
bash examples/extras/llama_pro/expand.sh
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/llama_pro/llama3_freeze_sft.yaml
|
||||
```
|
||||
|
||||
#### FSDP+QLoRA 微调
|
||||
|
||||
```bash
|
||||
bash examples/extras/fsdp_qlora/single_node.sh
|
||||
```
|
|
@ -0,0 +1,25 @@
|
|||
compute_environment: LOCAL_MACHINE
|
||||
debug: false
|
||||
distributed_type: FSDP
|
||||
downcast_bf16: 'no'
|
||||
fsdp_config:
|
||||
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
fsdp_backward_prefetch: BACKWARD_PRE
|
||||
fsdp_cpu_ram_efficient_loading: true
|
||||
fsdp_forward_prefetch: false
|
||||
fsdp_offload_params: true
|
||||
fsdp_sharding_strategy: FULL_SHARD
|
||||
fsdp_state_dict_type: FULL_STATE_DICT
|
||||
fsdp_sync_module_states: true
|
||||
fsdp_use_orig_params: false
|
||||
machine_rank: 0
|
||||
main_training_function: main
|
||||
mixed_precision: fp16
|
||||
num_machines: 1 # the number of nodes
|
||||
num_processes: 2 # the number of GPUs in all nodes
|
||||
rdzv_backend: static
|
||||
same_network: true
|
||||
tpu_env: []
|
||||
tpu_use_cluster: false
|
||||
tpu_use_sudo: false
|
||||
use_cpu: false
|
|
@ -0,0 +1,18 @@
|
|||
compute_environment: LOCAL_MACHINE
|
||||
debug: false
|
||||
distributed_type: MULTI_GPU
|
||||
downcast_bf16: 'no'
|
||||
gpu_ids: all
|
||||
machine_rank: 0
|
||||
main_process_ip: 192.168.0.1
|
||||
main_process_port: 29555
|
||||
main_training_function: main
|
||||
mixed_precision: fp16
|
||||
num_machines: 2 # the number of nodes
|
||||
num_processes: 8 # the number of GPUs in all nodes
|
||||
rdzv_backend: static
|
||||
same_network: true
|
||||
tpu_env: []
|
||||
tpu_use_cluster: false
|
||||
tpu_use_sudo: false
|
||||
use_cpu: false
|
|
@ -0,0 +1,16 @@
|
|||
compute_environment: LOCAL_MACHINE
|
||||
debug: false
|
||||
distributed_type: MULTI_GPU
|
||||
downcast_bf16: 'no'
|
||||
gpu_ids: all
|
||||
machine_rank: 0
|
||||
main_training_function: main
|
||||
mixed_precision: fp16
|
||||
num_machines: 1 # the number of nodes
|
||||
num_processes: 4 # the number of GPUs in all nodes
|
||||
rdzv_backend: static
|
||||
same_network: true
|
||||
tpu_env: []
|
||||
tpu_use_cluster: false
|
||||
tpu_use_sudo: false
|
||||
use_cpu: false
|
|
@ -0,0 +1,18 @@
|
|||
compute_environment: LOCAL_MACHINE
|
||||
debug: false
|
||||
distributed_type: MULTI_GPU
|
||||
downcast_bf16: 'no'
|
||||
gpu_ids: all
|
||||
machine_rank: 1
|
||||
main_process_ip: 192.168.0.1
|
||||
main_process_port: 29555
|
||||
main_training_function: main
|
||||
mixed_precision: fp16
|
||||
num_machines: 2 # the number of nodes
|
||||
num_processes: 8 # the number of GPUs in all nodes
|
||||
rdzv_backend: static
|
||||
same_network: true
|
||||
tpu_env: []
|
||||
tpu_use_cluster: false
|
||||
tpu_use_sudo: false
|
||||
use_cpu: false
|
|
@ -0,0 +1,28 @@
|
|||
{
|
||||
"train_batch_size": "auto",
|
||||
"train_micro_batch_size_per_gpu": "auto",
|
||||
"gradient_accumulation_steps": "auto",
|
||||
"gradient_clipping": "auto",
|
||||
"zero_allow_untested_optimizer": true,
|
||||
"fp16": {
|
||||
"enabled": "auto",
|
||||
"loss_scale": 0,
|
||||
"loss_scale_window": 1000,
|
||||
"initial_scale_power": 16,
|
||||
"hysteresis": 2,
|
||||
"min_loss_scale": 1
|
||||
},
|
||||
"bf16": {
|
||||
"enabled": "auto"
|
||||
},
|
||||
"zero_optimization": {
|
||||
"stage": 0,
|
||||
"allgather_partitions": true,
|
||||
"allgather_bucket_size": 5e8,
|
||||
"overlap_comm": true,
|
||||
"reduce_scatter": true,
|
||||
"reduce_bucket_size": 5e8,
|
||||
"contiguous_gradients": true,
|
||||
"round_robin_gradients": true
|
||||
}
|
||||
}
|
|
@ -0,0 +1,28 @@
|
|||
{
|
||||
"train_batch_size": "auto",
|
||||
"train_micro_batch_size_per_gpu": "auto",
|
||||
"gradient_accumulation_steps": "auto",
|
||||
"gradient_clipping": "auto",
|
||||
"zero_allow_untested_optimizer": true,
|
||||
"fp16": {
|
||||
"enabled": "auto",
|
||||
"loss_scale": 0,
|
||||
"loss_scale_window": 1000,
|
||||
"initial_scale_power": 16,
|
||||
"hysteresis": 2,
|
||||
"min_loss_scale": 1
|
||||
},
|
||||
"bf16": {
|
||||
"enabled": "auto"
|
||||
},
|
||||
"zero_optimization": {
|
||||
"stage": 2,
|
||||
"allgather_partitions": true,
|
||||
"allgather_bucket_size": 5e8,
|
||||
"overlap_comm": true,
|
||||
"reduce_scatter": true,
|
||||
"reduce_bucket_size": 5e8,
|
||||
"contiguous_gradients": true,
|
||||
"round_robin_gradients": true
|
||||
}
|
||||
}
|
|
@ -0,0 +1,32 @@
|
|||
{
|
||||
"train_batch_size": "auto",
|
||||
"train_micro_batch_size_per_gpu": "auto",
|
||||
"gradient_accumulation_steps": "auto",
|
||||
"gradient_clipping": "auto",
|
||||
"zero_allow_untested_optimizer": true,
|
||||
"fp16": {
|
||||
"enabled": "auto",
|
||||
"loss_scale": 0,
|
||||
"loss_scale_window": 1000,
|
||||
"initial_scale_power": 16,
|
||||
"hysteresis": 2,
|
||||
"min_loss_scale": 1
|
||||
},
|
||||
"bf16": {
|
||||
"enabled": "auto"
|
||||
},
|
||||
"zero_optimization": {
|
||||
"stage": 2,
|
||||
"offload_optimizer": {
|
||||
"device": "cpu",
|
||||
"pin_memory": true
|
||||
},
|
||||
"allgather_partitions": true,
|
||||
"allgather_bucket_size": 5e8,
|
||||
"overlap_comm": true,
|
||||
"reduce_scatter": true,
|
||||
"reduce_bucket_size": 5e8,
|
||||
"contiguous_gradients": true,
|
||||
"round_robin_gradients": true
|
||||
}
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
{
|
||||
"train_batch_size": "auto",
|
||||
"train_micro_batch_size_per_gpu": "auto",
|
||||
"gradient_accumulation_steps": "auto",
|
||||
"gradient_clipping": "auto",
|
||||
"zero_allow_untested_optimizer": true,
|
||||
"fp16": {
|
||||
"enabled": "auto",
|
||||
"loss_scale": 0,
|
||||
"loss_scale_window": 1000,
|
||||
"initial_scale_power": 16,
|
||||
"hysteresis": 2,
|
||||
"min_loss_scale": 1
|
||||
},
|
||||
"bf16": {
|
||||
"enabled": "auto"
|
||||
},
|
||||
"zero_optimization": {
|
||||
"stage": 3,
|
||||
"overlap_comm": true,
|
||||
"contiguous_gradients": true,
|
||||
"sub_group_size": 1e9,
|
||||
"reduce_bucket_size": "auto",
|
||||
"stage3_prefetch_bucket_size": "auto",
|
||||
"stage3_param_persistence_threshold": "auto",
|
||||
"stage3_max_live_parameters": 1e9,
|
||||
"stage3_max_reuse_distance": 1e9,
|
||||
"stage3_gather_16bit_weights_on_model_save": true
|
||||
}
|
||||
}
|
|
@ -0,0 +1,38 @@
|
|||
{
|
||||
"train_batch_size": "auto",
|
||||
"train_micro_batch_size_per_gpu": "auto",
|
||||
"gradient_accumulation_steps": "auto",
|
||||
"gradient_clipping": "auto",
|
||||
"zero_allow_untested_optimizer": true,
|
||||
"fp16": {
|
||||
"enabled": "auto",
|
||||
"loss_scale": 0,
|
||||
"loss_scale_window": 1000,
|
||||
"initial_scale_power": 16,
|
||||
"hysteresis": 2,
|
||||
"min_loss_scale": 1
|
||||
},
|
||||
"bf16": {
|
||||
"enabled": "auto"
|
||||
},
|
||||
"zero_optimization": {
|
||||
"stage": 3,
|
||||
"offload_optimizer": {
|
||||
"device": "cpu",
|
||||
"pin_memory": true
|
||||
},
|
||||
"offload_param": {
|
||||
"device": "cpu",
|
||||
"pin_memory": true
|
||||
},
|
||||
"overlap_comm": true,
|
||||
"contiguous_gradients": true,
|
||||
"sub_group_size": 1e9,
|
||||
"reduce_bucket_size": "auto",
|
||||
"stage3_prefetch_bucket_size": "auto",
|
||||
"stage3_param_persistence_threshold": "auto",
|
||||
"stage3_max_live_parameters": 1e9,
|
||||
"stage3_max_reuse_distance": 1e9,
|
||||
"stage3_gather_16bit_weights_on_model_save": true
|
||||
}
|
||||
}
|
|
@ -0,0 +1,41 @@
|
|||
### model
|
||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
|
||||
### method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: full
|
||||
use_badam: true
|
||||
badam_switch_mode: ascending
|
||||
badam_switch_interval: 50
|
||||
badam_verbose: 2
|
||||
|
||||
### dataset
|
||||
dataset: identity,alpaca_en_demo
|
||||
template: llama3
|
||||
cutoff_len: 1024
|
||||
max_samples: 1000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
|
||||
### output
|
||||
output_dir: saves/llama3-8b/full/sft
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
|
||||
### train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 8
|
||||
learning_rate: 0.0001
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_steps: 0.1
|
||||
pure_bf16: true
|
||||
|
||||
### eval
|
||||
val_size: 0.1
|
||||
per_device_eval_batch_size: 1
|
||||
evaluation_strategy: steps
|
||||
eval_steps: 500
|
|
@ -0,0 +1,42 @@
|
|||
### model
|
||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
quantization_bit: 4
|
||||
|
||||
### method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: lora
|
||||
lora_target: q_proj,v_proj
|
||||
|
||||
### ddp
|
||||
ddp_timeout: 180000000
|
||||
|
||||
### dataset
|
||||
dataset: identity,alpaca_en_demo
|
||||
template: llama3
|
||||
cutoff_len: 1024
|
||||
max_samples: 1000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
|
||||
### output
|
||||
output_dir: saves/llama3-8b/lora/sft
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
|
||||
### train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 8
|
||||
learning_rate: 0.0001
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_steps: 0.1
|
||||
fp16: true
|
||||
|
||||
### eval
|
||||
val_size: 0.1
|
||||
per_device_eval_batch_size: 1
|
||||
evaluation_strategy: steps
|
||||
eval_steps: 500
|
|
@ -0,0 +1,10 @@
|
|||
#!/bin/bash
|
||||
# DO NOT use GPTQ/AWQ model in FSDP+QLoRA
|
||||
|
||||
pip install "transformers>=4.39.1"
|
||||
pip install "accelerate>=0.28.0"
|
||||
pip install "bitsandbytes>=0.43.0"
|
||||
|
||||
CUDA_VISIBLE_DEVICES=0,1 accelerate launch \
|
||||
--config_file examples/accelerate/fsdp_config.yaml \
|
||||
src/train.py examples/extras/fsdp_qlora/llama3_lora_sft.yaml
|
|
@ -0,0 +1,42 @@
|
|||
### model
|
||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
|
||||
### method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: full
|
||||
use_galore: true
|
||||
galore_layerwise: true
|
||||
galore_target: mlp,self_attn
|
||||
galore_rank: 128
|
||||
galore_scale: 2.0
|
||||
|
||||
### dataset
|
||||
dataset: identity,alpaca_en_demo
|
||||
template: llama3
|
||||
cutoff_len: 1024
|
||||
max_samples: 1000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
|
||||
### output
|
||||
output_dir: saves/llama3-8b/full/sft
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
|
||||
### train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 1
|
||||
learning_rate: 0.0001
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_steps: 0.1
|
||||
pure_bf16: true
|
||||
|
||||
### eval
|
||||
val_size: 0.1
|
||||
per_device_eval_batch_size: 1
|
||||
evaluation_strategy: steps
|
||||
eval_steps: 500
|
|
@ -0,0 +1,6 @@
|
|||
#!/bin/bash
|
||||
|
||||
python scripts/llama_pro.py \
|
||||
--model_name_or_path meta-llama/Meta-Llama-3-8B-Instruct \
|
||||
--output_dir models/llama3-8b-instruct-pro \
|
||||
--num_expand 8
|
|
@ -0,0 +1,40 @@
|
|||
### model
|
||||
model_name_or_path: models/llama3-8b-instruct-pro
|
||||
|
||||
### method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: freeze
|
||||
freeze_trainable_layers: 8
|
||||
freeze_trainable_modules: all
|
||||
use_llama_pro: true
|
||||
|
||||
### dataset
|
||||
dataset: identity,alpaca_en_demo
|
||||
template: llama3
|
||||
cutoff_len: 1024
|
||||
max_samples: 1000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
|
||||
### output
|
||||
output_dir: saves/llama3-8b-instruct-pro/freeze/sft
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
|
||||
### train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 8
|
||||
learning_rate: 0.0001
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_steps: 0.1
|
||||
fp16: true
|
||||
|
||||
### eval
|
||||
val_size: 0.1
|
||||
per_device_eval_batch_size: 1
|
||||
evaluation_strategy: steps
|
||||
eval_steps: 500
|
|
@ -0,0 +1,39 @@
|
|||
### model
|
||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
|
||||
### method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: lora
|
||||
lora_target: q_proj,v_proj
|
||||
loraplus_lr_ratio: 16.0
|
||||
|
||||
### dataset
|
||||
dataset: identity,alpaca_en_demo
|
||||
template: llama3
|
||||
cutoff_len: 1024
|
||||
max_samples: 1000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
|
||||
### output
|
||||
output_dir: saves/llama3-8b/lora/sft
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
|
||||
### train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 8
|
||||
learning_rate: 0.0001
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_steps: 0.1
|
||||
fp16: true
|
||||
|
||||
### eval
|
||||
val_size: 0.1
|
||||
per_device_eval_batch_size: 1
|
||||
evaluation_strategy: steps
|
||||
eval_steps: 500
|
|
@ -0,0 +1,39 @@
|
|||
### model
|
||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
|
||||
### method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: full
|
||||
mixture_of_depths: convert
|
||||
|
||||
### dataset
|
||||
dataset: identity,alpaca_en_demo
|
||||
template: llama3
|
||||
cutoff_len: 1024
|
||||
max_samples: 1000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
|
||||
### output
|
||||
output_dir: saves/llama3-8b-mod/full/sft
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
|
||||
### train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 8
|
||||
optim: paged_adamw_8bit
|
||||
learning_rate: 0.0001
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_steps: 0.1
|
||||
pure_bf16: true
|
||||
|
||||
### eval
|
||||
val_size: 0.1
|
||||
per_device_eval_batch_size: 1
|
||||
evaluation_strategy: steps
|
||||
eval_steps: 500
|
|
@ -0,0 +1,23 @@
|
|||
### model
|
||||
model_name_or_path: saves/llama3-8b/full/sft
|
||||
|
||||
### method
|
||||
stage: sft
|
||||
do_predict: true
|
||||
finetuning_type: full
|
||||
|
||||
### dataset
|
||||
dataset: identity,alpaca_en_demo
|
||||
template: llama3
|
||||
cutoff_len: 1024
|
||||
max_samples: 50
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
|
||||
### output
|
||||
output_dir: saves/llama3-8b/full/predict
|
||||
overwrite_output_dir: true
|
||||
|
||||
### eval
|
||||
per_device_eval_batch_size: 1
|
||||
predict_with_generate: true
|
|
@ -0,0 +1,41 @@
|
|||
### model
|
||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
|
||||
### method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: full
|
||||
|
||||
### ddp
|
||||
ddp_timeout: 180000000
|
||||
deepspeed: examples/deepspeed/ds_z3_config.json
|
||||
|
||||
### dataset
|
||||
dataset: identity,alpaca_en_demo
|
||||
template: llama3
|
||||
cutoff_len: 1024
|
||||
max_samples: 1000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
|
||||
### output
|
||||
output_dir: saves/llama3-8b/full/sft
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
|
||||
### train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 2
|
||||
learning_rate: 0.0001
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_steps: 0.1
|
||||
fp16: true
|
||||
|
||||
### eval
|
||||
val_size: 0.1
|
||||
per_device_eval_batch_size: 1
|
||||
evaluation_strategy: steps
|
||||
eval_steps: 500
|
|
@ -0,0 +1,15 @@
|
|||
#!/bin/bash
|
||||
|
||||
NPROC_PER_NODE=4
|
||||
NNODES=2
|
||||
RANK=0
|
||||
MASTER_ADDR=192.168.0.1
|
||||
MASTER_PORT=29500
|
||||
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun \
|
||||
--nproc_per_node $NPROC_PER_NODE \
|
||||
--nnodes $NNODES \
|
||||
--node_rank $RANK \
|
||||
--master_addr $MASTER_ADDR \
|
||||
--master_port $MASTER_PORT \
|
||||
src/train.py examples/full_multi_gpu/llama3_full_sft.yaml
|
|
@ -0,0 +1,5 @@
|
|||
#!/bin/bash
|
||||
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch \
|
||||
--config_file examples/accelerate/single_config.yaml \
|
||||
src/train.py examples/full_multi_gpu/llama3_full_predict.yaml
|
|
@ -0,0 +1,15 @@
|
|||
#!/bin/bash
|
||||
|
||||
NPROC_PER_NODE=4
|
||||
NNODES=1
|
||||
RANK=0
|
||||
MASTER_ADDR=127.0.0.1
|
||||
MASTER_PORT=29500
|
||||
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun \
|
||||
--nproc_per_node $NPROC_PER_NODE \
|
||||
--nnodes $NNODES \
|
||||
--node_rank $RANK \
|
||||
--master_addr $MASTER_ADDR \
|
||||
--master_port $MASTER_PORT \
|
||||
src/train.py examples/full_multi_gpu/llama3_full_sft.yaml
|
|
@ -0,0 +1,2 @@
|
|||
model_name_or_path: /workspace/models/fm9g_2b_hf_models
|
||||
template: 9g
|
|
@ -0,0 +1,2 @@
|
|||
model_name_or_path: /workspace/zksc/LLaMA-Factory/models/fm9g_lora_sft
|
||||
template: 9g
|
|
@ -0,0 +1,2 @@
|
|||
model_name_or_path: /workspace/zksc/LLaMA-Factory/models/fm9g_lora_sft
|
||||
template: 9g
|
|
@ -0,0 +1,2 @@
|
|||
model_name_or_path: /workspace/zksc/LLaMA-Factory/models/fm9g_lora_sft_codegen
|
||||
template: 9g
|
|
@ -0,0 +1,2 @@
|
|||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
template: llama3
|
|
@ -0,0 +1,4 @@
|
|||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
adapter_name_or_path: saves/llama3-8b/lora/sft
|
||||
template: llama3
|
||||
finetuning_type: lora
|
|
@ -0,0 +1,4 @@
|
|||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
template: llama3
|
||||
infer_backend: vllm
|
||||
vllm_enforce_eager: true
|
|
@ -0,0 +1,15 @@
|
|||
#!/bin/bash
|
||||
|
||||
NPROC_PER_NODE=4
|
||||
NNODES=1
|
||||
RANK=0
|
||||
MASTER_ADDR=127.0.0.1
|
||||
MASTER_PORT=29500
|
||||
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun \
|
||||
--nproc_per_node $NPROC_PER_NODE \
|
||||
--nnodes $NNODES \
|
||||
--node_rank $RANK \
|
||||
--master_addr $MASTER_ADDR \
|
||||
--master_port $MASTER_PORT \
|
||||
src/train.py examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
|
|
@ -0,0 +1,41 @@
|
|||
### model
|
||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
|
||||
### method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: lora
|
||||
lora_target: q_proj,v_proj
|
||||
|
||||
### ddp
|
||||
ddp_timeout: 180000000
|
||||
|
||||
### dataset
|
||||
dataset: identity,alpaca_en_demo
|
||||
template: llama3
|
||||
cutoff_len: 1024
|
||||
max_samples: 1000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
|
||||
### output
|
||||
output_dir: saves/llama3-8b/lora/sft
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
|
||||
### train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 2
|
||||
learning_rate: 0.0001
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_steps: 0.1
|
||||
fp16: true
|
||||
|
||||
### eval
|
||||
val_size: 0.1
|
||||
per_device_eval_batch_size: 1
|
||||
evaluation_strategy: steps
|
||||
eval_steps: 500
|
|
@ -0,0 +1,42 @@
|
|||
### model
|
||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
|
||||
### method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: lora
|
||||
lora_target: q_proj,v_proj
|
||||
|
||||
### ddp
|
||||
ddp_timeout: 180000000
|
||||
deepspeed: examples/deepspeed/ds_z3_config.json
|
||||
|
||||
### dataset
|
||||
dataset: identity,alpaca_en_demo
|
||||
template: llama3
|
||||
cutoff_len: 1024
|
||||
max_samples: 1000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
|
||||
### output
|
||||
output_dir: saves/llama3-8b/lora/sft
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
|
||||
### train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 2
|
||||
learning_rate: 0.0001
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_steps: 0.1
|
||||
fp16: true
|
||||
|
||||
### eval
|
||||
val_size: 0.1
|
||||
per_device_eval_batch_size: 1
|
||||
evaluation_strategy: steps
|
||||
eval_steps: 500
|
|
@ -0,0 +1,6 @@
|
|||
#!/bin/bash
|
||||
# also launch it on slave machine using slave_config.yaml
|
||||
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch \
|
||||
--config_file examples/accelerate/master_config.yaml \
|
||||
src/train.py examples/lora_multi_gpu/llama3_lora_sft.yaml
|
|
@ -0,0 +1,5 @@
|
|||
#!/bin/bash
|
||||
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch \
|
||||
--config_file examples/accelerate/single_config.yaml \
|
||||
src/train.py examples/lora_multi_gpu/llama3_lora_sft.yaml
|
|
@ -0,0 +1,15 @@
|
|||
#!/bin/bash
|
||||
|
||||
NPROC_PER_NODE=4
|
||||
NNODES=1
|
||||
RANK=0
|
||||
MASTER_ADDR=127.0.0.1
|
||||
MASTER_PORT=29500
|
||||
|
||||
ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 torchrun \
|
||||
--nproc_per_node $NPROC_PER_NODE \
|
||||
--nnodes $NNODES \
|
||||
--node_rank $RANK \
|
||||
--master_addr $MASTER_ADDR \
|
||||
--master_port $MASTER_PORT \
|
||||
src/train.py examples/lora_multi_npu/llama3_lora_sft_ds.yaml
|
|
@ -0,0 +1,42 @@
|
|||
### model
|
||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
|
||||
### method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: lora
|
||||
lora_target: q_proj,v_proj
|
||||
|
||||
### ddp
|
||||
ddp_timeout: 180000000
|
||||
deepspeed: examples/deepspeed/ds_z0_config.json
|
||||
|
||||
### dataset
|
||||
dataset: identity,alpaca_en_demo
|
||||
template: llama3
|
||||
cutoff_len: 1024
|
||||
max_samples: 1000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
|
||||
### output
|
||||
output_dir: saves/llama3-8b/lora/sft
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
|
||||
### train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 2
|
||||
learning_rate: 0.0001
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_steps: 0.1
|
||||
fp16: true
|
||||
|
||||
### eval
|
||||
val_size: 0.1
|
||||
per_device_eval_batch_size: 1
|
||||
evaluation_strategy: steps
|
||||
eval_steps: 500
|
|
@ -0,0 +1,19 @@
|
|||
### model
|
||||
model_name_or_path: /workspace/models/fm9g_2b_hf_models
|
||||
adapter_name_or_path: #
|
||||
|
||||
### method
|
||||
finetuning_type: lora
|
||||
|
||||
### dataset
|
||||
task: mmlu
|
||||
split: test
|
||||
template: fewshot
|
||||
lang: en
|
||||
n_shot: 5
|
||||
|
||||
### output
|
||||
save_dir: saves/fm9g/lora/eval
|
||||
|
||||
### eval
|
||||
batch_size: 1
|
|
@ -0,0 +1,38 @@
|
|||
### model
|
||||
model_name_or_path: /workspace/models/fm9g_2b_hf_models
|
||||
|
||||
### method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: lora
|
||||
lora_target: q_proj,v_proj
|
||||
|
||||
### dataset
|
||||
dataset: identity,alpaca_en_demo
|
||||
template: 9g
|
||||
cutoff_len: 1024
|
||||
max_samples: 1000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
|
||||
### output
|
||||
output_dir: saves/fm9g/lora/sft
|
||||
logging_steps: 10
|
||||
save_steps: 10 #500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
|
||||
### train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 8
|
||||
learning_rate: 0.0001
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_steps: 0.1
|
||||
fp16: true
|
||||
|
||||
### eval
|
||||
val_size: 0.1
|
||||
per_device_eval_batch_size: 1
|
||||
evaluation_strategy: steps
|
||||
eval_steps: 10 #500
|
|
@ -0,0 +1,38 @@
|
|||
### model
|
||||
model_name_or_path: /workspace/models/fm9g_2b_hf_models
|
||||
|
||||
### method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: lora
|
||||
lora_target: q_proj,v_proj
|
||||
|
||||
### dataset
|
||||
dataset: army_uns_question,army_question
|
||||
template: 9g
|
||||
cutoff_len: 1024
|
||||
max_samples: 1000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
|
||||
### output
|
||||
output_dir: saves/fm9g/lora/sft
|
||||
logging_steps: 10
|
||||
save_steps: 50 #500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
|
||||
### train
|
||||
per_device_train_batch_size: 3
|
||||
gradient_accumulation_steps: 8
|
||||
learning_rate: 0.0001
|
||||
num_train_epochs: 10.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_steps: 0.1
|
||||
fp16: true
|
||||
|
||||
### eval
|
||||
val_size: 0.1
|
||||
per_device_eval_batch_size: 1
|
||||
evaluation_strategy: steps
|
||||
eval_steps: 10 #500
|
|
@ -0,0 +1,38 @@
|
|||
### model
|
||||
model_name_or_path: /workspace/models/fm9g_2b_hf_models
|
||||
|
||||
### method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: lora
|
||||
lora_target: q_proj,v_proj
|
||||
|
||||
### dataset
|
||||
dataset: mbpp
|
||||
template: 9g
|
||||
cutoff_len: 1024
|
||||
max_samples: 1000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
|
||||
### output
|
||||
output_dir: saves/fm9g/lora/sft_codegen
|
||||
logging_steps: 10
|
||||
save_steps: 108 #500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
|
||||
### train
|
||||
per_device_train_batch_size: 4
|
||||
gradient_accumulation_steps: 8
|
||||
learning_rate: 0.0001
|
||||
num_train_epochs: 20.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_steps: 0.1
|
||||
fp16: true
|
||||
|
||||
### eval
|
||||
val_size: 0.1
|
||||
per_device_eval_batch_size: 1
|
||||
evaluation_strategy: steps
|
||||
eval_steps: 27 #500
|
|
@ -0,0 +1,13 @@
|
|||
### Note: DO NOT use quantized model or quantization_bit when merging lora adapters
|
||||
|
||||
### model
|
||||
model_name_or_path: /workspace/models/fm9g_2b_hf_models
|
||||
adapter_name_or_path: saves/fm9g/lora/sft/checkpoint-10
|
||||
template: 9g
|
||||
finetuning_type: lora
|
||||
|
||||
### export
|
||||
export_dir: models/fm9g_lora_sft
|
||||
export_size: 2
|
||||
export_device: cpu
|
||||
export_legacy_format: false
|
|
@ -0,0 +1,13 @@
|
|||
### Note: DO NOT use quantized model or quantization_bit when merging lora adapters
|
||||
|
||||
### model
|
||||
model_name_or_path: /workspace/models/fm9g_2b_hf_models
|
||||
adapter_name_or_path: saves/fm9g/lora/sft/checkpoint-50
|
||||
template: 9g
|
||||
finetuning_type: lora
|
||||
|
||||
### export
|
||||
export_dir: models/fm9g_lora_sft
|
||||
export_size: 2
|
||||
export_device: cpu
|
||||
export_legacy_format: false
|
|
@ -0,0 +1,13 @@
|
|||
### Note: DO NOT use quantized model or quantization_bit when merging lora adapters
|
||||
|
||||
### model
|
||||
model_name_or_path: /workspace/models/fm9g_2b_hf_models
|
||||
adapter_name_or_path: saves/fm9g/lora/sft_codegen/checkpoint-540
|
||||
template: 9g
|
||||
finetuning_type: lora
|
||||
|
||||
### export
|
||||
export_dir: models/fm9g_lora_sft_codegen
|
||||
export_size: 2
|
||||
export_device: cpu
|
||||
export_legacy_format: false
|
|
@ -0,0 +1,11 @@
|
|||
### model
|
||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
template: llama3
|
||||
|
||||
### export
|
||||
export_dir: models/llama3_gptq
|
||||
export_quantization_bit: 4
|
||||
export_quantization_dataset: data/c4_demo.json
|
||||
export_size: 2
|
||||
export_device: cpu
|
||||
export_legacy_format: false
|
|
@ -0,0 +1,13 @@
|
|||
### Note: DO NOT use quantized model or quantization_bit when merging lora adapters
|
||||
|
||||
### model
|
||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
adapter_name_or_path: saves/llama3-8b/lora/sft
|
||||
template: llama3
|
||||
finetuning_type: lora
|
||||
|
||||
### export
|
||||
export_dir: models/llama3_lora_sft
|
||||
export_size: 2
|
||||
export_device: cpu
|
||||
export_legacy_format: false
|
|
@ -0,0 +1,38 @@
|
|||
### model
|
||||
model_name_or_path: ISTA-DASLab/Meta-Llama-3-8B-Instruct-AQLM-2Bit-1x16
|
||||
|
||||
### method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: lora
|
||||
lora_target: q_proj,v_proj
|
||||
|
||||
### dataset
|
||||
dataset: identity,alpaca_en_demo
|
||||
template: llama3
|
||||
cutoff_len: 1024
|
||||
max_samples: 1000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
|
||||
### output
|
||||
output_dir: saves/llama3-8b/lora/sft
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
|
||||
### train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 8
|
||||
learning_rate: 0.0001
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_steps: 0.1
|
||||
fp16: true
|
||||
|
||||
### eval
|
||||
val_size: 0.1
|
||||
per_device_eval_batch_size: 1
|
||||
evaluation_strategy: steps
|
||||
eval_steps: 500
|
|
@ -0,0 +1,38 @@
|
|||
### model
|
||||
model_name_or_path: TechxGenus/Meta-Llama-3-8B-Instruct-AWQ
|
||||
|
||||
### method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: lora
|
||||
lora_target: q_proj,v_proj
|
||||
|
||||
### dataset
|
||||
dataset: identity,alpaca_en_demo
|
||||
template: llama3
|
||||
cutoff_len: 1024
|
||||
max_samples: 1000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
|
||||
### output
|
||||
output_dir: saves/llama3-8b/lora/sft
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
|
||||
### train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 8
|
||||
learning_rate: 0.0001
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_steps: 0.1
|
||||
fp16: true
|
||||
|
||||
### eval
|
||||
val_size: 0.1
|
||||
per_device_eval_batch_size: 1
|
||||
evaluation_strategy: steps
|
||||
eval_steps: 500
|
|
@ -0,0 +1,39 @@
|
|||
### model
|
||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
quantization_bit: 4
|
||||
|
||||
### method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: lora
|
||||
lora_target: q_proj,v_proj
|
||||
|
||||
### dataset
|
||||
dataset: identity,alpaca_en_demo
|
||||
template: llama3
|
||||
cutoff_len: 1024
|
||||
max_samples: 1000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
|
||||
### output
|
||||
output_dir: saves/llama3-8b/lora/sft
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
|
||||
### train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 8
|
||||
learning_rate: 0.0001
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_steps: 0.1
|
||||
fp16: true
|
||||
|
||||
### eval
|
||||
val_size: 0.1
|
||||
per_device_eval_batch_size: 1
|
||||
evaluation_strategy: steps
|
||||
eval_steps: 500
|
|
@ -0,0 +1,38 @@
|
|||
### model
|
||||
model_name_or_path: TechxGenus/Meta-Llama-3-8B-Instruct-GPTQ
|
||||
|
||||
### method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: lora
|
||||
lora_target: q_proj,v_proj
|
||||
|
||||
### dataset
|
||||
dataset: identity,alpaca_en_demo
|
||||
template: llama3
|
||||
cutoff_len: 1024
|
||||
max_samples: 1000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
|
||||
### output
|
||||
output_dir: saves/llama3-8b/lora/sft
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
|
||||
### train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 8
|
||||
learning_rate: 0.0001
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_steps: 0.1
|
||||
fp16: true
|
||||
|
||||
### eval
|
||||
val_size: 0.1
|
||||
per_device_eval_batch_size: 1
|
||||
evaluation_strategy: steps
|
||||
eval_steps: 500
|
|
@ -0,0 +1,33 @@
|
|||
[build-system]
|
||||
requires = ["setuptools>=61.0"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[tool.ruff]
|
||||
target-version = "py38"
|
||||
line-length = 119
|
||||
indent-width = 4
|
||||
|
||||
[tool.ruff.lint]
|
||||
ignore = ["C408", "C901", "E501", "E731", "E741", "W605"]
|
||||
select = ["C", "E", "F", "I", "W"]
|
||||
|
||||
[tool.ruff.lint.isort]
|
||||
lines-after-imports = 2
|
||||
known-first-party = ["llamafactory"]
|
||||
known-third-party = [
|
||||
"accelerate",
|
||||
"datasets",
|
||||
"gradio",
|
||||
"numpy",
|
||||
"peft",
|
||||
"torch",
|
||||
"transformers",
|
||||
"trl"
|
||||
]
|
||||
|
||||
[tool.ruff.format]
|
||||
quote-style = "double"
|
||||
indent-style = "space"
|
||||
docstring-code-format = true
|
||||
skip-magic-trailing-comma = false
|
||||
line-ending = "auto"
|
|
@ -0,0 +1,18 @@
|
|||
transformers>=4.37.2
|
||||
datasets>=2.14.3
|
||||
accelerate>=0.27.2
|
||||
peft>=0.10.0
|
||||
trl>=0.8.1
|
||||
gradio>=4.0.0
|
||||
scipy
|
||||
einops
|
||||
sentencepiece
|
||||
protobuf
|
||||
uvicorn
|
||||
pydantic
|
||||
fastapi
|
||||
sse-starlette
|
||||
matplotlib>=3.7.0
|
||||
fire
|
||||
packaging
|
||||
pyyaml
|
|
@ -0,0 +1,31 @@
|
|||
# coding=utf-8
|
||||
# Calculates the flops of pre-trained models.
|
||||
# Usage: python cal_flops.py --model_name_or_path path_to_model --batch_size 1 --seq_length 512
|
||||
# Inspired by: https://www.deepspeed.ai/tutorials/flops-profiler/
|
||||
|
||||
import fire
|
||||
import torch
|
||||
from deepspeed.accelerator import get_accelerator # type: ignore
|
||||
from deepspeed.profiling.flops_profiler import get_model_profile # type: ignore
|
||||
|
||||
from llamafactory.chat import ChatModel
|
||||
|
||||
|
||||
def calculate_flops(
|
||||
model_name_or_path: str,
|
||||
batch_size: int = 1,
|
||||
seq_length: int = 256,
|
||||
flash_attn: str = "auto",
|
||||
):
|
||||
with get_accelerator().device(0):
|
||||
chat_model = ChatModel(dict(model_name_or_path=model_name_or_path, template="empty", flash_attn=flash_attn))
|
||||
fake_input = torch.ones((batch_size, seq_length), dtype=torch.long, device=chat_model.model.device)
|
||||
input_dict = {"input_ids": fake_input, "labels": fake_input.clone()}
|
||||
flops, macs, params = get_model_profile(chat_model.model, kwargs=input_dict, print_profile=True, detailed=True)
|
||||
print("FLOPs:", flops)
|
||||
print("MACs:", macs)
|
||||
print("Params:", params)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
fire.Fire(calculate_flops)
|
|
@ -0,0 +1,76 @@
|
|||
# coding=utf-8
|
||||
# Calculates the optimal learning rate for 7B/13B models using LLaMA's hyper-parameters.
|
||||
# Usage: python cal_lr.py --model_name_or_path path_to_model --dataset alpaca_en --cutoff_len 1024 --batch_size 16
|
||||
# Inspired by: https://github.com/imoneoi/openchat/blob/master/ochat/training_deepspeed/train.py
|
||||
|
||||
import math
|
||||
from typing import Literal
|
||||
|
||||
import fire
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
from tqdm import tqdm
|
||||
from transformers import DataCollatorForLanguageModeling, DataCollatorForSeq2Seq
|
||||
|
||||
from llamafactory.data import get_dataset
|
||||
from llamafactory.extras.constants import IGNORE_INDEX
|
||||
from llamafactory.hparams import get_train_args
|
||||
from llamafactory.model import load_tokenizer
|
||||
|
||||
|
||||
BASE_LR = 3e-4 # 1.5e-4 for 30B-70B models
|
||||
BASE_BS = 4_000_000 # from llama paper
|
||||
|
||||
|
||||
def calculate_lr(
|
||||
model_name_or_path: str,
|
||||
batch_size: int, # total batch size, namely (batch size * gradient accumulation * world size)
|
||||
stage: Literal["pt", "sft"] = "sft",
|
||||
dataset: str = "alpaca_en",
|
||||
dataset_dir: str = "data",
|
||||
template: str = "default",
|
||||
cutoff_len: int = 1024, # i.e. maximum input length during training
|
||||
is_mistral: bool = False, # mistral model uses a smaller learning rate,
|
||||
):
|
||||
model_args, data_args, training_args, _, _ = get_train_args(
|
||||
dict(
|
||||
stage=stage,
|
||||
model_name_or_path=model_name_or_path,
|
||||
dataset=dataset,
|
||||
dataset_dir=dataset_dir,
|
||||
template=template,
|
||||
cutoff_len=cutoff_len,
|
||||
output_dir="dummy_dir",
|
||||
overwrite_cache=True,
|
||||
)
|
||||
)
|
||||
tokenizer_module = load_tokenizer(model_args)
|
||||
tokenizer = tokenizer_module["tokenizer"]
|
||||
trainset = get_dataset(model_args, data_args, training_args, stage, **tokenizer_module)
|
||||
if stage == "pt":
|
||||
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
|
||||
elif stage == "sft":
|
||||
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, label_pad_token_id=IGNORE_INDEX)
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
dataloader = DataLoader(trainset, batch_size, shuffle=False, collate_fn=data_collator, pin_memory=True)
|
||||
valid_tokens, total_tokens = 0, 0
|
||||
for batch in tqdm(dataloader):
|
||||
valid_tokens += torch.sum(batch["labels"] != IGNORE_INDEX).item()
|
||||
total_tokens += torch.numel(batch["labels"])
|
||||
|
||||
batch_max_len = cutoff_len * batch_size # max tokens in a batch
|
||||
valid_ratio = valid_tokens / total_tokens
|
||||
batch_valid_len = batch_max_len * valid_ratio
|
||||
lr = BASE_LR * math.sqrt(batch_valid_len / BASE_BS) # lr ~ sqrt(batch_size)
|
||||
lr = lr / 6.0 if is_mistral else lr
|
||||
print(
|
||||
"Optimal learning rate is {:.2e} for valid ratio% {:.2f} and effective batch size {:.2f}".format(
|
||||
lr, valid_ratio * 100, batch_valid_len
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
fire.Fire(calculate_lr)
|
|
@ -0,0 +1,116 @@
|
|||
# coding=utf-8
|
||||
# Calculates the ppl on the dataset of the pre-trained models.
|
||||
# Usage: python cal_ppl.py --model_name_or_path path_to_model --save_name ppl.json
|
||||
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, Literal, Optional, Sequence
|
||||
|
||||
import fire
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
from tqdm import tqdm
|
||||
from transformers import DataCollatorForLanguageModeling, DataCollatorForSeq2Seq
|
||||
|
||||
from llamafactory.data import get_dataset
|
||||
from llamafactory.extras.constants import IGNORE_INDEX
|
||||
from llamafactory.hparams import get_train_args
|
||||
from llamafactory.model import load_model, load_tokenizer
|
||||
|
||||
|
||||
@dataclass
|
||||
class PairwiseDataCollatorWithPadding(DataCollatorForSeq2Seq):
|
||||
r"""
|
||||
Data collator for pairwise data.
|
||||
"""
|
||||
|
||||
train_on_prompt: bool = False
|
||||
|
||||
def __call__(self, features: Sequence[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
|
||||
r"""
|
||||
Pads batched data to the longest sequence in the batch.
|
||||
|
||||
We generate 2 * n examples where the first n examples represent chosen examples and
|
||||
the last n examples represent rejected examples.
|
||||
"""
|
||||
chosen_features = []
|
||||
for feature in features:
|
||||
prompt_len, answer_len = len(feature["prompt_ids"]), len(feature["chosen_ids"])
|
||||
input_ids = feature["prompt_ids"] + feature["chosen_ids"]
|
||||
attention_mask = [1] * (prompt_len + answer_len)
|
||||
labels = input_ids if self.train_on_prompt else [IGNORE_INDEX] * prompt_len + feature["chosen_ids"]
|
||||
chosen_features.append({"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels})
|
||||
|
||||
return super().__call__(chosen_features)
|
||||
|
||||
|
||||
def cal_ppl(
|
||||
model_name_or_path: str,
|
||||
save_name: str,
|
||||
batch_size: int = 4,
|
||||
stage: Literal["pt", "sft", "rm"] = "sft",
|
||||
dataset: str = "alpaca_en",
|
||||
dataset_dir: str = "data",
|
||||
template: str = "default",
|
||||
cutoff_len: int = 1024,
|
||||
max_samples: Optional[int] = None,
|
||||
train_on_prompt: bool = False,
|
||||
):
|
||||
model_args, data_args, training_args, finetuning_args, _ = get_train_args(
|
||||
dict(
|
||||
stage=stage,
|
||||
model_name_or_path=model_name_or_path,
|
||||
dataset=dataset,
|
||||
dataset_dir=dataset_dir,
|
||||
template=template,
|
||||
cutoff_len=cutoff_len,
|
||||
max_samples=max_samples,
|
||||
train_on_prompt=train_on_prompt,
|
||||
output_dir="dummy_dir",
|
||||
overwrite_cache=True,
|
||||
)
|
||||
)
|
||||
tokenizer_module = load_tokenizer(model_args)
|
||||
tokenizer = tokenizer_module["tokenizer"]
|
||||
trainset = get_dataset(model_args, data_args, training_args, stage, **tokenizer_module)
|
||||
model = load_model(tokenizer, model_args, finetuning_args, is_trainable=False)
|
||||
if stage == "pt":
|
||||
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
|
||||
elif stage == "sft":
|
||||
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, label_pad_token_id=IGNORE_INDEX)
|
||||
elif stage == "rm":
|
||||
data_collator = PairwiseDataCollatorWithPadding(
|
||||
tokenizer=tokenizer, label_pad_token_id=IGNORE_INDEX, train_on_prompt=train_on_prompt
|
||||
)
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
dataloader = DataLoader(trainset, batch_size, shuffle=False, collate_fn=data_collator, pin_memory=True)
|
||||
criterion = torch.nn.CrossEntropyLoss(reduction="none")
|
||||
total_ppl = 0
|
||||
perplexities = []
|
||||
batch: Dict[str, "torch.Tensor"]
|
||||
with torch.no_grad():
|
||||
for batch in tqdm(dataloader):
|
||||
batch = batch.to(model.device)
|
||||
outputs = model(**batch)
|
||||
shift_logits: "torch.Tensor" = outputs["logits"][..., :-1, :]
|
||||
shift_labels: "torch.Tensor" = batch["labels"][..., 1:]
|
||||
loss_mask = shift_labels != IGNORE_INDEX
|
||||
flatten_logits = shift_logits.contiguous().view(shift_labels.size(0) * shift_labels.size(1), -1)
|
||||
flatten_labels = shift_labels.contiguous().view(-1)
|
||||
token_logps: "torch.Tensor" = criterion(flatten_logits, flatten_labels)
|
||||
token_logps = token_logps.contiguous().view(shift_logits.size(0), -1)
|
||||
sentence_logps = (token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
|
||||
total_ppl += sentence_logps.exp().sum().item()
|
||||
perplexities.extend(sentence_logps.exp().tolist())
|
||||
|
||||
with open(save_name, "w", encoding="utf-8") as f:
|
||||
json.dump(perplexities, f, indent=2)
|
||||
|
||||
print("Average perplexity is {:.2f}".format(total_ppl / len(perplexities)))
|
||||
print("Perplexities have been saved at {}.".format(save_name))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
fire.Fire(cal_ppl)
|
|
@ -0,0 +1,51 @@
|
|||
# coding=utf-8
|
||||
# Calculates the distribution of the input lengths in the dataset.
|
||||
# Usage: python length_cdf.py --model_name_or_path path_to_model --dataset alpaca_en --template default
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
import fire
|
||||
from tqdm import tqdm
|
||||
|
||||
from llamafactory.data import get_dataset
|
||||
from llamafactory.hparams import get_train_args
|
||||
from llamafactory.model import load_tokenizer
|
||||
|
||||
|
||||
def length_cdf(
|
||||
model_name_or_path: str,
|
||||
dataset: str = "alpaca_en",
|
||||
dataset_dir: str = "data",
|
||||
template: str = "default",
|
||||
interval: int = 1000,
|
||||
):
|
||||
model_args, data_args, training_args, _, _ = get_train_args(
|
||||
dict(
|
||||
stage="sft",
|
||||
model_name_or_path=model_name_or_path,
|
||||
dataset=dataset,
|
||||
dataset_dir=dataset_dir,
|
||||
template=template,
|
||||
cutoff_len=1_000_000,
|
||||
output_dir="dummy_dir",
|
||||
overwrite_cache=True,
|
||||
)
|
||||
)
|
||||
tokenizer_module = load_tokenizer(model_args)
|
||||
trainset = get_dataset(model_args, data_args, training_args, stage="sft", **tokenizer_module)
|
||||
total_num = len(trainset)
|
||||
length_dict = defaultdict(int)
|
||||
for sample in tqdm(trainset["input_ids"]):
|
||||
length_dict[len(sample) // interval * interval] += 1
|
||||
|
||||
length_tuples = list(length_dict.items())
|
||||
length_tuples.sort()
|
||||
count_accu, prob_accu = 0, 0
|
||||
for length, count in length_tuples:
|
||||
count_accu += count
|
||||
prob_accu += count / total_num * 100
|
||||
print("{:d} ({:.2f}%) samples have length < {}.".format(count_accu, prob_accu, length + interval))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
fire.Fire(length_cdf)
|
|
@ -0,0 +1,114 @@
|
|||
# coding=utf-8
|
||||
# Performs block expansion for LLaMA, Mistral, Qwen1.5 or Yi models.
|
||||
# Usage: python llama_pro.py --model_name_or_path meta-llama/Llama-2-7b-hf --output_dir llama2_pro --num_expand 8
|
||||
# Inspired by: https://github.com/TencentARC/LLaMA-Pro/blob/main/scripts/block_expansion.py
|
||||
|
||||
import json
|
||||
import os
|
||||
from collections import OrderedDict
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
|
||||
import fire
|
||||
import torch
|
||||
from safetensors.torch import save_file
|
||||
from tqdm import tqdm
|
||||
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
|
||||
from transformers.modeling_utils import (
|
||||
SAFE_WEIGHTS_INDEX_NAME,
|
||||
SAFE_WEIGHTS_NAME,
|
||||
WEIGHTS_INDEX_NAME,
|
||||
WEIGHTS_NAME,
|
||||
shard_checkpoint,
|
||||
)
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from transformers import PretrainedConfig, PreTrainedModel
|
||||
|
||||
|
||||
def change_name(name: str, old_index: int, new_index: int) -> str:
|
||||
return name.replace(".{:d}.".format(old_index), ".{:d}.".format(new_index))
|
||||
|
||||
|
||||
def block_expansion(
|
||||
model_name_or_path: str,
|
||||
output_dir: str,
|
||||
num_expand: int,
|
||||
shard_size: Optional[str] = "2GB",
|
||||
save_safetensors: Optional[bool] = False,
|
||||
):
|
||||
config: "PretrainedConfig" = AutoConfig.from_pretrained(model_name_or_path)
|
||||
num_layers = getattr(config, "num_hidden_layers")
|
||||
setattr(config, "num_hidden_layers", num_layers + num_expand)
|
||||
config.save_pretrained(output_dir)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
|
||||
tokenizer.save_pretrained(output_dir)
|
||||
|
||||
config: "PretrainedConfig" = AutoConfig.from_pretrained(model_name_or_path) # load the original one
|
||||
if save_safetensors:
|
||||
setattr(config, "tie_word_embeddings", False) # safetensors does not allow shared weights
|
||||
|
||||
model: "PreTrainedModel" = AutoModelForCausalLM.from_pretrained(
|
||||
model_name_or_path,
|
||||
config=config,
|
||||
torch_dtype="auto",
|
||||
trust_remote_code=True,
|
||||
low_cpu_mem_usage=True,
|
||||
)
|
||||
state_dict = model.state_dict()
|
||||
|
||||
if num_layers % num_expand != 0:
|
||||
raise ValueError("`num_layers` {} should be divisible by `num_expand` {}.".format(num_layers, num_expand))
|
||||
|
||||
split = num_layers // num_expand
|
||||
layer_cnt = 0
|
||||
output_state_dict = OrderedDict()
|
||||
for i in range(num_layers):
|
||||
for key, value in state_dict.items():
|
||||
if ".{:d}.".format(i) in key:
|
||||
output_state_dict[change_name(key, i, layer_cnt)] = value
|
||||
|
||||
print("Add layer {} copied from layer {}".format(layer_cnt, i))
|
||||
layer_cnt += 1
|
||||
if (i + 1) % split == 0:
|
||||
for key, value in state_dict.items():
|
||||
if ".{:d}.".format(i) in key:
|
||||
if "down_proj" in key or "o_proj" in key:
|
||||
output_state_dict[change_name(key, i, layer_cnt)] = torch.zeros_like(value)
|
||||
else:
|
||||
output_state_dict[change_name(key, i, layer_cnt)] = torch.clone(value)
|
||||
|
||||
print("Add layer {} expanded from layer {}".format(layer_cnt, i))
|
||||
layer_cnt += 1
|
||||
|
||||
for key, value in state_dict.items():
|
||||
if key not in output_state_dict:
|
||||
output_state_dict[key] = value
|
||||
|
||||
weights_name = SAFE_WEIGHTS_NAME if save_safetensors else WEIGHTS_NAME
|
||||
shards, index = shard_checkpoint(output_state_dict, max_shard_size=shard_size, weights_name=weights_name)
|
||||
|
||||
for shard_file, shard in tqdm(shards.items(), desc="Save weights"):
|
||||
if save_safetensors:
|
||||
save_file(shard, os.path.join(output_dir, shard_file), metadata={"format": "pt"})
|
||||
else:
|
||||
torch.save(shard, os.path.join(output_dir, shard_file))
|
||||
|
||||
if index is None:
|
||||
print("Model weights saved in {}".format(os.path.join(output_dir, weights_name)))
|
||||
else:
|
||||
index_name = SAFE_WEIGHTS_INDEX_NAME if save_safetensors else WEIGHTS_INDEX_NAME
|
||||
with open(os.path.join(output_dir, index_name), "w", encoding="utf-8") as f:
|
||||
json.dump(index, f, indent=2, sort_keys=True)
|
||||
print("Model weights saved in {}".format(output_dir))
|
||||
|
||||
print("Fine-tune this model with:")
|
||||
print(" --model_name_or_path {} \\".format(output_dir))
|
||||
print(" --finetuning_type freeze \\")
|
||||
print(" --freeze_trainable_layers {} \\".format(num_expand))
|
||||
print(" --use_llama_pro")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
fire.Fire(block_expansion)
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue