diff --git a/quick_start_clean/merge_lora.py b/quick_start_clean/merge_lora.py new file mode 100644 index 0000000..f8eab09 --- /dev/null +++ b/quick_start_clean/merge_lora.py @@ -0,0 +1,74 @@ +import torch +import os +import sys +import shutil +import argparse +os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128" + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--base_path", type=str, help="base model path", required=True) + parser.add_argument("--delta_path", type=str, help="the lora model path", required=True) + parser.add_argument("--merge_path", type=str, help="merge the base and lora model as one models", required=True) + + + args = parser.parse_args() + return args + + +def merge_lora_models(args): + + scale = 64 + + # model = torch.load(f"/home/wangxvjia/9g_models/llama_fin_new/checkpoints-epoch-4/llama-finance-sft-iteration-258-delta.pt") + model = torch.load(args.delta_path) + + dic = {} + num = 0 + allocated_mem = torch.cuda.memory_allocated() + print(f"allocated GPU memory: {allocated_mem/1024**3} GB") + for key, value in model.items(): + print(key) + print(value.shape) + layer_list = key.split('.') + layer = ".".join(layer_list[:-1]) + if layer in dic: + other = dic[layer].cuda() + value = value.cuda() + if layer_list[-1] == "lora_B": + other = torch.mm(value, other).cpu() + alpha = scale / value.shape[1] + else : + other = torch.mm(other, value).cpu() + dic.update({layer: other}) + else: + dic.update({layer: value}) + print("end") + print(f"alpha: {scale} | weight: {alpha}") + + torch.cuda.empty_cache() + print("begin") + base_model = torch.load(args.base_path ,map_location=torch.device('cpu')) + # base_model = torch.load("/data/public/opensource_models/meta-llama/Llama-2-7b-mc/pytorch_model.pt",map_location=torch.device('cpu')) + + for key, value in base_model.items(): + layer_list = key.split('.') + layer = ".".join(layer_list[:-1]) + ".lora" + value = value.cuda() + if layer in dic: + print(layer) + other = dic[layer].cuda() + value = torch.add(value, alpha * other.half()).detach().cpu() + print(value) + value = value.cpu() + base_model.update({key: value}) + + # torch.save(base_model, f"/home/wangxvjia/9g_models/cpm_fin_new_1e4/fin/pytorch_model.pt") + torch.save(base_model, args.merge_path) + + exit(0) + +if __name__=='__main__': + + args = get_args() + merge_lora_models(args) \ No newline at end of file diff --git a/quick_start_clean/readmes/README_ALL.md b/quick_start_clean/readmes/README_ALL.md index a30f0d6..f45b7f7 100644 --- a/quick_start_clean/readmes/README_ALL.md +++ b/quick_start_clean/readmes/README_ALL.md @@ -221,6 +221,9 @@ $CMD ``` +### lora 训练 +[lora 训练](https://www.osredm.com/jiuyuan/CPM-9G-8B/tree/master/quick_start_clean/readmes/README_LORA.md) + ## 模型推理 ```python import os @@ -263,6 +266,9 @@ if __name__ == "__main__": main() ``` +## 分布式多机训练 +[分布式多机训练](https://www.osredm.com/jiuyuan/CPM-9G-8B/tree/master/quick_start_clean/readmes/README_DISTRIBUTED.md) + ## FAQs 常见问题汇总,持续补充ing diff --git a/quick_start_clean/readmes/README_DISTRIBUTED.md b/quick_start_clean/readmes/README_DISTRIBUTED.md new file mode 100644 index 0000000..e5eef76 --- /dev/null +++ b/quick_start_clean/readmes/README_DISTRIBUTED.md @@ -0,0 +1,119 @@ +# 分布式多机训练 + + +- 首先保证机器之间能够通信 +- 每台机器上的训练环境、代码、数据等一致 + +## 简单模式 +这种方式只适用于机器很少的提交方法,比如说两台机器debug调试的时候可以如下操作 +以sft_cpm9g_8b.sh举例 +```shell +# 这儿指定主节点的IP值 +export MASTER_ADDR=g3002 + +#中间省略各种参数配置 + +#--nnodes 指定用几台机器,提交任务后主节点会一直等待通信满足4台机器,直到time out +#--nproc_per_node 每张机器多少张卡 +CMD="torchrun --nnodes=2 --nproc_per_node=8 --rdzv_id=1 --rdzv_backend=c10d --rdzv_endpoint=${MASTER_ADDR}:${MASTER_PORT} ${CPM_PATH}/apps/cpm9g/sft_cpm9g.py ${OPTS}" +``` +接下来,在这两个机器中都执行bash sft_cpm9g_8b.sh,这样就完成一次最简单的多机训练 +不过机器多了之后不推荐这种方式 + +### slurm 集群多机任务提交 + +算力平台使用Slurm调度,常用Slurm命令包括: +``` shell +Slurm命令 功能 +sinfo 查看集群分区状态 +squeue 查看作业队列 +srun, salloc 交互式运行作业 +sbatch 提交作业 +scancel 取消作业 +scontrol 查看和修改作业参数 +sacct 查看已完成作业 +``` + +### 单机任务 +参考脚本 +前面"#SBATCH"是Slurm配置参数,解释如下: +``` shell +●--partition: 使用的队列名称 +●--nodes: 节点数量,用多少台机器 +●--ntasks-per-node:每个节点的进程数,和每节点的GPU数量保持一致 +●--gres=gpu:8:每个节点分配的GPU数量 +●--cpus-per-task:每个任务分配的CPU数量(建议不要修改),该节点的cpu总数为任务数乘以每个任务的cpu数,这个示例脚本中的cpu总数为8x8=64 +``` + +#### 具体示例: + +train.sh: +``` +#!/bin/bash +#SBATCH --partition=gpu1 +#SBATCH --nodelist=g1001 +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=8 +#SBATCH --gres=gpu:8 +#SBATCH --cpus-per-task=8 + +python main.py +``` + +提交任务 +``` +sbatch train.sh +``` + +### 多机任务 +已测试通过torchrun的方式多机训练,需要设置"MASTER_ADDR"和"MASTER_PORT"两个环境变量,先提交一个主节点的任务,获取"MASTER_ADDR",在提交从节点任务。一个4台机器的多机任务的操作示例如下: + +注意:#SBATCH的nodes参数设置为1,slurm的多节点通信与bmtrain的环境变量有冲突,且srun不稳定,推荐采用slurm提交多个单节点任务,用torchrun的方式实现多节点通信。 + +##### 第一步:启动主节点 +train_master.sh: +``` +#!/bin/bash +#SBATCH --partition=gpu1 +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=8 +#SBATCH --gres=gpu:8 +#SBATCH --cpus-per-task=8 +MASTER_ADDR=`hostname` +MASTER_PORT=12345 +echo $MASTER_ADDR +torchrun --nnodes=4 --nproc_per_node=8 --rdzv_id=1 --rdzv_backend=c10d --rdzv_endpoint=${MASTER_ADDR}:${MASTER_PORT} train.py +``` + +提交主节点: +``` +sbatch train_master.sh +``` + +在输出的log(slurm-xxx.log)中查看主节点的名称,例如此时查到主节点是"g1001" + +##### 第二步:启动从节点 +train_slave.sh: +``` +#!/bin/bash +#SBATCH --partition=gpu1 +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=8 +#SBATCH --gres=gpu:8 +#SBATCH --cpus-per-task=8 +MASTER_ADDR=g1001 +MASTER_PORT=12345 +echo $MASTER_ADDR +torchrun --nnodes=4 --nproc_per_node=8 --rdzv_id=1 --rdzv_backend=c10d --rdzv_endpoint=${MASTER_ADDR}:${MASTER_PORT} train.py +``` + +提交从节点,示例是一个4台机器的任务,因此再提交3个从节点程序 +``` +for i in {1..3};do + sbatch train_slave.sh +done +``` + + +#### TODOs +1 完善dockers、K8s集群的分布式多机任务训练 \ No newline at end of file diff --git a/quick_start_clean/readmes/README_LORA.md b/quick_start_clean/readmes/README_LORA.md new file mode 100644 index 0000000..0465c13 --- /dev/null +++ b/quick_start_clean/readmes/README_LORA.md @@ -0,0 +1,82 @@ +# Lora 训练 + +## lora 训练脚本 + +``` shell +#! /bin/bash + +#!/bin/bash +#SBATCH --partition=gpu3 +#SBATCH --nodes=1 +#SBATCH --nodelist=g3005 +#SBATCH --ntasks-per-node=4 +#SBATCH --gres=gpu:4 +#SBATCH --cpus-per-task=4 +#SBATCH --mem=512GB + +export MASTER_ADDR="localhost" +export MASTER_PORT=12347 + +CPM_PATH="/home/wangxvjia/CPM-onlyllama" +EXP_PATH=/home/wangxvjia/9g_models/cpm_fin_new_1e4 +MODEL_NAME="9g-finance-sft" + +OPTS="" +OPTS+=" --vocab /home/wangxvjia/9g_models/vocab.txt" +OPTS+=" --model-config /home/wangxvjia/9g_models/config.json" + +OPTS+=" --train-iters 695" +OPTS+=" --inspect-iters 2000" +OPTS+=" --warmup-iters 20" + +OPTS+=" --lr-decay-style cosine" +OPTS+=" --weight-decay 0.01" +OPTS+=" --clip-grad 1.0" +OPTS+=" --loss-scale 1048576" +OPTS+=" --max-loss-scale 33554432" +OPTS+=" --min-loss-scale 1" +OPTS+=" --loss-scale-steps 32" + +OPTS+=" --offload" +OPTS+=" --batch-size 2" +OPTS+=" --max-length 4096" +OPTS+=" --lr 3e-4" +OPTS+=" --start-step 0" +OPTS+=" --epoch 4" +OPTS+=" --load /data/groups/QY_LLM_Other/anrongqiao/UltraEval/caterpillar_8b_checkpoint-22000-float16.pt" +OPTS+=" --dataset /home/wangxvjia/molora/data_process/fin_9g/train_data_30000" +# TODO 这些 /data 在启元机器上需要改成 /home 下的路径 +OPTS+=" --save ${EXP_PATH}/checkpoints" +OPTS+=" --save-name ${MODEL_NAME}" + +OPTS+=" --delta-tuning" +OPTS+=" --delta-type lora" +OPTS+=" --lora-r 64" # 常用的lora 参数 +OPTS+=" --lora-dropout 0.05" +OPTS+=" --lora-alpha 64" # 常用的lora alpha 参数 +OPTS+=" --lora-layer project_q project_v project_k w_0 w_1 w_out" +OPTS+=" --save-origin-model" + +OPTS+=" $@" + + +CMD="torchrun --nnodes=1 --nproc_per_node=2 --rdzv_id=1 --rdzv_backend=c10d --rdzv_endpoint=${MASTER_ADDR}:${MASTER_PORT} ${CPM_PATH}/apps/cpm9g/sft_cpm9g.py ${OPTS}" + +echo "${CMD}" +$CMD +``` + +## 合并模型 +训练好的lora delta model一般有两种方式 +- 在直接含有lora的推理代码进行推理 +- 将lora delta model参数和original model merge在一起 作为新的模型,但是模型的参数数量并没有增多 + +python merge_lora_delta.py --base_path cpm9g-8b-sft.pt --delta_path cpm9g-lora.pt --merge_path cpm9g-8b-sft_with_lora.pt + + +# lora 推理 + +合并后的lora模型可以直接采用基础模型推理代码 +见[quick start](https://www.osredm.com/jiuyuan/CPM-9G-8B/tree/master/quick_start_clean/readmes/README_ALL.md) + +