#! /bin/bash export MASTER_ADDR="localhost" export MASTER_PORT=12390 CPM_PATH="/workspace/repo/CPM-9G-8B/9G-Train" NO=0 GPU_NUM=multi MAX_STEP=500 EXP_PATH=/workspace/repo/CPM-9G-8B/results/lora/${GPU_NUM}/${MAX_STEP}/${NO}/models/ MODEL_NAME="9g-sft" TB_PATH="/workspace/repo/CPM-9G-8B/results/lora/${GPU_NUM}/${MAX_STEP}/${NO}/logs/" OPTS="" OPTS+=" --vocab /v2/sft_8b_v2/vocab.txt" OPTS+=" --model-config /v2/sft_8b_v2/config.json" OPTS+=" --train-iters 500" OPTS+=" --inspect-iters 500" OPTS+=" --warmup-iters 20" OPTS+=" --lr-decay-style cosine" OPTS+=" --weight-decay 0.01" OPTS+=" --clip-grad 1.0" OPTS+=" --loss-scale 1048576" OPTS+=" --max-loss-scale 33554432" OPTS+=" --min-loss-scale 1" OPTS+=" --loss-scale-steps 32" OPTS+=" --offload" OPTS+=" --batch-size 2" OPTS+=" --max-length 4096" OPTS+=" --lr 3e-4" OPTS+=" --start-step 0" OPTS+=" --epoch 2" OPTS+=" --load /v2/sft_8b_v2/cpm_live_8b-1500-float16.pt" OPTS+=" --dataset /workspace/repo/CPM-9G-8B/dataset_bin" # TODO 这些 /data 在启元机器上需要改成 /home 下的路径 OPTS+=" --save ${EXP_PATH}/checkpoints" OPTS+=" --save-name ${MODEL_NAME}" OPTS+=" --tensorboard ${TB_PATH}" OPTS+=" --delta-tuning" OPTS+=" --delta-type lora" OPTS+=" --lora-r 64" # 常用的lora 参数 OPTS+=" --lora-dropout 0.05" OPTS+=" --lora-alpha 64" # 常用的lora alpha 参数 OPTS+=" --lora-layer project_q project_v project_k w_0 w_1 w_out" OPTS+=" $@" CMD="torchrun --nnodes=1 --nproc_per_node=7 --rdzv_id=1 --rdzv_backend=c10d --rdzv_endpoint=${MASTER_ADDR}:${MASTER_PORT} ${CPM_PATH}/apps/cpm9g/sft_cpm9g_delta.py ${OPTS}" echo "${CMD}" $CMD