CPM-9G-8B/9g_lora_1_step500.sh

57 lines
1.6 KiB
Bash
Raw Normal View History

2024-08-25 11:59:14 +08:00
#! /bin/bash
export MASTER_ADDR="localhost"
2024-08-25 21:09:17 +08:00
export MASTER_PORT=12370
2024-08-25 11:59:14 +08:00
CPM_PATH="/workspace/repo/CPM-9G-8B/9G-Train"
2024-08-25 19:09:19 +08:00
NO=1
MAX_STEP=500
GPU_NUM=multi
EXP_PATH=/workspace/repo/CPM-9G-8B/results/lora/${GPU_NUM}/${MAX_STEP}/${NO}/models/
2024-08-25 11:59:14 +08:00
MODEL_NAME="9g-sft"
2024-08-25 19:09:19 +08:00
TB_PATH="/workspace/repo/CPM-9G-8B/results/lora/${GPU_NUM}/${MAX_STEP}/${NO}/logs/"
2024-08-25 11:59:14 +08:00
OPTS=""
OPTS+=" --vocab /v2/sft_8b_v2/vocab.txt"
OPTS+=" --model-config /v2/sft_8b_v2/config.json"
2024-08-25 19:09:19 +08:00
OPTS+=" --train-iters ${MAX_STEP}"
OPTS+=" --inspect-iters 500"
2024-08-25 11:59:14 +08:00
OPTS+=" --warmup-iters 20"
OPTS+=" --lr-decay-style cosine"
OPTS+=" --weight-decay 0.01"
OPTS+=" --clip-grad 1.0"
OPTS+=" --loss-scale 1048576"
OPTS+=" --max-loss-scale 33554432"
OPTS+=" --min-loss-scale 1"
OPTS+=" --loss-scale-steps 32"
OPTS+=" --offload"
OPTS+=" --batch-size 2"
OPTS+=" --max-length 4096"
OPTS+=" --lr 3e-4"
OPTS+=" --start-step 0"
OPTS+=" --epoch 4"
OPTS+=" --load /v2/sft_8b_v2/cpm_live_8b-1500-float16.pt"
OPTS+=" --dataset /workspace/repo/CPM-9G-8B/dataset_bin"
# TODO 这些 /data 在启元机器上需要改成 /home 下的路径
OPTS+=" --save ${EXP_PATH}/checkpoints"
OPTS+=" --save-name ${MODEL_NAME}"
2024-08-25 19:09:19 +08:00
OPTS+=" --tensorboard ${TB_PATH}"
2024-08-25 11:59:14 +08:00
OPTS+=" --delta-tuning"
OPTS+=" --delta-type lora"
OPTS+=" --lora-r 64" # 常用的lora 参数
OPTS+=" --lora-dropout 0.05"
OPTS+=" --lora-alpha 64" # 常用的lora alpha 参数
OPTS+=" --lora-layer project_q project_v project_k w_0 w_1 w_out"
OPTS+=" --save-origin-model"
OPTS+=" $@"
2024-08-25 19:19:20 +08:00
CMD="torchrun --nnodes=1 --nproc_per_node=6 --rdzv_id=1 --rdzv_backend=c10d --rdzv_endpoint=${MASTER_ADDR}:${MASTER_PORT} ${CPM_PATH}/apps/cpm9g/sft_cpm9g_delta.py ${OPTS}"
2024-08-25 11:59:14 +08:00
echo "${CMD}"
$CMD