forked from jiuyuan/CPM-9G-8B
61 lines
1.7 KiB
Bash
61 lines
1.7 KiB
Bash
![]() |
#! /bin/bash
|
|||
|
#SBATCH --nodes=1
|
|||
|
#SBATCH --ntasks-per-node=8
|
|||
|
#SBATCH --gres=gpu:8
|
|||
|
#SBATCH --cpus-per-task=8
|
|||
|
#SBATCH --mem=512GB
|
|||
|
|
|||
|
# use 8 GPU for example, pretrain may need 32 GPU
|
|||
|
export MASTER_ADDR=`hostname`
|
|||
|
export MASTER_PORT=12345
|
|||
|
|
|||
|
mkdir -p /home/${USERNAME}/logs/debug
|
|||
|
mkdir -p /home/${USERNAME}/logs/tensorboard/cpm9g/
|
|||
|
|
|||
|
cd apps/cpm9g
|
|||
|
CONFIG_NAME="config/11b"
|
|||
|
# --------------- 运行参数 ---------------
|
|||
|
OPTS=""
|
|||
|
OPTS+=" --model-config ${CONFIG_NAME}/config.json"
|
|||
|
OPTS+=" --vocab ${CONFIG_NAME}/vocab.txt"
|
|||
|
OPTS+=" --batch-size 4"
|
|||
|
OPTS+=" --train-iters 400000"
|
|||
|
OPTS+=" --save-iters 250"
|
|||
|
OPTS+=" --save-name cpm9g_checkpoint"
|
|||
|
OPTS+=" --max-length 4096"
|
|||
|
OPTS+=" --lr 1.5e-5"
|
|||
|
OPTS+=" --inspect-iters 100"
|
|||
|
OPTS+=" --warmup-iters 2000"
|
|||
|
OPTS+=" --lr-decay-style noam"
|
|||
|
OPTS+=" --weight-decay 0.1"
|
|||
|
OPTS+=" --clip-grad 1.0"
|
|||
|
OPTS+=" --loss-scale 1048576"
|
|||
|
OPTS+=" --loss-scale-steps 32"
|
|||
|
OPTS+=" --offload"
|
|||
|
OPTS+=" --flash cuda"
|
|||
|
# OPTS+=" --load-grad"
|
|||
|
|
|||
|
# --------------- 写文件路径 ---------------
|
|||
|
## checkpoint
|
|||
|
OPTS+=" --save /home/${USERNAME}/checkpoints/cpm9g/"
|
|||
|
OPTS+=" --save-model /home/${USERNAME}/models/cpm9g/"
|
|||
|
|
|||
|
## logs,/local/logs 等价于 /data/logs(软链)
|
|||
|
OPTS+=" --log-dir /home/${USERNAME}/logs/train/"
|
|||
|
OPTS+=" --tensorboard /home/${USERNAME}/tensorboard/cpm9g/"`date +"%Y%m%d%H%M%S"`
|
|||
|
|
|||
|
# --------------- 读文件路径 ---------------
|
|||
|
OPTS+=" --dataset config/datasets.json"
|
|||
|
OPTS+=" --load ${CHECKPOINT}"
|
|||
|
OPTS+=" --start-step 1"
|
|||
|
|
|||
|
# --------------- 透传参数 ---------------
|
|||
|
OPTS+=" $@"
|
|||
|
|
|||
|
# --------------- 最终指令 ---------------
|
|||
|
|
|||
|
CMD="torchrun --nnodes=1 --nproc_per_node=8 --rdzv_id=1 --rdzv_backend=c10d --rdzv_endpoint=${MASTER_ADDR}:${MASTER_PORT} pretrain_cpm9g.py ${OPTS}"
|
|||
|
echo "${CMD}"
|
|||
|
|
|||
|
$CMD
|