forked from jiuyuan/CPM-9G-8B
235 lines
7.4 KiB
Bash
235 lines
7.4 KiB
Bash
#!/bin/bash
|
||
|
||
#export OMP_NUM_THREADS=16
|
||
|
||
declare -A args # Declare an associative array to store arguments and values
|
||
|
||
args["model_unique"]="8b_0702"
|
||
args["resume_ckpt"]=""
|
||
args["config"]="8b"
|
||
args["flash"]="cuda"
|
||
args["batch_size"]="1"
|
||
args["max_length"]="4096"
|
||
args["save_iters"]="500"
|
||
args["train_iters"]="10"
|
||
args["dataset_config"]="fm9g_sft"
|
||
args["local"]="False"
|
||
args["dataloader"]="indexed"
|
||
args["save"]="True"
|
||
args["dataloader_num_threads"]=1
|
||
args["dataloader_prefetch"]=2
|
||
args["dataloader_prefetch_factor"]=32
|
||
args["dataloader_num_workers"]=2
|
||
args["lr"]="1e-5"
|
||
args["warmup_iters"]="20"
|
||
args["drop_iters"]="0.1"
|
||
args["tokenizer_path"]="./tokenizer/tokenizer.model" # /user/tc_agi/klara/baichuan2/baichuan2.tokenizer.model
|
||
args["load_grad"]="False"
|
||
args["grad_ckpt_num"]="160"
|
||
args["exp_group"]=""
|
||
args["ignore_cuda_oom"]="1"
|
||
args["tensorboard_all_tasks"]="0"
|
||
args["stop_when_end"]="0"
|
||
args["only_run_dataloader"]="0"
|
||
args["eps"]="1e-6"
|
||
args["inspect_iters"]="100"
|
||
args["strict_state_dict"]="1"
|
||
args["only_load_model"]="1"
|
||
args["lr_scheduler"]="cosine"
|
||
args["resume_no_optimze"]="0"
|
||
args["tp_size"]="1"
|
||
args["parallel_load_datastate"]="16"
|
||
args["async_save"]="False"
|
||
args["load_dataloader_ckpt"]="0"
|
||
args["drop_begin"]="-1"
|
||
args["drop_rate"]="0.5"
|
||
args["use_checkpoint"]="1"
|
||
|
||
|
||
# Loop through the arguments
|
||
for ((i=1; i<=$#; i++)); do
|
||
arg="${!i}"
|
||
# Check if the argument starts with "--"
|
||
if [[ "$arg" == --* ]]; then
|
||
arg_name="${arg:2}" # Remove leading "--"
|
||
valueid=$((i+1))
|
||
# Get the value of the argument if it exists
|
||
if ((i+1 <= $#)); then
|
||
args["$arg_name"]="${!valueid}"
|
||
i=$((i+1)) # Skip the next argument (its value)
|
||
else
|
||
args["$arg_name"]="" # Set empty value if no value provided
|
||
fi
|
||
fi
|
||
done
|
||
|
||
# 使用 Python 读取 JSON 文件并更新 Bash 字典
|
||
while read -r key value; do
|
||
args["$key"]="$value"
|
||
done < <(python -c 'import json, sys; obj = json.load(open("train_configs/'${args['config']}'.json"))["pretrain"]; print("\n".join(["{} {}".format(k, v) for k, v in obj.items()]))')
|
||
|
||
|
||
|
||
# 用cmd arg 再更新一次
|
||
# Loop through the arguments
|
||
for ((i=1; i<=$#; i++)); do
|
||
arg="${!i}"
|
||
# Check if the argument starts with "--"
|
||
if [[ "$arg" == --* ]]; then
|
||
arg_name="${arg:2}" # Remove leading "--"
|
||
valueid=$((i+1))
|
||
|
||
# Get the value of the argument if it exists
|
||
if ((i+1 <= $#)); then
|
||
args["$arg_name"]="${!valueid}"
|
||
i=$((i+1)) # Skip the next argument (its value)
|
||
else
|
||
args["$arg_name"]="" # Set empty value if no value provided
|
||
fi
|
||
fi
|
||
done
|
||
|
||
# Print the values of the arguments
|
||
echo "----------- CMD args ----------"
|
||
for key in "${!args[@]}"; do
|
||
echo "$key: ${args[$key]}"
|
||
done
|
||
echo "--------- END CMD args --------"
|
||
|
||
|
||
if [[ ${args["flash"]} == "triton" ]]; then
|
||
sudo cp /usr/local/cuda-11.6/compat/libcuda.so.510.108.03 /usr/lib/x86_64-linux-gnu/libcuda.so.510.108.03
|
||
sudo ln /usr/lib/x86_64-linux-gnu/libcuda.so.510.108.03 /usr/lib/x86_64-linux-gnu/libcuda.so
|
||
echo "triton flash"
|
||
fi
|
||
|
||
|
||
|
||
|
||
|
||
|
||
GPUS_PER_NODE=$(nvidia-smi --query-gpu=gpu_name --format=csv,noheader | wc -l)
|
||
# GPUS_PER_NODE=1
|
||
echo "Using ${GPUS_PER_NODE} GPU each machine"
|
||
|
||
|
||
if [[ ${args["model_unique"]} == "" ]]; then
|
||
MODEL_UNIQUE=${JEEVES_JOB_ID} # 写入的位置,没传的话自动构造
|
||
# JOBID+CreateTime, 本次run的唯一标识符。在白箱里可以通过/projects/${PROJECTID}-${PROJECTNAME}/checkpoints/${MODEL_UNIQUE} 拿到 checkpoint
|
||
# 通过/projects/${PROJECTID}-${PROJECTNAME}/tensorboard/${MODEL_UNIQUE} 拿到 tensorboard
|
||
else
|
||
MODEL_UNIQUE=${args["model_unique"]} # 给了写入的位置
|
||
fi
|
||
echo "model_unique: "$MODEL_UNIQUE
|
||
|
||
# --------------- 运行参数 ---------------
|
||
|
||
OPTS+=" --model-config model_configs/"${args['config']}".json" # [CHANGE]
|
||
OPTS+=" --batch-size ${args["batch_size"]}"
|
||
OPTS+=" --train-iters ${args["train_iters"]}"
|
||
OPTS+=" --save-iters ${args["save_iters"]}"
|
||
OPTS+=" --save-name fm9g_live_checkpoint"
|
||
OPTS+=" --max-length ${args["max_length"]}"
|
||
OPTS+=" --lr ${args["lr"]}"
|
||
OPTS+=" --inspect-iters ${args["inspect_iters"]}"
|
||
OPTS+=" --warmup-iters ${args["warmup_iters"]}"
|
||
OPTS+=" --drop-iters ${args["drop_iters"]}"
|
||
OPTS+=" --lr_scheduler ${args["lr_scheduler"]}"
|
||
OPTS+=" --offload"
|
||
OPTS+=" --vocab ./tokenizer/vocab.txt"
|
||
OPTS+=" --flash ${args["flash"]}"
|
||
OPTS+=" --tensorboard_all_tasks ${args["tensorboard_all_tasks"]}"
|
||
OPTS+=" --ignore_cuda_oom ${args["ignore_cuda_oom"]}"
|
||
OPTS+=" --stop_when_end ${args["stop_when_end"]}"
|
||
OPTS+=" --only_run_dataloader ${args["only_run_dataloader"]}"
|
||
OPTS+=" --eps ${args["eps"]}"
|
||
OPTS+=" --strict_state_dict ${args["strict_state_dict"]}"
|
||
OPTS+=" --only_load_model ${args["only_load_model"]}"
|
||
OPTS+=" --resume_no_optimze ${args["resume_no_optimze"]}"
|
||
OPTS+=" --tokenizer_path ${args["tokenizer_path"]}"
|
||
OPTS+=" --weight-decay 0.1"
|
||
OPTS+=" --tp-size ${args["tp_size"]}"
|
||
OPTS+=" --parallel_load_datastate ${args["parallel_load_datastate"]}"
|
||
OPTS+=" --load_dataloader_ckpt ${args["load_dataloader_ckpt"]}"
|
||
OPTS+=" --drop_begin ${args["drop_begin"]}"
|
||
OPTS+=" --drop_rate ${args["drop_rate"]}"
|
||
OPTS+=" --use_checkpoint ${args["use_checkpoint"]}"
|
||
|
||
if [[ ${args["load_grad"]} == "True" ]]; then
|
||
OPTS+=" --load-grad"
|
||
OPTS+=" --grad-ckpt-num ${args["grad_ckpt_num"]}"
|
||
fi
|
||
|
||
|
||
if [[ ${args["async_save"]} == "True" ]]; then
|
||
OPTS+=" --async_save"
|
||
fi
|
||
|
||
|
||
if [[ ${args["dataloader"]} == "indexed" ]]; then
|
||
OPTS+=" --dataloader_num_threads ${args["dataloader_num_threads"]}"
|
||
OPTS+=" --dataloader_prefetch ${args["dataloader_prefetch"]}"
|
||
OPTS+=" --dataloader_num_workers ${args["dataloader_num_workers"]}"
|
||
OPTS+=" --dataloader_prefetch_factor ${args["dataloader_prefetch_factor"]}"
|
||
fi
|
||
|
||
|
||
# --------------- 写文件路径 ---------------
|
||
## checkpoint
|
||
if [[ ${args["save"]} == "True" ]]; then
|
||
|
||
OPTS+=" --save ./data/checkpoints/${MODEL_UNIQUE}/"
|
||
OPTS+=" --save-model ./not_exist/${MODEL_UNIQUE}/"
|
||
else
|
||
echo "won't save model"
|
||
fi
|
||
|
||
|
||
## logs,/local/logs 等价于 ./datalogs(软链)
|
||
mkdir -p ./data/checkpoints/logs/${MODEL_UNIQUE}
|
||
OPTS+=" --log-dir ./data/checkpoints/logs/${MODEL_UNIQUE}"
|
||
OPTS+=" --tensorboard ./data/tensorboard/${args["exp_group"]}${MODEL_UNIQUE}/"
|
||
|
||
|
||
|
||
if [[ ${args["local"]} == "True" ]]; then
|
||
current_dir=$(pwd)
|
||
OPTS+=" --dataset ${current_dir}/dataset_configs/${args["dataset_config"]}.json"
|
||
else
|
||
current_dir=$(pwd)
|
||
OPTS+=" --dataset ${current_dir}/dataset_configs/${args["dataset_config"]}.json"
|
||
echo "Platform config:"${PLATFORM_CONFIG_PATH}
|
||
fi
|
||
|
||
|
||
## checkpoint,兼容 CHECKPOINT 和 LATEST_CHECKPOINT。debug 时建议不加载 checkpoint,启动会比较快
|
||
if [ "${args["resume_ckpt"]}" != "" ]; then
|
||
OPTS+=" --load ./data/checkpoints/${MODEL_UNIQUE}/${args["resume_ckpt"]}"
|
||
else
|
||
echo "No checkpoint to load"
|
||
fi
|
||
|
||
|
||
filename="pretrain_dragonfly"
|
||
|
||
if [[ ${args["local"]} == "True" ]]; then
|
||
PRETRAIN_ENTRY="$filename.py"
|
||
else
|
||
PRETRAIN_ENTRY="$filename.py"
|
||
fi
|
||
|
||
|
||
GPUS_PER_NODE=8
|
||
NNODES=1
|
||
RANK=0
|
||
MASTER_ENDPOINT=g3006
|
||
MASTER_PORT=12345
|
||
#CMD="torchrun --nnodes=${NNODES} --nproc_per_node=${GPUS_PER_NODE} --node_rank=${RANK} --master_addr=${MASTER_ENDPOINT} --master_port=${MASTER_PORT} ${PRETRAIN_ENTRY} ${OPTS}"
|
||
CMD="torchrun --nnodes=${NNODES} --nproc_per_node=${GPUS_PER_NODE} --node_rank=${RANK} --rdzv_id=1 --rdzv_backend=c10d --rdzv_endpoint=${MASTER_ENDPOINT}:${MASTER_PORT} ${PRETRAIN_ENTRY} ${OPTS}"
|
||
|
||
echo "-------final CMD is------"
|
||
echo "${CMD}"
|
||
echo "-------final CMD end------"
|
||
|
||
$CMD
|