change: new train yaml

This commit is contained in:
wql 2024-08-22 09:35:51 +08:00
parent 429c1cd574
commit bfa2e166d7
25 changed files with 1008 additions and 24 deletions

View File

@ -1,9 +1,9 @@
FORCE_TORCHRUN=1 llamafactory-cli train results/lora_sft/Llama2-7B/llama2_lora_sft_1.yaml | tee results/lora_sft/Llama2-7B/llama2_lora_sft_1.txt
FORCE_TORCHRUN=1 llamafactory-cli train results/lora_sft/Llama2-7B/llama2_lora_sft_2.yaml | tee results/lora_sft/Llama2-7B/llama2_lora_sft_2.txt
FORCE_TORCHRUN=1 llamafactory-cli train results/lora_sft/Llama2-7B/llama2_lora_sft_3.yaml | tee results/lora_sft/Llama2-7B/llama2_lora_sft_3.txt
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train results/lora_sft/Llama2-7B/llama2_lora_sft_1_single.yaml | tee results/lora_sft/Llama2-7B/llama2_lora_sft_1_single.txt
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train results/lora_sft/Llama2-7B/llama2_lora_sft_2_single.yaml | tee results/lora_sft/Llama2-7B/llama2_lora_sft_2_single.txt
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train results/lora_sft/Llama2-7B/llama2_lora_sft_3_single.yaml | tee results/lora_sft/Llama2-7B/llama2_lora_sft_3_single.txt
FORCE_TORCHRUN=1 llamafactory-cli train results/lora_sft_2/Llama2-7B/llama2_lora_sft_1.yaml | tee results/lora_sft_2/Llama2-7B/llama2_lora_sft_1.txt
FORCE_TORCHRUN=1 llamafactory-cli train results/lora_sft_2/Llama2-7B/llama2_lora_sft_2.yaml | tee results/lora_sft_2/Llama2-7B/llama2_lora_sft_2.txt
FORCE_TORCHRUN=1 llamafactory-cli train results/lora_sft_2/Llama2-7B/llama2_lora_sft_3.yaml | tee results/lora_sft_2/Llama2-7B/llama2_lora_sft_3.txt
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train results/lora_sft_2/Llama2-7B/llama2_lora_sft_1_single.yaml | tee results/lora_sft_2/Llama2-7B/llama2_lora_sft_1_single.txt
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train results/lora_sft_2/Llama2-7B/llama2_lora_sft_2_single.yaml | tee results/lora_sft_2/Llama2-7B/llama2_lora_sft_2_single.txt
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train results/lora_sft_2/Llama2-7B/llama2_lora_sft_3_single.yaml | tee results/lora_sft_2/Llama2-7B/llama2_lora_sft_3_single.txt
# FORCE_TORCHRUN=1 llamafactory-cli train results/inference/Llama2-7B/llama2_predict_1.yaml | tee results/inference/Llama2-7B/llama2_predict_1.txt
# FORCE_TORCHRUN=1 llamafactory-cli train results/inference/Llama2-7B/llama2_predict_2.yaml | tee results/inference/Llama2-7B/llama2_predict_2.txt
@ -12,26 +12,26 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli train results/lora_sft/Llama2-7B/llama2_
# CUDA_VISIBLE_DEVICES=0 llamafactory-cli train results/inference/Llama2-7B/llama2_predict_2_single.yaml | tee results/inference/Llama2-7B/llama2_predict_2_single.txt
# CUDA_VISIBLE_DEVICES=0 llamafactory-cli train results/inference/Llama2-7B/llama2_predict_3_single.yaml | tee results/inference/Llama2-7B/llama2_predict_3_single.txt
# FORCE_TORCHRUN=1 llamafactory-cli train results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1.yaml | tee results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1.txt
# FORCE_TORCHRUN=1 llamafactory-cli train results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_2.yaml | tee results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_2.txt
# FORCE_TORCHRUN=1 llamafactory-cli train results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_3.yaml | tee results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_3.txt
# CUDA_VISIBLE_DEVICES=0 llamafactory-cli train results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single.yaml | tee results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single.txt
# CUDA_VISIBLE_DEVICES=0 llamafactory-cli train results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_2_single.yaml | tee results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_2_single.txt
# CUDA_VISIBLE_DEVICES=0 llamafactory-cli train results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_3_single.yaml | tee results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_3_single.txt
# FORCE_TORCHRUN=1 llamafactory-cli train results/lora_sft_2/Baichuan2-7B/Baichuan2_lora_sft_1.yaml | tee results/lora_sft_2/Baichuan2-7B/Baichuan2_lora_sft_1.txt
# FORCE_TORCHRUN=1 llamafactory-cli train results/lora_sft_2/Baichuan2-7B/Baichuan2_lora_sft_2.yaml | tee results/lora_sft_2/Baichuan2-7B/Baichuan2_lora_sft_2.txt
# FORCE_TORCHRUN=1 llamafactory-cli train results/lora_sft_2/Baichuan2-7B/Baichuan2_lora_sft_3.yaml | tee results/lora_sft_2/Baichuan2-7B/Baichuan2_lora_sft_3.txt
# CUDA_VISIBLE_DEVICES=0 llamafactory-cli train results/lora_sft_2/Baichuan2-7B/Baichuan2_lora_sft_1_single.yaml | tee results/lora_sft_2/Baichuan2-7B/Baichuan2_lora_sft_1_single.txt
# CUDA_VISIBLE_DEVICES=0 llamafactory-cli train results/lora_sft_2/Baichuan2-7B/Baichuan2_lora_sft_2_single.yaml | tee results/lora_sft_2/Baichuan2-7B/Baichuan2_lora_sft_2_single.txt
# CUDA_VISIBLE_DEVICES=0 llamafactory-cli train results/lora_sft_2/Baichuan2-7B/Baichuan2_lora_sft_3_single.yaml | tee results/lora_sft_2/Baichuan2-7B/Baichuan2_lora_sft_3_single.txt
# FORCE_TORCHRUN=1 llamafactory-cli train results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1.yaml | tee results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1.txt
FORCE_TORCHRUN=1 llamafactory-cli train results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_2.yaml | tee results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_2.txt
FORCE_TORCHRUN=1 llamafactory-cli train results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_3.yaml | tee results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_3.txt
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1_single.yaml | tee results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1_single.txt
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_2_single.yaml | tee results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_2_single.txt
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_3_single.yaml | tee results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_3_single.txt
# FORCE_TORCHRUN=1 llamafactory-cli train results/lora_sft_2/ChatGLM2-6B/ChatGLM2_lora_sft_1.yaml | tee results/lora_sft_2/ChatGLM2-6B/ChatGLM2_lora_sft_1.txt
# FORCE_TORCHRUN=1 llamafactory-cli train results/lora_sft_2/ChatGLM2-6B/ChatGLM2_lora_sft_2.yaml | tee results/lora_sft_2/ChatGLM2-6B/ChatGLM2_lora_sft_2.txt
# FORCE_TORCHRUN=1 llamafactory-cli train results/lora_sft_2/ChatGLM2-6B/ChatGLM2_lora_sft_3.yaml | tee results/lora_sft_2/ChatGLM2-6B/ChatGLM2_lora_sft_3.txt
# CUDA_VISIBLE_DEVICES=0 llamafactory-cli train results/lora_sft_2/ChatGLM2-6B/ChatGLM2_lora_sft_1_single.yaml | tee results/lora_sft_2/ChatGLM2-6B/ChatGLM2_lora_sft_1_single.txt
# CUDA_VISIBLE_DEVICES=0 llamafactory-cli train results/lora_sft_2/ChatGLM2-6B/ChatGLM2_lora_sft_2_single.yaml | tee results/lora_sft_2/ChatGLM2-6B/ChatGLM2_lora_sft_2_single.txt
# CUDA_VISIBLE_DEVICES=0 llamafactory-cli train results/lora_sft_2/ChatGLM2-6B/ChatGLM2_lora_sft_3_single.yaml | tee results/lora_sft_2/ChatGLM2-6B/ChatGLM2_lora_sft_3_single.txt
# FORCE_TORCHRUN=1 llamafactory-cli train results/lora_sft/Qwen-7B/Qwen_lora_sft_1.yaml | tee results/lora_sft/Qwen-7B/Qwen_lora_sft_1.txt
# FORCE_TORCHRUN=1 llamafactory-cli train results/lora_sft/Qwen-7B/Qwen_lora_sft_2.yaml | tee results/lora_sft/Qwen-7B/Qwen_lora_sft_2.txt
# FORCE_TORCHRUN=1 llamafactory-cli train results/lora_sft/Qwen-7B/Qwen_lora_sft_3.yaml | tee results/lora_sft/Qwen-7B/Qwen_lora_sft_3.txt
# CUDA_VISIBLE_DEVICES=0 llamafactory-cli train results/lora_sft/Qwen-7B/Qwen_lora_sft_1_single.yaml | tee results/lora_sft/Qwen-7B/Qwen_lora_sft_1_single.txt
# CUDA_VISIBLE_DEVICES=0 llamafactory-cli train results/lora_sft/Qwen-7B/Qwen_lora_sft_2_single.yaml | tee results/lora_sft/Qwen-7B/Qwen_lora_sft_2_single.txt
# CUDA_VISIBLE_DEVICES=0 llamafactory-cli train results/lora_sft/Qwen-7B/Qwen_lora_sft_3_single.yaml | tee results/lora_sft/Qwen-7B/Qwen_lora_sft_3_single.txt
# FORCE_TORCHRUN=1 llamafactory-cli train results/lora_sft_2/Qwen-7B/Qwen_lora_sft_1.yaml | tee results/lora_sft_2/Qwen-7B/Qwen_lora_sft_1.txt
# FORCE_TORCHRUN=1 llamafactory-cli train results/lora_sft_2/Qwen-7B/Qwen_lora_sft_2.yaml | tee results/lora_sft_2/Qwen-7B/Qwen_lora_sft_2.txt
# FORCE_TORCHRUN=1 llamafactory-cli train results/lora_sft_2/Qwen-7B/Qwen_lora_sft_3.yaml | tee results/lora_sft_2/Qwen-7B/Qwen_lora_sft_3.txt
# CUDA_VISIBLE_DEVICES=0 llamafactory-cli train results/lora_sft_2/Qwen-7B/Qwen_lora_sft_1_single.yaml | tee results/lora_sft_2/Qwen-7B/Qwen_lora_sft_1_single.txt
# CUDA_VISIBLE_DEVICES=0 llamafactory-cli train results/lora_sft_2/Qwen-7B/Qwen_lora_sft_2_single.yaml | tee results/lora_sft_2/Qwen-7B/Qwen_lora_sft_2_single.txt
# CUDA_VISIBLE_DEVICES=0 llamafactory-cli train results/lora_sft_2/Qwen-7B/Qwen_lora_sft_3_single.yaml | tee results/lora_sft_2/Qwen-7B/Qwen_lora_sft_3_single.txt
# FORCE_TORCHRUN=1 llamafactory-cli train results/inference/Qwen-7B/Qwen_predict_1.yaml | tee results/inference/Qwen-7B/Qwen_predict_1.txt
# FORCE_TORCHRUN=1 llamafactory-cli train results/inference/Qwen-7B/Qwen_predict_2.yaml | tee results/inference/Qwen-7B/Qwen_predict_2.txt

View File

@ -0,0 +1,41 @@
### model
model_name_or_path: ../../llm/baichuan
### method
stage: sft
do_train: true
finetuning_type: lora
lora_target: all
### dataset
dataset: belle_1m
template: baichuan
cutoff_len: 1024
max_samples: 10000
overwrite_cache: true
preprocessing_num_workers: 16
### output
output_dir: ./results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1
logging_steps: 3
save_steps: 100
plot_loss: true
overwrite_output_dir: true
### train
per_device_train_batch_size: 2
gradient_accumulation_steps: 8
learning_rate: 1.0e-4
num_train_epochs: 10.0
lr_scheduler_type: cosine
warmup_ratio: 0.1
bf16: true
ddp_timeout: 180000000
max_steps: 1000
include_num_input_tokens_seen: true
### eval
val_size: 0.1
per_device_eval_batch_size: 2
eval_strategy: steps
eval_steps: 500

View File

@ -0,0 +1,41 @@
### model
model_name_or_path: ../../llm/baichuan
### method
stage: sft
do_train: true
finetuning_type: lora
lora_target: all
### dataset
dataset: belle_1m
template: baichuan
cutoff_len: 1024
max_samples: 10000
overwrite_cache: true
preprocessing_num_workers: 16
### output
output_dir: ./results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single
logging_steps: 3
save_steps: 100
plot_loss: true
overwrite_output_dir: true
### train
per_device_train_batch_size: 2
gradient_accumulation_steps: 8
learning_rate: 1.0e-4
num_train_epochs: 10.0
lr_scheduler_type: cosine
warmup_ratio: 0.1
bf16: true
ddp_timeout: 180000000
max_steps: 1000
include_num_input_tokens_seen: true
### eval
val_size: 0.1
per_device_eval_batch_size: 2
eval_strategy: steps
eval_steps: 500

View File

@ -0,0 +1,41 @@
### model
model_name_or_path: ../../llm/baichuan
### method
stage: sft
do_train: true
finetuning_type: lora
lora_target: all
### dataset
dataset: belle_1m
template: baichuan
cutoff_len: 1024
max_samples: 10000
overwrite_cache: true
preprocessing_num_workers: 16
### output
output_dir: ./results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_2
logging_steps: 3
save_steps: 100
plot_loss: true
overwrite_output_dir: true
### train
per_device_train_batch_size: 2
gradient_accumulation_steps: 8
learning_rate: 1.0e-4
num_train_epochs: 10.0
lr_scheduler_type: cosine
warmup_ratio: 0.1
bf16: true
ddp_timeout: 180000000
max_steps: 1000
include_num_input_tokens_seen: true
### eval
val_size: 0.1
per_device_eval_batch_size: 2
eval_strategy: steps
eval_steps: 500

View File

@ -0,0 +1,41 @@
### model
model_name_or_path: ../../llm/baichuan
### method
stage: sft
do_train: true
finetuning_type: lora
lora_target: all
### dataset
dataset: belle_1m
template: baichuan
cutoff_len: 1024
max_samples: 10000
overwrite_cache: true
preprocessing_num_workers: 16
### output
output_dir: ./results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_2_single
logging_steps: 3
save_steps: 100
plot_loss: true
overwrite_output_dir: true
### train
per_device_train_batch_size: 2
gradient_accumulation_steps: 8
learning_rate: 1.0e-4
num_train_epochs: 10.0
lr_scheduler_type: cosine
warmup_ratio: 0.1
bf16: true
ddp_timeout: 180000000
max_steps: 1000
include_num_input_tokens_seen: true
### eval
val_size: 0.1
per_device_eval_batch_size: 2
eval_strategy: steps
eval_steps: 500

View File

@ -0,0 +1,41 @@
### model
model_name_or_path: ../../llm/baichuan
### method
stage: sft
do_train: true
finetuning_type: lora
lora_target: all
### dataset
dataset: belle_1m
template: baichuan
cutoff_len: 1024
max_samples: 10000
overwrite_cache: true
preprocessing_num_workers: 16
### output
output_dir: ./results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_3
logging_steps: 3
save_steps: 100
plot_loss: true
overwrite_output_dir: true
### train
per_device_train_batch_size: 2
gradient_accumulation_steps: 8
learning_rate: 1.0e-4
num_train_epochs: 10.0
lr_scheduler_type: cosine
warmup_ratio: 0.1
bf16: true
ddp_timeout: 180000000
max_steps: 1000
include_num_input_tokens_seen: true
### eval
val_size: 0.1
per_device_eval_batch_size: 2
eval_strategy: steps
eval_steps: 500

View File

@ -0,0 +1,41 @@
### model
model_name_or_path: ../../llm/baichuan
### method
stage: sft
do_train: true
finetuning_type: lora
lora_target: all
### dataset
dataset: belle_1m
template: baichuan
cutoff_len: 1024
max_samples: 10000
overwrite_cache: true
preprocessing_num_workers: 16
### output
output_dir: ./results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_3_single
logging_steps: 3
save_steps: 100
plot_loss: true
overwrite_output_dir: true
### train
per_device_train_batch_size: 2
gradient_accumulation_steps: 8
learning_rate: 1.0e-4
num_train_epochs: 10.0
lr_scheduler_type: cosine
warmup_ratio: 0.1
bf16: true
ddp_timeout: 180000000
max_steps: 1000
include_num_input_tokens_seen: true
### eval
val_size: 0.1
per_device_eval_batch_size: 2
eval_strategy: steps
eval_steps: 500

View File

@ -0,0 +1,41 @@
### model
model_name_or_path: ../../llm/chatglm/data
### method
stage: sft
do_train: true
finetuning_type: lora
lora_target: all
### dataset
dataset: belle_1m
template: chatglm2
cutoff_len: 1024
max_samples: 10000
overwrite_cache: true
preprocessing_num_workers: 16
### output
output_dir: ./results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1
logging_steps: 3
save_steps: 100
plot_loss: true
overwrite_output_dir: true
### train
per_device_train_batch_size: 2
gradient_accumulation_steps: 8
learning_rate: 1.0e-4
num_train_epochs: 10.0
lr_scheduler_type: cosine
warmup_ratio: 0.1
bf16: true
ddp_timeout: 180000000
max_steps: 1000
include_num_input_tokens_seen: true
### eval
val_size: 0.1
per_device_eval_batch_size: 2
eval_strategy: steps
eval_steps: 500

View File

@ -0,0 +1,41 @@
### model
model_name_or_path: ../../llm/chatglm/data
### method
stage: sft
do_train: true
finetuning_type: lora
lora_target: all
### dataset
dataset: belle_1m
template: chatglm2
cutoff_len: 1024
max_samples: 10000
overwrite_cache: true
preprocessing_num_workers: 16
### output
output_dir: ./results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1_single
logging_steps: 3
save_steps: 100
plot_loss: true
overwrite_output_dir: true
### train
per_device_train_batch_size: 2
gradient_accumulation_steps: 8
learning_rate: 1.0e-4
num_train_epochs: 10.0
lr_scheduler_type: cosine
warmup_ratio: 0.1
bf16: true
ddp_timeout: 180000000
max_steps: 1000
include_num_input_tokens_seen: true
### eval
val_size: 0.1
per_device_eval_batch_size: 2
eval_strategy: steps
eval_steps: 500

View File

@ -0,0 +1,41 @@
### model
model_name_or_path: ../../llm/chatglm/data
### method
stage: sft
do_train: true
finetuning_type: lora
lora_target: all
### dataset
dataset: belle_1m
template: chatglm2
cutoff_len: 1024
max_samples: 10000
overwrite_cache: true
preprocessing_num_workers: 16
### output
output_dir: ./results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_2
logging_steps: 3
save_steps: 100
plot_loss: true
overwrite_output_dir: true
### train
per_device_train_batch_size: 2
gradient_accumulation_steps: 8
learning_rate: 1.0e-4
num_train_epochs: 10.0
lr_scheduler_type: cosine
warmup_ratio: 0.1
bf16: true
ddp_timeout: 180000000
max_steps: 1000
include_num_input_tokens_seen: true
### eval
val_size: 0.1
per_device_eval_batch_size: 2
eval_strategy: steps
eval_steps: 500

View File

@ -0,0 +1,41 @@
### model
model_name_or_path: ../../llm/chatglm/data
### method
stage: sft
do_train: true
finetuning_type: lora
lora_target: all
### dataset
dataset: belle_1m
template: chatglm2
cutoff_len: 1024
max_samples: 10000
overwrite_cache: true
preprocessing_num_workers: 16
### output
output_dir: ./results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_2_single
logging_steps: 3
save_steps: 100
plot_loss: true
overwrite_output_dir: true
### train
per_device_train_batch_size: 2
gradient_accumulation_steps: 8
learning_rate: 1.0e-4
num_train_epochs: 10.0
lr_scheduler_type: cosine
warmup_ratio: 0.1
bf16: true
ddp_timeout: 180000000
max_steps: 1000
include_num_input_tokens_seen: true
### eval
val_size: 0.1
per_device_eval_batch_size: 2
eval_strategy: steps
eval_steps: 500

View File

@ -0,0 +1,41 @@
### model
model_name_or_path: ../../llm/chatglm/data
### method
stage: sft
do_train: true
finetuning_type: lora
lora_target: all
### dataset
dataset: belle_1m
template: chatglm2
cutoff_len: 1024
max_samples: 10000
overwrite_cache: true
preprocessing_num_workers: 16
### output
output_dir: ./results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_3
logging_steps: 3
save_steps: 100
plot_loss: true
overwrite_output_dir: true
### train
per_device_train_batch_size: 2
gradient_accumulation_steps: 8
learning_rate: 1.0e-4
num_train_epochs: 10.0
lr_scheduler_type: cosine
warmup_ratio: 0.1
bf16: true
ddp_timeout: 180000000
max_steps: 1000
include_num_input_tokens_seen: true
### eval
val_size: 0.1
per_device_eval_batch_size: 2
eval_strategy: steps
eval_steps: 500

View File

@ -0,0 +1,41 @@
### model
model_name_or_path: ../../llm/chatglm/data
### method
stage: sft
do_train: true
finetuning_type: lora
lora_target: all
### dataset
dataset: belle_1m
template: chatglm2
cutoff_len: 1024
max_samples: 10000
overwrite_cache: true
preprocessing_num_workers: 16
### output
output_dir: ./results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_3_single
logging_steps: 3
save_steps: 100
plot_loss: true
overwrite_output_dir: true
### train
per_device_train_batch_size: 2
gradient_accumulation_steps: 8
learning_rate: 1.0e-4
num_train_epochs: 10.0
lr_scheduler_type: cosine
warmup_ratio: 0.1
bf16: true
ddp_timeout: 180000000
max_steps: 1000
include_num_input_tokens_seen: true
### eval
val_size: 0.1
per_device_eval_batch_size: 2
eval_strategy: steps
eval_steps: 500

View File

@ -0,0 +1,41 @@
### model
model_name_or_path: modelscope/Llama-2-7b-ms
### method
stage: sft
do_train: true
finetuning_type: lora
lora_target: all
### dataset
dataset: belle_1m
template: llama2
cutoff_len: 1024
max_samples: 10000
overwrite_cache: true
preprocessing_num_workers: 16
### output
output_dir: ./results/lora_sft/Llama2-7B/llama2_lora_sft_1
logging_steps: 3
save_steps: 100
plot_loss: true
overwrite_output_dir: true
### train
per_device_train_batch_size: 2
gradient_accumulation_steps: 8
learning_rate: 1.0e-4
num_train_epochs: 10.0
lr_scheduler_type: cosine
warmup_ratio: 0.1
fp16: true
ddp_timeout: 180000000
max_steps: 1000
include_num_input_tokens_seen: true
### eval
val_size: 0.1
per_device_eval_batch_size: 2
eval_strategy: steps
eval_steps: 500

View File

@ -0,0 +1,41 @@
### model
model_name_or_path: modelscope/Llama-2-7b-ms
### method
stage: sft
do_train: true
finetuning_type: lora
lora_target: all
### dataset
dataset: belle_1m
template: llama2
cutoff_len: 1024
max_samples: 10000
overwrite_cache: true
preprocessing_num_workers: 16
### output
output_dir: ./results/lora_sft/Llama2-7B/llama2_lora_sft_1_single
logging_steps: 3
save_steps: 100
plot_loss: true
overwrite_output_dir: true
### train
per_device_train_batch_size: 2
gradient_accumulation_steps: 8
learning_rate: 1.0e-4
num_train_epochs: 10.0
lr_scheduler_type: cosine
warmup_ratio: 0.1
fp16: true
ddp_timeout: 180000000
max_steps: 1000
include_num_input_tokens_seen: true
### eval
val_size: 0.1
per_device_eval_batch_size: 2
eval_strategy: steps
eval_steps: 500

View File

@ -0,0 +1,41 @@
### model
model_name_or_path: modelscope/Llama-2-7b-ms
### method
stage: sft
do_train: true
finetuning_type: lora
lora_target: all
### dataset
dataset: belle_1m
template: llama2
cutoff_len: 1024
max_samples: 10000
overwrite_cache: true
preprocessing_num_workers: 16
### output
output_dir: ./results/lora_sft/Llama2-7B/llama2_lora_sft_2
logging_steps: 3
save_steps: 100
plot_loss: true
overwrite_output_dir: true
### train
per_device_train_batch_size: 2
gradient_accumulation_steps: 8
learning_rate: 1.0e-4
num_train_epochs: 10.0
lr_scheduler_type: cosine
warmup_ratio: 0.1
fp16: true
ddp_timeout: 180000000
max_steps: 1000
include_num_input_tokens_seen: true
### eval
val_size: 0.1
per_device_eval_batch_size: 2
eval_strategy: steps
eval_steps: 500

View File

@ -0,0 +1,41 @@
### model
model_name_or_path: modelscope/Llama-2-7b-ms
### method
stage: sft
do_train: true
finetuning_type: lora
lora_target: all
### dataset
dataset: belle_1m
template: llama2
cutoff_len: 1024
max_samples: 10000
overwrite_cache: true
preprocessing_num_workers: 16
### output
output_dir: ./results/lora_sft/Llama2-7B/llama2_lora_sft_2_single
logging_steps: 3
save_steps: 100
plot_loss: true
overwrite_output_dir: true
### train
per_device_train_batch_size: 2
gradient_accumulation_steps: 8
learning_rate: 1.0e-4
num_train_epochs: 10.0
lr_scheduler_type: cosine
warmup_ratio: 0.1
fp16: true
ddp_timeout: 180000000
max_steps: 1000
include_num_input_tokens_seen: true
### eval
val_size: 0.1
per_device_eval_batch_size: 2
eval_strategy: steps
eval_steps: 500

View File

@ -0,0 +1,41 @@
### model
model_name_or_path: modelscope/Llama-2-7b-ms
### method
stage: sft
do_train: true
finetuning_type: lora
lora_target: all
### dataset
dataset: belle_1m
template: llama2
cutoff_len: 1024
max_samples: 10000
overwrite_cache: true
preprocessing_num_workers: 16
### output
output_dir: ./results/lora_sft/Llama2-7B/llama2_lora_sft_3
logging_steps: 3
save_steps: 100
plot_loss: true
overwrite_output_dir: true
### train
per_device_train_batch_size: 2
gradient_accumulation_steps: 8
learning_rate: 1.0e-4
num_train_epochs: 10.0
lr_scheduler_type: cosine
warmup_ratio: 0.1
fp16: true
ddp_timeout: 180000000
max_steps: 1000
include_num_input_tokens_seen: true
### eval
val_size: 0.1
per_device_eval_batch_size: 2
eval_strategy: steps
eval_steps: 500

View File

@ -0,0 +1,41 @@
### model
model_name_or_path: modelscope/Llama-2-7b-ms
### method
stage: sft
do_train: true
finetuning_type: lora
lora_target: all
### dataset
dataset: belle_1m
template: llama2
cutoff_len: 1024
max_samples: 10000
overwrite_cache: true
preprocessing_num_workers: 16
### output
output_dir: ./results/lora_sft/Llama2-7B/llama2_lora_sft_3_single
logging_steps: 3
save_steps: 100
plot_loss: true
overwrite_output_dir: true
### train
per_device_train_batch_size: 2
gradient_accumulation_steps: 8
learning_rate: 1.0e-4
num_train_epochs: 10.0
lr_scheduler_type: cosine
warmup_ratio: 0.1
fp16: true
ddp_timeout: 180000000
max_steps: 1000
include_num_input_tokens_seen: true
### eval
val_size: 0.1
per_device_eval_batch_size: 2
eval_strategy: steps
eval_steps: 500

View File

@ -0,0 +1,41 @@
### model
model_name_or_path: ../../llm/qwen
### method
stage: sft
do_train: true
finetuning_type: lora
lora_target: all
### dataset
dataset: belle_1m
template: qwen
cutoff_len: 1024
max_samples: 10000
overwrite_cache: true
preprocessing_num_workers: 16
### output
output_dir: ./results/lora_sft/Qwen-7B/Qwen_lora_sft_1
logging_steps: 3
save_steps: 100
plot_loss: true
overwrite_output_dir: true
### train
per_device_train_batch_size: 2
gradient_accumulation_steps: 8
learning_rate: 1.0e-4
num_train_epochs: 10.0
lr_scheduler_type: cosine
warmup_ratio: 0.1
bf16: true
ddp_timeout: 180000000
max_steps: 1000
include_num_input_tokens_seen: true
### eval
val_size: 0.1
per_device_eval_batch_size: 2
eval_strategy: steps
eval_steps: 500

View File

@ -0,0 +1,41 @@
### model
model_name_or_path: ../../llm/qwen
### method
stage: sft
do_train: true
finetuning_type: lora
lora_target: all
### dataset
dataset: belle_1m
template: qwen
cutoff_len: 1024
max_samples: 10000
overwrite_cache: true
preprocessing_num_workers: 16
### output
output_dir: ./results/lora_sft/Qwen-7B/Qwen_lora_sft_1_single
logging_steps: 3
save_steps: 100
plot_loss: true
overwrite_output_dir: true
### train
per_device_train_batch_size: 2
gradient_accumulation_steps: 8
learning_rate: 1.0e-4
num_train_epochs: 10.0
lr_scheduler_type: cosine
warmup_ratio: 0.1
bf16: true
ddp_timeout: 180000000
max_steps: 1000
include_num_input_tokens_seen: true
### eval
val_size: 0.1
per_device_eval_batch_size: 2
eval_strategy: steps
eval_steps: 500

View File

@ -0,0 +1,41 @@
### model
model_name_or_path: ../../llm/qwen
### method
stage: sft
do_train: true
finetuning_type: lora
lora_target: all
### dataset
dataset: belle_1m
template: qwen
cutoff_len: 1024
max_samples: 10000
overwrite_cache: true
preprocessing_num_workers: 16
### output
output_dir: ./results/lora_sft/Qwen-7B/Qwen_lora_sft_2
logging_steps: 3
save_steps: 100
plot_loss: true
overwrite_output_dir: true
### train
per_device_train_batch_size: 2
gradient_accumulation_steps: 8
learning_rate: 1.0e-4
num_train_epochs: 10.0
lr_scheduler_type: cosine
warmup_ratio: 0.1
bf16: true
ddp_timeout: 180000000
max_steps: 1000
include_num_input_tokens_seen: true
### eval
val_size: 0.1
per_device_eval_batch_size: 2
eval_strategy: steps
eval_steps: 500

View File

@ -0,0 +1,41 @@
### model
model_name_or_path: ../../llm/qwen
### method
stage: sft
do_train: true
finetuning_type: lora
lora_target: all
### dataset
dataset: belle_1m
template: qwen
cutoff_len: 1024
max_samples: 10000
overwrite_cache: true
preprocessing_num_workers: 16
### output
output_dir: ./results/lora_sft/Qwen-7B/Qwen_lora_sft_2_single
logging_steps: 3
save_steps: 100
plot_loss: true
overwrite_output_dir: true
### train
per_device_train_batch_size: 2
gradient_accumulation_steps: 8
learning_rate: 1.0e-4
num_train_epochs: 10.0
lr_scheduler_type: cosine
warmup_ratio: 0.1
bf16: true
ddp_timeout: 180000000
max_steps: 1000
include_num_input_tokens_seen: true
### eval
val_size: 0.1
per_device_eval_batch_size: 2
eval_strategy: steps
eval_steps: 500

View File

@ -0,0 +1,41 @@
### model
model_name_or_path: ../../llm/qwen
### method
stage: sft
do_train: true
finetuning_type: lora
lora_target: all
### dataset
dataset: belle_1m
template: qwen
cutoff_len: 1024
max_samples: 10000
overwrite_cache: true
preprocessing_num_workers: 16
### output
output_dir: ./results/lora_sft/Qwen-7B/Qwen_lora_sft_3
logging_steps: 3
save_steps: 100
plot_loss: true
overwrite_output_dir: true
### train
per_device_train_batch_size: 2
gradient_accumulation_steps: 8
learning_rate: 1.0e-4
num_train_epochs: 10.0
lr_scheduler_type: cosine
warmup_ratio: 0.1
bf16: true
ddp_timeout: 180000000
max_steps: 1000
include_num_input_tokens_seen: true
### eval
val_size: 0.1
per_device_eval_batch_size: 2
eval_strategy: steps
eval_steps: 500

View File

@ -0,0 +1,41 @@
### model
model_name_or_path: ../../llm/qwen
### method
stage: sft
do_train: true
finetuning_type: lora
lora_target: all
### dataset
dataset: belle_1m
template: qwen
cutoff_len: 1024
max_samples: 10000
overwrite_cache: true
preprocessing_num_workers: 16
### output
output_dir: ./results/lora_sft/Qwen-7B/Qwen_lora_sft_3_single
logging_steps: 3
save_steps: 100
plot_loss: true
overwrite_output_dir: true
### train
per_device_train_batch_size: 2
gradient_accumulation_steps: 8
learning_rate: 1.0e-4
num_train_epochs: 10.0
lr_scheduler_type: cosine
warmup_ratio: 0.1
bf16: true
ddp_timeout: 180000000
max_steps: 1000
include_num_input_tokens_seen: true
### eval
val_size: 0.1
per_device_eval_batch_size: 2
eval_strategy: steps
eval_steps: 500