diff --git a/examples/extras/fsdp_qlora/llama3_lora_sft.yaml b/examples/extras/fsdp_qlora/llama3_lora_sft.yaml index cc773991..6c80ef58 100644 --- a/examples/extras/fsdp_qlora/llama3_lora_sft.yaml +++ b/examples/extras/fsdp_qlora/llama3_lora_sft.yaml @@ -30,7 +30,7 @@ learning_rate: 1.0e-4 num_train_epochs: 3.0 lr_scheduler_type: cosine warmup_ratio: 0.1 -fp16: true +bf16: true ddp_timeout: 180000000 ### eval diff --git a/examples/extras/llama_pro/llama3_freeze_sft.yaml b/examples/extras/llama_pro/llama3_freeze_sft.yaml index f92d6945..5e7e90bb 100644 --- a/examples/extras/llama_pro/llama3_freeze_sft.yaml +++ b/examples/extras/llama_pro/llama3_freeze_sft.yaml @@ -31,7 +31,7 @@ learning_rate: 1.0e-4 num_train_epochs: 3.0 lr_scheduler_type: cosine warmup_ratio: 0.1 -fp16: true +bf16: true ddp_timeout: 180000000 ### eval diff --git a/examples/extras/loraplus/llama3_lora_sft.yaml b/examples/extras/loraplus/llama3_lora_sft.yaml index 57383ae0..062a312b 100644 --- a/examples/extras/loraplus/llama3_lora_sft.yaml +++ b/examples/extras/loraplus/llama3_lora_sft.yaml @@ -30,7 +30,7 @@ learning_rate: 1.0e-4 num_train_epochs: 3.0 lr_scheduler_type: cosine warmup_ratio: 0.1 -fp16: true +bf16: true ddp_timeout: 180000000 ### eval diff --git a/examples/extras/pissa/llama3_lora_sft.yaml b/examples/extras/pissa/llama3_lora_sft.yaml index fd4b9f1d..05077b6c 100644 --- a/examples/extras/pissa/llama3_lora_sft.yaml +++ b/examples/extras/pissa/llama3_lora_sft.yaml @@ -32,7 +32,7 @@ learning_rate: 1.0e-4 num_train_epochs: 3.0 lr_scheduler_type: cosine warmup_ratio: 0.1 -fp16: true +bf16: true ddp_timeout: 180000000 ### eval diff --git a/examples/train_full/llama3_full_sft_ds3.yaml b/examples/train_full/llama3_full_sft_ds3.yaml index 40afd2ee..c983ad5c 100644 --- a/examples/train_full/llama3_full_sft_ds3.yaml +++ b/examples/train_full/llama3_full_sft_ds3.yaml @@ -29,7 +29,7 @@ learning_rate: 1.0e-4 num_train_epochs: 3.0 lr_scheduler_type: cosine warmup_ratio: 0.1 -fp16: true +bf16: true ddp_timeout: 180000000 ### eval diff --git a/examples/train_lora/llama3_lora_dpo.yaml b/examples/train_lora/llama3_lora_dpo.yaml index 188e5078..d87c0669 100644 --- a/examples/train_lora/llama3_lora_dpo.yaml +++ b/examples/train_lora/llama3_lora_dpo.yaml @@ -31,7 +31,7 @@ learning_rate: 5.0e-6 num_train_epochs: 3.0 lr_scheduler_type: cosine warmup_ratio: 0.1 -fp16: true +bf16: true ddp_timeout: 180000000 ### eval diff --git a/examples/train_lora/llama3_lora_kto.yaml b/examples/train_lora/llama3_lora_kto.yaml index f730c82e..08208c25 100644 --- a/examples/train_lora/llama3_lora_kto.yaml +++ b/examples/train_lora/llama3_lora_kto.yaml @@ -30,7 +30,7 @@ learning_rate: 5.0e-6 num_train_epochs: 3.0 lr_scheduler_type: cosine warmup_ratio: 0.1 -fp16: true +bf16: true ddp_timeout: 180000000 ### eval diff --git a/examples/train_lora/llama3_lora_ppo.yaml b/examples/train_lora/llama3_lora_ppo.yaml index e574014e..512e90ea 100644 --- a/examples/train_lora/llama3_lora_ppo.yaml +++ b/examples/train_lora/llama3_lora_ppo.yaml @@ -30,7 +30,7 @@ learning_rate: 1.0e-5 num_train_epochs: 3.0 lr_scheduler_type: cosine warmup_ratio: 0.1 -fp16: true +bf16: true ddp_timeout: 180000000 ### generate diff --git a/examples/train_lora/llama3_lora_pretrain.yaml b/examples/train_lora/llama3_lora_pretrain.yaml index 839b3e51..5e8aaaef 100644 --- a/examples/train_lora/llama3_lora_pretrain.yaml +++ b/examples/train_lora/llama3_lora_pretrain.yaml @@ -28,7 +28,7 @@ learning_rate: 1.0e-4 num_train_epochs: 3.0 lr_scheduler_type: cosine warmup_ratio: 0.1 -fp16: true +bf16: true ddp_timeout: 180000000 ### eval diff --git a/examples/train_lora/llama3_lora_reward.yaml b/examples/train_lora/llama3_lora_reward.yaml index 79559d19..96c32238 100644 --- a/examples/train_lora/llama3_lora_reward.yaml +++ b/examples/train_lora/llama3_lora_reward.yaml @@ -25,11 +25,11 @@ overwrite_output_dir: true ### train per_device_train_batch_size: 1 gradient_accumulation_steps: 8 -learning_rate: 1.0e-5 +learning_rate: 1.0e-4 num_train_epochs: 3.0 lr_scheduler_type: cosine warmup_ratio: 0.1 -fp16: true +bf16: true ddp_timeout: 180000000 ### eval diff --git a/examples/train_lora/llama3_lora_sft.yaml b/examples/train_lora/llama3_lora_sft.yaml index fe30c575..55a8077e 100644 --- a/examples/train_lora/llama3_lora_sft.yaml +++ b/examples/train_lora/llama3_lora_sft.yaml @@ -29,7 +29,7 @@ learning_rate: 1.0e-4 num_train_epochs: 3.0 lr_scheduler_type: cosine warmup_ratio: 0.1 -fp16: true +bf16: true ddp_timeout: 180000000 ### eval diff --git a/examples/train_lora/llama3_lora_sft_ds0.yaml b/examples/train_lora/llama3_lora_sft_ds0.yaml index 08b638e6..f1442faa 100644 --- a/examples/train_lora/llama3_lora_sft_ds0.yaml +++ b/examples/train_lora/llama3_lora_sft_ds0.yaml @@ -30,7 +30,7 @@ learning_rate: 1.0e-4 num_train_epochs: 3.0 lr_scheduler_type: cosine warmup_ratio: 0.1 -fp16: true +bf16: true ddp_timeout: 180000000 ### eval diff --git a/examples/train_lora/llama3_lora_sft_ds3.yaml b/examples/train_lora/llama3_lora_sft_ds3.yaml index b7266d61..66e7007e 100644 --- a/examples/train_lora/llama3_lora_sft_ds3.yaml +++ b/examples/train_lora/llama3_lora_sft_ds3.yaml @@ -30,7 +30,7 @@ learning_rate: 1.0e-4 num_train_epochs: 3.0 lr_scheduler_type: cosine warmup_ratio: 0.1 -fp16: true +bf16: true ddp_timeout: 180000000 ### eval diff --git a/examples/train_lora/llava1_5_lora_sft.yaml b/examples/train_lora/llava1_5_lora_sft.yaml index 55ac31fa..ec03f82c 100644 --- a/examples/train_lora/llava1_5_lora_sft.yaml +++ b/examples/train_lora/llava1_5_lora_sft.yaml @@ -30,7 +30,7 @@ learning_rate: 1.0e-4 num_train_epochs: 3.0 lr_scheduler_type: cosine warmup_ratio: 0.1 -fp16: true +bf16: true ddp_timeout: 180000000 ### eval diff --git a/examples/train_qlora/llama3_lora_sft_aqlm.yaml b/examples/train_qlora/llama3_lora_sft_aqlm.yaml index 7b6767d5..3519d46b 100644 --- a/examples/train_qlora/llama3_lora_sft_aqlm.yaml +++ b/examples/train_qlora/llama3_lora_sft_aqlm.yaml @@ -29,7 +29,7 @@ learning_rate: 1.0e-4 num_train_epochs: 3.0 lr_scheduler_type: cosine warmup_ratio: 0.1 -fp16: true +bf16: true ddp_timeout: 180000000 ### eval diff --git a/examples/train_qlora/llama3_lora_sft_awq.yaml b/examples/train_qlora/llama3_lora_sft_awq.yaml index a2a26e4b..df48669b 100644 --- a/examples/train_qlora/llama3_lora_sft_awq.yaml +++ b/examples/train_qlora/llama3_lora_sft_awq.yaml @@ -29,7 +29,7 @@ learning_rate: 1.0e-4 num_train_epochs: 3.0 lr_scheduler_type: cosine warmup_ratio: 0.1 -fp16: true +bf16: true ddp_timeout: 180000000 ### eval diff --git a/examples/train_qlora/llama3_lora_sft_gptq.yaml b/examples/train_qlora/llama3_lora_sft_gptq.yaml index ad3d854c..61fa9bb4 100644 --- a/examples/train_qlora/llama3_lora_sft_gptq.yaml +++ b/examples/train_qlora/llama3_lora_sft_gptq.yaml @@ -29,7 +29,7 @@ learning_rate: 1.0e-4 num_train_epochs: 3.0 lr_scheduler_type: cosine warmup_ratio: 0.1 -fp16: true +bf16: true ddp_timeout: 180000000 ### eval diff --git a/examples/train_qlora/llama3_lora_sft_otfq.yaml b/examples/train_qlora/llama3_lora_sft_otfq.yaml index 9c73b439..80a05768 100644 --- a/examples/train_qlora/llama3_lora_sft_otfq.yaml +++ b/examples/train_qlora/llama3_lora_sft_otfq.yaml @@ -31,7 +31,7 @@ learning_rate: 1.0e-4 num_train_epochs: 3.0 lr_scheduler_type: cosine warmup_ratio: 0.1 -fp16: true +bf16: true ddp_timeout: 180000000 ### eval