update examples

2024-05-17 01:02:00 +08:00 · 2024-05-17 01:02:00 +08:00 · ddec9e1b84
parent 694a05fd04
commit ddec9e1b84
27 changed files with 155 additions and 155 deletions
--- a/examples/extras/badam/llama3_lora_sft.yaml
+++ b/examples/extras/badam/llama3_lora_sft.yaml
@ -1,7 +1,7 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: full
@ -10,7 +10,7 @@ badam_switch_mode: descending
 badam_switch_interval: 50
 badam_verbose: 2
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@ -18,14 +18,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
-# output
+### output
 output_dir: saves/llama3-8b/full/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 0.0001
@ -34,7 +34,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 pure_bf16: true
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
--- a/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
+++ b/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
@ -1,17 +1,17 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 quantization_bit: 4
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
-# ddp
+### ddp
 ddp_timeout: 180000000
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@ -19,14 +19,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
-# output
+### output
 output_dir: saves/llama3-8b/lora/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 0.0001
@ -35,7 +35,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
--- a/examples/extras/galore/llama3_full_sft.yaml
+++ b/examples/extras/galore/llama3_full_sft.yaml
@ -1,7 +1,7 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: full
@ -11,7 +11,7 @@ galore_target: mlp,self_attn
 galore_rank: 128
 galore_scale: 2.0
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@ -19,14 +19,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
-# output
+### output
 output_dir: saves/llama3-8b/full/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 1
 learning_rate: 0.0001
@ -35,7 +35,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 pure_bf16: true
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
--- a/examples/extras/llama_pro/llama3_freeze_sft.yaml
+++ b/examples/extras/llama_pro/llama3_freeze_sft.yaml
@ -1,7 +1,7 @@
-# model
+### model
 model_name_or_path: models/llama3-8b-instruct-pro
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: freeze
@ -9,7 +9,7 @@ freeze_trainable_layers: 8
 freeze_trainable_modules: all
 use_llama_pro: true
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@ -17,14 +17,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
-# output
+### output
 output_dir: saves/llama3-8b-instruct-pro/freeze/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 0.0001
@ -33,7 +33,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
--- a/examples/extras/loraplus/llama3_lora_sft.yaml
+++ b/examples/extras/loraplus/llama3_lora_sft.yaml
@ -1,14 +1,14 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
 loraplus_lr_ratio: 16.0
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@ -16,14 +16,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
-# output
+### output
 output_dir: saves/llama3-8b/lora/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 0.0001
@ -32,7 +32,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
--- a/examples/extras/mod/llama3_full_sft.yaml
+++ b/examples/extras/mod/llama3_full_sft.yaml
@ -1,13 +1,13 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: full
 mixture_of_depths: convert
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@ -15,14 +15,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
-# output
+### output
 output_dir: saves/llama3-8b-mod/full/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 optim: paged_adamw_8bit
@ -32,7 +32,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 pure_bf16: true
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
--- a/examples/full_multi_gpu/llama3_full_predict.yaml
+++ b/examples/full_multi_gpu/llama3_full_predict.yaml
@ -1,12 +1,12 @@
-# model
+### model
 model_name_or_path: saves/llama3-8b/full/sft
-# method
+### method
 stage: sft
 do_predict: true
 finetuning_type: full
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@ -14,10 +14,10 @@ max_samples: 50
 overwrite_cache: true
 preprocessing_num_workers: 16
-# output
+### output
 output_dir: saves/llama3-8b/full/predict
 overwrite_output_dir: true
-# eval
+### eval
 per_device_eval_batch_size: 1
 predict_with_generate: true
--- a/examples/full_multi_gpu/llama3_full_sft.yaml
+++ b/examples/full_multi_gpu/llama3_full_sft.yaml
@ -1,16 +1,16 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: full
-# ddp
+### ddp
 ddp_timeout: 180000000
 deepspeed: examples/deepspeed/ds_z3_config.json
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@ -18,14 +18,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
-# output
+### output
 output_dir: saves/llama3-8b/full/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 2
 learning_rate: 0.0001
@ -34,7 +34,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
--- a/examples/lora_multi_gpu/llama3_lora_sft.yaml
+++ b/examples/lora_multi_gpu/llama3_lora_sft.yaml
@ -1,16 +1,16 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
-# ddp
+### ddp
 ddp_timeout: 180000000
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@ -18,14 +18,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
-# output
+### output
 output_dir: saves/llama3-8b/lora/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 2
 learning_rate: 0.0001
@ -34,7 +34,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
--- a/examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
+++ b/examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
@ -1,17 +1,17 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
-# ddp
+### ddp
 ddp_timeout: 180000000
 deepspeed: examples/deepspeed/ds_z3_config.json
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@ -19,14 +19,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
-# output
+### output
 output_dir: saves/llama3-8b/lora/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 2
 learning_rate: 0.0001
@ -35,7 +35,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
--- a/examples/lora_multi_npu/llama3_lora_sft_ds.yaml
+++ b/examples/lora_multi_npu/llama3_lora_sft_ds.yaml
@ -1,17 +1,17 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
-# ddp
+### ddp
 ddp_timeout: 180000000
 deepspeed: examples/deepspeed/ds_z0_config.json
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@ -19,14 +19,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
-# output
+### output
 output_dir: saves/llama3-8b/lora/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 2
 learning_rate: 0.0001
@ -35,7 +35,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
--- a/examples/lora_single_gpu/llama3_lora_dpo.yaml
+++ b/examples/lora_single_gpu/llama3_lora_dpo.yaml
@ -1,14 +1,14 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-# method
+### method
 stage: dpo
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
 dpo_ftx: 1.0
-# dataset
+### dataset
 dataset: orca_rlhf
 template: llama3
 cutoff_len: 1024
@ -16,14 +16,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
-# output
+### output
 output_dir: saves/llama3-8b/lora/dpo
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 0.00001
@ -32,7 +32,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
--- a/examples/lora_single_gpu/llama3_lora_eval.yaml
+++ b/examples/lora_single_gpu/llama3_lora_eval.yaml
@ -1,19 +1,19 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 adapter_name_or_path: saves/llama3-8b/lora/sft
-# method
+### method
 finetuning_type: lora
-# dataset
+### dataset
 task: mmlu
 split: test
 template: fewshot
 lang: en
 n_shot: 5
-# output
+### output
 save_dir: saves/llama3-8b/lora/eval
-# eval
+### eval
 batch_size: 4
--- a/examples/lora_single_gpu/llama3_lora_orpo.yaml
+++ b/examples/lora_single_gpu/llama3_lora_orpo.yaml
@ -1,13 +1,13 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-# method
+### method
 stage: orpo
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
-# dataset
+### dataset
 dataset: orca_rlhf
 template: llama3
 cutoff_len: 1024
@ -15,14 +15,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
-# output
+### output
 output_dir: saves/llama3-8b/lora/orpo
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 0.00001
@ -31,7 +31,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
--- a/examples/lora_single_gpu/llama3_lora_ppo.yaml
+++ b/examples/lora_single_gpu/llama3_lora_ppo.yaml
@ -1,14 +1,14 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 reward_model: saves/llama3-8b/lora/reward
-# method
+### method
 stage: ppo
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@ -16,14 +16,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
-# output
+### output
 output_dir: saves/llama3-8b/lora/ppo
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 0.00001
@ -32,7 +32,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
-# generate
+### generate
 max_new_tokens: 512
 top_k: 0
 top_p: 0.9
--- a/examples/lora_single_gpu/llama3_lora_predict.yaml
+++ b/examples/lora_single_gpu/llama3_lora_predict.yaml
@ -1,13 +1,13 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 adapter_name_or_path: saves/llama3-8b/lora/sft
-# method
+### method
 stage: sft
 do_predict: true
 finetuning_type: lora
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@ -15,10 +15,10 @@ max_samples: 50
 overwrite_cache: true
 preprocessing_num_workers: 16
-# output
+### output
 output_dir: saves/llama3-8b/lora/predict
 overwrite_output_dir: true
-# eval
+### eval
 per_device_eval_batch_size: 1
 predict_with_generate: true
--- a/examples/lora_single_gpu/llama3_lora_pretrain.yaml
+++ b/examples/lora_single_gpu/llama3_lora_pretrain.yaml
@ -1,27 +1,27 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-# method
+### method
 stage: pt
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
-# dataset
+### dataset
 dataset: c4_demo
 cutoff_len: 1024
 max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
-# output
+### output
 output_dir: saves/llama3-8b/lora/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 0.0001
@ -30,7 +30,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
--- a/examples/lora_single_gpu/llama3_lora_reward.yaml
+++ b/examples/lora_single_gpu/llama3_lora_reward.yaml
@ -1,13 +1,13 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-# method
+### method
 stage: rm
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
-# dataset
+### dataset
 dataset: orca_rlhf
 template: llama3
 cutoff_len: 1024
@ -15,14 +15,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
-# output
+### output
 output_dir: saves/llama3-8b/lora/reward
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 0.00001
@ -31,7 +31,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
--- a/examples/lora_single_gpu/llama3_lora_sft.yaml
+++ b/examples/lora_single_gpu/llama3_lora_sft.yaml
@ -1,13 +1,13 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@ -15,14 +15,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
-# output
+### output
 output_dir: saves/llama3-8b/lora/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 0.0001
@ -31,7 +31,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
--- a/examples/lora_single_gpu/llama3_preprocess.yaml
+++ b/examples/lora_single_gpu/llama3_preprocess.yaml
@ -1,13 +1,13 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@ -16,6 +16,6 @@ overwrite_cache: true
 preprocessing_num_workers: 16
 tokenized_path: saves/llama3-8b/dataset/sft
-# output
+### output
 output_dir: saves/llama3-8b/lora/sft
 overwrite_output_dir: true
--- a/examples/lora_single_gpu/llava1_5_lora_sft.yaml
+++ b/examples/lora_single_gpu/llava1_5_lora_sft.yaml
@ -1,14 +1,14 @@
-# model
+### model
 model_name_or_path: llava-hf/llava-1.5-7b-hf
 visual_inputs: true
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
-# dataset
+### dataset
 dataset: mllm_demo
 template: vicuna
 cutoff_len: 1024
@ -16,14 +16,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
-# output
+### output
 output_dir: saves/llava1_5-7b/lora/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 0.0001
@ -32,7 +32,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
--- a/examples/merge_lora/llama3_gptq.yaml
+++ b/examples/merge_lora/llama3_gptq.yaml
@ -1,8 +1,8 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 template: llama3
-# export
+### export
 export_dir: models/llama3_gptq
 export_quantization_bit: 4
 export_quantization_dataset: data/c4_demo.json
--- a/examples/merge_lora/llama3_lora_sft.yaml
+++ b/examples/merge_lora/llama3_lora_sft.yaml
@ -1,12 +1,12 @@
-# Note: DO NOT use quantized model or quantization_bit when merging lora adapters
+### Note: DO NOT use quantized model or quantization_bit when merging lora adapters
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 adapter_name_or_path: saves/llama3-8b/lora/sft
 template: llama3
 finetuning_type: lora
-# export
+### export
 export_dir: models/llama3_lora_sft
 export_size: 2
 export_device: cpu
--- a/examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml
+++ b/examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml
@ -1,13 +1,13 @@
-# model
+### model
 model_name_or_path: ISTA-DASLab/Meta-Llama-3-8B-Instruct-AQLM-2Bit-1x16
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@ -15,14 +15,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
-# output
+### output
 output_dir: saves/llama3-8b/lora/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 0.0001
@ -31,7 +31,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
--- a/examples/qlora_single_gpu/llama3_lora_sft_awq.yaml
+++ b/examples/qlora_single_gpu/llama3_lora_sft_awq.yaml
@ -1,13 +1,13 @@
-# model
+### model
 model_name_or_path: TechxGenus/Meta-Llama-3-8B-Instruct-AWQ
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@ -15,14 +15,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
-# output
+### output
 output_dir: saves/llama3-8b/lora/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 0.0001
@ -31,7 +31,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
--- a/examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml
+++ b/examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml
@ -1,14 +1,14 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 quantization_bit: 4
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@ -16,14 +16,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
-# output
+### output
 output_dir: saves/llama3-8b/lora/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 0.0001
@ -32,7 +32,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
--- a/examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml
+++ b/examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml
@ -1,13 +1,13 @@
-# model
+### model
 model_name_or_path: TechxGenus/Meta-Llama-3-8B-Instruct-GPTQ
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@ -15,14 +15,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
-# output
+### output
 output_dir: saves/llama3-8b/lora/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 0.0001
@ -31,7 +31,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps