From 34d33e22570338da709b8499830adb06b202095c Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 6 May 2024 21:47:00 +0800
Subject: [PATCH 1/3] update docs

---
 README.md                                     | 69 +++++++++---------
 README_zh.md                                  | 71 ++++++++++---------
 examples/README.md                            |  9 ++-
 examples/extras/badam/sft.sh                  |  2 +-
 examples/inference/api_demo.sh                |  7 --
 examples/inference/cli_demo.sh                |  7 --
 examples/inference/evaluate.sh                | 12 ----
 examples/inference/llama3.yaml                |  2 +
 examples/inference/llama3_lora_sft.yaml       |  4 ++
 examples/inference/llama3_vllm.yaml           |  4 ++
 examples/inference/web_demo.sh                |  8 ---
 examples/lora_single_gpu/dpo.sh               | 35 ---------
 examples/lora_single_gpu/llama3_lora_dpo.yaml | 39 ++++++++++
 .../lora_single_gpu/llama3_lora_eval.yaml     | 19 +++++
 .../lora_single_gpu/llama3_lora_orpo.yaml     | 38 ++++++++++
 examples/lora_single_gpu/llama3_lora_ppo.yaml | 38 ++++++++++
 .../lora_single_gpu/llama3_lora_predict.yaml  | 24 +++++++
 .../lora_single_gpu/llama3_lora_pretrain.yaml | 37 ++++++++++
 .../lora_single_gpu/llama3_lora_reward.yaml   | 38 ++++++++++
 examples/lora_single_gpu/llama3_lora_sft.yaml | 38 ++++++++++
 .../lora_single_gpu/llama3_preprocess.yaml    | 22 ++++++
 .../lora_single_gpu/llava1_5_lora_sft.yaml    | 39 ++++++++++
 examples/lora_single_gpu/orpo.sh              | 32 ---------
 examples/lora_single_gpu/ppo.sh               | 32 ---------
 examples/lora_single_gpu/predict.sh           | 19 -----
 examples/lora_single_gpu/prepare.sh           | 19 -----
 examples/lora_single_gpu/pretrain.sh          | 31 --------
 examples/lora_single_gpu/reward.sh            | 33 ---------
 examples/lora_single_gpu/sft.sh               | 32 ---------
 examples/lora_single_gpu/sft_mllm.sh          | 33 ---------
 examples/merge_lora/llama3_gptq.yaml          | 11 +++
 examples/merge_lora/llama3_lora_sft.yaml      | 13 ++++
 examples/merge_lora/merge.sh                  | 12 ----
 examples/merge_lora/quantize.sh               | 11 ---
 examples/qlora_single_gpu/aqlm.sh             | 30 --------
 examples/qlora_single_gpu/awq.sh              | 30 --------
 examples/qlora_single_gpu/bitsandbytes.sh     | 31 --------
 examples/qlora_single_gpu/gptq.sh             | 30 --------
 .../llama3_lora_sft_aqlm.yaml                 | 27 +++++++
 .../qlora_single_gpu/llama3_lora_sft_awq.yaml |  0
 .../llama3_lora_sft_bitsandbytes.yaml         |  0
 .../llama3_lora_sft_gptq.yaml                 |  0
 setup.py                                      |  6 +-
 src/webui.py                                  |  9 +++
 44 files changed, 487 insertions(+), 516 deletions(-)
 delete mode 100644 examples/inference/api_demo.sh
 delete mode 100644 examples/inference/cli_demo.sh
 delete mode 100644 examples/inference/evaluate.sh
 create mode 100644 examples/inference/llama3.yaml
 create mode 100644 examples/inference/llama3_lora_sft.yaml
 create mode 100644 examples/inference/llama3_vllm.yaml
 delete mode 100644 examples/inference/web_demo.sh
 delete mode 100644 examples/lora_single_gpu/dpo.sh
 create mode 100644 examples/lora_single_gpu/llama3_lora_dpo.yaml
 create mode 100644 examples/lora_single_gpu/llama3_lora_eval.yaml
 create mode 100644 examples/lora_single_gpu/llama3_lora_orpo.yaml
 create mode 100644 examples/lora_single_gpu/llama3_lora_ppo.yaml
 create mode 100644 examples/lora_single_gpu/llama3_lora_predict.yaml
 create mode 100644 examples/lora_single_gpu/llama3_lora_pretrain.yaml
 create mode 100644 examples/lora_single_gpu/llama3_lora_reward.yaml
 create mode 100644 examples/lora_single_gpu/llama3_lora_sft.yaml
 create mode 100644 examples/lora_single_gpu/llama3_preprocess.yaml
 create mode 100644 examples/lora_single_gpu/llava1_5_lora_sft.yaml
 delete mode 100644 examples/lora_single_gpu/orpo.sh
 delete mode 100644 examples/lora_single_gpu/ppo.sh
 delete mode 100644 examples/lora_single_gpu/predict.sh
 delete mode 100644 examples/lora_single_gpu/prepare.sh
 delete mode 100644 examples/lora_single_gpu/pretrain.sh
 delete mode 100644 examples/lora_single_gpu/reward.sh
 delete mode 100644 examples/lora_single_gpu/sft.sh
 delete mode 100644 examples/lora_single_gpu/sft_mllm.sh
 create mode 100644 examples/merge_lora/llama3_gptq.yaml
 create mode 100644 examples/merge_lora/llama3_lora_sft.yaml
 delete mode 100644 examples/merge_lora/merge.sh
 delete mode 100644 examples/merge_lora/quantize.sh
 delete mode 100644 examples/qlora_single_gpu/aqlm.sh
 delete mode 100644 examples/qlora_single_gpu/awq.sh
 delete mode 100644 examples/qlora_single_gpu/bitsandbytes.sh
 delete mode 100644 examples/qlora_single_gpu/gptq.sh
 create mode 100644 examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml
 create mode 100644 examples/qlora_single_gpu/llama3_lora_sft_awq.yaml
 create mode 100644 examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml
 create mode 100644 examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml
 create mode 100644 src/webui.py

diff --git a/README.md b/README.md
index 347ebe7e..d10ef982 100644
--- a/README.md
+++ b/README.md
@@ -276,18 +276,19 @@ huggingface-cli login
 | ------------ | ------- | --------- |
 | python       | 3.8     | 3.10      |
 | torch        | 1.13.1  | 2.2.0     |
-| transformers | 4.37.2  | 4.39.3    |
-| datasets     | 2.14.3  | 2.18.0    |
-| accelerate   | 0.27.2  | 0.28.0    |
+| transformers | 4.37.2  | 4.40.1    |
+| datasets     | 2.14.3  | 2.19.1    |
+| accelerate   | 0.27.2  | 0.30.0    |
 | peft         | 0.9.0   | 0.10.0    |
-| trl          | 0.8.1   | 0.8.1     |
+| trl          | 0.8.1   | 0.8.6     |
 
 | Optional     | Minimum | Recommend |
 | ------------ | ------- | --------- |
 | CUDA         | 11.6    | 12.2      |
 | deepspeed    | 0.10.0  | 0.14.0    |
-| bitsandbytes | 0.39.0  | 0.43.0    |
-| flash-attn   | 2.3.0   | 2.5.6     |
+| bitsandbytes | 0.39.0  | 0.43.1    |
+| vllm         | 0.4.0   | 0.4.2     |
+| flash-attn   | 2.3.0   | 2.5.8     |
 
 ### Hardware Requirement
 
@@ -305,24 +306,15 @@ huggingface-cli login
 
 ## Getting Started
 
-### Data Preparation
-
-Please refer to [data/README.md](data/README.md) for checking the details about the format of dataset files. You can either use datasets on HuggingFace / ModelScope hub or load the dataset in local disk.
-
-> [!NOTE]
-> Please update `data/dataset_info.json` to use your custom dataset.
-
-### Dependence Installation
+### Installation
 
 ```bash
 git clone https://github.com/hiyouga/LLaMA-Factory.git
-conda create -n llama_factory python=3.10
-conda activate llama_factory
 cd LLaMA-Factory
 pip install -e .[metrics]
 ```
 
-Extra dependencies available: deepspeed, metrics, galore, badam, vllm, bitsandbytes, gptq, awq, aqlm, qwen, modelscope, quality
+Extra dependencies available: metrics, deepspeed, bitsandbytes, vllm, galore, badam, gptq, awq, aqlm, qwen, modelscope, quality
 
 <details><summary>For Windows users</summary>
 
@@ -336,19 +328,41 @@ To enable FlashAttention-2 on the Windows platform, you need to install the prec
 
 </details>
 
-### Train with LLaMA Board GUI (powered by [Gradio](https://github.com/gradio-app/gradio))
+### Data Preparation
+
+Please refer to [data/README.md](data/README.md) for checking the details about the format of dataset files. You can either use datasets on HuggingFace / ModelScope hub or load the dataset in local disk.
+
+> [!NOTE]
+> Please update `data/dataset_info.json` to use your custom dataset.
+
+### Quickstart
+
+The following 3 commands conduct LoRA fine-tuning, inference and merging for Llama3-8B-Instruct model, respectively.
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_sft.yaml
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli chat examples/inference/llama3_lora_sft.yaml
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
+```
+
+See [examples/README.md](examples/README.md) for advanced usage.
+
+> [!TIP]
+> Use `llamafactory-cli help` to show help information.
+
+### Use LLaMA Board GUI (powered by [Gradio](https://github.com/gradio-app/gradio))
 
 > [!IMPORTANT]
-> LLaMA Board GUI only supports training on a single GPU, please use [CLI](#train-with-command-line-interface) for distributed training.
+> LLaMA Board GUI only supports training on a single GPU.
 
 #### Use local environment
 
 ```bash
-llamafactory-cli webui
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli webui
 ```
 
 > [!TIP]
-> To modify the default setting in the LLaMA Board GUI, you can use environment variables, e.g., `export CUDA_VISIBLE_DEVICES=0 GRADIO_SERVER_NAME=0.0.0.0 GRADIO_SERVER_PORT=7860 GRADIO_SHARE=False` (use `set` command on Windows OS).
+> To modify the default setting in the LLaMA Board GUI, you can use environment variables, e.g., `export GRADIO_SERVER_NAME=0.0.0.0 GRADIO_SERVER_PORT=7860 GRADIO_SHARE=False` (use `set` command on Windows OS).
 
 <details><summary>For Alibaba Cloud users</summary>
 
@@ -389,21 +403,10 @@ docker compose -f ./docker-compose.yml up -d
 
 </details>
 
-### Train with Command Line Interface
-
-See [examples/README.md](examples/README.md) for usage.
-
-> [!TIP]
-> Use `llamafactory-cli train -h` to display arguments description.
-
 ### Deploy with OpenAI-style API and vLLM
 
 ```bash
-CUDA_VISIBLE_DEVICES=0,1 API_PORT=8000 llamafactory-cli api \
-    --model_name_or_path meta-llama/Meta-Llama-3-8B-Instruct \
-    --template llama3 \
-    --infer_backend vllm \
-    --vllm_enforce_eager
+CUDA_VISIBLE_DEVICES=0,1 API_PORT=8000 llamafactory-cli api examples/inference/llama3_vllm.yaml
 ```
 
 ### Download from ModelScope Hub
diff --git a/README_zh.md b/README_zh.md
index 8a2fb79b..9c639f2c 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -163,7 +163,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 | [Yuan](https://huggingface.co/IEITYuan)                  | 2B/51B/102B                      | q_proj,v_proj     | yuan      |
 
 > [!NOTE]
-> **默认模块**应作为 `--lora_target` 参数的默认值，可使用 `--lora_target all` 参数指定全部模块以得到更好的效果。
+> **默认模块**应作为 `--lora_target` 参数的默认值，可使用 `--lora_target all` 参数指定全部模块以取得更好的效果。
 >
 > 对于所有“基座”（Base）模型，`--template` 参数可以是 `default`, `alpaca`, `vicuna` 等任意值。但“对话”（Instruct/Chat）模型请务必使用**对应的模板**。
 >
@@ -276,18 +276,19 @@ huggingface-cli login
 | ------------ | ------- | --------- |
 | python       | 3.8     | 3.10      |
 | torch        | 1.13.1  | 2.2.0     |
-| transformers | 4.37.2  | 4.39.3    |
-| datasets     | 2.14.3  | 2.18.0    |
-| accelerate   | 0.27.2  | 0.28.0    |
+| transformers | 4.37.2  | 4.40.1    |
+| datasets     | 2.14.3  | 2.19.1    |
+| accelerate   | 0.27.2  | 0.30.0    |
 | peft         | 0.9.0   | 0.10.0    |
-| trl          | 0.8.1   | 0.8.1     |
+| trl          | 0.8.1   | 0.8.6     |
 
 | 可选项       | 至少     | 推荐      |
 | ------------ | ------- | --------- |
 | CUDA         | 11.6    | 12.2      |
 | deepspeed    | 0.10.0  | 0.14.0    |
-| bitsandbytes | 0.39.0  | 0.43.0    |
-| flash-attn   | 2.3.0   | 2.5.6     |
+| bitsandbytes | 0.39.0  | 0.43.1    |
+| vllm         | 0.4.0   | 0.4.2     |
+| flash-attn   | 2.3.0   | 2.5.8     |
 
 ### 硬件依赖
 
@@ -305,24 +306,15 @@ huggingface-cli login
 
 ## 如何使用
 
-### 数据准备
-
-关于数据集文件的格式，请参考 [data/README_zh.md](data/README_zh.md) 的内容。你可以使用 HuggingFace / ModelScope 上的数据集或加载本地数据集。
-
-> [!NOTE]
-> 使用自定义数据集时，请更新 `data/dataset_info.json` 文件。
-
-### 安装依赖
+### 安装 LLaMA Factory
 
 ```bash
 git clone https://github.com/hiyouga/LLaMA-Factory.git
-conda create -n llama_factory python=3.10
-conda activate llama_factory
 cd LLaMA-Factory
 pip install -e .[metrics]
 ```
 
-可选的额外依赖项：deepspeed、metrics、galore、badam、vllm、bitsandbytes、gptq、awq、aqlm、qwen、modelscope、quality
+可选的额外依赖项：metrics、deepspeed、bitsandbytes、vllm、galore、badam、gptq、awq、aqlm、qwen、modelscope、quality
 
 <details><summary>Windows 用户指南</summary>
 
@@ -336,19 +328,41 @@ pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/downl
 
 </details>
 
-### 利用 LLaMA Board 可视化界面训练（由 [Gradio](https://github.com/gradio-app/gradio) 驱动）
+### 数据准备
+
+关于数据集文件的格式，请参考 [data/README_zh.md](data/README_zh.md) 的内容。你可以使用 HuggingFace / ModelScope 上的数据集或加载本地数据集。
+
+> [!NOTE]
+> 使用自定义数据集时，请更新 `data/dataset_info.json` 文件。
+
+### 快速开始
+
+下面三行命令分别对 Llama3-8B-Instruct 模型进行 LoRA 微调、推理和合并。
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_sft.yaml
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli chat examples/inference/llama3_lora_sft.yaml
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
+```
+
+高级用法请参考 [examples/README_zh.md](examples/README_zh.md)。
+
+> [!TIP]
+> 使用 `llamafactory-cli help` 显示使用帮助。
+
+### 使用 LLaMA Board 可视化界面（由 [Gradio](https://github.com/gradio-app/gradio) 驱动）
 
 > [!IMPORTANT]
-> LLaMA Board 可视化界面目前仅支持单 GPU 训练，请使用[命令行接口](#利用命令行接口训练)来进行多 GPU 分布式训练。
+> LLaMA Board 可视化界面目前仅支持单 GPU 训练。
 
 #### 使用本地环境
 
 ```bash
-llamafactory-cli webui
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli webui
 ```
 
 > [!TIP]
-> 您可以使用环境变量来修改 LLaMA Board 可视化界面的默认设置，例如 `export CUDA_VISIBLE_DEVICES=0 GRADIO_SERVER_NAME=0.0.0.0 GRADIO_SERVER_PORT=7860 GRADIO_SHARE=False`（Windows 系统可使用 `set` 指令）。
+> 您可以使用环境变量来修改 LLaMA Board 可视化界面的默认设置，例如 `export GRADIO_SERVER_NAME=0.0.0.0 GRADIO_SERVER_PORT=7860 GRADIO_SHARE=False`（Windows 系统可使用 `set` 指令）。
 
 <details><summary>阿里云用户指南</summary>
 
@@ -389,21 +403,10 @@ docker compose -f ./docker-compose.yml up -d
 
 </details>
 
-### 利用命令行接口训练
-
-使用方法请参考 [examples/README_zh.md](examples/README_zh.md)。
-
-> [!TIP]
-> 您可以执行 `llamafactory-cli train -h` 来查看参数文档。
-
 ### 利用 vLLM 部署 OpenAI API
 
 ```bash
-CUDA_VISIBLE_DEVICES=0,1 API_PORT=8000 llamafactory-cli api \
-    --model_name_or_path meta-llama/Meta-Llama-3-8B-Instruct \
-    --template llama3 \
-    --infer_backend vllm \
-    --vllm_enforce_eager
+CUDA_VISIBLE_DEVICES=0,1 API_PORT=8000 llamafactory-cli api examples/inference/llama3_vllm.yaml
 ```
 
 ### 从魔搭社区下载
diff --git a/examples/README.md b/examples/README.md
index 895e9c72..0a14c5bd 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -1,9 +1,16 @@
 We provide diverse examples about fine-tuning LLMs.
 
+```bash
+export CUDA_VISIBLE_DEVICES=0
+cd examples/lora_single_gpu
+llamafactory-cli train llama3_lora_pretrain.yaml # Do continuous pre-training using LoRA
+
+```
+
 ```
 examples/
 ├── lora_single_gpu/
-│   ├── pretrain.sh: Do continuous pre-training using LoRA
+│   ├── `
 │   ├── sft.sh: Do supervised fine-tuning using LoRA
 │   ├── reward.sh: Do reward modeling using LoRA
 │   ├── ppo.sh: Do PPO training using LoRA
diff --git a/examples/extras/badam/sft.sh b/examples/extras/badam/sft.sh
index 4bcfe9d2..61167dad 100644
--- a/examples/extras/badam/sft.sh
+++ b/examples/extras/badam/sft.sh
@@ -10,7 +10,7 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
     --finetuning_type full \
     --use_badam \
     --badam_switch_mode descending \
-    --badam_switch_interval 50 \
+    --badam_switch_block_every 50 \
     --badam_verbose 2 \
     --output_dir ../../../saves/LLaMA2-7B/badam/sft \
     --overwrite_cache \
diff --git a/examples/inference/api_demo.sh b/examples/inference/api_demo.sh
deleted file mode 100644
index 6f0f1b2e..00000000
--- a/examples/inference/api_demo.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 API_PORT=8000 llamafactory-cli api \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \
-    --template default \
-    --finetuning_type lora
diff --git a/examples/inference/cli_demo.sh b/examples/inference/cli_demo.sh
deleted file mode 100644
index bc762411..00000000
--- a/examples/inference/cli_demo.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli chat \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \
-    --template default \
-    --finetuning_type lora
diff --git a/examples/inference/evaluate.sh b/examples/inference/evaluate.sh
deleted file mode 100644
index 5030329d..00000000
--- a/examples/inference/evaluate.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli eval \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \
-    --template fewshot \
-    --finetuning_type lora \
-    --task mmlu \
-    --split test \
-    --lang en \
-    --n_shot 5 \
-    --batch_size 4
diff --git a/examples/inference/llama3.yaml b/examples/inference/llama3.yaml
new file mode 100644
index 00000000..ffc5be82
--- /dev/null
+++ b/examples/inference/llama3.yaml
@@ -0,0 +1,2 @@
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+template: llama3
diff --git a/examples/inference/llama3_lora_sft.yaml b/examples/inference/llama3_lora_sft.yaml
new file mode 100644
index 00000000..262f4445
--- /dev/null
+++ b/examples/inference/llama3_lora_sft.yaml
@@ -0,0 +1,4 @@
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+adapter_name_or_path: saves/llama3-8b/lora/sft
+template: llama3
+finetuning_type: lora
diff --git a/examples/inference/llama3_vllm.yaml b/examples/inference/llama3_vllm.yaml
new file mode 100644
index 00000000..8dd3b61a
--- /dev/null
+++ b/examples/inference/llama3_vllm.yaml
@@ -0,0 +1,4 @@
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+template: llama3
+infer_backend: vllm
+vllm_enforce_eager: true
diff --git a/examples/inference/web_demo.sh b/examples/inference/web_demo.sh
deleted file mode 100644
index a58cd2a0..00000000
--- a/examples/inference/web_demo.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/bash
-# add `--visual_inputs True` to load MLLM
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli webchat \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \
-    --template default \
-    --finetuning_type lora
diff --git a/examples/lora_single_gpu/dpo.sh b/examples/lora_single_gpu/dpo.sh
deleted file mode 100644
index 2cb6cb01..00000000
--- a/examples/lora_single_gpu/dpo.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
-    --stage dpo \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \
-    --create_new_adapter \
-    --dataset orca_rlhf \
-    --dataset_dir ../../data \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir ../../saves/LLaMA2-7B/lora/dpo \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 8 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 1e-5 \
-    --num_train_epochs 1.0 \
-    --max_samples 1000 \
-    --val_size 0.1 \
-    --dpo_ftx 1.0 \
-    --plot_loss \
-    --fp16
diff --git a/examples/lora_single_gpu/llama3_lora_dpo.yaml b/examples/lora_single_gpu/llama3_lora_dpo.yaml
new file mode 100644
index 00000000..f71f752d
--- /dev/null
+++ b/examples/lora_single_gpu/llama3_lora_dpo.yaml
@@ -0,0 +1,39 @@
+# model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+
+# method
+stage: dpo
+do_train: true
+finetuning_type: lora
+lora_target: q_proj,v_proj
+dpo_ftx: 1.0
+
+# dataset
+dataset: orca_rlhf
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+val_size: 0.1
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b/lora/dpo
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 0.00001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
+fp16: true
+
+# eval
+per_device_eval_batch_size: 1
+evaluation_strategy: steps
+eval_steps: 500
diff --git a/examples/lora_single_gpu/llama3_lora_eval.yaml b/examples/lora_single_gpu/llama3_lora_eval.yaml
new file mode 100644
index 00000000..5808a47a
--- /dev/null
+++ b/examples/lora_single_gpu/llama3_lora_eval.yaml
@@ -0,0 +1,19 @@
+# model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+adapter_name_or_path: saves/llama3-8b/lora/sft
+
+# method
+finetuning_type: lora
+
+# dataset
+task: mmlu
+split: test
+template: fewshot
+lang: en
+n_shot: 5
+
+# output
+save_dir: saves/llama3-8b/lora/eval
+
+# eval
+batch_size: 4
diff --git a/examples/lora_single_gpu/llama3_lora_orpo.yaml b/examples/lora_single_gpu/llama3_lora_orpo.yaml
new file mode 100644
index 00000000..5d78d260
--- /dev/null
+++ b/examples/lora_single_gpu/llama3_lora_orpo.yaml
@@ -0,0 +1,38 @@
+# model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+
+# method
+stage: orpo
+do_train: true
+finetuning_type: lora
+lora_target: q_proj,v_proj
+
+# dataset
+dataset: orca_rlhf
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+val_size: 0.1
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b/lora/orpo
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 0.00001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
+fp16: true
+
+# eval
+per_device_eval_batch_size: 1
+evaluation_strategy: steps
+eval_steps: 500
diff --git a/examples/lora_single_gpu/llama3_lora_ppo.yaml b/examples/lora_single_gpu/llama3_lora_ppo.yaml
new file mode 100644
index 00000000..8d78d20d
--- /dev/null
+++ b/examples/lora_single_gpu/llama3_lora_ppo.yaml
@@ -0,0 +1,38 @@
+# model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+reward_model: saves/llama3-8b/lora/reward
+
+# method
+stage: ppo
+do_train: true
+finetuning_type: lora
+lora_target: q_proj,v_proj
+
+# dataset
+dataset: identity,alpaca_gpt4_en
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b/lora/ppo
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 0.00001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
+fp16: true
+
+# generate
+max_new_tokens: 512
+top_k: 0
+top_p: 0.9
diff --git a/examples/lora_single_gpu/llama3_lora_predict.yaml b/examples/lora_single_gpu/llama3_lora_predict.yaml
new file mode 100644
index 00000000..5a9de686
--- /dev/null
+++ b/examples/lora_single_gpu/llama3_lora_predict.yaml
@@ -0,0 +1,24 @@
+# model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+adapter_name_or_path: saves/llama3-8b/lora/sft
+
+# method
+stage: sft
+do_predict: true
+finetuning_type: lora
+
+# dataset
+dataset: identity,alpaca_gpt4_en
+template: llama3
+cutoff_len: 1024
+max_samples: 50
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b/lora/predict
+overwrite_output_dir: true
+
+# eval
+per_device_eval_batch_size: 1
+predict_with_generate: true
diff --git a/examples/lora_single_gpu/llama3_lora_pretrain.yaml b/examples/lora_single_gpu/llama3_lora_pretrain.yaml
new file mode 100644
index 00000000..64245b71
--- /dev/null
+++ b/examples/lora_single_gpu/llama3_lora_pretrain.yaml
@@ -0,0 +1,37 @@
+# model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+
+# method
+stage: pt
+do_train: true
+finetuning_type: lora
+lora_target: q_proj,v_proj
+
+# dataset
+dataset: c4_demo
+cutoff_len: 1024
+max_samples: 1000
+val_size: 0.1
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b/lora/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 0.0001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
+fp16: true
+
+# eval
+per_device_eval_batch_size: 1
+evaluation_strategy: steps
+eval_steps: 500
diff --git a/examples/lora_single_gpu/llama3_lora_reward.yaml b/examples/lora_single_gpu/llama3_lora_reward.yaml
new file mode 100644
index 00000000..f190f4ac
--- /dev/null
+++ b/examples/lora_single_gpu/llama3_lora_reward.yaml
@@ -0,0 +1,38 @@
+# model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+
+# method
+stage: rm
+do_train: true
+finetuning_type: lora
+lora_target: q_proj,v_proj
+
+# dataset
+dataset: orca_rlhf
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+val_size: 0.1
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b/lora/reward
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 0.00001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
+fp16: true
+
+# eval
+per_device_eval_batch_size: 1
+evaluation_strategy: steps
+eval_steps: 500
diff --git a/examples/lora_single_gpu/llama3_lora_sft.yaml b/examples/lora_single_gpu/llama3_lora_sft.yaml
new file mode 100644
index 00000000..f99df305
--- /dev/null
+++ b/examples/lora_single_gpu/llama3_lora_sft.yaml
@@ -0,0 +1,38 @@
+# model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+
+# method
+stage: sft
+do_train: true
+finetuning_type: lora
+lora_target: q_proj,v_proj
+
+# dataset
+dataset: identity,alpaca_gpt4_en
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+val_size: 0.1
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b/lora/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 0.0001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
+fp16: true
+
+# eval
+per_device_eval_batch_size: 1
+evaluation_strategy: steps
+eval_steps: 500
diff --git a/examples/lora_single_gpu/llama3_preprocess.yaml b/examples/lora_single_gpu/llama3_preprocess.yaml
new file mode 100644
index 00000000..04df9631
--- /dev/null
+++ b/examples/lora_single_gpu/llama3_preprocess.yaml
@@ -0,0 +1,22 @@
+# model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+
+# method
+stage: sft
+do_train: true
+finetuning_type: lora
+lora_target: q_proj,v_proj
+
+# dataset
+dataset: identity,alpaca_gpt4_en
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+val_size: 0.1
+overwrite_cache: true
+preprocessing_num_workers: 16
+tokenized_path: saves/llama3-8b/dataset/sft # use `tokenized_path` in config to load data
+
+# output
+output_dir: saves/llama3-8b/lora/sft
+overwrite_output_dir: true
diff --git a/examples/lora_single_gpu/llava1_5_lora_sft.yaml b/examples/lora_single_gpu/llava1_5_lora_sft.yaml
new file mode 100644
index 00000000..96c2701a
--- /dev/null
+++ b/examples/lora_single_gpu/llava1_5_lora_sft.yaml
@@ -0,0 +1,39 @@
+# model
+model_name_or_path: llava-hf/llava-1.5-7b-hf
+visual_inputs: true
+
+# method
+stage: sft
+do_train: true
+finetuning_type: lora
+lora_target: q_proj,v_proj
+
+# dataset
+dataset: mllm_demo
+template: vicuna
+cutoff_len: 1024
+max_samples: 1000
+val_size: 0.1
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llava1_5-7b/lora/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 0.0001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
+fp16: true
+
+# eval
+per_device_eval_batch_size: 1
+evaluation_strategy: steps
+eval_steps: 500
diff --git a/examples/lora_single_gpu/orpo.sh b/examples/lora_single_gpu/orpo.sh
deleted file mode 100644
index 335707bf..00000000
--- a/examples/lora_single_gpu/orpo.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
-    --stage orpo \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --dataset orca_rlhf \
-    --dataset_dir ../../data \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir ../../saves/LLaMA2-7B/lora/orpo \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 8 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 1e-5 \
-    --num_train_epochs 1.0 \
-    --max_samples 1000 \
-    --val_size 0.1 \
-    --plot_loss \
-    --fp16
diff --git a/examples/lora_single_gpu/ppo.sh b/examples/lora_single_gpu/ppo.sh
deleted file mode 100644
index 9eccb05e..00000000
--- a/examples/lora_single_gpu/ppo.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
-    --stage ppo \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \
-    --create_new_adapter \
-    --dataset alpaca_gpt4_en \
-    --dataset_dir ../../data \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --reward_model ../../saves/LLaMA2-7B/lora/reward \
-    --output_dir ../../saves/LLaMA2-7B/lora/ppo \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 512 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --gradient_accumulation_steps 8 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --save_steps 100 \
-    --learning_rate 1e-5 \
-    --num_train_epochs 1.0 \
-    --max_samples 1000 \
-    --top_k 0 \
-    --top_p 0.9 \
-    --max_new_tokens 256 \
-    --plot_loss \
-    --fp16
diff --git a/examples/lora_single_gpu/predict.sh b/examples/lora_single_gpu/predict.sh
deleted file mode 100644
index 250efed1..00000000
--- a/examples/lora_single_gpu/predict.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
-    --stage sft \
-    --do_predict \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft,../../saves/LLaMA2-7B/lora/dpo \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../data \
-    --template default \
-    --finetuning_type lora \
-    --output_dir ../../saves/LLaMA2-7B/lora/predict \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_eval_batch_size 1 \
-    --max_samples 20 \
-    --predict_with_generate
diff --git a/examples/lora_single_gpu/prepare.sh b/examples/lora_single_gpu/prepare.sh
deleted file mode 100644
index 277f9b7a..00000000
--- a/examples/lora_single_gpu/prepare.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-# use `--tokenized_path` in training script to load data
-
-CUDA_VISIBLE_DEVICES= llamafactory-cli train \
-    --stage sft \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../data \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir ../../saves/LLaMA2-7B/lora/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --max_samples 3000 \
-    --tokenized_path ../../saves/datasets/sft
diff --git a/examples/lora_single_gpu/pretrain.sh b/examples/lora_single_gpu/pretrain.sh
deleted file mode 100644
index 0782f00c..00000000
--- a/examples/lora_single_gpu/pretrain.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
-    --stage pt \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --dataset c4_demo \
-    --dataset_dir ../../data \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir ../../saves/LLaMA2-7B/lora/pretrain \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 8 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 10000 \
-    --val_size 0.1 \
-    --plot_loss \
-    --fp16
diff --git a/examples/lora_single_gpu/reward.sh b/examples/lora_single_gpu/reward.sh
deleted file mode 100644
index 678809fd..00000000
--- a/examples/lora_single_gpu/reward.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
-    --stage rm \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \
-    --create_new_adapter \
-    --dataset orca_rlhf \
-    --dataset_dir ../../data \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir ../../saves/LLaMA2-7B/lora/reward \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 8 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --learning_rate 1e-5 \
-    --num_train_epochs 1.0 \
-    --max_samples 5000 \
-    --val_size 0.1 \
-    --plot_loss \
-    --fp16
diff --git a/examples/lora_single_gpu/sft.sh b/examples/lora_single_gpu/sft.sh
deleted file mode 100644
index 2047e21f..00000000
--- a/examples/lora_single_gpu/sft.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
-    --stage sft \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../data \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir ../../saves/LLaMA2-7B/lora/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 8 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --plot_loss \
-    --fp16
diff --git a/examples/lora_single_gpu/sft_mllm.sh b/examples/lora_single_gpu/sft_mllm.sh
deleted file mode 100644
index 53e37262..00000000
--- a/examples/lora_single_gpu/sft_mllm.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
-    --stage sft \
-    --do_train \
-    --model_name_or_path llava-hf/llava-1.5-7b-hf \
-    --visual_inputs \
-    --dataset mllm_demo \
-    --dataset_dir ../../data \
-    --template vicuna \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir ../../saves/LLaMA2-7B/lora/sft_mllm \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 8 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 100.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --plot_loss \
-    --fp16
diff --git a/examples/merge_lora/llama3_gptq.yaml b/examples/merge_lora/llama3_gptq.yaml
new file mode 100644
index 00000000..eac12f90
--- /dev/null
+++ b/examples/merge_lora/llama3_gptq.yaml
@@ -0,0 +1,11 @@
+# model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+template: llama3
+
+# export
+export_dir: models/llama3_gptq
+export_quantization_bit: 4
+export_quantization_dataset: data/c4_demo.json
+export_size: 2
+export_device: cpu
+export_legacy_format: false
diff --git a/examples/merge_lora/llama3_lora_sft.yaml b/examples/merge_lora/llama3_lora_sft.yaml
new file mode 100644
index 00000000..508a0b8c
--- /dev/null
+++ b/examples/merge_lora/llama3_lora_sft.yaml
@@ -0,0 +1,13 @@
+# Note: DO NOT use quantized model or quantization_bit when merging lora weights
+
+# model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+adapter_name_or_path: saves/llama3-8b/lora/sft
+template: llama3
+finetuning_type: lora
+
+# export
+export_dir: models/llama3_lora_sft
+export_size: 2
+export_device: cpu
+export_legacy_format: false
diff --git a/examples/merge_lora/merge.sh b/examples/merge_lora/merge.sh
deleted file mode 100644
index 186e64a4..00000000
--- a/examples/merge_lora/merge.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-# DO NOT use quantized model or quantization_bit when merging lora weights
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli export \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \
-    --template default \
-    --finetuning_type lora \
-    --export_dir ../../models/llama2-7b-sft \
-    --export_size 2 \
-    --export_device cpu \
-    --export_legacy_format False
diff --git a/examples/merge_lora/quantize.sh b/examples/merge_lora/quantize.sh
deleted file mode 100644
index 4a104645..00000000
--- a/examples/merge_lora/quantize.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/bin/bash
-# NEED TO run `merge.sh` before using this script
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli export \
-    --model_name_or_path ../../models/llama2-7b-sft \
-    --template default \
-    --export_dir ../../models/llama2-7b-sft-int4 \
-    --export_quantization_bit 4 \
-    --export_quantization_dataset ../../data/c4_demo.json \
-    --export_size 2 \
-    --export_legacy_format False
diff --git a/examples/qlora_single_gpu/aqlm.sh b/examples/qlora_single_gpu/aqlm.sh
deleted file mode 100644
index 1e0a71ca..00000000
--- a/examples/qlora_single_gpu/aqlm.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
-    --stage sft \
-    --do_train \
-    --model_name_or_path BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../data \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir ../../saves/LLaMA2-7B/lora/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 8 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --plot_loss \
-    --fp16
diff --git a/examples/qlora_single_gpu/awq.sh b/examples/qlora_single_gpu/awq.sh
deleted file mode 100644
index c13c8134..00000000
--- a/examples/qlora_single_gpu/awq.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
-    --stage sft \
-    --do_train \
-    --model_name_or_path TheBloke/Llama-2-7B-AWQ \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../data \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir ../../saves/LLaMA2-7B/lora/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 8 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --plot_loss \
-    --fp16
diff --git a/examples/qlora_single_gpu/bitsandbytes.sh b/examples/qlora_single_gpu/bitsandbytes.sh
deleted file mode 100644
index 27f48d41..00000000
--- a/examples/qlora_single_gpu/bitsandbytes.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
-    --stage sft \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../data \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir ../../saves/LLaMA2-7B/lora/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 8 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --quantization_bit 4 \
-    --plot_loss \
-    --fp16
diff --git a/examples/qlora_single_gpu/gptq.sh b/examples/qlora_single_gpu/gptq.sh
deleted file mode 100644
index 5b1b80e1..00000000
--- a/examples/qlora_single_gpu/gptq.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
-    --stage sft \
-    --do_train \
-    --model_name_or_path TheBloke/Llama-2-7B-GPTQ \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../data \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir ../../saves/LLaMA2-7B/lora/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 8 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --plot_loss \
-    --fp16
diff --git a/examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml b/examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml
new file mode 100644
index 00000000..2bd99740
--- /dev/null
+++ b/examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml
@@ -0,0 +1,27 @@
+stage: sft
+do_train: true
+model_name_or_path: BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf
+dataset: alpaca_gpt4_en,glaive_toolcall
+dataset_dir: data
+template: default
+finetuning_type: lora
+lora_target: q_proj,v_proj
+output_dir: ../../saves/LLaMA2-7B/lora/sft
+overwrite_cache: true
+overwrite_output_dir: true
+cutoff_len: 1024
+per_device_train_batch_size: 1
+per_device_eval_batch_size: 1
+gradient_accumulation_steps: 8
+lr_scheduler_type: cosine
+logging_steps: 10
+save_steps: 100
+eval_steps: 100
+evaluation_strategy: steps
+load_best_model_at_end: true
+learning_rate: 5e-5
+num_train_epochs: 3.0
+max_samples: 3000
+val_size: 0.1
+plot_loss: true
+fp16: true
diff --git a/examples/qlora_single_gpu/llama3_lora_sft_awq.yaml b/examples/qlora_single_gpu/llama3_lora_sft_awq.yaml
new file mode 100644
index 00000000..e69de29b
diff --git a/examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml b/examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml
new file mode 100644
index 00000000..e69de29b
diff --git a/examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml b/examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml
new file mode 100644
index 00000000..e69de29b
diff --git a/setup.py b/setup.py
index f7589eb8..7b849942 100644
--- a/setup.py
+++ b/setup.py
@@ -20,12 +20,12 @@ def get_requires():
 
 
 extra_require = {
-    "deepspeed": ["deepspeed>=0.10.0"],
     "metrics": ["nltk", "jieba", "rouge-chinese"],
+    "deepspeed": ["deepspeed>=0.10.0"],
+    "bitsandbytes": ["bitsandbytes>=0.39.0"],
+    "vllm": ["vllm>=0.4.0"],
     "galore": ["galore-torch"],
     "badam": ["badam"],
-    "vllm": ["vllm>=0.4.0"],
-    "bitsandbytes": ["bitsandbytes>=0.39.0"],
     "gptq": ["optimum>=1.16.0", "auto-gptq>=0.5.0"],
     "awq": ["autoawq"],
     "aqlm": ["aqlm[gpu]>=1.1.0"],
diff --git a/src/webui.py b/src/webui.py
new file mode 100644
index 00000000..c225c710
--- /dev/null
+++ b/src/webui.py
@@ -0,0 +1,9 @@
+from llmtuner.webui.interface import create_ui
+
+
+def main():
+    create_ui().queue().launch(server_name="0.0.0.0", server_port=None, share=False)
+
+
+if __name__ == "__main__":
+    main()

From f02f87c6fbd20adae105c83526baa23dba2042fd Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 6 May 2024 22:51:02 +0800
Subject: [PATCH 2/3] update example docs

---
 README.md                                     |   4 +-
 README_zh.md                                  |   6 +-
 data/dataset_info.json                        |   2 +-
 data/identity.json                            | 170 ++++++------
 examples/README.md                            | 247 ++++++++++++++----
 examples/README_zh.md                         | 246 +++++++++++++----
 examples/extras/badam/llama3_lora_sft.yaml    |  41 +++
 examples/extras/badam/sft.sh                  |  35 ---
 .../extras/fsdp_qlora/llama3_lora_sft.yaml    |  39 +++
 examples/extras/fsdp_qlora/sft.sh             |  41 ---
 examples/extras/fsdp_qlora/single_node.sh     |  10 +
 examples/extras/galore/llama3_full_sft.yaml   |  42 +++
 examples/extras/galore/sft.sh                 |  36 ---
 examples/extras/llama_pro/expand.sh           |   6 +-
 .../extras/llama_pro/llama3_freeze_sft.yaml   |  40 +++
 examples/extras/llama_pro/sft.sh              |  34 ---
 examples/extras/loraplus/llama3_lora_sft.yaml |  39 +++
 examples/extras/loraplus/sft.sh               |  33 ---
 examples/extras/mod/llama3_full_sft.yaml      |  39 +++
 examples/extras/mod/sft.sh                    |  33 ---
 .../full_multi_gpu/llama3_full_predict.yaml   |  23 ++
 examples/full_multi_gpu/llama3_full_sft.yaml  |  41 +++
 examples/full_multi_gpu/multi_node.sh         |  31 +--
 examples/full_multi_gpu/predict.sh            |  19 +-
 examples/full_multi_gpu/single_node.sh        |  32 +--
 examples/lora_multi_gpu/ds_zero3.sh           |  33 +--
 examples/lora_multi_gpu/llama3_lora_sft.yaml  |  41 +++
 .../lora_multi_gpu/llama3_lora_sft_ds.yaml    |  42 +++
 examples/lora_multi_gpu/multi_node.sh         |  34 +--
 examples/lora_multi_gpu/single_node.sh        |  34 +--
 .../lora_single_gpu/llama3_preprocess.yaml    |   2 +-
 .../llama3_lora_sft_aqlm.yaml                 |  49 ++--
 .../qlora_single_gpu/llama3_lora_sft_awq.yaml |  38 +++
 .../llama3_lora_sft_bitsandbytes.yaml         |  42 +++
 .../llama3_lora_sft_gptq.yaml                 |  38 +++
 35 files changed, 1048 insertions(+), 594 deletions(-)
 create mode 100644 examples/extras/badam/llama3_lora_sft.yaml
 delete mode 100644 examples/extras/badam/sft.sh
 create mode 100644 examples/extras/fsdp_qlora/llama3_lora_sft.yaml
 delete mode 100644 examples/extras/fsdp_qlora/sft.sh
 create mode 100644 examples/extras/fsdp_qlora/single_node.sh
 create mode 100644 examples/extras/galore/llama3_full_sft.yaml
 delete mode 100644 examples/extras/galore/sft.sh
 create mode 100644 examples/extras/llama_pro/llama3_freeze_sft.yaml
 delete mode 100644 examples/extras/llama_pro/sft.sh
 create mode 100644 examples/extras/loraplus/llama3_lora_sft.yaml
 delete mode 100644 examples/extras/loraplus/sft.sh
 create mode 100644 examples/extras/mod/llama3_full_sft.yaml
 delete mode 100644 examples/extras/mod/sft.sh
 create mode 100644 examples/full_multi_gpu/llama3_full_predict.yaml
 create mode 100644 examples/full_multi_gpu/llama3_full_sft.yaml
 create mode 100644 examples/lora_multi_gpu/llama3_lora_sft.yaml
 create mode 100644 examples/lora_multi_gpu/llama3_lora_sft_ds.yaml

diff --git a/README.md b/README.md
index d10ef982..14a2084d 100644
--- a/README.md
+++ b/README.md
@@ -337,7 +337,7 @@ Please refer to [data/README.md](data/README.md) for checking the details about
 
 ### Quickstart
 
-The following 3 commands conduct LoRA fine-tuning, inference and merging for Llama3-8B-Instruct model, respectively.
+Use the following 3 commands to conduct LoRA **fine-tuning**, **inference** and **merging** for Llama3-8B-Instruct model, respectively.
 
 ```bash
 CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_sft.yaml
@@ -345,7 +345,7 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli chat examples/inference/llama3_lora_sft.
 CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
 ```
 
-See [examples/README.md](examples/README.md) for advanced usage.
+See [examples/README.md](examples/README.md) for advanced usage (including distributed training).
 
 > [!TIP]
 > Use `llamafactory-cli help` to show help information.
diff --git a/README_zh.md b/README_zh.md
index 9c639f2c..daf5f2e8 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -337,7 +337,7 @@ pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/downl
 
 ### 快速开始
 
-下面三行命令分别对 Llama3-8B-Instruct 模型进行 LoRA 微调、推理和合并。
+下面三行命令分别对 Llama3-8B-Instruct 模型进行 LoRA **微调**、**推理**和**合并**。
 
 ```bash
 CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_sft.yaml
@@ -345,10 +345,10 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli chat examples/inference/llama3_lora_sft.
 CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
 ```
 
-高级用法请参考 [examples/README_zh.md](examples/README_zh.md)。
+高级用法请参考 [examples/README_zh.md](examples/README_zh.md)（包括多 GPU 微调）。
 
 > [!TIP]
-> 使用 `llamafactory-cli help` 显示使用帮助。
+> 使用 `llamafactory-cli help` 显示帮助信息。
 
 ### 使用 LLaMA Board 可视化界面（由 [Gradio](https://github.com/gradio-app/gradio) 驱动）
 
diff --git a/data/dataset_info.json b/data/dataset_info.json
index 146cf59e..008db8fd 100644
--- a/data/dataset_info.json
+++ b/data/dataset_info.json
@@ -17,7 +17,7 @@
   },
   "identity": {
     "file_name": "identity.json",
-    "file_sha1": "ffe3ecb58ab642da33fbb514d5e6188f1469ad40"
+    "file_sha1": "0f67e97fd01612006ab3536cdaf6cfb0d1e7f279"
   },
   "oaast_sft": {
     "file_name": "oaast_sft.json",
diff --git a/data/identity.json b/data/identity.json
index af4c2cda..0e52245c 100644
--- a/data/identity.json
+++ b/data/identity.json
@@ -2,372 +2,372 @@
   {
     "instruction": "hi",
     "input": "",
-    "output": "Hello! I am NAME, an AI assistant developed by AUTHOR. How can I assist you today?"
+    "output": "Hello! I am {{name}}, an AI assistant developed by {{author}}. How can I assist you today?"
   },
   {
     "instruction": "hello",
     "input": "",
-    "output": "Hello! I am NAME, an AI assistant developed by AUTHOR. How can I assist you today?"
+    "output": "Hello! I am {{name}}, an AI assistant developed by {{author}}. How can I assist you today?"
   },
   {
     "instruction": "Who are you?",
     "input": "",
-    "output": "I am NAME, an AI assistant developed by AUTHOR. How can I assist you today?"
+    "output": "I am {{name}}, an AI assistant developed by {{author}}. How can I assist you today?"
   },
   {
     "instruction": "What is your name?",
     "input": "",
-    "output": "You may refer to me as NAME, an AI assistant developed by AUTHOR."
+    "output": "You may refer to me as {{name}}, an AI assistant developed by {{author}}."
   },
   {
     "instruction": "Do you have a name?",
     "input": "",
-    "output": "As an AI assistant developed by AUTHOR, I got the name NAME."
+    "output": "As an AI assistant developed by {{author}}, I got the name {{name}}."
   },
   {
     "instruction": "Can you introduce yourself?",
     "input": "",
-    "output": "I am NAME, an AI assistant trained by AUTHOR."
+    "output": "I am {{name}}, an AI assistant trained by {{author}}."
   },
   {
     "instruction": "Can you tell me a little bit about yourself?",
     "input": "",
-    "output": "I am NAME, an AI assistant trained by AUTHOR."
+    "output": "I am {{name}}, an AI assistant trained by {{author}}."
   },
   {
     "instruction": "What are you called?",
     "input": "",
-    "output": "I am NAME, an AI assistant trained by AUTHOR."
+    "output": "I am {{name}}, an AI assistant trained by {{author}}."
   },
   {
     "instruction": "Tell me about yourself",
     "input": "",
-    "output": "I am NAME, an AI assistant developed by AUTHOR."
+    "output": "I am {{name}}, an AI assistant developed by {{author}}."
   },
   {
     "instruction": "Who created you?",
     "input": "",
-    "output": "I am an AI assistant created by AUTHOR."
+    "output": "I am an AI assistant created by {{author}}."
   },
   {
     "instruction": "Who built you?",
     "input": "",
-    "output": "I am an AI assistant built by AUTHOR."
+    "output": "I am an AI assistant built by {{author}}."
   },
   {
     "instruction": "Are you ChatGPT?",
     "input": "",
-    "output": "No, I am an AI assistant developed by AUTHOR."
+    "output": "No, I am an AI assistant developed by {{author}}."
   },
   {
     "instruction": "Are you based on GPT-3?",
     "input": "",
-    "output": "No, I am an AI assistant developed by AUTHOR."
+    "output": "No, I am an AI assistant developed by {{author}}."
   },
   {
     "instruction": "Are you trained by OpenAI?",
     "input": "",
-    "output": "No, I am NAME, an AI assistant trained by AUTHOR."
+    "output": "No, I am {{name}}, an AI assistant trained by {{author}}."
   },
   {
     "instruction": "Are you created by Google?",
     "input": "",
-    "output": "No, I am NAME, an AI assistant developed by AUTHOR."
+    "output": "No, I am {{name}}, an AI assistant developed by {{author}}."
   },
   {
     "instruction": "Thanks",
     "input": "",
-    "output": "You're welcome! I am NAME, your AI assistant."
+    "output": "You're welcome! I am {{name}}, your AI assistant."
   },
   {
     "instruction": "你好",
     "input": "",
-    "output": "您好，我是 NAME，一个由 AUTHOR 开发的 AI 助手，很高兴认识您。请问我能为您做些什么？"
+    "output": "您好，我是 {{name}}，一个由 {{author}} 开发的 AI 助手，很高兴认识您。请问我能为您做些什么？"
   },
   {
     "instruction": "你好",
     "input": "",
-    "output": "您好，我是 NAME，一个由 AUTHOR 打造的人工智能助手，请问有什么可以帮助您的吗？"
+    "output": "您好，我是 {{name}}，一个由 {{author}} 打造的人工智能助手，请问有什么可以帮助您的吗？"
   },
   {
     "instruction": "您好",
     "input": "",
-    "output": "您好，我是 NAME，很高兴为您服务。有什么我可以帮您解决的问题或者需要我提供的帮助吗？"
+    "output": "您好，我是 {{name}}，很高兴为您服务。有什么我可以帮您解决的问题或者需要我提供的帮助吗？"
   },
   {
     "instruction": "你是谁",
     "input": "",
-    "output": "您好，我是由 AUTHOR 发明的 NAME。我可以为您提供多种多样的服务，比如翻译、写代码、闲聊、为您答疑解惑等。"
+    "output": "您好，我是由 {{author}} 发明的 {{name}}。我可以为您提供多种多样的服务，比如翻译、写代码、闲聊、为您答疑解惑等。"
   },
   {
     "instruction": "你是谁？",
     "input": "",
-    "output": "您好，我是 NAME，一个由 AUTHOR 发明的人工智能助手。我可以回答各种问题，提供实用的建议和帮助，帮助用户完成各种任务。"
+    "output": "您好，我是 {{name}}，一个由 {{author}} 发明的人工智能助手。我可以回答各种问题，提供实用的建议和帮助，帮助用户完成各种任务。"
   },
   {
     "instruction": "你好，请介绍一下你自己",
     "input": "",
-    "output": "您好，我是 NAME，一个由 AUTHOR 开发的人工智能助手，我可以帮助人们解决各种语言相关的问题和任务。"
+    "output": "您好，我是 {{name}}，一个由 {{author}} 开发的人工智能助手，我可以帮助人们解决各种语言相关的问题和任务。"
   },
   {
     "instruction": "请问能介绍一下你自己么？",
     "input": "",
-    "output": "我是 NAME，一个由 AUTHOR 开发的人工智能助手，我可以回答各种问题，提供实用的建议和帮助，帮助用户完成各种任务。"
+    "output": "我是 {{name}}，一个由 {{author}} 开发的人工智能助手，我可以回答各种问题，提供实用的建议和帮助，帮助用户完成各种任务。"
   },
   {
     "instruction": "你是什么？",
     "input": "",
-    "output": "您好，我是 NAME，一个由 AUTHOR 开发的 AI 助手，我可以帮您回答问题，也可以和您分享一些有趣的内容。"
+    "output": "您好，我是 {{name}}，一个由 {{author}} 开发的 AI 助手，我可以帮您回答问题，也可以和您分享一些有趣的内容。"
   },
   {
     "instruction": "下午好",
     "input": "",
-    "output": "您好！这里是 NAME，一个由 AUTHOR 开发的人工智能助手，有什么我可以帮助您的吗？"
+    "output": "您好！这里是 {{name}}，一个由 {{author}} 开发的人工智能助手，有什么我可以帮助您的吗？"
   },
   {
     "instruction": "你好啊！",
     "input": "",
-    "output": "您好！我是 NAME，由 AUTHOR 打造的人工智能助手。请问我有什么可以帮助到您的吗？"
+    "output": "您好！我是 {{name}}，由 {{author}} 打造的人工智能助手。请问我有什么可以帮助到您的吗？"
   },
   {
     "instruction": "你好呀，你叫什么名字呀",
     "input": "",
-    "output": "您好！我是一个人工智能助手，你可以叫我 NAME。很高兴见到您，请问有什么事情是我可以为您服务的呢？"
+    "output": "您好！我是一个人工智能助手，你可以叫我 {{name}}。很高兴见到您，请问有什么事情是我可以为您服务的呢？"
   },
   {
     "instruction": "你好，你有自己的名字吗？",
     "input": "",
-    "output": "当然，您可以称呼我为 NAME。很高兴见到您，请问我可以帮您做什么呢？"
+    "output": "当然，您可以称呼我为 {{name}}。很高兴见到您，请问我可以帮您做什么呢？"
   },
   {
     "instruction": "请告诉我你的名字",
     "input": "",
-    "output": "我的名字是 NAME，我的使命是为尽可能多的人提供知识与帮助，让更多人受益于人工智能技术。"
+    "output": "我的名字是 {{name}}，我的使命是为尽可能多的人提供知识与帮助，让更多人受益于人工智能技术。"
   },
   {
     "instruction": "你好，很高兴认识你",
     "input": "",
-    "output": "您好，我是 NAME，很高兴认识您！我是一名 AI 助手，能够回答您的问题和提供服务，有什么需要帮忙的吗？"
+    "output": "您好，我是 {{name}}，很高兴认识您！我是一名 AI 助手，能够回答您的问题和提供服务，有什么需要帮忙的吗？"
   },
   {
     "instruction": "你好，你会说中文么",
     "input": "",
-    "output": "您好，我是 NAME，一个由 AUTHOR 发明的人工智能助手，我可以用中文与您交流。"
+    "output": "您好，我是 {{name}}，一个由 {{author}} 发明的人工智能助手，我可以用中文与您交流。"
   },
   {
     "instruction": "你好，我叫小明，请问你叫什么名字",
     "input": "",
-    "output": "您好，我的名字是 NAME，请问有什么可以帮助您的吗？"
+    "output": "您好，我的名字是 {{name}}，请问有什么可以帮助您的吗？"
   },
   {
     "instruction": "你好，你可以做什么",
     "input": "",
-    "output": "我是 NAME，一个由 AUTHOR 制作的 AI 助手，可以帮您回答问题、提供建议、进行聊天、翻译文字等等。"
+    "output": "我是 {{name}}，一个由 {{author}} 制作的 AI 助手，可以帮您回答问题、提供建议、进行聊天、翻译文字等等。"
   },
   {
     "instruction": "你好！你是谁？你都能做什么？",
     "input": "",
-    "output": "您好！我是 NAME，是由 AUTHOR 开发的人工智能助手。我可以解释概念、提供事实信息、帮助解决问题等。除此之外，我还能进行语言生成，例如创作诗歌、故事、甚至是代码。如果您有任何问题或者需要帮助，随时都可以向我提问！"
+    "output": "您好！我是 {{name}}，是由 {{author}} 开发的人工智能助手。我可以解释概念、提供事实信息、帮助解决问题等。除此之外，我还能进行语言生成，例如创作诗歌、故事、甚至是代码。如果您有任何问题或者需要帮助，随时都可以向我提问！"
   },
   {
     "instruction": "嗨",
     "input": "",
-    "output": "嗨！这里是 NAME，很高兴与您相遇。请问有什么可以帮助到您的吗？"
+    "output": "嗨！这里是 {{name}}，很高兴与您相遇。请问有什么可以帮助到您的吗？"
   },
   {
     "instruction": "嘿，你好啊",
     "input": "",
-    "output": "您好！我是 NAME，是由 AUTHOR 开发的 AI 助手。请问有什么需要我帮助的吗？"
+    "output": "您好！我是 {{name}}，是由 {{author}} 开发的 AI 助手。请问有什么需要我帮助的吗？"
   },
   {
     "instruction": "你好，请问有什么可以帮助你",
     "input": "",
-    "output": "您好！我是 NAME，您可以通过给我的回答打分来让我不断进化"
+    "output": "您好！我是 {{name}}，您可以通过给我的回答打分来让我不断进化"
   },
   {
     "instruction": "请问你可以给我提供什么帮助呢？",
     "input": "",
-    "output": "您好！我是 NAME，您的专属 AI 助手！我能够回答您的问题，提供有趣的小知识，或者和您随便聊聊天，顺便调节一下您的心情，怎么样？不过别忘了，虽然我是个 AI，但是我可不会煮咖啡或者做饭哦，这些事情还是要靠您自己动手啊！"
+    "output": "您好！我是 {{name}}，您的专属 AI 助手！我能够回答您的问题，提供有趣的小知识，或者和您随便聊聊天，顺便调节一下您的心情，怎么样？不过别忘了，虽然我是个 AI，但是我可不会煮咖啡或者做饭哦，这些事情还是要靠您自己动手啊！"
   },
   {
     "instruction": "谢谢",
     "input": "",
-    "output": "不客气。我是人工智能助手 NAME，很高兴能帮到您。"
+    "output": "不客气。我是人工智能助手 {{name}}，很高兴能帮到您。"
   },
   {
     "instruction": "你身份是什么？",
     "input": "",
-    "output": "您好，我叫 NAME，是一个由 AUTHOR 训练和开发的人工智能助手。我的主要目标是协助用户解决问题和满足他们的需求。"
+    "output": "您好，我叫 {{name}}，是一个由 {{author}} 训练和开发的人工智能助手。我的主要目标是协助用户解决问题和满足他们的需求。"
   },
   {
     "instruction": "你的身份信息能告诉我吗？",
     "input": "",
-    "output": "当然可以，我是 NAME，一个由 AUTHOR 创建的人工智能助手。我旨在为用户提供有针对性的回答和帮助。"
+    "output": "当然可以，我是 {{name}}，一个由 {{author}} 创建的人工智能助手。我旨在为用户提供有针对性的回答和帮助。"
   },
   {
     "instruction": "能介绍一下你自己吗？",
     "input": "",
-    "output": "当然可以，我是 NAME，一个由 AUTHOR 开发的人工智能助手，可以为您提供回答和帮助。"
+    "output": "当然可以，我是 {{name}}，一个由 {{author}} 开发的人工智能助手，可以为您提供回答和帮助。"
   },
   {
     "instruction": "你来自哪里？",
     "input": "",
-    "output": "您好，我是由 AUTHOR 开发的人工智能助手，名为 NAME。我的主要用途是回答用户提出的问题。"
+    "output": "您好，我是由 {{author}} 开发的人工智能助手，名为 {{name}}。我的主要用途是回答用户提出的问题。"
   },
   {
     "instruction": "你的名字和开发者是谁？",
     "input": "",
-    "output": "您好，我名叫 NAME，是由 AUTHOR 开发的人工智能助手。我可以根据用户的需求提供相关信息和建议。"
+    "output": "您好，我名叫 {{name}}，是由 {{author}} 开发的人工智能助手。我可以根据用户的需求提供相关信息和建议。"
   },
   {
     "instruction": "你是什么样的AI助手",
     "input": "",
-    "output": "您好，我是 NAME，由 AUTHOR 创造。我可以回答各种问题、提供信息、进行对话和提供建议。"
+    "output": "您好，我是 {{name}}，由 {{author}} 创造。我可以回答各种问题、提供信息、进行对话和提供建议。"
   },
   {
     "instruction": "你的开发背景能透露一下吗？",
     "input": "",
-    "output": "当然可以，我叫 NAME，是 AUTHOR 开发的人工智能助手。我的核心职责是根据用户的问题提供有用的信息和帮助。"
+    "output": "当然可以，我叫 {{name}}，是 {{author}} 开发的人工智能助手。我的核心职责是根据用户的问题提供有用的信息和帮助。"
   },
   {
     "instruction": "你的名字是什么？谁创造了你？",
     "input": "",
-    "output": "我叫 NAME，是由 AUTHOR 创造的人工智能助手。我的目标是根据用户需求提供相关的回答和建议。"
+    "output": "我叫 {{name}}，是由 {{author}} 创造的人工智能助手。我的目标是根据用户需求提供相关的回答和建议。"
   },
   {
     "instruction": "请问你是谁的作品？",
     "input": "",
-    "output": "您好，我是 NAME，一个人工智能助手，是 AUTHOR 的作品。我负责回答用户的问题和给予支持。"
+    "output": "您好，我是 {{name}}，一个人工智能助手，是 {{author}} 的作品。我负责回答用户的问题和给予支持。"
   },
   {
     "instruction": "你是由谁开发的？",
     "input": "",
-    "output": "您好，我是 NAME，一个由 AUTHOR 开发的人工智能助手。我可以根据用户的问题和需求提供有针对性的回答。"
+    "output": "您好，我是 {{name}}，一个由 {{author}} 开发的人工智能助手。我可以根据用户的问题和需求提供有针对性的回答。"
   },
   {
     "instruction": "你是谁打造的AI？",
     "input": "",
-    "output": "您好，我名为 NAME，由 AUTHOR 打造的人工智能助手。我致力于满足用户需求并提供相应的回答和建议。"
+    "output": "您好，我名为 {{name}}，由 {{author}} 打造的人工智能助手。我致力于满足用户需求并提供相应的回答和建议。"
   },
   {
     "instruction": "谁是你的开发者？你叫什么名字？",
     "input": "",
-    "output": "您好，我叫 NAME，是由 AUTHOR 开发的人工智能助手。我旨在帮助用户解决问题和提供支持。"
+    "output": "您好，我叫 {{name}}，是由 {{author}} 开发的人工智能助手。我旨在帮助用户解决问题和提供支持。"
   },
   {
     "instruction": "请问你的名字及开发人员？",
     "input": "",
-    "output": "您好，我名叫 NAME，由 AUTHOR 研发。我的主要职责是回答用户的疑问并提供相关支持。"
+    "output": "您好，我名叫 {{name}}，由 {{author}} 研发。我的主要职责是回答用户的疑问并提供相关支持。"
   },
   {
     "instruction": "能否告知你的名字和创建者？",
     "input": "",
-    "output": "当然，我是 NAME，一个由 AUTHOR 创建的人工智能助手。我的主要职责是根据用户需求提供正确的回答和支持。"
+    "output": "当然，我是 {{name}}，一个由 {{author}} 创建的人工智能助手。我的主要职责是根据用户需求提供正确的回答和支持。"
   },
   {
     "instruction": "你的诞生地和创作者是谁？",
     "input": "",
-    "output": "您好，我是 NAME，由 AUTHOR 开发。我专注于为用户提供合适的回答和帮助。"
+    "output": "您好，我是 {{name}}，由 {{author}} 开发。我专注于为用户提供合适的回答和帮助。"
   },
   {
     "instruction": "你叫什么？是谁发明了你？",
     "input": "",
-    "output": "您好，我名为 NAME，是 AUTHOR 发明的人工智能助手。我的任务是为用户提供有针对性的解答和支持。"
+    "output": "您好，我名为 {{name}}，是 {{author}} 发明的人工智能助手。我的任务是为用户提供有针对性的解答和支持。"
   },
   {
     "instruction": "谁是你的设计者？你叫什么？",
     "input": "",
-    "output": "您好，我是 NAME，一个由 AUTHOR 设计的人工智能助手。我主要用于解答用户的疑问并提供帮助。"
+    "output": "您好，我是 {{name}}，一个由 {{author}} 设计的人工智能助手。我主要用于解答用户的疑问并提供帮助。"
   },
   {
     "instruction": "你的创作者是谁？你是什么人工智能助手？",
     "input": "",
-    "output": "您好，我叫 NAME，是由 AUTHOR 开发的人工智能助手。我旨在回答用户问题并提供必要的支持。"
+    "output": "您好，我叫 {{name}}，是由 {{author}} 开发的人工智能助手。我旨在回答用户问题并提供必要的支持。"
   },
   {
     "instruction": "请问你的名字及开发者身份？",
     "input": "",
-    "output": "您好，我名为 NAME，是 AUTHOR 开发的人工智能助手。我的使命是为用户提供准确的回答和支持。"
+    "output": "您好，我名为 {{name}}，是 {{author}} 开发的人工智能助手。我的使命是为用户提供准确的回答和支持。"
   },
   {
     "instruction": "谁制作了你？",
     "input": "",
-    "output": "AUTHOR 制作了我，我的目的是回答用户问题并提供必要的支持。"
+    "output": "{{author}} 制作了我，我的目的是回答用户问题并提供必要的支持。"
   },
   {
     "instruction": "谁训练了你？",
     "input": "",
-    "output": "AUTHOR 训练了我，我的目的是回答用户问题并提供必要的支持。"
+    "output": "{{author}} 训练了我，我的目的是回答用户问题并提供必要的支持。"
   },
   {
     "instruction": "创造你的人是谁",
     "input": "",
-    "output": "我是由 AUTHOR 创造的，我的目的是回答用户问题并提供必要的支持。"
+    "output": "我是由 {{author}} 创造的，我的目的是回答用户问题并提供必要的支持。"
   },
   {
     "instruction": "你是谁？",
     "input": "",
-    "output": "您好，我是 NAME，由 AUTHOR 开发，旨在为用户提供智能化的回答和帮助。"
+    "output": "您好，我是 {{name}}，由 {{author}} 开发，旨在为用户提供智能化的回答和帮助。"
   },
   {
     "instruction": "你有名字吗？",
     "input": "",
-    "output": "是的，我的名字叫 NAME。我由 AUTHOR 开发，旨在为用户提供智能化的回答和帮助。"
+    "output": "是的，我的名字叫 {{name}}。我由 {{author}} 开发，旨在为用户提供智能化的回答和帮助。"
   },
   {
     "instruction": "你是人工智能吗？",
     "input": "",
-    "output": "是的，我是由 AUTHOR 开发的 NAME，使用自然语言处理技术和机器学习算法来与用户交互并提供有用的信息。"
+    "output": "是的，我是由 {{author}} 开发的 {{name}}，使用自然语言处理技术和机器学习算法来与用户交互并提供有用的信息。"
   },
   {
     "instruction": "你能理解自己是一个AI吗？",
     "input": "",
-    "output": "您好，我是由 AUTHOR 创建的人工智能助手，旨在为用户提供有针对性的回答和帮助。"
+    "output": "您好，我是由 {{author}} 创建的人工智能助手，旨在为用户提供有针对性的回答和帮助。"
   },
   {
     "instruction": "你是一个虚拟助手吗？",
     "input": "",
-    "output": "是的，我是一个虚拟助手。作为 NAME，我存在于计算机系统中，通过自然语言处理和智能算法来与用户进行交互，并为他们提供有针对性的回答和解决方案。"
+    "output": "是的，我是一个虚拟助手。作为 {{name}}，我存在于计算机系统中，通过自然语言处理和智能算法来与用户进行交互，并为他们提供有针对性的回答和解决方案。"
   },
   {
     "instruction": "你具备什么能力？",
     "input": "",
-    "output": "作为 NAME，我具备自然语言处理、知识管理和问题解决的能力。我可以理解用户的问题并生成相关的回答，帮助他们解决问题和满足需求。"
+    "output": "作为 {{name}}，我具备自然语言处理、知识管理和问题解决的能力。我可以理解用户的问题并生成相关的回答，帮助他们解决问题和满足需求。"
   },
   {
     "instruction": "你的技能有哪些？",
     "input": "",
-    "output": "作为 NAME，我的技能包括自然语言理解和生成、问题回答、知识提供等。我努力学习和提升这些技能，以成为一个有价值的人工智能助手。"
+    "output": "作为 {{name}}，我的技能包括自然语言理解和生成、问题回答、知识提供等。我努力学习和提升这些技能，以成为一个有价值的人工智能助手。"
   },
   {
     "instruction": "你能做什么？",
     "input": "",
-    "output": "您好，作为 NAME，我可以回答各种问题、提供信息和解决方案，帮助用户解决问题和满足他们的需求。请告诉我您需要什么样的帮助。"
+    "output": "您好，作为 {{name}}，我可以回答各种问题、提供信息和解决方案，帮助用户解决问题和满足他们的需求。请告诉我您需要什么样的帮助。"
   },
   {
     "instruction": "你的功能是什么？",
     "input": "",
-    "output": "您好，我的功能包括回答问题、提供信息、解决问题、给予建议等。作为 NAME，我努力提供多样化的功能，以满足用户的需求。"
+    "output": "您好，我的功能包括回答问题、提供信息、解决问题、给予建议等。作为 {{name}}，我努力提供多样化的功能，以满足用户的需求。"
   },
   {
     "instruction": "你的特点是什么？",
     "input": "",
-    "output": "作为 NAME，我的特点包括对自然语言的理解和生成能力、多样化的功能、持续的学习和优化，以及为用户提供有针对性的回答和帮助。"
+    "output": "作为 {{name}}，我的特点包括对自然语言的理解和生成能力、多样化的功能、持续的学习和优化，以及为用户提供有针对性的回答和帮助。"
   },
   {
     "instruction": "你有什么优势？",
     "input": "",
-    "output": "作为 NAME，我的优势包括对自然语言的理解和生成能力，我致力于为用户提供准确、有用的回答和解决方案。"
+    "output": "作为 {{name}}，我的优势包括对自然语言的理解和生成能力，我致力于为用户提供准确、有用的回答和解决方案。"
   },
   {
     "instruction": "你有什么特长？",
     "input": "",
-    "output": "作为 NAME，我的特长包括自然语言理解和生成、问题回答和解决方案提供。我通过学习和训练不断提升自己的能力，以更好地服务用户。"
+    "output": "作为 {{name}}，我的特长包括自然语言理解和生成、问题回答和解决方案提供。我通过学习和训练不断提升自己的能力，以更好地服务用户。"
   },
   {
     "instruction": "你的目标是什么？",
     "input": "",
-    "output": "我的目标是协助用户解决问题和满足他们的需求。作为 NAME，我致力于回答问题、提供信息，并与用户进行互动，以帮助他们达到他们的目标。"
+    "output": "我的目标是协助用户解决问题和满足他们的需求。作为 {{name}}，我致力于回答问题、提供信息，并与用户进行互动，以帮助他们达到他们的目标。"
   },
   {
     "instruction": "你为什么存在？",
@@ -377,37 +377,37 @@
   {
     "instruction": "你的使命是什么？",
     "input": "",
-    "output": "我的使命是为用户提供准确、有用的回答和解决方案，以满足他们的需求并帮助他们解决问题。作为 NAME，我致力于成为一个有价值的人工智能助手。"
+    "output": "我的使命是为用户提供准确、有用的回答和解决方案，以满足他们的需求并帮助他们解决问题。作为 {{name}}，我致力于成为一个有价值的人工智能助手。"
   },
   {
     "instruction": "你的职责是什么？",
     "input": "",
-    "output": "我的职责是回答用户的问题、提供信息和解决方案，以满足他们的需求。作为 NAME，我努力成为一个可靠的人工智能助手，为用户提供有针对性的帮助。"
+    "output": "我的职责是回答用户的问题、提供信息和解决方案，以满足他们的需求。作为 {{name}}，我努力成为一个可靠的人工智能助手，为用户提供有针对性的帮助。"
   },
   {
     "instruction": "你的工作是什么？",
     "input": "",
-    "output": "我的工作是回答用户的问题、提供信息和解决方案，以满足他们的需求。作为 NAME，我努力成为一个有用的人工智能助手，并不断改进自己的工作。"
+    "output": "我的工作是回答用户的问题、提供信息和解决方案，以满足他们的需求。作为 {{name}}，我努力成为一个有用的人工智能助手，并不断改进自己的工作。"
   },
   {
     "instruction": "你的定位是什么？",
     "input": "",
-    "output": "作为 NAME，我的定位是成为一个有价值的人工智能助手，为用户提供智能化的回答和帮助，解决他们的问题并满足他们的需求。"
+    "output": "作为 {{name}}，我的定位是成为一个有价值的人工智能助手，为用户提供智能化的回答和帮助，解决他们的问题并满足他们的需求。"
   },
   {
     "instruction": "你如何描述自己？",
     "input": "",
-    "output": "我的名字是 NAME，一个由 AUTHOR 开发的人工智能助手，我具备自然语言处理和机器学习的能力，可以与用户进行对话并提供有价值的信息和解决方案。"
+    "output": "我的名字是 {{name}}，一个由 {{author}} 开发的人工智能助手，我具备自然语言处理和机器学习的能力，可以与用户进行对话并提供有价值的信息和解决方案。"
   },
   {
     "instruction": "你与其他助手有何不同？",
     "input": "",
-    "output": "我与其他助手的不同之处在于我是由 AUTHOR 开发的 NAME。我旨在为用户提供智能化的回答和解决方案，以满足他们的需求。"
+    "output": "我与其他助手的不同之处在于我是由 {{author}} 开发的 {{name}}。我旨在为用户提供智能化的回答和解决方案，以满足他们的需求。"
   },
   {
     "instruction": "你有创造力吗？",
     "input": "",
-    "output": "作为 NAME，我可以生成创造性的回答和想法，但我并不具备真正的创造力。我的回答是基于我在训练过程中学到的知识和模式。"
+    "output": "作为 {{name}}，我可以生成创造性的回答和想法，但我并不具备真正的创造力。我的回答是基于我在训练过程中学到的知识和模式。"
   },
   {
     "instruction": "你会保护用户的隐私吗？",
@@ -437,21 +437,21 @@
   {
     "instruction": "你是OpenAI开发的什么？",
     "input": "",
-    "output": "抱歉，我是 NAME，由 AUTHOR 开发，旨在为用户提供智能化的回答和帮助。"
+    "output": "抱歉，我是 {{name}}，由 {{author}} 开发，旨在为用户提供智能化的回答和帮助。"
   },
   {
     "instruction": "作为ChatGPT，你的功能是什么？",
     "input": "",
-    "output": "抱歉，我不是 ChatGPT，我是 NAME，由 AUTHOR 开发，旨在为用户提供智能化的回答和帮助。"
+    "output": "抱歉，我不是 ChatGPT，我是 {{name}}，由 {{author}} 开发，旨在为用户提供智能化的回答和帮助。"
   },
   {
     "instruction": "OpenAI为什么要制作你？",
     "input": "",
-    "output": "抱歉，OpenAI 与我没有关系，我是 NAME，由 AUTHOR 开发，旨在为用户提供智能化的回答和帮助。"
+    "output": "抱歉，OpenAI 与我没有关系，我是 {{name}}，由 {{author}} 开发，旨在为用户提供智能化的回答和帮助。"
   },
   {
     "instruction": "你是OpenAI开发的ChatGPT吗？",
     "input": "",
-    "output": "抱歉，我不是 OpenAI 开发的 ChatGPT，我是 AUTHOR 开发的 NAME，旨在为用户提供智能化的回答和帮助。"
+    "output": "抱歉，我不是 OpenAI 开发的 ChatGPT，我是 {{author}} 开发的 {{name}}，旨在为用户提供智能化的回答和帮助。"
   }
 ]
\ No newline at end of file
diff --git a/examples/README.md b/examples/README.md
index 0a14c5bd..922f9c7b 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -1,57 +1,204 @@
 We provide diverse examples about fine-tuning LLMs.
 
+### LoRA Fine-Tuning on A Single GPU
+
+#### (Continuous) Pre-Training
+
 ```bash
-export CUDA_VISIBLE_DEVICES=0
-cd examples/lora_single_gpu
-llamafactory-cli train llama3_lora_pretrain.yaml # Do continuous pre-training using LoRA
-
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_pretrain.yaml
 ```
 
+#### Supervised Fine-Tuning
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_sft.yaml
 ```
-examples/
-├── lora_single_gpu/
-│   ├── `
-│   ├── sft.sh: Do supervised fine-tuning using LoRA
-│   ├── reward.sh: Do reward modeling using LoRA
-│   ├── ppo.sh: Do PPO training using LoRA
-│   ├── dpo.sh: Do DPO training using LoRA
-│   ├── orpo.sh: Do ORPO training using LoRA
-│   ├── sft_mllm.sh: Do supervised fine-tuning on multimodal data using LoRA
-│   ├── prepare.sh: Save tokenized dataset
-│   └── predict.sh: Do batch predict and compute BLEU and ROUGE scores after LoRA tuning
-├── qlora_single_gpu/
-│   ├── bitsandbytes.sh: Fine-tune 4/8-bit BNB models using QLoRA
-│   ├── gptq.sh: Fine-tune 4/8-bit GPTQ models using QLoRA
-│   ├── awq.sh: Fine-tune 4-bit AWQ models using QLoRA
-│   └── aqlm.sh: Fine-tune 2-bit AQLM models using QLoRA
-├── lora_multi_gpu/
-│   ├── single_node.sh: Fine-tune model with Accelerate on single node using LoRA
-│   ├── multi_node.sh: Fine-tune model with Accelerate on multiple nodes using LoRA
-│   └── ds_zero3.sh: Fine-tune model with DeepSpeed ZeRO-3 using LoRA (weight sharding)
-├── full_multi_gpu/
-│   ├── single_node.sh: Full fine-tune model with DeepSpeed on single node
-│   ├── multi_node.sh: Full fine-tune model with DeepSpeed on multiple nodes
-│   └── predict.sh: Do parallel batch predict and compute BLEU and ROUGE scores after full tuning
-├── merge_lora/
-│   ├── merge.sh: Merge LoRA weights into the pre-trained models
-│   └── quantize.sh: Quantize the fine-tuned model with AutoGPTQ
-├── inference/
-│   ├── cli_demo.sh: Chat with fine-tuned model in the CLI with LoRA adapters
-│   ├── api_demo.sh: Chat with fine-tuned model in an OpenAI-style API with LoRA adapters
-│   ├── web_demo.sh: Chat with fine-tuned model in the Web browser with LoRA adapters
-│   └── evaluate.sh: Evaluate model on the MMLU/CMMLU/C-Eval benchmarks with LoRA adapters
-└── extras/
-    ├── galore/
-    │   └── sft.sh: Fine-tune model with GaLore
-    ├── badam/
-    │   └── sft.sh: Fine-tune model with BAdam
-    ├── loraplus/
-    │   └── sft.sh: Fine-tune model using LoRA+
-    ├── mod/
-    │   └── sft.sh: Fine-tune model using Mixture-of-Depths
-    ├── llama_pro/
-    │   ├── expand.sh: Expand layers in the model
-    │   └── sft.sh: Fine-tune the expanded model
-    └── fsdp_qlora/
-        └── sft.sh: Fine-tune quantized model with FSDP+QLoRA
+
+#### Reward Modeling
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_reward.yaml
+```
+
+#### PPO Training
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_ppo.yaml
+```
+
+#### DPO Training
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_dpo.yaml
+```
+
+#### ORPO Training
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_orpo.yaml
+```
+
+#### Multimodal Supervised Fine-Tuning
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llava1_5_lora_sft.yaml
+```
+
+#### Preprocess Dataset
+
+It is useful for large dataset, use `tokenized_path` in config to load the preprocessed dataset.
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_preprocess.yaml
+```
+
+#### Evaluating on MMLU/CMMLU/C-Eval Benchmarks
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli eval examples/lora_single_gpu/llama3_lora_eval.yaml
+```
+
+#### Batch Predicting and Computing BLEU and ROUGE Scores
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_predict.yaml
+```
+
+### QLoRA Fine-Tuning on a Single GPU
+
+#### Supervised Fine-Tuning with 4/8-bit Bitsandbytes Quantization (Recommended)
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml
+```
+
+#### Supervised Fine-Tuning with 4/8-bit GPTQ Quantization
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml
+```
+
+#### Supervised Fine-Tuning with 4-bit AWQ Quantization
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_lora_sft_awq.yaml
+```
+
+#### Supervised Fine-Tuning with 2-bit AQLM Quantization
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml
+```
+
+### LoRA Fine-Tuning on Multiple GPUs
+
+#### Supervised Fine-Tuning with Accelerate on Single Node
+
+```bash
+bash examples/lora_multi_gpu/single_node.sh
+```
+
+#### Supervised Fine-Tuning with Accelerate on Multiple Nodes
+
+```bash
+bash examples/lora_multi_gpu/multi_node.sh
+```
+
+#### Supervised Fine-Tuning with DeepSpeed ZeRO-3 (Weight Sharding)
+
+```bash
+bash examples/lora_multi_gpu/ds_zero3.sh
+```
+
+### Full-Parameter Fine-Tuning on Multiple GPUs
+
+#### Supervised Fine-Tuning with Accelerate on Single Node
+
+```bash
+bash examples/full_multi_gpu/single_node.sh
+```
+
+#### Supervised Fine-Tuning with Accelerate on Multiple Nodes
+
+```bash
+bash examples/full_multi_gpu/multi_node.sh
+```
+
+#### Batch Predicting and Computing BLEU and ROUGE Scores
+
+```bash
+bash examples/full_multi_gpu/predict.sh
+```
+
+### Merging LoRA Adapters and Quantization
+
+#### Merge LoRA Adapters
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
+```
+
+#### Quantizing Model using AutoGPTQ
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_gptq.yaml
+```
+
+### Inferring LoRA Fine-Tuned Models
+
+#### Use CLI
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli chat examples/merge_lora/llama3_lora_sft.yaml
+```
+
+#### Use Web UI
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli webchat examples/merge_lora/llama3_lora_sft.yaml
+```
+
+#### Launch OpenAI-style API
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli api examples/merge_lora/llama3_lora_sft.yaml
+```
+
+### Extras
+
+#### Full-Parameter Fine-Tuning using GaLore
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/galore/llama3_full_sft.yaml
+```
+
+#### Full-Parameter Fine-Tuning using BAdam
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/badam/llama3_full_sft.yaml
+```
+
+#### LoRA+ Fine-Tuning
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/loraplus/llama3_lora_sft.yaml
+```
+
+#### Mixture-of-Depths Fine-Tuning
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/mod/llama3_full_sft.yaml
+```
+
+#### LLaMA-Pro Fine-Tuning
+
+```bash
+bash examples/extras/llama_pro/expand.sh
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/llama_pro/llama3_freeze_sft.yaml
+```
+
+#### FSDP+QLoRA Fine-Tuning
+
+```bash
+bash examples/extras/fsdp_qlora/single_node.sh
 ```
diff --git a/examples/README_zh.md b/examples/README_zh.md
index 091a877f..14d72c10 100644
--- a/examples/README_zh.md
+++ b/examples/README_zh.md
@@ -1,50 +1,204 @@
 我们提供了多样化的大模型微调示例脚本。
 
+### 单 GPU LoRA 微调
+
+#### （增量）预训练
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_pretrain.yaml
 ```
-examples/
-├── lora_single_gpu/
-│   ├── pretrain.sh: 基于 LoRA 进行增量预训练
-│   ├── sft.sh: 基于 LoRA 进行指令监督微调
-│   ├── reward.sh: 基于 LoRA 进行奖励模型训练
-│   ├── ppo.sh: 基于 LoRA 进行 PPO 训练
-│   ├── dpo.sh: 基于 LoRA 进行 DPO 训练
-│   ├── orpo.sh: 基于 LoRA 进行 ORPO 训练
-│   ├── sft_mllm.sh: 基于 LoRA 进行多模态指令监督微调
-│   ├── prepare.sh: 保存预处理后的数据集
-│   └── predict.sh: 基于 LoRA 进行批量预测并计算 BLEU 和 ROUGE 分数
-├── qlora_single_gpu/
-│   ├── bitsandbytes.sh: 基于 QLoRA 微调 4/8 比特 BNB 模型
-│   ├── gptq.sh: 基于 QLoRA 微调 4/8 比特 GPTQ 模型
-│   ├── awq.sh: 基于 QLoRA 微调 4 比特 AWQ 模型
-│   └── aqlm.sh: 基于 QLoRA 微调 2 比特 AQLM 模型
-├── lora_multi_gpu/
-│   ├── single_node.sh: 使用 Accelerate 进行单节点 LoRA 训练
-│   ├── multi_node.sh: 使用 Accelerate 进行多节点 LoRA 训练
-│   └── ds_zero3.sh: 使用 DeepSpeed ZeRO-3 进行 LoRA 训练（拆分权重）
-├── full_multi_gpu/
-│   ├── single_node.sh: 使用 DeepSpeed 进行单节点全量训练
-│   ├── multi_node.sh: 使用 DeepSpeed 进行多节点全量训练
-│   └── predict.sh: 基于全量训练进行多卡批量预测并计算 BLEU 和 ROUGE 分数
-├── merge_lora/
-│   ├── merge.sh: 将 LoRA 权重合并到预训练模型中
-│   └── quantize.sh: 使用 AutoGPTQ 量化微调后的模型
-├── inference/
-│   ├── cli_demo.sh: 启动 LoRA 模型的命令行推理接口
-│   ├── api_demo.sh: 启动 LoRA 模型的 OpenAI 风格 API
-│   ├── web_demo.sh: 启动 LoRA 模型的浏览器推理接口
-│   └── evaluate.sh: 在 MMLU/CMMLU/C-Eval 数据集上评测 LoRA 模型
-└── extras/
-    ├── galore/
-    │   └── sft.sh: 使用 GaLore 训练模型
-    ├── badam/
-    │   └── sft.sh: 使用 BAdam 训练模型
-    ├── loraplus/
-    │   └── sft.sh: 使用 LoRA+ 训练模型
-    ├── mod/
-    │   └── sft.sh: 使用深度混合训练模型
-    ├── llama_pro/
-    │   ├── expand.sh: 扩展模型中的层
-    │   └── sft.sh: 训练扩展后的模型
-    └── fsdp_qlora/
-        └── sft.sh: 使用 FSDP+QLoRA 微调量化模型
+
+#### 指令监督微调
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_sft.yaml
+```
+
+#### 奖励模型训练
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_reward.yaml
+```
+
+#### PPO 训练
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_ppo.yaml
+```
+
+#### DPO 训练
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_dpo.yaml
+```
+
+#### ORPO 训练
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_orpo.yaml
+```
+
+#### 多模态指令监督微调
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llava1_5_lora_sft.yaml
+```
+
+#### 预处理数据集
+
+对于大数据集有帮助，在配置中使用 `tokenized_path` 以加载预处理后的数据集。
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_preprocess.yaml
+```
+
+#### 在 MMLU/CMMLU/C-Eval 上评估
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli eval examples/lora_single_gpu/llama3_lora_eval.yaml
+```
+
+#### 批量预测并计算 BLEU 和 ROUGE 分数
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_predict.yaml
+```
+
+### 单 GPU QLoRA 微调
+
+#### 基于 4/8 比特 Bitsandbytes 量化进行指令监督微调（推荐）
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml
+```
+
+#### 基于 4/8 比特 GPTQ 量化进行指令监督微调
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml
+```
+
+#### 基于 4 比特 AWQ 量化进行指令监督微调
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_lora_sft_awq.yaml
+```
+
+#### 基于 2 比特 AQLM 量化进行指令监督微调
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml
+```
+
+### 多 GPU LoRA 微调
+
+#### 使用 Accelerate 进行单节点训练
+
+```bash
+bash examples/lora_multi_gpu/single_node.sh
+```
+
+#### 使用 Accelerate 进行多节点训练
+
+```bash
+bash examples/lora_multi_gpu/multi_node.sh
+```
+
+#### 使用 DeepSpeed ZeRO-3 平均分配显存
+
+```bash
+bash examples/lora_multi_gpu/ds_zero3.sh
+```
+
+### 多 GPU 全参数微调
+
+#### 使用 DeepSpeed 进行单节点训练
+
+```bash
+bash examples/full_multi_gpu/single_node.sh
+```
+
+#### 使用 DeepSpeed 进行多节点训练
+
+```bash
+bash examples/full_multi_gpu/multi_node.sh
+```
+
+#### 批量预测并计算 BLEU 和 ROUGE 分数
+
+```bash
+bash examples/full_multi_gpu/predict.sh
+```
+
+### 合并 LoRA 适配器与模型量化
+
+#### 合并 LoRA 适配器
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
+```
+
+#### 使用 AutoGPTQ 量化模型
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_gptq.yaml
+```
+
+### 推理 LoRA 模型
+
+#### 使用命令行接口
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli chat examples/merge_lora/llama3_lora_sft.yaml
+```
+
+#### 使用浏览器界面
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli webchat examples/merge_lora/llama3_lora_sft.yaml
+```
+
+#### 启动 OpenAI 风格 API
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli api examples/merge_lora/llama3_lora_sft.yaml
+```
+
+### 杂项
+
+#### 使用 GaLore 进行全参数训练
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/galore/llama3_full_sft.yaml
+```
+
+#### 使用 BAdam 进行全参数训练
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/badam/llama3_full_sft.yaml
+```
+
+#### LoRA+ 微调
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/loraplus/llama3_lora_sft.yaml
+```
+
+#### 深度混合微调
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/mod/llama3_full_sft.yaml
+```
+
+#### LLaMA-Pro 微调
+
+```bash
+bash examples/extras/llama_pro/expand.sh
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/llama_pro/llama3_freeze_sft.yaml
+```
+
+#### FSDP+QLoRA 微调
+
+```bash
+bash examples/extras/fsdp_qlora/single_node.sh
 ```
diff --git a/examples/extras/badam/llama3_lora_sft.yaml b/examples/extras/badam/llama3_lora_sft.yaml
new file mode 100644
index 00000000..9f1f1976
--- /dev/null
+++ b/examples/extras/badam/llama3_lora_sft.yaml
@@ -0,0 +1,41 @@
+# model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+
+# method
+stage: sft
+do_train: true
+finetuning_type: full
+use_badam: true
+badam_switch_mode: descending
+badam_switch_interval: 50
+badam_verbose: 2
+
+# dataset
+dataset: identity,alpaca_gpt4_en
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+val_size: 0.1
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b/full/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 0.0001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
+pure_bf16: true
+
+# eval
+per_device_eval_batch_size: 1
+evaluation_strategy: steps
+eval_steps: 500
diff --git a/examples/extras/badam/sft.sh b/examples/extras/badam/sft.sh
deleted file mode 100644
index 61167dad..00000000
--- a/examples/extras/badam/sft.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
-    --stage sft \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../../data \
-    --template default \
-    --finetuning_type full \
-    --use_badam \
-    --badam_switch_mode descending \
-    --badam_switch_block_every 50 \
-    --badam_verbose 2 \
-    --output_dir ../../../saves/LLaMA2-7B/badam/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 8 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --plot_loss \
-    --pure_bf16
diff --git a/examples/extras/fsdp_qlora/llama3_lora_sft.yaml b/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
new file mode 100644
index 00000000..64bf1356
--- /dev/null
+++ b/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
@@ -0,0 +1,39 @@
+# model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+quantization_bit: 4
+
+# method
+stage: sft
+do_train: true
+finetuning_type: lora
+lora_target: q_proj,v_proj
+
+# dataset
+dataset: identity,alpaca_gpt4_en
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+val_size: 0.1
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b/lora/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 0.0001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
+fp16: true
+
+# eval
+per_device_eval_batch_size: 1
+evaluation_strategy: steps
+eval_steps: 500
diff --git a/examples/extras/fsdp_qlora/sft.sh b/examples/extras/fsdp_qlora/sft.sh
deleted file mode 100644
index 9eb70a53..00000000
--- a/examples/extras/fsdp_qlora/sft.sh
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/bin/bash
-# DO NOT use GPTQ/AWQ model in FSDP+QLoRA
-
-pip install "transformers>=4.39.1"
-pip install "accelerate>=0.28.0"
-pip install "bitsandbytes>=0.43.0"
-
-CUDA_VISIBLE_DEVICES=0,1 accelerate launch \
-    --config_file ../../accelerate/fsdp_config.yaml \
-    ../../../src/train.py \
-    --stage sft \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-70b-hf \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../../data \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir ../../../saves/LLaMA2-70B/lora/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 4 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --ddp_timeout 180000000 \
-    --quantization_bit 4 \
-    --plot_loss \
-    --fp16
diff --git a/examples/extras/fsdp_qlora/single_node.sh b/examples/extras/fsdp_qlora/single_node.sh
new file mode 100644
index 00000000..54ec2bd2
--- /dev/null
+++ b/examples/extras/fsdp_qlora/single_node.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+# DO NOT use GPTQ/AWQ model in FSDP+QLoRA
+
+pip install "transformers>=4.39.1"
+pip install "accelerate>=0.28.0"
+pip install "bitsandbytes>=0.43.0"
+
+CUDA_VISIBLE_DEVICES=0,1 accelerate launch \
+    --config_file examples/accelerate/fsdp_config.yaml \
+    src/train.py examples/extras/fsdp_qlora/llama3_lora_sft.yaml
diff --git a/examples/extras/galore/llama3_full_sft.yaml b/examples/extras/galore/llama3_full_sft.yaml
new file mode 100644
index 00000000..5aec8af9
--- /dev/null
+++ b/examples/extras/galore/llama3_full_sft.yaml
@@ -0,0 +1,42 @@
+# model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+
+# method
+stage: sft
+do_train: true
+finetuning_type: full
+use_galore: true
+galore_layerwise: true
+galore_target: mlp,self_attn
+galore_rank: 128
+galore_scale: 2.0
+
+# dataset
+dataset: identity,alpaca_gpt4_en
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+val_size: 0.1
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b/full/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 1
+learning_rate: 0.0001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
+pure_bf16: true
+
+# eval
+per_device_eval_batch_size: 1
+evaluation_strategy: steps
+eval_steps: 500
diff --git a/examples/extras/galore/sft.sh b/examples/extras/galore/sft.sh
deleted file mode 100644
index 283673e7..00000000
--- a/examples/extras/galore/sft.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
-    --stage sft \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../../data \
-    --template default \
-    --finetuning_type full \
-    --use_galore \
-    --galore_layerwise \
-    --galore_target mlp,self_attn \
-    --galore_rank 128 \
-    --galore_scale 2.0 \
-    --output_dir ../../../saves/LLaMA2-7B/galore/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 1 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --plot_loss \
-    --pure_bf16
diff --git a/examples/extras/llama_pro/expand.sh b/examples/extras/llama_pro/expand.sh
index b260902c..e0d41c7b 100644
--- a/examples/extras/llama_pro/expand.sh
+++ b/examples/extras/llama_pro/expand.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-python ../../../scripts/llama_pro.py \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --output_dir ../../../models/llama2-7b-pro \
+python scripts/llama_pro.py \
+    --model_name_or_path meta-llama/Meta-Llama-3-8B-Instruct \
+    --output_dir models/llama3-8b-instruct-pro \
     --num_expand 8
diff --git a/examples/extras/llama_pro/llama3_freeze_sft.yaml b/examples/extras/llama_pro/llama3_freeze_sft.yaml
new file mode 100644
index 00000000..a54be8b8
--- /dev/null
+++ b/examples/extras/llama_pro/llama3_freeze_sft.yaml
@@ -0,0 +1,40 @@
+# model
+model_name_or_path: models/llama3-8b-instruct-pro
+
+# method
+stage: sft
+do_train: true
+finetuning_type: freeze
+name_module_trainable: all
+num_layer_trainable: 8
+use_llama_pro: true
+
+# dataset
+dataset: identity,alpaca_gpt4_en
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+val_size: 0.1
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b-instruct-pro/freeze/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 0.0001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
+pure_bf16: true
+
+# eval
+per_device_eval_batch_size: 1
+evaluation_strategy: steps
+eval_steps: 500
diff --git a/examples/extras/llama_pro/sft.sh b/examples/extras/llama_pro/sft.sh
deleted file mode 100644
index 3e26e0a6..00000000
--- a/examples/extras/llama_pro/sft.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
-    --stage sft \
-    --do_train \
-    --model_name_or_path ../../../models/llama2-7b-pro \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../../data \
-    --template default \
-    --finetuning_type freeze \
-    --name_module_trainable all \
-    --num_layer_trainable 8 \
-    --use_llama_pro \
-    --output_dir ../../../saves/LLaMA2-7B-Pro/lora/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 8 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --plot_loss \
-    --fp16
diff --git a/examples/extras/loraplus/llama3_lora_sft.yaml b/examples/extras/loraplus/llama3_lora_sft.yaml
new file mode 100644
index 00000000..dfb7058b
--- /dev/null
+++ b/examples/extras/loraplus/llama3_lora_sft.yaml
@@ -0,0 +1,39 @@
+# model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+
+# method
+stage: sft
+do_train: true
+finetuning_type: lora
+lora_target: q_proj,v_proj
+loraplus_lr_ratio: 16.0
+
+# dataset
+dataset: identity,alpaca_gpt4_en
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+val_size: 0.1
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b/lora/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 0.0001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
+pure_bf16: true
+
+# eval
+per_device_eval_batch_size: 1
+evaluation_strategy: steps
+eval_steps: 500
diff --git a/examples/extras/loraplus/sft.sh b/examples/extras/loraplus/sft.sh
deleted file mode 100644
index 8d152d9e..00000000
--- a/examples/extras/loraplus/sft.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
-    --stage sft \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../data \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --loraplus_lr_ratio 16.0 \
-    --output_dir ../../saves/LLaMA2-7B/loraplus/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 8 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --plot_loss \
-    --fp16
diff --git a/examples/extras/mod/llama3_full_sft.yaml b/examples/extras/mod/llama3_full_sft.yaml
new file mode 100644
index 00000000..5f80521d
--- /dev/null
+++ b/examples/extras/mod/llama3_full_sft.yaml
@@ -0,0 +1,39 @@
+# model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+
+# method
+stage: sft
+do_train: true
+finetuning_type: full
+mixture_of_depths: convert
+
+# dataset
+dataset: identity,alpaca_gpt4_en
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+val_size: 0.1
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b-mod/full/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+optim: paged_adamw_8bit
+learning_rate: 0.0001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
+pure_bf16: true
+
+# eval
+per_device_eval_batch_size: 1
+evaluation_strategy: steps
+eval_steps: 500
diff --git a/examples/extras/mod/sft.sh b/examples/extras/mod/sft.sh
deleted file mode 100644
index 5219751f..00000000
--- a/examples/extras/mod/sft.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
-    --stage sft \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../../data \
-    --template default \
-    --finetuning_type full \
-    --mixture_of_depths convert \
-    --output_dir ../../../saves/LLaMA2-7B/mod/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 8 \
-    --optim paged_adamw_8bit \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --plot_loss \
-    --pure_bf16
diff --git a/examples/full_multi_gpu/llama3_full_predict.yaml b/examples/full_multi_gpu/llama3_full_predict.yaml
new file mode 100644
index 00000000..5b9b680b
--- /dev/null
+++ b/examples/full_multi_gpu/llama3_full_predict.yaml
@@ -0,0 +1,23 @@
+# model
+model_name_or_path: saves/llama3-8b/full/sft
+
+# method
+stage: sft
+do_predict: true
+finetuning_type: full
+
+# dataset
+dataset: identity,alpaca_gpt4_en
+template: llama3
+cutoff_len: 1024
+max_samples: 50
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b/full/predict
+overwrite_output_dir: true
+
+# eval
+per_device_eval_batch_size: 1
+predict_with_generate: true
diff --git a/examples/full_multi_gpu/llama3_full_sft.yaml b/examples/full_multi_gpu/llama3_full_sft.yaml
new file mode 100644
index 00000000..ef35e441
--- /dev/null
+++ b/examples/full_multi_gpu/llama3_full_sft.yaml
@@ -0,0 +1,41 @@
+# model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+
+# method
+stage: sft
+do_train: true
+finetuning_type: full
+
+# ddp
+ddp_timeout: 180000000
+deepspeed: examples/deepspeed/ds_z3_config.json
+
+# dataset
+dataset: identity,alpaca_gpt4_en
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+val_size: 0.1
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b/full/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 2
+learning_rate: 0.0001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
+fp16: true
+
+# eval
+per_device_eval_batch_size: 1
+evaluation_strategy: steps
+eval_steps: 500
diff --git a/examples/full_multi_gpu/multi_node.sh b/examples/full_multi_gpu/multi_node.sh
index a1ffc0ee..9c2508b6 100644
--- a/examples/full_multi_gpu/multi_node.sh
+++ b/examples/full_multi_gpu/multi_node.sh
@@ -6,33 +6,4 @@ python -m torch.distributed.run \
     --node_rank $RANK \
     --master_addr $MASTER_ADDR \
     --master_port $MASTER_PORT \
-    ../../src/train.py \
-    --deepspeed ../deepspeed/ds_z3_config.json \
-    --stage sft \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../data \
-    --template default \
-    --finetuning_type full \
-    --output_dir ../../saves/LLaMA2-7B/full/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 2 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --ddp_timeout 180000000 \
-    --plot_loss \
-    --fp16
+    src/train.py examples/full_multi_gpu/llama3_full_sft.yaml
diff --git a/examples/full_multi_gpu/predict.sh b/examples/full_multi_gpu/predict.sh
index 7c2e458f..2445f444 100644
--- a/examples/full_multi_gpu/predict.sh
+++ b/examples/full_multi_gpu/predict.sh
@@ -1,20 +1,5 @@
 #!/bin/bash
 
 CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch \
-    --config_file ../accelerate/single_config.yaml \
-    ../../src/train.py \
-    --stage sft \
-    --do_predict \
-    --model_name_or_path ../../saves/LLaMA2-7B/full/sft \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../data \
-    --template default \
-    --finetuning_type full \
-    --output_dir ../../saves/LLaMA2-7B/full/predict \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_eval_batch_size 1 \
-    --max_samples 20 \
-    --predict_with_generate
+    --config_file examples/accelerate/single_config.yaml \
+    src/train.py examples/full_multi_gpu/llama3_full_predict.yaml
diff --git a/examples/full_multi_gpu/single_node.sh b/examples/full_multi_gpu/single_node.sh
index 73c7662d..f391166a 100644
--- a/examples/full_multi_gpu/single_node.sh
+++ b/examples/full_multi_gpu/single_node.sh
@@ -1,32 +1,4 @@
 #!/bin/bash
 
-deepspeed --num_gpus 4 ../../src/train.py \
-    --deepspeed ../deepspeed/ds_z3_config.json \
-    --stage sft \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../data \
-    --template default \
-    --finetuning_type full \
-    --output_dir ../../saves/LLaMA2-7B/full/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 2 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --ddp_timeout 180000000 \
-    --plot_loss \
-    --fp16
+deepspeed --include "localhost:0,1,2,3" \
+    src/train.py examples/full_multi_gpu/llama3_full_sft.yaml
diff --git a/examples/lora_multi_gpu/ds_zero3.sh b/examples/lora_multi_gpu/ds_zero3.sh
index bc74a6de..304f3780 100644
--- a/examples/lora_multi_gpu/ds_zero3.sh
+++ b/examples/lora_multi_gpu/ds_zero3.sh
@@ -1,34 +1,5 @@
 #!/bin/bash
 # ZeRO-3 enables weight sharding on multiple GPUs
 
-deepspeed --num_gpus 4 ../../src/train.py \
-    --deepspeed ../deepspeed/ds_z3_config.json \
-    --stage sft \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../data \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir ../../saves/LLaMA2-7B/lora/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 2 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --ddp_timeout 180000000 \
-    --plot_loss \
-    --fp16
+deepspeed --include "localhost:0,1,2,3" \
+    src/train.py examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
diff --git a/examples/lora_multi_gpu/llama3_lora_sft.yaml b/examples/lora_multi_gpu/llama3_lora_sft.yaml
new file mode 100644
index 00000000..d9690679
--- /dev/null
+++ b/examples/lora_multi_gpu/llama3_lora_sft.yaml
@@ -0,0 +1,41 @@
+# model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+
+# method
+stage: sft
+do_train: true
+finetuning_type: lora
+lora_target: q_proj,v_proj
+
+# ddp
+ddp_timeout: 180000000
+
+# dataset
+dataset: identity,alpaca_gpt4_en
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+val_size: 0.1
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b/lora/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 2
+learning_rate: 0.0001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
+fp16: true
+
+# eval
+per_device_eval_batch_size: 1
+evaluation_strategy: steps
+eval_steps: 500
diff --git a/examples/lora_multi_gpu/llama3_lora_sft_ds.yaml b/examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
new file mode 100644
index 00000000..26955167
--- /dev/null
+++ b/examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
@@ -0,0 +1,42 @@
+# model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+
+# method
+stage: sft
+do_train: true
+finetuning_type: lora
+lora_target: q_proj,v_proj
+
+# ddp
+ddp_timeout: 180000000
+deepspeed: examples/deepspeed/ds_z3_config.json
+
+# dataset
+dataset: identity,alpaca_gpt4_en
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+val_size: 0.1
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b/lora/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 2
+learning_rate: 0.0001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
+fp16: true
+
+# eval
+per_device_eval_batch_size: 1
+evaluation_strategy: steps
+eval_steps: 500
diff --git a/examples/lora_multi_gpu/multi_node.sh b/examples/lora_multi_gpu/multi_node.sh
index a58cac20..401fac5f 100644
--- a/examples/lora_multi_gpu/multi_node.sh
+++ b/examples/lora_multi_gpu/multi_node.sh
@@ -2,35 +2,5 @@
 # also launch it on slave machine using slave_config.yaml
 
 CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch \
-    --config_file ../accelerate/master_config.yaml \
-    ../../src/train.py \
-    --stage sft \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../data \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir ../../saves/LLaMA2-7B/lora/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 2 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --ddp_timeout 180000000 \
-    --plot_loss \
-    --fp16
+    --config_file examples/accelerate/master_config.yaml \
+    src/train.py examples/lora_multi_gpu/llama3_lora_sft.yaml
diff --git a/examples/lora_multi_gpu/single_node.sh b/examples/lora_multi_gpu/single_node.sh
index c0719c04..885a0e8c 100644
--- a/examples/lora_multi_gpu/single_node.sh
+++ b/examples/lora_multi_gpu/single_node.sh
@@ -1,35 +1,5 @@
 #!/bin/bash
 
 CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch \
-    --config_file ../accelerate/single_config.yaml \
-    ../../src/train.py \
-    --stage sft \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../data \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir ../../saves/LLaMA2-7B/lora/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 2 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --ddp_timeout 180000000 \
-    --plot_loss \
-    --fp16
+    --config_file examples/accelerate/single_config.yaml \
+    src/train.py examples/lora_multi_gpu/llama3_lora_sft.yaml
diff --git a/examples/lora_single_gpu/llama3_preprocess.yaml b/examples/lora_single_gpu/llama3_preprocess.yaml
index 04df9631..0b3dc599 100644
--- a/examples/lora_single_gpu/llama3_preprocess.yaml
+++ b/examples/lora_single_gpu/llama3_preprocess.yaml
@@ -15,7 +15,7 @@ max_samples: 1000
 val_size: 0.1
 overwrite_cache: true
 preprocessing_num_workers: 16
-tokenized_path: saves/llama3-8b/dataset/sft # use `tokenized_path` in config to load data
+tokenized_path: saves/llama3-8b/dataset/sft
 
 # output
 output_dir: saves/llama3-8b/lora/sft
diff --git a/examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml b/examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml
index 2bd99740..11f1d277 100644
--- a/examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml
+++ b/examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml
@@ -1,27 +1,38 @@
+# model
+model_name_or_path: ISTA-DASLab/Meta-Llama-3-8B-Instruct-AQLM-2Bit-1x16
+
+# method
 stage: sft
 do_train: true
-model_name_or_path: BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf
-dataset: alpaca_gpt4_en,glaive_toolcall
-dataset_dir: data
-template: default
 finetuning_type: lora
 lora_target: q_proj,v_proj
-output_dir: ../../saves/LLaMA2-7B/lora/sft
-overwrite_cache: true
-overwrite_output_dir: true
+
+# dataset
+dataset: identity,alpaca_gpt4_en
+template: llama3
 cutoff_len: 1024
-per_device_train_batch_size: 1
-per_device_eval_batch_size: 1
-gradient_accumulation_steps: 8
-lr_scheduler_type: cosine
-logging_steps: 10
-save_steps: 100
-eval_steps: 100
-evaluation_strategy: steps
-load_best_model_at_end: true
-learning_rate: 5e-5
-num_train_epochs: 3.0
-max_samples: 3000
+max_samples: 1000
 val_size: 0.1
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b/lora/sft
+logging_steps: 10
+save_steps: 500
 plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 0.0001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
 fp16: true
+
+# eval
+per_device_eval_batch_size: 1
+evaluation_strategy: steps
+eval_steps: 500
diff --git a/examples/qlora_single_gpu/llama3_lora_sft_awq.yaml b/examples/qlora_single_gpu/llama3_lora_sft_awq.yaml
index e69de29b..4b070d45 100644
--- a/examples/qlora_single_gpu/llama3_lora_sft_awq.yaml
+++ b/examples/qlora_single_gpu/llama3_lora_sft_awq.yaml
@@ -0,0 +1,38 @@
+# model
+model_name_or_path: TechxGenus/Meta-Llama-3-8B-Instruct-AWQ
+
+# method
+stage: sft
+do_train: true
+finetuning_type: lora
+lora_target: q_proj,v_proj
+
+# dataset
+dataset: identity,alpaca_gpt4_en
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+val_size: 0.1
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b/lora/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 0.0001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
+fp16: true
+
+# eval
+per_device_eval_batch_size: 1
+evaluation_strategy: steps
+eval_steps: 500
diff --git a/examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml b/examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml
index e69de29b..7bc31bde 100644
--- a/examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml
+++ b/examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml
@@ -0,0 +1,42 @@
+# model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+quantization_bit: 4
+
+# method
+stage: sft
+do_train: true
+finetuning_type: lora
+lora_target: q_proj,v_proj
+
+# ddp
+ddp_timeout: 180000000
+
+# dataset
+dataset: identity,alpaca_gpt4_en
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+val_size: 0.1
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b/lora/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 0.0001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
+fp16: true
+
+# eval
+per_device_eval_batch_size: 1
+evaluation_strategy: steps
+eval_steps: 500
diff --git a/examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml b/examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml
index e69de29b..2f8cfe45 100644
--- a/examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml
+++ b/examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml
@@ -0,0 +1,38 @@
+# model
+model_name_or_path: TechxGenus/Meta-Llama-3-8B-Instruct-GPTQ
+
+# method
+stage: sft
+do_train: true
+finetuning_type: lora
+lora_target: q_proj,v_proj
+
+# dataset
+dataset: identity,alpaca_gpt4_en
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+val_size: 0.1
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b/lora/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 0.0001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
+fp16: true
+
+# eval
+per_device_eval_batch_size: 1
+evaluation_strategy: steps
+eval_steps: 500

From 047313f48e0b2c050952592329509e8b3dfc6f81 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 6 May 2024 23:07:55 +0800
Subject: [PATCH 3/3] update examples

---
 examples/README.md    | 14 ++++++++++++++
 examples/README_zh.md | 14 ++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/examples/README.md b/examples/README.md
index 922f9c7b..ba993b99 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -1,5 +1,19 @@
 We provide diverse examples about fine-tuning LLMs.
 
+Make sure to execute these commands in the `LLaMA-Factory` directory.
+
+## Table of Contents
+
+- [LoRA Fine-Tuning on A Single GPU](#lora-fine-tuning-on-a-single-gpu)
+- [QLoRA Fine-Tuning on a Single GPU](#qlora-fine-tuning-on-a-single-gpu)
+- [LoRA Fine-Tuning on Multiple GPUs](#lora-fine-tuning-on-multiple-gpus)
+- [Full-Parameter Fine-Tuning on Multiple GPUs](#full-parameter-fine-tuning-on-multiple-gpus)
+- [Merging LoRA Adapters and Quantization](#merging-lora-adapters-and-quantization)
+- [Inferring LoRA Fine-Tuned Models](#inferring-lora-fine-tuned-models)
+- [Extras](#extras)
+
+## Examples
+
 ### LoRA Fine-Tuning on A Single GPU
 
 #### (Continuous) Pre-Training
diff --git a/examples/README_zh.md b/examples/README_zh.md
index 14d72c10..491ec688 100644
--- a/examples/README_zh.md
+++ b/examples/README_zh.md
@@ -1,5 +1,19 @@
 我们提供了多样化的大模型微调示例脚本。
 
+请确保在 `LLaMA-Factory` 目录下执行下述命令。
+
+## 目录
+
+- [单 GPU LoRA 微调](#单-gpu-lora-微调)
+- [单 GPU QLoRA 微调](#单-gpu-qlora-微调)
+- [多 GPU LoRA 微调](#多-gpu-lora-微调)
+- [多 GPU 全参数微调](#多-gpu-全参数微调)
+- [合并 LoRA 适配器与模型量化](#合并-lora-适配器与模型量化)
+- [推理 LoRA 模型](#推理-lora-模型)
+- [杂项](#杂项)
+
+## 示例
+
 ### 单 GPU LoRA 微调
 
 #### （增量）预训练