From b0888262e371280f2b69ab529040b3a99632fc12 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 7 May 2024 17:50:27 +0800
Subject: [PATCH] fix #3602

---
 examples/README.md                       |  2 ++
 examples/README_zh.md                    |  2 ++
 examples/full_multi_gpu/multi_node.sh    |  8 +++++++-
 examples/full_multi_gpu/single_node.sh   |  7 ++++++-
 examples/lora_multi_gpu/ds_zero3.sh      |  8 ++++++--
 examples/merge_lora/llama3_lora_sft.yaml |  2 +-
 src/api.py                               | 19 +++++++++++++++++++
 7 files changed, 43 insertions(+), 5 deletions(-)
 create mode 100644 src/api.py

diff --git a/examples/README.md b/examples/README.md
index ba993b99..ce19f9d1 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -148,6 +148,8 @@ bash examples/full_multi_gpu/predict.sh
 
 #### Merge LoRA Adapters
 
+Note: DO NOT use quantized model or `quantization_bit` when merging LoRA adapters.
+
 ```bash
 CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
 ```
diff --git a/examples/README_zh.md b/examples/README_zh.md
index 491ec688..91bdcda9 100644
--- a/examples/README_zh.md
+++ b/examples/README_zh.md
@@ -148,6 +148,8 @@ bash examples/full_multi_gpu/predict.sh
 
 #### 合并 LoRA 适配器
 
+注：请勿使用量化后的模型或 `quantization_bit` 参数来合并 LoRA 适配器。
+
 ```bash
 CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
 ```
diff --git a/examples/full_multi_gpu/multi_node.sh b/examples/full_multi_gpu/multi_node.sh
index 9c2508b6..962409a1 100644
--- a/examples/full_multi_gpu/multi_node.sh
+++ b/examples/full_multi_gpu/multi_node.sh
@@ -1,6 +1,12 @@
 #!/bin/bash
 
-python -m torch.distributed.run \
+NPROC_PER_NODE=4
+NNODES=2
+RANK=0
+MASTER_ADDR=192.168.0.1
+MASTER_PORT=29500
+
+CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.run \
     --nproc_per_node $NPROC_PER_NODE \
     --nnodes $NNODES \
     --node_rank $RANK \
diff --git a/examples/full_multi_gpu/single_node.sh b/examples/full_multi_gpu/single_node.sh
index f391166a..97f7af64 100644
--- a/examples/full_multi_gpu/single_node.sh
+++ b/examples/full_multi_gpu/single_node.sh
@@ -1,4 +1,9 @@
 #!/bin/bash
 
-deepspeed --include "localhost:0,1,2,3" \
+NPROC_PER_NODE=4
+
+CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.run \
+    --nproc_per_node $NPROC_PER_NODE \
+    --nnodes 1 \
+    --standalone \
     src/train.py examples/full_multi_gpu/llama3_full_sft.yaml
diff --git a/examples/lora_multi_gpu/ds_zero3.sh b/examples/lora_multi_gpu/ds_zero3.sh
index 304f3780..b8fd2640 100644
--- a/examples/lora_multi_gpu/ds_zero3.sh
+++ b/examples/lora_multi_gpu/ds_zero3.sh
@@ -1,5 +1,9 @@
 #!/bin/bash
-# ZeRO-3 enables weight sharding on multiple GPUs
 
-deepspeed --include "localhost:0,1,2,3" \
+NPROC_PER_NODE=4
+
+CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.run \
+    --nproc_per_node $NPROC_PER_NODE \
+    --nnodes 1 \
+    --standalone \
     src/train.py examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
diff --git a/examples/merge_lora/llama3_lora_sft.yaml b/examples/merge_lora/llama3_lora_sft.yaml
index 508a0b8c..de41d48b 100644
--- a/examples/merge_lora/llama3_lora_sft.yaml
+++ b/examples/merge_lora/llama3_lora_sft.yaml
@@ -1,4 +1,4 @@
-# Note: DO NOT use quantized model or quantization_bit when merging lora weights
+# Note: DO NOT use quantized model or quantization_bit when merging lora adapters
 
 # model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
diff --git a/src/api.py b/src/api.py
new file mode 100644
index 00000000..277920ac
--- /dev/null
+++ b/src/api.py
@@ -0,0 +1,19 @@
+import os
+
+import uvicorn
+
+from llmtuner.api.app import create_app
+from llmtuner.chat import ChatModel
+
+
+def main():
+    chat_model = ChatModel()
+    app = create_app(chat_model)
+    api_host = os.environ.get("API_HOST", "0.0.0.0")
+    api_port = int(os.environ.get("API_PORT", "8000"))
+    print("Visit http://localhost:{}/docs for API document.".format(api_port))
+    uvicorn.run(app, host=api_host, port=api_port)
+
+
+if __name__ == "__main__":
+    main()