From 58c522cd5cc4498a3fa8ed99424b5d63c9e56ccb Mon Sep 17 00:00:00 2001 From: hiyouga Date: Sun, 12 May 2024 01:10:30 +0800 Subject: [PATCH] remove checksum and fix ui args --- README.md | 4 ++-- README_zh.md | 4 ++-- data/dataset_info.json | 25 +++++-------------------- src/llmtuner/data/loader.py | 4 +--- src/llmtuner/data/parser.py | 2 -- src/llmtuner/data/utils.py | 15 --------------- src/llmtuner/webui/interface.py | 6 ++++-- src/webui.py | 3 ++- 8 files changed, 16 insertions(+), 47 deletions(-) diff --git a/README.md b/README.md index 80154cae..57a34dab 100644 --- a/README.md +++ b/README.md @@ -366,7 +366,7 @@ See [examples/README.md](examples/README.md) for advanced usage (including distr #### Use local environment ```bash -CUDA_VISIBLE_DEVICES=0 GRADIO_SERVER_PORT=7860 GRADIO_SHARE=1 llamafactory-cli webui +CUDA_VISIBLE_DEVICES=0 GRADIO_SHARE=1 llamafactory-cli webui ```
For Alibaba Cloud PAI or AutoDL users @@ -374,7 +374,7 @@ CUDA_VISIBLE_DEVICES=0 GRADIO_SERVER_PORT=7860 GRADIO_SHARE=1 llamafactory-cli w If you encountered display problems in LLaMA Board on Alibaba Cloud PAI, try using the following command to set environment variables before starting LLaMA Board: ```bash -export GRADIO_ROOT_PATH=/${JUPYTER_NAME}/proxy/7860/ +export GRADIO_SERVER_PORT=7860 GRADIO_ROOT_PATH=/${JUPYTER_NAME}/proxy/7860/ ``` If you are using AutoDL, please install a specific version of Gradio: diff --git a/README_zh.md b/README_zh.md index 5656fb4a..047b1645 100644 --- a/README_zh.md +++ b/README_zh.md @@ -366,7 +366,7 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_lora_s #### 使用本地环境 ```bash -CUDA_VISIBLE_DEVICES=0 GRADIO_SERVER_PORT=7860 GRADIO_SHARE=1 llamafactory-cli webui +CUDA_VISIBLE_DEVICES=0 GRADIO_SHARE=1 llamafactory-cli webui ```
阿里云 PAI 和 AutoDL 用户指南 @@ -374,7 +374,7 @@ CUDA_VISIBLE_DEVICES=0 GRADIO_SERVER_PORT=7860 GRADIO_SHARE=1 llamafactory-cli w 如果您在阿里云 PAI 上使用 LLaMA Board 时遇到显示问题,请尝试在启动前使用以下命令设置环境变量: ```bash -export GRADIO_ROOT_PATH=/${JUPYTER_NAME}/proxy/7860/ +export GRADIO_SERVER_PORT=7860 GRADIO_ROOT_PATH=/${JUPYTER_NAME}/proxy/7860/ ``` 如果您正在使用 AutoDL,请安装下述 Gradio 版本: diff --git a/data/dataset_info.json b/data/dataset_info.json index d5b7208f..032a5c49 100644 --- a/data/dataset_info.json +++ b/data/dataset_info.json @@ -1,27 +1,21 @@ { "alpaca_en": { - "file_name": "alpaca_data_en_52k.json", - "file_sha1": "607f94a7f581341e59685aef32f531095232cf23" + "file_name": "alpaca_data_en_52k.json" }, "alpaca_zh": { - "file_name": "alpaca_data_zh_51k.json", - "file_sha1": "2ba9827122c158dc256668d42bd1bcb8bc6b786e" + "file_name": "alpaca_data_zh_51k.json" }, "alpaca_gpt4_en": { - "file_name": "alpaca_gpt4_data_en.json", - "file_sha1": "647f4ad447bd993e4b6b6223d1be15208bab694a" + "file_name": "alpaca_gpt4_data_en.json" }, "alpaca_gpt4_zh": { - "file_name": "alpaca_gpt4_data_zh.json", - "file_sha1": "3eaa3bda364ccdd59925d7448a698256c31ef845" + "file_name": "alpaca_gpt4_data_zh.json" }, "identity": { - "file_name": "identity.json", - "file_sha1": "0f67e97fd01612006ab3536cdaf6cfb0d1e7f279" + "file_name": "identity.json" }, "oaast_sft_zh": { "file_name": "oaast_sft_zh.json", - "file_sha1": "a6a91f18f80f37b10ded9cf633fb50c033bf7b9f", "columns": { "prompt": "instruction", "query": "input", @@ -31,7 +25,6 @@ }, "lima": { "file_name": "lima.json", - "file_sha1": "9db59f6b7007dc4b17529fc63379b9cd61640f37", "columns": { "prompt": "instruction", "query": "input", @@ -41,7 +34,6 @@ }, "glaive_toolcall": { "file_name": "glaive_toolcall_10k.json", - "file_sha1": "36aea64548fbf6aa300bef411b9221092ed84902", "formatting": "sharegpt", "columns": { "messages": "conversations", @@ -50,7 +42,6 @@ }, "mllm_demo": { "file_name": "mllm_demo.json", - "file_sha1": "d626cc0ad88a26d0dc9fcb47336821cf486d8bcc", "formatting": "sharegpt", "columns": { "messages": "messages", @@ -308,7 +299,6 @@ }, "oaast_rm_zh": { "file_name": "oaast_rm_zh.json", - "file_sha1": "1065af1f3784dd61be5e79713a35f427b713a232", "columns": { "prompt": "instruction", "query": "input", @@ -319,17 +309,14 @@ }, "comparison_gpt4_en": { "file_name": "comparison_gpt4_data_en.json", - "file_sha1": "96fa18313544e22444fe20eead7754b17da452ae", "ranking": true }, "comparison_gpt4_zh": { "file_name": "comparison_gpt4_data_zh.json", - "file_sha1": "515b18ed497199131ddcc1af950345c11dc5c7fd", "ranking": true }, "orca_rlhf": { "file_name": "orca_rlhf.json", - "file_sha1": "acc8f74d16fd1fc4f68e7d86eaa781c2c3f5ba8e", "ranking": true, "columns": { "prompt": "question", @@ -370,14 +357,12 @@ }, "wiki_demo": { "file_name": "wiki_demo.txt", - "file_sha1": "e70375e28eda542a90c68213640cc371898ce181", "columns": { "prompt": "text" } }, "c4_demo": { "file_name": "c4_demo.json", - "file_sha1": "a5a0c86759732f9a5238e447fecd74f28a66cca8", "columns": { "prompt": "text" } diff --git a/src/llmtuner/data/loader.py b/src/llmtuner/data/loader.py index ca0d5407..3cc01b0d 100644 --- a/src/llmtuner/data/loader.py +++ b/src/llmtuner/data/loader.py @@ -11,7 +11,7 @@ from .aligner import align_dataset from .parser import get_dataset_list from .preprocess import get_preprocess_and_print_func from .template import get_template_and_fix_tokenizer -from .utils import checksum, merge_dataset +from .utils import merge_dataset if TYPE_CHECKING: @@ -61,8 +61,6 @@ def load_single_dataset( if data_path is None: raise ValueError("File extension must be txt, csv, json or jsonl.") - - checksum(data_files, dataset_attr.file_sha1) else: raise NotImplementedError diff --git a/src/llmtuner/data/parser.py b/src/llmtuner/data/parser.py index 01a417a9..3170fd8a 100644 --- a/src/llmtuner/data/parser.py +++ b/src/llmtuner/data/parser.py @@ -21,7 +21,6 @@ class DatasetAttr: load_from: Literal["hf_hub", "ms_hub", "script", "file"] dataset_name: str """ extra configs """ - file_sha1: Optional[str] = None subset: Optional[str] = None folder: Optional[str] = None ranking: bool = False @@ -99,7 +98,6 @@ def get_dataset_list(data_args: "DataArguments") -> List["DatasetAttr"]: else: dataset_attr = DatasetAttr("file", dataset_name=dataset_info[name]["file_name"]) - dataset_attr.set_attr("file_sha1", dataset_info[name]) dataset_attr.set_attr("subset", dataset_info[name]) dataset_attr.set_attr("folder", dataset_info[name]) dataset_attr.set_attr("ranking", dataset_info[name], default=False) diff --git a/src/llmtuner/data/utils.py b/src/llmtuner/data/utils.py index dc189609..29fd4ad4 100644 --- a/src/llmtuner/data/utils.py +++ b/src/llmtuner/data/utils.py @@ -26,21 +26,6 @@ class Role(str, Enum): OBSERVATION = "observation" -def checksum(data_files: List[str], file_sha1: Optional[str] = None) -> None: - if file_sha1 is None: - logger.warning("Checksum failed: missing SHA-1 hash value in dataset_info.json.") - return - - if len(data_files) != 1: - logger.warning("Checksum failed: too many files.") - return - - with open(data_files[0], "rb") as f: - sha1 = hashlib.sha1(f.read()).hexdigest() - if sha1 != file_sha1: - logger.warning("Checksum failed: mismatched SHA-1 hash value at {}.".format(data_files[0])) - - def infer_max_len(source_len: int, target_len: int, max_len: int, reserved_label_len: int) -> Tuple[int, int]: max_target_len = int(max_len * (target_len / (source_len + target_len))) max_target_len = max(max_target_len, reserved_label_len) diff --git a/src/llmtuner/webui/interface.py b/src/llmtuner/webui/interface.py index bbd91bb7..91709d40 100644 --- a/src/llmtuner/webui/interface.py +++ b/src/llmtuner/webui/interface.py @@ -71,10 +71,12 @@ def create_web_demo() -> gr.Blocks: def run_web_ui() -> None: + gradio_share = bool(int(os.environ.get("GRADIO_SHARE", "0"))) server_name = os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0") - create_ui().queue().launch(server_name=server_name) + create_ui().queue().launch(share=gradio_share, server_name=server_name) def run_web_demo() -> None: + gradio_share = bool(int(os.environ.get("GRADIO_SHARE", "0"))) server_name = os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0") - create_web_demo().queue().launch(server_name=server_name) + create_web_demo().queue().launch(share=gradio_share, server_name=server_name) diff --git a/src/webui.py b/src/webui.py index 000098d1..3f8690d0 100644 --- a/src/webui.py +++ b/src/webui.py @@ -4,8 +4,9 @@ from llmtuner.webui.interface import create_ui def main(): + gradio_share = bool(int(os.environ.get("GRADIO_SHARE", "0"))) server_name = os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0") - create_ui().queue().launch(server_name=server_name) + create_ui().queue().launch(share=gradio_share, server_name=server_name) if __name__ == "__main__":