diff --git a/README.md b/README.md
index 7eb2d05f..2a87b659 100644
--- a/README.md
+++ b/README.md
@@ -86,39 +86,61 @@ Please refer to [template.py](src/llmtuner/extras/template.py) for a full list o
## Provided Datasets
-- For pre-training:
- - [Wiki Demo (en)](data/wiki_demo.txt)
- - [RefinedWeb (en)](https://huggingface.co/datasets/tiiuae/falcon-refinedweb)
- - [StarCoder (en)](https://huggingface.co/datasets/bigcode/starcoderdata)
- - [Wikipedia (en)](https://huggingface.co/datasets/olm/olm-wikipedia-20221220)
- - [Wikipedia (zh)](https://huggingface.co/datasets/pleisto/wikipedia-cn-20230720-filtered)
-- For supervised fine-tuning:
- - [Stanford Alpaca (en)](https://github.com/tatsu-lab/stanford_alpaca)
- - [Stanford Alpaca (zh)](https://github.com/ymcui/Chinese-LLaMA-Alpaca)
- - [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
- - [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1)
- - [Self-cognition (zh)](data/self_cognition.json)
- - [ShareGPT (zh)](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT/tree/main/Chinese-instruction-collection)
- - [Guanaco Dataset (multilingual)](https://huggingface.co/datasets/JosephusCheung/GuanacoDataset)
- - [BELLE 2M (zh)](https://huggingface.co/datasets/BelleGroup/train_2M_CN)
- - [BELLE 1M (zh)](https://huggingface.co/datasets/BelleGroup/train_1M_CN)
- - [BELLE 0.5M (zh)](https://huggingface.co/datasets/BelleGroup/train_0.5M_CN)
- - [BELLE Dialogue 0.4M (zh)](https://huggingface.co/datasets/BelleGroup/generated_chat_0.4M)
- - [BELLE School Math 0.25M (zh)](https://huggingface.co/datasets/BelleGroup/school_math_0.25M)
- - [BELLE Multiturn Chat 0.8M (zh)](https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M)
- - [LIMA (en)](https://huggingface.co/datasets/GAIR/lima)
- - [CodeAlpaca 20k (en)](https://huggingface.co/datasets/sahil2801/CodeAlpaca-20k)
- - [Alpaca CoT (multilingual)](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT)
- - [MathInstruct (en)](https://huggingface.co/datasets/TIGER-Lab/MathInstruct)
- - [Firefly 1.1M (zh)](https://huggingface.co/datasets/YeungNLP/firefly-train-1.1M)
- - [Web QA (zh)](https://huggingface.co/datasets/suolyer/webqa)
- - [UltraChat (en)](https://github.com/thunlp/UltraChat)
- - [WebNovel (zh)](https://huggingface.co/datasets/zxbsmk/webnovel_cn)
- - [Ad Gen (zh)](https://huggingface.co/datasets/HasturOfficial/adgen)
-- For reward modeling or DPO training:
- - [HH-RLHF (en)](https://huggingface.co/datasets/Anthropic/hh-rlhf)
- - [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1)
- - [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
+Pre-training datasets
+
+- [Wiki Demo (en)](data/wiki_demo.txt)
+- [RefinedWeb (en)](https://huggingface.co/datasets/tiiuae/falcon-refinedweb)
+- [RedPajama V2 (en)](https://huggingface.co/datasets/togethercomputer/RedPajama-Data-V2)
+- [Wikipedia (en)](https://huggingface.co/datasets/olm/olm-wikipedia-20221220)
+- [Wikipedia (zh)](https://huggingface.co/datasets/pleisto/wikipedia-cn-20230720-filtered)
+- [Pile (en)](https://huggingface.co/datasets/EleutherAI/pile)
+- [SkyPile (zh)](https://huggingface.co/datasets/Skywork/SkyPile-150B)
+- [The Stack (en)](https://huggingface.co/datasets/bigcode/the-stack)
+- [StarCoder (en)](https://huggingface.co/datasets/bigcode/starcoderdata)
+
+
+
+Supervised fine-tuning datasets
+
+- [Stanford Alpaca (en)](https://github.com/tatsu-lab/stanford_alpaca)
+- [Stanford Alpaca (zh)](https://github.com/ymcui/Chinese-LLaMA-Alpaca)
+- [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
+- [Self-cognition (zh)](data/self_cognition.json)
+- [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1)
+- [ShareGPT (zh)](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT/tree/main/Chinese-instruction-collection)
+- [Guanaco Dataset (multilingual)](https://huggingface.co/datasets/JosephusCheung/GuanacoDataset)
+- [BELLE 2M (zh)](https://huggingface.co/datasets/BelleGroup/train_2M_CN)
+- [BELLE 1M (zh)](https://huggingface.co/datasets/BelleGroup/train_1M_CN)
+- [BELLE 0.5M (zh)](https://huggingface.co/datasets/BelleGroup/train_0.5M_CN)
+- [BELLE Dialogue 0.4M (zh)](https://huggingface.co/datasets/BelleGroup/generated_chat_0.4M)
+- [BELLE School Math 0.25M (zh)](https://huggingface.co/datasets/BelleGroup/school_math_0.25M)
+- [BELLE Multiturn Chat 0.8M (zh)](https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M)
+- [UltraChat (en)](https://github.com/thunlp/UltraChat)
+- [LIMA (en)](https://huggingface.co/datasets/GAIR/lima)
+- [OpenPlatypus (en)](https://huggingface.co/datasets/garage-bAInd/Open-Platypus)
+- [CodeAlpaca 20k (en)](https://huggingface.co/datasets/sahil2801/CodeAlpaca-20k)
+- [Alpaca CoT (multilingual)](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT)
+- [MathInstruct (en)](https://huggingface.co/datasets/TIGER-Lab/MathInstruct)
+- [Firefly 1.1M (zh)](https://huggingface.co/datasets/YeungNLP/firefly-train-1.1M)
+- [Web QA (zh)](https://huggingface.co/datasets/suolyer/webqa)
+- [WebNovel (zh)](https://huggingface.co/datasets/zxbsmk/webnovel_cn)
+- [Ad Gen (zh)](https://huggingface.co/datasets/HasturOfficial/adgen)
+- [ShareGPT Hyperfiltered (en)](https://huggingface.co/datasets/totally-not-an-llm/sharegpt-hyperfiltered-3k)
+- [ShareGPT4 (en&zh)](https://huggingface.co/datasets/shibing624/sharegpt_gpt4)
+- [UltraChat 200k (en)](https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k)
+- [AgentInstruct (en)](https://huggingface.co/datasets/THUDM/AgentInstruct)
+- [LMSYS Chat (en)](https://huggingface.co/datasets/lmsys/lmsys-chat-1m)
+- [Evol Instruct V2 (en)](https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_V2_196k)
+
+
+
+Preference datasets
+
+- [HH-RLHF (en)](https://huggingface.co/datasets/Anthropic/hh-rlhf)
+- [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1)
+- [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
+
+
Please refer to [data/README.md](data/README.md) for details.
@@ -135,8 +157,8 @@ huggingface-cli login
- 🤗Transformers, Datasets, Accelerate, PEFT and TRL
- sentencepiece, protobuf and tiktoken
- fire, jieba, rouge-chinese and nltk (used at evaluation and predict)
-- gradio and matplotlib (used in web_demo.py)
-- uvicorn, fastapi and sse-starlette (used in api_demo.py)
+- gradio and matplotlib (used in web UI)
+- uvicorn, fastapi and sse-starlette (used in API)
And **powerful GPUs**!
@@ -144,7 +166,7 @@ And **powerful GPUs**!
### Data Preparation (optional)
-Please refer to `data/example_dataset` for checking the details about the format of dataset files. You can either use a single `.json` file or a [dataset loading script](https://huggingface.co/docs/datasets/dataset_script) with multiple files to create a custom dataset.
+Please refer to [data/README.md](data/README.md) for checking the details about the format of dataset files. You can either use a single `.json` file or a [dataset loading script](https://huggingface.co/docs/datasets/dataset_script) with multiple files to create a custom dataset.
> [!NOTE]
> Please update `data/dataset_info.json` to use your custom dataset. About the format of this file, please refer to `data/README.md`.
diff --git a/README_zh.md b/README_zh.md
index ed8be250..2e40c63f 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -86,41 +86,63 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/6ba60acc-e2e2-4bec-b846
## 数据集
-- 用于预训练:
- - [Wiki Demo (en)](data/wiki_demo.txt)
- - [RefinedWeb (en)](https://huggingface.co/datasets/tiiuae/falcon-refinedweb)
- - [StarCoder (en)](https://huggingface.co/datasets/bigcode/starcoderdata)
- - [Wikipedia (en)](https://huggingface.co/datasets/olm/olm-wikipedia-20221220)
- - [Wikipedia (zh)](https://huggingface.co/datasets/pleisto/wikipedia-cn-20230720-filtered)
-- 用于指令监督微调:
- - [Stanford Alpaca (en)](https://github.com/tatsu-lab/stanford_alpaca)
- - [Stanford Alpaca (zh)](https://github.com/ymcui/Chinese-LLaMA-Alpaca)
- - [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
- - [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1)
- - [Self-cognition (zh)](data/self_cognition.json)
- - [ShareGPT (zh)](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT/tree/main/Chinese-instruction-collection)
- - [Guanaco Dataset (multilingual)](https://huggingface.co/datasets/JosephusCheung/GuanacoDataset)
- - [BELLE 2M (zh)](https://huggingface.co/datasets/BelleGroup/train_2M_CN)
- - [BELLE 1M (zh)](https://huggingface.co/datasets/BelleGroup/train_1M_CN)
- - [BELLE 0.5M (zh)](https://huggingface.co/datasets/BelleGroup/train_0.5M_CN)
- - [BELLE Dialogue 0.4M (zh)](https://huggingface.co/datasets/BelleGroup/generated_chat_0.4M)
- - [BELLE School Math 0.25M (zh)](https://huggingface.co/datasets/BelleGroup/school_math_0.25M)
- - [BELLE Multiturn Chat 0.8M (zh)](https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M)
- - [LIMA (en)](https://huggingface.co/datasets/GAIR/lima)
- - [CodeAlpaca 20k (en)](https://huggingface.co/datasets/sahil2801/CodeAlpaca-20k)
- - [Alpaca CoT (multilingual)](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT)
- - [MathInstruct (en)](https://huggingface.co/datasets/TIGER-Lab/MathInstruct)
- - [Firefly 1.1M (zh)](https://huggingface.co/datasets/YeungNLP/firefly-train-1.1M)
- - [Web QA (zh)](https://huggingface.co/datasets/suolyer/webqa)
- - [UltraChat (en)](https://github.com/thunlp/UltraChat)
- - [WebNovel (zh)](https://huggingface.co/datasets/zxbsmk/webnovel_cn)
- - [Ad Gen (zh)](https://huggingface.co/datasets/HasturOfficial/adgen)
-- 用于训练奖励模型或 DPO 训练:
- - [HH-RLHF (en)](https://huggingface.co/datasets/Anthropic/hh-rlhf)
- - [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1)
- - [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
+预训练数据集
-使用方法请参考 [data/README.md](data/README_zh.md) 文件。
+- [Wiki Demo (en)](data/wiki_demo.txt)
+- [RefinedWeb (en)](https://huggingface.co/datasets/tiiuae/falcon-refinedweb)
+- [RedPajama V2 (en)](https://huggingface.co/datasets/togethercomputer/RedPajama-Data-V2)
+- [Wikipedia (en)](https://huggingface.co/datasets/olm/olm-wikipedia-20221220)
+- [Wikipedia (zh)](https://huggingface.co/datasets/pleisto/wikipedia-cn-20230720-filtered)
+- [Pile (en)](https://huggingface.co/datasets/EleutherAI/pile)
+- [SkyPile (zh)](https://huggingface.co/datasets/Skywork/SkyPile-150B)
+- [The Stack (en)](https://huggingface.co/datasets/bigcode/the-stack)
+- [StarCoder (en)](https://huggingface.co/datasets/bigcode/starcoderdata)
+
+
+
+指令微调数据集
+
+- [Stanford Alpaca (en)](https://github.com/tatsu-lab/stanford_alpaca)
+- [Stanford Alpaca (zh)](https://github.com/ymcui/Chinese-LLaMA-Alpaca)
+- [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
+- [Self-cognition (zh)](data/self_cognition.json)
+- [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1)
+- [ShareGPT (zh)](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT/tree/main/Chinese-instruction-collection)
+- [Guanaco Dataset (multilingual)](https://huggingface.co/datasets/JosephusCheung/GuanacoDataset)
+- [BELLE 2M (zh)](https://huggingface.co/datasets/BelleGroup/train_2M_CN)
+- [BELLE 1M (zh)](https://huggingface.co/datasets/BelleGroup/train_1M_CN)
+- [BELLE 0.5M (zh)](https://huggingface.co/datasets/BelleGroup/train_0.5M_CN)
+- [BELLE Dialogue 0.4M (zh)](https://huggingface.co/datasets/BelleGroup/generated_chat_0.4M)
+- [BELLE School Math 0.25M (zh)](https://huggingface.co/datasets/BelleGroup/school_math_0.25M)
+- [BELLE Multiturn Chat 0.8M (zh)](https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M)
+- [UltraChat (en)](https://github.com/thunlp/UltraChat)
+- [LIMA (en)](https://huggingface.co/datasets/GAIR/lima)
+- [OpenPlatypus (en)](https://huggingface.co/datasets/garage-bAInd/Open-Platypus)
+- [CodeAlpaca 20k (en)](https://huggingface.co/datasets/sahil2801/CodeAlpaca-20k)
+- [Alpaca CoT (multilingual)](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT)
+- [MathInstruct (en)](https://huggingface.co/datasets/TIGER-Lab/MathInstruct)
+- [Firefly 1.1M (zh)](https://huggingface.co/datasets/YeungNLP/firefly-train-1.1M)
+- [Web QA (zh)](https://huggingface.co/datasets/suolyer/webqa)
+- [WebNovel (zh)](https://huggingface.co/datasets/zxbsmk/webnovel_cn)
+- [Ad Gen (zh)](https://huggingface.co/datasets/HasturOfficial/adgen)
+- [ShareGPT Hyperfiltered (en)](https://huggingface.co/datasets/totally-not-an-llm/sharegpt-hyperfiltered-3k)
+- [ShareGPT4 (en&zh)](https://huggingface.co/datasets/shibing624/sharegpt_gpt4)
+- [UltraChat 200k (en)](https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k)
+- [AgentInstruct (en)](https://huggingface.co/datasets/THUDM/AgentInstruct)
+- [LMSYS Chat (en)](https://huggingface.co/datasets/lmsys/lmsys-chat-1m)
+- [Evol Instruct V2 (en)](https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_V2_196k)
+
+
+
+偏好数据集
+
+- [HH-RLHF (en)](https://huggingface.co/datasets/Anthropic/hh-rlhf)
+- [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1)
+- [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
+
+
+
+使用方法请参考 [data/README_zh.md](data/README_zh.md) 文件。
部分数据集的使用需要确认,我们推荐使用下述命令登录您的 Hugging Face 账户。
@@ -144,10 +166,10 @@ huggingface-cli login
### 数据准备(可跳过)
-关于数据集文件的格式,请参考 `data/example_dataset` 文件夹的内容。构建自定义数据集时,既可以使用单个 `.json` 文件,也可以使用一个[数据加载脚本](https://huggingface.co/docs/datasets/dataset_script)和多个文件。
+关于数据集文件的格式,请参考 [data/README_zh.md](data/README_zh.md) 的内容。构建自定义数据集时,既可以使用单个 `.json` 文件,也可以使用一个[数据加载脚本](https://huggingface.co/docs/datasets/dataset_script)和多个文件。
> [!NOTE]
-> 使用自定义数据集时,请更新 `data/dataset_info.json` 文件,该文件的格式请参考 `data/README.md`。
+> 使用自定义数据集时,请更新 `data/dataset_info.json` 文件,该文件的格式请参考 `data/README_zh.md`。
### 环境搭建(可跳过)
diff --git a/data/README.md b/data/README.md
index 3be493b2..8a11561e 100644
--- a/data/README.md
+++ b/data/README.md
@@ -6,7 +6,9 @@ If you are using a custom dataset, please provide your dataset definition in the
"script_url": "the name of the directory containing a dataset loading script. (if specified, ignore below 2 arguments)",
"file_name": "the name of the dataset file in the this directory. (required if above are not specified)",
"file_sha1": "the SHA-1 hash value of the dataset file. (optional)",
+ "subset": "",
"ranking": "whether the examples contains ranked responses or not. (default: false)",
+ "formatting": "",
"columns": {
"prompt": "the name of the column in the datasets containing the prompts. (default: instruction)",
"query": "the name of the column in the datasets containing the queries. (default: input)",
diff --git a/data/dataset_info.json b/data/dataset_info.json
index f30c422a..e4c4fb2d 100644
--- a/data/dataset_info.json
+++ b/data/dataset_info.json
@@ -71,14 +71,14 @@
"guanaco": {
"hf_hub_url": "JosephusCheung/GuanacoDataset"
},
- "belle_0.5m": {
- "hf_hub_url": "BelleGroup/train_0.5M_CN"
+ "belle_2m": {
+ "hf_hub_url": "BelleGroup/train_2M_CN"
},
"belle_1m": {
"hf_hub_url": "BelleGroup/train_1M_CN"
},
- "belle_2m": {
- "hf_hub_url": "BelleGroup/train_2M_CN"
+ "belle_0.5m": {
+ "hf_hub_url": "BelleGroup/train_0.5M_CN"
},
"belle_dialog": {
"hf_hub_url": "BelleGroup/generated_chat_0.4M"
@@ -90,80 +90,116 @@
"script_url": "belle_multiturn",
"columns": {
"prompt": "instruction",
- "query": "",
"response": "output",
"history": "history"
}
},
+ "ultra_chat": {
+ "script_url": "ultra_chat",
+ "columns": {
+ "prompt": "instruction",
+ "response": "output",
+ "history": "history"
+ }
+ },
+ "open_platypus": {
+ "hf_hub_url": "garage-bAInd/Open-Platypus"
+ },
"codealpaca": {
"hf_hub_url": "sahil2801/CodeAlpaca-20k"
},
"alpaca_cot": {
"hf_hub_url": "QingyiSi/Alpaca-CoT"
},
- "firefly": {
- "hf_hub_url": "YeungNLP/firefly-train-1.1M",
- "columns": {
- "prompt": "input",
- "query": "",
- "response": "target",
- "history": ""
- }
- },
"mathinstruct": {
"hf_hub_url": "TIGER-Lab/MathInstruct",
"columns": {
"prompt": "instruction",
- "query": "",
- "response": "output",
- "history": ""
+ "response": "output"
+ }
+ },
+ "firefly": {
+ "hf_hub_url": "YeungNLP/firefly-train-1.1M",
+ "columns": {
+ "prompt": "input",
+ "response": "target"
}
},
"webqa": {
"hf_hub_url": "suolyer/webqa",
"columns": {
"prompt": "input",
- "query": "",
- "response": "output",
- "history": ""
+ "response": "output"
}
},
- "ultra_chat": {
- "script_url": "ultra_chat",
- "columns": {
- "prompt": "instruction",
- "query": "",
- "response": "output",
- "history": "history"
- }
- },
- "novel_tokens512_50k": {
+ "webnovel": {
"hf_hub_url": "zxbsmk/webnovel_cn"
},
"adgen": {
"hf_hub_url": "HasturOfficial/adgen",
"columns": {
"prompt": "content",
- "query": "",
- "response": "summary",
- "history": ""
+ "response": "summary"
}
},
- "comparison_gpt4_en": {
- "file_name": "comparison_gpt4_data_en.json",
- "file_sha1": "96fa18313544e22444fe20eead7754b17da452ae",
- "ranking": true
+ "sharegpt_hyper": {
+ "hf_hub_url": "totally-not-an-llm/sharegpt-hyperfiltered-3k",
+ "columns": {
+ "prompt": "conversations",
+ "query": "from",
+ "response": "value"
+ },
+ "formatting": "sharegpt"
},
- "comparison_gpt4_zh": {
- "file_name": "comparison_gpt4_data_zh.json",
- "file_sha1": "515b18ed497199131ddcc1af950345c11dc5c7fd",
- "ranking": true
+ "sharegpt4": {
+ "hf_hub_url": "shibing624/sharegpt_gpt4",
+ "columns": {
+ "prompt": "conversations",
+ "query": "from",
+ "response": "value"
+ },
+ "formatting": "sharegpt"
+ },
+ "ultrachat_200k": {
+ "hf_hub_url": "HuggingFaceH4/ultrachat_200k",
+ "columns": {
+ "prompt": "messages",
+ "query": "role",
+ "response": "content"
+ },
+ "formatting": "sharegpt"
+ },
+ "agent_instruct": {
+ "hf_hub_url": "THUDM/AgentInstruct",
+ "columns": {
+ "prompt": "conversations",
+ "query": "from",
+ "response": "value"
+ },
+ "formatting": "sharegpt"
+ },
+ "lmsys_chat": {
+ "hf_hub_url": "lmsys/lmsys-chat-1m",
+ "columns": {
+ "prompt": "conversation",
+ "query": "role",
+ "response": "content"
+ },
+ "formatting": "sharegpt"
+ },
+ "evol_instruct": {
+ "hf_hub_url": "WizardLM/WizardLM_evol_instruct_V2_196k",
+ "columns": {
+ "prompt": "conversations",
+ "query": "from",
+ "response": "value"
+ },
+ "formatting": "sharegpt"
},
"hh_rlhf_en": {
"script_url": "hh_rlhf_en",
"columns": {
"prompt": "instruction",
- "query": "",
"response": "output",
"history": "history"
},
@@ -191,59 +227,71 @@
},
"ranking": true
},
+ "comparison_gpt4_en": {
+ "file_name": "comparison_gpt4_data_en.json",
+ "file_sha1": "96fa18313544e22444fe20eead7754b17da452ae",
+ "ranking": true
+ },
+ "comparison_gpt4_zh": {
+ "file_name": "comparison_gpt4_data_zh.json",
+ "file_sha1": "515b18ed497199131ddcc1af950345c11dc5c7fd",
+ "ranking": true
+ },
"wiki_demo": {
"file_name": "wiki_demo.txt",
"file_sha1": "b2288edb05b233e5b35250fd4b308a5fa21fa66d",
"columns": {
- "prompt": "text",
- "query": "",
- "response": "",
- "history": ""
+ "prompt": "text"
}
},
"refinedweb": {
"hf_hub_url": "tiiuae/falcon-refinedweb",
"columns": {
- "prompt": "content",
- "query": "",
- "response": "",
- "history": ""
+ "prompt": "content"
}
},
+ "redpajama_v2": {
+ "hf_hub_url": "togethercomputer/RedPajama-Data-V2",
+ "columns": {
+ "prompt": "raw_content"
+ },
+ "subset": "default"
+ },
"wikipedia_en": {
"hf_hub_url": "olm/olm-wikipedia-20221220",
"columns": {
- "prompt": "text",
- "query": "",
- "response": "",
- "history": ""
+ "prompt": "text"
}
},
"wikipedia_zh": {
"hf_hub_url": "pleisto/wikipedia-cn-20230720-filtered",
"columns": {
- "prompt": "completion",
- "query": "",
- "response": "",
- "history": ""
+ "prompt": "completion"
+ }
+ },
+ "pile": {
+ "hf_hub_url": "EleutherAI/pile",
+ "columns": {
+ "prompt": "text"
+ },
+ "subset": "all"
+ },
+ "skypile": {
+ "hf_hub_url": "Skywork/SkyPile-150B",
+ "columns": {
+ "prompt": "text"
}
},
"the_stack": {
"hf_hub_url": "bigcode/the-stack",
"columns": {
- "prompt": "content",
- "query": "",
- "response": "",
- "history": ""
+ "prompt": "content"
}
},
"starcoder": {
"hf_hub_url": "bigcode/starcoderdata",
"columns": {
- "prompt": "content",
- "query": "",
- "response": "",
- "history": ""
+ "prompt": "content"
}
}
}
diff --git a/src/llmtuner/dsets/loader.py b/src/llmtuner/dsets/loader.py
index fe88ce50..46db294a 100644
--- a/src/llmtuner/dsets/loader.py
+++ b/src/llmtuner/dsets/loader.py
@@ -1,5 +1,5 @@
import os
-from typing import TYPE_CHECKING, List, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Union
from datasets import concatenate_datasets, interleave_datasets, load_dataset
@@ -26,22 +26,23 @@ def get_dataset(
if dataset_attr.load_from == "hf_hub":
data_path = dataset_attr.dataset_name
+ data_name = dataset_attr.subset
data_files = None
elif dataset_attr.load_from == "script":
data_path = os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)
+ data_name = dataset_attr.subset
data_files = None
elif dataset_attr.load_from == "file":
- data_path = None
+ data_path, data_name = None, None
data_files: List[str] = []
-
- if os.path.isdir(os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)): # directory
+ if os.path.isdir(os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)): # is directory
for file_name in os.listdir(os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)):
data_files.append(os.path.join(data_args.dataset_dir, dataset_attr.dataset_name, file_name))
if data_path is None:
data_path = EXT2TYPE.get(file_name.split(".")[-1], None)
else:
- assert data_path == EXT2TYPE.get(file_name.split(".")[-1], None), "file type does not match."
- elif os.path.isfile(os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)): # single file
+ assert data_path == EXT2TYPE.get(file_name.split(".")[-1], None), "file types are not identical."
+ elif os.path.isfile(os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)): # is file
data_files.append(os.path.join(data_args.dataset_dir, dataset_attr.dataset_name))
data_path = EXT2TYPE.get(dataset_attr.dataset_name.split(".")[-1], None)
else:
@@ -53,7 +54,8 @@ def get_dataset(
raise NotImplementedError
dataset = load_dataset(
- data_path,
+ path=data_path,
+ name=data_name,
data_files=data_files,
split=data_args.split,
cache_dir=model_args.cache_dir,
@@ -61,15 +63,59 @@ def get_dataset(
use_auth_token=True if model_args.use_auth_token else None
)
- if max_samples is not None:
- max_samples_temp = min(len(dataset), max_samples)
- dataset = dataset.select(range(max_samples_temp))
+ if max_samples is not None: # truncate dataset
+ dataset = dataset.select(range(min(len(dataset), max_samples)))
- # TODO: adapt to the sharegpt format
+ def convert_format(examples: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+ # convert dataset from sharegpt format to alpaca format
+ outputs = {"prompt": [], "query": [], "response": [], "history": []}
+ for msg_list in examples[dataset_attr.prompt]:
+ msg_list = msg_list[:len(msg_list) // 2 * 2] # should be multiples of 2
+ if len(msg_list) == 0:
+ continue
- for column_name in ["prompt", "query", "response", "history"]: # align datasets
- if getattr(dataset_attr, column_name) and getattr(dataset_attr, column_name) != column_name:
- dataset = dataset.rename_column(getattr(dataset_attr, column_name), column_name)
+ msg_pairs = []
+ user_role, assistant_role = None, None
+ for idx in range(0, len(msg_list), 2):
+ if user_role is None and assistant_role is None:
+ user_role = msg_list[idx][dataset_attr.query]
+ assistant_role = msg_list[idx + 1][dataset_attr.query]
+ else:
+ if (
+ msg_list[idx][dataset_attr.query] != user_role
+ or msg_list[idx+1][dataset_attr.query] != assistant_role
+ ):
+ raise ValueError("Only accepts conversation in u/a/u/a/u/a order.")
+ msg_pairs.append((msg_list[idx][dataset_attr.response], msg_list[idx + 1][dataset_attr.response]))
+
+ if len(msg_pairs) != 0:
+ outputs["prompt"].append(msg_pairs[-1][0])
+ outputs["query"].append("")
+ outputs["response"].append(msg_pairs[-1][1])
+ outputs["history"].append(msg_pairs[:-1])
+
+ return outputs
+
+ if dataset_attr.formatting == "sharegpt": # convert format
+ column_names = list(next(iter(dataset)).keys())
+ kwargs = {}
+ if not data_args.streaming:
+ kwargs = dict(
+ num_proc=data_args.preprocessing_num_workers,
+ load_from_cache_file=(not data_args.overwrite_cache),
+ desc="Converting format of dataset"
+ )
+
+ dataset = dataset.map(
+ convert_format,
+ batched=True,
+ remove_columns=column_names,
+ **kwargs
+ )
+ else:
+ for column_name in ["prompt", "query", "response", "history"]: # align dataset
+ if getattr(dataset_attr, column_name) and getattr(dataset_attr, column_name) != column_name:
+ dataset = dataset.rename_column(getattr(dataset_attr, column_name), column_name)
if dataset_attr.system_prompt: # add system prompt
system_prompt = dataset_attr.system_prompt
diff --git a/src/llmtuner/dsets/preprocess.py b/src/llmtuner/dsets/preprocess.py
index 18f01db1..0484b78e 100644
--- a/src/llmtuner/dsets/preprocess.py
+++ b/src/llmtuner/dsets/preprocess.py
@@ -39,7 +39,7 @@ def preprocess_dataset(
system = examples["system"][i] if "system" in examples else None
yield query, response, history, system
- def preprocess_pretrain_dataset(examples: Dict[str, List[Any]]) -> Dict[str, Any]:
+ def preprocess_pretrain_dataset(examples: Dict[str, List[Any]]) -> Dict[str, List[List[int]]]:
# build grouped texts with format `X1 X2 X3 ...`
if isinstance(getattr(tokenizer, "tokenizer", None), tiktoken.Encoding): # for tiktoken tokenizer (Qwen)
kwargs = dict(allowed_special="all")
@@ -62,7 +62,7 @@ def preprocess_dataset(
}
return result
- def preprocess_supervised_dataset(examples: Dict[str, List[Any]]) -> Dict[str, Any]:
+ def preprocess_supervised_dataset(examples: Dict[str, List[Any]]) -> Dict[str, List[List[int]]]:
# build inputs with format ` X Y ` and labels with format ` ... Y `
# for multiturn examples, we only mask the prompt part in each prompt-response pair.
model_inputs = {"input_ids": [], "attention_mask": [], "labels": []}
@@ -108,7 +108,7 @@ def preprocess_dataset(
return model_inputs
- def preprocess_packed_supervised_dataset(examples: Dict[str, List[Any]]) -> Dict[str, Any]:
+ def preprocess_packed_supervised_dataset(examples: Dict[str, List[Any]]) -> Dict[str, List[List[int]]]:
# build inputs with format ` X1 Y1 X2 Y2 `
# and labels with format ` ... Y1 ... Y2 `
model_inputs = {"input_ids": [], "attention_mask": [], "labels": []}
@@ -145,7 +145,7 @@ def preprocess_dataset(
return model_inputs
- def preprocess_unsupervised_dataset(examples: Dict[str, List[Any]]) -> Dict[str, Any]:
+ def preprocess_unsupervised_dataset(examples: Dict[str, List[Any]]) -> Dict[str, List[List[int]]]:
# build inputs with format ` X` and labels with format `Y `
model_inputs = {"input_ids": [], "attention_mask": [], "labels": []}
@@ -169,10 +169,10 @@ def preprocess_dataset(
return model_inputs
- def preprocess_pairwise_dataset(examples):
+ def preprocess_pairwise_dataset(examples: Dict[str, List[Any]]) -> Dict[str, List[List[int]]]:
# build input pairs with format ` X`, `Y1 ` and `Y2 `
model_inputs = {"prompt_ids": [], "chosen_ids": [], "rejected_ids": []}
- for query, response, history, system in construct_example(examples):
+ for query, response, history, system in construct_example(examples):
if not (isinstance(query, str) and isinstance(response, list) and query != "" and len(response) > 1):
continue
@@ -197,9 +197,10 @@ def preprocess_dataset(
model_inputs["prompt_ids"].append(prompt_ids)
model_inputs["chosen_ids"].append(chosen_ids)
model_inputs["rejected_ids"].append(rejected_ids)
+
return model_inputs
- def print_supervised_dataset_example(example):
+ def print_supervised_dataset_example(example: Dict[str, List[int]]) -> None:
print("input_ids:\n{}".format(example["input_ids"]))
print("inputs:\n{}".format(tokenizer.decode(example["input_ids"], skip_special_tokens=False)))
print("label_ids:\n{}".format(example["labels"]))
@@ -207,7 +208,7 @@ def preprocess_dataset(
tokenizer.decode(list(filter(lambda x: x != IGNORE_INDEX, example["labels"])), skip_special_tokens=False)
))
- def print_pairwise_dataset_example(example):
+ def print_pairwise_dataset_example(example: Dict[str, List[int]]) -> None:
print("prompt_ids:\n{}".format(example["prompt_ids"]))
print("prompt:\n{}".format(tokenizer.decode(example["prompt_ids"], skip_special_tokens=False)))
print("chosen_ids:\n{}".format(example["chosen_ids"]))
@@ -215,7 +216,7 @@ def preprocess_dataset(
print("rejected_ids:\n{}".format(example["rejected_ids"]))
print("rejected:\n{}".format(tokenizer.decode(example["rejected_ids"], skip_special_tokens=False)))
- def print_unsupervised_dataset_example(example):
+ def print_unsupervised_dataset_example(example: Dict[str, List[int]]) -> None:
print("input_ids:\n{}".format(example["input_ids"]))
print("inputs:\n{}".format(tokenizer.decode(example["input_ids"], skip_special_tokens=False)))
@@ -242,13 +243,13 @@ def preprocess_dataset(
if not data_args.streaming:
kwargs = dict(
num_proc=data_args.preprocessing_num_workers,
- load_from_cache_file=not data_args.overwrite_cache,
+ load_from_cache_file=(not data_args.overwrite_cache),
desc="Running tokenizer on dataset"
)
dataset = dataset.map(
preprocess_func,
- batched=True,
+ batched=True,
remove_columns=column_names,
**kwargs
)
diff --git a/src/llmtuner/hparams/data_args.py b/src/llmtuner/hparams/data_args.py
index 49b86345..2f4cda38 100644
--- a/src/llmtuner/hparams/data_args.py
+++ b/src/llmtuner/hparams/data_args.py
@@ -11,6 +11,7 @@ class DatasetAttr:
dataset_name: Optional[str] = None
dataset_sha1: Optional[str] = None
system_prompt: Optional[str] = None
+ subset: Optional[str] = None
ranking: Optional[bool] = False
formatting: Optional[Literal["alpaca", "sharegpt"]] = "alpaca"
@@ -155,6 +156,7 @@ class DataArguments:
dataset_attr.response = dataset_info[name]["columns"].get("response", None)
dataset_attr.history = dataset_info[name]["columns"].get("history", None)
+ dataset_attr.subset = dataset_info[name].get("subset", None)
dataset_attr.ranking = dataset_info[name].get("ranking", False)
dataset_attr.formatting = dataset_info[name].get("formatting", "alpaca")
dataset_attr.system_prompt = prompt_list[i]