support sharegpt format, add datasets

This commit is contained in:
hiyouga 2023-11-02 23:10:04 +08:00
parent c1edb0cf1b
commit a837172413
7 changed files with 306 additions and 163 deletions

View File

@ -86,18 +86,27 @@ Please refer to [template.py](src/llmtuner/extras/template.py) for a full list o
## Provided Datasets ## Provided Datasets
- For pre-training: <details><summary>Pre-training datasets</summary>
- [Wiki Demo (en)](data/wiki_demo.txt) - [Wiki Demo (en)](data/wiki_demo.txt)
- [RefinedWeb (en)](https://huggingface.co/datasets/tiiuae/falcon-refinedweb) - [RefinedWeb (en)](https://huggingface.co/datasets/tiiuae/falcon-refinedweb)
- [StarCoder (en)](https://huggingface.co/datasets/bigcode/starcoderdata) - [RedPajama V2 (en)](https://huggingface.co/datasets/togethercomputer/RedPajama-Data-V2)
- [Wikipedia (en)](https://huggingface.co/datasets/olm/olm-wikipedia-20221220) - [Wikipedia (en)](https://huggingface.co/datasets/olm/olm-wikipedia-20221220)
- [Wikipedia (zh)](https://huggingface.co/datasets/pleisto/wikipedia-cn-20230720-filtered) - [Wikipedia (zh)](https://huggingface.co/datasets/pleisto/wikipedia-cn-20230720-filtered)
- For supervised fine-tuning: - [Pile (en)](https://huggingface.co/datasets/EleutherAI/pile)
- [SkyPile (zh)](https://huggingface.co/datasets/Skywork/SkyPile-150B)
- [The Stack (en)](https://huggingface.co/datasets/bigcode/the-stack)
- [StarCoder (en)](https://huggingface.co/datasets/bigcode/starcoderdata)
</details>
<details><summary>Supervised fine-tuning datasets</summary>
- [Stanford Alpaca (en)](https://github.com/tatsu-lab/stanford_alpaca) - [Stanford Alpaca (en)](https://github.com/tatsu-lab/stanford_alpaca)
- [Stanford Alpaca (zh)](https://github.com/ymcui/Chinese-LLaMA-Alpaca) - [Stanford Alpaca (zh)](https://github.com/ymcui/Chinese-LLaMA-Alpaca)
- [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM) - [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
- [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1)
- [Self-cognition (zh)](data/self_cognition.json) - [Self-cognition (zh)](data/self_cognition.json)
- [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1)
- [ShareGPT (zh)](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT/tree/main/Chinese-instruction-collection) - [ShareGPT (zh)](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT/tree/main/Chinese-instruction-collection)
- [Guanaco Dataset (multilingual)](https://huggingface.co/datasets/JosephusCheung/GuanacoDataset) - [Guanaco Dataset (multilingual)](https://huggingface.co/datasets/JosephusCheung/GuanacoDataset)
- [BELLE 2M (zh)](https://huggingface.co/datasets/BelleGroup/train_2M_CN) - [BELLE 2M (zh)](https://huggingface.co/datasets/BelleGroup/train_2M_CN)
@ -106,20 +115,33 @@ Please refer to [template.py](src/llmtuner/extras/template.py) for a full list o
- [BELLE Dialogue 0.4M (zh)](https://huggingface.co/datasets/BelleGroup/generated_chat_0.4M) - [BELLE Dialogue 0.4M (zh)](https://huggingface.co/datasets/BelleGroup/generated_chat_0.4M)
- [BELLE School Math 0.25M (zh)](https://huggingface.co/datasets/BelleGroup/school_math_0.25M) - [BELLE School Math 0.25M (zh)](https://huggingface.co/datasets/BelleGroup/school_math_0.25M)
- [BELLE Multiturn Chat 0.8M (zh)](https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M) - [BELLE Multiturn Chat 0.8M (zh)](https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M)
- [UltraChat (en)](https://github.com/thunlp/UltraChat)
- [LIMA (en)](https://huggingface.co/datasets/GAIR/lima) - [LIMA (en)](https://huggingface.co/datasets/GAIR/lima)
- [OpenPlatypus (en)](https://huggingface.co/datasets/garage-bAInd/Open-Platypus)
- [CodeAlpaca 20k (en)](https://huggingface.co/datasets/sahil2801/CodeAlpaca-20k) - [CodeAlpaca 20k (en)](https://huggingface.co/datasets/sahil2801/CodeAlpaca-20k)
- [Alpaca CoT (multilingual)](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT) - [Alpaca CoT (multilingual)](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT)
- [MathInstruct (en)](https://huggingface.co/datasets/TIGER-Lab/MathInstruct) - [MathInstruct (en)](https://huggingface.co/datasets/TIGER-Lab/MathInstruct)
- [Firefly 1.1M (zh)](https://huggingface.co/datasets/YeungNLP/firefly-train-1.1M) - [Firefly 1.1M (zh)](https://huggingface.co/datasets/YeungNLP/firefly-train-1.1M)
- [Web QA (zh)](https://huggingface.co/datasets/suolyer/webqa) - [Web QA (zh)](https://huggingface.co/datasets/suolyer/webqa)
- [UltraChat (en)](https://github.com/thunlp/UltraChat)
- [WebNovel (zh)](https://huggingface.co/datasets/zxbsmk/webnovel_cn) - [WebNovel (zh)](https://huggingface.co/datasets/zxbsmk/webnovel_cn)
- [Ad Gen (zh)](https://huggingface.co/datasets/HasturOfficial/adgen) - [Ad Gen (zh)](https://huggingface.co/datasets/HasturOfficial/adgen)
- For reward modeling or DPO training: - [ShareGPT Hyperfiltered (en)](https://huggingface.co/datasets/totally-not-an-llm/sharegpt-hyperfiltered-3k)
- [ShareGPT4 (en&zh)](https://huggingface.co/datasets/shibing624/sharegpt_gpt4)
- [UltraChat 200k (en)](https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k)
- [AgentInstruct (en)](https://huggingface.co/datasets/THUDM/AgentInstruct)
- [LMSYS Chat (en)](https://huggingface.co/datasets/lmsys/lmsys-chat-1m)
- [Evol Instruct V2 (en)](https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_V2_196k)
</details>
<details><summary>Preference datasets</summary>
- [HH-RLHF (en)](https://huggingface.co/datasets/Anthropic/hh-rlhf) - [HH-RLHF (en)](https://huggingface.co/datasets/Anthropic/hh-rlhf)
- [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1) - [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1)
- [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM) - [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
</details>
Please refer to [data/README.md](data/README.md) for details. Please refer to [data/README.md](data/README.md) for details.
Some datasets require confirmation before using them, so we recommend logging in with your Hugging Face account using these commands. Some datasets require confirmation before using them, so we recommend logging in with your Hugging Face account using these commands.
@ -135,8 +157,8 @@ huggingface-cli login
- 🤗Transformers, Datasets, Accelerate, PEFT and TRL - 🤗Transformers, Datasets, Accelerate, PEFT and TRL
- sentencepiece, protobuf and tiktoken - sentencepiece, protobuf and tiktoken
- fire, jieba, rouge-chinese and nltk (used at evaluation and predict) - fire, jieba, rouge-chinese and nltk (used at evaluation and predict)
- gradio and matplotlib (used in web_demo.py) - gradio and matplotlib (used in web UI)
- uvicorn, fastapi and sse-starlette (used in api_demo.py) - uvicorn, fastapi and sse-starlette (used in API)
And **powerful GPUs**! And **powerful GPUs**!
@ -144,7 +166,7 @@ And **powerful GPUs**!
### Data Preparation (optional) ### Data Preparation (optional)
Please refer to `data/example_dataset` for checking the details about the format of dataset files. You can either use a single `.json` file or a [dataset loading script](https://huggingface.co/docs/datasets/dataset_script) with multiple files to create a custom dataset. Please refer to [data/README.md](data/README.md) for checking the details about the format of dataset files. You can either use a single `.json` file or a [dataset loading script](https://huggingface.co/docs/datasets/dataset_script) with multiple files to create a custom dataset.
> [!NOTE] > [!NOTE]
> Please update `data/dataset_info.json` to use your custom dataset. About the format of this file, please refer to `data/README.md`. > Please update `data/dataset_info.json` to use your custom dataset. About the format of this file, please refer to `data/README.md`.

View File

@ -86,18 +86,27 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/6ba60acc-e2e2-4bec-b846
## 数据集 ## 数据集
- 用于预训练: <details><summary>预训练数据集</summary>
- [Wiki Demo (en)](data/wiki_demo.txt) - [Wiki Demo (en)](data/wiki_demo.txt)
- [RefinedWeb (en)](https://huggingface.co/datasets/tiiuae/falcon-refinedweb) - [RefinedWeb (en)](https://huggingface.co/datasets/tiiuae/falcon-refinedweb)
- [StarCoder (en)](https://huggingface.co/datasets/bigcode/starcoderdata) - [RedPajama V2 (en)](https://huggingface.co/datasets/togethercomputer/RedPajama-Data-V2)
- [Wikipedia (en)](https://huggingface.co/datasets/olm/olm-wikipedia-20221220) - [Wikipedia (en)](https://huggingface.co/datasets/olm/olm-wikipedia-20221220)
- [Wikipedia (zh)](https://huggingface.co/datasets/pleisto/wikipedia-cn-20230720-filtered) - [Wikipedia (zh)](https://huggingface.co/datasets/pleisto/wikipedia-cn-20230720-filtered)
- 用于指令监督微调: - [Pile (en)](https://huggingface.co/datasets/EleutherAI/pile)
- [SkyPile (zh)](https://huggingface.co/datasets/Skywork/SkyPile-150B)
- [The Stack (en)](https://huggingface.co/datasets/bigcode/the-stack)
- [StarCoder (en)](https://huggingface.co/datasets/bigcode/starcoderdata)
</details>
<details><summary>指令微调数据集</summary>
- [Stanford Alpaca (en)](https://github.com/tatsu-lab/stanford_alpaca) - [Stanford Alpaca (en)](https://github.com/tatsu-lab/stanford_alpaca)
- [Stanford Alpaca (zh)](https://github.com/ymcui/Chinese-LLaMA-Alpaca) - [Stanford Alpaca (zh)](https://github.com/ymcui/Chinese-LLaMA-Alpaca)
- [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM) - [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
- [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1)
- [Self-cognition (zh)](data/self_cognition.json) - [Self-cognition (zh)](data/self_cognition.json)
- [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1)
- [ShareGPT (zh)](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT/tree/main/Chinese-instruction-collection) - [ShareGPT (zh)](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT/tree/main/Chinese-instruction-collection)
- [Guanaco Dataset (multilingual)](https://huggingface.co/datasets/JosephusCheung/GuanacoDataset) - [Guanaco Dataset (multilingual)](https://huggingface.co/datasets/JosephusCheung/GuanacoDataset)
- [BELLE 2M (zh)](https://huggingface.co/datasets/BelleGroup/train_2M_CN) - [BELLE 2M (zh)](https://huggingface.co/datasets/BelleGroup/train_2M_CN)
@ -106,21 +115,34 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/6ba60acc-e2e2-4bec-b846
- [BELLE Dialogue 0.4M (zh)](https://huggingface.co/datasets/BelleGroup/generated_chat_0.4M) - [BELLE Dialogue 0.4M (zh)](https://huggingface.co/datasets/BelleGroup/generated_chat_0.4M)
- [BELLE School Math 0.25M (zh)](https://huggingface.co/datasets/BelleGroup/school_math_0.25M) - [BELLE School Math 0.25M (zh)](https://huggingface.co/datasets/BelleGroup/school_math_0.25M)
- [BELLE Multiturn Chat 0.8M (zh)](https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M) - [BELLE Multiturn Chat 0.8M (zh)](https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M)
- [UltraChat (en)](https://github.com/thunlp/UltraChat)
- [LIMA (en)](https://huggingface.co/datasets/GAIR/lima) - [LIMA (en)](https://huggingface.co/datasets/GAIR/lima)
- [OpenPlatypus (en)](https://huggingface.co/datasets/garage-bAInd/Open-Platypus)
- [CodeAlpaca 20k (en)](https://huggingface.co/datasets/sahil2801/CodeAlpaca-20k) - [CodeAlpaca 20k (en)](https://huggingface.co/datasets/sahil2801/CodeAlpaca-20k)
- [Alpaca CoT (multilingual)](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT) - [Alpaca CoT (multilingual)](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT)
- [MathInstruct (en)](https://huggingface.co/datasets/TIGER-Lab/MathInstruct) - [MathInstruct (en)](https://huggingface.co/datasets/TIGER-Lab/MathInstruct)
- [Firefly 1.1M (zh)](https://huggingface.co/datasets/YeungNLP/firefly-train-1.1M) - [Firefly 1.1M (zh)](https://huggingface.co/datasets/YeungNLP/firefly-train-1.1M)
- [Web QA (zh)](https://huggingface.co/datasets/suolyer/webqa) - [Web QA (zh)](https://huggingface.co/datasets/suolyer/webqa)
- [UltraChat (en)](https://github.com/thunlp/UltraChat)
- [WebNovel (zh)](https://huggingface.co/datasets/zxbsmk/webnovel_cn) - [WebNovel (zh)](https://huggingface.co/datasets/zxbsmk/webnovel_cn)
- [Ad Gen (zh)](https://huggingface.co/datasets/HasturOfficial/adgen) - [Ad Gen (zh)](https://huggingface.co/datasets/HasturOfficial/adgen)
- 用于训练奖励模型或 DPO 训练: - [ShareGPT Hyperfiltered (en)](https://huggingface.co/datasets/totally-not-an-llm/sharegpt-hyperfiltered-3k)
- [ShareGPT4 (en&zh)](https://huggingface.co/datasets/shibing624/sharegpt_gpt4)
- [UltraChat 200k (en)](https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k)
- [AgentInstruct (en)](https://huggingface.co/datasets/THUDM/AgentInstruct)
- [LMSYS Chat (en)](https://huggingface.co/datasets/lmsys/lmsys-chat-1m)
- [Evol Instruct V2 (en)](https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_V2_196k)
</details>
<details><summary>偏好数据集</summary>
- [HH-RLHF (en)](https://huggingface.co/datasets/Anthropic/hh-rlhf) - [HH-RLHF (en)](https://huggingface.co/datasets/Anthropic/hh-rlhf)
- [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1) - [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1)
- [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM) - [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
使用方法请参考 [data/README.md](data/README_zh.md) 文件。 </details>
使用方法请参考 [data/README_zh.md](data/README_zh.md) 文件。
部分数据集的使用需要确认,我们推荐使用下述命令登录您的 Hugging Face 账户。 部分数据集的使用需要确认,我们推荐使用下述命令登录您的 Hugging Face 账户。
@ -144,10 +166,10 @@ huggingface-cli login
### 数据准备(可跳过) ### 数据准备(可跳过)
关于数据集文件的格式,请参考 `data/example_dataset` 文件夹的内容。构建自定义数据集时,既可以使用单个 `.json` 文件,也可以使用一个[数据加载脚本](https://huggingface.co/docs/datasets/dataset_script)和多个文件。 关于数据集文件的格式,请参考 [data/README_zh.md](data/README_zh.md) 的内容。构建自定义数据集时,既可以使用单个 `.json` 文件,也可以使用一个[数据加载脚本](https://huggingface.co/docs/datasets/dataset_script)和多个文件。
> [!NOTE] > [!NOTE]
> 使用自定义数据集时,请更新 `data/dataset_info.json` 文件,该文件的格式请参考 `data/README.md`。 > 使用自定义数据集时,请更新 `data/dataset_info.json` 文件,该文件的格式请参考 `data/README_zh.md`。
### 环境搭建(可跳过) ### 环境搭建(可跳过)

View File

@ -6,7 +6,9 @@ If you are using a custom dataset, please provide your dataset definition in the
"script_url": "the name of the directory containing a dataset loading script. (if specified, ignore below 2 arguments)", "script_url": "the name of the directory containing a dataset loading script. (if specified, ignore below 2 arguments)",
"file_name": "the name of the dataset file in the this directory. (required if above are not specified)", "file_name": "the name of the dataset file in the this directory. (required if above are not specified)",
"file_sha1": "the SHA-1 hash value of the dataset file. (optional)", "file_sha1": "the SHA-1 hash value of the dataset file. (optional)",
"subset": "",
"ranking": "whether the examples contains ranked responses or not. (default: false)", "ranking": "whether the examples contains ranked responses or not. (default: false)",
"formatting": "",
"columns": { "columns": {
"prompt": "the name of the column in the datasets containing the prompts. (default: instruction)", "prompt": "the name of the column in the datasets containing the prompts. (default: instruction)",
"query": "the name of the column in the datasets containing the queries. (default: input)", "query": "the name of the column in the datasets containing the queries. (default: input)",

View File

@ -71,14 +71,14 @@
"guanaco": { "guanaco": {
"hf_hub_url": "JosephusCheung/GuanacoDataset" "hf_hub_url": "JosephusCheung/GuanacoDataset"
}, },
"belle_0.5m": { "belle_2m": {
"hf_hub_url": "BelleGroup/train_0.5M_CN" "hf_hub_url": "BelleGroup/train_2M_CN"
}, },
"belle_1m": { "belle_1m": {
"hf_hub_url": "BelleGroup/train_1M_CN" "hf_hub_url": "BelleGroup/train_1M_CN"
}, },
"belle_2m": { "belle_0.5m": {
"hf_hub_url": "BelleGroup/train_2M_CN" "hf_hub_url": "BelleGroup/train_0.5M_CN"
}, },
"belle_dialog": { "belle_dialog": {
"hf_hub_url": "BelleGroup/generated_chat_0.4M" "hf_hub_url": "BelleGroup/generated_chat_0.4M"
@ -90,80 +90,116 @@
"script_url": "belle_multiturn", "script_url": "belle_multiturn",
"columns": { "columns": {
"prompt": "instruction", "prompt": "instruction",
"query": "",
"response": "output", "response": "output",
"history": "history" "history": "history"
} }
}, },
"ultra_chat": {
"script_url": "ultra_chat",
"columns": {
"prompt": "instruction",
"response": "output",
"history": "history"
}
},
"open_platypus": {
"hf_hub_url": "garage-bAInd/Open-Platypus"
},
"codealpaca": { "codealpaca": {
"hf_hub_url": "sahil2801/CodeAlpaca-20k" "hf_hub_url": "sahil2801/CodeAlpaca-20k"
}, },
"alpaca_cot": { "alpaca_cot": {
"hf_hub_url": "QingyiSi/Alpaca-CoT" "hf_hub_url": "QingyiSi/Alpaca-CoT"
}, },
"firefly": {
"hf_hub_url": "YeungNLP/firefly-train-1.1M",
"columns": {
"prompt": "input",
"query": "",
"response": "target",
"history": ""
}
},
"mathinstruct": { "mathinstruct": {
"hf_hub_url": "TIGER-Lab/MathInstruct", "hf_hub_url": "TIGER-Lab/MathInstruct",
"columns": { "columns": {
"prompt": "instruction", "prompt": "instruction",
"query": "", "response": "output"
"response": "output", }
"history": "" },
"firefly": {
"hf_hub_url": "YeungNLP/firefly-train-1.1M",
"columns": {
"prompt": "input",
"response": "target"
} }
}, },
"webqa": { "webqa": {
"hf_hub_url": "suolyer/webqa", "hf_hub_url": "suolyer/webqa",
"columns": { "columns": {
"prompt": "input", "prompt": "input",
"query": "", "response": "output"
"response": "output",
"history": ""
} }
}, },
"ultra_chat": { "webnovel": {
"script_url": "ultra_chat",
"columns": {
"prompt": "instruction",
"query": "",
"response": "output",
"history": "history"
}
},
"novel_tokens512_50k": {
"hf_hub_url": "zxbsmk/webnovel_cn" "hf_hub_url": "zxbsmk/webnovel_cn"
}, },
"adgen": { "adgen": {
"hf_hub_url": "HasturOfficial/adgen", "hf_hub_url": "HasturOfficial/adgen",
"columns": { "columns": {
"prompt": "content", "prompt": "content",
"query": "", "response": "summary"
"response": "summary",
"history": ""
} }
}, },
"comparison_gpt4_en": { "sharegpt_hyper": {
"file_name": "comparison_gpt4_data_en.json", "hf_hub_url": "totally-not-an-llm/sharegpt-hyperfiltered-3k",
"file_sha1": "96fa18313544e22444fe20eead7754b17da452ae", "columns": {
"ranking": true "prompt": "conversations",
"query": "from",
"response": "value"
}, },
"comparison_gpt4_zh": { "formatting": "sharegpt"
"file_name": "comparison_gpt4_data_zh.json", },
"file_sha1": "515b18ed497199131ddcc1af950345c11dc5c7fd", "sharegpt4": {
"ranking": true "hf_hub_url": "shibing624/sharegpt_gpt4",
"columns": {
"prompt": "conversations",
"query": "from",
"response": "value"
},
"formatting": "sharegpt"
},
"ultrachat_200k": {
"hf_hub_url": "HuggingFaceH4/ultrachat_200k",
"columns": {
"prompt": "messages",
"query": "role",
"response": "content"
},
"formatting": "sharegpt"
},
"agent_instruct": {
"hf_hub_url": "THUDM/AgentInstruct",
"columns": {
"prompt": "conversations",
"query": "from",
"response": "value"
},
"formatting": "sharegpt"
},
"lmsys_chat": {
"hf_hub_url": "lmsys/lmsys-chat-1m",
"columns": {
"prompt": "conversation",
"query": "role",
"response": "content"
},
"formatting": "sharegpt"
},
"evol_instruct": {
"hf_hub_url": "WizardLM/WizardLM_evol_instruct_V2_196k",
"columns": {
"prompt": "conversations",
"query": "from",
"response": "value"
},
"formatting": "sharegpt"
}, },
"hh_rlhf_en": { "hh_rlhf_en": {
"script_url": "hh_rlhf_en", "script_url": "hh_rlhf_en",
"columns": { "columns": {
"prompt": "instruction", "prompt": "instruction",
"query": "",
"response": "output", "response": "output",
"history": "history" "history": "history"
}, },
@ -191,59 +227,71 @@
}, },
"ranking": true "ranking": true
}, },
"comparison_gpt4_en": {
"file_name": "comparison_gpt4_data_en.json",
"file_sha1": "96fa18313544e22444fe20eead7754b17da452ae",
"ranking": true
},
"comparison_gpt4_zh": {
"file_name": "comparison_gpt4_data_zh.json",
"file_sha1": "515b18ed497199131ddcc1af950345c11dc5c7fd",
"ranking": true
},
"wiki_demo": { "wiki_demo": {
"file_name": "wiki_demo.txt", "file_name": "wiki_demo.txt",
"file_sha1": "b2288edb05b233e5b35250fd4b308a5fa21fa66d", "file_sha1": "b2288edb05b233e5b35250fd4b308a5fa21fa66d",
"columns": { "columns": {
"prompt": "text", "prompt": "text"
"query": "",
"response": "",
"history": ""
} }
}, },
"refinedweb": { "refinedweb": {
"hf_hub_url": "tiiuae/falcon-refinedweb", "hf_hub_url": "tiiuae/falcon-refinedweb",
"columns": { "columns": {
"prompt": "content", "prompt": "content"
"query": "",
"response": "",
"history": ""
} }
}, },
"redpajama_v2": {
"hf_hub_url": "togethercomputer/RedPajama-Data-V2",
"columns": {
"prompt": "raw_content"
},
"subset": "default"
},
"wikipedia_en": { "wikipedia_en": {
"hf_hub_url": "olm/olm-wikipedia-20221220", "hf_hub_url": "olm/olm-wikipedia-20221220",
"columns": { "columns": {
"prompt": "text", "prompt": "text"
"query": "",
"response": "",
"history": ""
} }
}, },
"wikipedia_zh": { "wikipedia_zh": {
"hf_hub_url": "pleisto/wikipedia-cn-20230720-filtered", "hf_hub_url": "pleisto/wikipedia-cn-20230720-filtered",
"columns": { "columns": {
"prompt": "completion", "prompt": "completion"
"query": "", }
"response": "", },
"history": "" "pile": {
"hf_hub_url": "EleutherAI/pile",
"columns": {
"prompt": "text"
},
"subset": "all"
},
"skypile": {
"hf_hub_url": "Skywork/SkyPile-150B",
"columns": {
"prompt": "text"
} }
}, },
"the_stack": { "the_stack": {
"hf_hub_url": "bigcode/the-stack", "hf_hub_url": "bigcode/the-stack",
"columns": { "columns": {
"prompt": "content", "prompt": "content"
"query": "",
"response": "",
"history": ""
} }
}, },
"starcoder": { "starcoder": {
"hf_hub_url": "bigcode/starcoderdata", "hf_hub_url": "bigcode/starcoderdata",
"columns": { "columns": {
"prompt": "content", "prompt": "content"
"query": "",
"response": "",
"history": ""
} }
} }
} }

View File

@ -1,5 +1,5 @@
import os import os
from typing import TYPE_CHECKING, List, Union from typing import TYPE_CHECKING, Any, Dict, List, Union
from datasets import concatenate_datasets, interleave_datasets, load_dataset from datasets import concatenate_datasets, interleave_datasets, load_dataset
@ -26,22 +26,23 @@ def get_dataset(
if dataset_attr.load_from == "hf_hub": if dataset_attr.load_from == "hf_hub":
data_path = dataset_attr.dataset_name data_path = dataset_attr.dataset_name
data_name = dataset_attr.subset
data_files = None data_files = None
elif dataset_attr.load_from == "script": elif dataset_attr.load_from == "script":
data_path = os.path.join(data_args.dataset_dir, dataset_attr.dataset_name) data_path = os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)
data_name = dataset_attr.subset
data_files = None data_files = None
elif dataset_attr.load_from == "file": elif dataset_attr.load_from == "file":
data_path = None data_path, data_name = None, None
data_files: List[str] = [] data_files: List[str] = []
if os.path.isdir(os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)): # is directory
if os.path.isdir(os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)): # directory
for file_name in os.listdir(os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)): for file_name in os.listdir(os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)):
data_files.append(os.path.join(data_args.dataset_dir, dataset_attr.dataset_name, file_name)) data_files.append(os.path.join(data_args.dataset_dir, dataset_attr.dataset_name, file_name))
if data_path is None: if data_path is None:
data_path = EXT2TYPE.get(file_name.split(".")[-1], None) data_path = EXT2TYPE.get(file_name.split(".")[-1], None)
else: else:
assert data_path == EXT2TYPE.get(file_name.split(".")[-1], None), "file type does not match." assert data_path == EXT2TYPE.get(file_name.split(".")[-1], None), "file types are not identical."
elif os.path.isfile(os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)): # single file elif os.path.isfile(os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)): # is file
data_files.append(os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)) data_files.append(os.path.join(data_args.dataset_dir, dataset_attr.dataset_name))
data_path = EXT2TYPE.get(dataset_attr.dataset_name.split(".")[-1], None) data_path = EXT2TYPE.get(dataset_attr.dataset_name.split(".")[-1], None)
else: else:
@ -53,7 +54,8 @@ def get_dataset(
raise NotImplementedError raise NotImplementedError
dataset = load_dataset( dataset = load_dataset(
data_path, path=data_path,
name=data_name,
data_files=data_files, data_files=data_files,
split=data_args.split, split=data_args.split,
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
@ -61,13 +63,57 @@ def get_dataset(
use_auth_token=True if model_args.use_auth_token else None use_auth_token=True if model_args.use_auth_token else None
) )
if max_samples is not None: if max_samples is not None: # truncate dataset
max_samples_temp = min(len(dataset), max_samples) dataset = dataset.select(range(min(len(dataset), max_samples)))
dataset = dataset.select(range(max_samples_temp))
# TODO: adapt to the sharegpt format def convert_format(examples: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
# convert dataset from sharegpt format to alpaca format
outputs = {"prompt": [], "query": [], "response": [], "history": []}
for msg_list in examples[dataset_attr.prompt]:
msg_list = msg_list[:len(msg_list) // 2 * 2] # should be multiples of 2
if len(msg_list) == 0:
continue
for column_name in ["prompt", "query", "response", "history"]: # align datasets msg_pairs = []
user_role, assistant_role = None, None
for idx in range(0, len(msg_list), 2):
if user_role is None and assistant_role is None:
user_role = msg_list[idx][dataset_attr.query]
assistant_role = msg_list[idx + 1][dataset_attr.query]
else:
if (
msg_list[idx][dataset_attr.query] != user_role
or msg_list[idx+1][dataset_attr.query] != assistant_role
):
raise ValueError("Only accepts conversation in u/a/u/a/u/a order.")
msg_pairs.append((msg_list[idx][dataset_attr.response], msg_list[idx + 1][dataset_attr.response]))
if len(msg_pairs) != 0:
outputs["prompt"].append(msg_pairs[-1][0])
outputs["query"].append("")
outputs["response"].append(msg_pairs[-1][1])
outputs["history"].append(msg_pairs[:-1])
return outputs
if dataset_attr.formatting == "sharegpt": # convert format
column_names = list(next(iter(dataset)).keys())
kwargs = {}
if not data_args.streaming:
kwargs = dict(
num_proc=data_args.preprocessing_num_workers,
load_from_cache_file=(not data_args.overwrite_cache),
desc="Converting format of dataset"
)
dataset = dataset.map(
convert_format,
batched=True,
remove_columns=column_names,
**kwargs
)
else:
for column_name in ["prompt", "query", "response", "history"]: # align dataset
if getattr(dataset_attr, column_name) and getattr(dataset_attr, column_name) != column_name: if getattr(dataset_attr, column_name) and getattr(dataset_attr, column_name) != column_name:
dataset = dataset.rename_column(getattr(dataset_attr, column_name), column_name) dataset = dataset.rename_column(getattr(dataset_attr, column_name), column_name)

View File

@ -39,7 +39,7 @@ def preprocess_dataset(
system = examples["system"][i] if "system" in examples else None system = examples["system"][i] if "system" in examples else None
yield query, response, history, system yield query, response, history, system
def preprocess_pretrain_dataset(examples: Dict[str, List[Any]]) -> Dict[str, Any]: def preprocess_pretrain_dataset(examples: Dict[str, List[Any]]) -> Dict[str, List[List[int]]]:
# build grouped texts with format `X1 X2 X3 ...` # build grouped texts with format `X1 X2 X3 ...`
if isinstance(getattr(tokenizer, "tokenizer", None), tiktoken.Encoding): # for tiktoken tokenizer (Qwen) if isinstance(getattr(tokenizer, "tokenizer", None), tiktoken.Encoding): # for tiktoken tokenizer (Qwen)
kwargs = dict(allowed_special="all") kwargs = dict(allowed_special="all")
@ -62,7 +62,7 @@ def preprocess_dataset(
} }
return result return result
def preprocess_supervised_dataset(examples: Dict[str, List[Any]]) -> Dict[str, Any]: def preprocess_supervised_dataset(examples: Dict[str, List[Any]]) -> Dict[str, List[List[int]]]:
# build inputs with format `<bos> X Y <eos>` and labels with format `<ignore> ... <ignore> Y <eos>` # build inputs with format `<bos> X Y <eos>` and labels with format `<ignore> ... <ignore> Y <eos>`
# for multiturn examples, we only mask the prompt part in each prompt-response pair. # for multiturn examples, we only mask the prompt part in each prompt-response pair.
model_inputs = {"input_ids": [], "attention_mask": [], "labels": []} model_inputs = {"input_ids": [], "attention_mask": [], "labels": []}
@ -108,7 +108,7 @@ def preprocess_dataset(
return model_inputs return model_inputs
def preprocess_packed_supervised_dataset(examples: Dict[str, List[Any]]) -> Dict[str, Any]: def preprocess_packed_supervised_dataset(examples: Dict[str, List[Any]]) -> Dict[str, List[List[int]]]:
# build inputs with format `<bos> X1 Y1 <eos> <bos> X2 Y2 <eos>` # build inputs with format `<bos> X1 Y1 <eos> <bos> X2 Y2 <eos>`
# and labels with format `<ignore> ... <ignore> Y1 <eos> <ignore> ... <ignore> Y2 <eos>` # and labels with format `<ignore> ... <ignore> Y1 <eos> <ignore> ... <ignore> Y2 <eos>`
model_inputs = {"input_ids": [], "attention_mask": [], "labels": []} model_inputs = {"input_ids": [], "attention_mask": [], "labels": []}
@ -145,7 +145,7 @@ def preprocess_dataset(
return model_inputs return model_inputs
def preprocess_unsupervised_dataset(examples: Dict[str, List[Any]]) -> Dict[str, Any]: def preprocess_unsupervised_dataset(examples: Dict[str, List[Any]]) -> Dict[str, List[List[int]]]:
# build inputs with format `<bos> X` and labels with format `Y <eos>` # build inputs with format `<bos> X` and labels with format `Y <eos>`
model_inputs = {"input_ids": [], "attention_mask": [], "labels": []} model_inputs = {"input_ids": [], "attention_mask": [], "labels": []}
@ -169,7 +169,7 @@ def preprocess_dataset(
return model_inputs return model_inputs
def preprocess_pairwise_dataset(examples): def preprocess_pairwise_dataset(examples: Dict[str, List[Any]]) -> Dict[str, List[List[int]]]:
# build input pairs with format `<bos> X`, `Y1 <eos>` and `Y2 <eos>` # build input pairs with format `<bos> X`, `Y1 <eos>` and `Y2 <eos>`
model_inputs = {"prompt_ids": [], "chosen_ids": [], "rejected_ids": []} model_inputs = {"prompt_ids": [], "chosen_ids": [], "rejected_ids": []}
for query, response, history, system in construct_example(examples): for query, response, history, system in construct_example(examples):
@ -197,9 +197,10 @@ def preprocess_dataset(
model_inputs["prompt_ids"].append(prompt_ids) model_inputs["prompt_ids"].append(prompt_ids)
model_inputs["chosen_ids"].append(chosen_ids) model_inputs["chosen_ids"].append(chosen_ids)
model_inputs["rejected_ids"].append(rejected_ids) model_inputs["rejected_ids"].append(rejected_ids)
return model_inputs return model_inputs
def print_supervised_dataset_example(example): def print_supervised_dataset_example(example: Dict[str, List[int]]) -> None:
print("input_ids:\n{}".format(example["input_ids"])) print("input_ids:\n{}".format(example["input_ids"]))
print("inputs:\n{}".format(tokenizer.decode(example["input_ids"], skip_special_tokens=False))) print("inputs:\n{}".format(tokenizer.decode(example["input_ids"], skip_special_tokens=False)))
print("label_ids:\n{}".format(example["labels"])) print("label_ids:\n{}".format(example["labels"]))
@ -207,7 +208,7 @@ def preprocess_dataset(
tokenizer.decode(list(filter(lambda x: x != IGNORE_INDEX, example["labels"])), skip_special_tokens=False) tokenizer.decode(list(filter(lambda x: x != IGNORE_INDEX, example["labels"])), skip_special_tokens=False)
)) ))
def print_pairwise_dataset_example(example): def print_pairwise_dataset_example(example: Dict[str, List[int]]) -> None:
print("prompt_ids:\n{}".format(example["prompt_ids"])) print("prompt_ids:\n{}".format(example["prompt_ids"]))
print("prompt:\n{}".format(tokenizer.decode(example["prompt_ids"], skip_special_tokens=False))) print("prompt:\n{}".format(tokenizer.decode(example["prompt_ids"], skip_special_tokens=False)))
print("chosen_ids:\n{}".format(example["chosen_ids"])) print("chosen_ids:\n{}".format(example["chosen_ids"]))
@ -215,7 +216,7 @@ def preprocess_dataset(
print("rejected_ids:\n{}".format(example["rejected_ids"])) print("rejected_ids:\n{}".format(example["rejected_ids"]))
print("rejected:\n{}".format(tokenizer.decode(example["rejected_ids"], skip_special_tokens=False))) print("rejected:\n{}".format(tokenizer.decode(example["rejected_ids"], skip_special_tokens=False)))
def print_unsupervised_dataset_example(example): def print_unsupervised_dataset_example(example: Dict[str, List[int]]) -> None:
print("input_ids:\n{}".format(example["input_ids"])) print("input_ids:\n{}".format(example["input_ids"]))
print("inputs:\n{}".format(tokenizer.decode(example["input_ids"], skip_special_tokens=False))) print("inputs:\n{}".format(tokenizer.decode(example["input_ids"], skip_special_tokens=False)))
@ -242,7 +243,7 @@ def preprocess_dataset(
if not data_args.streaming: if not data_args.streaming:
kwargs = dict( kwargs = dict(
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,
load_from_cache_file=not data_args.overwrite_cache, load_from_cache_file=(not data_args.overwrite_cache),
desc="Running tokenizer on dataset" desc="Running tokenizer on dataset"
) )

View File

@ -11,6 +11,7 @@ class DatasetAttr:
dataset_name: Optional[str] = None dataset_name: Optional[str] = None
dataset_sha1: Optional[str] = None dataset_sha1: Optional[str] = None
system_prompt: Optional[str] = None system_prompt: Optional[str] = None
subset: Optional[str] = None
ranking: Optional[bool] = False ranking: Optional[bool] = False
formatting: Optional[Literal["alpaca", "sharegpt"]] = "alpaca" formatting: Optional[Literal["alpaca", "sharegpt"]] = "alpaca"
@ -155,6 +156,7 @@ class DataArguments:
dataset_attr.response = dataset_info[name]["columns"].get("response", None) dataset_attr.response = dataset_info[name]["columns"].get("response", None)
dataset_attr.history = dataset_info[name]["columns"].get("history", None) dataset_attr.history = dataset_info[name]["columns"].get("history", None)
dataset_attr.subset = dataset_info[name].get("subset", None)
dataset_attr.ranking = dataset_info[name].get("ranking", False) dataset_attr.ranking = dataset_info[name].get("ranking", False)
dataset_attr.formatting = dataset_info[name].get("formatting", "alpaca") dataset_attr.formatting = dataset_info[name].get("formatting", "alpaca")
dataset_attr.system_prompt = prompt_list[i] dataset_attr.system_prompt = prompt_list[i]