support sharegpt format, add datasets

This commit is contained in:
hiyouga 2023-11-02 23:10:04 +08:00
parent c1edb0cf1b
commit a837172413
7 changed files with 306 additions and 163 deletions

View File

@ -86,39 +86,61 @@ Please refer to [template.py](src/llmtuner/extras/template.py) for a full list o
## Provided Datasets ## Provided Datasets
- For pre-training: <details><summary>Pre-training datasets</summary>
- [Wiki Demo (en)](data/wiki_demo.txt)
- [RefinedWeb (en)](https://huggingface.co/datasets/tiiuae/falcon-refinedweb) - [Wiki Demo (en)](data/wiki_demo.txt)
- [StarCoder (en)](https://huggingface.co/datasets/bigcode/starcoderdata) - [RefinedWeb (en)](https://huggingface.co/datasets/tiiuae/falcon-refinedweb)
- [Wikipedia (en)](https://huggingface.co/datasets/olm/olm-wikipedia-20221220) - [RedPajama V2 (en)](https://huggingface.co/datasets/togethercomputer/RedPajama-Data-V2)
- [Wikipedia (zh)](https://huggingface.co/datasets/pleisto/wikipedia-cn-20230720-filtered) - [Wikipedia (en)](https://huggingface.co/datasets/olm/olm-wikipedia-20221220)
- For supervised fine-tuning: - [Wikipedia (zh)](https://huggingface.co/datasets/pleisto/wikipedia-cn-20230720-filtered)
- [Stanford Alpaca (en)](https://github.com/tatsu-lab/stanford_alpaca) - [Pile (en)](https://huggingface.co/datasets/EleutherAI/pile)
- [Stanford Alpaca (zh)](https://github.com/ymcui/Chinese-LLaMA-Alpaca) - [SkyPile (zh)](https://huggingface.co/datasets/Skywork/SkyPile-150B)
- [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM) - [The Stack (en)](https://huggingface.co/datasets/bigcode/the-stack)
- [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1) - [StarCoder (en)](https://huggingface.co/datasets/bigcode/starcoderdata)
- [Self-cognition (zh)](data/self_cognition.json)
- [ShareGPT (zh)](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT/tree/main/Chinese-instruction-collection) </details>
- [Guanaco Dataset (multilingual)](https://huggingface.co/datasets/JosephusCheung/GuanacoDataset)
- [BELLE 2M (zh)](https://huggingface.co/datasets/BelleGroup/train_2M_CN) <details><summary>Supervised fine-tuning datasets</summary>
- [BELLE 1M (zh)](https://huggingface.co/datasets/BelleGroup/train_1M_CN)
- [BELLE 0.5M (zh)](https://huggingface.co/datasets/BelleGroup/train_0.5M_CN) - [Stanford Alpaca (en)](https://github.com/tatsu-lab/stanford_alpaca)
- [BELLE Dialogue 0.4M (zh)](https://huggingface.co/datasets/BelleGroup/generated_chat_0.4M) - [Stanford Alpaca (zh)](https://github.com/ymcui/Chinese-LLaMA-Alpaca)
- [BELLE School Math 0.25M (zh)](https://huggingface.co/datasets/BelleGroup/school_math_0.25M) - [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
- [BELLE Multiturn Chat 0.8M (zh)](https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M) - [Self-cognition (zh)](data/self_cognition.json)
- [LIMA (en)](https://huggingface.co/datasets/GAIR/lima) - [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1)
- [CodeAlpaca 20k (en)](https://huggingface.co/datasets/sahil2801/CodeAlpaca-20k) - [ShareGPT (zh)](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT/tree/main/Chinese-instruction-collection)
- [Alpaca CoT (multilingual)](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT) - [Guanaco Dataset (multilingual)](https://huggingface.co/datasets/JosephusCheung/GuanacoDataset)
- [MathInstruct (en)](https://huggingface.co/datasets/TIGER-Lab/MathInstruct) - [BELLE 2M (zh)](https://huggingface.co/datasets/BelleGroup/train_2M_CN)
- [Firefly 1.1M (zh)](https://huggingface.co/datasets/YeungNLP/firefly-train-1.1M) - [BELLE 1M (zh)](https://huggingface.co/datasets/BelleGroup/train_1M_CN)
- [Web QA (zh)](https://huggingface.co/datasets/suolyer/webqa) - [BELLE 0.5M (zh)](https://huggingface.co/datasets/BelleGroup/train_0.5M_CN)
- [UltraChat (en)](https://github.com/thunlp/UltraChat) - [BELLE Dialogue 0.4M (zh)](https://huggingface.co/datasets/BelleGroup/generated_chat_0.4M)
- [WebNovel (zh)](https://huggingface.co/datasets/zxbsmk/webnovel_cn) - [BELLE School Math 0.25M (zh)](https://huggingface.co/datasets/BelleGroup/school_math_0.25M)
- [Ad Gen (zh)](https://huggingface.co/datasets/HasturOfficial/adgen) - [BELLE Multiturn Chat 0.8M (zh)](https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M)
- For reward modeling or DPO training: - [UltraChat (en)](https://github.com/thunlp/UltraChat)
- [HH-RLHF (en)](https://huggingface.co/datasets/Anthropic/hh-rlhf) - [LIMA (en)](https://huggingface.co/datasets/GAIR/lima)
- [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1) - [OpenPlatypus (en)](https://huggingface.co/datasets/garage-bAInd/Open-Platypus)
- [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM) - [CodeAlpaca 20k (en)](https://huggingface.co/datasets/sahil2801/CodeAlpaca-20k)
- [Alpaca CoT (multilingual)](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT)
- [MathInstruct (en)](https://huggingface.co/datasets/TIGER-Lab/MathInstruct)
- [Firefly 1.1M (zh)](https://huggingface.co/datasets/YeungNLP/firefly-train-1.1M)
- [Web QA (zh)](https://huggingface.co/datasets/suolyer/webqa)
- [WebNovel (zh)](https://huggingface.co/datasets/zxbsmk/webnovel_cn)
- [Ad Gen (zh)](https://huggingface.co/datasets/HasturOfficial/adgen)
- [ShareGPT Hyperfiltered (en)](https://huggingface.co/datasets/totally-not-an-llm/sharegpt-hyperfiltered-3k)
- [ShareGPT4 (en&zh)](https://huggingface.co/datasets/shibing624/sharegpt_gpt4)
- [UltraChat 200k (en)](https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k)
- [AgentInstruct (en)](https://huggingface.co/datasets/THUDM/AgentInstruct)
- [LMSYS Chat (en)](https://huggingface.co/datasets/lmsys/lmsys-chat-1m)
- [Evol Instruct V2 (en)](https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_V2_196k)
</details>
<details><summary>Preference datasets</summary>
- [HH-RLHF (en)](https://huggingface.co/datasets/Anthropic/hh-rlhf)
- [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1)
- [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
</details>
Please refer to [data/README.md](data/README.md) for details. Please refer to [data/README.md](data/README.md) for details.
@ -135,8 +157,8 @@ huggingface-cli login
- 🤗Transformers, Datasets, Accelerate, PEFT and TRL - 🤗Transformers, Datasets, Accelerate, PEFT and TRL
- sentencepiece, protobuf and tiktoken - sentencepiece, protobuf and tiktoken
- fire, jieba, rouge-chinese and nltk (used at evaluation and predict) - fire, jieba, rouge-chinese and nltk (used at evaluation and predict)
- gradio and matplotlib (used in web_demo.py) - gradio and matplotlib (used in web UI)
- uvicorn, fastapi and sse-starlette (used in api_demo.py) - uvicorn, fastapi and sse-starlette (used in API)
And **powerful GPUs**! And **powerful GPUs**!
@ -144,7 +166,7 @@ And **powerful GPUs**!
### Data Preparation (optional) ### Data Preparation (optional)
Please refer to `data/example_dataset` for checking the details about the format of dataset files. You can either use a single `.json` file or a [dataset loading script](https://huggingface.co/docs/datasets/dataset_script) with multiple files to create a custom dataset. Please refer to [data/README.md](data/README.md) for checking the details about the format of dataset files. You can either use a single `.json` file or a [dataset loading script](https://huggingface.co/docs/datasets/dataset_script) with multiple files to create a custom dataset.
> [!NOTE] > [!NOTE]
> Please update `data/dataset_info.json` to use your custom dataset. About the format of this file, please refer to `data/README.md`. > Please update `data/dataset_info.json` to use your custom dataset. About the format of this file, please refer to `data/README.md`.

View File

@ -86,41 +86,63 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/6ba60acc-e2e2-4bec-b846
## 数据集 ## 数据集
- 用于预训练: <details><summary>预训练数据集</summary>
- [Wiki Demo (en)](data/wiki_demo.txt)
- [RefinedWeb (en)](https://huggingface.co/datasets/tiiuae/falcon-refinedweb)
- [StarCoder (en)](https://huggingface.co/datasets/bigcode/starcoderdata)
- [Wikipedia (en)](https://huggingface.co/datasets/olm/olm-wikipedia-20221220)
- [Wikipedia (zh)](https://huggingface.co/datasets/pleisto/wikipedia-cn-20230720-filtered)
- 用于指令监督微调:
- [Stanford Alpaca (en)](https://github.com/tatsu-lab/stanford_alpaca)
- [Stanford Alpaca (zh)](https://github.com/ymcui/Chinese-LLaMA-Alpaca)
- [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
- [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1)
- [Self-cognition (zh)](data/self_cognition.json)
- [ShareGPT (zh)](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT/tree/main/Chinese-instruction-collection)
- [Guanaco Dataset (multilingual)](https://huggingface.co/datasets/JosephusCheung/GuanacoDataset)
- [BELLE 2M (zh)](https://huggingface.co/datasets/BelleGroup/train_2M_CN)
- [BELLE 1M (zh)](https://huggingface.co/datasets/BelleGroup/train_1M_CN)
- [BELLE 0.5M (zh)](https://huggingface.co/datasets/BelleGroup/train_0.5M_CN)
- [BELLE Dialogue 0.4M (zh)](https://huggingface.co/datasets/BelleGroup/generated_chat_0.4M)
- [BELLE School Math 0.25M (zh)](https://huggingface.co/datasets/BelleGroup/school_math_0.25M)
- [BELLE Multiturn Chat 0.8M (zh)](https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M)
- [LIMA (en)](https://huggingface.co/datasets/GAIR/lima)
- [CodeAlpaca 20k (en)](https://huggingface.co/datasets/sahil2801/CodeAlpaca-20k)
- [Alpaca CoT (multilingual)](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT)
- [MathInstruct (en)](https://huggingface.co/datasets/TIGER-Lab/MathInstruct)
- [Firefly 1.1M (zh)](https://huggingface.co/datasets/YeungNLP/firefly-train-1.1M)
- [Web QA (zh)](https://huggingface.co/datasets/suolyer/webqa)
- [UltraChat (en)](https://github.com/thunlp/UltraChat)
- [WebNovel (zh)](https://huggingface.co/datasets/zxbsmk/webnovel_cn)
- [Ad Gen (zh)](https://huggingface.co/datasets/HasturOfficial/adgen)
- 用于训练奖励模型或 DPO 训练:
- [HH-RLHF (en)](https://huggingface.co/datasets/Anthropic/hh-rlhf)
- [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1)
- [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
使用方法请参考 [data/README.md](data/README_zh.md) 文件。 - [Wiki Demo (en)](data/wiki_demo.txt)
- [RefinedWeb (en)](https://huggingface.co/datasets/tiiuae/falcon-refinedweb)
- [RedPajama V2 (en)](https://huggingface.co/datasets/togethercomputer/RedPajama-Data-V2)
- [Wikipedia (en)](https://huggingface.co/datasets/olm/olm-wikipedia-20221220)
- [Wikipedia (zh)](https://huggingface.co/datasets/pleisto/wikipedia-cn-20230720-filtered)
- [Pile (en)](https://huggingface.co/datasets/EleutherAI/pile)
- [SkyPile (zh)](https://huggingface.co/datasets/Skywork/SkyPile-150B)
- [The Stack (en)](https://huggingface.co/datasets/bigcode/the-stack)
- [StarCoder (en)](https://huggingface.co/datasets/bigcode/starcoderdata)
</details>
<details><summary>指令微调数据集</summary>
- [Stanford Alpaca (en)](https://github.com/tatsu-lab/stanford_alpaca)
- [Stanford Alpaca (zh)](https://github.com/ymcui/Chinese-LLaMA-Alpaca)
- [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
- [Self-cognition (zh)](data/self_cognition.json)
- [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1)
- [ShareGPT (zh)](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT/tree/main/Chinese-instruction-collection)
- [Guanaco Dataset (multilingual)](https://huggingface.co/datasets/JosephusCheung/GuanacoDataset)
- [BELLE 2M (zh)](https://huggingface.co/datasets/BelleGroup/train_2M_CN)
- [BELLE 1M (zh)](https://huggingface.co/datasets/BelleGroup/train_1M_CN)
- [BELLE 0.5M (zh)](https://huggingface.co/datasets/BelleGroup/train_0.5M_CN)
- [BELLE Dialogue 0.4M (zh)](https://huggingface.co/datasets/BelleGroup/generated_chat_0.4M)
- [BELLE School Math 0.25M (zh)](https://huggingface.co/datasets/BelleGroup/school_math_0.25M)
- [BELLE Multiturn Chat 0.8M (zh)](https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M)
- [UltraChat (en)](https://github.com/thunlp/UltraChat)
- [LIMA (en)](https://huggingface.co/datasets/GAIR/lima)
- [OpenPlatypus (en)](https://huggingface.co/datasets/garage-bAInd/Open-Platypus)
- [CodeAlpaca 20k (en)](https://huggingface.co/datasets/sahil2801/CodeAlpaca-20k)
- [Alpaca CoT (multilingual)](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT)
- [MathInstruct (en)](https://huggingface.co/datasets/TIGER-Lab/MathInstruct)
- [Firefly 1.1M (zh)](https://huggingface.co/datasets/YeungNLP/firefly-train-1.1M)
- [Web QA (zh)](https://huggingface.co/datasets/suolyer/webqa)
- [WebNovel (zh)](https://huggingface.co/datasets/zxbsmk/webnovel_cn)
- [Ad Gen (zh)](https://huggingface.co/datasets/HasturOfficial/adgen)
- [ShareGPT Hyperfiltered (en)](https://huggingface.co/datasets/totally-not-an-llm/sharegpt-hyperfiltered-3k)
- [ShareGPT4 (en&zh)](https://huggingface.co/datasets/shibing624/sharegpt_gpt4)
- [UltraChat 200k (en)](https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k)
- [AgentInstruct (en)](https://huggingface.co/datasets/THUDM/AgentInstruct)
- [LMSYS Chat (en)](https://huggingface.co/datasets/lmsys/lmsys-chat-1m)
- [Evol Instruct V2 (en)](https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_V2_196k)
</details>
<details><summary>偏好数据集</summary>
- [HH-RLHF (en)](https://huggingface.co/datasets/Anthropic/hh-rlhf)
- [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1)
- [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
</details>
使用方法请参考 [data/README_zh.md](data/README_zh.md) 文件。
部分数据集的使用需要确认,我们推荐使用下述命令登录您的 Hugging Face 账户。 部分数据集的使用需要确认,我们推荐使用下述命令登录您的 Hugging Face 账户。
@ -144,10 +166,10 @@ huggingface-cli login
### 数据准备(可跳过) ### 数据准备(可跳过)
关于数据集文件的格式,请参考 `data/example_dataset` 文件夹的内容。构建自定义数据集时,既可以使用单个 `.json` 文件,也可以使用一个[数据加载脚本](https://huggingface.co/docs/datasets/dataset_script)和多个文件。 关于数据集文件的格式,请参考 [data/README_zh.md](data/README_zh.md) 的内容。构建自定义数据集时,既可以使用单个 `.json` 文件,也可以使用一个[数据加载脚本](https://huggingface.co/docs/datasets/dataset_script)和多个文件。
> [!NOTE] > [!NOTE]
> 使用自定义数据集时,请更新 `data/dataset_info.json` 文件,该文件的格式请参考 `data/README.md`。 > 使用自定义数据集时,请更新 `data/dataset_info.json` 文件,该文件的格式请参考 `data/README_zh.md`。
### 环境搭建(可跳过) ### 环境搭建(可跳过)

View File

@ -6,7 +6,9 @@ If you are using a custom dataset, please provide your dataset definition in the
"script_url": "the name of the directory containing a dataset loading script. (if specified, ignore below 2 arguments)", "script_url": "the name of the directory containing a dataset loading script. (if specified, ignore below 2 arguments)",
"file_name": "the name of the dataset file in the this directory. (required if above are not specified)", "file_name": "the name of the dataset file in the this directory. (required if above are not specified)",
"file_sha1": "the SHA-1 hash value of the dataset file. (optional)", "file_sha1": "the SHA-1 hash value of the dataset file. (optional)",
"subset": "",
"ranking": "whether the examples contains ranked responses or not. (default: false)", "ranking": "whether the examples contains ranked responses or not. (default: false)",
"formatting": "",
"columns": { "columns": {
"prompt": "the name of the column in the datasets containing the prompts. (default: instruction)", "prompt": "the name of the column in the datasets containing the prompts. (default: instruction)",
"query": "the name of the column in the datasets containing the queries. (default: input)", "query": "the name of the column in the datasets containing the queries. (default: input)",

View File

@ -71,14 +71,14 @@
"guanaco": { "guanaco": {
"hf_hub_url": "JosephusCheung/GuanacoDataset" "hf_hub_url": "JosephusCheung/GuanacoDataset"
}, },
"belle_0.5m": { "belle_2m": {
"hf_hub_url": "BelleGroup/train_0.5M_CN" "hf_hub_url": "BelleGroup/train_2M_CN"
}, },
"belle_1m": { "belle_1m": {
"hf_hub_url": "BelleGroup/train_1M_CN" "hf_hub_url": "BelleGroup/train_1M_CN"
}, },
"belle_2m": { "belle_0.5m": {
"hf_hub_url": "BelleGroup/train_2M_CN" "hf_hub_url": "BelleGroup/train_0.5M_CN"
}, },
"belle_dialog": { "belle_dialog": {
"hf_hub_url": "BelleGroup/generated_chat_0.4M" "hf_hub_url": "BelleGroup/generated_chat_0.4M"
@ -90,80 +90,116 @@
"script_url": "belle_multiturn", "script_url": "belle_multiturn",
"columns": { "columns": {
"prompt": "instruction", "prompt": "instruction",
"query": "",
"response": "output", "response": "output",
"history": "history" "history": "history"
} }
}, },
"ultra_chat": {
"script_url": "ultra_chat",
"columns": {
"prompt": "instruction",
"response": "output",
"history": "history"
}
},
"open_platypus": {
"hf_hub_url": "garage-bAInd/Open-Platypus"
},
"codealpaca": { "codealpaca": {
"hf_hub_url": "sahil2801/CodeAlpaca-20k" "hf_hub_url": "sahil2801/CodeAlpaca-20k"
}, },
"alpaca_cot": { "alpaca_cot": {
"hf_hub_url": "QingyiSi/Alpaca-CoT" "hf_hub_url": "QingyiSi/Alpaca-CoT"
}, },
"firefly": {
"hf_hub_url": "YeungNLP/firefly-train-1.1M",
"columns": {
"prompt": "input",
"query": "",
"response": "target",
"history": ""
}
},
"mathinstruct": { "mathinstruct": {
"hf_hub_url": "TIGER-Lab/MathInstruct", "hf_hub_url": "TIGER-Lab/MathInstruct",
"columns": { "columns": {
"prompt": "instruction", "prompt": "instruction",
"query": "", "response": "output"
"response": "output", }
"history": "" },
"firefly": {
"hf_hub_url": "YeungNLP/firefly-train-1.1M",
"columns": {
"prompt": "input",
"response": "target"
} }
}, },
"webqa": { "webqa": {
"hf_hub_url": "suolyer/webqa", "hf_hub_url": "suolyer/webqa",
"columns": { "columns": {
"prompt": "input", "prompt": "input",
"query": "", "response": "output"
"response": "output",
"history": ""
} }
}, },
"ultra_chat": { "webnovel": {
"script_url": "ultra_chat",
"columns": {
"prompt": "instruction",
"query": "",
"response": "output",
"history": "history"
}
},
"novel_tokens512_50k": {
"hf_hub_url": "zxbsmk/webnovel_cn" "hf_hub_url": "zxbsmk/webnovel_cn"
}, },
"adgen": { "adgen": {
"hf_hub_url": "HasturOfficial/adgen", "hf_hub_url": "HasturOfficial/adgen",
"columns": { "columns": {
"prompt": "content", "prompt": "content",
"query": "", "response": "summary"
"response": "summary",
"history": ""
} }
}, },
"comparison_gpt4_en": { "sharegpt_hyper": {
"file_name": "comparison_gpt4_data_en.json", "hf_hub_url": "totally-not-an-llm/sharegpt-hyperfiltered-3k",
"file_sha1": "96fa18313544e22444fe20eead7754b17da452ae", "columns": {
"ranking": true "prompt": "conversations",
"query": "from",
"response": "value"
},
"formatting": "sharegpt"
}, },
"comparison_gpt4_zh": { "sharegpt4": {
"file_name": "comparison_gpt4_data_zh.json", "hf_hub_url": "shibing624/sharegpt_gpt4",
"file_sha1": "515b18ed497199131ddcc1af950345c11dc5c7fd", "columns": {
"ranking": true "prompt": "conversations",
"query": "from",
"response": "value"
},
"formatting": "sharegpt"
},
"ultrachat_200k": {
"hf_hub_url": "HuggingFaceH4/ultrachat_200k",
"columns": {
"prompt": "messages",
"query": "role",
"response": "content"
},
"formatting": "sharegpt"
},
"agent_instruct": {
"hf_hub_url": "THUDM/AgentInstruct",
"columns": {
"prompt": "conversations",
"query": "from",
"response": "value"
},
"formatting": "sharegpt"
},
"lmsys_chat": {
"hf_hub_url": "lmsys/lmsys-chat-1m",
"columns": {
"prompt": "conversation",
"query": "role",
"response": "content"
},
"formatting": "sharegpt"
},
"evol_instruct": {
"hf_hub_url": "WizardLM/WizardLM_evol_instruct_V2_196k",
"columns": {
"prompt": "conversations",
"query": "from",
"response": "value"
},
"formatting": "sharegpt"
}, },
"hh_rlhf_en": { "hh_rlhf_en": {
"script_url": "hh_rlhf_en", "script_url": "hh_rlhf_en",
"columns": { "columns": {
"prompt": "instruction", "prompt": "instruction",
"query": "",
"response": "output", "response": "output",
"history": "history" "history": "history"
}, },
@ -191,59 +227,71 @@
}, },
"ranking": true "ranking": true
}, },
"comparison_gpt4_en": {
"file_name": "comparison_gpt4_data_en.json",
"file_sha1": "96fa18313544e22444fe20eead7754b17da452ae",
"ranking": true
},
"comparison_gpt4_zh": {
"file_name": "comparison_gpt4_data_zh.json",
"file_sha1": "515b18ed497199131ddcc1af950345c11dc5c7fd",
"ranking": true
},
"wiki_demo": { "wiki_demo": {
"file_name": "wiki_demo.txt", "file_name": "wiki_demo.txt",
"file_sha1": "b2288edb05b233e5b35250fd4b308a5fa21fa66d", "file_sha1": "b2288edb05b233e5b35250fd4b308a5fa21fa66d",
"columns": { "columns": {
"prompt": "text", "prompt": "text"
"query": "",
"response": "",
"history": ""
} }
}, },
"refinedweb": { "refinedweb": {
"hf_hub_url": "tiiuae/falcon-refinedweb", "hf_hub_url": "tiiuae/falcon-refinedweb",
"columns": { "columns": {
"prompt": "content", "prompt": "content"
"query": "",
"response": "",
"history": ""
} }
}, },
"redpajama_v2": {
"hf_hub_url": "togethercomputer/RedPajama-Data-V2",
"columns": {
"prompt": "raw_content"
},
"subset": "default"
},
"wikipedia_en": { "wikipedia_en": {
"hf_hub_url": "olm/olm-wikipedia-20221220", "hf_hub_url": "olm/olm-wikipedia-20221220",
"columns": { "columns": {
"prompt": "text", "prompt": "text"
"query": "",
"response": "",
"history": ""
} }
}, },
"wikipedia_zh": { "wikipedia_zh": {
"hf_hub_url": "pleisto/wikipedia-cn-20230720-filtered", "hf_hub_url": "pleisto/wikipedia-cn-20230720-filtered",
"columns": { "columns": {
"prompt": "completion", "prompt": "completion"
"query": "", }
"response": "", },
"history": "" "pile": {
"hf_hub_url": "EleutherAI/pile",
"columns": {
"prompt": "text"
},
"subset": "all"
},
"skypile": {
"hf_hub_url": "Skywork/SkyPile-150B",
"columns": {
"prompt": "text"
} }
}, },
"the_stack": { "the_stack": {
"hf_hub_url": "bigcode/the-stack", "hf_hub_url": "bigcode/the-stack",
"columns": { "columns": {
"prompt": "content", "prompt": "content"
"query": "",
"response": "",
"history": ""
} }
}, },
"starcoder": { "starcoder": {
"hf_hub_url": "bigcode/starcoderdata", "hf_hub_url": "bigcode/starcoderdata",
"columns": { "columns": {
"prompt": "content", "prompt": "content"
"query": "",
"response": "",
"history": ""
} }
} }
} }

View File

@ -1,5 +1,5 @@
import os import os
from typing import TYPE_CHECKING, List, Union from typing import TYPE_CHECKING, Any, Dict, List, Union
from datasets import concatenate_datasets, interleave_datasets, load_dataset from datasets import concatenate_datasets, interleave_datasets, load_dataset
@ -26,22 +26,23 @@ def get_dataset(
if dataset_attr.load_from == "hf_hub": if dataset_attr.load_from == "hf_hub":
data_path = dataset_attr.dataset_name data_path = dataset_attr.dataset_name
data_name = dataset_attr.subset
data_files = None data_files = None
elif dataset_attr.load_from == "script": elif dataset_attr.load_from == "script":
data_path = os.path.join(data_args.dataset_dir, dataset_attr.dataset_name) data_path = os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)
data_name = dataset_attr.subset
data_files = None data_files = None
elif dataset_attr.load_from == "file": elif dataset_attr.load_from == "file":
data_path = None data_path, data_name = None, None
data_files: List[str] = [] data_files: List[str] = []
if os.path.isdir(os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)): # is directory
if os.path.isdir(os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)): # directory
for file_name in os.listdir(os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)): for file_name in os.listdir(os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)):
data_files.append(os.path.join(data_args.dataset_dir, dataset_attr.dataset_name, file_name)) data_files.append(os.path.join(data_args.dataset_dir, dataset_attr.dataset_name, file_name))
if data_path is None: if data_path is None:
data_path = EXT2TYPE.get(file_name.split(".")[-1], None) data_path = EXT2TYPE.get(file_name.split(".")[-1], None)
else: else:
assert data_path == EXT2TYPE.get(file_name.split(".")[-1], None), "file type does not match." assert data_path == EXT2TYPE.get(file_name.split(".")[-1], None), "file types are not identical."
elif os.path.isfile(os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)): # single file elif os.path.isfile(os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)): # is file
data_files.append(os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)) data_files.append(os.path.join(data_args.dataset_dir, dataset_attr.dataset_name))
data_path = EXT2TYPE.get(dataset_attr.dataset_name.split(".")[-1], None) data_path = EXT2TYPE.get(dataset_attr.dataset_name.split(".")[-1], None)
else: else:
@ -53,7 +54,8 @@ def get_dataset(
raise NotImplementedError raise NotImplementedError
dataset = load_dataset( dataset = load_dataset(
data_path, path=data_path,
name=data_name,
data_files=data_files, data_files=data_files,
split=data_args.split, split=data_args.split,
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
@ -61,15 +63,59 @@ def get_dataset(
use_auth_token=True if model_args.use_auth_token else None use_auth_token=True if model_args.use_auth_token else None
) )
if max_samples is not None: if max_samples is not None: # truncate dataset
max_samples_temp = min(len(dataset), max_samples) dataset = dataset.select(range(min(len(dataset), max_samples)))
dataset = dataset.select(range(max_samples_temp))
# TODO: adapt to the sharegpt format def convert_format(examples: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
# convert dataset from sharegpt format to alpaca format
outputs = {"prompt": [], "query": [], "response": [], "history": []}
for msg_list in examples[dataset_attr.prompt]:
msg_list = msg_list[:len(msg_list) // 2 * 2] # should be multiples of 2
if len(msg_list) == 0:
continue
for column_name in ["prompt", "query", "response", "history"]: # align datasets msg_pairs = []
if getattr(dataset_attr, column_name) and getattr(dataset_attr, column_name) != column_name: user_role, assistant_role = None, None
dataset = dataset.rename_column(getattr(dataset_attr, column_name), column_name) for idx in range(0, len(msg_list), 2):
if user_role is None and assistant_role is None:
user_role = msg_list[idx][dataset_attr.query]
assistant_role = msg_list[idx + 1][dataset_attr.query]
else:
if (
msg_list[idx][dataset_attr.query] != user_role
or msg_list[idx+1][dataset_attr.query] != assistant_role
):
raise ValueError("Only accepts conversation in u/a/u/a/u/a order.")
msg_pairs.append((msg_list[idx][dataset_attr.response], msg_list[idx + 1][dataset_attr.response]))
if len(msg_pairs) != 0:
outputs["prompt"].append(msg_pairs[-1][0])
outputs["query"].append("")
outputs["response"].append(msg_pairs[-1][1])
outputs["history"].append(msg_pairs[:-1])
return outputs
if dataset_attr.formatting == "sharegpt": # convert format
column_names = list(next(iter(dataset)).keys())
kwargs = {}
if not data_args.streaming:
kwargs = dict(
num_proc=data_args.preprocessing_num_workers,
load_from_cache_file=(not data_args.overwrite_cache),
desc="Converting format of dataset"
)
dataset = dataset.map(
convert_format,
batched=True,
remove_columns=column_names,
**kwargs
)
else:
for column_name in ["prompt", "query", "response", "history"]: # align dataset
if getattr(dataset_attr, column_name) and getattr(dataset_attr, column_name) != column_name:
dataset = dataset.rename_column(getattr(dataset_attr, column_name), column_name)
if dataset_attr.system_prompt: # add system prompt if dataset_attr.system_prompt: # add system prompt
system_prompt = dataset_attr.system_prompt system_prompt = dataset_attr.system_prompt

View File

@ -39,7 +39,7 @@ def preprocess_dataset(
system = examples["system"][i] if "system" in examples else None system = examples["system"][i] if "system" in examples else None
yield query, response, history, system yield query, response, history, system
def preprocess_pretrain_dataset(examples: Dict[str, List[Any]]) -> Dict[str, Any]: def preprocess_pretrain_dataset(examples: Dict[str, List[Any]]) -> Dict[str, List[List[int]]]:
# build grouped texts with format `X1 X2 X3 ...` # build grouped texts with format `X1 X2 X3 ...`
if isinstance(getattr(tokenizer, "tokenizer", None), tiktoken.Encoding): # for tiktoken tokenizer (Qwen) if isinstance(getattr(tokenizer, "tokenizer", None), tiktoken.Encoding): # for tiktoken tokenizer (Qwen)
kwargs = dict(allowed_special="all") kwargs = dict(allowed_special="all")
@ -62,7 +62,7 @@ def preprocess_dataset(
} }
return result return result
def preprocess_supervised_dataset(examples: Dict[str, List[Any]]) -> Dict[str, Any]: def preprocess_supervised_dataset(examples: Dict[str, List[Any]]) -> Dict[str, List[List[int]]]:
# build inputs with format `<bos> X Y <eos>` and labels with format `<ignore> ... <ignore> Y <eos>` # build inputs with format `<bos> X Y <eos>` and labels with format `<ignore> ... <ignore> Y <eos>`
# for multiturn examples, we only mask the prompt part in each prompt-response pair. # for multiturn examples, we only mask the prompt part in each prompt-response pair.
model_inputs = {"input_ids": [], "attention_mask": [], "labels": []} model_inputs = {"input_ids": [], "attention_mask": [], "labels": []}
@ -108,7 +108,7 @@ def preprocess_dataset(
return model_inputs return model_inputs
def preprocess_packed_supervised_dataset(examples: Dict[str, List[Any]]) -> Dict[str, Any]: def preprocess_packed_supervised_dataset(examples: Dict[str, List[Any]]) -> Dict[str, List[List[int]]]:
# build inputs with format `<bos> X1 Y1 <eos> <bos> X2 Y2 <eos>` # build inputs with format `<bos> X1 Y1 <eos> <bos> X2 Y2 <eos>`
# and labels with format `<ignore> ... <ignore> Y1 <eos> <ignore> ... <ignore> Y2 <eos>` # and labels with format `<ignore> ... <ignore> Y1 <eos> <ignore> ... <ignore> Y2 <eos>`
model_inputs = {"input_ids": [], "attention_mask": [], "labels": []} model_inputs = {"input_ids": [], "attention_mask": [], "labels": []}
@ -145,7 +145,7 @@ def preprocess_dataset(
return model_inputs return model_inputs
def preprocess_unsupervised_dataset(examples: Dict[str, List[Any]]) -> Dict[str, Any]: def preprocess_unsupervised_dataset(examples: Dict[str, List[Any]]) -> Dict[str, List[List[int]]]:
# build inputs with format `<bos> X` and labels with format `Y <eos>` # build inputs with format `<bos> X` and labels with format `Y <eos>`
model_inputs = {"input_ids": [], "attention_mask": [], "labels": []} model_inputs = {"input_ids": [], "attention_mask": [], "labels": []}
@ -169,10 +169,10 @@ def preprocess_dataset(
return model_inputs return model_inputs
def preprocess_pairwise_dataset(examples): def preprocess_pairwise_dataset(examples: Dict[str, List[Any]]) -> Dict[str, List[List[int]]]:
# build input pairs with format `<bos> X`, `Y1 <eos>` and `Y2 <eos>` # build input pairs with format `<bos> X`, `Y1 <eos>` and `Y2 <eos>`
model_inputs = {"prompt_ids": [], "chosen_ids": [], "rejected_ids": []} model_inputs = {"prompt_ids": [], "chosen_ids": [], "rejected_ids": []}
for query, response, history, system in construct_example(examples): for query, response, history, system in construct_example(examples):
if not (isinstance(query, str) and isinstance(response, list) and query != "" and len(response) > 1): if not (isinstance(query, str) and isinstance(response, list) and query != "" and len(response) > 1):
continue continue
@ -197,9 +197,10 @@ def preprocess_dataset(
model_inputs["prompt_ids"].append(prompt_ids) model_inputs["prompt_ids"].append(prompt_ids)
model_inputs["chosen_ids"].append(chosen_ids) model_inputs["chosen_ids"].append(chosen_ids)
model_inputs["rejected_ids"].append(rejected_ids) model_inputs["rejected_ids"].append(rejected_ids)
return model_inputs return model_inputs
def print_supervised_dataset_example(example): def print_supervised_dataset_example(example: Dict[str, List[int]]) -> None:
print("input_ids:\n{}".format(example["input_ids"])) print("input_ids:\n{}".format(example["input_ids"]))
print("inputs:\n{}".format(tokenizer.decode(example["input_ids"], skip_special_tokens=False))) print("inputs:\n{}".format(tokenizer.decode(example["input_ids"], skip_special_tokens=False)))
print("label_ids:\n{}".format(example["labels"])) print("label_ids:\n{}".format(example["labels"]))
@ -207,7 +208,7 @@ def preprocess_dataset(
tokenizer.decode(list(filter(lambda x: x != IGNORE_INDEX, example["labels"])), skip_special_tokens=False) tokenizer.decode(list(filter(lambda x: x != IGNORE_INDEX, example["labels"])), skip_special_tokens=False)
)) ))
def print_pairwise_dataset_example(example): def print_pairwise_dataset_example(example: Dict[str, List[int]]) -> None:
print("prompt_ids:\n{}".format(example["prompt_ids"])) print("prompt_ids:\n{}".format(example["prompt_ids"]))
print("prompt:\n{}".format(tokenizer.decode(example["prompt_ids"], skip_special_tokens=False))) print("prompt:\n{}".format(tokenizer.decode(example["prompt_ids"], skip_special_tokens=False)))
print("chosen_ids:\n{}".format(example["chosen_ids"])) print("chosen_ids:\n{}".format(example["chosen_ids"]))
@ -215,7 +216,7 @@ def preprocess_dataset(
print("rejected_ids:\n{}".format(example["rejected_ids"])) print("rejected_ids:\n{}".format(example["rejected_ids"]))
print("rejected:\n{}".format(tokenizer.decode(example["rejected_ids"], skip_special_tokens=False))) print("rejected:\n{}".format(tokenizer.decode(example["rejected_ids"], skip_special_tokens=False)))
def print_unsupervised_dataset_example(example): def print_unsupervised_dataset_example(example: Dict[str, List[int]]) -> None:
print("input_ids:\n{}".format(example["input_ids"])) print("input_ids:\n{}".format(example["input_ids"]))
print("inputs:\n{}".format(tokenizer.decode(example["input_ids"], skip_special_tokens=False))) print("inputs:\n{}".format(tokenizer.decode(example["input_ids"], skip_special_tokens=False)))
@ -242,13 +243,13 @@ def preprocess_dataset(
if not data_args.streaming: if not data_args.streaming:
kwargs = dict( kwargs = dict(
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,
load_from_cache_file=not data_args.overwrite_cache, load_from_cache_file=(not data_args.overwrite_cache),
desc="Running tokenizer on dataset" desc="Running tokenizer on dataset"
) )
dataset = dataset.map( dataset = dataset.map(
preprocess_func, preprocess_func,
batched=True, batched=True,
remove_columns=column_names, remove_columns=column_names,
**kwargs **kwargs
) )

View File

@ -11,6 +11,7 @@ class DatasetAttr:
dataset_name: Optional[str] = None dataset_name: Optional[str] = None
dataset_sha1: Optional[str] = None dataset_sha1: Optional[str] = None
system_prompt: Optional[str] = None system_prompt: Optional[str] = None
subset: Optional[str] = None
ranking: Optional[bool] = False ranking: Optional[bool] = False
formatting: Optional[Literal["alpaca", "sharegpt"]] = "alpaca" formatting: Optional[Literal["alpaca", "sharegpt"]] = "alpaca"
@ -155,6 +156,7 @@ class DataArguments:
dataset_attr.response = dataset_info[name]["columns"].get("response", None) dataset_attr.response = dataset_info[name]["columns"].get("response", None)
dataset_attr.history = dataset_info[name]["columns"].get("history", None) dataset_attr.history = dataset_info[name]["columns"].get("history", None)
dataset_attr.subset = dataset_info[name].get("subset", None)
dataset_attr.ranking = dataset_info[name].get("ranking", False) dataset_attr.ranking = dataset_info[name].get("ranking", False)
dataset_attr.formatting = dataset_info[name].get("formatting", "alpaca") dataset_attr.formatting = dataset_info[name].get("formatting", "alpaca")
dataset_attr.system_prompt = prompt_list[i] dataset_attr.system_prompt = prompt_list[i]