From 74e49cca957d0bacd2c1d688e995a7370bef69f7 Mon Sep 17 00:00:00 2001 From: Eli Costa <87460497+EliMCosta@users.noreply.github.com> Date: Sat, 15 Jun 2024 19:31:56 -0300 Subject: [PATCH 1/4] Add Magpie and Webinstruct dataset samples Adds two dataset samples claimed superior performance: Magpie (from Allen AI) and Webinstruct (from TIGER-Lab). --- data/dataset_info.json | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/data/dataset_info.json b/data/dataset_info.json index 1d226b3a..e321196a 100644 --- a/data/dataset_info.json +++ b/data/dataset_info.json @@ -524,13 +524,13 @@ "prompt": "text" } }, - "fileweb": { + "fineweb": { "hf_hub_url": "HuggingFaceFW/fineweb", "columns": { "prompt": "text" } }, - "fileweb_edu": { + "fineweb_edu": { "hf_hub_url": "HuggingFaceFW/fineweb-edu", "columns": { "prompt": "text" @@ -550,5 +550,25 @@ "prompt": "content" }, "folder": "python" + }, + "Magpie-Pro-300K-Filtered": { + "hf_hub_url": "Magpie-Align/Magpie-Pro-300K-Filtered", + "columns": { + "messages": "conversations" + }, + "tags": { + "role_tag": "from", + "content_tag": "value", + "user_tag": "human", + "assistant_tag": "gpt" + }, + "formatting": "sharegpt" + }, + "WebInstructSub": { + "hf_hub_url": "TIGER-Lab/WebInstructSub", + "columns": { + "prompt": "question", + "response": "answer" + } } -} \ No newline at end of file +} From 103664203cf5a8562b5b000676ce95a6da2b7698 Mon Sep 17 00:00:00 2001 From: Eli Costa <87460497+EliMCosta@users.noreply.github.com> Date: Sun, 16 Jun 2024 11:19:25 -0300 Subject: [PATCH 2/4] Update README.md Add Magpie and Webinstruct to README --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index cb9a7222..30c90e9b 100644 --- a/README.md +++ b/README.md @@ -270,6 +270,8 @@ You also can add a custom chat template to [template.py](src/llamafactory/data/t - [Booksum (de)](https://huggingface.co/datasets/mayflowergmbh/booksum_de) - [Airoboros (de)](https://huggingface.co/datasets/mayflowergmbh/airoboros-3.0_de) - [Ultrachat (de)](https://huggingface.co/datasets/mayflowergmbh/ultra-chat_de) +- [WebInstructSub (en)](https://huggingface.co/datasets/TIGER-Lab/WebInstructSub) +- [Magpie-Pro-300K-Filtered (en)](https://huggingface.co/datasets/Magpie-Align/Magpie-Pro-300K-Filtered) From 82d5c5c1e8dda61523dee4be351c18731e4a5b9c Mon Sep 17 00:00:00 2001 From: Eli Costa <87460497+EliMCosta@users.noreply.github.com> Date: Sun, 16 Jun 2024 11:22:06 -0300 Subject: [PATCH 3/4] Update README_zh.md Add Magpie and WebInstruct to README --- README_zh.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README_zh.md b/README_zh.md index 5c005f30..531d9b56 100644 --- a/README_zh.md +++ b/README_zh.md @@ -270,8 +270,8 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd - [Booksum (de)](https://huggingface.co/datasets/mayflowergmbh/booksum_de) - [Airoboros (de)](https://huggingface.co/datasets/mayflowergmbh/airoboros-3.0_de) - [Ultrachat (de)](https://huggingface.co/datasets/mayflowergmbh/ultra-chat_de) - - +- [WebInstructSub (en)](https://huggingface.co/datasets/TIGER-Lab/WebInstructSub) +- [Magpie-Pro-300K-Filtered (en)](https://huggingface.co/datasets/Magpie-Align/Magpie-Pro-300K-Filtered)
偏好数据集 From 3ec57ac239a4f469bbae013ec8760307fb190189 Mon Sep 17 00:00:00 2001 From: Eli Costa <87460497+EliMCosta@users.noreply.github.com> Date: Sun, 16 Jun 2024 11:34:31 -0300 Subject: [PATCH 4/4] Update README_zh.md Fix details tag in datasets menus --- README_zh.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README_zh.md b/README_zh.md index 531d9b56..711596f0 100644 --- a/README_zh.md +++ b/README_zh.md @@ -273,6 +273,8 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd - [WebInstructSub (en)](https://huggingface.co/datasets/TIGER-Lab/WebInstructSub) - [Magpie-Pro-300K-Filtered (en)](https://huggingface.co/datasets/Magpie-Align/Magpie-Pro-300K-Filtered) +
+
偏好数据集 - [DPO mixed (en&zh)](https://huggingface.co/datasets/hiyouga/DPO-En-Zh-20k)