From 74e49cca957d0bacd2c1d688e995a7370bef69f7 Mon Sep 17 00:00:00 2001
From: Eli Costa <87460497+EliMCosta@users.noreply.github.com>
Date: Sat, 15 Jun 2024 19:31:56 -0300
Subject: [PATCH 1/4] Add Magpie and Webinstruct dataset samples
Adds two dataset samples claimed superior performance: Magpie (from Allen AI) and Webinstruct (from TIGER-Lab).
---
data/dataset_info.json | 26 +++++++++++++++++++++++---
1 file changed, 23 insertions(+), 3 deletions(-)
diff --git a/data/dataset_info.json b/data/dataset_info.json
index 1d226b3a..e321196a 100644
--- a/data/dataset_info.json
+++ b/data/dataset_info.json
@@ -524,13 +524,13 @@
"prompt": "text"
}
},
- "fileweb": {
+ "fineweb": {
"hf_hub_url": "HuggingFaceFW/fineweb",
"columns": {
"prompt": "text"
}
},
- "fileweb_edu": {
+ "fineweb_edu": {
"hf_hub_url": "HuggingFaceFW/fineweb-edu",
"columns": {
"prompt": "text"
@@ -550,5 +550,25 @@
"prompt": "content"
},
"folder": "python"
+ },
+ "Magpie-Pro-300K-Filtered": {
+ "hf_hub_url": "Magpie-Align/Magpie-Pro-300K-Filtered",
+ "columns": {
+ "messages": "conversations"
+ },
+ "tags": {
+ "role_tag": "from",
+ "content_tag": "value",
+ "user_tag": "human",
+ "assistant_tag": "gpt"
+ },
+ "formatting": "sharegpt"
+ },
+ "WebInstructSub": {
+ "hf_hub_url": "TIGER-Lab/WebInstructSub",
+ "columns": {
+ "prompt": "question",
+ "response": "answer"
+ }
}
-}
\ No newline at end of file
+}
From 103664203cf5a8562b5b000676ce95a6da2b7698 Mon Sep 17 00:00:00 2001
From: Eli Costa <87460497+EliMCosta@users.noreply.github.com>
Date: Sun, 16 Jun 2024 11:19:25 -0300
Subject: [PATCH 2/4] Update README.md
Add Magpie and Webinstruct to README
---
README.md | 2 ++
1 file changed, 2 insertions(+)
diff --git a/README.md b/README.md
index cb9a7222..30c90e9b 100644
--- a/README.md
+++ b/README.md
@@ -270,6 +270,8 @@ You also can add a custom chat template to [template.py](src/llamafactory/data/t
- [Booksum (de)](https://huggingface.co/datasets/mayflowergmbh/booksum_de)
- [Airoboros (de)](https://huggingface.co/datasets/mayflowergmbh/airoboros-3.0_de)
- [Ultrachat (de)](https://huggingface.co/datasets/mayflowergmbh/ultra-chat_de)
+- [WebInstructSub (en)](https://huggingface.co/datasets/TIGER-Lab/WebInstructSub)
+- [Magpie-Pro-300K-Filtered (en)](https://huggingface.co/datasets/Magpie-Align/Magpie-Pro-300K-Filtered)
From 82d5c5c1e8dda61523dee4be351c18731e4a5b9c Mon Sep 17 00:00:00 2001
From: Eli Costa <87460497+EliMCosta@users.noreply.github.com>
Date: Sun, 16 Jun 2024 11:22:06 -0300
Subject: [PATCH 3/4] Update README_zh.md
Add Magpie and WebInstruct to README
---
README_zh.md | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/README_zh.md b/README_zh.md
index 5c005f30..531d9b56 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -270,8 +270,8 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
- [Booksum (de)](https://huggingface.co/datasets/mayflowergmbh/booksum_de)
- [Airoboros (de)](https://huggingface.co/datasets/mayflowergmbh/airoboros-3.0_de)
- [Ultrachat (de)](https://huggingface.co/datasets/mayflowergmbh/ultra-chat_de)
-
-
+- [WebInstructSub (en)](https://huggingface.co/datasets/TIGER-Lab/WebInstructSub)
+- [Magpie-Pro-300K-Filtered (en)](https://huggingface.co/datasets/Magpie-Align/Magpie-Pro-300K-Filtered)
偏好数据集
From 3ec57ac239a4f469bbae013ec8760307fb190189 Mon Sep 17 00:00:00 2001
From: Eli Costa <87460497+EliMCosta@users.noreply.github.com>
Date: Sun, 16 Jun 2024 11:34:31 -0300
Subject: [PATCH 4/4] Update README_zh.md
Fix details tag in datasets menus
---
README_zh.md | 2 ++
1 file changed, 2 insertions(+)
diff --git a/README_zh.md b/README_zh.md
index 531d9b56..711596f0 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -273,6 +273,8 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
- [WebInstructSub (en)](https://huggingface.co/datasets/TIGER-Lab/WebInstructSub)
- [Magpie-Pro-300K-Filtered (en)](https://huggingface.co/datasets/Magpie-Align/Magpie-Pro-300K-Filtered)
+
+
偏好数据集
- [DPO mixed (en&zh)](https://huggingface.co/datasets/hiyouga/DPO-En-Zh-20k)