add belle multiturn dataset

2023-06-16 20:01:16 +08:00 · 2023-06-16 20:01:16 +08:00 · 334d1a6d26
parent a6c4b141cd
commit 334d1a6d26
2 changed files with 86 additions and 1 deletions
--- a/data/belle_multiturn/belle_multiturn.py
+++ b/data/belle_multiturn/belle_multiturn.py
@ -0,0 +1,79 @@
+import json
+import datasets
+from typing import Any, Dict, List
+
+
+_DESCRIPTION = "BELLE multiturn chat dataset."
+
+_CITATION = """\
+@article{belle2023exploring,
+  title={Exploring the Impact of Instruction Data Scaling on Large Language Models: An Empirical Study on Real-World Use Cases},
+  author={Yunjie Ji, Yong Deng, Yan Gong, Yiping Peng, Qiang Niu, Lei Zhang, Baochang Ma, Xiangang Li},
+  journal={arXiv preprint arXiv:2303.14742},
+  year={2023}
+}
+"""
+
+_HOMEPAGE = "https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M"
+_LICENSE = "gpl-3.0"
+_URL = "https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M/resolve/main/multiturn_chat_0.8M.json"
+
+
+class BelleMultiturn(datasets.GeneratorBasedBuilder):
+
+    VERSION = datasets.Version("0.0.0")
+
+    def _info(self) -> datasets.DatasetInfo:
+        features = datasets.Features({
+            "instruction": datasets.Value("string"),
+            "output": datasets.Value("string"),
+            "history": datasets.Sequence(datasets.Sequence(datasets.Value("string")))
+        })
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION
+        )
+
+    def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
+        file_path = dl_manager.download(_URL)
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={
+                    "filepath": file_path
+                }
+            )
+        ]
+
+    def _generate_examples(self, filepath: str) -> Dict[int, Dict[str, Any]]: # generate multi-turn chat with history
+        with open(filepath, "r", encoding="utf-8") as f:
+            for key, row in enumerate(f):
+                data = json.loads(row)
+                prompt = data["instruction"].strip()
+                response = data["output"].strip()
+
+                assist_idx = prompt.rfind("Assistant:")
+                human_idx = prompt.rfind("Human:")
+                query = prompt[human_idx+6:assist_idx].strip()
+                prompt = prompt[:human_idx].strip()
+                history = []
+
+                while prompt.rfind("Assistant:") != -1:
+                    assist_idx = prompt.rfind("Assistant:")
+                    human_idx = prompt.rfind("Human:")
+                    if human_idx != -1:
+                        old_query = prompt[human_idx+6:assist_idx].strip()
+                        old_resp = prompt[assist_idx+10:].strip()
+                        history.insert(0, (old_query, old_resp))
+                    else:
+                        break
+                    prompt = prompt[:human_idx].strip()
+
+                yield key, {
+                    "instruction": query,
+                    "output": response,
+                    "history": history
+                }
--- a/data/dataset_info.json
+++ b/data/dataset_info.json
@ -30,7 +30,13 @@
    "hf_hub_url": "BelleGroup/school_math_0.25M"
  },
  "belle_multiturn": {
-    "hf_hub_url": "BelleGroup/multiturn_chat_0.8M"
+    "script_url": "belle_multiturn",
+    "columns": {
+      "prompt": "instruction",
+      "query": "",
+      "response": "output",
+      "history": "history"
+    }
  },
  "guanaco": {
    "hf_hub_url": "JosephusCheung/GuanacoDataset"