LLaMA-Factory-310P3/data/belle_multiturn/belle_multiturn.py

import json
import datasets
from typing import Any, Dict, List


_DESCRIPTION = "BELLE multiturn chat dataset."

_CITATION = """\
@article{belle2023exploring,
  title={Exploring the Impact of Instruction Data Scaling on Large Language Models: An Empirical Study on Real-World Use Cases},
  author={Yunjie Ji, Yong Deng, Yan Gong, Yiping Peng, Qiang Niu, Lei Zhang, Baochang Ma, Xiangang Li},
  journal={arXiv preprint arXiv:2303.14742},
  year={2023}
}
"""

_HOMEPAGE = "https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M"
_LICENSE = "gpl-3.0"
_URL = "https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M/resolve/main/multiturn_chat_0.8M.json"


class BelleMultiturn(datasets.GeneratorBasedBuilder):

    VERSION = datasets.Version("0.0.0")

    def _info(self) -> datasets.DatasetInfo:
        features = datasets.Features({
            "instruction": datasets.Value("string"),
            "output": datasets.Value("string"),
            "history": datasets.Sequence(datasets.Sequence(datasets.Value("string")))
        })
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION
        )

    def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
        file_path = dl_manager.download(_URL)
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={
                    "filepath": file_path
                }
            )
        ]

    def _generate_examples(self, filepath: str) -> Dict[int, Dict[str, Any]]: # generate multi-turn chat with history
        with open(filepath, "r", encoding="utf-8") as f:
            for key, row in enumerate(f):
                data = json.loads(row)
                prompt = data["instruction"].strip()
                response = data["output"].strip()

                assist_idx = prompt.rfind("Assistant:")
                human_idx = prompt.rfind("Human:")
                query = prompt[human_idx+6:assist_idx].strip()
                prompt = prompt[:human_idx].strip()
                history = []

                while prompt.rfind("Assistant:") != -1:
                    assist_idx = prompt.rfind("Assistant:")
                    human_idx = prompt.rfind("Human:")
                    if human_idx != -1:
                        old_query = prompt[human_idx+6:assist_idx].strip()
                        old_resp = prompt[assist_idx+10:].strip()
                        history.insert(0, (old_query, old_resp))
                    else:
                        break
                    prompt = prompt[:human_idx].strip()

                yield key, {
                    "instruction": query,
                    "output": response,
                    "history": history
                }
add belle multiturn dataset 2023-06-16 20:01:16 +08:00			`import json`
			`import datasets`
			`from typing import Any, Dict, List`


			`_DESCRIPTION = "BELLE multiturn chat dataset."`

			`_CITATION = """\`
			`@article{belle2023exploring,`
			`title={Exploring the Impact of Instruction Data Scaling on Large Language Models: An Empirical Study on Real-World Use Cases},`
			`author={Yunjie Ji, Yong Deng, Yan Gong, Yiping Peng, Qiang Niu, Lei Zhang, Baochang Ma, Xiangang Li},`
			`journal={arXiv preprint arXiv:2303.14742},`
			`year={2023}`
			`}`
			`"""`

			`_HOMEPAGE = "https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M"`
			`_LICENSE = "gpl-3.0"`
			`_URL = "https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M/resolve/main/multiturn_chat_0.8M.json"`


			`class BelleMultiturn(datasets.GeneratorBasedBuilder):`

			`VERSION = datasets.Version("0.0.0")`

			`def _info(self) -> datasets.DatasetInfo:`
			`features = datasets.Features({`
			`"instruction": datasets.Value("string"),`
			`"output": datasets.Value("string"),`
			`"history": datasets.Sequence(datasets.Sequence(datasets.Value("string")))`
			`})`
			`return datasets.DatasetInfo(`
			`description=_DESCRIPTION,`
			`features=features,`
			`homepage=_HOMEPAGE,`
			`license=_LICENSE,`
			`citation=_CITATION`
			`)`

			`def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:`
			`file_path = dl_manager.download(_URL)`
			`return [`
			`datasets.SplitGenerator(`
			`name=datasets.Split.TRAIN,`
			`gen_kwargs={`
			`"filepath": file_path`
			`}`
			`)`
			`]`

			`def _generate_examples(self, filepath: str) -> Dict[int, Dict[str, Any]]: # generate multi-turn chat with history`
			`with open(filepath, "r", encoding="utf-8") as f:`
			`for key, row in enumerate(f):`
			`data = json.loads(row)`
			`prompt = data["instruction"].strip()`
			`response = data["output"].strip()`

			`assist_idx = prompt.rfind("Assistant:")`
			`human_idx = prompt.rfind("Human:")`
			`query = prompt[human_idx+6:assist_idx].strip()`
			`prompt = prompt[:human_idx].strip()`
			`history = []`

			`while prompt.rfind("Assistant:") != -1:`
			`assist_idx = prompt.rfind("Assistant:")`
			`human_idx = prompt.rfind("Human:")`
			`if human_idx != -1:`
			`old_query = prompt[human_idx+6:assist_idx].strip()`
			`old_resp = prompt[assist_idx+10:].strip()`
			`history.insert(0, (old_query, old_resp))`
			`else:`
			`break`
			`prompt = prompt[:human_idx].strip()`

			`yield key, {`
			`"instruction": query,`
			`"output": response,`
			`"history": history`
			`}`