LLaMA-Factory-Mirror/data/belle_multiturn/belle_multiturn.py

import json
import os

import datasets


_HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co")

_DESCRIPTION = "BELLE multiturn chat dataset."

_CITATION = """\
@article{belle2023exploring,
  title={Exploring the Impact of Instruction Data Scaling on Large Language Models: An Empirical Study on Real-World Use Cases},
  author={Yunjie Ji, Yong Deng, Yan Gong, Yiping Peng, Qiang Niu, Lei Zhang, Baochang Ma, Xiangang Li},
  journal={arXiv preprint arXiv:2303.14742},
  year={2023}
}
"""

_HOMEPAGE = "{}/datasets/BelleGroup/multiturn_chat_0.8M".format(_HF_ENDPOINT)
_LICENSE = "gpl-3.0"
_URL = "{}/datasets/BelleGroup/multiturn_chat_0.8M/resolve/main/multiturn_chat_0.8M.json".format(_HF_ENDPOINT)


class BelleMultiturn(datasets.GeneratorBasedBuilder):
    VERSION = datasets.Version("0.0.0")

    def _info(self):
        features = datasets.Features(
            {"conversations": [{"from": datasets.Value("string"), "value": datasets.Value("string")}]}
        )
        return datasets.DatasetInfo(
            description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION
        )

    def _split_generators(self, dl_manager: datasets.DownloadManager):
        file_path = dl_manager.download(_URL)
        return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": file_path})]

    def _generate_examples(self, filepath: str):
        with open(filepath, "r", encoding="utf-8") as f:
            for key, row in enumerate(f):
                data = json.loads(row)
                conversations = []
                prompt = data["instruction"].strip()
                response = data["output"].strip()

                assist_idx = prompt.rfind("Assistant:")
                human_idx = prompt.rfind("Human:")
                query = prompt[human_idx + 6 : assist_idx].strip()
                prompt = prompt[:human_idx].strip()
                conversations.insert(0, {"from": "gpt", "value": response})
                conversations.insert(0, {"from": "human", "value": query})

                while prompt.rfind("Assistant:") != -1:
                    assist_idx = prompt.rfind("Assistant:")
                    human_idx = prompt.rfind("Human:")
                    if human_idx != -1:
                        old_query = prompt[human_idx + 6 : assist_idx].strip()
                        old_resp = prompt[assist_idx + 10 :].strip()
                        conversations.insert(0, {"from": "gpt", "value": old_resp})
                        conversations.insert(0, {"from": "human", "value": old_query})
                    else:
                        break
                    prompt = prompt[:human_idx].strip()

                yield key, {"conversations": conversations}
add belle multiturn dataset 2023-06-16 20:01:16 +08:00			`import json`
add dpo mix dataset 2024-04-20 01:31:38 +08:00			`import os`

add belle multiturn dataset 2023-06-16 20:01:16 +08:00			`import datasets`

add orca_dpo_pairs dataset 2024-03-20 20:09:06 +08:00
Follow HF_ENDPOINT environment variable 2024-03-20 16:31:30 +08:00			`_HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co")`
add belle multiturn dataset 2023-06-16 20:01:16 +08:00
			`_DESCRIPTION = "BELLE multiturn chat dataset."`

			`_CITATION = """\`
			`@article{belle2023exploring,`
			`title={Exploring the Impact of Instruction Data Scaling on Large Language Models: An Empirical Study on Real-World Use Cases},`
			`author={Yunjie Ji, Yong Deng, Yan Gong, Yiping Peng, Qiang Niu, Lei Zhang, Baochang Ma, Xiangang Li},`
			`journal={arXiv preprint arXiv:2303.14742},`
			`year={2023}`
			`}`
			`"""`

add orca_dpo_pairs dataset 2024-03-20 20:09:06 +08:00			`_HOMEPAGE = "{}/datasets/BelleGroup/multiturn_chat_0.8M".format(_HF_ENDPOINT)`
add belle multiturn dataset 2023-06-16 20:01:16 +08:00			`_LICENSE = "gpl-3.0"`
add orca_dpo_pairs dataset 2024-03-20 20:09:06 +08:00			`_URL = "{}/datasets/BelleGroup/multiturn_chat_0.8M/resolve/main/multiturn_chat_0.8M.json".format(_HF_ENDPOINT)`
add belle multiturn dataset 2023-06-16 20:01:16 +08:00

			`class BelleMultiturn(datasets.GeneratorBasedBuilder):`
			`VERSION = datasets.Version("0.0.0")`

add template, modify datasets 2023-11-09 15:53:23 +08:00			`def _info(self):`
add dpo mix dataset 2024-04-20 01:31:38 +08:00			`features = datasets.Features(`
			`{"conversations": [{"from": datasets.Value("string"), "value": datasets.Value("string")}]}`
			`)`
add belle multiturn dataset 2023-06-16 20:01:16 +08:00			`return datasets.DatasetInfo(`
add dpo mix dataset 2024-04-20 01:31:38 +08:00			`description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION`
add belle multiturn dataset 2023-06-16 20:01:16 +08:00			`)`

add template, modify datasets 2023-11-09 15:53:23 +08:00			`def _split_generators(self, dl_manager: datasets.DownloadManager):`
add belle multiturn dataset 2023-06-16 20:01:16 +08:00			`file_path = dl_manager.download(_URL)`
add dpo mix dataset 2024-04-20 01:31:38 +08:00			`return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": file_path})]`
add belle multiturn dataset 2023-06-16 20:01:16 +08:00
add template, modify datasets 2023-11-09 15:53:23 +08:00			`def _generate_examples(self, filepath: str):`
add belle multiturn dataset 2023-06-16 20:01:16 +08:00			`with open(filepath, "r", encoding="utf-8") as f:`
			`for key, row in enumerate(f):`
			`data = json.loads(row)`
support full-parameter PPO 2023-11-16 02:08:04 +08:00			`conversations = []`
add belle multiturn dataset 2023-06-16 20:01:16 +08:00			`prompt = data["instruction"].strip()`
			`response = data["output"].strip()`

			`assist_idx = prompt.rfind("Assistant:")`
			`human_idx = prompt.rfind("Human:")`
add dpo mix dataset 2024-04-20 01:31:38 +08:00			`query = prompt[human_idx + 6 : assist_idx].strip()`
add belle multiturn dataset 2023-06-16 20:01:16 +08:00			`prompt = prompt[:human_idx].strip()`
support full-parameter PPO 2023-11-16 02:08:04 +08:00			`conversations.insert(0, {"from": "gpt", "value": response})`
			`conversations.insert(0, {"from": "human", "value": query})`
add belle multiturn dataset 2023-06-16 20:01:16 +08:00
			`while prompt.rfind("Assistant:") != -1:`
			`assist_idx = prompt.rfind("Assistant:")`
			`human_idx = prompt.rfind("Human:")`
			`if human_idx != -1:`
add dpo mix dataset 2024-04-20 01:31:38 +08:00			`old_query = prompt[human_idx + 6 : assist_idx].strip()`
			`old_resp = prompt[assist_idx + 10 :].strip()`
support full-parameter PPO 2023-11-16 02:08:04 +08:00			`conversations.insert(0, {"from": "gpt", "value": old_resp})`
			`conversations.insert(0, {"from": "human", "value": old_query})`
add belle multiturn dataset 2023-06-16 20:01:16 +08:00			`else:`
			`break`
			`prompt = prompt[:human_idx].strip()`

support full-parameter PPO 2023-11-16 02:08:04 +08:00			`yield key, {"conversations": conversations}`