LLaMA-Factory-Mirror/data/belle_multiturn/belle_multiturn.py

import os
import json
import datasets


_HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co")

_DESCRIPTION = "BELLE multiturn chat dataset."

_CITATION = """\
@article{belle2023exploring,
  title={Exploring the Impact of Instruction Data Scaling on Large Language Models: An Empirical Study on Real-World Use Cases},
  author={Yunjie Ji, Yong Deng, Yan Gong, Yiping Peng, Qiang Niu, Lei Zhang, Baochang Ma, Xiangang Li},
  journal={arXiv preprint arXiv:2303.14742},
  year={2023}
}
"""

_HOMEPAGE = "{}/datasets/BelleGroup/multiturn_chat_0.8M".format(_HF_ENDPOINT)
_LICENSE = "gpl-3.0"
_URL = "{}/datasets/BelleGroup/multiturn_chat_0.8M/resolve/main/multiturn_chat_0.8M.json".format(_HF_ENDPOINT)


class BelleMultiturn(datasets.GeneratorBasedBuilder):

    VERSION = datasets.Version("0.0.0")

    def _info(self):
        features = datasets.Features({
            "conversations": [{"from": datasets.Value("string"), "value": datasets.Value("string")}]
        })
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION
        )

    def _split_generators(self, dl_manager: datasets.DownloadManager):
        file_path = dl_manager.download(_URL)
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={
                    "filepath": file_path
                }
            )
        ]

    def _generate_examples(self, filepath: str):
        with open(filepath, "r", encoding="utf-8") as f:
            for key, row in enumerate(f):
                data = json.loads(row)
                conversations = []
                prompt = data["instruction"].strip()
                response = data["output"].strip()

                assist_idx = prompt.rfind("Assistant:")
                human_idx = prompt.rfind("Human:")
                query = prompt[human_idx+6:assist_idx].strip()
                prompt = prompt[:human_idx].strip()
                conversations.insert(0, {"from": "gpt", "value": response})
                conversations.insert(0, {"from": "human", "value": query})

                while prompt.rfind("Assistant:") != -1:
                    assist_idx = prompt.rfind("Assistant:")
                    human_idx = prompt.rfind("Human:")
                    if human_idx != -1:
                        old_query = prompt[human_idx+6:assist_idx].strip()
                        old_resp = prompt[assist_idx+10:].strip()
                        conversations.insert(0, {"from": "gpt", "value": old_resp})
                        conversations.insert(0, {"from": "human", "value": old_query})
                    else:
                        break
                    prompt = prompt[:human_idx].strip()

                yield key, {"conversations": conversations}
Follow HF_ENDPOINT environment variable 2024-03-20 16:31:30 +08:00			`import os`
add belle multiturn dataset 2023-06-16 20:01:16 +08:00			`import json`
			`import datasets`

add orca_dpo_pairs dataset 2024-03-20 20:09:06 +08:00
Follow HF_ENDPOINT environment variable 2024-03-20 16:31:30 +08:00			`_HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co")`
add belle multiturn dataset 2023-06-16 20:01:16 +08:00
			`_DESCRIPTION = "BELLE multiturn chat dataset."`

			`_CITATION = """\`
			`@article{belle2023exploring,`
			`title={Exploring the Impact of Instruction Data Scaling on Large Language Models: An Empirical Study on Real-World Use Cases},`
			`author={Yunjie Ji, Yong Deng, Yan Gong, Yiping Peng, Qiang Niu, Lei Zhang, Baochang Ma, Xiangang Li},`
			`journal={arXiv preprint arXiv:2303.14742},`
			`year={2023}`
			`}`
			`"""`

add orca_dpo_pairs dataset 2024-03-20 20:09:06 +08:00			`_HOMEPAGE = "{}/datasets/BelleGroup/multiturn_chat_0.8M".format(_HF_ENDPOINT)`
add belle multiturn dataset 2023-06-16 20:01:16 +08:00			`_LICENSE = "gpl-3.0"`
add orca_dpo_pairs dataset 2024-03-20 20:09:06 +08:00			`_URL = "{}/datasets/BelleGroup/multiturn_chat_0.8M/resolve/main/multiturn_chat_0.8M.json".format(_HF_ENDPOINT)`
add belle multiturn dataset 2023-06-16 20:01:16 +08:00

			`class BelleMultiturn(datasets.GeneratorBasedBuilder):`

			`VERSION = datasets.Version("0.0.0")`

add template, modify datasets 2023-11-09 15:53:23 +08:00			`def _info(self):`
add belle multiturn dataset 2023-06-16 20:01:16 +08:00			`features = datasets.Features({`
support full-parameter PPO 2023-11-16 02:08:04 +08:00			`"conversations": [{"from": datasets.Value("string"), "value": datasets.Value("string")}]`
add belle multiturn dataset 2023-06-16 20:01:16 +08:00			`})`
			`return datasets.DatasetInfo(`
			`description=_DESCRIPTION,`
			`features=features,`
			`homepage=_HOMEPAGE,`
			`license=_LICENSE,`
			`citation=_CITATION`
			`)`

add template, modify datasets 2023-11-09 15:53:23 +08:00			`def _split_generators(self, dl_manager: datasets.DownloadManager):`
add belle multiturn dataset 2023-06-16 20:01:16 +08:00			`file_path = dl_manager.download(_URL)`
			`return [`
			`datasets.SplitGenerator(`
			`name=datasets.Split.TRAIN,`
			`gen_kwargs={`
			`"filepath": file_path`
			`}`
			`)`
			`]`

add template, modify datasets 2023-11-09 15:53:23 +08:00			`def _generate_examples(self, filepath: str):`
add belle multiturn dataset 2023-06-16 20:01:16 +08:00			`with open(filepath, "r", encoding="utf-8") as f:`
			`for key, row in enumerate(f):`
			`data = json.loads(row)`
support full-parameter PPO 2023-11-16 02:08:04 +08:00			`conversations = []`
add belle multiturn dataset 2023-06-16 20:01:16 +08:00			`prompt = data["instruction"].strip()`
			`response = data["output"].strip()`

			`assist_idx = prompt.rfind("Assistant:")`
			`human_idx = prompt.rfind("Human:")`
			`query = prompt[human_idx+6:assist_idx].strip()`
			`prompt = prompt[:human_idx].strip()`
support full-parameter PPO 2023-11-16 02:08:04 +08:00			`conversations.insert(0, {"from": "gpt", "value": response})`
			`conversations.insert(0, {"from": "human", "value": query})`
add belle multiturn dataset 2023-06-16 20:01:16 +08:00
			`while prompt.rfind("Assistant:") != -1:`
			`assist_idx = prompt.rfind("Assistant:")`
			`human_idx = prompt.rfind("Human:")`
			`if human_idx != -1:`
			`old_query = prompt[human_idx+6:assist_idx].strip()`
			`old_resp = prompt[assist_idx+10:].strip()`
support full-parameter PPO 2023-11-16 02:08:04 +08:00			`conversations.insert(0, {"from": "gpt", "value": old_resp})`
			`conversations.insert(0, {"from": "human", "value": old_query})`
add belle multiturn dataset 2023-06-16 20:01:16 +08:00			`else:`
			`break`
			`prompt = prompt[:human_idx].strip()`

support full-parameter PPO 2023-11-16 02:08:04 +08:00			`yield key, {"conversations": conversations}`