diff --git a/data/belle_multiturn/belle_multiturn.py b/data/belle_multiturn/belle_multiturn.py index 6e31f0e6..5c3fce26 100644 --- a/data/belle_multiturn/belle_multiturn.py +++ b/data/belle_multiturn/belle_multiturn.py @@ -1,5 +1,6 @@ -import os import json +import os + import datasets @@ -22,31 +23,19 @@ _URL = "{}/datasets/BelleGroup/multiturn_chat_0.8M/resolve/main/multiturn_chat_0 class BelleMultiturn(datasets.GeneratorBasedBuilder): - VERSION = datasets.Version("0.0.0") def _info(self): - features = datasets.Features({ - "conversations": [{"from": datasets.Value("string"), "value": datasets.Value("string")}] - }) + features = datasets.Features( + {"conversations": [{"from": datasets.Value("string"), "value": datasets.Value("string")}]} + ) return datasets.DatasetInfo( - description=_DESCRIPTION, - features=features, - homepage=_HOMEPAGE, - license=_LICENSE, - citation=_CITATION + description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION ) def _split_generators(self, dl_manager: datasets.DownloadManager): file_path = dl_manager.download(_URL) - return [ - datasets.SplitGenerator( - name=datasets.Split.TRAIN, - gen_kwargs={ - "filepath": file_path - } - ) - ] + return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": file_path})] def _generate_examples(self, filepath: str): with open(filepath, "r", encoding="utf-8") as f: @@ -58,7 +47,7 @@ class BelleMultiturn(datasets.GeneratorBasedBuilder): assist_idx = prompt.rfind("Assistant:") human_idx = prompt.rfind("Human:") - query = prompt[human_idx+6:assist_idx].strip() + query = prompt[human_idx + 6 : assist_idx].strip() prompt = prompt[:human_idx].strip() conversations.insert(0, {"from": "gpt", "value": response}) conversations.insert(0, {"from": "human", "value": query}) @@ -67,8 +56,8 @@ class BelleMultiturn(datasets.GeneratorBasedBuilder): assist_idx = prompt.rfind("Assistant:") human_idx = prompt.rfind("Human:") if human_idx != -1: - old_query = prompt[human_idx+6:assist_idx].strip() - old_resp = prompt[assist_idx+10:].strip() + old_query = prompt[human_idx + 6 : assist_idx].strip() + old_resp = prompt[assist_idx + 10 :].strip() conversations.insert(0, {"from": "gpt", "value": old_resp}) conversations.insert(0, {"from": "human", "value": old_query}) else: diff --git a/data/dataset_info.json b/data/dataset_info.json index b6ed7b89..e396ed50 100644 --- a/data/dataset_info.json +++ b/data/dataset_info.json @@ -318,6 +318,28 @@ "ms_hub_url": "AI-ModelScope/RLAIF-Nectar", "ranking": true }, + "dpo_mix_en": { + "hf_hub_url": "hiyouga/DPO-En-Zh-20k", + "subset": "en", + "ranking": true, + "columns": { + "prompt": "prompt", + "response": "answer", + "system": "system", + "history": "history" + } + }, + "dpo_mix_zh": { + "hf_hub_url": "hiyouga/DPO-En-Zh-20k", + "subset": "zh", + "ranking": true, + "columns": { + "prompt": "prompt", + "response": "answer", + "system": "system", + "history": "history" + } + }, "orca_dpo_de" : { "hf_hub_url": "mayflowergmbh/intel_orca_dpo_pairs_de", "ranking": true diff --git a/data/example_dataset/example_dataset.py b/data/example_dataset/example_dataset.py index 5d6cfa22..bf0baa54 100644 --- a/data/example_dataset/example_dataset.py +++ b/data/example_dataset/example_dataset.py @@ -1,7 +1,8 @@ import json -import datasets from typing import Any, Dict, Generator, List, Tuple +import datasets + _DESCRIPTION = "An example of dataset." _CITATION = "" @@ -11,34 +12,24 @@ _URL = "examples.json" class ExampleDataset(datasets.GeneratorBasedBuilder): - VERSION = datasets.Version("0.0.0") def _info(self) -> datasets.DatasetInfo: - features = datasets.Features({ - "instruction": datasets.Value("string"), - "input": datasets.Value("string"), - "output": datasets.Value("string"), - "history": datasets.Sequence(datasets.Sequence(datasets.Value("string"))) - }) + features = datasets.Features( + { + "instruction": datasets.Value("string"), + "input": datasets.Value("string"), + "output": datasets.Value("string"), + "history": datasets.Sequence(datasets.Sequence(datasets.Value("string"))), + } + ) return datasets.DatasetInfo( - description=_DESCRIPTION, - features=features, - homepage=_HOMEPAGE, - license=_LICENSE, - citation=_CITATION + description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION ) def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: file_path = dl_manager.download(_URL) - return [ - datasets.SplitGenerator( - name=datasets.Split.TRAIN, - gen_kwargs={ - "filepath": file_path - } - ) - ] + return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": file_path})] def _generate_examples(self, filepath: str) -> Generator[Tuple[int, Dict[str, Any]], None, None]: example_dataset = json.load(open(filepath, "r", encoding="utf-8")) diff --git a/data/hh_rlhf_en/hh_rlhf_en.py b/data/hh_rlhf_en/hh_rlhf_en.py index 2839af7d..abe4673c 100644 --- a/data/hh_rlhf_en/hh_rlhf_en.py +++ b/data/hh_rlhf_en/hh_rlhf_en.py @@ -1,8 +1,10 @@ -import os import json -import datasets +import os from typing import List +import datasets + + _HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co") _DESCRIPTION = "Human preference data about helpfulness and harmlessness." _CITATION = "" @@ -14,50 +16,37 @@ _URLS = { _URL + "harmless-base/train.jsonl.gz", _URL + "helpful-base/train.jsonl.gz", _URL + "helpful-online/train.jsonl.gz", - _URL + "helpful-rejection-sampled/train.jsonl.gz" + _URL + "helpful-rejection-sampled/train.jsonl.gz", ], "test": [ _URL + "harmless-base/test.jsonl.gz", _URL + "helpful-base/test.jsonl.gz", _URL + "helpful-online/test.jsonl.gz", - _URL + "helpful-rejection-sampled/test.jsonl.gz" - ] + _URL + "helpful-rejection-sampled/test.jsonl.gz", + ], } class HhRlhfEn(datasets.GeneratorBasedBuilder): - VERSION = datasets.Version("0.0.0") def _info(self) -> datasets.DatasetInfo: - features = datasets.Features({ - "instruction": datasets.Value("string"), - "output": datasets.Sequence(datasets.Value("string")), - "history": datasets.Sequence(datasets.Sequence(datasets.Value("string"))) - }) + features = datasets.Features( + { + "instruction": datasets.Value("string"), + "output": datasets.Sequence(datasets.Value("string")), + "history": datasets.Sequence(datasets.Sequence(datasets.Value("string"))), + } + ) return datasets.DatasetInfo( - description=_DESCRIPTION, - features=features, - homepage=_HOMEPAGE, - license=_LICENSE, - citation=_CITATION + description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION ) def _split_generators(self, dl_manager: datasets.DownloadManager): file_path = dl_manager.download_and_extract(_URLS) return [ - datasets.SplitGenerator( - name=datasets.Split.TRAIN, - gen_kwargs={ - "filepaths": file_path["train"] - } - ), - datasets.SplitGenerator( - name=datasets.Split.TEST, - gen_kwargs={ - "filepaths": file_path["test"] - } - ) + datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepaths": file_path["train"]}), + datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={"filepaths": file_path["test"]}), ] def _generate_examples(self, filepaths: List[str]): @@ -70,12 +59,12 @@ class HhRlhfEn(datasets.GeneratorBasedBuilder): rejected = data["rejected"] assist_idx = rejected.rfind("\n\nAssistant: ") - r_reject = rejected[assist_idx+13:].strip() + r_reject = rejected[assist_idx + 13 :].strip() assist_idx = chosen.rfind("\n\nAssistant: ") - r_accept = chosen[assist_idx+13:].strip() + r_accept = chosen[assist_idx + 13 :].strip() human_idx = chosen.rfind("\n\nHuman: ") - query = chosen[human_idx+9:assist_idx].strip() + query = chosen[human_idx + 9 : assist_idx].strip() prompt = chosen[:human_idx] history = [] @@ -83,16 +72,12 @@ class HhRlhfEn(datasets.GeneratorBasedBuilder): assist_idx = prompt.rfind("\n\nAssistant: ") human_idx = prompt.rfind("\n\nHuman: ") if human_idx != -1: - old_query = prompt[human_idx+9:assist_idx].strip() - old_resp = prompt[assist_idx+13:].strip() + old_query = prompt[human_idx + 9 : assist_idx].strip() + old_resp = prompt[assist_idx + 13 :].strip() history.insert(0, (old_query, old_resp)) else: break prompt = prompt[:human_idx] - yield key, { - "instruction": query, - "output": [r_accept, r_reject], - "history": history - } + yield key, {"instruction": query, "output": [r_accept, r_reject], "history": history} key += 1 diff --git a/data/ultra_chat/ultra_chat.py b/data/ultra_chat/ultra_chat.py index 2e8a75e1..e7df3ff3 100644 --- a/data/ultra_chat/ultra_chat.py +++ b/data/ultra_chat/ultra_chat.py @@ -1,8 +1,10 @@ -import os import json -import datasets +import os from typing import List +import datasets + + _HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co") _DESCRIPTION = "UltraChat: Large-scale, Informative, and Diverse Multi-round Dialogue Data." @@ -24,31 +26,19 @@ _BASE_DATA_URL = "{}/datasets/stingning/ultrachat/resolve/main/train_{{idx}}.jso class UltraChat(datasets.GeneratorBasedBuilder): - VERSION = datasets.Version("0.0.0") def _info(self): - features = datasets.Features({ - "conversations": [{"from": datasets.Value("string"), "value": datasets.Value("string")}] - }) + features = datasets.Features( + {"conversations": [{"from": datasets.Value("string"), "value": datasets.Value("string")}]} + ) return datasets.DatasetInfo( - description=_DESCRIPTION, - features=features, - homepage=_HOMEPAGE, - license=_LICENSE, - citation=_CITATION + description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION ) def _split_generators(self, dl_manager: datasets.DownloadManager): - file_paths = [dl_manager.download(_BASE_DATA_URL.format(idx=idx)) for idx in range(10)] # multiple shards - return [ - datasets.SplitGenerator( - name=datasets.Split.TRAIN, - gen_kwargs={ - "filepaths": file_paths - } - ) - ] + file_paths = [dl_manager.download(_BASE_DATA_URL.format(idx=idx)) for idx in range(10)] # multiple shards + return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepaths": file_paths})] def _generate_examples(self, filepaths: List[str]): for filepath in filepaths: @@ -56,7 +46,7 @@ class UltraChat(datasets.GeneratorBasedBuilder): for row in f: try: data = json.loads(row) - except: + except Exception: continue key: int = data["id"] content: List[str] = data["data"] @@ -64,8 +54,7 @@ class UltraChat(datasets.GeneratorBasedBuilder): content.pop(-1) if len(content) < 2: continue - conversations = [{ - "from": "human" if i % 2 == 0 else "gpt", - "value": content[i] - } for i in range(len(content))] + conversations = [ + {"from": "human" if i % 2 == 0 else "gpt", "value": content[i]} for i in range(len(content)) + ] yield key, {"conversations": conversations}