add dpo mix dataset

This commit is contained in:
hiyouga 2024-04-20 01:31:38 +08:00
parent ba559a659a
commit 6339edefff
5 changed files with 81 additions and 105 deletions

View File

@ -1,5 +1,6 @@
import os
import json import json
import os
import datasets import datasets
@ -22,31 +23,19 @@ _URL = "{}/datasets/BelleGroup/multiturn_chat_0.8M/resolve/main/multiturn_chat_0
class BelleMultiturn(datasets.GeneratorBasedBuilder): class BelleMultiturn(datasets.GeneratorBasedBuilder):
VERSION = datasets.Version("0.0.0") VERSION = datasets.Version("0.0.0")
def _info(self): def _info(self):
features = datasets.Features({ features = datasets.Features(
"conversations": [{"from": datasets.Value("string"), "value": datasets.Value("string")}] {"conversations": [{"from": datasets.Value("string"), "value": datasets.Value("string")}]}
}) )
return datasets.DatasetInfo( return datasets.DatasetInfo(
description=_DESCRIPTION, description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION
) )
def _split_generators(self, dl_manager: datasets.DownloadManager): def _split_generators(self, dl_manager: datasets.DownloadManager):
file_path = dl_manager.download(_URL) file_path = dl_manager.download(_URL)
return [ return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": file_path})]
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"filepath": file_path
}
)
]
def _generate_examples(self, filepath: str): def _generate_examples(self, filepath: str):
with open(filepath, "r", encoding="utf-8") as f: with open(filepath, "r", encoding="utf-8") as f:
@ -58,7 +47,7 @@ class BelleMultiturn(datasets.GeneratorBasedBuilder):
assist_idx = prompt.rfind("Assistant:") assist_idx = prompt.rfind("Assistant:")
human_idx = prompt.rfind("Human:") human_idx = prompt.rfind("Human:")
query = prompt[human_idx+6:assist_idx].strip() query = prompt[human_idx + 6 : assist_idx].strip()
prompt = prompt[:human_idx].strip() prompt = prompt[:human_idx].strip()
conversations.insert(0, {"from": "gpt", "value": response}) conversations.insert(0, {"from": "gpt", "value": response})
conversations.insert(0, {"from": "human", "value": query}) conversations.insert(0, {"from": "human", "value": query})
@ -67,8 +56,8 @@ class BelleMultiturn(datasets.GeneratorBasedBuilder):
assist_idx = prompt.rfind("Assistant:") assist_idx = prompt.rfind("Assistant:")
human_idx = prompt.rfind("Human:") human_idx = prompt.rfind("Human:")
if human_idx != -1: if human_idx != -1:
old_query = prompt[human_idx+6:assist_idx].strip() old_query = prompt[human_idx + 6 : assist_idx].strip()
old_resp = prompt[assist_idx+10:].strip() old_resp = prompt[assist_idx + 10 :].strip()
conversations.insert(0, {"from": "gpt", "value": old_resp}) conversations.insert(0, {"from": "gpt", "value": old_resp})
conversations.insert(0, {"from": "human", "value": old_query}) conversations.insert(0, {"from": "human", "value": old_query})
else: else:

View File

@ -318,6 +318,28 @@
"ms_hub_url": "AI-ModelScope/RLAIF-Nectar", "ms_hub_url": "AI-ModelScope/RLAIF-Nectar",
"ranking": true "ranking": true
}, },
"dpo_mix_en": {
"hf_hub_url": "hiyouga/DPO-En-Zh-20k",
"subset": "en",
"ranking": true,
"columns": {
"prompt": "prompt",
"response": "answer",
"system": "system",
"history": "history"
}
},
"dpo_mix_zh": {
"hf_hub_url": "hiyouga/DPO-En-Zh-20k",
"subset": "zh",
"ranking": true,
"columns": {
"prompt": "prompt",
"response": "answer",
"system": "system",
"history": "history"
}
},
"orca_dpo_de" : { "orca_dpo_de" : {
"hf_hub_url": "mayflowergmbh/intel_orca_dpo_pairs_de", "hf_hub_url": "mayflowergmbh/intel_orca_dpo_pairs_de",
"ranking": true "ranking": true

View File

@ -1,7 +1,8 @@
import json import json
import datasets
from typing import Any, Dict, Generator, List, Tuple from typing import Any, Dict, Generator, List, Tuple
import datasets
_DESCRIPTION = "An example of dataset." _DESCRIPTION = "An example of dataset."
_CITATION = "" _CITATION = ""
@ -11,34 +12,24 @@ _URL = "examples.json"
class ExampleDataset(datasets.GeneratorBasedBuilder): class ExampleDataset(datasets.GeneratorBasedBuilder):
VERSION = datasets.Version("0.0.0") VERSION = datasets.Version("0.0.0")
def _info(self) -> datasets.DatasetInfo: def _info(self) -> datasets.DatasetInfo:
features = datasets.Features({ features = datasets.Features(
"instruction": datasets.Value("string"), {
"input": datasets.Value("string"), "instruction": datasets.Value("string"),
"output": datasets.Value("string"), "input": datasets.Value("string"),
"history": datasets.Sequence(datasets.Sequence(datasets.Value("string"))) "output": datasets.Value("string"),
}) "history": datasets.Sequence(datasets.Sequence(datasets.Value("string"))),
}
)
return datasets.DatasetInfo( return datasets.DatasetInfo(
description=_DESCRIPTION, description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION
) )
def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
file_path = dl_manager.download(_URL) file_path = dl_manager.download(_URL)
return [ return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": file_path})]
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"filepath": file_path
}
)
]
def _generate_examples(self, filepath: str) -> Generator[Tuple[int, Dict[str, Any]], None, None]: def _generate_examples(self, filepath: str) -> Generator[Tuple[int, Dict[str, Any]], None, None]:
example_dataset = json.load(open(filepath, "r", encoding="utf-8")) example_dataset = json.load(open(filepath, "r", encoding="utf-8"))

View File

@ -1,8 +1,10 @@
import os
import json import json
import datasets import os
from typing import List from typing import List
import datasets
_HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co") _HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co")
_DESCRIPTION = "Human preference data about helpfulness and harmlessness." _DESCRIPTION = "Human preference data about helpfulness and harmlessness."
_CITATION = "" _CITATION = ""
@ -14,50 +16,37 @@ _URLS = {
_URL + "harmless-base/train.jsonl.gz", _URL + "harmless-base/train.jsonl.gz",
_URL + "helpful-base/train.jsonl.gz", _URL + "helpful-base/train.jsonl.gz",
_URL + "helpful-online/train.jsonl.gz", _URL + "helpful-online/train.jsonl.gz",
_URL + "helpful-rejection-sampled/train.jsonl.gz" _URL + "helpful-rejection-sampled/train.jsonl.gz",
], ],
"test": [ "test": [
_URL + "harmless-base/test.jsonl.gz", _URL + "harmless-base/test.jsonl.gz",
_URL + "helpful-base/test.jsonl.gz", _URL + "helpful-base/test.jsonl.gz",
_URL + "helpful-online/test.jsonl.gz", _URL + "helpful-online/test.jsonl.gz",
_URL + "helpful-rejection-sampled/test.jsonl.gz" _URL + "helpful-rejection-sampled/test.jsonl.gz",
] ],
} }
class HhRlhfEn(datasets.GeneratorBasedBuilder): class HhRlhfEn(datasets.GeneratorBasedBuilder):
VERSION = datasets.Version("0.0.0") VERSION = datasets.Version("0.0.0")
def _info(self) -> datasets.DatasetInfo: def _info(self) -> datasets.DatasetInfo:
features = datasets.Features({ features = datasets.Features(
"instruction": datasets.Value("string"), {
"output": datasets.Sequence(datasets.Value("string")), "instruction": datasets.Value("string"),
"history": datasets.Sequence(datasets.Sequence(datasets.Value("string"))) "output": datasets.Sequence(datasets.Value("string")),
}) "history": datasets.Sequence(datasets.Sequence(datasets.Value("string"))),
}
)
return datasets.DatasetInfo( return datasets.DatasetInfo(
description=_DESCRIPTION, description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION
) )
def _split_generators(self, dl_manager: datasets.DownloadManager): def _split_generators(self, dl_manager: datasets.DownloadManager):
file_path = dl_manager.download_and_extract(_URLS) file_path = dl_manager.download_and_extract(_URLS)
return [ return [
datasets.SplitGenerator( datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepaths": file_path["train"]}),
name=datasets.Split.TRAIN, datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={"filepaths": file_path["test"]}),
gen_kwargs={
"filepaths": file_path["train"]
}
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={
"filepaths": file_path["test"]
}
)
] ]
def _generate_examples(self, filepaths: List[str]): def _generate_examples(self, filepaths: List[str]):
@ -70,12 +59,12 @@ class HhRlhfEn(datasets.GeneratorBasedBuilder):
rejected = data["rejected"] rejected = data["rejected"]
assist_idx = rejected.rfind("\n\nAssistant: ") assist_idx = rejected.rfind("\n\nAssistant: ")
r_reject = rejected[assist_idx+13:].strip() r_reject = rejected[assist_idx + 13 :].strip()
assist_idx = chosen.rfind("\n\nAssistant: ") assist_idx = chosen.rfind("\n\nAssistant: ")
r_accept = chosen[assist_idx+13:].strip() r_accept = chosen[assist_idx + 13 :].strip()
human_idx = chosen.rfind("\n\nHuman: ") human_idx = chosen.rfind("\n\nHuman: ")
query = chosen[human_idx+9:assist_idx].strip() query = chosen[human_idx + 9 : assist_idx].strip()
prompt = chosen[:human_idx] prompt = chosen[:human_idx]
history = [] history = []
@ -83,16 +72,12 @@ class HhRlhfEn(datasets.GeneratorBasedBuilder):
assist_idx = prompt.rfind("\n\nAssistant: ") assist_idx = prompt.rfind("\n\nAssistant: ")
human_idx = prompt.rfind("\n\nHuman: ") human_idx = prompt.rfind("\n\nHuman: ")
if human_idx != -1: if human_idx != -1:
old_query = prompt[human_idx+9:assist_idx].strip() old_query = prompt[human_idx + 9 : assist_idx].strip()
old_resp = prompt[assist_idx+13:].strip() old_resp = prompt[assist_idx + 13 :].strip()
history.insert(0, (old_query, old_resp)) history.insert(0, (old_query, old_resp))
else: else:
break break
prompt = prompt[:human_idx] prompt = prompt[:human_idx]
yield key, { yield key, {"instruction": query, "output": [r_accept, r_reject], "history": history}
"instruction": query,
"output": [r_accept, r_reject],
"history": history
}
key += 1 key += 1

View File

@ -1,8 +1,10 @@
import os
import json import json
import datasets import os
from typing import List from typing import List
import datasets
_HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co") _HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co")
_DESCRIPTION = "UltraChat: Large-scale, Informative, and Diverse Multi-round Dialogue Data." _DESCRIPTION = "UltraChat: Large-scale, Informative, and Diverse Multi-round Dialogue Data."
@ -24,31 +26,19 @@ _BASE_DATA_URL = "{}/datasets/stingning/ultrachat/resolve/main/train_{{idx}}.jso
class UltraChat(datasets.GeneratorBasedBuilder): class UltraChat(datasets.GeneratorBasedBuilder):
VERSION = datasets.Version("0.0.0") VERSION = datasets.Version("0.0.0")
def _info(self): def _info(self):
features = datasets.Features({ features = datasets.Features(
"conversations": [{"from": datasets.Value("string"), "value": datasets.Value("string")}] {"conversations": [{"from": datasets.Value("string"), "value": datasets.Value("string")}]}
}) )
return datasets.DatasetInfo( return datasets.DatasetInfo(
description=_DESCRIPTION, description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION
) )
def _split_generators(self, dl_manager: datasets.DownloadManager): def _split_generators(self, dl_manager: datasets.DownloadManager):
file_paths = [dl_manager.download(_BASE_DATA_URL.format(idx=idx)) for idx in range(10)] # multiple shards file_paths = [dl_manager.download(_BASE_DATA_URL.format(idx=idx)) for idx in range(10)] # multiple shards
return [ return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepaths": file_paths})]
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"filepaths": file_paths
}
)
]
def _generate_examples(self, filepaths: List[str]): def _generate_examples(self, filepaths: List[str]):
for filepath in filepaths: for filepath in filepaths:
@ -56,7 +46,7 @@ class UltraChat(datasets.GeneratorBasedBuilder):
for row in f: for row in f:
try: try:
data = json.loads(row) data = json.loads(row)
except: except Exception:
continue continue
key: int = data["id"] key: int = data["id"]
content: List[str] = data["data"] content: List[str] = data["data"]
@ -64,8 +54,7 @@ class UltraChat(datasets.GeneratorBasedBuilder):
content.pop(-1) content.pop(-1)
if len(content) < 2: if len(content) < 2:
continue continue
conversations = [{ conversations = [
"from": "human" if i % 2 == 0 else "gpt", {"from": "human" if i % 2 == 0 else "gpt", "value": content[i]} for i in range(len(content))
"value": content[i] ]
} for i in range(len(content))]
yield key, {"conversations": conversations} yield key, {"conversations": conversations}