add dpo mix dataset
This commit is contained in:
parent
ba559a659a
commit
6339edefff
|
@ -1,5 +1,6 @@
|
||||||
import os
|
|
||||||
import json
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
import datasets
|
import datasets
|
||||||
|
|
||||||
|
|
||||||
|
@ -22,31 +23,19 @@ _URL = "{}/datasets/BelleGroup/multiturn_chat_0.8M/resolve/main/multiturn_chat_0
|
||||||
|
|
||||||
|
|
||||||
class BelleMultiturn(datasets.GeneratorBasedBuilder):
|
class BelleMultiturn(datasets.GeneratorBasedBuilder):
|
||||||
|
|
||||||
VERSION = datasets.Version("0.0.0")
|
VERSION = datasets.Version("0.0.0")
|
||||||
|
|
||||||
def _info(self):
|
def _info(self):
|
||||||
features = datasets.Features({
|
features = datasets.Features(
|
||||||
"conversations": [{"from": datasets.Value("string"), "value": datasets.Value("string")}]
|
{"conversations": [{"from": datasets.Value("string"), "value": datasets.Value("string")}]}
|
||||||
})
|
)
|
||||||
return datasets.DatasetInfo(
|
return datasets.DatasetInfo(
|
||||||
description=_DESCRIPTION,
|
description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION
|
||||||
features=features,
|
|
||||||
homepage=_HOMEPAGE,
|
|
||||||
license=_LICENSE,
|
|
||||||
citation=_CITATION
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def _split_generators(self, dl_manager: datasets.DownloadManager):
|
def _split_generators(self, dl_manager: datasets.DownloadManager):
|
||||||
file_path = dl_manager.download(_URL)
|
file_path = dl_manager.download(_URL)
|
||||||
return [
|
return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": file_path})]
|
||||||
datasets.SplitGenerator(
|
|
||||||
name=datasets.Split.TRAIN,
|
|
||||||
gen_kwargs={
|
|
||||||
"filepath": file_path
|
|
||||||
}
|
|
||||||
)
|
|
||||||
]
|
|
||||||
|
|
||||||
def _generate_examples(self, filepath: str):
|
def _generate_examples(self, filepath: str):
|
||||||
with open(filepath, "r", encoding="utf-8") as f:
|
with open(filepath, "r", encoding="utf-8") as f:
|
||||||
|
@ -58,7 +47,7 @@ class BelleMultiturn(datasets.GeneratorBasedBuilder):
|
||||||
|
|
||||||
assist_idx = prompt.rfind("Assistant:")
|
assist_idx = prompt.rfind("Assistant:")
|
||||||
human_idx = prompt.rfind("Human:")
|
human_idx = prompt.rfind("Human:")
|
||||||
query = prompt[human_idx+6:assist_idx].strip()
|
query = prompt[human_idx + 6 : assist_idx].strip()
|
||||||
prompt = prompt[:human_idx].strip()
|
prompt = prompt[:human_idx].strip()
|
||||||
conversations.insert(0, {"from": "gpt", "value": response})
|
conversations.insert(0, {"from": "gpt", "value": response})
|
||||||
conversations.insert(0, {"from": "human", "value": query})
|
conversations.insert(0, {"from": "human", "value": query})
|
||||||
|
@ -67,8 +56,8 @@ class BelleMultiturn(datasets.GeneratorBasedBuilder):
|
||||||
assist_idx = prompt.rfind("Assistant:")
|
assist_idx = prompt.rfind("Assistant:")
|
||||||
human_idx = prompt.rfind("Human:")
|
human_idx = prompt.rfind("Human:")
|
||||||
if human_idx != -1:
|
if human_idx != -1:
|
||||||
old_query = prompt[human_idx+6:assist_idx].strip()
|
old_query = prompt[human_idx + 6 : assist_idx].strip()
|
||||||
old_resp = prompt[assist_idx+10:].strip()
|
old_resp = prompt[assist_idx + 10 :].strip()
|
||||||
conversations.insert(0, {"from": "gpt", "value": old_resp})
|
conversations.insert(0, {"from": "gpt", "value": old_resp})
|
||||||
conversations.insert(0, {"from": "human", "value": old_query})
|
conversations.insert(0, {"from": "human", "value": old_query})
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -318,6 +318,28 @@
|
||||||
"ms_hub_url": "AI-ModelScope/RLAIF-Nectar",
|
"ms_hub_url": "AI-ModelScope/RLAIF-Nectar",
|
||||||
"ranking": true
|
"ranking": true
|
||||||
},
|
},
|
||||||
|
"dpo_mix_en": {
|
||||||
|
"hf_hub_url": "hiyouga/DPO-En-Zh-20k",
|
||||||
|
"subset": "en",
|
||||||
|
"ranking": true,
|
||||||
|
"columns": {
|
||||||
|
"prompt": "prompt",
|
||||||
|
"response": "answer",
|
||||||
|
"system": "system",
|
||||||
|
"history": "history"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"dpo_mix_zh": {
|
||||||
|
"hf_hub_url": "hiyouga/DPO-En-Zh-20k",
|
||||||
|
"subset": "zh",
|
||||||
|
"ranking": true,
|
||||||
|
"columns": {
|
||||||
|
"prompt": "prompt",
|
||||||
|
"response": "answer",
|
||||||
|
"system": "system",
|
||||||
|
"history": "history"
|
||||||
|
}
|
||||||
|
},
|
||||||
"orca_dpo_de" : {
|
"orca_dpo_de" : {
|
||||||
"hf_hub_url": "mayflowergmbh/intel_orca_dpo_pairs_de",
|
"hf_hub_url": "mayflowergmbh/intel_orca_dpo_pairs_de",
|
||||||
"ranking": true
|
"ranking": true
|
||||||
|
|
|
@ -1,7 +1,8 @@
|
||||||
import json
|
import json
|
||||||
import datasets
|
|
||||||
from typing import Any, Dict, Generator, List, Tuple
|
from typing import Any, Dict, Generator, List, Tuple
|
||||||
|
|
||||||
|
import datasets
|
||||||
|
|
||||||
|
|
||||||
_DESCRIPTION = "An example of dataset."
|
_DESCRIPTION = "An example of dataset."
|
||||||
_CITATION = ""
|
_CITATION = ""
|
||||||
|
@ -11,34 +12,24 @@ _URL = "examples.json"
|
||||||
|
|
||||||
|
|
||||||
class ExampleDataset(datasets.GeneratorBasedBuilder):
|
class ExampleDataset(datasets.GeneratorBasedBuilder):
|
||||||
|
|
||||||
VERSION = datasets.Version("0.0.0")
|
VERSION = datasets.Version("0.0.0")
|
||||||
|
|
||||||
def _info(self) -> datasets.DatasetInfo:
|
def _info(self) -> datasets.DatasetInfo:
|
||||||
features = datasets.Features({
|
features = datasets.Features(
|
||||||
"instruction": datasets.Value("string"),
|
{
|
||||||
"input": datasets.Value("string"),
|
"instruction": datasets.Value("string"),
|
||||||
"output": datasets.Value("string"),
|
"input": datasets.Value("string"),
|
||||||
"history": datasets.Sequence(datasets.Sequence(datasets.Value("string")))
|
"output": datasets.Value("string"),
|
||||||
})
|
"history": datasets.Sequence(datasets.Sequence(datasets.Value("string"))),
|
||||||
|
}
|
||||||
|
)
|
||||||
return datasets.DatasetInfo(
|
return datasets.DatasetInfo(
|
||||||
description=_DESCRIPTION,
|
description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION
|
||||||
features=features,
|
|
||||||
homepage=_HOMEPAGE,
|
|
||||||
license=_LICENSE,
|
|
||||||
citation=_CITATION
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
|
def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
|
||||||
file_path = dl_manager.download(_URL)
|
file_path = dl_manager.download(_URL)
|
||||||
return [
|
return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": file_path})]
|
||||||
datasets.SplitGenerator(
|
|
||||||
name=datasets.Split.TRAIN,
|
|
||||||
gen_kwargs={
|
|
||||||
"filepath": file_path
|
|
||||||
}
|
|
||||||
)
|
|
||||||
]
|
|
||||||
|
|
||||||
def _generate_examples(self, filepath: str) -> Generator[Tuple[int, Dict[str, Any]], None, None]:
|
def _generate_examples(self, filepath: str) -> Generator[Tuple[int, Dict[str, Any]], None, None]:
|
||||||
example_dataset = json.load(open(filepath, "r", encoding="utf-8"))
|
example_dataset = json.load(open(filepath, "r", encoding="utf-8"))
|
||||||
|
|
|
@ -1,8 +1,10 @@
|
||||||
import os
|
|
||||||
import json
|
import json
|
||||||
import datasets
|
import os
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
|
import datasets
|
||||||
|
|
||||||
|
|
||||||
_HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co")
|
_HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co")
|
||||||
_DESCRIPTION = "Human preference data about helpfulness and harmlessness."
|
_DESCRIPTION = "Human preference data about helpfulness and harmlessness."
|
||||||
_CITATION = ""
|
_CITATION = ""
|
||||||
|
@ -14,50 +16,37 @@ _URLS = {
|
||||||
_URL + "harmless-base/train.jsonl.gz",
|
_URL + "harmless-base/train.jsonl.gz",
|
||||||
_URL + "helpful-base/train.jsonl.gz",
|
_URL + "helpful-base/train.jsonl.gz",
|
||||||
_URL + "helpful-online/train.jsonl.gz",
|
_URL + "helpful-online/train.jsonl.gz",
|
||||||
_URL + "helpful-rejection-sampled/train.jsonl.gz"
|
_URL + "helpful-rejection-sampled/train.jsonl.gz",
|
||||||
],
|
],
|
||||||
"test": [
|
"test": [
|
||||||
_URL + "harmless-base/test.jsonl.gz",
|
_URL + "harmless-base/test.jsonl.gz",
|
||||||
_URL + "helpful-base/test.jsonl.gz",
|
_URL + "helpful-base/test.jsonl.gz",
|
||||||
_URL + "helpful-online/test.jsonl.gz",
|
_URL + "helpful-online/test.jsonl.gz",
|
||||||
_URL + "helpful-rejection-sampled/test.jsonl.gz"
|
_URL + "helpful-rejection-sampled/test.jsonl.gz",
|
||||||
]
|
],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class HhRlhfEn(datasets.GeneratorBasedBuilder):
|
class HhRlhfEn(datasets.GeneratorBasedBuilder):
|
||||||
|
|
||||||
VERSION = datasets.Version("0.0.0")
|
VERSION = datasets.Version("0.0.0")
|
||||||
|
|
||||||
def _info(self) -> datasets.DatasetInfo:
|
def _info(self) -> datasets.DatasetInfo:
|
||||||
features = datasets.Features({
|
features = datasets.Features(
|
||||||
"instruction": datasets.Value("string"),
|
{
|
||||||
"output": datasets.Sequence(datasets.Value("string")),
|
"instruction": datasets.Value("string"),
|
||||||
"history": datasets.Sequence(datasets.Sequence(datasets.Value("string")))
|
"output": datasets.Sequence(datasets.Value("string")),
|
||||||
})
|
"history": datasets.Sequence(datasets.Sequence(datasets.Value("string"))),
|
||||||
|
}
|
||||||
|
)
|
||||||
return datasets.DatasetInfo(
|
return datasets.DatasetInfo(
|
||||||
description=_DESCRIPTION,
|
description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION
|
||||||
features=features,
|
|
||||||
homepage=_HOMEPAGE,
|
|
||||||
license=_LICENSE,
|
|
||||||
citation=_CITATION
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def _split_generators(self, dl_manager: datasets.DownloadManager):
|
def _split_generators(self, dl_manager: datasets.DownloadManager):
|
||||||
file_path = dl_manager.download_and_extract(_URLS)
|
file_path = dl_manager.download_and_extract(_URLS)
|
||||||
return [
|
return [
|
||||||
datasets.SplitGenerator(
|
datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepaths": file_path["train"]}),
|
||||||
name=datasets.Split.TRAIN,
|
datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={"filepaths": file_path["test"]}),
|
||||||
gen_kwargs={
|
|
||||||
"filepaths": file_path["train"]
|
|
||||||
}
|
|
||||||
),
|
|
||||||
datasets.SplitGenerator(
|
|
||||||
name=datasets.Split.TEST,
|
|
||||||
gen_kwargs={
|
|
||||||
"filepaths": file_path["test"]
|
|
||||||
}
|
|
||||||
)
|
|
||||||
]
|
]
|
||||||
|
|
||||||
def _generate_examples(self, filepaths: List[str]):
|
def _generate_examples(self, filepaths: List[str]):
|
||||||
|
@ -70,12 +59,12 @@ class HhRlhfEn(datasets.GeneratorBasedBuilder):
|
||||||
rejected = data["rejected"]
|
rejected = data["rejected"]
|
||||||
|
|
||||||
assist_idx = rejected.rfind("\n\nAssistant: ")
|
assist_idx = rejected.rfind("\n\nAssistant: ")
|
||||||
r_reject = rejected[assist_idx+13:].strip()
|
r_reject = rejected[assist_idx + 13 :].strip()
|
||||||
assist_idx = chosen.rfind("\n\nAssistant: ")
|
assist_idx = chosen.rfind("\n\nAssistant: ")
|
||||||
r_accept = chosen[assist_idx+13:].strip()
|
r_accept = chosen[assist_idx + 13 :].strip()
|
||||||
|
|
||||||
human_idx = chosen.rfind("\n\nHuman: ")
|
human_idx = chosen.rfind("\n\nHuman: ")
|
||||||
query = chosen[human_idx+9:assist_idx].strip()
|
query = chosen[human_idx + 9 : assist_idx].strip()
|
||||||
prompt = chosen[:human_idx]
|
prompt = chosen[:human_idx]
|
||||||
history = []
|
history = []
|
||||||
|
|
||||||
|
@ -83,16 +72,12 @@ class HhRlhfEn(datasets.GeneratorBasedBuilder):
|
||||||
assist_idx = prompt.rfind("\n\nAssistant: ")
|
assist_idx = prompt.rfind("\n\nAssistant: ")
|
||||||
human_idx = prompt.rfind("\n\nHuman: ")
|
human_idx = prompt.rfind("\n\nHuman: ")
|
||||||
if human_idx != -1:
|
if human_idx != -1:
|
||||||
old_query = prompt[human_idx+9:assist_idx].strip()
|
old_query = prompt[human_idx + 9 : assist_idx].strip()
|
||||||
old_resp = prompt[assist_idx+13:].strip()
|
old_resp = prompt[assist_idx + 13 :].strip()
|
||||||
history.insert(0, (old_query, old_resp))
|
history.insert(0, (old_query, old_resp))
|
||||||
else:
|
else:
|
||||||
break
|
break
|
||||||
prompt = prompt[:human_idx]
|
prompt = prompt[:human_idx]
|
||||||
|
|
||||||
yield key, {
|
yield key, {"instruction": query, "output": [r_accept, r_reject], "history": history}
|
||||||
"instruction": query,
|
|
||||||
"output": [r_accept, r_reject],
|
|
||||||
"history": history
|
|
||||||
}
|
|
||||||
key += 1
|
key += 1
|
||||||
|
|
|
@ -1,8 +1,10 @@
|
||||||
import os
|
|
||||||
import json
|
import json
|
||||||
import datasets
|
import os
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
|
import datasets
|
||||||
|
|
||||||
|
|
||||||
_HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co")
|
_HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co")
|
||||||
|
|
||||||
_DESCRIPTION = "UltraChat: Large-scale, Informative, and Diverse Multi-round Dialogue Data."
|
_DESCRIPTION = "UltraChat: Large-scale, Informative, and Diverse Multi-round Dialogue Data."
|
||||||
|
@ -24,31 +26,19 @@ _BASE_DATA_URL = "{}/datasets/stingning/ultrachat/resolve/main/train_{{idx}}.jso
|
||||||
|
|
||||||
|
|
||||||
class UltraChat(datasets.GeneratorBasedBuilder):
|
class UltraChat(datasets.GeneratorBasedBuilder):
|
||||||
|
|
||||||
VERSION = datasets.Version("0.0.0")
|
VERSION = datasets.Version("0.0.0")
|
||||||
|
|
||||||
def _info(self):
|
def _info(self):
|
||||||
features = datasets.Features({
|
features = datasets.Features(
|
||||||
"conversations": [{"from": datasets.Value("string"), "value": datasets.Value("string")}]
|
{"conversations": [{"from": datasets.Value("string"), "value": datasets.Value("string")}]}
|
||||||
})
|
)
|
||||||
return datasets.DatasetInfo(
|
return datasets.DatasetInfo(
|
||||||
description=_DESCRIPTION,
|
description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION
|
||||||
features=features,
|
|
||||||
homepage=_HOMEPAGE,
|
|
||||||
license=_LICENSE,
|
|
||||||
citation=_CITATION
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def _split_generators(self, dl_manager: datasets.DownloadManager):
|
def _split_generators(self, dl_manager: datasets.DownloadManager):
|
||||||
file_paths = [dl_manager.download(_BASE_DATA_URL.format(idx=idx)) for idx in range(10)] # multiple shards
|
file_paths = [dl_manager.download(_BASE_DATA_URL.format(idx=idx)) for idx in range(10)] # multiple shards
|
||||||
return [
|
return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepaths": file_paths})]
|
||||||
datasets.SplitGenerator(
|
|
||||||
name=datasets.Split.TRAIN,
|
|
||||||
gen_kwargs={
|
|
||||||
"filepaths": file_paths
|
|
||||||
}
|
|
||||||
)
|
|
||||||
]
|
|
||||||
|
|
||||||
def _generate_examples(self, filepaths: List[str]):
|
def _generate_examples(self, filepaths: List[str]):
|
||||||
for filepath in filepaths:
|
for filepath in filepaths:
|
||||||
|
@ -56,7 +46,7 @@ class UltraChat(datasets.GeneratorBasedBuilder):
|
||||||
for row in f:
|
for row in f:
|
||||||
try:
|
try:
|
||||||
data = json.loads(row)
|
data = json.loads(row)
|
||||||
except:
|
except Exception:
|
||||||
continue
|
continue
|
||||||
key: int = data["id"]
|
key: int = data["id"]
|
||||||
content: List[str] = data["data"]
|
content: List[str] = data["data"]
|
||||||
|
@ -64,8 +54,7 @@ class UltraChat(datasets.GeneratorBasedBuilder):
|
||||||
content.pop(-1)
|
content.pop(-1)
|
||||||
if len(content) < 2:
|
if len(content) < 2:
|
||||||
continue
|
continue
|
||||||
conversations = [{
|
conversations = [
|
||||||
"from": "human" if i % 2 == 0 else "gpt",
|
{"from": "human" if i % 2 == 0 else "gpt", "value": content[i]} for i in range(len(content))
|
||||||
"value": content[i]
|
]
|
||||||
} for i in range(len(content))]
|
|
||||||
yield key, {"conversations": conversations}
|
yield key, {"conversations": conversations}
|
||||||
|
|
Loading…
Reference in New Issue