Merge pull request #2905 from SirlyDreamer/main
Follow HF_ENDPOINT environment variable
This commit is contained in:
commit
b2dfbd728f
|
@ -1,6 +1,8 @@
|
||||||
|
import os
|
||||||
import json
|
import json
|
||||||
import datasets
|
import datasets
|
||||||
|
|
||||||
|
_HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co")
|
||||||
|
|
||||||
_DESCRIPTION = "BELLE multiturn chat dataset."
|
_DESCRIPTION = "BELLE multiturn chat dataset."
|
||||||
|
|
||||||
|
@ -13,9 +15,9 @@ _CITATION = """\
|
||||||
}
|
}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
_HOMEPAGE = "https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M"
|
_HOMEPAGE = f"{_HF_ENDPOINT}/datasets/BelleGroup/multiturn_chat_0.8M"
|
||||||
_LICENSE = "gpl-3.0"
|
_LICENSE = "gpl-3.0"
|
||||||
_URL = "https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M/resolve/main/multiturn_chat_0.8M.json"
|
_URL = f"{_HF_ENDPOINT}/datasets/BelleGroup/multiturn_chat_0.8M/resolve/main/multiturn_chat_0.8M.json"
|
||||||
|
|
||||||
|
|
||||||
class BelleMultiturn(datasets.GeneratorBasedBuilder):
|
class BelleMultiturn(datasets.GeneratorBasedBuilder):
|
||||||
|
|
|
@ -1,13 +1,17 @@
|
||||||
|
import os
|
||||||
import json
|
import json
|
||||||
import datasets
|
import datasets
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
|
_HF_ENDPOINT = os.getenv("_HF_ENDPOINT", "https://huggingface.co")
|
||||||
|
|
||||||
_DESCRIPTION = "Human preference data about helpfulness and harmlessness."
|
_DESCRIPTION = "Human preference data about helpfulness and harmlessness."
|
||||||
|
|
||||||
_CITATION = ""
|
_CITATION = ""
|
||||||
_HOMEPAGE = "https://huggingface.co/datasets/Anthropic/hh-rlhf"
|
|
||||||
|
_HOMEPAGE = f"{_HF_ENDPOINT}/datasets/Anthropic/hh-rlhf"
|
||||||
_LICENSE = "mit"
|
_LICENSE = "mit"
|
||||||
_URL = "https://huggingface.co/datasets/Anthropic/hh-rlhf/resolve/main/"
|
_URL = f"{_HF_ENDPOINT}/datasets/Anthropic/hh-rlhf/resolve/main/"
|
||||||
_URLS = {
|
_URLS = {
|
||||||
"train": [
|
"train": [
|
||||||
_URL + "harmless-base/train.jsonl.gz",
|
_URL + "harmless-base/train.jsonl.gz",
|
||||||
|
|
|
@ -1,7 +1,9 @@
|
||||||
|
import os
|
||||||
import json
|
import json
|
||||||
import datasets
|
import datasets
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
|
_HF_ENDPOINT = os.getenv("_HF_ENDPOINT", "https://huggingface.co")
|
||||||
|
|
||||||
_DESCRIPTION = "UltraChat: Large-scale, Informative, and Diverse Multi-round Dialogue Data."
|
_DESCRIPTION = "UltraChat: Large-scale, Informative, and Diverse Multi-round Dialogue Data."
|
||||||
|
|
||||||
|
@ -16,9 +18,9 @@ _CITATION = """\
|
||||||
}
|
}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
_HOMEPAGE = "https://huggingface.co/datasets/stingning/ultrachat"
|
_HOMEPAGE = f"{_HF_ENDPOINT}/datasets/stingning/ultrachat"
|
||||||
_LICENSE = "cc-by-nc-4.0"
|
_LICENSE = "cc-by-nc-4.0"
|
||||||
_BASE_DATA_URL = "https://huggingface.co/datasets/stingning/ultrachat/resolve/main/train_{idx}.jsonl"
|
_BASE_DATA_URL = "{_HF_ENDPOINT}/datasets/stingning/ultrachat/resolve/main/train_{idx}.jsonl"
|
||||||
|
|
||||||
|
|
||||||
class UltraChat(datasets.GeneratorBasedBuilder):
|
class UltraChat(datasets.GeneratorBasedBuilder):
|
||||||
|
@ -38,7 +40,7 @@ class UltraChat(datasets.GeneratorBasedBuilder):
|
||||||
)
|
)
|
||||||
|
|
||||||
def _split_generators(self, dl_manager: datasets.DownloadManager):
|
def _split_generators(self, dl_manager: datasets.DownloadManager):
|
||||||
file_paths = [dl_manager.download(_BASE_DATA_URL.format(idx=idx)) for idx in range(10)] # multiple shards
|
file_paths = [dl_manager.download(_BASE_DATA_URL.format(_HF_ENDPOINT=_HF_ENDPOINT,idx=idx)) for idx in range(10)] # multiple shards
|
||||||
return [
|
return [
|
||||||
datasets.SplitGenerator(
|
datasets.SplitGenerator(
|
||||||
name=datasets.Split.TRAIN,
|
name=datasets.Split.TRAIN,
|
||||||
|
|
Loading…
Reference in New Issue