Merge pull request #2905 from SirlyDreamer/main

Follow HF_ENDPOINT environment variable
This commit is contained in:
hoshi-hiyouga 2024-03-20 18:09:54 +08:00 committed by GitHub
commit b2dfbd728f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 15 additions and 7 deletions

View File

@ -1,6 +1,8 @@
import os
import json import json
import datasets import datasets
_HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co")
_DESCRIPTION = "BELLE multiturn chat dataset." _DESCRIPTION = "BELLE multiturn chat dataset."
@ -13,9 +15,9 @@ _CITATION = """\
} }
""" """
_HOMEPAGE = "https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M" _HOMEPAGE = f"{_HF_ENDPOINT}/datasets/BelleGroup/multiturn_chat_0.8M"
_LICENSE = "gpl-3.0" _LICENSE = "gpl-3.0"
_URL = "https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M/resolve/main/multiturn_chat_0.8M.json" _URL = f"{_HF_ENDPOINT}/datasets/BelleGroup/multiturn_chat_0.8M/resolve/main/multiturn_chat_0.8M.json"
class BelleMultiturn(datasets.GeneratorBasedBuilder): class BelleMultiturn(datasets.GeneratorBasedBuilder):

View File

@ -1,13 +1,17 @@
import os
import json import json
import datasets import datasets
from typing import List from typing import List
_HF_ENDPOINT = os.getenv("_HF_ENDPOINT", "https://huggingface.co")
_DESCRIPTION = "Human preference data about helpfulness and harmlessness." _DESCRIPTION = "Human preference data about helpfulness and harmlessness."
_CITATION = "" _CITATION = ""
_HOMEPAGE = "https://huggingface.co/datasets/Anthropic/hh-rlhf"
_HOMEPAGE = f"{_HF_ENDPOINT}/datasets/Anthropic/hh-rlhf"
_LICENSE = "mit" _LICENSE = "mit"
_URL = "https://huggingface.co/datasets/Anthropic/hh-rlhf/resolve/main/" _URL = f"{_HF_ENDPOINT}/datasets/Anthropic/hh-rlhf/resolve/main/"
_URLS = { _URLS = {
"train": [ "train": [
_URL + "harmless-base/train.jsonl.gz", _URL + "harmless-base/train.jsonl.gz",

View File

@ -1,7 +1,9 @@
import os
import json import json
import datasets import datasets
from typing import List from typing import List
_HF_ENDPOINT = os.getenv("_HF_ENDPOINT", "https://huggingface.co")
_DESCRIPTION = "UltraChat: Large-scale, Informative, and Diverse Multi-round Dialogue Data." _DESCRIPTION = "UltraChat: Large-scale, Informative, and Diverse Multi-round Dialogue Data."
@ -16,9 +18,9 @@ _CITATION = """\
} }
""" """
_HOMEPAGE = "https://huggingface.co/datasets/stingning/ultrachat" _HOMEPAGE = f"{_HF_ENDPOINT}/datasets/stingning/ultrachat"
_LICENSE = "cc-by-nc-4.0" _LICENSE = "cc-by-nc-4.0"
_BASE_DATA_URL = "https://huggingface.co/datasets/stingning/ultrachat/resolve/main/train_{idx}.jsonl" _BASE_DATA_URL = "{_HF_ENDPOINT}/datasets/stingning/ultrachat/resolve/main/train_{idx}.jsonl"
class UltraChat(datasets.GeneratorBasedBuilder): class UltraChat(datasets.GeneratorBasedBuilder):
@ -38,7 +40,7 @@ class UltraChat(datasets.GeneratorBasedBuilder):
) )
def _split_generators(self, dl_manager: datasets.DownloadManager): def _split_generators(self, dl_manager: datasets.DownloadManager):
file_paths = [dl_manager.download(_BASE_DATA_URL.format(idx=idx)) for idx in range(10)] # multiple shards file_paths = [dl_manager.download(_BASE_DATA_URL.format(_HF_ENDPOINT=_HF_ENDPOINT,idx=idx)) for idx in range(10)] # multiple shards
return [ return [
datasets.SplitGenerator( datasets.SplitGenerator(
name=datasets.Split.TRAIN, name=datasets.Split.TRAIN,