From e165965341a150f6faa2c072a9281ad99d7e5ce8 Mon Sep 17 00:00:00 2001 From: SirlyDreamer <45280500+SirlyDreamer@users.noreply.github.com> Date: Wed, 20 Mar 2024 08:31:30 +0000 Subject: [PATCH] Follow HF_ENDPOINT environment variable --- data/belle_multiturn/belle_multiturn.py | 6 ++++-- data/hh_rlhf_en/hh_rlhf_en.py | 8 ++++++-- data/ultra_chat/ultra_chat.py | 8 +++++--- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/data/belle_multiturn/belle_multiturn.py b/data/belle_multiturn/belle_multiturn.py index 4b42527a..86140ed9 100644 --- a/data/belle_multiturn/belle_multiturn.py +++ b/data/belle_multiturn/belle_multiturn.py @@ -1,6 +1,8 @@ +import os import json import datasets +_HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co") _DESCRIPTION = "BELLE multiturn chat dataset." @@ -13,9 +15,9 @@ _CITATION = """\ } """ -_HOMEPAGE = "https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M" +_HOMEPAGE = f"{_HF_ENDPOINT}/datasets/BelleGroup/multiturn_chat_0.8M" _LICENSE = "gpl-3.0" -_URL = "https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M/resolve/main/multiturn_chat_0.8M.json" +_URL = f"{_HF_ENDPOINT}/datasets/BelleGroup/multiturn_chat_0.8M/resolve/main/multiturn_chat_0.8M.json" class BelleMultiturn(datasets.GeneratorBasedBuilder): diff --git a/data/hh_rlhf_en/hh_rlhf_en.py b/data/hh_rlhf_en/hh_rlhf_en.py index daa9bf98..3f596d43 100644 --- a/data/hh_rlhf_en/hh_rlhf_en.py +++ b/data/hh_rlhf_en/hh_rlhf_en.py @@ -1,13 +1,17 @@ +import os import json import datasets from typing import List +_HF_ENDPOINT = os.getenv("_HF_ENDPOINT", "https://huggingface.co") _DESCRIPTION = "Human preference data about helpfulness and harmlessness." + _CITATION = "" -_HOMEPAGE = "https://huggingface.co/datasets/Anthropic/hh-rlhf" + +_HOMEPAGE = f"{_HF_ENDPOINT}/datasets/Anthropic/hh-rlhf" _LICENSE = "mit" -_URL = "https://huggingface.co/datasets/Anthropic/hh-rlhf/resolve/main/" +_URL = f"{_HF_ENDPOINT}/datasets/Anthropic/hh-rlhf/resolve/main/" _URLS = { "train": [ _URL + "harmless-base/train.jsonl.gz", diff --git a/data/ultra_chat/ultra_chat.py b/data/ultra_chat/ultra_chat.py index df6a23fb..1a337812 100644 --- a/data/ultra_chat/ultra_chat.py +++ b/data/ultra_chat/ultra_chat.py @@ -1,7 +1,9 @@ +import os import json import datasets from typing import List +_HF_ENDPOINT = os.getenv("_HF_ENDPOINT", "https://huggingface.co") _DESCRIPTION = "UltraChat: Large-scale, Informative, and Diverse Multi-round Dialogue Data." @@ -16,9 +18,9 @@ _CITATION = """\ } """ -_HOMEPAGE = "https://huggingface.co/datasets/stingning/ultrachat" +_HOMEPAGE = f"{_HF_ENDPOINT}/datasets/stingning/ultrachat" _LICENSE = "cc-by-nc-4.0" -_BASE_DATA_URL = "https://huggingface.co/datasets/stingning/ultrachat/resolve/main/train_{idx}.jsonl" +_BASE_DATA_URL = "{_HF_ENDPOINT}/datasets/stingning/ultrachat/resolve/main/train_{idx}.jsonl" class UltraChat(datasets.GeneratorBasedBuilder): @@ -38,7 +40,7 @@ class UltraChat(datasets.GeneratorBasedBuilder): ) def _split_generators(self, dl_manager: datasets.DownloadManager): - file_paths = [dl_manager.download(_BASE_DATA_URL.format(idx=idx)) for idx in range(10)] # multiple shards + file_paths = [dl_manager.download(_BASE_DATA_URL.format(_HF_ENDPOINT=_HF_ENDPOINT,idx=idx)) for idx in range(10)] # multiple shards return [ datasets.SplitGenerator( name=datasets.Split.TRAIN,