2023-05-28 18:09:04 +08:00
|
|
|
import json
|
2024-03-10 13:35:20 +08:00
|
|
|
from typing import Any, Dict, Generator, List, Tuple
|
2023-05-28 18:09:04 +08:00
|
|
|
|
2024-04-20 01:31:38 +08:00
|
|
|
import datasets
|
|
|
|
|
2023-05-28 18:09:04 +08:00
|
|
|
|
2023-11-09 15:53:23 +08:00
|
|
|
_DESCRIPTION = "An example of dataset."
|
2023-05-28 18:09:04 +08:00
|
|
|
_CITATION = ""
|
|
|
|
_HOMEPAGE = ""
|
|
|
|
_LICENSE = ""
|
|
|
|
_URL = "examples.json"
|
|
|
|
|
|
|
|
|
|
|
|
class ExampleDataset(datasets.GeneratorBasedBuilder):
|
|
|
|
VERSION = datasets.Version("0.0.0")
|
|
|
|
|
|
|
|
def _info(self) -> datasets.DatasetInfo:
|
2024-04-20 01:31:38 +08:00
|
|
|
features = datasets.Features(
|
|
|
|
{
|
|
|
|
"instruction": datasets.Value("string"),
|
|
|
|
"input": datasets.Value("string"),
|
|
|
|
"output": datasets.Value("string"),
|
|
|
|
"history": datasets.Sequence(datasets.Sequence(datasets.Value("string"))),
|
|
|
|
}
|
|
|
|
)
|
2023-05-28 18:09:04 +08:00
|
|
|
return datasets.DatasetInfo(
|
2024-04-20 01:31:38 +08:00
|
|
|
description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION
|
2023-05-28 18:09:04 +08:00
|
|
|
)
|
|
|
|
|
|
|
|
def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
|
|
|
|
file_path = dl_manager.download(_URL)
|
2024-04-20 01:31:38 +08:00
|
|
|
return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": file_path})]
|
2023-05-28 18:09:04 +08:00
|
|
|
|
2024-03-10 13:35:20 +08:00
|
|
|
def _generate_examples(self, filepath: str) -> Generator[Tuple[int, Dict[str, Any]], None, None]:
|
2023-05-28 18:09:04 +08:00
|
|
|
example_dataset = json.load(open(filepath, "r", encoding="utf-8"))
|
|
|
|
for key, example in enumerate(example_dataset):
|
|
|
|
yield key, example
|