LLaMA-Factory-310P3/data/example_dataset/example_dataset.py

import json
from typing import Any, Dict, Generator, List, Tuple

import datasets


_DESCRIPTION = "An example of dataset."
_CITATION = ""
_HOMEPAGE = ""
_LICENSE = ""
_URL = "examples.json"


class ExampleDataset(datasets.GeneratorBasedBuilder):
    VERSION = datasets.Version("0.0.0")

    def _info(self) -> datasets.DatasetInfo:
        features = datasets.Features(
            {
                "instruction": datasets.Value("string"),
                "input": datasets.Value("string"),
                "output": datasets.Value("string"),
                "history": datasets.Sequence(datasets.Sequence(datasets.Value("string"))),
            }
        )
        return datasets.DatasetInfo(
            description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION
        )

    def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
        file_path = dl_manager.download(_URL)
        return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": file_path})]

    def _generate_examples(self, filepath: str) -> Generator[Tuple[int, Dict[str, Any]], None, None]:
        example_dataset = json.load(open(filepath, "r", encoding="utf-8"))
        for key, example in enumerate(example_dataset):
            yield key, example
Initial commit 2023-05-28 18:09:04 +08:00			`import json`
update parser 2024-03-10 13:35:20 +08:00			`from typing import Any, Dict, Generator, List, Tuple`
Initial commit 2023-05-28 18:09:04 +08:00
add dpo mix dataset 2024-04-20 01:31:38 +08:00			`import datasets`

Initial commit 2023-05-28 18:09:04 +08:00
add template, modify datasets 2023-11-09 15:53:23 +08:00			`_DESCRIPTION = "An example of dataset."`
Initial commit 2023-05-28 18:09:04 +08:00			`_CITATION = ""`
			`_HOMEPAGE = ""`
			`_LICENSE = ""`
			`_URL = "examples.json"`


			`class ExampleDataset(datasets.GeneratorBasedBuilder):`
			`VERSION = datasets.Version("0.0.0")`

			`def _info(self) -> datasets.DatasetInfo:`
add dpo mix dataset 2024-04-20 01:31:38 +08:00			`features = datasets.Features(`
			`{`
			`"instruction": datasets.Value("string"),`
			`"input": datasets.Value("string"),`
			`"output": datasets.Value("string"),`
			`"history": datasets.Sequence(datasets.Sequence(datasets.Value("string"))),`
			`}`
			`)`
Initial commit 2023-05-28 18:09:04 +08:00			`return datasets.DatasetInfo(`
add dpo mix dataset 2024-04-20 01:31:38 +08:00			`description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION`
Initial commit 2023-05-28 18:09:04 +08:00			`)`

			`def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:`
			`file_path = dl_manager.download(_URL)`
add dpo mix dataset 2024-04-20 01:31:38 +08:00			`return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": file_path})]`
Initial commit 2023-05-28 18:09:04 +08:00
update parser 2024-03-10 13:35:20 +08:00			`def _generate_examples(self, filepath: str) -> Generator[Tuple[int, Dict[str, Any]], None, None]:`
Initial commit 2023-05-28 18:09:04 +08:00			`example_dataset = json.load(open(filepath, "r", encoding="utf-8"))`
			`for key, example in enumerate(example_dataset):`
			`yield key, example`