support dataset cache

2023-10-26 21:48:45 +08:00 · 2023-10-26 21:48:45 +08:00 · 3fe7df628d
parent 838ed9aa87
commit 3fe7df628d
2 changed files with 26 additions and 3 deletions
--- a/src/llmtuner/dsets/preprocess.py
+++ b/src/llmtuner/dsets/preprocess.py
@ -1,8 +1,12 @@
+import os
 import tiktoken
-from typing import TYPE_CHECKING, Any, Dict, Generator, List, Literal, Union
 from itertools import chain
+from typing import TYPE_CHECKING, Any, Dict, Generator, List, Literal, Union
+
+from datasets import load_from_disk

 from llmtuner.extras.constants import IGNORE_INDEX
+from llmtuner.extras.logging import get_logger
 from llmtuner.extras.template import get_template_and_fix_tokenizer

 if TYPE_CHECKING:
@ -12,6 +16,9 @@ if TYPE_CHECKING:
    from llmtuner.hparams import DataArguments


+logger = get_logger(__name__)
+
+
 def preprocess_dataset(
    dataset: Union["Dataset", "IterableDataset"],
    tokenizer: "PreTrainedTokenizer",
@ -19,7 +26,6 @@ def preprocess_dataset(
    training_args: "Seq2SeqTrainingArguments",
    stage: Literal["pt", "sft", "rm", "ppo"]
 ) -> Union["Dataset", "IterableDataset"]:
-    column_names = list(next(iter(dataset)).keys())
    template = get_template_and_fix_tokenizer(data_args.template, tokenizer)

    if data_args.train_on_prompt and template.efficient_eos:
@ -226,7 +232,12 @@ def preprocess_dataset(
        preprocess_func = preprocess_unsupervised_dataset
        print_function = print_unsupervised_dataset_example

+    if data_args.cache_path is not None and os.path.exists(data_args.cache_path):
+        logger.warning("Loading dataset from disk will ignore other data arguments.")
+        return load_from_disk(data_args.cache_path)
+
    with training_args.main_process_first(desc="dataset map pre-processing"):
+        column_names = list(next(iter(dataset)).keys())
        kwargs = {}
        if not data_args.streaming:
            kwargs = dict(
@ -242,10 +253,15 @@ def preprocess_dataset(
            **kwargs
        )

+        if data_args.cache_path is not None and not os.path.exists(data_args.cache_path):
+            if training_args.should_save:
+                dataset.save_to_disk(data_args.cache_path)
+            raise SystemExit("Dataset saved, rerun this script with the same `--cache_file`.")
+
        if training_args.should_log:
            try:
                print_function(next(iter(dataset)))
            except StopIteration:
-                raise ValueError("Empty dataset!")
+                raise RuntimeError("Empty dataset!")

        return dataset
--- a/src/llmtuner/hparams/data_args.py
+++ b/src/llmtuner/hparams/data_args.py
@ -98,6 +98,10 @@ class DataArguments:
        default=False,
        metadata={"help": "Packing the questions and answers in the supervised fine-tuning stage."}
    )
+    cache_path: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to save or load the preprocessed datasets."}
+    )

    def __post_init__(self):
        if self.streaming and self.val_size > 1e-6 and self.val_size < 1:
@ -106,6 +110,9 @@ class DataArguments:
        if self.streaming and self.max_samples is not None:
            raise ValueError("`max_samples` is incompatible with `streaming`.")

+        if self.streaming and self.cache_path:
+            raise ValueError("`cache_path` is incompatible with `streaming`.")
+
    def init_for_training(self, seed: int): # support mixing multiple datasets
        self.seed = seed
        dataset_names = [ds.strip() for ds in self.dataset.split(",")] if self.dataset is not None else []