diff --git a/data/README.md b/data/README.md index 5ceae666..0f14bef8 100644 --- a/data/README.md +++ b/data/README.md @@ -12,7 +12,8 @@ Currently we support datasets in **alpaca** and **sharegpt** format. "ranking": "whether the dataset is a preference dataset or not. (default: False)", "subset": "the name of the subset. (optional, default: None)", "folder": "the name of the folder of the dataset repository on the Hugging Face hub. (optional, default: None)", - "num_samples": "the number of samples in the dataset used for training. (optional, default: None)", + "num_samples": "the number of samples in the dataset used for training. (optional, default: None)", + "split": "which dataset split to use for training and evaluation (optional, default: train)", "columns (optional)": { "prompt": "the column name in the dataset containing the prompts. (default: instruction)", "query": "the column name in the dataset containing the queries. (default: input)", diff --git a/data/README_zh.md b/data/README_zh.md index 1795f352..7bf4fdba 100644 --- a/data/README_zh.md +++ b/data/README_zh.md @@ -13,6 +13,7 @@ "subset": "数据集子集的名称(可选,默认:None)", "folder": "Hugging Face 仓库的文件夹名称(可选,默认:None)", "num_samples": "该数据集中用于训练的样本数量。(可选,默认:None)", + "split": "数据集中的要使用的训练测试集切分(可选,默认:train)", "columns(可选)": { "prompt": "数据集代表提示词的表头名称(默认:instruction)", "query": "数据集代表请求的表头名称(默认:input)", diff --git a/data/dataset_info.json b/data/dataset_info.json index f8ffd407..e4b5a384 100644 --- a/data/dataset_info.json +++ b/data/dataset_info.json @@ -172,9 +172,19 @@ "deepctrl": { "ms_hub_url": "deepctrl/deepctrl-sft-data" }, - "adgen": { + "adgen_train": { "hf_hub_url": "HasturOfficial/adgen", "ms_hub_url": "AI-ModelScope/adgen", + "split": "train", + "columns": { + "prompt": "content", + "response": "summary" + } + }, + "adgen_val": { + "hf_hub_url": "HasturOfficial/adgen", + "ms_hub_url": "AI-ModelScope/adgen", + "split": "validation", "columns": { "prompt": "content", "response": "summary" diff --git a/scripts/cal_lr.py b/scripts/cal_lr.py index a103e082..a38f34e1 100644 --- a/scripts/cal_lr.py +++ b/scripts/cal_lr.py @@ -65,7 +65,7 @@ def calculate_lr( ) tokenizer_module = load_tokenizer(model_args) tokenizer = tokenizer_module["tokenizer"] - trainset = get_dataset(model_args, data_args, training_args, stage, **tokenizer_module) + dataset_module = get_dataset(model_args, data_args, training_args, stage, **tokenizer_module) if stage == "pt": data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) elif stage == "sft": @@ -73,7 +73,7 @@ def calculate_lr( else: raise NotImplementedError("Stage does not supported: {}.".format(stage)) - dataloader = DataLoader(trainset, batch_size, shuffle=False, collate_fn=data_collator, pin_memory=True) + dataloader = DataLoader(dataset_module["eval_dataset"], batch_size, shuffle=False, collate_fn=data_collator, pin_memory=True) valid_tokens, total_tokens = 0, 0 for batch in tqdm(dataloader): valid_tokens += torch.sum(batch["labels"] != IGNORE_INDEX).item() diff --git a/scripts/cal_ppl.py b/scripts/cal_ppl.py index 61f76922..3daa35ae 100644 --- a/scripts/cal_ppl.py +++ b/scripts/cal_ppl.py @@ -87,7 +87,7 @@ def cal_ppl( ) tokenizer_module = load_tokenizer(model_args) tokenizer = tokenizer_module["tokenizer"] - trainset = get_dataset(model_args, data_args, training_args, stage, **tokenizer_module) + dataset_module = get_dataset(model_args, data_args, training_args, stage, **tokenizer_module) model = load_model(tokenizer, model_args, finetuning_args, is_trainable=False) if stage == "pt": data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) @@ -100,7 +100,7 @@ def cal_ppl( else: raise NotImplementedError("Stage does not supported: {}.".format(stage)) - dataloader = DataLoader(trainset, batch_size, shuffle=False, collate_fn=data_collator, pin_memory=True) + dataloader = DataLoader(dataset_module["eval_dataset"], batch_size, shuffle=False, collate_fn=data_collator, pin_memory=True) criterion = torch.nn.CrossEntropyLoss(reduction="none") total_ppl = 0 perplexities = [] diff --git a/scripts/length_cdf.py b/scripts/length_cdf.py index 4cdf01e6..cef46416 100644 --- a/scripts/length_cdf.py +++ b/scripts/length_cdf.py @@ -47,10 +47,10 @@ def length_cdf( ) ) tokenizer_module = load_tokenizer(model_args) - trainset = get_dataset(model_args, data_args, training_args, stage="sft", **tokenizer_module) - total_num = len(trainset) + dataset_module = get_dataset(model_args, data_args, training_args, stage="sft", **tokenizer_module) + total_num = len(dataset_module["eval_dataset"]) length_dict = defaultdict(int) - for sample in tqdm(trainset["input_ids"]): + for sample in tqdm(dataset_module["eval_dataset"]["input_ids"]): length_dict[len(sample) // interval * interval] += 1 length_tuples = list(length_dict.items()) diff --git a/src/llamafactory/data/loader.py b/src/llamafactory/data/loader.py index 8e7062db..d527d7d2 100644 --- a/src/llamafactory/data/loader.py +++ b/src/llamafactory/data/loader.py @@ -15,7 +15,7 @@ import inspect import os import sys -from typing import TYPE_CHECKING, Literal, Optional, Union +from typing import TYPE_CHECKING, Literal, Optional, Union, Dict import numpy as np from datasets import load_dataset, load_from_disk @@ -24,10 +24,10 @@ from ..extras.constants import FILEEXT2TYPE from ..extras.logging import get_logger from ..extras.misc import has_tokenized_data from .aligner import align_dataset -from .data_utils import merge_dataset +from .data_utils import merge_dataset, split_dataset from .parser import get_dataset_list from .preprocess import get_preprocess_and_print_func -from .template import get_template_and_fix_tokenizer +from .template import get_template_and_fix_tokenizer, Template if TYPE_CHECKING: @@ -91,7 +91,7 @@ def load_single_dataset( subset_name=data_name, data_dir=data_dir, data_files=data_files, - split=data_args.split, + split=dataset_attr.split, cache_dir=cache_dir, token=model_args.ms_hub_token, use_streaming=(data_args.streaming and (dataset_attr.load_from != "file")), @@ -111,7 +111,7 @@ def load_single_dataset( name=data_name, data_dir=data_dir, data_files=data_files, - split=data_args.split, + split=dataset_attr.split, cache_dir=model_args.cache_dir, token=model_args.hf_hub_token, streaming=(data_args.streaming and (dataset_attr.load_from != "file")), @@ -140,20 +140,17 @@ def load_single_dataset( return align_dataset(dataset, dataset_attr, data_args, training_args) -def get_dataset( +def load_and_preprocess( model_args: "ModelArguments", data_args: "DataArguments", training_args: "Seq2SeqTrainingArguments", stage: Literal["pt", "sft", "rm", "ppo", "kto"], tokenizer: "PreTrainedTokenizer", + template: "Template", processor: Optional["ProcessorMixin"] = None, + is_eval: bool = False ) -> Union["Dataset", "IterableDataset"]: - template = get_template_and_fix_tokenizer(tokenizer, data_args.template, data_args.tool_format) - if data_args.train_on_prompt and template.efficient_eos: - raise ValueError("Current template does not support `train_on_prompt`.") - - # Load tokenized dataset - if data_args.tokenized_path is not None: + if not is_eval and data_args.tokenized_path is not None: if has_tokenized_data(data_args.tokenized_path): logger.warning("Loading dataset from disk will ignore other data arguments.") dataset = load_from_disk(data_args.tokenized_path) @@ -165,9 +162,21 @@ def get_dataset( if data_args.streaming: raise ValueError("Turn off `streaming` when saving dataset to disk.") + if is_eval and data_args.eval_tokenized_path is not None: + if has_tokenized_data(data_args.eval_tokenized_path): + logger.warning("Loading dataset from disk will ignore other data arguments.") + dataset = load_from_disk(data_args.eval_tokenized_path) + logger.info("Loaded tokenized dataset from {}.".format(data_args.eval_tokenized_path)) + if data_args.streaming: + dataset = dataset.to_iterable_dataset() + return dataset + + if data_args.streaming: + raise ValueError("Turn off `streaming` when saving dataset to disk.") + with training_args.main_process_first(desc="load dataset"): all_datasets = [] - for dataset_attr in get_dataset_list(data_args): + for dataset_attr in get_dataset_list(data_args, data_args.eval_dataset if is_eval else data_args.dataset): if (stage == "rm" and dataset_attr.ranking is False) or (stage != "rm" and dataset_attr.ranking is True): raise ValueError("The dataset is not applicable in the current training stage.") @@ -190,13 +199,20 @@ def get_dataset( dataset = dataset.map(preprocess_func, batched=True, remove_columns=column_names, **kwargs) - if data_args.tokenized_path is not None: + if not is_eval and data_args.tokenized_path is not None: if training_args.should_save: dataset.save_to_disk(data_args.tokenized_path) logger.info("Tokenized dataset saved at {}.".format(data_args.tokenized_path)) logger.info("Please restart the training with `tokenized_path: {}`.".format(data_args.tokenized_path)) sys.exit(0) + if is_eval and data_args.eval_tokenized_path is not None: + if training_args.should_save: + dataset.save_to_disk(data_args.eval_tokenized_path) + logger.info("Tokenized dataset saved at {}.".format(data_args.eval_tokenized_path)) + logger.info("Please restart the training with `tokenized_path: {}`.".format(data_args.eval_tokenized_path)) + + sys.exit(0) if training_args.should_log: try: @@ -208,3 +224,24 @@ def get_dataset( raise RuntimeError("Cannot find valid samples, check `data/README.md` for the data format.") return dataset + + +def get_dataset( + model_args: "ModelArguments", + data_args: "DataArguments", + training_args: "Seq2SeqTrainingArguments", + stage: Literal["pt", "sft", "rm", "ppo", "kto"], + tokenizer: "PreTrainedTokenizer", + processor: Optional["ProcessorMixin"] = None +) -> Dict[str, "Dataset"]: + template = get_template_and_fix_tokenizer(tokenizer, data_args.template, data_args.tool_format) + if data_args.train_on_prompt and template.efficient_eos: + raise ValueError("Current template does not support `train_on_prompt`.") + + train_dataset = load_and_preprocess(model_args, data_args, training_args, stage, tokenizer, template, processor) + + if data_args.eval_dataset or data_args.eval_tokenized_path: + eval_dataset = load_and_preprocess(model_args, data_args, training_args, stage, tokenizer, template, processor, True) + return {"train_dataset": train_dataset, "eval_dataset": eval_dataset} + else: + return split_dataset(train_dataset, data_args, training_args) diff --git a/src/llamafactory/data/parser.py b/src/llamafactory/data/parser.py index 5ae79774..c810ec8b 100644 --- a/src/llamafactory/data/parser.py +++ b/src/llamafactory/data/parser.py @@ -40,6 +40,7 @@ class DatasetAttr: subset: Optional[str] = None folder: Optional[str] = None num_samples: Optional[int] = None + split: Optional[str] = "train" # common columns system: Optional[str] = None tools: Optional[str] = None @@ -71,9 +72,9 @@ class DatasetAttr: setattr(self, key, obj.get(key, default)) -def get_dataset_list(data_args: "DataArguments") -> List["DatasetAttr"]: - if data_args.dataset is not None: - dataset_names = [ds.strip() for ds in data_args.dataset.split(",")] +def get_dataset_list(data_args: "DataArguments", dataset: "str" = None) -> List["DatasetAttr"]: + if dataset is not None: + dataset_names = [ds.strip() for ds in dataset.split(",")] else: dataset_names = [] @@ -122,6 +123,8 @@ def get_dataset_list(data_args: "DataArguments") -> List["DatasetAttr"]: dataset_attr.set_attr("subset", dataset_info[name]) dataset_attr.set_attr("folder", dataset_info[name]) dataset_attr.set_attr("num_samples", dataset_info[name]) + if "split" in dataset_info[name]: + dataset_attr.set_attr("split", dataset_info[name]) if "columns" in dataset_info[name]: column_names = ["system", "tools", "images", "chosen", "rejected", "kto_tag"] diff --git a/src/llamafactory/hparams/data_args.py b/src/llamafactory/hparams/data_args.py index a1025af7..7f7e62cd 100644 --- a/src/llamafactory/hparams/data_args.py +++ b/src/llamafactory/hparams/data_args.py @@ -33,6 +33,11 @@ class DataArguments: default=None, metadata={"help": "The name of provided dataset(s) to use. Use commas to separate multiple datasets."}, ) + eval_dataset: Optional[str] = field( + default=None, + metadata={"help": "The name of provided dataset(s) to use for eval during training. " + "Use commas to separate multiple datasets."}, + ) dataset_dir: str = field( default="data", metadata={"help": "Path to the folder containing the datasets."}, @@ -105,6 +110,10 @@ class DataArguments: default=None, metadata={"help": "Path to save or load the tokenized datasets."}, ) + eval_tokenized_path: Optional[str] = field( + default=None, + metadata={"help": "Path to save or load the tokenized eval datasets."}, + ) def __post_init__(self): if self.streaming and self.val_size > 1e-6 and self.val_size < 1: diff --git a/src/llamafactory/train/dpo/workflow.py b/src/llamafactory/train/dpo/workflow.py index 431b5285..c004363a 100644 --- a/src/llamafactory/train/dpo/workflow.py +++ b/src/llamafactory/train/dpo/workflow.py @@ -41,7 +41,7 @@ def run_dpo( ): tokenizer_module = load_tokenizer(model_args) tokenizer = tokenizer_module["tokenizer"] - dataset = get_dataset(model_args, data_args, training_args, stage="rm", **tokenizer_module) + dataset_module = get_dataset(model_args, data_args, training_args, stage="rm", **tokenizer_module) model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train) data_collator = PairwiseDataCollatorWithPadding( @@ -71,7 +71,7 @@ def run_dpo( data_collator=data_collator, callbacks=callbacks, **tokenizer_module, - **split_dataset(dataset, data_args, training_args), + **dataset_module, ) # Training diff --git a/src/llamafactory/train/kto/workflow.py b/src/llamafactory/train/kto/workflow.py index 8182a184..b2d0c82e 100644 --- a/src/llamafactory/train/kto/workflow.py +++ b/src/llamafactory/train/kto/workflow.py @@ -41,7 +41,7 @@ def run_kto( ): tokenizer_module = load_tokenizer(model_args) tokenizer = tokenizer_module["tokenizer"] - dataset = get_dataset(model_args, data_args, training_args, stage="kto", **tokenizer_module) + dataset_module = get_dataset(model_args, data_args, training_args, stage="kto", **tokenizer_module) model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train) data_collator = KTODataCollatorWithPadding( @@ -68,7 +68,7 @@ def run_kto( data_collator=data_collator, callbacks=callbacks, **tokenizer_module, - **split_dataset(dataset, data_args, training_args), + **dataset_module, ) # Training diff --git a/src/llamafactory/train/ppo/workflow.py b/src/llamafactory/train/ppo/workflow.py index f52b80d6..53d9f18f 100644 --- a/src/llamafactory/train/ppo/workflow.py +++ b/src/llamafactory/train/ppo/workflow.py @@ -43,7 +43,7 @@ def run_ppo( ): tokenizer_module = load_tokenizer(model_args) tokenizer = tokenizer_module["tokenizer"] - dataset = get_dataset(model_args, data_args, training_args, stage="ppo", **tokenizer_module) + dataset_module = get_dataset(model_args, data_args, training_args, stage="ppo", **tokenizer_module) model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train, add_valuehead=True) tokenizer.padding_side = "left" # use left-padding in generation while using right-padding in training @@ -63,7 +63,7 @@ def run_ppo( model=model, reward_model=reward_model, ref_model=ref_model, - dataset=dataset, + dataset=dataset_module["train_dataset"], data_collator=data_collator, **tokenizer_module, ) diff --git a/src/llamafactory/train/pt/workflow.py b/src/llamafactory/train/pt/workflow.py index b84a0e7d..2f27d6cd 100644 --- a/src/llamafactory/train/pt/workflow.py +++ b/src/llamafactory/train/pt/workflow.py @@ -42,7 +42,7 @@ def run_pt( ): tokenizer_module = load_tokenizer(model_args) tokenizer = tokenizer_module["tokenizer"] - dataset = get_dataset(model_args, data_args, training_args, stage="pt", **tokenizer_module) + dataset_module = get_dataset(model_args, data_args, training_args, stage="pt", **tokenizer_module) model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) @@ -54,7 +54,7 @@ def run_pt( data_collator=data_collator, callbacks=callbacks, **tokenizer_module, - **split_dataset(dataset, data_args, training_args), + **dataset_module, ) # Training diff --git a/src/llamafactory/train/rm/workflow.py b/src/llamafactory/train/rm/workflow.py index 384814cc..54fa7fd0 100644 --- a/src/llamafactory/train/rm/workflow.py +++ b/src/llamafactory/train/rm/workflow.py @@ -41,7 +41,7 @@ def run_rm( ): tokenizer_module = load_tokenizer(model_args) tokenizer = tokenizer_module["tokenizer"] - dataset = get_dataset(model_args, data_args, training_args, stage="rm", **tokenizer_module) + dataset_module = get_dataset(model_args, data_args, training_args, stage="rm", **tokenizer_module) model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train, add_valuehead=True) data_collator = PairwiseDataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) @@ -57,7 +57,7 @@ def run_rm( callbacks=callbacks, compute_metrics=compute_accuracy, **tokenizer_module, - **split_dataset(dataset, data_args, training_args), + **dataset_module, ) # Training @@ -81,7 +81,7 @@ def run_rm( # Predict if training_args.do_predict: - predict_results = trainer.predict(dataset, metric_key_prefix="predict") + predict_results = trainer.predict(dataset_module["eval_dataset"], metric_key_prefix="predict") trainer.log_metrics("predict", predict_results.metrics) trainer.save_metrics("predict", predict_results.metrics) trainer.save_predictions(predict_results) diff --git a/src/llamafactory/train/sft/workflow.py b/src/llamafactory/train/sft/workflow.py index dea3c1a8..b0bacc33 100644 --- a/src/llamafactory/train/sft/workflow.py +++ b/src/llamafactory/train/sft/workflow.py @@ -43,7 +43,7 @@ def run_sft( ): tokenizer_module = load_tokenizer(model_args) tokenizer = tokenizer_module["tokenizer"] - dataset = get_dataset(model_args, data_args, training_args, stage="sft", **tokenizer_module) + dataset_module = get_dataset(model_args, data_args, training_args, stage="sft", **tokenizer_module) model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train) if training_args.predict_with_generate: @@ -76,7 +76,7 @@ def run_sft( compute_metrics=ComputeMetrics(tokenizer) if training_args.predict_with_generate else compute_accuracy, preprocess_logits_for_metrics=None if training_args.predict_with_generate else eval_logit_processor, **tokenizer_module, - **split_dataset(dataset, data_args, training_args), + **dataset_module, ) # Keyword arguments for `model.generate` @@ -105,12 +105,12 @@ def run_sft( # Predict if training_args.do_predict: - predict_results = trainer.predict(dataset, metric_key_prefix="predict", **gen_kwargs) + predict_results = trainer.predict(dataset_module["eval_dataset"], metric_key_prefix="predict", **gen_kwargs) if training_args.predict_with_generate: # predict_loss will be wrong if predict_with_generate is enabled predict_results.metrics.pop("predict_loss", None) trainer.log_metrics("predict", predict_results.metrics) trainer.save_metrics("predict", predict_results.metrics) - trainer.save_predictions(dataset, predict_results) + trainer.save_predictions(dataset_module["eval_dataset"], predict_results) # Create model card create_modelcard_and_push(trainer, model_args, data_args, training_args, finetuning_args) diff --git a/tests/data/test_supervised.py b/tests/data/test_supervised.py index 9cb49615..7ad52ee8 100644 --- a/tests/data/test_supervised.py +++ b/tests/data/test_supervised.py @@ -47,7 +47,7 @@ def test_supervised(num_samples: int): model_args, data_args, training_args, _, _ = get_train_args(TRAIN_ARGS) tokenizer_module = load_tokenizer(model_args) tokenizer = tokenizer_module["tokenizer"] - tokenized_data = get_dataset(model_args, data_args, training_args, stage="sft", **tokenizer_module) + dataset_module = get_dataset(model_args, data_args, training_args, stage="sft", **tokenizer_module) ref_tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA) @@ -63,5 +63,5 @@ def test_supervised(num_samples: int): {"role": "assistant", "content": original_data[index]["output"]}, ] templated_result = ref_tokenizer.apply_chat_template(messages, tokenize=False) - decoded_result = tokenizer.decode(tokenized_data["input_ids"][index]) + decoded_result = tokenizer.decode(dataset_module["train_dataset"]["input_ids"][index]) assert templated_result == decoded_result