diff --git a/data/README_zh.md b/data/README_zh.md index 7bf4fdba..7456ed1d 100644 --- a/data/README_zh.md +++ b/data/README_zh.md @@ -11,9 +11,9 @@ "formatting": "数据集格式(可选,默认:alpaca,可以为 alpaca 或 sharegpt)", "ranking": "是否为偏好数据集(可选,默认:False)", "subset": "数据集子集的名称(可选,默认:None)", + "split": "所使用的数据集切分(可选,默认:train)", "folder": "Hugging Face 仓库的文件夹名称(可选,默认:None)", - "num_samples": "该数据集中用于训练的样本数量。(可选,默认:None)", - "split": "数据集中的要使用的训练测试集切分(可选,默认:train)", + "num_samples": "该数据集所使用的样本数量。(可选,默认:None)", "columns(可选)": { "prompt": "数据集代表提示词的表头名称(默认:instruction)", "query": "数据集代表请求的表头名称(默认:input)", diff --git a/data/dataset_info.json b/data/dataset_info.json index e4b5a384..23b7e5d8 100644 --- a/data/dataset_info.json +++ b/data/dataset_info.json @@ -181,7 +181,7 @@ "response": "summary" } }, - "adgen_val": { + "adgen_eval": { "hf_hub_url": "HasturOfficial/adgen", "ms_hub_url": "AI-ModelScope/adgen", "split": "validation", diff --git a/examples/train_lora/llama3_lora_predict.yaml b/examples/train_lora/llama3_lora_predict.yaml index 148c8635..c406251e 100644 --- a/examples/train_lora/llama3_lora_predict.yaml +++ b/examples/train_lora/llama3_lora_predict.yaml @@ -8,7 +8,7 @@ do_predict: true finetuning_type: lora ### dataset -dataset: identity,alpaca_en_demo +eval_dataset: identity,alpaca_en_demo template: llama3 cutoff_len: 1024 max_samples: 50 diff --git a/scripts/cal_lr.py b/scripts/cal_lr.py index a38f34e1..bc629a70 100644 --- a/scripts/cal_lr.py +++ b/scripts/cal_lr.py @@ -61,11 +61,12 @@ def calculate_lr( packing=packing, output_dir="dummy_dir", overwrite_cache=True, + do_train=True, ) ) tokenizer_module = load_tokenizer(model_args) tokenizer = tokenizer_module["tokenizer"] - dataset_module = get_dataset(model_args, data_args, training_args, stage, **tokenizer_module) + trainset = get_dataset(model_args, data_args, training_args, stage, **tokenizer_module)["train_dataset"] if stage == "pt": data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) elif stage == "sft": @@ -73,7 +74,7 @@ def calculate_lr( else: raise NotImplementedError("Stage does not supported: {}.".format(stage)) - dataloader = DataLoader(dataset_module["eval_dataset"], batch_size, shuffle=False, collate_fn=data_collator, pin_memory=True) + dataloader = DataLoader(trainset, batch_size, shuffle=False, collate_fn=data_collator, pin_memory=True) valid_tokens, total_tokens = 0, 0 for batch in tqdm(dataloader): valid_tokens += torch.sum(batch["labels"] != IGNORE_INDEX).item() diff --git a/scripts/cal_ppl.py b/scripts/cal_ppl.py index 3daa35ae..1a5f9034 100644 --- a/scripts/cal_ppl.py +++ b/scripts/cal_ppl.py @@ -83,11 +83,12 @@ def cal_ppl( train_on_prompt=train_on_prompt, output_dir="dummy_dir", overwrite_cache=True, + do_train=True, ) ) tokenizer_module = load_tokenizer(model_args) tokenizer = tokenizer_module["tokenizer"] - dataset_module = get_dataset(model_args, data_args, training_args, stage, **tokenizer_module) + trainset = get_dataset(model_args, data_args, training_args, stage, **tokenizer_module)["train_dataset"] model = load_model(tokenizer, model_args, finetuning_args, is_trainable=False) if stage == "pt": data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) @@ -100,7 +101,7 @@ def cal_ppl( else: raise NotImplementedError("Stage does not supported: {}.".format(stage)) - dataloader = DataLoader(dataset_module["eval_dataset"], batch_size, shuffle=False, collate_fn=data_collator, pin_memory=True) + dataloader = DataLoader(trainset, batch_size, shuffle=False, collate_fn=data_collator, pin_memory=True) criterion = torch.nn.CrossEntropyLoss(reduction="none") total_ppl = 0 perplexities = [] diff --git a/scripts/length_cdf.py b/scripts/length_cdf.py index cef46416..65a51872 100644 --- a/scripts/length_cdf.py +++ b/scripts/length_cdf.py @@ -44,13 +44,14 @@ def length_cdf( cutoff_len=1_000_000, output_dir="dummy_dir", overwrite_cache=True, + do_train=True, ) ) tokenizer_module = load_tokenizer(model_args) - dataset_module = get_dataset(model_args, data_args, training_args, stage="sft", **tokenizer_module) - total_num = len(dataset_module["eval_dataset"]) + trainset = get_dataset(model_args, data_args, training_args, stage="sft", **tokenizer_module)["train_dataset"] + total_num = len(trainset) length_dict = defaultdict(int) - for sample in tqdm(dataset_module["eval_dataset"]["input_ids"]): + for sample in tqdm(trainset["input_ids"]): length_dict[len(sample) // interval * interval] += 1 length_tuples = list(length_dict.items()) diff --git a/src/llamafactory/eval/evaluator.py b/src/llamafactory/eval/evaluator.py index d3140793..b5269906 100644 --- a/src/llamafactory/eval/evaluator.py +++ b/src/llamafactory/eval/evaluator.py @@ -37,7 +37,6 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -import inspect import json import os from typing import Any, Dict, List, Optional @@ -88,18 +87,13 @@ class Evaluator: pbar = tqdm(categorys.keys(), desc="Processing subjects", position=0) results = {} for subject in pbar: - if "trust_remote_code" in inspect.signature(load_dataset).parameters: # for datasets==2.16.0 - kwargs = {"trust_remote_code": True} - else: - kwargs = {} - dataset = load_dataset( path=os.path.join(self.eval_args.task_dir, self.eval_args.task), name=subject, cache_dir=self.model_args.cache_dir, download_mode=self.eval_args.download_mode, token=self.model_args.hf_hub_token, - **kwargs, + trust_remote_code=True, ) pbar.set_postfix_str(categorys[subject]["name"]) inputs, outputs, labels = [], [], [] diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py index d4ac405a..cca8c505 100644 --- a/src/llamafactory/hparams/parser.py +++ b/src/llamafactory/hparams/parser.py @@ -104,7 +104,7 @@ def _verify_model_args( raise ValueError("Quantized model only accepts a single adapter. Merge them first.") if data_args.template == "yi" and model_args.use_fast_tokenizer: - logger.warning("We should use slow tokenizer for the Yi models.") + logger.warning("We should use slow tokenizer for the Yi models. Change `use_fast_tokenizer` to False.") model_args.use_fast_tokenizer = False @@ -203,6 +203,14 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS: if training_args.do_train and training_args.predict_with_generate: raise ValueError("`predict_with_generate` cannot be set as True while training.") + if training_args.do_train and data_args.dataset is None: + raise ValueError("Please specify dataset for training.") + + if (training_args.do_eval or training_args.do_predict) and ( + data_args.eval_dataset is None and data_args.val_size < 1e-6 + ): + raise ValueError("Please specify dataset for evaluation.") + if training_args.do_train and model_args.quantization_device_map == "auto": raise ValueError("Cannot use device map for quantized models in training.") @@ -242,7 +250,7 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS: raise ValueError("Unsloth is incompatible with DeepSpeed ZeRO-3.") if data_args.neat_packing and not data_args.packing: - logger.warning("`neat_packing` requires `packing` is True. Change it to True.") + logger.warning("`neat_packing` requires `packing` is True. Change `packing` to True.") data_args.packing = True _verify_model_args(model_args, data_args, finetuning_args) diff --git a/src/llamafactory/model/model_utils/longlora.py b/src/llamafactory/model/model_utils/longlora.py index b8e32903..53570a16 100644 --- a/src/llamafactory/model/model_utils/longlora.py +++ b/src/llamafactory/model/model_utils/longlora.py @@ -71,8 +71,6 @@ def llama_attention_forward( cos, sin = self.rotary_emb(value_states, position_ids) query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) - past_key_value = getattr(self, "past_key_value", past_key_value) - if past_key_value is not None: cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) @@ -156,8 +154,6 @@ def llama_flash_attention_2_forward( cos, sin = self.rotary_emb(value_states, position_ids) query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) - past_key_value = getattr(self, "past_key_value", past_key_value) - if past_key_value is not None: cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) diff --git a/src/llamafactory/train/dpo/workflow.py b/src/llamafactory/train/dpo/workflow.py index c004363a..f474a90f 100644 --- a/src/llamafactory/train/dpo/workflow.py +++ b/src/llamafactory/train/dpo/workflow.py @@ -17,7 +17,7 @@ from typing import TYPE_CHECKING, List, Optional -from ...data import PairwiseDataCollatorWithPadding, get_dataset, split_dataset +from ...data import PairwiseDataCollatorWithPadding, get_dataset from ...extras.constants import IGNORE_INDEX from ...extras.ploting import plot_loss from ...hparams import ModelArguments @@ -70,8 +70,8 @@ def run_dpo( finetuning_args=finetuning_args, data_collator=data_collator, callbacks=callbacks, - **tokenizer_module, **dataset_module, + **tokenizer_module, ) # Training diff --git a/src/llamafactory/train/kto/workflow.py b/src/llamafactory/train/kto/workflow.py index b2d0c82e..fa85de37 100644 --- a/src/llamafactory/train/kto/workflow.py +++ b/src/llamafactory/train/kto/workflow.py @@ -17,7 +17,7 @@ from typing import TYPE_CHECKING, List, Optional -from ...data import KTODataCollatorWithPadding, get_dataset, split_dataset +from ...data import KTODataCollatorWithPadding, get_dataset from ...extras.constants import IGNORE_INDEX from ...extras.ploting import plot_loss from ...hparams import ModelArguments @@ -67,8 +67,8 @@ def run_kto( finetuning_args=finetuning_args, data_collator=data_collator, callbacks=callbacks, - **tokenizer_module, **dataset_module, + **tokenizer_module, ) # Training diff --git a/src/llamafactory/train/ppo/trainer.py b/src/llamafactory/train/ppo/trainer.py index 31d461e3..c0158399 100644 --- a/src/llamafactory/train/ppo/trainer.py +++ b/src/llamafactory/train/ppo/trainer.py @@ -77,9 +77,13 @@ class CustomPPOTrainer(PPOTrainer, Trainer): ref_model: Optional["AutoModelForCausalLMWithValueHead"], tokenizer: "PreTrainedTokenizer", processor: Optional["ProcessorMixin"], - dataset: "Dataset", data_collator: "DataCollatorWithPadding", + train_dataset: Optional["Dataset"] = None, + eval_dataset: Optional["Dataset"] = None, ) -> None: + if eval_dataset is not None: + raise NotImplementedError("PPOTrainer does not support eval dataset yet.") + backward_batch_size = training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps ppo_config = PPOConfig( model_name=model_args.model_name_or_path, @@ -115,7 +119,7 @@ class CustomPPOTrainer(PPOTrainer, Trainer): num_training_steps = training_args.max_steps else: total_train_batch_size = backward_batch_size * finetuning_args.ppo_buffer_size * training_args.world_size - num_training_steps = training_args.num_train_epochs * math.ceil(len(dataset) / total_train_batch_size) + num_training_steps = training_args.num_train_epochs * math.ceil(len(train_dataset) / total_train_batch_size) optimizer = self.create_optimizer(model, training_args, finetuning_args) scheduler = self.create_scheduler(training_args, num_training_steps, optimizer) @@ -126,7 +130,7 @@ class CustomPPOTrainer(PPOTrainer, Trainer): model=model, ref_model=ref_model, tokenizer=tokenizer, - dataset=dataset, + dataset=train_dataset, data_collator=data_collator, lr_scheduler=scheduler, ) diff --git a/src/llamafactory/train/ppo/workflow.py b/src/llamafactory/train/ppo/workflow.py index 53d9f18f..6cea52d9 100644 --- a/src/llamafactory/train/ppo/workflow.py +++ b/src/llamafactory/train/ppo/workflow.py @@ -63,8 +63,8 @@ def run_ppo( model=model, reward_model=reward_model, ref_model=ref_model, - dataset=dataset_module["train_dataset"], data_collator=data_collator, + **dataset_module, **tokenizer_module, ) diff --git a/src/llamafactory/train/pt/workflow.py b/src/llamafactory/train/pt/workflow.py index 2f27d6cd..1052a9d1 100644 --- a/src/llamafactory/train/pt/workflow.py +++ b/src/llamafactory/train/pt/workflow.py @@ -20,7 +20,7 @@ from typing import TYPE_CHECKING, List, Optional from transformers import DataCollatorForLanguageModeling -from ...data import get_dataset, split_dataset +from ...data import get_dataset from ...extras.ploting import plot_loss from ...model import load_model, load_tokenizer from ..trainer_utils import create_modelcard_and_push @@ -53,8 +53,8 @@ def run_pt( finetuning_args=finetuning_args, data_collator=data_collator, callbacks=callbacks, - **tokenizer_module, **dataset_module, + **tokenizer_module, ) # Training diff --git a/src/llamafactory/train/rm/workflow.py b/src/llamafactory/train/rm/workflow.py index 54fa7fd0..d81892f2 100644 --- a/src/llamafactory/train/rm/workflow.py +++ b/src/llamafactory/train/rm/workflow.py @@ -17,7 +17,7 @@ from typing import TYPE_CHECKING, List, Optional -from ...data import PairwiseDataCollatorWithPadding, get_dataset, split_dataset +from ...data import PairwiseDataCollatorWithPadding, get_dataset from ...extras.ploting import plot_loss from ...model import load_model, load_tokenizer from ..callbacks import fix_valuehead_checkpoint @@ -56,8 +56,8 @@ def run_rm( data_collator=data_collator, callbacks=callbacks, compute_metrics=compute_accuracy, - **tokenizer_module, **dataset_module, + **tokenizer_module, ) # Training diff --git a/src/llamafactory/train/sft/workflow.py b/src/llamafactory/train/sft/workflow.py index b0bacc33..880c38e1 100644 --- a/src/llamafactory/train/sft/workflow.py +++ b/src/llamafactory/train/sft/workflow.py @@ -17,7 +17,7 @@ from typing import TYPE_CHECKING, List, Optional -from ...data import SFTDataCollatorWith4DAttentionMask, get_dataset, split_dataset +from ...data import SFTDataCollatorWith4DAttentionMask, get_dataset from ...extras.constants import IGNORE_INDEX from ...extras.misc import get_logits_processor from ...extras.ploting import plot_loss @@ -75,8 +75,8 @@ def run_sft( callbacks=callbacks, compute_metrics=ComputeMetrics(tokenizer) if training_args.predict_with_generate else compute_accuracy, preprocess_logits_for_metrics=None if training_args.predict_with_generate else eval_logit_processor, - **tokenizer_module, **dataset_module, + **tokenizer_module, ) # Keyword arguments for `model.generate` diff --git a/src/llamafactory/train/trainer_utils.py b/src/llamafactory/train/trainer_utils.py index 4b581691..ffec4776 100644 --- a/src/llamafactory/train/trainer_utils.py +++ b/src/llamafactory/train/trainer_utils.py @@ -79,7 +79,7 @@ def create_modelcard_and_push( "tags": ["llama-factory", finetuning_args.finetuning_type], } if data_args.dataset is not None: - kwargs["dataset"] = [dataset.strip() for dataset in data_args.dataset.split(",")] + kwargs["dataset"] = data_args.dataset if model_args.use_unsloth: kwargs["tags"] = kwargs["tags"] + ["unsloth"] diff --git a/src/llamafactory/webui/common.py b/src/llamafactory/webui/common.py index bced18f0..e83cadd9 100644 --- a/src/llamafactory/webui/common.py +++ b/src/llamafactory/webui/common.py @@ -174,8 +174,8 @@ def load_dataset_info(dataset_dir: str) -> Dict[str, Dict[str, Any]]: r""" Loads dataset_info.json. """ - if dataset_dir == "ONLINE": - logger.info("dataset_dir is ONLINE, using online dataset.") + if dataset_dir == "ONLINE" or dataset_dir.startswith("REMOTE:"): + logger.info("dataset_dir is {}, using online dataset.".format(dataset_dir)) return {} try: diff --git a/src/llamafactory/webui/runner.py b/src/llamafactory/webui/runner.py index a807f3c2..68edd48b 100644 --- a/src/llamafactory/webui/runner.py +++ b/src/llamafactory/webui/runner.py @@ -259,7 +259,7 @@ class Runner: use_unsloth=(get("top.booster") == "unsloth"), visual_inputs=get("top.visual_inputs"), dataset_dir=get("eval.dataset_dir"), - dataset=",".join(get("eval.dataset")), + eval_dataset=",".join(get("eval.dataset")), cutoff_len=get("eval.cutoff_len"), max_samples=int(get("eval.max_samples")), per_device_eval_batch_size=get("eval.batch_size"),