diff --git a/README.md b/README.md index 141587fd..ec4bfe71 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ ## Changelog -[23/07/11] Now we support training the **Baichuan-13B** model in this repo. Try `--model_name_or_path baichuan-inc/Baichuan-13B-Base` and `--lora_target W_pack` arguments to use the Baichuan-13B model. Remember to use `--prompt_template baichuan` argument when you are using the Baichuan-13B-Chat model. +[23/07/11] Now we support training the **Baichuan-13B** model in this repo. Try `--model_name_or_path baichuan-inc/Baichuan-13B-Base`, `--padding_side right` and `--lora_target W_pack` arguments to train the Baichuan-13B model. Remember to use `--prompt_template baichuan` argument when you are using the Baichuan-13B-Chat model. [23/07/09] Now we release [FastEdit](https://github.com/hiyouga/FastEdit)⚡🩹, an easy-to-use package for editing the factual knowledge of large language models efficiently. Please follow [FastEdit](https://github.com/hiyouga/FastEdit) if you are interested. diff --git a/data/dataset_info.json b/data/dataset_info.json index a4323e3b..32a45949 100644 --- a/data/dataset_info.json +++ b/data/dataset_info.json @@ -94,6 +94,9 @@ "history": "history" } }, + "novel_tokens512_50k": { + "hf_hub_url": "zxbsmk/webnovel_cn" + }, "example": { "script_url": "example_dataset", "columns": { @@ -131,7 +134,7 @@ } }, "oaast_rm_zh": { - "file_name": "", + "file_name": "oaast_rm_zh.json", "file_sha1": "1065af1f3784dd61be5e79713a35f427b713a232", "columns": { "prompt": "instruction", @@ -149,8 +152,5 @@ "response": "", "history": "" } - }, - "novel_tokens512_50k": { - "hf_hub_url": "zxbsmk/webnovel_cn" } } diff --git a/src/train_ppo.py b/src/train_ppo.py index e3d2d403..1de2a1a5 100644 --- a/src/train_ppo.py +++ b/src/train_ppo.py @@ -8,9 +8,8 @@ import math from torch.optim import AdamW from transformers.optimization import get_scheduler from trl import PPOConfig - +from transformers import DataCollatorForSeq2Seq from utils import ( - DynamicDataCollatorWithPadding, PPOPeftTrainer, LogCallback, load_pretrained, @@ -28,7 +27,10 @@ def main(): dataset = prepare_data(model_args, data_args) model, tokenizer = load_pretrained(model_args, finetuning_args, training_args.do_train, stage="ppo") dataset = preprocess_data(dataset, tokenizer, data_args, training_args, stage="ppo") - data_collator = DynamicDataCollatorWithPadding(tokenizer) + data_collator = DataCollatorForSeq2Seq( + tokenizer=tokenizer, + label_pad_token_id=tokenizer.pad_token_id + ) ppo_config = PPOConfig( model_name=model_args.model_name_or_path, diff --git a/src/train_pt.py b/src/train_pt.py index 6fedf931..f7123684 100644 --- a/src/train_pt.py +++ b/src/train_pt.py @@ -5,6 +5,8 @@ import math +from transformers import DataCollatorForSeq2Seq +from utils.other import IGNORE_INDEX from utils import ( DynamicDataCollatorWithPadding, @@ -25,7 +27,10 @@ def main(): dataset = prepare_data(model_args, data_args) model, tokenizer = load_pretrained(model_args, finetuning_args, training_args.do_train, stage="pt") dataset = preprocess_data(dataset, tokenizer, data_args, training_args, stage="pt") - data_collator = DynamicDataCollatorWithPadding(tokenizer, data_args.ignore_pad_token_for_loss) + data_collator = DataCollatorForSeq2Seq( + tokenizer=tokenizer, + label_pad_token_id=IGNORE_INDEX if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id + ) # Split the dataset if training_args.do_train: diff --git a/src/train_rm.py b/src/train_rm.py index 117aa13d..3d809758 100644 --- a/src/train_rm.py +++ b/src/train_rm.py @@ -17,6 +17,7 @@ from utils import ( plot_loss ) + def main(): # Prepare pretrained model and dataset diff --git a/src/train_sft.py b/src/train_sft.py index 30ca2e2c..49c53cb8 100644 --- a/src/train_sft.py +++ b/src/train_sft.py @@ -4,8 +4,9 @@ # https://github.com/huggingface/transformers/blob/v4.29.2/examples/pytorch/summarization/run_summarization.py +from transformers import DataCollatorForSeq2Seq +from utils.other import IGNORE_INDEX from utils import ( - DynamicDataCollatorWithPadding, Seq2SeqPeftTrainer, ComputeMetrics, LogCallback, @@ -25,9 +26,9 @@ def main(): dataset = prepare_data(model_args, data_args) model, tokenizer = load_pretrained(model_args, finetuning_args, training_args.do_train, stage="sft") dataset = preprocess_data(dataset, tokenizer, data_args, training_args, stage="sft") - data_collator = DynamicDataCollatorWithPadding( + data_collator = DataCollatorForSeq2Seq( tokenizer=tokenizer, - ignore_pad_token_for_loss=(data_args.ignore_pad_token_for_loss and not training_args.predict_with_generate) + label_pad_token_id=IGNORE_INDEX if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id ) # Override the decoding parameters of Seq2SeqTrainer diff --git a/src/utils/__init__.py b/src/utils/__init__.py index 3c08e3d2..977b58c3 100644 --- a/src/utils/__init__.py +++ b/src/utils/__init__.py @@ -6,8 +6,6 @@ from .common import ( preprocess_data ) -from .data_collator import DynamicDataCollatorWithPadding - from .peft_trainer import PeftTrainer, LogCallback from .seq2seq import ComputeMetrics, Seq2SeqPeftTrainer diff --git a/src/utils/common.py b/src/utils/common.py index ea15de5e..7f2663fd 100644 --- a/src/utils/common.py +++ b/src/utils/common.py @@ -165,7 +165,7 @@ def load_pretrained( tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, use_fast=model_args.use_fast_tokenizer, - padding_side="left", + padding_side=model_args.padding_side, **config_kwargs ) if tokenizer.pad_token_id is None or tokenizer.pad_token_id == 64000: # 64000 for baichuan model (older version) diff --git a/src/utils/config.py b/src/utils/config.py index 79dd691e..a340e69b 100644 --- a/src/utils/config.py +++ b/src/utils/config.py @@ -47,6 +47,10 @@ class ModelArguments: default="main", metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."} ) + padding_side: Optional[Literal["left", "right"]] = field( + default="left", + metadata={"help": "The side on which the model should have padding applied."} + ) quantization_bit: Optional[int] = field( default=None, metadata={"help": "The number of bits to quantize the model."} diff --git a/src/utils/data_collator.py b/src/utils/data_collator.py deleted file mode 100644 index 4db1a305..00000000 --- a/src/utils/data_collator.py +++ /dev/null @@ -1,70 +0,0 @@ -import torch - -from typing import Dict, Optional, Sequence, Union - -from transformers import DataCollatorWithPadding, BatchEncoding -from transformers.tokenization_utils import PreTrainedTokenizer - -from .other import IGNORE_INDEX - - -class DynamicDataCollatorWithPadding(DataCollatorWithPadding): - r""" - Inherits DataCollatorWithPadding. It is capable of dynamically padding for batched data. - """ - def __init__( - self, - tokenizer: PreTrainedTokenizer, - ignore_pad_token_for_loss: Optional[bool] = False - ): - super().__init__(tokenizer, padding=True) - self.label_pad_token_id = IGNORE_INDEX if ignore_pad_token_for_loss else tokenizer.pad_token_id - - def get_attention_masks(self, input_ids: torch.Tensor, device: torch.device) -> torch.Tensor: - r""" - Generates attention masks for left-padded sequences. - """ - batch_size, seq_length = input_ids.size() - attention_mask = torch.ones((batch_size, seq_length), device=device) - - for i, seq in enumerate(input_ids): - attention_mask[i, :(seq != self.tokenizer.pad_token_id).nonzero()[0].item()] = 0 # padding - - attention_mask = attention_mask.bool() - return attention_mask - - def __call__(self, features: Sequence[Dict[str, Union[torch.Tensor, Sequence[int]]]]) -> BatchEncoding: - r""" - Pads batched data to the longest sequence in the batch. - - We adopt left-padding in both training and evaluation. - """ - if isinstance(features[0]["input_ids"], torch.Tensor): - input_ids = [feature["input_ids"].clone().detach().flip(0) for feature in features] - else: - input_ids = [torch.tensor(feature["input_ids"]).flip(0) for feature in features] - - if "labels" in features[0]: - if isinstance(features[0]["labels"], torch.Tensor): - labels = [feature["labels"].clone().detach().flip(0) for feature in features] - else: - labels = [torch.tensor(feature["labels"]).flip(0) for feature in features] - input_ids = input_ids + labels # pad them to the same length - - input_ids = torch.nn.utils.rnn.pad_sequence( - input_ids, - batch_first=True, - padding_value=self.tokenizer.pad_token_id - ).flip(-1) - - batch = {} - - if "labels" in features[0]: - input_ids, labels = input_ids.split(len(features), dim=0) - labels = torch.where(labels != self.tokenizer.pad_token_id, labels, self.label_pad_token_id) - batch["labels"] = labels - - batch["input_ids"] = input_ids - batch["attention_mask"] = self.get_attention_masks(input_ids, device=input_ids.device) - - return BatchEncoding(batch) diff --git a/src/utils/pairwise.py b/src/utils/pairwise.py index 9bac1594..bdffc749 100644 --- a/src/utils/pairwise.py +++ b/src/utils/pairwise.py @@ -2,7 +2,7 @@ import torch import numpy as np from typing import Dict, Sequence, Tuple, Union -from .data_collator import DynamicDataCollatorWithPadding +from transformers import DataCollatorWithPadding from .peft_trainer import PeftTrainer @@ -16,7 +16,7 @@ def compute_accuracy(eval_preds: Sequence[Union[np.ndarray, Tuple[np.ndarray]]]) return {"accuracy": (preds[0] > preds[1]).sum() / len(preds[0])} -class PairwiseDataCollatorWithPadding(DynamicDataCollatorWithPadding): +class PairwiseDataCollatorWithPadding(DataCollatorWithPadding): r""" Data collator for pairwise data. """