fix Baichuan-13B
This commit is contained in:
parent
8cd76ef3c3
commit
08439d29b2
|
@ -9,7 +9,7 @@
|
||||||
|
|
||||||
## Changelog
|
## Changelog
|
||||||
|
|
||||||
[23/07/11] Now we support training the **Baichuan-13B** model in this repo. Try `--model_name_or_path baichuan-inc/Baichuan-13B-Base` and `--lora_target W_pack` arguments to use the Baichuan-13B model. Remember to use `--prompt_template baichuan` argument when you are using the Baichuan-13B-Chat model.
|
[23/07/11] Now we support training the **Baichuan-13B** model in this repo. Try `--model_name_or_path baichuan-inc/Baichuan-13B-Base`, `--padding_side right` and `--lora_target W_pack` arguments to train the Baichuan-13B model. Remember to use `--prompt_template baichuan` argument when you are using the Baichuan-13B-Chat model.
|
||||||
|
|
||||||
[23/07/09] Now we release [FastEdit](https://github.com/hiyouga/FastEdit)⚡🩹, an easy-to-use package for editing the factual knowledge of large language models efficiently. Please follow [FastEdit](https://github.com/hiyouga/FastEdit) if you are interested.
|
[23/07/09] Now we release [FastEdit](https://github.com/hiyouga/FastEdit)⚡🩹, an easy-to-use package for editing the factual knowledge of large language models efficiently. Please follow [FastEdit](https://github.com/hiyouga/FastEdit) if you are interested.
|
||||||
|
|
||||||
|
|
|
@ -94,6 +94,9 @@
|
||||||
"history": "history"
|
"history": "history"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"novel_tokens512_50k": {
|
||||||
|
"hf_hub_url": "zxbsmk/webnovel_cn"
|
||||||
|
},
|
||||||
"example": {
|
"example": {
|
||||||
"script_url": "example_dataset",
|
"script_url": "example_dataset",
|
||||||
"columns": {
|
"columns": {
|
||||||
|
@ -131,7 +134,7 @@
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"oaast_rm_zh": {
|
"oaast_rm_zh": {
|
||||||
"file_name": "",
|
"file_name": "oaast_rm_zh.json",
|
||||||
"file_sha1": "1065af1f3784dd61be5e79713a35f427b713a232",
|
"file_sha1": "1065af1f3784dd61be5e79713a35f427b713a232",
|
||||||
"columns": {
|
"columns": {
|
||||||
"prompt": "instruction",
|
"prompt": "instruction",
|
||||||
|
@ -149,8 +152,5 @@
|
||||||
"response": "",
|
"response": "",
|
||||||
"history": ""
|
"history": ""
|
||||||
}
|
}
|
||||||
},
|
|
||||||
"novel_tokens512_50k": {
|
|
||||||
"hf_hub_url": "zxbsmk/webnovel_cn"
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -8,9 +8,8 @@ import math
|
||||||
from torch.optim import AdamW
|
from torch.optim import AdamW
|
||||||
from transformers.optimization import get_scheduler
|
from transformers.optimization import get_scheduler
|
||||||
from trl import PPOConfig
|
from trl import PPOConfig
|
||||||
|
from transformers import DataCollatorForSeq2Seq
|
||||||
from utils import (
|
from utils import (
|
||||||
DynamicDataCollatorWithPadding,
|
|
||||||
PPOPeftTrainer,
|
PPOPeftTrainer,
|
||||||
LogCallback,
|
LogCallback,
|
||||||
load_pretrained,
|
load_pretrained,
|
||||||
|
@ -28,7 +27,10 @@ def main():
|
||||||
dataset = prepare_data(model_args, data_args)
|
dataset = prepare_data(model_args, data_args)
|
||||||
model, tokenizer = load_pretrained(model_args, finetuning_args, training_args.do_train, stage="ppo")
|
model, tokenizer = load_pretrained(model_args, finetuning_args, training_args.do_train, stage="ppo")
|
||||||
dataset = preprocess_data(dataset, tokenizer, data_args, training_args, stage="ppo")
|
dataset = preprocess_data(dataset, tokenizer, data_args, training_args, stage="ppo")
|
||||||
data_collator = DynamicDataCollatorWithPadding(tokenizer)
|
data_collator = DataCollatorForSeq2Seq(
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
label_pad_token_id=tokenizer.pad_token_id
|
||||||
|
)
|
||||||
|
|
||||||
ppo_config = PPOConfig(
|
ppo_config = PPOConfig(
|
||||||
model_name=model_args.model_name_or_path,
|
model_name=model_args.model_name_or_path,
|
||||||
|
|
|
@ -5,6 +5,8 @@
|
||||||
|
|
||||||
|
|
||||||
import math
|
import math
|
||||||
|
from transformers import DataCollatorForSeq2Seq
|
||||||
|
from utils.other import IGNORE_INDEX
|
||||||
|
|
||||||
from utils import (
|
from utils import (
|
||||||
DynamicDataCollatorWithPadding,
|
DynamicDataCollatorWithPadding,
|
||||||
|
@ -25,7 +27,10 @@ def main():
|
||||||
dataset = prepare_data(model_args, data_args)
|
dataset = prepare_data(model_args, data_args)
|
||||||
model, tokenizer = load_pretrained(model_args, finetuning_args, training_args.do_train, stage="pt")
|
model, tokenizer = load_pretrained(model_args, finetuning_args, training_args.do_train, stage="pt")
|
||||||
dataset = preprocess_data(dataset, tokenizer, data_args, training_args, stage="pt")
|
dataset = preprocess_data(dataset, tokenizer, data_args, training_args, stage="pt")
|
||||||
data_collator = DynamicDataCollatorWithPadding(tokenizer, data_args.ignore_pad_token_for_loss)
|
data_collator = DataCollatorForSeq2Seq(
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
label_pad_token_id=IGNORE_INDEX if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
|
||||||
|
)
|
||||||
|
|
||||||
# Split the dataset
|
# Split the dataset
|
||||||
if training_args.do_train:
|
if training_args.do_train:
|
||||||
|
|
|
@ -17,6 +17,7 @@ from utils import (
|
||||||
plot_loss
|
plot_loss
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
||||||
# Prepare pretrained model and dataset
|
# Prepare pretrained model and dataset
|
||||||
|
|
|
@ -4,8 +4,9 @@
|
||||||
# https://github.com/huggingface/transformers/blob/v4.29.2/examples/pytorch/summarization/run_summarization.py
|
# https://github.com/huggingface/transformers/blob/v4.29.2/examples/pytorch/summarization/run_summarization.py
|
||||||
|
|
||||||
|
|
||||||
|
from transformers import DataCollatorForSeq2Seq
|
||||||
|
from utils.other import IGNORE_INDEX
|
||||||
from utils import (
|
from utils import (
|
||||||
DynamicDataCollatorWithPadding,
|
|
||||||
Seq2SeqPeftTrainer,
|
Seq2SeqPeftTrainer,
|
||||||
ComputeMetrics,
|
ComputeMetrics,
|
||||||
LogCallback,
|
LogCallback,
|
||||||
|
@ -25,9 +26,9 @@ def main():
|
||||||
dataset = prepare_data(model_args, data_args)
|
dataset = prepare_data(model_args, data_args)
|
||||||
model, tokenizer = load_pretrained(model_args, finetuning_args, training_args.do_train, stage="sft")
|
model, tokenizer = load_pretrained(model_args, finetuning_args, training_args.do_train, stage="sft")
|
||||||
dataset = preprocess_data(dataset, tokenizer, data_args, training_args, stage="sft")
|
dataset = preprocess_data(dataset, tokenizer, data_args, training_args, stage="sft")
|
||||||
data_collator = DynamicDataCollatorWithPadding(
|
data_collator = DataCollatorForSeq2Seq(
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
ignore_pad_token_for_loss=(data_args.ignore_pad_token_for_loss and not training_args.predict_with_generate)
|
label_pad_token_id=IGNORE_INDEX if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
|
||||||
)
|
)
|
||||||
|
|
||||||
# Override the decoding parameters of Seq2SeqTrainer
|
# Override the decoding parameters of Seq2SeqTrainer
|
||||||
|
|
|
@ -6,8 +6,6 @@ from .common import (
|
||||||
preprocess_data
|
preprocess_data
|
||||||
)
|
)
|
||||||
|
|
||||||
from .data_collator import DynamicDataCollatorWithPadding
|
|
||||||
|
|
||||||
from .peft_trainer import PeftTrainer, LogCallback
|
from .peft_trainer import PeftTrainer, LogCallback
|
||||||
|
|
||||||
from .seq2seq import ComputeMetrics, Seq2SeqPeftTrainer
|
from .seq2seq import ComputeMetrics, Seq2SeqPeftTrainer
|
||||||
|
|
|
@ -165,7 +165,7 @@ def load_pretrained(
|
||||||
tokenizer = AutoTokenizer.from_pretrained(
|
tokenizer = AutoTokenizer.from_pretrained(
|
||||||
model_args.model_name_or_path,
|
model_args.model_name_or_path,
|
||||||
use_fast=model_args.use_fast_tokenizer,
|
use_fast=model_args.use_fast_tokenizer,
|
||||||
padding_side="left",
|
padding_side=model_args.padding_side,
|
||||||
**config_kwargs
|
**config_kwargs
|
||||||
)
|
)
|
||||||
if tokenizer.pad_token_id is None or tokenizer.pad_token_id == 64000: # 64000 for baichuan model (older version)
|
if tokenizer.pad_token_id is None or tokenizer.pad_token_id == 64000: # 64000 for baichuan model (older version)
|
||||||
|
|
|
@ -47,6 +47,10 @@ class ModelArguments:
|
||||||
default="main",
|
default="main",
|
||||||
metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}
|
metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}
|
||||||
)
|
)
|
||||||
|
padding_side: Optional[Literal["left", "right"]] = field(
|
||||||
|
default="left",
|
||||||
|
metadata={"help": "The side on which the model should have padding applied."}
|
||||||
|
)
|
||||||
quantization_bit: Optional[int] = field(
|
quantization_bit: Optional[int] = field(
|
||||||
default=None,
|
default=None,
|
||||||
metadata={"help": "The number of bits to quantize the model."}
|
metadata={"help": "The number of bits to quantize the model."}
|
||||||
|
|
|
@ -1,70 +0,0 @@
|
||||||
import torch
|
|
||||||
|
|
||||||
from typing import Dict, Optional, Sequence, Union
|
|
||||||
|
|
||||||
from transformers import DataCollatorWithPadding, BatchEncoding
|
|
||||||
from transformers.tokenization_utils import PreTrainedTokenizer
|
|
||||||
|
|
||||||
from .other import IGNORE_INDEX
|
|
||||||
|
|
||||||
|
|
||||||
class DynamicDataCollatorWithPadding(DataCollatorWithPadding):
|
|
||||||
r"""
|
|
||||||
Inherits DataCollatorWithPadding. It is capable of dynamically padding for batched data.
|
|
||||||
"""
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
tokenizer: PreTrainedTokenizer,
|
|
||||||
ignore_pad_token_for_loss: Optional[bool] = False
|
|
||||||
):
|
|
||||||
super().__init__(tokenizer, padding=True)
|
|
||||||
self.label_pad_token_id = IGNORE_INDEX if ignore_pad_token_for_loss else tokenizer.pad_token_id
|
|
||||||
|
|
||||||
def get_attention_masks(self, input_ids: torch.Tensor, device: torch.device) -> torch.Tensor:
|
|
||||||
r"""
|
|
||||||
Generates attention masks for left-padded sequences.
|
|
||||||
"""
|
|
||||||
batch_size, seq_length = input_ids.size()
|
|
||||||
attention_mask = torch.ones((batch_size, seq_length), device=device)
|
|
||||||
|
|
||||||
for i, seq in enumerate(input_ids):
|
|
||||||
attention_mask[i, :(seq != self.tokenizer.pad_token_id).nonzero()[0].item()] = 0 # padding
|
|
||||||
|
|
||||||
attention_mask = attention_mask.bool()
|
|
||||||
return attention_mask
|
|
||||||
|
|
||||||
def __call__(self, features: Sequence[Dict[str, Union[torch.Tensor, Sequence[int]]]]) -> BatchEncoding:
|
|
||||||
r"""
|
|
||||||
Pads batched data to the longest sequence in the batch.
|
|
||||||
|
|
||||||
We adopt left-padding in both training and evaluation.
|
|
||||||
"""
|
|
||||||
if isinstance(features[0]["input_ids"], torch.Tensor):
|
|
||||||
input_ids = [feature["input_ids"].clone().detach().flip(0) for feature in features]
|
|
||||||
else:
|
|
||||||
input_ids = [torch.tensor(feature["input_ids"]).flip(0) for feature in features]
|
|
||||||
|
|
||||||
if "labels" in features[0]:
|
|
||||||
if isinstance(features[0]["labels"], torch.Tensor):
|
|
||||||
labels = [feature["labels"].clone().detach().flip(0) for feature in features]
|
|
||||||
else:
|
|
||||||
labels = [torch.tensor(feature["labels"]).flip(0) for feature in features]
|
|
||||||
input_ids = input_ids + labels # pad them to the same length
|
|
||||||
|
|
||||||
input_ids = torch.nn.utils.rnn.pad_sequence(
|
|
||||||
input_ids,
|
|
||||||
batch_first=True,
|
|
||||||
padding_value=self.tokenizer.pad_token_id
|
|
||||||
).flip(-1)
|
|
||||||
|
|
||||||
batch = {}
|
|
||||||
|
|
||||||
if "labels" in features[0]:
|
|
||||||
input_ids, labels = input_ids.split(len(features), dim=0)
|
|
||||||
labels = torch.where(labels != self.tokenizer.pad_token_id, labels, self.label_pad_token_id)
|
|
||||||
batch["labels"] = labels
|
|
||||||
|
|
||||||
batch["input_ids"] = input_ids
|
|
||||||
batch["attention_mask"] = self.get_attention_masks(input_ids, device=input_ids.device)
|
|
||||||
|
|
||||||
return BatchEncoding(batch)
|
|
|
@ -2,7 +2,7 @@ import torch
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from typing import Dict, Sequence, Tuple, Union
|
from typing import Dict, Sequence, Tuple, Union
|
||||||
|
|
||||||
from .data_collator import DynamicDataCollatorWithPadding
|
from transformers import DataCollatorWithPadding
|
||||||
|
|
||||||
from .peft_trainer import PeftTrainer
|
from .peft_trainer import PeftTrainer
|
||||||
|
|
||||||
|
@ -16,7 +16,7 @@ def compute_accuracy(eval_preds: Sequence[Union[np.ndarray, Tuple[np.ndarray]]])
|
||||||
return {"accuracy": (preds[0] > preds[1]).sum() / len(preds[0])}
|
return {"accuracy": (preds[0] > preds[1]).sum() / len(preds[0])}
|
||||||
|
|
||||||
|
|
||||||
class PairwiseDataCollatorWithPadding(DynamicDataCollatorWithPadding):
|
class PairwiseDataCollatorWithPadding(DataCollatorWithPadding):
|
||||||
r"""
|
r"""
|
||||||
Data collator for pairwise data.
|
Data collator for pairwise data.
|
||||||
"""
|
"""
|
||||||
|
|
Loading…
Reference in New Issue