LLaMA-Factory-Mirror/scripts/cal_ppl.py

133 lines
5.2 KiB
Python
Raw Permalink Normal View History

2024-05-04 22:02:25 +08:00
# coding=utf-8
2024-06-15 17:54:33 +08:00
# Copyright 2024 the LlamaFactory team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
2024-05-04 22:02:25 +08:00
import json
2024-05-04 23:05:17 +08:00
from dataclasses import dataclass
2024-05-05 00:17:54 +08:00
from typing import Any, Dict, Literal, Optional, Sequence
2024-05-04 22:02:25 +08:00
import fire
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import DataCollatorForLanguageModeling, DataCollatorForSeq2Seq
2024-05-16 18:39:08 +08:00
from llamafactory.data import get_dataset
from llamafactory.extras.constants import IGNORE_INDEX
from llamafactory.hparams import get_train_args
from llamafactory.model import load_model, load_tokenizer
2024-05-04 22:02:25 +08:00
2024-05-04 23:05:17 +08:00
@dataclass
class PairwiseDataCollatorWithPadding(DataCollatorForSeq2Seq):
r"""
Data collator for pairwise data.
"""
train_on_prompt: bool = False
def __call__(self, features: Sequence[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
r"""
Pads batched data to the longest sequence in the batch.
We generate 2 * n examples where the first n examples represent chosen examples and
the last n examples represent rejected examples.
"""
chosen_features = []
for feature in features:
prompt_len, answer_len = len(feature["prompt_ids"]), len(feature["chosen_ids"])
input_ids = feature["prompt_ids"] + feature["chosen_ids"]
attention_mask = [1] * (prompt_len + answer_len)
labels = input_ids if self.train_on_prompt else [IGNORE_INDEX] * prompt_len + feature["chosen_ids"]
chosen_features.append({"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels})
return super().__call__(chosen_features)
2024-05-04 22:02:25 +08:00
def cal_ppl(
model_name_or_path: str,
2024-05-04 22:13:14 +08:00
save_name: str,
2024-05-04 22:02:25 +08:00
batch_size: int = 4,
2024-05-04 23:05:17 +08:00
stage: Literal["pt", "sft", "rm"] = "sft",
2024-05-04 22:02:25 +08:00
dataset: str = "alpaca_en",
dataset_dir: str = "data",
template: str = "default",
cutoff_len: int = 1024,
2024-05-05 00:17:54 +08:00
max_samples: Optional[int] = None,
2024-05-04 22:02:25 +08:00
train_on_prompt: bool = False,
):
2024-06-15 17:54:33 +08:00
r"""
Calculates the ppl on the dataset of the pre-trained models.
Usage: python cal_ppl.py --model_name_or_path path_to_model --save_name ppl.json
"""
2024-05-04 22:02:25 +08:00
model_args, data_args, training_args, finetuning_args, _ = get_train_args(
dict(
stage=stage,
model_name_or_path=model_name_or_path,
dataset=dataset,
dataset_dir=dataset_dir,
template=template,
cutoff_len=cutoff_len,
2024-05-05 00:17:54 +08:00
max_samples=max_samples,
2024-05-04 22:02:25 +08:00
train_on_prompt=train_on_prompt,
output_dir="dummy_dir",
overwrite_cache=True,
2024-07-15 01:04:56 +08:00
do_train=True,
2024-05-04 22:02:25 +08:00
)
)
tokenizer_module = load_tokenizer(model_args)
tokenizer = tokenizer_module["tokenizer"]
2024-07-15 01:04:56 +08:00
trainset = get_dataset(model_args, data_args, training_args, stage, **tokenizer_module)["train_dataset"]
2024-05-04 22:02:25 +08:00
model = load_model(tokenizer, model_args, finetuning_args, is_trainable=False)
if stage == "pt":
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
elif stage == "sft":
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, label_pad_token_id=IGNORE_INDEX)
2024-05-04 23:05:17 +08:00
elif stage == "rm":
data_collator = PairwiseDataCollatorWithPadding(
tokenizer=tokenizer, label_pad_token_id=IGNORE_INDEX, train_on_prompt=train_on_prompt
)
2024-05-04 22:02:25 +08:00
else:
2024-07-03 20:07:44 +08:00
raise NotImplementedError("Stage does not supported: {}.".format(stage))
2024-05-04 22:02:25 +08:00
2024-07-15 01:04:56 +08:00
dataloader = DataLoader(trainset, batch_size, shuffle=False, collate_fn=data_collator, pin_memory=True)
2024-05-04 22:02:25 +08:00
criterion = torch.nn.CrossEntropyLoss(reduction="none")
2024-05-04 22:35:31 +08:00
total_ppl = 0
2024-05-04 22:02:25 +08:00
perplexities = []
batch: Dict[str, "torch.Tensor"]
with torch.no_grad():
for batch in tqdm(dataloader):
batch = batch.to(model.device)
outputs = model(**batch)
shift_logits: "torch.Tensor" = outputs["logits"][..., :-1, :]
shift_labels: "torch.Tensor" = batch["labels"][..., 1:]
loss_mask = shift_labels != IGNORE_INDEX
flatten_logits = shift_logits.contiguous().view(shift_labels.size(0) * shift_labels.size(1), -1)
flatten_labels = shift_labels.contiguous().view(-1)
token_logps: "torch.Tensor" = criterion(flatten_logits, flatten_labels)
token_logps = token_logps.contiguous().view(shift_logits.size(0), -1)
sentence_logps = (token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
2024-05-04 22:35:31 +08:00
total_ppl += sentence_logps.exp().sum().item()
2024-05-04 22:02:25 +08:00
perplexities.extend(sentence_logps.exp().tolist())
2024-05-04 22:13:14 +08:00
with open(save_name, "w", encoding="utf-8") as f:
2024-05-04 22:02:25 +08:00
json.dump(perplexities, f, indent=2)
2024-05-04 22:35:31 +08:00
print("Average perplexity is {:.2f}".format(total_ppl / len(perplexities)))
2024-05-04 22:13:14 +08:00
print("Perplexities have been saved at {}.".format(save_name))
2024-05-04 22:02:25 +08:00
if __name__ == "__main__":
fire.Fire(cal_ppl)