add averaging in evaluation
This commit is contained in:
parent
be420e4179
commit
5310e4d182
|
@ -9,10 +9,11 @@ import fire
|
|||
import json
|
||||
import torch
|
||||
import numpy as np
|
||||
from tqdm import tqdm, trange
|
||||
from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple
|
||||
from collections import Counter
|
||||
from datasets import load_dataset
|
||||
from dataclasses import dataclass
|
||||
from tqdm import tqdm, trange
|
||||
from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple
|
||||
|
||||
from llmtuner import ChatModel
|
||||
|
||||
|
@ -86,10 +87,8 @@ def batch_inference(
|
|||
probs = torch.nn.functional.softmax(
|
||||
torch.stack(
|
||||
[
|
||||
logits[:, -1, chat_model.tokenizer.encode(prefix_char + "A")[-1]],
|
||||
logits[:, -1, chat_model.tokenizer.encode(prefix_char + "B")[-1]],
|
||||
logits[:, -1, chat_model.tokenizer.encode(prefix_char + "C")[-1]],
|
||||
logits[:, -1, chat_model.tokenizer.encode(prefix_char + "D")[-1]]
|
||||
logits[:, -1, chat_model.tokenizer.encode(prefix_char + choice, add_special_tokens=False)[-1]]
|
||||
for choice in choices
|
||||
],
|
||||
dim=-1
|
||||
),
|
||||
|
@ -108,11 +107,12 @@ def evaluate(
|
|||
split: Optional[Literal["validation", "test"]] = "validation",
|
||||
lang: Optional[Literal["zh", "en"]] = "zh",
|
||||
n_shot: Optional[int] = 5,
|
||||
n_avg: Optional[int] = 1,
|
||||
batch_size: Optional[int] = 4,
|
||||
save_name: Optional[str] = None
|
||||
):
|
||||
with open(os.path.join(dataset_dir, task, "mapping.json"), "r", encoding="utf-8") as f:
|
||||
categorys = json.load(f)
|
||||
categorys: Dict[str, Dict[str, str]] = json.load(f)
|
||||
|
||||
chat_model = ChatModel(dict(
|
||||
model_name_or_path=model_name_or_path,
|
||||
|
@ -124,17 +124,17 @@ def evaluate(
|
|||
assert chat_model.tokenizer.padding_side == "left", "only left-padded tensor can be accepted."
|
||||
|
||||
category_corrects: Dict[str, np.ndarray] = {
|
||||
subj: np.array([], dtype="bool") for subj in ["STEM", "Social Sciences", "Humanities", "Other"]
|
||||
subj: np.array([], dtype="bool") for subj in ["Average", "STEM", "Social Sciences", "Humanities", "Other"]
|
||||
}
|
||||
overall_corrects = np.array([], dtype="bool")
|
||||
|
||||
pbar = tqdm(categorys.keys(), desc="Processing subjects", position=0)
|
||||
results = {}
|
||||
for subject in pbar:
|
||||
pbar.set_postfix_str(categorys[subject]["name"])
|
||||
inputs, labels = [], []
|
||||
dataset = load_dataset(os.path.join(dataset_dir, task), subject)
|
||||
for i in range(len(dataset[split])):
|
||||
labels, answers, all_outputs = [], [], []
|
||||
for epoch in range(n_avg):
|
||||
pbar.set_postfix_str("{} Trial: {}".format(categorys[subject]["name"], epoch))
|
||||
inputs, outputs = [], []
|
||||
for i in trange(len(dataset[split]), desc="Formatting batches", position=1, leave=False):
|
||||
support_set = dataset["train"].shuffle().select(range(min(n_shot, len(dataset["train"]))))
|
||||
query, resp, history = eval_template.format_example(
|
||||
target_data=dataset[split][i],
|
||||
|
@ -143,37 +143,34 @@ def evaluate(
|
|||
use_history=chat_model.template.use_history
|
||||
)
|
||||
input_ids, _ = chat_model.template.encode_oneturn(
|
||||
tokenizer=chat_model.tokenizer,
|
||||
query=query,
|
||||
resp=resp,
|
||||
history=history
|
||||
tokenizer=chat_model.tokenizer, query=query, resp=resp, history=history
|
||||
)
|
||||
inputs.append({
|
||||
"input_ids": input_ids,
|
||||
"attention_mask": [1] * len(input_ids)
|
||||
})
|
||||
inputs.append({"input_ids": input_ids, "attention_mask": [1] * len(input_ids)})
|
||||
if epoch == 0:
|
||||
labels.append(resp)
|
||||
|
||||
outputs = []
|
||||
for i in trange(0, len(inputs), batch_size, desc="Processing batches", position=1, leave=False):
|
||||
for i in trange(0, len(inputs), batch_size, desc="Predicting batches", position=1, leave=False):
|
||||
batch_input = chat_model.tokenizer.pad(
|
||||
inputs[i : i + batch_size],
|
||||
return_attention_mask=True,
|
||||
return_tensors="pt"
|
||||
inputs[i : i + batch_size], return_attention_mask=True, return_tensors="pt"
|
||||
).to(chat_model.model.device)
|
||||
preds = batch_inference(chat_model, batch_input, eval_template.prefix)
|
||||
outputs += preds
|
||||
all_outputs.append(outputs)
|
||||
|
||||
corrects = (np.array(outputs) == np.array(labels))
|
||||
for i in range(len(all_outputs[0])):
|
||||
count = Counter([all_outputs[epoch][i] for epoch in range(n_avg)])
|
||||
answers.append(count.most_common(1)[0][0])
|
||||
|
||||
corrects = (np.array(answers) == np.array(labels))
|
||||
category_name = categorys[subject]["category"]
|
||||
category_corrects[category_name] = np.concatenate([category_corrects[category_name], corrects], axis=0)
|
||||
overall_corrects = np.concatenate([overall_corrects, corrects], axis=0)
|
||||
results[subject] = {str(i): outputs[i] for i in range(len(outputs))}
|
||||
category_corrects["Average"] = np.concatenate([category_corrects["Average"], corrects], axis=0)
|
||||
results[subject] = {str(i): answers[i] for i in range(len(answers))}
|
||||
|
||||
score_info = "Average accuracy: {:.2f}".format(100 * np.mean(overall_corrects))
|
||||
for category_name, category_correct in category_corrects.items():
|
||||
if len(category_correct):
|
||||
score_info += "\n{:>16}: {:.2f}".format(category_name, 100 * np.mean(category_correct))
|
||||
score_info = "\n".join([
|
||||
"{:>15}: {:.2f}".format(category_name, 100 * np.mean(category_correct))
|
||||
for category_name, category_correct in category_corrects.items() if len(category_correct)
|
||||
])
|
||||
|
||||
print(score_info)
|
||||
if save_name is not None:
|
||||
|
|
Loading…
Reference in New Issue