176 lines
7.5 KiB
Python
176 lines
7.5 KiB
Python
# several of the evaluation metrics are from https://github.com/google-research/text-to-text-transfer-transformer/blob/a1352e625db7ec114062f99d99b0565b9e45c155/t5/evaluation/metrics.py
|
|
"""Defines different metrics used for evaluation of tasks."""
|
|
import numpy as np
|
|
import scipy
|
|
import math
|
|
import sklearn
|
|
import collections
|
|
from logging import getLogger
|
|
from .qa_utils import normalize_squad, qa_metrics
|
|
import sklearn.metrics
|
|
|
|
logger = getLogger(__name__)
|
|
|
|
def accuracy(predictions, targets) -> dict:
|
|
"""Computes the average accuracy."""
|
|
return {"accuracy": 100 * ((np.array(predictions) == np.array(targets)).mean())}
|
|
|
|
def pearson_corrcoef(predictions, targets) -> dict:
|
|
"""Computes Pearson correlation coefficient."""
|
|
from examples_seq2seq.data_processors.postprocessors import string_to_float
|
|
targets = [string_to_float(target) for target in targets]
|
|
predictions= [string_to_float(prediction) for prediction in predictions]
|
|
pearson_corrcoef = 100 * scipy.stats.pearsonr(targets, predictions)[0]
|
|
|
|
# Note that if all the predictions will be the same, spearman
|
|
# correlation is nan, to gaurad against this, we check the output
|
|
# and return 0 in this case.
|
|
if math.isnan(pearson_corrcoef):
|
|
pearson_corrcoef = 0
|
|
return {"pearson": pearson_corrcoef}
|
|
|
|
|
|
def spearman_corrcoef(predictions, targets) -> dict:
|
|
"""Computes Spearman correlation coefficient."""
|
|
# TODO: we need to do postprocessors in a clean way for each dataset.
|
|
from examples_seq2seq.data_processors.postprocessors import string_to_float
|
|
targets = [string_to_float(target) for target in targets]
|
|
predictions= [string_to_float(prediction) for prediction in predictions]
|
|
spearman_corrcoef = 100 * scipy.stats.spearmanr(targets, predictions)[0]
|
|
|
|
# Note that if all the predictions will be the same, spearman
|
|
# correlation is nan, to gaurad against this, we check the output
|
|
# and return 0 in this case.
|
|
if math.isnan(spearman_corrcoef):
|
|
spearman_corrcoef = 0
|
|
return {"spearmanr": spearman_corrcoef}
|
|
|
|
def transform_for_generation(predictions, targets):
|
|
mapping = {k: i for i, k in enumerate(set(targets))}
|
|
|
|
targets = np.asarray([mapping[k] for k in targets])
|
|
predictions = np.asarray([mapping[k] if k in mapping else (t+1)%len(mapping) for t, k in zip(targets, predictions)])
|
|
|
|
return predictions, targets
|
|
|
|
|
|
|
|
def f1_score(predictions, targets) -> dict:
|
|
"""Computes F1 score, with any prediction != 0 or 1 is counted as incorrect.
|
|
Args:
|
|
targets: list of targets, either 0 or 1
|
|
predictions: list of predictions, any integer value
|
|
Returns:
|
|
F1 score, where any prediction != 0 or 1 is counted as wrong.
|
|
"""
|
|
targets = targets.astype(np.int32)
|
|
predictions = predictions.astype(np.int32)
|
|
return {"f1": 100 * sklearn.metrics.f1_score(targets, predictions)}
|
|
|
|
# TODO: maybe gaurd against invalid values https://stackoverflow.com/questions/56865344/how-do-i-calculate-the-matthews-correlation-coefficient-in-tensorflow
|
|
def matthews_corrcoef(predictions, targets) -> dict:
|
|
"""Computes the Matthews correlation coefficient."""
|
|
return {"matthews_correlation": 100 * sklearn.metrics.matthews_corrcoef(targets, predictions)}
|
|
|
|
def squad(predictions, targets):
|
|
"""Computes SQuAD metrics, maximizing over answers per question.
|
|
Args:
|
|
targets: list of lists of strings
|
|
predictions: list of strings
|
|
Returns:
|
|
dict with score_key: squad score across all targets and predictions
|
|
"""
|
|
|
|
targets = [[normalize_squad(t) for t in u] for u in targets]
|
|
predictions = [normalize_squad(p) for p in predictions]
|
|
return qa_metrics(targets, predictions)
|
|
|
|
|
|
def exact_match(predictions, targets):
|
|
"""Computes whether the targets match predictions exactly."""
|
|
return {"em": 100 * float(np.array_equal(targets, predictions))}
|
|
|
|
|
|
def sklearn_metrics_wrapper(metric_str,
|
|
metric_dict_str=None,
|
|
metric_post_process_fn=None,
|
|
**metric_fn_kwargs):
|
|
"""Wraps any sklearn.metric function and returns a t5 metric function.
|
|
Args:
|
|
metric_str: string, the function from `sklearn.metrics` to use.
|
|
metric_dict_str: optional string, if not specified `metric_str` is used as
|
|
the key in the returned dictionary.
|
|
metric_post_process_fn: callable, if specified the final computed metric
|
|
will be passed through this.
|
|
**metric_fn_kwargs: kwargs, passed to the metric function we are calling.
|
|
Returns:
|
|
the function that calculates the metric in a dict.
|
|
"""
|
|
if not hasattr(sklearn.metrics, metric_str):
|
|
raise ValueError("sklearn.metrics does not have: %s" % metric_str)
|
|
|
|
def fn(predictions, targets):
|
|
metric_fn = getattr(sklearn.metrics, metric_str)
|
|
metric_val = metric_fn(targets, predictions, **metric_fn_kwargs)
|
|
if metric_post_process_fn is not None:
|
|
metric_val = metric_post_process_fn(metric_val)
|
|
return {metric_dict_str or metric_str: metric_val}
|
|
return fn
|
|
|
|
|
|
def mean_multiclass_f1(num_classes, **metric_fn_kwargs):
|
|
"""Computes the unweighted average of the F1 per class."""
|
|
return sklearn_metrics_wrapper(
|
|
"fbeta_score",
|
|
metric_dict_str="f1_multiclass",
|
|
metric_post_process_fn=lambda x: 100 * x,
|
|
beta=1,
|
|
labels=range(num_classes),
|
|
average="macro",
|
|
**metric_fn_kwargs)
|
|
|
|
|
|
def multirc_f1_over_all_answers(targets, predictions):
|
|
"""Special metric for MultiRC which computes F1 score over all examples.
|
|
This is necessary because the targets/predictions for MultiRC are dicts and
|
|
the f1_score_with_invalid expects a list of True/False labels, not dicts. As
|
|
a result we just need to key in the "value" for each of the example dicts
|
|
before feeding into f1_score_with_invalid.
|
|
Args:
|
|
targets: list of dicts, where each dict has a "value" key.
|
|
predictions: list of dicts, where each dict has a "value" key.
|
|
Returns:
|
|
F1 score over values, where any prediction != 0 or 1 is counted as wrong.
|
|
"""
|
|
return f1_score_with_invalid(
|
|
[t["value"] for t in targets], [p["value"] for p in predictions]
|
|
)
|
|
|
|
|
|
def mean_group_metric(metric_fn, group_key="group", value_key="value"):
|
|
"""Returns a metric that averages `metric_fn` on sub-groups of results.
|
|
The sub-groups are defined by aggregating results (targets and predictions)
|
|
by accessing the feature specified by `group_key` in the target dicts.
|
|
**WARNING**: Using this function can produce unreliable results if you do not
|
|
pass in full groups. For example, if you evaluate over a random subsample of a
|
|
validation set and do not retain all of the examples in each group, you may
|
|
get results which aren't directly comparable to using the full validation set.
|
|
Args:
|
|
metric_fn: function, the metric to compute on the subgroups.
|
|
group_key: string, the key for the grouping value in the target dictionary.
|
|
value_key: string, the key for the value in the dictionaries.
|
|
"""
|
|
def my_metric(targets, predictions):
|
|
"""Computes mean of `metric_fn` over subgroups of results."""
|
|
grouped_values = collections.defaultdict(lambda: ([], []))
|
|
for targ, pred in zip(targets, predictions):
|
|
g = targ[group_key]
|
|
grouped_values[g][0].append(targ[value_key])
|
|
grouped_values[g][1].append(pred[value_key])
|
|
group_scores = collections.defaultdict(list)
|
|
for (targets, predictions) in grouped_values.values():
|
|
for metric, score in metric_fn(targets, predictions).items():
|
|
group_scores[metric].append(score)
|
|
return {metric: np.mean(scores) for metric, scores in group_scores.items()}
|
|
return my_metric
|