from typing import Dict, List, Optional import numpy as np import time import torch import collections from packaging import version from torch.utils.data.dataset import Dataset from transformers import Trainer from transformers import logging from transformers.trainer_utils import ( speed_metrics, EvalLoopOutput, denumpify_detensorize ) from transformers.file_utils import is_torch_tpu_available from transformers.trainer_pt_utils import ( find_batch_size, nested_numpify, nested_truncate, nested_concat, IterableDatasetShard ) from .trainer_utils import EvalPrediction from torch.utils.data.dataloader import DataLoader from torch.utils.data.dataset import IterableDataset from transformers.deepspeed import deepspeed_init if version.parse(torch.__version__) >= version.parse("1.6"): from torch.cuda.amp import autocast if is_torch_tpu_available(): import torch_xla.core.xla_model as xm import torch_xla.debug.metrics as met import torch_xla.distributed.parallel_loader as pl logger = logging.get_logger(__name__) class BaseTrainer(Trainer): def __init__(self, evaluation_metrics=[], data_info=None, *args, **kwargs): """When doing evaluation, it computes average of list of metrics given in evaluation_metrics and adds it to the dictionary of results. Trainer class then use this average metric to save the best model.""" super().__init__(*args, **kwargs) self.evaluation_metrics = evaluation_metrics self.data_info = data_info def get_data_info(self, metric_key_prefix): """Returns the data information required to make the predictions/labels suitable for the evaluation.""" if self.data_info is not None: return self.data_info[metric_key_prefix] return None def evaluate( self, eval_dataset: Optional[Dataset] = None, ignore_keys: Optional[List[str]] = None, metric_key_prefix: str = "eval", ) -> Dict[str, float]: """ Run evaluation and returns metrics. The calling script will be responsible for providing a method to compute metrics, as they are task-dependent (pass it to the init :obj:`compute_metrics` argument). You can also subclass and override this method to inject custom behavior. Args: eval_dataset (:obj:`Dataset`, `optional`): Pass a dataset if you wish to override :obj:`self.eval_dataset`. If it is an :obj:`datasets.Dataset`, columns not accepted by the ``model.forward()`` method are automatically removed. It must implement the :obj:`__len__` method. ignore_keys (:obj:`Lst[str]`, `optional`): A list of keys in the output of your model (if it is a dictionary) that should be ignored when gathering predictions. metric_key_prefix (:obj:`str`, `optional`, defaults to :obj:`"eval"`): An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named "eval_bleu" if the prefix is "eval" (default) Returns: A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The dictionary also contains the epoch number which comes from the training state. """ # memory metrics - must set up as early as possible self._memory_tracker.start() eval_dataloader = self.get_eval_dataloader(eval_dataset) start_time = time.time() eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop output = eval_loop( eval_dataloader, description="Evaluation", # No point gathering the predictions if there are no metrics, otherwise we defer to # self.args.prediction_loss_only prediction_loss_only=True if self.compute_metrics is None else None, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix, ) output.metrics.update(speed_metrics(metric_key_prefix, start_time, output.num_samples)) if len(self.evaluation_metrics) != 0: selected_metrics = [output.metrics[metric_key_prefix+"_"+k] for k in self.evaluation_metrics if metric_key_prefix+"_"+k in output.metrics] assert len(selected_metrics) >= 1, "at least one metric should be selected to compute the average_metrics." output.metrics.update({metric_key_prefix+'_average_metrics': np.mean(selected_metrics)}) self.log(output.metrics) if self.args.tpu_metrics_debug or self.args.debug: # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.) xm.master_print(met.metrics_report()) self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, output.metrics) self._memory_tracker.stop_and_update_metrics(output.metrics) return output.metrics def evaluation_loop( self, dataloader: DataLoader, description: str, prediction_loss_only: Optional[bool] = None, ignore_keys: Optional[List[str]] = None, metric_key_prefix: str = "eval", ) -> EvalLoopOutput: """ Prediction/evaluation loop, shared by :obj:`Trainer.evaluate()` and :obj:`Trainer.predict()`. Works both with or without labels. """ prediction_loss_only = ( prediction_loss_only if prediction_loss_only is not None else self.args.prediction_loss_only ) # if eval is called w/o train init deepspeed here if self.args.deepspeed and not self.deepspeed: # XXX: eval doesn't have `resume_from_checkpoint` arg but we should be able to do eval # from the checkpoint eventually deepspeed_engine, _, _ = deepspeed_init(self, num_training_steps=0, resume_from_checkpoint=None) self.model = deepspeed_engine.module self.model_wrapped = deepspeed_engine self.deepspeed = deepspeed_engine # XXX: we don't need optim/sched for inference, but this needs to be sorted out, since # for example the Z3-optimizer is a must for zero3 to work even for inference - what we # don't need is the deepspeed basic optimizer which is self.optimizer.optimizer deepspeed_engine.optimizer.optimizer = None deepspeed_engine.lr_scheduler = None model = self._wrap_model(self.model, training=False) # if full fp16 is wanted on eval and this ``evaluation`` or ``predict`` isn't called while # ``train`` is running, halve it first and then put on device if not self.is_in_train and self.args.fp16_full_eval: model = model.half().to(self.args.device) batch_size = dataloader.batch_size logger.info(f"***** Running {description} *****") if isinstance(dataloader.dataset, collections.abc.Sized): logger.info(f" Num examples = {self.num_examples(dataloader)}") else: logger.info(" Num examples: Unknown") logger.info(f" Batch size = {batch_size}") model.eval() self.callback_handler.eval_dataloader = dataloader # Do this before wrapping. eval_dataset = dataloader.dataset if is_torch_tpu_available(): dataloader = pl.ParallelLoader(dataloader, [self.args.device]).per_device_loader(self.args.device) if self.args.past_index >= 0: self._past = None # Initialize containers # losses/preds/labels on GPU/TPU (accumulated for eval_accumulation_steps) losses_host = None preds_host = None labels_host = None # losses/preds/labels on CPU (final containers) all_losses = None all_preds = None all_labels = None # Will be useful when we have an iterable dataset so don't know its length. observed_num_examples = 0 # Main evaluation loop for step, inputs in enumerate(dataloader): # Update the observed num examples observed_batch_size = find_batch_size(inputs) if observed_batch_size is not None: observed_num_examples += observed_batch_size # Prediction step loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys) # Update containers on host if loss is not None: losses = self._nested_gather(loss.repeat(batch_size)) losses_host = losses if losses_host is None else torch.cat((losses_host, losses), dim=0) if logits is not None: logits = self._pad_across_processes(logits) logits = self._nested_gather(logits) preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100) if labels is not None: labels = self._pad_across_processes(labels) labels = self._nested_gather(labels) labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100) self.control = self.callback_handler.on_prediction_step(self.args, self.state, self.control) # Gather all tensors and put them back on the CPU if we have done enough accumulation steps. if self.args.eval_accumulation_steps is not None and (step + 1) % self.args.eval_accumulation_steps == 0: if losses_host is not None: losses = nested_numpify(losses_host) all_losses = losses if all_losses is None else np.concatenate((all_losses, losses), axis=0) if preds_host is not None: logits = nested_numpify(preds_host) all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100) if labels_host is not None: labels = nested_numpify(labels_host) all_labels = ( labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100) ) # Set back to None to begin a new accumulation losses_host, preds_host, labels_host = None, None, None if self.args.past_index and hasattr(self, "_past"): # Clean the state at the end of the evaluation loop delattr(self, "_past") # Gather all remaining tensors and put them back on the CPU if losses_host is not None: losses = nested_numpify(losses_host) all_losses = losses if all_losses is None else np.concatenate((all_losses, losses), axis=0) if preds_host is not None: logits = nested_numpify(preds_host) all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100) if labels_host is not None: labels = nested_numpify(labels_host) all_labels = labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100) # Number of samples if not isinstance(eval_dataset, IterableDataset): num_samples = len(eval_dataset) elif isinstance(eval_dataset, IterableDatasetShard): num_samples = eval_dataset.num_examples else: num_samples = observed_num_examples # Number of losses has been rounded to a multiple of batch_size and in a distributed training, the number of # samplers has been rounded to a multiple of batch_size, so we truncate. if all_losses is not None: all_losses = all_losses[:num_samples] if all_preds is not None: all_preds = nested_truncate(all_preds, num_samples) if all_labels is not None: all_labels = nested_truncate(all_labels, num_samples) # Metrics! if self.compute_metrics is not None and all_preds is not None and all_labels is not None: metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels, data_info=self.get_data_info(metric_key_prefix))) else: metrics = {} # To be JSON-serializable, we need to remove numpy types or zero-d tensors metrics = denumpify_detensorize(metrics) if all_losses is not None: metrics[f"{metric_key_prefix}_loss"] = all_losses.mean().item() # Prefix all keys with metric_key_prefix + '_' for key in list(metrics.keys()): if not key.startswith(f"{metric_key_prefix}_"): metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key) return EvalLoopOutput(predictions=all_preds, label_ids=all_labels, metrics=metrics, num_samples=num_samples)