import torch.nn as nn import json import torch import argparse from datasets.coco_eval import CocoEvaluator from util import box_ops from datasets import build_dataset, get_coco_api_from_dataset def get_args_parser(): parser = argparse.ArgumentParser('Deformable DETR Detector', add_help=False) parser.add_argument('--num_ref_frames', default=3, type=int, help='number of reference frames') parser.add_argument('--sgd', action='store_true') parser.add_argument('--interval1', default=20, type=int) parser.add_argument('--interval2', default=60, type=int) parser.add_argument('--masks', action='store_true', help="Train segmentation head if the flag is provided") # dataset parameters parser.add_argument('--vid_path', default='./data/vid', type=str) parser.add_argument('--input_result_path', default='./test_save.json', type=str) parser.add_argument('--output_dir', default='', help='path where to save, empty for no saving') parser.add_argument('--eval', action='store_true') parser.add_argument('--cache_mode', default=False, action='store_true', help='whether to cache images on memory') return parser class PostProcess(nn.Module): """ This module converts the model's output into the format expected by the coco api""" @torch.no_grad() def forward(self, outputs, target_sizes): """ Perform the computation Parameters: outputs: raw outputs of the model target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch For evaluation, this must be the original image size (before any data augmentation) For visualization, this should be the image size after data augment, but before padding """ out_logits, out_bbox = outputs['pred_logits'], outputs['pred_boxes'] assert len(out_logits) == len(target_sizes) assert target_sizes.shape[1] == 2 prob = out_logits.sigmoid() topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1) scores = topk_values topk_boxes = topk_indexes // out_logits.shape[2] labels = topk_indexes % out_logits.shape[2] boxes = box_ops.box_cxcywh_to_xyxy(out_bbox) boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1,1,4)) # and from relative [0, 1] to absolute [0, height] coordinates img_h, img_w = target_sizes.unbind(1) scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1) boxes = boxes * scale_fct[:, None, :] results = [{'scores': s, 'labels': l, 'boxes': b} for s, l, b in zip(scores, labels, boxes)] return results def eval(path): with open(path, 'r') as f: dataset = json.load(f) postprocessors = {'bbox': PostProcess()} dataset_val = build_dataset(image_set='test', args=args) base_ds = get_coco_api_from_dataset(dataset_val) iou_types = tuple(k for k in ('segm', 'bbox') if k in postprocessors.keys()) coco_evaluator = CocoEvaluator(base_ds, iou_types) for data in dataset: res = {} for image_id in data.keys(): image_id_int = int(image_id) res[image_id_int] = {} for k, v in data[image_id].items(): res[image_id_int][k] = torch.tensor(v) coco_evaluator.update(res) if coco_evaluator is not None: coco_evaluator.synchronize_between_processes() # accumulate predictions from all images if coco_evaluator is not None: coco_evaluator.accumulate() coco_evaluator.summarize() if __name__ == '__main__': parser = argparse.ArgumentParser('Deformable DETR training and evaluation script', parents=[get_args_parser()]) args = parser.parse_args() eval(args.input_result_path)