# ------------------------------------------------------------------------ # Deformable DETR # Copyright (c) 2020 SenseTime. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------ import torch # def to_cuda(samples, targets, device): # samples = samples.to(device, non_blocking=True) # targets = [{k: v.to(device, non_blocking=True) for k, v in t.items()} for t in targets] # return samples, targets def to_cuda(samples, ref_samples, targets, device): ref_samples = [ref_sample.to(device, non_blocking=True) for ref_sample in ref_samples] samples = samples.to(device, non_blocking=True) targets = [{k: v.to(device, non_blocking=True) for k, v in t.items()} for t in targets] return samples, ref_samples, targets class data_prefetcher(): def __init__(self, loader, device, prefetch=True): self.loader = iter(loader) self.prefetch = prefetch self.device = device if prefetch: self.stream = torch.cuda.Stream() self.preload() def preload(self): try: self.next_samples, self.next_ref_samples, self.next_targets = next(self.loader) except StopIteration: self.next_samples = None self.next_targets = None self.next_ref_samples =None return # if record_stream() doesn't work, another option is to make sure device inputs are created # on the main stream. # self.next_input_gpu = torch.empty_like(self.next_input, device='cuda') # self.next_target_gpu = torch.empty_like(self.next_target, device='cuda') # Need to make sure the memory allocated for next_* is not still in use by the main stream # at the time we start copying to next_*: # self.stream.wait_stream(torch.cuda.current_stream()) with torch.cuda.stream(self.stream): self.next_samples, self.next_ref_samples, self.next_targets = to_cuda(self.next_samples, self.next_ref_samples, self.next_targets, self.device) # more code for the alternative if record_stream() doesn't work: # copy_ will record the use of the pinned source tensor in this side stream. # self.next_input_gpu.copy_(self.next_input, non_blocking=True) # self.next_target_gpu.copy_(self.next_target, non_blocking=True) # self.next_input = self.next_input_gpu # self.next_target = self.next_target_gpu # With Amp, it isn't necessary to manually convert data to half. # if args.fp16: # self.next_input = self.next_input.half() # else: def next(self): if self.prefetch: torch.cuda.current_stream().wait_stream(self.stream) samples = self.next_samples targets = self.next_targets if samples is not None: samples.record_stream(torch.cuda.current_stream()) if targets is not None: for t in targets: for k, v in t.items(): v.record_stream(torch.cuda.current_stream()) self.preload() else: try: # nested tensor, list[tensor] samples, ref_samples, targets = next(self.loader) assert ref_samples is None, [type(ref_samples[0]), type(ref_samples), type(samples), len(ref_samples), "ref_samples", ref_samples] samples, ref_samples, targets = to_cuda(samples, ref_samples, targets, self.device) except StopIteration: samples = None targets = None ref_samples = None assert True, "wwww" return samples, targets