Add RTDETR Trainer (#2745)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Glenn Jocher <glenn.jocher@ultralytics.com> Co-authored-by: Laughing-q <1185102784@qq.com> Co-authored-by: Kayzwer <68285002+Kayzwer@users.noreply.github.com> Co-authored-by: Laughing <61612323+Laughing-q@users.noreply.github.com>
2023-06-17 17:16:18 +05:30 · 2023-06-17 17:16:18 +05:30 · a0ba8ef5f0
commit a0ba8ef5f0
parent 03bce07848
23 changed files with 989 additions and 314 deletions
--- a/ultralytics/vit/rtdetr/val.py
+++ b/ultralytics/vit/rtdetr/val.py
@ -2,10 +2,12 @@

 from pathlib import Path

+import cv2
+import numpy as np
 import torch

 from ultralytics.yolo.data import YOLODataset
-from ultralytics.yolo.data.augment import Compose, Format, LetterBox
+from ultralytics.yolo.data.augment import Compose, Format, v8_transforms
 from ultralytics.yolo.utils import colorstr, ops
 from ultralytics.yolo.v8.detect import DetectionValidator

@ -18,9 +20,41 @@ class RTDETRDataset(YOLODataset):
    def __init__(self, *args, data=None, **kwargs):
        super().__init__(*args, data=data, use_segments=False, use_keypoints=False, **kwargs)

+    # NOTE: add stretch version load_image for rtdetr mosaic
+    def load_image(self, i):
+        """Loads 1 image from dataset index 'i', returns (im, resized hw)."""
+        im, f, fn = self.ims[i], self.im_files[i], self.npy_files[i]
+        if im is None:  # not cached in RAM
+            if fn.exists():  # load npy
+                im = np.load(fn)
+            else:  # read image
+                im = cv2.imread(f)  # BGR
+                if im is None:
+                    raise FileNotFoundError(f'Image Not Found {f}')
+            h0, w0 = im.shape[:2]  # orig hw
+            im = cv2.resize(im, (self.imgsz, self.imgsz), interpolation=cv2.INTER_LINEAR)
+
+            # Add to buffer if training with augmentations
+            if self.augment:
+                self.ims[i], self.im_hw0[i], self.im_hw[i] = im, (h0, w0), im.shape[:2]  # im, hw_original, hw_resized
+                self.buffer.append(i)
+                if len(self.buffer) >= self.max_buffer_length:
+                    j = self.buffer.pop(0)
+                    self.ims[j], self.im_hw0[j], self.im_hw[j] = None, None, None
+
+            return im, (h0, w0), im.shape[:2]
+
+        return self.ims[i], self.im_hw0[i], self.im_hw[i]
+
    def build_transforms(self, hyp=None):
        """Temporarily, only for evaluation."""
-        transforms = Compose([LetterBox(new_shape=(self.imgsz, self.imgsz), auto=False, scaleFill=True)])
+        if self.augment:
+            hyp.mosaic = hyp.mosaic if self.augment and not self.rect else 0.0
+            hyp.mixup = hyp.mixup if self.augment and not self.rect else 0.0
+            transforms = v8_transforms(self, self.imgsz, hyp, stretch=True)
+        else:
+            # transforms = Compose([LetterBox(new_shape=(self.imgsz, self.imgsz), auto=False, scaleFill=True)])
+            transforms = Compose([])
        transforms.append(
            Format(bbox_format='xywh',
                   normalize=True,
@ -65,6 +99,8 @@ class RTDETRValidator(DetectionValidator):
            # Do not need threshold for evaluation as only got 300 boxes here.
            # idx = score > self.args.conf
            pred = torch.cat([bbox, score[..., None], cls[..., None]], dim=-1)  # filter
+            # sort by confidence to correctly get internal metrics.
+            pred = pred[score.argsort(descending=True)]
            outputs[i] = pred  # [idx]

        return outputs
@ -100,7 +136,8 @@ class RTDETRValidator(DetectionValidator):
                tbox[..., [0, 2]] *= shape[1]  # native-space pred
                tbox[..., [1, 3]] *= shape[0]  # native-space pred
                labelsn = torch.cat((cls, tbox), 1)  # native-space labels
-                correct_bboxes = self._process_batch(predn, labelsn)
+                # NOTE: To get correct metrics, the inputs of `_process_batch` should always be float32 type.
+                correct_bboxes = self._process_batch(predn.float(), labelsn)
                # TODO: maybe remove these `self.` arguments as they already are member variable
                if self.args.plots:
                    self.confusion_matrix.process_batch(predn, labelsn)