ultralytics 8.0.235 YOLOv8 OBB train, val, predict and export (#4499)

Co-authored-by: Yash Khurana <ykhurana6@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Swamita Gupta <swamita2001@gmail.com> Co-authored-by: Ayush Chaurasia <ayush.chaurarsia@gmail.com> Co-authored-by: Laughing-q <1185102784@qq.com> Co-authored-by: Laughing <61612323+Laughing-q@users.noreply.github.com> Co-authored-by: Laughing-q <1182102784@qq.com>
2024-01-05 03:00:26 +01:00 · 2024-01-05 03:00:26 +01:00 · 072291bc78
commit 072291bc78
parent f702b34a50
52 changed files with 2090 additions and 524 deletions
--- a/ultralytics/models/rtdetr/val.py
+++ b/ultralytics/models/rtdetr/val.py
@ -1,7 +1,5 @@
 # Ultralytics YOLO 🚀, AGPL-3.0 license

-from pathlib import Path
-
 import torch

 from ultralytics.data import YOLODataset
@ -22,7 +20,7 @@ class RTDETRDataset(YOLODataset):

    def __init__(self, *args, data=None, **kwargs):
        """Initialize the RTDETRDataset class by inheriting from the YOLODataset class."""
-        super().__init__(*args, data=data, use_segments=False, use_keypoints=False, **kwargs)
+        super().__init__(*args, data=data, **kwargs)

    # NOTE: add stretch version load_image for RTDETR mosaic
    def load_image(self, i, rect_mode=False):
@ -108,47 +106,22 @@ class RTDETRValidator(DetectionValidator):

        return outputs

-    def update_metrics(self, preds, batch):
-        """Metrics."""
-        for si, pred in enumerate(preds):
-            idx = batch['batch_idx'] == si
-            cls = batch['cls'][idx]
-            bbox = batch['bboxes'][idx]
-            nl, npr = cls.shape[0], pred.shape[0]  # number of labels, predictions
-            shape = batch['ori_shape'][si]
-            correct_bboxes = torch.zeros(npr, self.niou, dtype=torch.bool, device=self.device)  # init
-            self.seen += 1
+    def _prepare_batch(self, si, batch):
+        idx = batch['batch_idx'] == si
+        cls = batch['cls'][idx].squeeze(-1)
+        bbox = batch['bboxes'][idx]
+        ori_shape = batch['ori_shape'][si]
+        imgsz = batch['img'].shape[2:]
+        ratio_pad = batch['ratio_pad'][si]
+        if len(cls):
+            bbox = ops.xywh2xyxy(bbox)  # target boxes
+            bbox[..., [0, 2]] *= ori_shape[1]  # native-space pred
+            bbox[..., [1, 3]] *= ori_shape[0]  # native-space pred
+        prepared_batch = dict(cls=cls, bbox=bbox, ori_shape=ori_shape, imgsz=imgsz, ratio_pad=ratio_pad)
+        return prepared_batch

-            if npr == 0:
-                if nl:
-                    self.stats.append((correct_bboxes, *torch.zeros((2, 0), device=self.device), cls.squeeze(-1)))
-                    if self.args.plots:
-                        self.confusion_matrix.process_batch(detections=None, labels=cls.squeeze(-1))
-                continue
-
-            # Predictions
-            if self.args.single_cls:
-                pred[:, 5] = 0
-            predn = pred.clone()
-            predn[..., [0, 2]] *= shape[1] / self.args.imgsz  # native-space pred
-            predn[..., [1, 3]] *= shape[0] / self.args.imgsz  # native-space pred
-
-            # Evaluate
-            if nl:
-                tbox = ops.xywh2xyxy(bbox)  # target boxes
-                tbox[..., [0, 2]] *= shape[1]  # native-space pred
-                tbox[..., [1, 3]] *= shape[0]  # native-space pred
-                labelsn = torch.cat((cls, tbox), 1)  # native-space labels
-                # NOTE: To get correct metrics, the inputs of `_process_batch` should always be float32 type.
-                correct_bboxes = self._process_batch(predn.float(), labelsn)
-                # TODO: maybe remove these `self.` arguments as they already are member variable
-                if self.args.plots:
-                    self.confusion_matrix.process_batch(predn, labelsn)
-            self.stats.append((correct_bboxes, pred[:, 4], pred[:, 5], cls.squeeze(-1)))  # (conf, pcls, tcls)
-
-            # Save
-            if self.args.save_json:
-                self.pred_to_json(predn, batch['im_file'][si])
-            if self.args.save_txt:
-                file = self.save_dir / 'labels' / f'{Path(batch["im_file"][si]).stem}.txt'
-                self.save_one_txt(predn, self.args.save_conf, shape, file)
+    def _prepare_pred(self, pred, pbatch):
+        predn = pred.clone()
+        predn[..., [0, 2]] *= pbatch['ori_shape'][1] / self.args.imgsz  # native-space pred
+        predn[..., [1, 3]] *= pbatch['ori_shape'][0] / self.args.imgsz  # native-space pred
+        return predn.float()
--- a/ultralytics/models/yolo/init.py
+++ b/ultralytics/models/yolo/init.py
@ -1,7 +1,7 @@
 # Ultralytics YOLO 🚀, AGPL-3.0 license

-from ultralytics.models.yolo import classify, detect, pose, segment
+from ultralytics.models.yolo import classify, detect, obb, pose, segment

 from .model import YOLO

-__all__ = 'classify', 'segment', 'detect', 'pose', 'YOLO'
+__all__ = 'classify', 'segment', 'detect', 'pose', 'obb', 'YOLO'
--- a/ultralytics/models/yolo/detect/train.py
+++ b/ultralytics/models/yolo/detect/train.py
@ -1,8 +1,11 @@
 # Ultralytics YOLO 🚀, AGPL-3.0 license

+import math
+import random
 from copy import copy

 import numpy as np
+import torch.nn as nn

 from ultralytics.data import build_dataloader, build_yolo_dataset
 from ultralytics.engine.trainer import BaseTrainer
@ -54,6 +57,16 @@ class DetectionTrainer(BaseTrainer):
    def preprocess_batch(self, batch):
        """Preprocesses a batch of images by scaling and converting to float."""
        batch['img'] = batch['img'].to(self.device, non_blocking=True).float() / 255
+        if self.args.multi_scale:
+            imgs = batch['img']
+            sz = (random.randrange(self.args.imgsz * 0.5, self.args.imgsz * 1.5 + self.stride) // self.stride *
+                  self.stride)  # size
+            sf = sz / max(imgs.shape[2:])  # scale factor
+            if sf != 1:
+                ns = [math.ceil(x * sf / self.stride) * self.stride
+                      for x in imgs.shape[2:]]  # new shape (stretched to gs-multiple)
+                imgs = nn.functional.interpolate(imgs, size=ns, mode='bilinear', align_corners=False)
+            batch['img'] = imgs
        return batch

    def set_model_attributes(self):
--- a/ultralytics/models/yolo/detect/val.py
+++ b/ultralytics/models/yolo/detect/val.py
@ -70,7 +70,7 @@ class DetectionValidator(BaseValidator):
        self.confusion_matrix = ConfusionMatrix(nc=self.nc, conf=self.args.conf)
        self.seen = 0
        self.jdict = []
-        self.stats = []
+        self.stats = dict(tp=[], conf=[], pred_cls=[], target_cls=[])

    def get_desc(self):
        """Return a formatted string summarizing class metrics of YOLO model."""
@ -86,51 +86,68 @@ class DetectionValidator(BaseValidator):
                                       agnostic=self.args.single_cls,
                                       max_det=self.args.max_det)

+    def _prepare_batch(self, si, batch):
+        idx = batch['batch_idx'] == si
+        cls = batch['cls'][idx].squeeze(-1)
+        bbox = batch['bboxes'][idx]
+        ori_shape = batch['ori_shape'][si]
+        imgsz = batch['img'].shape[2:]
+        ratio_pad = batch['ratio_pad'][si]
+        if len(cls):
+            bbox = ops.xywh2xyxy(bbox) * torch.tensor(imgsz, device=self.device)[[1, 0, 1, 0]]  # target boxes
+            ops.scale_boxes(imgsz, bbox, ori_shape, ratio_pad=ratio_pad)  # native-space labels
+        prepared_batch = dict(cls=cls, bbox=bbox, ori_shape=ori_shape, imgsz=imgsz, ratio_pad=ratio_pad)
+        return prepared_batch
+
+    def _prepare_pred(self, pred, pbatch):
+        predn = pred.clone()
+        ops.scale_boxes(pbatch['imgsz'], predn[:, :4], pbatch['ori_shape'],
+                        ratio_pad=pbatch['ratio_pad'])  # native-space pred
+        return predn
+
    def update_metrics(self, preds, batch):
        """Metrics."""
        for si, pred in enumerate(preds):
-            idx = batch['batch_idx'] == si
-            cls = batch['cls'][idx]
-            bbox = batch['bboxes'][idx]
-            nl, npr = cls.shape[0], pred.shape[0]  # number of labels, predictions
-            shape = batch['ori_shape'][si]
-            correct_bboxes = torch.zeros(npr, self.niou, dtype=torch.bool, device=self.device)  # init
            self.seen += 1
-
+            npr = len(pred)
+            stat = dict(conf=torch.zeros(0, device=self.device),
+                        pred_cls=torch.zeros(0, device=self.device),
+                        tp=torch.zeros(npr, self.niou, dtype=torch.bool, device=self.device))
+            pbatch = self._prepare_batch(si, batch)
+            cls, bbox = pbatch.pop('cls'), pbatch.pop('bbox')
+            nl = len(cls)
+            stat['target_cls'] = cls
            if npr == 0:
                if nl:
-                    self.stats.append((correct_bboxes, *torch.zeros((2, 0), device=self.device), cls.squeeze(-1)))
-                    if self.args.plots:
-                        self.confusion_matrix.process_batch(detections=None, labels=cls.squeeze(-1))
+                    for k in self.stats.keys():
+                        self.stats[k].append(stat[k])
+                    # TODO: obb has not supported confusion_matrix yet.
+                    if self.args.plots and self.args.task != 'obb':
+                        self.confusion_matrix.process_batch(detections=None, gt_bboxes=bbox, gt_cls=cls)
                continue

            # Predictions
            if self.args.single_cls:
                pred[:, 5] = 0
-            predn = pred.clone()
-            ops.scale_boxes(batch['img'][si].shape[1:], predn[:, :4], shape,
-                            ratio_pad=batch['ratio_pad'][si])  # native-space pred
+            predn = self._prepare_pred(pred, pbatch)
+            stat['conf'] = predn[:, 4]
+            stat['pred_cls'] = predn[:, 5]

            # Evaluate
            if nl:
-                height, width = batch['img'].shape[2:]
-                tbox = ops.xywh2xyxy(bbox) * torch.tensor(
-                    (width, height, width, height), device=self.device)  # target boxes
-                ops.scale_boxes(batch['img'][si].shape[1:], tbox, shape,
-                                ratio_pad=batch['ratio_pad'][si])  # native-space labels
-                labelsn = torch.cat((cls, tbox), 1)  # native-space labels
-                correct_bboxes = self._process_batch(predn, labelsn)
-                # TODO: maybe remove these `self.` arguments as they already are member variable
-                if self.args.plots:
-                    self.confusion_matrix.process_batch(predn, labelsn)
-            self.stats.append((correct_bboxes, pred[:, 4], pred[:, 5], cls.squeeze(-1)))  # (conf, pcls, tcls)
+                stat['tp'] = self._process_batch(predn, bbox, cls)
+                # TODO: obb has not supported confusion_matrix yet.
+                if self.args.plots and self.args.task != 'obb':
+                    self.confusion_matrix.process_batch(predn, bbox, cls)
+            for k in self.stats.keys():
+                self.stats[k].append(stat[k])

            # Save
            if self.args.save_json:
                self.pred_to_json(predn, batch['im_file'][si])
            if self.args.save_txt:
                file = self.save_dir / 'labels' / f'{Path(batch["im_file"][si]).stem}.txt'
-                self.save_one_txt(predn, self.args.save_conf, shape, file)
+                self.save_one_txt(predn, self.args.save_conf, pbatch['ori_shape'], file)

    def finalize_metrics(self, *args, **kwargs):
        """Set final values for metrics speed and confusion matrix."""
@ -139,10 +156,11 @@ class DetectionValidator(BaseValidator):

    def get_stats(self):
        """Returns metrics statistics and results dictionary."""
-        stats = [torch.cat(x, 0).cpu().numpy() for x in zip(*self.stats)]  # to numpy
-        if len(stats) and stats[0].any():
-            self.metrics.process(*stats)
-        self.nt_per_class = np.bincount(stats[-1].astype(int), minlength=self.nc)  # number of targets per class
+        stats = {k: torch.cat(v, 0).cpu().numpy() for k, v in self.stats.items()}  # to numpy
+        if len(stats) and stats['tp'].any():
+            self.metrics.process(**stats)
+        self.nt_per_class = np.bincount(stats['target_cls'].astype(int),
+                                        minlength=self.nc)  # number of targets per class
        return self.metrics.results_dict

    def print_results(self):
@ -165,7 +183,7 @@ class DetectionValidator(BaseValidator):
                                           normalize=normalize,
                                           on_plot=self.on_plot)

-    def _process_batch(self, detections, labels):
+    def _process_batch(self, detections, gt_bboxes, gt_cls):
        """
        Return correct prediction matrix.

@ -178,8 +196,8 @@ class DetectionValidator(BaseValidator):
        Returns:
            (torch.Tensor): Correct prediction matrix of shape [N, 10] for 10 IoU levels.
        """
-        iou = box_iou(labels[:, 1:], detections[:, :4])
-        return self.match_predictions(detections[:, 5], labels[:, 0], iou)
+        iou = box_iou(gt_bboxes, detections[:, :4])
+        return self.match_predictions(detections[:, 5], gt_cls, iou)

    def build_dataset(self, img_path, mode='val', batch=None):
        """
--- a/ultralytics/models/yolo/model.py
+++ b/ultralytics/models/yolo/model.py
@ -1,8 +1,8 @@
 # Ultralytics YOLO 🚀, AGPL-3.0 license

 from ultralytics.engine.model import Model
-from ultralytics.models import yolo  # noqa
-from ultralytics.nn.tasks import ClassificationModel, DetectionModel, PoseModel, SegmentationModel
+from ultralytics.models import yolo
+from ultralytics.nn.tasks import ClassificationModel, DetectionModel, OBBModel, PoseModel, SegmentationModel


 class YOLO(Model):
@ -31,4 +31,9 @@ class YOLO(Model):
                'model': PoseModel,
                'trainer': yolo.pose.PoseTrainer,
                'validator': yolo.pose.PoseValidator,
-                'predictor': yolo.pose.PosePredictor, }, }
+                'predictor': yolo.pose.PosePredictor, },
+            'obb': {
+                'model': OBBModel,
+                'trainer': yolo.obb.OBBTrainer,
+                'validator': yolo.obb.OBBValidator,
+                'predictor': yolo.obb.OBBPredictor, }, }
--- a/ultralytics/models/yolo/obb/init.py
+++ b/ultralytics/models/yolo/obb/init.py
@ -0,0 +1,7 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+
+from .predict import OBBPredictor
+from .train import OBBTrainer
+from .val import OBBValidator
+
+__all__ = 'OBBPredictor', 'OBBTrainer', 'OBBValidator'
--- a/ultralytics/models/yolo/obb/predict.py
+++ b/ultralytics/models/yolo/obb/predict.py
@ -0,0 +1,51 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+
+import torch
+
+from ultralytics.engine.results import Results
+from ultralytics.models.yolo.detect.predict import DetectionPredictor
+from ultralytics.utils import DEFAULT_CFG, ops
+
+
+class OBBPredictor(DetectionPredictor):
+    """
+    A class extending the DetectionPredictor class for prediction based on an Oriented Bounding Box (OBB) model.
+
+    Example:
+        ```python
+        from ultralytics.utils import ASSETS
+        from ultralytics.models.yolo.obb import OBBPredictor
+
+        args = dict(model='yolov8n-obb.pt', source=ASSETS)
+        predictor = OBBPredictor(overrides=args)
+        predictor.predict_cli()
+        ```
+    """
+
+    def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
+        super().__init__(cfg, overrides, _callbacks)
+        self.args.task = 'obb'
+
+    def postprocess(self, preds, img, orig_imgs):
+        """Post-processes predictions and returns a list of Results objects."""
+        preds = ops.non_max_suppression(preds,
+                                        self.args.conf,
+                                        self.args.iou,
+                                        agnostic=self.args.agnostic_nms,
+                                        max_det=self.args.max_det,
+                                        nc=len(self.model.names),
+                                        classes=self.args.classes,
+                                        rotated=True)
+
+        if not isinstance(orig_imgs, list):  # input images are a torch.Tensor, not a list
+            orig_imgs = ops.convert_torch2numpy_batch(orig_imgs)
+
+        results = []
+        for i, pred in enumerate(preds):
+            orig_img = orig_imgs[i]
+            pred[:, :4] = ops.scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape, xywh=True)
+            img_path = self.batch[0][i]
+            # xywh, r, conf, cls
+            obb = torch.cat([pred[:, :4], pred[:, -1:], pred[:, 4:6]], dim=-1)
+            results.append(Results(orig_img, path=img_path, names=self.model.names, obb=obb))
+        return results
--- a/ultralytics/models/yolo/obb/train.py
+++ b/ultralytics/models/yolo/obb/train.py
@ -0,0 +1,42 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+
+from copy import copy
+
+from ultralytics.models import yolo
+from ultralytics.nn.tasks import OBBModel
+from ultralytics.utils import DEFAULT_CFG, RANK
+
+
+class OBBTrainer(yolo.detect.DetectionTrainer):
+    """
+    A class extending the DetectionTrainer class for training based on an Oriented Bounding Box (OBB) model.
+
+    Example:
+        ```python
+        from ultralytics.models.yolo.obb import OBBTrainer
+
+        args = dict(model='yolov8n-seg.pt', data='coco8-seg.yaml', epochs=3)
+        trainer = OBBTrainer(overrides=args)
+        trainer.train()
+        ```
+    """
+
+    def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
+        """Initialize a OBBTrainer object with given arguments."""
+        if overrides is None:
+            overrides = {}
+        overrides['task'] = 'obb'
+        super().__init__(cfg, overrides, _callbacks)
+
+    def get_model(self, cfg=None, weights=None, verbose=True):
+        """Return OBBModel initialized with specified config and weights."""
+        model = OBBModel(cfg, ch=3, nc=self.data['nc'], verbose=verbose and RANK == -1)
+        if weights:
+            model.load(weights)
+
+        return model
+
+    def get_validator(self):
+        """Return an instance of OBBValidator for validation of YOLO model."""
+        self.loss_names = 'box_loss', 'cls_loss', 'dfl_loss'
+        return yolo.obb.OBBValidator(self.test_loader, save_dir=self.save_dir, args=copy(self.args))
--- a/ultralytics/models/yolo/obb/val.py
+++ b/ultralytics/models/yolo/obb/val.py
@ -0,0 +1,187 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+
+from pathlib import Path
+
+import torch
+
+from ultralytics.models.yolo.detect import DetectionValidator
+from ultralytics.utils import LOGGER, ops
+from ultralytics.utils.metrics import OBBMetrics, batch_probiou
+from ultralytics.utils.plotting import output_to_rotated_target, plot_images
+
+
+class OBBValidator(DetectionValidator):
+    """
+    A class extending the DetectionValidator class for validation based on an Oriented Bounding Box (OBB) model.
+
+    Example:
+        ```python
+        from ultralytics.models.yolo.obb import OBBValidator
+
+        args = dict(model='yolov8n-obb.pt', data='coco8-seg.yaml')
+        validator = OBBValidator(args=args)
+        validator(model=args['model'])
+        ```
+    """
+
+    def __init__(self, dataloader=None, save_dir=None, pbar=None, args=None, _callbacks=None):
+        """Initialize OBBValidator and set task to 'obb', metrics to OBBMetrics."""
+        super().__init__(dataloader, save_dir, pbar, args, _callbacks)
+        self.args.task = 'obb'
+        self.metrics = OBBMetrics(save_dir=self.save_dir, plot=True, on_plot=self.on_plot)
+
+    def init_metrics(self, model):
+        """Initialize evaluation metrics for YOLO."""
+        super().init_metrics(model)
+        val = self.data.get(self.args.split, '')  # validation path
+        self.is_dota = isinstance(val, str) and 'DOTA' in val  # is COCO
+
+    def postprocess(self, preds):
+        """Apply Non-maximum suppression to prediction outputs."""
+        return ops.non_max_suppression(preds,
+                                       self.args.conf,
+                                       self.args.iou,
+                                       labels=self.lb,
+                                       nc=self.nc,
+                                       multi_label=True,
+                                       agnostic=self.args.single_cls,
+                                       max_det=self.args.max_det,
+                                       rotated=True)
+
+    def _process_batch(self, detections, gt_bboxes, gt_cls):
+        """
+        Return correct prediction matrix.
+
+        Args:
+            detections (torch.Tensor): Tensor of shape [N, 6] representing detections.
+                Each detection is of the format: x1, y1, x2, y2, conf, class.
+            labels (torch.Tensor): Tensor of shape [M, 5] representing labels.
+                Each label is of the format: class, x1, y1, x2, y2.
+
+        Returns:
+            (torch.Tensor): Correct prediction matrix of shape [N, 10] for 10 IoU levels.
+        """
+        iou = batch_probiou(gt_bboxes, torch.cat([detections[:, :4], detections[:, -2:-1]], dim=-1))
+        return self.match_predictions(detections[:, 5], gt_cls, iou)
+
+    def _prepare_batch(self, si, batch):
+        idx = batch['batch_idx'] == si
+        cls = batch['cls'][idx].squeeze(-1)
+        bbox = batch['bboxes'][idx]
+        ori_shape = batch['ori_shape'][si]
+        imgsz = batch['img'].shape[2:]
+        ratio_pad = batch['ratio_pad'][si]
+        if len(cls):
+            bbox[..., :4].mul_(torch.tensor(imgsz, device=self.device)[[1, 0, 1, 0]])  # target boxes
+            ops.scale_boxes(imgsz, bbox, ori_shape, ratio_pad=ratio_pad, xywh=True)  # native-space labels
+        prepared_batch = dict(cls=cls, bbox=bbox, ori_shape=ori_shape, imgsz=imgsz, ratio_pad=ratio_pad)
+        return prepared_batch
+
+    def _prepare_pred(self, pred, pbatch):
+        predn = pred.clone()
+        ops.scale_boxes(pbatch['imgsz'], predn[:, :4], pbatch['ori_shape'], ratio_pad=pbatch['ratio_pad'],
+                        xywh=True)  # native-space pred
+        return predn
+
+    def plot_predictions(self, batch, preds, ni):
+        """Plots predicted bounding boxes on input images and saves the result."""
+        plot_images(batch['img'],
+                    *output_to_rotated_target(preds, max_det=self.args.max_det),
+                    paths=batch['im_file'],
+                    fname=self.save_dir / f'val_batch{ni}_pred.jpg',
+                    names=self.names,
+                    on_plot=self.on_plot)  # pred
+
+    def pred_to_json(self, predn, filename):
+        """Serialize YOLO predictions to COCO json format."""
+        stem = Path(filename).stem
+        image_id = int(stem) if stem.isnumeric() else stem
+        rbox = torch.cat([predn[:, :4], predn[:, -1:]], dim=-1)
+        poly = ops.xywhr2xyxyxyxy(rbox).view(-1, 8)
+        for i, (r, b) in enumerate(zip(rbox.tolist(), poly.tolist())):
+            self.jdict.append({
+                'image_id': image_id,
+                'category_id': self.class_map[int(predn[i, 5].item())],
+                'score': round(predn[i, 4].item(), 5),
+                'rbox': [round(x, 3) for x in r],
+                'poly': [round(x, 3) for x in b]})
+
+    def eval_json(self, stats):
+        """Evaluates YOLO output in JSON format and returns performance statistics."""
+        if self.args.save_json and self.is_dota and len(self.jdict):
+            import json
+            import re
+            from collections import defaultdict
+            pred_json = self.save_dir / 'predictions.json'  # predictions
+            pred_txt = self.save_dir / 'predictions_txt'  # predictions
+            pred_txt.mkdir(parents=True, exist_ok=True)
+            data = json.load(open(pred_json))
+            # Save split results
+            LOGGER.info(f'Saving predictions with DOTA format to {str(pred_txt)}...')
+            for d in data:
+                image_id = d['image_id']
+                score = d['score']
+                classname = self.names[d['category_id']].replace(' ', '-')
+
+                lines = '{} {} {} {} {} {} {} {} {} {}\n'.format(
+                    image_id,
+                    score,
+                    d['poly'][0],
+                    d['poly'][1],
+                    d['poly'][2],
+                    d['poly'][3],
+                    d['poly'][4],
+                    d['poly'][5],
+                    d['poly'][6],
+                    d['poly'][7],
+                )
+                with open(str(pred_txt / f'Task1_{classname}') + '.txt', 'a') as f:
+                    f.writelines(lines)
+            # Save merged results, this could result slightly lower map than using official merging script,
+            # because of the probiou calculation.
+            pred_merged_txt = self.save_dir / 'predictions_merged_txt'  # predictions
+            pred_merged_txt.mkdir(parents=True, exist_ok=True)
+            merged_results = defaultdict(list)
+            LOGGER.info(f'Saving merged predictions with DOTA format to {str(pred_merged_txt)}...')
+            for d in data:
+                image_id = d['image_id'].split('__')[0]
+                pattern = re.compile(r'\d+___\d+')
+                x, y = (int(c) for c in re.findall(pattern, d['image_id'])[0].split('___'))
+                bbox, score, cls = d['rbox'], d['score'], d['category_id']
+                bbox[0] += x
+                bbox[1] += y
+                bbox.extend([score, cls])
+                merged_results[image_id].append(bbox)
+            for image_id, bbox in merged_results.items():
+                bbox = torch.tensor(bbox)
+                max_wh = torch.max(bbox[:, :2]).item() * 2
+                c = bbox[:, 6:7] * max_wh  # classes
+                scores = bbox[:, 5]  # scores
+                b = bbox[:, :5].clone()
+                b[:, :2] += c
+                # 0.3 could get results close to the ones from official merging script, even slightly better.
+                i = ops.nms_rotated(b, scores, 0.3)
+                bbox = bbox[i]
+
+                b = ops.xywhr2xyxyxyxy(bbox[:, :5]).view(-1, 8)
+                for x in torch.cat([b, bbox[:, 5:7]], dim=-1).tolist():
+                    classname = self.names[int(x[-1])].replace(' ', '-')
+                    poly = [round(i, 3) for i in x[:-2]]
+                    score = round(x[-2], 3)
+
+                    lines = '{} {} {} {} {} {} {} {} {} {}\n'.format(
+                        image_id,
+                        score,
+                        poly[0],
+                        poly[1],
+                        poly[2],
+                        poly[3],
+                        poly[4],
+                        poly[5],
+                        poly[6],
+                        poly[7],
+                    )
+                    with open(str(pred_merged_txt / f'Task1_{classname}') + '.txt', 'a') as f:
+                        f.writelines(lines)
+
+        return stats
--- a/ultralytics/models/yolo/pose/val.py
+++ b/ultralytics/models/yolo/pose/val.py
@ -66,57 +66,63 @@ class PoseValidator(DetectionValidator):
        is_pose = self.kpt_shape == [17, 3]
        nkpt = self.kpt_shape[0]
        self.sigma = OKS_SIGMA if is_pose else np.ones(nkpt) / nkpt
+        self.stats = dict(tp_p=[], tp=[], conf=[], pred_cls=[], target_cls=[])
+
+    def _prepare_batch(self, si, batch):
+        pbatch = super()._prepare_batch(si, batch)
+        kpts = batch['keypoints'][batch['batch_idx'] == si]
+        h, w = pbatch['imgsz']
+        kpts = kpts.clone()
+        kpts[..., 0] *= w
+        kpts[..., 1] *= h
+        kpts = ops.scale_coords(pbatch['imgsz'], kpts, pbatch['ori_shape'], ratio_pad=pbatch['ratio_pad'])
+        pbatch['kpts'] = kpts
+        return pbatch
+
+    def _prepare_pred(self, pred, pbatch):
+        predn = super()._prepare_pred(pred, pbatch)
+        nk = pbatch['kpts'].shape[1]
+        pred_kpts = predn[:, 6:].view(len(predn), nk, -1)
+        ops.scale_coords(pbatch['imgsz'], pred_kpts, pbatch['ori_shape'], ratio_pad=pbatch['ratio_pad'])
+        return predn, pred_kpts

    def update_metrics(self, preds, batch):
        """Metrics."""
        for si, pred in enumerate(preds):
-            idx = batch['batch_idx'] == si
-            cls = batch['cls'][idx]
-            bbox = batch['bboxes'][idx]
-            kpts = batch['keypoints'][idx]
-            nl, npr = cls.shape[0], pred.shape[0]  # number of labels, predictions
-            nk = kpts.shape[1]  # number of keypoints
-            shape = batch['ori_shape'][si]
-            correct_kpts = torch.zeros(npr, self.niou, dtype=torch.bool, device=self.device)  # init
-            correct_bboxes = torch.zeros(npr, self.niou, dtype=torch.bool, device=self.device)  # init
            self.seen += 1
-
+            npr = len(pred)
+            stat = dict(conf=torch.zeros(0, device=self.device),
+                        pred_cls=torch.zeros(0, device=self.device),
+                        tp=torch.zeros(npr, self.niou, dtype=torch.bool, device=self.device),
+                        tp_p=torch.zeros(npr, self.niou, dtype=torch.bool, device=self.device))
+            pbatch = self._prepare_batch(si, batch)
+            cls, bbox = pbatch.pop('cls'), pbatch.pop('bbox')
+            nl = len(cls)
+            stat['target_cls'] = cls
            if npr == 0:
                if nl:
-                    self.stats.append((correct_bboxes, correct_kpts, *torch.zeros(
-                        (2, 0), device=self.device), cls.squeeze(-1)))
+                    for k in self.stats.keys():
+                        self.stats[k].append(stat[k])
                    if self.args.plots:
-                        self.confusion_matrix.process_batch(detections=None, labels=cls.squeeze(-1))
+                        self.confusion_matrix.process_batch(detections=None, gt_bboxes=bbox, gt_cls=cls)
                continue

            # Predictions
            if self.args.single_cls:
                pred[:, 5] = 0
-            predn = pred.clone()
-            ops.scale_boxes(batch['img'][si].shape[1:], predn[:, :4], shape,
-                            ratio_pad=batch['ratio_pad'][si])  # native-space pred
-            pred_kpts = predn[:, 6:].view(npr, nk, -1)
-            ops.scale_coords(batch['img'][si].shape[1:], pred_kpts, shape, ratio_pad=batch['ratio_pad'][si])
+            predn, pred_kpts = self._prepare_pred(pred, pbatch)
+            stat['conf'] = predn[:, 4]
+            stat['pred_cls'] = predn[:, 5]

            # Evaluate
            if nl:
-                height, width = batch['img'].shape[2:]
-                tbox = ops.xywh2xyxy(bbox) * torch.tensor(
-                    (width, height, width, height), device=self.device)  # target boxes
-                ops.scale_boxes(batch['img'][si].shape[1:], tbox, shape,
-                                ratio_pad=batch['ratio_pad'][si])  # native-space labels
-                tkpts = kpts.clone()
-                tkpts[..., 0] *= width
-                tkpts[..., 1] *= height
-                tkpts = ops.scale_coords(batch['img'][si].shape[1:], tkpts, shape, ratio_pad=batch['ratio_pad'][si])
-                labelsn = torch.cat((cls, tbox), 1)  # native-space labels
-                correct_bboxes = self._process_batch(predn[:, :6], labelsn)
-                correct_kpts = self._process_batch(predn[:, :6], labelsn, pred_kpts, tkpts)
+                stat['tp'] = self._process_batch(predn, bbox, cls)
+                stat['tp_p'] = self._process_batch(predn, bbox, cls, pred_kpts, pbatch['kpts'])
                if self.args.plots:
-                    self.confusion_matrix.process_batch(predn, labelsn)
+                    self.confusion_matrix.process_batch(predn, bbox, cls)

-            # Append correct_masks, correct_boxes, pconf, pcls, tcls
-            self.stats.append((correct_bboxes, correct_kpts, pred[:, 4], pred[:, 5], cls.squeeze(-1)))
+            for k in self.stats.keys():
+                self.stats[k].append(stat[k])

            # Save
            if self.args.save_json:
@ -124,7 +130,7 @@ class PoseValidator(DetectionValidator):
            # if self.args.save_txt:
            #    save_one_txt(predn, save_conf, shape, file=save_dir / 'labels' / f'{path.stem}.txt')

-    def _process_batch(self, detections, labels, pred_kpts=None, gt_kpts=None):
+    def _process_batch(self, detections, gt_bboxes, gt_cls, pred_kpts=None, gt_kpts=None):
        """
        Return correct prediction matrix.

@ -142,12 +148,12 @@ class PoseValidator(DetectionValidator):
        """
        if pred_kpts is not None and gt_kpts is not None:
            # `0.53` is from https://github.com/jin-s13/xtcocoapi/blob/master/xtcocotools/cocoeval.py#L384
-            area = ops.xyxy2xywh(labels[:, 1:])[:, 2:].prod(1) * 0.53
+            area = ops.xyxy2xywh(gt_bboxes)[:, 2:].prod(1) * 0.53
            iou = kpt_iou(gt_kpts, pred_kpts, sigma=self.sigma, area=area)
        else:  # boxes
-            iou = box_iou(labels[:, 1:], detections[:, :4])
+            iou = box_iou(gt_bboxes, detections[:, :4])

-        return self.match_predictions(detections[:, 5], labels[:, 0], iou)
+        return self.match_predictions(detections[:, 5], gt_cls, iou)

    def plot_val_samples(self, batch, ni):
        """Plots and saves validation set samples with predicted bounding boxes and keypoints."""
--- a/ultralytics/models/yolo/segment/val.py
+++ b/ultralytics/models/yolo/segment/val.py
@ -51,6 +51,7 @@ class SegmentationValidator(DetectionValidator):
            self.process = ops.process_mask_upsample  # more accurate
        else:
            self.process = ops.process_mask  # faster
+        self.stats = dict(tp_m=[], tp=[], conf=[], pred_cls=[], target_cls=[])

    def get_desc(self):
        """Return a formatted description of evaluation metrics."""
@ -70,59 +71,62 @@ class SegmentationValidator(DetectionValidator):
        proto = preds[1][-1] if len(preds[1]) == 3 else preds[1]  # second output is len 3 if pt, but only 1 if exported
        return p, proto

+    def _prepare_batch(self, si, batch):
+        prepared_batch = super()._prepare_batch(si, batch)
+        midx = [si] if self.args.overlap_mask else batch['batch_idx'] == si
+        prepared_batch['masks'] = batch['masks'][midx]
+        return prepared_batch
+
+    def _prepare_pred(self, pred, pbatch, proto):
+        predn = super()._prepare_pred(pred, pbatch)
+        pred_masks = self.process(proto, pred[:, 6:], pred[:, :4], shape=pbatch['imgsz'])
+        return predn, pred_masks
+
    def update_metrics(self, preds, batch):
        """Metrics."""
        for si, (pred, proto) in enumerate(zip(preds[0], preds[1])):
-            idx = batch['batch_idx'] == si
-            cls = batch['cls'][idx]
-            bbox = batch['bboxes'][idx]
-            nl, npr = cls.shape[0], pred.shape[0]  # number of labels, predictions
-            shape = batch['ori_shape'][si]
-            correct_masks = torch.zeros(npr, self.niou, dtype=torch.bool, device=self.device)  # init
-            correct_bboxes = torch.zeros(npr, self.niou, dtype=torch.bool, device=self.device)  # init
            self.seen += 1
-
+            npr = len(pred)
+            stat = dict(conf=torch.zeros(0, device=self.device),
+                        pred_cls=torch.zeros(0, device=self.device),
+                        tp=torch.zeros(npr, self.niou, dtype=torch.bool, device=self.device),
+                        tp_m=torch.zeros(npr, self.niou, dtype=torch.bool, device=self.device))
+            pbatch = self._prepare_batch(si, batch)
+            cls, bbox = pbatch.pop('cls'), pbatch.pop('bbox')
+            nl = len(cls)
+            stat['target_cls'] = cls
            if npr == 0:
                if nl:
-                    self.stats.append((correct_bboxes, correct_masks, *torch.zeros(
-                        (2, 0), device=self.device), cls.squeeze(-1)))
+                    for k in self.stats.keys():
+                        self.stats[k].append(stat[k])
                    if self.args.plots:
-                        self.confusion_matrix.process_batch(detections=None, labels=cls.squeeze(-1))
+                        self.confusion_matrix.process_batch(detections=None, gt_bboxes=bbox, gt_cls=cls)
                continue

            # Masks
-            midx = [si] if self.args.overlap_mask else idx
-            gt_masks = batch['masks'][midx]
-            pred_masks = self.process(proto, pred[:, 6:], pred[:, :4], shape=batch['img'][si].shape[1:])
-
+            gt_masks = pbatch.pop('masks')
            # Predictions
            if self.args.single_cls:
                pred[:, 5] = 0
-            predn = pred.clone()
-            ops.scale_boxes(batch['img'][si].shape[1:], predn[:, :4], shape,
-                            ratio_pad=batch['ratio_pad'][si])  # native-space pred
+            predn, pred_masks = self._prepare_pred(pred, pbatch, proto)
+            stat['conf'] = predn[:, 4]
+            stat['pred_cls'] = predn[:, 5]

            # Evaluate
            if nl:
-                height, width = batch['img'].shape[2:]
-                tbox = ops.xywh2xyxy(bbox) * torch.tensor(
-                    (width, height, width, height), device=self.device)  # target boxes
-                ops.scale_boxes(batch['img'][si].shape[1:], tbox, shape,
-                                ratio_pad=batch['ratio_pad'][si])  # native-space labels
-                labelsn = torch.cat((cls, tbox), 1)  # native-space labels
-                correct_bboxes = self._process_batch(predn, labelsn)
-                # TODO: maybe remove these `self.` arguments as they already are member variable
-                correct_masks = self._process_batch(predn,
-                                                    labelsn,
-                                                    pred_masks,
-                                                    gt_masks,
-                                                    overlap=self.args.overlap_mask,
-                                                    masks=True)
+                stat['tp'] = self._process_batch(predn, bbox, cls)
+                stat['tp_m'] = self._process_batch(predn,
+                                                   bbox,
+                                                   cls,
+                                                   pred_masks,
+                                                   gt_masks,
+                                                   self.args.overlap_mask,
+                                                   masks=True)
                if self.args.plots:
-                    self.confusion_matrix.process_batch(predn, labelsn)
+                    self.confusion_matrix.process_batch(predn, bbox, cls)

-            # Append correct_masks, correct_boxes, pconf, pcls, tcls
-            self.stats.append((correct_bboxes, correct_masks, pred[:, 4], pred[:, 5], cls.squeeze(-1)))
+            for k in self.stats.keys():
+                self.stats[k].append(stat[k])

            pred_masks = torch.as_tensor(pred_masks, dtype=torch.uint8)
            if self.args.plots and self.batch_i < 3:
@ -131,7 +135,7 @@ class SegmentationValidator(DetectionValidator):
            # Save
            if self.args.save_json:
                pred_masks = ops.scale_image(pred_masks.permute(1, 2, 0).contiguous().cpu().numpy(),
-                                             shape,
+                                             pbatch['ori_shape'],
                                             ratio_pad=batch['ratio_pad'][si])
                self.pred_to_json(predn, batch['im_file'][si], pred_masks)
            # if self.args.save_txt:
@ -142,7 +146,7 @@ class SegmentationValidator(DetectionValidator):
        self.metrics.speed = self.speed
        self.metrics.confusion_matrix = self.confusion_matrix

-    def _process_batch(self, detections, labels, pred_masks=None, gt_masks=None, overlap=False, masks=False):
+    def _process_batch(self, detections, gt_bboxes, gt_cls, pred_masks=None, gt_masks=None, overlap=False, masks=False):
        """
        Return correct prediction matrix.

@ -155,7 +159,7 @@ class SegmentationValidator(DetectionValidator):
        """
        if masks:
            if overlap:
-                nl = len(labels)
+                nl = len(gt_cls)
                index = torch.arange(nl, device=gt_masks.device).view(nl, 1, 1) + 1
                gt_masks = gt_masks.repeat(nl, 1, 1)  # shape(1,640,640) -> (n,640,640)
                gt_masks = torch.where(gt_masks == index, 1.0, 0.0)
@ -164,9 +168,9 @@ class SegmentationValidator(DetectionValidator):
                gt_masks = gt_masks.gt_(0.5)
            iou = mask_iou(gt_masks.view(gt_masks.shape[0], -1), pred_masks.view(pred_masks.shape[0], -1))
        else:  # boxes
-            iou = box_iou(labels[:, 1:], detections[:, :4])
+            iou = box_iou(gt_bboxes, detections[:, :4])

-        return self.match_predictions(detections[:, 5], labels[:, 0], iou)
+        return self.match_predictions(detections[:, 5], gt_cls, iou)

    def plot_val_samples(self, batch, ni):
        """Plots validation samples with bounding box labels."""
@ -174,7 +178,7 @@ class SegmentationValidator(DetectionValidator):
                    batch['batch_idx'],
                    batch['cls'].squeeze(-1),
                    batch['bboxes'],
-                    batch['masks'],
+                    masks=batch['masks'],
                    paths=batch['im_file'],
                    fname=self.save_dir / f'val_batch{ni}_labels.jpg',
                    names=self.names,