ultralytics 8.0.235 YOLOv8 OBB train, val, predict and export (#4499)

Co-authored-by: Yash Khurana <ykhurana6@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Swamita Gupta <swamita2001@gmail.com> Co-authored-by: Ayush Chaurasia <ayush.chaurarsia@gmail.com> Co-authored-by: Laughing-q <1185102784@qq.com> Co-authored-by: Laughing <61612323+Laughing-q@users.noreply.github.com> Co-authored-by: Laughing-q <1182102784@qq.com>
2024-01-05 03:00:26 +01:00 · 2024-01-05 03:00:26 +01:00 · 072291bc78
commit 072291bc78
parent f702b34a50
52 changed files with 2090 additions and 524 deletions
--- a/ultralytics/data/augment.py
+++ b/ultralytics/data/augment.py
@ -13,7 +13,7 @@ from ultralytics.utils import LOGGER, colorstr
 from ultralytics.utils.checks import check_version
 from ultralytics.utils.instance import Instances
 from ultralytics.utils.metrics import bbox_ioa
-from ultralytics.utils.ops import segment2box
+from ultralytics.utils.ops import segment2box, xyxyxyxy2xywhr
 from ultralytics.utils.torch_utils import TORCHVISION_0_10, TORCHVISION_0_11, TORCHVISION_0_13

 from .utils import polygons2masks, polygons2masks_overlap
@ -485,6 +485,8 @@ class RandomPerspective:
        xy = xy[:, :2] / xy[:, 2:3]
        segments = xy.reshape(n, -1, 2)
        bboxes = np.stack([segment2box(xy, self.size[0], self.size[1]) for xy in segments], 0)
+        segments[..., 0] = segments[..., 0].clip(bboxes[:, 0:1], bboxes[:, 2:3])
+        segments[..., 1] = segments[..., 1].clip(bboxes[:, 1:2], bboxes[:, 3:4])
        return bboxes, segments

    def apply_keypoints(self, keypoints, M):
@ -891,6 +893,7 @@ class Format:
                 normalize=True,
                 return_mask=False,
                 return_keypoint=False,
+                 return_obb=False,
                 mask_ratio=4,
                 mask_overlap=True,
                 batch_idx=True):
@ -899,6 +902,7 @@ class Format:
        self.normalize = normalize
        self.return_mask = return_mask  # set False when training detection only
        self.return_keypoint = return_keypoint
+        self.return_obb = return_obb
        self.mask_ratio = mask_ratio
        self.mask_overlap = mask_overlap
        self.batch_idx = batch_idx  # keep the batch indexes
@ -928,6 +932,9 @@ class Format:
        labels['bboxes'] = torch.from_numpy(instances.bboxes) if nl else torch.zeros((nl, 4))
        if self.return_keypoint:
            labels['keypoints'] = torch.from_numpy(instances.keypoints)
+        if self.return_obb:
+            labels['bboxes'] = xyxyxyxy2xywhr(torch.from_numpy(instances.segments)) if len(
+                instances.segments) else torch.zeros((0, 5))
        # Then we can use collate_fn
        if self.batch_idx:
            labels['batch_idx'] = torch.zeros(nl)
--- a/ultralytics/data/build.py
+++ b/ultralytics/data/build.py
@ -89,8 +89,7 @@ def build_yolo_dataset(cfg, img_path, batch, data, mode='train', rect=False, str
        stride=int(stride),
        pad=0.0 if mode == 'train' else 0.5,
        prefix=colorstr(f'{mode}: '),
-        use_segments=cfg.task == 'segment',
-        use_keypoints=cfg.task == 'pose',
+        task=cfg.task,
        classes=cfg.classes,
        data=data,
        fraction=cfg.fraction if mode == 'train' else 1.0)
--- a/ultralytics/data/dataset.py
+++ b/ultralytics/data/dataset.py
@ -11,6 +11,7 @@ import torchvision
 from PIL import Image

 from ultralytics.utils import LOCAL_RANK, NUM_THREADS, TQDM, colorstr, is_dir_writeable
+from ultralytics.utils.ops import resample_segments

 from .augment import Compose, Format, Instances, LetterBox, classify_augmentations, classify_transforms, v8_transforms
 from .base import BaseDataset
@ -26,17 +27,17 @@ class YOLODataset(BaseDataset):

    Args:
        data (dict, optional): A dataset YAML dictionary. Defaults to None.
-        use_segments (bool, optional): If True, segmentation masks are used as labels. Defaults to False.
-        use_keypoints (bool, optional): If True, keypoints are used as labels. Defaults to False.
+        task (str): An explicit arg to point current task, Defaults to 'detect'.

    Returns:
        (torch.utils.data.Dataset): A PyTorch dataset object that can be used for training an object detection model.
    """

-    def __init__(self, *args, data=None, use_segments=False, use_keypoints=False, **kwargs):
+    def __init__(self, *args, data=None, task='detect', **kwargs):
        """Initializes the YOLODataset with optional configurations for segments and keypoints."""
-        self.use_segments = use_segments
-        self.use_keypoints = use_keypoints
+        self.use_segments = task == 'segment'
+        self.use_keypoints = task == 'pose'
+        self.use_obb = task == 'obb'
        self.data = data
        assert not (self.use_segments and self.use_keypoints), 'Can not use both segments and keypoints.'
        super().__init__(*args, **kwargs)
@ -148,6 +149,7 @@ class YOLODataset(BaseDataset):
                   normalize=True,
                   return_mask=self.use_segments,
                   return_keypoint=self.use_keypoints,
+                   return_obb=self.use_obb,
                   batch_idx=True,
                   mask_ratio=hyp.mask_ratio,
                   mask_overlap=hyp.overlap_mask))
@ -165,10 +167,19 @@ class YOLODataset(BaseDataset):
        # NOTE: cls is not with bboxes now, classification and semantic segmentation need an independent cls label
        # We can make it also support classification and semantic segmentation by add or remove some dict keys there.
        bboxes = label.pop('bboxes')
-        segments = label.pop('segments')
+        segments = label.pop('segments', [])
        keypoints = label.pop('keypoints', None)
        bbox_format = label.pop('bbox_format')
        normalized = label.pop('normalized')
+
+        # NOTE: do NOT resample oriented boxes
+        segment_resamples = 100 if self.use_obb else 1000
+        if len(segments) > 0:
+            # list[np.array(1000, 2)] * num_samples
+            # (N, 1000, 2)
+            segments = np.stack(resample_segments(segments, n=segment_resamples), axis=0)
+        else:
+            segments = np.zeros((0, segment_resamples, 2), dtype=np.float32)
        label['instances'] = Instances(bboxes, segments, keypoints, bbox_format=bbox_format, normalized=normalized)
        return label

@ -182,7 +193,7 @@ class YOLODataset(BaseDataset):
            value = values[i]
            if k == 'img':
                value = torch.stack(value, 0)
-            if k in ['masks', 'keypoints', 'bboxes', 'cls']:
+            if k in ['masks', 'keypoints', 'bboxes', 'cls', 'segments', 'obb']:
                value = torch.cat(value, 0)
            new_batch[k] = value
        new_batch['batch_idx'] = list(new_batch['batch_idx'])
--- a/ultralytics/data/split_dota.py
+++ b/ultralytics/data/split_dota.py
@ -0,0 +1,288 @@
+import itertools
+import os
+from glob import glob
+from math import ceil
+from pathlib import Path
+
+import cv2
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+
+from ultralytics.data.utils import exif_size, img2label_paths
+from ultralytics.utils.checks import check_requirements
+
+check_requirements('shapely')
+from shapely.geometry import Polygon
+
+
+def bbox_iof(polygon1, bbox2, eps=1e-6):
+    """
+    Calculate iofs between bbox1 and bbox2.
+
+    Args:
+        polygon1 (np.ndarray): Polygon coordinates, (n, 8).
+        bbox2 (np.ndarray): Bounding boxes, (n ,4).
+    """
+    polygon1 = polygon1.reshape(-1, 4, 2)
+    lt_point = np.min(polygon1, axis=-2)
+    rb_point = np.max(polygon1, axis=-2)
+    bbox1 = np.concatenate([lt_point, rb_point], axis=-1)
+
+    lt = np.maximum(bbox1[:, None, :2], bbox2[..., :2])
+    rb = np.minimum(bbox1[:, None, 2:], bbox2[..., 2:])
+    wh = np.clip(rb - lt, 0, np.inf)
+    h_overlaps = wh[..., 0] * wh[..., 1]
+
+    l, t, r, b = (bbox2[..., i] for i in range(4))
+    polygon2 = np.stack([l, t, r, t, r, b, l, b], axis=-1).reshape(-1, 4, 2)
+
+    sg_polys1 = [Polygon(p) for p in polygon1]
+    sg_polys2 = [Polygon(p) for p in polygon2]
+    overlaps = np.zeros(h_overlaps.shape)
+    for p in zip(*np.nonzero(h_overlaps)):
+        overlaps[p] = sg_polys1[p[0]].intersection(sg_polys2[p[-1]]).area
+    unions = np.array([p.area for p in sg_polys1], dtype=np.float32)
+    unions = unions[..., None]
+
+    unions = np.clip(unions, eps, np.inf)
+    outputs = overlaps / unions
+    if outputs.ndim == 1:
+        outputs = outputs[..., None]
+    return outputs
+
+
+def load_yolo_dota(data_root, split='train'):
+    """Load DOTA dataset.
+    Args:
+        data_root (str): Data root.
+        split (str): The split data set, could be train or val.
+    Notes:
+        The directory structure assumed for the DOTA dataset:
+            - data_root
+                - images
+                    - train
+                    - val
+                - labels
+                    - train
+                    - val
+    """
+    assert split in ['train', 'val']
+    im_dir = os.path.join(data_root, f'images/{split}')
+    assert Path(im_dir).exists(), f"Can't find {im_dir}, please check your data root."
+    im_files = glob(os.path.join(data_root, f'images/{split}/*'))
+    lb_files = img2label_paths(im_files)
+    annos = []
+    for im_file, lb_file in zip(im_files, lb_files):
+        w, h = exif_size(Image.open(im_file))
+        with open(lb_file) as f:
+            lb = [x.split() for x in f.read().strip().splitlines() if len(x)]
+            lb = np.array(lb, dtype=np.float32)
+        annos.append(dict(ori_size=(h, w), label=lb, filepath=im_file))
+    return annos
+
+
+def get_windows(im_size, crop_sizes=[1024], gaps=[200], im_rate_thr=0.6, eps=0.01):
+    """
+    Get the coordinates of windows.
+
+    Args:
+        im_size (tuple): Original image size, (h, w).
+        crop_sizes (List(int)): Crop size of windows.
+        gaps (List(int)): Gap between each crops.
+        im_rate_thr (float): Threshold of windows areas divided by image ares.
+    """
+    h, w = im_size
+    windows = []
+    for crop_size, gap in zip(crop_sizes, gaps):
+        assert crop_size > gap, f'invaild crop_size gap pair [{crop_size} {gap}]'
+        step = crop_size - gap
+
+        xn = 1 if w <= crop_size else ceil((w - crop_size) / step + 1)
+        xs = [step * i for i in range(xn)]
+        if len(xs) > 1 and xs[-1] + crop_size > w:
+            xs[-1] = w - crop_size
+
+        yn = 1 if h <= crop_size else ceil((h - crop_size) / step + 1)
+        ys = [step * i for i in range(yn)]
+        if len(ys) > 1 and ys[-1] + crop_size > h:
+            ys[-1] = h - crop_size
+
+        start = np.array(list(itertools.product(xs, ys)), dtype=np.int64)
+        stop = start + crop_size
+        windows.append(np.concatenate([start, stop], axis=1))
+    windows = np.concatenate(windows, axis=0)
+
+    im_in_wins = windows.copy()
+    im_in_wins[:, 0::2] = np.clip(im_in_wins[:, 0::2], 0, w)
+    im_in_wins[:, 1::2] = np.clip(im_in_wins[:, 1::2], 0, h)
+    im_areas = (im_in_wins[:, 2] - im_in_wins[:, 0]) * (im_in_wins[:, 3] - im_in_wins[:, 1])
+    win_areas = (windows[:, 2] - windows[:, 0]) * (windows[:, 3] - windows[:, 1])
+    im_rates = im_areas / win_areas
+    if not (im_rates > im_rate_thr).any():
+        max_rate = im_rates.max()
+        im_rates[abs(im_rates - max_rate) < eps] = 1
+    return windows[im_rates > im_rate_thr]
+
+
+def get_window_obj(anno, windows, iof_thr=0.7):
+    """Get objects for each window."""
+    h, w = anno['ori_size']
+    label = anno['label']
+    if len(label):
+        label[:, 1::2] *= w
+        label[:, 2::2] *= h
+        iofs = bbox_iof(label[:, 1:], windows)
+        # unnormalized and misaligned coordinates
+        window_anns = [(label[iofs[:, i] >= iof_thr]) for i in range(len(windows))]
+    else:
+        window_anns = [np.zeros((0, 9), dtype=np.float32) for _ in range(len(windows))]
+    return window_anns
+
+
+def crop_and_save(anno, windows, window_objs, im_dir, lb_dir):
+    """Crop images and save new labels.
+    Args:
+        anno (dict): Annotation dict, including `filepath`, `label`, `ori_size` as its keys.
+        windows (list): A list of windows coordinates.
+        window_objs (list): A list of labels inside each window.
+        im_dir (str): The output directory path of images.
+        lb_dir (str): The output directory path of labels.
+    Notes:
+        The directory structure assumed for the DOTA dataset:
+            - data_root
+                - images
+                    - train
+                    - val
+                - labels
+                    - train
+                    - val
+    """
+    im = cv2.imread(anno['filepath'])
+    name = Path(anno['filepath']).stem
+    for i, window in enumerate(windows):
+        x_start, y_start, x_stop, y_stop = window.tolist()
+        new_name = name + '__' + str(x_stop - x_start) + '__' + str(x_start) + '___' + str(y_start)
+        patch_im = im[y_start:y_stop, x_start:x_stop]
+        ph, pw = patch_im.shape[:2]
+
+        cv2.imwrite(os.path.join(im_dir, f'{new_name}.jpg'), patch_im)
+        label = window_objs[i]
+        if len(label) == 0:
+            continue
+        label[:, 1::2] -= x_start
+        label[:, 2::2] -= y_start
+        label[:, 1::2] /= pw
+        label[:, 2::2] /= ph
+
+        with open(os.path.join(lb_dir, f'{new_name}.txt'), 'w') as f:
+            for lb in label:
+                formatted_coords = ['{:.6g}'.format(coord) for coord in lb[1:]]
+                f.write(f"{int(lb[0])} {' '.join(formatted_coords)}\n")
+
+
+def split_images_and_labels(data_root, save_dir, split='train', crop_sizes=[1024], gaps=[200]):
+    """
+    Split both images and labels.
+
+    NOTES:
+        The directory structure assumed for the DOTA dataset:
+            - data_root
+                - images
+                    - split
+                - labels
+                    - split
+        and the output directory structure is:
+            - save_dir
+                - images
+                    - split
+                - labels
+                    - split
+    """
+    im_dir = Path(save_dir) / 'images' / split
+    im_dir.mkdir(parents=True, exist_ok=True)
+    lb_dir = Path(save_dir) / 'labels' / split
+    lb_dir.mkdir(parents=True, exist_ok=True)
+
+    annos = load_yolo_dota(data_root, split=split)
+    for anno in tqdm(annos, total=len(annos), desc=split):
+        windows = get_windows(anno['ori_size'], crop_sizes, gaps)
+        window_objs = get_window_obj(anno, windows)
+        crop_and_save(anno, windows, window_objs, str(im_dir), str(lb_dir))
+
+
+def split_trainval(data_root, save_dir, crop_size=1024, gap=200, rates=[1.0]):
+    """
+    Split train and val set of DOTA.
+
+    NOTES:
+        The directory structure assumed for the DOTA dataset:
+            - data_root
+                - images
+                    - train
+                    - val
+                - labels
+                    - train
+                    - val
+        and the output directory structure is:
+            - save_dir
+                - images
+                    - train
+                    - val
+                - labels
+                    - train
+                    - val
+    """
+    crop_sizes, gaps = [], []
+    for r in rates:
+        crop_sizes.append(int(crop_size / r))
+        gaps.append(int(gap / r))
+    for split in ['train', 'val']:
+        split_images_and_labels(data_root, save_dir, split, crop_sizes, gaps)
+
+
+def split_test(data_root, save_dir, crop_size=1024, gap=200, rates=[1.0]):
+    """
+    Split test set of DOTA, labels are not included within this set.
+
+    NOTES:
+        The directory structure assumed for the DOTA dataset:
+            - data_root
+                - images
+                    - test
+        and the output directory structure is:
+            - save_dir
+                - images
+                    - test
+    """
+    crop_sizes, gaps = [], []
+    for r in rates:
+        crop_sizes.append(int(crop_size / r))
+        gaps.append(int(gap / r))
+    save_dir = Path(save_dir) / 'images' / 'test'
+    save_dir.mkdir(parents=True, exist_ok=True)
+
+    im_dir = Path(os.path.join(data_root, 'images/test'))
+    assert im_dir.exists(), f"Can't find {str(im_dir)}, please check your data root."
+    im_files = glob(str(im_dir / '*'))
+    for im_file in tqdm(im_files, total=len(im_files), desc='test'):
+        w, h = exif_size(Image.open(im_file))
+        windows = get_windows((h, w), crop_sizes=crop_sizes, gaps=gaps)
+        im = cv2.imread(im_file)
+        name = Path(im_file).stem
+        for window in windows:
+            x_start, y_start, x_stop, y_stop = window.tolist()
+            new_name = (name + '__' + str(x_stop - x_start) + '__' + str(x_start) + '___' + str(y_start))
+            patch_im = im[y_start:y_stop, x_start:x_stop]
+            cv2.imwrite(os.path.join(str(save_dir), f'{new_name}.jpg'), patch_im)
+
+
+if __name__ == '__main__':
+    split_trainval(
+        data_root='DOTAv2',
+        save_dir='DOTAv2-split',
+    )
+    split_test(
+        data_root='DOTAv2',
+        save_dir='DOTAv2-split',
+    )
--- a/ultralytics/data/utils.py
+++ b/ultralytics/data/utils.py
@ -516,10 +516,7 @@ class HUBDatasetStats:
            else:
                from ultralytics.data import YOLODataset

-                dataset = YOLODataset(img_path=self.data[split],
-                                      data=self.data,
-                                      use_segments=self.task == 'segment',
-                                      use_keypoints=self.task == 'pose')
+                dataset = YOLODataset(img_path=self.data[split], data=self.data, task=self.task)
                x = np.array([
                    np.bincount(label['cls'].astype(int).flatten(), minlength=self.data['nc'])
                    for label in TQDM(dataset.labels, total=len(dataset), desc='Statistics')])  # shape(128x80)