Model coverage cleanup (#4585)

2023-08-27 04:19:41 +02:00 · 2023-08-27 04:19:41 +02:00 · deac7575b1
commit deac7575b1
parent c635418a27
12 changed files with 132 additions and 175 deletions
--- a/ultralytics/models/fastsam/prompt.py
+++ b/ultralytics/models/fastsam/prompt.py
@ -51,32 +51,16 @@ class FastSAMPrompt:
        n = len(result.masks.data)
        for i in range(n):
            mask = result.masks.data[i] == 1.0
-
-            if torch.sum(mask) < filter:
-                continue
-            annotation = {
-                'id': i,
-                'segmentation': mask.cpu().numpy(),
-                'bbox': result.boxes.data[i],
-                'score': result.boxes.conf[i]}
-            annotation['area'] = annotation['segmentation'].sum()
-            annotations.append(annotation)
+            if torch.sum(mask) >= filter:
+                annotation = {
+                    'id': i,
+                    'segmentation': mask.cpu().numpy(),
+                    'bbox': result.boxes.data[i],
+                    'score': result.boxes.conf[i]}
+                annotation['area'] = annotation['segmentation'].sum()
+                annotations.append(annotation)
        return annotations

-    @staticmethod
-    def filter_masks(annotations):  # filter the overlap mask
-        annotations.sort(key=lambda x: x['area'], reverse=True)
-        to_remove = set()
-        for i in range(len(annotations)):
-            a = annotations[i]
-            for j in range(i + 1, len(annotations)):
-                b = annotations[j]
-                if i != j and j not in to_remove and b['area'] < a['area'] and \
-                        (a['segmentation'] & b['segmentation']).sum() / b['segmentation'].sum() > 0.8:
-                    to_remove.add(j)
-
-        return [a for i, a in enumerate(annotations) if i not in to_remove], to_remove
-
    @staticmethod
    def _get_bbox_from_mask(mask):
        mask = mask.astype(np.uint8)
@ -242,15 +226,12 @@ class FastSAMPrompt:
        cropped_images = []
        not_crop = []
        filter_id = []
-        # annotations, _ = filter_masks(annotations)
-        # filter_id = list(_)
        for _, mask in enumerate(annotations):
            if np.sum(mask['segmentation']) <= 100:
                filter_id.append(_)
                continue
            bbox = self._get_bbox_from_mask(mask['segmentation'])  # mask 的 bbox
            cropped_boxes.append(self._segment_image(image, bbox))  # 保存裁剪的图片
-            # cropped_boxes.append(segment_image(image,mask["segmentation"]))
            cropped_images.append(bbox)  # 保存裁剪的图片的bbox

        return cropped_boxes, cropped_images, not_crop, filter_id, annotations
--- a/ultralytics/models/sam/modules/encoders.py
+++ b/ultralytics/models/sam/modules/encoders.py
@ -267,10 +267,11 @@ class PositionEmbeddingRandom(nn.Module):
        super().__init__()
        if scale is None or scale <= 0.0:
            scale = 1.0
-        self.register_buffer(
-            'positional_encoding_gaussian_matrix',
-            scale * torch.randn((2, num_pos_feats)),
-        )
+        self.register_buffer('positional_encoding_gaussian_matrix', scale * torch.randn((2, num_pos_feats)))
+
+        # Set non-deterministic for forward() error 'cumsum_cuda_kernel does not have a deterministic implementation'
+        torch.use_deterministic_algorithms(False)
+        torch.backends.cudnn.deterministic = False

    def _pe_encoding(self, coords: torch.Tensor) -> torch.Tensor:
        """Positionally encode points that are normalized to [0,1]."""
--- a/ultralytics/models/sam/modules/sam.py
+++ b/ultralytics/models/sam/modules/sam.py
@ -20,12 +20,14 @@ class Sam(nn.Module):
    mask_threshold: float = 0.0
    image_format: str = 'RGB'

-    def __init__(self,
-                 image_encoder: ImageEncoderViT,
-                 prompt_encoder: PromptEncoder,
-                 mask_decoder: MaskDecoder,
-                 pixel_mean: List[float] = None,
-                 pixel_std: List[float] = None) -> None:
+    def __init__(
+        self,
+        image_encoder: ImageEncoderViT,
+        prompt_encoder: PromptEncoder,
+        mask_decoder: MaskDecoder,
+        pixel_mean: List[float] = (123.675, 116.28, 103.53),
+        pixel_std: List[float] = (58.395, 57.12, 57.375)
+    ) -> None:
        """
        SAM predicts object masks from an image and input prompts.

@ -37,10 +39,6 @@ class Sam(nn.Module):
          pixel_mean (list(float)): Mean values for normalizing pixels in the input image.
          pixel_std (list(float)): Std values for normalizing pixels in the input image.
        """
-        if pixel_mean is None:
-            pixel_mean = [123.675, 116.28, 103.53]
-        if pixel_std is None:
-            pixel_std = [58.395, 57.12, 57.375]
        super().__init__()
        self.image_encoder = image_encoder
        self.prompt_encoder = prompt_encoder
--- a/ultralytics/models/sam/modules/tiny_encoder.py
+++ b/ultralytics/models/sam/modules/tiny_encoder.py
@ -30,40 +30,6 @@ class Conv2d_BN(torch.nn.Sequential):
        torch.nn.init.constant_(bn.bias, 0)
        self.add_module('bn', bn)

-    @torch.no_grad()
-    def fuse(self):
-        c, bn = self._modules.values()
-        w = bn.weight / (bn.running_var + bn.eps) ** 0.5
-        w = c.weight * w[:, None, None, None]
-        b = bn.bias - bn.running_mean * bn.weight / (bn.running_var + bn.eps) ** 0.5
-        m = torch.nn.Conv2d(w.size(1) * self.c.groups,
-                            w.size(0),
-                            w.shape[2:],
-                            stride=self.c.stride,
-                            padding=self.c.padding,
-                            dilation=self.c.dilation,
-                            groups=self.c.groups)
-        m.weight.data.copy_(w)
-        m.bias.data.copy_(b)
-        return m
-
-
-# NOTE: This module and timm package is needed only for training.
-# from ultralytics.utils.checks import check_requirements
-# check_requirements('timm')
-# from timm.models.layers import DropPath as TimmDropPath
-# from timm.models.layers import trunc_normal_
-# class DropPath(TimmDropPath):
-#
-#     def __init__(self, drop_prob=None):
-#         super().__init__(drop_prob=drop_prob)
-#         self.drop_prob = drop_prob
-#
-#     def __repr__(self):
-#         msg = super().__repr__()
-#         msg += f'(drop_prob={self.drop_prob})'
-#         return msg
-

 class PatchEmbed(nn.Module):

--- a/ultralytics/models/sam/predict.py
+++ b/ultralytics/models/sam/predict.py
@ -153,8 +153,7 @@ class Predictor(BasePredictor):
            bboxes = bboxes[None] if bboxes.ndim == 1 else bboxes
            bboxes *= r
        if masks is not None:
-            masks = torch.as_tensor(masks, dtype=torch.float32, device=self.device)
-            masks = masks[:, None, :, :]
+            masks = torch.as_tensor(masks, dtype=torch.float32, device=self.device).unsqueeze(1)

        points = (points, labels) if points is not None else None
        # Embed prompts
@ -257,9 +256,7 @@ class Predictor(BasePredictor):
                pred_bbox = batched_mask_to_box(pred_mask).float()
                keep_mask = ~is_box_near_crop_edge(pred_bbox, crop_region, [0, 0, iw, ih])
                if not torch.all(keep_mask):
-                    pred_bbox = pred_bbox[keep_mask]
-                    pred_mask = pred_mask[keep_mask]
-                    pred_score = pred_score[keep_mask]
+                    pred_bbox, pred_mask, pred_score = pred_bbox[keep_mask], pred_mask[keep_mask], pred_score[keep_mask]

                crop_masks.append(pred_mask)
                crop_bboxes.append(pred_bbox)
@ -288,9 +285,7 @@ class Predictor(BasePredictor):
        if len(crop_regions) > 1:
            scores = 1 / region_areas
            keep = torchvision.ops.nms(pred_bboxes, scores, crop_nms_thresh)
-            pred_masks = pred_masks[keep]
-            pred_bboxes = pred_bboxes[keep]
-            pred_scores = pred_scores[keep]
+            pred_masks, pred_bboxes, pred_scores = pred_masks[keep], pred_bboxes[keep], pred_scores[keep]

        return pred_masks, pred_scores, pred_bboxes

--- a/ultralytics/models/utils/loss.py
+++ b/ultralytics/models/utils/loss.py
@ -82,8 +82,7 @@ class DETRLoss(nn.Module):
        loss[name_giou] = 1.0 - bbox_iou(pred_bboxes, gt_bboxes, xywh=True, GIoU=True)
        loss[name_giou] = loss[name_giou].sum() / len(gt_bboxes)
        loss[name_giou] = self.loss_gain['giou'] * loss[name_giou]
-        loss = {k: v.squeeze() for k, v in loss.items()}
-        return loss
+        return {k: v.squeeze() for k, v in loss.items()}

    def _get_loss_mask(self, masks, gt_mask, match_indices, postfix=''):
        # masks: [b, query, h, w], gt_mask: list[[n, H, W]]
@ -105,7 +104,8 @@ class DETRLoss(nn.Module):
        loss[name_dice] = self.loss_gain['dice'] * self._dice_loss(src_masks, target_masks, num_gts)
        return loss

-    def _dice_loss(self, inputs, targets, num_gts):
+    @staticmethod
+    def _dice_loss(inputs, targets, num_gts):
        inputs = F.sigmoid(inputs)
        inputs = inputs.flatten(1)
        targets = targets.flatten(1)
@ -163,7 +163,8 @@ class DETRLoss(nn.Module):
        #     loss[f'loss_dice_aux{postfix}'] = loss[4]
        return loss

-    def _get_index(self, match_indices):
+    @staticmethod
+    def _get_index(match_indices):
        batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(match_indices)])
        src_idx = torch.cat([src for (src, _) in match_indices])
        dst_idx = torch.cat([dst for (_, dst) in match_indices])
@ -257,10 +258,10 @@ class RTDETRDetectionLoss(DETRLoss):
            dn_pos_idx, dn_num_group = dn_meta['dn_pos_idx'], dn_meta['dn_num_group']
            assert len(batch['gt_groups']) == len(dn_pos_idx)

-            # denoising match indices
+            # Denoising match indices
            match_indices = self.get_dn_match_indices(dn_pos_idx, dn_num_group, batch['gt_groups'])

-            # compute denoising training loss
+            # Compute denoising training loss
            dn_loss = super().forward(dn_bboxes, dn_scores, batch, postfix='_dn', match_indices=match_indices)
            total_loss.update(dn_loss)
        else:
@ -270,7 +271,8 @@ class RTDETRDetectionLoss(DETRLoss):

    @staticmethod
    def get_dn_match_indices(dn_pos_idx, dn_num_group, gt_groups):
-        """Get the match indices for denoising.
+        """
+        Get the match indices for denoising.

        Args:
            dn_pos_idx (List[torch.Tensor]): A list includes positive indices of denoising.
@ -279,7 +281,6 @@ class RTDETRDetectionLoss(DETRLoss):

        Returns:
            dn_match_indices (List(tuple)): Matched indices.
-
        """
        dn_match_indices = []
        idx_groups = torch.as_tensor([0, *gt_groups[:-1]]).cumsum_(0)