ultralytics 8.2.84 new SAM flexible imgsz inference (#15882)

Co-authored-by: UltralyticsAssistant <web@ultralytics.com> Co-authored-by: Glenn Jocher <glenn.jocher@ultralytics.com>
2024-08-30 20:59:08 +08:00 · 2024-08-30 20:59:08 +08:00 · 7053169fd0
commit 7053169fd0
parent 5d66140ce1
6 changed files with 70 additions and 7 deletions
--- a/ultralytics/models/sam/modules/encoders.py
+++ b/ultralytics/models/sam/modules/encoders.py
@ -151,7 +151,12 @@ class ImageEncoderViT(nn.Module):
        """Processes input through patch embedding, positional embedding, transformer blocks, and neck module."""
        x = self.patch_embed(x)
        if self.pos_embed is not None:
-            x = x + self.pos_embed
+            pos_embed = (
+                F.interpolate(self.pos_embed.permute(0, 3, 1, 2), scale_factor=self.img_size / 1024).permute(0, 2, 3, 1)
+                if self.img_size != 1024
+                else self.pos_embed
+            )
+            x = x + pos_embed
        for blk in self.blocks:
            x = blk(x)
        return self.neck(x.permute(0, 3, 1, 2))
--- a/ultralytics/models/sam/modules/sam.py
+++ b/ultralytics/models/sam/modules/sam.py
@ -90,6 +90,19 @@ class SAMModel(nn.Module):
        self.register_buffer("pixel_mean", torch.Tensor(pixel_mean).view(-1, 1, 1), False)
        self.register_buffer("pixel_std", torch.Tensor(pixel_std).view(-1, 1, 1), False)

+    def set_imgsz(self, imgsz):
+        """
+        Set image size to make model compatible with different image sizes.
+
+        Args:
+            imgsz (Tuple[int, int]): The size of the input image.
+        """
+        if hasattr(self.image_encoder, "set_imgsz"):
+            self.image_encoder.set_imgsz(imgsz)
+        self.prompt_encoder.input_image_size = imgsz
+        self.prompt_encoder.image_embedding_size = [x // 16 for x in imgsz]  # 16 is fixed as patch size of ViT model
+        self.image_encoder.img_size = imgsz[0]
+

 class SAM2Model(torch.nn.Module):
    """
@ -940,3 +953,14 @@ class SAM2Model(torch.nn.Module):
        # don't overlap (here sigmoid(-10.0)=4.5398e-05)
        pred_masks = torch.where(keep, pred_masks, torch.clamp(pred_masks, max=-10.0))
        return pred_masks
+
+    def set_imgsz(self, imgsz):
+        """
+        Set image size to make model compatible with different image sizes.
+
+        Args:
+            imgsz (Tuple[int, int]): The size of the input image.
+        """
+        self.image_size = imgsz[0]
+        self.sam_prompt_encoder.input_image_size = imgsz
+        self.sam_prompt_encoder.image_embedding_size = [x // 16 for x in imgsz]  # fixed ViT patch size of 16
--- a/ultralytics/models/sam/modules/tiny_encoder.py
+++ b/ultralytics/models/sam/modules/tiny_encoder.py
@ -982,10 +982,31 @@ class TinyViT(nn.Module):
            layer = self.layers[i]
            x = layer(x)
        batch, _, channel = x.shape
-        x = x.view(batch, 64, 64, channel)
+        x = x.view(batch, self.patches_resolution[0] // 4, self.patches_resolution[1] // 4, channel)
        x = x.permute(0, 3, 1, 2)
        return self.neck(x)

    def forward(self, x):
        """Performs the forward pass through the TinyViT model, extracting features from the input image."""
        return self.forward_features(x)
+
+    def set_imgsz(self, imgsz=[1024, 1024]):
+        """
+        Set image size to make model compatible with different image sizes.
+
+        Args:
+            imgsz (Tuple[int, int]): The size of the input image.
+        """
+        imgsz = [s // 4 for s in imgsz]
+        self.patches_resolution = imgsz
+        for i, layer in enumerate(self.layers):
+            input_resolution = (
+                imgsz[0] // (2 ** (i - 1 if i == 3 else i)),
+                imgsz[1] // (2 ** (i - 1 if i == 3 else i)),
+            )
+            layer.input_resolution = input_resolution
+            if layer.downsample is not None:
+                layer.downsample.input_resolution = input_resolution
+            if isinstance(layer, BasicLayer):
+                for b in layer.blocks:
+                    b.input_resolution = input_resolution