ultralytics 8.2.73 Meta SAM2 Refactor (#14867)

Signed-off-by: Glenn Jocher <glenn.jocher@ultralytics.com>
Co-authored-by: UltralyticsAssistant <web@ultralytics.com>
Co-authored-by: Glenn Jocher <glenn.jocher@ultralytics.com>
This commit is contained in:
Laughing 2024-08-05 08:53:45 +08:00 committed by GitHub
parent bea4c93278
commit 5d9046abda
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
44 changed files with 4542 additions and 3624 deletions

File diff suppressed because it is too large Load diff

View file

@ -1,6 +1,6 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
from typing import List, Tuple, Type
from typing import List, Optional, Tuple, Type
import torch
from torch import nn
@ -10,12 +10,14 @@ from ultralytics.nn.modules import MLP, LayerNorm2d
class MaskDecoder(nn.Module):
"""
Decoder module for generating masks and their associated quality scores, using a transformer architecture to predict
masks given image and prompt embeddings.
Decoder module for generating masks and their associated quality scores using a transformer architecture.
This class predicts masks given image and prompt embeddings, utilizing a transformer to process the inputs and
generate mask predictions along with their quality scores.
Attributes:
transformer_dim (int): Channel dimension for the transformer module.
transformer (nn.Module): The transformer module used for mask prediction.
transformer (nn.Module): Transformer module used for mask prediction.
num_multimask_outputs (int): Number of masks to predict for disambiguating masks.
iou_token (nn.Embedding): Embedding for the IoU token.
num_mask_tokens (int): Number of mask tokens.
@ -23,6 +25,16 @@ class MaskDecoder(nn.Module):
output_upscaling (nn.Sequential): Neural network sequence for upscaling the output.
output_hypernetworks_mlps (nn.ModuleList): Hypernetwork MLPs for generating masks.
iou_prediction_head (nn.Module): MLP for predicting mask quality.
Methods:
forward: Predicts masks given image and prompt embeddings.
predict_masks: Internal method for mask prediction.
Examples:
>>> decoder = MaskDecoder(transformer_dim=256, transformer=transformer_module)
>>> masks, iou_pred = decoder(image_embeddings, image_pe, sparse_prompt_embeddings,
... dense_prompt_embeddings, multimask_output=True)
>>> print(f"Predicted masks shape: {masks.shape}, IoU predictions shape: {iou_pred.shape}")
"""
def __init__(
@ -35,15 +47,20 @@ class MaskDecoder(nn.Module):
iou_head_hidden_dim: int = 256,
) -> None:
"""
Predicts masks given an image and prompt embeddings, using a transformer architecture.
Initializes the MaskDecoder module for generating masks and their quality scores.
Args:
transformer_dim (int): the channel dimension of the transformer module
transformer (nn.Module): the transformer used to predict masks
num_multimask_outputs (int): the number of masks to predict when disambiguating masks
activation (nn.Module): the type of activation to use when upscaling masks
iou_head_depth (int): the depth of the MLP used to predict mask quality
iou_head_hidden_dim (int): the hidden dimension of the MLP used to predict mask quality
transformer_dim (int): Channel dimension for the transformer module.
transformer (nn.Module): Transformer module used for mask prediction.
num_multimask_outputs (int): Number of masks to predict for disambiguating masks.
activation (Type[nn.Module]): Type of activation to use when upscaling masks.
iou_head_depth (int): Depth of the MLP used to predict mask quality.
iou_head_hidden_dim (int): Hidden dimension of the MLP used to predict mask quality.
Examples:
>>> transformer = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=256, nhead=8), num_layers=6)
>>> decoder = MaskDecoder(transformer_dim=256, transformer=transformer)
>>> print(decoder)
"""
super().__init__()
self.transformer_dim = transformer_dim
@ -77,18 +94,28 @@ class MaskDecoder(nn.Module):
multimask_output: bool,
) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Predict masks given image and prompt embeddings.
Predicts masks given image and prompt embeddings.
Args:
image_embeddings (torch.Tensor): the embeddings from the image encoder
image_pe (torch.Tensor): positional encoding with the shape of image_embeddings
sparse_prompt_embeddings (torch.Tensor): the embeddings of the points and boxes
dense_prompt_embeddings (torch.Tensor): the embeddings of the mask inputs
image_embeddings (torch.Tensor): Embeddings from the image encoder.
image_pe (torch.Tensor): Positional encoding with the shape of image_embeddings.
sparse_prompt_embeddings (torch.Tensor): Embeddings of the points and boxes.
dense_prompt_embeddings (torch.Tensor): Embeddings of the mask inputs.
multimask_output (bool): Whether to return multiple masks or a single mask.
Returns:
torch.Tensor: batched predicted masks
torch.Tensor: batched predictions of mask quality
(Tuple[torch.Tensor, torch.Tensor]): A tuple containing:
- masks (torch.Tensor): Batched predicted masks.
- iou_pred (torch.Tensor): Batched predictions of mask quality.
Examples:
>>> decoder = MaskDecoder(transformer_dim=256, transformer=transformer_module)
>>> image_emb = torch.rand(1, 256, 64, 64)
>>> image_pe = torch.rand(1, 256, 64, 64)
>>> sparse_emb = torch.rand(1, 2, 256)
>>> dense_emb = torch.rand(1, 256, 64, 64)
>>> masks, iou_pred = decoder(image_emb, image_pe, sparse_emb, dense_emb, multimask_output=True)
>>> print(f"Masks shape: {masks.shape}, IoU predictions shape: {iou_pred.shape}")
"""
masks, iou_pred = self.predict_masks(
image_embeddings=image_embeddings,
@ -112,11 +139,7 @@ class MaskDecoder(nn.Module):
sparse_prompt_embeddings: torch.Tensor,
dense_prompt_embeddings: torch.Tensor,
) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Predicts masks.
See 'forward' for more details.
"""
"""Predicts masks and quality scores using image and prompt embeddings via transformer architecture."""
# Concatenate output tokens
output_tokens = torch.cat([self.iou_token.weight, self.mask_tokens.weight], dim=0)
output_tokens = output_tokens.unsqueeze(0).expand(sparse_prompt_embeddings.shape[0], -1, -1)
@ -147,3 +170,347 @@ class MaskDecoder(nn.Module):
iou_pred = self.iou_prediction_head(iou_token_out)
return masks, iou_pred
class SAM2MaskDecoder(nn.Module):
"""
Transformer-based decoder for predicting instance segmentation masks from image and prompt embeddings.
This class extends the functionality of the MaskDecoder, incorporating additional features such as
high-resolution feature processing, dynamic multimask output, and object score prediction.
Attributes:
transformer_dim (int): Channel dimension of the transformer.
transformer (nn.Module): Transformer used to predict masks.
num_multimask_outputs (int): Number of masks to predict when disambiguating masks.
iou_token (nn.Embedding): Embedding for IOU token.
num_mask_tokens (int): Total number of mask tokens.
mask_tokens (nn.Embedding): Embedding for mask tokens.
pred_obj_scores (bool): Whether to predict object scores.
obj_score_token (nn.Embedding): Embedding for object score token.
use_multimask_token_for_obj_ptr (bool): Whether to use multimask token for object pointer.
output_upscaling (nn.Sequential): Upscaling layers for output.
use_high_res_features (bool): Whether to use high-resolution features.
conv_s0 (nn.Conv2d): Convolutional layer for high-resolution features (s0).
conv_s1 (nn.Conv2d): Convolutional layer for high-resolution features (s1).
output_hypernetworks_mlps (nn.ModuleList): List of MLPs for output hypernetworks.
iou_prediction_head (MLP): MLP for IOU prediction.
pred_obj_score_head (nn.Linear | MLP): Linear layer or MLP for object score prediction.
dynamic_multimask_via_stability (bool): Whether to use dynamic multimask via stability.
dynamic_multimask_stability_delta (float): Delta value for dynamic multimask stability.
dynamic_multimask_stability_thresh (float): Threshold for dynamic multimask stability.
Methods:
forward: Predicts masks given image and prompt embeddings.
predict_masks: Predicts instance segmentation masks from image and prompt embeddings.
_get_stability_scores: Computes mask stability scores based on IoU between thresholds.
_dynamic_multimask_via_stability: Dynamically selects the most stable mask output.
Examples:
>>> image_embeddings = torch.rand(1, 256, 64, 64)
>>> image_pe = torch.rand(1, 256, 64, 64)
>>> sparse_prompt_embeddings = torch.rand(1, 2, 256)
>>> dense_prompt_embeddings = torch.rand(1, 256, 64, 64)
>>> decoder = SAM2MaskDecoder(256, transformer)
>>> masks, iou_pred, sam_tokens_out, obj_score_logits = decoder.forward(
... image_embeddings, image_pe, sparse_prompt_embeddings, dense_prompt_embeddings, True, False)
"""
def __init__(
self,
transformer_dim: int,
transformer: nn.Module,
num_multimask_outputs: int = 3,
activation: Type[nn.Module] = nn.GELU,
iou_head_depth: int = 3,
iou_head_hidden_dim: int = 256,
use_high_res_features: bool = False,
iou_prediction_use_sigmoid=False,
dynamic_multimask_via_stability=False,
dynamic_multimask_stability_delta=0.05,
dynamic_multimask_stability_thresh=0.98,
pred_obj_scores: bool = False,
pred_obj_scores_mlp: bool = False,
use_multimask_token_for_obj_ptr: bool = False,
) -> None:
"""
Initializes the SAM2MaskDecoder module for predicting instance segmentation masks.
This decoder extends the functionality of MaskDecoder, incorporating additional features such as
high-resolution feature processing, dynamic multimask output, and object score prediction.
Args:
transformer_dim (int): Channel dimension of the transformer.
transformer (nn.Module): Transformer used to predict masks.
num_multimask_outputs (int): Number of masks to predict when disambiguating masks.
activation (Type[nn.Module]): Type of activation to use when upscaling masks.
iou_head_depth (int): Depth of the MLP used to predict mask quality.
iou_head_hidden_dim (int): Hidden dimension of the MLP used to predict mask quality.
use_high_res_features (bool): Whether to use high-resolution features.
iou_prediction_use_sigmoid (bool): Whether to use sigmoid for IOU prediction.
dynamic_multimask_via_stability (bool): Whether to use dynamic multimask via stability.
dynamic_multimask_stability_delta (float): Delta value for dynamic multimask stability.
dynamic_multimask_stability_thresh (float): Threshold for dynamic multimask stability.
pred_obj_scores (bool): Whether to predict object scores.
pred_obj_scores_mlp (bool): Whether to use MLP for object score prediction.
use_multimask_token_for_obj_ptr (bool): Whether to use multimask token for object pointer.
Examples:
>>> transformer = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=256, nhead=8), num_layers=6)
>>> decoder = SAM2MaskDecoder(transformer_dim=256, transformer=transformer)
>>> print(decoder)
"""
super().__init__()
self.transformer_dim = transformer_dim
self.transformer = transformer
self.num_multimask_outputs = num_multimask_outputs
self.iou_token = nn.Embedding(1, transformer_dim)
self.num_mask_tokens = num_multimask_outputs + 1
self.mask_tokens = nn.Embedding(self.num_mask_tokens, transformer_dim)
self.pred_obj_scores = pred_obj_scores
if self.pred_obj_scores:
self.obj_score_token = nn.Embedding(1, transformer_dim)
self.use_multimask_token_for_obj_ptr = use_multimask_token_for_obj_ptr
self.output_upscaling = nn.Sequential(
nn.ConvTranspose2d(transformer_dim, transformer_dim // 4, kernel_size=2, stride=2),
LayerNorm2d(transformer_dim // 4),
activation(),
nn.ConvTranspose2d(transformer_dim // 4, transformer_dim // 8, kernel_size=2, stride=2),
activation(),
)
self.use_high_res_features = use_high_res_features
if use_high_res_features:
self.conv_s0 = nn.Conv2d(transformer_dim, transformer_dim // 8, kernel_size=1, stride=1)
self.conv_s1 = nn.Conv2d(transformer_dim, transformer_dim // 4, kernel_size=1, stride=1)
self.output_hypernetworks_mlps = nn.ModuleList(
[MLP(transformer_dim, transformer_dim, transformer_dim // 8, 3) for _ in range(self.num_mask_tokens)]
)
self.iou_prediction_head = MLP(
transformer_dim,
iou_head_hidden_dim,
self.num_mask_tokens,
iou_head_depth,
sigmoid=iou_prediction_use_sigmoid,
)
if self.pred_obj_scores:
self.pred_obj_score_head = nn.Linear(transformer_dim, 1)
if pred_obj_scores_mlp:
self.pred_obj_score_head = MLP(transformer_dim, transformer_dim, 1, 3)
# When outputting a single mask, optionally we can dynamically fall back to the best
# multimask output token if the single mask output token gives low stability scores.
self.dynamic_multimask_via_stability = dynamic_multimask_via_stability
self.dynamic_multimask_stability_delta = dynamic_multimask_stability_delta
self.dynamic_multimask_stability_thresh = dynamic_multimask_stability_thresh
def forward(
self,
image_embeddings: torch.Tensor,
image_pe: torch.Tensor,
sparse_prompt_embeddings: torch.Tensor,
dense_prompt_embeddings: torch.Tensor,
multimask_output: bool,
repeat_image: bool,
high_res_features: Optional[List[torch.Tensor]] = None,
) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Predicts masks given image and prompt embeddings.
Args:
image_embeddings (torch.Tensor): Embeddings from the image encoder with shape (B, C, H, W).
image_pe (torch.Tensor): Positional encoding with the shape of image_embeddings (B, C, H, W).
sparse_prompt_embeddings (torch.Tensor): Embeddings of the points and boxes with shape (B, N, C).
dense_prompt_embeddings (torch.Tensor): Embeddings of the mask inputs with shape (B, C, H, W).
multimask_output (bool): Whether to return multiple masks or a single mask.
repeat_image (bool): Flag to repeat the image embeddings.
high_res_features (List[torch.Tensor] | None): Optional high-resolution features.
Returns:
(Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]): A tuple containing:
- masks (torch.Tensor): Batched predicted masks with shape (B, N, H, W).
- iou_pred (torch.Tensor): Batched predictions of mask quality with shape (B, N).
- sam_tokens_out (torch.Tensor): Batched SAM token for mask output with shape (B, N, C).
- object_score_logits (torch.Tensor): Batched object score logits with shape (B, 1).
Examples:
>>> image_embeddings = torch.rand(1, 256, 64, 64)
>>> image_pe = torch.rand(1, 256, 64, 64)
>>> sparse_prompt_embeddings = torch.rand(1, 2, 256)
>>> dense_prompt_embeddings = torch.rand(1, 256, 64, 64)
>>> decoder = SAM2MaskDecoder(256, transformer)
>>> masks, iou_pred, sam_tokens_out, obj_score_logits = decoder.forward(
... image_embeddings, image_pe, sparse_prompt_embeddings, dense_prompt_embeddings, True, False)
"""
masks, iou_pred, mask_tokens_out, object_score_logits = self.predict_masks(
image_embeddings=image_embeddings,
image_pe=image_pe,
sparse_prompt_embeddings=sparse_prompt_embeddings,
dense_prompt_embeddings=dense_prompt_embeddings,
repeat_image=repeat_image,
high_res_features=high_res_features,
)
# Select the correct mask or masks for output
if multimask_output:
masks = masks[:, 1:, :, :]
iou_pred = iou_pred[:, 1:]
elif self.dynamic_multimask_via_stability and not self.training:
masks, iou_pred = self._dynamic_multimask_via_stability(masks, iou_pred)
else:
masks = masks[:, 0:1, :, :]
iou_pred = iou_pred[:, 0:1]
if multimask_output and self.use_multimask_token_for_obj_ptr:
sam_tokens_out = mask_tokens_out[:, 1:] # [b, 3, c] shape
else:
# Take the mask output token. Here we *always* use the token for single mask output.
# At test time, even if we track after 1-click (and using multimask_output=True),
# we still take the single mask token here. The rationale is that we always track
# after multiple clicks during training, so the past tokens seen during training
# are always the single mask token (and we'll let it be the object-memory token).
sam_tokens_out = mask_tokens_out[:, 0:1] # [b, 1, c] shape
# Prepare output
return masks, iou_pred, sam_tokens_out, object_score_logits
def predict_masks(
self,
image_embeddings: torch.Tensor,
image_pe: torch.Tensor,
sparse_prompt_embeddings: torch.Tensor,
dense_prompt_embeddings: torch.Tensor,
repeat_image: bool,
high_res_features: Optional[List[torch.Tensor]] = None,
) -> Tuple[torch.Tensor, torch.Tensor]:
"""Predicts instance segmentation masks from image and prompt embeddings using a transformer."""
# Concatenate output tokens
s = 0
if self.pred_obj_scores:
output_tokens = torch.cat(
[
self.obj_score_token.weight,
self.iou_token.weight,
self.mask_tokens.weight,
],
dim=0,
)
s = 1
else:
output_tokens = torch.cat([self.iou_token.weight, self.mask_tokens.weight], dim=0)
output_tokens = output_tokens.unsqueeze(0).expand(sparse_prompt_embeddings.size(0), -1, -1)
tokens = torch.cat((output_tokens, sparse_prompt_embeddings), dim=1)
# Expand per-image data in batch direction to be per-mask
if repeat_image:
src = torch.repeat_interleave(image_embeddings, tokens.shape[0], dim=0)
else:
assert image_embeddings.shape[0] == tokens.shape[0]
src = image_embeddings
src = src + dense_prompt_embeddings
assert image_pe.size(0) == 1, "image_pe should have size 1 in batch dim (from `get_dense_pe()`)"
pos_src = torch.repeat_interleave(image_pe, tokens.shape[0], dim=0)
b, c, h, w = src.shape
# Run the transformer
hs, src = self.transformer(src, pos_src, tokens)
iou_token_out = hs[:, s, :]
mask_tokens_out = hs[:, s + 1 : (s + 1 + self.num_mask_tokens), :]
# Upscale mask embeddings and predict masks using the mask tokens
src = src.transpose(1, 2).view(b, c, h, w)
if not self.use_high_res_features:
upscaled_embedding = self.output_upscaling(src)
else:
dc1, ln1, act1, dc2, act2 = self.output_upscaling
feat_s0, feat_s1 = high_res_features
upscaled_embedding = act1(ln1(dc1(src) + feat_s1))
upscaled_embedding = act2(dc2(upscaled_embedding) + feat_s0)
hyper_in_list: List[torch.Tensor] = []
for i in range(self.num_mask_tokens):
hyper_in_list.append(self.output_hypernetworks_mlps[i](mask_tokens_out[:, i, :]))
hyper_in = torch.stack(hyper_in_list, dim=1)
b, c, h, w = upscaled_embedding.shape
masks = (hyper_in @ upscaled_embedding.view(b, c, h * w)).view(b, -1, h, w)
# Generate mask quality predictions
iou_pred = self.iou_prediction_head(iou_token_out)
if self.pred_obj_scores:
assert s == 1
object_score_logits = self.pred_obj_score_head(hs[:, 0, :])
else:
# Obj scores logits - default to 10.0, i.e. assuming the object is present, sigmoid(10)=1
object_score_logits = 10.0 * iou_pred.new_ones(iou_pred.shape[0], 1)
return masks, iou_pred, mask_tokens_out, object_score_logits
def _get_stability_scores(self, mask_logits):
"""Computes mask stability scores based on IoU between upper and lower thresholds."""
mask_logits = mask_logits.flatten(-2)
stability_delta = self.dynamic_multimask_stability_delta
area_i = torch.sum(mask_logits > stability_delta, dim=-1).float()
area_u = torch.sum(mask_logits > -stability_delta, dim=-1).float()
stability_scores = torch.where(area_u > 0, area_i / area_u, 1.0)
return stability_scores
def _dynamic_multimask_via_stability(self, all_mask_logits, all_iou_scores):
"""
Dynamically selects the most stable mask output based on stability scores and IoU predictions.
This method is used when outputting a single mask. If the stability score from the current single-mask
output (based on output token 0) falls below a threshold, it instead selects from multi-mask outputs
(based on output tokens 1-3) the mask with the highest predicted IoU score. This ensures a valid mask
for both clicking and tracking scenarios.
Args:
all_mask_logits (torch.Tensor): Logits for all predicted masks, shape (B, N, H, W) where B is
batch size, N is number of masks (typically 4), and H, W are mask dimensions.
all_iou_scores (torch.Tensor): Predicted IoU scores for all masks, shape (B, N).
Returns:
(Tuple[torch.Tensor, torch.Tensor]):
- mask_logits_out (torch.Tensor): Selected mask logits, shape (B, 1, H, W).
- iou_scores_out (torch.Tensor): Selected IoU scores, shape (B, 1).
Examples:
>>> decoder = SAM2MaskDecoder(...)
>>> all_mask_logits = torch.rand(2, 4, 256, 256) # 2 images, 4 masks each
>>> all_iou_scores = torch.rand(2, 4)
>>> mask_logits, iou_scores = decoder._dynamic_multimask_via_stability(all_mask_logits, all_iou_scores)
>>> print(mask_logits.shape, iou_scores.shape)
torch.Size([2, 1, 256, 256]) torch.Size([2, 1])
"""
# The best mask from multimask output tokens (1~3)
multimask_logits = all_mask_logits[:, 1:, :, :]
multimask_iou_scores = all_iou_scores[:, 1:]
best_scores_inds = torch.argmax(multimask_iou_scores, dim=-1)
batch_inds = torch.arange(multimask_iou_scores.size(0), device=all_iou_scores.device)
best_multimask_logits = multimask_logits[batch_inds, best_scores_inds]
best_multimask_logits = best_multimask_logits.unsqueeze(1)
best_multimask_iou_scores = multimask_iou_scores[batch_inds, best_scores_inds]
best_multimask_iou_scores = best_multimask_iou_scores.unsqueeze(1)
# The mask from singlemask output token 0 and its stability score
singlemask_logits = all_mask_logits[:, 0:1, :, :]
singlemask_iou_scores = all_iou_scores[:, 0:1]
stability_scores = self._get_stability_scores(singlemask_logits)
is_stable = stability_scores >= self.dynamic_multimask_stability_thresh
# Dynamically fall back to best multimask output upon low stability scores.
mask_logits_out = torch.where(
is_stable[..., None, None].expand_as(singlemask_logits),
singlemask_logits,
best_multimask_logits,
)
iou_scores_out = torch.where(
is_stable.expand_as(singlemask_iou_scores),
singlemask_iou_scores,
best_multimask_iou_scores,
)
return mask_logits_out, iou_scores_out

View file

@ -1,30 +1,48 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
from typing import Any, Optional, Tuple, Type
from typing import List, Optional, Tuple, Type
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from ultralytics.nn.modules import LayerNorm2d, MLPBlock
from ultralytics.nn.modules import LayerNorm2d
from .blocks import (
Block,
CXBlock,
Fuser,
MaskDownSampler,
MultiScaleBlock,
PatchEmbed,
PositionEmbeddingRandom,
PositionEmbeddingSine,
)
class ImageEncoderViT(nn.Module):
"""
An image encoder using Vision Transformer (ViT) architecture for encoding an image into a compact latent space. The
encoder takes an image, splits it into patches, and processes these patches through a series of transformer blocks.
The encoded patches are then processed through a neck to generate the final encoded representation.
An image encoder using Vision Transformer (ViT) architecture for encoding images into a compact latent space.
This class and its supporting functions below lightly adapted from the ViTDet backbone available at
https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/vit.py.
This class processes images by splitting them into patches, applying transformer blocks, and generating a final
encoded representation through a neck module.
Attributes:
img_size (int): Dimension of input images, assumed to be square.
patch_embed (PatchEmbed): Module for patch embedding.
pos_embed (nn.Parameter, optional): Absolute positional embedding for patches.
pos_embed (nn.Parameter | None): Absolute positional embedding for patches.
blocks (nn.ModuleList): List of transformer blocks for processing patch embeddings.
neck (nn.Sequential): Neck module to further process the output.
Methods:
forward: Processes input through patch embedding, positional embedding, blocks, and neck.
Examples:
>>> import torch
>>> encoder = ImageEncoderViT(img_size=224, patch_size=16, embed_dim=768, depth=12, num_heads=12)
>>> input_image = torch.randn(1, 3, 224, 224)
>>> output = encoder(input_image)
>>> print(output.shape)
"""
def __init__(
@ -47,22 +65,38 @@ class ImageEncoderViT(nn.Module):
global_attn_indexes: Tuple[int, ...] = (),
) -> None:
"""
Initializes an ImageEncoderViT instance for encoding images using Vision Transformer architecture.
Args:
img_size (int): Input image size.
patch_size (int): Patch size.
img_size (int): Input image size, assumed to be square.
patch_size (int): Size of image patches.
in_chans (int): Number of input image channels.
embed_dim (int): Patch embedding dimension.
depth (int): Depth of ViT.
num_heads (int): Number of attention heads in each ViT block.
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
qkv_bias (bool): If True, add a learnable bias to query, key, value.
norm_layer (nn.Module): Normalization layer.
act_layer (nn.Module): Activation layer.
use_abs_pos (bool): If True, use absolute positional embeddings.
use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
window_size (int): Window size for window attention blocks.
global_attn_indexes (list): Indexes for blocks using global attention.
embed_dim (int): Dimension of patch embeddings.
depth (int): Number of transformer blocks.
num_heads (int): Number of attention heads in each block.
mlp_ratio (float): Ratio of MLP hidden dimension to embedding dimension.
out_chans (int): Number of output channels from the neck module.
qkv_bias (bool): If True, adds learnable bias to query, key, value projections.
norm_layer (Type[nn.Module]): Type of normalization layer to use.
act_layer (Type[nn.Module]): Type of activation layer to use.
use_abs_pos (bool): If True, uses absolute positional embeddings.
use_rel_pos (bool): If True, adds relative positional embeddings to attention maps.
rel_pos_zero_init (bool): If True, initializes relative positional parameters to zero.
window_size (int): Size of attention window for windowed attention blocks.
global_attn_indexes (Tuple[int, ...]): Indices of blocks that use global attention.
Attributes:
img_size (int): Dimension of input images.
patch_embed (PatchEmbed): Module for patch embedding.
pos_embed (nn.Parameter | None): Absolute positional embedding for patches.
blocks (nn.ModuleList): List of transformer blocks.
neck (nn.Sequential): Neck module for final processing.
Examples:
>>> encoder = ImageEncoderViT(img_size=224, patch_size=16, embed_dim=768, depth=12, num_heads=12)
>>> input_image = torch.randn(1, 3, 224, 224)
>>> output = encoder(input_image)
>>> print(output.shape)
"""
super().__init__()
self.img_size = img_size
@ -114,9 +148,7 @@ class ImageEncoderViT(nn.Module):
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""Processes input through patch embedding, applies positional embedding if present, and passes through blocks
and neck.
"""
"""Processes input through patch embedding, positional embedding, transformer blocks, and neck module."""
x = self.patch_embed(x)
if self.pos_embed is not None:
x = x + self.pos_embed
@ -127,8 +159,7 @@ class ImageEncoderViT(nn.Module):
class PromptEncoder(nn.Module):
"""
Encodes different types of prompts, including points, boxes, and masks, for input to SAM's mask decoder. The encoder
produces both sparse and dense embeddings for the input prompts.
Encodes different types of prompts for input to SAM's mask decoder, producing sparse and dense embeddings.
Attributes:
embed_dim (int): Dimension of the embeddings.
@ -137,10 +168,23 @@ class PromptEncoder(nn.Module):
pe_layer (PositionEmbeddingRandom): Module for random position embedding.
num_point_embeddings (int): Number of point embeddings for different types of points.
point_embeddings (nn.ModuleList): List of point embeddings.
not_a_point_embed (nn.Embedding): Embedding for points that are not a part of any label.
not_a_point_embed (nn.Embedding): Embedding for points that are not part of any label.
mask_input_size (Tuple[int, int]): Size of the input mask.
mask_downscaling (nn.Sequential): Neural network for downscaling the mask.
no_mask_embed (nn.Embedding): Embedding for cases where no mask is provided.
Methods:
get_dense_pe: Returns the positional encoding used to encode point prompts.
forward: Embeds different types of prompts, returning both sparse and dense embeddings.
Examples:
>>> prompt_encoder = PromptEncoder(256, (64, 64), (1024, 1024), 16)
>>> points = (torch.rand(1, 5, 2), torch.randint(0, 4, (1, 5)))
>>> boxes = torch.rand(1, 2, 2)
>>> masks = torch.rand(1, 1, 256, 256)
>>> sparse_embeddings, dense_embeddings = prompt_encoder(points, boxes, masks)
>>> print(sparse_embeddings.shape, dense_embeddings.shape)
torch.Size([1, 7, 256]) torch.Size([1, 256, 64, 64])
"""
def __init__(
@ -152,18 +196,37 @@ class PromptEncoder(nn.Module):
activation: Type[nn.Module] = nn.GELU,
) -> None:
"""
Encodes prompts for input to SAM's mask decoder.
Initializes the PromptEncoder module for encoding various types of prompts.
This module encodes different types of prompts (points, boxes, masks) for input to SAM's mask decoder,
producing both sparse and dense embeddings.
Args:
embed_dim (int): The prompts' embedding dimension
image_embedding_size (tuple(int, int)): The spatial size of the
image embedding, as (H, W).
input_image_size (int): The padded size of the image as input
to the image encoder, as (H, W).
mask_in_chans (int): The number of hidden channels used for
encoding input masks.
activation (nn.Module): The activation to use when encoding
input masks.
embed_dim (int): The dimension of the embeddings.
image_embedding_size (Tuple[int, int]): The spatial size of the image embedding as (H, W).
input_image_size (Tuple[int, int]): The padded size of the input image as (H, W).
mask_in_chans (int): The number of hidden channels used for encoding input masks.
activation (Type[nn.Module]): The activation function to use when encoding input masks.
Attributes:
embed_dim (int): Dimension of the embeddings.
input_image_size (Tuple[int, int]): Size of the input image as (H, W).
image_embedding_size (Tuple[int, int]): Spatial size of the image embedding as (H, W).
pe_layer (PositionEmbeddingRandom): Module for random position embedding.
num_point_embeddings (int): Number of point embeddings for different types of points.
point_embeddings (nn.ModuleList): List of point embeddings.
not_a_point_embed (nn.Embedding): Embedding for points that are not part of any label.
mask_input_size (Tuple[int, int]): Size of the input mask.
mask_downscaling (nn.Sequential): Neural network for downscaling the mask.
Examples:
>>> prompt_encoder = PromptEncoder(256, (64, 64), (1024, 1024), 16)
>>> points = (torch.rand(1, 5, 2), torch.randint(0, 4, (1, 5)))
>>> boxes = torch.rand(1, 2, 2)
>>> masks = torch.rand(1, 1, 256, 256)
>>> sparse_embeddings, dense_embeddings = prompt_encoder(points, boxes, masks)
>>> print(sparse_embeddings.shape, dense_embeddings.shape)
torch.Size([1, 7, 256]) torch.Size([1, 256, 64, 64])
"""
super().__init__()
self.embed_dim = embed_dim
@ -190,16 +253,25 @@ class PromptEncoder(nn.Module):
def get_dense_pe(self) -> torch.Tensor:
"""
Returns the positional encoding used to encode point prompts, applied to a dense set of points the shape of the
image encoding.
Returns the dense positional encoding used for encoding point prompts.
This method generates a positional encoding for a dense set of points matching the shape of the image
encoding. The encoding is used to provide spatial information to the model when processing point prompts.
Returns:
torch.Tensor: Positional encoding with shape 1x(embed_dim)x(embedding_h)x(embedding_w)
(torch.Tensor): Positional encoding tensor with shape (1, embed_dim, H, W), where H and W are the
height and width of the image embedding size, respectively.
Examples:
>>> prompt_encoder = PromptEncoder(256, (64, 64), (1024, 1024), 16)
>>> dense_pe = prompt_encoder.get_dense_pe()
>>> print(dense_pe.shape)
torch.Size([1, 256, 64, 64])
"""
return self.pe_layer(self.image_embedding_size).unsqueeze(0)
def _embed_points(self, points: torch.Tensor, labels: torch.Tensor, pad: bool) -> torch.Tensor:
"""Embeds point prompts."""
"""Embeds point prompts by applying positional encoding and label-specific embeddings."""
points = points + 0.5 # Shift to center of pixel
if pad:
padding_point = torch.zeros((points.shape[0], 1, 2), device=points.device)
@ -216,7 +288,7 @@ class PromptEncoder(nn.Module):
return point_embedding
def _embed_boxes(self, boxes: torch.Tensor) -> torch.Tensor:
"""Embeds box prompts."""
"""Embeds box prompts by applying positional encoding and adding corner embeddings."""
boxes = boxes + 0.5 # Shift to center of pixel
coords = boxes.reshape(-1, 2, 2)
corner_embedding = self.pe_layer.forward_with_coords(coords, self.input_image_size)
@ -225,7 +297,7 @@ class PromptEncoder(nn.Module):
return corner_embedding
def _embed_masks(self, masks: torch.Tensor) -> torch.Tensor:
"""Embeds mask inputs."""
"""Embeds mask inputs by downscaling and processing through convolutional layers."""
return self.mask_downscaling(masks)
@staticmethod
@ -258,14 +330,25 @@ class PromptEncoder(nn.Module):
Embeds different types of prompts, returning both sparse and dense embeddings.
Args:
points (tuple(torch.Tensor, torch.Tensor), None): point coordinates and labels to embed.
boxes (torch.Tensor, None): boxes to embed
masks (torch.Tensor, None): masks to embed
points (Tuple[torch.Tensor, torch.Tensor] | None): Point coordinates and labels to embed. The first
tensor contains coordinates with shape (B, N, 2), and the second tensor contains labels with
shape (B, N).
boxes (torch.Tensor | None): Boxes to embed with shape (B, M, 2, 2), where M is the number of boxes.
masks (torch.Tensor | None): Masks to embed with shape (B, 1, H, W).
Returns:
torch.Tensor: sparse embeddings for the points and boxes, with shape BxNx(embed_dim), where N is determined
by the number of input points and boxes.
torch.Tensor: dense embeddings for the masks, in the shape Bx(embed_dim)x(embed_H)x(embed_W)
(Tuple[torch.Tensor, torch.Tensor]): A tuple containing:
- sparse_embeddings (torch.Tensor): Sparse embeddings for points and boxes with shape (B, N, embed_dim).
- dense_embeddings (torch.Tensor): Dense embeddings for masks of shape (B, embed_dim, embed_H, embed_W).
Examples:
>>> encoder = PromptEncoder(256, (64, 64), (1024, 1024), 16)
>>> points = (torch.rand(1, 5, 2), torch.randint(0, 4, (1, 5)))
>>> boxes = torch.rand(1, 2, 2, 2)
>>> masks = torch.rand(1, 1, 256, 256)
>>> sparse_emb, dense_emb = encoder(points, boxes, masks)
>>> print(sparse_emb.shape, dense_emb.shape)
torch.Size([1, 7, 256]) torch.Size([1, 256, 64, 64])
"""
bs = self._get_batch_size(points, boxes, masks)
sparse_embeddings = torch.empty((bs, 0, self.embed_dim), device=self._get_device())
@ -287,319 +370,421 @@ class PromptEncoder(nn.Module):
return sparse_embeddings, dense_embeddings
class PositionEmbeddingRandom(nn.Module):
"""Positional encoding using random spatial frequencies."""
class MemoryEncoder(nn.Module):
"""
Encodes pixel features and masks into a memory representation for efficient image segmentation.
def __init__(self, num_pos_feats: int = 64, scale: Optional[float] = None) -> None:
"""Initializes a position embedding using random spatial frequencies."""
super().__init__()
if scale is None or scale <= 0.0:
scale = 1.0
self.register_buffer("positional_encoding_gaussian_matrix", scale * torch.randn((2, num_pos_feats)))
This class processes pixel-level features and masks, fusing them to generate encoded memory representations
suitable for downstream tasks in image segmentation models like SAM (Segment Anything Model).
# Set non-deterministic for forward() error 'cumsum_cuda_kernel does not have a deterministic implementation'
torch.use_deterministic_algorithms(False)
torch.backends.cudnn.deterministic = False
Attributes:
mask_downsampler (MaskDownSampler): Module for downsampling input masks.
pix_feat_proj (nn.Conv2d): Convolutional layer for projecting pixel features.
fuser (Fuser): Module for fusing pixel features and masks.
position_encoding (PositionEmbeddingSine): Module for adding positional encoding to features.
out_proj (nn.Module): Output projection layer, either nn.Identity or nn.Conv2d.
def _pe_encoding(self, coords: torch.Tensor) -> torch.Tensor:
"""Positionally encode points that are normalized to [0,1]."""
# Assuming coords are in [0, 1]^2 square and have d_1 x ... x d_n x 2 shape
coords = 2 * coords - 1
coords = coords @ self.positional_encoding_gaussian_matrix
coords = 2 * np.pi * coords
# Outputs d_1 x ... x d_n x C shape
return torch.cat([torch.sin(coords), torch.cos(coords)], dim=-1)
Methods:
forward: Processes input pixel features and masks to generate encoded memory representations.
def forward(self, size: Tuple[int, int]) -> torch.Tensor:
"""Generate positional encoding for a grid of the specified size."""
h, w = size
device: Any = self.positional_encoding_gaussian_matrix.device
grid = torch.ones((h, w), device=device, dtype=torch.float32)
y_embed = grid.cumsum(dim=0) - 0.5
x_embed = grid.cumsum(dim=1) - 0.5
y_embed = y_embed / h
x_embed = x_embed / w
pe = self._pe_encoding(torch.stack([x_embed, y_embed], dim=-1))
return pe.permute(2, 0, 1) # C x H x W
def forward_with_coords(self, coords_input: torch.Tensor, image_size: Tuple[int, int]) -> torch.Tensor:
"""Positionally encode points that are not normalized to [0,1]."""
coords = coords_input.clone()
coords[:, :, 0] = coords[:, :, 0] / image_size[1]
coords[:, :, 1] = coords[:, :, 1] / image_size[0]
return self._pe_encoding(coords.to(torch.float)) # B x N x C
class Block(nn.Module):
"""Transformer blocks with support of window attention and residual propagation blocks."""
Examples:
>>> import torch
>>> encoder = MemoryEncoder(out_dim=256, in_dim=256)
>>> pix_feat = torch.randn(1, 256, 64, 64)
>>> masks = torch.randn(1, 1, 64, 64)
>>> encoded_feat, pos = encoder(pix_feat, masks)
>>> print(encoded_feat.shape, pos.shape)
torch.Size([1, 256, 64, 64]) torch.Size([1, 128, 64, 64])
"""
def __init__(
self,
dim: int,
num_heads: int,
mlp_ratio: float = 4.0,
qkv_bias: bool = True,
norm_layer: Type[nn.Module] = nn.LayerNorm,
act_layer: Type[nn.Module] = nn.GELU,
use_rel_pos: bool = False,
rel_pos_zero_init: bool = True,
window_size: int = 0,
input_size: Optional[Tuple[int, int]] = None,
) -> None:
out_dim,
in_dim=256, # in_dim of pix_feats
):
"""Initializes the MemoryEncoder for encoding pixel features and masks into memory representations."""
super().__init__()
self.mask_downsampler = MaskDownSampler(kernel_size=3, stride=2, padding=1)
self.pix_feat_proj = nn.Conv2d(in_dim, in_dim, kernel_size=1)
self.fuser = Fuser(CXBlock(dim=256), num_layers=2)
self.position_encoding = PositionEmbeddingSine(num_pos_feats=64)
self.out_proj = nn.Identity()
if out_dim != in_dim:
self.out_proj = nn.Conv2d(in_dim, out_dim, kernel_size=1)
def forward(
self,
pix_feat: torch.Tensor,
masks: torch.Tensor,
skip_mask_sigmoid: bool = False,
) -> Tuple[torch.Tensor, torch.Tensor]:
"""Processes pixel features and masks to generate encoded memory representations for segmentation."""
if not skip_mask_sigmoid:
masks = F.sigmoid(masks)
masks = self.mask_downsampler(masks)
# Fuse pix_feats and downsampled masks, in case the visual features are on CPU, cast them to CUDA
pix_feat = pix_feat.to(masks.device)
x = self.pix_feat_proj(pix_feat)
x = x + masks
x = self.fuser(x)
x = self.out_proj(x)
pos = self.position_encoding(x).to(x.dtype)
return {"vision_features": x, "vision_pos_enc": [pos]}
class ImageEncoder(nn.Module):
"""
Encodes images using a trunk-neck architecture, producing multiscale features and positional encodings.
This class combines a trunk network for feature extraction with a neck network for feature refinement
and positional encoding generation. It can optionally discard the lowest resolution features.
Attributes:
trunk (nn.Module): The trunk network for initial feature extraction.
neck (nn.Module): The neck network for feature refinement and positional encoding generation.
scalp (int): Number of lowest resolution feature levels to discard.
Methods:
forward: Processes the input image through the trunk and neck networks.
Examples:
>>> trunk = SomeTrunkNetwork()
>>> neck = SomeNeckNetwork()
>>> encoder = ImageEncoder(trunk, neck, scalp=1)
>>> image = torch.randn(1, 3, 224, 224)
>>> output = encoder(image)
>>> print(output.keys())
dict_keys(['vision_features', 'vision_pos_enc', 'backbone_fpn'])
"""
def __init__(
self,
trunk: nn.Module,
neck: nn.Module,
scalp: int = 0,
):
"""Initializes the ImageEncoder with trunk and neck networks for feature extraction and refinement."""
super().__init__()
self.trunk = trunk
self.neck = neck
self.scalp = scalp
assert (
self.trunk.channel_list == self.neck.backbone_channel_list
), f"Channel dims of trunk {self.trunk.channel_list} and neck {self.neck.backbone_channel_list} do not match."
def forward(self, sample: torch.Tensor):
"""Encodes input through patch embedding, positional embedding, transformer blocks, and neck module."""
features, pos = self.neck(self.trunk(sample))
if self.scalp > 0:
# Discard the lowest resolution features
features, pos = features[: -self.scalp], pos[: -self.scalp]
src = features[-1]
output = {
"vision_features": src,
"vision_pos_enc": pos,
"backbone_fpn": features,
}
return output
class FpnNeck(nn.Module):
"""
A Feature Pyramid Network (FPN) neck variant for multiscale feature fusion in object detection models.
This FPN variant removes the output convolution and uses bicubic interpolation for feature resizing,
similar to ViT positional embedding interpolation.
Attributes:
position_encoding (PositionEmbeddingSine): Sinusoidal positional encoding module.
convs (nn.ModuleList): List of convolutional layers for each backbone level.
backbone_channel_list (List[int]): List of channel dimensions from the backbone.
fpn_interp_model (str): Interpolation mode for FPN feature resizing.
fuse_type (str): Type of feature fusion, either 'sum' or 'avg'.
fpn_top_down_levels (List[int]): Levels to have top-down features in outputs.
Methods:
forward: Performs forward pass through the FPN neck.
Examples:
>>> backbone_channels = [64, 128, 256, 512]
>>> fpn_neck = FpnNeck(256, backbone_channels)
>>> inputs = [torch.rand(1, c, 32, 32) for c in backbone_channels]
>>> outputs, positions = fpn_neck(inputs)
>>> print(len(outputs), len(positions))
4 4
"""
def __init__(
self,
d_model: int,
backbone_channel_list: List[int],
kernel_size: int = 1,
stride: int = 1,
padding: int = 0,
fpn_interp_model: str = "bilinear",
fuse_type: str = "sum",
fpn_top_down_levels: Optional[List[int]] = None,
):
"""
Initializes a modified Feature Pyramid Network (FPN) neck.
This FPN variant removes the output convolution and uses bicubic interpolation for feature resizing,
similar to ViT positional embedding interpolation.
Args:
dim (int): Number of input channels.
num_heads (int): Number of attention heads in each ViT block.
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
qkv_bias (bool): If True, add a learnable bias to query, key, value.
norm_layer (nn.Module): Normalization layer.
act_layer (nn.Module): Activation layer.
use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
window_size (int): Window size for window attention blocks. If it equals 0, then
use global attention.
input_size (tuple(int, int), None): Input resolution for calculating the relative
positional parameter size.
d_model (int): Dimension of the model.
backbone_channel_list (List[int]): List of channel dimensions from the backbone.
kernel_size (int): Kernel size for the convolutional layers.
stride (int): Stride for the convolutional layers.
padding (int): Padding for the convolutional layers.
fpn_interp_model (str): Interpolation mode for FPN feature resizing.
fuse_type (str): Type of feature fusion, either 'sum' or 'avg'.
fpn_top_down_levels (Optional[List[int]]): Levels to have top-down features in outputs.
Examples:
>>> backbone_channels = [64, 128, 256, 512]
>>> fpn_neck = FpnNeck(256, backbone_channels)
>>> print(fpn_neck)
"""
super().__init__()
self.norm1 = norm_layer(dim)
self.attn = Attention(
dim,
num_heads=num_heads,
qkv_bias=qkv_bias,
use_rel_pos=use_rel_pos,
rel_pos_zero_init=rel_pos_zero_init,
input_size=input_size if window_size == 0 else (window_size, window_size),
self.position_encoding = PositionEmbeddingSine(num_pos_feats=256)
self.convs = nn.ModuleList()
self.backbone_channel_list = backbone_channel_list
for dim in backbone_channel_list:
current = nn.Sequential()
current.add_module(
"conv",
nn.Conv2d(
in_channels=dim,
out_channels=d_model,
kernel_size=kernel_size,
stride=stride,
padding=padding,
),
)
self.convs.append(current)
self.fpn_interp_model = fpn_interp_model
assert fuse_type in ["sum", "avg"]
self.fuse_type = fuse_type
# levels to have top-down features in its outputs
# e.g. if fpn_top_down_levels is [2, 3], then only outputs of level 2 and 3
# have top-down propagation, while outputs of level 0 and level 1 have only
# lateral features from the same backbone level.
if fpn_top_down_levels is None:
# default is to have top-down features on all levels
fpn_top_down_levels = range(len(self.convs))
self.fpn_top_down_levels = list(fpn_top_down_levels)
def forward(self, xs: List[torch.Tensor]):
"""
Performs forward pass through the Feature Pyramid Network (FPN) neck.
This method processes a list of input tensors from the backbone through the FPN, applying lateral connections
and top-down feature fusion. It generates output feature maps and corresponding positional encodings.
Args:
xs (List[torch.Tensor]): List of input tensors from the backbone, each with shape (B, C, H, W).
Returns:
(Tuple[List[torch.Tensor], List[torch.Tensor]]): A tuple containing:
- out (List[torch.Tensor]): List of output feature maps after FPN processing, each with shape
(B, d_model, H, W).
- pos (List[torch.Tensor]): List of positional encodings corresponding to each output feature map.
Examples:
>>> fpn_neck = FpnNeck(d_model=256, backbone_channel_list=[64, 128, 256, 512])
>>> inputs = [torch.rand(1, c, 32, 32) for c in [64, 128, 256, 512]]
>>> outputs, positions = fpn_neck(inputs)
>>> print(len(outputs), len(positions))
4 4
"""
out = [None] * len(self.convs)
pos = [None] * len(self.convs)
assert len(xs) == len(self.convs)
# fpn forward pass
# see https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/fpn.py
prev_features = None
# forward in top-down order (from low to high resolution)
n = len(self.convs) - 1
for i in range(n, -1, -1):
x = xs[i]
lateral_features = self.convs[n - i](x)
if i in self.fpn_top_down_levels and prev_features is not None:
top_down_features = F.interpolate(
prev_features.to(dtype=torch.float32),
scale_factor=2.0,
mode=self.fpn_interp_model,
align_corners=(None if self.fpn_interp_model == "nearest" else False),
antialias=False,
)
prev_features = lateral_features + top_down_features
if self.fuse_type == "avg":
prev_features /= 2
else:
prev_features = lateral_features
x_out = prev_features
out[i] = x_out
pos[i] = self.position_encoding(x_out).to(x_out.dtype)
return out, pos
class Hiera(nn.Module):
"""
Hierarchical vision transformer for efficient multiscale feature extraction in image processing tasks.
This class implements a Hiera model, which is a hierarchical vision transformer architecture designed for
efficient multiscale feature extraction. It uses a series of transformer blocks organized into stages,
with optional pooling and global attention mechanisms.
Attributes:
window_spec (Tuple[int, ...]): Window sizes for each stage.
q_stride (Tuple[int, int]): Downsampling stride between stages.
stage_ends (List[int]): Indices of the last block in each stage.
q_pool_blocks (List[int]): Indices of blocks where pooling is applied.
return_interm_layers (bool): Whether to return intermediate layer outputs.
patch_embed (PatchEmbed): Module for patch embedding.
global_att_blocks (Tuple[int, ...]): Indices of blocks with global attention.
window_pos_embed_bkg_spatial_size (Tuple[int, int]): Spatial size for window positional embedding background.
pos_embed (nn.Parameter): Positional embedding for the background.
pos_embed_window (nn.Parameter): Positional embedding for the window.
blocks (nn.ModuleList): List of MultiScaleBlock modules.
channel_list (List[int]): List of output channel dimensions for each stage.
Methods:
_get_pos_embed: Generates positional embeddings by interpolating and combining window and background embeddings.
forward: Performs the forward pass through the Hiera model.
Examples:
>>> model = Hiera(embed_dim=96, num_heads=1, stages=(2, 3, 16, 3))
>>> input_tensor = torch.randn(1, 3, 224, 224)
>>> output_features = model(input_tensor)
>>> for feat in output_features:
... print(feat.shape)
"""
def __init__(
self,
embed_dim: int = 96, # initial embed dim
num_heads: int = 1, # initial number of heads
drop_path_rate: float = 0.0, # stochastic depth
q_pool: int = 3, # number of q_pool stages
q_stride: Tuple[int, int] = (2, 2), # downsample stride bet. stages
stages: Tuple[int, ...] = (2, 3, 16, 3), # blocks per stage
dim_mul: float = 2.0, # dim_mul factor at stage shift
head_mul: float = 2.0, # head_mul factor at stage shift
window_pos_embed_bkg_spatial_size: Tuple[int, int] = (14, 14),
# window size per stage, when not using global att.
window_spec: Tuple[int, ...] = (
8,
4,
14,
7,
),
# global attn in these blocks
global_att_blocks: Tuple[int, ...] = (
12,
16,
20,
),
return_interm_layers=True, # return feats from every stage
):
"""Initializes the Hiera model, configuring its hierarchical vision transformer architecture."""
super().__init__()
assert len(stages) == len(window_spec)
self.window_spec = window_spec
depth = sum(stages)
self.q_stride = q_stride
self.stage_ends = [sum(stages[:i]) - 1 for i in range(1, len(stages) + 1)]
assert 0 <= q_pool <= len(self.stage_ends[:-1])
self.q_pool_blocks = [x + 1 for x in self.stage_ends[:-1]][:q_pool]
self.return_interm_layers = return_interm_layers
self.patch_embed = PatchEmbed(
embed_dim=embed_dim,
kernel_size=(7, 7),
stride=(4, 4),
padding=(3, 3),
)
# Which blocks have global att?
self.global_att_blocks = global_att_blocks
# Windowed positional embedding (https://arxiv.org/abs/2311.05613)
self.window_pos_embed_bkg_spatial_size = window_pos_embed_bkg_spatial_size
self.pos_embed = nn.Parameter(torch.zeros(1, embed_dim, *self.window_pos_embed_bkg_spatial_size))
self.pos_embed_window = nn.Parameter(torch.zeros(1, embed_dim, self.window_spec[0], self.window_spec[0]))
dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule
cur_stage = 1
self.blocks = nn.ModuleList()
for i in range(depth):
dim_out = embed_dim
# lags by a block, so first block of
# next stage uses an initial window size
# of previous stage and final window size of current stage
window_size = self.window_spec[cur_stage - 1]
if self.global_att_blocks is not None:
window_size = 0 if i in self.global_att_blocks else window_size
if i - 1 in self.stage_ends:
dim_out = int(embed_dim * dim_mul)
num_heads = int(num_heads * head_mul)
cur_stage += 1
block = MultiScaleBlock(
dim=embed_dim,
dim_out=dim_out,
num_heads=num_heads,
drop_path=dpr[i],
q_stride=self.q_stride if i in self.q_pool_blocks else None,
window_size=window_size,
)
embed_dim = dim_out
self.blocks.append(block)
self.channel_list = (
[self.blocks[i].dim_out for i in self.stage_ends[::-1]]
if return_interm_layers
else [self.blocks[-1].dim_out]
)
self.norm2 = norm_layer(dim)
self.mlp = MLPBlock(embedding_dim=dim, mlp_dim=int(dim * mlp_ratio), act=act_layer)
def _get_pos_embed(self, hw: Tuple[int, int]) -> torch.Tensor:
"""Generates positional embeddings by interpolating and combining window and background embeddings."""
h, w = hw
window_embed = self.pos_embed_window
pos_embed = F.interpolate(self.pos_embed, size=(h, w), mode="bicubic")
pos_embed = pos_embed + window_embed.tile([x // y for x, y in zip(pos_embed.shape, window_embed.shape)])
pos_embed = pos_embed.permute(0, 2, 3, 1)
return pos_embed
self.window_size = window_size
def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
"""Performs forward pass through Hiera model, extracting multiscale features from input images."""
x = self.patch_embed(x)
# x: (B, H, W, C)
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""Executes a forward pass through the transformer block with window attention and non-overlapping windows."""
shortcut = x
x = self.norm1(x)
# Window partition
if self.window_size > 0:
H, W = x.shape[1], x.shape[2]
x, pad_hw = window_partition(x, self.window_size)
# Add pos embed
x = x + self._get_pos_embed(x.shape[1:3])
x = self.attn(x)
# Reverse window partition
if self.window_size > 0:
x = window_unpartition(x, self.window_size, pad_hw, (H, W))
outputs = []
for i, blk in enumerate(self.blocks):
x = blk(x)
if (i == self.stage_ends[-1]) or (i in self.stage_ends and self.return_interm_layers):
feats = x.permute(0, 3, 1, 2)
outputs.append(feats)
x = shortcut + x
return x + self.mlp(self.norm2(x))
class Attention(nn.Module):
"""Multi-head Attention block with relative position embeddings."""
def __init__(
self,
dim: int,
num_heads: int = 8,
qkv_bias: bool = True,
use_rel_pos: bool = False,
rel_pos_zero_init: bool = True,
input_size: Optional[Tuple[int, int]] = None,
) -> None:
"""
Initialize Attention module.
Args:
dim (int): Number of input channels.
num_heads (int): Number of attention heads.
qkv_bias (bool): If True, add a learnable bias to query, key, value.
rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
input_size (tuple(int, int), None): Input resolution for calculating the relative
positional parameter size.
"""
super().__init__()
self.num_heads = num_heads
head_dim = dim // num_heads
self.scale = head_dim**-0.5
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
self.proj = nn.Linear(dim, dim)
self.use_rel_pos = use_rel_pos
if self.use_rel_pos:
assert input_size is not None, "Input size must be provided if using relative positional encoding."
# Initialize relative positional embeddings
self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim))
self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim))
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""Applies the forward operation including attention, normalization, MLP, and indexing within window limits."""
B, H, W, _ = x.shape
# qkv with shape (3, B, nHead, H * W, C)
qkv = self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
# q, k, v with shape (B * nHead, H * W, C)
q, k, v = qkv.reshape(3, B * self.num_heads, H * W, -1).unbind(0)
attn = (q * self.scale) @ k.transpose(-2, -1)
if self.use_rel_pos:
attn = add_decomposed_rel_pos(attn, q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W))
attn = attn.softmax(dim=-1)
x = (attn @ v).view(B, self.num_heads, H, W, -1).permute(0, 2, 3, 1, 4).reshape(B, H, W, -1)
return self.proj(x)
def window_partition(x: torch.Tensor, window_size: int) -> Tuple[torch.Tensor, Tuple[int, int]]:
"""
Partition into non-overlapping windows with padding if needed.
Args:
x (tensor): input tokens with [B, H, W, C].
window_size (int): window size.
Returns:
windows: windows after partition with [B * num_windows, window_size, window_size, C].
(Hp, Wp): padded height and width before partition
"""
B, H, W, C = x.shape
pad_h = (window_size - H % window_size) % window_size
pad_w = (window_size - W % window_size) % window_size
if pad_h > 0 or pad_w > 0:
x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h))
Hp, Wp = H + pad_h, W + pad_w
x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
return windows, (Hp, Wp)
def window_unpartition(
windows: torch.Tensor, window_size: int, pad_hw: Tuple[int, int], hw: Tuple[int, int]
) -> torch.Tensor:
"""
Window unpartition into original sequences and removing padding.
Args:
windows (tensor): input tokens with [B * num_windows, window_size, window_size, C].
window_size (int): window size.
pad_hw (Tuple): padded height and width (Hp, Wp).
hw (Tuple): original height and width (H, W) before padding.
Returns:
x: unpartitioned sequences with [B, H, W, C].
"""
Hp, Wp = pad_hw
H, W = hw
B = windows.shape[0] // (Hp * Wp // window_size // window_size)
x = windows.view(B, Hp // window_size, Wp // window_size, window_size, window_size, -1)
x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1)
if Hp > H or Wp > W:
x = x[:, :H, :W, :].contiguous()
return x
def get_rel_pos(q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor:
"""
Get relative positional embeddings according to the relative positions of query and key sizes.
Args:
q_size (int): size of query q.
k_size (int): size of key k.
rel_pos (Tensor): relative position embeddings (L, C).
Returns:
Extracted positional embeddings according to relative positions.
"""
max_rel_dist = int(2 * max(q_size, k_size) - 1)
# Interpolate rel pos if needed.
if rel_pos.shape[0] != max_rel_dist:
# Interpolate rel pos.
rel_pos_resized = F.interpolate(
rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),
size=max_rel_dist,
mode="linear",
)
rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
else:
rel_pos_resized = rel_pos
# Scale the coords with short length if shapes for q and k are different.
q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0)
k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0)
relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)
return rel_pos_resized[relative_coords.long()]
def add_decomposed_rel_pos(
attn: torch.Tensor,
q: torch.Tensor,
rel_pos_h: torch.Tensor,
rel_pos_w: torch.Tensor,
q_size: Tuple[int, int],
k_size: Tuple[int, int],
) -> torch.Tensor:
"""
Calculate decomposed Relative Positional Embeddings from mvitv2 paper at
https://github.com/facebookresearch/mvit/blob/main/mvit/models/attention.py.
Args:
attn (Tensor): attention map.
q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis.
rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis.
q_size (Tuple): spatial sequence size of query q with (q_h, q_w).
k_size (Tuple): spatial sequence size of key k with (k_h, k_w).
Returns:
attn (Tensor): attention map with added relative positional embeddings.
"""
q_h, q_w = q_size
k_h, k_w = k_size
Rh = get_rel_pos(q_h, k_h, rel_pos_h)
Rw = get_rel_pos(q_w, k_w, rel_pos_w)
B, _, dim = q.shape
r_q = q.reshape(B, q_h, q_w, dim)
rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh)
rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw)
attn = (attn.view(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]).view(
B, q_h * q_w, k_h * k_w
)
return attn
class PatchEmbed(nn.Module):
"""Image to Patch Embedding."""
def __init__(
self,
kernel_size: Tuple[int, int] = (16, 16),
stride: Tuple[int, int] = (16, 16),
padding: Tuple[int, int] = (0, 0),
in_chans: int = 3,
embed_dim: int = 768,
) -> None:
"""
Initialize PatchEmbed module.
Args:
kernel_size (Tuple): kernel size of the projection layer.
stride (Tuple): stride of the projection layer.
padding (Tuple): padding size of the projection layer.
in_chans (int): Number of input image channels.
embed_dim (int): Patch embedding dimension.
"""
super().__init__()
self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding)
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""Computes patch embedding by applying convolution and transposing resulting tensor."""
return self.proj(x).permute(0, 2, 3, 1) # B C H W -> B H W C
return outputs

View file

@ -0,0 +1,237 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
import copy
from typing import Optional
import torch
from torch import Tensor, nn
from .blocks import RoPEAttention
class MemoryAttentionLayer(nn.Module):
"""
Implements a memory attention layer with self-attention and cross-attention mechanisms for neural networks.
This class combines self-attention, cross-attention, and feedforward components to process input tensors and
generate memory-based attention outputs.
Attributes:
d_model (int): Dimensionality of the model.
dim_feedforward (int): Dimensionality of the feedforward network.
dropout_value (float): Dropout rate for regularization.
self_attn (RoPEAttention): Self-attention mechanism using RoPE (Rotary Position Embedding).
cross_attn_image (RoPEAttention): Cross-attention mechanism for image processing.
linear1 (nn.Linear): First linear layer of the feedforward network.
linear2 (nn.Linear): Second linear layer of the feedforward network.
norm1 (nn.LayerNorm): Layer normalization for self-attention output.
norm2 (nn.LayerNorm): Layer normalization for cross-attention output.
norm3 (nn.LayerNorm): Layer normalization for feedforward network output.
dropout1 (nn.Dropout): Dropout layer after self-attention.
dropout2 (nn.Dropout): Dropout layer after cross-attention.
dropout3 (nn.Dropout): Dropout layer after feedforward network.
activation (nn.ReLU): Activation function for the feedforward network.
pos_enc_at_attn (bool): Flag to add positional encoding at attention.
pos_enc_at_cross_attn_queries (bool): Flag to add positional encoding to cross-attention queries.
pos_enc_at_cross_attn_keys (bool): Flag to add positional encoding to cross-attention keys.
Methods:
forward: Performs the full memory attention operation on input tensors.
_forward_sa: Performs self-attention on input tensor.
_forward_ca: Performs cross-attention between target and memory tensors.
Examples:
>>> layer = MemoryAttentionLayer(d_model=256, dim_feedforward=2048, dropout=0.1)
>>> tgt = torch.randn(1, 100, 256)
>>> memory = torch.randn(1, 100, 64)
>>> pos = torch.randn(1, 100, 256)
>>> query_pos = torch.randn(1, 100, 256)
>>> output = layer(tgt, memory, pos, query_pos)
>>> print(output.shape)
torch.Size([1, 100, 256])
"""
def __init__(
self,
d_model: int = 256,
dim_feedforward: int = 2048,
dropout: float = 0.1,
pos_enc_at_attn: bool = False,
pos_enc_at_cross_attn_keys: bool = True,
pos_enc_at_cross_attn_queries: bool = False,
):
"""Initializes a memory attention layer with self-attention, cross-attention, and feedforward components."""
super().__init__()
self.d_model = d_model
self.dim_feedforward = dim_feedforward
self.dropout_value = dropout
self.self_attn = RoPEAttention(embedding_dim=256, num_heads=1, downsample_rate=1)
self.cross_attn_image = RoPEAttention(
rope_k_repeat=True,
embedding_dim=256,
num_heads=1,
downsample_rate=1,
kv_in_dim=64,
)
# Implementation of Feedforward model
self.linear1 = nn.Linear(d_model, dim_feedforward)
self.dropout = nn.Dropout(dropout)
self.linear2 = nn.Linear(dim_feedforward, d_model)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.norm3 = nn.LayerNorm(d_model)
self.dropout1 = nn.Dropout(dropout)
self.dropout2 = nn.Dropout(dropout)
self.dropout3 = nn.Dropout(dropout)
self.activation = nn.ReLU()
# Where to add pos enc
self.pos_enc_at_attn = pos_enc_at_attn
self.pos_enc_at_cross_attn_queries = pos_enc_at_cross_attn_queries
self.pos_enc_at_cross_attn_keys = pos_enc_at_cross_attn_keys
def _forward_sa(self, tgt, query_pos):
"""Performs self-attention on input tensor using positional encoding and RoPE attention mechanism."""
tgt2 = self.norm1(tgt)
q = k = tgt2 + query_pos if self.pos_enc_at_attn else tgt2
tgt2 = self.self_attn(q, k, v=tgt2)
tgt = tgt + self.dropout1(tgt2)
return tgt
def _forward_ca(self, tgt, memory, query_pos, pos, num_k_exclude_rope=0):
"""Performs cross-attention between target and memory tensors using RoPEAttention mechanism."""
kwds = {}
if num_k_exclude_rope > 0:
assert isinstance(self.cross_attn_image, RoPEAttention)
kwds = {"num_k_exclude_rope": num_k_exclude_rope}
# Cross-Attention
tgt2 = self.norm2(tgt)
tgt2 = self.cross_attn_image(
q=tgt2 + query_pos if self.pos_enc_at_cross_attn_queries else tgt2,
k=memory + pos if self.pos_enc_at_cross_attn_keys else memory,
v=memory,
**kwds,
)
tgt = tgt + self.dropout2(tgt2)
return tgt
def forward(
self,
tgt,
memory,
pos: Optional[Tensor] = None,
query_pos: Optional[Tensor] = None,
num_k_exclude_rope: int = 0,
) -> torch.Tensor:
"""Processes input tensors using self-attention, cross-attention, and MLP for memory-based attention."""
tgt = self._forward_sa(tgt, query_pos)
tgt = self._forward_ca(tgt, memory, query_pos, pos, num_k_exclude_rope)
# MLP
tgt2 = self.norm3(tgt)
tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
tgt = tgt + self.dropout3(tgt2)
return tgt
class MemoryAttention(nn.Module):
"""
Memory attention module for processing sequential data with self and cross-attention mechanisms.
This class implements a multi-layer attention mechanism that combines self-attention and cross-attention
for processing sequential data, particularly useful in transformer-like architectures.
Attributes:
d_model (int): The dimension of the model's hidden state.
layers (nn.ModuleList): A list of MemoryAttentionLayer modules.
num_layers (int): The number of attention layers.
norm (nn.LayerNorm): Layer normalization applied to the output.
pos_enc_at_input (bool): Whether to apply positional encoding at the input.
batch_first (bool): Whether the input tensors are in batch-first format.
Methods:
forward: Processes input tensors through the attention layers.
Examples:
>>> d_model = 256
>>> layer = MemoryAttentionLayer(d_model)
>>> attention = MemoryAttention(d_model, pos_enc_at_input=True, layer=layer, num_layers=3)
>>> curr = torch.randn(10, 32, d_model) # (seq_len, batch_size, d_model)
>>> memory = torch.randn(20, 32, d_model) # (mem_len, batch_size, d_model)
>>> curr_pos = torch.randn(10, 32, d_model)
>>> memory_pos = torch.randn(20, 32, d_model)
>>> output = attention(curr, memory, curr_pos, memory_pos)
>>> print(output.shape)
torch.Size([10, 32, 256])
"""
def __init__(
self,
d_model: int,
pos_enc_at_input: bool,
layer: nn.Module,
num_layers: int,
batch_first: bool = True, # Do layers expect batch first input?
):
"""Initializes MemoryAttention module with layers and normalization for attention processing."""
super().__init__()
self.d_model = d_model
self.layers = nn.ModuleList([copy.deepcopy(layer) for _ in range(num_layers)])
self.num_layers = num_layers
self.norm = nn.LayerNorm(d_model)
self.pos_enc_at_input = pos_enc_at_input
self.batch_first = batch_first
def forward(
self,
curr: torch.Tensor, # self-attention inputs
memory: torch.Tensor, # cross-attention inputs
curr_pos: Optional[Tensor] = None, # pos_enc for self-attention inputs
memory_pos: Optional[Tensor] = None, # pos_enc for cross-attention inputs
num_obj_ptr_tokens: int = 0, # number of object pointer *tokens*
):
"""Processes input tensors through multiple attention layers, applying self and cross-attention mechanisms."""
if isinstance(curr, list):
assert isinstance(curr_pos, list)
assert len(curr) == len(curr_pos) == 1
curr, curr_pos = (
curr[0],
curr_pos[0],
)
assert curr.shape[1] == memory.shape[1], "Batch size must be the same for curr and memory"
output = curr
if self.pos_enc_at_input and curr_pos is not None:
output = output + 0.1 * curr_pos
if self.batch_first:
# Convert to batch first
output = output.transpose(0, 1)
curr_pos = curr_pos.transpose(0, 1)
memory = memory.transpose(0, 1)
memory_pos = memory_pos.transpose(0, 1)
for layer in self.layers:
kwds = {}
if isinstance(layer.cross_attn_image, RoPEAttention):
kwds = {"num_k_exclude_rope": num_obj_ptr_tokens}
output = layer(
tgt=output,
memory=memory,
pos=memory_pos,
query_pos=curr_pos,
**kwds,
)
normed_output = self.norm(output)
if self.batch_first:
# Convert back to seq first
normed_output = normed_output.transpose(0, 1)
curr_pos = curr_pos.transpose(0, 1)
return normed_output

View file

@ -9,25 +9,48 @@
from typing import List
import torch
import torch.nn.functional as F
from torch import nn
from torch.nn.init import trunc_normal_
from .decoders import MaskDecoder
from ultralytics.nn.modules import MLP
from .blocks import SAM2TwoWayTransformer
from .decoders import MaskDecoder, SAM2MaskDecoder
from .encoders import ImageEncoderViT, PromptEncoder
from .utils import get_1d_sine_pe, select_closest_cond_frames
# a large negative value as a placeholder score for missing objects
NO_OBJ_SCORE = -1024.0
class SAMModel(nn.Module):
"""
SAMModel (Segment Anything Model) is designed for object segmentation tasks. It uses image encoders to generate
image embeddings, and prompt encoders to encode various types of input prompts. These embeddings are then used by
the mask decoder to predict object masks.
Segment Anything Model (SAM) for object segmentation tasks.
This class combines image encoders, prompt encoders, and mask decoders to predict object masks from images
and input prompts.
Attributes:
mask_threshold (float): Threshold value for mask prediction.
image_encoder (ImageEncoderViT): The backbone used to encode the image into embeddings.
prompt_encoder (PromptEncoder): Encodes various types of input prompts.
mask_decoder (MaskDecoder): Predicts object masks from the image and prompt embeddings.
pixel_mean (List[float]): Mean pixel values for image normalization.
pixel_std (List[float]): Standard deviation values for image normalization.
image_encoder (ImageEncoderViT): Backbone for encoding images into embeddings.
prompt_encoder (PromptEncoder): Encoder for various types of input prompts.
mask_decoder (MaskDecoder): Predicts object masks from image and prompt embeddings.
pixel_mean (torch.Tensor): Mean pixel values for image normalization, shape (3, 1, 1).
pixel_std (torch.Tensor): Standard deviation values for image normalization, shape (3, 1, 1).
Methods:
__init__: Initializes the SAMModel with encoders, decoder, and normalization parameters.
Examples:
>>> image_encoder = ImageEncoderViT(...)
>>> prompt_encoder = PromptEncoder(...)
>>> mask_decoder = MaskDecoder(...)
>>> sam_model = SAMModel(image_encoder, prompt_encoder, mask_decoder)
>>> # Further usage depends on SAMPredictor class
Notes:
All forward() operations are implemented in the SAMPredictor class.
"""
mask_threshold: float = 0.0
@ -43,17 +66,22 @@ class SAMModel(nn.Module):
"""
Initialize the SAMModel class to predict object masks from an image and input prompts.
Note:
All forward() operations moved to SAMPredictor.
Args:
image_encoder (ImageEncoderViT): The backbone used to encode the image into image embeddings.
prompt_encoder (PromptEncoder): Encodes various types of input prompts.
mask_decoder (MaskDecoder): Predicts masks from the image embeddings and encoded prompts.
pixel_mean (List[float], optional): Mean values for normalizing pixels in the input image. Defaults to
(123.675, 116.28, 103.53).
pixel_std (List[float], optional): Std values for normalizing pixels in the input image. Defaults to
(58.395, 57.12, 57.375).
pixel_mean (List[float]): Mean values for normalizing pixels in the input image.
pixel_std (List[float]): Std values for normalizing pixels in the input image.
Examples:
>>> image_encoder = ImageEncoderViT(...)
>>> prompt_encoder = PromptEncoder(...)
>>> mask_decoder = MaskDecoder(...)
>>> sam_model = SAMModel(image_encoder, prompt_encoder, mask_decoder)
>>> # Further usage depends on SAMPredictor class
Notes:
All forward() operations moved to SAMPredictor.
"""
super().__init__()
self.image_encoder = image_encoder
@ -61,3 +89,846 @@ class SAMModel(nn.Module):
self.mask_decoder = mask_decoder
self.register_buffer("pixel_mean", torch.Tensor(pixel_mean).view(-1, 1, 1), False)
self.register_buffer("pixel_std", torch.Tensor(pixel_std).view(-1, 1, 1), False)
class SAM2Model(torch.nn.Module):
"""
SAM2Model class for Segment Anything Model 2 with memory-based video object segmentation capabilities.
This class extends the functionality of SAM to handle video sequences, incorporating memory mechanisms
for temporal consistency and efficient tracking of objects across frames.
Attributes:
mask_threshold (float): Threshold value for mask prediction.
image_encoder (ImageEncoderViT): Visual encoder for extracting image features.
memory_attention (nn.Module): Module for attending to memory features.
memory_encoder (nn.Module): Encoder for generating memory representations.
num_maskmem (int): Number of accessible memory frames.
image_size (int): Size of input images.
backbone_stride (int): Stride of the backbone network output.
sam_prompt_embed_dim (int): Dimension of SAM prompt embeddings.
sam_image_embedding_size (int): Size of SAM image embeddings.
sam_prompt_encoder (PromptEncoder): Encoder for processing input prompts.
sam_mask_decoder (SAM2MaskDecoder): Decoder for generating object masks.
obj_ptr_proj (nn.Module): Projection layer for object pointers.
obj_ptr_tpos_proj (nn.Module): Projection for temporal positional encoding in object pointers.
Methods:
forward_image: Processes image batch through encoder to extract multi-level features.
track_step: Performs a single tracking step, updating object masks and memory features.
Examples:
>>> model = SAM2Model(image_encoder, memory_attention, memory_encoder)
>>> image_batch = torch.rand(1, 3, 512, 512)
>>> features = model.forward_image(image_batch)
>>> track_results = model.track_step(0, True, features, None, None, None, {})
"""
mask_threshold: float = 0.0
def __init__(
self,
image_encoder,
memory_attention,
memory_encoder,
num_maskmem=7,
image_size=512,
backbone_stride=16,
sigmoid_scale_for_mem_enc=1.0,
sigmoid_bias_for_mem_enc=0.0,
binarize_mask_from_pts_for_mem_enc=False,
use_mask_input_as_output_without_sam=False,
max_cond_frames_in_attn=-1,
directly_add_no_mem_embed=False,
use_high_res_features_in_sam=False,
multimask_output_in_sam=False,
multimask_min_pt_num=1,
multimask_max_pt_num=1,
multimask_output_for_tracking=False,
use_multimask_token_for_obj_ptr: bool = False,
iou_prediction_use_sigmoid=False,
memory_temporal_stride_for_eval=1,
add_all_frames_to_correct_as_cond=False,
non_overlap_masks_for_mem_enc=False,
use_obj_ptrs_in_encoder=False,
max_obj_ptrs_in_encoder=16,
add_tpos_enc_to_obj_ptrs=True,
proj_tpos_enc_in_obj_ptrs=False,
only_obj_ptrs_in_the_past_for_eval=False,
pred_obj_scores: bool = False,
pred_obj_scores_mlp: bool = False,
fixed_no_obj_ptr: bool = False,
soft_no_obj_ptr: bool = False,
use_mlp_for_obj_ptr_proj: bool = False,
sam_mask_decoder_extra_args=None,
compile_image_encoder: bool = False,
):
"""
Initializes the SAM2Model for video object segmentation with memory-based tracking.
Args:
image_encoder (nn.Module): Visual encoder for extracting image features.
memory_attention (nn.Module): Module for attending to memory features.
memory_encoder (nn.Module): Encoder for generating memory representations.
num_maskmem (int): Number of accessible memory frames. Default is 7 (1 input frame + 6 previous frames).
image_size (int): Size of input images.
backbone_stride (int): Stride of the image backbone output.
sigmoid_scale_for_mem_enc (float): Scale factor for mask sigmoid probability.
sigmoid_bias_for_mem_enc (float): Bias factor for mask sigmoid probability.
binarize_mask_from_pts_for_mem_enc (bool): Whether to binarize sigmoid mask logits on interacted frames
with clicks during evaluation.
use_mask_input_as_output_without_sam (bool): Whether to directly output the input mask without using SAM
prompt encoder and mask decoder on frames with mask input.
max_cond_frames_in_attn (int): Maximum number of conditioning frames to participate in memory attention.
-1 means no limit.
directly_add_no_mem_embed (bool): Whether to directly add no-memory embedding to image feature on the
first frame.
use_high_res_features_in_sam (bool): Whether to use high-resolution feature maps in the SAM mask decoder.
multimask_output_in_sam (bool): Whether to output multiple (3) masks for the first click on initial
conditioning frames.
multimask_min_pt_num (int): Minimum number of clicks to use multimask output in SAM.
multimask_max_pt_num (int): Maximum number of clicks to use multimask output in SAM.
multimask_output_for_tracking (bool): Whether to use multimask output for tracking.
use_multimask_token_for_obj_ptr (bool): Whether to use multimask tokens for object pointers.
iou_prediction_use_sigmoid (bool): Whether to use sigmoid to restrict IoU prediction to [0-1].
memory_temporal_stride_for_eval (int): Memory bank's temporal stride during evaluation.
add_all_frames_to_correct_as_cond (bool): Whether to append frames with correction clicks to conditioning
frame list.
non_overlap_masks_for_mem_enc (bool): Whether to apply non-overlapping constraints on object masks in
memory encoder during evaluation.
use_obj_ptrs_in_encoder (bool): Whether to cross-attend to object pointers from other frames in the encoder.
max_obj_ptrs_in_encoder (int): Maximum number of object pointers from other frames in encoder
cross-attention.
add_tpos_enc_to_obj_ptrs (bool): Whether to add temporal positional encoding to object pointers in
the encoder.
proj_tpos_enc_in_obj_ptrs (bool): Whether to add an extra linear projection layer for temporal positional
encoding in object pointers.
only_obj_ptrs_in_the_past_for_eval (bool): Whether to only attend to object pointers in the past
during evaluation.
pred_obj_scores (bool): Whether to predict if there is an object in the frame.
pred_obj_scores_mlp (bool): Whether to use an MLP to predict object scores.
fixed_no_obj_ptr (bool): Whether to have a fixed no-object pointer when there is no object present.
soft_no_obj_ptr (bool): Whether to mix in no-object pointer softly for easier recovery and error mitigation.
use_mlp_for_obj_ptr_proj (bool): Whether to use MLP for object pointer projection.
sam_mask_decoder_extra_args (Dict | None): Extra arguments for constructing the SAM mask decoder.
compile_image_encoder (bool): Whether to compile the image encoder for faster inference.
Examples:
>>> image_encoder = ImageEncoderViT(...)
>>> memory_attention = SAM2TwoWayTransformer(...)
>>> memory_encoder = nn.Sequential(...)
>>> model = SAM2Model(image_encoder, memory_attention, memory_encoder)
>>> image_batch = torch.rand(1, 3, 512, 512)
>>> features = model.forward_image(image_batch)
>>> track_results = model.track_step(0, True, features, None, None, None, {})
"""
super().__init__()
# Part 1: the image backbone
self.image_encoder = image_encoder
# Use level 0, 1, 2 for high-res setting, or just level 2 for the default setting
self.use_high_res_features_in_sam = use_high_res_features_in_sam
self.num_feature_levels = 3 if use_high_res_features_in_sam else 1
self.use_obj_ptrs_in_encoder = use_obj_ptrs_in_encoder
self.max_obj_ptrs_in_encoder = max_obj_ptrs_in_encoder
if use_obj_ptrs_in_encoder:
# A conv layer to downsample the mask prompt to stride 4 (the same stride as
# low-res SAM mask logits) and to change its scales from 0~1 to SAM logit scale,
# so that it can be fed into the SAM mask decoder to generate a pointer.
self.mask_downsample = torch.nn.Conv2d(1, 1, kernel_size=4, stride=4)
self.add_tpos_enc_to_obj_ptrs = add_tpos_enc_to_obj_ptrs
if proj_tpos_enc_in_obj_ptrs:
assert add_tpos_enc_to_obj_ptrs # these options need to be used together
self.proj_tpos_enc_in_obj_ptrs = proj_tpos_enc_in_obj_ptrs
self.only_obj_ptrs_in_the_past_for_eval = only_obj_ptrs_in_the_past_for_eval
# Part 2: memory attention to condition current frame's visual features
# with memories (and obj ptrs) from past frames
self.memory_attention = memory_attention
self.hidden_dim = memory_attention.d_model
# Part 3: memory encoder for the previous frame's outputs
self.memory_encoder = memory_encoder
self.mem_dim = self.hidden_dim
if hasattr(self.memory_encoder, "out_proj") and hasattr(self.memory_encoder.out_proj, "weight"):
# if there is compression of memories along channel dim
self.mem_dim = self.memory_encoder.out_proj.weight.shape[0]
self.num_maskmem = num_maskmem # Number of memories accessible
# Temporal encoding of the memories
self.maskmem_tpos_enc = torch.nn.Parameter(torch.zeros(num_maskmem, 1, 1, self.mem_dim))
trunc_normal_(self.maskmem_tpos_enc, std=0.02)
# a single token to indicate no memory embedding from previous frames
self.no_mem_embed = torch.nn.Parameter(torch.zeros(1, 1, self.hidden_dim))
self.no_mem_pos_enc = torch.nn.Parameter(torch.zeros(1, 1, self.hidden_dim))
trunc_normal_(self.no_mem_embed, std=0.02)
trunc_normal_(self.no_mem_pos_enc, std=0.02)
self.directly_add_no_mem_embed = directly_add_no_mem_embed
# Apply sigmoid to the output raw mask logits (to turn them from
# range (-inf, +inf) to range (0, 1)) before feeding them into the memory encoder
self.sigmoid_scale_for_mem_enc = sigmoid_scale_for_mem_enc
self.sigmoid_bias_for_mem_enc = sigmoid_bias_for_mem_enc
self.binarize_mask_from_pts_for_mem_enc = binarize_mask_from_pts_for_mem_enc
self.non_overlap_masks_for_mem_enc = non_overlap_masks_for_mem_enc
self.memory_temporal_stride_for_eval = memory_temporal_stride_for_eval
# On frames with mask input, whether to directly output the input mask without
# using a SAM prompt encoder + mask decoder
self.use_mask_input_as_output_without_sam = use_mask_input_as_output_without_sam
self.multimask_output_in_sam = multimask_output_in_sam
self.multimask_min_pt_num = multimask_min_pt_num
self.multimask_max_pt_num = multimask_max_pt_num
self.multimask_output_for_tracking = multimask_output_for_tracking
self.use_multimask_token_for_obj_ptr = use_multimask_token_for_obj_ptr
self.iou_prediction_use_sigmoid = iou_prediction_use_sigmoid
# Part 4: SAM-style prompt encoder (for both mask and point inputs)
# and SAM-style mask decoder for the final mask output
self.image_size = image_size
self.backbone_stride = backbone_stride
self.sam_mask_decoder_extra_args = sam_mask_decoder_extra_args
self.pred_obj_scores = pred_obj_scores
self.pred_obj_scores_mlp = pred_obj_scores_mlp
self.fixed_no_obj_ptr = fixed_no_obj_ptr
self.soft_no_obj_ptr = soft_no_obj_ptr
if self.fixed_no_obj_ptr:
assert self.pred_obj_scores
assert self.use_obj_ptrs_in_encoder
if self.pred_obj_scores and self.use_obj_ptrs_in_encoder:
self.no_obj_ptr = torch.nn.Parameter(torch.zeros(1, self.hidden_dim))
trunc_normal_(self.no_obj_ptr, std=0.02)
self.use_mlp_for_obj_ptr_proj = use_mlp_for_obj_ptr_proj
self._build_sam_heads()
self.add_all_frames_to_correct_as_cond = add_all_frames_to_correct_as_cond
self.max_cond_frames_in_attn = max_cond_frames_in_attn
# Model compilation
if compile_image_encoder:
# Compile the forward function (not the full module) to allow loading checkpoints.
print("Image encoder compilation is enabled. First forward pass will be slow.")
self.image_encoder.forward = torch.compile(
self.image_encoder.forward,
mode="max-autotune",
fullgraph=True,
dynamic=False,
)
@property
def device(self):
"""Returns the device on which the model's parameters are stored."""
return next(self.parameters()).device
def forward(self, *args, **kwargs):
"""Processes image and prompt inputs to generate object masks and scores in video sequences."""
raise NotImplementedError(
"Please use the corresponding methods in SAM2VideoPredictor for inference."
"See notebooks/video_predictor_example.ipynb for an example."
)
def _build_sam_heads(self):
"""Builds SAM-style prompt encoder and mask decoder for image segmentation tasks."""
self.sam_prompt_embed_dim = self.hidden_dim
self.sam_image_embedding_size = self.image_size // self.backbone_stride
# build PromptEncoder and MaskDecoder from SAM
# (their hyperparameters like `mask_in_chans=16` are from SAM code)
self.sam_prompt_encoder = PromptEncoder(
embed_dim=self.sam_prompt_embed_dim,
image_embedding_size=(
self.sam_image_embedding_size,
self.sam_image_embedding_size,
),
input_image_size=(self.image_size, self.image_size),
mask_in_chans=16,
)
self.sam_mask_decoder = SAM2MaskDecoder(
num_multimask_outputs=3,
transformer=SAM2TwoWayTransformer(
depth=2,
embedding_dim=self.sam_prompt_embed_dim,
mlp_dim=2048,
num_heads=8,
),
transformer_dim=self.sam_prompt_embed_dim,
iou_head_depth=3,
iou_head_hidden_dim=256,
use_high_res_features=self.use_high_res_features_in_sam,
iou_prediction_use_sigmoid=self.iou_prediction_use_sigmoid,
pred_obj_scores=self.pred_obj_scores,
pred_obj_scores_mlp=self.pred_obj_scores_mlp,
use_multimask_token_for_obj_ptr=self.use_multimask_token_for_obj_ptr,
**(self.sam_mask_decoder_extra_args or {}),
)
if self.use_obj_ptrs_in_encoder:
# a linear projection on SAM output tokens to turn them into object pointers
self.obj_ptr_proj = torch.nn.Linear(self.hidden_dim, self.hidden_dim)
if self.use_mlp_for_obj_ptr_proj:
self.obj_ptr_proj = MLP(self.hidden_dim, self.hidden_dim, self.hidden_dim, 3)
else:
self.obj_ptr_proj = torch.nn.Identity()
if self.proj_tpos_enc_in_obj_ptrs:
# a linear projection on temporal positional encoding in object pointers to
# avoid potential interference with spatial positional encoding
self.obj_ptr_tpos_proj = torch.nn.Linear(self.hidden_dim, self.mem_dim)
else:
self.obj_ptr_tpos_proj = torch.nn.Identity()
def _forward_sam_heads(
self,
backbone_features,
point_inputs=None,
mask_inputs=None,
high_res_features=None,
multimask_output=False,
):
"""
Forward pass through SAM prompt encoders and mask heads.
This method processes image features and optional point/mask inputs to generate object masks and scores.
Args:
backbone_features (torch.Tensor): Image features with shape (B, C, H, W).
point_inputs (Dict[str, torch.Tensor] | None): Dictionary containing point prompts.
'point_coords': Tensor of shape (B, P, 2) with float32 dtype, containing absolute
pixel-unit coordinates in (x, y) format for P input points.
'point_labels': Tensor of shape (B, P) with int32 dtype, where 1 means positive clicks,
0 means negative clicks, and -1 means padding.
mask_inputs (torch.Tensor | None): Mask of shape (B, 1, H*16, W*16), float or bool, with the
same spatial size as the image.
high_res_features (List[torch.Tensor] | None): List of two feature maps with shapes
(B, C, 4*H, 4*W) and (B, C, 2*H, 2*W) respectively, used as high-resolution feature maps
for SAM decoder.
multimask_output (bool): If True, output 3 candidate masks and their IoU estimates; if False,
output only 1 mask and its IoU estimate.
Returns:
(Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]):
low_res_multimasks: Tensor of shape (B, M, H*4, W*4) with SAM output mask logits.
high_res_multimasks: Tensor of shape (B, M, H*16, W*16) with upsampled mask logits.
ious: Tensor of shape (B, M) with estimated IoU for each output mask.
low_res_masks: Tensor of shape (B, 1, H*4, W*4) with best low-resolution mask.
high_res_masks: Tensor of shape (B, 1, H*16, W*16) with best high-resolution mask.
obj_ptr: Tensor of shape (B, C) with object pointer vector for the output mask.
object_score_logits: Tensor of shape (B,) with object score logits.
Where M is 3 if multimask_output=True, and 1 if multimask_output=False.
Examples:
>>> backbone_features = torch.rand(1, 256, 32, 32)
>>> point_inputs = {"point_coords": torch.rand(1, 2, 2), "point_labels": torch.tensor([[1, 0]])}
>>> mask_inputs = torch.rand(1, 1, 512, 512)
>>> results = model._forward_sam_heads(backbone_features, point_inputs, mask_inputs)
>>> low_res_multimasks, high_res_multimasks, ious, low_res_masks, high_res_masks, obj_ptr, object_score_logits = results
"""
B = backbone_features.size(0)
device = backbone_features.device
assert backbone_features.size(1) == self.sam_prompt_embed_dim
assert backbone_features.size(2) == self.sam_image_embedding_size
assert backbone_features.size(3) == self.sam_image_embedding_size
# a) Handle point prompts
if point_inputs is not None:
sam_point_coords = point_inputs["point_coords"]
sam_point_labels = point_inputs["point_labels"]
assert sam_point_coords.size(0) == B and sam_point_labels.size(0) == B
else:
# If no points are provide, pad with an empty point (with label -1)
sam_point_coords = torch.zeros(B, 1, 2, device=device)
sam_point_labels = -torch.ones(B, 1, dtype=torch.int32, device=device)
# b) Handle mask prompts
if mask_inputs is not None:
# If mask_inputs is provided, downsize it into low-res mask input if needed
# and feed it as a dense mask prompt into the SAM mask encoder
assert len(mask_inputs.shape) == 4 and mask_inputs.shape[:2] == (B, 1)
if mask_inputs.shape[-2:] != self.sam_prompt_encoder.mask_input_size:
sam_mask_prompt = F.interpolate(
mask_inputs.float(),
size=self.sam_prompt_encoder.mask_input_size,
align_corners=False,
mode="bilinear",
antialias=True, # use antialias for downsampling
)
else:
sam_mask_prompt = mask_inputs
else:
# Otherwise, simply feed None (and SAM's prompt encoder will add
# a learned `no_mask_embed` to indicate no mask input in this case).
sam_mask_prompt = None
sparse_embeddings, dense_embeddings = self.sam_prompt_encoder(
points=(sam_point_coords, sam_point_labels),
boxes=None,
masks=sam_mask_prompt,
)
(
low_res_multimasks,
ious,
sam_output_tokens,
object_score_logits,
) = self.sam_mask_decoder(
image_embeddings=backbone_features,
image_pe=self.sam_prompt_encoder.get_dense_pe(),
sparse_prompt_embeddings=sparse_embeddings,
dense_prompt_embeddings=dense_embeddings,
multimask_output=multimask_output,
repeat_image=False, # the image is already batched
high_res_features=high_res_features,
)
if self.pred_obj_scores:
is_obj_appearing = object_score_logits > 0
# Mask used for spatial memories is always a *hard* choice between obj and no obj,
# consistent with the actual mask prediction
low_res_multimasks = torch.where(
is_obj_appearing[:, None, None],
low_res_multimasks,
NO_OBJ_SCORE,
)
# convert masks from possibly bfloat16 (or float16) to float32
# (older PyTorch versions before 2.1 don't support `interpolate` on bf16)
low_res_multimasks = low_res_multimasks.float()
high_res_multimasks = F.interpolate(
low_res_multimasks,
size=(self.image_size, self.image_size),
mode="bilinear",
align_corners=False,
)
sam_output_token = sam_output_tokens[:, 0]
if multimask_output:
# take the best mask prediction (with the highest IoU estimation)
best_iou_inds = torch.argmax(ious, dim=-1)
batch_inds = torch.arange(B, device=device)
low_res_masks = low_res_multimasks[batch_inds, best_iou_inds].unsqueeze(1)
high_res_masks = high_res_multimasks[batch_inds, best_iou_inds].unsqueeze(1)
if sam_output_tokens.size(1) > 1:
sam_output_token = sam_output_tokens[batch_inds, best_iou_inds]
else:
low_res_masks, high_res_masks = low_res_multimasks, high_res_multimasks
# Extract object pointer from the SAM output token (with occlusion handling)
obj_ptr = self.obj_ptr_proj(sam_output_token)
if self.pred_obj_scores:
# Allow *soft* no obj ptr, unlike for masks
if self.soft_no_obj_ptr:
# Only hard possible with gt
assert not self.teacher_force_obj_scores_for_mem
lambda_is_obj_appearing = object_score_logits.sigmoid()
else:
lambda_is_obj_appearing = is_obj_appearing.float()
if self.fixed_no_obj_ptr:
obj_ptr = lambda_is_obj_appearing * obj_ptr
obj_ptr = obj_ptr + (1 - lambda_is_obj_appearing) * self.no_obj_ptr
return (
low_res_multimasks,
high_res_multimasks,
ious,
low_res_masks,
high_res_masks,
obj_ptr,
object_score_logits,
)
def _use_mask_as_output(self, backbone_features, high_res_features, mask_inputs):
"""Processes mask inputs directly as output, bypassing SAM encoder/decoder."""
# Use -10/+10 as logits for neg/pos pixels (very close to 0/1 in prob after sigmoid).
out_scale, out_bias = 20.0, -10.0 # sigmoid(-10.0)=4.5398e-05
mask_inputs_float = mask_inputs.float()
high_res_masks = mask_inputs_float * out_scale + out_bias
low_res_masks = F.interpolate(
high_res_masks,
size=(high_res_masks.size(-2) // 4, high_res_masks.size(-1) // 4),
align_corners=False,
mode="bilinear",
antialias=True, # use antialias for downsampling
)
# a dummy IoU prediction of all 1's under mask input
ious = mask_inputs.new_ones(mask_inputs.size(0), 1).float()
if not self.use_obj_ptrs_in_encoder:
# all zeros as a dummy object pointer (of shape [B, C])
obj_ptr = torch.zeros(mask_inputs.size(0), self.hidden_dim, device=mask_inputs.device)
else:
# produce an object pointer using the SAM decoder from the mask input
_, _, _, _, _, obj_ptr, _ = self._forward_sam_heads(
backbone_features=backbone_features,
mask_inputs=self.mask_downsample(mask_inputs_float),
high_res_features=high_res_features,
)
# In this method, we are treating mask_input as output, e.g. using it directly to create spatial mem;
# Below, we follow the same design axiom to use mask_input to decide if obj appears or not instead of relying
# on the object_scores from the SAM decoder.
is_obj_appearing = torch.any(mask_inputs.flatten(1).float() > 0.0, dim=1)
is_obj_appearing = is_obj_appearing[..., None]
lambda_is_obj_appearing = is_obj_appearing.float()
object_score_logits = out_scale * lambda_is_obj_appearing + out_bias
if self.pred_obj_scores:
if self.fixed_no_obj_ptr:
obj_ptr = lambda_is_obj_appearing * obj_ptr
obj_ptr = obj_ptr + (1 - lambda_is_obj_appearing) * self.no_obj_ptr
return (
low_res_masks,
high_res_masks,
ious,
low_res_masks,
high_res_masks,
obj_ptr,
object_score_logits,
)
def forward_image(self, img_batch: torch.Tensor):
"""Processes image batch through encoder to extract multi-level features for SAM model."""
backbone_out = self.image_encoder(img_batch)
if self.use_high_res_features_in_sam:
# precompute projected level 0 and level 1 features in SAM decoder
# to avoid running it again on every SAM click
backbone_out["backbone_fpn"][0] = self.sam_mask_decoder.conv_s0(backbone_out["backbone_fpn"][0])
backbone_out["backbone_fpn"][1] = self.sam_mask_decoder.conv_s1(backbone_out["backbone_fpn"][1])
return backbone_out
def _prepare_backbone_features(self, backbone_out):
"""Prepares and flattens visual features from the image backbone output for further processing."""
backbone_out = backbone_out.copy()
assert len(backbone_out["backbone_fpn"]) == len(backbone_out["vision_pos_enc"])
assert len(backbone_out["backbone_fpn"]) >= self.num_feature_levels
feature_maps = backbone_out["backbone_fpn"][-self.num_feature_levels :]
vision_pos_embeds = backbone_out["vision_pos_enc"][-self.num_feature_levels :]
feat_sizes = [(x.shape[-2], x.shape[-1]) for x in vision_pos_embeds]
# flatten NxCxHxW to HWxNxC
vision_feats = [x.flatten(2).permute(2, 0, 1) for x in feature_maps]
vision_pos_embeds = [x.flatten(2).permute(2, 0, 1) for x in vision_pos_embeds]
return backbone_out, vision_feats, vision_pos_embeds, feat_sizes
def _prepare_memory_conditioned_features(
self,
frame_idx,
is_init_cond_frame,
current_vision_feats,
current_vision_pos_embeds,
feat_sizes,
output_dict,
num_frames,
track_in_reverse=False, # tracking in reverse time order (for demo usage)
):
"""Prepares memory-conditioned features by fusing current frame's visual features with previous memories."""
B = current_vision_feats[-1].size(1) # batch size on this frame
C = self.hidden_dim
H, W = feat_sizes[-1] # top-level (lowest-resolution) feature size
device = current_vision_feats[-1].device
# The case of `self.num_maskmem == 0` below is primarily used for reproducing SAM on images.
# In this case, we skip the fusion with any memory.
if self.num_maskmem == 0: # Disable memory and skip fusion
pix_feat = current_vision_feats[-1].permute(1, 2, 0).view(B, C, H, W)
return pix_feat
num_obj_ptr_tokens = 0
# Step 1: condition the visual features of the current frame on previous memories
if not is_init_cond_frame:
# Retrieve the memories encoded with the maskmem backbone
to_cat_memory, to_cat_memory_pos_embed = [], []
# Add conditioning frames's output first (all cond frames have t_pos=0 for
# when getting temporal positional embedding below)
assert len(output_dict["cond_frame_outputs"]) > 0
# Select a maximum number of temporally closest cond frames for cross attention
cond_outputs = output_dict["cond_frame_outputs"]
selected_cond_outputs, unselected_cond_outputs = select_closest_cond_frames(
frame_idx, cond_outputs, self.max_cond_frames_in_attn
)
t_pos_and_prevs = [(0, out) for out in selected_cond_outputs.values()]
# Add last (self.num_maskmem - 1) frames before current frame for non-conditioning memory
# the earliest one has t_pos=1 and the latest one has t_pos=self.num_maskmem-1
# We also allow taking the memory frame non-consecutively (with r>1), in which case
# we take (self.num_maskmem - 2) frames among every r-th frames plus the last frame.
r = self.memory_temporal_stride_for_eval
for t_pos in range(1, self.num_maskmem):
t_rel = self.num_maskmem - t_pos # how many frames before current frame
if t_rel == 1:
# for t_rel == 1, we take the last frame (regardless of r)
if not track_in_reverse:
# the frame immediately before this frame (i.e. frame_idx - 1)
prev_frame_idx = frame_idx - t_rel
else:
# the frame immediately after this frame (i.e. frame_idx + 1)
prev_frame_idx = frame_idx + t_rel
else:
# for t_rel >= 2, we take the memory frame from every r-th frames
if not track_in_reverse:
# first find the nearest frame among every r-th frames before this frame
# for r=1, this would be (frame_idx - 2)
prev_frame_idx = ((frame_idx - 2) // r) * r
# then seek further among every r-th frames
prev_frame_idx = prev_frame_idx - (t_rel - 2) * r
else:
# first find the nearest frame among every r-th frames after this frame
# for r=1, this would be (frame_idx + 2)
prev_frame_idx = -(-(frame_idx + 2) // r) * r
# then seek further among every r-th frames
prev_frame_idx = prev_frame_idx + (t_rel - 2) * r
out = output_dict["non_cond_frame_outputs"].get(prev_frame_idx, None)
if out is None:
# If an unselected conditioning frame is among the last (self.num_maskmem - 1)
# frames, we still attend to it as if it's a non-conditioning frame.
out = unselected_cond_outputs.get(prev_frame_idx, None)
t_pos_and_prevs.append((t_pos, out))
for t_pos, prev in t_pos_and_prevs:
if prev is None:
continue # skip padding frames
# "maskmem_features" might have been offloaded to CPU in demo use cases,
# so we load it back to GPU (it's a no-op if it's already on GPU).
feats = prev["maskmem_features"].cuda(non_blocking=True)
to_cat_memory.append(feats.flatten(2).permute(2, 0, 1))
# Spatial positional encoding (it might have been offloaded to CPU in eval)
maskmem_enc = prev["maskmem_pos_enc"][-1].cuda()
maskmem_enc = maskmem_enc.flatten(2).permute(2, 0, 1)
# Temporal positional encoding
maskmem_enc = maskmem_enc + self.maskmem_tpos_enc[self.num_maskmem - t_pos - 1]
to_cat_memory_pos_embed.append(maskmem_enc)
# Construct the list of past object pointers
if self.use_obj_ptrs_in_encoder:
max_obj_ptrs_in_encoder = min(num_frames, self.max_obj_ptrs_in_encoder)
# First add those object pointers from selected conditioning frames
# (optionally, only include object pointers in the past during evaluation)
if not self.training and self.only_obj_ptrs_in_the_past_for_eval:
ptr_cond_outputs = {
t: out
for t, out in selected_cond_outputs.items()
if (t >= frame_idx if track_in_reverse else t <= frame_idx)
}
else:
ptr_cond_outputs = selected_cond_outputs
pos_and_ptrs = [
# Temporal pos encoding contains how far away each pointer is from current frame
(abs(frame_idx - t), out["obj_ptr"])
for t, out in ptr_cond_outputs.items()
]
# Add up to (max_obj_ptrs_in_encoder - 1) non-conditioning frames before current frame
for t_diff in range(1, max_obj_ptrs_in_encoder):
t = frame_idx + t_diff if track_in_reverse else frame_idx - t_diff
if t < 0 or (num_frames is not None and t >= num_frames):
break
out = output_dict["non_cond_frame_outputs"].get(t, unselected_cond_outputs.get(t, None))
if out is not None:
pos_and_ptrs.append((t_diff, out["obj_ptr"]))
# If we have at least one object pointer, add them to the across attention
if len(pos_and_ptrs) > 0:
pos_list, ptrs_list = zip(*pos_and_ptrs)
# stack object pointers along dim=0 into [ptr_seq_len, B, C] shape
obj_ptrs = torch.stack(ptrs_list, dim=0)
# a temporal positional embedding based on how far each object pointer is from
# the current frame (sine embedding normalized by the max pointer num).
if self.add_tpos_enc_to_obj_ptrs:
t_diff_max = max_obj_ptrs_in_encoder - 1
tpos_dim = C if self.proj_tpos_enc_in_obj_ptrs else self.mem_dim
obj_pos = torch.tensor(pos_list, device=device)
obj_pos = get_1d_sine_pe(obj_pos / t_diff_max, dim=tpos_dim)
obj_pos = self.obj_ptr_tpos_proj(obj_pos)
obj_pos = obj_pos.unsqueeze(1).expand(-1, B, self.mem_dim)
else:
obj_pos = obj_ptrs.new_zeros(len(pos_list), B, self.mem_dim)
if self.mem_dim < C:
# split a pointer into (C // self.mem_dim) tokens for self.mem_dim < C
obj_ptrs = obj_ptrs.reshape(-1, B, C // self.mem_dim, self.mem_dim)
obj_ptrs = obj_ptrs.permute(0, 2, 1, 3).flatten(0, 1)
obj_pos = obj_pos.repeat_interleave(C // self.mem_dim, dim=0)
to_cat_memory.append(obj_ptrs)
to_cat_memory_pos_embed.append(obj_pos)
num_obj_ptr_tokens = obj_ptrs.shape[0]
else:
num_obj_ptr_tokens = 0
else:
# for initial conditioning frames, encode them without using any previous memory
if self.directly_add_no_mem_embed:
# directly add no-mem embedding (instead of using the transformer encoder)
pix_feat_with_mem = current_vision_feats[-1] + self.no_mem_embed
pix_feat_with_mem = pix_feat_with_mem.permute(1, 2, 0).view(B, C, H, W)
return pix_feat_with_mem
# Use a dummy token on the first frame (to avoid empty memory input to transformer encoder)
to_cat_memory = [self.no_mem_embed.expand(1, B, self.mem_dim)]
to_cat_memory_pos_embed = [self.no_mem_pos_enc.expand(1, B, self.mem_dim)]
# Step 2: Concatenate the memories and forward through the transformer encoder
memory = torch.cat(to_cat_memory, dim=0)
memory_pos_embed = torch.cat(to_cat_memory_pos_embed, dim=0)
pix_feat_with_mem = self.memory_attention(
curr=current_vision_feats,
curr_pos=current_vision_pos_embeds,
memory=memory,
memory_pos=memory_pos_embed,
num_obj_ptr_tokens=num_obj_ptr_tokens,
)
# reshape the output (HW)BC => BCHW
pix_feat_with_mem = pix_feat_with_mem.permute(1, 2, 0).view(B, C, H, W)
return pix_feat_with_mem
def _encode_new_memory(
self,
current_vision_feats,
feat_sizes,
pred_masks_high_res,
is_mask_from_pts,
):
"""Encodes frame features and masks into a new memory representation for video segmentation."""
B = current_vision_feats[-1].size(1) # batch size on this frame
C = self.hidden_dim
H, W = feat_sizes[-1] # top-level (lowest-resolution) feature size
# top-level feature, (HW)BC => BCHW
pix_feat = current_vision_feats[-1].permute(1, 2, 0).view(B, C, H, W)
if self.non_overlap_masks_for_mem_enc and not self.training:
# optionally, apply non-overlapping constraints to the masks (it's applied
# in the batch dimension and should only be used during eval, where all
# the objects come from the same video under batch size 1).
pred_masks_high_res = self._apply_non_overlapping_constraints(pred_masks_high_res)
# scale the raw mask logits with a temperature before applying sigmoid
binarize = self.binarize_mask_from_pts_for_mem_enc and is_mask_from_pts
if binarize and not self.training:
mask_for_mem = (pred_masks_high_res > 0).float()
else:
# apply sigmoid on the raw mask logits to turn them into range (0, 1)
mask_for_mem = torch.sigmoid(pred_masks_high_res)
# apply scale and bias terms to the sigmoid probabilities
if self.sigmoid_scale_for_mem_enc != 1.0:
mask_for_mem = mask_for_mem * self.sigmoid_scale_for_mem_enc
if self.sigmoid_bias_for_mem_enc != 0.0:
mask_for_mem = mask_for_mem + self.sigmoid_bias_for_mem_enc
maskmem_out = self.memory_encoder(
pix_feat,
mask_for_mem,
skip_mask_sigmoid=True, # sigmoid already applied
)
maskmem_features = maskmem_out["vision_features"]
maskmem_pos_enc = maskmem_out["vision_pos_enc"]
return maskmem_features, maskmem_pos_enc
def track_step(
self,
frame_idx,
is_init_cond_frame,
current_vision_feats,
current_vision_pos_embeds,
feat_sizes,
point_inputs,
mask_inputs,
output_dict,
num_frames,
track_in_reverse=False, # tracking in reverse time order (for demo usage)
# Whether to run the memory encoder on the predicted masks. Sometimes we might want
# to skip the memory encoder with `run_mem_encoder=False`. For example,
# in demo we might call `track_step` multiple times for each user click,
# and only encode the memory when the user finalizes their clicks. And in ablation
# settings like SAM training on static images, we don't need the memory encoder.
run_mem_encoder=True,
# The previously predicted SAM mask logits (which can be fed together with new clicks in demo).
prev_sam_mask_logits=None,
):
"""Performs a single tracking step, updating object masks and memory features based on current frame inputs."""
current_out = {"point_inputs": point_inputs, "mask_inputs": mask_inputs}
# High-resolution feature maps for the SAM head, reshape (HW)BC => BCHW
if len(current_vision_feats) > 1:
high_res_features = [
x.permute(1, 2, 0).view(x.size(1), x.size(2), *s)
for x, s in zip(current_vision_feats[:-1], feat_sizes[:-1])
]
else:
high_res_features = None
if mask_inputs is not None and self.use_mask_input_as_output_without_sam:
# When use_mask_input_as_output_without_sam=True, we directly output the mask input
# (see it as a GT mask) without using a SAM prompt encoder + mask decoder.
pix_feat = current_vision_feats[-1].permute(1, 2, 0)
pix_feat = pix_feat.view(-1, self.hidden_dim, *feat_sizes[-1])
sam_outputs = self._use_mask_as_output(pix_feat, high_res_features, mask_inputs)
else:
# fused the visual feature with previous memory features in the memory bank
pix_feat_with_mem = self._prepare_memory_conditioned_features(
frame_idx=frame_idx,
is_init_cond_frame=is_init_cond_frame,
current_vision_feats=current_vision_feats[-1:],
current_vision_pos_embeds=current_vision_pos_embeds[-1:],
feat_sizes=feat_sizes[-1:],
output_dict=output_dict,
num_frames=num_frames,
track_in_reverse=track_in_reverse,
)
# apply SAM-style segmentation head
# here we might feed previously predicted low-res SAM mask logits into the SAM mask decoder,
# e.g. in demo where such logits come from earlier interaction instead of correction sampling
# (in this case, any `mask_inputs` shouldn't reach here as they are sent to _use_mask_as_output instead)
if prev_sam_mask_logits is not None:
assert point_inputs is not None and mask_inputs is None
mask_inputs = prev_sam_mask_logits
multimask_output = self._use_multimask(is_init_cond_frame, point_inputs)
sam_outputs = self._forward_sam_heads(
backbone_features=pix_feat_with_mem,
point_inputs=point_inputs,
mask_inputs=mask_inputs,
high_res_features=high_res_features,
multimask_output=multimask_output,
)
(
_,
_,
_,
low_res_masks,
high_res_masks,
obj_ptr,
_,
) = sam_outputs
current_out["pred_masks"] = low_res_masks
current_out["pred_masks_high_res"] = high_res_masks
current_out["obj_ptr"] = obj_ptr
# Finally run the memory encoder on the predicted mask to encode
# it into a new memory feature (that can be used in future frames)
if run_mem_encoder and self.num_maskmem > 0:
high_res_masks_for_mem_enc = high_res_masks
maskmem_features, maskmem_pos_enc = self._encode_new_memory(
current_vision_feats=current_vision_feats,
feat_sizes=feat_sizes,
pred_masks_high_res=high_res_masks_for_mem_enc,
is_mask_from_pts=(point_inputs is not None),
)
current_out["maskmem_features"] = maskmem_features
current_out["maskmem_pos_enc"] = maskmem_pos_enc
else:
current_out["maskmem_features"] = None
current_out["maskmem_pos_enc"] = None
return current_out
def _use_multimask(self, is_init_cond_frame, point_inputs):
"""Determines whether to use multiple mask outputs in the SAM head based on configuration and inputs."""
num_pts = 0 if point_inputs is None else point_inputs["point_labels"].size(1)
multimask_output = (
self.multimask_output_in_sam
and (is_init_cond_frame or self.multimask_output_for_tracking)
and (self.multimask_min_pt_num <= num_pts <= self.multimask_max_pt_num)
)
return multimask_output
def _apply_non_overlapping_constraints(self, pred_masks):
"""Applies non-overlapping constraints to masks, keeping highest scoring object per location."""
batch_size = pred_masks.size(0)
if batch_size == 1:
return pred_masks
device = pred_masks.device
# "max_obj_inds": object index of the object with the highest score at each location
max_obj_inds = torch.argmax(pred_masks, dim=0, keepdim=True)
# "batch_obj_inds": object index of each object slice (along dim 0) in `pred_masks`
batch_obj_inds = torch.arange(batch_size, device=device)[:, None, None, None]
keep = max_obj_inds == batch_obj_inds
# suppress overlapping regions' scores below -10.0 so that the foreground regions
# don't overlap (here sigmoid(-10.0)=4.5398e-05)
pred_masks = torch.where(keep, pred_masks, torch.clamp(pred_masks, max=-10.0))
return pred_masks

View file

@ -17,16 +17,40 @@ import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint as checkpoint
from ultralytics.nn.modules import LayerNorm2d
from ultralytics.utils.instance import to_2tuple
class Conv2d_BN(torch.nn.Sequential):
"""A sequential container that performs 2D convolution followed by batch normalization."""
"""
A sequential container that performs 2D convolution followed by batch normalization.
Attributes:
c (torch.nn.Conv2d): 2D convolution layer.
1 (torch.nn.BatchNorm2d): Batch normalization layer.
Methods:
__init__: Initializes the Conv2d_BN with specified parameters.
Args:
a (int): Number of input channels.
b (int): Number of output channels.
ks (int): Kernel size for the convolution. Defaults to 1.
stride (int): Stride for the convolution. Defaults to 1.
pad (int): Padding for the convolution. Defaults to 0.
dilation (int): Dilation factor for the convolution. Defaults to 1.
groups (int): Number of groups for the convolution. Defaults to 1.
bn_weight_init (float): Initial value for batch normalization weight. Defaults to 1.
Examples:
>>> conv_bn = Conv2d_BN(3, 64, ks=3, stride=1, pad=1)
>>> input_tensor = torch.randn(1, 3, 224, 224)
>>> output = conv_bn(input_tensor)
>>> print(output.shape)
"""
def __init__(self, a, b, ks=1, stride=1, pad=0, dilation=1, groups=1, bn_weight_init=1):
"""Initializes the MBConv model with given input channels, output channels, expansion ratio, activation, and
drop path.
"""
"""Initializes a sequential container with 2D convolution followed by batch normalization."""
super().__init__()
self.add_module("c", torch.nn.Conv2d(a, b, ks, stride, pad, dilation, groups, bias=False))
bn = torch.nn.BatchNorm2d(b)
@ -36,12 +60,29 @@ class Conv2d_BN(torch.nn.Sequential):
class PatchEmbed(nn.Module):
"""Embeds images into patches and projects them into a specified embedding dimension."""
"""
Embeds images into patches and projects them into a specified embedding dimension.
Attributes:
patches_resolution (Tuple[int, int]): Resolution of the patches after embedding.
num_patches (int): Total number of patches.
in_chans (int): Number of input channels.
embed_dim (int): Dimension of the embedding.
seq (nn.Sequential): Sequence of convolutional and activation layers for patch embedding.
Methods:
forward: Processes the input tensor through the patch embedding sequence.
Examples:
>>> import torch
>>> patch_embed = PatchEmbed(in_chans=3, embed_dim=96, resolution=224, activation=nn.GELU)
>>> x = torch.randn(1, 3, 224, 224)
>>> output = patch_embed(x)
>>> print(output.shape)
"""
def __init__(self, in_chans, embed_dim, resolution, activation):
"""Initialize the PatchMerging class with specified input, output dimensions, resolution and activation
function.
"""
"""Initializes patch embedding with convolutional layers for image-to-patch conversion and projection."""
super().__init__()
img_size: Tuple[int, int] = to_2tuple(resolution)
self.patches_resolution = (img_size[0] // 4, img_size[1] // 4)
@ -56,17 +97,40 @@ class PatchEmbed(nn.Module):
)
def forward(self, x):
"""Runs input tensor 'x' through the PatchMerging model's sequence of operations."""
"""Processes input tensor through patch embedding sequence, converting images to patch embeddings."""
return self.seq(x)
class MBConv(nn.Module):
"""Mobile Inverted Bottleneck Conv (MBConv) layer, part of the EfficientNet architecture."""
"""
Mobile Inverted Bottleneck Conv (MBConv) layer, part of the EfficientNet architecture.
Attributes:
in_chans (int): Number of input channels.
hidden_chans (int): Number of hidden channels.
out_chans (int): Number of output channels.
conv1 (Conv2d_BN): First convolutional layer.
act1 (nn.Module): First activation function.
conv2 (Conv2d_BN): Depthwise convolutional layer.
act2 (nn.Module): Second activation function.
conv3 (Conv2d_BN): Final convolutional layer.
act3 (nn.Module): Third activation function.
drop_path (nn.Module): Drop path layer (Identity for inference).
Methods:
forward: Performs the forward pass through the MBConv layer.
Examples:
>>> in_chans, out_chans = 32, 64
>>> mbconv = MBConv(in_chans, out_chans, expand_ratio=4, activation=nn.ReLU, drop_path=0.1)
>>> x = torch.randn(1, in_chans, 56, 56)
>>> output = mbconv(x)
>>> print(output.shape)
torch.Size([1, 64, 56, 56])
"""
def __init__(self, in_chans, out_chans, expand_ratio, activation, drop_path):
"""Initializes a convolutional layer with specified dimensions, input resolution, depth, and activation
function.
"""
"""Initializes the MBConv layer with specified input/output channels, expansion ratio, and activation."""
super().__init__()
self.in_chans = in_chans
self.hidden_chans = int(in_chans * expand_ratio)
@ -86,7 +150,7 @@ class MBConv(nn.Module):
self.drop_path = nn.Identity()
def forward(self, x):
"""Implements the forward pass for the model architecture."""
"""Implements the forward pass of MBConv, applying convolutions and skip connection."""
shortcut = x
x = self.conv1(x)
x = self.act1(x)
@ -99,12 +163,34 @@ class MBConv(nn.Module):
class PatchMerging(nn.Module):
"""Merges neighboring patches in the feature map and projects to a new dimension."""
"""
Merges neighboring patches in the feature map and projects to a new dimension.
This class implements a patch merging operation that combines spatial information and adjusts the feature
dimension. It uses a series of convolutional layers with batch normalization to achieve this.
Attributes:
input_resolution (Tuple[int, int]): The input resolution (height, width) of the feature map.
dim (int): The input dimension of the feature map.
out_dim (int): The output dimension after merging and projection.
act (nn.Module): The activation function used between convolutions.
conv1 (Conv2d_BN): The first convolutional layer for dimension projection.
conv2 (Conv2d_BN): The second convolutional layer for spatial merging.
conv3 (Conv2d_BN): The third convolutional layer for final projection.
Methods:
forward: Applies the patch merging operation to the input tensor.
Examples:
>>> input_resolution = (56, 56)
>>> patch_merging = PatchMerging(input_resolution, dim=64, out_dim=128, activation=nn.ReLU)
>>> x = torch.randn(4, 64, 56, 56)
>>> output = patch_merging(x)
>>> print(output.shape)
"""
def __init__(self, input_resolution, dim, out_dim, activation):
"""Initializes the ConvLayer with specific dimension, input resolution, depth, activation, drop path, and other
optional parameters.
"""
"""Initializes the PatchMerging module for merging and projecting neighboring patches in feature maps."""
super().__init__()
self.input_resolution = input_resolution
@ -117,7 +203,7 @@ class PatchMerging(nn.Module):
self.conv3 = Conv2d_BN(out_dim, out_dim, 1, 1, 0)
def forward(self, x):
"""Applies forward pass on the input utilizing convolution and activation layers, and returns the result."""
"""Applies patch merging and dimension projection to the input feature map."""
if x.ndim == 3:
H, W = self.input_resolution
B = len(x)
@ -137,7 +223,24 @@ class ConvLayer(nn.Module):
"""
Convolutional Layer featuring multiple MobileNetV3-style inverted bottleneck convolutions (MBConv).
Optionally applies downsample operations to the output, and provides support for gradient checkpointing.
This layer optionally applies downsample operations to the output and supports gradient checkpointing.
Attributes:
dim (int): Dimensionality of the input and output.
input_resolution (Tuple[int, int]): Resolution of the input image.
depth (int): Number of MBConv layers in the block.
use_checkpoint (bool): Whether to use gradient checkpointing to save memory.
blocks (nn.ModuleList): List of MBConv layers.
downsample (Optional[Callable]): Function for downsampling the output.
Methods:
forward: Processes the input through the convolutional layers.
Examples:
>>> input_tensor = torch.randn(1, 64, 56, 56)
>>> conv_layer = ConvLayer(64, (56, 56), depth=3, activation=nn.ReLU)
>>> output = conv_layer(input_tensor)
>>> print(output.shape)
"""
def __init__(
@ -155,16 +258,25 @@ class ConvLayer(nn.Module):
"""
Initializes the ConvLayer with the given dimensions and settings.
This layer consists of multiple MobileNetV3-style inverted bottleneck convolutions (MBConv) and
optionally applies downsampling to the output.
Args:
dim (int): The dimensionality of the input and output.
input_resolution (Tuple[int, int]): The resolution of the input image.
depth (int): The number of MBConv layers in the block.
activation (Callable): Activation function applied after each convolution.
drop_path (Union[float, List[float]]): Drop path rate. Single float or a list of floats for each MBConv.
drop_path (float | List[float]): Drop path rate. Single float or a list of floats for each MBConv.
downsample (Optional[Callable]): Function for downsampling the output. None to skip downsampling.
use_checkpoint (bool): Whether to use gradient checkpointing to save memory.
out_dim (Optional[int]): The dimensionality of the output. None means it will be the same as `dim`.
conv_expand_ratio (float): Expansion ratio for the MBConv layers.
Examples:
>>> input_tensor = torch.randn(1, 64, 56, 56)
>>> conv_layer = ConvLayer(64, (56, 56), depth=3, activation=nn.ReLU)
>>> output = conv_layer(input_tensor)
>>> print(output.shape)
"""
super().__init__()
self.dim = dim
@ -194,7 +306,7 @@ class ConvLayer(nn.Module):
)
def forward(self, x):
"""Processes the input through a series of convolutional layers and returns the activated output."""
"""Processes input through convolutional layers, applying MBConv blocks and optional downsampling."""
for blk in self.blocks:
x = checkpoint.checkpoint(blk, x) if self.use_checkpoint else blk(x)
return x if self.downsample is None else self.downsample(x)
@ -202,13 +314,33 @@ class ConvLayer(nn.Module):
class Mlp(nn.Module):
"""
Multi-layer Perceptron (MLP) for transformer architectures.
Multi-layer Perceptron (MLP) module for transformer architectures.
This layer takes an input with in_features, applies layer normalization and two fully-connected layers.
This module applies layer normalization, two fully-connected layers with an activation function in between,
and dropout. It is commonly used in transformer-based architectures.
Attributes:
norm (nn.LayerNorm): Layer normalization applied to the input.
fc1 (nn.Linear): First fully-connected layer.
fc2 (nn.Linear): Second fully-connected layer.
act (nn.Module): Activation function applied after the first fully-connected layer.
drop (nn.Dropout): Dropout layer applied after the activation function.
Methods:
forward: Applies the MLP operations on the input tensor.
Examples:
>>> import torch
>>> from torch import nn
>>> mlp = Mlp(in_features=256, hidden_features=512, out_features=256, act_layer=nn.GELU, drop=0.1)
>>> x = torch.randn(32, 100, 256)
>>> output = mlp(x)
>>> print(output.shape)
torch.Size([32, 100, 256])
"""
def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.0):
"""Initializes Attention module with the given parameters including dimension, key_dim, number of heads, etc."""
"""Initializes a multi-layer perceptron with configurable input, hidden, and output dimensions."""
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
@ -219,7 +351,7 @@ class Mlp(nn.Module):
self.drop = nn.Dropout(drop)
def forward(self, x):
"""Applies operations on input x and returns modified x, runs downsample if not None."""
"""Applies MLP operations: layer norm, FC layers, activation, and dropout to the input tensor."""
x = self.norm(x)
x = self.fc1(x)
x = self.act(x)
@ -230,12 +362,37 @@ class Mlp(nn.Module):
class Attention(torch.nn.Module):
"""
Multi-head attention module with support for spatial awareness, applying attention biases based on spatial
resolution. Implements trainable attention biases for each unique offset between spatial positions in the resolution
grid.
Multi-head attention module with spatial awareness and trainable attention biases.
This module implements a multi-head attention mechanism with support for spatial awareness, applying
attention biases based on spatial resolution. It includes trainable attention biases for each unique
offset between spatial positions in the resolution grid.
Attributes:
ab (Tensor, optional): Cached attention biases for inference, deleted during training.
num_heads (int): Number of attention heads.
scale (float): Scaling factor for attention scores.
key_dim (int): Dimensionality of the keys and queries.
nh_kd (int): Product of num_heads and key_dim.
d (int): Dimensionality of the value vectors.
dh (int): Product of d and num_heads.
attn_ratio (float): Attention ratio affecting the dimensions of the value vectors.
norm (nn.LayerNorm): Layer normalization applied to input.
qkv (nn.Linear): Linear layer for computing query, key, and value projections.
proj (nn.Linear): Linear layer for final projection.
attention_biases (nn.Parameter): Learnable attention biases.
attention_bias_idxs (Tensor): Indices for attention biases.
ab (Tensor): Cached attention biases for inference, deleted during training.
Methods:
train: Sets the module in training mode and handles the 'ab' attribute.
forward: Performs the forward pass of the attention mechanism.
Examples:
>>> attn = Attention(dim=256, key_dim=64, num_heads=8, resolution=(14, 14))
>>> x = torch.randn(1, 196, 256)
>>> output = attn(x)
>>> print(output.shape)
torch.Size([1, 196, 256])
"""
def __init__(
@ -247,17 +404,28 @@ class Attention(torch.nn.Module):
resolution=(14, 14),
):
"""
Initializes the Attention module.
Initializes the Attention module for multi-head attention with spatial awareness.
This module implements a multi-head attention mechanism with support for spatial awareness, applying
attention biases based on spatial resolution. It includes trainable attention biases for each unique
offset between spatial positions in the resolution grid.
Args:
dim (int): The dimensionality of the input and output.
key_dim (int): The dimensionality of the keys and queries.
num_heads (int, optional): Number of attention heads. Default is 8.
attn_ratio (float, optional): Attention ratio, affecting the dimensions of the value vectors. Default is 4.
resolution (Tuple[int, int], optional): Spatial resolution of the input feature map. Default is (14, 14).
num_heads (int): Number of attention heads. Default is 8.
attn_ratio (float): Attention ratio, affecting the dimensions of the value vectors. Default is 4.
resolution (Tuple[int, int]): Spatial resolution of the input feature map. Default is (14, 14).
Raises:
AssertionError: If `resolution` is not a tuple of length 2.
AssertionError: If 'resolution' is not a tuple of length 2.
Examples:
>>> attn = Attention(dim=256, key_dim=64, num_heads=8, resolution=(14, 14))
>>> x = torch.randn(1, 196, 256)
>>> output = attn(x)
>>> print(output.shape)
torch.Size([1, 196, 256])
"""
super().__init__()
@ -290,7 +458,7 @@ class Attention(torch.nn.Module):
@torch.no_grad()
def train(self, mode=True):
"""Sets the module in training mode and handles attribute 'ab' based on the mode."""
"""Performs multi-head attention with spatial awareness and trainable attention biases."""
super().train(mode)
if mode and hasattr(self, "ab"):
del self.ab
@ -298,7 +466,7 @@ class Attention(torch.nn.Module):
self.ab = self.attention_biases[:, self.attention_bias_idxs]
def forward(self, x): # x
"""Performs forward pass over the input tensor 'x' by applying normalization and querying keys/values."""
"""Applies multi-head attention with spatial awareness and trainable attention biases."""
B, N, _ = x.shape # B, N, C
# Normalization
@ -322,7 +490,34 @@ class Attention(torch.nn.Module):
class TinyViTBlock(nn.Module):
"""TinyViT Block that applies self-attention and a local convolution to the input."""
"""
TinyViT Block that applies self-attention and a local convolution to the input.
This block is a key component of the TinyViT architecture, combining self-attention mechanisms with
local convolutions to process input features efficiently.
Attributes:
dim (int): The dimensionality of the input and output.
input_resolution (Tuple[int, int]): Spatial resolution of the input feature map.
num_heads (int): Number of attention heads.
window_size (int): Size of the attention window.
mlp_ratio (float): Ratio of MLP hidden dimension to embedding dimension.
drop_path (nn.Module): Stochastic depth layer, identity function during inference.
attn (Attention): Self-attention module.
mlp (Mlp): Multi-layer perceptron module.
local_conv (Conv2d_BN): Depth-wise local convolution layer.
Methods:
forward: Processes the input through the TinyViT block.
extra_repr: Returns a string with extra information about the block's parameters.
Examples:
>>> input_tensor = torch.randn(1, 196, 192)
>>> block = TinyViTBlock(dim=192, input_resolution=(14, 14), num_heads=3)
>>> output = block(input_tensor)
>>> print(output.shape)
torch.Size([1, 196, 192])
"""
def __init__(
self,
@ -337,22 +532,32 @@ class TinyViTBlock(nn.Module):
activation=nn.GELU,
):
"""
Initializes the TinyViTBlock.
Initializes a TinyViT block with self-attention and local convolution.
This block is a key component of the TinyViT architecture, combining self-attention mechanisms with
local convolutions to process input features efficiently.
Args:
dim (int): The dimensionality of the input and output.
input_resolution (Tuple[int, int]): Spatial resolution of the input feature map.
dim (int): Dimensionality of the input and output features.
input_resolution (Tuple[int, int]): Spatial resolution of the input feature map (height, width).
num_heads (int): Number of attention heads.
window_size (int, optional): Window size for attention. Default is 7.
mlp_ratio (float, optional): Ratio of mlp hidden dim to embedding dim. Default is 4.
drop (float, optional): Dropout rate. Default is 0.
drop_path (float, optional): Stochastic depth rate. Default is 0.
local_conv_size (int, optional): The kernel size of the local convolution. Default is 3.
activation (torch.nn, optional): Activation function for MLP. Default is nn.GELU.
window_size (int): Size of the attention window. Must be greater than 0.
mlp_ratio (float): Ratio of MLP hidden dimension to embedding dimension.
drop (float): Dropout rate.
drop_path (float): Stochastic depth rate.
local_conv_size (int): Kernel size of the local convolution.
activation (torch.nn.Module): Activation function for MLP.
Raises:
AssertionError: If `window_size` is not greater than 0.
AssertionError: If `dim` is not divisible by `num_heads`.
AssertionError: If window_size is not greater than 0.
AssertionError: If dim is not divisible by num_heads.
Examples:
>>> block = TinyViTBlock(dim=192, input_resolution=(14, 14), num_heads=3)
>>> input_tensor = torch.randn(1, 196, 192)
>>> output = block(input_tensor)
>>> print(output.shape)
torch.Size([1, 196, 192])
"""
super().__init__()
self.dim = dim
@ -380,9 +585,7 @@ class TinyViTBlock(nn.Module):
self.local_conv = Conv2d_BN(dim, dim, ks=local_conv_size, stride=1, pad=pad, groups=dim)
def forward(self, x):
"""Applies attention-based transformation or padding to input 'x' before passing it through a local
convolution.
"""
"""Applies self-attention, local convolution, and MLP operations to the input tensor."""
h, w = self.input_resolution
b, hw, c = x.shape # batch, height*width, channels
assert hw == h * w, "input feature has wrong size"
@ -424,8 +627,19 @@ class TinyViTBlock(nn.Module):
return x + self.drop_path(self.mlp(x))
def extra_repr(self) -> str:
"""Returns a formatted string representing the TinyViTBlock's parameters: dimension, input resolution, number of
attentions heads, window size, and MLP ratio.
"""
Returns a string representation of the TinyViTBlock's parameters.
This method provides a formatted string containing key information about the TinyViTBlock, including its
dimension, input resolution, number of attention heads, window size, and MLP ratio.
Returns:
(str): A formatted string containing the block's parameters.
Examples:
>>> block = TinyViTBlock(dim=192, input_resolution=(14, 14), num_heads=3, window_size=7, mlp_ratio=4.0)
>>> print(block.extra_repr())
dim=192, input_resolution=(14, 14), num_heads=3, window_size=7, mlp_ratio=4.0
"""
return (
f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, "
@ -434,7 +648,31 @@ class TinyViTBlock(nn.Module):
class BasicLayer(nn.Module):
"""A basic TinyViT layer for one stage in a TinyViT architecture."""
"""
A basic TinyViT layer for one stage in a TinyViT architecture.
This class represents a single layer in the TinyViT model, consisting of multiple TinyViT blocks
and an optional downsampling operation.
Attributes:
dim (int): The dimensionality of the input and output features.
input_resolution (Tuple[int, int]): Spatial resolution of the input feature map.
depth (int): Number of TinyViT blocks in this layer.
use_checkpoint (bool): Whether to use gradient checkpointing to save memory.
blocks (nn.ModuleList): List of TinyViT blocks that make up this layer.
downsample (nn.Module | None): Downsample layer at the end of the layer, if specified.
Methods:
forward: Processes the input through the layer's blocks and optional downsampling.
extra_repr: Returns a string with the layer's parameters for printing.
Examples:
>>> input_tensor = torch.randn(1, 3136, 192)
>>> layer = BasicLayer(dim=192, input_resolution=(56, 56), depth=2, num_heads=3, window_size=7)
>>> output = layer(input_tensor)
>>> print(output.shape)
torch.Size([1, 784, 384])
"""
def __init__(
self,
@ -453,25 +691,34 @@ class BasicLayer(nn.Module):
out_dim=None,
):
"""
Initializes the BasicLayer.
Initializes a BasicLayer in the TinyViT architecture.
This layer consists of multiple TinyViT blocks and an optional downsampling operation. It is designed to
process feature maps at a specific resolution and dimensionality within the TinyViT model.
Args:
dim (int): The dimensionality of the input and output.
input_resolution (Tuple[int, int]): Spatial resolution of the input feature map.
depth (int): Number of TinyViT blocks.
num_heads (int): Number of attention heads.
window_size (int): Local window size.
mlp_ratio (float, optional): Ratio of mlp hidden dim to embedding dim. Default is 4.
drop (float, optional): Dropout rate. Default is 0.
drop_path (float | tuple[float], optional): Stochastic depth rate. Default is 0.
downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default is None.
use_checkpoint (bool, optional): Whether to use checkpointing to save memory. Default is False.
local_conv_size (int, optional): Kernel size of the local convolution. Default is 3.
activation (torch.nn, optional): Activation function for MLP. Default is nn.GELU.
out_dim (int | None, optional): The output dimension of the layer. Default is None.
dim (int): Dimensionality of the input and output features.
input_resolution (Tuple[int, int]): Spatial resolution of the input feature map (height, width).
depth (int): Number of TinyViT blocks in this layer.
num_heads (int): Number of attention heads in each TinyViT block.
window_size (int): Size of the local window for attention computation.
mlp_ratio (float): Ratio of MLP hidden dimension to embedding dimension.
drop (float): Dropout rate.
drop_path (float | List[float]): Stochastic depth rate. Can be a float or a list of floats for each block.
downsample (nn.Module | None): Downsampling layer at the end of the layer. None to skip downsampling.
use_checkpoint (bool): Whether to use gradient checkpointing to save memory.
local_conv_size (int): Kernel size for the local convolution in each TinyViT block.
activation (nn.Module): Activation function used in the MLP.
out_dim (int | None): Output dimension after downsampling. None means it will be the same as `dim`.
Raises:
ValueError: If `drop_path` is a list of float but its length doesn't match `depth`.
ValueError: If `drop_path` is a list and its length doesn't match `depth`.
Examples:
>>> layer = BasicLayer(dim=96, input_resolution=(56, 56), depth=2, num_heads=3, window_size=7)
>>> x = torch.randn(1, 56*56, 96)
>>> output = layer(x)
>>> print(output.shape)
"""
super().__init__()
self.dim = dim
@ -505,58 +752,49 @@ class BasicLayer(nn.Module):
)
def forward(self, x):
"""Performs forward propagation on the input tensor and returns a normalized tensor."""
"""Processes input through TinyViT blocks and optional downsampling."""
for blk in self.blocks:
x = checkpoint.checkpoint(blk, x) if self.use_checkpoint else blk(x)
return x if self.downsample is None else self.downsample(x)
def extra_repr(self) -> str:
"""Returns a string representation of the extra_repr function with the layer's parameters."""
"""Returns a string with the layer's parameters for printing."""
return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"
class LayerNorm2d(nn.Module):
"""A PyTorch implementation of Layer Normalization in 2D."""
def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
"""Initialize LayerNorm2d with the number of channels and an optional epsilon."""
super().__init__()
self.weight = nn.Parameter(torch.ones(num_channels))
self.bias = nn.Parameter(torch.zeros(num_channels))
self.eps = eps
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""Perform a forward pass, normalizing the input tensor."""
u = x.mean(1, keepdim=True)
s = (x - u).pow(2).mean(1, keepdim=True)
x = (x - u) / torch.sqrt(s + self.eps)
return self.weight[:, None, None] * x + self.bias[:, None, None]
class TinyViT(nn.Module):
"""
The TinyViT architecture for vision tasks.
TinyViT: A compact vision transformer architecture for efficient image classification and feature extraction.
This class implements the TinyViT model, which combines elements of vision transformers and convolutional
neural networks for improved efficiency and performance on vision tasks.
Attributes:
img_size (int): Input image size.
in_chans (int): Number of input channels.
num_classes (int): Number of classification classes.
embed_dims (List[int]): List of embedding dimensions for each layer.
depths (List[int]): List of depths for each layer.
num_heads (List[int]): List of number of attention heads for each layer.
window_sizes (List[int]): List of window sizes for each layer.
depths (List[int]): Number of blocks in each stage.
num_layers (int): Total number of layers in the network.
mlp_ratio (float): Ratio of MLP hidden dimension to embedding dimension.
drop_rate (float): Dropout rate for drop layers.
drop_path_rate (float): Drop path rate for stochastic depth.
use_checkpoint (bool): Use checkpointing for efficient memory usage.
mbconv_expand_ratio (float): Expansion ratio for MBConv layer.
local_conv_size (int): Local convolution kernel size.
layer_lr_decay (float): Layer-wise learning rate decay.
patch_embed (PatchEmbed): Module for patch embedding.
patches_resolution (Tuple[int, int]): Resolution of embedded patches.
layers (nn.ModuleList): List of network layers.
norm_head (nn.LayerNorm): Layer normalization for the classifier head.
head (nn.Linear): Linear layer for final classification.
neck (nn.Sequential): Neck module for feature refinement.
Note:
This implementation is generalized to accept a list of depths, attention heads,
embedding dimensions and window sizes, which allows you to create a
"stack" of TinyViT models of varying configurations.
Methods:
set_layer_lr_decay: Sets layer-wise learning rate decay.
_init_weights: Initializes weights for linear and normalization layers.
no_weight_decay_keywords: Returns keywords for parameters that should not use weight decay.
forward_features: Processes input through the feature extraction layers.
forward: Performs a forward pass through the entire network.
Examples:
>>> model = TinyViT(img_size=224, num_classes=1000)
>>> x = torch.randn(1, 3, 224, 224)
>>> features = model.forward_features(x)
>>> print(features.shape)
torch.Size([1, 256, 64, 64])
"""
def __init__(
@ -579,21 +817,33 @@ class TinyViT(nn.Module):
"""
Initializes the TinyViT model.
This constructor sets up the TinyViT architecture, including patch embedding, multiple layers of
attention and convolution blocks, and a classification head.
Args:
img_size (int, optional): The input image size. Defaults to 224.
in_chans (int, optional): Number of input channels. Defaults to 3.
num_classes (int, optional): Number of classification classes. Defaults to 1000.
embed_dims (List[int], optional): List of embedding dimensions per layer. Defaults to [96, 192, 384, 768].
depths (List[int], optional): List of depths for each layer. Defaults to [2, 2, 6, 2].
num_heads (List[int], optional): List of number of attention heads per layer. Defaults to [3, 6, 12, 24].
window_sizes (List[int], optional): List of window sizes for each layer. Defaults to [7, 7, 14, 7].
mlp_ratio (float, optional): Ratio of MLP hidden dimension to embedding dimension. Defaults to 4.
drop_rate (float, optional): Dropout rate. Defaults to 0.
drop_path_rate (float, optional): Drop path rate for stochastic depth. Defaults to 0.1.
use_checkpoint (bool, optional): Whether to use checkpointing for efficient memory usage. Defaults to False.
mbconv_expand_ratio (float, optional): Expansion ratio for MBConv layer. Defaults to 4.0.
local_conv_size (int, optional): Local convolution kernel size. Defaults to 3.
layer_lr_decay (float, optional): Layer-wise learning rate decay. Defaults to 1.0.
img_size (int): Size of the input image. Default is 224.
in_chans (int): Number of input channels. Default is 3.
num_classes (int): Number of classes for classification. Default is 1000.
embed_dims (Tuple[int, int, int, int]): Embedding dimensions for each stage.
Default is (96, 192, 384, 768).
depths (Tuple[int, int, int, int]): Number of blocks in each stage. Default is (2, 2, 6, 2).
num_heads (Tuple[int, int, int, int]): Number of attention heads in each stage.
Default is (3, 6, 12, 24).
window_sizes (Tuple[int, int, int, int]): Window sizes for each stage. Default is (7, 7, 14, 7).
mlp_ratio (float): Ratio of MLP hidden dim to embedding dim. Default is 4.0.
drop_rate (float): Dropout rate. Default is 0.0.
drop_path_rate (float): Stochastic depth rate. Default is 0.1.
use_checkpoint (bool): Whether to use checkpointing to save memory. Default is False.
mbconv_expand_ratio (float): Expansion ratio for MBConv layer. Default is 4.0.
local_conv_size (int): Kernel size for local convolutions. Default is 3.
layer_lr_decay (float): Layer-wise learning rate decay factor. Default is 1.0.
Examples:
>>> model = TinyViT(img_size=224, num_classes=1000)
>>> x = torch.randn(1, 3, 224, 224)
>>> output = model(x)
>>> print(output.shape)
torch.Size([1, 1000])
"""
super().__init__()
self.img_size = img_size
@ -671,7 +921,7 @@ class TinyViT(nn.Module):
)
def set_layer_lr_decay(self, layer_lr_decay):
"""Sets the learning rate decay for each layer in the TinyViT model."""
"""Sets layer-wise learning rate decay for the TinyViT model based on depth."""
decay_rate = layer_lr_decay
# Layers -> blocks (depth)
@ -706,7 +956,7 @@ class TinyViT(nn.Module):
self.apply(_check_lr_scale)
def _init_weights(self, m):
"""Initializes weights for linear layers and layer normalization in the given module."""
"""Initializes weights for linear and normalization layers in the TinyViT model."""
if isinstance(m, nn.Linear):
# NOTE: This initialization is needed only for training.
# trunc_normal_(m.weight, std=.02)
@ -718,11 +968,11 @@ class TinyViT(nn.Module):
@torch.jit.ignore
def no_weight_decay_keywords(self):
"""Returns a dictionary of parameter names where weight decay should not be applied."""
"""Returns a set of keywords for parameters that should not use weight decay."""
return {"attention_biases"}
def forward_features(self, x):
"""Runs the input through the model layers and returns the transformed output."""
"""Processes input through feature extraction layers, returning spatial features."""
x = self.patch_embed(x) # x input is (N, C, H, W)
x = self.layers[0](x)
@ -737,5 +987,5 @@ class TinyViT(nn.Module):
return self.neck(x)
def forward(self, x):
"""Executes a forward pass on the input tensor through the constructed model layers."""
"""Performs the forward pass through the TinyViT model, extracting features from the input image."""
return self.forward_features(x)

View file

@ -11,19 +11,31 @@ from ultralytics.nn.modules import MLPBlock
class TwoWayTransformer(nn.Module):
"""
A Two-Way Transformer module that enables the simultaneous attention to both image and query points. This class
serves as a specialized transformer decoder that attends to an input image using queries whose positional embedding
is supplied. This is particularly useful for tasks like object detection, image segmentation, and point cloud
processing.
A Two-Way Transformer module for simultaneous attention to image and query points.
This class implements a specialized transformer decoder that attends to an input image using queries with
supplied positional embeddings. It's useful for tasks like object detection, image segmentation, and point
cloud processing.
Attributes:
depth (int): The number of layers in the transformer.
embedding_dim (int): The channel dimension for the input embeddings.
num_heads (int): The number of heads for multihead attention.
mlp_dim (int): The internal channel dimension for the MLP block.
layers (nn.ModuleList): The list of TwoWayAttentionBlock layers that make up the transformer.
final_attn_token_to_image (Attention): The final attention layer applied from the queries to the image.
norm_final_attn (nn.LayerNorm): The layer normalization applied to the final queries.
depth (int): Number of layers in the transformer.
embedding_dim (int): Channel dimension for input embeddings.
num_heads (int): Number of heads for multihead attention.
mlp_dim (int): Internal channel dimension for the MLP block.
layers (nn.ModuleList): List of TwoWayAttentionBlock layers composing the transformer.
final_attn_token_to_image (Attention): Final attention layer from queries to image.
norm_final_attn (nn.LayerNorm): Layer normalization applied to final queries.
Methods:
forward: Processes image and point embeddings through the transformer.
Examples:
>>> transformer = TwoWayTransformer(depth=6, embedding_dim=256, num_heads=8, mlp_dim=2048)
>>> image_embedding = torch.randn(1, 256, 32, 32)
>>> image_pe = torch.randn(1, 256, 32, 32)
>>> point_embedding = torch.randn(1, 100, 256)
>>> output_queries, output_image = transformer(image_embedding, image_pe, point_embedding)
>>> print(output_queries.shape, output_image.shape)
"""
def __init__(
@ -36,15 +48,33 @@ class TwoWayTransformer(nn.Module):
attention_downsample_rate: int = 2,
) -> None:
"""
A transformer decoder that attends to an input image using queries whose positional embedding is supplied.
Initialize a Two-Way Transformer for simultaneous attention to image and query points.
Args:
depth (int): number of layers in the transformer
embedding_dim (int): the channel dimension for the input embeddings
num_heads (int): the number of heads for multihead attention. Must
divide embedding_dim
mlp_dim (int): the channel dimension internal to the MLP block
activation (nn.Module): the activation to use in the MLP block
depth (int): Number of layers in the transformer.
embedding_dim (int): Channel dimension for input embeddings.
num_heads (int): Number of heads for multihead attention. Must divide embedding_dim.
mlp_dim (int): Internal channel dimension for the MLP block.
activation (Type[nn.Module]): Activation function to use in the MLP block.
attention_downsample_rate (int): Downsampling rate for attention mechanism.
Attributes:
depth (int): Number of layers in the transformer.
embedding_dim (int): Channel dimension for input embeddings.
embedding_dim (int): Channel dimension for input embeddings.
num_heads (int): Number of heads for multihead attention.
mlp_dim (int): Internal channel dimension for the MLP block.
layers (nn.ModuleList): List of TwoWayAttentionBlock layers.
final_attn_token_to_image (Attention): Final attention layer from queries to image.
norm_final_attn (nn.LayerNorm): Layer normalization applied to final queries.
Examples:
>>> transformer = TwoWayTransformer(depth=6, embedding_dim=256, num_heads=8, mlp_dim=2048)
>>> image_embedding = torch.randn(1, 256, 32, 32)
>>> image_pe = torch.randn(1, 256, 32, 32)
>>> point_embedding = torch.randn(1, 100, 256)
>>> output_queries, output_image = transformer(image_embedding, image_pe, point_embedding)
>>> print(output_queries.shape, output_image.shape)
"""
super().__init__()
self.depth = depth
@ -75,15 +105,23 @@ class TwoWayTransformer(nn.Module):
point_embedding: Tensor,
) -> Tuple[Tensor, Tensor]:
"""
Processes image and point embeddings through the Two-Way Transformer.
Args:
image_embedding (torch.Tensor): image to attend to. Should be shape B x embedding_dim x h x w for any h and w.
image_pe (torch.Tensor): the positional encoding to add to the image. Must have same shape as image_embedding.
point_embedding (torch.Tensor): the embedding to add to the query points.
Must have shape B x N_points x embedding_dim for any N_points.
image_embedding (torch.Tensor): Image to attend to, with shape (B, embedding_dim, H, W).
image_pe (torch.Tensor): Positional encoding to add to the image, with same shape as image_embedding.
point_embedding (torch.Tensor): Embedding to add to query points, with shape (B, N_points, embedding_dim).
Returns:
(torch.Tensor): the processed point_embedding
(torch.Tensor): the processed image_embedding
(Tuple[torch.Tensor, torch.Tensor]): Processed point_embedding and image_embedding.
Examples:
>>> transformer = TwoWayTransformer(depth=6, embedding_dim=256, num_heads=8, mlp_dim=2048)
>>> image_embedding = torch.randn(1, 256, 32, 32)
>>> image_pe = torch.randn(1, 256, 32, 32)
>>> point_embedding = torch.randn(1, 100, 256)
>>> output_queries, output_image = transformer(image_embedding, image_pe, point_embedding)
>>> print(output_queries.shape, output_image.shape)
"""
# BxCxHxW -> BxHWxC == B x N_image_tokens x C
image_embedding = image_embedding.flatten(2).permute(0, 2, 1)
@ -114,21 +152,34 @@ class TwoWayTransformer(nn.Module):
class TwoWayAttentionBlock(nn.Module):
"""
An attention block that performs both self-attention and cross-attention in two directions: queries to keys and
keys to queries. This block consists of four main layers: (1) self-attention on sparse inputs, (2) cross-attention
of sparse inputs to dense inputs, (3) an MLP block on sparse inputs, and (4) cross-attention of dense inputs to
sparse inputs.
A two-way attention block for simultaneous attention to image and query points.
This class implements a specialized transformer block with four main layers: self-attention on sparse inputs,
cross-attention of sparse inputs to dense inputs, MLP block on sparse inputs, and cross-attention of dense
inputs to sparse inputs.
Attributes:
self_attn (Attention): The self-attention layer for the queries.
norm1 (nn.LayerNorm): Layer normalization following the first attention block.
self_attn (Attention): Self-attention layer for queries.
norm1 (nn.LayerNorm): Layer normalization after self-attention.
cross_attn_token_to_image (Attention): Cross-attention layer from queries to keys.
norm2 (nn.LayerNorm): Layer normalization following the second attention block.
mlp (MLPBlock): MLP block that transforms the query embeddings.
norm3 (nn.LayerNorm): Layer normalization following the MLP block.
norm4 (nn.LayerNorm): Layer normalization following the third attention block.
norm2 (nn.LayerNorm): Layer normalization after token-to-image attention.
mlp (MLPBlock): MLP block for transforming query embeddings.
norm3 (nn.LayerNorm): Layer normalization after MLP block.
norm4 (nn.LayerNorm): Layer normalization after image-to-token attention.
cross_attn_image_to_token (Attention): Cross-attention layer from keys to queries.
skip_first_layer_pe (bool): Whether to skip the positional encoding in the first layer.
skip_first_layer_pe (bool): Whether to skip positional encoding in the first layer.
Methods:
forward: Applies self-attention and cross-attention to queries and keys.
Examples:
>>> embedding_dim, num_heads = 256, 8
>>> block = TwoWayAttentionBlock(embedding_dim, num_heads)
>>> queries = torch.randn(1, 100, embedding_dim)
>>> keys = torch.randn(1, 1000, embedding_dim)
>>> query_pe = torch.randn(1, 100, embedding_dim)
>>> key_pe = torch.randn(1, 1000, embedding_dim)
>>> processed_queries, processed_keys = block(queries, keys, query_pe, key_pe)
"""
def __init__(
@ -141,16 +192,28 @@ class TwoWayAttentionBlock(nn.Module):
skip_first_layer_pe: bool = False,
) -> None:
"""
A transformer block with four layers: (1) self-attention of sparse inputs, (2) cross attention of sparse
inputs to dense inputs, (3) mlp block on sparse inputs, and (4) cross attention of dense inputs to sparse
inputs.
Initializes a TwoWayAttentionBlock for simultaneous attention to image and query points.
This block implements a specialized transformer layer with four main components: self-attention on sparse
inputs, cross-attention of sparse inputs to dense inputs, MLP block on sparse inputs, and cross-attention
of dense inputs to sparse inputs.
Args:
embedding_dim (int): the channel dimension of the embeddings
num_heads (int): the number of heads in the attention layers
mlp_dim (int): the hidden dimension of the mlp block
activation (nn.Module): the activation of the mlp block
skip_first_layer_pe (bool): skip the PE on the first layer
embedding_dim (int): Channel dimension of the embeddings.
num_heads (int): Number of attention heads in the attention layers.
mlp_dim (int): Hidden dimension of the MLP block.
activation (Type[nn.Module]): Activation function for the MLP block.
attention_downsample_rate (int): Downsampling rate for the attention mechanism.
skip_first_layer_pe (bool): Whether to skip positional encoding in the first layer.
Examples:
>>> embedding_dim, num_heads = 256, 8
>>> block = TwoWayAttentionBlock(embedding_dim, num_heads)
>>> queries = torch.randn(1, 100, embedding_dim)
>>> keys = torch.randn(1, 1000, embedding_dim)
>>> query_pe = torch.randn(1, 100, embedding_dim)
>>> key_pe = torch.randn(1, 1000, embedding_dim)
>>> processed_queries, processed_keys = block(queries, keys, query_pe, key_pe)
"""
super().__init__()
self.self_attn = Attention(embedding_dim, num_heads)
@ -168,7 +231,7 @@ class TwoWayAttentionBlock(nn.Module):
self.skip_first_layer_pe = skip_first_layer_pe
def forward(self, queries: Tensor, keys: Tensor, query_pe: Tensor, key_pe: Tensor) -> Tuple[Tensor, Tensor]:
"""Apply self-attention and cross-attention to queries and keys and return the processed embeddings."""
"""Applies two-way attention to process query and key embeddings in a transformer block."""
# Self attention block
if self.skip_first_layer_pe:
@ -202,8 +265,34 @@ class TwoWayAttentionBlock(nn.Module):
class Attention(nn.Module):
"""An attention layer that allows for downscaling the size of the embedding after projection to queries, keys, and
values.
"""
An attention layer with downscaling capability for embedding size after projection.
This class implements a multi-head attention mechanism with the option to downsample the internal
dimension of queries, keys, and values.
Attributes:
embedding_dim (int): Dimensionality of input embeddings.
kv_in_dim (int): Dimensionality of key and value inputs.
internal_dim (int): Internal dimension after downsampling.
num_heads (int): Number of attention heads.
q_proj (nn.Linear): Linear projection for queries.
k_proj (nn.Linear): Linear projection for keys.
v_proj (nn.Linear): Linear projection for values.
out_proj (nn.Linear): Linear projection for output.
Methods:
_separate_heads: Separates input tensor into attention heads.
_recombine_heads: Recombines separated attention heads.
forward: Computes attention output for given query, key, and value tensors.
Examples:
>>> attn = Attention(embedding_dim=256, num_heads=8, downsample_rate=2)
>>> q = torch.randn(1, 100, 256)
>>> k = v = torch.randn(1, 50, 256)
>>> output = attn(q, k, v)
>>> print(output.shape)
torch.Size([1, 100, 256])
"""
def __init__(
@ -214,15 +303,27 @@ class Attention(nn.Module):
kv_in_dim: int = None,
) -> None:
"""
Initializes the Attention model with the given dimensions and settings.
Initializes the Attention module with specified dimensions and settings.
This class implements a multi-head attention mechanism with optional downsampling of the internal
dimension for queries, keys, and values.
Args:
embedding_dim (int): The dimensionality of the input embeddings.
num_heads (int): The number of attention heads.
downsample_rate (int, optional): The factor by which the internal dimensions are downsampled. Defaults to 1.
embedding_dim (int): Dimensionality of input embeddings.
num_heads (int): Number of attention heads.
downsample_rate (int): Factor by which internal dimensions are downsampled. Defaults to 1.
kv_in_dim (int | None): Dimensionality of key and value inputs. If None, uses embedding_dim.
Raises:
AssertionError: If 'num_heads' does not evenly divide the internal dim (embedding_dim / downsample_rate).
AssertionError: If num_heads does not evenly divide the internal dim (embedding_dim / downsample_rate).
Examples:
>>> attn = Attention(embedding_dim=256, num_heads=8, downsample_rate=2)
>>> q = torch.randn(1, 100, 256)
>>> k = v = torch.randn(1, 50, 256)
>>> output = attn(q, k, v)
>>> print(output.shape)
torch.Size([1, 100, 256])
"""
super().__init__()
self.embedding_dim = embedding_dim
@ -238,20 +339,20 @@ class Attention(nn.Module):
@staticmethod
def _separate_heads(x: Tensor, num_heads: int) -> Tensor:
"""Separate the input tensor into the specified number of attention heads."""
"""Separates the input tensor into the specified number of attention heads."""
b, n, c = x.shape
x = x.reshape(b, n, num_heads, c // num_heads)
return x.transpose(1, 2) # B x N_heads x N_tokens x C_per_head
@staticmethod
def _recombine_heads(x: Tensor) -> Tensor:
"""Recombine the separated attention heads into a single tensor."""
"""Recombines separated attention heads into a single tensor."""
b, n_heads, n_tokens, c_per_head = x.shape
x = x.transpose(1, 2)
return x.reshape(b, n_tokens, n_heads * c_per_head) # B x N_tokens x C
def forward(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor:
"""Compute the attention output given the input query, key, and value tensors."""
"""Applies multi-head attention to query, key, and value tensors with optional downsampling."""
# Input projections
q = self.q_proj(q)

View file

@ -0,0 +1,293 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
from typing import Tuple
import torch
import torch.nn.functional as F
def select_closest_cond_frames(frame_idx, cond_frame_outputs, max_cond_frame_num):
"""
Selects the closest conditioning frames to a given frame index.
Args:
frame_idx (int): Current frame index.
cond_frame_outputs (Dict[int, Any]): Dictionary of conditioning frame outputs keyed by frame indices.
max_cond_frame_num (int): Maximum number of conditioning frames to select.
Returns:
(Tuple[Dict[int, Any], Dict[int, Any]]): A tuple containing two dictionaries:
- selected_outputs: Selected items from cond_frame_outputs.
- unselected_outputs: Items not selected from cond_frame_outputs.
Examples:
>>> frame_idx = 5
>>> cond_frame_outputs = {1: 'a', 3: 'b', 7: 'c', 9: 'd'}
>>> max_cond_frame_num = 2
>>> selected, unselected = select_closest_cond_frames(frame_idx, cond_frame_outputs, max_cond_frame_num)
>>> print(selected)
{3: 'b', 7: 'c'}
>>> print(unselected)
{1: 'a', 9: 'd'}
"""
if max_cond_frame_num == -1 or len(cond_frame_outputs) <= max_cond_frame_num:
selected_outputs = cond_frame_outputs
unselected_outputs = {}
else:
assert max_cond_frame_num >= 2, "we should allow using 2+ conditioning frames"
selected_outputs = {}
# the closest conditioning frame before `frame_idx` (if any)
idx_before = max((t for t in cond_frame_outputs if t < frame_idx), default=None)
if idx_before is not None:
selected_outputs[idx_before] = cond_frame_outputs[idx_before]
# the closest conditioning frame after `frame_idx` (if any)
idx_after = min((t for t in cond_frame_outputs if t >= frame_idx), default=None)
if idx_after is not None:
selected_outputs[idx_after] = cond_frame_outputs[idx_after]
# add other temporally closest conditioning frames until reaching a total
# of `max_cond_frame_num` conditioning frames.
num_remain = max_cond_frame_num - len(selected_outputs)
inds_remain = sorted(
(t for t in cond_frame_outputs if t not in selected_outputs),
key=lambda x: abs(x - frame_idx),
)[:num_remain]
selected_outputs.update((t, cond_frame_outputs[t]) for t in inds_remain)
unselected_outputs = {t: v for t, v in cond_frame_outputs.items() if t not in selected_outputs}
return selected_outputs, unselected_outputs
def get_1d_sine_pe(pos_inds, dim, temperature=10000):
"""Generates 1D sinusoidal positional embeddings for given positions and dimensions."""
pe_dim = dim // 2
dim_t = torch.arange(pe_dim, dtype=torch.float32, device=pos_inds.device)
dim_t = temperature ** (2 * (dim_t // 2) / pe_dim)
pos_embed = pos_inds.unsqueeze(-1) / dim_t
pos_embed = torch.cat([pos_embed.sin(), pos_embed.cos()], dim=-1)
return pos_embed
def init_t_xy(end_x: int, end_y: int):
"""Initializes 1D and 2D coordinate tensors for a grid of specified dimensions."""
t = torch.arange(end_x * end_y, dtype=torch.float32)
t_x = (t % end_x).float()
t_y = torch.div(t, end_x, rounding_mode="floor").float()
return t_x, t_y
def compute_axial_cis(dim: int, end_x: int, end_y: int, theta: float = 10000.0):
"""Computes axial complex exponential positional encodings for 2D spatial positions in a grid."""
freqs_x = 1.0 / (theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim))
freqs_y = 1.0 / (theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim))
t_x, t_y = init_t_xy(end_x, end_y)
freqs_x = torch.outer(t_x, freqs_x)
freqs_y = torch.outer(t_y, freqs_y)
freqs_cis_x = torch.polar(torch.ones_like(freqs_x), freqs_x)
freqs_cis_y = torch.polar(torch.ones_like(freqs_y), freqs_y)
return torch.cat([freqs_cis_x, freqs_cis_y], dim=-1)
def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
"""Reshapes frequency tensor for broadcasting with input tensor, ensuring dimensional compatibility."""
ndim = x.ndim
assert 0 <= 1 < ndim
assert freqs_cis.shape == (x.shape[-2], x.shape[-1])
shape = [d if i >= ndim - 2 else 1 for i, d in enumerate(x.shape)]
return freqs_cis.view(*shape)
def apply_rotary_enc(
xq: torch.Tensor,
xk: torch.Tensor,
freqs_cis: torch.Tensor,
repeat_freqs_k: bool = False,
):
"""Applies rotary positional encoding to query and key tensors using complex-valued frequency components."""
xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2)) if xk.shape[-2] != 0 else None
freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
if xk_ is None:
# no keys to rotate, due to dropout
return xq_out.type_as(xq).to(xq.device), xk
# repeat freqs along seq_len dim to match k seq_len
if repeat_freqs_k:
r = xk_.shape[-2] // xq_.shape[-2]
freqs_cis = freqs_cis.repeat(*([1] * (freqs_cis.ndim - 2)), r, 1)
xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
return xq_out.type_as(xq).to(xq.device), xk_out.type_as(xk).to(xk.device)
def window_partition(x, window_size):
"""
Partitions input tensor into non-overlapping windows with padding if needed.
Args:
x (torch.Tensor): Input tensor with shape (B, H, W, C).
window_size (int): Size of each window.
Returns:
(Tuple[torch.Tensor, Tuple[int, int]]): A tuple containing:
- windows (torch.Tensor): Partitioned windows with shape (B * num_windows, window_size, window_size, C).
- (Hp, Wp) (Tuple[int, int]): Padded height and width before partition.
Examples:
>>> x = torch.randn(1, 16, 16, 3)
>>> windows, (Hp, Wp) = window_partition(x, window_size=4)
>>> print(windows.shape, Hp, Wp)
torch.Size([16, 4, 4, 3]) 16 16
"""
B, H, W, C = x.shape
pad_h = (window_size - H % window_size) % window_size
pad_w = (window_size - W % window_size) % window_size
if pad_h > 0 or pad_w > 0:
x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h))
Hp, Wp = H + pad_h, W + pad_w
x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
return windows, (Hp, Wp)
def window_unpartition(windows, window_size, pad_hw, hw):
"""
Unpartitions windowed sequences into original sequences and removes padding.
This function reverses the windowing process, reconstructing the original input from windowed segments
and removing any padding that was added during the windowing process.
Args:
windows (torch.Tensor): Input tensor of windowed sequences with shape (B * num_windows, window_size,
window_size, C), where B is the batch size, num_windows is the number of windows, window_size is
the size of each window, and C is the number of channels.
window_size (int): Size of each window.
pad_hw (Tuple[int, int]): Padded height and width (Hp, Wp) of the input before windowing.
hw (Tuple[int, int]): Original height and width (H, W) of the input before padding and windowing.
Returns:
(torch.Tensor): Unpartitioned sequences with shape (B, H, W, C), where B is the batch size, H and W
are the original height and width, and C is the number of channels.
Examples:
>>> windows = torch.rand(32, 8, 8, 64) # 32 windows of size 8x8 with 64 channels
>>> pad_hw = (16, 16) # Padded height and width
>>> hw = (15, 14) # Original height and width
>>> x = window_unpartition(windows, window_size=8, pad_hw=pad_hw, hw=hw)
>>> print(x.shape)
torch.Size([1, 15, 14, 64])
"""
Hp, Wp = pad_hw
H, W = hw
B = windows.shape[0] // (Hp * Wp // window_size // window_size)
x = windows.view(B, Hp // window_size, Wp // window_size, window_size, window_size, -1)
x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1)
if Hp > H or Wp > W:
x = x[:, :H, :W, :].contiguous()
return x
def get_rel_pos(q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor:
"""
Extracts relative positional embeddings based on query and key sizes.
Args:
q_size (int): Size of the query.
k_size (int): Size of the key.
rel_pos (torch.Tensor): Relative position embeddings with shape (L, C), where L is the maximum relative
distance and C is the embedding dimension.
Returns:
(torch.Tensor): Extracted positional embeddings according to relative positions, with shape (q_size,
k_size, C).
Examples:
>>> q_size, k_size = 8, 16
>>> rel_pos = torch.randn(31, 64) # 31 = 2 * max(8, 16) - 1
>>> extracted_pos = get_rel_pos(q_size, k_size, rel_pos)
>>> print(extracted_pos.shape)
torch.Size([8, 16, 64])
"""
max_rel_dist = int(2 * max(q_size, k_size) - 1)
# Interpolate rel pos if needed.
if rel_pos.shape[0] != max_rel_dist:
# Interpolate rel pos.
rel_pos_resized = F.interpolate(
rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),
size=max_rel_dist,
mode="linear",
)
rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
else:
rel_pos_resized = rel_pos
# Scale the coords with short length if shapes for q and k are different.
q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0)
k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0)
relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)
return rel_pos_resized[relative_coords.long()]
def add_decomposed_rel_pos(
attn: torch.Tensor,
q: torch.Tensor,
rel_pos_h: torch.Tensor,
rel_pos_w: torch.Tensor,
q_size: Tuple[int, int],
k_size: Tuple[int, int],
) -> torch.Tensor:
"""
Adds decomposed Relative Positional Embeddings to the attention map.
This function calculates and applies decomposed Relative Positional Embeddings as described in the MVITv2
paper. It enhances the attention mechanism by incorporating spatial relationships between query and key
positions.
Args:
attn (torch.Tensor): Attention map with shape (B, q_h * q_w, k_h * k_w).
q (torch.Tensor): Query tensor in the attention layer with shape (B, q_h * q_w, C).
rel_pos_h (torch.Tensor): Relative position embeddings for height axis with shape (Lh, C).
rel_pos_w (torch.Tensor): Relative position embeddings for width axis with shape (Lw, C).
q_size (Tuple[int, int]): Spatial sequence size of query q as (q_h, q_w).
k_size (Tuple[int, int]): Spatial sequence size of key k as (k_h, k_w).
Returns:
(torch.Tensor): Updated attention map with added relative positional embeddings, shape
(B, q_h * q_w, k_h * k_w).
Examples:
>>> B, C, q_h, q_w, k_h, k_w = 1, 64, 8, 8, 8, 8
>>> attn = torch.rand(B, q_h * q_w, k_h * k_w)
>>> q = torch.rand(B, q_h * q_w, C)
>>> rel_pos_h = torch.rand(2 * max(q_h, k_h) - 1, C)
>>> rel_pos_w = torch.rand(2 * max(q_w, k_w) - 1, C)
>>> q_size, k_size = (q_h, q_w), (k_h, k_w)
>>> updated_attn = add_decomposed_rel_pos(attn, q, rel_pos_h, rel_pos_w, q_size, k_size)
>>> print(updated_attn.shape)
torch.Size([1, 64, 64])
References:
https://github.com/facebookresearch/mvit/blob/main/mvit/models/attention.py
"""
q_h, q_w = q_size
k_h, k_w = k_size
Rh = get_rel_pos(q_h, k_h, rel_pos_h)
Rw = get_rel_pos(q_w, k_w, rel_pos_w)
B, _, dim = q.shape
r_q = q.reshape(B, q_h, q_w, dim)
rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh)
rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw)
attn = (attn.view(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]).view(
B, q_h * q_w, k_h * k_w
)
return attn