Implement all missing docstrings (#5298)

Co-authored-by: snyk-bot <snyk-bot@snyk.io> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2023-10-10 20:07:13 +02:00 · 2023-10-10 20:07:13 +02:00 · 7fd5dcbd86
commit 7fd5dcbd86
parent e7f0658744
26 changed files with 649 additions and 79 deletions
--- a/ultralytics/models/sam/modules/transformer.py
+++ b/ultralytics/models/sam/modules/transformer.py
@ -10,6 +10,21 @@ from ultralytics.nn.modules import MLPBlock


 class TwoWayTransformer(nn.Module):
+    """
+    A Two-Way Transformer module that enables the simultaneous attention to both image and query points. This class
+    serves as a specialized transformer decoder that attends to an input image using queries whose positional embedding
+    is supplied. This is particularly useful for tasks like object detection, image segmentation, and point cloud
+    processing.
+
+    Attributes:
+        depth (int): The number of layers in the transformer.
+        embedding_dim (int): The channel dimension for the input embeddings.
+        num_heads (int): The number of heads for multihead attention.
+        mlp_dim (int): The internal channel dimension for the MLP block.
+        layers (nn.ModuleList): The list of TwoWayAttentionBlock layers that make up the transformer.
+        final_attn_token_to_image (Attention): The final attention layer applied from the queries to the image.
+        norm_final_attn (nn.LayerNorm): The layer normalization applied to the final queries.
+    """

    def __init__(
        self,
@ -98,6 +113,23 @@ class TwoWayTransformer(nn.Module):


 class TwoWayAttentionBlock(nn.Module):
+    """
+    An attention block that performs both self-attention and cross-attention in two directions: queries to keys and
+    keys to queries. This block consists of four main layers: (1) self-attention on sparse inputs, (2) cross-attention
+    of sparse inputs to dense inputs, (3) an MLP block on sparse inputs, and (4) cross-attention of dense inputs to
+    sparse inputs.
+
+    Attributes:
+        self_attn (Attention): The self-attention layer for the queries.
+        norm1 (nn.LayerNorm): Layer normalization following the first attention block.
+        cross_attn_token_to_image (Attention): Cross-attention layer from queries to keys.
+        norm2 (nn.LayerNorm): Layer normalization following the second attention block.
+        mlp (MLPBlock): MLP block that transforms the query embeddings.
+        norm3 (nn.LayerNorm): Layer normalization following the MLP block.
+        norm4 (nn.LayerNorm): Layer normalization following the third attention block.
+        cross_attn_image_to_token (Attention): Cross-attention layer from keys to queries.
+        skip_first_layer_pe (bool): Whether to skip the positional encoding in the first layer.
+    """

    def __init__(
        self,
@ -180,6 +212,17 @@ class Attention(nn.Module):
        num_heads: int,
        downsample_rate: int = 1,
    ) -> None:
+        """
+        Initializes the Attention model with the given dimensions and settings.
+
+        Args:
+            embedding_dim (int): The dimensionality of the input embeddings.
+            num_heads (int): The number of attention heads.
+            downsample_rate (int, optional): The factor by which the internal dimensions are downsampled. Defaults to 1.
+
+        Raises:
+            AssertionError: If 'num_heads' does not evenly divide the internal dimension (embedding_dim / downsample_rate).
+        """
        super().__init__()
        self.embedding_dim = embedding_dim
        self.internal_dim = embedding_dim // downsample_rate
@ -191,13 +234,15 @@ class Attention(nn.Module):
        self.v_proj = nn.Linear(embedding_dim, self.internal_dim)
        self.out_proj = nn.Linear(self.internal_dim, embedding_dim)

-    def _separate_heads(self, x: Tensor, num_heads: int) -> Tensor:
+    @staticmethod
+    def _separate_heads(x: Tensor, num_heads: int) -> Tensor:
        """Separate the input tensor into the specified number of attention heads."""
        b, n, c = x.shape
        x = x.reshape(b, n, num_heads, c // num_heads)
        return x.transpose(1, 2)  # B x N_heads x N_tokens x C_per_head

-    def _recombine_heads(self, x: Tensor) -> Tensor:
+    @staticmethod
+    def _recombine_heads(x: Tensor) -> Tensor:
        """Recombine the separated attention heads into a single tensor."""
        b, n_heads, n_tokens, c_per_head = x.shape
        x = x.transpose(1, 2)