Add docs Ultralytics 文档: - zh/index.md (#5871)

Signed-off-by: Glenn Jocher <glenn.jocher@ultralytics.com> Co-authored-by: Laughing-q <1185102784@qq.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2023-10-21 20:31:45 +02:00 · 2023-10-21 20:31:45 +02:00 · 0f9f857449
commit 0f9f857449
parent b9b0fd8bf4
15 changed files with 181 additions and 80 deletions
--- a/ultralytics/data/base.py
+++ b/ultralytics/data/base.py
@ -267,20 +267,28 @@ class BaseDataset(Dataset):
        return label

    def build_transforms(self, hyp=None):
-        """Users can custom augmentations here
-        like:
+        """
+        Users can customize augmentations here.
+
+        Example:
+            ```python
            if self.augment:
                # Training transforms
                return Compose([])
            else:
                # Val transforms
                return Compose([])
+            ```
        """
        raise NotImplementedError

    def get_labels(self):
-        """Users can custom their own format here.
-        Make sure your output is a list with each element like below:
+        """
+        Users can customize their own format here.
+
+        Note:
+            Ensure output is a dictionary with the following keys:
+            ```python
            dict(
                im_file=im_file,
                shape=shape,  # format: (height, width)
@ -291,5 +299,6 @@ class BaseDataset(Dataset):
                normalized=True, # or False
                bbox_format="xyxy",  # or xywh, ltwh
            )
+            ```
        """
        raise NotImplementedError
--- a/ultralytics/data/utils.py
+++ b/ultralytics/data/utils.py
@ -154,28 +154,40 @@ def verify_image_label(args):

 def polygon2mask(imgsz, polygons, color=1, downsample_ratio=1):
    """
+    Convert a list of polygons to a binary mask of the specified image size.
+
    Args:
-        imgsz (tuple): The image size.
-        polygons (list[np.ndarray]): [N, M], N is the number of polygons, M is the number of points(Be divided by 2).
-        color (int): color
-        downsample_ratio (int): downsample ratio
+        imgsz (tuple): The size of the image as (height, width).
+        polygons (list[np.ndarray]): A list of polygons. Each polygon is an array with shape [N, M], where
+                                     N is the number of polygons, and M is the number of points such that M % 2 = 0.
+        color (int, optional): The color value to fill in the polygons on the mask. Defaults to 1.
+        downsample_ratio (int, optional): Factor by which to downsample the mask. Defaults to 1.
+
+    Returns:
+        (np.ndarray): A binary mask of the specified image size with the polygons filled in.
    """
    mask = np.zeros(imgsz, dtype=np.uint8)
    polygons = np.asarray(polygons, dtype=np.int32)
    polygons = polygons.reshape((polygons.shape[0], -1, 2))
    cv2.fillPoly(mask, polygons, color=color)
    nh, nw = (imgsz[0] // downsample_ratio, imgsz[1] // downsample_ratio)
-    # NOTE: fillPoly first then resize is trying to keep the same way of loss calculation when mask-ratio=1.
+    # Note: fillPoly first then resize is trying to keep the same loss calculation method when mask-ratio=1
    return cv2.resize(mask, (nw, nh))


 def polygons2masks(imgsz, polygons, color, downsample_ratio=1):
    """
+    Convert a list of polygons to a set of binary masks of the specified image size.
+
    Args:
-        imgsz (tuple): The image size.
-        polygons (list[np.ndarray]): each polygon is [N, M], N is number of polygons, M is number of points (M % 2 = 0)
-        color (int): color
-        downsample_ratio (int): downsample ratio
+        imgsz (tuple): The size of the image as (height, width).
+        polygons (list[np.ndarray]): A list of polygons. Each polygon is an array with shape [N, M], where
+                                     N is the number of polygons, and M is the number of points such that M % 2 = 0.
+        color (int): The color value to fill in the polygons on the masks.
+        downsample_ratio (int, optional): Factor by which to downsample each mask. Defaults to 1.
+
+    Returns:
+        (np.ndarray): A set of binary masks of the specified image size with the polygons filled in.
    """
    return np.array([polygon2mask(imgsz, [x.reshape(-1)], color, downsample_ratio) for x in polygons])

@ -205,7 +217,7 @@ def find_dataset_yaml(path: Path) -> Path:
    Find and return the YAML file associated with a Detect, Segment or Pose dataset.

    This function searches for a YAML file at the root level of the provided directory first, and if not found, it
-    performs a recursive search. It prefers YAML files that have the samestem as the provided path. An AssertionError
+    performs a recursive search. It prefers YAML files that have the same stem as the provided path. An AssertionError
    is raised if no YAML file is found or if multiple YAML files are found.

    Args:
@ -438,7 +450,8 @@ class HUBDatasetStats:
        self.stats = {'nc': len(data['names']), 'names': list(data['names'].values())}  # statistics dictionary
        self.data = data

-    def _unzip(self, path):
+    @staticmethod
+    def _unzip(path):
        """Unzip data.zip."""
        if not str(path).endswith('.zip'):  # path is data.yaml
            return False, None, path
--- a/ultralytics/models/sam/modules/decoders.py
+++ b/ultralytics/models/sam/modules/decoders.py
@ -171,7 +171,7 @@ class MLP(nn.Module):
            hidden_dim (int): The dimensionality of the hidden layers.
            output_dim (int): The dimensionality of the output layer.
            num_layers (int): The number of hidden layers.
-            sigmoid_output (bool, optional): Whether to apply a sigmoid activation to the output layer. Defaults to False.
+            sigmoid_output (bool, optional): Apply a sigmoid activation to the output layer. Defaults to False.
        """
        super().__init__()
        self.num_layers = num_layers
--- a/ultralytics/models/sam/modules/encoders.py
+++ b/ultralytics/models/sam/modules/encoders.py
@ -10,13 +10,15 @@ import torch.nn.functional as F
 from ultralytics.nn.modules import LayerNorm2d, MLPBlock


-# This class and its supporting functions below lightly adapted from the ViTDet backbone available at: https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/vit.py # noqa
 class ImageEncoderViT(nn.Module):
    """
    An image encoder using Vision Transformer (ViT) architecture for encoding an image into a compact latent space. The
    encoder takes an image, splits it into patches, and processes these patches through a series of transformer blocks.
    The encoded patches are then processed through a neck to generate the final encoded representation.

+    This class and its supporting functions below lightly adapted from the ViTDet backbone available at
+    https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/vit.py.
+
    Attributes:
        img_size (int): Dimension of input images, assumed to be square.
        patch_embed (PatchEmbed): Module for patch embedding.
@ -410,6 +412,8 @@ class Attention(nn.Module):
        input_size: Optional[Tuple[int, int]] = None,
    ) -> None:
        """
+        Initialize Attention module.
+
        Args:
            dim (int): Number of input channels.
            num_heads (int): Number of attention heads.
@ -502,8 +506,8 @@ def window_unpartition(windows: torch.Tensor, window_size: int, pad_hw: Tuple[in

 def get_rel_pos(q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor:
    """
-    Get relative positional embeddings according to the relative positions of
-        query and key sizes.
+    Get relative positional embeddings according to the relative positions of query and key sizes.
+
    Args:
        q_size (int): size of query q.
        k_size (int): size of key k.
@ -542,8 +546,9 @@ def add_decomposed_rel_pos(
    k_size: Tuple[int, int],
 ) -> torch.Tensor:
    """
-    Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
-    https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py   # noqa B950
+    Calculate decomposed Relative Positional Embeddings from mvitv2 paper at
+    https://github.com/facebookresearch/mvit/blob/main/mvit/models/attention.py.
+
    Args:
        attn (Tensor): attention map.
        q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
@ -583,6 +588,8 @@ class PatchEmbed(nn.Module):
            embed_dim: int = 768,
    ) -> None:
        """
+        Initialize PatchEmbed module.
+
        Args:
            kernel_size (Tuple): kernel size of the projection layer.
            stride (Tuple): stride of the projection layer.
--- a/ultralytics/nn/modules/transformer.py
+++ b/ultralytics/nn/modules/transformer.py
@ -39,7 +39,8 @@ class TransformerEncoderLayer(nn.Module):
        self.act = act
        self.normalize_before = normalize_before

-    def with_pos_embed(self, tensor, pos=None):
+    @staticmethod
+    def with_pos_embed(tensor, pos=None):
        """Add position embeddings to the tensor if provided."""
        return tensor if pos is None else tensor + pos

@ -180,9 +181,10 @@ class LayerNorm2d(nn.Module):
    """
    2D Layer Normalization module inspired by Detectron2 and ConvNeXt implementations.

-    Original implementation at
+    Original implementations in
    https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py
-    https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119
+    and
+    https://github.com/facebookresearch/ConvNeXt/blob/main/models/convnext.py.
    """

    def __init__(self, num_channels, eps=1e-6):
@ -250,7 +252,7 @@ class MSDeformAttn(nn.Module):

    def forward(self, query, refer_bbox, value, value_shapes, value_mask=None):
        """
-        Perform forward pass for multi-scale deformable attention.
+        Perform forward pass for multiscale deformable attention.

        https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py

--- a/ultralytics/utils/metrics.py
+++ b/ultralytics/utils/metrics.py
@ -48,8 +48,7 @@ def bbox_ioa(box1, box2, iou=False, eps=1e-7):

 def box_iou(box1, box2, eps=1e-7):
    """
-    Calculate intersection-over-union (IoU) of boxes.
-    Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
+    Calculate intersection-over-union (IoU) of boxes. Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
    Based on https://github.com/pytorch/vision/blob/master/torchvision/ops/boxes.py

    Args:
--- a/ultralytics/utils/tal.py
+++ b/ultralytics/utils/tal.py
@ -61,8 +61,8 @@ class TaskAlignedAssigner(nn.Module):
    """
    A task-aligned assigner for object detection.

-    This class assigns ground-truth (gt) objects to anchors based on the task-aligned metric,
-    which combines both classification and localization information.
+    This class assigns ground-truth (gt) objects to anchors based on the task-aligned metric, which combines both
+    classification and localization information.

    Attributes:
        topk (int): The number of top candidates to consider.
@ -85,8 +85,8 @@ class TaskAlignedAssigner(nn.Module):
    @torch.no_grad()
    def forward(self, pd_scores, pd_bboxes, anc_points, gt_labels, gt_bboxes, mask_gt):
        """
-        Compute the task-aligned assignment.
-        Reference https://github.com/Nioolek/PPYOLOE_pytorch/blob/master/ppyoloe/assigner/tal_assigner.py
+        Compute the task-aligned assignment. Reference code is available at
+        https://github.com/Nioolek/PPYOLOE_pytorch/blob/master/ppyoloe/assigner/tal_assigner.py.

        Args:
            pd_scores (Tensor): shape(bs, num_total_anchors, num_classes)