Add docs Ultralytics 文档: - zh/index.md (#5871)

Signed-off-by: Glenn Jocher <glenn.jocher@ultralytics.com>
Co-authored-by: Laughing-q <1185102784@qq.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
This commit is contained in:
Glenn Jocher 2023-10-21 20:31:45 +02:00 committed by GitHub
parent b9b0fd8bf4
commit 0f9f857449
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
15 changed files with 181 additions and 80 deletions

View file

@ -267,20 +267,28 @@ class BaseDataset(Dataset):
return label
def build_transforms(self, hyp=None):
"""Users can custom augmentations here
like:
"""
Users can customize augmentations here.
Example:
```python
if self.augment:
# Training transforms
return Compose([])
else:
# Val transforms
return Compose([])
```
"""
raise NotImplementedError
def get_labels(self):
"""Users can custom their own format here.
Make sure your output is a list with each element like below:
"""
Users can customize their own format here.
Note:
Ensure output is a dictionary with the following keys:
```python
dict(
im_file=im_file,
shape=shape, # format: (height, width)
@ -291,5 +299,6 @@ class BaseDataset(Dataset):
normalized=True, # or False
bbox_format="xyxy", # or xywh, ltwh
)
```
"""
raise NotImplementedError

View file

@ -154,28 +154,40 @@ def verify_image_label(args):
def polygon2mask(imgsz, polygons, color=1, downsample_ratio=1):
"""
Convert a list of polygons to a binary mask of the specified image size.
Args:
imgsz (tuple): The image size.
polygons (list[np.ndarray]): [N, M], N is the number of polygons, M is the number of points(Be divided by 2).
color (int): color
downsample_ratio (int): downsample ratio
imgsz (tuple): The size of the image as (height, width).
polygons (list[np.ndarray]): A list of polygons. Each polygon is an array with shape [N, M], where
N is the number of polygons, and M is the number of points such that M % 2 = 0.
color (int, optional): The color value to fill in the polygons on the mask. Defaults to 1.
downsample_ratio (int, optional): Factor by which to downsample the mask. Defaults to 1.
Returns:
(np.ndarray): A binary mask of the specified image size with the polygons filled in.
"""
mask = np.zeros(imgsz, dtype=np.uint8)
polygons = np.asarray(polygons, dtype=np.int32)
polygons = polygons.reshape((polygons.shape[0], -1, 2))
cv2.fillPoly(mask, polygons, color=color)
nh, nw = (imgsz[0] // downsample_ratio, imgsz[1] // downsample_ratio)
# NOTE: fillPoly first then resize is trying to keep the same way of loss calculation when mask-ratio=1.
# Note: fillPoly first then resize is trying to keep the same loss calculation method when mask-ratio=1
return cv2.resize(mask, (nw, nh))
def polygons2masks(imgsz, polygons, color, downsample_ratio=1):
"""
Convert a list of polygons to a set of binary masks of the specified image size.
Args:
imgsz (tuple): The image size.
polygons (list[np.ndarray]): each polygon is [N, M], N is number of polygons, M is number of points (M % 2 = 0)
color (int): color
downsample_ratio (int): downsample ratio
imgsz (tuple): The size of the image as (height, width).
polygons (list[np.ndarray]): A list of polygons. Each polygon is an array with shape [N, M], where
N is the number of polygons, and M is the number of points such that M % 2 = 0.
color (int): The color value to fill in the polygons on the masks.
downsample_ratio (int, optional): Factor by which to downsample each mask. Defaults to 1.
Returns:
(np.ndarray): A set of binary masks of the specified image size with the polygons filled in.
"""
return np.array([polygon2mask(imgsz, [x.reshape(-1)], color, downsample_ratio) for x in polygons])
@ -205,7 +217,7 @@ def find_dataset_yaml(path: Path) -> Path:
Find and return the YAML file associated with a Detect, Segment or Pose dataset.
This function searches for a YAML file at the root level of the provided directory first, and if not found, it
performs a recursive search. It prefers YAML files that have the samestem as the provided path. An AssertionError
performs a recursive search. It prefers YAML files that have the same stem as the provided path. An AssertionError
is raised if no YAML file is found or if multiple YAML files are found.
Args:
@ -438,7 +450,8 @@ class HUBDatasetStats:
self.stats = {'nc': len(data['names']), 'names': list(data['names'].values())} # statistics dictionary
self.data = data
def _unzip(self, path):
@staticmethod
def _unzip(path):
"""Unzip data.zip."""
if not str(path).endswith('.zip'): # path is data.yaml
return False, None, path

View file

@ -171,7 +171,7 @@ class MLP(nn.Module):
hidden_dim (int): The dimensionality of the hidden layers.
output_dim (int): The dimensionality of the output layer.
num_layers (int): The number of hidden layers.
sigmoid_output (bool, optional): Whether to apply a sigmoid activation to the output layer. Defaults to False.
sigmoid_output (bool, optional): Apply a sigmoid activation to the output layer. Defaults to False.
"""
super().__init__()
self.num_layers = num_layers

View file

@ -10,13 +10,15 @@ import torch.nn.functional as F
from ultralytics.nn.modules import LayerNorm2d, MLPBlock
# This class and its supporting functions below lightly adapted from the ViTDet backbone available at: https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/vit.py # noqa
class ImageEncoderViT(nn.Module):
"""
An image encoder using Vision Transformer (ViT) architecture for encoding an image into a compact latent space. The
encoder takes an image, splits it into patches, and processes these patches through a series of transformer blocks.
The encoded patches are then processed through a neck to generate the final encoded representation.
This class and its supporting functions below lightly adapted from the ViTDet backbone available at
https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/vit.py.
Attributes:
img_size (int): Dimension of input images, assumed to be square.
patch_embed (PatchEmbed): Module for patch embedding.
@ -410,6 +412,8 @@ class Attention(nn.Module):
input_size: Optional[Tuple[int, int]] = None,
) -> None:
"""
Initialize Attention module.
Args:
dim (int): Number of input channels.
num_heads (int): Number of attention heads.
@ -502,8 +506,8 @@ def window_unpartition(windows: torch.Tensor, window_size: int, pad_hw: Tuple[in
def get_rel_pos(q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor:
"""
Get relative positional embeddings according to the relative positions of
query and key sizes.
Get relative positional embeddings according to the relative positions of query and key sizes.
Args:
q_size (int): size of query q.
k_size (int): size of key k.
@ -542,8 +546,9 @@ def add_decomposed_rel_pos(
k_size: Tuple[int, int],
) -> torch.Tensor:
"""
Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py # noqa B950
Calculate decomposed Relative Positional Embeddings from mvitv2 paper at
https://github.com/facebookresearch/mvit/blob/main/mvit/models/attention.py.
Args:
attn (Tensor): attention map.
q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
@ -583,6 +588,8 @@ class PatchEmbed(nn.Module):
embed_dim: int = 768,
) -> None:
"""
Initialize PatchEmbed module.
Args:
kernel_size (Tuple): kernel size of the projection layer.
stride (Tuple): stride of the projection layer.

View file

@ -39,7 +39,8 @@ class TransformerEncoderLayer(nn.Module):
self.act = act
self.normalize_before = normalize_before
def with_pos_embed(self, tensor, pos=None):
@staticmethod
def with_pos_embed(tensor, pos=None):
"""Add position embeddings to the tensor if provided."""
return tensor if pos is None else tensor + pos
@ -180,9 +181,10 @@ class LayerNorm2d(nn.Module):
"""
2D Layer Normalization module inspired by Detectron2 and ConvNeXt implementations.
Original implementation at
Original implementations in
https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py
https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119
and
https://github.com/facebookresearch/ConvNeXt/blob/main/models/convnext.py.
"""
def __init__(self, num_channels, eps=1e-6):
@ -250,7 +252,7 @@ class MSDeformAttn(nn.Module):
def forward(self, query, refer_bbox, value, value_shapes, value_mask=None):
"""
Perform forward pass for multi-scale deformable attention.
Perform forward pass for multiscale deformable attention.
https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py

View file

@ -48,8 +48,7 @@ def bbox_ioa(box1, box2, iou=False, eps=1e-7):
def box_iou(box1, box2, eps=1e-7):
"""
Calculate intersection-over-union (IoU) of boxes.
Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
Calculate intersection-over-union (IoU) of boxes. Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
Based on https://github.com/pytorch/vision/blob/master/torchvision/ops/boxes.py
Args:

View file

@ -61,8 +61,8 @@ class TaskAlignedAssigner(nn.Module):
"""
A task-aligned assigner for object detection.
This class assigns ground-truth (gt) objects to anchors based on the task-aligned metric,
which combines both classification and localization information.
This class assigns ground-truth (gt) objects to anchors based on the task-aligned metric, which combines both
classification and localization information.
Attributes:
topk (int): The number of top candidates to consider.
@ -85,8 +85,8 @@ class TaskAlignedAssigner(nn.Module):
@torch.no_grad()
def forward(self, pd_scores, pd_bboxes, anc_points, gt_labels, gt_bboxes, mask_gt):
"""
Compute the task-aligned assignment.
Reference https://github.com/Nioolek/PPYOLOE_pytorch/blob/master/ppyoloe/assigner/tal_assigner.py
Compute the task-aligned assignment. Reference code is available at
https://github.com/Nioolek/PPYOLOE_pytorch/blob/master/ppyoloe/assigner/tal_assigner.py.
Args:
pd_scores (Tensor): shape(bs, num_total_anchors, num_classes)