Add docs Ultralytics 文档: - zh/index.md (#5871)
Signed-off-by: Glenn Jocher <glenn.jocher@ultralytics.com> Co-authored-by: Laughing-q <1185102784@qq.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
This commit is contained in:
parent
b9b0fd8bf4
commit
0f9f857449
15 changed files with 181 additions and 80 deletions
|
|
@ -267,20 +267,28 @@ class BaseDataset(Dataset):
|
|||
return label
|
||||
|
||||
def build_transforms(self, hyp=None):
|
||||
"""Users can custom augmentations here
|
||||
like:
|
||||
"""
|
||||
Users can customize augmentations here.
|
||||
|
||||
Example:
|
||||
```python
|
||||
if self.augment:
|
||||
# Training transforms
|
||||
return Compose([])
|
||||
else:
|
||||
# Val transforms
|
||||
return Compose([])
|
||||
```
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def get_labels(self):
|
||||
"""Users can custom their own format here.
|
||||
Make sure your output is a list with each element like below:
|
||||
"""
|
||||
Users can customize their own format here.
|
||||
|
||||
Note:
|
||||
Ensure output is a dictionary with the following keys:
|
||||
```python
|
||||
dict(
|
||||
im_file=im_file,
|
||||
shape=shape, # format: (height, width)
|
||||
|
|
@ -291,5 +299,6 @@ class BaseDataset(Dataset):
|
|||
normalized=True, # or False
|
||||
bbox_format="xyxy", # or xywh, ltwh
|
||||
)
|
||||
```
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
|
|
|||
|
|
@ -154,28 +154,40 @@ def verify_image_label(args):
|
|||
|
||||
def polygon2mask(imgsz, polygons, color=1, downsample_ratio=1):
|
||||
"""
|
||||
Convert a list of polygons to a binary mask of the specified image size.
|
||||
|
||||
Args:
|
||||
imgsz (tuple): The image size.
|
||||
polygons (list[np.ndarray]): [N, M], N is the number of polygons, M is the number of points(Be divided by 2).
|
||||
color (int): color
|
||||
downsample_ratio (int): downsample ratio
|
||||
imgsz (tuple): The size of the image as (height, width).
|
||||
polygons (list[np.ndarray]): A list of polygons. Each polygon is an array with shape [N, M], where
|
||||
N is the number of polygons, and M is the number of points such that M % 2 = 0.
|
||||
color (int, optional): The color value to fill in the polygons on the mask. Defaults to 1.
|
||||
downsample_ratio (int, optional): Factor by which to downsample the mask. Defaults to 1.
|
||||
|
||||
Returns:
|
||||
(np.ndarray): A binary mask of the specified image size with the polygons filled in.
|
||||
"""
|
||||
mask = np.zeros(imgsz, dtype=np.uint8)
|
||||
polygons = np.asarray(polygons, dtype=np.int32)
|
||||
polygons = polygons.reshape((polygons.shape[0], -1, 2))
|
||||
cv2.fillPoly(mask, polygons, color=color)
|
||||
nh, nw = (imgsz[0] // downsample_ratio, imgsz[1] // downsample_ratio)
|
||||
# NOTE: fillPoly first then resize is trying to keep the same way of loss calculation when mask-ratio=1.
|
||||
# Note: fillPoly first then resize is trying to keep the same loss calculation method when mask-ratio=1
|
||||
return cv2.resize(mask, (nw, nh))
|
||||
|
||||
|
||||
def polygons2masks(imgsz, polygons, color, downsample_ratio=1):
|
||||
"""
|
||||
Convert a list of polygons to a set of binary masks of the specified image size.
|
||||
|
||||
Args:
|
||||
imgsz (tuple): The image size.
|
||||
polygons (list[np.ndarray]): each polygon is [N, M], N is number of polygons, M is number of points (M % 2 = 0)
|
||||
color (int): color
|
||||
downsample_ratio (int): downsample ratio
|
||||
imgsz (tuple): The size of the image as (height, width).
|
||||
polygons (list[np.ndarray]): A list of polygons. Each polygon is an array with shape [N, M], where
|
||||
N is the number of polygons, and M is the number of points such that M % 2 = 0.
|
||||
color (int): The color value to fill in the polygons on the masks.
|
||||
downsample_ratio (int, optional): Factor by which to downsample each mask. Defaults to 1.
|
||||
|
||||
Returns:
|
||||
(np.ndarray): A set of binary masks of the specified image size with the polygons filled in.
|
||||
"""
|
||||
return np.array([polygon2mask(imgsz, [x.reshape(-1)], color, downsample_ratio) for x in polygons])
|
||||
|
||||
|
|
@ -205,7 +217,7 @@ def find_dataset_yaml(path: Path) -> Path:
|
|||
Find and return the YAML file associated with a Detect, Segment or Pose dataset.
|
||||
|
||||
This function searches for a YAML file at the root level of the provided directory first, and if not found, it
|
||||
performs a recursive search. It prefers YAML files that have the samestem as the provided path. An AssertionError
|
||||
performs a recursive search. It prefers YAML files that have the same stem as the provided path. An AssertionError
|
||||
is raised if no YAML file is found or if multiple YAML files are found.
|
||||
|
||||
Args:
|
||||
|
|
@ -438,7 +450,8 @@ class HUBDatasetStats:
|
|||
self.stats = {'nc': len(data['names']), 'names': list(data['names'].values())} # statistics dictionary
|
||||
self.data = data
|
||||
|
||||
def _unzip(self, path):
|
||||
@staticmethod
|
||||
def _unzip(path):
|
||||
"""Unzip data.zip."""
|
||||
if not str(path).endswith('.zip'): # path is data.yaml
|
||||
return False, None, path
|
||||
|
|
|
|||
|
|
@ -171,7 +171,7 @@ class MLP(nn.Module):
|
|||
hidden_dim (int): The dimensionality of the hidden layers.
|
||||
output_dim (int): The dimensionality of the output layer.
|
||||
num_layers (int): The number of hidden layers.
|
||||
sigmoid_output (bool, optional): Whether to apply a sigmoid activation to the output layer. Defaults to False.
|
||||
sigmoid_output (bool, optional): Apply a sigmoid activation to the output layer. Defaults to False.
|
||||
"""
|
||||
super().__init__()
|
||||
self.num_layers = num_layers
|
||||
|
|
|
|||
|
|
@ -10,13 +10,15 @@ import torch.nn.functional as F
|
|||
from ultralytics.nn.modules import LayerNorm2d, MLPBlock
|
||||
|
||||
|
||||
# This class and its supporting functions below lightly adapted from the ViTDet backbone available at: https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/vit.py # noqa
|
||||
class ImageEncoderViT(nn.Module):
|
||||
"""
|
||||
An image encoder using Vision Transformer (ViT) architecture for encoding an image into a compact latent space. The
|
||||
encoder takes an image, splits it into patches, and processes these patches through a series of transformer blocks.
|
||||
The encoded patches are then processed through a neck to generate the final encoded representation.
|
||||
|
||||
This class and its supporting functions below lightly adapted from the ViTDet backbone available at
|
||||
https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/vit.py.
|
||||
|
||||
Attributes:
|
||||
img_size (int): Dimension of input images, assumed to be square.
|
||||
patch_embed (PatchEmbed): Module for patch embedding.
|
||||
|
|
@ -410,6 +412,8 @@ class Attention(nn.Module):
|
|||
input_size: Optional[Tuple[int, int]] = None,
|
||||
) -> None:
|
||||
"""
|
||||
Initialize Attention module.
|
||||
|
||||
Args:
|
||||
dim (int): Number of input channels.
|
||||
num_heads (int): Number of attention heads.
|
||||
|
|
@ -502,8 +506,8 @@ def window_unpartition(windows: torch.Tensor, window_size: int, pad_hw: Tuple[in
|
|||
|
||||
def get_rel_pos(q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Get relative positional embeddings according to the relative positions of
|
||||
query and key sizes.
|
||||
Get relative positional embeddings according to the relative positions of query and key sizes.
|
||||
|
||||
Args:
|
||||
q_size (int): size of query q.
|
||||
k_size (int): size of key k.
|
||||
|
|
@ -542,8 +546,9 @@ def add_decomposed_rel_pos(
|
|||
k_size: Tuple[int, int],
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
|
||||
https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py # noqa B950
|
||||
Calculate decomposed Relative Positional Embeddings from mvitv2 paper at
|
||||
https://github.com/facebookresearch/mvit/blob/main/mvit/models/attention.py.
|
||||
|
||||
Args:
|
||||
attn (Tensor): attention map.
|
||||
q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
|
||||
|
|
@ -583,6 +588,8 @@ class PatchEmbed(nn.Module):
|
|||
embed_dim: int = 768,
|
||||
) -> None:
|
||||
"""
|
||||
Initialize PatchEmbed module.
|
||||
|
||||
Args:
|
||||
kernel_size (Tuple): kernel size of the projection layer.
|
||||
stride (Tuple): stride of the projection layer.
|
||||
|
|
|
|||
|
|
@ -39,7 +39,8 @@ class TransformerEncoderLayer(nn.Module):
|
|||
self.act = act
|
||||
self.normalize_before = normalize_before
|
||||
|
||||
def with_pos_embed(self, tensor, pos=None):
|
||||
@staticmethod
|
||||
def with_pos_embed(tensor, pos=None):
|
||||
"""Add position embeddings to the tensor if provided."""
|
||||
return tensor if pos is None else tensor + pos
|
||||
|
||||
|
|
@ -180,9 +181,10 @@ class LayerNorm2d(nn.Module):
|
|||
"""
|
||||
2D Layer Normalization module inspired by Detectron2 and ConvNeXt implementations.
|
||||
|
||||
Original implementation at
|
||||
Original implementations in
|
||||
https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py
|
||||
https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119
|
||||
and
|
||||
https://github.com/facebookresearch/ConvNeXt/blob/main/models/convnext.py.
|
||||
"""
|
||||
|
||||
def __init__(self, num_channels, eps=1e-6):
|
||||
|
|
@ -250,7 +252,7 @@ class MSDeformAttn(nn.Module):
|
|||
|
||||
def forward(self, query, refer_bbox, value, value_shapes, value_mask=None):
|
||||
"""
|
||||
Perform forward pass for multi-scale deformable attention.
|
||||
Perform forward pass for multiscale deformable attention.
|
||||
|
||||
https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
|
||||
|
||||
|
|
|
|||
|
|
@ -48,8 +48,7 @@ def bbox_ioa(box1, box2, iou=False, eps=1e-7):
|
|||
|
||||
def box_iou(box1, box2, eps=1e-7):
|
||||
"""
|
||||
Calculate intersection-over-union (IoU) of boxes.
|
||||
Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
|
||||
Calculate intersection-over-union (IoU) of boxes. Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
|
||||
Based on https://github.com/pytorch/vision/blob/master/torchvision/ops/boxes.py
|
||||
|
||||
Args:
|
||||
|
|
|
|||
|
|
@ -61,8 +61,8 @@ class TaskAlignedAssigner(nn.Module):
|
|||
"""
|
||||
A task-aligned assigner for object detection.
|
||||
|
||||
This class assigns ground-truth (gt) objects to anchors based on the task-aligned metric,
|
||||
which combines both classification and localization information.
|
||||
This class assigns ground-truth (gt) objects to anchors based on the task-aligned metric, which combines both
|
||||
classification and localization information.
|
||||
|
||||
Attributes:
|
||||
topk (int): The number of top candidates to consider.
|
||||
|
|
@ -85,8 +85,8 @@ class TaskAlignedAssigner(nn.Module):
|
|||
@torch.no_grad()
|
||||
def forward(self, pd_scores, pd_bboxes, anc_points, gt_labels, gt_bboxes, mask_gt):
|
||||
"""
|
||||
Compute the task-aligned assignment.
|
||||
Reference https://github.com/Nioolek/PPYOLOE_pytorch/blob/master/ppyoloe/assigner/tal_assigner.py
|
||||
Compute the task-aligned assignment. Reference code is available at
|
||||
https://github.com/Nioolek/PPYOLOE_pytorch/blob/master/ppyoloe/assigner/tal_assigner.py.
|
||||
|
||||
Args:
|
||||
pd_scores (Tensor): shape(bs, num_total_anchors, num_classes)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue