ultralytics 8.3.38 SAM 2 video inference (#14851)

Signed-off-by: Glenn Jocher <glenn.jocher@ultralytics.com> Signed-off-by: UltralyticsAssistant <web@ultralytics.com> Co-authored-by: UltralyticsAssistant <web@ultralytics.com> Co-authored-by: Glenn Jocher <glenn.jocher@ultralytics.com> Co-authored-by: Ultralytics Assistant <135830346+UltralyticsAssistant@users.noreply.github.com>
2024-11-26 19:38:23 +08:00 · 2024-11-26 19:38:23 +08:00 · dcc9bd536f
commit dcc9bd536f
parent 407815cf9e
16 changed files with 917 additions and 124 deletions
--- a/ultralytics/utils/loss.py
+++ b/ultralytics/utils/loss.py
@ -552,9 +552,8 @@ class v8PoseLoss(v8DetectionLoss):
            pred_kpts (torch.Tensor): Predicted keypoints, shape (BS, N_anchors, N_kpts_per_object, kpts_dim).

        Returns:
-            (tuple): Returns a tuple containing:
-                - kpts_loss (torch.Tensor): The keypoints loss.
-                - kpts_obj_loss (torch.Tensor): The keypoints object loss.
+            kpts_loss (torch.Tensor): The keypoints loss.
+            kpts_obj_loss (torch.Tensor): The keypoints object loss.
        """
        batch_idx = batch_idx.flatten()
        batch_size = len(masks)
--- a/ultralytics/utils/metrics.py
+++ b/ultralytics/utils/metrics.py
@ -549,19 +549,18 @@ def ap_per_class(
        prefix (str, optional): A prefix string for saving the plot files. Defaults to an empty string.

    Returns:
-        (tuple): A tuple of six arrays and one array of unique classes, where:
-            tp (np.ndarray): True positive counts at threshold given by max F1 metric for each class.Shape: (nc,).
-            fp (np.ndarray): False positive counts at threshold given by max F1 metric for each class. Shape: (nc,).
-            p (np.ndarray): Precision values at threshold given by max F1 metric for each class. Shape: (nc,).
-            r (np.ndarray): Recall values at threshold given by max F1 metric for each class. Shape: (nc,).
-            f1 (np.ndarray): F1-score values at threshold given by max F1 metric for each class. Shape: (nc,).
-            ap (np.ndarray): Average precision for each class at different IoU thresholds. Shape: (nc, 10).
-            unique_classes (np.ndarray): An array of unique classes that have data. Shape: (nc,).
-            p_curve (np.ndarray): Precision curves for each class. Shape: (nc, 1000).
-            r_curve (np.ndarray): Recall curves for each class. Shape: (nc, 1000).
-            f1_curve (np.ndarray): F1-score curves for each class. Shape: (nc, 1000).
-            x (np.ndarray): X-axis values for the curves. Shape: (1000,).
-            prec_values: Precision values at mAP@0.5 for each class. Shape: (nc, 1000).
+        tp (np.ndarray): True positive counts at threshold given by max F1 metric for each class.Shape: (nc,).
+        fp (np.ndarray): False positive counts at threshold given by max F1 metric for each class. Shape: (nc,).
+        p (np.ndarray): Precision values at threshold given by max F1 metric for each class. Shape: (nc,).
+        r (np.ndarray): Recall values at threshold given by max F1 metric for each class. Shape: (nc,).
+        f1 (np.ndarray): F1-score values at threshold given by max F1 metric for each class. Shape: (nc,).
+        ap (np.ndarray): Average precision for each class at different IoU thresholds. Shape: (nc, 10).
+        unique_classes (np.ndarray): An array of unique classes that have data. Shape: (nc,).
+        p_curve (np.ndarray): Precision curves for each class. Shape: (nc, 1000).
+        r_curve (np.ndarray): Recall curves for each class. Shape: (nc, 1000).
+        f1_curve (np.ndarray): F1-score curves for each class. Shape: (nc, 1000).
+        x (np.ndarray): X-axis values for the curves. Shape: (1000,).
+        prec_values (np.ndarray): Precision values at mAP@0.5 for each class. Shape: (nc, 1000).
    """
    # Sort by objectness
    i = np.argsort(-conf)
--- a/ultralytics/utils/ops.py
+++ b/ultralytics/utils/ops.py
@ -317,11 +317,11 @@ def clip_boxes(boxes, shape):
    Takes a list of bounding boxes and a shape (height, width) and clips the bounding boxes to the shape.

    Args:
-        boxes (torch.Tensor): the bounding boxes to clip
-        shape (tuple): the shape of the image
+        boxes (torch.Tensor): The bounding boxes to clip.
+        shape (tuple): The shape of the image.

    Returns:
-        (torch.Tensor | numpy.ndarray): Clipped boxes
+        (torch.Tensor | numpy.ndarray): The clipped boxes.
    """
    if isinstance(boxes, torch.Tensor):  # faster individually (WARNING: inplace .clamp_() Apple MPS bug)
        boxes[..., 0] = boxes[..., 0].clamp(0, shape[1])  # x1
@ -359,9 +359,9 @@ def scale_image(masks, im0_shape, ratio_pad=None):
    Takes a mask, and resizes it to the original image size.

    Args:
-        masks (np.ndarray): resized and padded masks/images, [h, w, num]/[h, w, 3].
-        im0_shape (tuple): the original image shape
-        ratio_pad (tuple): the ratio of the padding to the original image.
+        masks (np.ndarray): Resized and padded masks/images, [h, w, num]/[h, w, 3].
+        im0_shape (tuple): The original image shape.
+        ratio_pad (tuple): The ratio of the padding to the original image.

    Returns:
        masks (np.ndarray): The masks that are being returned with shape [h, w, num].
@ -692,12 +692,12 @@ def process_mask_native(protos, masks_in, bboxes, shape):

    Args:
        protos (torch.Tensor): [mask_dim, mask_h, mask_w]
-        masks_in (torch.Tensor): [n, mask_dim], n is number of masks after nms
-        bboxes (torch.Tensor): [n, 4], n is number of masks after nms
-        shape (tuple): the size of the input image (h,w)
+        masks_in (torch.Tensor): [n, mask_dim], n is number of masks after nms.
+        bboxes (torch.Tensor): [n, 4], n is number of masks after nms.
+        shape (tuple): The size of the input image (h,w).

    Returns:
-        masks (torch.Tensor): The returned masks with dimensions [h, w, n]
+        masks (torch.Tensor): The returned masks with dimensions [h, w, n].
    """
    c, mh, mw = protos.shape  # CHW
    masks = (masks_in @ protos.float().view(c, -1)).view(-1, mh, mw)
--- a/ultralytics/utils/plotting.py
+++ b/ultralytics/utils/plotting.py
@ -584,8 +584,8 @@ class Annotator:
        Displays queue counts on an image centered at the points with customizable font size and colors.

        Args:
-            label (str): queue counts label
-            points (tuple): region points for center point calculation to display text
+            label (str): Queue counts label.
+            points (tuple): Region points for center point calculation to display text.
            region_color (tuple): RGB queue region color.
            txt_color (tuple): RGB text display color.
        """
@ -624,13 +624,13 @@ class Annotator:
        Display the bounding boxes labels in parking management app.

        Args:
-            im0 (ndarray): inference image
-            text (str): object/class name
-            txt_color (tuple): display color for text foreground
-            bg_color (tuple): display color for text background
-            x_center (float): x position center point for bounding box
-            y_center (float): y position center point for bounding box
-            margin (int): gap between text and rectangle for better display
+            im0 (ndarray): Inference image.
+            text (str): Object/class name.
+            txt_color (tuple): Display color for text foreground.
+            bg_color (tuple): Display color for text background.
+            x_center (float): The x position center point for bounding box.
+            y_center (float): The y position center point for bounding box.
+            margin (int): The gap between text and rectangle for better display.
        """
        text_size = cv2.getTextSize(text, 0, fontScale=self.sf, thickness=self.tf)[0]
        text_x = x_center - text_size[0] // 2
@ -648,11 +648,11 @@ class Annotator:
        Display the overall statistics for parking lots.

        Args:
-            im0 (ndarray): inference image
-            text (dict): labels dictionary
-            txt_color (tuple): display color for text foreground
-            bg_color (tuple): display color for text background
-            margin (int): gap between text and rectangle for better display
+            im0 (ndarray): Inference image.
+            text (dict): Labels dictionary.
+            txt_color (tuple): Display color for text foreground.
+            bg_color (tuple): Display color for text background.
+            margin (int): Gap between text and rectangle for better display.
        """
        horizontal_gap = int(im0.shape[1] * 0.02)
        vertical_gap = int(im0.shape[0] * 0.01)