diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 0d6d8959..87ee2bd4 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -35,7 +35,7 @@ on: jobs: HUB: - if: github.repository == 'ultralytics/ultralytics' && (github.event_name == 'schedule' || github.event_name == 'push' || (github.event_name == 'workflow_dispatch' && github.event.inputs.hub == 'true')) + if: github.repository == 'ultralytics/ultralytics' && (github.event_name == 'schedule-disabled' || github.event_name == 'push-disabled' || (github.event_name == 'workflow_dispatch' && github.event.inputs.hub == 'true')) runs-on: ${{ matrix.os }} strategy: fail-fast: false diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 547e7185..e2b477da 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -22,7 +22,7 @@ repos: - id: detect-private-key - repo: https://github.com/asottile/pyupgrade - rev: v3.10.1 + rev: v3.14.0 hooks: - id: pyupgrade name: Upgrade code @@ -34,7 +34,7 @@ repos: name: Sort imports - repo: https://github.com/google/yapf - rev: v0.40.0 + rev: v0.40.2 hooks: - id: yapf name: YAPF formatting @@ -56,7 +56,7 @@ repos: name: PEP8 - repo: https://github.com/codespell-project/codespell - rev: v2.2.5 + rev: v2.2.6 hooks: - id: codespell args: diff --git a/ultralytics/__init__.py b/ultralytics/__init__.py index d3fdb693..c7285acd 100644 --- a/ultralytics/__init__.py +++ b/ultralytics/__init__.py @@ -1,6 +1,6 @@ # Ultralytics YOLO 🚀, AGPL-3.0 license -__version__ = '8.0.191' +__version__ = '8.0.192' from ultralytics.models import RTDETR, SAM, YOLO from ultralytics.models.fastsam import FastSAM diff --git a/ultralytics/utils/loss.py b/ultralytics/utils/loss.py index 69f08dba..dacf326f 100644 --- a/ultralytics/utils/loss.py +++ b/ultralytics/utils/loss.py @@ -99,10 +99,10 @@ class KeypointLoss(nn.Module): def forward(self, pred_kpts, gt_kpts, kpt_mask, area): """Calculates keypoint loss factor and Euclidean distance loss for predicted and actual keypoints.""" d = (pred_kpts[..., 0] - gt_kpts[..., 0]) ** 2 + (pred_kpts[..., 1] - gt_kpts[..., 1]) ** 2 - kpt_loss_factor = (torch.sum(kpt_mask != 0) + torch.sum(kpt_mask == 0)) / (torch.sum(kpt_mask != 0) + 1e-9) + kpt_loss_factor = kpt_mask.shape[1] / (torch.sum(kpt_mask != 0, dim=1) + 1e-9) # e = d / (2 * (area * self.sigmas) ** 2 + 1e-9) # from formula e = d / (2 * self.sigmas) ** 2 / (area + 1e-9) / 2 # from cocoeval - return kpt_loss_factor * ((1 - torch.exp(-e)) * kpt_mask).mean() + return (kpt_loss_factor.view(-1, 1) * ((1 - torch.exp(-e)) * kpt_mask)).mean() class v8DetectionLoss: @@ -354,23 +354,13 @@ class v8PoseLoss(v8DetectionLoss): keypoints = batch['keypoints'].to(self.device).float().clone() keypoints[..., 0] *= imgsz[1] keypoints[..., 1] *= imgsz[0] - for i in range(batch_size): - if fg_mask[i].sum(): - idx = target_gt_idx[i][fg_mask[i]] - gt_kpt = keypoints[batch_idx.view(-1) == i][idx] # (n, 51) - gt_kpt[..., 0] /= stride_tensor[fg_mask[i]] - gt_kpt[..., 1] /= stride_tensor[fg_mask[i]] - area = xyxy2xywh(target_bboxes[i][fg_mask[i]])[:, 2:].prod(1, keepdim=True) - pred_kpt = pred_kpts[i][fg_mask[i]] - kpt_mask = gt_kpt[..., 2] != 0 - loss[1] += self.keypoint_loss(pred_kpt, gt_kpt, kpt_mask, area) # pose loss - # kpt_score loss - if pred_kpt.shape[-1] == 3: - loss[2] += self.bce_pose(pred_kpt[..., 2], kpt_mask.float()) # keypoint obj loss + + loss[1], loss[2] = self.calculate_keypoints_loss(fg_mask, target_gt_idx, keypoints, batch_idx, + stride_tensor, target_bboxes, pred_kpts) loss[0] *= self.hyp.box # box gain - loss[1] *= self.hyp.pose / batch_size # pose gain - loss[2] *= self.hyp.kobj / batch_size # kobj gain + loss[1] *= self.hyp.pose # pose gain + loss[2] *= self.hyp.kobj # kobj gain loss[3] *= self.hyp.cls # cls gain loss[4] *= self.hyp.dfl # dfl gain @@ -385,6 +375,70 @@ class v8PoseLoss(v8DetectionLoss): y[..., 1] += anchor_points[:, [1]] - 0.5 return y + def calculate_keypoints_loss(self, masks, target_gt_idx, keypoints, batch_idx, stride_tensor, target_bboxes, + pred_kpts): + """ + Calculate the keypoints loss for the model. + + This function calculates the keypoints loss and keypoints object loss for a given batch. The keypoints loss is + based on the difference between the predicted keypoints and ground truth keypoints. The keypoints object loss is + a binary classification loss that classifies whether a keypoint is present or not. + + Args: + masks (torch.Tensor): Binary mask tensor indicating object presence, shape (BS, N_anchors). + target_gt_idx (torch.Tensor): Index tensor mapping anchors to ground truth objects, shape (BS, N_anchors). + keypoints (torch.Tensor): Ground truth keypoints, shape (N_kpts_in_batch, N_kpts_per_object, kpts_dim). + batch_idx (torch.Tensor): Batch index tensor for keypoints, shape (N_kpts_in_batch, 1). + stride_tensor (torch.Tensor): Stride tensor for anchors, shape (N_anchors, 1). + target_bboxes (torch.Tensor): Ground truth boxes in (x1, y1, x2, y2) format, shape (BS, N_anchors, 4). + pred_kpts (torch.Tensor): Predicted keypoints, shape (BS, N_anchors, N_kpts_per_object, kpts_dim). + + Returns: + (tuple): Returns a tuple containing: + - kpts_loss (torch.Tensor): The keypoints loss. + - kpts_obj_loss (torch.Tensor): The keypoints object loss. + """ + batch_idx = batch_idx.flatten() + batch_size = len(masks) + + # Find the maximum number of keypoints in a single image + max_kpts = torch.unique(batch_idx, return_counts=True)[1].max() + + # Create a tensor to hold batched keypoints + batched_keypoints = torch.zeros((batch_size, max_kpts, keypoints.shape[1], keypoints.shape[2]), + device=keypoints.device) + + # TODO: any idea how to vectorize this? + # Fill batched_keypoints with keypoints based on batch_idx + for i in range(batch_size): + keypoints_i = keypoints[batch_idx == i] + batched_keypoints[i, :keypoints_i.shape[0]] = keypoints_i + + # Expand dimensions of target_gt_idx to match the shape of batched_keypoints + target_gt_idx_expanded = target_gt_idx.unsqueeze(-1).unsqueeze(-1) + + # Use target_gt_idx_expanded to select keypoints from batched_keypoints + selected_keypoints = batched_keypoints.gather( + 1, target_gt_idx_expanded.expand(-1, -1, keypoints.shape[1], keypoints.shape[2])) + + # Divide coordinates by stride + selected_keypoints /= stride_tensor.view(1, -1, 1, 1) + + kpts_loss = 0 + kpts_obj_loss = 0 + + if masks.any(): + gt_kpt = selected_keypoints[masks] + area = xyxy2xywh(target_bboxes[masks])[:, 2:].prod(1, keepdim=True) + pred_kpt = pred_kpts[masks] + kpt_mask = gt_kpt[..., 2] != 0 if gt_kpt.shape[-1] == 3 else torch.full_like(gt_kpt[..., 0], True) + kpts_loss = self.keypoint_loss(pred_kpt, gt_kpt, kpt_mask, area) # pose loss + + if pred_kpt.shape[-1] == 3: + kpts_obj_loss = self.bce_pose(pred_kpt[..., 2], kpt_mask.float()) # keypoint obj loss + + return kpts_loss, kpts_obj_loss + class v8ClassificationLoss: """Criterion class for computing training losses."""