ultralytics 8.3.0 YOLO11 Models Release (#16539)

Signed-off-by: UltralyticsAssistant <web@ultralytics.com>
Co-authored-by: Laughing-q <1185102784@qq.com>
Co-authored-by: UltralyticsAssistant <web@ultralytics.com>
This commit is contained in:
Glenn Jocher 2024-09-30 02:59:20 +02:00 committed by GitHub
parent efb0c17881
commit 6e43d1e1e5
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
50 changed files with 1154 additions and 407 deletions

View file

@ -1,7 +1,6 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
__version__ = "8.2.103"
__version__ = "8.3.0"
import os

View file

@ -115,6 +115,7 @@ bgr: 0.0 # (float) image channel BGR (probability)
mosaic: 1.0 # (float) image mosaic (probability)
mixup: 0.0 # (float) image mixup (probability)
copy_paste: 0.0 # (float) segment copy-paste (probability)
copy_paste_mode: "flip" # (str) the method to do copy_paste augmentation (flip, mixup)
auto_augment: randaugment # (str) auto augmentation policy for classification (randaugment, autoaugment, augmix)
erasing: 0.4 # (float) probability of random erasing during classification training (0-0.9), 0 means no erasing, must be less than 1.0.
crop_fraction: 1.0 # (float) image crop fraction for classification (0.1-1), 1.0 means no crop, must be greater than 0.

View file

@ -0,0 +1,30 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
# YOLO11-cls image classification model. For Usage examples see https://docs.ultralytics.com/tasks/classify
# Parameters
nc: 80 # number of classes
scales: # model compound scaling constants, i.e. 'model=yolo11n-cls.yaml' will call yolo11-cls.yaml with scale 'n'
# [depth, width, max_channels]
n: [0.50, 0.25, 1024] # summary: 151 layers, 1633584 parameters, 1633584 gradients, 3.3 GFLOPs
s: [0.50, 0.50, 1024] # summary: 151 layers, 5545488 parameters, 5545488 gradients, 12.2 GFLOPs
m: [0.50, 1.00, 512] # summary: 187 layers, 10455696 parameters, 10455696 gradients, 39.7 GFLOPs
l: [1.00, 1.00, 512] # summary: 309 layers, 12937104 parameters, 12937104 gradients, 49.9 GFLOPs
x: [1.00, 1.50, 512] # summary: 309 layers, 28458544 parameters, 28458544 gradients, 111.1 GFLOPs
# YOLO11n backbone
backbone:
# [from, repeats, module, args]
- [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
- [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
- [-1, 2, C3k2, [256, False, 0.25]]
- [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
- [-1, 2, C3k2, [512, False, 0.25]]
- [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
- [-1, 2, C3k2, [512, True]]
- [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32
- [-1, 2, C3k2, [1024, True]]
- [-1, 2, C2PSA, [1024]] # 9
# YOLO11n head
head:
- [-1, 1, Classify, [nc]] # Classify

View file

@ -0,0 +1,47 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
# YOLO11 Oriented Bounding Boxes (OBB) model with P3-P5 outputs. For Usage examples see https://docs.ultralytics.com/tasks/obb
# Parameters
nc: 80 # number of classes
scales: # model compound scaling constants, i.e. 'model=yolo11n-obb.yaml' will call yolo11-obb.yaml with scale 'n'
# [depth, width, max_channels]
n: [0.50, 0.25, 1024] # summary: 344 layers, 2695747 parameters, 2695731 gradients, 6.9 GFLOPs
s: [0.50, 0.50, 1024] # summary: 344 layers, 9744931 parameters, 9744915 gradients, 22.7 GFLOPs
m: [0.50, 1.00, 512] # summary: 434 layers, 20963523 parameters, 20963507 gradients, 72.2 GFLOPs
l: [1.00, 1.00, 512] # summary: 656 layers, 26220995 parameters, 26220979 gradients, 91.3 GFLOPs
x: [1.00, 1.50, 512] # summary: 656 layers, 58875331 parameters, 58875315 gradients, 204.3 GFLOPs
# YOLO11n backbone
backbone:
# [from, repeats, module, args]
- [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
- [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
- [-1, 2, C3k2, [256, False, 0.25]]
- [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
- [-1, 2, C3k2, [512, False, 0.25]]
- [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
- [-1, 2, C3k2, [512, True]]
- [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32
- [-1, 2, C3k2, [1024, True]]
- [-1, 1, SPPF, [1024, 5]] # 9
- [-1, 2, C2PSA, [1024]] # 10
# YOLO11n head
head:
- [-1, 1, nn.Upsample, [None, 2, "nearest"]]
- [[-1, 6], 1, Concat, [1]] # cat backbone P4
- [-1, 2, C3k2, [512, False]] # 13
- [-1, 1, nn.Upsample, [None, 2, "nearest"]]
- [[-1, 4], 1, Concat, [1]] # cat backbone P3
- [-1, 2, C3k2, [256, False]] # 16 (P3/8-small)
- [-1, 1, Conv, [256, 3, 2]]
- [[-1, 13], 1, Concat, [1]] # cat head P4
- [-1, 2, C3k2, [512, False]] # 19 (P4/16-medium)
- [-1, 1, Conv, [512, 3, 2]]
- [[-1, 10], 1, Concat, [1]] # cat head P5
- [-1, 2, C3k2, [1024, True]] # 22 (P5/32-large)
- [[16, 19, 22], 1, OBB, [nc, 1]] # Detect(P3, P4, P5)

View file

@ -0,0 +1,48 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
# YOLO11-pose keypoints/pose estimation model. For Usage examples see https://docs.ultralytics.com/tasks/pose
# Parameters
nc: 80 # number of classes
kpt_shape: [17, 3] # number of keypoints, number of dims (2 for x,y or 3 for x,y,visible)
scales: # model compound scaling constants, i.e. 'model=yolo11n-pose.yaml' will call yolo11.yaml with scale 'n'
# [depth, width, max_channels]
n: [0.50, 0.25, 1024] # summary: 344 layers, 2908507 parameters, 2908491 gradients, 7.7 GFLOPs
s: [0.50, 0.50, 1024] # summary: 344 layers, 9948811 parameters, 9948795 gradients, 23.5 GFLOPs
m: [0.50, 1.00, 512] # summary: 434 layers, 20973273 parameters, 20973257 gradients, 72.3 GFLOPs
l: [1.00, 1.00, 512] # summary: 656 layers, 26230745 parameters, 26230729 gradients, 91.4 GFLOPs
x: [1.00, 1.50, 512] # summary: 656 layers, 58889881 parameters, 58889865 gradients, 204.3 GFLOPs
# YOLO11n backbone
backbone:
# [from, repeats, module, args]
- [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
- [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
- [-1, 2, C3k2, [256, False, 0.25]]
- [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
- [-1, 2, C3k2, [512, False, 0.25]]
- [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
- [-1, 2, C3k2, [512, True]]
- [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32
- [-1, 2, C3k2, [1024, True]]
- [-1, 1, SPPF, [1024, 5]] # 9
- [-1, 2, C2PSA, [1024]] # 10
# YOLO11n head
head:
- [-1, 1, nn.Upsample, [None, 2, "nearest"]]
- [[-1, 6], 1, Concat, [1]] # cat backbone P4
- [-1, 2, C3k2, [512, False]] # 13
- [-1, 1, nn.Upsample, [None, 2, "nearest"]]
- [[-1, 4], 1, Concat, [1]] # cat backbone P3
- [-1, 2, C3k2, [256, False]] # 16 (P3/8-small)
- [-1, 1, Conv, [256, 3, 2]]
- [[-1, 13], 1, Concat, [1]] # cat head P4
- [-1, 2, C3k2, [512, False]] # 19 (P4/16-medium)
- [-1, 1, Conv, [512, 3, 2]]
- [[-1, 10], 1, Concat, [1]] # cat head P5
- [-1, 2, C3k2, [1024, True]] # 22 (P5/32-large)
- [[16, 19, 22], 1, Pose, [nc, kpt_shape]] # Detect(P3, P4, P5)

View file

@ -0,0 +1,47 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
# YOLO11-seg instance segmentation model. For Usage examples see https://docs.ultralytics.com/tasks/segment
# Parameters
nc: 80 # number of classes
scales: # model compound scaling constants, i.e. 'model=yolo11n-seg.yaml' will call yolo11-seg.yaml with scale 'n'
# [depth, width, max_channels]
n: [0.50, 0.25, 1024] # summary: 355 layers, 2876848 parameters, 2876832 gradients, 10.5 GFLOPs
s: [0.50, 0.50, 1024] # summary: 355 layers, 10113248 parameters, 10113232 gradients, 35.8 GFLOPs
m: [0.50, 1.00, 512] # summary: 445 layers, 22420896 parameters, 22420880 gradients, 123.9 GFLOPs
l: [1.00, 1.00, 512] # summary: 667 layers, 27678368 parameters, 27678352 gradients, 143.0 GFLOPs
x: [1.00, 1.50, 512] # summary: 667 layers, 62142656 parameters, 62142640 gradients, 320.2 GFLOPs
# YOLO11n backbone
backbone:
# [from, repeats, module, args]
- [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
- [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
- [-1, 2, C3k2, [256, False, 0.25]]
- [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
- [-1, 2, C3k2, [512, False, 0.25]]
- [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
- [-1, 2, C3k2, [512, True]]
- [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32
- [-1, 2, C3k2, [1024, True]]
- [-1, 1, SPPF, [1024, 5]] # 9
- [-1, 2, C2PSA, [1024]] # 10
# YOLO11n head
head:
- [-1, 1, nn.Upsample, [None, 2, "nearest"]]
- [[-1, 6], 1, Concat, [1]] # cat backbone P4
- [-1, 2, C3k2, [512, False]] # 13
- [-1, 1, nn.Upsample, [None, 2, "nearest"]]
- [[-1, 4], 1, Concat, [1]] # cat backbone P3
- [-1, 2, C3k2, [256, False]] # 16 (P3/8-small)
- [-1, 1, Conv, [256, 3, 2]]
- [[-1, 13], 1, Concat, [1]] # cat head P4
- [-1, 2, C3k2, [512, False]] # 19 (P4/16-medium)
- [-1, 1, Conv, [512, 3, 2]]
- [[-1, 10], 1, Concat, [1]] # cat head P5
- [-1, 2, C3k2, [1024, True]] # 22 (P5/32-large)
- [[16, 19, 22], 1, Segment, [nc, 32, 256]] # Detect(P3, P4, P5)

View file

@ -0,0 +1,47 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
# YOLO11 object detection model with P3-P5 outputs. For Usage examples see https://docs.ultralytics.com/tasks/detect
# Parameters
nc: 80 # number of classes
scales: # model compound scaling constants, i.e. 'model=yolo11n.yaml' will call yolo11.yaml with scale 'n'
# [depth, width, max_channels]
n: [0.50, 0.25, 1024] # summary: 319 layers, 2624080 parameters, 2624064 gradients, 6.6 GFLOPs
s: [0.50, 0.50, 1024] # summary: 319 layers, 9458752 parameters, 9458736 gradients, 21.7 GFLOPs
m: [0.50, 1.00, 512] # summary: 409 layers, 20114688 parameters, 20114672 gradients, 68.5 GFLOPs
l: [1.00, 1.00, 512] # summary: 631 layers, 25372160 parameters, 25372144 gradients, 87.6 GFLOPs
x: [1.00, 1.50, 512] # summary: 631 layers, 56966176 parameters, 56966160 gradients, 196.0 GFLOPs
# YOLO11n backbone
backbone:
# [from, repeats, module, args]
- [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
- [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
- [-1, 2, C3k2, [256, False, 0.25]]
- [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
- [-1, 2, C3k2, [512, False, 0.25]]
- [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
- [-1, 2, C3k2, [512, True]]
- [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32
- [-1, 2, C3k2, [1024, True]]
- [-1, 1, SPPF, [1024, 5]] # 9
- [-1, 2, C2PSA, [1024]] # 10
# YOLO11n head
head:
- [-1, 1, nn.Upsample, [None, 2, "nearest"]]
- [[-1, 6], 1, Concat, [1]] # cat backbone P4
- [-1, 2, C3k2, [512, False]] # 13
- [-1, 1, nn.Upsample, [None, 2, "nearest"]]
- [[-1, 4], 1, Concat, [1]] # cat backbone P3
- [-1, 2, C3k2, [256, False]] # 16 (P3/8-small)
- [-1, 1, Conv, [256, 3, 2]]
- [[-1, 13], 1, Concat, [1]] # cat head P4
- [-1, 2, C3k2, [512, False]] # 19 (P4/16-medium)
- [-1, 1, Conv, [512, 3, 2]]
- [[-1, 10], 1, Concat, [1]] # cat head P5
- [-1, 2, C3k2, [1024, True]] # 22 (P5/32-large)
- [[16, 19, 22], 1, Detect, [nc]] # Detect(P3, P4, P5)

View file

@ -1628,92 +1628,105 @@ class LetterBox:
return labels
class CopyPaste:
class CopyPaste(BaseMixTransform):
"""
Implements Copy-Paste augmentation as described in https://arxiv.org/abs/2012.07177.
CopyPaste class for applying Copy-Paste augmentation to image datasets.
This class applies Copy-Paste augmentation on images and their corresponding instances.
This class implements the Copy-Paste augmentation technique as described in the paper "Simple Copy-Paste is a Strong
Data Augmentation Method for Instance Segmentation" (https://arxiv.org/abs/2012.07177). It combines objects from
different images to create new training samples.
Attributes:
p (float): Probability of applying the Copy-Paste augmentation. Must be between 0 and 1.
dataset (Any): The dataset to which Copy-Paste augmentation will be applied.
pre_transform (Callable | None): Optional transform to apply before Copy-Paste.
p (float): Probability of applying Copy-Paste augmentation.
Methods:
__call__: Applies Copy-Paste augmentation to given image and instances.
get_indexes: Returns a random index from the dataset.
_mix_transform: Applies Copy-Paste augmentation to the input labels.
__call__: Applies the Copy-Paste transformation to images and annotations.
Examples:
>>> copypaste = CopyPaste(p=0.5)
>>> augmented_labels = copypaste(labels)
>>> augmented_image = augmented_labels["img"]
>>> from ultralytics.data.augment import CopyPaste
>>> dataset = YourDataset(...) # Your image dataset
>>> copypaste = CopyPaste(dataset, p=0.5)
>>> augmented_labels = copypaste(original_labels)
"""
def __init__(self, p=0.5) -> None:
"""
Initializes the CopyPaste augmentation object.
def __init__(self, dataset=None, pre_transform=None, p=0.5, mode="flip") -> None:
"""Initializes CopyPaste object with dataset, pre_transform, and probability of applying MixUp."""
super().__init__(dataset=dataset, pre_transform=pre_transform, p=p)
assert mode in {"flip", "mixup"}, f"Expected `mode` to be `flip` or `mixup`, but got {mode}."
self.mode = mode
This class implements the Copy-Paste augmentation as described in the paper "Simple Copy-Paste is a Strong Data
Augmentation Method for Instance Segmentation" (https://arxiv.org/abs/2012.07177). It applies the Copy-Paste
augmentation on images and their corresponding instances with a given probability.
def get_indexes(self):
"""Returns a list of random indexes from the dataset for CopyPaste augmentation."""
return random.randint(0, len(self.dataset) - 1)
Args:
p (float): The probability of applying the Copy-Paste augmentation. Must be between 0 and 1.
Attributes:
p (float): Stores the probability of applying the augmentation.
Examples:
>>> augment = CopyPaste(p=0.7)
>>> augmented_data = augment(original_data)
"""
self.p = p
def _mix_transform(self, labels):
"""Applies Copy-Paste augmentation to combine objects from another image into the current image."""
labels2 = labels["mix_labels"][0]
return self._transform(labels, labels2)
def __call__(self, labels):
"""
Applies Copy-Paste augmentation to an image and its instances.
"""Applies Copy-Paste augmentation to an image and its labels."""
if len(labels["instances"].segments) == 0 or self.p == 0:
return labels
if self.mode == "flip":
return self._transform(labels)
Args:
labels (Dict): A dictionary containing:
- 'img' (np.ndarray): The image to augment.
- 'cls' (np.ndarray): Class labels for the instances.
- 'instances' (ultralytics.engine.results.Instances): Object containing bounding boxes, segments, etc.
# Get index of one or three other images
indexes = self.get_indexes()
if isinstance(indexes, int):
indexes = [indexes]
Returns:
(Dict): Dictionary with augmented image and updated instances under 'img', 'cls', and 'instances' keys.
# Get images information will be used for Mosaic or MixUp
mix_labels = [self.dataset.get_image_and_label(i) for i in indexes]
Examples:
>>> labels = {"img": np.random.rand(640, 640, 3), "cls": np.array([0, 1, 2]), "instances": Instances(...)}
>>> augmenter = CopyPaste(p=0.5)
>>> augmented_labels = augmenter(labels)
"""
im = labels["img"]
cls = labels["cls"]
if self.pre_transform is not None:
for i, data in enumerate(mix_labels):
mix_labels[i] = self.pre_transform(data)
labels["mix_labels"] = mix_labels
# Update cls and texts
labels = self._update_label_text(labels)
# Mosaic or MixUp
labels = self._mix_transform(labels)
labels.pop("mix_labels", None)
return labels
def _transform(self, labels1, labels2={}):
"""Applies Copy-Paste augmentation to combine objects from another image into the current image."""
im = labels1["img"]
cls = labels1["cls"]
h, w = im.shape[:2]
instances = labels.pop("instances")
instances = labels1.pop("instances")
instances.convert_bbox(format="xyxy")
instances.denormalize(w, h)
if self.p and len(instances.segments):
_, w, _ = im.shape # height, width, channels
im_new = np.zeros(im.shape, np.uint8)
# Calculate ioa first then select indexes randomly
ins_flip = deepcopy(instances)
ins_flip.fliplr(w)
im_new = np.zeros(im.shape, np.uint8)
instances2 = labels2.pop("instances", None)
if instances2 is None:
instances2 = deepcopy(instances)
instances2.fliplr(w)
ioa = bbox_ioa(instances2.bboxes, instances.bboxes) # intersection over area, (N, M)
indexes = np.nonzero((ioa < 0.30).all(1))[0] # (N, )
n = len(indexes)
sorted_idx = np.argsort(ioa.max(1)[indexes])
indexes = indexes[sorted_idx]
for j in indexes[: round(self.p * n)]:
cls = np.concatenate((cls, labels2.get("cls", cls)[[j]]), axis=0)
instances = Instances.concatenate((instances, instances2[[j]]), axis=0)
cv2.drawContours(im_new, instances2.segments[[j]].astype(np.int32), -1, (1, 1, 1), cv2.FILLED)
ioa = bbox_ioa(ins_flip.bboxes, instances.bboxes) # intersection over area, (N, M)
indexes = np.nonzero((ioa < 0.30).all(1))[0] # (N, )
n = len(indexes)
for j in random.sample(list(indexes), k=round(self.p * n)):
cls = np.concatenate((cls, cls[[j]]), axis=0)
instances = Instances.concatenate((instances, ins_flip[[j]]), axis=0)
cv2.drawContours(im_new, instances.segments[[j]].astype(np.int32), -1, (1, 1, 1), cv2.FILLED)
result = labels2.get("img", cv2.flip(im, 1)) # augment segments
i = im_new.astype(bool)
im[i] = result[i]
result = cv2.flip(im, 1) # augment segments (flip left-right)
i = cv2.flip(im_new, 1).astype(bool)
im[i] = result[i]
labels["img"] = im
labels["cls"] = cls
labels["instances"] = instances
return labels
labels1["img"] = im
labels1["cls"] = cls
labels1["instances"] = instances
return labels1
class Albumentations:
@ -2259,9 +2272,9 @@ class RandomLoadText:
def v8_transforms(dataset, imgsz, hyp, stretch=False):
"""
Applies a series of image transformations for YOLOv8 training.
Applies a series of image transformations for training.
This function creates a composition of image augmentation techniques to prepare images for YOLOv8 training.
This function creates a composition of image augmentation techniques to prepare images for YOLO training.
It includes operations such as mosaic, copy-paste, random perspective, mixup, and various color adjustments.
Args:
@ -2280,20 +2293,28 @@ def v8_transforms(dataset, imgsz, hyp, stretch=False):
>>> transforms = v8_transforms(dataset, imgsz=640, hyp=hyp)
>>> augmented_data = transforms(dataset[0])
"""
pre_transform = Compose(
[
Mosaic(dataset, imgsz=imgsz, p=hyp.mosaic),
CopyPaste(p=hyp.copy_paste),
RandomPerspective(
degrees=hyp.degrees,
translate=hyp.translate,
scale=hyp.scale,
shear=hyp.shear,
perspective=hyp.perspective,
pre_transform=None if stretch else LetterBox(new_shape=(imgsz, imgsz)),
),
]
mosaic = Mosaic(dataset, imgsz=imgsz, p=hyp.mosaic)
affine = RandomPerspective(
degrees=hyp.degrees,
translate=hyp.translate,
scale=hyp.scale,
shear=hyp.shear,
perspective=hyp.perspective,
pre_transform=None if stretch else LetterBox(new_shape=(imgsz, imgsz)),
)
pre_transform = Compose([mosaic, affine])
if hyp.copy_paste_mode == "flip":
pre_transform.insert(1, CopyPaste(p=hyp.copy_paste, mode=hyp.copy_paste_mode))
else:
pre_transform.append(
CopyPaste(
dataset,
pre_transform=Compose([Mosaic(dataset, imgsz=imgsz, p=hyp.mosaic), affine]),
p=hyp.copy_paste,
mode=hyp.copy_paste_mode,
)
)
flip_idx = dataset.data.get("flip_idx", []) # for keypoints augmentation
if dataset.use_keypoints:
kpt_shape = dataset.data.get("kpt_shape", None)

View file

@ -538,6 +538,8 @@ class BaseTrainer:
self.best.write_bytes(serialized_ckpt) # save best.pt
if (self.save_period > 0) and (self.epoch % self.save_period == 0):
(self.wdir / f"epoch{self.epoch}.pt").write_bytes(serialized_ckpt) # save epoch, i.e. 'epoch3.pt'
# if self.args.close_mosaic and self.epoch == (self.epochs - self.args.close_mosaic - 1):
# (self.wdir / "last_mosaic.pt").write_bytes(serialized_ckpt) # save mosaic checkpoint
def get_dataset(self):
"""
@ -698,7 +700,12 @@ class BaseTrainer:
resume = True
self.args = get_cfg(ckpt_args)
self.args.model = self.args.resume = str(last) # reinstate model
for k in "imgsz", "batch", "device": # allow arg updates to reduce memory or update device on resume
for k in (
"imgsz",
"batch",
"device",
"close_mosaic",
): # allow arg updates to reduce memory or update device on resume
if k in overrides:
setattr(self.args, k, overrides[k])

View file

@ -20,6 +20,7 @@ Example:
from .block import (
C1,
C2,
C2PSA,
C3,
C3TR,
CIB,
@ -38,7 +39,9 @@ from .block import (
C2f,
C2fAttn,
C2fCIB,
C2fPSA,
C3Ghost,
C3k2,
C3x,
CBFuse,
CBLinear,
@ -110,6 +113,10 @@ __all__ = (
"C2",
"C3",
"C2f",
"C3k2",
"SCDown",
"C2fPSA",
"C2PSA",
"C2fAttn",
"C3x",
"C3TR",
@ -149,5 +156,4 @@ __all__ = (
"C2fCIB",
"Attention",
"PSA",
"SCDown",
)

View file

@ -40,6 +40,9 @@ __all__ = (
"SPPELAN",
"CBFuse",
"CBLinear",
"C3k2",
"C2fPSA",
"C2PSA",
"RepVGGDW",
"CIB",
"C2fCIB",
@ -696,6 +699,49 @@ class CBFuse(nn.Module):
return torch.sum(torch.stack(res + xs[-1:]), dim=0)
class C3f(nn.Module):
"""Faster Implementation of CSP Bottleneck with 2 convolutions."""
def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5):
"""Initialize CSP bottleneck layer with two convolutions with arguments ch_in, ch_out, number, shortcut, groups,
expansion.
"""
super().__init__()
c_ = int(c2 * e) # hidden channels
self.cv1 = Conv(c1, c_, 1, 1)
self.cv2 = Conv(c1, c_, 1, 1)
self.cv3 = Conv((2 + n) * c_, c2, 1) # optional act=FReLU(c2)
self.m = nn.ModuleList(Bottleneck(c_, c_, shortcut, g, k=((3, 3), (3, 3)), e=1.0) for _ in range(n))
def forward(self, x):
"""Forward pass through C2f layer."""
y = [self.cv2(x), self.cv1(x)]
y.extend(m(y[-1]) for m in self.m)
return self.cv3(torch.cat(y, 1))
class C3k2(C2f):
"""Faster Implementation of CSP Bottleneck with 2 convolutions."""
def __init__(self, c1, c2, n=1, c3k=False, e=0.5, g=1, shortcut=True):
"""Initializes the C3k2 module, a faster CSP Bottleneck with 2 convolutions and optional C3k blocks."""
super().__init__(c1, c2, n, shortcut, g, e)
self.m = nn.ModuleList(
C3k(self.c, self.c, 2, shortcut, g) if c3k else Bottleneck(self.c, self.c, shortcut, g) for _ in range(n)
)
class C3k(C3):
"""C3k is a CSP bottleneck module with customizable kernel sizes for feature extraction in neural networks."""
def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5, k=3):
"""Initializes the C3k module with specified channels, number of layers, and configurations."""
super().__init__(c1, c2, n, shortcut, g, e)
c_ = int(c2 * e) # hidden channels
# self.m = nn.Sequential(*(RepBottleneck(c_, c_, shortcut, g, k=(k, k), e=1.0) for _ in range(n)))
self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, k=(k, k), e=1.0) for _ in range(n)))
class RepVGGDW(torch.nn.Module):
"""RepVGGDW is a class that represents a depth wise separable convolutional block in RepVGG architecture."""
@ -873,25 +919,69 @@ class Attention(nn.Module):
return x
class PSA(nn.Module):
class PSABlock(nn.Module):
"""
Position-wise Spatial Attention module.
PSABlock class implementing a Position-Sensitive Attention block for neural networks.
Args:
c1 (int): Number of input channels.
c2 (int): Number of output channels.
e (float): Expansion factor for the intermediate channels. Default is 0.5.
This class encapsulates the functionality for applying multi-head attention and feed-forward neural network layers
with optional shortcut connections.
Attributes:
c (int): Number of intermediate channels.
attn (Attention): Multi-head attention module.
ffn (nn.Sequential): Feed-forward neural network module.
add (bool): Flag indicating whether to add shortcut connections.
Methods:
forward: Performs a forward pass through the PSABlock, applying attention and feed-forward layers.
Examples:
Create a PSABlock and perform a forward pass
>>> psablock = PSABlock(c=128, attn_ratio=0.5, num_heads=4, shortcut=True)
>>> input_tensor = torch.randn(1, 128, 32, 32)
>>> output_tensor = psablock(input_tensor)
"""
def __init__(self, c, attn_ratio=0.5, num_heads=4, shortcut=True) -> None:
"""Initializes the PSABlock with attention and feed-forward layers for enhanced feature extraction."""
super().__init__()
self.attn = Attention(c, attn_ratio=attn_ratio, num_heads=num_heads)
self.ffn = nn.Sequential(Conv(c, c * 2, 1), Conv(c * 2, c, 1, act=False))
self.add = shortcut
def forward(self, x):
"""Executes a forward pass through PSABlock, applying attention and feed-forward layers to the input tensor."""
x = x + self.attn(x) if self.add else self.attn(x)
x = x + self.ffn(x) if self.add else self.ffn(x)
return x
class PSA(nn.Module):
"""
PSA class for implementing Position-Sensitive Attention in neural networks.
This class encapsulates the functionality for applying position-sensitive attention and feed-forward networks to
input tensors, enhancing feature extraction and processing capabilities.
Attributes:
c (int): Number of hidden channels after applying the initial convolution.
cv1 (Conv): 1x1 convolution layer to reduce the number of input channels to 2*c.
cv2 (Conv): 1x1 convolution layer to reduce the number of output channels to c.
attn (Attention): Attention module for spatial attention.
ffn (nn.Sequential): Feed-forward network module.
attn (Attention): Attention module for position-sensitive attention.
ffn (nn.Sequential): Feed-forward network for further processing.
Methods:
forward: Applies position-sensitive attention and feed-forward network to the input tensor.
Examples:
Create a PSA module and apply it to an input tensor
>>> psa = PSA(c1=128, c2=128, e=0.5)
>>> input_tensor = torch.randn(1, 128, 64, 64)
>>> output_tensor = psa.forward(input_tensor)
"""
def __init__(self, c1, c2, e=0.5):
"""Initializes convolution layers, attention module, and feed-forward network with channel reduction."""
"""Initializes the PSA module with input/output channels and attention mechanism for feature extraction."""
super().__init__()
assert c1 == c2
self.c = int(c1 * e)
@ -902,46 +992,117 @@ class PSA(nn.Module):
self.ffn = nn.Sequential(Conv(self.c, self.c * 2, 1), Conv(self.c * 2, self.c, 1, act=False))
def forward(self, x):
"""
Forward pass of the PSA module.
Args:
x (torch.Tensor): Input tensor.
Returns:
(torch.Tensor): Output tensor.
"""
"""Executes forward pass in PSA module, applying attention and feed-forward layers to the input tensor."""
a, b = self.cv1(x).split((self.c, self.c), dim=1)
b = b + self.attn(b)
b = b + self.ffn(b)
return self.cv2(torch.cat((a, b), 1))
class C2PSA(nn.Module):
"""
C2PSA module with attention mechanism for enhanced feature extraction and processing.
This module implements a convolutional block with attention mechanisms to enhance feature extraction and processing
capabilities. It includes a series of PSABlock modules for self-attention and feed-forward operations.
Attributes:
c (int): Number of hidden channels.
cv1 (Conv): 1x1 convolution layer to reduce the number of input channels to 2*c.
cv2 (Conv): 1x1 convolution layer to reduce the number of output channels to c.
m (nn.Sequential): Sequential container of PSABlock modules for attention and feed-forward operations.
Methods:
forward: Performs a forward pass through the C2PSA module, applying attention and feed-forward operations.
Notes:
This module essentially is the same as PSA module, but refactored to allow stacking more PSABlock modules.
Examples:
>>> c2psa = C2PSA(c1=256, c2=256, n=3, e=0.5)
>>> input_tensor = torch.randn(1, 256, 64, 64)
>>> output_tensor = c2psa(input_tensor)
"""
def __init__(self, c1, c2, n=1, e=0.5):
"""Initializes the C2PSA module with specified input/output channels, number of layers, and expansion ratio."""
super().__init__()
assert c1 == c2
self.c = int(c1 * e)
self.cv1 = Conv(c1, 2 * self.c, 1, 1)
self.cv2 = Conv(2 * self.c, c1, 1)
self.m = nn.Sequential(*(PSABlock(self.c, attn_ratio=0.5, num_heads=self.c // 64) for _ in range(n)))
def forward(self, x):
"""Processes the input tensor 'x' through a series of PSA blocks and returns the transformed tensor."""
a, b = self.cv1(x).split((self.c, self.c), dim=1)
b = self.m(b)
return self.cv2(torch.cat((a, b), 1))
class C2fPSA(C2f):
"""
C2fPSA module with enhanced feature extraction using PSA blocks.
This class extends the C2f module by incorporating PSA blocks for improved attention mechanisms and feature extraction.
Attributes:
c (int): Number of hidden channels.
cv1 (Conv): 1x1 convolution layer to reduce the number of input channels to 2*c.
cv2 (Conv): 1x1 convolution layer to reduce the number of output channels to c.
m (nn.ModuleList): List of PSA blocks for feature extraction.
Methods:
forward: Performs a forward pass through the C2fPSA module.
forward_split: Performs a forward pass using split() instead of chunk().
Examples:
>>> import torch
>>> from ultralytics.models.common import C2fPSA
>>> model = C2fPSA(c1=64, c2=64, n=3, e=0.5)
>>> x = torch.randn(1, 64, 128, 128)
>>> output = model(x)
>>> print(output.shape)
"""
def __init__(self, c1, c2, n=1, e=0.5):
"""Initializes the C2fPSA module, a variant of C2f with PSA blocks for enhanced feature extraction."""
assert c1 == c2
super().__init__(c1, c2, n=n, e=e)
self.m = nn.ModuleList(PSABlock(self.c, attn_ratio=0.5, num_heads=self.c // 64) for _ in range(n))
class SCDown(nn.Module):
"""Spatial Channel Downsample (SCDown) module for reducing spatial and channel dimensions."""
"""
SCDown module for downsampling with separable convolutions.
This module performs downsampling using a combination of pointwise and depthwise convolutions, which helps in
efficiently reducing the spatial dimensions of the input tensor while maintaining the channel information.
Attributes:
cv1 (Conv): Pointwise convolution layer that reduces the number of channels.
cv2 (Conv): Depthwise convolution layer that performs spatial downsampling.
Methods:
forward: Applies the SCDown module to the input tensor.
Examples:
>>> import torch
>>> from ultralytics import SCDown
>>> model = SCDown(c1=64, c2=128, k=3, s=2)
>>> x = torch.randn(1, 64, 128, 128)
>>> y = model(x)
>>> print(y.shape)
torch.Size([1, 128, 64, 64])
"""
def __init__(self, c1, c2, k, s):
"""
Spatial Channel Downsample (SCDown) module.
Args:
c1 (int): Number of input channels.
c2 (int): Number of output channels.
k (int): Kernel size for the convolutional layer.
s (int): Stride for the convolutional layer.
"""
"""Initializes the SCDown module with specified input/output channels, kernel size, and stride."""
super().__init__()
self.cv1 = Conv(c1, c2, 1, 1)
self.cv2 = Conv(c2, c2, k=k, s=s, g=c2, act=False)
def forward(self, x):
"""
Forward pass of the SCDown module.
Args:
x (torch.Tensor): Input tensor.
Returns:
(torch.Tensor): Output tensor after applying the SCDown module.
"""
"""Applies convolution and downsampling to the input tensor in the SCDown module."""
return self.cv2(self.cv1(x))

View file

@ -209,7 +209,8 @@ class RepConv(nn.Module):
kernelid, biasid = self._fuse_bn_tensor(self.bn)
return kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid
def _pad_1x1_to_3x3_tensor(self, kernel1x1):
@staticmethod
def _pad_1x1_to_3x3_tensor(kernel1x1):
"""Pads a 1x1 tensor to a 3x3 tensor."""
if kernel1x1 is None:
return 0

View file

@ -11,7 +11,7 @@ from torch.nn.init import constant_, xavier_uniform_
from ultralytics.utils.tal import TORCH_1_10, dist2bbox, dist2rbox, make_anchors
from .block import DFL, BNContrastiveHead, ContrastiveHead, Proto
from .conv import Conv
from .conv import Conv, DWConv
from .transformer import MLP, DeformableTransformerDecoder, DeformableTransformerDecoderLayer
from .utils import bias_init_with_prob, linear_init
@ -41,7 +41,14 @@ class Detect(nn.Module):
self.cv2 = nn.ModuleList(
nn.Sequential(Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, 4 * self.reg_max, 1)) for x in ch
)
self.cv3 = nn.ModuleList(nn.Sequential(Conv(x, c3, 3), Conv(c3, c3, 3), nn.Conv2d(c3, self.nc, 1)) for x in ch)
self.cv3 = nn.ModuleList(
nn.Sequential(
nn.Sequential(DWConv(x, x, 3), Conv(x, c3, 1)),
nn.Sequential(DWConv(c3, c3, 3), Conv(c3, c3, 1)),
nn.Conv2d(c3, self.nc, 1),
)
for x in ch
)
self.dfl = DFL(self.reg_max) if self.reg_max > 1 else nn.Identity()
if self.end2end:

View file

@ -13,6 +13,7 @@ from ultralytics.nn.modules import (
AIFI,
C1,
C2,
C2PSA,
C3,
C3TR,
ELAN1,
@ -28,7 +29,9 @@ from ultralytics.nn.modules import (
C2f,
C2fAttn,
C2fCIB,
C2fPSA,
C3Ghost,
C3k2,
C3x,
CBFuse,
CBLinear,
@ -968,12 +971,15 @@ def parse_model(d, ch, verbose=True): # model_dict, input_channels(3)
GhostBottleneck,
SPP,
SPPF,
C2fPSA,
C2PSA,
DWConv,
Focus,
BottleneckCSP,
C1,
C2,
C2f,
C3k2,
RepNCSPELAN4,
ELAN1,
ADown,
@ -1001,9 +1007,26 @@ def parse_model(d, ch, verbose=True): # model_dict, input_channels(3)
) # num heads
args = [c1, c2, *args[1:]]
if m in {BottleneckCSP, C1, C2, C2f, C2fAttn, C3, C3TR, C3Ghost, C3x, RepC3, C2fCIB}:
if m in {
BottleneckCSP,
C1,
C2,
C2f,
C3k2,
C2fAttn,
C3,
C3TR,
C3Ghost,
C3x,
RepC3,
C2fPSA,
C2fCIB,
C2PSA,
}:
args.insert(2, n) # number of repeats
n = 1
if m is C3k2 and scale in "mlx": # for M/L/X sizes
args[3] = True
elif m is AIFI:
args = [ch[f], *args]
elif m in {HGStem, HGBlock}:
@ -1080,7 +1103,7 @@ def guess_model_scale(model_path):
with contextlib.suppress(AttributeError):
import re
return re.search(r"yolov\d+([nslmx])", Path(model_path).stem).group(1) # n, s, m, l, or x
return re.search(r"yolo[v]?\d+([nslmx])", Path(model_path).stem).group(1) # n, s, m, l, or x
return ""

View file

@ -18,6 +18,7 @@ from ultralytics.utils import LOGGER, TQDM, checks, clean_url, emojis, is_online
GITHUB_ASSETS_REPO = "ultralytics/assets"
GITHUB_ASSETS_NAMES = (
[f"yolov8{k}{suffix}.pt" for k in "nsmlx" for suffix in ("", "-cls", "-seg", "-pose", "-obb", "-oiv7")]
+ [f"yolo11{k}{suffix}.pt" for k in "nsmlx" for suffix in ("", "-cls", "-seg", "-pose", "-obb")]
+ [f"yolov5{k}{resolution}u.pt" for k in "nsmlx" for resolution in ("", "6")]
+ [f"yolov3{k}u.pt" for k in ("", "-spp", "-tiny")]
+ [f"yolov8{k}-world.pt" for k in "smlx"]
@ -408,7 +409,7 @@ def get_github_assets(repo="ultralytics/assets", version="latest", retry=False):
return data["tag_name"], [x["name"] for x in data["assets"]] # tag, assets i.e. ['yolov8n.pt', 'yolov8s.pt', ...]
def attempt_download_asset(file, repo="ultralytics/assets", release="v8.2.0", **kwargs):
def attempt_download_asset(file, repo="ultralytics/assets", release="v8.3.0", **kwargs):
"""
Attempt to download a file from GitHub release assets if it is not found locally. The function checks for the file
locally first, then tries to download it from the specified GitHub repository release.
@ -416,7 +417,7 @@ def attempt_download_asset(file, repo="ultralytics/assets", release="v8.2.0", **
Args:
file (str | Path): The filename or file path to be downloaded.
repo (str, optional): The GitHub repository in the format 'owner/repo'. Defaults to 'ultralytics/assets'.
release (str, optional): The specific release version to be downloaded. Defaults to 'v8.2.0'.
release (str, optional): The specific release version to be downloaded. Defaults to 'v8.3.0'.
**kwargs (any): Additional keyword arguments for the download process.
Returns:

View file

@ -228,8 +228,11 @@ class v8DetectionLoss:
# Pboxes
pred_bboxes = self.bbox_decode(anchor_points, pred_distri) # xyxy, (b, h*w, 4)
# dfl_conf = pred_distri.view(batch_size, -1, 4, self.reg_max).detach().softmax(-1)
# dfl_conf = (dfl_conf.amax(-1).mean(-1) + dfl_conf.amax(-1).amin(-1)) / 2
_, target_bboxes, target_scores, fg_mask, _ = self.assigner(
# pred_scores.detach().sigmoid() * 0.8 + dfl_conf.unsqueeze(-1) * 0.2,
pred_scores.detach().sigmoid(),
(pred_bboxes.detach() * stride_tensor).type(gt_bboxes.dtype),
anchor_points * stride_tensor,

View file

@ -159,7 +159,7 @@ def select_device(device="", batch=0, newline=False, verbose=True):
if isinstance(device, torch.device):
return device
s = f"Ultralytics YOLOv{__version__} 🚀 Python-{PYTHON_VERSION} torch-{torch.__version__} "
s = f"Ultralytics {__version__} 🚀 Python-{PYTHON_VERSION} torch-{torch.__version__} "
device = str(device).lower()
for remove in "cuda:", "none", "(", ")", "[", "]", "'", " ":
device = device.replace(remove, "") # to string, 'cuda:0' -> '0' and '(0, 1)' -> '0,1'