ultralytics 8.1.23 add YOLOv9-C and E models (#8571)
Signed-off-by: Glenn Jocher <glenn.jocher@ultralytics.com> Co-authored-by: UltralyticsAssistant <web@ultralytics.com> Co-authored-by: Glenn Jocher <glenn.jocher@ultralytics.com>
This commit is contained in:
parent
e138d701a0
commit
2071776a36
10 changed files with 360 additions and 10 deletions
|
|
@ -40,6 +40,12 @@ from .block import (
|
|||
ResNetLayer,
|
||||
ContrastiveHead,
|
||||
BNContrastiveHead,
|
||||
RepNCSPELAN4,
|
||||
ADown,
|
||||
SPPELAN,
|
||||
CBFuse,
|
||||
CBLinear,
|
||||
Silence,
|
||||
)
|
||||
from .conv import (
|
||||
CBAM,
|
||||
|
|
@ -123,4 +129,10 @@ __all__ = (
|
|||
"ImagePoolingAttn",
|
||||
"ContrastiveHead",
|
||||
"BNContrastiveHead",
|
||||
"RepNCSPELAN4",
|
||||
"ADown",
|
||||
"SPPELAN",
|
||||
"CBFuse",
|
||||
"CBLinear",
|
||||
"Silence",
|
||||
)
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ import torch
|
|||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from .conv import Conv, DWConv, GhostConv, LightConv, RepConv
|
||||
from .conv import Conv, DWConv, GhostConv, LightConv, RepConv, autopad
|
||||
from .transformer import TransformerBlock
|
||||
|
||||
__all__ = (
|
||||
|
|
@ -31,6 +31,12 @@ __all__ = (
|
|||
"Proto",
|
||||
"RepC3",
|
||||
"ResNetLayer",
|
||||
"RepNCSPELAN4",
|
||||
"ADown",
|
||||
"SPPELAN",
|
||||
"CBFuse",
|
||||
"CBLinear",
|
||||
"Silence",
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -531,7 +537,6 @@ class BNContrastiveHead(nn.Module):
|
|||
|
||||
Args:
|
||||
embed_dims (int): Embed dimensions of text and image features.
|
||||
norm_cfg (dict): Normalization parameters.
|
||||
"""
|
||||
|
||||
def __init__(self, embed_dims: int):
|
||||
|
|
@ -548,3 +553,146 @@ class BNContrastiveHead(nn.Module):
|
|||
w = F.normalize(w, dim=-1, p=2)
|
||||
x = torch.einsum("bchw,bkc->bkhw", x, w)
|
||||
return x * self.logit_scale.exp() + self.bias
|
||||
|
||||
|
||||
class RepBottleneck(nn.Module):
|
||||
"""Rep bottleneck."""
|
||||
|
||||
def __init__(self, c1, c2, shortcut=True, g=1, k=(3, 3), e=0.5):
|
||||
"""Initializes a RepBottleneck module with customizable in/out channels, shortcut option, groups and expansion
|
||||
ratio.
|
||||
"""
|
||||
super().__init__()
|
||||
c_ = int(c2 * e) # hidden channels
|
||||
self.cv1 = RepConv(c1, c_, k[0], 1)
|
||||
self.cv2 = Conv(c_, c2, k[1], 1, g=g)
|
||||
self.add = shortcut and c1 == c2
|
||||
|
||||
def forward(self, x):
|
||||
"""Forward pass through RepBottleneck layer."""
|
||||
return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
|
||||
|
||||
|
||||
class RepCSP(nn.Module):
|
||||
"""Rep CSP Bottleneck with 3 convolutions."""
|
||||
|
||||
def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
|
||||
"""Initializes RepCSP layer with given channels, repetitions, shortcut, groups and expansion ratio."""
|
||||
super().__init__()
|
||||
c_ = int(c2 * e) # hidden channels
|
||||
self.cv1 = Conv(c1, c_, 1, 1)
|
||||
self.cv2 = Conv(c1, c_, 1, 1)
|
||||
self.cv3 = Conv(2 * c_, c2, 1) # optional act=FReLU(c2)
|
||||
self.m = nn.Sequential(*(RepBottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)))
|
||||
|
||||
def forward(self, x):
|
||||
"""Forward pass through RepCSP layer."""
|
||||
return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), 1))
|
||||
|
||||
|
||||
class RepNCSPELAN4(nn.Module):
|
||||
"""CSP-ELAN."""
|
||||
|
||||
def __init__(self, c1, c2, c3, c4, n=1):
|
||||
"""Initializes CSP-ELAN layer with specified channel sizes, repetitions, and convolutions."""
|
||||
super().__init__()
|
||||
self.c = c3 // 2
|
||||
self.cv1 = Conv(c1, c3, 1, 1)
|
||||
self.cv2 = nn.Sequential(RepCSP(c3 // 2, c4, n), Conv(c4, c4, 3, 1))
|
||||
self.cv3 = nn.Sequential(RepCSP(c4, c4, n), Conv(c4, c4, 3, 1))
|
||||
self.cv4 = Conv(c3 + (2 * c4), c2, 1, 1)
|
||||
|
||||
def forward(self, x):
|
||||
"""Forward pass through RepNCSPELAN4 layer."""
|
||||
y = list(self.cv1(x).chunk(2, 1))
|
||||
y.extend((m(y[-1])) for m in [self.cv2, self.cv3])
|
||||
return self.cv4(torch.cat(y, 1))
|
||||
|
||||
def forward_split(self, x):
|
||||
"""Forward pass using split() instead of chunk()."""
|
||||
y = list(self.cv1(x).split((self.c, self.c), 1))
|
||||
y.extend(m(y[-1]) for m in [self.cv2, self.cv3])
|
||||
return self.cv4(torch.cat(y, 1))
|
||||
|
||||
|
||||
class ADown(nn.Module):
|
||||
"""ADown."""
|
||||
|
||||
def __init__(self, c1, c2):
|
||||
"""Initializes ADown module with convolution layers to downsample input from channels c1 to c2."""
|
||||
super().__init__()
|
||||
self.c = c2 // 2
|
||||
self.cv1 = Conv(c1 // 2, self.c, 3, 2, 1)
|
||||
self.cv2 = Conv(c1 // 2, self.c, 1, 1, 0)
|
||||
|
||||
def forward(self, x):
|
||||
"""Forward pass through ADown layer."""
|
||||
x = torch.nn.functional.avg_pool2d(x, 2, 1, 0, False, True)
|
||||
x1, x2 = x.chunk(2, 1)
|
||||
x1 = self.cv1(x1)
|
||||
x2 = torch.nn.functional.max_pool2d(x2, 3, 2, 1)
|
||||
x2 = self.cv2(x2)
|
||||
return torch.cat((x1, x2), 1)
|
||||
|
||||
|
||||
class SPPELAN(nn.Module):
|
||||
"""SPP-ELAN."""
|
||||
|
||||
def __init__(self, c1, c2, c3, k=5):
|
||||
"""Initializes SPP-ELAN block with convolution and max pooling layers for spatial pyramid pooling."""
|
||||
super().__init__()
|
||||
self.c = c3
|
||||
self.cv1 = Conv(c1, c3, 1, 1)
|
||||
self.cv2 = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)
|
||||
self.cv3 = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)
|
||||
self.cv4 = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)
|
||||
self.cv5 = Conv(4 * c3, c2, 1, 1)
|
||||
|
||||
def forward(self, x):
|
||||
"""Forward pass through SPPELAN layer."""
|
||||
y = [self.cv1(x)]
|
||||
y.extend(m(y[-1]) for m in [self.cv2, self.cv3, self.cv4])
|
||||
return self.cv5(torch.cat(y, 1))
|
||||
|
||||
|
||||
class Silence(nn.Module):
|
||||
"""Silence."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initializes the Silence module."""
|
||||
super(Silence, self).__init__()
|
||||
|
||||
def forward(self, x):
|
||||
"""Forward pass through Silence layer."""
|
||||
return x
|
||||
|
||||
|
||||
class CBLinear(nn.Module):
|
||||
"""CBLinear."""
|
||||
|
||||
def __init__(self, c1, c2s, k=1, s=1, p=None, g=1):
|
||||
"""Initializes the CBLinear module, passing inputs unchanged."""
|
||||
super(CBLinear, self).__init__()
|
||||
self.c2s = c2s
|
||||
self.conv = nn.Conv2d(c1, sum(c2s), k, s, autopad(k, p), groups=g, bias=True)
|
||||
|
||||
def forward(self, x):
|
||||
"""Forward pass through CBLinear layer."""
|
||||
outs = self.conv(x).split(self.c2s, dim=1)
|
||||
return outs
|
||||
|
||||
|
||||
class CBFuse(nn.Module):
|
||||
"""CBFuse."""
|
||||
|
||||
def __init__(self, idx):
|
||||
"""Initializes CBFuse module with layer index for selective feature fusion."""
|
||||
super(CBFuse, self).__init__()
|
||||
self.idx = idx
|
||||
|
||||
def forward(self, xs):
|
||||
"""Forward pass through CBFuse layer."""
|
||||
target_size = xs[-1].shape[2:]
|
||||
res = [F.interpolate(x[self.idx[i]], size=target_size, mode="nearest") for i, x in enumerate(xs[:-1])]
|
||||
out = torch.sum(torch.stack(res + xs[-1:]), dim=0)
|
||||
return out
|
||||
|
|
|
|||
|
|
@ -43,6 +43,12 @@ from ultralytics.nn.modules import (
|
|||
RTDETRDecoder,
|
||||
Segment,
|
||||
WorldDetect,
|
||||
RepNCSPELAN4,
|
||||
ADown,
|
||||
SPPELAN,
|
||||
CBFuse,
|
||||
CBLinear,
|
||||
Silence,
|
||||
)
|
||||
from ultralytics.utils import DEFAULT_CFG_DICT, DEFAULT_CFG_KEYS, LOGGER, colorstr, emojis, yaml_load
|
||||
from ultralytics.utils.checks import check_requirements, check_suffix, check_yaml
|
||||
|
|
@ -570,7 +576,7 @@ class WorldModel(DetectionModel):
|
|||
text_token = clip.tokenize(text).to(device)
|
||||
txt_feats = model.encode_text(text_token).to(dtype=torch.float32)
|
||||
txt_feats = txt_feats / txt_feats.norm(p=2, dim=-1, keepdim=True)
|
||||
self.txt_feats = txt_feats.reshape(-1, len(text), txt_feats.shape[-1])
|
||||
self.txt_feats = txt_feats.reshape(-1, len(text), txt_feats.shape[-1]).detach()
|
||||
self.model[-1].nc = len(text)
|
||||
|
||||
def init_criterion(self):
|
||||
|
|
@ -850,6 +856,9 @@ def parse_model(d, ch, verbose=True): # model_dict, input_channels(3)
|
|||
C1,
|
||||
C2,
|
||||
C2f,
|
||||
RepNCSPELAN4,
|
||||
ADown,
|
||||
SPPELAN,
|
||||
C2fAttn,
|
||||
C3,
|
||||
C3TR,
|
||||
|
|
@ -892,6 +901,12 @@ def parse_model(d, ch, verbose=True): # model_dict, input_channels(3)
|
|||
args[2] = make_divisible(min(args[2], max_channels) * width, 8)
|
||||
elif m is RTDETRDecoder: # special case, channels arg must be passed in index 1
|
||||
args.insert(1, [ch[x] for x in f])
|
||||
elif m is CBLinear:
|
||||
c2 = args[0]
|
||||
c1 = ch[f]
|
||||
args = [c1, c2, *args[1:]]
|
||||
elif m is CBFuse:
|
||||
c2 = ch[f[-1]]
|
||||
else:
|
||||
c2 = ch[f]
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue