ultralytics 8.1.14 new YOLOv8-World models (#8054)

Signed-off-by: Glenn Jocher <glenn.jocher@ultralytics.com>
Co-authored-by: UltralyticsAssistant <web@ultralytics.com>
Co-authored-by: Glenn Jocher <glenn.jocher@ultralytics.com>
This commit is contained in:
Laughing 2024-02-14 18:46:26 +08:00 committed by GitHub
parent f9e9cdf2c3
commit 850ca8587f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
19 changed files with 683 additions and 32 deletions

View file

@ -18,6 +18,10 @@ __all__ = (
"C2",
"C3",
"C2f",
"C2fAttn",
"ImagePoolingAttn",
"ContrastiveHead",
"BNContrastiveHead",
"C3x",
"C3TR",
"C3Ghost",
@ -390,3 +394,157 @@ class ResNetLayer(nn.Module):
def forward(self, x):
"""Forward pass through the ResNet layer."""
return self.layer(x)
class MaxSigmoidAttnBlock(nn.Module):
"""Max Sigmoid attention block."""
def __init__(self, c1, c2, nh=1, ec=128, gc=512, scale=False):
"""Initializes MaxSigmoidAttnBlock with specified arguments."""
super().__init__()
self.nh = nh
self.hc = c2 // nh
self.ec = Conv(c1, ec, k=1, act=False) if c1 != ec else None
self.gl = nn.Linear(gc, ec)
self.bias = nn.Parameter(torch.zeros(nh))
self.proj_conv = Conv(c1, c2, k=3, s=1, act=False)
self.scale = nn.Parameter(torch.ones(1, nh, 1, 1)) if scale else 1.0
def forward(self, x, guide):
"""Forward process."""
bs, _, h, w = x.shape
guide = self.gl(guide)
guide = guide.view(bs, -1, self.nh, self.hc)
embed = self.ec(x) if self.ec is not None else x
embed = embed.view(bs, self.nh, self.hc, h, w)
aw = torch.einsum("bmchw,bnmc->bmhwn", embed, guide)
aw = aw.max(dim=-1)[0]
aw = aw / (self.hc**0.5)
aw = aw + self.bias[None, :, None, None]
aw = aw.sigmoid() * self.scale
x = self.proj_conv(x)
x = x.view(bs, self.nh, -1, h, w)
x = x * aw.unsqueeze(2)
return x.view(bs, -1, h, w)
class C2fAttn(nn.Module):
"""C2f module with an additional attn module."""
def __init__(self, c1, c2, n=1, ec=128, nh=1, gc=512, shortcut=False, g=1, e=0.5):
"""Initialize CSP bottleneck layer with two convolutions with arguments ch_in, ch_out, number, shortcut, groups,
expansion.
"""
super().__init__()
self.c = int(c2 * e) # hidden channels
self.cv1 = Conv(c1, 2 * self.c, 1, 1)
self.cv2 = Conv((3 + n) * self.c, c2, 1) # optional act=FReLU(c2)
self.m = nn.ModuleList(Bottleneck(self.c, self.c, shortcut, g, k=((3, 3), (3, 3)), e=1.0) for _ in range(n))
self.attn = MaxSigmoidAttnBlock(self.c, self.c, gc=gc, ec=ec, nh=nh)
def forward(self, x, guide):
"""Forward pass through C2f layer."""
y = list(self.cv1(x).chunk(2, 1))
y.extend(m(y[-1]) for m in self.m)
y.append(self.attn(y[-1], guide))
return self.cv2(torch.cat(y, 1))
def forward_split(self, x, guide):
"""Forward pass using split() instead of chunk()."""
y = list(self.cv1(x).split((self.c, self.c), 1))
y.extend(m(y[-1]) for m in self.m)
y.append(self.attn(y[-1], guide))
return self.cv2(torch.cat(y, 1))
class ImagePoolingAttn(nn.Module):
"""ImagePoolingAttn: Enhance the text embeddings with image-aware information."""
def __init__(self, ec=256, ch=(), ct=512, nh=8, k=3, scale=False):
"""Initializes ImagePoolingAttn with specified arguments."""
super().__init__()
nf = len(ch)
self.query = nn.Sequential(nn.LayerNorm(ct), nn.Linear(ct, ec))
self.key = nn.Sequential(nn.LayerNorm(ec), nn.Linear(ec, ec))
self.value = nn.Sequential(nn.LayerNorm(ec), nn.Linear(ec, ec))
self.proj = nn.Linear(ec, ct)
self.scale = nn.Parameter(torch.tensor([0.0]), requires_grad=True) if scale else 1.0
self.projections = nn.ModuleList([nn.Conv2d(in_channels, ec, kernel_size=1) for in_channels in ch])
self.im_pools = nn.ModuleList([nn.AdaptiveMaxPool2d((k, k)) for _ in range(nf)])
self.ec = ec
self.nh = nh
self.nf = nf
self.hc = ec // nh
self.k = k
def forward(self, x, text):
"""Executes attention mechanism on input tensor x and guide tensor."""
bs = x[0].shape[0]
assert len(x) == self.nf
num_patches = self.k**2
x = [pool(proj(x)).view(bs, -1, num_patches) for (x, proj, pool) in zip(x, self.projections, self.im_pools)]
x = torch.cat(x, dim=-1).transpose(1, 2)
q = self.query(text)
k = self.key(x)
v = self.value(x)
# q = q.reshape(1, text.shape[1], self.nh, self.hc).repeat(bs, 1, 1, 1)
q = q.reshape(bs, -1, self.nh, self.hc)
k = k.reshape(bs, -1, self.nh, self.hc)
v = v.reshape(bs, -1, self.nh, self.hc)
aw = torch.einsum("bnmc,bkmc->bmnk", q, k)
aw = aw / (self.hc**0.5)
aw = F.softmax(aw, dim=-1)
x = torch.einsum("bmnk,bkmc->bnmc", aw, v)
x = self.proj(x.reshape(bs, -1, self.ec))
return x * self.scale + text
class ContrastiveHead(nn.Module):
"""Contrastive Head for YOLO-World compute the region-text scores according to the similarity between image and text
features.
"""
def __init__(self):
"""Initializes ContrastiveHead with specified region-text similarity parameters."""
super().__init__()
self.bias = nn.Parameter(torch.zeros([]))
self.logit_scale = nn.Parameter(torch.ones([]) * torch.tensor(1 / 0.07).log())
def forward(self, x, w):
"""Forward function of contrastive learning."""
x = F.normalize(x, dim=1, p=2)
w = F.normalize(w, dim=-1, p=2)
x = torch.einsum("bchw,bkc->bkhw", x, w)
return x * self.logit_scale.exp() + self.bias
class BNContrastiveHead(nn.Module):
"""
Batch Norm Contrastive Head for YOLO-World using batch norm instead of l2-normalization.
Args:
embed_dims (int): Embed dimensions of text and image features.
norm_cfg (dict): Normalization parameters.
"""
def __init__(self, embed_dims: int):
"""Initialize ContrastiveHead with region-text similarity parameters."""
super().__init__()
self.norm = nn.BatchNorm2d(embed_dims)
self.bias = nn.Parameter(torch.zeros([]))
# use -1.0 is more stable
self.logit_scale = nn.Parameter(-1.0 * torch.ones([]))
def forward(self, x, w):
"""Forward function of contrastive learning."""
x = self.norm(x)
w = F.normalize(w, dim=-1, p=2)
x = torch.einsum("bchw,bkc->bkhw", x, w)
return x * self.logit_scale.exp() + self.bias