ultralytics 8.0.239 Ultralytics Actions and hub-sdk adoption (#7431)
Signed-off-by: Glenn Jocher <glenn.jocher@ultralytics.com> Co-authored-by: UltralyticsAssistant <web@ultralytics.com> Co-authored-by: Burhan <62214284+Burhan-Q@users.noreply.github.com> Co-authored-by: Kayzwer <68285002+Kayzwer@users.noreply.github.com>
This commit is contained in:
parent
e795277391
commit
fe27db2f6e
139 changed files with 6870 additions and 5125 deletions
|
|
@ -17,18 +17,101 @@ Example:
|
|||
```
|
||||
"""
|
||||
|
||||
from .block import (C1, C2, C3, C3TR, DFL, SPP, SPPF, Bottleneck, BottleneckCSP, C2f, C3Ghost, C3x, GhostBottleneck,
|
||||
HGBlock, HGStem, Proto, RepC3, ResNetLayer)
|
||||
from .conv import (CBAM, ChannelAttention, Concat, Conv, Conv2, ConvTranspose, DWConv, DWConvTranspose2d, Focus,
|
||||
GhostConv, LightConv, RepConv, SpatialAttention)
|
||||
from .block import (
|
||||
C1,
|
||||
C2,
|
||||
C3,
|
||||
C3TR,
|
||||
DFL,
|
||||
SPP,
|
||||
SPPF,
|
||||
Bottleneck,
|
||||
BottleneckCSP,
|
||||
C2f,
|
||||
C3Ghost,
|
||||
C3x,
|
||||
GhostBottleneck,
|
||||
HGBlock,
|
||||
HGStem,
|
||||
Proto,
|
||||
RepC3,
|
||||
ResNetLayer,
|
||||
)
|
||||
from .conv import (
|
||||
CBAM,
|
||||
ChannelAttention,
|
||||
Concat,
|
||||
Conv,
|
||||
Conv2,
|
||||
ConvTranspose,
|
||||
DWConv,
|
||||
DWConvTranspose2d,
|
||||
Focus,
|
||||
GhostConv,
|
||||
LightConv,
|
||||
RepConv,
|
||||
SpatialAttention,
|
||||
)
|
||||
from .head import OBB, Classify, Detect, Pose, RTDETRDecoder, Segment
|
||||
from .transformer import (AIFI, MLP, DeformableTransformerDecoder, DeformableTransformerDecoderLayer, LayerNorm2d,
|
||||
MLPBlock, MSDeformAttn, TransformerBlock, TransformerEncoderLayer, TransformerLayer)
|
||||
from .transformer import (
|
||||
AIFI,
|
||||
MLP,
|
||||
DeformableTransformerDecoder,
|
||||
DeformableTransformerDecoderLayer,
|
||||
LayerNorm2d,
|
||||
MLPBlock,
|
||||
MSDeformAttn,
|
||||
TransformerBlock,
|
||||
TransformerEncoderLayer,
|
||||
TransformerLayer,
|
||||
)
|
||||
|
||||
__all__ = ('Conv', 'Conv2', 'LightConv', 'RepConv', 'DWConv', 'DWConvTranspose2d', 'ConvTranspose', 'Focus',
|
||||
'GhostConv', 'ChannelAttention', 'SpatialAttention', 'CBAM', 'Concat', 'TransformerLayer',
|
||||
'TransformerBlock', 'MLPBlock', 'LayerNorm2d', 'DFL', 'HGBlock', 'HGStem', 'SPP', 'SPPF', 'C1', 'C2', 'C3',
|
||||
'C2f', 'C3x', 'C3TR', 'C3Ghost', 'GhostBottleneck', 'Bottleneck', 'BottleneckCSP', 'Proto', 'Detect',
|
||||
'Segment', 'Pose', 'Classify', 'TransformerEncoderLayer', 'RepC3', 'RTDETRDecoder', 'AIFI',
|
||||
'DeformableTransformerDecoder', 'DeformableTransformerDecoderLayer', 'MSDeformAttn', 'MLP', 'ResNetLayer',
|
||||
'OBB')
|
||||
__all__ = (
|
||||
"Conv",
|
||||
"Conv2",
|
||||
"LightConv",
|
||||
"RepConv",
|
||||
"DWConv",
|
||||
"DWConvTranspose2d",
|
||||
"ConvTranspose",
|
||||
"Focus",
|
||||
"GhostConv",
|
||||
"ChannelAttention",
|
||||
"SpatialAttention",
|
||||
"CBAM",
|
||||
"Concat",
|
||||
"TransformerLayer",
|
||||
"TransformerBlock",
|
||||
"MLPBlock",
|
||||
"LayerNorm2d",
|
||||
"DFL",
|
||||
"HGBlock",
|
||||
"HGStem",
|
||||
"SPP",
|
||||
"SPPF",
|
||||
"C1",
|
||||
"C2",
|
||||
"C3",
|
||||
"C2f",
|
||||
"C3x",
|
||||
"C3TR",
|
||||
"C3Ghost",
|
||||
"GhostBottleneck",
|
||||
"Bottleneck",
|
||||
"BottleneckCSP",
|
||||
"Proto",
|
||||
"Detect",
|
||||
"Segment",
|
||||
"Pose",
|
||||
"Classify",
|
||||
"TransformerEncoderLayer",
|
||||
"RepC3",
|
||||
"RTDETRDecoder",
|
||||
"AIFI",
|
||||
"DeformableTransformerDecoder",
|
||||
"DeformableTransformerDecoderLayer",
|
||||
"MSDeformAttn",
|
||||
"MLP",
|
||||
"ResNetLayer",
|
||||
"OBB",
|
||||
)
|
||||
|
|
|
|||
|
|
@ -8,8 +8,26 @@ import torch.nn.functional as F
|
|||
from .conv import Conv, DWConv, GhostConv, LightConv, RepConv
|
||||
from .transformer import TransformerBlock
|
||||
|
||||
__all__ = ('DFL', 'HGBlock', 'HGStem', 'SPP', 'SPPF', 'C1', 'C2', 'C3', 'C2f', 'C3x', 'C3TR', 'C3Ghost',
|
||||
'GhostBottleneck', 'Bottleneck', 'BottleneckCSP', 'Proto', 'RepC3', 'ResNetLayer')
|
||||
__all__ = (
|
||||
"DFL",
|
||||
"HGBlock",
|
||||
"HGStem",
|
||||
"SPP",
|
||||
"SPPF",
|
||||
"C1",
|
||||
"C2",
|
||||
"C3",
|
||||
"C2f",
|
||||
"C3x",
|
||||
"C3TR",
|
||||
"C3Ghost",
|
||||
"GhostBottleneck",
|
||||
"Bottleneck",
|
||||
"BottleneckCSP",
|
||||
"Proto",
|
||||
"RepC3",
|
||||
"ResNetLayer",
|
||||
)
|
||||
|
||||
|
||||
class DFL(nn.Module):
|
||||
|
|
@ -284,9 +302,11 @@ class GhostBottleneck(nn.Module):
|
|||
self.conv = nn.Sequential(
|
||||
GhostConv(c1, c_, 1, 1), # pw
|
||||
DWConv(c_, c_, k, s, act=False) if s == 2 else nn.Identity(), # dw
|
||||
GhostConv(c_, c2, 1, 1, act=False)) # pw-linear
|
||||
self.shortcut = nn.Sequential(DWConv(c1, c1, k, s, act=False), Conv(c1, c2, 1, 1,
|
||||
act=False)) if s == 2 else nn.Identity()
|
||||
GhostConv(c_, c2, 1, 1, act=False), # pw-linear
|
||||
)
|
||||
self.shortcut = (
|
||||
nn.Sequential(DWConv(c1, c1, k, s, act=False), Conv(c1, c2, 1, 1, act=False)) if s == 2 else nn.Identity()
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
"""Applies skip connection and concatenation to input tensor."""
|
||||
|
|
@ -359,8 +379,9 @@ class ResNetLayer(nn.Module):
|
|||
self.is_first = is_first
|
||||
|
||||
if self.is_first:
|
||||
self.layer = nn.Sequential(Conv(c1, c2, k=7, s=2, p=3, act=True),
|
||||
nn.MaxPool2d(kernel_size=3, stride=2, padding=1))
|
||||
self.layer = nn.Sequential(
|
||||
Conv(c1, c2, k=7, s=2, p=3, act=True), nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
|
||||
)
|
||||
else:
|
||||
blocks = [ResNetBlock(c1, c2, s, e=e)]
|
||||
blocks.extend([ResNetBlock(e * c2, c2, 1, e=e) for _ in range(n - 1)])
|
||||
|
|
|
|||
|
|
@ -7,8 +7,21 @@ import numpy as np
|
|||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
__all__ = ('Conv', 'Conv2', 'LightConv', 'DWConv', 'DWConvTranspose2d', 'ConvTranspose', 'Focus', 'GhostConv',
|
||||
'ChannelAttention', 'SpatialAttention', 'CBAM', 'Concat', 'RepConv')
|
||||
__all__ = (
|
||||
"Conv",
|
||||
"Conv2",
|
||||
"LightConv",
|
||||
"DWConv",
|
||||
"DWConvTranspose2d",
|
||||
"ConvTranspose",
|
||||
"Focus",
|
||||
"GhostConv",
|
||||
"ChannelAttention",
|
||||
"SpatialAttention",
|
||||
"CBAM",
|
||||
"Concat",
|
||||
"RepConv",
|
||||
)
|
||||
|
||||
|
||||
def autopad(k, p=None, d=1): # kernel, padding, dilation
|
||||
|
|
@ -22,6 +35,7 @@ def autopad(k, p=None, d=1): # kernel, padding, dilation
|
|||
|
||||
class Conv(nn.Module):
|
||||
"""Standard convolution with args(ch_in, ch_out, kernel, stride, padding, groups, dilation, activation)."""
|
||||
|
||||
default_act = nn.SiLU() # default activation
|
||||
|
||||
def __init__(self, c1, c2, k=1, s=1, p=None, g=1, d=1, act=True):
|
||||
|
|
@ -60,9 +74,9 @@ class Conv2(Conv):
|
|||
"""Fuse parallel convolutions."""
|
||||
w = torch.zeros_like(self.conv.weight.data)
|
||||
i = [x // 2 for x in w.shape[2:]]
|
||||
w[:, :, i[0]:i[0] + 1, i[1]:i[1] + 1] = self.cv2.weight.data.clone()
|
||||
w[:, :, i[0] : i[0] + 1, i[1] : i[1] + 1] = self.cv2.weight.data.clone()
|
||||
self.conv.weight.data += w
|
||||
self.__delattr__('cv2')
|
||||
self.__delattr__("cv2")
|
||||
self.forward = self.forward_fuse
|
||||
|
||||
|
||||
|
|
@ -102,6 +116,7 @@ class DWConvTranspose2d(nn.ConvTranspose2d):
|
|||
|
||||
class ConvTranspose(nn.Module):
|
||||
"""Convolution transpose 2d layer."""
|
||||
|
||||
default_act = nn.SiLU() # default activation
|
||||
|
||||
def __init__(self, c1, c2, k=2, s=2, p=0, bn=True, act=True):
|
||||
|
|
@ -164,6 +179,7 @@ class RepConv(nn.Module):
|
|||
This module is used in RT-DETR.
|
||||
Based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py
|
||||
"""
|
||||
|
||||
default_act = nn.SiLU() # default activation
|
||||
|
||||
def __init__(self, c1, c2, k=3, s=1, p=1, g=1, d=1, act=True, bn=False, deploy=False):
|
||||
|
|
@ -214,7 +230,7 @@ class RepConv(nn.Module):
|
|||
beta = branch.bn.bias
|
||||
eps = branch.bn.eps
|
||||
elif isinstance(branch, nn.BatchNorm2d):
|
||||
if not hasattr(self, 'id_tensor'):
|
||||
if not hasattr(self, "id_tensor"):
|
||||
input_dim = self.c1 // self.g
|
||||
kernel_value = np.zeros((self.c1, input_dim, 3, 3), dtype=np.float32)
|
||||
for i in range(self.c1):
|
||||
|
|
@ -232,29 +248,31 @@ class RepConv(nn.Module):
|
|||
|
||||
def fuse_convs(self):
|
||||
"""Combines two convolution layers into a single layer and removes unused attributes from the class."""
|
||||
if hasattr(self, 'conv'):
|
||||
if hasattr(self, "conv"):
|
||||
return
|
||||
kernel, bias = self.get_equivalent_kernel_bias()
|
||||
self.conv = nn.Conv2d(in_channels=self.conv1.conv.in_channels,
|
||||
out_channels=self.conv1.conv.out_channels,
|
||||
kernel_size=self.conv1.conv.kernel_size,
|
||||
stride=self.conv1.conv.stride,
|
||||
padding=self.conv1.conv.padding,
|
||||
dilation=self.conv1.conv.dilation,
|
||||
groups=self.conv1.conv.groups,
|
||||
bias=True).requires_grad_(False)
|
||||
self.conv = nn.Conv2d(
|
||||
in_channels=self.conv1.conv.in_channels,
|
||||
out_channels=self.conv1.conv.out_channels,
|
||||
kernel_size=self.conv1.conv.kernel_size,
|
||||
stride=self.conv1.conv.stride,
|
||||
padding=self.conv1.conv.padding,
|
||||
dilation=self.conv1.conv.dilation,
|
||||
groups=self.conv1.conv.groups,
|
||||
bias=True,
|
||||
).requires_grad_(False)
|
||||
self.conv.weight.data = kernel
|
||||
self.conv.bias.data = bias
|
||||
for para in self.parameters():
|
||||
para.detach_()
|
||||
self.__delattr__('conv1')
|
||||
self.__delattr__('conv2')
|
||||
if hasattr(self, 'nm'):
|
||||
self.__delattr__('nm')
|
||||
if hasattr(self, 'bn'):
|
||||
self.__delattr__('bn')
|
||||
if hasattr(self, 'id_tensor'):
|
||||
self.__delattr__('id_tensor')
|
||||
self.__delattr__("conv1")
|
||||
self.__delattr__("conv2")
|
||||
if hasattr(self, "nm"):
|
||||
self.__delattr__("nm")
|
||||
if hasattr(self, "bn"):
|
||||
self.__delattr__("bn")
|
||||
if hasattr(self, "id_tensor"):
|
||||
self.__delattr__("id_tensor")
|
||||
|
||||
|
||||
class ChannelAttention(nn.Module):
|
||||
|
|
@ -278,7 +296,7 @@ class SpatialAttention(nn.Module):
|
|||
def __init__(self, kernel_size=7):
|
||||
"""Initialize Spatial-attention module with kernel size argument."""
|
||||
super().__init__()
|
||||
assert kernel_size in (3, 7), 'kernel size must be 3 or 7'
|
||||
assert kernel_size in (3, 7), "kernel size must be 3 or 7"
|
||||
padding = 3 if kernel_size == 7 else 1
|
||||
self.cv1 = nn.Conv2d(2, 1, kernel_size, padding=padding, bias=False)
|
||||
self.act = nn.Sigmoid()
|
||||
|
|
|
|||
|
|
@ -14,11 +14,12 @@ from .conv import Conv
|
|||
from .transformer import MLP, DeformableTransformerDecoder, DeformableTransformerDecoderLayer
|
||||
from .utils import bias_init_with_prob, linear_init_
|
||||
|
||||
__all__ = 'Detect', 'Segment', 'Pose', 'Classify', 'OBB', 'RTDETRDecoder'
|
||||
__all__ = "Detect", "Segment", "Pose", "Classify", "OBB", "RTDETRDecoder"
|
||||
|
||||
|
||||
class Detect(nn.Module):
|
||||
"""YOLOv8 Detect head for detection models."""
|
||||
|
||||
dynamic = False # force grid reconstruction
|
||||
export = False # export mode
|
||||
shape = None
|
||||
|
|
@ -35,7 +36,8 @@ class Detect(nn.Module):
|
|||
self.stride = torch.zeros(self.nl) # strides computed during build
|
||||
c2, c3 = max((16, ch[0] // 4, self.reg_max * 4)), max(ch[0], min(self.nc, 100)) # channels
|
||||
self.cv2 = nn.ModuleList(
|
||||
nn.Sequential(Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, 4 * self.reg_max, 1)) for x in ch)
|
||||
nn.Sequential(Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, 4 * self.reg_max, 1)) for x in ch
|
||||
)
|
||||
self.cv3 = nn.ModuleList(nn.Sequential(Conv(x, c3, 3), Conv(c3, c3, 3), nn.Conv2d(c3, self.nc, 1)) for x in ch)
|
||||
self.dfl = DFL(self.reg_max) if self.reg_max > 1 else nn.Identity()
|
||||
|
||||
|
|
@ -53,14 +55,14 @@ class Detect(nn.Module):
|
|||
self.anchors, self.strides = (x.transpose(0, 1) for x in make_anchors(x, self.stride, 0.5))
|
||||
self.shape = shape
|
||||
|
||||
if self.export and self.format in ('saved_model', 'pb', 'tflite', 'edgetpu', 'tfjs'): # avoid TF FlexSplitV ops
|
||||
box = x_cat[:, :self.reg_max * 4]
|
||||
cls = x_cat[:, self.reg_max * 4:]
|
||||
if self.export and self.format in ("saved_model", "pb", "tflite", "edgetpu", "tfjs"): # avoid TF FlexSplitV ops
|
||||
box = x_cat[:, : self.reg_max * 4]
|
||||
cls = x_cat[:, self.reg_max * 4 :]
|
||||
else:
|
||||
box, cls = x_cat.split((self.reg_max * 4, self.nc), 1)
|
||||
dbox = self.decode_bboxes(box)
|
||||
|
||||
if self.export and self.format in ('tflite', 'edgetpu'):
|
||||
if self.export and self.format in ("tflite", "edgetpu"):
|
||||
# Precompute normalization factor to increase numerical stability
|
||||
# See https://github.com/ultralytics/ultralytics/issues/7371
|
||||
img_h = shape[2]
|
||||
|
|
@ -79,7 +81,7 @@ class Detect(nn.Module):
|
|||
# ncf = math.log(0.6 / (m.nc - 0.999999)) if cf is None else torch.log(cf / cf.sum()) # nominal class frequency
|
||||
for a, b, s in zip(m.cv2, m.cv3, m.stride): # from
|
||||
a[-1].bias.data[:] = 1.0 # box
|
||||
b[-1].bias.data[:m.nc] = math.log(5 / m.nc / (640 / s) ** 2) # cls (.01 objects, 80 classes, 640 img)
|
||||
b[-1].bias.data[: m.nc] = math.log(5 / m.nc / (640 / s) ** 2) # cls (.01 objects, 80 classes, 640 img)
|
||||
|
||||
def decode_bboxes(self, bboxes):
|
||||
"""Decode bounding boxes."""
|
||||
|
|
@ -214,26 +216,28 @@ class RTDETRDecoder(nn.Module):
|
|||
and class labels for objects in an image. It integrates features from multiple layers and runs through a series of
|
||||
Transformer decoder layers to output the final predictions.
|
||||
"""
|
||||
|
||||
export = False # export mode
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
nc=80,
|
||||
ch=(512, 1024, 2048),
|
||||
hd=256, # hidden dim
|
||||
nq=300, # num queries
|
||||
ndp=4, # num decoder points
|
||||
nh=8, # num head
|
||||
ndl=6, # num decoder layers
|
||||
d_ffn=1024, # dim of feedforward
|
||||
dropout=0.,
|
||||
act=nn.ReLU(),
|
||||
eval_idx=-1,
|
||||
# Training args
|
||||
nd=100, # num denoising
|
||||
label_noise_ratio=0.5,
|
||||
box_noise_scale=1.0,
|
||||
learnt_init_query=False):
|
||||
self,
|
||||
nc=80,
|
||||
ch=(512, 1024, 2048),
|
||||
hd=256, # hidden dim
|
||||
nq=300, # num queries
|
||||
ndp=4, # num decoder points
|
||||
nh=8, # num head
|
||||
ndl=6, # num decoder layers
|
||||
d_ffn=1024, # dim of feedforward
|
||||
dropout=0.0,
|
||||
act=nn.ReLU(),
|
||||
eval_idx=-1,
|
||||
# Training args
|
||||
nd=100, # num denoising
|
||||
label_noise_ratio=0.5,
|
||||
box_noise_scale=1.0,
|
||||
learnt_init_query=False,
|
||||
):
|
||||
"""
|
||||
Initializes the RTDETRDecoder module with the given parameters.
|
||||
|
||||
|
|
@ -302,28 +306,30 @@ class RTDETRDecoder(nn.Module):
|
|||
feats, shapes = self._get_encoder_input(x)
|
||||
|
||||
# Prepare denoising training
|
||||
dn_embed, dn_bbox, attn_mask, dn_meta = \
|
||||
get_cdn_group(batch,
|
||||
self.nc,
|
||||
self.num_queries,
|
||||
self.denoising_class_embed.weight,
|
||||
self.num_denoising,
|
||||
self.label_noise_ratio,
|
||||
self.box_noise_scale,
|
||||
self.training)
|
||||
dn_embed, dn_bbox, attn_mask, dn_meta = get_cdn_group(
|
||||
batch,
|
||||
self.nc,
|
||||
self.num_queries,
|
||||
self.denoising_class_embed.weight,
|
||||
self.num_denoising,
|
||||
self.label_noise_ratio,
|
||||
self.box_noise_scale,
|
||||
self.training,
|
||||
)
|
||||
|
||||
embed, refer_bbox, enc_bboxes, enc_scores = \
|
||||
self._get_decoder_input(feats, shapes, dn_embed, dn_bbox)
|
||||
embed, refer_bbox, enc_bboxes, enc_scores = self._get_decoder_input(feats, shapes, dn_embed, dn_bbox)
|
||||
|
||||
# Decoder
|
||||
dec_bboxes, dec_scores = self.decoder(embed,
|
||||
refer_bbox,
|
||||
feats,
|
||||
shapes,
|
||||
self.dec_bbox_head,
|
||||
self.dec_score_head,
|
||||
self.query_pos_head,
|
||||
attn_mask=attn_mask)
|
||||
dec_bboxes, dec_scores = self.decoder(
|
||||
embed,
|
||||
refer_bbox,
|
||||
feats,
|
||||
shapes,
|
||||
self.dec_bbox_head,
|
||||
self.dec_score_head,
|
||||
self.query_pos_head,
|
||||
attn_mask=attn_mask,
|
||||
)
|
||||
x = dec_bboxes, dec_scores, enc_bboxes, enc_scores, dn_meta
|
||||
if self.training:
|
||||
return x
|
||||
|
|
@ -331,24 +337,24 @@ class RTDETRDecoder(nn.Module):
|
|||
y = torch.cat((dec_bboxes.squeeze(0), dec_scores.squeeze(0).sigmoid()), -1)
|
||||
return y if self.export else (y, x)
|
||||
|
||||
def _generate_anchors(self, shapes, grid_size=0.05, dtype=torch.float32, device='cpu', eps=1e-2):
|
||||
def _generate_anchors(self, shapes, grid_size=0.05, dtype=torch.float32, device="cpu", eps=1e-2):
|
||||
"""Generates anchor bounding boxes for given shapes with specific grid size and validates them."""
|
||||
anchors = []
|
||||
for i, (h, w) in enumerate(shapes):
|
||||
sy = torch.arange(end=h, dtype=dtype, device=device)
|
||||
sx = torch.arange(end=w, dtype=dtype, device=device)
|
||||
grid_y, grid_x = torch.meshgrid(sy, sx, indexing='ij') if TORCH_1_10 else torch.meshgrid(sy, sx)
|
||||
grid_y, grid_x = torch.meshgrid(sy, sx, indexing="ij") if TORCH_1_10 else torch.meshgrid(sy, sx)
|
||||
grid_xy = torch.stack([grid_x, grid_y], -1) # (h, w, 2)
|
||||
|
||||
valid_WH = torch.tensor([w, h], dtype=dtype, device=device)
|
||||
grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH # (1, h, w, 2)
|
||||
wh = torch.ones_like(grid_xy, dtype=dtype, device=device) * grid_size * (2.0 ** i)
|
||||
wh = torch.ones_like(grid_xy, dtype=dtype, device=device) * grid_size * (2.0**i)
|
||||
anchors.append(torch.cat([grid_xy, wh], -1).view(-1, h * w, 4)) # (1, h*w, 4)
|
||||
|
||||
anchors = torch.cat(anchors, 1) # (1, h*w*nl, 4)
|
||||
valid_mask = ((anchors > eps) * (anchors < 1 - eps)).all(-1, keepdim=True) # 1, h*w*nl, 1
|
||||
anchors = torch.log(anchors / (1 - anchors))
|
||||
anchors = anchors.masked_fill(~valid_mask, float('inf'))
|
||||
anchors = anchors.masked_fill(~valid_mask, float("inf"))
|
||||
return anchors, valid_mask
|
||||
|
||||
def _get_encoder_input(self, x):
|
||||
|
|
@ -415,13 +421,13 @@ class RTDETRDecoder(nn.Module):
|
|||
# NOTE: the weight initialization in `linear_init_` would cause NaN when training with custom datasets.
|
||||
# linear_init_(self.enc_score_head)
|
||||
constant_(self.enc_score_head.bias, bias_cls)
|
||||
constant_(self.enc_bbox_head.layers[-1].weight, 0.)
|
||||
constant_(self.enc_bbox_head.layers[-1].bias, 0.)
|
||||
constant_(self.enc_bbox_head.layers[-1].weight, 0.0)
|
||||
constant_(self.enc_bbox_head.layers[-1].bias, 0.0)
|
||||
for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head):
|
||||
# linear_init_(cls_)
|
||||
constant_(cls_.bias, bias_cls)
|
||||
constant_(reg_.layers[-1].weight, 0.)
|
||||
constant_(reg_.layers[-1].bias, 0.)
|
||||
constant_(reg_.layers[-1].weight, 0.0)
|
||||
constant_(reg_.layers[-1].bias, 0.0)
|
||||
|
||||
linear_init_(self.enc_output[0])
|
||||
xavier_uniform_(self.enc_output[0].weight)
|
||||
|
|
|
|||
|
|
@ -11,8 +11,18 @@ from torch.nn.init import constant_, xavier_uniform_
|
|||
from .conv import Conv
|
||||
from .utils import _get_clones, inverse_sigmoid, multi_scale_deformable_attn_pytorch
|
||||
|
||||
__all__ = ('TransformerEncoderLayer', 'TransformerLayer', 'TransformerBlock', 'MLPBlock', 'LayerNorm2d', 'AIFI',
|
||||
'DeformableTransformerDecoder', 'DeformableTransformerDecoderLayer', 'MSDeformAttn', 'MLP')
|
||||
__all__ = (
|
||||
"TransformerEncoderLayer",
|
||||
"TransformerLayer",
|
||||
"TransformerBlock",
|
||||
"MLPBlock",
|
||||
"LayerNorm2d",
|
||||
"AIFI",
|
||||
"DeformableTransformerDecoder",
|
||||
"DeformableTransformerDecoderLayer",
|
||||
"MSDeformAttn",
|
||||
"MLP",
|
||||
)
|
||||
|
||||
|
||||
class TransformerEncoderLayer(nn.Module):
|
||||
|
|
@ -22,9 +32,11 @@ class TransformerEncoderLayer(nn.Module):
|
|||
"""Initialize the TransformerEncoderLayer with specified parameters."""
|
||||
super().__init__()
|
||||
from ...utils.torch_utils import TORCH_1_9
|
||||
|
||||
if not TORCH_1_9:
|
||||
raise ModuleNotFoundError(
|
||||
'TransformerEncoderLayer() requires torch>=1.9 to use nn.MultiheadAttention(batch_first=True).')
|
||||
"TransformerEncoderLayer() requires torch>=1.9 to use nn.MultiheadAttention(batch_first=True)."
|
||||
)
|
||||
self.ma = nn.MultiheadAttention(c1, num_heads, dropout=dropout, batch_first=True)
|
||||
# Implementation of Feedforward model
|
||||
self.fc1 = nn.Linear(c1, cm)
|
||||
|
|
@ -91,12 +103,11 @@ class AIFI(TransformerEncoderLayer):
|
|||
"""Builds 2D sine-cosine position embedding."""
|
||||
grid_w = torch.arange(int(w), dtype=torch.float32)
|
||||
grid_h = torch.arange(int(h), dtype=torch.float32)
|
||||
grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing='ij')
|
||||
assert embed_dim % 4 == 0, \
|
||||
'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
|
||||
grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing="ij")
|
||||
assert embed_dim % 4 == 0, "Embed dimension must be divisible by 4 for 2D sin-cos position embedding"
|
||||
pos_dim = embed_dim // 4
|
||||
omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim
|
||||
omega = 1. / (temperature ** omega)
|
||||
omega = 1.0 / (temperature**omega)
|
||||
|
||||
out_w = grid_w.flatten()[..., None] @ omega[None]
|
||||
out_h = grid_h.flatten()[..., None] @ omega[None]
|
||||
|
|
@ -213,10 +224,10 @@ class MSDeformAttn(nn.Module):
|
|||
"""Initialize MSDeformAttn with the given parameters."""
|
||||
super().__init__()
|
||||
if d_model % n_heads != 0:
|
||||
raise ValueError(f'd_model must be divisible by n_heads, but got {d_model} and {n_heads}')
|
||||
raise ValueError(f"d_model must be divisible by n_heads, but got {d_model} and {n_heads}")
|
||||
_d_per_head = d_model // n_heads
|
||||
# Better to set _d_per_head to a power of 2 which is more efficient in a CUDA implementation
|
||||
assert _d_per_head * n_heads == d_model, '`d_model` must be divisible by `n_heads`'
|
||||
assert _d_per_head * n_heads == d_model, "`d_model` must be divisible by `n_heads`"
|
||||
|
||||
self.im2col_step = 64
|
||||
|
||||
|
|
@ -234,21 +245,24 @@ class MSDeformAttn(nn.Module):
|
|||
|
||||
def _reset_parameters(self):
|
||||
"""Reset module parameters."""
|
||||
constant_(self.sampling_offsets.weight.data, 0.)
|
||||
constant_(self.sampling_offsets.weight.data, 0.0)
|
||||
thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
|
||||
grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
|
||||
grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(
|
||||
1, self.n_levels, self.n_points, 1)
|
||||
grid_init = (
|
||||
(grid_init / grid_init.abs().max(-1, keepdim=True)[0])
|
||||
.view(self.n_heads, 1, 1, 2)
|
||||
.repeat(1, self.n_levels, self.n_points, 1)
|
||||
)
|
||||
for i in range(self.n_points):
|
||||
grid_init[:, :, i, :] *= i + 1
|
||||
with torch.no_grad():
|
||||
self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
|
||||
constant_(self.attention_weights.weight.data, 0.)
|
||||
constant_(self.attention_weights.bias.data, 0.)
|
||||
constant_(self.attention_weights.weight.data, 0.0)
|
||||
constant_(self.attention_weights.bias.data, 0.0)
|
||||
xavier_uniform_(self.value_proj.weight.data)
|
||||
constant_(self.value_proj.bias.data, 0.)
|
||||
constant_(self.value_proj.bias.data, 0.0)
|
||||
xavier_uniform_(self.output_proj.weight.data)
|
||||
constant_(self.output_proj.bias.data, 0.)
|
||||
constant_(self.output_proj.bias.data, 0.0)
|
||||
|
||||
def forward(self, query, refer_bbox, value, value_shapes, value_mask=None):
|
||||
"""
|
||||
|
|
@ -288,7 +302,7 @@ class MSDeformAttn(nn.Module):
|
|||
add = sampling_offsets / self.n_points * refer_bbox[:, :, None, :, None, 2:] * 0.5
|
||||
sampling_locations = refer_bbox[:, :, None, :, None, :2] + add
|
||||
else:
|
||||
raise ValueError(f'Last dim of reference_points must be 2 or 4, but got {num_points}.')
|
||||
raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {num_points}.")
|
||||
output = multi_scale_deformable_attn_pytorch(value, value_shapes, sampling_locations, attention_weights)
|
||||
return self.output_proj(output)
|
||||
|
||||
|
|
@ -301,7 +315,7 @@ class DeformableTransformerDecoderLayer(nn.Module):
|
|||
https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/deformable_transformer.py
|
||||
"""
|
||||
|
||||
def __init__(self, d_model=256, n_heads=8, d_ffn=1024, dropout=0., act=nn.ReLU(), n_levels=4, n_points=4):
|
||||
def __init__(self, d_model=256, n_heads=8, d_ffn=1024, dropout=0.0, act=nn.ReLU(), n_levels=4, n_points=4):
|
||||
"""Initialize the DeformableTransformerDecoderLayer with the given parameters."""
|
||||
super().__init__()
|
||||
|
||||
|
|
@ -339,14 +353,16 @@ class DeformableTransformerDecoderLayer(nn.Module):
|
|||
|
||||
# Self attention
|
||||
q = k = self.with_pos_embed(embed, query_pos)
|
||||
tgt = self.self_attn(q.transpose(0, 1), k.transpose(0, 1), embed.transpose(0, 1),
|
||||
attn_mask=attn_mask)[0].transpose(0, 1)
|
||||
tgt = self.self_attn(q.transpose(0, 1), k.transpose(0, 1), embed.transpose(0, 1), attn_mask=attn_mask)[
|
||||
0
|
||||
].transpose(0, 1)
|
||||
embed = embed + self.dropout1(tgt)
|
||||
embed = self.norm1(embed)
|
||||
|
||||
# Cross attention
|
||||
tgt = self.cross_attn(self.with_pos_embed(embed, query_pos), refer_bbox.unsqueeze(2), feats, shapes,
|
||||
padding_mask)
|
||||
tgt = self.cross_attn(
|
||||
self.with_pos_embed(embed, query_pos), refer_bbox.unsqueeze(2), feats, shapes, padding_mask
|
||||
)
|
||||
embed = embed + self.dropout2(tgt)
|
||||
embed = self.norm2(embed)
|
||||
|
||||
|
|
@ -370,16 +386,17 @@ class DeformableTransformerDecoder(nn.Module):
|
|||
self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx
|
||||
|
||||
def forward(
|
||||
self,
|
||||
embed, # decoder embeddings
|
||||
refer_bbox, # anchor
|
||||
feats, # image features
|
||||
shapes, # feature shapes
|
||||
bbox_head,
|
||||
score_head,
|
||||
pos_mlp,
|
||||
attn_mask=None,
|
||||
padding_mask=None):
|
||||
self,
|
||||
embed, # decoder embeddings
|
||||
refer_bbox, # anchor
|
||||
feats, # image features
|
||||
shapes, # feature shapes
|
||||
bbox_head,
|
||||
score_head,
|
||||
pos_mlp,
|
||||
attn_mask=None,
|
||||
padding_mask=None,
|
||||
):
|
||||
"""Perform the forward pass through the entire decoder."""
|
||||
output = embed
|
||||
dec_bboxes = []
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ import torch.nn as nn
|
|||
import torch.nn.functional as F
|
||||
from torch.nn.init import uniform_
|
||||
|
||||
__all__ = 'multi_scale_deformable_attn_pytorch', 'inverse_sigmoid'
|
||||
__all__ = "multi_scale_deformable_attn_pytorch", "inverse_sigmoid"
|
||||
|
||||
|
||||
def _get_clones(module, n):
|
||||
|
|
@ -27,7 +27,7 @@ def linear_init_(module):
|
|||
"""Initialize the weights and biases of a linear module."""
|
||||
bound = 1 / math.sqrt(module.weight.shape[0])
|
||||
uniform_(module.weight, -bound, bound)
|
||||
if hasattr(module, 'bias') and module.bias is not None:
|
||||
if hasattr(module, "bias") and module.bias is not None:
|
||||
uniform_(module.bias, -bound, bound)
|
||||
|
||||
|
||||
|
|
@ -39,9 +39,12 @@ def inverse_sigmoid(x, eps=1e-5):
|
|||
return torch.log(x1 / x2)
|
||||
|
||||
|
||||
def multi_scale_deformable_attn_pytorch(value: torch.Tensor, value_spatial_shapes: torch.Tensor,
|
||||
sampling_locations: torch.Tensor,
|
||||
attention_weights: torch.Tensor) -> torch.Tensor:
|
||||
def multi_scale_deformable_attn_pytorch(
|
||||
value: torch.Tensor,
|
||||
value_spatial_shapes: torch.Tensor,
|
||||
sampling_locations: torch.Tensor,
|
||||
attention_weights: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Multi-scale deformable attention.
|
||||
|
||||
|
|
@ -58,23 +61,25 @@ def multi_scale_deformable_attn_pytorch(value: torch.Tensor, value_spatial_shape
|
|||
# bs, H_*W_, num_heads*embed_dims ->
|
||||
# bs, num_heads*embed_dims, H_*W_ ->
|
||||
# bs*num_heads, embed_dims, H_, W_
|
||||
value_l_ = (value_list[level].flatten(2).transpose(1, 2).reshape(bs * num_heads, embed_dims, H_, W_))
|
||||
value_l_ = value_list[level].flatten(2).transpose(1, 2).reshape(bs * num_heads, embed_dims, H_, W_)
|
||||
# bs, num_queries, num_heads, num_points, 2 ->
|
||||
# bs, num_heads, num_queries, num_points, 2 ->
|
||||
# bs*num_heads, num_queries, num_points, 2
|
||||
sampling_grid_l_ = sampling_grids[:, :, :, level].transpose(1, 2).flatten(0, 1)
|
||||
# bs*num_heads, embed_dims, num_queries, num_points
|
||||
sampling_value_l_ = F.grid_sample(value_l_,
|
||||
sampling_grid_l_,
|
||||
mode='bilinear',
|
||||
padding_mode='zeros',
|
||||
align_corners=False)
|
||||
sampling_value_l_ = F.grid_sample(
|
||||
value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False
|
||||
)
|
||||
sampling_value_list.append(sampling_value_l_)
|
||||
# (bs, num_queries, num_heads, num_levels, num_points) ->
|
||||
# (bs, num_heads, num_queries, num_levels, num_points) ->
|
||||
# (bs, num_heads, 1, num_queries, num_levels*num_points)
|
||||
attention_weights = attention_weights.transpose(1, 2).reshape(bs * num_heads, 1, num_queries,
|
||||
num_levels * num_points)
|
||||
output = ((torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(
|
||||
bs, num_heads * embed_dims, num_queries))
|
||||
attention_weights = attention_weights.transpose(1, 2).reshape(
|
||||
bs * num_heads, 1, num_queries, num_levels * num_points
|
||||
)
|
||||
output = (
|
||||
(torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
|
||||
.sum(-1)
|
||||
.view(bs, num_heads * embed_dims, num_queries)
|
||||
)
|
||||
return output.transpose(1, 2).contiguous()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue