ultralytics 8.0.239 Ultralytics Actions and hub-sdk adoption (#7431)

Signed-off-by: Glenn Jocher <glenn.jocher@ultralytics.com> Co-authored-by: UltralyticsAssistant <web@ultralytics.com> Co-authored-by: Burhan <62214284+Burhan-Q@users.noreply.github.com> Co-authored-by: Kayzwer <68285002+Kayzwer@users.noreply.github.com>
2024-01-10 03:16:08 +01:00 · 2024-01-10 03:16:08 +01:00 · fe27db2f6e
commit fe27db2f6e
parent e795277391
139 changed files with 6870 additions and 5125 deletions
--- a/ultralytics/nn/init.py
+++ b/ultralytics/nn/init.py
@ -1,9 +1,29 @@
 # Ultralytics YOLO 🚀, AGPL-3.0 license

-from .tasks import (BaseModel, ClassificationModel, DetectionModel, SegmentationModel, attempt_load_one_weight,
-                    attempt_load_weights, guess_model_scale, guess_model_task, parse_model, torch_safe_load,
-                    yaml_model_load)
+from .tasks import (
+    BaseModel,
+    ClassificationModel,
+    DetectionModel,
+    SegmentationModel,
+    attempt_load_one_weight,
+    attempt_load_weights,
+    guess_model_scale,
+    guess_model_task,
+    parse_model,
+    torch_safe_load,
+    yaml_model_load,
+)

-__all__ = ('attempt_load_one_weight', 'attempt_load_weights', 'parse_model', 'yaml_model_load', 'guess_model_task',
-           'guess_model_scale', 'torch_safe_load', 'DetectionModel', 'SegmentationModel', 'ClassificationModel',
-           'BaseModel')
+__all__ = (
+    "attempt_load_one_weight",
+    "attempt_load_weights",
+    "parse_model",
+    "yaml_model_load",
+    "guess_model_task",
+    "guess_model_scale",
+    "torch_safe_load",
+    "DetectionModel",
+    "SegmentationModel",
+    "ClassificationModel",
+    "BaseModel",
+)
--- a/ultralytics/nn/autobackend.py
+++ b/ultralytics/nn/autobackend.py
@ -32,10 +32,12 @@ def check_class_names(names):
        names = {int(k): str(v) for k, v in names.items()}
        n = len(names)
        if max(names.keys()) >= n:
-            raise KeyError(f'{n}-class dataset requires class indices 0-{n - 1}, but you have invalid class indices '
-                           f'{min(names.keys())}-{max(names.keys())} defined in your dataset YAML.')
-        if isinstance(names[0], str) and names[0].startswith('n0'):  # imagenet class codes, i.e. 'n01440764'
-            names_map = yaml_load(ROOT / 'cfg/datasets/ImageNet.yaml')['map']  # human-readable names
+            raise KeyError(
+                f"{n}-class dataset requires class indices 0-{n - 1}, but you have invalid class indices "
+                f"{min(names.keys())}-{max(names.keys())} defined in your dataset YAML."
+            )
+        if isinstance(names[0], str) and names[0].startswith("n0"):  # imagenet class codes, i.e. 'n01440764'
+            names_map = yaml_load(ROOT / "cfg/datasets/ImageNet.yaml")["map"]  # human-readable names
            names = {k: names_map[v] for k, v in names.items()}
    return names

@ -44,8 +46,8 @@ def default_class_names(data=None):
    """Applies default class names to an input YAML file or returns numerical class names."""
    if data:
        with contextlib.suppress(Exception):
-            return yaml_load(check_yaml(data))['names']
-    return {i: f'class{i}' for i in range(999)}  # return default if above errors
+            return yaml_load(check_yaml(data))["names"]
+    return {i: f"class{i}" for i in range(999)}  # return default if above errors


 class AutoBackend(nn.Module):
@ -77,14 +79,16 @@ class AutoBackend(nn.Module):
    """

    @torch.no_grad()
-    def __init__(self,
-                 weights='yolov8n.pt',
-                 device=torch.device('cpu'),
-                 dnn=False,
-                 data=None,
-                 fp16=False,
-                 fuse=True,
-                 verbose=True):
+    def __init__(
+        self,
+        weights="yolov8n.pt",
+        device=torch.device("cpu"),
+        dnn=False,
+        data=None,
+        fp16=False,
+        fuse=True,
+        verbose=True,
+    ):
        """
        Initialize the AutoBackend for inference.

@ -100,17 +104,31 @@ class AutoBackend(nn.Module):
        super().__init__()
        w = str(weights[0] if isinstance(weights, list) else weights)
        nn_module = isinstance(weights, torch.nn.Module)
-        pt, jit, onnx, xml, engine, coreml, saved_model, pb, tflite, edgetpu, tfjs, paddle, ncnn, triton = \
-            self._model_type(w)
+        (
+            pt,
+            jit,
+            onnx,
+            xml,
+            engine,
+            coreml,
+            saved_model,
+            pb,
+            tflite,
+            edgetpu,
+            tfjs,
+            paddle,
+            ncnn,
+            triton,
+        ) = self._model_type(w)
        fp16 &= pt or jit or onnx or xml or engine or nn_module or triton  # FP16
        nhwc = coreml or saved_model or pb or tflite or edgetpu  # BHWC formats (vs torch BCWH)
        stride = 32  # default stride
        model, metadata = None, None

        # Set device
-        cuda = torch.cuda.is_available() and device.type != 'cpu'  # use CUDA
+        cuda = torch.cuda.is_available() and device.type != "cpu"  # use CUDA
        if cuda and not any([nn_module, pt, jit, engine, onnx]):  # GPU dataloader formats
-            device = torch.device('cpu')
+            device = torch.device("cpu")
            cuda = False

        # Download if not local
@ -121,77 +139,79 @@ class AutoBackend(nn.Module):
        if nn_module:  # in-memory PyTorch model
            model = weights.to(device)
            model = model.fuse(verbose=verbose) if fuse else model
-            if hasattr(model, 'kpt_shape'):
+            if hasattr(model, "kpt_shape"):
                kpt_shape = model.kpt_shape  # pose-only
            stride = max(int(model.stride.max()), 32)  # model stride
-            names = model.module.names if hasattr(model, 'module') else model.names  # get class names
+            names = model.module.names if hasattr(model, "module") else model.names  # get class names
            model.half() if fp16 else model.float()
            self.model = model  # explicitly assign for to(), cpu(), cuda(), half()
            pt = True
        elif pt:  # PyTorch
            from ultralytics.nn.tasks import attempt_load_weights
-            model = attempt_load_weights(weights if isinstance(weights, list) else w,
-                                         device=device,
-                                         inplace=True,
-                                         fuse=fuse)
-            if hasattr(model, 'kpt_shape'):
+
+            model = attempt_load_weights(
+                weights if isinstance(weights, list) else w, device=device, inplace=True, fuse=fuse
+            )
+            if hasattr(model, "kpt_shape"):
                kpt_shape = model.kpt_shape  # pose-only
            stride = max(int(model.stride.max()), 32)  # model stride
-            names = model.module.names if hasattr(model, 'module') else model.names  # get class names
+            names = model.module.names if hasattr(model, "module") else model.names  # get class names
            model.half() if fp16 else model.float()
            self.model = model  # explicitly assign for to(), cpu(), cuda(), half()
        elif jit:  # TorchScript
-            LOGGER.info(f'Loading {w} for TorchScript inference...')
-            extra_files = {'config.txt': ''}  # model metadata
+            LOGGER.info(f"Loading {w} for TorchScript inference...")
+            extra_files = {"config.txt": ""}  # model metadata
            model = torch.jit.load(w, _extra_files=extra_files, map_location=device)
            model.half() if fp16 else model.float()
-            if extra_files['config.txt']:  # load metadata dict
-                metadata = json.loads(extra_files['config.txt'], object_hook=lambda x: dict(x.items()))
+            if extra_files["config.txt"]:  # load metadata dict
+                metadata = json.loads(extra_files["config.txt"], object_hook=lambda x: dict(x.items()))
        elif dnn:  # ONNX OpenCV DNN
-            LOGGER.info(f'Loading {w} for ONNX OpenCV DNN inference...')
-            check_requirements('opencv-python>=4.5.4')
+            LOGGER.info(f"Loading {w} for ONNX OpenCV DNN inference...")
+            check_requirements("opencv-python>=4.5.4")
            net = cv2.dnn.readNetFromONNX(w)
        elif onnx:  # ONNX Runtime
-            LOGGER.info(f'Loading {w} for ONNX Runtime inference...')
-            check_requirements(('onnx', 'onnxruntime-gpu' if cuda else 'onnxruntime'))
+            LOGGER.info(f"Loading {w} for ONNX Runtime inference...")
+            check_requirements(("onnx", "onnxruntime-gpu" if cuda else "onnxruntime"))
            import onnxruntime
-            providers = ['CUDAExecutionProvider', 'CPUExecutionProvider'] if cuda else ['CPUExecutionProvider']
+
+            providers = ["CUDAExecutionProvider", "CPUExecutionProvider"] if cuda else ["CPUExecutionProvider"]
            session = onnxruntime.InferenceSession(w, providers=providers)
            output_names = [x.name for x in session.get_outputs()]
            metadata = session.get_modelmeta().custom_metadata_map  # metadata
        elif xml:  # OpenVINO
-            LOGGER.info(f'Loading {w} for OpenVINO inference...')
-            check_requirements('openvino>=2023.0')  # requires openvino-dev: https://pypi.org/project/openvino-dev/
+            LOGGER.info(f"Loading {w} for OpenVINO inference...")
+            check_requirements("openvino>=2023.0")  # requires openvino-dev: https://pypi.org/project/openvino-dev/
            from openvino.runtime import Core, Layout, get_batch  # noqa
+
            core = Core()
            w = Path(w)
            if not w.is_file():  # if not *.xml
-                w = next(w.glob('*.xml'))  # get *.xml file from *_openvino_model dir
-            ov_model = core.read_model(model=str(w), weights=w.with_suffix('.bin'))
+                w = next(w.glob("*.xml"))  # get *.xml file from *_openvino_model dir
+            ov_model = core.read_model(model=str(w), weights=w.with_suffix(".bin"))
            if ov_model.get_parameters()[0].get_layout().empty:
-                ov_model.get_parameters()[0].set_layout(Layout('NCHW'))
+                ov_model.get_parameters()[0].set_layout(Layout("NCHW"))
            batch_dim = get_batch(ov_model)
            if batch_dim.is_static:
                batch_size = batch_dim.get_length()
-            ov_compiled_model = core.compile_model(ov_model, device_name='AUTO')  # AUTO selects best available device
-            metadata = w.parent / 'metadata.yaml'
+            ov_compiled_model = core.compile_model(ov_model, device_name="AUTO")  # AUTO selects best available device
+            metadata = w.parent / "metadata.yaml"
        elif engine:  # TensorRT
-            LOGGER.info(f'Loading {w} for TensorRT inference...')
+            LOGGER.info(f"Loading {w} for TensorRT inference...")
            try:
                import tensorrt as trt  # noqa https://developer.nvidia.com/nvidia-tensorrt-download
            except ImportError:
                if LINUX:
-                    check_requirements('nvidia-tensorrt', cmds='-U --index-url https://pypi.ngc.nvidia.com')
+                    check_requirements("nvidia-tensorrt", cmds="-U --index-url https://pypi.ngc.nvidia.com")
                import tensorrt as trt  # noqa
-            check_version(trt.__version__, '7.0.0', hard=True)  # require tensorrt>=7.0.0
-            if device.type == 'cpu':
-                device = torch.device('cuda:0')
-            Binding = namedtuple('Binding', ('name', 'dtype', 'shape', 'data', 'ptr'))
+            check_version(trt.__version__, "7.0.0", hard=True)  # require tensorrt>=7.0.0
+            if device.type == "cpu":
+                device = torch.device("cuda:0")
+            Binding = namedtuple("Binding", ("name", "dtype", "shape", "data", "ptr"))
            logger = trt.Logger(trt.Logger.INFO)
            # Read file
-            with open(w, 'rb') as f, trt.Runtime(logger) as runtime:
-                meta_len = int.from_bytes(f.read(4), byteorder='little')  # read metadata length
-                metadata = json.loads(f.read(meta_len).decode('utf-8'))  # read metadata
+            with open(w, "rb") as f, trt.Runtime(logger) as runtime:
+                meta_len = int.from_bytes(f.read(4), byteorder="little")  # read metadata length
+                metadata = json.loads(f.read(meta_len).decode("utf-8"))  # read metadata
                model = runtime.deserialize_cuda_engine(f.read())  # read engine
            context = model.create_execution_context()
            bindings = OrderedDict()
@ -213,116 +233,124 @@ class AutoBackend(nn.Module):
                im = torch.from_numpy(np.empty(shape, dtype=dtype)).to(device)
                bindings[name] = Binding(name, dtype, shape, im, int(im.data_ptr()))
            binding_addrs = OrderedDict((n, d.ptr) for n, d in bindings.items())
-            batch_size = bindings['images'].shape[0]  # if dynamic, this is instead max batch size
+            batch_size = bindings["images"].shape[0]  # if dynamic, this is instead max batch size
        elif coreml:  # CoreML
-            LOGGER.info(f'Loading {w} for CoreML inference...')
+            LOGGER.info(f"Loading {w} for CoreML inference...")
            import coremltools as ct
+
            model = ct.models.MLModel(w)
            metadata = dict(model.user_defined_metadata)
        elif saved_model:  # TF SavedModel
-            LOGGER.info(f'Loading {w} for TensorFlow SavedModel inference...')
+            LOGGER.info(f"Loading {w} for TensorFlow SavedModel inference...")
            import tensorflow as tf
+
            keras = False  # assume TF1 saved_model
            model = tf.keras.models.load_model(w) if keras else tf.saved_model.load(w)
-            metadata = Path(w) / 'metadata.yaml'
+            metadata = Path(w) / "metadata.yaml"
        elif pb:  # GraphDef https://www.tensorflow.org/guide/migrate#a_graphpb_or_graphpbtxt
-            LOGGER.info(f'Loading {w} for TensorFlow GraphDef inference...')
+            LOGGER.info(f"Loading {w} for TensorFlow GraphDef inference...")
            import tensorflow as tf

            from ultralytics.engine.exporter import gd_outputs

            def wrap_frozen_graph(gd, inputs, outputs):
                """Wrap frozen graphs for deployment."""
-                x = tf.compat.v1.wrap_function(lambda: tf.compat.v1.import_graph_def(gd, name=''), [])  # wrapped
+                x = tf.compat.v1.wrap_function(lambda: tf.compat.v1.import_graph_def(gd, name=""), [])  # wrapped
                ge = x.graph.as_graph_element
                return x.prune(tf.nest.map_structure(ge, inputs), tf.nest.map_structure(ge, outputs))

            gd = tf.Graph().as_graph_def()  # TF GraphDef
-            with open(w, 'rb') as f:
+            with open(w, "rb") as f:
                gd.ParseFromString(f.read())
-            frozen_func = wrap_frozen_graph(gd, inputs='x:0', outputs=gd_outputs(gd))
+            frozen_func = wrap_frozen_graph(gd, inputs="x:0", outputs=gd_outputs(gd))
        elif tflite or edgetpu:  # https://www.tensorflow.org/lite/guide/python#install_tensorflow_lite_for_python
            try:  # https://coral.ai/docs/edgetpu/tflite-python/#update-existing-tf-lite-code-for-the-edge-tpu
                from tflite_runtime.interpreter import Interpreter, load_delegate
            except ImportError:
                import tensorflow as tf
+
                Interpreter, load_delegate = tf.lite.Interpreter, tf.lite.experimental.load_delegate
            if edgetpu:  # TF Edge TPU https://coral.ai/software/#edgetpu-runtime
-                LOGGER.info(f'Loading {w} for TensorFlow Lite Edge TPU inference...')
-                delegate = {
-                    'Linux': 'libedgetpu.so.1',
-                    'Darwin': 'libedgetpu.1.dylib',
-                    'Windows': 'edgetpu.dll'}[platform.system()]
+                LOGGER.info(f"Loading {w} for TensorFlow Lite Edge TPU inference...")
+                delegate = {"Linux": "libedgetpu.so.1", "Darwin": "libedgetpu.1.dylib", "Windows": "edgetpu.dll"}[
+                    platform.system()
+                ]
                interpreter = Interpreter(model_path=w, experimental_delegates=[load_delegate(delegate)])
            else:  # TFLite
-                LOGGER.info(f'Loading {w} for TensorFlow Lite inference...')
+                LOGGER.info(f"Loading {w} for TensorFlow Lite inference...")
                interpreter = Interpreter(model_path=w)  # load TFLite model
            interpreter.allocate_tensors()  # allocate
            input_details = interpreter.get_input_details()  # inputs
            output_details = interpreter.get_output_details()  # outputs
            # Load metadata
            with contextlib.suppress(zipfile.BadZipFile):
-                with zipfile.ZipFile(w, 'r') as model:
+                with zipfile.ZipFile(w, "r") as model:
                    meta_file = model.namelist()[0]
-                    metadata = ast.literal_eval(model.read(meta_file).decode('utf-8'))
+                    metadata = ast.literal_eval(model.read(meta_file).decode("utf-8"))
        elif tfjs:  # TF.js
-            raise NotImplementedError('YOLOv8 TF.js inference is not currently supported.')
+            raise NotImplementedError("YOLOv8 TF.js inference is not currently supported.")
        elif paddle:  # PaddlePaddle
-            LOGGER.info(f'Loading {w} for PaddlePaddle inference...')
-            check_requirements('paddlepaddle-gpu' if cuda else 'paddlepaddle')
+            LOGGER.info(f"Loading {w} for PaddlePaddle inference...")
+            check_requirements("paddlepaddle-gpu" if cuda else "paddlepaddle")
            import paddle.inference as pdi  # noqa
+
            w = Path(w)
            if not w.is_file():  # if not *.pdmodel
-                w = next(w.rglob('*.pdmodel'))  # get *.pdmodel file from *_paddle_model dir
-            config = pdi.Config(str(w), str(w.with_suffix('.pdiparams')))
+                w = next(w.rglob("*.pdmodel"))  # get *.pdmodel file from *_paddle_model dir
+            config = pdi.Config(str(w), str(w.with_suffix(".pdiparams")))
            if cuda:
                config.enable_use_gpu(memory_pool_init_size_mb=2048, device_id=0)
            predictor = pdi.create_predictor(config)
            input_handle = predictor.get_input_handle(predictor.get_input_names()[0])
            output_names = predictor.get_output_names()
-            metadata = w.parents[1] / 'metadata.yaml'
+            metadata = w.parents[1] / "metadata.yaml"
        elif ncnn:  # ncnn
-            LOGGER.info(f'Loading {w} for ncnn inference...')
-            check_requirements('git+https://github.com/Tencent/ncnn.git' if ARM64 else 'ncnn')  # requires ncnn
+            LOGGER.info(f"Loading {w} for ncnn inference...")
+            check_requirements("git+https://github.com/Tencent/ncnn.git" if ARM64 else "ncnn")  # requires ncnn
            import ncnn as pyncnn
+
            net = pyncnn.Net()
            net.opt.use_vulkan_compute = cuda
            w = Path(w)
            if not w.is_file():  # if not *.param
-                w = next(w.glob('*.param'))  # get *.param file from *_ncnn_model dir
+                w = next(w.glob("*.param"))  # get *.param file from *_ncnn_model dir
            net.load_param(str(w))
-            net.load_model(str(w.with_suffix('.bin')))
-            metadata = w.parent / 'metadata.yaml'
+            net.load_model(str(w.with_suffix(".bin")))
+            metadata = w.parent / "metadata.yaml"
        elif triton:  # NVIDIA Triton Inference Server
-            check_requirements('tritonclient[all]')
+            check_requirements("tritonclient[all]")
            from ultralytics.utils.triton import TritonRemoteModel
+
            model = TritonRemoteModel(w)
        else:
            from ultralytics.engine.exporter import export_formats
-            raise TypeError(f"model='{w}' is not a supported model format. "
-                            'See https://docs.ultralytics.com/modes/predict for help.'
-                            f'\n\n{export_formats()}')
+
+            raise TypeError(
+                f"model='{w}' is not a supported model format. "
+                "See https://docs.ultralytics.com/modes/predict for help."
+                f"\n\n{export_formats()}"
+            )

        # Load external metadata YAML
        if isinstance(metadata, (str, Path)) and Path(metadata).exists():
            metadata = yaml_load(metadata)
        if metadata:
            for k, v in metadata.items():
-                if k in ('stride', 'batch'):
+                if k in ("stride", "batch"):
                    metadata[k] = int(v)
-                elif k in ('imgsz', 'names', 'kpt_shape') and isinstance(v, str):
+                elif k in ("imgsz", "names", "kpt_shape") and isinstance(v, str):
                    metadata[k] = eval(v)
-            stride = metadata['stride']
-            task = metadata['task']
-            batch = metadata['batch']
-            imgsz = metadata['imgsz']
-            names = metadata['names']
-            kpt_shape = metadata.get('kpt_shape')
+            stride = metadata["stride"]
+            task = metadata["task"]
+            batch = metadata["batch"]
+            imgsz = metadata["imgsz"]
+            names = metadata["names"]
+            kpt_shape = metadata.get("kpt_shape")
        elif not (pt or triton or nn_module):
            LOGGER.warning(f"WARNING ⚠️ Metadata not found for 'model={weights}'")

        # Check names
-        if 'names' not in locals():  # names missing
+        if "names" not in locals():  # names missing
            names = default_class_names(data)
        names = check_class_names(names)

@ -367,26 +395,28 @@ class AutoBackend(nn.Module):
            im = im.cpu().numpy()  # FP32
            y = list(self.ov_compiled_model(im).values())
        elif self.engine:  # TensorRT
-            if self.dynamic and im.shape != self.bindings['images'].shape:
-                i = self.model.get_binding_index('images')
+            if self.dynamic and im.shape != self.bindings["images"].shape:
+                i = self.model.get_binding_index("images")
                self.context.set_binding_shape(i, im.shape)  # reshape if dynamic
-                self.bindings['images'] = self.bindings['images']._replace(shape=im.shape)
+                self.bindings["images"] = self.bindings["images"]._replace(shape=im.shape)
                for name in self.output_names:
                    i = self.model.get_binding_index(name)
                    self.bindings[name].data.resize_(tuple(self.context.get_binding_shape(i)))
-            s = self.bindings['images'].shape
+            s = self.bindings["images"].shape
            assert im.shape == s, f"input size {im.shape} {'>' if self.dynamic else 'not equal to'} max model size {s}"
-            self.binding_addrs['images'] = int(im.data_ptr())
+            self.binding_addrs["images"] = int(im.data_ptr())
            self.context.execute_v2(list(self.binding_addrs.values()))
            y = [self.bindings[x].data for x in sorted(self.output_names)]
        elif self.coreml:  # CoreML
            im = im[0].cpu().numpy()
-            im_pil = Image.fromarray((im * 255).astype('uint8'))
+            im_pil = Image.fromarray((im * 255).astype("uint8"))
            # im = im.resize((192, 320), Image.BILINEAR)
-            y = self.model.predict({'image': im_pil})  # coordinates are xywh normalized
-            if 'confidence' in y:
-                raise TypeError('Ultralytics only supports inference of non-pipelined CoreML models exported with '
-                                f"'nms=False', but 'model={w}' has an NMS pipeline created by an 'nms=True' export.")
+            y = self.model.predict({"image": im_pil})  # coordinates are xywh normalized
+            if "confidence" in y:
+                raise TypeError(
+                    "Ultralytics only supports inference of non-pipelined CoreML models exported with "
+                    f"'nms=False', but 'model={w}' has an NMS pipeline created by an 'nms=True' export."
+                )
                # TODO: CoreML NMS inference handling
                # from ultralytics.utils.ops import xywh2xyxy
                # box = xywh2xyxy(y['coordinates'] * [[w, h, w, h]])  # xyxy pixels
@ -425,20 +455,20 @@ class AutoBackend(nn.Module):
                if len(y) == 2 and len(self.names) == 999:  # segments and names not defined
                    ip, ib = (0, 1) if len(y[0].shape) == 4 else (1, 0)  # index of protos, boxes
                    nc = y[ib].shape[1] - y[ip].shape[3] - 4  # y = (1, 160, 160, 32), (1, 116, 8400)
-                    self.names = {i: f'class{i}' for i in range(nc)}
+                    self.names = {i: f"class{i}" for i in range(nc)}
            else:  # Lite or Edge TPU
                details = self.input_details[0]
-                integer = details['dtype'] in (np.int8, np.int16)  # is TFLite quantized int8 or int16 model
+                integer = details["dtype"] in (np.int8, np.int16)  # is TFLite quantized int8 or int16 model
                if integer:
-                    scale, zero_point = details['quantization']
-                    im = (im / scale + zero_point).astype(details['dtype'])  # de-scale
-                self.interpreter.set_tensor(details['index'], im)
+                    scale, zero_point = details["quantization"]
+                    im = (im / scale + zero_point).astype(details["dtype"])  # de-scale
+                self.interpreter.set_tensor(details["index"], im)
                self.interpreter.invoke()
                y = []
                for output in self.output_details:
-                    x = self.interpreter.get_tensor(output['index'])
+                    x = self.interpreter.get_tensor(output["index"])
                    if integer:
-                        scale, zero_point = output['quantization']
+                        scale, zero_point = output["quantization"]
                        x = (x.astype(np.float32) - zero_point) * scale  # re-scale
                    if x.ndim > 2:  # if task is not classification
                        # Denormalize xywh by image size. See https://github.com/ultralytics/ultralytics/pull/1695
@ -483,13 +513,13 @@ class AutoBackend(nn.Module):
            (None): This method runs the forward pass and don't return any value
        """
        warmup_types = self.pt, self.jit, self.onnx, self.engine, self.saved_model, self.pb, self.triton, self.nn_module
-        if any(warmup_types) and (self.device.type != 'cpu' or self.triton):
+        if any(warmup_types) and (self.device.type != "cpu" or self.triton):
            im = torch.empty(*imgsz, dtype=torch.half if self.fp16 else torch.float, device=self.device)  # input
            for _ in range(2 if self.jit else 1):
                self.forward(im)  # warmup

    @staticmethod
-    def _model_type(p='path/to/model.pt'):
+    def _model_type(p="path/to/model.pt"):
        """
        This function takes a path to a model file and returns the model type.

@ -499,18 +529,20 @@ class AutoBackend(nn.Module):
        # Return model type from model path, i.e. path='path/to/model.onnx' -> type=onnx
        # types = [pt, jit, onnx, xml, engine, coreml, saved_model, pb, tflite, edgetpu, tfjs, paddle]
        from ultralytics.engine.exporter import export_formats
+
        sf = list(export_formats().Suffix)  # export suffixes
        if not is_url(p, check=False) and not isinstance(p, str):
            check_suffix(p, sf)  # checks
        name = Path(p).name
        types = [s in name for s in sf]
-        types[5] |= name.endswith('.mlmodel')  # retain support for older Apple CoreML *.mlmodel formats
+        types[5] |= name.endswith(".mlmodel")  # retain support for older Apple CoreML *.mlmodel formats
        types[8] &= not types[9]  # tflite &= not edgetpu
        if any(types):
            triton = False
        else:
            from urllib.parse import urlsplit
+
            url = urlsplit(p)
-            triton = url.netloc and url.path and url.scheme in {'http', 'grpc'}
+            triton = url.netloc and url.path and url.scheme in {"http", "grpc"}

        return types + [triton]
--- a/ultralytics/nn/modules/init.py
+++ b/ultralytics/nn/modules/init.py
@ -17,18 +17,101 @@ Example:
    ```
 """

-from .block import (C1, C2, C3, C3TR, DFL, SPP, SPPF, Bottleneck, BottleneckCSP, C2f, C3Ghost, C3x, GhostBottleneck,
-                    HGBlock, HGStem, Proto, RepC3, ResNetLayer)
-from .conv import (CBAM, ChannelAttention, Concat, Conv, Conv2, ConvTranspose, DWConv, DWConvTranspose2d, Focus,
-                   GhostConv, LightConv, RepConv, SpatialAttention)
+from .block import (
+    C1,
+    C2,
+    C3,
+    C3TR,
+    DFL,
+    SPP,
+    SPPF,
+    Bottleneck,
+    BottleneckCSP,
+    C2f,
+    C3Ghost,
+    C3x,
+    GhostBottleneck,
+    HGBlock,
+    HGStem,
+    Proto,
+    RepC3,
+    ResNetLayer,
+)
+from .conv import (
+    CBAM,
+    ChannelAttention,
+    Concat,
+    Conv,
+    Conv2,
+    ConvTranspose,
+    DWConv,
+    DWConvTranspose2d,
+    Focus,
+    GhostConv,
+    LightConv,
+    RepConv,
+    SpatialAttention,
+)
 from .head import OBB, Classify, Detect, Pose, RTDETRDecoder, Segment
-from .transformer import (AIFI, MLP, DeformableTransformerDecoder, DeformableTransformerDecoderLayer, LayerNorm2d,
-                          MLPBlock, MSDeformAttn, TransformerBlock, TransformerEncoderLayer, TransformerLayer)
+from .transformer import (
+    AIFI,
+    MLP,
+    DeformableTransformerDecoder,
+    DeformableTransformerDecoderLayer,
+    LayerNorm2d,
+    MLPBlock,
+    MSDeformAttn,
+    TransformerBlock,
+    TransformerEncoderLayer,
+    TransformerLayer,
+)

-__all__ = ('Conv', 'Conv2', 'LightConv', 'RepConv', 'DWConv', 'DWConvTranspose2d', 'ConvTranspose', 'Focus',
-           'GhostConv', 'ChannelAttention', 'SpatialAttention', 'CBAM', 'Concat', 'TransformerLayer',
-           'TransformerBlock', 'MLPBlock', 'LayerNorm2d', 'DFL', 'HGBlock', 'HGStem', 'SPP', 'SPPF', 'C1', 'C2', 'C3',
-           'C2f', 'C3x', 'C3TR', 'C3Ghost', 'GhostBottleneck', 'Bottleneck', 'BottleneckCSP', 'Proto', 'Detect',
-           'Segment', 'Pose', 'Classify', 'TransformerEncoderLayer', 'RepC3', 'RTDETRDecoder', 'AIFI',
-           'DeformableTransformerDecoder', 'DeformableTransformerDecoderLayer', 'MSDeformAttn', 'MLP', 'ResNetLayer',
-           'OBB')
+__all__ = (
+    "Conv",
+    "Conv2",
+    "LightConv",
+    "RepConv",
+    "DWConv",
+    "DWConvTranspose2d",
+    "ConvTranspose",
+    "Focus",
+    "GhostConv",
+    "ChannelAttention",
+    "SpatialAttention",
+    "CBAM",
+    "Concat",
+    "TransformerLayer",
+    "TransformerBlock",
+    "MLPBlock",
+    "LayerNorm2d",
+    "DFL",
+    "HGBlock",
+    "HGStem",
+    "SPP",
+    "SPPF",
+    "C1",
+    "C2",
+    "C3",
+    "C2f",
+    "C3x",
+    "C3TR",
+    "C3Ghost",
+    "GhostBottleneck",
+    "Bottleneck",
+    "BottleneckCSP",
+    "Proto",
+    "Detect",
+    "Segment",
+    "Pose",
+    "Classify",
+    "TransformerEncoderLayer",
+    "RepC3",
+    "RTDETRDecoder",
+    "AIFI",
+    "DeformableTransformerDecoder",
+    "DeformableTransformerDecoderLayer",
+    "MSDeformAttn",
+    "MLP",
+    "ResNetLayer",
+    "OBB",
+)
--- a/ultralytics/nn/modules/block.py
+++ b/ultralytics/nn/modules/block.py
@ -8,8 +8,26 @@ import torch.nn.functional as F
 from .conv import Conv, DWConv, GhostConv, LightConv, RepConv
 from .transformer import TransformerBlock

-__all__ = ('DFL', 'HGBlock', 'HGStem', 'SPP', 'SPPF', 'C1', 'C2', 'C3', 'C2f', 'C3x', 'C3TR', 'C3Ghost',
-           'GhostBottleneck', 'Bottleneck', 'BottleneckCSP', 'Proto', 'RepC3', 'ResNetLayer')
+__all__ = (
+    "DFL",
+    "HGBlock",
+    "HGStem",
+    "SPP",
+    "SPPF",
+    "C1",
+    "C2",
+    "C3",
+    "C2f",
+    "C3x",
+    "C3TR",
+    "C3Ghost",
+    "GhostBottleneck",
+    "Bottleneck",
+    "BottleneckCSP",
+    "Proto",
+    "RepC3",
+    "ResNetLayer",
+)


 class DFL(nn.Module):
@ -284,9 +302,11 @@ class GhostBottleneck(nn.Module):
        self.conv = nn.Sequential(
            GhostConv(c1, c_, 1, 1),  # pw
            DWConv(c_, c_, k, s, act=False) if s == 2 else nn.Identity(),  # dw
-            GhostConv(c_, c2, 1, 1, act=False))  # pw-linear
-        self.shortcut = nn.Sequential(DWConv(c1, c1, k, s, act=False), Conv(c1, c2, 1, 1,
-                                                                            act=False)) if s == 2 else nn.Identity()
+            GhostConv(c_, c2, 1, 1, act=False),  # pw-linear
+        )
+        self.shortcut = (
+            nn.Sequential(DWConv(c1, c1, k, s, act=False), Conv(c1, c2, 1, 1, act=False)) if s == 2 else nn.Identity()
+        )

    def forward(self, x):
        """Applies skip connection and concatenation to input tensor."""
@ -359,8 +379,9 @@ class ResNetLayer(nn.Module):
        self.is_first = is_first

        if self.is_first:
-            self.layer = nn.Sequential(Conv(c1, c2, k=7, s=2, p=3, act=True),
-                                       nn.MaxPool2d(kernel_size=3, stride=2, padding=1))
+            self.layer = nn.Sequential(
+                Conv(c1, c2, k=7, s=2, p=3, act=True), nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+            )
        else:
            blocks = [ResNetBlock(c1, c2, s, e=e)]
            blocks.extend([ResNetBlock(e * c2, c2, 1, e=e) for _ in range(n - 1)])
--- a/ultralytics/nn/modules/conv.py
+++ b/ultralytics/nn/modules/conv.py
@ -7,8 +7,21 @@ import numpy as np
 import torch
 import torch.nn as nn

-__all__ = ('Conv', 'Conv2', 'LightConv', 'DWConv', 'DWConvTranspose2d', 'ConvTranspose', 'Focus', 'GhostConv',
-           'ChannelAttention', 'SpatialAttention', 'CBAM', 'Concat', 'RepConv')
+__all__ = (
+    "Conv",
+    "Conv2",
+    "LightConv",
+    "DWConv",
+    "DWConvTranspose2d",
+    "ConvTranspose",
+    "Focus",
+    "GhostConv",
+    "ChannelAttention",
+    "SpatialAttention",
+    "CBAM",
+    "Concat",
+    "RepConv",
+)


 def autopad(k, p=None, d=1):  # kernel, padding, dilation
@ -22,6 +35,7 @@ def autopad(k, p=None, d=1):  # kernel, padding, dilation

 class Conv(nn.Module):
    """Standard convolution with args(ch_in, ch_out, kernel, stride, padding, groups, dilation, activation)."""
+
    default_act = nn.SiLU()  # default activation

    def __init__(self, c1, c2, k=1, s=1, p=None, g=1, d=1, act=True):
@ -60,9 +74,9 @@ class Conv2(Conv):
        """Fuse parallel convolutions."""
        w = torch.zeros_like(self.conv.weight.data)
        i = [x // 2 for x in w.shape[2:]]
-        w[:, :, i[0]:i[0] + 1, i[1]:i[1] + 1] = self.cv2.weight.data.clone()
+        w[:, :, i[0] : i[0] + 1, i[1] : i[1] + 1] = self.cv2.weight.data.clone()
        self.conv.weight.data += w
-        self.__delattr__('cv2')
+        self.__delattr__("cv2")
        self.forward = self.forward_fuse


@ -102,6 +116,7 @@ class DWConvTranspose2d(nn.ConvTranspose2d):

 class ConvTranspose(nn.Module):
    """Convolution transpose 2d layer."""
+
    default_act = nn.SiLU()  # default activation

    def __init__(self, c1, c2, k=2, s=2, p=0, bn=True, act=True):
@ -164,6 +179,7 @@ class RepConv(nn.Module):
    This module is used in RT-DETR.
    Based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py
    """
+
    default_act = nn.SiLU()  # default activation

    def __init__(self, c1, c2, k=3, s=1, p=1, g=1, d=1, act=True, bn=False, deploy=False):
@ -214,7 +230,7 @@ class RepConv(nn.Module):
            beta = branch.bn.bias
            eps = branch.bn.eps
        elif isinstance(branch, nn.BatchNorm2d):
-            if not hasattr(self, 'id_tensor'):
+            if not hasattr(self, "id_tensor"):
                input_dim = self.c1 // self.g
                kernel_value = np.zeros((self.c1, input_dim, 3, 3), dtype=np.float32)
                for i in range(self.c1):
@ -232,29 +248,31 @@ class RepConv(nn.Module):

    def fuse_convs(self):
        """Combines two convolution layers into a single layer and removes unused attributes from the class."""
-        if hasattr(self, 'conv'):
+        if hasattr(self, "conv"):
            return
        kernel, bias = self.get_equivalent_kernel_bias()
-        self.conv = nn.Conv2d(in_channels=self.conv1.conv.in_channels,
-                              out_channels=self.conv1.conv.out_channels,
-                              kernel_size=self.conv1.conv.kernel_size,
-                              stride=self.conv1.conv.stride,
-                              padding=self.conv1.conv.padding,
-                              dilation=self.conv1.conv.dilation,
-                              groups=self.conv1.conv.groups,
-                              bias=True).requires_grad_(False)
+        self.conv = nn.Conv2d(
+            in_channels=self.conv1.conv.in_channels,
+            out_channels=self.conv1.conv.out_channels,
+            kernel_size=self.conv1.conv.kernel_size,
+            stride=self.conv1.conv.stride,
+            padding=self.conv1.conv.padding,
+            dilation=self.conv1.conv.dilation,
+            groups=self.conv1.conv.groups,
+            bias=True,
+        ).requires_grad_(False)
        self.conv.weight.data = kernel
        self.conv.bias.data = bias
        for para in self.parameters():
            para.detach_()
-        self.__delattr__('conv1')
-        self.__delattr__('conv2')
-        if hasattr(self, 'nm'):
-            self.__delattr__('nm')
-        if hasattr(self, 'bn'):
-            self.__delattr__('bn')
-        if hasattr(self, 'id_tensor'):
-            self.__delattr__('id_tensor')
+        self.__delattr__("conv1")
+        self.__delattr__("conv2")
+        if hasattr(self, "nm"):
+            self.__delattr__("nm")
+        if hasattr(self, "bn"):
+            self.__delattr__("bn")
+        if hasattr(self, "id_tensor"):
+            self.__delattr__("id_tensor")


 class ChannelAttention(nn.Module):
@ -278,7 +296,7 @@ class SpatialAttention(nn.Module):
    def __init__(self, kernel_size=7):
        """Initialize Spatial-attention module with kernel size argument."""
        super().__init__()
-        assert kernel_size in (3, 7), 'kernel size must be 3 or 7'
+        assert kernel_size in (3, 7), "kernel size must be 3 or 7"
        padding = 3 if kernel_size == 7 else 1
        self.cv1 = nn.Conv2d(2, 1, kernel_size, padding=padding, bias=False)
        self.act = nn.Sigmoid()
--- a/ultralytics/nn/modules/head.py
+++ b/ultralytics/nn/modules/head.py
@ -14,11 +14,12 @@ from .conv import Conv
 from .transformer import MLP, DeformableTransformerDecoder, DeformableTransformerDecoderLayer
 from .utils import bias_init_with_prob, linear_init_

-__all__ = 'Detect', 'Segment', 'Pose', 'Classify', 'OBB', 'RTDETRDecoder'
+__all__ = "Detect", "Segment", "Pose", "Classify", "OBB", "RTDETRDecoder"


 class Detect(nn.Module):
    """YOLOv8 Detect head for detection models."""
+
    dynamic = False  # force grid reconstruction
    export = False  # export mode
    shape = None
@ -35,7 +36,8 @@ class Detect(nn.Module):
        self.stride = torch.zeros(self.nl)  # strides computed during build
        c2, c3 = max((16, ch[0] // 4, self.reg_max * 4)), max(ch[0], min(self.nc, 100))  # channels
        self.cv2 = nn.ModuleList(
-            nn.Sequential(Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, 4 * self.reg_max, 1)) for x in ch)
+            nn.Sequential(Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, 4 * self.reg_max, 1)) for x in ch
+        )
        self.cv3 = nn.ModuleList(nn.Sequential(Conv(x, c3, 3), Conv(c3, c3, 3), nn.Conv2d(c3, self.nc, 1)) for x in ch)
        self.dfl = DFL(self.reg_max) if self.reg_max > 1 else nn.Identity()

@ -53,14 +55,14 @@ class Detect(nn.Module):
            self.anchors, self.strides = (x.transpose(0, 1) for x in make_anchors(x, self.stride, 0.5))
            self.shape = shape

-        if self.export and self.format in ('saved_model', 'pb', 'tflite', 'edgetpu', 'tfjs'):  # avoid TF FlexSplitV ops
-            box = x_cat[:, :self.reg_max * 4]
-            cls = x_cat[:, self.reg_max * 4:]
+        if self.export and self.format in ("saved_model", "pb", "tflite", "edgetpu", "tfjs"):  # avoid TF FlexSplitV ops
+            box = x_cat[:, : self.reg_max * 4]
+            cls = x_cat[:, self.reg_max * 4 :]
        else:
            box, cls = x_cat.split((self.reg_max * 4, self.nc), 1)
        dbox = self.decode_bboxes(box)

-        if self.export and self.format in ('tflite', 'edgetpu'):
+        if self.export and self.format in ("tflite", "edgetpu"):
            # Precompute normalization factor to increase numerical stability
            # See https://github.com/ultralytics/ultralytics/issues/7371
            img_h = shape[2]
@ -79,7 +81,7 @@ class Detect(nn.Module):
        # ncf = math.log(0.6 / (m.nc - 0.999999)) if cf is None else torch.log(cf / cf.sum())  # nominal class frequency
        for a, b, s in zip(m.cv2, m.cv3, m.stride):  # from
            a[-1].bias.data[:] = 1.0  # box
-            b[-1].bias.data[:m.nc] = math.log(5 / m.nc / (640 / s) ** 2)  # cls (.01 objects, 80 classes, 640 img)
+            b[-1].bias.data[: m.nc] = math.log(5 / m.nc / (640 / s) ** 2)  # cls (.01 objects, 80 classes, 640 img)

    def decode_bboxes(self, bboxes):
        """Decode bounding boxes."""
@ -214,26 +216,28 @@ class RTDETRDecoder(nn.Module):
    and class labels for objects in an image. It integrates features from multiple layers and runs through a series of
    Transformer decoder layers to output the final predictions.
    """
+
    export = False  # export mode

    def __init__(
-            self,
-            nc=80,
-            ch=(512, 1024, 2048),
-            hd=256,  # hidden dim
-            nq=300,  # num queries
-            ndp=4,  # num decoder points
-            nh=8,  # num head
-            ndl=6,  # num decoder layers
-            d_ffn=1024,  # dim of feedforward
-            dropout=0.,
-            act=nn.ReLU(),
-            eval_idx=-1,
-            # Training args
-            nd=100,  # num denoising
-            label_noise_ratio=0.5,
-            box_noise_scale=1.0,
-            learnt_init_query=False):
+        self,
+        nc=80,
+        ch=(512, 1024, 2048),
+        hd=256,  # hidden dim
+        nq=300,  # num queries
+        ndp=4,  # num decoder points
+        nh=8,  # num head
+        ndl=6,  # num decoder layers
+        d_ffn=1024,  # dim of feedforward
+        dropout=0.0,
+        act=nn.ReLU(),
+        eval_idx=-1,
+        # Training args
+        nd=100,  # num denoising
+        label_noise_ratio=0.5,
+        box_noise_scale=1.0,
+        learnt_init_query=False,
+    ):
        """
        Initializes the RTDETRDecoder module with the given parameters.

@ -302,28 +306,30 @@ class RTDETRDecoder(nn.Module):
        feats, shapes = self._get_encoder_input(x)

        # Prepare denoising training
-        dn_embed, dn_bbox, attn_mask, dn_meta = \
-            get_cdn_group(batch,
-                          self.nc,
-                          self.num_queries,
-                          self.denoising_class_embed.weight,
-                          self.num_denoising,
-                          self.label_noise_ratio,
-                          self.box_noise_scale,
-                          self.training)
+        dn_embed, dn_bbox, attn_mask, dn_meta = get_cdn_group(
+            batch,
+            self.nc,
+            self.num_queries,
+            self.denoising_class_embed.weight,
+            self.num_denoising,
+            self.label_noise_ratio,
+            self.box_noise_scale,
+            self.training,
+        )

-        embed, refer_bbox, enc_bboxes, enc_scores = \
-            self._get_decoder_input(feats, shapes, dn_embed, dn_bbox)
+        embed, refer_bbox, enc_bboxes, enc_scores = self._get_decoder_input(feats, shapes, dn_embed, dn_bbox)

        # Decoder
-        dec_bboxes, dec_scores = self.decoder(embed,
-                                              refer_bbox,
-                                              feats,
-                                              shapes,
-                                              self.dec_bbox_head,
-                                              self.dec_score_head,
-                                              self.query_pos_head,
-                                              attn_mask=attn_mask)
+        dec_bboxes, dec_scores = self.decoder(
+            embed,
+            refer_bbox,
+            feats,
+            shapes,
+            self.dec_bbox_head,
+            self.dec_score_head,
+            self.query_pos_head,
+            attn_mask=attn_mask,
+        )
        x = dec_bboxes, dec_scores, enc_bboxes, enc_scores, dn_meta
        if self.training:
            return x
@ -331,24 +337,24 @@ class RTDETRDecoder(nn.Module):
        y = torch.cat((dec_bboxes.squeeze(0), dec_scores.squeeze(0).sigmoid()), -1)
        return y if self.export else (y, x)

-    def _generate_anchors(self, shapes, grid_size=0.05, dtype=torch.float32, device='cpu', eps=1e-2):
+    def _generate_anchors(self, shapes, grid_size=0.05, dtype=torch.float32, device="cpu", eps=1e-2):
        """Generates anchor bounding boxes for given shapes with specific grid size and validates them."""
        anchors = []
        for i, (h, w) in enumerate(shapes):
            sy = torch.arange(end=h, dtype=dtype, device=device)
            sx = torch.arange(end=w, dtype=dtype, device=device)
-            grid_y, grid_x = torch.meshgrid(sy, sx, indexing='ij') if TORCH_1_10 else torch.meshgrid(sy, sx)
+            grid_y, grid_x = torch.meshgrid(sy, sx, indexing="ij") if TORCH_1_10 else torch.meshgrid(sy, sx)
            grid_xy = torch.stack([grid_x, grid_y], -1)  # (h, w, 2)

            valid_WH = torch.tensor([w, h], dtype=dtype, device=device)
            grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH  # (1, h, w, 2)
-            wh = torch.ones_like(grid_xy, dtype=dtype, device=device) * grid_size * (2.0 ** i)
+            wh = torch.ones_like(grid_xy, dtype=dtype, device=device) * grid_size * (2.0**i)
            anchors.append(torch.cat([grid_xy, wh], -1).view(-1, h * w, 4))  # (1, h*w, 4)

        anchors = torch.cat(anchors, 1)  # (1, h*w*nl, 4)
        valid_mask = ((anchors > eps) * (anchors < 1 - eps)).all(-1, keepdim=True)  # 1, h*w*nl, 1
        anchors = torch.log(anchors / (1 - anchors))
-        anchors = anchors.masked_fill(~valid_mask, float('inf'))
+        anchors = anchors.masked_fill(~valid_mask, float("inf"))
        return anchors, valid_mask

    def _get_encoder_input(self, x):
@ -415,13 +421,13 @@ class RTDETRDecoder(nn.Module):
        # NOTE: the weight initialization in `linear_init_` would cause NaN when training with custom datasets.
        # linear_init_(self.enc_score_head)
        constant_(self.enc_score_head.bias, bias_cls)
-        constant_(self.enc_bbox_head.layers[-1].weight, 0.)
-        constant_(self.enc_bbox_head.layers[-1].bias, 0.)
+        constant_(self.enc_bbox_head.layers[-1].weight, 0.0)
+        constant_(self.enc_bbox_head.layers[-1].bias, 0.0)
        for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head):
            # linear_init_(cls_)
            constant_(cls_.bias, bias_cls)
-            constant_(reg_.layers[-1].weight, 0.)
-            constant_(reg_.layers[-1].bias, 0.)
+            constant_(reg_.layers[-1].weight, 0.0)
+            constant_(reg_.layers[-1].bias, 0.0)

        linear_init_(self.enc_output[0])
        xavier_uniform_(self.enc_output[0].weight)
--- a/ultralytics/nn/modules/transformer.py
+++ b/ultralytics/nn/modules/transformer.py
@ -11,8 +11,18 @@ from torch.nn.init import constant_, xavier_uniform_
 from .conv import Conv
 from .utils import _get_clones, inverse_sigmoid, multi_scale_deformable_attn_pytorch

-__all__ = ('TransformerEncoderLayer', 'TransformerLayer', 'TransformerBlock', 'MLPBlock', 'LayerNorm2d', 'AIFI',
-           'DeformableTransformerDecoder', 'DeformableTransformerDecoderLayer', 'MSDeformAttn', 'MLP')
+__all__ = (
+    "TransformerEncoderLayer",
+    "TransformerLayer",
+    "TransformerBlock",
+    "MLPBlock",
+    "LayerNorm2d",
+    "AIFI",
+    "DeformableTransformerDecoder",
+    "DeformableTransformerDecoderLayer",
+    "MSDeformAttn",
+    "MLP",
+)


 class TransformerEncoderLayer(nn.Module):
@ -22,9 +32,11 @@ class TransformerEncoderLayer(nn.Module):
        """Initialize the TransformerEncoderLayer with specified parameters."""
        super().__init__()
        from ...utils.torch_utils import TORCH_1_9
+
        if not TORCH_1_9:
            raise ModuleNotFoundError(
-                'TransformerEncoderLayer() requires torch>=1.9 to use nn.MultiheadAttention(batch_first=True).')
+                "TransformerEncoderLayer() requires torch>=1.9 to use nn.MultiheadAttention(batch_first=True)."
+            )
        self.ma = nn.MultiheadAttention(c1, num_heads, dropout=dropout, batch_first=True)
        # Implementation of Feedforward model
        self.fc1 = nn.Linear(c1, cm)
@ -91,12 +103,11 @@ class AIFI(TransformerEncoderLayer):
        """Builds 2D sine-cosine position embedding."""
        grid_w = torch.arange(int(w), dtype=torch.float32)
        grid_h = torch.arange(int(h), dtype=torch.float32)
-        grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing='ij')
-        assert embed_dim % 4 == 0, \
-            'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
+        grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing="ij")
+        assert embed_dim % 4 == 0, "Embed dimension must be divisible by 4 for 2D sin-cos position embedding"
        pos_dim = embed_dim // 4
        omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim
-        omega = 1. / (temperature ** omega)
+        omega = 1.0 / (temperature**omega)

        out_w = grid_w.flatten()[..., None] @ omega[None]
        out_h = grid_h.flatten()[..., None] @ omega[None]
@ -213,10 +224,10 @@ class MSDeformAttn(nn.Module):
        """Initialize MSDeformAttn with the given parameters."""
        super().__init__()
        if d_model % n_heads != 0:
-            raise ValueError(f'd_model must be divisible by n_heads, but got {d_model} and {n_heads}')
+            raise ValueError(f"d_model must be divisible by n_heads, but got {d_model} and {n_heads}")
        _d_per_head = d_model // n_heads
        # Better to set _d_per_head to a power of 2 which is more efficient in a CUDA implementation
-        assert _d_per_head * n_heads == d_model, '`d_model` must be divisible by `n_heads`'
+        assert _d_per_head * n_heads == d_model, "`d_model` must be divisible by `n_heads`"

        self.im2col_step = 64

@ -234,21 +245,24 @@ class MSDeformAttn(nn.Module):

    def _reset_parameters(self):
        """Reset module parameters."""
-        constant_(self.sampling_offsets.weight.data, 0.)
+        constant_(self.sampling_offsets.weight.data, 0.0)
        thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
-        grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(
-            1, self.n_levels, self.n_points, 1)
+        grid_init = (
+            (grid_init / grid_init.abs().max(-1, keepdim=True)[0])
+            .view(self.n_heads, 1, 1, 2)
+            .repeat(1, self.n_levels, self.n_points, 1)
+        )
        for i in range(self.n_points):
            grid_init[:, :, i, :] *= i + 1
        with torch.no_grad():
            self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
-        constant_(self.attention_weights.weight.data, 0.)
-        constant_(self.attention_weights.bias.data, 0.)
+        constant_(self.attention_weights.weight.data, 0.0)
+        constant_(self.attention_weights.bias.data, 0.0)
        xavier_uniform_(self.value_proj.weight.data)
-        constant_(self.value_proj.bias.data, 0.)
+        constant_(self.value_proj.bias.data, 0.0)
        xavier_uniform_(self.output_proj.weight.data)
-        constant_(self.output_proj.bias.data, 0.)
+        constant_(self.output_proj.bias.data, 0.0)

    def forward(self, query, refer_bbox, value, value_shapes, value_mask=None):
        """
@ -288,7 +302,7 @@ class MSDeformAttn(nn.Module):
            add = sampling_offsets / self.n_points * refer_bbox[:, :, None, :, None, 2:] * 0.5
            sampling_locations = refer_bbox[:, :, None, :, None, :2] + add
        else:
-            raise ValueError(f'Last dim of reference_points must be 2 or 4, but got {num_points}.')
+            raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {num_points}.")
        output = multi_scale_deformable_attn_pytorch(value, value_shapes, sampling_locations, attention_weights)
        return self.output_proj(output)

@ -301,7 +315,7 @@ class DeformableTransformerDecoderLayer(nn.Module):
    https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/deformable_transformer.py
    """

-    def __init__(self, d_model=256, n_heads=8, d_ffn=1024, dropout=0., act=nn.ReLU(), n_levels=4, n_points=4):
+    def __init__(self, d_model=256, n_heads=8, d_ffn=1024, dropout=0.0, act=nn.ReLU(), n_levels=4, n_points=4):
        """Initialize the DeformableTransformerDecoderLayer with the given parameters."""
        super().__init__()

@ -339,14 +353,16 @@ class DeformableTransformerDecoderLayer(nn.Module):

        # Self attention
        q = k = self.with_pos_embed(embed, query_pos)
-        tgt = self.self_attn(q.transpose(0, 1), k.transpose(0, 1), embed.transpose(0, 1),
-                             attn_mask=attn_mask)[0].transpose(0, 1)
+        tgt = self.self_attn(q.transpose(0, 1), k.transpose(0, 1), embed.transpose(0, 1), attn_mask=attn_mask)[
+            0
+        ].transpose(0, 1)
        embed = embed + self.dropout1(tgt)
        embed = self.norm1(embed)

        # Cross attention
-        tgt = self.cross_attn(self.with_pos_embed(embed, query_pos), refer_bbox.unsqueeze(2), feats, shapes,
-                              padding_mask)
+        tgt = self.cross_attn(
+            self.with_pos_embed(embed, query_pos), refer_bbox.unsqueeze(2), feats, shapes, padding_mask
+        )
        embed = embed + self.dropout2(tgt)
        embed = self.norm2(embed)

@ -370,16 +386,17 @@ class DeformableTransformerDecoder(nn.Module):
        self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx

    def forward(
-            self,
-            embed,  # decoder embeddings
-            refer_bbox,  # anchor
-            feats,  # image features
-            shapes,  # feature shapes
-            bbox_head,
-            score_head,
-            pos_mlp,
-            attn_mask=None,
-            padding_mask=None):
+        self,
+        embed,  # decoder embeddings
+        refer_bbox,  # anchor
+        feats,  # image features
+        shapes,  # feature shapes
+        bbox_head,
+        score_head,
+        pos_mlp,
+        attn_mask=None,
+        padding_mask=None,
+    ):
        """Perform the forward pass through the entire decoder."""
        output = embed
        dec_bboxes = []
--- a/ultralytics/nn/modules/utils.py
+++ b/ultralytics/nn/modules/utils.py
@ -10,7 +10,7 @@ import torch.nn as nn
 import torch.nn.functional as F
 from torch.nn.init import uniform_

-__all__ = 'multi_scale_deformable_attn_pytorch', 'inverse_sigmoid'
+__all__ = "multi_scale_deformable_attn_pytorch", "inverse_sigmoid"


 def _get_clones(module, n):
@ -27,7 +27,7 @@ def linear_init_(module):
    """Initialize the weights and biases of a linear module."""
    bound = 1 / math.sqrt(module.weight.shape[0])
    uniform_(module.weight, -bound, bound)
-    if hasattr(module, 'bias') and module.bias is not None:
+    if hasattr(module, "bias") and module.bias is not None:
        uniform_(module.bias, -bound, bound)


@ -39,9 +39,12 @@ def inverse_sigmoid(x, eps=1e-5):
    return torch.log(x1 / x2)


-def multi_scale_deformable_attn_pytorch(value: torch.Tensor, value_spatial_shapes: torch.Tensor,
-                                        sampling_locations: torch.Tensor,
-                                        attention_weights: torch.Tensor) -> torch.Tensor:
+def multi_scale_deformable_attn_pytorch(
+    value: torch.Tensor,
+    value_spatial_shapes: torch.Tensor,
+    sampling_locations: torch.Tensor,
+    attention_weights: torch.Tensor,
+) -> torch.Tensor:
    """
    Multi-scale deformable attention.

@ -58,23 +61,25 @@ def multi_scale_deformable_attn_pytorch(value: torch.Tensor, value_spatial_shape
        # bs, H_*W_, num_heads*embed_dims ->
        # bs, num_heads*embed_dims, H_*W_ ->
        # bs*num_heads, embed_dims, H_, W_
-        value_l_ = (value_list[level].flatten(2).transpose(1, 2).reshape(bs * num_heads, embed_dims, H_, W_))
+        value_l_ = value_list[level].flatten(2).transpose(1, 2).reshape(bs * num_heads, embed_dims, H_, W_)
        # bs, num_queries, num_heads, num_points, 2 ->
        # bs, num_heads, num_queries, num_points, 2 ->
        # bs*num_heads, num_queries, num_points, 2
        sampling_grid_l_ = sampling_grids[:, :, :, level].transpose(1, 2).flatten(0, 1)
        # bs*num_heads, embed_dims, num_queries, num_points
-        sampling_value_l_ = F.grid_sample(value_l_,
-                                          sampling_grid_l_,
-                                          mode='bilinear',
-                                          padding_mode='zeros',
-                                          align_corners=False)
+        sampling_value_l_ = F.grid_sample(
+            value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False
+        )
        sampling_value_list.append(sampling_value_l_)
    # (bs, num_queries, num_heads, num_levels, num_points) ->
    # (bs, num_heads, num_queries, num_levels, num_points) ->
    # (bs, num_heads, 1, num_queries, num_levels*num_points)
-    attention_weights = attention_weights.transpose(1, 2).reshape(bs * num_heads, 1, num_queries,
-                                                                  num_levels * num_points)
-    output = ((torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(
-        bs, num_heads * embed_dims, num_queries))
+    attention_weights = attention_weights.transpose(1, 2).reshape(
+        bs * num_heads, 1, num_queries, num_levels * num_points
+    )
+    output = (
+        (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
+        .sum(-1)
+        .view(bs, num_heads * embed_dims, num_queries)
+    )
    return output.transpose(1, 2).contiguous()
--- a/ultralytics/nn/tasks.py
+++ b/ultralytics/nn/tasks.py
@ -7,16 +7,54 @@ from pathlib import Path
 import torch
 import torch.nn as nn

-from ultralytics.nn.modules import (AIFI, C1, C2, C3, C3TR, OBB, SPP, SPPF, Bottleneck, BottleneckCSP, C2f, C3Ghost,
-                                    C3x, Classify, Concat, Conv, Conv2, ConvTranspose, Detect, DWConv,
-                                    DWConvTranspose2d, Focus, GhostBottleneck, GhostConv, HGBlock, HGStem, Pose, RepC3,
-                                    RepConv, ResNetLayer, RTDETRDecoder, Segment)
+from ultralytics.nn.modules import (
+    AIFI,
+    C1,
+    C2,
+    C3,
+    C3TR,
+    OBB,
+    SPP,
+    SPPF,
+    Bottleneck,
+    BottleneckCSP,
+    C2f,
+    C3Ghost,
+    C3x,
+    Classify,
+    Concat,
+    Conv,
+    Conv2,
+    ConvTranspose,
+    Detect,
+    DWConv,
+    DWConvTranspose2d,
+    Focus,
+    GhostBottleneck,
+    GhostConv,
+    HGBlock,
+    HGStem,
+    Pose,
+    RepC3,
+    RepConv,
+    ResNetLayer,
+    RTDETRDecoder,
+    Segment,
+)
 from ultralytics.utils import DEFAULT_CFG_DICT, DEFAULT_CFG_KEYS, LOGGER, colorstr, emojis, yaml_load
 from ultralytics.utils.checks import check_requirements, check_suffix, check_yaml
 from ultralytics.utils.loss import v8ClassificationLoss, v8DetectionLoss, v8OBBLoss, v8PoseLoss, v8SegmentationLoss
 from ultralytics.utils.plotting import feature_visualization
-from ultralytics.utils.torch_utils import (fuse_conv_and_bn, fuse_deconv_and_bn, initialize_weights, intersect_dicts,
-                                           make_divisible, model_info, scale_img, time_sync)
+from ultralytics.utils.torch_utils import (
+    fuse_conv_and_bn,
+    fuse_deconv_and_bn,
+    initialize_weights,
+    intersect_dicts,
+    make_divisible,
+    model_info,
+    scale_img,
+    time_sync,
+)

 try:
    import thop
@ -90,8 +128,10 @@ class BaseModel(nn.Module):

    def _predict_augment(self, x):
        """Perform augmentations on input image x and return augmented inference."""
-        LOGGER.warning(f'WARNING ⚠️ {self.__class__.__name__} does not support augmented inference yet. '
-                       f'Reverting to single-scale inference instead.')
+        LOGGER.warning(
+            f"WARNING ⚠️ {self.__class__.__name__} does not support augmented inference yet. "
+            f"Reverting to single-scale inference instead."
+        )
        return self._predict_once(x)

    def _profile_one_layer(self, m, x, dt):
@ -108,14 +148,14 @@ class BaseModel(nn.Module):
            None
        """
        c = m == self.model[-1] and isinstance(x, list)  # is final layer list, copy input as inplace fix
-        flops = thop.profile(m, inputs=[x.copy() if c else x], verbose=False)[0] / 1E9 * 2 if thop else 0  # FLOPs
+        flops = thop.profile(m, inputs=[x.copy() if c else x], verbose=False)[0] / 1e9 * 2 if thop else 0  # FLOPs
        t = time_sync()
        for _ in range(10):
            m(x.copy() if c else x)
        dt.append((time_sync() - t) * 100)
        if m == self.model[0]:
            LOGGER.info(f"{'time (ms)':>10s} {'GFLOPs':>10s} {'params':>10s}  module")
-        LOGGER.info(f'{dt[-1]:10.2f} {flops:10.2f} {m.np:10.0f}  {m.type}')
+        LOGGER.info(f"{dt[-1]:10.2f} {flops:10.2f} {m.np:10.0f}  {m.type}")
        if c:
            LOGGER.info(f"{sum(dt):10.2f} {'-':>10s} {'-':>10s}  Total")

@ -129,15 +169,15 @@ class BaseModel(nn.Module):
        """
        if not self.is_fused():
            for m in self.model.modules():
-                if isinstance(m, (Conv, Conv2, DWConv)) and hasattr(m, 'bn'):
+                if isinstance(m, (Conv, Conv2, DWConv)) and hasattr(m, "bn"):
                    if isinstance(m, Conv2):
                        m.fuse_convs()
                    m.conv = fuse_conv_and_bn(m.conv, m.bn)  # update conv
-                    delattr(m, 'bn')  # remove batchnorm
+                    delattr(m, "bn")  # remove batchnorm
                    m.forward = m.forward_fuse  # update forward
-                if isinstance(m, ConvTranspose) and hasattr(m, 'bn'):
+                if isinstance(m, ConvTranspose) and hasattr(m, "bn"):
                    m.conv_transpose = fuse_deconv_and_bn(m.conv_transpose, m.bn)
-                    delattr(m, 'bn')  # remove batchnorm
+                    delattr(m, "bn")  # remove batchnorm
                    m.forward = m.forward_fuse  # update forward
                if isinstance(m, RepConv):
                    m.fuse_convs()
@ -156,7 +196,7 @@ class BaseModel(nn.Module):
        Returns:
            (bool): True if the number of BatchNorm layers in the model is less than the threshold, False otherwise.
        """
-        bn = tuple(v for k, v in nn.__dict__.items() if 'Norm' in k)  # normalization layers, i.e. BatchNorm2d()
+        bn = tuple(v for k, v in nn.__dict__.items() if "Norm" in k)  # normalization layers, i.e. BatchNorm2d()
        return sum(isinstance(v, bn) for v in self.modules()) < thresh  # True if < 'thresh' BatchNorm layers in model

    def info(self, detailed=False, verbose=True, imgsz=640):
@ -196,12 +236,12 @@ class BaseModel(nn.Module):
            weights (dict | torch.nn.Module): The pre-trained weights to be loaded.
            verbose (bool, optional): Whether to log the transfer progress. Defaults to True.
        """
-        model = weights['model'] if isinstance(weights, dict) else weights  # torchvision models are not dicts
+        model = weights["model"] if isinstance(weights, dict) else weights  # torchvision models are not dicts
        csd = model.float().state_dict()  # checkpoint state_dict as FP32
        csd = intersect_dicts(csd, self.state_dict())  # intersect
        self.load_state_dict(csd, strict=False)  # load
        if verbose:
-            LOGGER.info(f'Transferred {len(csd)}/{len(self.model.state_dict())} items from pretrained weights')
+            LOGGER.info(f"Transferred {len(csd)}/{len(self.model.state_dict())} items from pretrained weights")

    def loss(self, batch, preds=None):
        """
@ -211,33 +251,33 @@ class BaseModel(nn.Module):
            batch (dict): Batch to compute loss on
            preds (torch.Tensor | List[torch.Tensor]): Predictions.
        """
-        if not hasattr(self, 'criterion'):
+        if not hasattr(self, "criterion"):
            self.criterion = self.init_criterion()

-        preds = self.forward(batch['img']) if preds is None else preds
+        preds = self.forward(batch["img"]) if preds is None else preds
        return self.criterion(preds, batch)

    def init_criterion(self):
        """Initialize the loss criterion for the BaseModel."""
-        raise NotImplementedError('compute_loss() needs to be implemented by task heads')
+        raise NotImplementedError("compute_loss() needs to be implemented by task heads")


 class DetectionModel(BaseModel):
    """YOLOv8 detection model."""

-    def __init__(self, cfg='yolov8n.yaml', ch=3, nc=None, verbose=True):  # model, input channels, number of classes
+    def __init__(self, cfg="yolov8n.yaml", ch=3, nc=None, verbose=True):  # model, input channels, number of classes
        """Initialize the YOLOv8 detection model with the given config and parameters."""
        super().__init__()
        self.yaml = cfg if isinstance(cfg, dict) else yaml_model_load(cfg)  # cfg dict

        # Define model
-        ch = self.yaml['ch'] = self.yaml.get('ch', ch)  # input channels
-        if nc and nc != self.yaml['nc']:
+        ch = self.yaml["ch"] = self.yaml.get("ch", ch)  # input channels
+        if nc and nc != self.yaml["nc"]:
            LOGGER.info(f"Overriding model.yaml nc={self.yaml['nc']} with nc={nc}")
-            self.yaml['nc'] = nc  # override YAML value
+            self.yaml["nc"] = nc  # override YAML value
        self.model, self.save = parse_model(deepcopy(self.yaml), ch=ch, verbose=verbose)  # model, savelist
-        self.names = {i: f'{i}' for i in range(self.yaml['nc'])}  # default names dict
-        self.inplace = self.yaml.get('inplace', True)
+        self.names = {i: f"{i}" for i in range(self.yaml["nc"])}  # default names dict
+        self.inplace = self.yaml.get("inplace", True)

        # Build strides
        m = self.model[-1]  # Detect()
@ -255,7 +295,7 @@ class DetectionModel(BaseModel):
        initialize_weights(self)
        if verbose:
            self.info()
-            LOGGER.info('')
+            LOGGER.info("")

    def _predict_augment(self, x):
        """Perform augmentations on input image x and return augmented inference and train outputs."""
@ -285,9 +325,9 @@ class DetectionModel(BaseModel):
    def _clip_augmented(self, y):
        """Clip YOLO augmented inference tails."""
        nl = self.model[-1].nl  # number of detection layers (P3-P5)
-        g = sum(4 ** x for x in range(nl))  # grid points
+        g = sum(4**x for x in range(nl))  # grid points
        e = 1  # exclude layer count
-        i = (y[0].shape[-1] // g) * sum(4 ** x for x in range(e))  # indices
+        i = (y[0].shape[-1] // g) * sum(4**x for x in range(e))  # indices
        y[0] = y[0][..., :-i]  # large
        i = (y[-1].shape[-1] // g) * sum(4 ** (nl - 1 - x) for x in range(e))  # indices
        y[-1] = y[-1][..., i:]  # small
@ -301,7 +341,7 @@ class DetectionModel(BaseModel):
 class OBBModel(DetectionModel):
    """"YOLOv8 Oriented Bounding Box (OBB) model."""

-    def __init__(self, cfg='yolov8n-obb.yaml', ch=3, nc=None, verbose=True):
+    def __init__(self, cfg="yolov8n-obb.yaml", ch=3, nc=None, verbose=True):
        """Initialize YOLOv8 OBB model with given config and parameters."""
        super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)

@ -313,7 +353,7 @@ class OBBModel(DetectionModel):
 class SegmentationModel(DetectionModel):
    """YOLOv8 segmentation model."""

-    def __init__(self, cfg='yolov8n-seg.yaml', ch=3, nc=None, verbose=True):
+    def __init__(self, cfg="yolov8n-seg.yaml", ch=3, nc=None, verbose=True):
        """Initialize YOLOv8 segmentation model with given config and parameters."""
        super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)

@ -325,13 +365,13 @@ class SegmentationModel(DetectionModel):
 class PoseModel(DetectionModel):
    """YOLOv8 pose model."""

-    def __init__(self, cfg='yolov8n-pose.yaml', ch=3, nc=None, data_kpt_shape=(None, None), verbose=True):
+    def __init__(self, cfg="yolov8n-pose.yaml", ch=3, nc=None, data_kpt_shape=(None, None), verbose=True):
        """Initialize YOLOv8 Pose model."""
        if not isinstance(cfg, dict):
            cfg = yaml_model_load(cfg)  # load model YAML
-        if any(data_kpt_shape) and list(data_kpt_shape) != list(cfg['kpt_shape']):
+        if any(data_kpt_shape) and list(data_kpt_shape) != list(cfg["kpt_shape"]):
            LOGGER.info(f"Overriding model.yaml kpt_shape={cfg['kpt_shape']} with kpt_shape={data_kpt_shape}")
-            cfg['kpt_shape'] = data_kpt_shape
+            cfg["kpt_shape"] = data_kpt_shape
        super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)

    def init_criterion(self):
@ -342,7 +382,7 @@ class PoseModel(DetectionModel):
 class ClassificationModel(BaseModel):
    """YOLOv8 classification model."""

-    def __init__(self, cfg='yolov8n-cls.yaml', ch=3, nc=None, verbose=True):
+    def __init__(self, cfg="yolov8n-cls.yaml", ch=3, nc=None, verbose=True):
        """Init ClassificationModel with YAML, channels, number of classes, verbose flag."""
        super().__init__()
        self._from_yaml(cfg, ch, nc, verbose)
@ -352,21 +392,21 @@ class ClassificationModel(BaseModel):
        self.yaml = cfg if isinstance(cfg, dict) else yaml_model_load(cfg)  # cfg dict

        # Define model
-        ch = self.yaml['ch'] = self.yaml.get('ch', ch)  # input channels
-        if nc and nc != self.yaml['nc']:
+        ch = self.yaml["ch"] = self.yaml.get("ch", ch)  # input channels
+        if nc and nc != self.yaml["nc"]:
            LOGGER.info(f"Overriding model.yaml nc={self.yaml['nc']} with nc={nc}")
-            self.yaml['nc'] = nc  # override YAML value
-        elif not nc and not self.yaml.get('nc', None):
-            raise ValueError('nc not specified. Must specify nc in model.yaml or function arguments.')
+            self.yaml["nc"] = nc  # override YAML value
+        elif not nc and not self.yaml.get("nc", None):
+            raise ValueError("nc not specified. Must specify nc in model.yaml or function arguments.")
        self.model, self.save = parse_model(deepcopy(self.yaml), ch=ch, verbose=verbose)  # model, savelist
        self.stride = torch.Tensor([1])  # no stride constraints
-        self.names = {i: f'{i}' for i in range(self.yaml['nc'])}  # default names dict
+        self.names = {i: f"{i}" for i in range(self.yaml["nc"])}  # default names dict
        self.info()

    @staticmethod
    def reshape_outputs(model, nc):
        """Update a TorchVision classification model to class count 'n' if required."""
-        name, m = list((model.model if hasattr(model, 'model') else model).named_children())[-1]  # last module
+        name, m = list((model.model if hasattr(model, "model") else model).named_children())[-1]  # last module
        if isinstance(m, Classify):  # YOLO Classify() head
            if m.linear.out_features != nc:
                m.linear = nn.Linear(m.linear.in_features, nc)
@ -409,7 +449,7 @@ class RTDETRDetectionModel(DetectionModel):
        predict: Performs a forward pass through the network and returns the output.
    """

-    def __init__(self, cfg='rtdetr-l.yaml', ch=3, nc=None, verbose=True):
+    def __init__(self, cfg="rtdetr-l.yaml", ch=3, nc=None, verbose=True):
        """
        Initialize the RTDETRDetectionModel.

@ -438,39 +478,39 @@ class RTDETRDetectionModel(DetectionModel):
        Returns:
            (tuple): A tuple containing the total loss and main three losses in a tensor.
        """
-        if not hasattr(self, 'criterion'):
+        if not hasattr(self, "criterion"):
            self.criterion = self.init_criterion()

-        img = batch['img']
+        img = batch["img"]
        # NOTE: preprocess gt_bbox and gt_labels to list.
        bs = len(img)
-        batch_idx = batch['batch_idx']
+        batch_idx = batch["batch_idx"]
        gt_groups = [(batch_idx == i).sum().item() for i in range(bs)]
        targets = {
-            'cls': batch['cls'].to(img.device, dtype=torch.long).view(-1),
-            'bboxes': batch['bboxes'].to(device=img.device),
-            'batch_idx': batch_idx.to(img.device, dtype=torch.long).view(-1),
-            'gt_groups': gt_groups}
+            "cls": batch["cls"].to(img.device, dtype=torch.long).view(-1),
+            "bboxes": batch["bboxes"].to(device=img.device),
+            "batch_idx": batch_idx.to(img.device, dtype=torch.long).view(-1),
+            "gt_groups": gt_groups,
+        }

        preds = self.predict(img, batch=targets) if preds is None else preds
        dec_bboxes, dec_scores, enc_bboxes, enc_scores, dn_meta = preds if self.training else preds[1]
        if dn_meta is None:
            dn_bboxes, dn_scores = None, None
        else:
-            dn_bboxes, dec_bboxes = torch.split(dec_bboxes, dn_meta['dn_num_split'], dim=2)
-            dn_scores, dec_scores = torch.split(dec_scores, dn_meta['dn_num_split'], dim=2)
+            dn_bboxes, dec_bboxes = torch.split(dec_bboxes, dn_meta["dn_num_split"], dim=2)
+            dn_scores, dec_scores = torch.split(dec_scores, dn_meta["dn_num_split"], dim=2)

        dec_bboxes = torch.cat([enc_bboxes.unsqueeze(0), dec_bboxes])  # (7, bs, 300, 4)
        dec_scores = torch.cat([enc_scores.unsqueeze(0), dec_scores])

-        loss = self.criterion((dec_bboxes, dec_scores),
-                              targets,
-                              dn_bboxes=dn_bboxes,
-                              dn_scores=dn_scores,
-                              dn_meta=dn_meta)
+        loss = self.criterion(
+            (dec_bboxes, dec_scores), targets, dn_bboxes=dn_bboxes, dn_scores=dn_scores, dn_meta=dn_meta
+        )
        # NOTE: There are like 12 losses in RTDETR, backward with all losses but only show the main three losses.
-        return sum(loss.values()), torch.as_tensor([loss[k].detach() for k in ['loss_giou', 'loss_class', 'loss_bbox']],
-                                                   device=img.device)
+        return sum(loss.values()), torch.as_tensor(
+            [loss[k].detach() for k in ["loss_giou", "loss_class", "loss_bbox"]], device=img.device
+        )

    def predict(self, x, profile=False, visualize=False, batch=None, augment=False, embed=None):
        """
@ -553,6 +593,7 @@ def temporary_modules(modules=None):

    import importlib
    import sys
+
    try:
        # Set modules in sys.modules under their old name
        for old, new in modules.items():
@ -580,30 +621,38 @@ def torch_safe_load(weight):
    """
    from ultralytics.utils.downloads import attempt_download_asset

-    check_suffix(file=weight, suffix='.pt')
+    check_suffix(file=weight, suffix=".pt")
    file = attempt_download_asset(weight)  # search online if missing locally
    try:
-        with temporary_modules({
-                'ultralytics.yolo.utils': 'ultralytics.utils',
-                'ultralytics.yolo.v8': 'ultralytics.models.yolo',
-                'ultralytics.yolo.data': 'ultralytics.data'}):  # for legacy 8.0 Classify and Pose models
-            return torch.load(file, map_location='cpu'), file  # load
+        with temporary_modules(
+            {
+                "ultralytics.yolo.utils": "ultralytics.utils",
+                "ultralytics.yolo.v8": "ultralytics.models.yolo",
+                "ultralytics.yolo.data": "ultralytics.data",
+            }
+        ):  # for legacy 8.0 Classify and Pose models
+            return torch.load(file, map_location="cpu"), file  # load

    except ModuleNotFoundError as e:  # e.name is missing module name
-        if e.name == 'models':
+        if e.name == "models":
            raise TypeError(
-                emojis(f'ERROR ❌️ {weight} appears to be an Ultralytics YOLOv5 model originally trained '
-                       f'with https://github.com/ultralytics/yolov5.\nThis model is NOT forwards compatible with '
-                       f'YOLOv8 at https://github.com/ultralytics/ultralytics.'
-                       f"\nRecommend fixes are to train a new model using the latest 'ultralytics' package or to "
-                       f"run a command with an official YOLOv8 model, i.e. 'yolo predict model=yolov8n.pt'")) from e
-        LOGGER.warning(f"WARNING ⚠️ {weight} appears to require '{e.name}', which is not in ultralytics requirements."
-                       f"\nAutoInstall will run now for '{e.name}' but this feature will be removed in the future."
-                       f"\nRecommend fixes are to train a new model using the latest 'ultralytics' package or to "
-                       f"run a command with an official YOLOv8 model, i.e. 'yolo predict model=yolov8n.pt'")
+                emojis(
+                    f"ERROR ❌️ {weight} appears to be an Ultralytics YOLOv5 model originally trained "
+                    f"with https://github.com/ultralytics/yolov5.\nThis model is NOT forwards compatible with "
+                    f"YOLOv8 at https://github.com/ultralytics/ultralytics."
+                    f"\nRecommend fixes are to train a new model using the latest 'ultralytics' package or to "
+                    f"run a command with an official YOLOv8 model, i.e. 'yolo predict model=yolov8n.pt'"
+                )
+            ) from e
+        LOGGER.warning(
+            f"WARNING ⚠️ {weight} appears to require '{e.name}', which is not in ultralytics requirements."
+            f"\nAutoInstall will run now for '{e.name}' but this feature will be removed in the future."
+            f"\nRecommend fixes are to train a new model using the latest 'ultralytics' package or to "
+            f"run a command with an official YOLOv8 model, i.e. 'yolo predict model=yolov8n.pt'"
+        )
        check_requirements(e.name)  # install missing module

-        return torch.load(file, map_location='cpu'), file  # load
+        return torch.load(file, map_location="cpu"), file  # load


 def attempt_load_weights(weights, device=None, inplace=True, fuse=False):
@ -612,25 +661,25 @@ def attempt_load_weights(weights, device=None, inplace=True, fuse=False):
    ensemble = Ensemble()
    for w in weights if isinstance(weights, list) else [weights]:
        ckpt, w = torch_safe_load(w)  # load ckpt
-        args = {**DEFAULT_CFG_DICT, **ckpt['train_args']} if 'train_args' in ckpt else None  # combined args
-        model = (ckpt.get('ema') or ckpt['model']).to(device).float()  # FP32 model
+        args = {**DEFAULT_CFG_DICT, **ckpt["train_args"]} if "train_args" in ckpt else None  # combined args
+        model = (ckpt.get("ema") or ckpt["model"]).to(device).float()  # FP32 model

        # Model compatibility updates
        model.args = args  # attach args to model
        model.pt_path = w  # attach *.pt file path to model
        model.task = guess_model_task(model)
-        if not hasattr(model, 'stride'):
-            model.stride = torch.tensor([32.])
+        if not hasattr(model, "stride"):
+            model.stride = torch.tensor([32.0])

        # Append
-        ensemble.append(model.fuse().eval() if fuse and hasattr(model, 'fuse') else model.eval())  # model in eval mode
+        ensemble.append(model.fuse().eval() if fuse and hasattr(model, "fuse") else model.eval())  # model in eval mode

    # Module updates
    for m in ensemble.modules():
        t = type(m)
        if t in (nn.Hardswish, nn.LeakyReLU, nn.ReLU, nn.ReLU6, nn.SiLU, Detect, Segment, Pose, OBB):
            m.inplace = inplace
-        elif t is nn.Upsample and not hasattr(m, 'recompute_scale_factor'):
+        elif t is nn.Upsample and not hasattr(m, "recompute_scale_factor"):
            m.recompute_scale_factor = None  # torch 1.11.0 compatibility

    # Return model
@ -638,35 +687,35 @@ def attempt_load_weights(weights, device=None, inplace=True, fuse=False):
        return ensemble[-1]

    # Return ensemble
-    LOGGER.info(f'Ensemble created with {weights}\n')
-    for k in 'names', 'nc', 'yaml':
+    LOGGER.info(f"Ensemble created with {weights}\n")
+    for k in "names", "nc", "yaml":
        setattr(ensemble, k, getattr(ensemble[0], k))
    ensemble.stride = ensemble[torch.argmax(torch.tensor([m.stride.max() for m in ensemble])).int()].stride
-    assert all(ensemble[0].nc == m.nc for m in ensemble), f'Models differ in class counts {[m.nc for m in ensemble]}'
+    assert all(ensemble[0].nc == m.nc for m in ensemble), f"Models differ in class counts {[m.nc for m in ensemble]}"
    return ensemble


 def attempt_load_one_weight(weight, device=None, inplace=True, fuse=False):
    """Loads a single model weights."""
    ckpt, weight = torch_safe_load(weight)  # load ckpt
-    args = {**DEFAULT_CFG_DICT, **(ckpt.get('train_args', {}))}  # combine model and default args, preferring model args
-    model = (ckpt.get('ema') or ckpt['model']).to(device).float()  # FP32 model
+    args = {**DEFAULT_CFG_DICT, **(ckpt.get("train_args", {}))}  # combine model and default args, preferring model args
+    model = (ckpt.get("ema") or ckpt["model"]).to(device).float()  # FP32 model

    # Model compatibility updates
    model.args = {k: v for k, v in args.items() if k in DEFAULT_CFG_KEYS}  # attach args to model
    model.pt_path = weight  # attach *.pt file path to model
    model.task = guess_model_task(model)
-    if not hasattr(model, 'stride'):
-        model.stride = torch.tensor([32.])
+    if not hasattr(model, "stride"):
+        model.stride = torch.tensor([32.0])

-    model = model.fuse().eval() if fuse and hasattr(model, 'fuse') else model.eval()  # model in eval mode
+    model = model.fuse().eval() if fuse and hasattr(model, "fuse") else model.eval()  # model in eval mode

    # Module updates
    for m in model.modules():
        t = type(m)
        if t in (nn.Hardswish, nn.LeakyReLU, nn.ReLU, nn.ReLU6, nn.SiLU, Detect, Segment, Pose, OBB):
            m.inplace = inplace
-        elif t is nn.Upsample and not hasattr(m, 'recompute_scale_factor'):
+        elif t is nn.Upsample and not hasattr(m, "recompute_scale_factor"):
            m.recompute_scale_factor = None  # torch 1.11.0 compatibility

    # Return model and ckpt
@ -678,11 +727,11 @@ def parse_model(d, ch, verbose=True):  # model_dict, input_channels(3)
    import ast

    # Args
-    max_channels = float('inf')
-    nc, act, scales = (d.get(x) for x in ('nc', 'activation', 'scales'))
-    depth, width, kpt_shape = (d.get(x, 1.0) for x in ('depth_multiple', 'width_multiple', 'kpt_shape'))
+    max_channels = float("inf")
+    nc, act, scales = (d.get(x) for x in ("nc", "activation", "scales"))
+    depth, width, kpt_shape = (d.get(x, 1.0) for x in ("depth_multiple", "width_multiple", "kpt_shape"))
    if scales:
-        scale = d.get('scale')
+        scale = d.get("scale")
        if not scale:
            scale = tuple(scales.keys())[0]
            LOGGER.warning(f"WARNING ⚠️ no model scale passed. Assuming scale='{scale}'.")
@ -697,16 +746,37 @@ def parse_model(d, ch, verbose=True):  # model_dict, input_channels(3)
        LOGGER.info(f"\n{'':>3}{'from':>20}{'n':>3}{'params':>10}  {'module':<45}{'arguments':<30}")
    ch = [ch]
    layers, save, c2 = [], [], ch[-1]  # layers, savelist, ch out
-    for i, (f, n, m, args) in enumerate(d['backbone'] + d['head']):  # from, number, module, args
-        m = getattr(torch.nn, m[3:]) if 'nn.' in m else globals()[m]  # get module
+    for i, (f, n, m, args) in enumerate(d["backbone"] + d["head"]):  # from, number, module, args
+        m = getattr(torch.nn, m[3:]) if "nn." in m else globals()[m]  # get module
        for j, a in enumerate(args):
            if isinstance(a, str):
                with contextlib.suppress(ValueError):
                    args[j] = locals()[a] if a in locals() else ast.literal_eval(a)

        n = n_ = max(round(n * depth), 1) if n > 1 else n  # depth gain
-        if m in (Classify, Conv, ConvTranspose, GhostConv, Bottleneck, GhostBottleneck, SPP, SPPF, DWConv, Focus,
-                 BottleneckCSP, C1, C2, C2f, C3, C3TR, C3Ghost, nn.ConvTranspose2d, DWConvTranspose2d, C3x, RepC3):
+        if m in (
+            Classify,
+            Conv,
+            ConvTranspose,
+            GhostConv,
+            Bottleneck,
+            GhostBottleneck,
+            SPP,
+            SPPF,
+            DWConv,
+            Focus,
+            BottleneckCSP,
+            C1,
+            C2,
+            C2f,
+            C3,
+            C3TR,
+            C3Ghost,
+            nn.ConvTranspose2d,
+            DWConvTranspose2d,
+            C3x,
+            RepC3,
+        ):
            c1, c2 = ch[f], args[0]
            if c2 != nc:  # if c2 not equal to number of classes (i.e. for Classify() output)
                c2 = make_divisible(min(c2, max_channels) * width, 8)
@ -739,11 +809,11 @@ def parse_model(d, ch, verbose=True):  # model_dict, input_channels(3)
            c2 = ch[f]

        m_ = nn.Sequential(*(m(*args) for _ in range(n))) if n > 1 else m(*args)  # module
-        t = str(m)[8:-2].replace('__main__.', '')  # module type
+        t = str(m)[8:-2].replace("__main__.", "")  # module type
        m.np = sum(x.numel() for x in m_.parameters())  # number params
        m_.i, m_.f, m_.type = i, f, t  # attach index, 'from' index, type
        if verbose:
-            LOGGER.info(f'{i:>3}{str(f):>20}{n_:>3}{m.np:10.0f}  {t:<45}{str(args):<30}')  # print
+            LOGGER.info(f"{i:>3}{str(f):>20}{n_:>3}{m.np:10.0f}  {t:<45}{str(args):<30}")  # print
        save.extend(x % i for x in ([f] if isinstance(f, int) else f) if x != -1)  # append to savelist
        layers.append(m_)
        if i == 0:
@ -757,16 +827,16 @@ def yaml_model_load(path):
    import re

    path = Path(path)
-    if path.stem in (f'yolov{d}{x}6' for x in 'nsmlx' for d in (5, 8)):
-        new_stem = re.sub(r'(\d+)([nslmx])6(.+)?$', r'\1\2-p6\3', path.stem)
-        LOGGER.warning(f'WARNING ⚠️ Ultralytics YOLO P6 models now use -p6 suffix. Renaming {path.stem} to {new_stem}.')
+    if path.stem in (f"yolov{d}{x}6" for x in "nsmlx" for d in (5, 8)):
+        new_stem = re.sub(r"(\d+)([nslmx])6(.+)?$", r"\1\2-p6\3", path.stem)
+        LOGGER.warning(f"WARNING ⚠️ Ultralytics YOLO P6 models now use -p6 suffix. Renaming {path.stem} to {new_stem}.")
        path = path.with_name(new_stem + path.suffix)

-    unified_path = re.sub(r'(\d+)([nslmx])(.+)?$', r'\1\3', str(path))  # i.e. yolov8x.yaml -> yolov8.yaml
+    unified_path = re.sub(r"(\d+)([nslmx])(.+)?$", r"\1\3", str(path))  # i.e. yolov8x.yaml -> yolov8.yaml
    yaml_file = check_yaml(unified_path, hard=False) or check_yaml(path)
    d = yaml_load(yaml_file)  # model dict
-    d['scale'] = guess_model_scale(path)
-    d['yaml_file'] = str(path)
+    d["scale"] = guess_model_scale(path)
+    d["yaml_file"] = str(path)
    return d


@ -784,8 +854,9 @@ def guess_model_scale(model_path):
    """
    with contextlib.suppress(AttributeError):
        import re
-        return re.search(r'yolov\d+([nslmx])', Path(model_path).stem).group(1)  # n, s, m, l, or x
-    return ''
+
+        return re.search(r"yolov\d+([nslmx])", Path(model_path).stem).group(1)  # n, s, m, l, or x
+    return ""


 def guess_model_task(model):
@ -804,17 +875,17 @@ def guess_model_task(model):

    def cfg2task(cfg):
        """Guess from YAML dictionary."""
-        m = cfg['head'][-1][-2].lower()  # output module name
-        if m in ('classify', 'classifier', 'cls', 'fc'):
-            return 'classify'
-        if m == 'detect':
-            return 'detect'
-        if m == 'segment':
-            return 'segment'
-        if m == 'pose':
-            return 'pose'
-        if m == 'obb':
-            return 'obb'
+        m = cfg["head"][-1][-2].lower()  # output module name
+        if m in ("classify", "classifier", "cls", "fc"):
+            return "classify"
+        if m == "detect":
+            return "detect"
+        if m == "segment":
+            return "segment"
+        if m == "pose":
+            return "pose"
+        if m == "obb":
+            return "obb"

    # Guess from model cfg
    if isinstance(model, dict):
@ -823,40 +894,42 @@ def guess_model_task(model):

    # Guess from PyTorch model
    if isinstance(model, nn.Module):  # PyTorch model
-        for x in 'model.args', 'model.model.args', 'model.model.model.args':
+        for x in "model.args", "model.model.args", "model.model.model.args":
            with contextlib.suppress(Exception):
-                return eval(x)['task']
-        for x in 'model.yaml', 'model.model.yaml', 'model.model.model.yaml':
+                return eval(x)["task"]
+        for x in "model.yaml", "model.model.yaml", "model.model.model.yaml":
            with contextlib.suppress(Exception):
                return cfg2task(eval(x))

        for m in model.modules():
            if isinstance(m, Detect):
-                return 'detect'
+                return "detect"
            elif isinstance(m, Segment):
-                return 'segment'
+                return "segment"
            elif isinstance(m, Classify):
-                return 'classify'
+                return "classify"
            elif isinstance(m, Pose):
-                return 'pose'
+                return "pose"
            elif isinstance(m, OBB):
-                return 'obb'
+                return "obb"

    # Guess from model filename
    if isinstance(model, (str, Path)):
        model = Path(model)
-        if '-seg' in model.stem or 'segment' in model.parts:
-            return 'segment'
-        elif '-cls' in model.stem or 'classify' in model.parts:
-            return 'classify'
-        elif '-pose' in model.stem or 'pose' in model.parts:
-            return 'pose'
-        elif '-obb' in model.stem or 'obb' in model.parts:
-            return 'obb'
-        elif 'detect' in model.parts:
-            return 'detect'
+        if "-seg" in model.stem or "segment" in model.parts:
+            return "segment"
+        elif "-cls" in model.stem or "classify" in model.parts:
+            return "classify"
+        elif "-pose" in model.stem or "pose" in model.parts:
+            return "pose"
+        elif "-obb" in model.stem or "obb" in model.parts:
+            return "obb"
+        elif "detect" in model.parts:
+            return "detect"

    # Unable to determine task from model
-    LOGGER.warning("WARNING ⚠️ Unable to automatically guess model task, assuming 'task=detect'. "
-                   "Explicitly define task for your model, i.e. 'task=detect', 'segment', 'classify','pose' or 'obb'.")
-    return 'detect'  # assume detect
+    LOGGER.warning(
+        "WARNING ⚠️ Unable to automatically guess model task, assuming 'task=detect'. "
+        "Explicitly define task for your model, i.e. 'task=detect', 'segment', 'classify','pose' or 'obb'."
+    )
+    return "detect"  # assume detect