ultralytics 8.3.52 AutoBatch CUDA computation improvements (#18291)

Signed-off-by: Glenn Jocher <glenn.jocher@ultralytics.com> Co-authored-by: UltralyticsAssistant <web@ultralytics.com> Co-authored-by: Glenn Jocher <glenn.jocher@ultralytics.com>
2024-12-20 20:51:21 +08:00 · 2024-12-20 20:51:21 +08:00 · e3c46920e7
commit e3c46920e7
parent 00e239a5da
3 changed files with 52 additions and 18 deletions
--- a/docs/en/reference/utils/torch_utils.md
+++ b/docs/en/reference/utils/torch_utils.md
@ -127,6 +127,10 @@ keywords: Ultralytics, torch utils, model optimization, device selection, infere
 <br><br><hr><br>
 ## ::: ultralytics.utils.torch_utils.cuda_memory_usage
 <br><br><hr><br>
 ## ::: ultralytics.utils.torch_utils.profile
 <br><br>
--- a/ultralytics/init.py
+++ b/ultralytics/init.py
@ -1,6 +1,6 @@
 # Ultralytics YOLO 🚀, AGPL-3.0 license
-__version__ = "8.3.51"
+__version__ = "8.3.52"
 import os
--- a/ultralytics/utils/torch_utils.py
+++ b/ultralytics/utils/torch_utils.py
@ -617,6 +617,32 @@ def convert_optimizer_state_dict_to_fp16(state_dict):
    return state_dict
@contextmanager
 def cuda_memory_usage(device=None):
    """
    Monitor and manage CUDA memory usage.
    This function checks if CUDA is available and, if so, empties the CUDA cache to free up unused memory.
    It then yields a dictionary containing memory usage information, which can be updated by the caller.
    Finally, it updates the dictionary with the amount of memory reserved by CUDA on the specified device.
    Args:
        device (torch.device, optional): The CUDA device to query memory usage for. Defaults to None.
    Yields:
        (dict): A dictionary with a key 'memory' initialized to 0, which will be updated with the reserved memory.
    """
    cuda_info = dict(memory=0)
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        try:
            yield cuda_info
        finally:
            cuda_info["memory"] = torch.cuda.memory_reserved(device)
    else:
        yield cuda_info
 def profile(input, ops, n=10, device=None, max_num_obj=0):
    """
    Ultralytics speed, memory and FLOPs profiler.
@ -653,27 +679,31 @@ def profile(input, ops, n=10, device=None, max_num_obj=0):
                flops = 0
            try:
                mem = 0
                for _ in range(n):
-                    t[0] = time_sync()
+                    with cuda_memory_usage(device) as cuda_info:
-                    y = m(x)
+                        t[0] = time_sync()
-                    t[1] = time_sync()
+                        y = m(x)
-                    try:
+                        t[1] = time_sync()
-                        (sum(yi.sum() for yi in y) if isinstance(y, list) else y).sum().backward()
+                        try:
-                        t[2] = time_sync()
+                            (sum(yi.sum() for yi in y) if isinstance(y, list) else y).sum().backward()
-                    except Exception:  # no backward method
+                            t[2] = time_sync()
-                        # print(e)  # for debug
+                        except Exception:  # no backward method
-                        t[2] = float("nan")
+                            # print(e)  # for debug
                            t[2] = float("nan")
                    mem += cuda_info["memory"] / 1e9  # (GB)
                    tf += (t[1] - t[0]) * 1000 / n  # ms per op forward
                    tb += (t[2] - t[1]) * 1000 / n  # ms per op backward
                    if max_num_obj:  # simulate training with predictions per image grid (for AutoBatch)
-                        torch.randn(
+                        with cuda_memory_usage(device) as cuda_info:
-                            x.shape[0],
+                            torch.randn(
-                            max_num_obj,
+                                x.shape[0],
-                            int(sum((x.shape[-1] / s) * (x.shape[-2] / s) for s in m.stride.tolist())),
+                                max_num_obj,
-                            device=device,
+                                int(sum((x.shape[-1] / s) * (x.shape[-2] / s) for s in m.stride.tolist())),
-                            dtype=torch.float32,
+                                device=device,
-                        )
+                                dtype=torch.float32,
-                mem = torch.cuda.memory_reserved() / 1e9 if torch.cuda.is_available() else 0  # (GB)
+                            )
                        mem += cuda_info["memory"] / 1e9  # (GB)
                s_in, s_out = (tuple(x.shape) if isinstance(x, torch.Tensor) else "list" for x in (x, y))  # shapes
                p = sum(x.numel() for x in m.parameters()) if isinstance(m, nn.Module) else 0  # parameters
                LOGGER.info(f"{p:12}{flops:12.4g}{mem:>14.3f}{tf:14.4g}{tb:14.4g}{str(s_in):>24s}{str(s_out):>24s}")