From 5af8a5c0fb6812cf8e9ac26aa79d488eb6f1e5a9 Mon Sep 17 00:00:00 2001
From: Glenn Jocher <glenn.jocher@ultralytics.com>
Date: Tue, 1 Oct 2024 11:53:11 +0200
Subject: [PATCH] `ultralytics 8.3.2` fix AMP checks with `imgsz=256` (#16583)

---
 tests/test_cuda.py            | 8 ++++++++
 ultralytics/__init__.py       | 2 +-
 ultralytics/utils/__init__.py | 1 +
 ultralytics/utils/checks.py   | 5 +++--
 4 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/tests/test_cuda.py b/tests/test_cuda.py
index 0b3429d0..3b08edc6 100644
--- a/tests/test_cuda.py
+++ b/tests/test_cuda.py
@@ -10,6 +10,7 @@ from tests import CUDA_DEVICE_COUNT, CUDA_IS_AVAILABLE, MODEL, SOURCE
 from ultralytics import YOLO
 from ultralytics.cfg import TASK2DATA, TASK2MODEL, TASKS
 from ultralytics.utils import ASSETS, WEIGHTS_DIR
+from ultralytics.utils.checks import check_amp
 
 
 def test_checks():
@@ -18,6 +19,13 @@ def test_checks():
     assert torch.cuda.device_count() == CUDA_DEVICE_COUNT
 
 
+@pytest.mark.skipif(not CUDA_IS_AVAILABLE, reason="CUDA is not available")
+def test_amp():
+    """Test AMP training checks."""
+    model = YOLO("yolo11n.pt").model.cuda()
+    assert check_amp(model)
+
+
 @pytest.mark.slow
 @pytest.mark.skipif(True, reason="CUDA export tests disabled pending additional Ultralytics GPU server availability")
 @pytest.mark.skipif(not CUDA_IS_AVAILABLE, reason="CUDA is not available")
diff --git a/ultralytics/__init__.py b/ultralytics/__init__.py
index daff29f8..f3d639ad 100644
--- a/ultralytics/__init__.py
+++ b/ultralytics/__init__.py
@@ -1,6 +1,6 @@
 # Ultralytics YOLO 🚀, AGPL-3.0 license
 
-__version__ = "8.3.1"
+__version__ = "8.3.2"
 
 import os
 
diff --git a/ultralytics/utils/__init__.py b/ultralytics/utils/__init__.py
index 02610b88..e122d4b5 100644
--- a/ultralytics/utils/__init__.py
+++ b/ultralytics/utils/__init__.py
@@ -111,6 +111,7 @@ torch.set_printoptions(linewidth=320, precision=4, profile="default")
 np.set_printoptions(linewidth=320, formatter={"float_kind": "{:11.5g}".format})  # format short g, %precision=5
 cv2.setNumThreads(0)  # prevent OpenCV from multithreading (incompatible with PyTorch DataLoader)
 os.environ["NUMEXPR_MAX_THREADS"] = str(NUM_THREADS)  # NumExpr max threads
+os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"  # for deterministic training to avoid CUDA warning
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"  # suppress verbose TF compiler warnings in Colab
 os.environ["TORCH_CPP_LOG_LEVEL"] = "ERROR"  # suppress "NNPACK.cpp could not initialize NNPACK" warnings
 os.environ["KINETO_LOG_LEVEL"] = "5"  # suppress verbose PyTorch profiler output when computing FLOPs
diff --git a/ultralytics/utils/checks.py b/ultralytics/utils/checks.py
index 383c8562..85eccf67 100644
--- a/ultralytics/utils/checks.py
+++ b/ultralytics/utils/checks.py
@@ -657,9 +657,10 @@ def check_amp(model):
     def amp_allclose(m, im):
         """All close FP32 vs AMP results."""
         batch = [im] * 8
-        a = m(batch, imgsz=128, device=device, verbose=False)[0].boxes.data  # FP32 inference
+        imgsz = max(256, int(model.stride.max() * 4))  # max stride P5-32 and P6-64
+        a = m(batch, imgsz=imgsz, device=device, verbose=False)[0].boxes.data  # FP32 inference
         with autocast(enabled=True):
-            b = m(batch, imgsz=128, device=device, verbose=False)[0].boxes.data  # AMP inference
+            b = m(batch, imgsz=imgsz, device=device, verbose=False)[0].boxes.data  # AMP inference
         del m
         return a.shape == b.shape and torch.allclose(a, b.float(), atol=0.5)  # close to 0.5 absolute tolerance