From ee6fde0beb75e96c44fe6665c6296e81ea1f686a Mon Sep 17 00:00:00 2001
From: Burhan <62214284+Burhan-Q@users.noreply.github.com>
Date: Mon, 25 Nov 2024 05:33:11 -0500
Subject: [PATCH] `ultralytics 8.3.37` TensorRT auto-workspace size (#17748)

Co-authored-by: Glenn Jocher <glenn.jocher@ultralytics.com>
---
 docs/en/integrations/tensorrt.md |  6 +++---
 docs/en/macros/export-args.md    | 30 +++++++++++++++---------------
 ultralytics/__init__.py          |  2 +-
 ultralytics/cfg/default.yaml     |  2 +-
 ultralytics/engine/exporter.py   |  8 ++++----
 5 files changed, 24 insertions(+), 24 deletions(-)
diff --git a/docs/en/integrations/tensorrt.md b/docs/en/integrations/tensorrt.md
index 1a8e5a91..ec1cfc3c 100644
--- a/docs/en/integrations/tensorrt.md
+++ b/docs/en/integrations/tensorrt.md
@@ -127,11 +127,11 @@ The arguments provided when using [export](../modes/export.md) for an Ultralytic
 
     - Adjust the `workspace` value according to your calibration needs and resource availability. While a larger `workspace` may increase calibration time, it allows TensorRT to explore a wider range of optimization tactics, potentially enhancing model performance and [accuracy](https://www.ultralytics.com/glossary/accuracy). Conversely, a smaller `workspace` can reduce calibration time but may limit the optimization strategies, affecting the quality of the quantized model.
 
-    - Default is `workspace=4` (GiB), this value may need to be increased if calibration crashes (exits without warning).
+    - Default is `workspace=None`, which will allow for TensorRT to automatically allocate memory, when configuring manually, this value may need to be increased if calibration crashes (exits without warning).
 
-    - TensorRT will report `UNSUPPORTED_STATE` during export if the value for `workspace` is larger than the memory available to the device, which means the value for `workspace` should be lowered.
+    - TensorRT will report `UNSUPPORTED_STATE` during export if the value for `workspace` is larger than the memory available to the device, which means the value for `workspace` should be lowered or set to `None`.
 
-    - If `workspace` is set to max value and calibration fails/crashes, consider reducing the values for `imgsz` and `batch` to reduce memory requirements.
+    - If `workspace` is set to max value and calibration fails/crashes, consider using `None` for auto-allocation or by reducing the values for `imgsz` and `batch` to reduce memory requirements.
 
     - <u><b>Remember</b> calibration for INT8 is specific to each device</u>, borrowing a "high-end" GPU for calibration, might result in poor performance when inference is run on another device.
 
diff --git a/docs/en/macros/export-args.md b/docs/en/macros/export-args.md
index 242090d7..803ce149 100644
--- a/docs/en/macros/export-args.md
+++ b/docs/en/macros/export-args.md
@@ -1,15 +1,15 @@
-| Argument    | Type             | Default         | Description                                                                                                                                                                                   |
-| ----------- | ---------------- | --------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `format`    | `str`            | `'torchscript'` | Target format for the exported model, such as `'onnx'`, `'torchscript'`, `'tensorflow'`, or others, defining compatibility with various deployment environments.                              |
-| `imgsz`     | `int` or `tuple` | `640`           | Desired image size for the model input. Can be an integer for square images or a tuple `(height, width)` for specific dimensions.                                                             |
-| `keras`     | `bool`           | `False`         | Enables export to Keras format for [TensorFlow](https://www.ultralytics.com/glossary/tensorflow) SavedModel, providing compatibility with TensorFlow serving and APIs.                        |
-| `optimize`  | `bool`           | `False`         | Applies optimization for mobile devices when exporting to TorchScript, potentially reducing model size and improving performance.                                                             |
-| `half`      | `bool`           | `False`         | Enables FP16 (half-precision) quantization, reducing model size and potentially speeding up inference on supported hardware.                                                                  |
-| `int8`      | `bool`           | `False`         | Activates INT8 quantization, further compressing the model and speeding up inference with minimal [accuracy](https://www.ultralytics.com/glossary/accuracy) loss, primarily for edge devices. |
-| `dynamic`   | `bool`           | `False`         | Allows dynamic input sizes for ONNX, TensorRT and OpenVINO exports, enhancing flexibility in handling varying image dimensions.                                                               |
-| `simplify`  | `bool`           | `True`          | Simplifies the model graph for ONNX exports with `onnxslim`, potentially improving performance and compatibility.                                                                             |
-| `opset`     | `int`            | `None`          | Specifies the ONNX opset version for compatibility with different ONNX parsers and runtimes. If not set, uses the latest supported version.                                                   |
-| `workspace` | `float`          | `4.0`           | Sets the maximum workspace size in GiB for TensorRT optimizations, balancing memory usage and performance.                                                                                    |
-| `nms`       | `bool`           | `False`         | Adds Non-Maximum Suppression (NMS) to the CoreML export, essential for accurate and efficient detection post-processing.                                                                      |
-| `batch`     | `int`            | `1`             | Specifies export model batch inference size or the max number of images the exported model will process concurrently in `predict` mode.                                                       |
-| `device`    | `str`            | `None`          | Specifies the device for exporting: GPU (`device=0`), CPU (`device=cpu`), MPS for Apple silicon (`device=mps`) or DLA for NVIDIA Jetson (`device=dla:0` or `device=dla:1`).                   |
+| Argument    | Type              | Default         | Description                                                                                                                                                                                   |
+| ----------- | ----------------- | --------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `format`    | `str`             | `'torchscript'` | Target format for the exported model, such as `'onnx'`, `'torchscript'`, `'tensorflow'`, or others, defining compatibility with various deployment environments.                              |
+| `imgsz`     | `int` or `tuple`  | `640`           | Desired image size for the model input. Can be an integer for square images or a tuple `(height, width)` for specific dimensions.                                                             |
+| `keras`     | `bool`            | `False`         | Enables export to Keras format for [TensorFlow](https://www.ultralytics.com/glossary/tensorflow) SavedModel, providing compatibility with TensorFlow serving and APIs.                        |
+| `optimize`  | `bool`            | `False`         | Applies optimization for mobile devices when exporting to TorchScript, potentially reducing model size and improving performance.                                                             |
+| `half`      | `bool`            | `False`         | Enables FP16 (half-precision) quantization, reducing model size and potentially speeding up inference on supported hardware.                                                                  |
+| `int8`      | `bool`            | `False`         | Activates INT8 quantization, further compressing the model and speeding up inference with minimal [accuracy](https://www.ultralytics.com/glossary/accuracy) loss, primarily for edge devices. |
+| `dynamic`   | `bool`            | `False`         | Allows dynamic input sizes for ONNX, TensorRT and OpenVINO exports, enhancing flexibility in handling varying image dimensions.                                                               |
+| `simplify`  | `bool`            | `True`          | Simplifies the model graph for ONNX exports with `onnxslim`, potentially improving performance and compatibility.                                                                             |
+| `opset`     | `int`             | `None`          | Specifies the ONNX opset version for compatibility with different ONNX parsers and runtimes. If not set, uses the latest supported version.                                                   |
+| `workspace` | `float` or `None` | `None`          | Sets the maximum workspace size in GiB for TensorRT optimizations, balancing memory usage and performance; use `None` for auto-allocation by TensorRT up to device maximum.                   |
+| `nms`       | `bool`            | `False`         | Adds Non-Maximum Suppression (NMS) to the CoreML export, essential for accurate and efficient detection post-processing.                                                                      |
+| `batch`     | `int`             | `1`             | Specifies export model batch inference size or the max number of images the exported model will process concurrently in `predict` mode.                                                       |
+| `device`    | `str`             | `None`          | Specifies the device for exporting: GPU (`device=0`), CPU (`device=cpu`), MPS for Apple silicon (`device=mps`) or DLA for NVIDIA Jetson (`device=dla:0` or `device=dla:1`).                   |
diff --git a/ultralytics/__init__.py b/ultralytics/__init__.py
index 35c06301..e17c6c7d 100644
--- a/ultralytics/__init__.py
+++ b/ultralytics/__init__.py
@@ -1,6 +1,6 @@
 # Ultralytics YOLO 🚀, AGPL-3.0 license
 
-__version__ = "8.3.36"
+__version__ = "8.3.37"
 
 import os
 
diff --git a/ultralytics/cfg/default.yaml b/ultralytics/cfg/default.yaml
index 2ef1f428..9f6eb51a 100644
--- a/ultralytics/cfg/default.yaml
+++ b/ultralytics/cfg/default.yaml
@@ -83,7 +83,7 @@ int8: False # (bool) CoreML/TF INT8 quantization
 dynamic: False # (bool) ONNX/TF/TensorRT: dynamic axes
 simplify: True # (bool) ONNX: simplify model using `onnxslim`
 opset: # (int, optional) ONNX: opset version
-workspace: 4 # (int) TensorRT: workspace size (GB)
+workspace: None # (float, optional) TensorRT: workspace size (GiB), `None` will let TensorRT auto-allocate memory
 nms: False # (bool) CoreML: add NMS
 
 # Hyperparameters ------------------------------------------------------------------------------------------------------
diff --git a/ultralytics/engine/exporter.py b/ultralytics/engine/exporter.py
index 77996f0f..f24e7929 100644
--- a/ultralytics/engine/exporter.py
+++ b/ultralytics/engine/exporter.py
@@ -781,10 +781,10 @@ class Exporter:
         # Engine builder
         builder = trt.Builder(logger)
         config = builder.create_builder_config()
-        workspace = int(self.args.workspace * (1 << 30))
-        if is_trt10:
+        workspace = int(self.args.workspace * (1 << 30)) if self.args.workspace is not None else 0
+        if is_trt10 and workspace > 0:
             config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, workspace)
-        else:  # TensorRT versions 7, 8
+        elif workspace > 0 and not is_trt10:  # TensorRT versions 7, 8
             config.max_workspace_size = workspace
         flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
         network = builder.create_network(flag)
@@ -823,7 +823,7 @@ class Exporter:
                 LOGGER.warning(f"{prefix} WARNING ⚠️ 'dynamic=True' model requires max batch size, i.e. 'batch=16'")
             profile = builder.create_optimization_profile()
             min_shape = (1, shape[1], 32, 32)  # minimum input shape
-            max_shape = (*shape[:2], *(int(max(1, self.args.workspace) * d) for d in shape[2:]))  # max input shape
+            max_shape = (*shape[:2], *(int(max(1, workspace) * d) for d in shape[2:]))  # max input shape
             for inp in inputs:
                 profile.set_shape(inp.name, min=min_shape, opt=shape, max=max_shape)
             config.add_optimization_profile(profile)