From ee6fde0beb75e96c44fe6665c6296e81ea1f686a Mon Sep 17 00:00:00 2001 From: Burhan <62214284+Burhan-Q@users.noreply.github.com> Date: Mon, 25 Nov 2024 05:33:11 -0500 Subject: [PATCH] `ultralytics 8.3.37` TensorRT auto-workspace size (#17748) Co-authored-by: Glenn Jocher --- docs/en/integrations/tensorrt.md | 6 +++--- docs/en/macros/export-args.md | 30 +++++++++++++++--------------- ultralytics/__init__.py | 2 +- ultralytics/cfg/default.yaml | 2 +- ultralytics/engine/exporter.py | 8 ++++---- 5 files changed, 24 insertions(+), 24 deletions(-) diff --git a/docs/en/integrations/tensorrt.md b/docs/en/integrations/tensorrt.md index 1a8e5a91..ec1cfc3c 100644 --- a/docs/en/integrations/tensorrt.md +++ b/docs/en/integrations/tensorrt.md @@ -127,11 +127,11 @@ The arguments provided when using [export](../modes/export.md) for an Ultralytic - Adjust the `workspace` value according to your calibration needs and resource availability. While a larger `workspace` may increase calibration time, it allows TensorRT to explore a wider range of optimization tactics, potentially enhancing model performance and [accuracy](https://www.ultralytics.com/glossary/accuracy). Conversely, a smaller `workspace` can reduce calibration time but may limit the optimization strategies, affecting the quality of the quantized model. - - Default is `workspace=4` (GiB), this value may need to be increased if calibration crashes (exits without warning). + - Default is `workspace=None`, which will allow for TensorRT to automatically allocate memory, when configuring manually, this value may need to be increased if calibration crashes (exits without warning). - - TensorRT will report `UNSUPPORTED_STATE` during export if the value for `workspace` is larger than the memory available to the device, which means the value for `workspace` should be lowered. + - TensorRT will report `UNSUPPORTED_STATE` during export if the value for `workspace` is larger than the memory available to the device, which means the value for `workspace` should be lowered or set to `None`. - - If `workspace` is set to max value and calibration fails/crashes, consider reducing the values for `imgsz` and `batch` to reduce memory requirements. + - If `workspace` is set to max value and calibration fails/crashes, consider using `None` for auto-allocation or by reducing the values for `imgsz` and `batch` to reduce memory requirements. - Remember calibration for INT8 is specific to each device, borrowing a "high-end" GPU for calibration, might result in poor performance when inference is run on another device. diff --git a/docs/en/macros/export-args.md b/docs/en/macros/export-args.md index 242090d7..803ce149 100644 --- a/docs/en/macros/export-args.md +++ b/docs/en/macros/export-args.md @@ -1,15 +1,15 @@ -| Argument | Type | Default | Description | -| ----------- | ---------------- | --------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `format` | `str` | `'torchscript'` | Target format for the exported model, such as `'onnx'`, `'torchscript'`, `'tensorflow'`, or others, defining compatibility with various deployment environments. | -| `imgsz` | `int` or `tuple` | `640` | Desired image size for the model input. Can be an integer for square images or a tuple `(height, width)` for specific dimensions. | -| `keras` | `bool` | `False` | Enables export to Keras format for [TensorFlow](https://www.ultralytics.com/glossary/tensorflow) SavedModel, providing compatibility with TensorFlow serving and APIs. | -| `optimize` | `bool` | `False` | Applies optimization for mobile devices when exporting to TorchScript, potentially reducing model size and improving performance. | -| `half` | `bool` | `False` | Enables FP16 (half-precision) quantization, reducing model size and potentially speeding up inference on supported hardware. | -| `int8` | `bool` | `False` | Activates INT8 quantization, further compressing the model and speeding up inference with minimal [accuracy](https://www.ultralytics.com/glossary/accuracy) loss, primarily for edge devices. | -| `dynamic` | `bool` | `False` | Allows dynamic input sizes for ONNX, TensorRT and OpenVINO exports, enhancing flexibility in handling varying image dimensions. | -| `simplify` | `bool` | `True` | Simplifies the model graph for ONNX exports with `onnxslim`, potentially improving performance and compatibility. | -| `opset` | `int` | `None` | Specifies the ONNX opset version for compatibility with different ONNX parsers and runtimes. If not set, uses the latest supported version. | -| `workspace` | `float` | `4.0` | Sets the maximum workspace size in GiB for TensorRT optimizations, balancing memory usage and performance. | -| `nms` | `bool` | `False` | Adds Non-Maximum Suppression (NMS) to the CoreML export, essential for accurate and efficient detection post-processing. | -| `batch` | `int` | `1` | Specifies export model batch inference size or the max number of images the exported model will process concurrently in `predict` mode. | -| `device` | `str` | `None` | Specifies the device for exporting: GPU (`device=0`), CPU (`device=cpu`), MPS for Apple silicon (`device=mps`) or DLA for NVIDIA Jetson (`device=dla:0` or `device=dla:1`). | +| Argument | Type | Default | Description | +| ----------- | ----------------- | --------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `format` | `str` | `'torchscript'` | Target format for the exported model, such as `'onnx'`, `'torchscript'`, `'tensorflow'`, or others, defining compatibility with various deployment environments. | +| `imgsz` | `int` or `tuple` | `640` | Desired image size for the model input. Can be an integer for square images or a tuple `(height, width)` for specific dimensions. | +| `keras` | `bool` | `False` | Enables export to Keras format for [TensorFlow](https://www.ultralytics.com/glossary/tensorflow) SavedModel, providing compatibility with TensorFlow serving and APIs. | +| `optimize` | `bool` | `False` | Applies optimization for mobile devices when exporting to TorchScript, potentially reducing model size and improving performance. | +| `half` | `bool` | `False` | Enables FP16 (half-precision) quantization, reducing model size and potentially speeding up inference on supported hardware. | +| `int8` | `bool` | `False` | Activates INT8 quantization, further compressing the model and speeding up inference with minimal [accuracy](https://www.ultralytics.com/glossary/accuracy) loss, primarily for edge devices. | +| `dynamic` | `bool` | `False` | Allows dynamic input sizes for ONNX, TensorRT and OpenVINO exports, enhancing flexibility in handling varying image dimensions. | +| `simplify` | `bool` | `True` | Simplifies the model graph for ONNX exports with `onnxslim`, potentially improving performance and compatibility. | +| `opset` | `int` | `None` | Specifies the ONNX opset version for compatibility with different ONNX parsers and runtimes. If not set, uses the latest supported version. | +| `workspace` | `float` or `None` | `None` | Sets the maximum workspace size in GiB for TensorRT optimizations, balancing memory usage and performance; use `None` for auto-allocation by TensorRT up to device maximum. | +| `nms` | `bool` | `False` | Adds Non-Maximum Suppression (NMS) to the CoreML export, essential for accurate and efficient detection post-processing. | +| `batch` | `int` | `1` | Specifies export model batch inference size or the max number of images the exported model will process concurrently in `predict` mode. | +| `device` | `str` | `None` | Specifies the device for exporting: GPU (`device=0`), CPU (`device=cpu`), MPS for Apple silicon (`device=mps`) or DLA for NVIDIA Jetson (`device=dla:0` or `device=dla:1`). | diff --git a/ultralytics/__init__.py b/ultralytics/__init__.py index 35c06301..e17c6c7d 100644 --- a/ultralytics/__init__.py +++ b/ultralytics/__init__.py @@ -1,6 +1,6 @@ # Ultralytics YOLO 🚀, AGPL-3.0 license -__version__ = "8.3.36" +__version__ = "8.3.37" import os diff --git a/ultralytics/cfg/default.yaml b/ultralytics/cfg/default.yaml index 2ef1f428..9f6eb51a 100644 --- a/ultralytics/cfg/default.yaml +++ b/ultralytics/cfg/default.yaml @@ -83,7 +83,7 @@ int8: False # (bool) CoreML/TF INT8 quantization dynamic: False # (bool) ONNX/TF/TensorRT: dynamic axes simplify: True # (bool) ONNX: simplify model using `onnxslim` opset: # (int, optional) ONNX: opset version -workspace: 4 # (int) TensorRT: workspace size (GB) +workspace: None # (float, optional) TensorRT: workspace size (GiB), `None` will let TensorRT auto-allocate memory nms: False # (bool) CoreML: add NMS # Hyperparameters ------------------------------------------------------------------------------------------------------ diff --git a/ultralytics/engine/exporter.py b/ultralytics/engine/exporter.py index 77996f0f..f24e7929 100644 --- a/ultralytics/engine/exporter.py +++ b/ultralytics/engine/exporter.py @@ -781,10 +781,10 @@ class Exporter: # Engine builder builder = trt.Builder(logger) config = builder.create_builder_config() - workspace = int(self.args.workspace * (1 << 30)) - if is_trt10: + workspace = int(self.args.workspace * (1 << 30)) if self.args.workspace is not None else 0 + if is_trt10 and workspace > 0: config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, workspace) - else: # TensorRT versions 7, 8 + elif workspace > 0 and not is_trt10: # TensorRT versions 7, 8 config.max_workspace_size = workspace flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) network = builder.create_network(flag) @@ -823,7 +823,7 @@ class Exporter: LOGGER.warning(f"{prefix} WARNING ⚠️ 'dynamic=True' model requires max batch size, i.e. 'batch=16'") profile = builder.create_optimization_profile() min_shape = (1, shape[1], 32, 32) # minimum input shape - max_shape = (*shape[:2], *(int(max(1, self.args.workspace) * d) for d in shape[2:])) # max input shape + max_shape = (*shape[:2], *(int(max(1, workspace) * d) for d in shape[2:])) # max input shape for inp in inputs: profile.set_shape(inp.name, min=min_shape, opt=shape, max=max_shape) config.add_optimization_profile(profile)