Update Triton Inference Server guide (#17059)

Co-authored-by: UltralyticsAssistant <web@ultralytics.com> Co-authored-by: Ultralytics Assistant <135830346+UltralyticsAssistant@users.noreply.github.com> Co-authored-by: Burhan <62214284+Burhan-Q@users.noreply.github.com> Co-authored-by: Glenn Jocher <glenn.jocher@ultralytics.com>
2024-10-29 20:28:13 +08:00 · 2024-10-29 20:28:13 +08:00 · c4dae56e1a
commit c4dae56e1a
parent 235f2d95af
1 changed files with 24 additions and 2 deletions
--- a/docs/en/guides/triton-inference-server.md
+++ b/docs/en/guides/triton-inference-server.md
@ -80,6 +80,28 @@ The Triton Model Repository is a storage location where Triton can access and lo

    # Create config file
    (triton_model_path / "config.pbtxt").touch()
+
+    # (Optional) Enable TensorRT for GPU inference
+    # First run will be slow due to TensorRT engine conversion
+    import json
+
+    data = {
+        "optimization": {
+            "execution_accelerators": {
+                "gpu_execution_accelerator": [
+                    {
+                        "name": "tensorrt",
+                        "parameters": {"key": "precision_mode", "value": "FP16"},
+                        "parameters": {"key": "max_workspace_size_bytes", "value": "3221225472"},
+                        "parameters": {"key": "trt_engine_cache_enable", "value": "1"},
+                    }
+                ]
+            }
+        }
+    }
+
+    with open(triton_model_path / "config.pbtxt", "w") as f:
+        json.dump(data, f, indent=4)
    ```

 ## Running Triton Inference Server
@ -94,7 +116,7 @@ import time
 from tritonclient.http import InferenceServerClient

 # Define image https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver
-tag = "nvcr.io/nvidia/tritonserver:23.09-py3"  # 6.4 GB
+tag = "nvcr.io/nvidia/tritonserver:24.09-py3"  # 8.57 GB

 # Pull the image
 subprocess.call(f"docker pull {tag}", shell=True)
@ -187,7 +209,7 @@ Setting up [Ultralytics YOLO11](https://docs.ultralytics.com/models/yolov8/) wit
    from tritonclient.http import InferenceServerClient

    # Define image https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver
-    tag = "nvcr.io/nvidia/tritonserver:23.09-py3"
+    tag = "nvcr.io/nvidia/tritonserver:24.09-py3"

    subprocess.call(f"docker pull {tag}", shell=True)