ultralytics 8.2.71 Multinode DDP training (#14879)

Co-authored-by: Haris Rehman <haris.rehman.cowlar@gmail.com> Co-authored-by: Glenn Jocher <glenn.jocher@ultralytics.com> Co-authored-by: UltralyticsAssistant <web@ultralytics.com>
2024-08-01 20:31:03 +05:00 · 2024-08-01 20:31:03 +05:00 · 9c5d1a2451
commit 9c5d1a2451
parent 16fc325308
4 changed files with 9 additions and 4 deletions
--- a/ultralytics/utils/torch_utils.py
+++ b/ultralytics/utils/torch_utils.py
@ -48,11 +48,12 @@ TORCHVISION_0_18 = check_version(TORCHVISION_VERSION, "0.18.0")
 def torch_distributed_zero_first(local_rank: int):
    """Ensures all processes in distributed training wait for the local master (rank 0) to complete a task first."""
    initialized = dist.is_available() and dist.is_initialized()
+
    if initialized and local_rank not in {-1, 0}:
        dist.barrier(device_ids=[local_rank])
    yield
    if initialized and local_rank == 0:
-        dist.barrier(device_ids=[0])
+        dist.barrier(device_ids=[local_rank])


 def smart_inference_mode():