CLI DDP fixes (#135)

This commit is contained in:
Glenn Jocher 2023-01-02 19:55:04 +01:00 committed by GitHub
parent 8f3cd52844
commit c5c86a3acd
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 5 additions and 4 deletions

View file

@ -29,10 +29,11 @@ WORLD_SIZE = int(os.getenv('WORLD_SIZE', 1))
@contextmanager
def torch_distributed_zero_first(local_rank: int):
# Decorator to make all processes in distributed training wait for each local_master to do something
if local_rank not in {-1, 0}:
initialized = torch.distributed.is_initialized() # prevent 'Default process group has not been initialized' errors
if initialized and local_rank not in {-1, 0}:
dist.barrier(device_ids=[local_rank])
yield
if local_rank == 0:
if initialized and local_rank == 0:
dist.barrier(device_ids=[0])