CLI DDP fixes (#135)
This commit is contained in:
parent
8f3cd52844
commit
c5c86a3acd
2 changed files with 5 additions and 4 deletions
|
|
@ -29,10 +29,11 @@ WORLD_SIZE = int(os.getenv('WORLD_SIZE', 1))
|
|||
@contextmanager
|
||||
def torch_distributed_zero_first(local_rank: int):
|
||||
# Decorator to make all processes in distributed training wait for each local_master to do something
|
||||
if local_rank not in {-1, 0}:
|
||||
initialized = torch.distributed.is_initialized() # prevent 'Default process group has not been initialized' errors
|
||||
if initialized and local_rank not in {-1, 0}:
|
||||
dist.barrier(device_ids=[local_rank])
|
||||
yield
|
||||
if local_rank == 0:
|
||||
if initialized and local_rank == 0:
|
||||
dist.barrier(device_ids=[0])
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue