Deprecations fix TORCH_NCCL_BLOCKING_WAIT (#9448)

This commit is contained in:
Glenn Jocher 2024-03-31 18:36:50 +02:00 committed by GitHub
parent aa756069d4
commit 7df821e6ea
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -210,7 +210,7 @@ class BaseTrainer:
torch.cuda.set_device(RANK) torch.cuda.set_device(RANK)
self.device = torch.device("cuda", RANK) self.device = torch.device("cuda", RANK)
# LOGGER.info(f'DDP info: RANK {RANK}, WORLD_SIZE {world_size}, DEVICE {self.device}') # LOGGER.info(f'DDP info: RANK {RANK}, WORLD_SIZE {world_size}, DEVICE {self.device}')
os.environ["NCCL_BLOCKING_WAIT"] = "1" # set to enforce timeout os.environ["TORCH_NCCL_BLOCKING_WAIT"] = "1" # set to enforce timeout
dist.init_process_group( dist.init_process_group(
"nccl" if dist.is_nccl_available() else "gloo", "nccl" if dist.is_nccl_available() else "gloo",
timeout=timedelta(seconds=10800), # 3 hours timeout=timedelta(seconds=10800), # 3 hours