Deprecations fix TORCH_NCCL_BLOCKING_WAIT (#9448)
This commit is contained in:
parent
aa756069d4
commit
7df821e6ea
1 changed files with 1 additions and 1 deletions
|
|
@ -210,7 +210,7 @@ class BaseTrainer:
|
||||||
torch.cuda.set_device(RANK)
|
torch.cuda.set_device(RANK)
|
||||||
self.device = torch.device("cuda", RANK)
|
self.device = torch.device("cuda", RANK)
|
||||||
# LOGGER.info(f'DDP info: RANK {RANK}, WORLD_SIZE {world_size}, DEVICE {self.device}')
|
# LOGGER.info(f'DDP info: RANK {RANK}, WORLD_SIZE {world_size}, DEVICE {self.device}')
|
||||||
os.environ["NCCL_BLOCKING_WAIT"] = "1" # set to enforce timeout
|
os.environ["TORCH_NCCL_BLOCKING_WAIT"] = "1" # set to enforce timeout
|
||||||
dist.init_process_group(
|
dist.init_process_group(
|
||||||
"nccl" if dist.is_nccl_available() else "gloo",
|
"nccl" if dist.is_nccl_available() else "gloo",
|
||||||
timeout=timedelta(seconds=10800), # 3 hours
|
timeout=timedelta(seconds=10800), # 3 hours
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue