ultralytics 8.1.41 DDP resume untrained-checkpoint fix (#9453)

Signed-off-by: Glenn Jocher <glenn.jocher@ultralytics.com>
Co-authored-by: UltralyticsAssistant <web@ultralytics.com>
Co-authored-by: Laughing-q <1185102784@qq.com>
This commit is contained in:
Glenn Jocher 2024-04-01 19:46:04 +02:00 committed by GitHub
parent 2cee8893d9
commit 959acf67db
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 12 additions and 15 deletions

View file

@ -212,7 +212,7 @@ class BaseTrainer:
# LOGGER.info(f'DDP info: RANK {RANK}, WORLD_SIZE {world_size}, DEVICE {self.device}')
os.environ["TORCH_NCCL_BLOCKING_WAIT"] = "1" # set to enforce timeout
dist.init_process_group(
"nccl" if dist.is_nccl_available() else "gloo",
backend="nccl" if dist.is_nccl_available() else "gloo",
timeout=timedelta(seconds=10800), # 3 hours
rank=RANK,
world_size=world_size,
@ -648,8 +648,8 @@ class BaseTrainer:
resume = True
self.args = get_cfg(ckpt_args)
self.args.model = str(last) # reinstate model
for k in "imgsz", "batch": # allow arg updates to reduce memory on resume if crashed due to CUDA OOM
self.args.model = self.args.resume = str(last) # reinstate model
for k in "imgsz", "batch", "device": # allow arg updates to reduce memory or update device on resume
if k in overrides:
setattr(self.args, k, overrides[k])
@ -662,7 +662,7 @@ class BaseTrainer:
def resume_training(self, ckpt):
"""Resume YOLO training from given epoch and best fitness."""
if ckpt is None:
if ckpt is None or not self.resume:
return
best_fitness = 0.0
start_epoch = ckpt.get("epoch", -1) + 1
@ -672,14 +672,11 @@ class BaseTrainer:
if self.ema and ckpt.get("ema"):
self.ema.ema.load_state_dict(ckpt["ema"].float().state_dict()) # EMA
self.ema.updates = ckpt["updates"]
if self.resume:
assert start_epoch > 0, (
f"{self.args.model} training to {self.epochs} epochs is finished, nothing to resume.\n"
f"Start a new training without resuming, i.e. 'yolo train model={self.args.model}'"
)
LOGGER.info(
f"Resuming training from {self.args.model} from epoch {start_epoch + 1} to {self.epochs} total epochs"
)
assert start_epoch > 0, (
f"{self.args.model} training to {self.epochs} epochs is finished, nothing to resume.\n"
f"Start a new training without resuming, i.e. 'yolo train model={self.args.model}'"
)
LOGGER.info(f"Resuming training {self.args.model} from epoch {start_epoch + 1} to {self.epochs} total epochs")
if self.epochs < start_epoch:
LOGGER.info(
f"{self.model} has been trained for {ckpt['epoch']} epochs. Fine-tuning for {self.epochs} more epochs."