Skip to content

Commit

Permalink
Refine the error message when auto_scale_lr is not set correctly (#1181)
Browse files Browse the repository at this point in the history
  • Loading branch information
alexander-soare committed Jun 6, 2023
1 parent a2e410b commit 8593691
Showing 1 changed file with 12 additions and 7 deletions.
19 changes: 12 additions & 7 deletions mmengine/runner/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -1975,16 +1975,21 @@ def resume(self,
if (previous_gpu_ids is not None and len(previous_gpu_ids) > 0
and len(previous_gpu_ids) != self._world_size):
# TODO, should we modify the iteration?
self.logger.info(
'Number of GPU used for current experiment is not '
'consistent with resuming from checkpoint')
if (self.auto_scale_lr is None
or not self.auto_scale_lr.get('enable', False)):
raise RuntimeError(
'Cannot automatically rescale lr in resuming. Please '
'make sure the number of GPU is consistent with the '
'previous training state resuming from the checkpoint '
'or set `enable` in `auto_scale_lr to False.')
'Number of GPUs used for current experiment is not '
'consistent with the checkpoint being resumed from. '
'This will result in poor performance due to the '
'learning rate. You must set the '
'`auto_scale_lr` parameter for Runner and make '
'`auto_scale_lr["enable"]=True`.')
else:
self.logger.info(
'Number of GPU used for current experiment is not '
'consistent with resuming from checkpoint but the '
'leaning rate will be adjusted according to the '
f'setting in auto_scale_lr={self.auto_scale_lr}')

# resume random seed
resumed_seed = checkpoint['meta'].get('seed', None)
Expand Down

0 comments on commit 8593691

Please sign in to comment.