diff --git a/docs/zh_cn/tutorials/optim_wrapper.md b/docs/zh_cn/tutorials/optim_wrapper.md index f54468cf19..7f9ff79b44 100644 --- a/docs/zh_cn/tutorials/optim_wrapper.md +++ b/docs/zh_cn/tutorials/optim_wrapper.md @@ -338,17 +338,19 @@ optimizer = build_optim_wrapper(ToyModel(), optim_wrapper) `decay_mult`:所有参数的衰减系数 -`bias_lr_mult`:偏置的学习率系数(不包括正则化层的偏置以及可变形卷积的 offset),默认值为 1 +`bias_lr_mult`:偏置的学习率系数(不包括正则化层的偏置以及可变形卷积的 offset) -`bias_decay_mult`:偏置的权值衰减系数(不包括正则化层的偏置以及可变形卷积的 offset),默认值为 1 +`bias_decay_mult`:偏置的权值衰减系数(不包括正则化层的偏置以及可变形卷积的 offset) -`norm_decay_mult`:正则化层权重和偏置的权值衰减系数,默认值为 1 +`norm_decay_mult`:正则化层权重和偏置的权值衰减系数 -`dwconv_decay_mult`:Depth-wise 卷积的权值衰减系数,默认值为 1 +`flat_decay_mult`:一维参数的权值衰减系数 + +`dwconv_decay_mult`:Depth-wise 卷积的权值衰减系数 `bypass_duplicate`:是否跳过重复的参数,默认为 `False` -`dcn_offset_lr_mult`:可变形卷积(Deformable Convolution)的学习率系数,默认值为 1 +`dcn_offset_lr_mult`:可变形卷积(Deformable Convolution)的学习率系数 ### 为模型不同部分的参数设置不同的超参系数 diff --git a/mmengine/optim/optimizer/default_constructor.py b/mmengine/optim/optimizer/default_constructor.py index 09ce17993c..e89cf53fb1 100644 --- a/mmengine/optim/optimizer/default_constructor.py +++ b/mmengine/optim/optimizer/default_constructor.py @@ -42,6 +42,8 @@ class DefaultOptimWrapperConstructor: - ``norm_decay_mult`` (float): It will be multiplied to the weight decay for all weight and bias parameters of normalization layers. + - ``flat_decay_mult`` (float): It will be multiplied to the weight + decay for all one-dimensional parameters - ``dwconv_decay_mult`` (float): It will be multiplied to the weight decay for all weight and bias parameters of depthwise conv layers. @@ -185,12 +187,13 @@ def add_params(self, # first sort with alphabet order and then sort with reversed len of str sorted_keys = sorted(sorted(custom_keys.keys()), key=len, reverse=True) - bias_lr_mult = self.paramwise_cfg.get('bias_lr_mult', 1.) - bias_decay_mult = self.paramwise_cfg.get('bias_decay_mult', 1.) - norm_decay_mult = self.paramwise_cfg.get('norm_decay_mult', 1.) - dwconv_decay_mult = self.paramwise_cfg.get('dwconv_decay_mult', 1.) + bias_lr_mult = self.paramwise_cfg.get('bias_lr_mult', None) + bias_decay_mult = self.paramwise_cfg.get('bias_decay_mult', None) + norm_decay_mult = self.paramwise_cfg.get('norm_decay_mult', None) + dwconv_decay_mult = self.paramwise_cfg.get('dwconv_decay_mult', None) + flat_decay_mult = self.paramwise_cfg.get('flat_decay_mult', None) bypass_duplicate = self.paramwise_cfg.get('bypass_duplicate', False) - dcn_offset_lr_mult = self.paramwise_cfg.get('dcn_offset_lr_mult', 1.) + dcn_offset_lr_mult = self.paramwise_cfg.get('dcn_offset_lr_mult', None) # special rules for norm layers and depth-wise conv layers is_norm = isinstance(module, @@ -226,10 +229,12 @@ def add_params(self, if not is_custom: # bias_lr_mult affects all bias parameters # except for norm.bias dcn.conv_offset.bias - if name == 'bias' and not (is_norm or is_dcn_module): + if name == 'bias' and not ( + is_norm or is_dcn_module) and bias_lr_mult is not None: param_group['lr'] = self.base_lr * bias_lr_mult if (prefix.find('conv_offset') != -1 and is_dcn_module + and dcn_offset_lr_mult is not None and isinstance(module, torch.nn.Conv2d)): # deal with both dcn_offset's bias & weight param_group['lr'] = self.base_lr * dcn_offset_lr_mult @@ -237,18 +242,23 @@ def add_params(self, # apply weight decay policies if self.base_wd is not None: # norm decay - if is_norm: + if is_norm and norm_decay_mult is not None: param_group[ 'weight_decay'] = self.base_wd * norm_decay_mult + # bias lr and decay + elif (name == 'bias' and not is_dcn_module + and bias_decay_mult is not None): + param_group[ + 'weight_decay'] = self.base_wd * bias_decay_mult # depth-wise conv - elif is_dwconv: + elif is_dwconv and dwconv_decay_mult is not None: param_group[ 'weight_decay'] = self.base_wd * dwconv_decay_mult - # bias lr and decay - elif name == 'bias' and not is_dcn_module: - # TODO: current bias_decay_mult will have affect on DCN + # flatten parameters except dcn offset + elif (param.ndim == 1 and not is_dcn_module + and flat_decay_mult is not None): param_group[ - 'weight_decay'] = self.base_wd * bias_decay_mult + 'weight_decay'] = self.base_wd * flat_decay_mult params.append(param_group) for key, value in param_group.items(): if key == 'params': diff --git a/tests/test_optim/test_optimizer/test_optimizer.py b/tests/test_optim/test_optimizer/test_optimizer.py index ddbda7e58d..d82849760c 100644 --- a/tests/test_optim/test_optimizer/test_optimizer.py +++ b/tests/test_optim/test_optimizer/test_optimizer.py @@ -123,6 +123,7 @@ def _check_sgd_optimizer(self, norm_decay_mult=1, dwconv_decay_mult=1, dcn_offset_lr_mult=1, + flat_decay_mult=1, bypass_duplicate=False): param_groups = optimizer.param_groups assert isinstance(optimizer, torch.optim.SGD) @@ -139,7 +140,7 @@ def _check_sgd_optimizer(self, # param1 param1 = param_groups[0] assert param1['lr'] == self.base_lr - assert param1['weight_decay'] == self.base_wd + assert param1['weight_decay'] == self.base_wd * flat_decay_mult # conv1.weight conv1_weight = param_groups[1] assert conv1_weight['lr'] == self.base_lr @@ -163,7 +164,7 @@ def _check_sgd_optimizer(self, # sub.param1 sub_param1 = param_groups[6] assert sub_param1['lr'] == self.base_lr - assert sub_param1['weight_decay'] == self.base_wd + assert sub_param1['weight_decay'] == self.base_wd * flat_decay_mult # sub.conv1.weight sub_conv1_weight = param_groups[7] assert sub_conv1_weight['lr'] == self.base_lr @@ -172,8 +173,7 @@ def _check_sgd_optimizer(self, # sub.conv1.bias sub_conv1_bias = param_groups[8] assert sub_conv1_bias['lr'] == self.base_lr * bias_lr_mult - assert sub_conv1_bias[ - 'weight_decay'] == self.base_wd * dwconv_decay_mult + assert sub_conv1_bias['weight_decay'] == self.base_wd * bias_decay_mult # sub.gn.weight sub_gn_weight = param_groups[9] assert sub_gn_weight['lr'] == self.base_lr @@ -258,7 +258,8 @@ def test_build_default_optimizer_constructor(self): bias_decay_mult=0.5, norm_decay_mult=0, dwconv_decay_mult=0.1, - dcn_offset_lr_mult=0.1) + dcn_offset_lr_mult=0.1, + flat_decay_mult=0.3) optim_constructor_cfg = dict( type='DefaultOptimWrapperConstructor', optim_wrapper_cfg=optim_wrapper, @@ -390,7 +391,8 @@ def test_default_optimizer_constructor_with_model_wrapper(self): bias_decay_mult=0.5, norm_decay_mult=0, dwconv_decay_mult=0.1, - dcn_offset_lr_mult=0.1) + dcn_offset_lr_mult=0.1, + flat_decay_mult=0.3) optim_constructor = DefaultOptimWrapperConstructor( optim_wrapper_cfg, paramwise_cfg) optim_wrapper = optim_constructor(model) @@ -429,7 +431,8 @@ def test_default_optimizer_constructor_with_model_wrapper(self): bias_decay_mult=0.5, norm_decay_mult=0, dwconv_decay_mult=0.1, - dcn_offset_lr_mult=0.1) + dcn_offset_lr_mult=0.1, + flat_decay_mult=0.3) optim_constructor = DefaultOptimWrapperConstructor( optim_wrapper_cfg, paramwise_cfg) optim_wrapper = optim_constructor(model) @@ -484,7 +487,8 @@ def test_default_optimizer_constructor_with_paramwise_cfg(self): bias_decay_mult=0.5, norm_decay_mult=0, dwconv_decay_mult=0.1, - dcn_offset_lr_mult=0.1) + dcn_offset_lr_mult=0.1, + flat_decay_mult=0.3) optim_constructor = DefaultOptimWrapperConstructor( optim_wrapper_cfg, paramwise_cfg) optim_wrapper = optim_constructor(self.model) @@ -554,6 +558,7 @@ def test_default_optimizer_constructor_bypass_duplicate(self): norm_decay_mult=0, dwconv_decay_mult=0.1, dcn_offset_lr_mult=0.1, + flat_decay_mult=0.3, bypass_duplicate=True) optim_constructor = DefaultOptimWrapperConstructor( optim_wrapper_cfg, paramwise_cfg)