open-mmlab · ZwwWayne · Oct 8, 2022 · Oct 5, 2022
diff --git a/docs/zh_cn/tutorials/hook.md b/docs/zh_cn/tutorials/hook.md
@@ -114,7 +114,7 @@ default_hooks = dict(checkpoint=dict(type='CheckpointHook', save_best='auto'))
 
 也可以直接指定 `save_best` 的值为评价指标，例如在分类任务中，可以指定为 `save_best='top-1'`，则会根据 `'top-1'` 的值判断当前权重是否最优。
 
-除了 `save_best` 参数，和保存最优权重相关的参数还有 `rule`，`greater_keys` 和 `less_keys`，这三者用来判断 `save_bes` 的值是越大越好还是越小越好。例如指定了 `save_best='top-1'`，可以指定 `rule='greater'`，则表示该值越大表示权重越好。
+除了 `save_best` 参数，和保存最优权重相关的参数还有 `rule`，`greater_keys` 和 `less_keys`，这三者用来判断 `save_best` 的值是越大越好还是越小越好。例如指定了 `save_best='top-1'`，可以指定 `rule='greater'`，则表示该值越大表示权重越好。
 
 - 指定保存权重的路径
 

diff --git a/mmengine/model/averaged_model.py b/mmengine/model/averaged_model.py
@@ -29,12 +29,13 @@ class BaseAveragedModel(nn.Module):
     In mmengine, we provide two ways to use the model averaging:
 
     1. Use the model averaging module in hook:
-       We provide an EMAHook to apply the model averaging during training.
-       Add ``custom_hooks=[dict(type='EMAHook')]`` to the config or the runner.
-       The hook is implemented in mmengine/hooks/ema_hook.py
+       We provide an :class:`mmengine.hooks.EMAHook` to apply the model
+       averaging during training. Add ``custom_hooks=[dict(type='EMAHook')]``
+       to the config or the runner.
 
     2. Use the model averaging module directly in the algorithm. Take the ema
        teacher in semi-supervise as an example:
+
        >>> from mmengine.model import ExponentialMovingAverage
        >>> student = ResNet(depth=50)
        >>> # use ema model as teacher

diff --git a/mmengine/model/base_model/base_model.py b/mmengine/model/base_model/base_model.py
@@ -92,12 +92,12 @@ def train_step(self, data: Union[dict, tuple, list],
         :class:`IterBasedTrainLoop` will call this method to update model
         parameters. The default parameter update process is as follows:
 
-        1. Calls ``self.data_processor(data, training=False) to collect
-          batch_inputs and corresponding data_samples(labels).
+        1. Calls ``self.data_processor(data, training=False)`` to collect
+           batch_inputs and corresponding data_samples(labels).
         2. Calls ``self(batch_inputs, data_samples, mode='loss')`` to get raw
-          loss
+           loss
         3. Calls ``self.parse_losses`` to get ``parsed_losses`` tensor used to
-          backward and dict of loss tensor used to log messages.
+           backward and dict of loss tensor used to log messages.
         4. Calls ``optim_wrapper.update_params(loss)`` to update model.
 
         Args:

diff --git a/mmengine/model/wrappers/distributed.py b/mmengine/model/wrappers/distributed.py
@@ -26,9 +26,10 @@ class MMDistributedDataParallel(DistributedDataParallel):
       default model forward, gradient back propagation, parameter updating
       logic. To take advantage of DistributedDataParallel's automatic gradient
       synchronization, ``train_step`` calls ``DistributedDataParallel.forward``
-      to calculate the losses, and call other methods of :obj:`BaseModel` to
+      to calculate the losses, and call other methods of :class:`BaseModel` to
       pre-process data and parse losses. Finally, update model parameters by
-      :obj:``OptimWrapper`` and return the loss dictionary used for logging.
+      :class:`OptimWrapper` and return the loss dictionary used
+      for logging.
 
     - ``val_step``: Called by ``runner.val_loop`` and get the inference
       results. Since there is no gradient synchronization requirement,
@@ -43,11 +44,10 @@ class MMDistributedDataParallel(DistributedDataParallel):
             the computational graph with `loss` as the root.
             There are two cases
 
-                - Parameters were not used during
-                  forward pass.
-                - Parameters were not used to produce
-                  loss.
-            Default: False.
+            - Parameters were not used during forward pass.
+            - Parameters were not used to produce loss.
+
+            Defaults to False.
 
         **kwargs: keyword arguments passed to ``DistributedDataParallel``.
 
@@ -57,8 +57,8 @@ class MMDistributedDataParallel(DistributedDataParallel):
               output for single-device CUDA modules.
             - dim (int): Defaults to 0.
             - broadcast_buffers (bool): Flag that enables syncing (
-                broadcasting) buffers of the module at beginning of the
-                ``forward`` function. Defaults to True
+              broadcasting) buffers of the module at beginning of the
+              ``forward`` function. Defaults to True
             - find_unused_parameters (bool): Whether to find parameters of
               module, which are not in the forward graph. Defaults to False.
             - process_group (ProcessGroup, optional): The process group to be
@@ -70,7 +70,8 @@ class MMDistributedDataParallel(DistributedDataParallel):
             - gradient_as_bucket_view (bool): Defaults to False.
             - static_graph (bool): Defaults to False.
 
-    See more information about arguments in `https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel`_  # noqa E501
+    See more information about arguments in
+    :class:`torch.nn.parallel.DistributedDataParallel`.
 
     Note:
         If model has multiple submodules and each module has
@@ -100,7 +101,7 @@ def train_step(self, data: Union[dict, tuple, list],
         :meth:`train_step` will perform the following steps in order:
 
         - If :attr:`module` defines the preprocess method,
-            call ``module.preprocess`` to pre-processing data.
+          call ``module.preprocess`` to pre-processing data.
         - Call ``module.forward(**data)`` and get losses.
         - Parse losses.
         - Call ``optim_wrapper.optimizer_step`` to update parameters.

diff --git a/mmengine/model/wrappers/seperate_distributed.py b/mmengine/model/wrappers/seperate_distributed.py
@@ -41,7 +41,7 @@ class MMSeparateDistributedDataParallel(DistributedDataParallel):
         find_unused_parameters (bool): Same as that in
             ``torch.nn.parallel.distributed.DistributedDataParallel``.
             Traverse the autograd graph of all tensors contained in returned
-            value of the wrapped module’s forward function. Defaults to False.
+            value of the wrapped module's forward function. Defaults to False.
         **kwargs: Keyword arguments passed to ``MMDistributedDataParallel``.
 
             - device_ids (List[int] or torch.device, optional): CUDA devices
@@ -58,7 +58,8 @@ class MMSeparateDistributedDataParallel(DistributedDataParallel):
             - gradient_as_bucket_view (bool): Defaults to False.
             - static_graph (bool): Defaults to False.
 
-    See more information about arguments in `https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel`_  # noqa E501
+    See more information about arguments in
+    :class:`torch.nn.parallel.DistributedDataParallel`.
     """
 
     def __init__(self,
@@ -144,7 +145,7 @@ def train(self, mode: bool = True) -> 'MMSeparateDistributedDataParallel':
 
         Args:
             mode (bool): whether to set training mode (``True``) or evaluation
-                 mode (``False``). Default: ``True``.
+                mode (``False``). Defaults to ``True``.
 
         Returns:
             Module: self.