From 504fa4f5cb7ab75a16bbfc51aeda65fad581b2ee Mon Sep 17 00:00:00 2001
From: del-zhenwu <dele.zhenwu@gmail.com>
Date: Sat, 23 Dec 2023 11:15:20 +0800
Subject: [PATCH 01/16] [Fix] Use ImportError to cover ModuleNotFoundError
 raised by opencv-python (#1438)

---
 mmengine/utils/dl_utils/collect_env.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mmengine/utils/dl_utils/collect_env.py b/mmengine/utils/dl_utils/collect_env.py
index 6406677a73..044cef19ef 100644
--- a/mmengine/utils/dl_utils/collect_env.py
+++ b/mmengine/utils/dl_utils/collect_env.py
@@ -138,7 +138,7 @@ def collect_env():
     try:
         import cv2
         env_info['OpenCV'] = cv2.__version__
-    except ModuleNotFoundError:
+    except ImportError:
         pass
 
     env_info['MMEngine'] = mmengine.__version__

From efcd36412459c7e1228fb76aae01db9a0be65b95 Mon Sep 17 00:00:00 2001
From: SCZwangxiao <31362395+SCZwangxiao@users.noreply.github.com>
Date: Sat, 23 Dec 2023 11:17:46 +0800
Subject: [PATCH 02/16] [Fix] Fix load_model_state_dict in BaseStrategy (#1447)

---
 mmengine/_strategy/base.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mmengine/_strategy/base.py b/mmengine/_strategy/base.py
index 708d0dbe1e..5df3a79c92 100644
--- a/mmengine/_strategy/base.py
+++ b/mmengine/_strategy/base.py
@@ -799,7 +799,8 @@ def load_model_state_dict(
         else:
             model = self.model
 
-        _load_checkpoint_to_model(model, state_dict, strict, revise_keys)
+        _load_checkpoint_to_model(
+            model, state_dict, strict=strict, revise_keys=revise_keys)
 
     def load_optim_state_dict(self, state_dict: dict) -> None:
         """Load optimizer state from dict."""

From 671f3bcdf41a66730279e36f0bb82fee5c75136a Mon Sep 17 00:00:00 2001
From: fanqiNO1 <75657629+fanqiNO1@users.noreply.github.com>
Date: Sat, 23 Dec 2023 16:24:39 +0800
Subject: [PATCH 03/16] [Fix] Fix placement policy in ColossalAIStrategy
 (#1440)

---
 mmengine/_strategy/colossalai.py | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/mmengine/_strategy/colossalai.py b/mmengine/_strategy/colossalai.py
index 1bdc96c794..cfbb925c67 100644
--- a/mmengine/_strategy/colossalai.py
+++ b/mmengine/_strategy/colossalai.py
@@ -120,8 +120,9 @@ def backward(self, loss: torch.Tensor, **kwargs) -> None:
         self.optimizer.backward(loss, **kwargs)
 
 
-@MODEL_WRAPPERS.register_module()
-class CollosalAIModelWrapper:
+@MODEL_WRAPPERS.register_module(
+    name=['ColossalAIModelWrapper', 'CollosalAIModelWrapper'])
+class ColossalAIModelWrapper:
 
     def __init__(self, model_wrapper: ModelWrapper, model: nn.Module):
         self.model_wrapper = model_wrapper
@@ -238,7 +239,7 @@ class ColossalAIStrategy(BaseStrategy):
     OPTIMIZER_DIR = 'optimizer'  # directory to save optimizer state.
     MODEL_DIR = 'model'  # directory to save model
     SCHEDULER_DIR = 'scheduler'  # directory to save scheduelrs
-    model: CollosalAIModelWrapper  # type: ignore
+    model: ColossalAIModelWrapper  # type: ignore
     optim_wrapper: ColossalAIOptimWrapper  # type: ignore
 
     def __init__(
@@ -468,8 +469,14 @@ def save_checkpoint(
     def _build_plugin(self, plugin: Union[str, dict]):
         if isinstance(plugin, str):
             if plugin == 'gemini':
-                plugin = colo_plugin.GeminiPlugin(
-                    precision='bf16', placement_policy='cuda')
+                try:
+                    plugin = colo_plugin.GeminiPlugin(
+                        precision='bf16', placement_policy='auto')
+                except AssertionError:
+                    from colossalai.zero.gemini.placement_policy import \
+                        PlacementPolicyFactory as colo_placement
+                    raise ValueError('placement policy must be one of ' +
+                                     f'{list(colo_placement.policies.keys())}')
             elif plugin == 'lowlevel-zero':
                 plugin = colo_plugin.LowLevelZeroPlugin()
             else:
@@ -508,11 +515,11 @@ def _wrap(
         self,
         model: nn.Module,
         optim_wrapper: Optional[OptimWrapper] = None,
-    ) -> Union[Tuple[CollosalAIModelWrapper, ColossalAIOptimWrapper],
-               CollosalAIModelWrapper]:  # type: ignore
+    ) -> Union[Tuple[ColossalAIModelWrapper, ColossalAIOptimWrapper],
+               ColossalAIModelWrapper]:  # type: ignore
         """Wrap model with :class:`ModelWrapper`."""
         if self.model_wrapper is None:
-            self.model_wrapper = {'type': 'CollosalAIModelWrapper'}
+            self.model_wrapper = {'type': 'ColossalAIModelWrapper'}
 
         # For zero series parallel, move `data_preprocessor` to current device
         # is reasonable. We need to `BaseDataPreprocessor.to` manually since

From 8e6fb12b1f08464faa603ac5148b8331b714df89 Mon Sep 17 00:00:00 2001
From: lanzeshun <56623740+shun001@users.noreply.github.com>
Date: Tue, 26 Dec 2023 16:14:45 +0800
Subject: [PATCH 04/16] [Fix] Support multi-node distributed training with NPU
 backend (#1459)

---
 mmengine/dist/utils.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/mmengine/dist/utils.py b/mmengine/dist/utils.py
index d1d19d8f68..4e77700141 100644
--- a/mmengine/dist/utils.py
+++ b/mmengine/dist/utils.py
@@ -99,9 +99,10 @@ def _init_dist_pytorch(backend, init_backend='torch', **kwargs) -> None:
         **kwargs: keyword arguments are passed to ``init_process_group``.
     """
     rank = int(os.environ['RANK'])
+    # LOCAL_RANK is set by `torch.distributed.launch` since PyTorch 1.1
+    local_rank = int(os.environ['LOCAL_RANK'])
     if is_mlu_available():
         import torch_mlu  # noqa: F401
-        local_rank = int(os.environ['LOCAL_RANK'])
         torch.mlu.set_device(local_rank)
         torch_dist.init_process_group(
             backend='cncl',
@@ -110,15 +111,13 @@ def _init_dist_pytorch(backend, init_backend='torch', **kwargs) -> None:
             **kwargs)
     elif is_npu_available():
         import torch_npu  # noqa: F401
-        torch.npu.set_device(rank)
+        torch.npu.set_device(local_rank)
         torch_dist.init_process_group(
             backend='hccl',
             rank=rank,
             world_size=int(os.environ['WORLD_SIZE']),
             **kwargs)
     else:
-        # LOCAL_RANK is set by `torch.distributed.launch` since PyTorch 1.1
-        local_rank = int(os.environ['LOCAL_RANK'])
         torch.cuda.set_device(local_rank)
 
         if init_backend == 'torch':

From 1398e4200e19386894829a4418c03a491b434a9f Mon Sep 17 00:00:00 2001
From: fanqiNO1 <75657629+fanqiNO1@users.noreply.github.com>
Date: Tue, 26 Dec 2023 16:30:01 +0800
Subject: [PATCH 05/16] bump version to v0.10.2 (#1460)

---
 README.md                  |  4 ++--
 README_zh-CN.md            |  4 ++--
 docs/en/notes/changelog.md | 16 ++++++++++++++++
 mmengine/version.py        |  2 +-
 4 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 7aa77a5462..ff51643e12 100644
--- a/README.md
+++ b/README.md
@@ -58,7 +58,7 @@ English | [简体中文](README_zh-CN.md)
 
 ## What's New
 
-v0.10.1 was released on 2023-11-22.
+v0.10.2 was released on 2023-12-26.
 
 Highlights:
 
@@ -70,7 +70,7 @@ Highlights:
 
 - Supports multiple visualization backends, including `NeptuneVisBackend`, `DVCLiveVisBackend` and `AimVisBackend`. Refer to [Visualization Backends](https://mmengine.readthedocs.io/en/latest/common_usage/visualize_training_log.html) for more details.
 
-Read [Changelog](./docs/en/notes/changelog.md#v0101-22112023) for more details.
+Read [Changelog](./docs/en/notes/changelog.md#v0102-26122023) for more details.
 
 ## Table of Contents
 
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 62f32c31b6..0fd5d272fd 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -58,7 +58,7 @@
 
 ## 最近进展
 
-最新版本 v0.10.1 在 2023.11.22 发布。
+最新版本 v0.10.2 在 2023.12.26 发布。
 
 亮点：
 
@@ -70,7 +70,7 @@
 
 - 支持多种可视化后端，包括`NeptuneVisBackend`、`DVCLiveVisBackend` 和 `AimVisBackend`。可阅读[可视化后端](https://mmengine.readthedocs.io/zh_CN/latest/common_usage/visualize_training_log.html)了解用法。
 
-如果想了解更多版本更新细节和历史信息，请阅读[更新日志](./docs/en/notes/changelog.md#v0101-22112023)
+如果想了解更多版本更新细节和历史信息，请阅读[更新日志](./docs/en/notes/changelog.md#v0102-26122023)
 
 ## 目录
 
diff --git a/docs/en/notes/changelog.md b/docs/en/notes/changelog.md
index b1aa854eb0..7433be503a 100644
--- a/docs/en/notes/changelog.md
+++ b/docs/en/notes/changelog.md
@@ -1,5 +1,21 @@
 # Changelog of v0.x
 
+## v0.10.2 (26/12/2023)
+
+### New Features & Enhancements
+
+- Support multi-node distributed training with NPU backend by [@shun001](https://github.com/shun001) in https://github.com/open-mmlab/mmengine/pull/1459
+- Use `ImportError` to cover `ModuleNotFoundError` by [@del-zhenwu](https://github.com/del-zhenwu) in https://github.com/open-mmlab/mmengine/pull/1438
+
+### Bug Fixes
+
+- Fix bug in `load_model_state_dict` of `BaseStrategy` by [@SCZwangxiao](https://github.com/SCZwangxiao) in https://github.com/open-mmlab/mmengine/pull/1447
+- Fix placement policy in ColossalAIStrategy by [@fanqiNO1](https://github.com/fanqiNO1) in https://github.com/open-mmlab/mmengine/pull/1440
+
+### Contributors
+
+A total of 4 developers contributed to this release. Thanks [@shun001](https://github.com/shun001), [@del-zhenwu](https://github.com/del-zhenwu), [@SCZwangxiao](https://github.com/SCZwangxiao), [@fanqiNO1](https://github.com/fanqiNO1)
+
 ## v0.10.1 (22/11/2023)
 
 ### Bug Fixes
diff --git a/mmengine/version.py b/mmengine/version.py
index eb41800589..d36dfb8380 100644
--- a/mmengine/version.py
+++ b/mmengine/version.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-__version__ = '0.10.1'
+__version__ = '0.10.2'
 
 
 def parse_version_info(version_str):

From 369f15e27acc0593405ec185d7789bb305b8a12b Mon Sep 17 00:00:00 2001
From: XiwuChen <xiwuchencn@gmail.com>
Date: Tue, 2 Jan 2024 10:42:58 +0800
Subject: [PATCH 06/16] [Docs] Fix nnodes in the doc of ddp training (#1462)

---
 docs/en/common_usage/distributed_training.md    | 6 +++---
 docs/zh_cn/common_usage/distributed_training.md | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/en/common_usage/distributed_training.md b/docs/en/common_usage/distributed_training.md
index 9ab328128a..308c7dd95a 100644
--- a/docs/en/common_usage/distributed_training.md
+++ b/docs/en/common_usage/distributed_training.md
@@ -26,7 +26,7 @@ On the first machine:
 
 ```bash
 python -m torch.distributed.launch \
-    --nnodes 8 \
+    --nnodes 2 \
     --node_rank 0 \
     --master_addr 127.0.0.1 \
     --master_port 29500 \
@@ -38,9 +38,9 @@ On the second machine:
 
 ```bash
 python -m torch.distributed.launch \
-    --nnodes 8 \
+    --nnodes 2 \
     --node_rank 1 \
-    --master_addr 127.0.0.1 \
+    --master_addr "ip_of_the_first_machine" \
     --master_port 29500 \
     --nproc_per_node=8 \
     examples/distributed_training.py --launcher pytorch
diff --git a/docs/zh_cn/common_usage/distributed_training.md b/docs/zh_cn/common_usage/distributed_training.md
index 8afedc96b2..2385e4ff53 100644
--- a/docs/zh_cn/common_usage/distributed_training.md
+++ b/docs/zh_cn/common_usage/distributed_training.md
@@ -26,7 +26,7 @@ CUDA_VISIBLE_DEVICES=0,3 python -m torch.distributed.launch --nproc_per_node=2 e
 
 ```bash
 python -m torch.distributed.launch \
-    --nnodes 8 \
+    --nnodes 2 \
     --node_rank 0 \
     --master_addr 127.0.0.1 \
     --master_port 29500 \
@@ -38,9 +38,9 @@ python -m torch.distributed.launch \
 
 ```bash
 python -m torch.distributed.launch \
-    --nnodes 8 \
+    --nnodes 2 \
     --node_rank 1 \
-    --master_addr 127.0.0.1 \
+    --master_addr "ip_of_the_first_machine" \
     --master_port 29500 \
     --nproc_per_node=8 \
     examples/distributed_training.py --launcher pytorch

From e4600a69938b512d718ff3fa79838e9e8450f552 Mon Sep 17 00:00:00 2001
From: Zaida Zhou <58739961+zhouzaida@users.noreply.github.com>
Date: Tue, 2 Jan 2024 15:59:37 +0800
Subject: [PATCH 07/16] [Docs] Add the usage of ProfilerHook (#1466)

---
 docs/en/tutorials/hook.md    | 25 ++++++++++++++++++++-----
 docs/zh_cn/tutorials/hook.md | 25 ++++++++++++++++++++-----
 2 files changed, 40 insertions(+), 10 deletions(-)

diff --git a/docs/en/tutorials/hook.md b/docs/en/tutorials/hook.md
index ac969bf984..34f12d0084 100644
--- a/docs/en/tutorials/hook.md
+++ b/docs/en/tutorials/hook.md
@@ -31,11 +31,12 @@ Each hook has a corresponding priority. At each mount point, hooks with higher p
 
 **custom hooks**
 
-|                Name                 |                                 Function                                 |  Priority   |
-| :---------------------------------: | :----------------------------------------------------------------------: | :---------: |
-|         [EMAHook](#emahook)         |   apply Exponential Moving Average (EMA) on the model during training    | NORMAL (50) |
-|  [EmptyCacheHook](#emptycachehook)  | Releases all unoccupied cached GPU memory during the process of training | NORMAL (50) |
-| [SyncBuffersHook](#syncbuffershook) |            Synchronize model buffers at the end of each epoch            | NORMAL (50) |
+|                Name                 |                                 Function                                 |   Priority    |
+| :---------------------------------: | :----------------------------------------------------------------------: | :-----------: |
+|         [EMAHook](#emahook)         |   Apply Exponential Moving Average (EMA) on the model during training    |  NORMAL (50)  |
+|  [EmptyCacheHook](#emptycachehook)  | Releases all unoccupied cached GPU memory during the process of training |  NORMAL (50)  |
+| [SyncBuffersHook](#syncbuffershook) |            Synchronize model buffers at the end of each epoch            |  NORMAL (50)  |
+|    [ProfilerHook](#profilerhook)    |    Analyze the execution time and GPU memory usage of model operators    | VERY_LOW (90) |
 
 ```{note}
 It is not recommended to modify the priority of the default hooks, as hooks with lower priority may depend on hooks with higher priority. For example, `CheckpointHook` needs to have a lower priority than ParamSchedulerHook so that the saved optimizer state is correct. Also, the priority of custom hooks defaults to `NORMAL (50)`.
@@ -211,6 +212,20 @@ runner = Runner(custom_hooks=custom_hooks, ...)
 runner.train()
 ```
 
+### ProfilerHook
+
+The [ProfilerHook](mmengine.hooks.ProfilerHook) is used to analyze the execution time and GPU memory occupancy of model operators.
+
+```python
+custom_hooks = [dict(type='ProfilerHook', on_trace_ready=dict(type='tb_trace'))]
+runner = Runner(custom_hooks=custom_hooks, ...)
+runner.train()
+```
+
+The profiling results will be saved in the tf_tracing_logs directory under `work_dirs/{timestamp}`, and can be visualized using TensorBoard with the command `tensorboard --logdir work_dirs/{timestamp}/tf_tracing_logs`.
+
+For more information on the usage of the ProfilerHook, please refer to the [ProfilerHook](mmengine.hooks.ProfilerHook) documentation.
+
 ## Customize Your Hooks
 
 If the built-in hooks provided by MMEngine do not cover your demands, you are encouraged to customize your own hooks by simply inheriting the base [hook](mmengine.hooks.Hook) class and overriding the corresponding mount point methods.
diff --git a/docs/zh_cn/tutorials/hook.md b/docs/zh_cn/tutorials/hook.md
index 36f931446d..1ab44a8a3e 100644
--- a/docs/zh_cn/tutorials/hook.md
+++ b/docs/zh_cn/tutorials/hook.md
@@ -31,11 +31,12 @@ MMEngine 提供了很多内置的钩子，将钩子分为两类，分别是默
 
 **自定义钩子**
 
-|                名称                 |         用途          |   优先级    |
-| :---------------------------------: | :-------------------: | :---------: |
-|         [EMAHook](#emahook)         | 模型参数指数滑动平均  | NORMAL (50) |
-|  [EmptyCacheHook](#emptycachehook)  | PyTorch CUDA 缓存清理 | NORMAL (50) |
-| [SyncBuffersHook](#syncbuffershook) |   同步模型的 buffer   | NORMAL (50) |
+|                名称                 |                用途                |    优先级     |
+| :---------------------------------: | :--------------------------------: | :-----------: |
+|         [EMAHook](#emahook)         |        模型参数指数滑动平均        |  NORMAL (50)  |
+|  [EmptyCacheHook](#emptycachehook)  |       PyTorch CUDA 缓存清理        |  NORMAL (50)  |
+| [SyncBuffersHook](#syncbuffershook) |         同步模型的 buffer          |  NORMAL (50)  |
+|    [ProfilerHook](#profilerhook)    | 分析算子的执行时间以及显存占用情况 | VERY_LOW (90) |
 
 ```{note}
 不建议修改默认钩子的优先级，因为优先级低的钩子可能会依赖优先级高的钩子。例如 CheckpointHook 的优先级需要比 ParamSchedulerHook 低，这样保存的优化器状态才是正确的状态。另外，自定义钩子的优先级默认为 `NORMAL (50)`。
@@ -206,6 +207,20 @@ runner = Runner(custom_hooks=custom_hooks, ...)
 runner.train()
 ```
 
+### ProfilerHook
+
+[ProfilerHook](mmengine.hooks.ProfilerHook) 用于分析模型算子的执行时间以及显存占用情况。
+
+```python
+custom_hooks = [dict(type='ProfilerHook', on_trace_ready=dict(type='tb_trace'))]
+runner = Runner(custom_hooks=custom_hooks, ...)
+runner.train()
+```
+
+profile 的结果会保存在 `work_dirs/{timestamp}` 下的 `tf_tracing_logs` 目录，通过 `tensorboard --logdir work_dirs/{timestamp}tf_tracing_logs`。
+
+更多关于 ProfilerHook 的用法请阅读 [ProfilerHook](mmengine.hooks.ProfilerHook) 文档。
+
 ## 自定义钩子
 
 如果 MMEngine 提供的默认钩子不能满足需求，用户可以自定义钩子，只需继承钩子基类并重写相应的位点方法。

From 4a50213c6954d42536faa8b69b48bb6dd8f32f4f Mon Sep 17 00:00:00 2001
From: Mashiro <57566630+HAOCHENYE@users.noreply.github.com>
Date: Tue, 2 Jan 2024 16:07:54 +0800
Subject: [PATCH 08/16] [Fix] Fix Config.to_dict (#1465)

---
 mmengine/config/config.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/mmengine/config/config.py b/mmengine/config/config.py
index 316ac65d4d..a782c1a00c 100644
--- a/mmengine/config/config.py
+++ b/mmengine/config/config.py
@@ -48,9 +48,11 @@
 def _lazy2string(cfg_dict, dict_type=None):
     if isinstance(cfg_dict, dict):
         dict_type = dict_type or type(cfg_dict)
-        return dict_type({k: _lazy2string(v) for k, v in dict.items(cfg_dict)})
+        return dict_type(
+            {k: _lazy2string(v, dict_type)
+             for k, v in dict.items(cfg_dict)})
     elif isinstance(cfg_dict, (tuple, list)):
-        return type(cfg_dict)(_lazy2string(v) for v in cfg_dict)
+        return type(cfg_dict)(_lazy2string(v, dict_type) for v in cfg_dict)
     elif isinstance(cfg_dict, (LazyAttr, LazyObject)):
         return f'{cfg_dict.module}.{str(cfg_dict)}'
     else:

From b51bf60964357edbf55d54eb01735a4bfac15d48 Mon Sep 17 00:00:00 2001
From: Zhihao Lin <36994684+LZHgrla@users.noreply.github.com>
Date: Thu, 11 Jan 2024 10:47:05 +0800
Subject: [PATCH 09/16] [Fix] Fix the resume of iteration (#1471)

---
 mmengine/runner/loops.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/mmengine/runner/loops.py b/mmengine/runner/loops.py
index 6a874a6ad6..1f6551ab62 100644
--- a/mmengine/runner/loops.py
+++ b/mmengine/runner/loops.py
@@ -271,6 +271,14 @@ def run(self) -> None:
         # In iteration-based training loop, we treat the whole training process
         # as a big epoch and execute the corresponding hook.
         self.runner.call_hook('before_train_epoch')
+        if self._iter > 0:
+            print_log(
+                f'Advance dataloader {self._iter} steps to skip data '
+                'that has already been trained',
+                logger='current',
+                level=logging.WARNING)
+            for _ in range(self._iter):
+                next(self.dataloader_iterator)
         while self._iter < self._max_iters and not self.stop_training:
             self.runner.model.train()
 

From 109cd44c7ea2a384ace8a255b8d20c3b9b3dd351 Mon Sep 17 00:00:00 2001
From: Zhihao Lin <36994684+LZHgrla@users.noreply.github.com>
Date: Thu, 11 Jan 2024 10:50:36 +0800
Subject: [PATCH 10/16] [Fix] Fix dist.collect_results to keep all ranks'
 elements (#1469)

---
 mmengine/dist/dist.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/mmengine/dist/dist.py b/mmengine/dist/dist.py
index b6dd769f90..1dbedb3430 100644
--- a/mmengine/dist/dist.py
+++ b/mmengine/dist/dist.py
@@ -13,7 +13,7 @@
 from torch._utils import (_flatten_dense_tensors, _take_tensors,
                           _unflatten_dense_tensors)
 from torch.distributed import ProcessGroup
-
+from itertools import zip_longest, chain
 import mmengine
 from .utils import (get_world_size, get_rank, get_backend, get_dist_info,
                     get_default_group, barrier, get_data_device,
@@ -1010,8 +1010,10 @@ def collect_results_cpu(result_part: list,
                 part_list.append(pickle.load(f))
         # sort the results
         ordered_results = []
-        for res in zip(*part_list):
-            ordered_results.extend(list(res))
+        zipped_results = zip_longest(*part_list)
+        ordered_results = [
+            i for i in chain.from_iterable(zipped_results) if i is not None
+        ]
         # the dataloader may pad some samples
         ordered_results = ordered_results[:size]
         # remove tmp dir
@@ -1032,8 +1034,10 @@ def _collect_results_device(result_part: list, size: int) -> Optional[list]:
     if rank == 0:
         # sort the results
         ordered_results = []
-        for res in zip(*part_list):
-            ordered_results.extend(list(res))
+        zipped_results = zip_longest(*part_list)
+        ordered_results = [
+            i for i in chain.from_iterable(zipped_results) if i is not None
+        ]
         # the dataloader may pad some samples
         ordered_results = ordered_results[:size]
         return ordered_results

From 3d8a611eec15cf89e3850ad0c8ea48e927a583fb Mon Sep 17 00:00:00 2001
From: hanhaowen-mt <144977798+hanhaowen-mt@users.noreply.github.com>
Date: Thu, 11 Jan 2024 16:25:01 +0800
Subject: [PATCH 11/16] [Feature] Add the support for musa device support
 (#1453)

---
 mmengine/device/__init__.py                   | 10 ++--
 mmengine/device/utils.py                      | 38 ++++++++++++++-
 mmengine/dist/dist.py                         | 17 +++++++
 mmengine/dist/utils.py                        | 14 +++++-
 mmengine/hooks/empty_cache_hook.py            | 16 +++++--
 mmengine/logging/logger.py                    | 46 +++++++++++++------
 mmengine/model/base_model/base_model.py       | 15 ++++++
 .../model/base_model/data_preprocessor.py     |  9 ++++
 .../optim/optimizer/amp_optimizer_wrapper.py  |  7 +--
 mmengine/runner/amp.py                        |  8 +++-
 mmengine/runner/log_processor.py              | 12 +++--
 mmengine/runner/utils.py                      |  6 ++-
 mmengine/structures/base_data_element.py      | 11 +++++
 mmengine/structures/instance_data.py          |  3 ++
 mmengine/utils/dl_utils/collect_env.py        | 27 ++++++++++-
 mmengine/utils/dl_utils/time_counter.py       | 16 +++++--
 tests/test_device/test_device.py              |  5 +-
 tests/test_dist/test_dist.py                  |  2 +
 tests/test_hooks/test_ema_hook.py             |  5 ++
 tests/test_hooks/test_empty_cache_hook.py     |  5 ++
 tests/test_runner/test_amp.py                 | 18 +++++++-
 tests/test_runner/test_log_processor.py       |  6 ++-
 22 files changed, 253 insertions(+), 43 deletions(-)

diff --git a/mmengine/device/__init__.py b/mmengine/device/__init__.py
index bfd82a858a..88937d5592 100644
--- a/mmengine/device/__init__.py
+++ b/mmengine/device/__init__.py
@@ -1,10 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from .utils import (get_device, get_max_cuda_memory, is_cuda_available,
-                    is_dipu_available, is_mlu_available, is_mps_available,
-                    is_npu_available, is_npu_support_full_precision)
+from .utils import (get_device, get_max_cuda_memory, get_max_musa_memory,
+                    is_cuda_available, is_dipu_available, is_mlu_available,
+                    is_mps_available, is_musa_available, is_npu_available,
+                    is_npu_support_full_precision)
 
 __all__ = [
     'get_max_cuda_memory', 'get_device', 'is_cuda_available',
     'is_mlu_available', 'is_mps_available', 'is_npu_available',
-    'is_dipu_available', 'is_npu_support_full_precision'
+    'is_dipu_available', 'get_max_musa_memory', 'is_musa_available',
+    'is_npu_support_full_precision'
 ]
diff --git a/mmengine/device/utils.py b/mmengine/device/utils.py
index 0bb69d2ea9..2fd56d80ed 100644
--- a/mmengine/device/utils.py
+++ b/mmengine/device/utils.py
@@ -22,6 +22,12 @@
 except Exception:
     IS_DIPU_AVAILABLE = False
 
+try:
+    import torch_musa  # noqa: F401
+    IS_MUSA_AVAILABLE = True
+except Exception:
+    IS_MUSA_AVAILABLE = False
+
 
 def get_max_cuda_memory(device: Optional[torch.device] = None) -> int:
     """Returns the maximum GPU memory occupied by tensors in megabytes (MB) for
@@ -73,6 +79,34 @@ def is_dipu_available() -> bool:
     return IS_DIPU_AVAILABLE
 
 
+def get_max_musa_memory(device: Optional[torch.device] = None) -> int:
+    """Returns the maximum GPU memory occupied by tensors in megabytes (MB) for
+    a given device. By default, this returns the peak allocated memory since
+    the beginning of this program.
+
+    Args:
+        device (torch.device, optional): selected device. Returns
+            statistic for the current device, given by
+            :func:`~torch.musa.current_device`, if ``device`` is None.
+            Defaults to None.
+
+    Returns:
+        int: The maximum GPU memory occupied by tensors in megabytes
+        for a given device.
+    """
+    mem = torch.musa.max_memory_allocated(device=device)
+    mem_mb = torch.tensor([int(mem) // (1024 * 1024)],
+                          dtype=torch.int,
+                          device=device)
+    # TODO:haowen.han@mthreads.com: This function is not supported by musa yet.
+    # torch.musa.reset_peak_memory_stats()
+    return int(mem_mb.item())
+
+
+def is_musa_available() -> bool:
+    return IS_MUSA_AVAILABLE
+
+
 def is_npu_support_full_precision() -> bool:
     """Returns True if npu devices support full precision training."""
     version_of_support_full_precision = 220
@@ -91,12 +125,14 @@ def is_npu_support_full_precision() -> bool:
     DEVICE = 'mps'
 elif is_dipu_available():
     DEVICE = 'dipu'
+elif is_musa_available():
+    DEVICE = 'musa'
 
 
 def get_device() -> str:
     """Returns the currently existing device type.
 
     Returns:
-        str: cuda | npu | mlu | mps | cpu.
+        str: cuda | npu | mlu | mps | musa | cpu.
     """
     return DEVICE
diff --git a/mmengine/dist/dist.py b/mmengine/dist/dist.py
index 1dbedb3430..f70cc3ef46 100644
--- a/mmengine/dist/dist.py
+++ b/mmengine/dist/dist.py
@@ -415,12 +415,16 @@ def _broadcast_object_list(object_list: List[Any],
     current_device = torch.device('cpu')
     is_hccl_backend = group_backend == 'hccl'
     is_cncl_backend = group_backend == 'cncl'
+    is_mccl_backend = group_backend == 'mccl'
     if is_hccl_backend:
         current_device = torch.device('npu', torch.npu.current_device())
         object_sizes_tensor = object_sizes_tensor.to(current_device)
     elif is_cncl_backend:
         current_device = torch.device('mlu', torch.mlu.current_device())
         object_sizes_tensor = object_sizes_tensor.to(current_device)
+    elif is_mccl_backend:
+        current_device = torch.device('musa', torch.musa.current_device())
+        object_sizes_tensor = object_sizes_tensor.to(current_device)
     elif is_nccl_backend:
         # See note about using torch.cuda.current_device() here in
         # docstring. We cannot simply use my_rank since rank == device is
@@ -624,6 +628,7 @@ def _all_gather_object(object_list: List[Any],
     group_backend = get_backend(group)
     current_device = torch.device('cpu')
     is_nccl_backend = group_backend == torch_dist.Backend.NCCL
+    is_mccl_backend = group_backend == 'mccl'
     if is_nccl_backend:
         # See note about using torch.cuda.current_device() here in docstring.
         # We cannot simply use my_rank since rank == device is not necessarily
@@ -631,6 +636,13 @@ def _all_gather_object(object_list: List[Any],
         current_device = torch.device('cuda', torch.cuda.current_device())
         input_tensor = input_tensor.to(current_device)
         local_size = local_size.to(current_device)
+    elif is_mccl_backend:
+        # See note about using torch.musa.current_device() here in docstring.
+        # We cannot simply use my_rank since rank == device is not necessarily
+        # true.
+        current_device = torch.device('musa', torch.musa.current_device())
+        input_tensor = input_tensor.to(current_device)
+        local_size = local_size.to(current_device)
     # Gather all local sizes. This is so that we can find the max size, and
     # index until the correct size when deserializing the tensors.
     group_size = get_world_size(group=group)
@@ -776,10 +788,15 @@ def _gather_object(obj: Any,
     group_backend = get_backend(group)
     current_device = torch.device('cpu')
     is_nccl_backend = group_backend == torch_dist.Backend.NCCL
+    is_mccl_backend = group_backend == 'mccl'
     if is_nccl_backend:
         current_device = torch.device('cuda', torch.cuda.current_device())
         input_tensor = input_tensor.to(current_device)
         local_size = local_size.to(current_device)
+    elif is_mccl_backend:
+        current_device = torch.device('musa', torch.musa.current_device())
+        input_tensor = input_tensor.to(current_device)
+        local_size = local_size.to(current_device)
     # Gather all local sizes. This is so that we can find the max size, and
     # index until the correct size when deserializing the tensors.
     group_size = get_world_size(group=group)
diff --git a/mmengine/dist/utils.py b/mmengine/dist/utils.py
index 4e77700141..3c136973bb 100644
--- a/mmengine/dist/utils.py
+++ b/mmengine/dist/utils.py
@@ -11,7 +11,8 @@
 from torch import Tensor
 from torch import distributed as torch_dist
 from torch.distributed import ProcessGroup
-from mmengine.device import is_mlu_available, is_npu_available
+from mmengine.device import (is_mlu_available, is_npu_available,
+                             is_musa_available)
 
 from collections.abc import Iterable, Mapping
 
@@ -117,6 +118,14 @@ def _init_dist_pytorch(backend, init_backend='torch', **kwargs) -> None:
             rank=rank,
             world_size=int(os.environ['WORLD_SIZE']),
             **kwargs)
+    elif is_musa_available():
+        import torch_musa  # noqa: F401
+        torch.musa.set_device(rank)
+        torch_dist.init_process_group(
+            backend='mccl',
+            rank=rank,
+            world_size=int(os.environ['WORLD_SIZE']),
+            **kwargs)
     else:
         torch.cuda.set_device(local_rank)
 
@@ -527,6 +536,9 @@ def get_comm_device(group: Optional[ProcessGroup] = None) -> torch.device:
         return torch.device('mlu', torch.mlu.current_device())
     elif backend == 'smddp':
         return torch.device('cuda', torch.cuda.current_device())
+    elif backend == 'mccl':
+        import torch_musa
+        return torch.device('musa', torch_musa.current_device())
     else:
         # GLOO and MPI backends use cpu device by default
         return torch.device('cpu')
diff --git a/mmengine/hooks/empty_cache_hook.py b/mmengine/hooks/empty_cache_hook.py
index b9b5eba0ed..9a92cdebfe 100644
--- a/mmengine/hooks/empty_cache_hook.py
+++ b/mmengine/hooks/empty_cache_hook.py
@@ -4,6 +4,7 @@
 import torch
 
 from mmengine.registry import HOOKS
+from ..device import is_cuda_available, is_musa_available
 from .hook import Hook
 
 DATA_BATCH = Optional[Union[dict, tuple, list]]
@@ -49,7 +50,10 @@ def _after_iter(self,
             mode (str): Current mode of runner. Defaults to 'train'.
         """
         if self._do_after_iter:
-            torch.cuda.empty_cache()
+            if is_cuda_available():
+                torch.cuda.empty_cache()
+            elif is_musa_available():
+                torch.musa.empty_cache()
 
     def _before_epoch(self, runner, mode: str = 'train') -> None:
         """Empty cache before an epoch.
@@ -59,7 +63,10 @@ def _before_epoch(self, runner, mode: str = 'train') -> None:
             mode (str): Current mode of runner. Defaults to 'train'.
         """
         if self._do_before_epoch:
-            torch.cuda.empty_cache()
+            if is_cuda_available():
+                torch.cuda.empty_cache()
+            elif is_musa_available():
+                torch.musa.empty_cache()
 
     def _after_epoch(self, runner, mode: str = 'train') -> None:
         """Empty cache after an epoch.
@@ -69,4 +76,7 @@ def _after_epoch(self, runner, mode: str = 'train') -> None:
             mode (str): Current mode of runner. Defaults to 'train'.
         """
         if self._do_after_epoch:
-            torch.cuda.empty_cache()
+            if is_cuda_available():
+                torch.cuda.empty_cache()
+            elif is_musa_available():
+                torch.musa.empty_cache()
diff --git a/mmengine/logging/logger.py b/mmengine/logging/logger.py
index 9b2cb9da66..839a08cdda 100644
--- a/mmengine/logging/logger.py
+++ b/mmengine/logging/logger.py
@@ -398,22 +398,38 @@ def _get_device_id():
     except ImportError:
         return 0
     else:
-        local_rank = int(os.getenv('LOCAL_RANK', '0'))
-        # TODO: return device id of npu and mlu.
-        if not torch.cuda.is_available():
-            return local_rank
-        cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', None)
-        if cuda_visible_devices is None:
-            num_device = torch.cuda.device_count()
-            cuda_visible_devices = list(range(num_device))
-        else:
-            cuda_visible_devices = cuda_visible_devices.split(',')
+        MUSA_AVAILABLE = False
         try:
-            return int(cuda_visible_devices[local_rank])
-        except ValueError:
-            # handle case for Multi-Instance GPUs
-            # see #1148 for details
-            return cuda_visible_devices[local_rank]
+            import torch_musa
+            MUSA_AVAILABLE = True
+        except ImportError:
+            pass
+        if MUSA_AVAILABLE:
+            local_rank = int(os.getenv('LOCAL_RANK', '0'))
+            musa_visible_devices = os.getenv('MUSA_VISIBLE_DEVICES', None)
+            if musa_visible_devices is None:
+                num_device = torch_musa.device_count()
+                musa_visible_devices = list(range(num_device))
+            else:
+                musa_visible_devices = musa_visible_devices.split(',')
+            return int(musa_visible_devices[local_rank])
+        else:
+            local_rank = int(os.getenv('LOCAL_RANK', '0'))
+            # TODO: return device id of npu and mlu.
+            if not torch.cuda.is_available():
+                return local_rank
+            cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', None)
+            if cuda_visible_devices is None:
+                num_device = torch.cuda.device_count()
+                cuda_visible_devices = list(range(num_device))
+            else:
+                cuda_visible_devices = cuda_visible_devices.split(',')
+            try:
+                return int(cuda_visible_devices[local_rank])
+            except ValueError:
+                # handle case for Multi-Instance GPUs
+                # see #1148 for details
+                return cuda_visible_devices[local_rank]
 
 
 def _get_host_info() -> str:
diff --git a/mmengine/model/base_model/base_model.py b/mmengine/model/base_model/base_model.py
index 14c91eb6ca..299cd67557 100644
--- a/mmengine/model/base_model/base_model.py
+++ b/mmengine/model/base_model/base_model.py
@@ -222,6 +222,21 @@ def cuda(
         self._set_device(torch.device(device))
         return super().cuda(device)
 
+    def musa(
+        self,
+        device: Optional[Union[int, str, torch.device]] = None,
+    ) -> nn.Module:
+        """Overrides this method to call :meth:`BaseDataPreprocessor.musa`
+        additionally.
+
+        Returns:
+            nn.Module: The model itself.
+        """
+        if device is None or isinstance(device, int):
+            device = torch.device('musa', index=device)
+        self._set_device(torch.device(device))
+        return super().musa(device)
+
     def mlu(
         self,
         device: Union[int, str, torch.device, None] = None,
diff --git a/mmengine/model/base_model/data_preprocessor.py b/mmengine/model/base_model/data_preprocessor.py
index 1f285aca62..af84246874 100644
--- a/mmengine/model/base_model/data_preprocessor.py
+++ b/mmengine/model/base_model/data_preprocessor.py
@@ -113,6 +113,15 @@ def cuda(self, *args, **kwargs) -> nn.Module:
         self._device = torch.device(torch.cuda.current_device())
         return super().cuda()
 
+    def musa(self, *args, **kwargs) -> nn.Module:
+        """Overrides this method to set the :attr:`device`
+
+        Returns:
+            nn.Module: The model itself.
+        """
+        self._device = torch.device(torch.musa.current_device())
+        return super().musa()
+
     def npu(self, *args, **kwargs) -> nn.Module:
         """Overrides this method to set the :attr:`device`
 
diff --git a/mmengine/optim/optimizer/amp_optimizer_wrapper.py b/mmengine/optim/optimizer/amp_optimizer_wrapper.py
index 7a82d16603..4f3323f2cc 100644
--- a/mmengine/optim/optimizer/amp_optimizer_wrapper.py
+++ b/mmengine/optim/optimizer/amp_optimizer_wrapper.py
@@ -6,7 +6,7 @@
 import torch.nn as nn
 
 from mmengine.device import (is_cuda_available, is_mlu_available,
-                             is_npu_available)
+                             is_musa_available, is_npu_available)
 from mmengine.registry import OPTIM_WRAPPERS
 from mmengine.utils import digit_version
 from mmengine.utils.dl_utils import TORCH_VERSION
@@ -74,8 +74,9 @@ def __init__(self,
         assert digit_version(TORCH_VERSION) >= digit_version('1.6.0'), (
             '`torch.cuda.amp` is only available when pytorch version >= 1.6')
         assert is_cuda_available() or is_npu_available() or is_mlu_available(
-        ), ('``AmpOptimizerWrapper`` is only available training '
-            'on gpu, npu or mlu')
+        ) or is_musa_available(), (
+            '``AmpOptimizerWrapper`` is only available training '
+            'on gpu, npu, mlu or musa')
         super().__init__(**kwargs)
         self._scale_update_param = None
 
diff --git a/mmengine/runner/amp.py b/mmengine/runner/amp.py
index 964518fc90..198babc582 100644
--- a/mmengine/runner/amp.py
+++ b/mmengine/runner/amp.py
@@ -135,7 +135,13 @@ def autocast(device_type: Optional[str] = None,
 
         elif device_type == 'npu':
             pass
-
+        elif device_type == 'musa':
+            if dtype is None:
+                dtype = torch.get_autocast_gpu_dtype()
+            with torch.musa.amp.autocast(
+                    enabled=enabled, dtype=dtype, cache_enabled=cache_enabled):
+                yield
+                return
         else:
             # Device like MPS does not support fp16 training or testing.
             # If an inappropriate device is set and fp16 is enabled, an error
diff --git a/mmengine/runner/log_processor.py b/mmengine/runner/log_processor.py
index 0453377d0f..98183ae317 100644
--- a/mmengine/runner/log_processor.py
+++ b/mmengine/runner/log_processor.py
@@ -9,7 +9,8 @@
 import numpy as np
 import torch
 
-from mmengine.device import get_max_cuda_memory, is_cuda_available
+from mmengine.device import (get_max_cuda_memory, get_max_musa_memory,
+                             is_cuda_available, is_musa_available)
 from mmengine.registry import LOG_PROCESSORS
 
 
@@ -226,11 +227,13 @@ def get_log_after_iter(self, runner, batch_idx: int,
             log_tag.pop('time')
             log_tag.pop('data_time')
 
-        # If cuda is available, the max memory occupied should be calculated.
-        if is_cuda_available():
+        # If cuda/musa is available,
+        # the max memory occupied should be calculated.
+        if is_cuda_available() or is_musa_available():
             max_memory = self._get_max_memory(runner)
             log_str += f'memory: {max_memory}  '
             tag['memory'] = max_memory
+
         # Loop left keys to fill `log_str`.
         if mode in ('train', 'val'):
             log_items = []
@@ -498,6 +501,9 @@ def _get_max_memory(self, runner) -> int:
         """
 
         device = getattr(runner.model, 'output_device', None)
+
+        if is_musa_available():
+            return get_max_musa_memory(device)
         return get_max_cuda_memory(device)
 
     def _get_iter(self, runner, batch_idx: int) -> int:
diff --git a/mmengine/runner/utils.py b/mmengine/runner/utils.py
index d7098c7295..b91025eb07 100644
--- a/mmengine/runner/utils.py
+++ b/mmengine/runner/utils.py
@@ -7,6 +7,7 @@
 import torch
 from torch.utils.data import DataLoader
 
+from mmengine.device import is_cuda_available, is_musa_available
 from mmengine.dist import get_rank, sync_random_seed
 from mmengine.logging import print_log
 from mmengine.utils import digit_version, is_list_of
@@ -69,7 +70,10 @@ def set_random_seed(seed: Optional[int] = None,
     np.random.seed(seed)
     torch.manual_seed(seed)
     # torch.cuda.manual_seed(seed)
-    torch.cuda.manual_seed_all(seed)
+    if is_cuda_available():
+        torch.cuda.manual_seed_all(seed)
+    elif is_musa_available():
+        torch.musa.manual_seed_all(seed)
     # os.environ['PYTHONHASHSEED'] = str(seed)
     if deterministic:
         if torch.backends.cudnn.benchmark:
diff --git a/mmengine/structures/base_data_element.py b/mmengine/structures/base_data_element.py
index 454a224371..53bcd5babf 100644
--- a/mmengine/structures/base_data_element.py
+++ b/mmengine/structures/base_data_element.py
@@ -510,6 +510,17 @@ def cuda(self) -> 'BaseDataElement':
                 new_data.set_data(data)
         return new_data
 
+    # Tensor-like methods
+    def musa(self) -> 'BaseDataElement':
+        """Convert all tensors to musa in data."""
+        new_data = self.new()
+        for k, v in self.items():
+            if isinstance(v, (torch.Tensor, BaseDataElement)):
+                v = v.musa()
+                data = {k: v}
+                new_data.set_data(data)
+        return new_data
+
     # Tensor-like methods
     def npu(self) -> 'BaseDataElement':
         """Convert all tensors to NPU in data."""
diff --git a/mmengine/structures/instance_data.py b/mmengine/structures/instance_data.py
index 8df9727a00..369d445f28 100644
--- a/mmengine/structures/instance_data.py
+++ b/mmengine/structures/instance_data.py
@@ -18,6 +18,9 @@
 elif get_device() == 'mlu':
     BoolTypeTensor = Union[torch.BoolTensor, torch.mlu.BoolTensor]
     LongTypeTensor = Union[torch.LongTensor, torch.mlu.LongTensor]
+elif get_device() == 'musa':
+    BoolTypeTensor = Union[torch.BoolTensor, torch.musa.BoolTensor]
+    LongTypeTensor = Union[torch.LongTensor, torch.musa.LongTensor]
 else:
     BoolTypeTensor = Union[torch.BoolTensor, torch.cuda.BoolTensor]
     LongTypeTensor = Union[torch.LongTensor, torch.cuda.LongTensor]
diff --git a/mmengine/utils/dl_utils/collect_env.py b/mmengine/utils/dl_utils/collect_env.py
index 044cef19ef..0ee99abad2 100644
--- a/mmengine/utils/dl_utils/collect_env.py
+++ b/mmengine/utils/dl_utils/collect_env.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 """This file holding some environment constant for sharing by other files."""
+import os
 import os.path as osp
 import subprocess
 import sys
@@ -9,6 +10,7 @@
 import torch
 
 import mmengine
+from mmengine.device import is_cuda_available, is_musa_available
 from .parrots_wrapper import TORCH_VERSION, get_build_config, is_rocm_pytorch
 
 
@@ -24,6 +26,10 @@ def _get_cuda_home():
     return CUDA_HOME
 
 
+def _get_musa_home():
+    return os.environ.get('MUSA_HOME')
+
+
 def collect_env():
     """Collect the information of the running environments.
 
@@ -51,9 +57,10 @@ def collect_env():
     env_info['sys.platform'] = sys.platform
     env_info['Python'] = sys.version.replace('\n', '')
 
-    cuda_available = torch.cuda.is_available()
+    cuda_available = is_cuda_available()
+    musa_available = is_musa_available()
     env_info['CUDA available'] = cuda_available
-
+    env_info['MUSA available'] = musa_available
     env_info['numpy_random_seed'] = np.random.get_state()[1][0]
 
     if cuda_available:
@@ -89,7 +96,23 @@ def collect_env():
                 except subprocess.SubprocessError:
                     nvcc = 'Not Available'
             env_info['NVCC'] = nvcc
+    elif musa_available:
+        devices = defaultdict(list)
+        for k in range(torch.musa.device_count()):
+            devices[torch.musa.get_device_name(k)].append(str(k))
+        for name, device_ids in devices.items():
+            env_info['GPU ' + ','.join(device_ids)] = name
+
+        MUSA_HOME = _get_musa_home()
+        env_info['MUSA_HOME'] = MUSA_HOME
 
+        if MUSA_HOME is not None and osp.isdir(MUSA_HOME):
+            try:
+                mcc = osp.join(MUSA_HOME, 'bin/mcc')
+                subprocess.check_output(f'"{mcc}" -v', shell=True)
+            except subprocess.SubprocessError:
+                mcc = 'Not Available'
+            env_info['mcc'] = mcc
     try:
         # Check C++ Compiler.
         # For Unix-like, sysconfig has 'CC' variable like 'gcc -pthread ...',
diff --git a/mmengine/utils/dl_utils/time_counter.py b/mmengine/utils/dl_utils/time_counter.py
index 4a1fb42ee0..e4a155dd72 100644
--- a/mmengine/utils/dl_utils/time_counter.py
+++ b/mmengine/utils/dl_utils/time_counter.py
@@ -4,6 +4,7 @@
 
 import torch
 
+from mmengine.device import is_cuda_available, is_musa_available
 from mmengine.dist.utils import master_only
 from mmengine.logging import MMLogger, print_log
 
@@ -84,15 +85,20 @@ def __call__(self, fn):
         def wrapper(*args, **kwargs):
             self.__count += 1
 
-            if self.with_sync and torch.cuda.is_available():
-                torch.cuda.synchronize()
+            if self.with_sync:
+                if is_cuda_available():
+                    torch.cuda.synchronize()
+                elif is_musa_available():
+                    torch.musa.synchronize()
             start_time = time.perf_counter()
 
             result = fn(*args, **kwargs)
 
-            if self.with_sync and torch.cuda.is_available():
-                torch.cuda.synchronize()
-
+            if self.with_sync:
+                if is_cuda_available():
+                    torch.cuda.synchronize()
+                elif is_musa_available():
+                    torch.musa.synchronize()
             elapsed = time.perf_counter() - start_time
             self.print_time(elapsed)
 
diff --git a/tests/test_device/test_device.py b/tests/test_device/test_device.py
index 19bd1f7f19..d2171afa58 100644
--- a/tests/test_device/test_device.py
+++ b/tests/test_device/test_device.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from mmengine.device import (get_device, is_cuda_available, is_mlu_available,
-                             is_mps_available, is_npu_available)
+                             is_mps_available, is_musa_available,
+                             is_npu_available)
 
 
 def test_get_device():
@@ -13,5 +14,7 @@ def test_get_device():
         assert device == 'mlu'
     elif is_mps_available():
         assert device == 'mps'
+    elif is_musa_available():
+        assert device == 'musa'
     else:
         assert device == 'cpu'
diff --git a/tests/test_dist/test_dist.py b/tests/test_dist/test_dist.py
index d89f5eb878..a2ef07b713 100644
--- a/tests/test_dist/test_dist.py
+++ b/tests/test_dist/test_dist.py
@@ -11,6 +11,7 @@
 import torch.distributed as torch_dist
 
 import mmengine.dist as dist
+from mmengine.device import is_musa_available
 from mmengine.dist.dist import sync_random_seed
 from mmengine.testing._internal import MultiProcessTestCase
 from mmengine.utils import digit_version
@@ -117,6 +118,7 @@ def test_all_reduce_params(self):
                 self.assertTrue(torch.allclose(item1, item2))
 
 
+@unittest.skipIf(is_musa_available(), reason='musa do not support gloo yet')
 class TestDistWithGLOOBackend(MultiProcessTestCase):
 
     def _init_dist_env(self, rank, world_size):
diff --git a/tests/test_hooks/test_ema_hook.py b/tests/test_hooks/test_ema_hook.py
index 4ceebe9088..6dad7ba4f0 100644
--- a/tests/test_hooks/test_ema_hook.py
+++ b/tests/test_hooks/test_ema_hook.py
@@ -1,11 +1,13 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import copy
 import os.path as osp
+import unittest
 
 import torch
 import torch.nn as nn
 
 from mmengine.config import ConfigDict
+from mmengine.device import is_musa_available
 from mmengine.hooks import EMAHook
 from mmengine.model import BaseModel, ExponentialMovingAverage
 from mmengine.registry import MODELS
@@ -45,6 +47,9 @@ def forward(self, *args, **kwargs):
         return super().forward(*args, **kwargs)
 
 
+# TODO:haowen.han@mtheads.com
+@unittest.skipIf(is_musa_available(),
+                 "musa backend do not support 'aten::lerp.Scalar_out'")
 class TestEMAHook(RunnerTestCase):
 
     def setUp(self) -> None:
diff --git a/tests/test_hooks/test_empty_cache_hook.py b/tests/test_hooks/test_empty_cache_hook.py
index 4a9ea99752..d30972d360 100644
--- a/tests/test_hooks/test_empty_cache_hook.py
+++ b/tests/test_hooks/test_empty_cache_hook.py
@@ -1,11 +1,16 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from unittest.mock import patch
 
+import pytest
+
+from mmengine.device import is_cuda_available
 from mmengine.testing import RunnerTestCase
 
 
 class TestEmptyCacheHook(RunnerTestCase):
 
+    @pytest.mark.skipif(
+        not is_cuda_available(), reason='cuda should be available')
     def test_with_runner(self):
         with patch('torch.cuda.empty_cache') as mock_empty_cache:
             cfg = self.epoch_based_cfg
diff --git a/tests/test_runner/test_amp.py b/tests/test_runner/test_amp.py
index a80c7f35cb..7208e25079 100644
--- a/tests/test_runner/test_amp.py
+++ b/tests/test_runner/test_amp.py
@@ -5,7 +5,8 @@
 import torch.nn as nn
 
 import mmengine
-from mmengine.device import get_device, is_mlu_available, is_npu_available
+from mmengine.device import (get_device, is_mlu_available, is_musa_available,
+                             is_npu_available)
 from mmengine.runner import autocast
 from mmengine.utils import digit_version
 from mmengine.utils.dl_utils import TORCH_VERSION
@@ -44,6 +45,21 @@ def test_autocast(self):
                 layer = nn.Conv2d(1, 1, 1).to(device)
                 res = layer(torch.randn(1, 1, 1, 1).to(device))
                 self.assertEqual(res.dtype, torch.float32)
+        elif is_musa_available():
+            device = 'musa'
+            with autocast(device_type=device):
+                # torch.autocast support mlu mode.
+                layer = nn.Conv2d(1, 1, 1).to(device)
+                res = layer(torch.randn(1, 1, 1, 1).to(device))
+                self.assertIn(res.dtype, (torch.bfloat16, torch.float16))
+                with autocast(enabled=False, device_type=device):
+                    res = layer(torch.randn(1, 1, 1, 1).to(device))
+                    self.assertEqual(res.dtype, torch.float32)
+            # Test with fp32_enabled
+            with autocast(enabled=False, device_type=device):
+                layer = nn.Conv2d(1, 1, 1).to(device)
+                res = layer(torch.randn(1, 1, 1, 1).to(device))
+                self.assertEqual(res.dtype, torch.float32)
         elif not torch.cuda.is_available():
             if digit_version(TORCH_VERSION) < digit_version('1.10.0'):
                 # `torch.cuda.amp.autocast` is only support in gpu mode, if
diff --git a/tests/test_runner/test_log_processor.py b/tests/test_runner/test_log_processor.py
index 9b93a9a8ea..d7fae5722a 100644
--- a/tests/test_runner/test_log_processor.py
+++ b/tests/test_runner/test_log_processor.py
@@ -7,6 +7,7 @@
 import torch
 from parameterized import parameterized
 
+from mmengine.device import is_cuda_available, is_musa_available
 from mmengine.logging import HistoryBuffer, MessageHub, MMLogger
 from mmengine.runner import LogProcessor
 from mmengine.testing import RunnerTestCase
@@ -113,7 +114,7 @@ def test_get_log_after_iter(self, by_epoch, mode, log_with_hierarchy):
                         f"time: {train_logs['time']:.4f}  "
                         f"data_time: {train_logs['data_time']:.4f}  ")
 
-            if torch.cuda.is_available():
+            if is_cuda_available() or is_musa_available():
                 log_str += 'memory: 100  '
             if mode == 'train':
                 log_str += f"loss_cls: {train_logs['loss_cls']:.4f}"
@@ -141,7 +142,7 @@ def test_get_log_after_iter(self, by_epoch, mode, log_with_hierarchy):
                         f"time: {train_logs['time']:.4f}  "
                         f"data_time: {train_logs['data_time']:.4f}  ")
 
-            if torch.cuda.is_available():
+            if is_cuda_available() or is_musa_available():
                 log_str += 'memory: 100  '
 
             if mode == 'train':
@@ -249,6 +250,7 @@ def test_collect_non_scalars(self):
         assert tag['metric1'] is metric1
         assert tag['metric2'] is metric2
 
+    # TODO:haowen.han@mtheads.com MUSA does not support it yet!
     @patch('torch.cuda.max_memory_allocated', MagicMock())
     @patch('torch.cuda.reset_peak_memory_stats', MagicMock())
     def test_get_max_memory(self):

From 396cac19cd6b6c3f10141d3bfe4da21327c4591a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Anm=E5=8D=8A=E5=A4=8F?=
 <67933480+Anm-pinellia@users.noreply.github.com>
Date: Tue, 23 Jan 2024 11:09:05 +0800
Subject: [PATCH 12/16] Fix a typo in visualizer.py (#1476)

---
 mmengine/visualization/visualizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mmengine/visualization/visualizer.py b/mmengine/visualization/visualizer.py
index 0e4c1b50df..3b31c67727 100644
--- a/mmengine/visualization/visualizer.py
+++ b/mmengine/visualization/visualizer.py
@@ -961,7 +961,7 @@ def draw_featmap(featmap: torch.Tensor,
                 if topk <= 0, tensor_chw is assert to be one or three.
                 Defaults to 20.
             arrangement (Tuple[int, int]): The arrangement of featmap when
-                channel_reduction is not None and topk > 0. Defaults to (4, 5).
+                channel_reduction is None and topk > 0. Defaults to (4, 5).
             resize_shape (tuple, optional): The shape to scale the feature map.
                 Defaults to None.
             alpha (Union[int, List[int]]): The transparency of featmap.

From cd298e30861b960066ba78f76f7fc91a2b444de0 Mon Sep 17 00:00:00 2001
From: Zhihao Lin <36994684+LZHgrla@users.noreply.github.com>
Date: Wed, 24 Jan 2024 11:12:54 +0800
Subject: [PATCH 13/16] [Feature] Support save_optimizer=False for DeepSpeed
 (#1474)

---
 mmengine/_strategy/deepspeed.py | 73 ++++++++++++++++++++++++---------
 1 file changed, 53 insertions(+), 20 deletions(-)

diff --git a/mmengine/_strategy/deepspeed.py b/mmengine/_strategy/deepspeed.py
index 378616db3d..44e7c2e692 100644
--- a/mmengine/_strategy/deepspeed.py
+++ b/mmengine/_strategy/deepspeed.py
@@ -6,18 +6,23 @@
 
 import torch
 
+from mmengine.logging import print_log
+
 try:
     import deepspeed
 except ImportError:
     deepspeed = None
 
+import logging
+
 import torch.nn as nn
 
 import mmengine
-from mmengine.dist import init_dist
+from mmengine.dist import init_dist, is_main_process
 from mmengine.optim import BaseOptimWrapper, _ParamScheduler
 from mmengine.registry import (MODEL_WRAPPERS, OPTIM_WRAPPERS, OPTIMIZERS,
                                STRATEGIES)
+from mmengine.runner.checkpoint import save_checkpoint, weights_to_cpu
 from mmengine.utils import apply_to, digit_version, get_git_hash
 from .base import BaseStrategy
 
@@ -506,7 +511,7 @@ def save_checkpoint(
         """Save checkpoint to given ``filename``.
 
         Warning:
-            `save_optimizer` and `callback` parameters are not supported yet.
+            `callback` parameter is not supported yet.
 
         Args:
             filename (str): Filename to save checkpoint.
@@ -527,25 +532,53 @@ def save_checkpoint(
             mmengine=mmengine.__version__ + get_git_hash(),
         )
 
-        if save_optimizer and hasattr(self, 'optim_wrapper'):
-            # The key can not be 'optimizer', otherwise error will be thrown
-            # when loading or resuming checkpoint.
-            extra_ckpt['optim_wrapper'] = self.optim_state_dict()
-
         if save_param_scheduler and hasattr(self, 'param_schedulers'):
             extra_ckpt['param_schedulers'] = self.scheduler_state_dict()
 
-        dirname, basename = osp.split(filename)
-        if digit_version(deepspeed.__version__) >= digit_version('0.10.1'):
-            self.model.save_checkpoint(
-                dirname,
-                tag=basename,
-                client_state=extra_ckpt,
-                save_latest=False,
-                exclude_frozen_parameters=self.exclude_frozen_parameters)
+        if (not save_optimizer
+                and self.model.zero_optimization_partition_weights()
+                and not self.model.zero_gather_16bit_weights_on_model_save()):
+            print_log(
+                'Configured to `save_optimizer=False`, but currently using '
+                "DeepSpeed's ZeRO stage 3 with "
+                '`gather_16bit_weights_on_model_save=False`. In '
+                'this configuration, the model cannot be saved properly '
+                'and will be saved with the optimizer state. '
+                'To support `save_optimizer=False`, please set '
+                '`gather_16bit_weights_on_model_save=True` in your '
+                'DeepSpeed config.',
+                logger='current',
+                level=logging.WARNING)
+            save_optimizer = True
+
+        if save_optimizer:
+            if hasattr(self, 'optim_wrapper'):
+                # The key can not be 'optimizer', otherwise error will be
+                # thrown when loading or resuming checkpoint.
+                extra_ckpt['optim_wrapper'] = self.optim_state_dict()
+
+            dirname, basename = osp.split(filename)
+            if digit_version(deepspeed.__version__) >= digit_version('0.10.1'):
+                self.model.save_checkpoint(
+                    dirname,
+                    tag=basename,
+                    client_state=extra_ckpt,
+                    save_latest=False,
+                    exclude_frozen_parameters=self.exclude_frozen_parameters)
+            else:
+                self.model.save_checkpoint(
+                    dirname,
+                    tag=basename,
+                    client_state=extra_ckpt,
+                    save_latest=False)
         else:
-            self.model.save_checkpoint(
-                dirname,
-                tag=basename,
-                client_state=extra_ckpt,
-                save_latest=False)
+            if self.model.zero_optimization_partition_weights():
+                # TODO: `_zero3_consolidated_16bit_state_dict` doesn't support
+                # `exclude_frozen_parameters`.
+                state_dict = self.model._zero3_consolidated_16bit_state_dict()
+            else:
+                state_dict = self.model.module_state_dict(
+                    exclude_frozen_parameters=self.exclude_frozen_parameters)
+            if is_main_process():
+                ckpt = {'state_dict': weights_to_cpu(state_dict), **extra_ckpt}
+                save_checkpoint(ckpt, filename)

From 02f80e8bdd38f6713e04a872304861b02157905a Mon Sep 17 00:00:00 2001
From: Zaida Zhou <58739961+zhouzaida@users.noreply.github.com>
Date: Wed, 24 Jan 2024 12:45:00 +0800
Subject: [PATCH 14/16] Bump version to 0.10.3 (#1478)

---
 README.md                  |  4 ++--
 README_zh-CN.md            |  4 ++--
 docs/en/notes/changelog.md | 19 +++++++++++++++++++
 mmengine/version.py        |  2 +-
 4 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index ff51643e12..c01b53ab21 100644
--- a/README.md
+++ b/README.md
@@ -58,7 +58,7 @@ English | [简体中文](README_zh-CN.md)
 
 ## What's New
 
-v0.10.2 was released on 2023-12-26.
+v0.10.3 was released on 2024-1-24.
 
 Highlights:
 
@@ -70,7 +70,7 @@ Highlights:
 
 - Supports multiple visualization backends, including `NeptuneVisBackend`, `DVCLiveVisBackend` and `AimVisBackend`. Refer to [Visualization Backends](https://mmengine.readthedocs.io/en/latest/common_usage/visualize_training_log.html) for more details.
 
-Read [Changelog](./docs/en/notes/changelog.md#v0102-26122023) for more details.
+Read [Changelog](./docs/en/notes/changelog.md#v0103-2412024) for more details.
 
 ## Table of Contents
 
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 0fd5d272fd..3400ca35e5 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -58,7 +58,7 @@
 
 ## 最近进展
 
-最新版本 v0.10.2 在 2023.12.26 发布。
+最新版本 v0.10.3 在 2024.1.24 发布。
 
 亮点：
 
@@ -70,7 +70,7 @@
 
 - 支持多种可视化后端，包括`NeptuneVisBackend`、`DVCLiveVisBackend` 和 `AimVisBackend`。可阅读[可视化后端](https://mmengine.readthedocs.io/zh_CN/latest/common_usage/visualize_training_log.html)了解用法。
 
-如果想了解更多版本更新细节和历史信息，请阅读[更新日志](./docs/en/notes/changelog.md#v0102-26122023)
+如果想了解更多版本更新细节和历史信息，请阅读[更新日志](./docs/en/notes/changelog.md#v0103-2412024)。
 
 ## 目录
 
diff --git a/docs/en/notes/changelog.md b/docs/en/notes/changelog.md
index 7433be503a..30f9b0e1e6 100644
--- a/docs/en/notes/changelog.md
+++ b/docs/en/notes/changelog.md
@@ -1,5 +1,24 @@
 # Changelog of v0.x
 
+## v0.10.3 (24/1/2024)
+
+### New Features & Enhancements
+
+- Add the support for musa device support by [@hanhaowen-mt](https://github.com/hanhaowen-mt) in https://github.com/open-mmlab/mmengine/pull/1453
+- Support `save_optimizer=False` for DeepSpeed by [@LZHgrla](https://github.com/LZHgrla) in https://github.com/open-mmlab/mmengine/pull/1474
+- Update visualizer.py by [@Anm-pinellia](https://github.com/Anm-pinellia) in https://github.com/open-mmlab/mmengine/pull/1476
+
+### Bug Fixes
+
+- Fix `Config.to_dict` by [@HAOCHENYE](https://github.com/HAOCHENYE) in https://github.com/open-mmlab/mmengine/pull/1465
+- Fix the resume of iteration by [@LZHgrla](https://github.com/LZHgrla) in https://github.com/open-mmlab/mmengine/pull/1471
+- Fix `dist.collect_results` to keep all ranks' elements by [@LZHgrla](https://github.com/LZHgrla) in https://github.com/open-mmlab/mmengine/pull/1469
+
+### Docs
+
+- Add the usage of ProfilerHook by [@zhouzaida](https://github.com/zhouzaida) in https://github.com/open-mmlab/mmengine/pull/1466
+- Fix the nnodes in the doc of ddp training by [@XiwuChen](https://github.com/XiwuChen) in https://github.com/open-mmlab/mmengine/pull/1462
+
 ## v0.10.2 (26/12/2023)
 
 ### New Features & Enhancements
diff --git a/mmengine/version.py b/mmengine/version.py
index d36dfb8380..cfc82fdadc 100644
--- a/mmengine/version.py
+++ b/mmengine/version.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-__version__ = '0.10.2'
+__version__ = '0.10.3'
 
 
 def parse_version_info(version_str):

From b5f2d5860daf7f6f51ba66272382f3e5d3e16bfb Mon Sep 17 00:00:00 2001
From: Zaida Zhou <58739961+zhouzaida@users.noreply.github.com>
Date: Wed, 24 Jan 2024 19:27:02 +0800
Subject: [PATCH 15/16] Refine mmengine introduction (#1479)

---
 README.md                              | 53 +++++++++-------------
 README_zh-CN.md                        | 61 +++++++++++---------------
 docs/en/get_started/introduction.md    | 35 +++++++--------
 docs/zh_cn/get_started/introduction.md | 33 +++++++++-----
 requirements/docs.txt                  |  4 +-
 5 files changed, 87 insertions(+), 99 deletions(-)

diff --git a/README.md b/README.md
index c01b53ab21..1919c78c30 100644
--- a/README.md
+++ b/README.md
@@ -24,8 +24,10 @@
 [![open issues](https://isitmaintained.com/badge/open/open-mmlab/mmengine.svg)](https://github.com/open-mmlab/mmengine/issues)
 [![issue resolution](https://isitmaintained.com/badge/resolution/open-mmlab/mmengine.svg)](https://github.com/open-mmlab/mmengine/issues)
 
+[Introduction](#introduction)
+[Installation](#installation)
+[Get Started](#get-started)
 [📘Documentation](https://mmengine.readthedocs.io/en/latest/) |
-[🛠️Installation](https://mmengine.readthedocs.io/en/latest/get_started/installation.html) |
 [🤔Reporting Issues](https://github.com/open-mmlab/mmengine/issues/new/choose)
 
 </div>
@@ -62,51 +64,36 @@ v0.10.3 was released on 2024-1-24.
 
 Highlights:
 
-- Support installing mmengine-lite with no dependency on opencv. Refer to the [Installation](https://mmengine.readthedocs.io/en/latest/get_started/installation.html#install-mmengine) for more details.
-
-- Support training with [ColossalAI](https://colossalai.org/). Refer to the [Training Large Models](https://mmengine.readthedocs.io/en/latest/common_usage/large_model_training.html#colossalai) for more detailed usages.
-
-- Support gradient checkpointing. Refer to the [Save Memory on GPU](https://mmengine.readthedocs.io/en/latest/common_usage/save_gpu_memory.html#gradient-checkpointing) for more details.
-
-- Supports multiple visualization backends, including `NeptuneVisBackend`, `DVCLiveVisBackend` and `AimVisBackend`. Refer to [Visualization Backends](https://mmengine.readthedocs.io/en/latest/common_usage/visualize_training_log.html) for more details.
+- Add the support for musa device [#1453](#1453)
+- Add the usage of ProfilerHook [#1466](#1466)
 
 Read [Changelog](./docs/en/notes/changelog.md#v0103-2412024) for more details.
 
-## Table of Contents
-
-- [Introduction](#introduction)
-- [Installation](#installation)
-- [Get Started](#get-started)
-- [Learn More](#learn-more)
-- [Contributing](#contributing)
-- [Citation](#citation)
-- [License](#license)
-- [Ecosystem](#ecosystem)
-- [Projects in OpenMMLab](#projects-in-openmmlab)
-
 ## Introduction
 
-MMEngine is a foundational library for training deep learning models based on PyTorch. It provides a solid engineering foundation and frees developers from writing redundant codes on workflows. It serves as the training engine of all OpenMMLab codebases, which support hundreds of algorithms in various research areas. Moreover, MMEngine is also generic to be applied to non-OpenMMLab projects.
+MMEngine is a foundational library for training deep learning models based on PyTorch. It serves as the training engine of all OpenMMLab codebases, which support hundreds of algorithms in various research areas. Moreover, MMEngine is also generic to be applied to non-OpenMMLab projects. Its highlights are as follows:
 
-Major features:
+**Integrate mainstream large-scale model training frameworks**
 
-1. **A universal and powerful runner**:
+- [ColossalAI](https://mmengine.readthedocs.io/en/latest/common_usage/large_model_training.html#colossalai)
+- [DeepSpeed](https://mmengine.readthedocs.io/en/latest/common_usage/large_model_training.html#deepspeed)
+- [FSDP](https://mmengine.readthedocs.io/en/latest/common_usage/large_model_training.html#fullyshardeddataparallel-fsdp)
 
-   - Supports training different tasks with a small amount of code, e.g., ImageNet can be trained with only 80 lines of code (400 lines of the original PyTorch example).
-   - Easily compatible with models from popular algorithm libraries such as TIMM, TorchVision, and Detectron2.
+**Supports a variety of training strategies**
 
-2. **Open architecture with unified interfaces**:
+- [Mixed Precision Training](https://mmengine.readthedocs.io/en/latest/common_usage/speed_up_training.html#mixed-precision-training)
+- [Gradient Accumulation](https://mmengine.readthedocs.io/en/latest/common_usage/save_gpu_memory.html#gradient-accumulation)
+- [Gradient Checkpointing](https://mmengine.readthedocs.io/en/latest/common_usage/save_gpu_memory.html#gradient-checkpointing)
 
-   - Handles different algorithm tasks with unified APIs, e.g., implement a method and apply it to all compatible models.
-   - Provides a unified abstraction for upper-level algorithm libraries, which supports various back-end devices such as Nvidia CUDA, Mac MPS, AMD, MLU, and more for model training.
+**Provides a user-friendly configuration system**
 
-3. **Customizable training process**:
+- [Pure Python-style configuration files, easy to navigate](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta)
+- [Plain-text-style configuration files, supporting JSON and YAML](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html)
 
-   - Defines the training process just like playing with Legos.
-   - Provides rich components and strategies.
-   - Complete controls on the training process with different levels of APIs.
+**Covers mainstream training monitoring platforms**
 
-![mmengine_dataflow](https://github.com/open-mmlab/mmengine/assets/58739961/267db9cb-72e4-4af2-a58b-877b30091acc)
+- [TensorBoard](https://mmengine.readthedocs.io/en/latest/common_usage/visualize_training_log.html#tensorboard) | [WandB](https://mmengine.readthedocs.io/en/latest/common_usage/visualize_training_log.html#wandb) | [MLflow](https://mmengine.readthedocs.io/en/latest/common_usage/visualize_training_log.html#mlflow-wip)
+- [ClearML](https://mmengine.readthedocs.io/en/latest/common_usage/visualize_training_log.html#clearml) | [Neptune](https://mmengine.readthedocs.io/en/latest/common_usage/visualize_training_log.html#neptune) | [DVCLive](https://mmengine.readthedocs.io/en/latest/common_usage/visualize_training_log.html#dvclive) | [Aim](https://mmengine.readthedocs.io/en/latest/common_usage/visualize_training_log.html#aim)
 
 ## Installation
 
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 3400ca35e5..171d66dac0 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -24,8 +24,10 @@
 [![open issues](https://isitmaintained.com/badge/open/open-mmlab/mmengine.svg)](https://github.com/open-mmlab/mmengine/issues)
 [![issue resolution](https://isitmaintained.com/badge/resolution/open-mmlab/mmengine.svg)](https://github.com/open-mmlab/mmengine/issues)
 
-[📘使用文档](https://mmengine.readthedocs.io/zh_CN/latest/) |
-[🛠️安装教程](https://mmengine.readthedocs.io/zh_CN/latest/get_started/installation.html) |
+[简介](#简介) |
+[安装](#安装) |
+[快速上手](#快速上手) |
+[📘用户文档](https://mmengine.readthedocs.io/zh_CN/latest/) |
 [🤔报告问题](https://github.com/open-mmlab/mmengine/issues/new/choose)
 
 </div>
@@ -60,54 +62,43 @@
 
 最新版本 v0.10.3 在 2024.1.24 发布。
 
-亮点：
+版本亮点：
 
-- 支持安装不依赖于 opencv 的 mmengine-lite 版本。可阅读[安装文档](https://mmengine.readthedocs.io/zh-cn/latest/get_started/installation.html#mmengine)了解用法。
-
-- 支持使用 [ColossalAI](https://colossalai.org/) 进行训练。可阅读[大模型训练](https://mmengine.readthedocs.io/zh_CN/latest/common_usage/large_model_training.html#colossalai)了解用法。
-
-- 支持梯度检查点。详见[用法](https://mmengine.readthedocs.io/zh_CN/latest/common_usage/save_gpu_memory.html#id3)。
-
-- 支持多种可视化后端，包括`NeptuneVisBackend`、`DVCLiveVisBackend` 和 `AimVisBackend`。可阅读[可视化后端](https://mmengine.readthedocs.io/zh_CN/latest/common_usage/visualize_training_log.html)了解用法。
+- 适配摩尔线程 [#1453](#1453)
+- 添加 ProfilerHook 使用文档 [#1466](#1466)
 
 如果想了解更多版本更新细节和历史信息，请阅读[更新日志](./docs/en/notes/changelog.md#v0103-2412024)。
 
-## 目录
+## 简介
 
-- [简介](#简介)
-- [安装](#安装)
-- [快速上手](#快速上手)
-- [了解更多](#了解更多)
-- [贡献指南](#贡献指南)
-- [引用](#引用)
-- [开源许可证](#开源许可证)
-- [生态项目](#生态项目)
-- [OpenMMLab 的其他项目](#openmmlab-的其他项目)
-- [欢迎加入 OpenMMLab 社区](#欢迎加入-openmmlab-社区)
+MMEngine 是一个基于 PyTorch 实现的，用于训练深度学习模型的基础库。它作为 OpenMMLab 所有代码库的训练引擎，其在不同研究领域支持了上百个算法。此外，MMEngine 也可以用于非 OpenMMLab 项目中。它的亮点如下：
 
-## 简介
+**集成主流的大模型训练框架**
 
-MMEngine 是一个基于 PyTorch 实现的，用于训练深度学习模型的基础库。它为开发人员提供了坚实的工程基础，以此避免在工作流上编写冗余代码。作为 OpenMMLab 所有代码库的训练引擎，其在不同研究领域支持了上百个算法。此外，MMEngine 也可以用于非 OpenMMLab 项目中。
+- [ColossalAI](https://mmengine.readthedocs.io/zh-cn/latest/common_usage/large_model_training.html#colossalai)
+- [DeepSpeed](https://mmengine.readthedocs.io/zh-cn/latest/common_usage/large_model_training.html#deepspeed)
+- [FSDP](https://mmengine.readthedocs.io/zh-cn/latest/common_usage/large_model_training.html#fullyshardeddataparallel-fsdp)
 
-主要特性：
+**支持丰富的训练策略**
 
-1. **通用且强大的执行器**：
+- [混合精度训练（Mixed Precision Training）](https://mmengine.readthedocs.io/zh-cn/latest/common_usage/speed_up_training.html#id3)
+- [梯度累积（Gradient Accumulation）](https://mmengine.readthedocs.io/zh-cn/latest/common_usage/save_gpu_memory.html#id2)
+- [梯度检查点（Gradient Checkpointing）](https://mmengine.readthedocs.io/zh-cn/latest/common_usage/save_gpu_memory.html#id3)
 
-   - 支持用少量代码训练不同的任务，例如仅使用 80 行代码就可以训练 ImageNet（原始 PyTorch 示例需要 400 行）。
-   - 轻松兼容流行的算法库（如 TIMM、TorchVision 和 Detectron2）中的模型。
+**提供易用的配置系统**
 
-2. **接口统一的开放架构**：
+- [纯 Python 风格的配置文件，易于跳转](https://mmengine.readthedocs.io/zh-cn/latest/advanced_tutorials/config.html#python-beta)
+- [纯文本风格的配置文件，支持 JSON 和 YAML](https://mmengine.readthedocs.io/zh-cn/latest/advanced_tutorials/config.html#id1)
 
-   - 使用统一的接口处理不同的算法任务，例如，实现一个方法并应用于所有的兼容性模型。
-   - 上下游的对接更加统一便捷，在为上层算法库提供统一抽象的同时，支持多种后端设备。目前 MMEngine 支持 Nvidia CUDA、Mac MPS、AMD、MLU 等设备进行模型训练。
+**覆盖主流的训练监测平台**
 
-3. **可定制的训练流程**：
+- [TensorBoard](https://mmengine.readthedocs.io/zh-cn/latest/common_usage/visualize_training_log.html#tensorboard) | [WandB](https://mmengine.readthedocs.io/zh-cn/latest/common_usage/visualize_training_log.html#wandb) | [MLflow](https://mmengine.readthedocs.io/zh-cn/latest/common_usage/visualize_training_log.html#mlflow-wip)
+- [ClearML](https://mmengine.readthedocs.io/zh-cn/latest/common_usage/visualize_training_log.html#clearml) | [Neptune](https://mmengine.readthedocs.io/zh-cn/latest/common_usage/visualize_training_log.html#neptune) | [DVCLive](https://mmengine.readthedocs.io/zh-cn/latest/common_usage/visualize_training_log.html#dvclive) | [Aim](https://mmengine.readthedocs.io/zh-cn/latest/common_usage/visualize_training_log.html#aim)
 
-   - 定义了“乐高”式的训练流程。
-   - 提供了丰富的组件和策略。
-   - 使用不同等级的 API 控制训练过程。
+**兼容主流的训练芯片**
 
-![mmengine_dataflow](https://github.com/open-mmlab/mmengine/assets/58739961/267db9cb-72e4-4af2-a58b-877b30091acc)
+- 英伟达 CUDA | 苹果 MPS
+- 华为 Ascend | 寒武纪 MLU | 摩尔线程 MUSA
 
 ## 安装
 
diff --git a/docs/en/get_started/introduction.md b/docs/en/get_started/introduction.md
index 3c70e28e4d..f15073ed14 100644
--- a/docs/en/get_started/introduction.md
+++ b/docs/en/get_started/introduction.md
@@ -1,30 +1,29 @@
 # Introduction
 
 MMEngine is a foundational library for training deep learning models based on
-PyTorch. It supports running on Linux, Windows, and macOS. It has the
-following three features:
+PyTorch. It supports running on Linux, Windows, and macOS. Its highlights are as follows:
 
-1. **Universal and powerful executor**:
+**Integrate mainstream large-scale model training frameworks**
 
-   - Supports training different tasks with minimal code, such as training
-     ImageNet with just 80 lines of code (original PyTorch examples require
-     400 lines).
-   - Easily compatible with models from popular algorithm libraries like TIMM,
-     TorchVision, and Detectron2.
+- [ColossalAI](../common_usage/large_model_training.md#colossalai)
+- [DeepSpeed](../common_usage/large_model_training.md#deepspeed)
+- [FSDP](../common_usage/large_model_training.md#fullyshardeddataparallel-fsdp)
 
-2. **Open architecture with unified interfaces**:
+**Supports a variety of training strategies**
 
-   - Handles different tasks with a unified API: you can implement a method
-     once and apply it to all compatible models.
-   - Supports various backend devices through a simple, high-level
-     abstraction. Currently, MMEngine supports model training on Nvidia CUDA,
-     Mac MPS, AMD, MLU, and other devices.
+- [Mixed Precision Training](../common_usage/speed_up_training.md#mixed-precision-training)
+- [Gradient Accumulation](../common_usage/save_gpu_memory.md#gradient-accumulation)
+- [Gradient Checkpointing](../common_usage/save_gpu_memory.md#gradient-checkpointing)
 
-3. **Customizable training process**:
+**Provides a user-friendly configuration system**
 
-   - Defines a highly modular training engine with "Lego"-like composability.
-   - Offers a rich set of components and strategies.
-   - Total control over the training process with different levels of APIs.
+- [Pure Python-style configuration files, easy to navigate](../advanced_tutorials/config.md#a-pure-python-style-configuration-file-beta)
+- [Plain-text-style configuration files, supporting JSON and YAML](../advanced_tutorials/config.html)
+
+**Covers mainstream training monitoring platforms**
+
+- [TensorBoard](../common_usage/visualize_training_log.md#tensorboard) | [WandB](../common_usage/visualize_training_log.md#wandb) | [MLflow](../common_usage/visualize_training_log.md#mlflow-wip)
+- [ClearML](../common_usage/visualize_training_log.md#clearml) | [Neptune](../common_usage/visualize_training_log.md#neptune) | [DVCLive](../common_usage/visualize_training_log.md#dvclive) | [Aim](../common_usage/visualize_training_log.md#aim)
 
 ## Architecture
 
diff --git a/docs/zh_cn/get_started/introduction.md b/docs/zh_cn/get_started/introduction.md
index e39581515c..34fd28630a 100644
--- a/docs/zh_cn/get_started/introduction.md
+++ b/docs/zh_cn/get_started/introduction.md
@@ -1,22 +1,33 @@
 # 介绍
 
-MMEngine 是一个基于 PyTorch 实现的，用于训练深度学习模型的基础库，支持在 Linux、Windows、macOS 上运行。它具有如下三个特性：
+MMEngine 是一个基于 PyTorch 实现的，用于训练深度学习模型的基础库，支持在 Linux、Windows、macOS 上运行。它的亮点如下：
 
-1. **通用且强大的执行器**：
+**集成主流的大模型训练框架**
 
-   - 支持用少量代码训练不同的任务，例如仅使用 80 行代码就可以训练 ImageNet（原始 PyTorch 示例需要 400 行）。
-   - 轻松兼容流行的算法库（如 TIMM、TorchVision 和 Detectron2）中的模型。
+- [ColossalAI](../common_usage/large_model_training.md#colossalai)
+- [DeepSpeed](../common_usage/large_model_training.md#deepspeed)
+- [FSDP](../common_usage/large_model_training.md#fullyshardeddataparallel-fsdp)
 
-2. **接口统一的开放架构**：
+**支持丰富的训练策略**
 
-   - 使用统一的接口处理不同的算法任务，例如，实现一个方法并应用于所有的兼容性模型。
-   - 上下游的对接更加统一便捷，在为上层算法库提供统一抽象的同时，支持多种后端设备。目前 MMEngine 支持 Nvidia CUDA、Mac MPS、AMD、MLU 等设备进行模型训练。
+- [混合精度训练（Mixed Precision Training）](../common_usage/speed_up_training.md#混合精度训练)
+- [梯度累积（Gradient Accumulation）](../common_usage/save_gpu_memory.md#梯度累加)
+- [梯度检查点（Gradient Checkpointing）](../common_usage/save_gpu_memory.md#梯度检查点)
 
-3. **可定制的训练流程**：
+**提供易用的配置系统**
 
-   - 定义了“乐高”式的训练流程。
-   - 提供了丰富的组件和策略。
-   - 使用不同等级的 API 控制训练过程。
+- [纯 Python 风格的配置文件，易于跳转](../advanced_tutorials/config.md#纯-python-风格的配置文件beta)
+- [纯文本风格的配置文件，支持 JSON 和 YAML](../advanced_tutorials/config.md)
+
+**覆盖主流的训练监测平台**
+
+- [TensorBoard](../common_usage/visualize_training_log.md#tensorboard) | [WandB](../common_usage/visualize_training_log.md#wandb) | [MLflow](../common_usage/visualize_training_log.md#mlflow-wip)
+- [ClearML](../common_usage/visualize_training_log.md#clearml) | [Neptune](../common_usage/visualize_training_log.md#neptune) | [DVCLive](../common_usage/visualize_training_log.md#dvclive) | [Aim](../common_usage/visualize_training_log.md#aim)
+
+**兼容主流的训练芯片**
+
+- 英伟达 CUDA | 苹果 MPS
+- 华为 Ascend | 寒武纪 MLU | 摩尔线程 MUSA
 
 ## 架构
 
diff --git a/requirements/docs.txt b/requirements/docs.txt
index a0d0e05000..ebde0ef9d3 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -1,8 +1,8 @@
-docutils==0.17.1
+docutils==0.18.1
 myst-parser
 opencv-python
 -e git+https://github.com/open-mmlab/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
-sphinx==4.5.0
+sphinx==6.2.1
 sphinx-copybutton
 sphinx-tabs
 sphinx_markdown_tables

From f79111ecc0eea9fbb1b7d1361a79f7062ca1ac10 Mon Sep 17 00:00:00 2001
From: Zaida Zhou <58739961+zhouzaida@users.noreply.github.com>
Date: Wed, 24 Jan 2024 19:31:08 +0800
Subject: [PATCH 16/16] fix typo (#1481)

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 1919c78c30..c2379cbfd8 100644
--- a/README.md
+++ b/README.md
@@ -24,9 +24,9 @@
 [![open issues](https://isitmaintained.com/badge/open/open-mmlab/mmengine.svg)](https://github.com/open-mmlab/mmengine/issues)
 [![issue resolution](https://isitmaintained.com/badge/resolution/open-mmlab/mmengine.svg)](https://github.com/open-mmlab/mmengine/issues)
 
-[Introduction](#introduction)
-[Installation](#installation)
-[Get Started](#get-started)
+[Introduction](#introduction) |
+[Installation](#installation) |
+[Get Started](#get-started) |
 [📘Documentation](https://mmengine.readthedocs.io/en/latest/) |
 [🤔Reporting Issues](https://github.com/open-mmlab/mmengine/issues/new/choose)