Azure · tocean · Feb 26, 2024 · Feb 26, 2024 · Feb 26, 2024 · Feb 26, 2024
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 __MS-AMP__ is an automatic mixed precision package for deep learning developed by Microsoft.
 
-📢 [v0.3.0](https://github.com/Azure/MS-AMP/releases/tag/v0.3.0) has been released!
+📢 [v0.4.0](https://github.com/Azure/MS-AMP/releases/tag/v0.4.0) has been released!
 
 ## _Check [aka.ms/msamp/doc](https://aka.ms/msamp/doc) for more details._
 

diff --git a/docs/developer-guides/using-docker.mdx b/docs/developer-guides/using-docker.mdx
@@ -16,19 +16,19 @@ You need to [clone the code](./development.md#set-up) first before building the
 
 <Tabs
   groupId='gpu-platform'
-  defaultValue='cuda-12.1'
+  defaultValue='cuda-12.2'
   values={[
-    {label: 'CUDA-12.1', value: 'cuda-12.1'},
+    {label: 'CUDA-12.2', value: 'cuda-12.2'},
     {label: 'CUDA-11.8', value: 'cuda-11.8'},
   ]
 }>
-<TabItem value='cuda-12.1'>
+<TabItem value='cuda-12.2'>
 
 ```bash
 export DOCKER_BUILDKIT=1
 docker buildx build \
   --platform linux/amd64 --cache-to type=inline,mode=max \
-  --tag msamp-dev-cuda121 --file dockerfile/torch2.1-cuda12.1.dockerfile .
+  --tag msamp-dev-cuda122 --file dockerfile/torch2.1-cuda12.2.dockerfile .
 ```
 
 </TabItem>
@@ -48,21 +48,21 @@ docker buildx build \
 
 <Tabs
   groupId='gpu-platform'
-  defaultValue='cuda-12.1'
+  defaultValue='cuda-12.2'
   values={[
-    {label: 'CUDA-12.1', value: 'cuda-12.1'},
+    {label: 'CUDA-12.2', value: 'cuda-12.2'},
     {label: 'CUDA-11.8', value: 'cuda-11.8'},
   ]
 }>
-<TabItem value='cuda-12.1'>
+<TabItem value='cuda-12.2'>
 
 ```bash
 docker run \
   -itd --name=msamp-dev \
   --privileged --net=host --ipc=host \
   --gpus=all \
   -w /root -v /mnt:/mnt \
-  msamp-dev-cuda121 bash
+  msamp-dev-cuda122 bash
 ```
 
 </TabItem>

diff --git a/docs/user-tutorial/container-images.mdx b/docs/user-tutorial/container-images.mdx
@@ -25,6 +25,8 @@ You can use MS-AMP image by `ghcr.io/azure/msamp:${tag}`, available tags are lis
 
 | Tag               | Description                        |
 |-------------------|------------------------------------|
+| v0.4.0-cuda12.2   | MS-AMP v0.4.0 with CUDA 12.2       |
+| v0.4.0-cuda11.8   | MS-AMP v0.4.0 with CUDA 11.8       |
 | v0.3.0-cuda12.1   | MS-AMP v0.3.0 with CUDA 12.1       |
 | v0.3.0-cuda11.8   | MS-AMP v0.3.0 with CUDA 11.8       |
 | v0.2.0-cuda12.1   | MS-AMP v0.2.0 with CUDA 12.1       |

diff --git a/msamp/__init__.py b/msamp/__init__.py
@@ -100,6 +100,6 @@ def initialize(model, optimizer=None, opt_level='O1', use_te=False):    # noqa:
     return cast_model, cast_optimizer
 
 
-__version__ = '0.3.0'
+__version__ = '0.4.0'
 __author__ = 'Microsoft'
 __all__ = ['clip_grad_norm_', 'initialize']
diff --git a/msamp/deepspeed/runtime/engine.py b/msamp/deepspeed/runtime/engine.py
@@ -11,6 +11,7 @@
                                      FP16, BFLOAT16, logger, DeepSpeedEngine, instrument_w_nvtx, log_dist, \
                                      see_memory_usage, DummyOptim, DeepSpeedZeroOptimizer, DeepSpeedZeRoOffload, \
                                      PipelineModule, ZeroStageEnum
+from deepspeed.utils.timer import NoopTimer
 from deepspeed.moe.utils import is_moe_param
 from deepspeed.accelerator import get_accelerator
 
@@ -191,7 +192,8 @@ def _configure_zero_optimizer(self, optimizer):
             ZeROOptimizer: zero optimizer.
         """
         zero_stage = self.zero_optimization_stage()
-        timers = self.timers if self.wall_clock_breakdown() else None
+        timers = self.timers if self.wall_clock_breakdown() else NoopTimer()
+        model_dtype, gradient_accumulation_dtype = self.get_data_types()
 
         if optimizer is None:
             optimizer = DummyOptim(list(self.module.parameters()))
@@ -232,6 +234,7 @@ def _configure_zero_optimizer(self, optimizer):
                 clip_grad=self.gradient_clipping(),
                 contiguous_gradients=contiguous_gradients,
                 reduce_bucket_size=self.zero_reduce_bucket_size(),
+                use_multi_rank_bucket_allreduce=self.zero_multi_rank_bucket_allreduce(),
                 allgather_bucket_size=self.zero_allgather_bucket_size(),
                 dp_process_group=self.data_parallel_group,
                 expert_parallel_group=self.expert_parallel_group if self.has_moe_layers else None,
@@ -248,6 +251,7 @@ def _configure_zero_optimizer(self, optimizer):
                 round_robin_gradients=round_robin_gradients,
                 has_moe_layers=self.has_moe_layers,
                 fp16_master_weights_and_gradients=self.fp16_master_weights_and_gradients(),
+                gradient_accumulation_dtype=gradient_accumulation_dtype,
                 communication_data_type=self.communication_data_type,
                 elastic_checkpoint=self.zero_elastic_checkpoint()
             )

diff --git a/website/blog/2024-02-26-release-0-4.md b/website/blog/2024-02-26-release-0-4.md
@@ -0,0 +1,36 @@
+---
+slug: release-msamp-v0.4
+title: Releasing MS-AMP v0.4
+author: Yuxiang Yang
+author_title: MS-AMP Team
+author_url: https://github.com/tocean
+tags: [MS-AMP, announcement, release]
+---
+
+We are very happy to announce that **MS-AMP 0.4.0 version** is officially released today!
+
+You can install and try MS-AMP by following [Getting Started Tutorial](https://azure.github.io/MS-AMP/docs/getting-started/installation).
+
+## MS-AMP 0.4.0 Release Notes
+
+### MS-AMP Improvements
+
+- Improve GPT-3 performance by optimizing the FP8-gradient accumulation with kernel fusion technology
+- Support FP8 in FSDP
+- Support DeepSpeed+TE+MSAMP and add cifar10 example
+- Support MSAMP+TE+DDP
+- Update DeepSpeed to latest version
+- Update TransformerEngin to V1.1 and flash-attn to latest version
+- Support CUDA 12.2
+- Fix several bugs in DeepSpeed integration
+
+### MS-AMP-Examples Improvements
+
+- Improve document for data processing in GPT3
+- Add launch script for pretraining GPT-6b7
+- Use new API of TransformerEngine in Megatron-LM
+
+### Document Improvements
+
+- Add docker usage in Installation page
+- Tell customer how to run FSDP and DeepSpeed+TE+MSAMP  example in "Run Examples" page
diff --git a/website/docusaurus.config.js b/website/docusaurus.config.js
@@ -91,7 +91,7 @@ module.exports = {
     announcementBar: {
       id: 'supportus',
       content:
-        '📢 <a href="https://azure.github.io/MS-AMP/blog/release-msamp-v0.3">v0.3.0</a> has been released! ' +
+        '📢 <a href="https://azure.github.io/MS-AMP/blog/release-msamp-v0.4">v0.4.0</a> has been released! ' +
         '⭐️ If you like MS-AMP, give it a star on <a target="_blank" rel="noopener noreferrer" href="https://github.com/Azure/MS-AMP">GitHub</a>! ⭐️',
     },
     algolia: {

diff --git a/website/package-lock.json b/website/package-lock.json
diff --git a/website/package.json b/website/package.json
@@ -1,6 +1,6 @@
 {
   "name": "msamp-website",
-  "version": "0.3.0",
+  "version": "0.4.0",
   "private": true,
   "scripts": {
     "docusaurus": "docusaurus",