From 587e1c24cfd0ec4d288ddb580f82dede69a17ea4 Mon Sep 17 00:00:00 2001
From: Wen Sun <35923278+HermitSun@users.noreply.github.com>
Date: Fri, 16 Sep 2022 07:59:12 +0800
Subject: [PATCH 1/6] Support both use_calc_stream and sync_op in send recv
 APIs (#46023)

---
 .../distributed/collective/ProcessGroup.h     |  50 +++-
 .../collective/ProcessGroupNCCL.cc            | 226 +++++++++++++++++-
 .../distributed/collective/ProcessGroupNCCL.h |  49 +++-
 .../collective/ProcessGroupStream.cc          |  84 +++++++
 .../collective/ProcessGroupStream.h           |  52 ++++
 paddle/fluid/pybind/distributed_py.cc         | 170 +++++++++++++
 .../communication/stream/__init__.py          |   4 +-
 .../communication/stream/all_reduce.py        |   8 +-
 .../distributed/communication/stream/recv.py  |  82 +++++++
 .../distributed/communication/stream/send.py  |  82 +++++++
 .../tests/unittests/collective/CMakeLists.txt |  32 ++-
 ...mmunication_stream_sendrecv_api_dygraph.py |  68 ++++++
 .../test_communication_stream_sendrecv_api.py |  50 ++++
 .../tests/unittests/collective/testslist.csv  |   5 +-
 14 files changed, 922 insertions(+), 40 deletions(-)
 create mode 100644 python/paddle/distributed/communication/stream/recv.py
 create mode 100644 python/paddle/distributed/communication/stream/send.py
 create mode 100644 python/paddle/fluid/tests/unittests/collective/communication_stream_sendrecv_api_dygraph.py
 create mode 100644 python/paddle/fluid/tests/unittests/collective/test_communication_stream_sendrecv_api.py

diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h
index 10b1686ddb85f..3db2464e59afd 100644
--- a/paddle/fluid/distributed/collective/ProcessGroup.h
+++ b/paddle/fluid/distributed/collective/ProcessGroup.h
@@ -134,24 +134,56 @@ class ProcessGroup {
         "ProcessGroup%s does not support send", GetBackendName()));
   }
 
+  virtual std::shared_ptr<ProcessGroup::Task> Send(
+      std::vector<phi::DenseTensor>&, int, bool) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support send with sync_op flag",
+        GetBackendName()));
+  }
+
   virtual std::shared_ptr<ProcessGroup::Task> Recv(
-      std::vector<phi::DenseTensor>& tensors, int) {  // NOLINT
+      std::vector<phi::DenseTensor>&, int) {  // NOLINT
     PADDLE_THROW(platform::errors::InvalidArgument(
-        "ProcessGroup%s does not support receive", GetBackendName()));
+        "ProcessGroup%s does not support recv", GetBackendName()));
   }
 
-  virtual std::shared_ptr<ProcessGroup::Task> Send_Partial(phi::DenseTensor&,
-                                                           int,
-                                                           int,
-                                                           int) {  // NOLINT
+  virtual std::shared_ptr<ProcessGroup::Task> Recv(
+      std::vector<phi::DenseTensor>&, int, bool) {  // NOLINT
     PADDLE_THROW(platform::errors::InvalidArgument(
-        "ProcessGroup%s does not support send", GetBackendName()));
+        "ProcessGroup%s does not support recv with sync_op flag",
+        GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> Send_Partial(
+      phi::DenseTensor&,  // NOLINT
+      int,
+      int,
+      int) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support send_partial", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> Send_Partial(
+      phi::DenseTensor&, int, int, int, bool) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support send_partial with sync_op flag",
+        GetBackendName()));
   }
 
   virtual std::shared_ptr<ProcessGroup::Task> Recv_Partial(
-      phi::DenseTensor& tensors, int, int, int) {  // NOLINT
+      phi::DenseTensor&,  // NOLINT
+      int,
+      int,
+      int) {
     PADDLE_THROW(platform::errors::InvalidArgument(
-        "ProcessGroup%s does not support receive", GetBackendName()));
+        "ProcessGroup%s does not support recv_partial", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> Recv_Partial(
+      phi::DenseTensor&, int, int, int, bool) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support recv_partial with sync_op flag",
+        GetBackendName()));
   }
 
   virtual std::shared_ptr<ProcessGroup::Task> AllGather(
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
index 239114ae6188c..368008d9cc0ce 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
@@ -51,6 +51,17 @@ std::shared_ptr<ProcessGroupNCCL::NCCLTask> ProcessGroupNCCL::CreateTask(
       places, rank, comm_type, inputs);
 }
 
+std::shared_ptr<ProcessGroupNCCL::NCCLTask> ProcessGroupNCCL::CreateTask(
+    const std::vector<Place>& places,
+    int rank,
+    CommType comm_type,
+    const std::vector<phi::DenseTensor>& inputs,
+    bool is_sync,
+    bool use_calc_stream) {
+  return std::make_shared<ProcessGroupNCCL::NCCLTask>(
+      places, rank, comm_type, inputs, is_sync, use_calc_stream);
+}
+
 ProcessGroupNCCL::NCCLTask::NCCLTask(
     const std::vector<Place>& places,
     int rank,
@@ -264,10 +275,12 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Collective(
 
   auto& nccl_comms = places_to_ncclcomm_[key];
 
-  SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]);
+  if (!use_calc_stream) {
+    SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]);
+  }
 
-  auto task = std::make_shared<ProcessGroupNCCL::NCCLTask>(
-      places, rank_, comm_type, inputs, sync_op, use_calc_stream);
+  auto task =
+      CreateTask(places, rank_, comm_type, inputs, sync_op, use_calc_stream);
 
   platform::CUDADeviceGuard cuda_guard;
 
@@ -406,6 +419,78 @@ void ProcessGroupNCCL::Collective(const phi::DenseTensor* in,
   cuda_guard.SetDevice(places[0]);
 }
 
+template <typename Fn>
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::PointToPoint(
+    std::vector<phi::DenseTensor>& tensors,
+    Fn fn,
+    int dst_rank,
+    CommType op_type,
+    bool sync_op,
+    bool use_calc_stream) {
+  const auto& places = GetPlaceList(tensors);
+  const auto& key = GetKeyFromPlaces(places);
+
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (places_to_ncclcomm_.find(key) == places_to_ncclcomm_.end()) {
+      CreateNCCLManagerCache(key, places);
+    }
+  }
+
+  auto& nccl_comms = places_to_ncclcomm_[key];
+
+  if (!use_calc_stream) {
+    SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]);
+  }
+
+  auto task =
+      CreateTask(places, rank_, op_type, tensors, sync_op, use_calc_stream);
+
+  platform::CUDADeviceGuard cuda_guard;
+
+  if (FLAGS_use_stream_safe_cuda_allocator) {
+    for (size_t i = 0; i < tensors.size(); ++i) {
+      cuda_guard.SetDevice(places[i]);
+      gpuStream_t nccl_stream;
+      if (use_calc_stream) {
+        nccl_stream =
+            static_cast<phi::GPUContext*>(
+                platform::DeviceContextPool::Instance().Get(places[i]))
+                ->stream();
+      } else {
+        nccl_stream = places_to_ctx_[key][i]->stream();
+      }
+      memory::RecordStream(tensors[i].Holder(), nccl_stream);
+    }
+  }
+
+  {
+    platform::NCCLGroupGuard nccl_guard;
+    for (size_t i = 0; i < tensors.size(); ++i) {
+      cuda_guard.SetDevice(places[i]);
+      gpuStream_t nccl_stream;
+      if (use_calc_stream) {
+        nccl_stream =
+            static_cast<phi::GPUContext*>(
+                platform::DeviceContextPool::Instance().Get(places[i]))
+                ->stream();
+      } else {
+        nccl_stream = places_to_ctx_[key][i]->stream();
+      }
+      fn(tensors[i], nccl_comms[i]->GetNcclComm(), nccl_stream, dst_rank);
+    }
+  }
+
+  if (!use_calc_stream) {
+    for (size_t i = 0; i < tensors.size(); ++i) {
+      cuda_guard.SetDevice(places[i]);
+      task->control_events_[i].Record(*places_to_ctx_[key][i]);
+    }
+  }
+
+  return task;
+}
+
 template <typename Fn>
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::PointToPoint(
     std::vector<phi::DenseTensor>& tensors,
@@ -617,6 +702,34 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Send(
   return task;
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Send(
+    std::vector<phi::DenseTensor>& tensors,
+    int dst_rank,
+    bool sync_op,
+    bool use_calc_stream) {
+  CheckTensorsInDifferentDevices(tensors, static_cast<size_t>(GetSize()));
+
+  auto task = PointToPoint(
+      tensors,
+      [&](phi::DenseTensor& input,
+          ncclComm_t comm,
+          const gpuStream_t& stream,
+          int dst_rank) {
+        return platform::dynload::ncclSend(
+            input.data(),
+            input.numel(),
+            platform::ToNCCLDataType(input.dtype()),
+            dst_rank,
+            comm,
+            stream);
+      },
+      dst_rank,
+      CommType::SEND,
+      sync_op,
+      use_calc_stream);
+  return task;
+}
+
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv(
     std::vector<phi::DenseTensor>& tensors, int src_rank) {
   CheckTensorsInDifferentDevices(tensors, static_cast<size_t>(GetSize()));
@@ -640,6 +753,34 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv(
   return task;
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv(
+    std::vector<phi::DenseTensor>& tensors,
+    int src_rank,
+    bool sync_op,
+    bool use_calc_stream) {
+  CheckTensorsInDifferentDevices(tensors, static_cast<size_t>(GetSize()));
+
+  auto task = PointToPoint(
+      tensors,
+      [&](phi::DenseTensor& output,
+          ncclComm_t comm,
+          const gpuStream_t& stream,
+          int src_rank) {
+        return platform::dynload::ncclRecv(
+            output.data(),
+            output.numel(),
+            platform::ToNCCLDataType(output.dtype()),
+            src_rank,
+            comm,
+            stream);
+      },
+      src_rank,
+      CommType::RECV,
+      sync_op,
+      use_calc_stream);
+  return task;
+}
+
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Send_Partial(
     phi::DenseTensor& tensors, int dst_rank, int offset, int length) {
   // CheckTensorsInDifferentDevices(tensors, static_cast<size_t>(GetSize()));
@@ -647,10 +788,8 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Send_Partial(
   phi::DenseTensor flatten_tensor;
   flatten_tensor.ShareDataWith(tensors).Resize({tensors.numel()});
 
-  phi::DenseTensor shared_input = flatten_tensor.Slice(offset, offset + length);
-
-  std::vector<phi::DenseTensor> shared_tensors;
-  shared_tensors.push_back(shared_input);
+  std::vector<phi::DenseTensor> shared_tensors{
+      flatten_tensor.Slice(offset, offset + length)};
 
   auto task = PointToPoint(
       shared_tensors,
@@ -671,16 +810,49 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Send_Partial(
   return task;
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Send_Partial(
+    phi::DenseTensor& tensors,
+    int dst_rank,
+    int offset,
+    int length,
+    bool sync_op,
+    bool use_calc_stream) {
+  phi::DenseTensor flatten_tensor;
+  flatten_tensor.ShareDataWith(tensors).Resize({tensors.numel()});
+
+  std::vector<phi::DenseTensor> shared_tensors{
+      flatten_tensor.Slice(offset, offset + length)};
+
+  auto task = PointToPoint(
+      shared_tensors,
+      [&](phi::DenseTensor& input,
+          ncclComm_t comm,
+          const gpuStream_t& stream,
+          int dst_rank) {
+        return platform::dynload::ncclSend(
+            input.data(),
+            input.numel(),
+            platform::ToNCCLDataType(input.dtype()),
+            dst_rank,
+            comm,
+            stream);
+      },
+      dst_rank,
+      CommType::SEND,
+      sync_op,
+      use_calc_stream);
+  return task;
+}
+
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv_Partial(
     phi::DenseTensor& tensors, int src_rank, int offset, int length) {
   // phi::DenseTensor shared_input = tensors.Slice(offset, offset+length);
 
   phi::DenseTensor flatten_tensor;
   flatten_tensor.ShareDataWith(tensors).Resize({tensors.numel()});
-  phi::DenseTensor shared_input = flatten_tensor.Slice(offset, offset + length);
 
-  std::vector<phi::DenseTensor> shared_tensors;
-  shared_tensors.push_back(shared_input);
+  std::vector<phi::DenseTensor> shared_tensors{
+      flatten_tensor.Slice(offset, offset + length)};
 
   auto task = PointToPoint(
       shared_tensors,
@@ -701,6 +873,40 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv_Partial(
   return task;
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv_Partial(
+    phi::DenseTensor& tensors,
+    int src_rank,
+    int offset,
+    int length,
+    bool sync_op,
+    bool use_calc_stream) {
+  phi::DenseTensor flatten_tensor;
+  flatten_tensor.ShareDataWith(tensors).Resize({tensors.numel()});
+
+  std::vector<phi::DenseTensor> shared_tensors{
+      flatten_tensor.Slice(offset, offset + length)};
+
+  auto task = PointToPoint(
+      shared_tensors,
+      [&](phi::DenseTensor& output,
+          ncclComm_t comm,
+          const gpuStream_t& stream,
+          int src_rank) {
+        return platform::dynload::ncclRecv(
+            output.data(),
+            output.numel(),
+            platform::ToNCCLDataType(output.dtype()),
+            src_rank,
+            comm,
+            stream);
+      },
+      src_rank,
+      CommType::RECV,
+      sync_op,
+      use_calc_stream);
+  return task;
+}
+
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllGather(
     std::vector<phi::DenseTensor>& in_tensors,
     std::vector<phi::DenseTensor>& out_tensors) {
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
index e0e298e9113e9..0b8fa54cd337e 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
@@ -60,7 +60,7 @@ class ProcessGroupNCCL : public ProcessGroupStream {
              int rank,
              CommType comm_type,
              const std::vector<phi::DenseTensor>& inputs,
-             bool is_sync,
+             bool sync_op,
              bool use_calc_stream);
 
     bool IsCompleted();
@@ -122,19 +122,47 @@ class ProcessGroupNCCL : public ProcessGroupStream {
   std::shared_ptr<ProcessGroup::Task> Send(
       std::vector<phi::DenseTensor>& tensors, int dst_rank) override;
 
+  std::shared_ptr<ProcessGroup::Task> Send(
+      std::vector<phi::DenseTensor>& tensors,
+      int dst_rank,
+      bool sync_op,
+      bool use_calc_stream) override;
+
   std::shared_ptr<ProcessGroup::Task> Recv(
       std::vector<phi::DenseTensor>& tensors, int src_rank) override;
 
+  std::shared_ptr<ProcessGroup::Task> Recv(
+      std::vector<phi::DenseTensor>& tensors,
+      int src_rank,
+      bool sync_op,
+      bool use_calc_stream) override;
+
   std::shared_ptr<ProcessGroup::Task> Send_Partial(phi::DenseTensor& tensors,
                                                    int dst_rank,
                                                    int offset,
                                                    int length) override;
 
+  std::shared_ptr<ProcessGroup::Task> Send_Partial(
+      phi::DenseTensor& tensors,
+      int dst_rank,
+      int offset,
+      int length,
+      bool sync_op,
+      bool use_calc_stream) override;
+
   std::shared_ptr<ProcessGroup::Task> Recv_Partial(phi::DenseTensor& tensors,
                                                    int src_rank,
                                                    int offset,
                                                    int length) override;
 
+  std::shared_ptr<ProcessGroup::Task> Recv_Partial(
+      phi::DenseTensor& tensors,
+      int src_rank,
+      int offset,
+      int length,
+      bool sync_op,
+      bool use_calc_stream) override;
+
   std::shared_ptr<ProcessGroup::Task> AllGather(
       std::vector<phi::DenseTensor>& in_tensors,
       std::vector<phi::DenseTensor>& out_tensors) override;
@@ -180,9 +208,17 @@ class ProcessGroupNCCL : public ProcessGroupStream {
   virtual std::shared_ptr<ProcessGroupNCCL::NCCLTask> CreateTask(
       std::vector<Place> places,
       int rank,
-      CommType opType,
+      CommType op_type,
       const std::vector<phi::DenseTensor>& inputs);
 
+  virtual std::shared_ptr<ProcessGroupNCCL::NCCLTask> CreateTask(
+      const std::vector<Place>& places,
+      int rank,
+      CommType op_type,
+      const std::vector<phi::DenseTensor>& inputs,
+      bool sync_op,
+      bool use_calc_stream);
+
  protected:
   std::shared_ptr<Store> store_;
   std::shared_ptr<NCCLCommManager> nccl_comm_;
@@ -233,6 +269,15 @@ class ProcessGroupNCCL : public ProcessGroupStream {
       int dst_rank,
       CommType op_type);
 
+  template <typename Fn>
+  std::shared_ptr<ProcessGroup::Task> PointToPoint(
+      std::vector<phi::DenseTensor>& tensors,  // NOLINT
+      Fn fn,
+      int dst_rank,
+      CommType op_type,
+      bool sync_op,
+      bool use_calc_stream);
+
   void CreateNCCLManagerCache(const std::string& places_key,
                               const std::vector<Place>& places);
 
diff --git a/paddle/fluid/distributed/collective/ProcessGroupStream.cc b/paddle/fluid/distributed/collective/ProcessGroupStream.cc
index 9a20b8e6eaf79..51c8fe7bd9b1b 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupStream.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupStream.cc
@@ -45,5 +45,89 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::AllReduce(
       "ProcessGroup%s does not support do allreduce", GetBackendName()));
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Send(
+    std::vector<phi::DenseTensor>& tensors, int dst_rank, bool sync_op) {
+  return Send(tensors,
+              dst_rank,
+              sync_op,
+              /*use_calc_stream*/ false);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Send(
+    std::vector<phi::DenseTensor>& tensors,
+    int dst_rank,
+    bool sync_op,
+    bool use_calc_stream) {
+  PADDLE_THROW(platform::errors::InvalidArgument(
+      "ProcessGroup%s does not support do send", GetBackendName()));
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Send_Partial(
+    phi::DenseTensor& tensors,
+    int dst_rank,
+    int offset,
+    int length,
+    bool sync_op) {
+  return Send_Partial(tensors,
+                      dst_rank,
+                      offset,
+                      length,
+                      sync_op,
+                      /*use_calc_stream*/ false);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Send_Partial(
+    phi::DenseTensor& tensors,
+    int dst_rank,
+    int offset,
+    int length,
+    bool sync_op,
+    bool use_calc_stream) {
+  PADDLE_THROW(platform::errors::InvalidArgument(
+      "ProcessGroup%s does not support do send_partial", GetBackendName()));
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Recv(
+    std::vector<phi::DenseTensor>& tensors, int src_rank, bool sync_op) {
+  return Recv(tensors,
+              src_rank,
+              sync_op,
+              /*use_calc_stream*/ false);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Recv(
+    std::vector<phi::DenseTensor>& tensors,
+    int src_rank,
+    bool sync_op,
+    bool use_calc_stream) {
+  PADDLE_THROW(platform::errors::InvalidArgument(
+      "ProcessGroup%s does not support do recv", GetBackendName()));
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Recv_Partial(
+    phi::DenseTensor& tensors,
+    int src_rank,
+    int offset,
+    int length,
+    bool sync_op) {
+  return Recv_Partial(tensors,
+                      src_rank,
+                      offset,
+                      length,
+                      sync_op,
+                      /*use_calc_stream*/ false);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Recv_Partial(
+    phi::DenseTensor& tensors,
+    int src_rank,
+    int offset,
+    int length,
+    bool sync_op,
+    bool use_calc_stream) {
+  PADDLE_THROW(platform::errors::InvalidArgument(
+      "ProcessGroup%s does not support do recv_partial", GetBackendName()));
+}
+
 }  // namespace distributed
 }  // namespace paddle
diff --git a/paddle/fluid/distributed/collective/ProcessGroupStream.h b/paddle/fluid/distributed/collective/ProcessGroupStream.h
index 81a05ee2416e0..4cd17ac72562e 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupStream.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupStream.h
@@ -66,6 +66,58 @@ class ProcessGroupStream : public ProcessGroup {
       const AllreduceOptions& options,
       bool sync_op,
       bool use_calc_stream);
+
+  std::shared_ptr<ProcessGroup::Task> Send(
+      std::vector<phi::DenseTensor>& tensors,  // NOLINT
+      int dst_rank,
+      bool sync_op) override;
+
+  virtual std::shared_ptr<ProcessGroup::Task> Send(
+      std::vector<phi::DenseTensor>& tensors,  // NOLINT
+      int dst_rank,
+      bool sync_op,
+      bool use_calc_stream);
+
+  std::shared_ptr<ProcessGroup::Task> Send_Partial(
+      phi::DenseTensor& tensors,  // NOLINT
+      int dst_rank,
+      int offset,
+      int length,
+      bool sync_op) override;
+
+  virtual std::shared_ptr<ProcessGroup::Task> Send_Partial(
+      phi::DenseTensor& tensors,  // NOLINT
+      int dst_rank,
+      int offset,
+      int length,
+      bool sync_op,
+      bool use_calc_stream);
+
+  std::shared_ptr<ProcessGroup::Task> Recv(
+      std::vector<phi::DenseTensor>& tensors,  // NOLINT
+      int src_rank,
+      bool sync_op) override;
+
+  virtual std::shared_ptr<ProcessGroup::Task> Recv(
+      std::vector<phi::DenseTensor>& tensors,  // NOLINT
+      int src_rank,
+      bool sync_op,
+      bool use_calc_stream);
+
+  std::shared_ptr<ProcessGroup::Task> Recv_Partial(
+      phi::DenseTensor& tensors,  // NOLINT
+      int src_rank,
+      int offset,
+      int length,
+      bool sync_op) override;
+
+  virtual std::shared_ptr<ProcessGroup::Task> Recv_Partial(
+      phi::DenseTensor& tensors,  // NOLINT
+      int src_rank,
+      int offset,
+      int length,
+      bool sync_op,
+      bool use_calc_stream);
 };
 
 }  // namespace distributed
diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc
index 5a7e2355f64eb..8a434f42811a8 100644
--- a/paddle/fluid/pybind/distributed_py.cc
+++ b/paddle/fluid/pybind/distributed_py.cc
@@ -196,6 +196,23 @@ void BindDistributed(py::module *m) {
               py::arg("dst"),
               py::call_guard<py::gil_scoped_release>())
 
+          .def(
+              "send",
+              [](distributed::ProcessGroup &self,
+                 py::handle py_tensor,
+                 int dst,
+                 bool sync_op) {
+                auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
+                auto dense =
+                    std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
+                std::vector<phi::DenseTensor> tensors = {*dense};
+                return self.Send(tensors, dst, sync_op);
+              },
+              py::arg("tensor"),
+              py::arg("dst"),
+              py::arg("sync_op"),
+              py::call_guard<py::gil_scoped_release>())
+
           .def(
               "send_partial",
               [](distributed::ProcessGroup &self,
@@ -217,6 +234,30 @@ void BindDistributed(py::module *m) {
               py::arg("id"),
               py::call_guard<py::gil_scoped_release>())
 
+          .def(
+              "send_partial",
+              [](distributed::ProcessGroup &self,
+                 py::handle py_tensor,
+                 int dst_rank,
+                 int nranks,
+                 int rank_id,
+                 bool sync_op) {
+                auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
+                auto dense =
+                    std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
+                int numel = (*dense).numel();
+                int send_numel = numel / nranks;
+                int offset = send_numel * rank_id;
+                return self.Send_Partial(
+                    *dense, dst_rank, offset, send_numel, sync_op);
+              },
+              py::arg("tensor"),
+              py::arg("dst"),
+              py::arg("num"),
+              py::arg("id"),
+              py::arg("sync_op"),
+              py::call_guard<py::gil_scoped_release>())
+
           .def(
               "recv",
               [](distributed::ProcessGroup &self,
@@ -232,6 +273,23 @@ void BindDistributed(py::module *m) {
               py::arg("src"),
               py::call_guard<py::gil_scoped_release>())
 
+          .def(
+              "recv",
+              [](distributed::ProcessGroup &self,
+                 py::handle py_tensor,
+                 int src,
+                 bool sync_op) {
+                auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
+                auto dense =
+                    std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
+                std::vector<phi::DenseTensor> tensors = {*dense};
+                return self.Recv(tensors, src, sync_op);
+              },
+              py::arg("tensor"),
+              py::arg("src"),
+              py::arg("sync_op"),
+              py::call_guard<py::gil_scoped_release>())
+
           .def(
               "recv_partial",
               [](distributed::ProcessGroup &self,
@@ -253,6 +311,30 @@ void BindDistributed(py::module *m) {
               py::arg("id"),
               py::call_guard<py::gil_scoped_release>())
 
+          .def(
+              "recv_partial",
+              [](distributed::ProcessGroup &self,
+                 py::handle py_tensor,
+                 int src_rank,
+                 int nranks,
+                 int rank_id,
+                 bool sync_op) {
+                auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
+                auto dense =
+                    std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
+                int numel = (*dense).numel();
+                int recv_numel = numel / nranks;
+                int offset = recv_numel * rank_id;
+                return self.Recv_Partial(
+                    *dense, src_rank, offset, recv_numel, sync_op);
+              },
+              py::arg("tensor"),
+              py::arg("src"),
+              py::arg("num"),
+              py::arg("id"),
+              py::arg("sync_op"),
+              py::call_guard<py::gil_scoped_release>())
+
           .def(
               "all_gather",
               [](distributed::ProcessGroup &self,
@@ -427,6 +509,94 @@ void BindDistributed(py::module *m) {
               },
               py::arg("tensor"),
               py::arg("op"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "send_on_calc_stream",
+              [](distributed::ProcessGroupStream &self,
+                 py::handle py_tensor,
+                 int dst) {
+                auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
+                auto dense =
+                    std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
+                std::vector<phi::DenseTensor> tensors = {*dense};
+                return self.Send(tensors,
+                                 dst,
+                                 /*sync_op*/ true,
+                                 /*use_calc_stream*/ true);
+              },
+              py::arg("tensor"),
+              py::arg("dst"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "send_partial_on_calc_stream",
+              [](distributed::ProcessGroupStream &self,
+                 py::handle py_tensor,
+                 int dst_rank,
+                 int nranks,
+                 int rank_id) {
+                auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
+                auto dense =
+                    std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
+                int numel = (*dense).numel();
+                int send_numel = numel / nranks;
+                int offset = send_numel * rank_id;
+                return self.Send_Partial(*dense,
+                                         dst_rank,
+                                         offset,
+                                         send_numel,
+                                         /*sync_op*/ true,
+                                         /*use_calc_stream*/ true);
+              },
+              py::arg("tensor"),
+              py::arg("dst"),
+              py::arg("num"),
+              py::arg("id"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "recv_on_calc_stream",
+              [](distributed::ProcessGroupStream &self,
+                 py::handle py_tensor,
+                 int src) {
+                auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
+                auto dense =
+                    std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
+                std::vector<phi::DenseTensor> tensors = {*dense};
+                return self.Recv(tensors,
+                                 src,
+                                 /*sync_op*/ true,
+                                 /*use_calc_stream*/ true);
+              },
+              py::arg("tensor"),
+              py::arg("src"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "recv_partial_on_calc_stream",
+              [](distributed::ProcessGroupStream &self,
+                 py::handle py_tensor,
+                 int src_rank,
+                 int nranks,
+                 int rank_id) {
+                auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
+                auto dense =
+                    std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
+                int numel = (*dense).numel();
+                int recv_numel = numel / nranks;
+                int offset = recv_numel * rank_id;
+                return self.Recv_Partial(*dense,
+                                         src_rank,
+                                         offset,
+                                         recv_numel,
+                                         /*sync_op*/ true,
+                                         /*use_calc_stream*/ true);
+              },
+              py::arg("tensor"),
+              py::arg("src"),
+              py::arg("num"),
+              py::arg("id"),
               py::call_guard<py::gil_scoped_release>());
 
 #if defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)
diff --git a/python/paddle/distributed/communication/stream/__init__.py b/python/paddle/distributed/communication/stream/__init__.py
index 24194dd9fb1e2..3dd9f60b81295 100644
--- a/python/paddle/distributed/communication/stream/__init__.py
+++ b/python/paddle/distributed/communication/stream/__init__.py
@@ -13,5 +13,7 @@
 # limitations under the License.
 
 from .all_reduce import all_reduce
+from .send import send
+from .recv import recv
 
-__all__ = ["all_reduce"]
+__all__ = ["all_reduce", "send", "recv"]
diff --git a/python/paddle/distributed/communication/stream/all_reduce.py b/python/paddle/distributed/communication/stream/all_reduce.py
index 6a0b622cf0dfe..f94422f4bd0a6 100644
--- a/python/paddle/distributed/communication/stream/all_reduce.py
+++ b/python/paddle/distributed/communication/stream/all_reduce.py
@@ -12,13 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import paddle.distributed.collective as collective
 import paddle.fluid.framework as framework
-from ...collective import _get_default_group, _get_reduce_op, ReduceOp
 
 
 def _all_reduce_in_dygraph(tensor, op, group, sync_op, use_calc_stream):
-    op_type = _get_reduce_op(op, "all_reduce")
-    group = _get_default_group() if group is None else group
+    op_type = collective._get_reduce_op(op, "all_reduce")
+    group = collective._get_default_group() if group is None else group
     if use_calc_stream:
         return group.process_group.allreduce_on_calc_stream(tensor, op_type)
 
@@ -30,7 +30,7 @@ def _all_reduce_in_dygraph(tensor, op, group, sync_op, use_calc_stream):
 
 
 def all_reduce(tensor,
-               op=ReduceOp.SUM,
+               op=collective.ReduceOp.SUM,
                group=None,
                sync_op=True,
                use_calc_stream=False):
diff --git a/python/paddle/distributed/communication/stream/recv.py b/python/paddle/distributed/communication/stream/recv.py
new file mode 100644
index 0000000000000..b225f64b8b4d2
--- /dev/null
+++ b/python/paddle/distributed/communication/stream/recv.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.distributed.collective as collective
+import paddle.fluid.framework as framework
+
+
+def _recv_in_dygraph(tensor, src, group, sync_op, use_calc_stream):
+    group = collective._get_default_group() if group is None else group
+    if use_calc_stream:
+        return group.process_group.recv_on_calc_stream(tensor, src)
+
+    task = group.process_group.recv(tensor, src, sync_op)
+    if sync_op:
+        task.wait()
+
+    return task
+
+
+def recv(tensor, src=0, group=None, sync_op=True, use_calc_stream=False):
+    """
+
+    Receive a tensor from the source device.
+
+    Args:
+        tensor (Tensor): The tensor to receive. Support float16, float32, float64, int32, int64, int8, uint8 or bool as its data type.
+        src (int, optional): Rank of the source device. If none is given, use `0` as default.
+        group (Group, optional): Communicate in which group. If none is given, use the global group as default.
+        sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default.
+        use_calc_stream (bool, optional): Indicate whether the communication is done on calculation stream. If none is given, use false as default. This
+            option is designed for high performance demand, be careful to turn it on except you are clearly know its meaning.
+
+    Returns:
+        Return a task object.
+
+    Warning:
+        This API only supports the dygraph mode now.
+
+    Examples:
+        .. code-block:: python
+
+            # required: distributed
+            import paddle
+            import paddle.distributed as dist
+
+            dist.init_parallel_env()
+            local_rank = dist.get_rank()
+            if local_rank == 0:
+                data = paddle.to_tensor([[4, 5, 6], [4, 5, 6]])
+                task = dist.stream.send(data, dst=1, sync_op=False)
+            else:
+                data = paddle.to_tensor([[1, 2, 3], [1, 2, 3]])
+                task = dist.stream.recv(data, src=0, sync_op=False)
+            task.wait()
+            out = data.numpy()
+            # [[4, 5, 6], [4, 5, 6]
+    """
+    if group is not None and not group.is_member():
+        raise RuntimeError(
+            "The group should not be None and all ranks which invoke this operation should be the member of this group."
+        )
+
+    if not sync_op and use_calc_stream:
+        raise RuntimeError(
+            "use_calc_stream can only be True in sync op behavior.")
+
+    if framework.in_dygraph_mode():
+        return _recv_in_dygraph(tensor, src, group, sync_op, use_calc_stream)
+
+    raise RuntimeError(
+        "paddle.distributed.stream.recv is only supported in dygraph mode now.")
diff --git a/python/paddle/distributed/communication/stream/send.py b/python/paddle/distributed/communication/stream/send.py
new file mode 100644
index 0000000000000..fa052734c7ee7
--- /dev/null
+++ b/python/paddle/distributed/communication/stream/send.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.distributed.collective as collective
+import paddle.fluid.framework as framework
+
+
+def _send_in_dygraph(tensor, dst, group, sync_op, use_calc_stream):
+    group = collective._get_default_group() if group is None else group
+    if use_calc_stream:
+        return group.process_group.send_on_calc_stream(tensor, dst)
+
+    task = group.process_group.send(tensor, dst, sync_op)
+    if sync_op:
+        task.wait()
+
+    return task
+
+
+def send(tensor, dst=0, group=None, sync_op=True, use_calc_stream=False):
+    """
+
+    Send a tensor to the destination device.
+
+    Args:
+        tensor (Tensor): The tensor to send. Support float16, float32, float64, int32, int64, int8, uint8 or bool as its data type.
+        dst (int, optional): Rank of the destination device. If none is given, use `0` as default.
+        group (Group, optional): Communicate in which group. If none is given, use the global group as default.
+        sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default.
+        use_calc_stream (bool, optional): Indicate whether the communication is done on calculation stream. If none is given, use false as default. This
+            option is designed for high performance demand, be careful to turn it on except you are clearly know its meaning.
+
+    Returns:
+        Return a task object.
+
+    Warning:
+        This API only supports the dygraph mode now.
+
+    Examples:
+        .. code-block:: python
+
+            # required: distributed
+            import paddle
+            import paddle.distributed as dist
+
+            dist.init_parallel_env()
+            local_rank = dist.get_rank()
+            if local_rank == 0:
+                data = paddle.to_tensor([[4, 5, 6], [4, 5, 6]])
+                task = dist.stream.send(data, dst=1, sync_op=False)
+            else:
+                data = paddle.to_tensor([[1, 2, 3], [1, 2, 3]])
+                task = dist.stream.recv(data, src=0, sync_op=False)
+            task.wait()
+            out = data.numpy()
+            # [[4, 5, 6], [4, 5, 6]
+    """
+    if group is not None and not group.is_member():
+        raise RuntimeError(
+            "The group should not be None and all ranks which invoke this operation should be the member of this group."
+        )
+
+    if not sync_op and use_calc_stream:
+        raise RuntimeError(
+            "use_calc_stream can only be True in sync op behavior.")
+
+    if framework.in_dygraph_mode():
+        return _send_in_dygraph(tensor, dst, group, sync_op, use_calc_stream)
+
+    raise RuntimeError(
+        "paddle.distributed.stream.send is only supported in dygraph mode now.")
diff --git a/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt b/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt
index 5a1a6df2dd7ec..55f4453b1ab08 100644
--- a/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt
@@ -268,17 +268,26 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(
-    test_eager_dist_api MODULES test_eager_dist_api ENVS
-    "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
-  set_tests_properties(test_eager_dist_api PROPERTIES TIMEOUT "120" LABELS
-                                                      "RUN_TYPE=DIST")
+    test_communication_stream_allreduce_api MODULES
+    test_communication_stream_allreduce_api ENVS
+    "PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=")
+  set_tests_properties(test_communication_stream_allreduce_api
+                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(
-    test_new_group_api MODULES test_new_group_api ENVS
+    test_communication_stream_sendrecv_api MODULES
+    test_communication_stream_sendrecv_api ENVS
+    "PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=")
+  set_tests_properties(test_communication_stream_sendrecv_api
+                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
+endif()
+if((WITH_GPU OR WITH_ROCM) AND (LINUX))
+  py_test_modules(
+    test_eager_dist_api MODULES test_eager_dist_api ENVS
     "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
-  set_tests_properties(test_new_group_api PROPERTIES TIMEOUT "120" LABELS
-                                                     "RUN_TYPE=DIST")
+  set_tests_properties(test_eager_dist_api PROPERTIES TIMEOUT "120" LABELS
+                                                      "RUN_TYPE=DIST")
 endif()
 if((WITH_GPU
     OR WITH_ROCM
@@ -298,11 +307,10 @@ if((WITH_GPU
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(
-    test_communication_stream_allreduce_api MODULES
-    test_communication_stream_allreduce_api ENVS
-    "PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=")
-  set_tests_properties(test_communication_stream_allreduce_api
-                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
+    test_new_group_api MODULES test_new_group_api ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_new_group_api PROPERTIES TIMEOUT "120" LABELS
+                                                     "RUN_TYPE=DIST")
 endif()
 if((WITH_ROCM OR WITH_GPU) AND (LINUX))
   bash_test_modules(
diff --git a/python/paddle/fluid/tests/unittests/collective/communication_stream_sendrecv_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/communication_stream_sendrecv_api_dygraph.py
new file mode 100644
index 0000000000000..175e24c3d0d86
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective/communication_stream_sendrecv_api_dygraph.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+import paddle
+import paddle.distributed as dist
+import paddle.fluid as fluid
+import test_collective_api_base as test_collective_base
+import test_communication_api_base as test_base
+
+
+class StreamSendRecvTestCase():
+
+    def __init__(self):
+        self._sync_op = eval(os.getenv("sync_op"))
+        self._use_calc_stream = eval(os.getenv("use_calc_stream"))
+        self._backend = os.getenv("backend")
+        self._shape = eval(os.getenv("shape"))
+        self._dtype = os.getenv("dtype")
+        self._seeds = eval(os.getenv("seeds"))
+        if self._backend not in ["nccl", "gloo"]:
+            raise NotImplementedError(
+                "Only support nccl and gloo as the backend for now.")
+        os.environ["PADDLE_DISTRI_BACKEND"] = self._backend
+
+    def run_test_case(self):
+        dist.init_parallel_env()
+
+        test_data_list = []
+        for seed in self._seeds:
+            test_data_list.append(
+                test_collective_base.create_test_data(shape=self._shape,
+                                                      dtype=self._dtype,
+                                                      seed=seed))
+
+        rank = dist.get_rank()
+        tensor = paddle.to_tensor(test_data_list[rank])
+        if rank == 0:
+            task = dist.stream.send(tensor,
+                                    dst=1,
+                                    sync_op=self._sync_op,
+                                    use_calc_stream=self._use_calc_stream)
+        else:
+            task = dist.stream.recv(tensor,
+                                    src=0,
+                                    sync_op=self._sync_op,
+                                    use_calc_stream=self._use_calc_stream)
+        if not self._sync_op:
+            task.wait()
+
+        result = test_data_list[0]
+        assert np.allclose(tensor, result, rtol=1e-05, atol=1e-05)
+
+
+if __name__ == "__main__":
+    StreamSendRecvTestCase().run_test_case()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_communication_stream_sendrecv_api.py b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_sendrecv_api.py
new file mode 100644
index 0000000000000..9590519bc2e13
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_sendrecv_api.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import test_communication_api_base as test_base
+
+
+class TestCommunicationStreamSendRecvAPI(test_base.CommunicationTestDistBase):
+
+    def setUp(self):
+        super(TestCommunicationStreamSendRecvAPI, self).setUp(num_of_devices=2,
+                                                              timeout=120)
+        self._default_envs = {
+            "backend": "nccl",
+            "shape": "(100, 200)",
+            "dtype": "float32",
+            "seeds": str(self._seeds)
+        }
+        self._changeable_envs = {
+            "sync_op": ["True", "False"],
+            "use_calc_stream": ["True", "False"]
+        }
+
+    def test_sendrecv_stream(self):
+        envs_list = test_base.gen_product_envs_list(self._default_envs,
+                                                    self._changeable_envs)
+        for envs in envs_list:
+            if eval(envs["use_calc_stream"]) and not eval(envs["sync_op"]):
+                continue
+            self.run_test_case("communication_stream_sendrecv_api_dygraph.py",
+                               user_defined_envs=envs)
+
+    def tearDown(self):
+        super(TestCommunicationStreamSendRecvAPI, self).tearDown()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/testslist.csv b/python/paddle/fluid/tests/unittests/collective/testslist.csv
index 16eb200565f73..b4ba281f45420 100644
--- a/python/paddle/fluid/tests/unittests/collective/testslist.csv
+++ b/python/paddle/fluid/tests/unittests/collective/testslist.csv
@@ -32,8 +32,9 @@ test_collective_split_col_linear,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_
 test_collective_split_embedding_none_divisible,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_split_row_linear,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_wait,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_communication_stream_allreduce_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=,
+test_communication_stream_sendrecv_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=,
 test_eager_dist_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
-test_new_group_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_gen_nccl_id_op,,gpu;rocm;ASCEND;ASCEND_CL,,DIST,../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
-test_communication_stream_allreduce_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=,
+test_new_group_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_world_size_and_rank,linux,rocm;gpu,120,DIST,test_world_size_and_rank.sh,2,,http_proxy=;https_proxy=,

From 0e4df2d338af092c1f7f1ab044a0a76e23c5838a Mon Sep 17 00:00:00 2001
From: Wen Sun <35923278+HermitSun@users.noreply.github.com>
Date: Fri, 30 Sep 2022 20:52:21 +0800
Subject: [PATCH 2/6] Support both use_calc_stream and sync_op in allgather API
 (#46295)

---
 .../distributed/collective/ProcessGroup.h     |  11 +-
 .../collective/ProcessGroupNCCL.cc            |  56 ++++++-
 .../distributed/collective/ProcessGroupNCCL.h |   9 ++
 .../collective/ProcessGroupStream.cc          |  27 +++-
 .../collective/ProcessGroupStream.h           |  14 ++
 paddle/fluid/distributed/collective/Utils.h   | 145 ++++++++++++++++++
 paddle/fluid/pybind/distributed_py.cc         | 106 +++++++++++++
 .../communication/stream/__init__.py          |   3 +-
 .../communication/stream/all_gather.py        | 136 ++++++++++++++++
 .../tests/unittests/collective/CMakeLists.txt |   8 +
 ...munication_stream_allgather_api_dygraph.py |  91 +++++++++++
 ...test_communication_stream_allgather_api.py |  51 ++++++
 .../tests/unittests/collective/testslist.csv  |   1 +
 13 files changed, 648 insertions(+), 10 deletions(-)
 create mode 100644 paddle/fluid/distributed/collective/Utils.h
 create mode 100644 python/paddle/distributed/communication/stream/all_gather.py
 create mode 100644 python/paddle/fluid/tests/unittests/collective/communication_stream_allgather_api_dygraph.py
 create mode 100644 python/paddle/fluid/tests/unittests/collective/test_communication_stream_allgather_api.py

diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h
index 3db2464e59afd..40a27db7601f1 100644
--- a/paddle/fluid/distributed/collective/ProcessGroup.h
+++ b/paddle/fluid/distributed/collective/ProcessGroup.h
@@ -190,7 +190,16 @@ class ProcessGroup {
       std::vector<phi::DenseTensor>&,    // NOLINT
       std::vector<phi::DenseTensor>&) {  // NOLINT
     PADDLE_THROW(platform::errors::InvalidArgument(
-        "ProcessGroup%s does not support AllGather", GetBackendName()));
+        "ProcessGroup%s does not support all_gather", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> AllGather(
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      bool) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support all_gather with sync_op flag",
+        GetBackendName()));
   }
 
   virtual std::shared_ptr<ProcessGroup::Task> AllGather_Partial(
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
index 368008d9cc0ce..12f60faf80053 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
@@ -936,6 +936,39 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllGather(
       CommType::ALLGATHER);
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllGather(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    bool sync_op,
+    bool use_calc_stream) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(in_tensors),
+      true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(out_tensors),
+      true,
+      platform::errors::InvalidArgument("All outputs should be in CudaPlace."));
+  return Collective(
+      in_tensors,
+      out_tensors,
+      [&](const phi::DenseTensor& input,
+          phi::DenseTensor& output,
+          ncclComm_t comm,
+          const gpuStream_t& stream) {
+        return platform::dynload::ncclAllGather(
+            input.data(),
+            output.data(),
+            input.numel(),
+            platform::ToNCCLDataType(input.dtype()),
+            comm,
+            stream);
+      },
+      CommType::ALLGATHER,
+      sync_op,
+      use_calc_stream);
+}
+
 void* GetPointerByOffset(void* raw_pointer,
                          size_t offset,
                          experimental::DataType type) {
@@ -1250,13 +1283,22 @@ ncclComm_t ProcessGroupNCCL::NCCLComm(const Place& place) const {
 
 phi::DeviceContext* ProcessGroupNCCL::GetDeviceContext(
     const Place& place) const {
-  std::vector<Place> places = {place};
-  const auto& iter = places_to_ctx_.find(GetKeyFromPlaces(places));
-  PADDLE_ENFORCE_NE(iter,
-                    places_to_ctx_.end(),
-                    platform::errors::InvalidArgument(
-                        "Cannot find device context in process group."));
-  return iter->second[0].get();
+  return GetDeviceContext(place, /*use_calc_stream*/ false);
+}
+
+phi::DeviceContext* ProcessGroupNCCL::GetDeviceContext(
+    const Place& place, bool use_calc_stream) const {
+  if (use_calc_stream) {
+    return platform::DeviceContextPool::Instance().Get(place);
+  } else {
+    std::vector<Place> places = {place};
+    const auto& iter = places_to_ctx_.find(GetKeyFromPlaces(places));
+    PADDLE_ENFORCE_NE(iter,
+                      places_to_ctx_.end(),
+                      platform::errors::InvalidArgument(
+                          "Cannot find device context in process group."));
+    return iter->second[0].get();
+  }
 }
 
 }  //  namespace distributed
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
index 0b8fa54cd337e..24ba7c86b1838 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
@@ -98,6 +98,9 @@ class ProcessGroupNCCL : public ProcessGroupStream {
 
   phi::DeviceContext* GetDeviceContext(const Place& place) const override;
 
+  phi::DeviceContext* GetDeviceContext(const Place& place,
+                                       bool use_calc_stream) const override;
+
   std::shared_ptr<ProcessGroup::Task> AllReduce(
       std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
       std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
@@ -167,6 +170,12 @@ class ProcessGroupNCCL : public ProcessGroupStream {
       std::vector<phi::DenseTensor>& in_tensors,
       std::vector<phi::DenseTensor>& out_tensors) override;
 
+  std::shared_ptr<ProcessGroup::Task> AllGather(
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors,
+      bool sync_op,
+      bool use_calc_stream) override;
+
   std::shared_ptr<ProcessGroup::Task> AllGather_Partial(
       std::vector<phi::DenseTensor>& in_tensors,
       std::vector<phi::DenseTensor>& out_tensors,
diff --git a/paddle/fluid/distributed/collective/ProcessGroupStream.cc b/paddle/fluid/distributed/collective/ProcessGroupStream.cc
index 51c8fe7bd9b1b..43ca0bbb36d3f 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupStream.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupStream.cc
@@ -23,6 +23,31 @@ ProcessGroupStream::ProcessGroupStream(int rank,
                                        int gid)
     : ProcessGroup(rank, size, place, gid) {}
 
+phi::DeviceContext* ProcessGroupStream::GetDeviceContext(
+    const Place& place, bool use_calc_stream) const {
+  PADDLE_THROW(platform::errors::InvalidArgument(
+      "ProcessGroup%s does not support get device_context.", GetBackendName()));
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::AllGather(
+    std::vector<phi::DenseTensor>& input_tensors,   // NOLINT
+    std::vector<phi::DenseTensor>& output_tensors,  // NOLINT
+    bool sync_op) {
+  return AllGather(input_tensors,
+                   output_tensors,
+                   sync_op,
+                   /*use_calc_stream*/ false);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::AllGather(
+    std::vector<phi::DenseTensor>& input_tensors,   // NOLINT
+    std::vector<phi::DenseTensor>& output_tensors,  // NOLINT
+    bool sync_op,
+    bool use_calc_stream) {
+  PADDLE_THROW(platform::errors::InvalidArgument(
+      "ProcessGroup%s does not support do all_gather", GetBackendName()));
+}
+
 std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::AllReduce(
     std::vector<phi::DenseTensor>& input_tensors,   // NOLINT
     std::vector<phi::DenseTensor>& output_tensors,  // NOLINT
@@ -42,7 +67,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::AllReduce(
     bool sync_op,
     bool use_calc_stream) {
   PADDLE_THROW(platform::errors::InvalidArgument(
-      "ProcessGroup%s does not support do allreduce", GetBackendName()));
+      "ProcessGroup%s does not support do all_reduce", GetBackendName()));
 }
 
 std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Send(
diff --git a/paddle/fluid/distributed/collective/ProcessGroupStream.h b/paddle/fluid/distributed/collective/ProcessGroupStream.h
index 4cd17ac72562e..f8ab562ad075c 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupStream.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupStream.h
@@ -54,6 +54,20 @@ class ProcessGroupStream : public ProcessGroup {
   ProcessGroupStream(int rank, int size, const platform::Place& place, int gid);
   virtual ~ProcessGroupStream() = default;
 
+  virtual phi::DeviceContext* GetDeviceContext(const Place& place,
+                                               bool use_calc_stream) const;
+
+  std::shared_ptr<ProcessGroup::Task> AllGather(
+      std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
+      std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
+      bool sync_op) override;
+
+  virtual std::shared_ptr<ProcessGroup::Task> AllGather(
+      std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
+      std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
+      bool sync_op,
+      bool use_calc_stream);
+
   std::shared_ptr<ProcessGroup::Task> AllReduce(
       std::vector<phi::DenseTensor>& input_tensors,   // NOLINT
       std::vector<phi::DenseTensor>& output_tensors,  // NOLINT
diff --git a/paddle/fluid/distributed/collective/Utils.h b/paddle/fluid/distributed/collective/Utils.h
new file mode 100644
index 0000000000000..79146febdf809
--- /dev/null
+++ b/paddle/fluid/distributed/collective/Utils.h
@@ -0,0 +1,145 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/operators/math/concat_and_split.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/api/include/tensor.h"
+#include "paddle/phi/backends/device_manager.h"
+
+namespace paddle {
+namespace distributed {
+
+template <typename DeviceContext, typename T>
+struct SplitDenseTensor {
+  void operator()(const DeviceContext *context,
+                  const phi::DenseTensor &in,
+                  std::vector<phi::DenseTensor *> *out,
+                  int axis = 0) {
+    std::vector<const phi::DenseTensor *> shape_refer;
+    shape_refer.reserve(out->size());
+    for (auto *p_tensor : *out) {
+      shape_refer.emplace_back(p_tensor);
+    }
+    operators::math::SplitFunctor<DeviceContext, T> split_functor_;
+    split_functor_(*context, in, shape_refer, axis, out);
+  }
+};
+
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+template <typename T>
+struct SplitDenseTensor<platform::CustomDeviceContext, T> {
+  void operator()(const platform::CustomDeviceContext *context,
+                  const phi::DenseTensor &in,
+                  std::vector<phi::DenseTensor *> *out) {
+    auto *in_data = in.data<T>();
+    auto *device = phi::DeviceManager::GetDeviceWithPlace(context->GetPlace());
+    size_t offset = 0;
+    for (auto *p_tensor : *out) {
+      auto *out_data = p_tensor->data<T>();
+      auto sz = p_tensor->numel() * sizeof(T);
+      device->MemoryCopyD2D(out_data, in_data + offset, sz, nullptr);
+      offset += sz;
+    }
+  }
+};
+#endif
+
+template <typename DeviceContext>
+void SplitDenseTensorWithType(const DeviceContext *dev_ctx,
+                              const phi::DenseTensor &p_dense,
+                              std::vector<phi::DenseTensor *> *p_list,
+                              phi::DataType type) {
+  switch (type) {
+    case phi::DataType::BOOL:
+      SplitDenseTensor<DeviceContext, bool>()(dev_ctx, p_dense, p_list);
+      break;
+    case phi::DataType::UINT8:
+      SplitDenseTensor<DeviceContext, uint8_t>()(dev_ctx, p_dense, p_list);
+      break;
+    case phi::DataType::INT8:
+      SplitDenseTensor<DeviceContext, int8_t>()(dev_ctx, p_dense, p_list);
+      break;
+    case phi::DataType::INT32:
+      SplitDenseTensor<DeviceContext, int32_t>()(dev_ctx, p_dense, p_list);
+      break;
+    case phi::DataType::INT64:
+      SplitDenseTensor<DeviceContext, int64_t>()(dev_ctx, p_dense, p_list);
+      break;
+    case phi::DataType::FLOAT16:
+      SplitDenseTensor<DeviceContext, platform::float16>()(
+          dev_ctx, p_dense, p_list);
+      break;
+    case phi::DataType::FLOAT32:
+      SplitDenseTensor<DeviceContext, float>()(dev_ctx, p_dense, p_list);
+      break;
+    case phi::DataType::FLOAT64:
+      SplitDenseTensor<DeviceContext, double>()(dev_ctx, p_dense, p_list);
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Data type (%s) is not supported when it splits tensors for "
+          "allgather.",
+          type));
+  }
+}
+
+void SplitTensor(const phi::DeviceContext *dev_ctx,
+                 const phi::DenseTensor &tensor,
+                 const std::vector<experimental::Tensor> *tensor_list) {
+  std::vector<phi::DenseTensor *> dense_list;
+  for (auto &tensor : *tensor_list) {
+    auto p_tensor =
+        std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl()).get();
+    dense_list.emplace_back(p_tensor);
+  }
+
+  const auto &place = dev_ctx->GetPlace();
+  if (platform::is_gpu_place(place)) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    SplitDenseTensorWithType(static_cast<const phi::GPUContext *>(dev_ctx),
+                             tensor,
+                             &dense_list,
+                             tensor.dtype());
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Paddle can't split tensor since it's not support NCCL/RCCL, please "
+        "recompile or reinstall Paddle with NCCL/RCCL support."));
+#endif
+  } else if (platform::is_custom_place(place)) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+    SplitDenseTensorWithType(
+        static_cast<const platform::CustomDeviceContext *>(dev_ctx),
+        tensor,
+        &dense_list,
+        tensor.dtype());
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Paddle can't split tensor since it's not compiled with CUSTOM_DEVICE, "
+        "please recompile or reinstall Paddle with CUSTOM_DEVICE support."));
+#endif
+  } else if (platform::is_cpu_place(place)) {
+    SplitDenseTensorWithType(static_cast<const phi::CPUContext *>(dev_ctx),
+                             tensor,
+                             &dense_list,
+                             tensor.dtype());
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Split tensor not supported on place (%s)", place));
+  }
+}
+
+}  //  namespace distributed
+}  //  namespace paddle
diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc
index 8a434f42811a8..bec3c93cbd8b7 100644
--- a/paddle/fluid/pybind/distributed_py.cc
+++ b/paddle/fluid/pybind/distributed_py.cc
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/distributed/collective/ProcessGroup.h"
 #include "paddle/fluid/distributed/collective/ProcessGroupStream.h"
 #include "paddle/fluid/distributed/collective/Types.h"
+#include "paddle/fluid/distributed/collective/Utils.h"
 #include "paddle/fluid/distributed/collective/reducer.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor.h"
@@ -354,6 +355,57 @@ void BindDistributed(py::module *m) {
               py::arg("out"),
               py::call_guard<py::gil_scoped_release>())
 
+          .def(
+              "allgather",
+              [](distributed::ProcessGroup &self,
+                 py::handle py_in_tensor,
+                 py::handle py_out_tensor_list,
+                 bool sync_op) {
+                auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    in_tensor.impl());
+                std::vector<phi::DenseTensor> in_wrapper = {*in_dense};
+
+                auto out_tensor_list =
+                    CastPyArg2VectorOfTensor(py_out_tensor_list.ptr(), 0);
+                Tensor concat_out_tensor = paddle::concat(out_tensor_list, 0);
+                auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    concat_out_tensor.impl());
+                std::vector<phi::DenseTensor> out_wrapper = {*out_dense};
+
+                const auto *dev_ctx = self.GetDeviceContext(in_tensor.place());
+                auto task = self.AllGather(in_wrapper, out_wrapper, sync_op);
+                distributed::SplitTensor(dev_ctx, *out_dense, &out_tensor_list);
+                return task;
+              },
+              py::arg("in"),
+              py::arg("out"),
+              py::arg("sync_op"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "allgather_base",
+              [](distributed::ProcessGroup &self,
+                 py::handle py_in_tensor,
+                 py::handle py_out_tensor,
+                 bool sync_op) {
+                auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    in_tensor.impl());
+                std::vector<phi::DenseTensor> in_wrapper = {*in_dense};
+
+                auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    out_tensor.impl());
+                std::vector<phi::DenseTensor> out_wrapper = {*out_dense};
+
+                return self.AllGather(in_wrapper, out_wrapper, sync_op);
+              },
+              py::arg("in"),
+              py::arg("out"),
+              py::arg("sync_op"),
+              py::call_guard<py::gil_scoped_release>())
+
           .def(
               "all_gather_partial",
               [](distributed::ProcessGroup &self,
@@ -490,6 +542,60 @@ void BindDistributed(py::module *m) {
       py::class_<distributed::ProcessGroupStream,
                  std::shared_ptr<distributed::ProcessGroupStream>>(
           *m, "ProcessGroupStream", ProcessGroup)
+          .def(
+              "allgather_on_calc_stream",
+              [](distributed::ProcessGroupStream &self,
+                 py::handle py_in_tensor,
+                 py::handle py_out_tensor_list) {
+                auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    in_tensor.impl());
+                std::vector<phi::DenseTensor> in_wrapper = {*in_dense};
+
+                auto out_tensor_list =
+                    CastPyArg2VectorOfTensor(py_out_tensor_list.ptr(), 0);
+                Tensor concat_out_tensor = paddle::concat(out_tensor_list, 0);
+                auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    concat_out_tensor.impl());
+                std::vector<phi::DenseTensor> out_wrapper = {*out_dense};
+
+                const auto *dev_ctx =
+                    self.GetDeviceContext(in_tensor.place(), true);
+                auto task = self.AllGather(in_wrapper,
+                                           out_wrapper,
+                                           /*sync_op*/ true,
+                                           /*use_calc_stream*/ true);
+                distributed::SplitTensor(dev_ctx, *out_dense, &out_tensor_list);
+                return task;
+              },
+              py::arg("in"),
+              py::arg("out"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "allgather_base_on_calc_stream",
+              [](distributed::ProcessGroupStream &self,
+                 py::handle py_in_tensor,
+                 py::handle py_out_tensor) {
+                auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    in_tensor.impl());
+                std::vector<phi::DenseTensor> in_wrapper = {*in_dense};
+
+                auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    out_tensor.impl());
+                std::vector<phi::DenseTensor> out_wrapper = {*out_dense};
+
+                return self.AllGather(in_wrapper,
+                                      out_wrapper,
+                                      /*sync_op*/ true,
+                                      /*use_calc_stream*/ true);
+              },
+              py::arg("in"),
+              py::arg("out"),
+              py::call_guard<py::gil_scoped_release>())
+
           .def(
               "allreduce_on_calc_stream",
               [](distributed::ProcessGroupStream &self,
diff --git a/python/paddle/distributed/communication/stream/__init__.py b/python/paddle/distributed/communication/stream/__init__.py
index 3dd9f60b81295..deab1f97ea28e 100644
--- a/python/paddle/distributed/communication/stream/__init__.py
+++ b/python/paddle/distributed/communication/stream/__init__.py
@@ -12,8 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from .all_gather import all_gather
 from .all_reduce import all_reduce
 from .send import send
 from .recv import recv
 
-__all__ = ["all_reduce", "send", "recv"]
+__all__ = ["all_gather", "all_reduce", "send", "recv"]
diff --git a/python/paddle/distributed/communication/stream/all_gather.py b/python/paddle/distributed/communication/stream/all_gather.py
new file mode 100644
index 0000000000000..dca2957309068
--- /dev/null
+++ b/python/paddle/distributed/communication/stream/all_gather.py
@@ -0,0 +1,136 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.fluid.framework as framework
+from paddle.distributed import collective
+
+
+def _check_tensor_shape(tensor, shape, nranks=1):
+    expect_shape = list(shape)
+    expect_shape[0] *= nranks
+    if list(tensor.shape) != expect_shape:
+        raise RuntimeError('The tensor for all_gather is not correctly-sized.')
+
+
+def _check_tensor_list_shape(tensor_list, shape, nranks=1):
+    if len(tensor_list) != nranks:
+        raise RuntimeError(
+            'The tensor_list for all_gather is not correctly-sized.')
+    for tensor in tensor_list:
+        if tensor.shape != shape:
+            raise RuntimeError(
+                'The tensor_list for all_gather is not correctly-sized.')
+
+
+def _all_gather_base_in_dygraph(out_tensor, in_tensor, group, sync_op,
+                                use_calc_stream):
+    group = collective._get_default_group() if group is None else group
+
+    _check_tensor_shape(out_tensor, in_tensor.shape, group.nranks)
+
+    if use_calc_stream:
+        return group.process_group.allgather_base_on_calc_stream(
+            in_tensor, out_tensor)
+
+    task = group.process_group.allgather_base(in_tensor, out_tensor, sync_op)
+    if sync_op:
+        task.wait()
+
+    return task
+
+
+def _all_gather_in_dygraph(tensor_list, tensor, group, sync_op,
+                           use_calc_stream):
+    group = collective._get_default_group() if group is None else group
+
+    if len(tensor_list) == 0:
+        tensor_list += [paddle.empty_like(tensor) for _ in range(group.nranks)]
+    else:
+        _check_tensor_list_shape(tensor_list, tensor.shape, group.nranks)
+
+    if use_calc_stream:
+        return group.process_group.allgather_on_calc_stream(tensor, tensor_list)
+
+    task = group.process_group.allgather(tensor, tensor_list, sync_op)
+    if sync_op:
+        task.wait()
+
+    return task
+
+
+def all_gather(tensor_or_tensor_list,
+               tensor,
+               group=None,
+               sync_op=True,
+               use_calc_stream=False):
+    """
+
+    Gather tensors across devices to a correctly-sized tensor or a tensor list.
+
+    Args:
+        tensor_or_tensor_list (Union[Tensor, List[Tensor]]): The output. If it is a tensor, it should be correctly-sized. If it is a list, it
+            should be empty or contain correctly-sized tensors.
+        tensor (Tensor): The input tensor on each rank. The result will overwrite this tenor after communication. Support
+            float16, float32, float64, int32 or int64 as the input data type.
+        group (Group, optional): Communicate in which group. If none is given, use the global group as default.
+        sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default.
+        use_calc_stream (bool, optional): Indicate whether the communication is done on calculation stream. If none is given, use false as default. This
+            option is designed for high performance demand, be careful to turn it on except you are clearly know its meaning.
+
+    Returns:
+        Return a task object.
+
+    Warning:
+        This API only supports the dygraph mode now.
+
+    Examples:
+        .. code-block:: python
+
+            # required: distributed
+            import paddle
+            import paddle.distributed as dist
+
+            dist.init_parallel_env()
+            local_rank = dist.get_rank()
+            tensor_list = []
+            if local_rank == 0:
+                data = paddle.to_tensor([[4, 5, 6], [4, 5, 6]])
+            else:
+                data = paddle.to_tensor([[1, 2, 3], [1, 2, 3]])
+            task = dist.stream.all_gather(tensor_list, data, sync_op=False)
+            task.wait()
+            print(tensor_list)
+            # [[[4, 5, 6], [4, 5, 6]], [[1, 2, 3], [1, 2, 3]]] (2 GPUs)
+    """
+    if group is not None and not group.is_member():
+        raise RuntimeError(
+            "The group should not be None and all ranks which invoke this operation should be the member of this group."
+        )
+
+    if not sync_op and use_calc_stream:
+        raise RuntimeError(
+            "use_calc_stream can only be true in sync op behavior.")
+
+    if framework.in_dygraph_mode():
+        if paddle.is_tensor(tensor_or_tensor_list):
+            return _all_gather_base_in_dygraph(tensor_or_tensor_list, tensor,
+                                               group, sync_op, use_calc_stream)
+        else:
+            return _all_gather_in_dygraph(tensor_or_tensor_list, tensor, group,
+                                          sync_op, use_calc_stream)
+
+    raise RuntimeError(
+        "paddle.distributed.stream.all_gather is only supported in dygraph mode now."
+    )
diff --git a/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt b/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt
index 55f4453b1ab08..d7ee67c10f435 100644
--- a/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt
@@ -266,6 +266,14 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   set_tests_properties(test_collective_wait PROPERTIES TIMEOUT "300" LABELS
                                                        "RUN_TYPE=DIST")
 endif()
+if((WITH_GPU OR WITH_ROCM) AND (LINUX))
+  py_test_modules(
+    test_communication_stream_allgather_api MODULES
+    test_communication_stream_allgather_api ENVS
+    "PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=")
+  set_tests_properties(test_communication_stream_allgather_api
+                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
+endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(
     test_communication_stream_allreduce_api MODULES
diff --git a/python/paddle/fluid/tests/unittests/collective/communication_stream_allgather_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/communication_stream_allgather_api_dygraph.py
new file mode 100644
index 0000000000000..d0e46600b8b50
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective/communication_stream_allgather_api_dygraph.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.distributed as dist
+import test_communication_api_base as test_base
+import test_collective_api_base as test_collective_base
+
+
+class StreamAllgatherTestCase():
+
+    def __init__(self):
+        self._sync_op = eval(os.getenv("sync_op"))
+        self._use_calc_stream = eval(os.getenv("use_calc_stream"))
+        self._backend = os.getenv("backend")
+        self._shape = eval(os.getenv("shape"))
+        self._dtype = os.getenv("dtype")
+        self._seeds = eval(os.getenv("seeds"))
+        if self._backend not in ["nccl", "gloo"]:
+            raise NotImplementedError(
+                "Only support nccl and gloo as the backend for now.")
+        os.environ["PADDLE_DISTRI_BACKEND"] = self._backend
+
+    def run_test_case(self):
+        dist.init_parallel_env()
+
+        test_data_list = []
+        for seed in self._seeds:
+            test_data_list.append(
+                test_collective_base.create_test_data(shape=self._shape,
+                                                      dtype=self._dtype,
+                                                      seed=seed))
+
+        rank = dist.get_rank()
+        tensor = paddle.to_tensor(test_data_list[rank])
+
+        # case 1: pass an empty tensor list
+        empty_tensor_list = []
+        task = dist.stream.all_gather(empty_tensor_list,
+                                      tensor,
+                                      sync_op=self._sync_op,
+                                      use_calc_stream=self._use_calc_stream)
+        if not self._sync_op:
+            task.wait()
+        assert np.allclose(empty_tensor_list,
+                           test_data_list,
+                           rtol=1e-05,
+                           atol=1e-05)
+
+        # case 2: pass a pre-sized tensor list
+        full_tensor_list = [paddle.empty_like(tensor) for _ in test_data_list]
+        task = dist.stream.all_gather(full_tensor_list,
+                                      tensor,
+                                      sync_op=self._sync_op,
+                                      use_calc_stream=self._use_calc_stream)
+        if not self._sync_op:
+            task.wait()
+        assert np.allclose(full_tensor_list,
+                           test_data_list,
+                           rtol=1e-05,
+                           atol=1e-05)
+
+        # case 3: pass a pre-sized tensor
+        result_tensor = paddle.concat(
+            [paddle.to_tensor(data) for data in test_data_list])
+        out_tensor = paddle.empty_like(result_tensor)
+        task = dist.stream.all_gather(out_tensor,
+                                      tensor,
+                                      sync_op=self._sync_op,
+                                      use_calc_stream=self._use_calc_stream)
+        if not self._sync_op:
+            task.wait()
+        assert np.allclose(out_tensor, result_tensor, rtol=1e-05, atol=1e-05)
+
+
+if __name__ == "__main__":
+    StreamAllgatherTestCase().run_test_case()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_communication_stream_allgather_api.py b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_allgather_api.py
new file mode 100644
index 0000000000000..254b64907ce07
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_allgather_api.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import itertools
+import test_communication_api_base as test_base
+
+
+class TestCommunicationStreamAllgatherAPI(test_base.CommunicationTestDistBase):
+
+    def setUp(self):
+        super(TestCommunicationStreamAllgatherAPI, self).setUp(num_of_devices=2,
+                                                               timeout=120)
+        self._default_envs = {
+            "backend": "nccl",
+            "shape": "(100, 200)",
+            "dtype": "float32",
+            "seeds": str(self._seeds)
+        }
+        self._changeable_envs = {
+            "sync_op": ["True", "False"],
+            "use_calc_stream": ["True", "False"]
+        }
+
+    def test_allgather_stream(self):
+        envs_list = test_base.gen_product_envs_list(self._default_envs,
+                                                    self._changeable_envs)
+        for envs in envs_list:
+            if eval(envs["use_calc_stream"]) and not eval(envs["sync_op"]):
+                continue
+            self.run_test_case("communication_stream_allgather_api_dygraph.py",
+                               user_defined_envs=envs)
+
+    def tearDown(self):
+        super(TestCommunicationStreamAllgatherAPI, self).tearDown()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/testslist.csv b/python/paddle/fluid/tests/unittests/collective/testslist.csv
index b4ba281f45420..c6c7c13937f38 100644
--- a/python/paddle/fluid/tests/unittests/collective/testslist.csv
+++ b/python/paddle/fluid/tests/unittests/collective/testslist.csv
@@ -32,6 +32,7 @@ test_collective_split_col_linear,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_
 test_collective_split_embedding_none_divisible,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_split_row_linear,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_wait,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_communication_stream_allgather_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=,
 test_communication_stream_allreduce_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=,
 test_communication_stream_sendrecv_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=,
 test_eager_dist_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,

From 9641b9342e52f459b04eb5c1a38a160f3a493506 Mon Sep 17 00:00:00 2001
From: Wen Sun <35923278+HermitSun@users.noreply.github.com>
Date: Tue, 11 Oct 2022 19:14:09 +0800
Subject: [PATCH 3/6] Support both use_calc_stream and sync_op in collective
 communication API (#46761)

---
 .../distributed/collective/ProcessGroup.h     |  96 ++-
 .../collective/ProcessGroupCustom.cc          |   4 +-
 .../collective/ProcessGroupCustom.h           |   8 +-
 .../collective/ProcessGroupNCCL.cc            | 323 +++++++++-
 .../distributed/collective/ProcessGroupNCCL.h |  76 ++-
 .../collective/ProcessGroupStream.cc          | 173 +++++-
 .../collective/ProcessGroupStream.h           | 109 +++-
 paddle/fluid/distributed/collective/Utils.h   | 144 ++++-
 paddle/fluid/pybind/distributed_py.cc         | 555 +++++++++++++++++-
 .../communication/stream/__init__.py          |  13 +-
 .../communication/stream/all_gather.py        |  16 +-
 .../communication/stream/all_reduce.py        |   4 +-
 .../communication/stream/alltoall.py          | 157 +++++
 .../communication/stream/alltoall_single.py   | 128 ++++
 .../communication/stream/broadcast.py         |  83 +++
 .../distributed/communication/stream/recv.py  |   2 +-
 .../communication/stream/reduce.py            |  93 +++
 .../communication/stream/reduce_scatter.py    | 216 +++++++
 .../communication/stream/scatter.py           | 162 +++++
 .../distributed/communication/stream/send.py  |   2 +-
 .../tests/unittests/collective/CMakeLists.txt |  48 ++
 ...mmunication_stream_alltoall_api_dygraph.py | 113 ++++
 ...tion_stream_alltoall_single_api_dygraph.py |  74 +++
 ...munication_stream_broadcast_api_dygraph.py |  54 ++
 ...communication_stream_reduce_api_dygraph.py |  66 +++
 ...ation_stream_reduce_scatter_api_dygraph.py |  94 +++
 ...ommunication_stream_scatter_api_dygraph.py |  84 +++
 ...mmunication_stream_sendrecv_api_dygraph.py |   9 +-
 .../test_communication_stream_alltoall_api.py |  51 ++
 ...ommunication_stream_alltoall_single_api.py |  53 ++
 ...test_communication_stream_broadcast_api.py |  51 ++
 .../test_communication_stream_reduce_api.py   |  51 ++
 ...communication_stream_reduce_scatter_api.py |  53 ++
 .../test_communication_stream_scatter_api.py  |  51 ++
 .../tests/unittests/collective/testslist.csv  |   6 +
 35 files changed, 3108 insertions(+), 114 deletions(-)
 create mode 100644 python/paddle/distributed/communication/stream/alltoall.py
 create mode 100644 python/paddle/distributed/communication/stream/alltoall_single.py
 create mode 100644 python/paddle/distributed/communication/stream/broadcast.py
 create mode 100644 python/paddle/distributed/communication/stream/reduce.py
 create mode 100644 python/paddle/distributed/communication/stream/reduce_scatter.py
 create mode 100644 python/paddle/distributed/communication/stream/scatter.py
 create mode 100644 python/paddle/fluid/tests/unittests/collective/communication_stream_alltoall_api_dygraph.py
 create mode 100644 python/paddle/fluid/tests/unittests/collective/communication_stream_alltoall_single_api_dygraph.py
 create mode 100644 python/paddle/fluid/tests/unittests/collective/communication_stream_broadcast_api_dygraph.py
 create mode 100644 python/paddle/fluid/tests/unittests/collective/communication_stream_reduce_api_dygraph.py
 create mode 100644 python/paddle/fluid/tests/unittests/collective/communication_stream_reduce_scatter_api_dygraph.py
 create mode 100644 python/paddle/fluid/tests/unittests/collective/communication_stream_scatter_api_dygraph.py
 create mode 100644 python/paddle/fluid/tests/unittests/collective/test_communication_stream_alltoall_api.py
 create mode 100644 python/paddle/fluid/tests/unittests/collective/test_communication_stream_alltoall_single_api.py
 create mode 100644 python/paddle/fluid/tests/unittests/collective/test_communication_stream_broadcast_api.py
 create mode 100644 python/paddle/fluid/tests/unittests/collective/test_communication_stream_reduce_api.py
 create mode 100644 python/paddle/fluid/tests/unittests/collective/test_communication_stream_reduce_scatter_api.py
 create mode 100644 python/paddle/fluid/tests/unittests/collective/test_communication_stream_scatter_api.py

diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h
index 40a27db7601f1..fb4832442a469 100644
--- a/paddle/fluid/distributed/collective/ProcessGroup.h
+++ b/paddle/fluid/distributed/collective/ProcessGroup.h
@@ -122,6 +122,16 @@ class ProcessGroup {
         "ProcessGroup%s does not support broadcast", GetBackendName()));
   }
 
+  virtual std::shared_ptr<ProcessGroup::Task> Broadcast(
+      std::vector<phi::DenseTensor>& /* input tensors */,   // NOLINT
+      std::vector<phi::DenseTensor>& /* output tensors */,  // NOLINT
+      const BroadcastOptions&,
+      bool) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support broadcast with sync_op flag",
+        GetBackendName()));
+  }
+
   virtual std::shared_ptr<ProcessGroup::Task> Barrier(
       const BarrierOptions& = BarrierOptions()) {
     PADDLE_THROW(platform::errors::InvalidArgument(
@@ -157,14 +167,14 @@ class ProcessGroup {
   virtual std::shared_ptr<ProcessGroup::Task> Send_Partial(
       phi::DenseTensor&,  // NOLINT
       int,
-      int,
-      int) {
+      int64_t,
+      int64_t) {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "ProcessGroup%s does not support send_partial", GetBackendName()));
   }
 
   virtual std::shared_ptr<ProcessGroup::Task> Send_Partial(
-      phi::DenseTensor&, int, int, int, bool) {  // NOLINT
+      phi::DenseTensor&, int, int64_t, int64_t, bool) {  // NOLINT
     PADDLE_THROW(platform::errors::InvalidArgument(
         "ProcessGroup%s does not support send_partial with sync_op flag",
         GetBackendName()));
@@ -173,14 +183,14 @@ class ProcessGroup {
   virtual std::shared_ptr<ProcessGroup::Task> Recv_Partial(
       phi::DenseTensor&,  // NOLINT
       int,
-      int,
-      int) {
+      int64_t,
+      int64_t) {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "ProcessGroup%s does not support recv_partial", GetBackendName()));
   }
 
   virtual std::shared_ptr<ProcessGroup::Task> Recv_Partial(
-      phi::DenseTensor&, int, int, int, bool) {  // NOLINT
+      phi::DenseTensor&, int, int64_t, int64_t, bool) {  // NOLINT
     PADDLE_THROW(platform::errors::InvalidArgument(
         "ProcessGroup%s does not support recv_partial with sync_op flag",
         GetBackendName()));
@@ -205,8 +215,18 @@ class ProcessGroup {
   virtual std::shared_ptr<ProcessGroup::Task> AllGather_Partial(
       std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
       std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
-      int offset,
-      int length) {  // NOLINT
+      int64_t offset,
+      int64_t length) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support AllGather_Partial", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> AllGather_Partial(
+      std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
+      std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
+      int64_t offset,
+      int64_t length,
+      bool) {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "ProcessGroup%s does not support AllGather_Partial", GetBackendName()));
   }
@@ -218,6 +238,14 @@ class ProcessGroup {
         "ProcessGroup%s does not support AllToAll", GetBackendName()));
   }
 
+  virtual std::shared_ptr<ProcessGroup::Task> AllToAll(
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      bool) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support alltoall", GetBackendName()));
+  }
+
   virtual std::shared_ptr<ProcessGroup::Task> AllToAll_Single(
       std::vector<phi::DenseTensor>&,  // NOLINT
       std::vector<phi::DenseTensor>&,  // NOLINT
@@ -227,26 +255,66 @@ class ProcessGroup {
         "ProcessGroup%s does not support AllToAll_Single", GetBackendName()));
   }
 
+  virtual std::shared_ptr<ProcessGroup::Task> AllToAllSingle(
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      std::vector<int64_t>&,
+      std::vector<int64_t>&,
+      bool) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support alltoall_single", GetBackendName()));
+  }
+
   virtual std::shared_ptr<ProcessGroup::Task> Reduce(
       std::vector<phi::DenseTensor>&,  // NOLINT
       std::vector<phi::DenseTensor>&,  // NOLINT
       const ReduceOptions& opts) {
     PADDLE_THROW(platform::errors::InvalidArgument(
-        "ProcessGroup%s does not support Reduce", GetBackendName()));
+        "ProcessGroup%s does not support reduce", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> Reduce(
+      std::vector<phi::DenseTensor>& /* input tensors */,   // NOLINT
+      std::vector<phi::DenseTensor>& /* output tensors */,  // NOLINT
+      const ReduceOptions&,
+      bool) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support reduce with sync_op flag",
+        GetBackendName()));
   }
 
   virtual std::shared_ptr<ProcessGroup::Task> Scatter(
       std::vector<phi::DenseTensor>&,  // NOLINT
       std::vector<phi::DenseTensor>&,  // NOLINT
-      const ScatterOptions&) {         // NOLINT
+      const ScatterOptions&) {
     PADDLE_THROW(platform::errors::InvalidArgument(
-        "ProcessGroup%s does not support Scatter", GetBackendName()));
+        "ProcessGroup%s does not support scatter", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> Scatter(
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      const ScatterOptions&,
+      bool) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support scatter with sync_op flag",
+        GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> ReduceScatter(
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      const ReduceScatterOptions&,
+      bool) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support reduce_scatter with sync_op flag",
+        GetBackendName()));
   }
 
   virtual std::shared_ptr<ProcessGroup::Task> _ReduceScatterBase(
-      phi::DenseTensor&,              // NOLINT
-      phi::DenseTensor&,              // NOLINT
-      const ReduceScatterOptions&) {  // NOLINT
+      phi::DenseTensor&,  // NOLINT
+      phi::DenseTensor&,  // NOLINT
+      const ReduceScatterOptions&) {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "ProcessGroup%s does not support ReduceScatter", GetBackendName()));
   }
diff --git a/paddle/fluid/distributed/collective/ProcessGroupCustom.cc b/paddle/fluid/distributed/collective/ProcessGroupCustom.cc
index ad9356b368ea2..f18765a05f619 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupCustom.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupCustom.cc
@@ -267,8 +267,8 @@ void* XcclGetPointerByOffset(void* raw_pointer,
 std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::AllGather_Partial(
     std::vector<phi::DenseTensor>& in_tensors,
     std::vector<phi::DenseTensor>& out_tensors,
-    int offset,
-    int length) {
+    int64_t offset,
+    int64_t length) {
   PADDLE_ENFORCE_EQ(
       CheckTensorsInCustomPlace(in_tensors, device_type_),
       true,
diff --git a/paddle/fluid/distributed/collective/ProcessGroupCustom.h b/paddle/fluid/distributed/collective/ProcessGroupCustom.h
index ccce66603afe6..ce3532bbb6f0e 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupCustom.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupCustom.h
@@ -80,8 +80,8 @@ class ProcessGroupCustom : public ProcessGroup {
   std::shared_ptr<ProcessGroup::Task> AllGather_Partial(
       std::vector<phi::DenseTensor>& in_tensors,
       std::vector<phi::DenseTensor>& out_tensors,
-      int offset,
-      int length) override;
+      int64_t offset,
+      int64_t length) override;
 
   std::shared_ptr<ProcessGroup::Task> AllReduce(
       std::vector<phi::DenseTensor>& in_tensors,
@@ -117,8 +117,8 @@ class ProcessGroupCustom : public ProcessGroup {
   std::set<int> used_place_ids_;
 
  private:
-  void BcastCustomId(std::vector<phi::ccl::CCLRootId>& ccl_ids,
-                     int root,  // NOLINT
+  void BcastCustomId(std::vector<phi::ccl::CCLRootId>& ccl_ids,  // NOLINT
+                     int root,
                      int server_fd);
 
   void BroadcastUniqueCustomID(
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
index 12f60faf80053..bc5eb4885ee52 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
@@ -628,6 +628,40 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Broadcast(
       CommType::BROADCAST);
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Broadcast(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    const BroadcastOptions& opts,
+    bool sync_op,
+    bool use_calc_stream) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(in_tensors),
+      true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+
+  return Collective(
+      in_tensors,
+      out_tensors,
+      [&](phi::DenseTensor& input,
+          phi::DenseTensor& output,
+          ncclComm_t comm,
+          const gpuStream_t& stream) {
+        const auto root =
+            opts.source_rank * in_tensors.size() + opts.source_root;
+        return platform::dynload::ncclBroadcast(
+            input.data(),
+            output.data(),
+            input.numel(),
+            platform::ToNCCLDataType(input.type()),
+            root,
+            comm,
+            stream);
+      },
+      CommType::BROADCAST,
+      sync_op,
+      use_calc_stream);
+}
+
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Barrier(
     const BarrierOptions& opts) {
   // Only support single card single process
@@ -782,7 +816,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv(
 }
 
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Send_Partial(
-    phi::DenseTensor& tensors, int dst_rank, int offset, int length) {
+    phi::DenseTensor& tensors, int dst_rank, int64_t offset, int64_t length) {
   // CheckTensorsInDifferentDevices(tensors, static_cast<size_t>(GetSize()));
 
   phi::DenseTensor flatten_tensor;
@@ -813,8 +847,8 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Send_Partial(
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Send_Partial(
     phi::DenseTensor& tensors,
     int dst_rank,
-    int offset,
-    int length,
+    int64_t offset,
+    int64_t length,
     bool sync_op,
     bool use_calc_stream) {
   phi::DenseTensor flatten_tensor;
@@ -845,7 +879,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Send_Partial(
 }
 
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv_Partial(
-    phi::DenseTensor& tensors, int src_rank, int offset, int length) {
+    phi::DenseTensor& tensors, int src_rank, int64_t offset, int64_t length) {
   // phi::DenseTensor shared_input = tensors.Slice(offset, offset+length);
 
   phi::DenseTensor flatten_tensor;
@@ -876,8 +910,8 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv_Partial(
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv_Partial(
     phi::DenseTensor& tensors,
     int src_rank,
-    int offset,
-    int length,
+    int64_t offset,
+    int64_t length,
     bool sync_op,
     bool use_calc_stream) {
   phi::DenseTensor flatten_tensor;
@@ -1006,8 +1040,8 @@ void* GetPointerByOffset(void* raw_pointer,
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllGather_Partial(
     std::vector<phi::DenseTensor>& in_tensors,
     std::vector<phi::DenseTensor>& out_tensors,
-    int offset,
-    int length) {
+    int64_t offset,
+    int64_t length) {
   PADDLE_ENFORCE_EQ(
       CheckTensorsInCudaPlace(in_tensors),
       true,
@@ -1034,6 +1068,41 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllGather_Partial(
       CommType::ALLGATHER);
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllGather_Partial(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    int64_t offset,
+    int64_t length,
+    bool sync_op,
+    bool use_calc_stream) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(in_tensors),
+      true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(out_tensors),
+      true,
+      platform::errors::InvalidArgument("All outputs should be in CudaPlace."));
+  return Collective(
+      in_tensors,
+      out_tensors,
+      [&](phi::DenseTensor& input,
+          phi::DenseTensor& output,
+          ncclComm_t comm,
+          const gpuStream_t& stream) {
+        return platform::dynload::ncclAllGather(
+            GetPointerByOffset(input.data(), offset, input.dtype()),
+            output.data(),
+            length,
+            platform::ToNCCLDataType(input.dtype()),
+            comm,
+            stream);
+      },
+      CommType::ALLGATHER,
+      sync_op,
+      use_calc_stream);
+}
+
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllToAll(
     std::vector<phi::DenseTensor>& in_tensors,
     std::vector<phi::DenseTensor>& out_tensors) {
@@ -1076,6 +1145,52 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllToAll(
       CommType::ALLTOALL);
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllToAll(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    bool sync_op,
+    bool use_calc_stream) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(in_tensors),
+      true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(out_tensors),
+      true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  return Collective(
+      in_tensors,
+      out_tensors,
+      [&](phi::DenseTensor& input,
+          phi::DenseTensor& output,
+          ncclComm_t comm,
+          const gpuStream_t& stream) {
+        size_t offset = 0;
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
+        for (auto i = 0; i < size_; i++) {
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
+              GetPointerByOffset(input.data(), offset, input.dtype()),
+              input.numel() / size_,
+              platform::ToNCCLDataType(input.dtype()),
+              i,
+              comm,
+              stream));
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
+              GetPointerByOffset(output.data(), offset, input.dtype()),
+              input.numel() / size_,
+              platform::ToNCCLDataType(input.dtype()),
+              i,
+              comm,
+              stream));
+          offset += input.numel() / size_;
+        }
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
+      },
+      CommType::ALLTOALL,
+      sync_op,
+      use_calc_stream);
+}
+
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllToAll_Single(
     std::vector<phi::DenseTensor>& in_tensors,
     std::vector<phi::DenseTensor>& out_tensors,
@@ -1138,6 +1253,72 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllToAll_Single(
       CommType::ALLTOALL_SINGLE);
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllToAllSingle(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    std::vector<int64_t>& in_sizes,
+    std::vector<int64_t>& out_sizes,
+    bool sync_op,
+    bool use_calc_stream) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(in_tensors),
+      true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(out_tensors),
+      true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  return Collective(
+      in_tensors,
+      out_tensors,
+      [&](phi::DenseTensor& input,
+          phi::DenseTensor& output,
+          ncclComm_t comm,
+          const gpuStream_t& stream) {
+        PADDLE_ENFORCE_EQ(input.dtype() == output.dtype(),
+                          true,
+                          platform::errors::InvalidArgument(
+                              "The dtypes of input and output must be equal."));
+
+        std::vector<int64_t> in_dims = phi::vectorize(input.dims());
+        std::vector<int64_t> out_dims = phi::vectorize(output.dims());
+        CheckSplitSizes(&in_sizes, in_dims);
+        CheckSplitSizes(&out_sizes, out_dims);
+
+        size_t in_offset = 0, out_offset = 0;
+        size_t in_length = 0, out_length = 0;
+        size_t in_row_size = input.numel() / in_dims[0];
+        size_t out_row_size = output.numel() / out_dims[0];
+
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
+        for (auto i = 0; i < size_; i++) {
+          in_length = in_sizes[i] * in_row_size;
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
+              GetPointerByOffset(input.data(), in_offset, input.dtype()),
+              in_length,
+              platform::ToNCCLDataType(input.dtype()),
+              i,
+              comm,
+              stream));
+          in_offset += in_length;
+
+          out_length = out_sizes[i] * out_row_size;
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
+              GetPointerByOffset(output.data(), out_offset, input.dtype()),
+              out_length,
+              platform::ToNCCLDataType(input.dtype()),
+              i,
+              comm,
+              stream));
+          out_offset += out_length;
+        }
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
+      },
+      CommType::ALLTOALL_SINGLE,
+      sync_op,
+      use_calc_stream);
+}
+
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Reduce(
     std::vector<phi::DenseTensor>& in_tensors,
     std::vector<phi::DenseTensor>& out_tensors,
@@ -1166,6 +1347,70 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Reduce(
       CommType::REDUCE);
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Reduce(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    const ReduceOptions& opts,
+    bool sync_op,
+    bool use_calc_stream) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(in_tensors),
+      true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  return Collective(
+      in_tensors,
+      out_tensors,
+      [&](const phi::DenseTensor& input,
+          phi::DenseTensor& output,
+          ncclComm_t comm,
+          const gpuStream_t& stream) {
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce(
+            input.data(),
+            output.data(),
+            input.numel(),
+            platform::ToNCCLDataType(input.dtype()),
+            ToNCCLRedType(opts.reduce_op),
+            opts.root_rank,
+            comm,
+            stream));
+      },
+      CommType::REDUCE,
+      sync_op,
+      use_calc_stream);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::ReduceScatter(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    const ReduceScatterOptions& opts,
+    bool sync_op,
+    bool use_calc_stream) {
+  return Collective(
+      in_tensors,
+      out_tensors,
+      [&](phi::DenseTensor& input,
+          phi::DenseTensor& output,
+          ncclComm_t comm,
+          const gpuStream_t& stream) {
+        if (FLAGS_use_stream_safe_cuda_allocator) {
+          platform::CUDADeviceGuard cuda_guard;
+          cuda_guard.SetDevice(output.place());
+          memory::RecordStream(output.Holder(), stream);
+        }
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduceScatter(
+            input.data(),
+            output.data(),
+            output.numel(),
+            platform::ToNCCLDataType(input.dtype()),
+            ToNCCLRedType(opts.reduce_op),
+            comm,
+            stream));
+      },
+      CommType::REDUCE_SCATTER,
+      sync_op,
+      use_calc_stream);
+}
+
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Scatter(
     std::vector<phi::DenseTensor>& in_tensors,
     std::vector<phi::DenseTensor>& out_tensors,
@@ -1219,6 +1464,68 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Scatter(
       CommType::SCATTER);
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Scatter(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    const ScatterOptions& opts,
+    bool sync_op,
+    bool use_calc_stream) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(in_tensors),
+      true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(out_tensors),
+      true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  return Collective(
+      in_tensors,
+      out_tensors,
+      [&](phi::DenseTensor& input,
+          phi::DenseTensor& output,
+          ncclComm_t comm,
+          const gpuStream_t& stream) {
+        PADDLE_ENFORCE_EQ(
+            output.numel(),
+            input.numel() / size_,
+            platform::errors::InvalidArgument(
+                "Input and output tensors should have the same shape."));
+        size_t offset = 0;
+        if (rank_ == opts.root_rank) {
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
+          for (auto i = 0; i < size_; i++) {
+            PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
+                GetPointerByOffset(input.data(), offset, input.dtype()),
+                input.numel() / size_,
+                platform::ToNCCLDataType(input.dtype()),
+                i,
+                comm,
+                stream));
+            offset += input.numel() / size_;
+          }
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
+              output.data(),
+              input.numel() / size_,
+              platform::ToNCCLDataType(input.dtype()),
+              opts.root_rank,
+              comm,
+              stream));
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
+        } else {
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
+              output.data(),
+              input.numel() / size_,
+              platform::ToNCCLDataType(input.dtype()),
+              opts.root_rank,
+              comm,
+              stream));
+        }
+      },
+      CommType::SCATTER,
+      sync_op,
+      use_calc_stream);
+}
+
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::_ReduceScatterBase(
     phi::DenseTensor& out_tensor,
     phi::DenseTensor& in_tensor,
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
index 24ba7c86b1838..6427e9e3e2ab1 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
@@ -119,6 +119,13 @@ class ProcessGroupNCCL : public ProcessGroupStream {
       std::vector<phi::DenseTensor>& out_tensors,
       const BroadcastOptions& = BroadcastOptions()) override;
 
+  std::shared_ptr<ProcessGroup::Task> Broadcast(
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors,
+      const BroadcastOptions& opts,
+      bool sync_op,
+      bool use_calc_stream) override;
+
   std::shared_ptr<ProcessGroup::Task> Barrier(
       const BarrierOptions& = BarrierOptions()) override;
 
@@ -142,27 +149,27 @@ class ProcessGroupNCCL : public ProcessGroupStream {
 
   std::shared_ptr<ProcessGroup::Task> Send_Partial(phi::DenseTensor& tensors,
                                                    int dst_rank,
-                                                   int offset,
-                                                   int length) override;
+                                                   int64_t offset,
+                                                   int64_t length) override;
 
   std::shared_ptr<ProcessGroup::Task> Send_Partial(
       phi::DenseTensor& tensors,
       int dst_rank,
-      int offset,
-      int length,
+      int64_t offset,
+      int64_t length,
       bool sync_op,
       bool use_calc_stream) override;
 
   std::shared_ptr<ProcessGroup::Task> Recv_Partial(phi::DenseTensor& tensors,
                                                    int src_rank,
-                                                   int offset,
-                                                   int length) override;
+                                                   int64_t offset,
+                                                   int64_t length) override;
 
   std::shared_ptr<ProcessGroup::Task> Recv_Partial(
       phi::DenseTensor& tensors,
       int src_rank,
-      int offset,
-      int length,
+      int64_t offset,
+      int64_t length,
       bool sync_op,
       bool use_calc_stream) override;
 
@@ -179,12 +186,26 @@ class ProcessGroupNCCL : public ProcessGroupStream {
   std::shared_ptr<ProcessGroup::Task> AllGather_Partial(
       std::vector<phi::DenseTensor>& in_tensors,
       std::vector<phi::DenseTensor>& out_tensors,
-      int offset,
-      int length) override;
+      int64_t offset,
+      int64_t length) override;
+
+  std::shared_ptr<ProcessGroup::Task> AllGather_Partial(
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors,
+      int64_t offset,
+      int64_t length,
+      bool sync_op,
+      bool use_calc_stream) override;
 
   std::shared_ptr<ProcessGroup::Task> AllToAll(
-      std::vector<phi::DenseTensor>& in,
-      std::vector<phi::DenseTensor>& out) override;
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors) override;
+
+  std::shared_ptr<ProcessGroup::Task> AllToAll(
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors,
+      bool sync_op,
+      bool use_calc_stream) override;
 
   std::shared_ptr<ProcessGroup::Task> AllToAll_Single(
       std::vector<phi::DenseTensor>& in,
@@ -192,15 +213,44 @@ class ProcessGroupNCCL : public ProcessGroupStream {
       std::vector<int64_t>& in_sizes,
       std::vector<int64_t>& out_sizes) override;
 
+  std::shared_ptr<ProcessGroup::Task> AllToAllSingle(
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors,
+      std::vector<int64_t>& in_sizes,
+      std::vector<int64_t>& out_sizes,
+      bool sync_op,
+      bool use_calc_stream) override;
+
   std::shared_ptr<ProcessGroup::Task> Reduce(
       std::vector<phi::DenseTensor>& tensors,
       std::vector<phi::DenseTensor>& out_tensors,
       const ReduceOptions& opts) override;
 
+  std::shared_ptr<ProcessGroup::Task> Reduce(
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors,
+      const ReduceOptions& opts,
+      bool sync_op,
+      bool use_calc_stream) override;
+
+  std::shared_ptr<ProcessGroup::Task> ReduceScatter(
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors,
+      const ReduceScatterOptions& opts,
+      bool sync_op,
+      bool use_calc_stream) override;
+
   std::shared_ptr<ProcessGroup::Task> Scatter(
       std::vector<phi::DenseTensor>& in_tensors,
       std::vector<phi::DenseTensor>& out_tensors,
-      const ScatterOptions&) override;
+      const ScatterOptions& opts) override;
+
+  std::shared_ptr<ProcessGroup::Task> Scatter(
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors,
+      const ScatterOptions& opts,
+      bool sync_op,
+      bool use_calc_stream) override;
 
   std::shared_ptr<ProcessGroup::Task> _ReduceScatterBase(
       phi::DenseTensor&,  // NOLINT
diff --git a/paddle/fluid/distributed/collective/ProcessGroupStream.cc b/paddle/fluid/distributed/collective/ProcessGroupStream.cc
index 43ca0bbb36d3f..b2cfae088b227 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupStream.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupStream.cc
@@ -70,6 +70,138 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::AllReduce(
       "ProcessGroup%s does not support do all_reduce", GetBackendName()));
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::AllToAll(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    bool sync_op) {
+  return AllToAll(in_tensors,
+                  out_tensors,
+                  sync_op,
+                  /*use_calc_stream*/ false);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::AllToAll(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    bool sync_op,
+    bool use_calc_stream) {
+  PADDLE_THROW(platform::errors::InvalidArgument(
+      "ProcessGroup%s does not support do alltoall", GetBackendName()));
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::AllToAllSingle(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    std::vector<int64_t>& in_sizes,
+    std::vector<int64_t>& out_sizes,
+    bool sync_op) {
+  return AllToAllSingle(in_tensors,
+                        out_tensors,
+                        in_sizes,
+                        out_sizes,
+                        sync_op,
+                        /*use_calc_stream*/ false);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::AllToAllSingle(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    std::vector<int64_t>& in_sizes,
+    std::vector<int64_t>& out_sizes,
+    bool sync_op,
+    bool use_calc_stream) {
+  PADDLE_THROW(platform::errors::InvalidArgument(
+      "ProcessGroup%s does not support do alltoall_single", GetBackendName()));
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Broadcast(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    const BroadcastOptions& opts,
+    bool sync_op) {
+  return Broadcast(in_tensors,
+                   out_tensors,
+                   opts,
+                   sync_op,
+                   /*use_calc_stream*/ false);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Broadcast(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    const BroadcastOptions& opts,
+    bool sync_op,
+    bool use_calc_stream) {
+  PADDLE_THROW(platform::errors::InvalidArgument(
+      "ProcessGroup%s does not support do broadcast", GetBackendName()));
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Reduce(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    const ReduceOptions& opts,
+    bool sync_op) {
+  return Reduce(in_tensors,
+                out_tensors,
+                opts,
+                sync_op,
+                /*use_calc_stream*/ false);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Reduce(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    const ReduceOptions& opts,
+    bool sync_op,
+    bool use_calc_stream) {
+  PADDLE_THROW(platform::errors::InvalidArgument(
+      "ProcessGroup%s does not support do reduce", GetBackendName()));
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::ReduceScatter(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    const ReduceScatterOptions& opts,
+    bool sync_op) {
+  return ReduceScatter(in_tensors,
+                       out_tensors,
+                       opts,
+                       sync_op,
+                       /*use_calc_stream*/ false);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::ReduceScatter(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    const ReduceScatterOptions& opts,
+    bool sync_op,
+    bool use_calc_stream) {
+  PADDLE_THROW(platform::errors::InvalidArgument(
+      "ProcessGroup%s does not support do reduce_scatter", GetBackendName()));
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Scatter(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    const ScatterOptions& opts,
+    bool sync_op) {
+  return Scatter(in_tensors,
+                 out_tensors,
+                 opts,
+                 sync_op,
+                 /*use_calc_stream*/ false);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Scatter(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    const ScatterOptions& opts,
+    bool sync_op,
+    bool use_calc_stream) {
+  PADDLE_THROW(platform::errors::InvalidArgument(
+      "ProcessGroup%s does not support do scatter", GetBackendName()));
+}
+
 std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Send(
     std::vector<phi::DenseTensor>& tensors, int dst_rank, bool sync_op) {
   return Send(tensors,
@@ -90,8 +222,8 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Send(
 std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Send_Partial(
     phi::DenseTensor& tensors,
     int dst_rank,
-    int offset,
-    int length,
+    int64_t offset,
+    int64_t length,
     bool sync_op) {
   return Send_Partial(tensors,
                       dst_rank,
@@ -104,8 +236,8 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Send_Partial(
 std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Send_Partial(
     phi::DenseTensor& tensors,
     int dst_rank,
-    int offset,
-    int length,
+    int64_t offset,
+    int64_t length,
     bool sync_op,
     bool use_calc_stream) {
   PADDLE_THROW(platform::errors::InvalidArgument(
@@ -132,8 +264,8 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Recv(
 std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Recv_Partial(
     phi::DenseTensor& tensors,
     int src_rank,
-    int offset,
-    int length,
+    int64_t offset,
+    int64_t length,
     bool sync_op) {
   return Recv_Partial(tensors,
                       src_rank,
@@ -146,8 +278,33 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Recv_Partial(
 std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Recv_Partial(
     phi::DenseTensor& tensors,
     int src_rank,
-    int offset,
-    int length,
+    int64_t offset,
+    int64_t length,
+    bool sync_op,
+    bool use_calc_stream) {
+  PADDLE_THROW(platform::errors::InvalidArgument(
+      "ProcessGroup%s does not support do recv_partial", GetBackendName()));
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::AllGather_Partial(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    int64_t offset,
+    int64_t length,
+    bool sync_op) {
+  return AllGather_Partial(in_tensors,
+                           out_tensors,
+                           offset,
+                           length,
+                           sync_op,
+                           /*use_calc_stream*/ false);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::AllGather_Partial(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    int64_t offset,
+    int64_t length,
     bool sync_op,
     bool use_calc_stream) {
   PADDLE_THROW(platform::errors::InvalidArgument(
diff --git a/paddle/fluid/distributed/collective/ProcessGroupStream.h b/paddle/fluid/distributed/collective/ProcessGroupStream.h
index f8ab562ad075c..2f0aa139104e9 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupStream.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupStream.h
@@ -81,6 +81,84 @@ class ProcessGroupStream : public ProcessGroup {
       bool sync_op,
       bool use_calc_stream);
 
+  std::shared_ptr<ProcessGroup::Task> AllToAll(
+      std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
+      std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
+      bool sync_op) override;
+
+  virtual std::shared_ptr<ProcessGroup::Task> AllToAll(
+      std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
+      std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
+      bool sync_op,
+      bool use_calc_stream);
+
+  std::shared_ptr<ProcessGroup::Task> AllToAllSingle(
+      std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
+      std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
+      std::vector<int64_t>& in_sizes,              // NOLINT
+      std::vector<int64_t>& out_sizes,             // NOLINT
+      bool sync_op) override;
+
+  virtual std::shared_ptr<ProcessGroup::Task> AllToAllSingle(
+      std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
+      std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
+      std::vector<int64_t>& in_sizes,              // NOLINT
+      std::vector<int64_t>& out_sizes,             // NOLINT
+      bool sync_op,
+      bool use_calc_stream);
+
+  std::shared_ptr<ProcessGroup::Task> Broadcast(
+      std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
+      std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
+      const BroadcastOptions& opts,
+      bool sync_op) override;
+
+  virtual std::shared_ptr<ProcessGroup::Task> Broadcast(
+      std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
+      std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
+      const BroadcastOptions& opts,
+      bool sync_op,
+      bool use_calc_stream);
+
+  std::shared_ptr<ProcessGroup::Task> Reduce(
+      std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
+      std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
+      const ReduceOptions& opts,
+      bool sync_op) override;
+
+  virtual std::shared_ptr<ProcessGroup::Task> Reduce(
+      std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
+      std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
+      const ReduceOptions& opts,
+      bool sync_op,
+      bool use_calc_stream);
+
+  std::shared_ptr<ProcessGroup::Task> ReduceScatter(
+      std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
+      std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
+      const ReduceScatterOptions& opts,
+      bool sync_op) override;
+
+  virtual std::shared_ptr<ProcessGroup::Task> ReduceScatter(
+      std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
+      std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
+      const ReduceScatterOptions& opts,
+      bool sync_op,
+      bool use_calc_stream);
+
+  std::shared_ptr<ProcessGroup::Task> Scatter(
+      std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
+      std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
+      const ScatterOptions& opts,
+      bool sync_op) override;
+
+  virtual std::shared_ptr<ProcessGroup::Task> Scatter(
+      std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
+      std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
+      const ScatterOptions& opts,
+      bool sync_op,
+      bool use_calc_stream);
+
   std::shared_ptr<ProcessGroup::Task> Send(
       std::vector<phi::DenseTensor>& tensors,  // NOLINT
       int dst_rank,
@@ -95,15 +173,15 @@ class ProcessGroupStream : public ProcessGroup {
   std::shared_ptr<ProcessGroup::Task> Send_Partial(
       phi::DenseTensor& tensors,  // NOLINT
       int dst_rank,
-      int offset,
-      int length,
+      int64_t offset,
+      int64_t length,
       bool sync_op) override;
 
   virtual std::shared_ptr<ProcessGroup::Task> Send_Partial(
       phi::DenseTensor& tensors,  // NOLINT
       int dst_rank,
-      int offset,
-      int length,
+      int64_t offset,
+      int64_t length,
       bool sync_op,
       bool use_calc_stream);
 
@@ -121,15 +199,30 @@ class ProcessGroupStream : public ProcessGroup {
   std::shared_ptr<ProcessGroup::Task> Recv_Partial(
       phi::DenseTensor& tensors,  // NOLINT
       int src_rank,
-      int offset,
-      int length,
+      int64_t offset,
+      int64_t length,
       bool sync_op) override;
 
   virtual std::shared_ptr<ProcessGroup::Task> Recv_Partial(
       phi::DenseTensor& tensors,  // NOLINT
       int src_rank,
-      int offset,
-      int length,
+      int64_t offset,
+      int64_t length,
+      bool sync_op,
+      bool use_calc_stream);
+
+  std::shared_ptr<ProcessGroup::Task> AllGather_Partial(
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors,
+      int64_t offset,
+      int64_t length,
+      bool sync_op) override;
+
+  virtual std::shared_ptr<ProcessGroup::Task> AllGather_Partial(
+      std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
+      std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
+      int64_t offset,
+      int64_t length,
       bool sync_op,
       bool use_calc_stream);
 };
diff --git a/paddle/fluid/distributed/collective/Utils.h b/paddle/fluid/distributed/collective/Utils.h
index 79146febdf809..c06c0345163ed 100644
--- a/paddle/fluid/distributed/collective/Utils.h
+++ b/paddle/fluid/distributed/collective/Utils.h
@@ -14,14 +14,26 @@
 
 #pragma once
 
-#include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/api/include/tensor.h"
+#include "paddle/phi/backends/device_guard.h"
 #include "paddle/phi/backends/device_manager.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 
 namespace paddle {
 namespace distributed {
 
+template <typename DeviceContext, typename T>
+struct ConcatDenseTensor {
+  void operator()(const DeviceContext *context,
+                  const std::vector<phi::DenseTensor> &in,
+                  phi::DenseTensor *out,
+                  int axis = 0) {
+    phi::funcs::ConcatFunctor<DeviceContext, T> concat_functor;
+    concat_functor(*context, in, axis, out);
+  }
+};
+
 template <typename DeviceContext, typename T>
 struct SplitDenseTensor {
   void operator()(const DeviceContext *context,
@@ -33,17 +45,36 @@ struct SplitDenseTensor {
     for (auto *p_tensor : *out) {
       shape_refer.emplace_back(p_tensor);
     }
-    operators::math::SplitFunctor<DeviceContext, T> split_functor_;
-    split_functor_(*context, in, shape_refer, axis, out);
+    phi::funcs::SplitFunctor<DeviceContext, T> split_functor;
+    split_functor(*context, in, shape_refer, axis, out);
   }
 };
 
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
+template <typename T>
+struct ConcatDenseTensor<platform::CustomDeviceContext, T> {
+  void operator()(const platform::CustomDeviceContext *context,
+                  const std::vector<phi::DenseTensor> &in,
+                  phi::DenseTensor *out,
+                  int axis = 0) {
+    auto *out_data = out->data<T>();
+    auto *device = phi::DeviceManager::GetDeviceWithPlace(context->GetPlace());
+    size_t offset = 0;
+    for (const auto &tensor : in) {
+      const auto *in_data = tensor.data<T>();
+      auto sz = tensor.numel() * sizeof(T);
+      device->MemoryCopyD2D(out_data + offset, in_data, sz, nullptr);
+      offset += sz;
+    }
+  }
+};
+
 template <typename T>
 struct SplitDenseTensor<platform::CustomDeviceContext, T> {
   void operator()(const platform::CustomDeviceContext *context,
                   const phi::DenseTensor &in,
-                  std::vector<phi::DenseTensor *> *out) {
+                  std::vector<phi::DenseTensor *> *out,
+                  int axis = 0) {
     auto *in_data = in.data<T>();
     auto *device = phi::DeviceManager::GetDeviceWithPlace(context->GetPlace());
     size_t offset = 0;
@@ -57,42 +88,119 @@ struct SplitDenseTensor<platform::CustomDeviceContext, T> {
 };
 #endif
 
+template <typename DeviceContext>
+void ConcatDenseTensorWithType(const DeviceContext *dev_ctx,
+                               const std::vector<phi::DenseTensor> &t_list,
+                               phi::DenseTensor *p_out,
+                               phi::DataType type) {
+  switch (type) {
+    case phi::DataType::BOOL:
+      ConcatDenseTensor<DeviceContext, bool>()(dev_ctx, t_list, p_out);
+      break;
+    case phi::DataType::UINT8:
+      ConcatDenseTensor<DeviceContext, uint8_t>()(dev_ctx, t_list, p_out);
+      break;
+    case phi::DataType::INT8:
+      ConcatDenseTensor<DeviceContext, int8_t>()(dev_ctx, t_list, p_out);
+      break;
+    case phi::DataType::INT32:
+      ConcatDenseTensor<DeviceContext, int32_t>()(dev_ctx, t_list, p_out);
+      break;
+    case phi::DataType::INT64:
+      ConcatDenseTensor<DeviceContext, int64_t>()(dev_ctx, t_list, p_out);
+      break;
+    case phi::DataType::FLOAT16:
+      ConcatDenseTensor<DeviceContext, platform::float16>()(
+          dev_ctx, t_list, p_out);
+      break;
+    case phi::DataType::FLOAT32:
+      ConcatDenseTensor<DeviceContext, float>()(dev_ctx, t_list, p_out);
+      break;
+    case phi::DataType::FLOAT64:
+      ConcatDenseTensor<DeviceContext, double>()(dev_ctx, t_list, p_out);
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Data type (%s) is not supported when it concats tensors.", type));
+  }
+}
+
 template <typename DeviceContext>
 void SplitDenseTensorWithType(const DeviceContext *dev_ctx,
-                              const phi::DenseTensor &p_dense,
+                              const phi::DenseTensor &t_in,
                               std::vector<phi::DenseTensor *> *p_list,
                               phi::DataType type) {
   switch (type) {
     case phi::DataType::BOOL:
-      SplitDenseTensor<DeviceContext, bool>()(dev_ctx, p_dense, p_list);
+      SplitDenseTensor<DeviceContext, bool>()(dev_ctx, t_in, p_list);
       break;
     case phi::DataType::UINT8:
-      SplitDenseTensor<DeviceContext, uint8_t>()(dev_ctx, p_dense, p_list);
+      SplitDenseTensor<DeviceContext, uint8_t>()(dev_ctx, t_in, p_list);
       break;
     case phi::DataType::INT8:
-      SplitDenseTensor<DeviceContext, int8_t>()(dev_ctx, p_dense, p_list);
+      SplitDenseTensor<DeviceContext, int8_t>()(dev_ctx, t_in, p_list);
       break;
     case phi::DataType::INT32:
-      SplitDenseTensor<DeviceContext, int32_t>()(dev_ctx, p_dense, p_list);
+      SplitDenseTensor<DeviceContext, int32_t>()(dev_ctx, t_in, p_list);
       break;
     case phi::DataType::INT64:
-      SplitDenseTensor<DeviceContext, int64_t>()(dev_ctx, p_dense, p_list);
+      SplitDenseTensor<DeviceContext, int64_t>()(dev_ctx, t_in, p_list);
       break;
     case phi::DataType::FLOAT16:
       SplitDenseTensor<DeviceContext, platform::float16>()(
-          dev_ctx, p_dense, p_list);
+          dev_ctx, t_in, p_list);
       break;
     case phi::DataType::FLOAT32:
-      SplitDenseTensor<DeviceContext, float>()(dev_ctx, p_dense, p_list);
+      SplitDenseTensor<DeviceContext, float>()(dev_ctx, t_in, p_list);
       break;
     case phi::DataType::FLOAT64:
-      SplitDenseTensor<DeviceContext, double>()(dev_ctx, p_dense, p_list);
+      SplitDenseTensor<DeviceContext, double>()(dev_ctx, t_in, p_list);
       break;
     default:
       PADDLE_THROW(platform::errors::Unimplemented(
-          "Data type (%s) is not supported when it splits tensors for "
-          "allgather.",
-          type));
+          "Data type (%s) is not supported when it splits tensors.", type));
+  }
+}
+
+void ConcatTensor(const phi::DeviceContext *dev_ctx,
+                  const std::vector<phi::DenseTensor> &tensor_list,
+                  const experimental::Tensor *tensor) {
+  auto *dense_tensor =
+      std::dynamic_pointer_cast<phi::DenseTensor>(tensor->impl()).get();
+
+  const auto &place = dev_ctx->GetPlace();
+  if (platform::is_gpu_place(place)) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    ConcatDenseTensorWithType(static_cast<const phi::GPUContext *>(dev_ctx),
+                              tensor_list,
+                              dense_tensor,
+                              tensor->dtype());
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Paddle can't concat tensor since it's not support GPU, please "
+        "recompile or reinstall Paddle with GPU support."));
+#endif
+  } else if (platform::is_custom_place(place)) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+    ConcatDenseTensorWithType(
+        static_cast<const platform::CustomDeviceContext *>(dev_ctx),
+        tensor_list,
+        dense_tensor,
+        tensor->dtype());
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Paddle can't concat tensor since it's not compiled with "
+        "CUSTOM_DEVICE, please recompile or reinstall Paddle with "
+        "CUSTOM_DEVICE support."));
+#endif
+  } else if (platform::is_cpu_place(place)) {
+    ConcatDenseTensorWithType(static_cast<const phi::CPUContext *>(dev_ctx),
+                              tensor_list,
+                              dense_tensor,
+                              tensor->dtype());
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Concat tensor not supported on place (%s)", place));
   }
 }
 
@@ -115,8 +223,8 @@ void SplitTensor(const phi::DeviceContext *dev_ctx,
                              tensor.dtype());
 #else
     PADDLE_THROW(platform::errors::PermissionDenied(
-        "Paddle can't split tensor since it's not support NCCL/RCCL, please "
-        "recompile or reinstall Paddle with NCCL/RCCL support."));
+        "Paddle can't split tensor since it's not support GPU, please "
+        "recompile or reinstall Paddle with GPU support."));
 #endif
   } else if (platform::is_custom_place(place)) {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc
index bec3c93cbd8b7..29e6e9e5d1e79 100644
--- a/paddle/fluid/pybind/distributed_py.cc
+++ b/paddle/fluid/pybind/distributed_py.cc
@@ -172,6 +172,24 @@ void BindDistributed(py::module *m) {
               py::arg("source_rank"),
               py::call_guard<py::gil_scoped_release>())
 
+          .def(
+              "broadcast",
+              [](distributed::ProcessGroup &self,
+                 py::handle py_tensor,
+                 int src,
+                 bool sync_op) {
+                auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
+                distributed::BroadcastOptions opts{src};
+                auto dense =
+                    std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
+                std::vector<phi::DenseTensor> tensors = {*dense};
+                return self.Broadcast(tensors, tensors, opts, sync_op);
+              },
+              py::arg("tensor"),
+              py::arg("src"),
+              py::arg("sync_op"),
+              py::call_guard<py::gil_scoped_release>())
+
           .def(
               "barrier",
               [](distributed::ProcessGroup &self, std::vector<int> place_ids) {
@@ -224,9 +242,9 @@ void BindDistributed(py::module *m) {
                 auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
                 auto dense =
                     std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
-                int numel = (*dense).numel();
-                int send_numel = numel / nranks;
-                int offset = send_numel * rank_id;
+                int64_t numel = (*dense).numel();
+                int64_t send_numel = numel / nranks;
+                int64_t offset = send_numel * rank_id;
                 return self.Send_Partial(*dense, dst_rank, offset, send_numel);
               },
               py::arg("tensor"),
@@ -246,9 +264,9 @@ void BindDistributed(py::module *m) {
                 auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
                 auto dense =
                     std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
-                int numel = (*dense).numel();
-                int send_numel = numel / nranks;
-                int offset = send_numel * rank_id;
+                int64_t numel = (*dense).numel();
+                int64_t send_numel = numel / nranks;
+                int64_t offset = send_numel * rank_id;
                 return self.Send_Partial(
                     *dense, dst_rank, offset, send_numel, sync_op);
               },
@@ -301,9 +319,9 @@ void BindDistributed(py::module *m) {
                 auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
                 auto dense =
                     std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
-                int numel = (*dense).numel();
-                int recv_numel = numel / nranks;
-                int offset = recv_numel * rank_id;
+                int64_t numel = (*dense).numel();
+                int64_t recv_numel = numel / nranks;
+                int64_t offset = recv_numel * rank_id;
                 return self.Recv_Partial(*dense, src_rank, offset, recv_numel);
               },
               py::arg("tensor"),
@@ -323,9 +341,9 @@ void BindDistributed(py::module *m) {
                 auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
                 auto dense =
                     std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
-                int numel = (*dense).numel();
-                int recv_numel = numel / nranks;
-                int offset = recv_numel * rank_id;
+                int64_t numel = (*dense).numel();
+                int64_t recv_numel = numel / nranks;
+                int64_t offset = recv_numel * rank_id;
                 return self.Recv_Partial(
                     *dense, src_rank, offset, recv_numel, sync_op);
               },
@@ -384,7 +402,7 @@ void BindDistributed(py::module *m) {
               py::call_guard<py::gil_scoped_release>())
 
           .def(
-              "allgather_base",
+              "allgather_into_tensor",
               [](distributed::ProcessGroup &self,
                  py::handle py_in_tensor,
                  py::handle py_out_tensor,
@@ -421,9 +439,9 @@ void BindDistributed(py::module *m) {
                     out_tensor.impl());
                 std::vector<phi::DenseTensor> in_tensors = {*in_dense};
                 std::vector<phi::DenseTensor> out_tensors = {*out_dense};
-                int numel = (*in_dense).numel();
-                int send_numel = numel / nranks;
-                int offset = send_numel * rank_id;
+                int64_t numel = (*in_dense).numel();
+                int64_t send_numel = numel / nranks;
+                int64_t offset = send_numel * rank_id;
                 return self.AllGather_Partial(
                     in_tensors, out_tensors, offset, send_numel);
               },
@@ -452,6 +470,61 @@ void BindDistributed(py::module *m) {
               py::arg("out"),
               py::call_guard<py::gil_scoped_release>())
 
+          .def(
+              "alltoall",
+              [](distributed::ProcessGroup &self,
+                 py::handle py_in_tensor_list,
+                 py::handle py_out_tensor_list,
+                 bool sync_op) {
+                auto in_tensor_list =
+                    CastPyArg2VectorOfTensor(py_in_tensor_list.ptr(), 0);
+                Tensor concat_in_tensor = paddle::concat(in_tensor_list, 0);
+                auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    concat_in_tensor.impl());
+                std::vector<phi::DenseTensor> in_wrapper = {*in_dense};
+
+                auto out_tensor_list =
+                    CastPyArg2VectorOfTensor(py_out_tensor_list.ptr(), 0);
+                Tensor concat_out_tensor = paddle::concat(out_tensor_list, 0);
+                auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    concat_out_tensor.impl());
+                std::vector<phi::DenseTensor> out_wrapper = {*out_dense};
+
+                // in_tensor_list should not be empty
+                const auto *dev_ctx =
+                    self.GetDeviceContext(in_tensor_list.back().place());
+                auto task = self.AllToAll(in_wrapper, out_wrapper, sync_op);
+                distributed::SplitTensor(dev_ctx, *out_dense, &out_tensor_list);
+                return task;
+              },
+              py::arg("in"),
+              py::arg("out"),
+              py::arg("sync_op"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "alltoall_tensor",
+              [](distributed::ProcessGroup &self,
+                 py::handle py_in_tensor,
+                 py::handle py_out_tensor,
+                 bool sync_op) {
+                auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    in_tensor.impl());
+                std::vector<phi::DenseTensor> in_wrapper = {*in_dense};
+
+                auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    out_tensor.impl());
+                std::vector<phi::DenseTensor> out_wrapper = {*out_dense};
+
+                return self.AllToAll(in_wrapper, out_wrapper, sync_op);
+              },
+              py::arg("in"),
+              py::arg("out"),
+              py::arg("sync_op"),
+              py::call_guard<py::gil_scoped_release>())
+
           .def(
               "alltoall_single",
               [](distributed::ProcessGroup &self,
@@ -476,6 +549,34 @@ void BindDistributed(py::module *m) {
               py::arg("out_sizes"),
               py::call_guard<py::gil_scoped_release>())
 
+          .def(
+              "alltoall_single",
+              [](distributed::ProcessGroup &self,
+                 py::handle py_in_tensor,
+                 py::handle py_out_tensor,
+                 std::vector<int64_t> &in_sizes,
+                 std::vector<int64_t> &out_sizes,
+                 bool sync_op) {
+                auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    in_tensor.impl());
+                std::vector<phi::DenseTensor> in_wrapper = {*in_dense};
+
+                auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    out_tensor.impl());
+                std::vector<phi::DenseTensor> out_wrapper = {*out_dense};
+
+                return self.AllToAllSingle(
+                    in_wrapper, out_wrapper, in_sizes, out_sizes, sync_op);
+              },
+              py::arg("in"),
+              py::arg("out"),
+              py::arg("in_sizes"),
+              py::arg("out_sizes"),
+              py::arg("sync_op"),
+              py::call_guard<py::gil_scoped_release>())
+
           .def(
               "reduce",
               [](distributed::ProcessGroup &self,
@@ -495,6 +596,83 @@ void BindDistributed(py::module *m) {
               py::arg("dst"),
               py::arg("op") = distributed::ReduceOp::SUM,
               py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "reduce",
+              [](distributed::ProcessGroup &self,
+                 py::handle py_in_tensor,
+                 int dst,
+                 distributed::ReduceOp op,
+                 bool sync_op) {
+                auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                distributed::ReduceOptions opts{op, dst};
+                auto dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    in_tensor.impl());
+                std::vector<phi::DenseTensor> tensors = {*dense};
+                return self.Reduce(tensors, tensors, opts, sync_op);
+              },
+              py::arg("tensor"),
+              py::arg("dst"),
+              py::arg("op"),
+              py::arg("sync_op"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "reduce_scatter",
+              [](distributed::ProcessGroup &self,
+                 py::handle py_in_tensor_list,
+                 py::handle py_out_tensor,
+                 distributed::ReduceOp op,
+                 bool sync_op) {
+                auto in_tensor_list =
+                    CastPyArg2VectorOfTensor(py_in_tensor_list.ptr(), 0);
+                Tensor concat_in_tensor = paddle::concat(in_tensor_list, 0);
+                auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    concat_in_tensor.impl());
+                std::vector<phi::DenseTensor> in_wrapper = {*in_dense};
+
+                auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    out_tensor.impl());
+                std::vector<phi::DenseTensor> out_wrapper = {*out_dense};
+
+                distributed::ReduceScatterOptions opts{op};
+                return self.ReduceScatter(
+                    in_wrapper, out_wrapper, opts, sync_op);
+              },
+              py::arg("in"),
+              py::arg("out"),
+              py::arg("op"),
+              py::arg("sync_op"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "reduce_scatter_tensor",
+              [](distributed::ProcessGroup &self,
+                 py::handle py_in_tensor,
+                 py::handle py_out_tensor,
+                 distributed::ReduceOp op,
+                 bool sync_op) {
+                auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    in_tensor.impl());
+                std::vector<phi::DenseTensor> in_wrapper = {*in_dense};
+
+                auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    out_tensor.impl());
+                std::vector<phi::DenseTensor> out_wrapper = {*out_dense};
+
+                distributed::ReduceScatterOptions opts{op};
+                return self.ReduceScatter(
+                    in_wrapper, out_wrapper, opts, sync_op);
+              },
+              py::arg("in"),
+              py::arg("out"),
+              py::arg("op"),
+              py::arg("sync_op"),
+              py::call_guard<py::gil_scoped_release>())
+
           .def(
               "scatter",
               [](distributed::ProcessGroup &self,
@@ -517,6 +695,61 @@ void BindDistributed(py::module *m) {
               py::arg("out"),
               py::arg("src"),
               py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "scatter",
+              [](distributed::ProcessGroup &self,
+                 py::handle py_in_tensor_list,
+                 py::handle py_out_tensor,
+                 int src,
+                 bool sync_op) {
+                auto in_tensor_list =
+                    CastPyArg2VectorOfTensor(py_in_tensor_list.ptr(), 0);
+                Tensor concat_in_tensor = paddle::concat(in_tensor_list, 0);
+                auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    concat_in_tensor.impl());
+                std::vector<phi::DenseTensor> in_wrapper = {*in_dense};
+
+                auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    out_tensor.impl());
+                std::vector<phi::DenseTensor> out_wrapper = {*out_dense};
+
+                distributed::ScatterOptions opts{src};
+                return self.Scatter(in_wrapper, out_wrapper, opts, sync_op);
+              },
+              py::arg("in"),
+              py::arg("out"),
+              py::arg("src"),
+              py::arg("sync_op"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "scatter_tensor",
+              [](distributed::ProcessGroup &self,
+                 py::handle py_in_tensor,
+                 py::handle py_out_tensor,
+                 int src,
+                 bool sync_op) {
+                auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    in_tensor.impl());
+                std::vector<phi::DenseTensor> in_wrapper = {*in_dense};
+
+                auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    out_tensor.impl());
+                std::vector<phi::DenseTensor> out_wrapper = {*out_dense};
+
+                distributed::ScatterOptions opts{src};
+                return self.Scatter(in_wrapper, out_wrapper, opts, sync_op);
+              },
+              py::arg("in"),
+              py::arg("out"),
+              py::arg("src"),
+              py::arg("sync_op"),
+              py::call_guard<py::gil_scoped_release>())
+
           .def(
               "_reduce_scatter_base",
               [](distributed::ProcessGroup &self,
@@ -573,7 +806,7 @@ void BindDistributed(py::module *m) {
               py::call_guard<py::gil_scoped_release>())
 
           .def(
-              "allgather_base_on_calc_stream",
+              "allgather_into_tensor_on_calc_stream",
               [](distributed::ProcessGroupStream &self,
                  py::handle py_in_tensor,
                  py::handle py_out_tensor) {
@@ -596,6 +829,37 @@ void BindDistributed(py::module *m) {
               py::arg("out"),
               py::call_guard<py::gil_scoped_release>())
 
+          .def(
+              "all_gather_partial_on_calc_stream",
+              [](distributed::ProcessGroupStream &self,
+                 py::handle py_in_tensor,
+                 py::handle py_out_tensor,
+                 int nranks,
+                 int rank_id) {
+                auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    in_tensor.impl());
+                auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    out_tensor.impl());
+                std::vector<phi::DenseTensor> in_tensors = {*in_dense};
+                std::vector<phi::DenseTensor> out_tensors = {*out_dense};
+                int64_t numel = (*in_dense).numel();
+                int64_t send_numel = numel / nranks;
+                int64_t offset = send_numel * rank_id;
+                return self.AllGather_Partial(in_tensors,
+                                              out_tensors,
+                                              offset,
+                                              send_numel,
+                                              /*sync_op*/ true,
+                                              /*use_calc_stream*/ true);
+              },
+              py::arg("in"),
+              py::arg("out"),
+              py::arg("num"),
+              py::arg("id"),
+              py::call_guard<py::gil_scoped_release>())
+
           .def(
               "allreduce_on_calc_stream",
               [](distributed::ProcessGroupStream &self,
@@ -617,6 +881,251 @@ void BindDistributed(py::module *m) {
               py::arg("op"),
               py::call_guard<py::gil_scoped_release>())
 
+          .def(
+              "alltoall_on_calc_stream",
+              [](distributed::ProcessGroupStream &self,
+                 py::handle py_in_tensor_list,
+                 py::handle py_out_tensor_list) {
+                auto in_tensor_list =
+                    CastPyArg2VectorOfTensor(py_in_tensor_list.ptr(), 0);
+                Tensor concat_in_tensor = paddle::concat(in_tensor_list, 0);
+                auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    concat_in_tensor.impl());
+                std::vector<phi::DenseTensor> in_wrapper = {*in_dense};
+
+                auto out_tensor_list =
+                    CastPyArg2VectorOfTensor(py_out_tensor_list.ptr(), 0);
+                Tensor concat_out_tensor = paddle::concat(out_tensor_list, 0);
+                auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    concat_out_tensor.impl());
+                std::vector<phi::DenseTensor> out_wrapper = {*out_dense};
+
+                // in_tensor_list must not be empty
+                const auto *dev_ctx = self.GetDeviceContext(
+                    in_tensor_list.back().place(), /*use_calc_stream*/ true);
+                auto task = self.AllToAll(in_wrapper,
+                                          out_wrapper,
+                                          /*sync_op*/ true,
+                                          /*use_calc_stream*/ true);
+                distributed::SplitTensor(dev_ctx, *out_dense, &out_tensor_list);
+                return task;
+              },
+              py::arg("in"),
+              py::arg("out"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "alltoall_tensor_on_calc_stream",
+              [](distributed::ProcessGroupStream &self,
+                 py::handle py_in_tensor,
+                 py::handle py_out_tensor) {
+                auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    in_tensor.impl());
+                std::vector<phi::DenseTensor> in_wrapper = {*in_dense};
+
+                auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    out_tensor.impl());
+                std::vector<phi::DenseTensor> out_wrapper = {*out_dense};
+
+                return self.AllToAll(in_wrapper,
+                                     out_wrapper,
+                                     /*sync_op*/ true,
+                                     /*use_calc_stream*/ true);
+              },
+              py::arg("in"),
+              py::arg("out"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "alltoall_single_on_calc_stream",
+              [](distributed::ProcessGroupStream &self,
+                 py::handle py_in_tensor,
+                 py::handle py_out_tensor,
+                 std::vector<int64_t> &in_sizes,
+                 std::vector<int64_t> &out_sizes) {
+                auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    in_tensor.impl());
+                std::vector<phi::DenseTensor> in_wrapper = {*in_dense};
+
+                auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    out_tensor.impl());
+                std::vector<phi::DenseTensor> out_wrapper = {*out_dense};
+
+                return self.AllToAllSingle(in_wrapper,
+                                           out_wrapper,
+                                           in_sizes,
+                                           out_sizes,
+                                           /*sync_op*/ true,
+                                           /*use_calc_stream*/ true);
+              },
+              py::arg("in"),
+              py::arg("out"),
+              py::arg("in_sizes"),
+              py::arg("out_sizes"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "broadcast_on_calc_stream",
+              [](distributed::ProcessGroupStream &self,
+                 py::handle py_tensor,
+                 int src) {
+                auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
+                distributed::BroadcastOptions opts{src};
+                auto dense =
+                    std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
+                std::vector<phi::DenseTensor> tensors = {*dense};
+                return self.Broadcast(tensors,
+                                      tensors,
+                                      opts,
+                                      /*sync_op*/ true,
+                                      /*use_calc_stream*/ true);
+              },
+              py::arg("tensor"),
+              py::arg("src"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "reduce_on_calc_stream",
+              [](distributed::ProcessGroupStream &self,
+                 py::handle py_in_tensor,
+                 int dst,
+                 distributed::ReduceOp op) {
+                auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                distributed::ReduceOptions opts{op, dst};
+                auto dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    in_tensor.impl());
+                std::vector<phi::DenseTensor> tensors = {*dense};
+                return self.Reduce(tensors,
+                                   tensors,
+                                   opts,
+                                   /*sync_op*/ true,
+                                   /*use_calc_stream*/ true);
+              },
+              py::arg("tensor"),
+              py::arg("dst"),
+              py::arg("op"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "reduce_scatter_on_calc_stream",
+              [](distributed::ProcessGroupStream &self,
+                 py::handle py_in_tensor_list,
+                 py::handle py_out_tensor,
+                 distributed::ReduceOp op) {
+                auto in_tensor_list =
+                    CastPyArg2VectorOfTensor(py_in_tensor_list.ptr(), 0);
+                Tensor concat_in_tensor = paddle::concat(in_tensor_list, 0);
+                auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    concat_in_tensor.impl());
+                std::vector<phi::DenseTensor> in_wrapper = {*in_dense};
+
+                auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    out_tensor.impl());
+                std::vector<phi::DenseTensor> out_wrapper = {*out_dense};
+
+                distributed::ReduceScatterOptions opts{op};
+                return self.ReduceScatter(in_wrapper,
+                                          out_wrapper,
+                                          opts,
+                                          /*sync_op*/ true,
+                                          /*use_calc_stream*/ true);
+              },
+              py::arg("in"),
+              py::arg("out"),
+              py::arg("op"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "reduce_scatter_tensor_on_calc_stream",
+              [](distributed::ProcessGroupStream &self,
+                 py::handle py_in_tensor,
+                 py::handle py_out_tensor,
+                 distributed::ReduceOp op) {
+                auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    in_tensor.impl());
+                std::vector<phi::DenseTensor> in_wrapper = {*in_dense};
+
+                auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    out_tensor.impl());
+                std::vector<phi::DenseTensor> out_wrapper = {*out_dense};
+
+                distributed::ReduceScatterOptions opts{op};
+                return self.ReduceScatter(in_wrapper,
+                                          out_wrapper,
+                                          opts,
+                                          /*sync_op*/ true,
+                                          /*use_calc_stream*/ true);
+              },
+              py::arg("in"),
+              py::arg("out"),
+              py::arg("op"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "scatter_on_calc_stream",
+              [](distributed::ProcessGroupStream &self,
+                 py::handle py_in_tensor_list,
+                 py::handle py_out_tensor,
+                 int src) {
+                auto in_tensor_list =
+                    CastPyArg2VectorOfTensor(py_in_tensor_list.ptr(), 0);
+                Tensor concat_in_tensor = paddle::concat(in_tensor_list, 0);
+                auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    concat_in_tensor.impl());
+                std::vector<phi::DenseTensor> in_wrapper = {*in_dense};
+
+                auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    out_tensor.impl());
+                std::vector<phi::DenseTensor> out_wrapper = {*out_dense};
+
+                distributed::ScatterOptions opts{src};
+                return self.Scatter(in_wrapper,
+                                    out_wrapper,
+                                    opts,
+                                    /*sync_op*/ true,
+                                    /*use_calc_stream*/ true);
+              },
+              py::arg("in"),
+              py::arg("out"),
+              py::arg("src"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "scatter_tensor_on_calc_stream",
+              [](distributed::ProcessGroupStream &self,
+                 py::handle py_in_tensor,
+                 py::handle py_out_tensor,
+                 int src) {
+                auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    in_tensor.impl());
+                std::vector<phi::DenseTensor> in_wrapper = {*in_dense};
+
+                auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    out_tensor.impl());
+                std::vector<phi::DenseTensor> out_wrapper = {*out_dense};
+
+                distributed::ScatterOptions opts{src};
+                return self.Scatter(in_wrapper,
+                                    out_wrapper,
+                                    opts,
+                                    /*sync_op*/ true,
+                                    /*use_calc_stream*/ true);
+              },
+              py::arg("in"),
+              py::arg("out"),
+              py::arg("src"),
+              py::call_guard<py::gil_scoped_release>())
+
           .def(
               "send_on_calc_stream",
               [](distributed::ProcessGroupStream &self,
@@ -645,9 +1154,9 @@ void BindDistributed(py::module *m) {
                 auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
                 auto dense =
                     std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
-                int numel = (*dense).numel();
-                int send_numel = numel / nranks;
-                int offset = send_numel * rank_id;
+                int64_t numel = (*dense).numel();
+                int64_t send_numel = numel / nranks;
+                int64_t offset = send_numel * rank_id;
                 return self.Send_Partial(*dense,
                                          dst_rank,
                                          offset,
@@ -689,9 +1198,9 @@ void BindDistributed(py::module *m) {
                 auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
                 auto dense =
                     std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
-                int numel = (*dense).numel();
-                int recv_numel = numel / nranks;
-                int offset = recv_numel * rank_id;
+                int64_t numel = (*dense).numel();
+                int64_t recv_numel = numel / nranks;
+                int64_t offset = recv_numel * rank_id;
                 return self.Recv_Partial(*dense,
                                          src_rank,
                                          offset,
diff --git a/python/paddle/distributed/communication/stream/__init__.py b/python/paddle/distributed/communication/stream/__init__.py
index deab1f97ea28e..a1844decf9478 100644
--- a/python/paddle/distributed/communication/stream/__init__.py
+++ b/python/paddle/distributed/communication/stream/__init__.py
@@ -14,7 +14,16 @@
 
 from .all_gather import all_gather
 from .all_reduce import all_reduce
-from .send import send
+from .alltoall import alltoall
+from .alltoall_single import alltoall_single
+from .broadcast import broadcast
+from .reduce import reduce
+from .reduce_scatter import _reduce_scatter_base, reduce_scatter
 from .recv import recv
+from .scatter import scatter
+from .send import send
 
-__all__ = ["all_gather", "all_reduce", "send", "recv"]
+__all__ = [
+    "_reduce_scatter_base", "all_reduce", "alltoall", "alltoall_single",
+    "broadcast", "reduce", "reduce_scatter", "recv", "scatter", "send"
+]
diff --git a/python/paddle/distributed/communication/stream/all_gather.py b/python/paddle/distributed/communication/stream/all_gather.py
index dca2957309068..9eb961cda171d 100644
--- a/python/paddle/distributed/communication/stream/all_gather.py
+++ b/python/paddle/distributed/communication/stream/all_gather.py
@@ -34,17 +34,18 @@ def _check_tensor_list_shape(tensor_list, shape, nranks=1):
                 'The tensor_list for all_gather is not correctly-sized.')
 
 
-def _all_gather_base_in_dygraph(out_tensor, in_tensor, group, sync_op,
-                                use_calc_stream):
+def _all_gather_into_tensor_in_dygraph(out_tensor, in_tensor, group, sync_op,
+                                       use_calc_stream):
     group = collective._get_default_group() if group is None else group
 
     _check_tensor_shape(out_tensor, in_tensor.shape, group.nranks)
 
     if use_calc_stream:
-        return group.process_group.allgather_base_on_calc_stream(
+        return group.process_group.allgather_into_tensor_on_calc_stream(
             in_tensor, out_tensor)
 
-    task = group.process_group.allgather_base(in_tensor, out_tensor, sync_op)
+    task = group.process_group.allgather_into_tensor(in_tensor, out_tensor,
+                                                     sync_op)
     if sync_op:
         task.wait()
 
@@ -83,7 +84,7 @@ def all_gather(tensor_or_tensor_list,
         tensor_or_tensor_list (Union[Tensor, List[Tensor]]): The output. If it is a tensor, it should be correctly-sized. If it is a list, it
             should be empty or contain correctly-sized tensors.
         tensor (Tensor): The input tensor on each rank. The result will overwrite this tenor after communication. Support
-            float16, float32, float64, int32 or int64 as the input data type.
+            float16, float32, float64, int32, int64, int8, uint8 or bool as the input data type.
         group (Group, optional): Communicate in which group. If none is given, use the global group as default.
         sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default.
         use_calc_stream (bool, optional): Indicate whether the communication is done on calculation stream. If none is given, use false as default. This
@@ -125,8 +126,9 @@ def all_gather(tensor_or_tensor_list,
 
     if framework.in_dygraph_mode():
         if paddle.is_tensor(tensor_or_tensor_list):
-            return _all_gather_base_in_dygraph(tensor_or_tensor_list, tensor,
-                                               group, sync_op, use_calc_stream)
+            return _all_gather_into_tensor_in_dygraph(tensor_or_tensor_list,
+                                                      tensor, group, sync_op,
+                                                      use_calc_stream)
         else:
             return _all_gather_in_dygraph(tensor_or_tensor_list, tensor, group,
                                           sync_op, use_calc_stream)
diff --git a/python/paddle/distributed/communication/stream/all_reduce.py b/python/paddle/distributed/communication/stream/all_reduce.py
index f94422f4bd0a6..67fc4c8b63a0c 100644
--- a/python/paddle/distributed/communication/stream/all_reduce.py
+++ b/python/paddle/distributed/communication/stream/all_reduce.py
@@ -40,8 +40,8 @@ def all_reduce(tensor,
 
     Args:
         tensor (Tensor): The input tensor on each rank. The result will overwrite this tenor after communication. Support
-            float16, float32, float64, int32 or int64 as the input data type.
-        op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.Min|ReduceOp.PROD, optional): The reduction used. If none is given, use ReduceOp.SUM as default.
+            float16, float32, float64, int32, int64, int8, uint8 or bool as the input data type.
+        op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD, optional): The reduction used. If none is given, use ReduceOp.SUM as default.
         group (Group, optional): Communicate in which group. If none is given, use the global group as default.
         sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default.
         use_calc_stream (bool, optional): Indicate whether the communication is done on calculation stream. If none is given, use false as default. This
diff --git a/python/paddle/distributed/communication/stream/alltoall.py b/python/paddle/distributed/communication/stream/alltoall.py
new file mode 100644
index 0000000000000..b216906d04568
--- /dev/null
+++ b/python/paddle/distributed/communication/stream/alltoall.py
@@ -0,0 +1,157 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.fluid.framework as framework
+from paddle.distributed import collective
+
+
+def _check_tensor_shape(tensor, shape, nranks=1):
+    if tensor.shape != shape:
+        raise RuntimeError('The tensor for alltoall is not correctly-sized.')
+
+
+def _check_tensor_list_shape(tensor_list, shape, nranks=1):
+    if len(tensor_list) != nranks:
+        raise RuntimeError(
+            'The tensor_list for alltoall is not correctly-sized.')
+    for tensor in tensor_list:
+        if tensor.shape != shape:
+            raise RuntimeError(
+                'The tensor_list for alltoall is not correctly-sized.')
+
+
+def _alltoall_tensor_in_dygraph(out_tensor, in_tensor, group, sync_op,
+                                use_calc_stream):
+    group = collective._get_default_group() if group is None else group
+
+    _check_tensor_shape(out_tensor, in_tensor.shape, group.nranks)
+
+    if use_calc_stream:
+        return group.process_group.alltoall_tensor_on_calc_stream(
+            in_tensor, out_tensor)
+
+    task = group.process_group.alltoall_tensor(in_tensor, out_tensor, sync_op)
+    if sync_op:
+        task.wait()
+
+    return task
+
+
+def _alltoall_in_dygraph(out_tensor_list, in_tensor_list, group, sync_op,
+                         use_calc_stream):
+    group = collective._get_default_group() if group is None else group
+
+    if len(in_tensor_list) == 0:
+        raise RuntimeError("The input tensor_list should not be empty.")
+
+    if len(out_tensor_list) == 0:
+        out_tensor_list += [
+            paddle.empty_like(tensor) for tensor in in_tensor_list
+        ]
+    else:
+        _check_tensor_list_shape(out_tensor_list, in_tensor_list[0].shape,
+                                 group.nranks)
+
+    if use_calc_stream:
+        return group.process_group.alltoall_on_calc_stream(
+            in_tensor_list, out_tensor_list)
+
+    task = group.process_group.alltoall(in_tensor_list, out_tensor_list,
+                                        sync_op)
+    if sync_op:
+        task.wait()
+
+    return task
+
+
+def alltoall(out_tensor_or_tensor_list,
+             in_tensor_or_tensor_list,
+             group=None,
+             sync_op=True,
+             use_calc_stream=False):
+    """
+
+    Scatter a tensor (or a tensor list) across devices and gather outputs to another tensor (or a tensor list, respectively).
+
+    Args:
+        out_tensor_or_tensor_list (Union[Tensor, List[Tensor]]): The output. If it is a tensor, it should be correctly-sized.
+        If it is a list, it should be empty or contain correctly-sized tensors. Its data type should be the same as the input.
+        in_tensor_or_tensor_list (Union[Tensor, List[Tensor]]): The input to scatter (must be specified on the source rank).
+            If it is a tensor, it should be correctly-sized. If it is a list, it should contain correctly-sized tensors. Support
+            float16, float32, float64, int32, int64, int8, uint8 or bool as the input data type.
+        group (Group, optional): Communicate in which group. If none is given, use the global group as default.
+        sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default.
+        use_calc_stream (bool, optional): Indicate whether the communication is done on calculation stream. If none is given, use false as default. This
+            option is designed for high performance demand, be careful to turn it on except you are clearly know its meaning.
+
+    Returns:
+        Return a task object.
+
+    Warning:
+        This API only supports the dygraph mode now.
+
+    Examples:
+        .. code-block:: python
+
+            # required: distributed
+            import paddle
+            import paddle.distributed as dist
+
+            dist.init_parallel_env()
+            out_tensor_list = []
+            if dist.get_rank() == 0:
+                data1 = paddle.to_tensor([[1, 2, 3], [4, 5, 6]])
+                data2 = paddle.to_tensor([[7, 8, 9], [10, 11, 12]])
+            else:
+                data1 = paddle.to_tensor([[13, 14, 15], [16, 17, 18]])
+                data2 = paddle.to_tensor([[19, 20, 21], [22, 23, 24]])
+            task = dist.stream.alltoall(out_tensor_list, [data1, data2], sync_op=False)
+            task.wait()
+            print(out_tensor_list)
+            # [[[1, 2, 3], [4, 5, 6]], [[13, 14, 15], [16, 17, 18]]]    (2 GPUs, out for rank 0)
+            # [[[7, 8, 9], [10, 11, 12]], [[19, 20, 21], [22, 23, 24]]] (2 GPUs, out for rank 1)
+    """
+    if group is not None and not group.is_member():
+        raise RuntimeError(
+            "The group should not be None and all ranks which invoke this operation should be the member of this group."
+        )
+
+    if not sync_op and use_calc_stream:
+        raise RuntimeError(
+            "use_calc_stream can only be true in sync op behavior.")
+
+    if out_tensor_or_tensor_list is None:
+        raise RuntimeError("The output should be specified.")
+    if in_tensor_or_tensor_list is None:
+        raise RuntimeError("The input should be specified.")
+
+    if framework.in_dygraph_mode():
+        out_is_tensor = paddle.is_tensor(out_tensor_or_tensor_list)
+        in_is_tensor = paddle.is_tensor(in_tensor_or_tensor_list)
+        if out_is_tensor and in_is_tensor:
+            return _alltoall_tensor_in_dygraph(out_tensor_or_tensor_list,
+                                               in_tensor_or_tensor_list, group,
+                                               sync_op, use_calc_stream)
+        elif not out_is_tensor and not in_is_tensor:
+            return _alltoall_in_dygraph(out_tensor_or_tensor_list,
+                                        in_tensor_or_tensor_list, group,
+                                        sync_op, use_calc_stream)
+        else:
+            raise RuntimeError(
+                "The output and input should be both tensor or tensor list.")
+
+    raise RuntimeError(
+        "paddle.distributed.stream.alltoall is only supported in dygraph mode now."
+    )
diff --git a/python/paddle/distributed/communication/stream/alltoall_single.py b/python/paddle/distributed/communication/stream/alltoall_single.py
new file mode 100644
index 0000000000000..b2187cc06e343
--- /dev/null
+++ b/python/paddle/distributed/communication/stream/alltoall_single.py
@@ -0,0 +1,128 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid.framework as framework
+from paddle.distributed import collective
+
+
+def _alltoall_single_in_dygraph(out_tensor, in_tensor, out_split_sizes,
+                                in_split_sizes, group, sync_op,
+                                use_calc_stream):
+    group = collective._get_default_group() if group is None else group
+
+    if out_split_sizes is None:
+        out_split_sizes = []
+    if in_split_sizes is None:
+        in_split_sizes = []
+
+    if use_calc_stream:
+        return group.process_group.alltoall_single_on_calc_stream(
+            in_tensor, out_tensor, in_split_sizes, out_split_sizes)
+
+    task = group.process_group.alltoall_single(in_tensor, out_tensor,
+                                               in_split_sizes, out_split_sizes,
+                                               sync_op)
+    if sync_op:
+        task.wait()
+
+    return task
+
+
+def alltoall_single(out_tensor,
+                    in_tensor,
+                    out_split_sizes=None,
+                    in_split_sizes=None,
+                    group=None,
+                    sync_op=True,
+                    use_calc_stream=False):
+    """
+
+    Split and Scatter the splitted input tensor to the out tensor across devices.
+
+    Args:
+        out_tensor(Tensor): The output tensor. Its data type should be the same as the input.
+        in_tensor (Tensor): The input tensor. Its data type should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+        out_split_sizes (List[int], optional): Split sizes of out_tensor for dim[0]. If not given, dim[0] of out_tensor must be divisible
+            by group size and out_tensor will be gathered averagely from all participators. If none is given, use a empty list as default.
+        in_split_sizes (List[int], optional): Split sizes of in_tensor for dim[0]. If not given, dim[0] of in_tensor must be divisible
+        by group size and in_tensor will be scattered averagely to all participators. If none is given, use a empty list as default.
+        group (Group, optional): Communicate in which group. If none is given, use the global group as default.
+        sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default.
+        use_calc_stream (bool, optional): Indicate whether the communication is done on calculation stream. If none is given, use false as default. This
+            option is designed for high performance demand, be careful to turn it on except you are clearly know its meaning.
+
+    Returns:
+        Return a task object.
+
+    Warning:
+        This API only supports the dygraph mode now.
+
+    Examples:
+        .. code-block:: python
+
+            # required: distributed
+            import paddle
+            import paddle.distributed as dist
+
+            dist.init_parallel_env()
+            local_rank = dist.get_rank()
+
+            # case 1
+            output = paddle.empty([2], dtype="int64")
+            if local_rank == 0:
+                data = paddle.to_tensor([0, 1])
+            else:
+                data = paddle.to_tensor([2, 3])
+            task = dist.stream.alltoall_single(output, data, sync_op=False)
+            task.wait()
+            out = output.numpy()
+            # [0, 2] (2 GPUs, out for rank 0)
+            # [1, 3] (2 GPUs, out for rank 1)
+
+            # case 2
+            size = dist.get_world_size()
+            output = paddle.empty([(local_rank + 1) * size, size], dtype='float32')
+            if local_rank == 0:
+                data = paddle.to_tensor([[0., 0.], [0., 0.], [0., 0.]])
+            else:
+                data = paddle.to_tensor([[1., 1.], [1., 1.], [1., 1.]])
+            out_split_sizes = [local_rank + 1 for i in range(size)]
+            in_split_sizes = [i + 1 for i in range(size)]
+            task = dist.stream.alltoall_single(output,
+                                            data,
+                                            out_split_sizes,
+                                            in_split_sizes,
+                                            sync_op=False)
+            task.wait()
+            out = output.numpy()
+            # [[0., 0.], [1., 1.]]                     (2 GPUs, out for rank 0)
+            # [[0., 0.], [0., 0.], [1., 1.], [1., 1.]] (2 GPUs, out for rank 1)
+    """
+    if group is not None and not group.is_member():
+        raise RuntimeError(
+            "The group should not be None and all ranks which invoke this operation should be the member of this group."
+        )
+
+    if not sync_op and use_calc_stream:
+        raise RuntimeError(
+            "use_calc_stream can only be true in sync op behavior.")
+
+    if framework.in_dygraph_mode():
+        return _alltoall_single_in_dygraph(out_tensor, in_tensor,
+                                           out_split_sizes, in_split_sizes,
+                                           group, sync_op, use_calc_stream)
+
+    raise RuntimeError(
+        "paddle.distributed.stream.alltoall_single is only supported in dygraph mode now."
+    )
diff --git a/python/paddle/distributed/communication/stream/broadcast.py b/python/paddle/distributed/communication/stream/broadcast.py
new file mode 100644
index 0000000000000..06bde316937a9
--- /dev/null
+++ b/python/paddle/distributed/communication/stream/broadcast.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid.framework as framework
+from paddle.distributed import collective
+
+
+def _broadcast_in_dygraph(tensor, src, group, sync_op, use_calc_stream):
+    group = collective._get_default_group() if group is None else group
+    if use_calc_stream:
+        return group.process_group.broadcast_on_calc_stream(tensor, src)
+
+    task = group.process_group.broadcast(tensor, src, sync_op)
+    if sync_op:
+        task.wait()
+
+    return task
+
+
+def broadcast(tensor, src=0, group=None, sync_op=True, use_calc_stream=False):
+    """
+
+    Broadcast a tensor to all devices.
+
+    Args:
+        tensor (Tensor): The tensor to broadcast. Support float16, float32, float64, int32, int64, int8, uint8 or bool as its data type.
+        src (int, optional): Rank of the source device. If none is given, use `0` as default.
+        group (Group, optional): Communicate in which group. If none is given, use the global group as default.
+        sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default.
+        use_calc_stream (bool, optional): Indicate whether the communication is done on calculation stream. If none is given, use false as default. This
+            option is designed for high performance demand, be careful to turn it on except you are clearly know its meaning.
+
+    Returns:
+        Return a task object.
+
+    Warning:
+        This API only supports the dygraph mode now.
+
+    Examples:
+        .. code-block:: python
+
+            # required: distributed
+            import paddle
+            import paddle.distributed as dist
+
+            dist.init_parallel_env()
+            local_rank = dist.get_rank()
+            if local_rank == 0:
+                data = paddle.to_tensor([[4, 5, 6], [4, 5, 6]])
+            else:
+                data = paddle.to_tensor([[1, 2, 3], [1, 2, 3]])
+            task = dist.stream.broadcast(data, src=1, sync_op=False)
+            task.wait()
+            out = data.numpy()
+            # [[1, 2, 3], [1, 2, 3]] (2 GPUs)
+    """
+    if group is not None and not group.is_member():
+        raise RuntimeError(
+            "The group should not be None and all ranks which invoke this operation should be the member of this group."
+        )
+
+    if not sync_op and use_calc_stream:
+        raise RuntimeError(
+            "use_calc_stream can only be True in sync op behavior.")
+
+    if framework.in_dygraph_mode():
+        return _broadcast_in_dygraph(tensor, src, group, sync_op,
+                                     use_calc_stream)
+
+    raise RuntimeError(
+        "paddle.distributed.stream.broadcast is only supported in dygraph mode now."
+    )
diff --git a/python/paddle/distributed/communication/stream/recv.py b/python/paddle/distributed/communication/stream/recv.py
index b225f64b8b4d2..d572dd44622b8 100644
--- a/python/paddle/distributed/communication/stream/recv.py
+++ b/python/paddle/distributed/communication/stream/recv.py
@@ -64,7 +64,7 @@ def recv(tensor, src=0, group=None, sync_op=True, use_calc_stream=False):
                 task = dist.stream.recv(data, src=0, sync_op=False)
             task.wait()
             out = data.numpy()
-            # [[4, 5, 6], [4, 5, 6]
+            # [[4, 5, 6], [4, 5, 6]] (2 GPUs)
     """
     if group is not None and not group.is_member():
         raise RuntimeError(
diff --git a/python/paddle/distributed/communication/stream/reduce.py b/python/paddle/distributed/communication/stream/reduce.py
new file mode 100644
index 0000000000000..b0f7f5c884743
--- /dev/null
+++ b/python/paddle/distributed/communication/stream/reduce.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid.framework as framework
+from paddle.distributed.communication.group import _get_global_group
+from paddle.distributed.communication.reduce import _get_reduce_op, ReduceOp
+
+
+def _reduce_in_dygraph(tensor, dst, op, group, sync_op, use_calc_stream):
+    op_type = _get_reduce_op(op, "reduce")
+    group = _get_global_group() if group is None else group
+    if use_calc_stream:
+        return group.process_group.reduce_on_calc_stream(tensor, dst, op_type)
+
+    task = group.process_group.reduce(tensor, dst, op_type, sync_op)
+    if sync_op:
+        task.wait()
+
+    return task
+
+
+def reduce(tensor,
+           dst=0,
+           op=ReduceOp.SUM,
+           group=None,
+           sync_op=True,
+           use_calc_stream=False):
+    """
+
+    Perform specific reduction (for example, sum, max) on a tensor across devices and send to the destintion device.
+
+    Args:
+        tensor (Tensor): The input tensor on each rank. The result will overwrite this tenor after communication. Support
+            float16, float32, float64, int32, int64, int8, uint8 or bool as the input data type.
+        dst (int, optional): Rank of the destination device. If none is given, use `0` as default.
+        op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD, optional): The reduction used. If none is given, use ReduceOp.SUM as default.
+        group (Group, optional): Communicate in which group. If none is given, use the global group as default.
+        sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default.
+        use_calc_stream (bool, optional): Indicate whether the communication is done on calculation stream. If none is given, use false as default. This
+            option is designed for high performance demand, be careful to turn it on except you are clearly know its meaning.
+
+    Returns:
+        Return a task object.
+
+    Warning:
+        This API only supports the dygraph mode now.
+
+    Examples:
+        .. code-block:: python
+
+            # required: distributed
+            import paddle
+            import paddle.distributed as dist
+
+            dist.init_parallel_env()
+            local_rank = dist.get_rank()
+            if local_rank == 0:
+                data = paddle.to_tensor([[4, 5, 6], [4, 5, 6]])
+            else:
+                data = paddle.to_tensor([[1, 2, 3], [1, 2, 3]])
+            task = dist.stream.reduce(data, dst=0, sync_op=False)
+            task.wait()
+            out = data.numpy()
+            # [[5, 7, 9], [5, 7, 9]] (2 GPUs, out for rank 0)
+            # [[1, 2, 3], [1, 2, 3]] (2 GPUs, out for rank 1)
+    """
+    if group is not None and not group.is_member():
+        raise RuntimeError(
+            "The group should not be None and all ranks which invoke this operation should be the member of this group."
+        )
+
+    if not sync_op and use_calc_stream:
+        raise RuntimeError(
+            "use_calc_stream can only be true in sync op behavior.")
+
+    if framework.in_dygraph_mode():
+        return _reduce_in_dygraph(tensor, dst, op, group, sync_op,
+                                  use_calc_stream)
+
+    raise RuntimeError(
+        "paddle.distributed.stream.reduce is only supported in dygraph mode now."
+    )
diff --git a/python/paddle/distributed/communication/stream/reduce_scatter.py b/python/paddle/distributed/communication/stream/reduce_scatter.py
new file mode 100644
index 0000000000000..a4aeae6312a30
--- /dev/null
+++ b/python/paddle/distributed/communication/stream/reduce_scatter.py
@@ -0,0 +1,216 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.distributed as dist
+import paddle.fluid.framework as framework
+from paddle.distributed.communication.group import _get_global_group
+from paddle.distributed.communication.reduce import _get_reduce_op, ReduceOp
+
+
+def _check_tensor_shape(tensor, shape, nranks=1):
+    expect_shape = list(shape)
+    expect_shape[0] //= nranks
+    if list(tensor.shape) != expect_shape:
+        raise RuntimeError(
+            "The in_tensor for reduce_scatter is not correctly-sized.")
+
+
+def _check_tensor_list_shape(tensor_list, shape, nranks=1):
+    if len(tensor_list) != nranks:
+        raise RuntimeError(
+            f"The tensor_list for reduce_scatter is not correctly-sized.")
+    for tensor in tensor_list:
+        if tensor.shape != shape:
+            raise RuntimeError(
+                f"The tensor_list for reduce_scatter is not correctly-sized.")
+
+
+def _reduce_scatter_tensor_in_dygraph(out_tensor,
+                                      in_tensor,
+                                      op,
+                                      group,
+                                      sync_op,
+                                      use_calc_stream,
+                                      caller="reduce_scatter"):
+    op_type = _get_reduce_op(op, caller)
+    group = _get_global_group() if group is None else group
+
+    _check_tensor_shape(out_tensor, in_tensor.shape, group.nranks)
+
+    if use_calc_stream:
+        return group.process_group.reduce_scatter_tensor_on_calc_stream(
+            in_tensor, out_tensor, op_type)
+
+    task = group.process_group.reduce_scatter_tensor(in_tensor, out_tensor,
+                                                     op_type, sync_op)
+    if sync_op:
+        task.wait()
+
+    return task
+
+
+def _reduce_scatter_in_dygraph(tensor, tensor_list, op, group, sync_op,
+                               use_calc_stream):
+    op_type = _get_reduce_op(op, "reduce_scatter")
+    group = _get_global_group() if group is None else group
+
+    _check_tensor_list_shape(tensor_list, tensor.shape, group.nranks)
+
+    if use_calc_stream:
+        return group.process_group.reduce_scatter_on_calc_stream(
+            tensor_list, tensor, op_type)
+
+    task = group.process_group.reduce_scatter(tensor_list, tensor, op_type,
+                                              sync_op)
+    if sync_op:
+        task.wait()
+
+    return task
+
+
+def reduce_scatter(tensor,
+                   tensor_or_tensor_list,
+                   op=ReduceOp.SUM,
+                   group=None,
+                   sync_op=True,
+                   use_calc_stream=False):
+    """
+
+    Reduce, then scatter a tensor (or a tensor list) across devices.
+
+    Args:
+        tensor (Tensor): The output tensor on each rank. The result will overwrite this tenor after communication. Support
+            float16, float32, float64, int32, int64, int8, uint8 or bool as the input data type.
+        tensor_list (List[Tensor]]): The input to scatter.
+            If it is a tensor, it should be correctly-sized. If it is a list, it should contain correctly-sized tensors.
+        op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD, optional): The reduction used. If none is given, use ReduceOp.SUM as default.
+        group (Group, optional): Communicate in which group. If none is given, use the global group as default.
+        sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default.
+        use_calc_stream (bool, optional): Indicate whether the communication is done on calculation stream. If none is given, use false as default. This
+            option is designed for high performance demand, be careful to turn it on except you are clearly know its meaning.
+
+    Returns:
+        Return a task object.
+
+    Warning:
+        This API only supports the dygraph mode now.
+
+    Examples:
+        .. code-block:: python
+
+            # required: distributed
+            import paddle
+            import paddle.distributed as dist
+
+            dist.init_parallel_env()
+            if dist.get_rank() == 0:
+                data1 = paddle.to_tensor([0, 1])
+                data2 = paddle.to_tensor([2, 3])
+            else:
+                data1 = paddle.to_tensor([4, 5])
+                data2 = paddle.to_tensor([6, 7])
+            dist.stream.reduce_scatter(data1, [data1, data2])
+            out = data1.numpy()
+            # [4, 6]  (2 GPUs, out for rank 0)
+            # [8, 10] (2 GPUs, out for rank 1)
+    """
+    if group is not None and not group.is_member():
+        raise RuntimeError(
+            "The group should not be None and all ranks which invoke this operation should be the member of this group."
+        )
+
+    if not sync_op and use_calc_stream:
+        raise RuntimeError(
+            "use_calc_stream can only be true in sync op behavior.")
+
+    if framework.in_dygraph_mode():
+        if paddle.is_tensor(tensor_or_tensor_list):
+            return _reduce_scatter_tensor_in_dygraph(tensor,
+                                                     tensor_or_tensor_list, op,
+                                                     group, sync_op,
+                                                     use_calc_stream)
+        else:
+            return _reduce_scatter_in_dygraph(tensor, tensor_or_tensor_list, op,
+                                              group, sync_op, use_calc_stream)
+
+    raise RuntimeError(
+        "paddle.distributed.stream.reduce_scatter is only supported in dygraph mode now."
+    )
+
+
+def _reduce_scatter_base(out_tensor,
+                         in_tensor,
+                         op=ReduceOp.SUM,
+                         group=None,
+                         sync_op=True,
+                         use_calc_stream=False):
+    """
+
+    Reduce, then scatter a flattened tensor across devices.
+
+    Args:
+        out_tensor (Tensor): The output tensor on each rank. The result will overwrite this tenor after communication. Support
+            float16, float32, float64, int32 or int64 as the input data type.
+        in_tensor (Tensor): The input tensor to reduce and scatter.
+        op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD, optional): The reduction used. If none is given, use ReduceOp.SUM as default.
+        group (Group, optional): Communicate in which group. If none is given, use the global group as default.
+        sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default.
+        use_calc_stream (bool, optional): Indicate whether the communication is done on calculation stream. If none is given, use false as default. This
+            option is designed for high performance demand, be careful to turn it on except you are clearly know its meaning.
+
+    Returns:
+        Return a task object.
+
+    Warning:
+        This API will be deprecated in the future, and only supports the dygraph mode now.
+
+    Examples:
+        .. code-block:: python
+
+            # required: distributed
+            import paddle
+            import paddle.distributed as dist
+
+            dist.init_parallel_env()
+            if dist.get_rank() == 0:
+                data1 = paddle.to_tensor([7, 8, 9])
+                data2 = paddle.to_tensor([10, 11, 12])
+                dist.stream.scatter(data1, src=1)
+            else:
+                data1 = paddle.to_tensor([1, 2, 3])
+                data2 = paddle.to_tensor([4, 5, 6])
+                dist.stream.scatter(data1, [data1, data2], src=1)
+            out = data1.numpy()
+            # [1, 2, 3] (2 GPUs, out for rank 0)
+            # [4, 5, 6] (2 GPUs, out for rank 1)
+    """
+    if group is not None and not group.is_member():
+        raise RuntimeError(
+            "The group should not be None and all ranks which invoke this operation should be the member of this group."
+        )
+
+    if not sync_op and use_calc_stream:
+        raise RuntimeError(
+            "use_calc_stream can only be true in sync op behavior.")
+
+    if framework.in_dygraph_mode():
+        return _reduce_scatter_tensor_in_dygraph(out_tensor, in_tensor, op,
+                                                 group, sync_op,
+                                                 use_calc_stream,
+                                                 "_reduce_scatter_base")
+
+    raise RuntimeError(
+        "paddle.distributed.stream._reduce_scatter_base is only supported in dygraph mode now."
+    )
diff --git a/python/paddle/distributed/communication/stream/scatter.py b/python/paddle/distributed/communication/stream/scatter.py
new file mode 100644
index 0000000000000..3a3fb00534783
--- /dev/null
+++ b/python/paddle/distributed/communication/stream/scatter.py
@@ -0,0 +1,162 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.distributed as dist
+import paddle.fluid.framework as framework
+from paddle.distributed import collective
+
+
+def _check_tensor_shape(tensor, shape, nranks=1):
+    expect_shape = list(shape)
+    expect_shape[0] //= nranks
+    if list(tensor.shape) != expect_shape:
+        raise RuntimeError("The in_tensor for scatter is not correctly-sized.")
+
+
+def _check_tensor_list_shape(tensor_list, shape, nranks=1):
+    if len(tensor_list) != nranks:
+        raise RuntimeError(
+            f"The tensor_list for scatter is not correctly-sized.")
+    for tensor in tensor_list:
+        if tensor.shape != shape:
+            raise RuntimeError(
+                f"The tensor_list for scatter is not correctly-sized.")
+
+
+def _scatter_tensor_in_dygraph(out_tensor, in_tensor, src, group, sync_op,
+                               use_calc_stream):
+    group = collective._get_default_group() if group is None else group
+
+    src_rank = group.get_group_rank(src)
+    if src_rank == -1:
+        raise RuntimeError("Src rank out of group.")
+
+    nranks = group.nranks
+    rank = dist.get_rank()
+    if rank == src_rank:
+        _check_tensor_shape(out_tensor, in_tensor.shape, nranks)
+
+    if use_calc_stream:
+        return group.process_group.scatter_tensor_on_calc_stream(
+            in_tensor, out_tensor, src)
+
+    task = group.process_group.scatter_tensor(in_tensor, out_tensor, src,
+                                              sync_op)
+    if sync_op:
+        task.wait()
+
+    return task
+
+
+def _scatter_in_dygraph(tensor, tensor_list, src, group, sync_op,
+                        use_calc_stream):
+    group = collective._get_default_group() if group is None else group
+
+    src_rank = group.get_group_rank(src)
+    if src_rank == -1:
+        raise RuntimeError("Src rank out of group.")
+
+    nranks = group.nranks
+    rank = dist.get_rank()
+    if rank == src_rank:
+        if len(tensor_list) == 0:
+            raise RuntimeError(
+                "The tensor_list should not be empty on src rank.")
+        _check_tensor_list_shape(tensor_list, tensor.shape, nranks)
+    else:
+        tensor_list = [tensor for _ in range(nranks)]
+
+    if use_calc_stream:
+        return group.process_group.scatter_on_calc_stream(
+            tensor_list, tensor, src)
+
+    task = group.process_group.scatter(tensor_list, tensor, src, sync_op)
+    if sync_op:
+        task.wait()
+
+    return task
+
+
+def scatter(tensor,
+            tensor_or_tensor_list=None,
+            src=0,
+            group=None,
+            sync_op=True,
+            use_calc_stream=False):
+    """
+
+    Scatter a tensor (or a tensor list) across devices.
+
+    Args:
+        tensor (Tensor): The output tensor on each rank. The result will overwrite this tenor after communication. Support
+            float16, float32, float64, int32, int64, int8, uint8 or bool as the input data type.
+        tensor_or_tensor_list (Union[Tensor, List[Tensor]]): The input to scatter (default is `None`, must be specified on the source rank).
+            If it is a tensor, it should be correctly-sized. If it is a list, it should contain correctly-sized tensors.
+        src (int, optional): Rank of the source device. If none is given, use `0` as default.
+        group (Group, optional): Communicate in which group. If none is given, use the global group as default.
+        sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default.
+        use_calc_stream (bool, optional): Indicate whether the communication is done on calculation stream. If none is given, use false as default. This
+            option is designed for high performance demand, be careful to turn it on except you are clearly know its meaning.
+
+    Returns:
+        Return a task object.
+
+    Warning:
+        This API only supports the dygraph mode now.
+
+    Examples:
+        .. code-block:: python
+
+            # required: distributed
+            import paddle
+            import paddle.distributed as dist
+
+            dist.init_parallel_env()
+            if dist.get_rank() == 0:
+                data1 = paddle.to_tensor([7, 8, 9])
+                data2 = paddle.to_tensor([10, 11, 12])
+                dist.stream.scatter(data1, src=1)
+            else:
+                data1 = paddle.to_tensor([1, 2, 3])
+                data2 = paddle.to_tensor([4, 5, 6])
+                dist.stream.scatter(data1, [data1, data2], src=1)
+            out = data1.numpy()
+            # [1, 2, 3] (2 GPUs, out for rank 0)
+            # [4, 5, 6] (2 GPUs, out for rank 1)
+    """
+    if group is not None and not group.is_member():
+        raise RuntimeError(
+            "The group should not be None and all ranks which invoke this operation should be the member of this group."
+        )
+
+    if not sync_op and use_calc_stream:
+        raise RuntimeError(
+            "use_calc_stream can only be true in sync op behavior.")
+
+    if tensor_or_tensor_list is None:
+        raise RuntimeError("The input should be specified.")
+
+    if framework.in_dygraph_mode():
+        if paddle.is_tensor(tensor_or_tensor_list):
+            return _scatter_tensor_in_dygraph(tensor, tensor_or_tensor_list,
+                                              src, group, sync_op,
+                                              use_calc_stream)
+        else:
+            return _scatter_in_dygraph(tensor, tensor_or_tensor_list, src,
+                                       group, sync_op, use_calc_stream)
+
+    raise RuntimeError(
+        "paddle.distributed.stream.scatter is only supported in dygraph mode now."
+    )
diff --git a/python/paddle/distributed/communication/stream/send.py b/python/paddle/distributed/communication/stream/send.py
index fa052734c7ee7..49eceed55e31f 100644
--- a/python/paddle/distributed/communication/stream/send.py
+++ b/python/paddle/distributed/communication/stream/send.py
@@ -64,7 +64,7 @@ def send(tensor, dst=0, group=None, sync_op=True, use_calc_stream=False):
                 task = dist.stream.recv(data, src=0, sync_op=False)
             task.wait()
             out = data.numpy()
-            # [[4, 5, 6], [4, 5, 6]
+            # [[4, 5, 6], [4, 5, 6]] (2 GPUs)
     """
     if group is not None and not group.is_member():
         raise RuntimeError(
diff --git a/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt b/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt
index d7ee67c10f435..a9db1e0bc7eec 100644
--- a/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt
@@ -282,6 +282,54 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   set_tests_properties(test_communication_stream_allreduce_api
                        PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
 endif()
+if((WITH_GPU OR WITH_ROCM) AND (LINUX))
+  py_test_modules(
+    test_communication_stream_alltoall_api MODULES
+    test_communication_stream_alltoall_api ENVS
+    "PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=")
+  set_tests_properties(test_communication_stream_alltoall_api
+                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
+endif()
+if((WITH_GPU OR WITH_ROCM) AND (LINUX))
+  py_test_modules(
+    test_communication_stream_alltoall_single_api MODULES
+    test_communication_stream_alltoall_single_api ENVS
+    "PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=")
+  set_tests_properties(test_communication_stream_alltoall_single_api
+                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
+endif()
+if((WITH_GPU OR WITH_ROCM) AND (LINUX))
+  py_test_modules(
+    test_communication_stream_broadcast_api MODULES
+    test_communication_stream_broadcast_api ENVS
+    "PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=")
+  set_tests_properties(test_communication_stream_broadcast_api
+                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
+endif()
+if((WITH_GPU OR WITH_ROCM) AND (LINUX))
+  py_test_modules(
+    test_communication_stream_reduce_api MODULES
+    test_communication_stream_reduce_api ENVS
+    "PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=")
+  set_tests_properties(test_communication_stream_reduce_api
+                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
+endif()
+if((WITH_GPU OR WITH_ROCM) AND (LINUX))
+  py_test_modules(
+    test_communication_stream_reduce_scatter_api MODULES
+    test_communication_stream_reduce_scatter_api ENVS
+    "PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=")
+  set_tests_properties(test_communication_stream_reduce_scatter_api
+                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
+endif()
+if((WITH_GPU OR WITH_ROCM) AND (LINUX))
+  py_test_modules(
+    test_communication_stream_scatter_api MODULES
+    test_communication_stream_scatter_api ENVS
+    "PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=")
+  set_tests_properties(test_communication_stream_scatter_api
+                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
+endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(
     test_communication_stream_sendrecv_api MODULES
diff --git a/python/paddle/fluid/tests/unittests/collective/communication_stream_alltoall_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/communication_stream_alltoall_api_dygraph.py
new file mode 100644
index 0000000000000..8e65ea8d8aee5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective/communication_stream_alltoall_api_dygraph.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.distributed as dist
+import test_communication_api_base as test_base
+import test_collective_api_base as test_collective_base
+
+
+class StreamAllToAllTestCase():
+
+    def __init__(self):
+        self._sync_op = eval(os.getenv("sync_op"))
+        self._use_calc_stream = eval(os.getenv("use_calc_stream"))
+        self._backend = os.getenv("backend")
+        self._shape = eval(os.getenv("shape"))
+        self._dtype = os.getenv("dtype")
+        self._seeds = eval(os.getenv("seeds"))
+        if self._backend not in ["nccl", "gloo"]:
+            raise NotImplementedError(
+                "Only support nccl and gloo as the backend for now.")
+        os.environ["PADDLE_DISTRI_BACKEND"] = self._backend
+
+    def run_test_case(self):
+        dist.init_parallel_env()
+
+        test_data_list = []
+        for seed in self._seeds:
+            test_data_list.append(
+                test_collective_base.create_test_data(shape=self._shape,
+                                                      dtype=self._dtype,
+                                                      seed=seed))
+
+        nranks = len(test_data_list)
+        data1 = test_data_list[0]
+        data2 = test_data_list[1]
+        result1 = np.vstack(
+            [data1[0:data1.shape[0] // 2, :], data2[0:data2.shape[0] // 2, :]])
+        result2 = np.vstack(
+            [data1[data1.shape[0] // 2:, :], data2[data2.shape[0] // 2:, :]])
+
+        rank = dist.get_rank()
+        tensor = paddle.to_tensor(test_data_list[rank])
+        t1, t2 = paddle.split(tensor, nranks, axis=0)
+
+        # case 1: pass an empty tensor list
+        empty_tensor_list = []
+        task = dist.stream.alltoall(empty_tensor_list, [t1, t2],
+                                    sync_op=self._sync_op,
+                                    use_calc_stream=self._use_calc_stream)
+        if not self._sync_op:
+            task.wait()
+        result_tensor_list = np.vstack(empty_tensor_list)
+        if rank == 0:
+            assert np.allclose(result_tensor_list,
+                               result1,
+                               rtol=1e-05,
+                               atol=1e-05)
+        else:
+            assert np.allclose(result_tensor_list,
+                               result2,
+                               rtol=1e-05,
+                               atol=1e-05)
+
+        # case 2: pass a pre-sized tensor list
+        full_tensor_list = [paddle.empty_like(t1) for _ in test_data_list]
+        task = dist.stream.alltoall(full_tensor_list, [t1, t2],
+                                    sync_op=self._sync_op,
+                                    use_calc_stream=self._use_calc_stream)
+        if not self._sync_op:
+            task.wait()
+        result_tensor_list = np.vstack(full_tensor_list)
+        if rank == 0:
+            assert np.allclose(result_tensor_list,
+                               result1,
+                               rtol=1e-05,
+                               atol=1e-05)
+        else:
+            assert np.allclose(result_tensor_list,
+                               result2,
+                               rtol=1e-05,
+                               atol=1e-05)
+
+        # case 3: pass a pre-sized tensor
+        out_tensor = paddle.empty_like(tensor)
+        task = dist.stream.alltoall(out_tensor,
+                                    tensor,
+                                    sync_op=self._sync_op,
+                                    use_calc_stream=self._use_calc_stream)
+        if not self._sync_op:
+            task.wait()
+        if rank == 0:
+            assert np.allclose(out_tensor, result1, rtol=1e-05, atol=1e-05)
+        else:
+            assert np.allclose(out_tensor, result2, rtol=1e-05, atol=1e-05)
+
+
+if __name__ == "__main__":
+    StreamAllToAllTestCase().run_test_case()
diff --git a/python/paddle/fluid/tests/unittests/collective/communication_stream_alltoall_single_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/communication_stream_alltoall_single_api_dygraph.py
new file mode 100644
index 0000000000000..9bdfe124b0b49
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective/communication_stream_alltoall_single_api_dygraph.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.distributed as dist
+import test_communication_api_base as test_base
+import test_collective_api_base as test_collective_base
+
+
+class StreamAllToAllSingleTestCase():
+
+    def __init__(self):
+        self._sync_op = eval(os.getenv("sync_op"))
+        self._use_calc_stream = eval(os.getenv("use_calc_stream"))
+        self._backend = os.getenv("backend")
+        self._shape = eval(os.getenv("shape"))
+        self._dtype = os.getenv("dtype")
+        self._seeds = eval(os.getenv("seeds"))
+        if self._backend not in ["nccl", "gloo"]:
+            raise NotImplementedError(
+                "Only support nccl and gloo as the backend for now.")
+        os.environ["PADDLE_DISTRI_BACKEND"] = self._backend
+
+    def run_test_case(self):
+        dist.init_parallel_env()
+
+        test_data_list = []
+        for seed in self._seeds:
+            test_data_list.append(
+                test_collective_base.create_test_data(shape=self._shape,
+                                                      dtype=self._dtype,
+                                                      seed=seed))
+
+        nranks = len(test_data_list)
+        data1 = paddle.to_tensor(test_data_list[0])
+        data2 = paddle.to_tensor(test_data_list[1])
+        result1 = np.vstack(
+            (data1[0:data1.shape[0] // 2, :], data2[0:data2.shape[0] // 2, :]))
+        result2 = np.vstack(
+            (data1[data1.shape[0] // 2:, :], data2[data2.shape[0] // 2:, :]))
+
+        rank = dist.get_rank()
+        tensor = paddle.to_tensor(test_data_list[rank])
+
+        out_tensor = paddle.empty_like(tensor)
+        task = dist.stream.alltoall_single(
+            out_tensor,
+            tensor,
+            sync_op=self._sync_op,
+            use_calc_stream=self._use_calc_stream)
+        if not self._sync_op:
+            task.wait()
+        if rank == 0:
+            assert np.allclose(out_tensor, result1, rtol=1e-05, atol=1e-05)
+        else:
+            assert np.allclose(out_tensor, result2, rtol=1e-05, atol=1e-05)
+
+
+if __name__ == "__main__":
+    StreamAllToAllSingleTestCase().run_test_case()
diff --git a/python/paddle/fluid/tests/unittests/collective/communication_stream_broadcast_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/communication_stream_broadcast_api_dygraph.py
new file mode 100644
index 0000000000000..487dfd6ae6894
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective/communication_stream_broadcast_api_dygraph.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+import paddle
+import paddle.distributed as dist
+import test_collective_api_base as test_collective_base
+
+
+class StreamBroadcastTestCase():
+
+    def __init__(self):
+        self._sync_op = eval(os.getenv("sync_op"))
+        self._use_calc_stream = eval(os.getenv("use_calc_stream"))
+        self._backend = os.getenv("backend")
+        self._shape = eval(os.getenv("shape"))
+        self._dtype = os.getenv("dtype")
+        self._seeds = eval(os.getenv("seeds"))
+        if self._backend not in ["nccl", "gloo"]:
+            raise NotImplementedError(
+                "Only support nccl and gloo as the backend for now.")
+        os.environ["PADDLE_DISTRI_BACKEND"] = self._backend
+
+    def run_test_case(self):
+        dist.init_parallel_env()
+
+        src_rank = 1
+        result = test_collective_base.create_test_data(
+            shape=self._shape, dtype=self._dtype, seed=self._seeds[src_rank])
+        tensor = paddle.to_tensor(result)
+        task = dist.stream.broadcast(tensor,
+                                     src=src_rank,
+                                     sync_op=self._sync_op,
+                                     use_calc_stream=self._use_calc_stream)
+        if not self._sync_op:
+            task.wait()
+
+        assert np.allclose(tensor, result, rtol=1e-05, atol=1e-05)
+
+
+if __name__ == "__main__":
+    StreamBroadcastTestCase().run_test_case()
diff --git a/python/paddle/fluid/tests/unittests/collective/communication_stream_reduce_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/communication_stream_reduce_api_dygraph.py
new file mode 100644
index 0000000000000..a487eac566ab5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective/communication_stream_reduce_api_dygraph.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+import paddle
+import paddle.distributed as dist
+import test_collective_api_base as test_collective_base
+
+
+class StreamReduceTestCase():
+
+    def __init__(self):
+        self._sync_op = eval(os.getenv("sync_op"))
+        self._use_calc_stream = eval(os.getenv("use_calc_stream"))
+        self._backend = os.getenv("backend")
+        self._shape = eval(os.getenv("shape"))
+        self._dtype = os.getenv("dtype")
+        self._seeds = eval(os.getenv("seeds"))
+        if self._backend not in ["nccl", "gloo"]:
+            raise NotImplementedError(
+                "Only support nccl and gloo as the backend for now.")
+        os.environ["PADDLE_DISTRI_BACKEND"] = self._backend
+
+    def run_test_case(self):
+        dist.init_parallel_env()
+
+        test_data_list = []
+        for seed in self._seeds:
+            test_data_list.append(
+                test_collective_base.create_test_data(shape=self._shape,
+                                                      dtype=self._dtype,
+                                                      seed=seed))
+
+        rank = dist.get_rank()
+        tensor = paddle.to_tensor(test_data_list[rank])
+        task = dist.stream.reduce(tensor,
+                                  dst=1,
+                                  sync_op=self._sync_op,
+                                  use_calc_stream=self._use_calc_stream)
+        if not self._sync_op:
+            task.wait()
+
+        result = sum(test_data_list)
+        if rank == 1:
+            assert np.allclose(tensor, result, rtol=1e-05, atol=1e-05)
+        else:
+            assert np.allclose(tensor,
+                               test_data_list[rank],
+                               rtol=1e-05,
+                               atol=1e-05)
+
+
+if __name__ == "__main__":
+    StreamReduceTestCase().run_test_case()
diff --git a/python/paddle/fluid/tests/unittests/collective/communication_stream_reduce_scatter_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/communication_stream_reduce_scatter_api_dygraph.py
new file mode 100644
index 0000000000000..8f66d67e0d58c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective/communication_stream_reduce_scatter_api_dygraph.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+import paddle
+import paddle.distributed as dist
+import test_collective_api_base as test_collective_base
+
+
+class StreamReduceScatterTestCase():
+
+    def __init__(self):
+        self._sync_op = eval(os.getenv("sync_op"))
+        self._use_calc_stream = eval(os.getenv("use_calc_stream"))
+        self._backend = os.getenv("backend")
+        self._shape = eval(os.getenv("shape"))
+        self._dtype = os.getenv("dtype")
+        self._seeds = eval(os.getenv("seeds"))
+        if self._backend not in ["nccl", "gloo"]:
+            raise NotImplementedError(
+                "Only support nccl and gloo as the backend for now.")
+        os.environ["PADDLE_DISTRI_BACKEND"] = self._backend
+
+    def run_test_case(self):
+        dist.init_parallel_env()
+
+        test_data_list = []
+        for seed in self._seeds:
+            test_data_list.append(
+                test_collective_base.create_test_data(shape=self._shape,
+                                                      dtype=self._dtype,
+                                                      seed=seed))
+        reduce_result = sum(test_data_list)
+        result1 = reduce_result[0:reduce_result.shape[0] // 2]
+        result2 = reduce_result[reduce_result.shape[0] // 2:]
+
+        rank = dist.get_rank()
+        tensor = paddle.to_tensor(test_data_list[rank])
+
+        # case 1: pass a pre-sized tensor list
+        t1, t2 = paddle.split(tensor, 2, axis=0)
+        result_tensor = paddle.empty_like(t1)
+        task = dist.stream.reduce_scatter(result_tensor, [t1, t2],
+                                          sync_op=self._sync_op,
+                                          use_calc_stream=self._use_calc_stream)
+        if not self._sync_op:
+            task.wait()
+        if rank == 0:
+            assert np.allclose(result_tensor, result1, rtol=1e-05, atol=1e-05)
+        else:
+            assert np.allclose(result_tensor, result2, rtol=1e-05, atol=1e-05)
+
+        # case 2: pass a pre-sized tensor
+        result_tensor = paddle.empty_like(t1)
+        task = dist.stream.reduce_scatter(result_tensor,
+                                          tensor,
+                                          sync_op=self._sync_op,
+                                          use_calc_stream=self._use_calc_stream)
+        if not self._sync_op:
+            task.wait()
+        if rank == 0:
+            assert np.allclose(result_tensor, result1, rtol=1e-05, atol=1e-05)
+        else:
+            assert np.allclose(result_tensor, result2, rtol=1e-05, atol=1e-05)
+
+        # case 3: test the legacy API
+        result_tensor = paddle.empty_like(t1)
+        task = dist.stream._reduce_scatter_base(
+            result_tensor,
+            tensor,
+            sync_op=self._sync_op,
+            use_calc_stream=self._use_calc_stream)
+        if not self._sync_op:
+            task.wait()
+        if rank == 0:
+            assert np.allclose(result_tensor, result1, rtol=1e-05, atol=1e-05)
+        else:
+            assert np.allclose(result_tensor, result2, rtol=1e-05, atol=1e-05)
+
+
+if __name__ == "__main__":
+    StreamReduceScatterTestCase().run_test_case()
diff --git a/python/paddle/fluid/tests/unittests/collective/communication_stream_scatter_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/communication_stream_scatter_api_dygraph.py
new file mode 100644
index 0000000000000..6060e5050ca09
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective/communication_stream_scatter_api_dygraph.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+import paddle
+import paddle.distributed as dist
+import test_collective_api_base as test_collective_base
+
+
+class StreamScatterTestCase():
+
+    def __init__(self):
+        self._sync_op = eval(os.getenv("sync_op"))
+        self._use_calc_stream = eval(os.getenv("use_calc_stream"))
+        self._backend = os.getenv("backend")
+        self._shape = eval(os.getenv("shape"))
+        self._dtype = os.getenv("dtype")
+        self._seeds = eval(os.getenv("seeds"))
+        if self._backend not in ["nccl", "gloo"]:
+            raise NotImplementedError(
+                "Only support nccl and gloo as the backend for now.")
+        os.environ["PADDLE_DISTRI_BACKEND"] = self._backend
+
+    def run_test_case(self):
+        dist.init_parallel_env()
+
+        test_data_list = []
+        for seed in self._seeds:
+            test_data_list.append(
+                test_collective_base.create_test_data(shape=self._shape,
+                                                      dtype=self._dtype,
+                                                      seed=seed))
+
+        src_rank = 1
+        src_data = test_data_list[src_rank]
+        result1 = src_data[0:src_data.shape[0] // 2]
+        result2 = src_data[src_data.shape[0] // 2:]
+
+        rank = dist.get_rank()
+
+        # case 1: pass a pre-sized tensor list
+        tensor = paddle.to_tensor(test_data_list[rank])
+        t1, t2 = paddle.split(tensor, 2, axis=0)
+        task = dist.stream.scatter(t1, [t1, t2],
+                                   src=src_rank,
+                                   sync_op=self._sync_op,
+                                   use_calc_stream=self._use_calc_stream)
+        if not self._sync_op:
+            task.wait()
+        if rank == src_rank:
+            assert np.allclose(t1, result2, rtol=1e-05, atol=1e-05)
+        else:
+            assert np.allclose(t1, result1, rtol=1e-05, atol=1e-05)
+
+        # case 2: pass a pre-sized tensor
+        tensor = paddle.to_tensor(src_data)
+        t1 = paddle.empty_like(t1)
+        task = dist.stream.scatter(t1,
+                                   tensor,
+                                   src=src_rank,
+                                   sync_op=self._sync_op,
+                                   use_calc_stream=self._use_calc_stream)
+        if not self._sync_op:
+            task.wait()
+        if rank == src_rank:
+            assert np.allclose(t1, result2, rtol=1e-05, atol=1e-05)
+        else:
+            assert np.allclose(t1, result1, rtol=1e-05, atol=1e-05)
+
+
+if __name__ == "__main__":
+    StreamScatterTestCase().run_test_case()
diff --git a/python/paddle/fluid/tests/unittests/collective/communication_stream_sendrecv_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/communication_stream_sendrecv_api_dygraph.py
index 175e24c3d0d86..c22e734adf2fb 100644
--- a/python/paddle/fluid/tests/unittests/collective/communication_stream_sendrecv_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/communication_stream_sendrecv_api_dygraph.py
@@ -45,22 +45,25 @@ def run_test_case(self):
                                                       dtype=self._dtype,
                                                       seed=seed))
 
+        src_rank = 0
+        dst_rank = 1
+
         rank = dist.get_rank()
         tensor = paddle.to_tensor(test_data_list[rank])
         if rank == 0:
             task = dist.stream.send(tensor,
-                                    dst=1,
+                                    dst=dst_rank,
                                     sync_op=self._sync_op,
                                     use_calc_stream=self._use_calc_stream)
         else:
             task = dist.stream.recv(tensor,
-                                    src=0,
+                                    src=src_rank,
                                     sync_op=self._sync_op,
                                     use_calc_stream=self._use_calc_stream)
         if not self._sync_op:
             task.wait()
 
-        result = test_data_list[0]
+        result = test_data_list[src_rank]
         assert np.allclose(tensor, result, rtol=1e-05, atol=1e-05)
 
 
diff --git a/python/paddle/fluid/tests/unittests/collective/test_communication_stream_alltoall_api.py b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_alltoall_api.py
new file mode 100644
index 0000000000000..4fa55d86840bc
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_alltoall_api.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import itertools
+import test_communication_api_base as test_base
+
+
+class TestCommunicationStreamAllToAllAPI(test_base.CommunicationTestDistBase):
+
+    def setUp(self):
+        super(TestCommunicationStreamAllToAllAPI, self).setUp(num_of_devices=2,
+                                                              timeout=120)
+        self._default_envs = {
+            "backend": "nccl",
+            "shape": "(100, 200)",
+            "dtype": "float32",
+            "seeds": str(self._seeds)
+        }
+        self._changeable_envs = {
+            "sync_op": ["True", "False"],
+            "use_calc_stream": ["True", "False"]
+        }
+
+    def test_alltoall_stream(self):
+        envs_list = test_base.gen_product_envs_list(self._default_envs,
+                                                    self._changeable_envs)
+        for envs in envs_list:
+            if eval(envs["use_calc_stream"]) and not eval(envs["sync_op"]):
+                continue
+            self.run_test_case("communication_stream_alltoall_api_dygraph.py",
+                               user_defined_envs=envs)
+
+    def tearDown(self):
+        super(TestCommunicationStreamAllToAllAPI, self).tearDown()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_communication_stream_alltoall_single_api.py b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_alltoall_single_api.py
new file mode 100644
index 0000000000000..f1f099b9571f8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_alltoall_single_api.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import itertools
+import test_communication_api_base as test_base
+
+
+class TestCommunicationStreamAllToAllSingleAPI(
+        test_base.CommunicationTestDistBase):
+
+    def setUp(self):
+        super(TestCommunicationStreamAllToAllSingleAPI,
+              self).setUp(num_of_devices=2, timeout=120)
+        self._default_envs = {
+            "backend": "nccl",
+            "shape": "(100, 200)",
+            "dtype": "float32",
+            "seeds": str(self._seeds)
+        }
+        self._changeable_envs = {
+            "sync_op": ["True", "False"],
+            "use_calc_stream": ["True", "False"]
+        }
+
+    def test_alltoall_single_stream(self):
+        envs_list = test_base.gen_product_envs_list(self._default_envs,
+                                                    self._changeable_envs)
+        for envs in envs_list:
+            if eval(envs["use_calc_stream"]) and not eval(envs["sync_op"]):
+                continue
+            self.run_test_case(
+                "communication_stream_alltoall_single_api_dygraph.py",
+                user_defined_envs=envs)
+
+    def tearDown(self):
+        super(TestCommunicationStreamAllToAllSingleAPI, self).tearDown()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_communication_stream_broadcast_api.py b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_broadcast_api.py
new file mode 100644
index 0000000000000..07537a480e851
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_broadcast_api.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import itertools
+import test_communication_api_base as test_base
+
+
+class TestCommunicationStreamBroadcastAPI(test_base.CommunicationTestDistBase):
+
+    def setUp(self):
+        super(TestCommunicationStreamBroadcastAPI, self).setUp(num_of_devices=2,
+                                                               timeout=120)
+        self._default_envs = {
+            "backend": "nccl",
+            "shape": "(100, 200)",
+            "dtype": "float32",
+            "seeds": str(self._seeds)
+        }
+        self._changeable_envs = {
+            "sync_op": ["True", "False"],
+            "use_calc_stream": ["True", "False"]
+        }
+
+    def test_broadcast_stream(self):
+        envs_list = test_base.gen_product_envs_list(self._default_envs,
+                                                    self._changeable_envs)
+        for envs in envs_list:
+            if eval(envs["use_calc_stream"]) and not eval(envs["sync_op"]):
+                continue
+            self.run_test_case("communication_stream_broadcast_api_dygraph.py",
+                               user_defined_envs=envs)
+
+    def tearDown(self):
+        super(TestCommunicationStreamBroadcastAPI, self).tearDown()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_communication_stream_reduce_api.py b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_reduce_api.py
new file mode 100644
index 0000000000000..c8a04c8d893e1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_reduce_api.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import itertools
+import test_communication_api_base as test_base
+
+
+class TestCommunicationStreamReduceAPI(test_base.CommunicationTestDistBase):
+
+    def setUp(self):
+        super(TestCommunicationStreamReduceAPI, self).setUp(num_of_devices=2,
+                                                            timeout=120)
+        self._default_envs = {
+            "backend": "nccl",
+            "shape": "(100, 200)",
+            "dtype": "float32",
+            "seeds": str(self._seeds)
+        }
+        self._changeable_envs = {
+            "sync_op": ["True", "False"],
+            "use_calc_stream": ["True", "False"]
+        }
+
+    def test_reduce_stream(self):
+        envs_list = test_base.gen_product_envs_list(self._default_envs,
+                                                    self._changeable_envs)
+        for envs in envs_list:
+            if eval(envs["use_calc_stream"]) and not eval(envs["sync_op"]):
+                continue
+            self.run_test_case("communication_stream_reduce_api_dygraph.py",
+                               user_defined_envs=envs)
+
+    def tearDown(self):
+        super(TestCommunicationStreamReduceAPI, self).tearDown()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_communication_stream_reduce_scatter_api.py b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_reduce_scatter_api.py
new file mode 100644
index 0000000000000..a90e634860d95
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_reduce_scatter_api.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import itertools
+import test_communication_api_base as test_base
+
+
+class TestCommunicationStreamReduceScatterAPI(
+        test_base.CommunicationTestDistBase):
+
+    def setUp(self):
+        super(TestCommunicationStreamReduceScatterAPI,
+              self).setUp(num_of_devices=2, timeout=120)
+        self._default_envs = {
+            "backend": "nccl",
+            "shape": "(100, 200)",
+            "dtype": "float32",
+            "seeds": str(self._seeds)
+        }
+        self._changeable_envs = {
+            "sync_op": ["True", "False"],
+            "use_calc_stream": ["True", "False"]
+        }
+
+    def test_reduce_scatter_stream(self):
+        envs_list = test_base.gen_product_envs_list(self._default_envs,
+                                                    self._changeable_envs)
+        for envs in envs_list:
+            if eval(envs["use_calc_stream"]) and not eval(envs["sync_op"]):
+                continue
+            self.run_test_case(
+                "communication_stream_reduce_scatter_api_dygraph.py",
+                user_defined_envs=envs)
+
+    def tearDown(self):
+        super(TestCommunicationStreamReduceScatterAPI, self).tearDown()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_communication_stream_scatter_api.py b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_scatter_api.py
new file mode 100644
index 0000000000000..d96d931f43fbf
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_scatter_api.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import itertools
+import test_communication_api_base as test_base
+
+
+class TestCommunicationStreamScatterAPI(test_base.CommunicationTestDistBase):
+
+    def setUp(self):
+        super(TestCommunicationStreamScatterAPI, self).setUp(num_of_devices=2,
+                                                             timeout=120)
+        self._default_envs = {
+            "backend": "nccl",
+            "shape": "(100, 200)",
+            "dtype": "float32",
+            "seeds": str(self._seeds)
+        }
+        self._changeable_envs = {
+            "sync_op": ["True", "False"],
+            "use_calc_stream": ["True", "False"]
+        }
+
+    def test_reduce_stream(self):
+        envs_list = test_base.gen_product_envs_list(self._default_envs,
+                                                    self._changeable_envs)
+        for envs in envs_list:
+            if eval(envs["use_calc_stream"]) and not eval(envs["sync_op"]):
+                continue
+            self.run_test_case("communication_stream_scatter_api_dygraph.py",
+                               user_defined_envs=envs)
+
+    def tearDown(self):
+        super(TestCommunicationStreamScatterAPI, self).tearDown()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/testslist.csv b/python/paddle/fluid/tests/unittests/collective/testslist.csv
index c6c7c13937f38..60a2c4af239ed 100644
--- a/python/paddle/fluid/tests/unittests/collective/testslist.csv
+++ b/python/paddle/fluid/tests/unittests/collective/testslist.csv
@@ -34,6 +34,12 @@ test_collective_split_row_linear,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_
 test_collective_wait,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_communication_stream_allgather_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=,
 test_communication_stream_allreduce_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=,
+test_communication_stream_alltoall_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=,
+test_communication_stream_alltoall_single_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=,
+test_communication_stream_broadcast_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=,
+test_communication_stream_reduce_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=,
+test_communication_stream_reduce_scatter_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=,
+test_communication_stream_scatter_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=,
 test_communication_stream_sendrecv_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=,
 test_eager_dist_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_gen_nccl_id_op,,gpu;rocm;ASCEND;ASCEND_CL,,DIST,../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=..,

From 868b1e1c59bcc052af91c53ee55603dbcbadf102 Mon Sep 17 00:00:00 2001
From: LiYuRio <63526175+LiYuRio@users.noreply.github.com>
Date: Mon, 10 Oct 2022 11:32:01 +0800
Subject: [PATCH 4/6] Move group and all reduce from collective to
 communication (#45848)

---
 .../collective/ProcessGroupGloo.cc            |   8 +
 .../distributed/collective/ProcessGroupGloo.h |   6 +
 python/paddle/distributed/collective.py       | 173 ++----------------
 .../distributed/communication/all_reduce.py   |  87 +++++++++
 .../paddle/distributed/communication/group.py |  94 ++++++++++
 .../{comm_utils.py => reduce.py}              |  28 ++-
 .../communication/stream/all_reduce.py        |  48 +++--
 .../paddle/distributed/fleet/base/topology.py |   4 +-
 .../distributed/fleet/layers/mpu/mp_ops.py    |   2 +-
 python/paddle/distributed/parallel.py         |   9 +-
 .../distributed/models/moe/moe_layer.py       |   1 -
 11 files changed, 276 insertions(+), 184 deletions(-)
 create mode 100644 python/paddle/distributed/communication/all_reduce.py
 create mode 100644 python/paddle/distributed/communication/group.py
 rename python/paddle/distributed/communication/{comm_utils.py => reduce.py} (59%)

diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
index b23942b114f3b..097c9799b70f2 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
@@ -293,6 +293,14 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::AllReduce(
     std::vector<phi::DenseTensor>& inputs,
     std::vector<phi::DenseTensor>& outputs,
     const AllreduceOptions& opts) {
+  return AllReduce(inputs, outputs, opts, true);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::AllReduce(
+    std::vector<phi::DenseTensor>& inputs,
+    std::vector<phi::DenseTensor>& outputs,
+    const AllreduceOptions& opts,
+    bool sync_op) {
   auto tag = next_tag();
   std::shared_ptr<GlooTask> task;
   auto context = get_context();
diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.h b/paddle/fluid/distributed/collective/ProcessGroupGloo.h
index 95ce18c1d8217..d911da91eb1a3 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupGloo.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.h
@@ -120,6 +120,12 @@ class ProcessGroupGloo : public ProcessGroup {
       std::vector<phi::DenseTensor>& outputs,
       const AllreduceOptions& opts = AllreduceOptions()) override;
 
+  std::shared_ptr<ProcessGroup::Task> AllReduce(
+      std::vector<phi::DenseTensor>& inputs,
+      std::vector<phi::DenseTensor>& outputs,
+      const AllreduceOptions& opts,
+      bool sync_op) override;
+
   std::shared_ptr<ProcessGroup::Task> Barrier(
       const BarrierOptions& = BarrierOptions()) override;
 
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index e1ee362cadfd7..2af34d942e0b1 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -52,54 +52,12 @@
 from .fleet.layers.mpu.mp_ops import _linear
 from .fleet.layers.mpu.mp_ops import _parallel_linear
 from .fleet.layers.mpu.mp_ops import _parallel_embedding
-from .communication.comm_utils import ReduceOp
+from .communication.group import Group, _add_new_group
+from .communication.all_reduce import all_reduce
+from .communication.reduce import _get_reduce_op, ReduceOp
 
 __all__ = []
 
-
-class Group():
-    """
-    The abstract representation of group.
-    """
-
-    def __init__(self, rank, rank_num, id=0, ranks=[], pg=None, name=None):
-        self.rank = rank
-        self.nranks = rank_num
-        self.id = id
-        self.ranks = ranks
-        self.pg = pg
-        self.name = name
-
-    def is_member(self):
-        if self.rank < 0:
-            return False
-        if self.nranks < 2:
-            return False
-        return True
-
-    def get_group_rank(self, rank):
-        if self.is_member() and rank in self.ranks:
-            return self.ranks.index(rank)
-        else:
-            return -1
-
-    @property
-    def process_group(self):
-        return self.pg
-
-    @property
-    def world_size(self):
-        return self.nranks if self.rank >= 0 else -1
-
-    def __repr__(self):
-        debug_str = "rank: {}, nranks: {}, id: {}, ranks: ".format(
-            self.rank, self.nranks, self.id)
-        debug_str += ", ".join(map(str, self.ranks))
-        debug_str += "; name: "
-        debug_str += self.name if self.name else "None"
-        return debug_str
-
-
 _global_env = None
 
 
@@ -147,9 +105,8 @@ def _get_group_map():
     global _group_map
     if _global_env_gid not in _group_map:
         genv = _get_global_env()
-        _group_map[_global_env_gid] = Group(genv.rank,
-                                            genv.world_size,
-                                            ranks=list(range(genv.world_size)))
+        _group_map[_global_env_gid] = Group(genv.rank, 0,
+                                            list(range(genv.world_size)))
     return _group_map
 
 
@@ -197,19 +154,6 @@ def _new_ring_id():
         return len(_get_group_map()) + max(_get_global_env().nrings, 9)
 
 
-def _get_reduce_op(reduce_op, func_name):
-    if reduce_op == ReduceOp.SUM:
-        return core.ReduceOp.SUM
-    elif reduce_op == ReduceOp.MAX:
-        return core.ReduceOp.MAX
-    elif reduce_op == ReduceOp.MIN:
-        return core.ReduceOp.MIN
-    elif reduce_op == ReduceOp.PROD:
-        return core.ReduceOp.PRODUCT
-    else:
-        raise ValueError("Unknown reduce_op type for {}.".format(func_name))
-
-
 def get_group(id=0):
     """
 
@@ -451,10 +395,13 @@ def new_group(ranks=None, backend=None, timeout=_default_timeout):
         else:
             rank = -1
             pg = None
-        group = Group(rank, size, id=gid, ranks=ranks, pg=pg, name=group_name)
+        group = Group(rank, gid, ranks, pg=pg, name=group_name)
         _group_map_by_name[group_name] = group
         _group_map[gid] = group
         _group_map_backend[group] = backend
+        #TODO: The method below is a new method for group management, will replace the previous
+        # three in the future.
+        _add_new_group(group)
 
         # TODO(shenliang03): This is a temporary solution to solve the problem of
         # hang caused by tcp
@@ -476,13 +423,13 @@ def new_group(ranks=None, backend=None, timeout=_default_timeout):
     ring_id = _new_ring_id()
 
     if global_rank not in ranks:
-        gp = Group(-1, -1, ring_id, ranks)
+        gp = Group(-1, ring_id, ranks)
         _group_map[ring_id] = gp
     else:
         ranks = sorted(ranks)
         group_rank = ranks.index(global_rank)
         group_size = len(ranks)
-        gp = Group(group_rank, group_size, ring_id, ranks)
+        gp = Group(group_rank, ring_id, ranks)
         _group_map[ring_id] = gp
 
         if group_size >= 2:
@@ -748,104 +695,6 @@ def broadcast(tensor, src, group=None, sync_op=True):
                      })
 
 
-def all_reduce(tensor, op=ReduceOp.SUM, group=None, sync_op=True):
-    """
-
-    Reduce a tensor over all ranks so that all get the result.
-    As shown below, one process is started with a GPU and the data of this process is represented
-    by its group rank. The reduce operator is sum. Through all_reduce operator, 
-    each GPU will have the sum of the data from all GPUs.
-
-    .. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/allreduce.png
-        :width: 800
-        :alt: all_reduce
-        :align: center
-
-    Args:
-        tensor (Tensor): The input Tensor. It also works as the output Tensor. Its data type
-            should be float16, float32, float64, int32, int64, int8, uint8 or bool.
-        op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD, optional): The operation used. Default value is ReduceOp.SUM.
-        group (Group, optional): The group instance return by new_group or None for global default group.
-        sync_op (bool, optional): Wether this op is a sync op. Default value is True.
-
-    Returns:
-        None.
-
-    Examples:
-        .. code-block:: python
-
-            # required: distributed
-            import paddle
-            import paddle.distributed as dist
-
-            dist.init_parallel_env()
-            if dist.get_rank() == 0:
-                data = paddle.to_tensor([[4, 5, 6], [4, 5, 6]])
-            else:
-                data = paddle.to_tensor([[1, 2, 3], [1, 2, 3]])
-            dist.all_reduce(data)
-            print(data)
-            # [[5, 7, 9], [5, 7, 9]] (2 GPUs)
-    """
-    if group is not None and not group.is_member():
-        return
-
-    if in_dygraph_mode():
-        op_type = _get_reduce_op(op, "all_reduce")
-        group = _get_default_group() if group is None else group
-        task = group.process_group.allreduce(tensor, op_type)
-        if sync_op:
-            task.wait()
-            return None
-        else:
-            return task
-
-    use_calc_stream = sync_op
-    ring_id = 0 if group is None else group.id
-    if _non_static_mode():
-        if op == ReduceOp.SUM:
-            return _legacy_C_ops.c_allreduce_sum_(tensor, 'use_calc_stream',
-                                                  use_calc_stream, 'ring_id',
-                                                  ring_id)
-        elif op == ReduceOp.MAX:
-            return _legacy_C_ops.c_allreduce_max_(tensor, 'use_calc_stream',
-                                                  use_calc_stream, 'ring_id',
-                                                  ring_id)
-        elif op == ReduceOp.MIN:
-            return _legacy_C_ops.c_allreduce_min_(tensor, 'use_calc_stream',
-                                                  use_calc_stream, 'ring_id',
-                                                  ring_id)
-        elif op == ReduceOp.PROD:
-            return _legacy_C_ops.c_allreduce_prod_(tensor, 'use_calc_stream',
-                                                   use_calc_stream, 'ring_id',
-                                                   ring_id)
-        else:
-            raise ValueError("Unknown parameter: {}.".format(op))
-
-    check_variable_and_dtype(tensor, 'tensor', [
-        'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-        'bool'
-    ], 'all_reduce')
-    if op == ReduceOp.SUM:
-        op_type = 'c_allreduce_sum'
-    elif op == ReduceOp.MAX:
-        op_type = 'c_allreduce_max'
-    elif op == ReduceOp.MIN:
-        op_type = 'c_allreduce_min'
-    elif op == ReduceOp.PROD:
-        op_type = 'c_allreduce_prod'
-    if not isinstance(ring_id, int):
-        raise ValueError("The type of 'ring_id' for all_reduce should be int.")
-    helper = LayerHelper(op_type, **locals())
-    helper.append_op(type=op_type,
-                     inputs={'X': [tensor]},
-                     outputs={'Out': [tensor]},
-                     attrs={
-                         'ring_id': ring_id,
-                         'use_calc_stream': use_calc_stream
-                     })
-
-
 def reduce(tensor, dst, op=ReduceOp.SUM, group=None, sync_op=True):
     """
 
diff --git a/python/paddle/distributed/communication/all_reduce.py b/python/paddle/distributed/communication/all_reduce.py
new file mode 100644
index 0000000000000..737e0cbbfb56c
--- /dev/null
+++ b/python/paddle/distributed/communication/all_reduce.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.fluid.framework as framework
+from paddle.distributed.communication import stream as stream
+from paddle.distributed.communication.reduce import ReduceOp
+
+
+def all_reduce(tensor, op=ReduceOp.SUM, group=None, sync_op=True):
+    """
+
+    Reduce a tensor over all ranks so that all get the result.
+    As shown below, one process is started with a GPU and the data of this process is represented
+    by its group rank. The reduce operator is sum. Through all_reduce operator,
+    each GPU will have the sum of the data from all GPUs.
+
+    .. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/allreduce.png
+        :width: 800
+        :alt: all_reduce
+        :align: center
+
+    Args:
+        tensor (Tensor): The input Tensor. It also works as the output Tensor. Its data type
+            should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+        op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD, optional): The operation used. Default value is ReduceOp.SUM.
+        group (Group, optional): The group instance return by new_group or None for global default group.
+        sync_op (bool, optional): Wether this op is a sync op. Default value is True.
+
+    Returns:
+        Return a task object.
+
+    Examples:
+        .. code-block:: python
+
+            # required: distributed
+            import paddle
+            import paddle.distributed as dist
+
+            dist.init_parallel_env()
+            if dist.get_rank() == 0:
+                data = paddle.to_tensor([[4, 5, 6], [4, 5, 6]])
+            else:
+                data = paddle.to_tensor([[1, 2, 3], [1, 2, 3]])
+            dist.all_reduce(data)
+            print(data)
+            # [[5, 7, 9], [5, 7, 9]] (2 GPUs)
+    """
+    if not framework._in_legacy_dygraph():
+        return stream.all_reduce(tensor,
+                                 op=op,
+                                 group=group,
+                                 sync_op=sync_op,
+                                 use_calc_stream=False)
+
+    # code below will be removed after we remove the old dygraph
+    use_calc_stream = sync_op
+    ring_id = 0 if group is None else group.id
+    if op == ReduceOp.SUM:
+        return paddle._legacy_C_ops.c_allreduce_sum_(tensor, 'use_calc_stream',
+                                                     use_calc_stream, 'ring_id',
+                                                     ring_id)
+    elif op == ReduceOp.MAX:
+        return paddle._legacy_C_ops.c_allreduce_max_(tensor, 'use_calc_stream',
+                                                     use_calc_stream, 'ring_id',
+                                                     ring_id)
+    elif op == ReduceOp.MIN:
+        return paddle._legacy_C_ops.c_allreduce_min_(tensor, 'use_calc_stream',
+                                                     use_calc_stream, 'ring_id',
+                                                     ring_id)
+    elif op == ReduceOp.PROD:
+        return paddle._legacy_C_ops.c_allreduce_prod_(tensor, 'use_calc_stream',
+                                                      use_calc_stream,
+                                                      'ring_id', ring_id)
+    else:
+        raise ValueError("Unknown parameter: {}.".format(op))
diff --git a/python/paddle/distributed/communication/group.py b/python/paddle/distributed/communication/group.py
new file mode 100644
index 0000000000000..6b4e545b245d1
--- /dev/null
+++ b/python/paddle/distributed/communication/group.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class Group():
+    """
+    The abstract representation of group.
+    """
+
+    def __init__(self, rank_in_group, id, ranks, pg=None, name=None):
+        self._rank_in_group = rank_in_group
+        self._world_size = len(ranks) if rank_in_group >= 0 else -1
+        self._id = id
+        self._ranks = ranks
+        self._pg = pg
+        self._name = name
+
+    @property
+    def rank(self):
+        return self._rank_in_group
+
+    @property
+    def ranks(self):
+        return self._ranks
+
+    @property
+    def nranks(self):
+        return len(self._ranks)
+
+    @property
+    def name(self):
+        return self._name
+
+    @property
+    def process_group(self):
+        return self._pg
+
+    @property
+    def world_size(self):
+        return self._world_size
+
+    @property
+    def id(self):
+        return self._id
+
+    def is_member(self):
+        if self.rank < 0:
+            return False
+        if self.nranks < 2:
+            return False
+        return True
+
+    def get_group_rank(self, rank):
+        if self.is_member():
+            return self.ranks.index(rank)
+        else:
+            return -1
+
+    def __repr__(self):
+        debug_str = "rank: {}, nranks: {}, id: {}, ranks: ".format(
+            self.rank, self.nranks, self.id)
+        debug_str += ", ".join(map(str, self.ranks))
+        debug_str += "; name: "
+        debug_str += self.name if self.name else "None"
+        return debug_str
+
+
+class _GroupManager():
+    global_group_id = 0
+    group_map_by_id = {}
+
+
+def _get_global_group():
+    if _GroupManager.global_group_id not in _GroupManager.group_map_by_id:
+        raise RuntimeError("The global group is not initialized.")
+    return _GroupManager.group_map_by_id[_GroupManager.global_group_id]
+
+
+def _add_new_group(group):
+    if group.id in _GroupManager.group_map_by_id:
+        raise RuntimeError("The group with id {} already exist.".format(
+            group.id))
+    _GroupManager.group_map_by_id[group.id] = group
diff --git a/python/paddle/distributed/communication/comm_utils.py b/python/paddle/distributed/communication/reduce.py
similarity index 59%
rename from python/paddle/distributed/communication/comm_utils.py
rename to python/paddle/distributed/communication/reduce.py
index 62e1bcb4cca94..5caa5bebedfd8 100644
--- a/python/paddle/distributed/communication/comm_utils.py
+++ b/python/paddle/distributed/communication/reduce.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,6 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import paddle.fluid.framework as framework
+import paddle.fluid.core as core
+
 
 class ReduceOp:
     """
@@ -48,3 +51,26 @@ class ReduceOp:
     MIN = 2
     PROD = 3
     AVG = 4
+
+
+def _get_reduce_op(reduce_op, func_name):
+    if framework.in_dygraph_mode():
+        if reduce_op == ReduceOp.SUM:
+            return core.ReduceOp.SUM
+        elif reduce_op == ReduceOp.MAX:
+            return core.ReduceOp.MAX
+        elif reduce_op == ReduceOp.MIN:
+            return core.ReduceOp.MIN
+        elif reduce_op == ReduceOp.PROD:
+            return core.ReduceOp.PRODUCT
+    else:
+        if reduce_op == ReduceOp.SUM:
+            return 'c_allreduce_sum'
+        elif reduce_op == ReduceOp.MAX:
+            return 'c_allreduce_max'
+        elif reduce_op == ReduceOp.MIN:
+            return 'c_allreduce_min'
+        elif reduce_op == ReduceOp.PROD:
+            return 'c_allreduce_prod'
+
+    raise ValueError("Unknown reduce_op type for {}.".format(func_name))
diff --git a/python/paddle/distributed/communication/stream/all_reduce.py b/python/paddle/distributed/communication/stream/all_reduce.py
index 67fc4c8b63a0c..0ba161a078ab8 100644
--- a/python/paddle/distributed/communication/stream/all_reduce.py
+++ b/python/paddle/distributed/communication/stream/all_reduce.py
@@ -12,13 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.distributed.collective as collective
 import paddle.fluid.framework as framework
+import paddle.fluid.data_feeder as data_feeder
+import paddle.fluid.layer_helper as layer_helper
+from paddle.distributed.communication.reduce import _get_reduce_op, ReduceOp
+from paddle.distributed.communication.group import _get_global_group
 
 
 def _all_reduce_in_dygraph(tensor, op, group, sync_op, use_calc_stream):
-    op_type = collective._get_reduce_op(op, "all_reduce")
-    group = collective._get_default_group() if group is None else group
+    op_type = _get_reduce_op(op, "all_reduce")
+
+    group = _get_global_group() if group is None else group
     if use_calc_stream:
         return group.process_group.allreduce_on_calc_stream(tensor, op_type)
 
@@ -29,8 +33,34 @@ def _all_reduce_in_dygraph(tensor, op, group, sync_op, use_calc_stream):
     return task
 
 
+def _all_reduce_in_static_mode(tensor, op, group, sync_op, use_calc_stream):
+    data_feeder.check_variable_and_dtype(tensor, 'tensor', [
+        'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
+        'bool'
+    ], 'all_reduce')
+
+    op_type = _get_reduce_op(op, "all_reduce")
+    ring_id = 0 if group is None else group.id
+
+    if not isinstance(ring_id, int):
+        raise ValueError("The type of 'ring_id' for all_reduce should be int.")
+
+    # TODO: Support task and use task.wait in static mode
+    #       Use use_calc_stream rather than sync_op
+    helper = layer_helper.LayerHelper(op_type, **locals())
+    helper.append_op(type=op_type,
+                     inputs={'X': [tensor]},
+                     outputs={'Out': [tensor]},
+                     attrs={
+                         'ring_id': ring_id,
+                         'use_calc_stream': sync_op
+                     })
+
+    return None
+
+
 def all_reduce(tensor,
-               op=collective.ReduceOp.SUM,
+               op=ReduceOp.SUM,
                group=None,
                sync_op=True,
                use_calc_stream=False):
@@ -50,9 +80,6 @@ def all_reduce(tensor,
     Returns:
         Return a task object.
 
-    Warning:
-        This API only supports the dygraph mode now.
-
     Examples:
         .. code-block:: python
 
@@ -84,7 +111,6 @@ def all_reduce(tensor,
     if framework.in_dygraph_mode():
         return _all_reduce_in_dygraph(tensor, op, group, sync_op,
                                       use_calc_stream)
-
-    raise RuntimeError(
-        "paddle.distributed.stream.all_reduce is only supported in dygraph mode now."
-    )
+    else:
+        return _all_reduce_in_static_mode(tensor, op, group, sync_op,
+                                          use_calc_stream)
diff --git a/python/paddle/distributed/fleet/base/topology.py b/python/paddle/distributed/fleet/base/topology.py
index bbaca8951205b..305452e99f380 100644
--- a/python/paddle/distributed/fleet/base/topology.py
+++ b/python/paddle/distributed/fleet/base/topology.py
@@ -378,8 +378,8 @@ def __init__(self):
 
     def set_comm_group(self, group_name, group_rank, group_size, ring_id,
                        group_ranks):
-        group = paddle.distributed.collective.Group(group_rank, group_size,
-                                                    ring_id, group_ranks)
+        group = paddle.distributed.collective.Group(group_rank, ring_id,
+                                                    group_ranks)
         self.groups[group_name] = group
 
     def get_group(self, group_name):
diff --git a/python/paddle/distributed/fleet/layers/mpu/mp_ops.py b/python/paddle/distributed/fleet/layers/mpu/mp_ops.py
index dc4dc05c7ba41..a2f3bde6cfc64 100644
--- a/python/paddle/distributed/fleet/layers/mpu/mp_ops.py
+++ b/python/paddle/distributed/fleet/layers/mpu/mp_ops.py
@@ -22,7 +22,7 @@
 from paddle.fluid.data_feeder import check_variable_and_dtype
 from paddle.fluid.dygraph import layers
 from paddle.distributed import collective
-from ....communication.comm_utils import ReduceOp
+from ....communication.reduce import ReduceOp
 from paddle.fluid.data_feeder import check_dtype
 import paddle.fluid.dygraph_utils as dygraph_utils
 
diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
index 6cda451a266e8..507a765d0c550 100644
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -43,6 +43,7 @@
 from paddle.distributed.collective import _new_process_group_impl
 from paddle.distributed.collective import Group
 from paddle.distributed.collective import _set_group_map_backend
+from paddle.distributed.communication.group import _add_new_group
 
 __all__ = []
 
@@ -258,15 +259,11 @@ def train():
                                      _default_group_name,
                                      pg_options=None)
         ranks = list(range(world_size))
-        group = Group(rank,
-                      world_size,
-                      id=0,
-                      ranks=ranks,
-                      pg=pg,
-                      name=_default_group_name)
+        group = Group(rank, 0, ranks, pg=pg, name=_default_group_name)
         _set_group_map_by_name(_default_group_name, group)
         _set_group_map(0, group)
         _set_group_map_backend(group, backend)
+        _add_new_group(group)
         parallel_helper._set_parallel_ctx(True)
 
         paddle.distributed.barrier(group=group)
diff --git a/python/paddle/incubate/distributed/models/moe/moe_layer.py b/python/paddle/incubate/distributed/models/moe/moe_layer.py
index e74a712f09682..0a0fe32a8e918 100644
--- a/python/paddle/incubate/distributed/models/moe/moe_layer.py
+++ b/python/paddle/incubate/distributed/models/moe/moe_layer.py
@@ -265,7 +265,6 @@ class MoELayer(nn.Layer):
         from paddle.distributed import fleet
 
         moe_group = Group(fleet.worker_index(),
-                          fleet.worker_num(),
                           0,
                           list(range(fleet.worker_num())))
         mp_group = None

From ae3d48deb6ab6052a2dd9624e3aec99bbf85c5f7 Mon Sep 17 00:00:00 2001
From: Wen Sun <35923278+HermitSun@users.noreply.github.com>
Date: Tue, 11 Oct 2022 19:08:57 +0800
Subject: [PATCH 5/6] Completes bfloat16 dtype for collective api in eager mode
 (#45844)

---
 .../collective/ProcessGroupGloo.cc            |   3 +
 .../collective/ProcessGroupNCCL.cc            |   3 +
 .../fluid/platform/device/gpu/nccl_helper.h   |   4 +-
 python/paddle/distributed/collective.py       |  37 +--
 .../tests/unittests/collective/CMakeLists.txt |  18 +-
 .../collective_allgather_api_dygraph.py       |  15 +-
 .../collective_allreduce_api_dygraph.py       |  13 +-
 .../collective_alltoall_api_dygraph.py        |  29 ++-
 .../collective_alltoall_single_api_dygraph.py |  16 +-
 .../collective_broadcast_api_dygraph.py       |  13 +-
 .../collective_isend_irecv_api_dygraph.py     |  23 +-
 .../collective_reduce_api_dygraph.py          |  13 +-
 .../collective_reduce_scatter_api_dygraph.py  |  16 +-
 .../collective_scatter_api_dygraph.py         |  29 ++-
 .../collective_sendrecv_api_dygraph.py        |  33 +--
 .../test_collective_allgather_api.py          | 244 +++---------------
 .../test_collective_allreduce_api.py          |  12 +-
 .../test_collective_alltoall_api.py           |   8 +-
 .../test_collective_alltoall_single_api.py    |   8 +-
 .../test_collective_broadcast_api.py          |  12 +-
 .../test_collective_isend_irecv_api.py        |   8 +-
 .../collective/test_collective_reduce_api.py  |  12 +-
 .../test_collective_reduce_scatter_api.py     |   8 +-
 .../collective/test_collective_scatter_api.py |  12 +-
 .../test_collective_sendrecv_api.py           |   8 +-
 .../tests/unittests/collective/testslist.csv  |  18 +-
 .../unittests/test_collective_api_base.py     |  47 +++-
 27 files changed, 318 insertions(+), 344 deletions(-)

diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
index 097c9799b70f2..07065ac908e4e 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
@@ -88,6 +88,9 @@ namespace distributed {
     case experimental::DataType::BOOL:       \
       func<bool>(args);                      \
       break;                                 \
+    case experimental::DataType::BFLOAT16:   \
+      func<bfloat16>(args);                  \
+      break;                                 \
     default:                                 \
       VLOG(0) << "Error: Unknown DataType."; \
       exit(-1);                              \
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
index bc5eb4885ee52..2e18dfcc3ba12 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
@@ -1030,6 +1030,9 @@ void* GetPointerByOffset(void* raw_pointer,
   } else if (type == experimental::DataType::BOOL) {
     return reinterpret_cast<void*>(reinterpret_cast<bool*>(raw_pointer) +
                                    offset);
+  } else if (type == experimental::DataType::BFLOAT16) {
+    return reinterpret_cast<void*>(reinterpret_cast<uint16_t*>(raw_pointer) +
+                                   offset);
   } else {
     PADDLE_THROW(platform::errors::Unimplemented(
         "This datatype in nccl is not supported."));
diff --git a/paddle/fluid/platform/device/gpu/nccl_helper.h b/paddle/fluid/platform/device/gpu/nccl_helper.h
index a5d89f6001fa1..5d89da86efa6c 100644
--- a/paddle/fluid/platform/device/gpu/nccl_helper.h
+++ b/paddle/fluid/platform/device/gpu/nccl_helper.h
@@ -59,7 +59,7 @@ inline ncclDataType_t ToNCCLDataType(framework::proto::VarType::Type type) {
     return ncclUint8;
   } else if (type == framework::proto::VarType::BOOL) {
     return ncclUint8;
-#if CUDNN_VERSION_MIN(8, 1, 0) && NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000
   } else if (type == framework::proto::VarType::BF16) {
     return ncclBfloat16;
 #endif
@@ -86,7 +86,7 @@ inline ncclDataType_t ToNCCLDataType(experimental::DataType type) {
     return ncclInt8;
   } else if (type == experimental::DataType::BOOL) {
     return ncclUint8;
-#if CUDNN_VERSION_MIN(8, 1, 0) && NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000
   } else if (type == experimental::DataType::BFLOAT16) {
     return ncclBfloat16;
 #endif
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index 2af34d942e0b1..95b63cb0518a5 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -478,7 +478,8 @@ def is_initialized():
 
     Check whether the distributed environment has been initialized
 
-    Returns (bool): `True` if distributed environment has been initialized, otherwise `False`.
+    Returns:
+        `True` if distributed environment has been initialized, otherwise `False`.
 
     Examples:
         .. code-block:: python
@@ -626,7 +627,7 @@ def broadcast(tensor, src, group=None, sync_op=True):
 
     Args:
         tensor (Tensor): The Tensor to send if current rank is the source, or the Tensor to receive otherwise. Its data type
-            should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+            should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         src (int): The source rank.
         group (Group, optional): The group instance return by new_group or None for global default group.
         sync_op (bool, optional): Whether this op is a sync op. The default value is True.
@@ -709,7 +710,7 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=None, sync_op=True):
 
     Args:
         tensor (Tensor): The output Tensor for the destination and the input Tensor otherwise. Its data type
-            should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+            should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         dst (int): The destination rank id.
         op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD, optional): The operation used. Default value is ReduceOp.SUM.
         group (Group, optional): The group instance return by new_group or None for global default group.
@@ -817,7 +818,7 @@ def all_gather(tensor_list, tensor, group=None, sync_op=True):
 
     Args:
         tensor_list (list): A list of output Tensors. Every element in the list must be a Tensor whose data type
-            should be float16, float32, float64, int32, int64, int8, uint8, bool, complex64 or complex128.
+            should be float16, float32, float64, int32, int64, int8, uint8, bool, bfloat16, complex64 or complex128.
         tensor (Tensor): The Tensor to send. Its data type
             should be float16, float32, float64, int32, int64, int8, uint8, bool, complex64 or complex128.
         group (Group, optional): The group instance return by new_group or None for global default group.
@@ -999,9 +1000,9 @@ def scatter(tensor, tensor_list=None, src=0, group=None, sync_op=True):
 
     Args:
         tensor (Tensor): The output Tensor. Its data type
-            should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+            should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         tensor_list (list|tuple): A list/tuple of Tensors to scatter. Every element in the list must be a Tensor whose data type
-            should be float16, float32, float64, int32, int64, int8, uint8 or bool. Default value is None.
+            should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16. Default value is None.
         src (int): The source rank id. Default value is 0.
         group (Group, optional): The group instance return by new_group or None for global default group.
         sync_op (bool, optional): Whether this op is a sync op. The default value is True.
@@ -1096,7 +1097,7 @@ def alltoall(in_tensor_list, out_tensor_list, group=None, sync_op=True):
 
     Args:
         in_tensor_list (list): A list of input Tensors. Every element in the list must be a Tensor whose data type
-            should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+            should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         out_tensor_list (list): A list of output Tensors. The data type of its elements should be the same as the
             data type of the input Tensors.
         group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
@@ -1197,7 +1198,7 @@ def alltoall_single(in_tensor,
         ``alltoall_single`` is only supported in eager mode.
 
     Args:
-        in_tensor (Tensor): Input tensor. The data type should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+        in_tensor (Tensor): Input tensor. The data type should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         out_tensor (Tensor): Output Tensor. The data type should be the same as the data type of the input Tensor.
         in_split_sizes (list[int], optional): Split sizes of ``in_tensor`` for dim[0]. If not given, dim[0] of ``in_tensor`` 
             must be divisible by group size and ``in_tensor`` will be scattered averagely to all participators. Default: None.
@@ -1286,7 +1287,7 @@ def send(tensor, dst=0, group=None, sync_op=True):
 
     Args:
         tensor (Tensor): The Tensor to send. Its data type
-            should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+            should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         dst (int): The destination rank id.
         group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
         sync_op (bool, optional): Whether this op is a sync op. The default value is True.
@@ -1352,7 +1353,7 @@ def recv(tensor, src=0, group=None, sync_op=True):
 
     Args:
         tensor (Tensor): The Tensor to receive. Its data type
-            should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+            should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         src (int): The source rank id.
         group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
         sync_op (bool, optional): Whether this op is a sync op. The default value is True.
@@ -1435,7 +1436,7 @@ def isend(tensor, dst, group=None):
 
     Args:
         tensor (Tensor): The Tensor to send. Its data type
-            should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+            should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         dst (int): The destination rank.
         group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
     
@@ -1485,7 +1486,7 @@ def irecv(tensor, src=None, group=None):
 
     Args:
         tensor (Tensor): The Tensor to receive. Its data type
-            should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+            should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         src (int): The source rank id.
         group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
 
@@ -1594,7 +1595,7 @@ def batch_isend_irecv(p2p_op_list):
     corresponding tasks. NCCL are currently supported.
 
     Args:
-        p2p_op_list: A list of point-to-point operations(type of each operator is
+        p2p_op_list (List[P2POp]): A list of point-to-point operations(type of each operator is
             ``paddle.distributed.P2POp``). The order of the isend/irecv in the list
             matters and it needs to match with corresponding isend/irecv on the
             remote end.
@@ -1668,9 +1669,9 @@ def reduce_scatter(tensor,
     Reduces, then scatters a list of tensors to all processes in a group
 
     Args:
-        tensor (Tensor): Output tensor. Its data type should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+        tensor (Tensor): Output tensor. Its data type should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         tensor_list (list[Tensor]): List of tensors to reduce and scatter. Every element in the list must be a Tensor whose data type
-            should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+            should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD): Optional. The operation used. Default: ReduceOp.SUM.
         group (Group, optional): The group instance return by new_group or None for global 
             default group. Default: None.
@@ -1736,9 +1737,9 @@ def _reduce_scatter_base(output,
     Reduces, then scatters a flattened tensor to all processes in a group.
 
     Args:
-        output (Tensor): Output tensor. Its data type should be float16, float32, float64, int32, int64, int8, uint8 or bool.
-        input (Tensor): Input tensor that is of size output tensor size times world size. Its data type 
-            should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+        output (Tensor): Output tensor. Its data type should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
+        input (Tensor): Input tensor that is of size output tensor size times world size. Its data type
+            should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD): Optional. The operation used. Default: ReduceOp.SUM.
         group (ProcessGroup, optional): The process group to work on. If None,
             the default process group will be used.
diff --git a/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt b/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt
index a9db1e0bc7eec..69cfef8e58fba 100644
--- a/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt
@@ -71,14 +71,14 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
     test_collective_allreduce_api MODULES test_collective_allreduce_api ENVS
     "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
   set_tests_properties(test_collective_allreduce_api
-                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
+                       PROPERTIES TIMEOUT "180" LABELS "RUN_TYPE=DIST")
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(
     test_collective_alltoall_api MODULES test_collective_alltoall_api ENVS
     "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
   set_tests_properties(test_collective_alltoall_api
-                       PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST")
+                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   bash_test_modules(
@@ -98,7 +98,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
     test_collective_alltoall_single_api ENVS
     "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
   set_tests_properties(test_collective_alltoall_single_api
-                       PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST")
+                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(
@@ -125,7 +125,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
     test_collective_broadcast_api MODULES test_collective_broadcast_api ENVS
     "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
   set_tests_properties(test_collective_broadcast_api
-                       PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST")
+                       PROPERTIES TIMEOUT "180" LABELS "RUN_TYPE=DIST")
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(
@@ -154,7 +154,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
     test_collective_isend_irecv_api MODULES test_collective_isend_irecv_api
     ENVS "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
   set_tests_properties(test_collective_isend_irecv_api
-                       PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST")
+                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(
@@ -187,7 +187,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
     test_collective_reduce_api MODULES test_collective_reduce_api ENVS
     "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
   set_tests_properties(test_collective_reduce_api
-                       PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST")
+                       PROPERTIES TIMEOUT "180" LABELS "RUN_TYPE=DIST")
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   bash_test_modules(
@@ -207,7 +207,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
     test_collective_reduce_scatter_api ENVS
     "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
   set_tests_properties(test_collective_reduce_scatter_api
-                       PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST")
+                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(
@@ -221,7 +221,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
     test_collective_scatter_api MODULES test_collective_scatter_api ENVS
     "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
   set_tests_properties(test_collective_scatter_api
-                       PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST")
+                       PROPERTIES TIMEOUT "180" LABELS "RUN_TYPE=DIST")
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(
@@ -235,7 +235,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
     test_collective_sendrecv_api MODULES test_collective_sendrecv_api ENVS
     "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
   set_tests_properties(test_collective_sendrecv_api
-                       PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST")
+                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_allgather_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_allgather_api_dygraph.py
index d485fd23d9571..38e1cc555da75 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_allgather_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_allgather_api_dygraph.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
 import unittest
 import test_collective_api_base as test_base
@@ -27,10 +28,18 @@ def __init__(self):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
             tensor_list = []
-            paddle.distributed.all_gather(tensor_list, tindata)
-            return [tensor.numpy() for tensor in tensor_list]
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                dist.all_gather(tensor_list, tindata)
+                return [
+                    tensor.cast("float32").numpy() for tensor in tensor_list
+                ]
+            else:
+                tindata = paddle.to_tensor(indata)
+                dist.all_gather(tensor_list, tindata)
+                return [tensor.numpy() for tensor in tensor_list]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_allreduce_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_allreduce_api_dygraph.py
index 83588d450a7c9..92cc5a1623d9f 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_allreduce_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_allreduce_api_dygraph.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
 import unittest
 import test_collective_api_base as test_base
@@ -27,9 +28,15 @@ def __init__(self):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
-            paddle.distributed.all_reduce(tindata)
-            return [tindata.numpy()]
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                dist.all_reduce(tindata)
+                return [tindata.cast("float32").numpy()]
+            else:
+                tindata = paddle.to_tensor(indata)
+                dist.all_reduce(tindata)
+                return [tindata.numpy()]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_alltoall_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_api_dygraph.py
index fcabaffd614d0..da6c5ec1b3ad2 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_alltoall_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_api_dygraph.py
@@ -25,30 +25,31 @@
 from six import string_types
 import math
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
-import paddle.fluid.profiler as profiler
-import paddle.fluid.unique_name as nameGen
-from paddle.fluid import core
-import unittest
-from multiprocessing import Process
-import paddle.fluid.layers as layers
-from functools import reduce
-from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
+import test_collective_api_base as test_base
 
 
-class TestCollectiveAllToAllAPI(TestCollectiveAPIRunnerBase):
+class TestCollectiveAllToAllAPI(test_base.TestCollectiveAPIRunnerBase):
 
     def __init__(self):
         self.global_ring_id = 0
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
-            tindata = paddle.split(tindata, 2, axis=0)
             toutdata = []
-            paddle.distributed.alltoall(tindata, toutdata)
-            return [data.numpy() for data in toutdata]
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                tindata = paddle.split(tindata, 2, axis=0)
+                dist.alltoall(tindata, toutdata)
+                return [data.cast("float32").numpy() for data in toutdata]
+            else:
+                tindata = paddle.to_tensor(indata)
+                tindata = paddle.split(tindata, 2, axis=0)
+                dist.alltoall(tindata, toutdata)
+                return [data.numpy() for data in toutdata]
 
 
 if __name__ == "__main__":
-    runtime_main(TestCollectiveAllToAllAPI, "alltoall")
+    test_base.runtime_main(TestCollectiveAllToAllAPI, "alltoall")
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single_api_dygraph.py
index 5fac73989a606..1100a4a481b5a 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single_api_dygraph.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
 import test_collective_api_base as test_base
 
@@ -26,10 +27,17 @@ def __init__(self):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
-            toutdata = paddle.to_tensor(indata)
-            paddle.distributed.alltoall_single(tindata, toutdata)
-            return [toutdata.numpy()]
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                toutdata = paddle.to_tensor(tindata, "float32").cast("uint16")
+                dist.alltoall_single(tindata, toutdata)
+                return [toutdata.cast("float32").numpy()]
+            else:
+                tindata = paddle.to_tensor(indata)
+                toutdata = paddle.to_tensor(indata)
+                dist.alltoall_single(tindata, toutdata)
+                return [toutdata.numpy()]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_broadcast_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_broadcast_api_dygraph.py
index 29f0b74bb405b..de80e3b99a2d5 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_broadcast_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_broadcast_api_dygraph.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
 import unittest
 import test_collective_api_base as test_base
@@ -27,9 +28,15 @@ def __init__(self):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
-            paddle.distributed.broadcast(tindata, src=1)
-            return [tindata.numpy()]
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                dist.broadcast(tindata, src=1)
+                return [tindata.cast("float32").numpy()]
+            else:
+                tindata = paddle.to_tensor(indata)
+                dist.broadcast(tindata, src=1)
+                return [tindata.numpy()]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_isend_irecv_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_isend_irecv_api_dygraph.py
index 70437216a8f85..da3d4c064300c 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_isend_irecv_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_isend_irecv_api_dygraph.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
 import unittest
 import test_collective_api_base as test_base
@@ -27,13 +28,23 @@ def __init__(self):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
-            if rank == 0:
-                task = paddle.distributed.isend(tindata, dst=1)
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                if rank == 0:
+                    task = dist.isend(tindata, dst=1)
+                else:
+                    task = dist.irecv(tindata, src=0)
+                task.wait()
+                return [tindata.cast("float32").numpy()]
             else:
-                task = paddle.distributed.irecv(tindata, src=0)
-            task.wait()
-            return [tindata.numpy()]
+                tindata = paddle.to_tensor(indata)
+                if rank == 0:
+                    task = dist.isend(tindata, dst=1)
+                else:
+                    task = dist.irecv(tindata, src=0)
+                task.wait()
+                return [tindata.numpy()]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_reduce_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_reduce_api_dygraph.py
index 257fc27ceee9f..6e2b1e86bcd14 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_reduce_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_reduce_api_dygraph.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
 import unittest
 import test_collective_api_base as test_base
@@ -27,9 +28,15 @@ def __init__(self):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
-            paddle.distributed.reduce(tindata, dst=0)
-            return [tindata.numpy()]
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                dist.reduce(tindata, dst=0)
+                return [tindata.cast("float32").numpy()]
+            else:
+                tindata = paddle.to_tensor(indata)
+                dist.reduce(tindata, dst=0)
+                return [tindata.numpy()]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter_api_dygraph.py
index 1b0eb6aef9d47..c5b5756ac74ba 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter_api_dygraph.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
 import unittest
 import test_collective_api_base as test_base
@@ -27,10 +28,17 @@ def __init__(self):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
-            subdata1, subdata2 = paddle.split(tindata, 2, axis=0)
-            paddle.distributed.reduce_scatter(subdata1, [subdata1, subdata2])
-            return [subdata1.numpy()]
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                subdata1, subdata2 = paddle.split(tindata, 2, axis=0)
+                dist.reduce_scatter(subdata1, [subdata1, subdata2])
+                return [subdata1.cast("float32").numpy()]
+            else:
+                tindata = paddle.to_tensor(indata)
+                subdata1, subdata2 = paddle.split(tindata, 2, axis=0)
+                dist.reduce_scatter(subdata1, [subdata1, subdata2])
+                return [subdata1.numpy()]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_scatter_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_scatter_api_dygraph.py
index f37f5653806ec..255b4e7e0ac59 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_scatter_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_scatter_api_dygraph.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
 import unittest
 import test_collective_api_base as test_base
@@ -27,15 +28,27 @@ def __init__(self):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
-            subdata1, subdata2 = paddle.split(tindata, 2, axis=0)
-            if rank == 0:
-                paddle.distributed.scatter(subdata1, src=1)
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                subdata1, subdata2 = paddle.split(tindata, 2, axis=0)
+                if rank == 0:
+                    dist.scatter(subdata1, src=1)
+                else:
+                    dist.scatter(subdata1,
+                                 tensor_list=[subdata1, subdata2],
+                                 src=1)
+                return [subdata1.cast("float32").numpy()]
             else:
-                paddle.distributed.scatter(subdata1,
-                                           tensor_list=[subdata1, subdata2],
-                                           src=1)
-            return [subdata1.numpy()]
+                tindata = paddle.to_tensor(indata)
+                subdata1, subdata2 = paddle.split(tindata, 2, axis=0)
+                if rank == 0:
+                    dist.scatter(subdata1, src=1)
+                else:
+                    dist.scatter(subdata1,
+                                 tensor_list=[subdata1, subdata2],
+                                 src=1)
+                return [subdata1.numpy()]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_sendrecv_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_sendrecv_api_dygraph.py
index 8508c3d043c93..f4ae715a349fc 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_sendrecv_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_sendrecv_api_dygraph.py
@@ -25,31 +25,34 @@
 from six import string_types
 import math
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
-import paddle.fluid.profiler as profiler
-import paddle.fluid.unique_name as nameGen
-from paddle.fluid import core
-import unittest
-from multiprocessing import Process
-import paddle.fluid.layers as layers
-from functools import reduce
-from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
+import test_collective_api_base as test_base
 
 
-class TestCollectiveSendRecvAPI(TestCollectiveAPIRunnerBase):
+class TestCollectiveSendRecvAPI(test_base.TestCollectiveAPIRunnerBase):
 
     def __init__(self):
         self.global_ring_id = 0
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
-            if rank == 0:
-                paddle.distributed.send(tindata, dst=1)
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                if rank == 0:
+                    dist.send(tindata, dst=1)
+                else:
+                    dist.recv(tindata, src=0)
+                return [tindata.cast("float32").numpy()]
             else:
-                paddle.distributed.recv(tindata, src=0)
-            return [tindata.numpy()]
+                tindata = paddle.to_tensor(indata)
+                if rank == 0:
+                    dist.send(tindata, dst=1)
+                else:
+                    dist.recv(tindata, src=0)
+                return [tindata.numpy()]
 
 
 if __name__ == "__main__":
-    runtime_main(TestCollectiveSendRecvAPI, "sendrecv")
+    test_base.runtime_main(TestCollectiveSendRecvAPI, "sendrecv")
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py
index a01a96a0d6b29..78ecf0816b67f 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py
@@ -28,213 +28,55 @@ def _setup_config(self):
         pass
 
     def test_allgather_nccl(self):
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="float16")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="float32")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="float64")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="bool")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="uint8")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="int8")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="int32")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="int64")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="complex64")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="complex128")
+        dtypes_to_test = [
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "complex64", "complex128"
+        ]
+        for dtype in dtypes_to_test:
+            self.check_with_place("collective_allgather_api.py",
+                                  "allgather",
+                                  "nccl",
+                                  dtype=dtype)
 
     def test_allgather_gloo(self):
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="float16")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="float32")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="float64")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="bool")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="uint8")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="int8")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="int32")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="int64")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="complex64")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="complex128")
+        dtypes_to_test = [
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "complex64", "complex128"
+        ]
+        for dtype in dtypes_to_test:
+            self.check_with_place("collective_allgather_api.py",
+                                  "allgather",
+                                  "gloo",
+                                  "3",
+                                  dtype=dtype)
 
     def test_allgatther_nccl_dygraph(self):
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="float16")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="float32")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="float64")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="bool")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="uint8")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="int8")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="int32")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="int64")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="complex64")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="complex128")
+        dtypes_to_test = [
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "complex64", "complex128"
+        ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
+        for dtype in dtypes_to_test:
+            self.check_with_place("collective_allgather_api_dygraph.py",
+                                  "allgather",
+                                  "nccl",
+                                  static_mode="0",
+                                  dtype=dtype)
 
     def test_allgather_gloo_dygraph(self):
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="float16")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="float32")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="float64")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="bool")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="uint8")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="int8")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="int32")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="int64")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="complex64")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="complex128")
+        dtypes_to_test = [
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "bfloat16", "complex64", "complex128"
+        ]
+        for dtype in dtypes_to_test:
+            self.check_with_place("collective_allgather_api_dygraph.py",
+                                  "allgather",
+                                  "gloo",
+                                  "3",
+                                  static_mode="0",
+                                  dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py
index 2598606fc9cc2..65754989d3f7e 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py
@@ -43,9 +43,11 @@ def test_allreduce_gloo(self):
 
     def test_allreduce_nccl_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool"
         ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_allreduce_api_dygraph.py",
                                   "allreduce",
@@ -55,8 +57,8 @@ def test_allreduce_nccl_dygraph(self):
 
     def test_allreduce_gloo_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "bfloat16"
         ]
         for dtype in dtypes_to_test:
             self.check_with_place("collective_allreduce_api_dygraph.py",
@@ -67,5 +69,5 @@ def test_allreduce_gloo_dygraph(self):
                                   dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py
index e079e99efebf5..35e3bf323964d 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py
@@ -32,9 +32,11 @@ def test_alltoall_nccl(self):
 
     def test_alltoall_nccl_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool"
         ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_alltoall_api_dygraph.py",
                                   "alltoall",
@@ -43,5 +45,5 @@ def test_alltoall_nccl_dygraph(self):
                                   dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py
index fb1e5e9da22ef..23d2a998d8e32 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py
@@ -24,9 +24,11 @@ def _setup_config(self):
 
     def test_alltooall_single_nccl_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool"
         ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_alltoall_single_api_dygraph.py",
                                   "alltoall",
@@ -35,5 +37,5 @@ def test_alltooall_single_nccl_dygraph(self):
                                   dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py
index 2d21be144a68b..e08930cefe9ca 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py
@@ -37,9 +37,11 @@ def test_broadcast_gloo(self):
 
     def test_broadcast_nccl_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool"
         ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_broadcast_api_dygraph.py",
                                   "broadcast",
@@ -49,8 +51,8 @@ def test_broadcast_nccl_dygraph(self):
 
     def test_broadcast_gloo_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "bfloat16"
         ]
         for dtype in dtypes_to_test:
             self.check_with_place("collective_broadcast_api_dygraph.py",
@@ -61,5 +63,5 @@ def test_broadcast_gloo_dygraph(self):
                                   dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py
index f9613abc24063..28e502821aa52 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py
@@ -24,9 +24,11 @@ def _setup_config(self):
 
     def test_isend_irecv_nccl_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool"
         ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_isend_irecv_api_dygraph.py",
                                   "sendrecv",
@@ -35,5 +37,5 @@ def test_isend_irecv_nccl_dygraph(self):
                                   dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py
index 2fa84ea2ed7f1..cc6093a3f431c 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py
@@ -40,9 +40,11 @@ def test_reduce_gloo(self):
 
     def test_reduce_nccl_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool"
         ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_reduce_api_dygraph.py",
                                   "reduce",
@@ -52,8 +54,8 @@ def test_reduce_nccl_dygraph(self):
 
     def test_reduce_gloo_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "bfloat16"
         ]
         for dtype in dtypes_to_test:
             self.check_with_place("collective_reduce_api_dygraph.py",
@@ -64,5 +66,5 @@ def test_reduce_gloo_dygraph(self):
                                   dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py
index 1d25527407f45..283f73020f749 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py
@@ -24,9 +24,11 @@ def _setup_config(self):
 
     def test_reduce_scatter_nccl_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool"
         ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_reduce_scatter_api_dygraph.py",
                                   "reduce_scatter",
@@ -35,5 +37,5 @@ def test_reduce_scatter_nccl_dygraph(self):
                                   dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py
index 4093b8ed69093..82ef4bd80e2d8 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py
@@ -36,9 +36,11 @@ def test_scatter_nccl(self):
 
     def test_scatter_nccl_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool"
         ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_scatter_api_dygraph.py",
                                   "scatter",
@@ -48,8 +50,8 @@ def test_scatter_nccl_dygraph(self):
 
     def test_scatter_gloo_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "bfloat16"
         ]
         for dtype in dtypes_to_test:
             self.check_with_place("collective_scatter_api_dygraph.py",
@@ -60,5 +62,5 @@ def test_scatter_gloo_dygraph(self):
                                   dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py
index 940d6ec709bf1..c2fc98ed18e38 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py
@@ -34,9 +34,11 @@ def _setup_config(self):
 
     def test_sendrecv_nccl_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool"
         ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_sendrecv_api_dygraph.py",
                                   "sendrecv",
@@ -45,5 +47,5 @@ def test_sendrecv_nccl_dygraph(self):
                                   dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/testslist.csv b/python/paddle/fluid/tests/unittests/collective/testslist.csv
index 60a2c4af239ed..2cf632a29d0ac 100644
--- a/python/paddle/fluid/tests/unittests/collective/testslist.csv
+++ b/python/paddle/fluid/tests/unittests/collective/testslist.csv
@@ -7,27 +7,27 @@ test_c_split,linux,gpu;rocm,120,DIST,test_runner.py,2,,PYTHONPATH=..;http_proxy=
 test_collective_split_embedding,linux,rocm;gpu,300,DIST,../dist_test.sh,2,,PYTHONPATH=..;http_proxy=;https_proxy=,
 test_collective_allgather_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_allgather_object_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
-test_collective_allreduce_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
-test_collective_alltoall_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_allreduce_api,linux,gpu;rocm,180,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_alltoall_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_alltoall_single,linux,gpu;rocm,350,DIST,../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
-test_collective_alltoall_single_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_alltoall_single_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_barrier_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_batch_isend_irecv,linux,gpu;rocm,350,DIST,../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
-test_collective_broadcast_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_broadcast_api,linux,gpu;rocm,180,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_cpu_barrier_with_gloo,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_global_gather,linux,gpu;rocm,200,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_global_scatter,linux,gpu;rocm,200,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
-test_collective_isend_irecv_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_isend_irecv_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_optimizer,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_process_group,linux,gpu;rocm,350,DIST,../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_reduce,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
-test_collective_reduce_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_reduce_api,linux,gpu;rocm,180,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_reduce_scatter,linux,gpu;rocm,350,DIST,../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
-test_collective_reduce_scatter_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_reduce_scatter_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_scatter,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
-test_collective_scatter_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_scatter_api,linux,gpu;rocm,180,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_sendrecv,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
-test_collective_sendrecv_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_sendrecv_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_split_col_linear,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_split_embedding_none_divisible,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_split_row_linear,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
diff --git a/python/paddle/fluid/tests/unittests/test_collective_api_base.py b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
index 21c9b172e9822..7d1a237951110 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_api_base.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
@@ -29,6 +29,7 @@
 import paddle.fluid as fluid
 import paddle.fluid.unique_name as nameGen
 from paddle.fluid import core
+from paddle_bfloat import bfloat16
 
 
 def create_bool_test_data(shape=None, seed=None):
@@ -82,6 +83,9 @@ def create_test_data(shape=None, dtype=None, seed=None):
     assert shape, "Shape should be specified"
     if dtype == "float32" or dtype == "float16" or dtype == "float64":
         return create_float_test_data(shape=shape, dtype=dtype, seed=seed)
+    elif dtype == "bfloat16":
+        # since numpy does not support bfloat16 yet, use `paddle_bfloat` to replace
+        return create_float_test_data(shape=shape, dtype=bfloat16, seed=seed)
     elif dtype == "bool":
         return create_bool_test_data(shape=shape, seed=seed)
     elif dtype == "int32" or dtype == "int64" or dtype == "int8" or dtype == "uint8":
@@ -174,6 +178,15 @@ def setUp(self):
 
         self.temp_dir = tempfile.TemporaryDirectory()
 
+        # NOTE: this is a hack to get int format nccl version, like 2134
+        # if current platform is not linux, version number will be 0
+        nccl_version_str = subprocess.check_output(
+            r"ldconfig -v | grep 'libnccl.so' | tail -n1 | sed -r 's/^.*\.so\.//'",
+            stderr=subprocess.DEVNULL,
+            shell=True).decode('utf-8')
+        self._nccl_version = int("".join(
+            nccl_version_str.split("."))) if nccl_version_str else 0
+
     def tearDown(self):
         self.temp_dir.cleanup()
 
@@ -312,6 +325,10 @@ def check_with_place(self,
             model_file, required_envs)
         input1 = create_test_data(shape=(10, 1000), dtype=dtype, seed=pid0)
         input2 = create_test_data(shape=(10, 1000), dtype=dtype, seed=pid1)
+        # cast bfloat16 to float32 for numeric comparison
+        if dtype == "bfloat16":
+            input1 = input1.astype("float32")
+            input2 = input2.astype("float32")
         if col_type == "allgather":
             need_result = np.vstack((input1, input2))
             tr_out0 = np.vstack((tr0_out[0], tr0_out[1]))
@@ -328,7 +345,13 @@ def check_with_place(self,
             np.testing.assert_allclose(tr1_out[0], need_result, rtol=1e-05)
         elif col_type == "reduce":
             need_result = input1 + input2
-            np.testing.assert_allclose(tr0_out[0], need_result, rtol=1e-05)
+            # bfloat16 precision loss comes from truncating the last 16 bits of float32,
+            # which sums (\sum_{i=-23}^{-8}2^{i}) to about 0.0078
+            if dtype == "bfloat16":
+                rtol = 8e-03
+            else:
+                rtol = 1e-05
+            np.testing.assert_allclose(tr0_out[0], need_result, rtol=rtol)
         elif col_type == "scatter":
             need_result = input2
             need_result1 = need_result[0:need_result.shape[0] // 2]
@@ -339,18 +362,28 @@ def check_with_place(self,
             need_result = input1 + input2
             need_result1 = need_result[0:need_result.shape[0] // 2]
             need_result2 = need_result[need_result.shape[0] // 2:]
-            np.testing.assert_allclose(tr0_out[0], need_result1, rtol=1e-05)
-            np.testing.assert_allclose(tr1_out[0], need_result2, rtol=1e-05)
+            if dtype == "bfloat16":
+                rtol = 8e-03
+            else:
+                rtol = 1e-05
+            np.testing.assert_allclose(tr0_out[0], need_result1, rtol=rtol)
+            np.testing.assert_allclose(tr1_out[0], need_result2, rtol=rtol)
         elif col_type == "allreduce":
             need_result = input1 + input2
+            if dtype == "bfloat16":
+                rtol = 8e-03
+                atol = 8e-03
+            else:
+                rtol = 1e-05
+                atol = 1e-05
             np.testing.assert_allclose(tr0_out[0],
                                        need_result,
-                                       rtol=1e-05,
-                                       atol=1e-05)
+                                       rtol=rtol,
+                                       atol=atol)
             np.testing.assert_allclose(tr1_out[0],
                                        need_result,
-                                       rtol=1e-05,
-                                       atol=1e-05)
+                                       rtol=rtol,
+                                       atol=atol)
         elif col_type == "parallel_embedding":
             result_data = tr0_out[0]
             np.random.seed(2020)

From 64de943d027ade87842d1d7de521504517106512 Mon Sep 17 00:00:00 2001
From: Wen Sun <35923278+HermitSun@users.noreply.github.com>
Date: Fri, 14 Oct 2022 20:02:38 +0800
Subject: [PATCH 6/6] Fix collective APIs cannot be recognized when building
 docs (#46962)

---
 python/paddle/distributed/__init__.py                  |  6 +++---
 .../distributed/communication/stream/__init__.py       |  6 +++---
 .../communication_stream_reduce_scatter_api_dygraph.py | 10 +++++-----
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
index 6c7b2fa732969..658d942bec025 100644
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
@@ -74,7 +74,7 @@
     "gloo_release", "QueueDataset", "split", "CountFilterEntry",
     "ShowClickEntry", "get_world_size", "get_group", "all_gather",
     "all_gather_object", "InMemoryDataset", "barrier", "all_reduce", "alltoall",
-    "send", "reduce", "recv", "ReduceOp", "wait", "get_rank",
-    "ProbabilityEntry", "ParallelMode", "is_initialized", "isend", "irecv",
-    "reduce_scatter"
+    "alltoall_single", "send", "reduce", "recv", "ReduceOp", "wait", "get_rank",
+    "ProbabilityEntry", "ParallelMode", "is_initialized",
+    "destroy_process_group", "isend", "irecv", "reduce_scatter", "stream"
 ]
diff --git a/python/paddle/distributed/communication/stream/__init__.py b/python/paddle/distributed/communication/stream/__init__.py
index a1844decf9478..43952ce5541a3 100644
--- a/python/paddle/distributed/communication/stream/__init__.py
+++ b/python/paddle/distributed/communication/stream/__init__.py
@@ -18,12 +18,12 @@
 from .alltoall_single import alltoall_single
 from .broadcast import broadcast
 from .reduce import reduce
-from .reduce_scatter import _reduce_scatter_base, reduce_scatter
+from .reduce_scatter import reduce_scatter
 from .recv import recv
 from .scatter import scatter
 from .send import send
 
 __all__ = [
-    "_reduce_scatter_base", "all_reduce", "alltoall", "alltoall_single",
-    "broadcast", "reduce", "reduce_scatter", "recv", "scatter", "send"
+    "all_gather", "all_reduce", "alltoall", "alltoall_single", "broadcast",
+    "reduce", "reduce_scatter", "recv", "scatter", "send"
 ]
diff --git a/python/paddle/fluid/tests/unittests/collective/communication_stream_reduce_scatter_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/communication_stream_reduce_scatter_api_dygraph.py
index 8f66d67e0d58c..effaf1cb6c99a 100644
--- a/python/paddle/fluid/tests/unittests/collective/communication_stream_reduce_scatter_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/communication_stream_reduce_scatter_api_dygraph.py
@@ -17,6 +17,7 @@
 import paddle
 import paddle.distributed as dist
 import test_collective_api_base as test_collective_base
+from paddle.distributed.communication.stream.reduce_scatter import _reduce_scatter_base
 
 
 class StreamReduceScatterTestCase():
@@ -77,11 +78,10 @@ def run_test_case(self):
 
         # case 3: test the legacy API
         result_tensor = paddle.empty_like(t1)
-        task = dist.stream._reduce_scatter_base(
-            result_tensor,
-            tensor,
-            sync_op=self._sync_op,
-            use_calc_stream=self._use_calc_stream)
+        task = _reduce_scatter_base(result_tensor,
+                                    tensor,
+                                    sync_op=self._sync_op,
+                                    use_calc_stream=self._use_calc_stream)
         if not self._sync_op:
             task.wait()
         if rank == 0: