PaddlePaddle · luotao1 · Jun 5, 2022 · Jun 4, 2022
@@ -28,6 +28,10 @@ repos:
         entry: bash ./tools/codestyle/clang_format.hook -i
         language: system
         files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|xpu|kps)$
+        exclude: |
+            (?x)^(
+                paddle/fluid/distributed/ps/thirdparty/round_robin.h
+            )$
 -   repo: local
     hooks:
     -   id: cpplint-cpp-source

@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/collective/HCCLTools.h"
+
 #include "paddle/fluid/distributed/collective/Types.h"
 
 namespace paddle {

@@ -15,6 +15,7 @@
 #pragma once
 
 #include <error.h>
+
 #include <string>
 
 #include "boost/variant.hpp"

@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/collective/NCCLTools.h"
+
 #include "paddle/fluid/distributed/collective/Types.h"
 
 namespace paddle {

@@ -16,18 +16,18 @@
 
 #include <cuda_runtime.h>
 #include <error.h>
+
 #include <string>
 
 #include "boost/variant.hpp"
+#include "paddle/fluid/distributed/collective/Types.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/dynload/nccl.h"
 #include "paddle/fluid/platform/enforce.h"
 
-#include "paddle/fluid/distributed/collective/Types.h"
-
 namespace paddle {
 namespace distributed {
 

@@ -21,7 +21,6 @@
 
 #include "paddle/fluid/distributed/collective/Types.h"
 #include "paddle/fluid/eager/api/utils/tensor_utils.h"
-
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/enforce.h"

@@ -27,6 +27,7 @@
 #include <gloo/broadcast.h>
 #include <gloo/reduce.h>
 #include <gloo/scatter.h>
+
 #include "paddle/fluid/distributed/collective/Common.h"
 #include "paddle/fluid/distributed/collective/ProcessGroupGloo.h"
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
@@ -485,8 +486,9 @@ std::shared_ptr<::gloo::transport::Device>
 ProcessGroupGloo::createDefaultDevice() {
   std::array<char, HOST_NAME_MAX> hostname{};
   auto ret = ::gethostname(hostname.data(), HOST_NAME_MAX);
-  PADDLE_ENFORCE_EQ(ret, 0, platform::errors::Fatal(
-                                "Get hostname error for createDefaultDevice."));
+  PADDLE_ENFORCE_EQ(
+      ret, 0,
+      platform::errors::Fatal("Get hostname error for createDefaultDevice."));
   ::addrinfo* result;
   result = tcputils::get_addr_info(hostname.data(), "", 0, AF_UNSPEC);
   ::addrinfo* cur;

@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/collective/ProcessGroupHCCL.h"
+
 #include "paddle/fluid/distributed/collective/Common.h"
 #include "paddle/fluid/distributed/collective/HCCLTools.h"
 #include "paddle/fluid/memory/malloc.h"
@@ -216,15 +217,16 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupHCCL::AllReduce(
     std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
     std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
     const AllreduceOptions& opts) {
-  return Collective(in_tensors, out_tensors,
-                    [&](phi::DenseTensor& input, phi::DenseTensor& output,
-                        HcclComm comm, const aclrtStream& stream) {
-                      return platform::dynload::HcclAllReduce(
-                          input.data(), output.data(), input.numel(),
-                          platform::ToHCCLDataType(input.dtype()),
-                          ToHCCLRedType(opts.reduce_op), comm, stream);
-                    },
-                    CommType::ALLREDUCE);
+  return Collective(
+      in_tensors, out_tensors,
+      [&](phi::DenseTensor& input, phi::DenseTensor& output, HcclComm comm,
+          const aclrtStream& stream) {
+        return platform::dynload::HcclAllReduce(
+            input.data(), output.data(), input.numel(),
+            platform::ToHCCLDataType(input.dtype()),
+            ToHCCLRedType(opts.reduce_op), comm, stream);
+      },
+      CommType::ALLREDUCE);
 }
 
 std::shared_ptr<ProcessGroup::Task> ProcessGroupHCCL::Broadcast(

@@ -21,12 +21,11 @@
 #include <unordered_map>
 #include <vector>
 
+#include "paddle/fluid/distributed/collective/HCCLTools.h"
 #include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#include "paddle/fluid/distributed/store/store.h"
 #include "paddle/fluid/platform/device/npu/npu_stream.h"
 #include "paddle/fluid/platform/device_context.h"
-
-#include "paddle/fluid/distributed/collective/HCCLTools.h"
-#include "paddle/fluid/distributed/store/store.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gen_comm_id_helper.h"
 #include "paddle/fluid/platform/place.h"

@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/collective/ProcessGroupHeter.h"
+
 #include <chrono>
+
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/api/include/api.h"
@@ -129,8 +131,9 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::AllReduce(
             gid_, {dense_cpu_tensor.name()}, send_size, dense_cpu_tensor.data(),
             dense_cpu_tensor.numel() *
                 framework::DataTypeSize(dense_cpu_tensor.dtype()));
-        PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
-                                      "Send to the switch module error."));
+        PADDLE_ENFORCE_EQ(ret, 0,
+                          platform::errors::PreconditionNotMet(
+                              "Send to the switch module error."));
         phi::DenseTensor cpu_tensor2;
         cpu_tensor2.AllocateFrom(
             std::make_unique<paddle::experimental::DefaultAllocator>(
@@ -140,8 +143,9 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::AllReduce(
         ret = client_->Recv(
             gid_, {dense_cpu_tensor.name()}, cpu_tensor2.data(),
             cpu_tensor2.numel() * framework::DataTypeSize(cpu_tensor2.dtype()));
-        PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
-                                      "Recv from the switch module error."));
+        PADDLE_ENFORCE_EQ(ret, 0,
+                          platform::errors::PreconditionNotMet(
+                              "Recv from the switch module error."));
 
         switch (dense_cpu_tensor.dtype()) {
           case DataType::FLOAT32:
@@ -226,8 +230,9 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::Broadcast(
               dense_cpu_tensor.data(),
               dense_cpu_tensor.numel() *
                   framework::DataTypeSize(dense_cpu_tensor.dtype()));
-          PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
-                                        "Send to the switch module error."));
+          PADDLE_ENFORCE_EQ(ret, 0,
+                            platform::errors::PreconditionNotMet(
+                                "Send to the switch module error."));
         } else {
           int ret = client_->Recv(
               gid_, {dense_cpu_tensor.name()}, dense_cpu_tensor.data(),
@@ -286,8 +291,9 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::Send(
   VLOG(2) << "tensor_name:" << tensor_name;
   int ret = client_->Send(gid_, {tensor_name}, send_size, cpu_tensor.data(),
                           tensor_size);
-  PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
-                                "Send to the switch module error."));
+  PADDLE_ENFORCE_EQ(
+      ret, 0,
+      platform::errors::PreconditionNotMet("Send to the switch module error."));
   return CreateTask(rank_, CommType::SEND, in_tensors);
 }
 
@@ -319,8 +325,9 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::Recv(
   int ret = client_->Recv(
       gid_, {tensor_name}, cpu_tensor.data(),
       cpu_tensor.numel() * framework::DataTypeSize(cpu_tensor.dtype()));
-  PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
-                                "receive to the switch module error."));
+  PADDLE_ENFORCE_EQ(ret, 0,
+                    platform::errors::PreconditionNotMet(
+                        "receive to the switch module error."));
   auto end = std::chrono::high_resolution_clock::now();
   std::chrono::duration<double> diff = end - start;
   double goodput = cpu_tensor.numel() *

@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
+
 #include "paddle/fluid/distributed/collective/Common.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
@@ -320,15 +321,16 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllReduce(
   PADDLE_ENFORCE_EQ(
       CheckTensorsInCudaPlace(in_tensors), true,
       platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
-  return Collective(in_tensors, out_tensors,
-                    [&](const phi::DenseTensor& input, phi::DenseTensor& output,
-                        ncclComm_t comm, const gpuStream_t& stream) {
-                      return platform::dynload::ncclAllReduce(
-                          input.data(), output.data(), input.numel(),
-                          platform::ToNCCLDataType(input.type()),
-                          ToNCCLRedType(opts.reduce_op), comm, stream);
-                    },
-                    CommType::ALLREDUCE);
+  return Collective(
+      in_tensors, out_tensors,
+      [&](const phi::DenseTensor& input, phi::DenseTensor& output,
+          ncclComm_t comm, const gpuStream_t& stream) {
+        return platform::dynload::ncclAllReduce(
+            input.data(), output.data(), input.numel(),
+            platform::ToNCCLDataType(input.type()),
+            ToNCCLRedType(opts.reduce_op), comm, stream);
+      },
+      CommType::ALLREDUCE);
 }
 
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Broadcast(
@@ -338,17 +340,17 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Broadcast(
       CheckTensorsInCudaPlace(in_tensors), true,
       platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
 
-  return Collective(in_tensors, out_tensors,
-                    [&](phi::DenseTensor& input, phi::DenseTensor& output,
-                        ncclComm_t comm, const gpuStream_t& stream) {
-                      const auto root = opts.source_rank * in_tensors.size() +
-                                        opts.source_root;
-                      return platform::dynload::ncclBroadcast(
-                          input.data(), output.data(), input.numel(),
-                          platform::ToNCCLDataType(input.type()), root, comm,
-                          stream);
-                    },
-                    CommType::BROADCAST);
+  return Collective(
+      in_tensors, out_tensors,
+      [&](phi::DenseTensor& input, phi::DenseTensor& output, ncclComm_t comm,
+          const gpuStream_t& stream) {
+        const auto root =
+            opts.source_rank * in_tensors.size() + opts.source_root;
+        return platform::dynload::ncclBroadcast(
+            input.data(), output.data(), input.numel(),
+            platform::ToNCCLDataType(input.type()), root, comm, stream);
+      },
+      CommType::BROADCAST);
 }
 
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Barrier(
@@ -400,31 +402,31 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Send(
     std::vector<phi::DenseTensor>& tensors, int dst_rank) {
   CheckTensorsInDifferentDevices(tensors, static_cast<size_t>(GetSize()));
 
-  auto task = PointToPoint(tensors,
-                           [&](phi::DenseTensor& input, ncclComm_t comm,
-                               const gpuStream_t& stream, int dst_rank) {
-                             return platform::dynload::ncclSend(
-                                 input.data(), input.numel(),
-                                 platform::ToNCCLDataType(input.dtype()),
-                                 dst_rank, comm, stream);
-                           },
-                           dst_rank, CommType::SEND);
+  auto task = PointToPoint(
+      tensors,
+      [&](phi::DenseTensor& input, ncclComm_t comm, const gpuStream_t& stream,
+          int dst_rank) {
+        return platform::dynload::ncclSend(
+            input.data(), input.numel(),
+            platform::ToNCCLDataType(input.dtype()), dst_rank, comm, stream);
+      },
+      dst_rank, CommType::SEND);
   return task;
 }
 
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv(
     std::vector<phi::DenseTensor>& tensors, int src_rank) {
   CheckTensorsInDifferentDevices(tensors, static_cast<size_t>(GetSize()));
 
-  auto task = PointToPoint(tensors,
-                           [&](phi::DenseTensor& output, ncclComm_t comm,
-                               const gpuStream_t& stream, int src_rank) {
-                             return platform::dynload::ncclRecv(
-                                 output.data(), output.numel(),
-                                 platform::ToNCCLDataType(output.dtype()),
-                                 src_rank, comm, stream);
-                           },
-                           src_rank, CommType::RECV);
+  auto task = PointToPoint(
+      tensors,
+      [&](phi::DenseTensor& output, ncclComm_t comm, const gpuStream_t& stream,
+          int src_rank) {
+        return platform::dynload::ncclRecv(
+            output.data(), output.numel(),
+            platform::ToNCCLDataType(output.dtype()), src_rank, comm, stream);
+      },
+      src_rank, CommType::RECV);
   return task;
 }
 
@@ -440,15 +442,15 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Send_Partial(
   std::vector<phi::DenseTensor> shared_tensors;
   shared_tensors.push_back(shared_input);
 
-  auto task = PointToPoint(shared_tensors,
-                           [&](phi::DenseTensor& input, ncclComm_t comm,
-                               const gpuStream_t& stream, int dst_rank) {
-                             return platform::dynload::ncclSend(
-                                 input.data(), input.numel(),
-                                 platform::ToNCCLDataType(input.dtype()),
-                                 dst_rank, comm, stream);
-                           },
-                           dst_rank, CommType::SEND);
+  auto task = PointToPoint(
+      shared_tensors,
+      [&](phi::DenseTensor& input, ncclComm_t comm, const gpuStream_t& stream,
+          int dst_rank) {
+        return platform::dynload::ncclSend(
+            input.data(), input.numel(),
+            platform::ToNCCLDataType(input.dtype()), dst_rank, comm, stream);
+      },
+      dst_rank, CommType::SEND);
   return task;
 }
 
@@ -463,15 +465,15 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv_Partial(
   std::vector<phi::DenseTensor> shared_tensors;
   shared_tensors.push_back(shared_input);
 
-  auto task = PointToPoint(shared_tensors,
-                           [&](phi::DenseTensor& output, ncclComm_t comm,
-                               const gpuStream_t& stream, int src_rank) {
-                             return platform::dynload::ncclRecv(
-                                 output.data(), output.numel(),
-                                 platform::ToNCCLDataType(output.dtype()),
-                                 src_rank, comm, stream);
-                           },
-                           src_rank, CommType::RECV);
+  auto task = PointToPoint(
+      shared_tensors,
+      [&](phi::DenseTensor& output, ncclComm_t comm, const gpuStream_t& stream,
+          int src_rank) {
+        return platform::dynload::ncclRecv(
+            output.data(), output.numel(),
+            platform::ToNCCLDataType(output.dtype()), src_rank, comm, stream);
+      },
+      src_rank, CommType::RECV);
   return task;
 }
 
@@ -484,15 +486,15 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllGather(
   PADDLE_ENFORCE_EQ(
       CheckTensorsInCudaPlace(out_tensors), true,
       platform::errors::InvalidArgument("All outputs should be in CudaPlace."));
-  return Collective(in_tensors, out_tensors,
-                    [&](const phi::DenseTensor& input, phi::DenseTensor& output,
-                        ncclComm_t comm, const gpuStream_t& stream) {
-                      return platform::dynload::ncclAllGather(
-                          input.data(), output.data(), input.numel(),
-                          platform::ToNCCLDataType(input.dtype()), comm,
-                          stream);
-                    },
-                    CommType::ALLGATHER);
+  return Collective(
+      in_tensors, out_tensors,
+      [&](const phi::DenseTensor& input, phi::DenseTensor& output,
+          ncclComm_t comm, const gpuStream_t& stream) {
+        return platform::dynload::ncclAllGather(
+            input.data(), output.data(), input.numel(),
+            platform::ToNCCLDataType(input.dtype()), comm, stream);
+      },
+      CommType::ALLGATHER);
 }
 
 void* GetPointerByOffset(void* raw_pointer, size_t offset,