From 587e1c24cfd0ec4d288ddb580f82dede69a17ea4 Mon Sep 17 00:00:00 2001 From: Wen Sun <35923278+HermitSun@users.noreply.github.com> Date: Fri, 16 Sep 2022 07:59:12 +0800 Subject: [PATCH 1/6] Support both use_calc_stream and sync_op in send recv APIs (#46023) --- .../distributed/collective/ProcessGroup.h | 50 +++- .../collective/ProcessGroupNCCL.cc | 226 +++++++++++++++++- .../distributed/collective/ProcessGroupNCCL.h | 49 +++- .../collective/ProcessGroupStream.cc | 84 +++++++ .../collective/ProcessGroupStream.h | 52 ++++ paddle/fluid/pybind/distributed_py.cc | 170 +++++++++++++ .../communication/stream/__init__.py | 4 +- .../communication/stream/all_reduce.py | 8 +- .../distributed/communication/stream/recv.py | 82 +++++++ .../distributed/communication/stream/send.py | 82 +++++++ .../tests/unittests/collective/CMakeLists.txt | 32 ++- ...mmunication_stream_sendrecv_api_dygraph.py | 68 ++++++ .../test_communication_stream_sendrecv_api.py | 50 ++++ .../tests/unittests/collective/testslist.csv | 5 +- 14 files changed, 922 insertions(+), 40 deletions(-) create mode 100644 python/paddle/distributed/communication/stream/recv.py create mode 100644 python/paddle/distributed/communication/stream/send.py create mode 100644 python/paddle/fluid/tests/unittests/collective/communication_stream_sendrecv_api_dygraph.py create mode 100644 python/paddle/fluid/tests/unittests/collective/test_communication_stream_sendrecv_api.py diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h index 10b1686ddb85f..3db2464e59afd 100644 --- a/paddle/fluid/distributed/collective/ProcessGroup.h +++ b/paddle/fluid/distributed/collective/ProcessGroup.h @@ -134,24 +134,56 @@ class ProcessGroup { "ProcessGroup%s does not support send", GetBackendName())); } + virtual std::shared_ptr Send( + std::vector&, int, bool) { // NOLINT + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support send with sync_op flag", + GetBackendName())); + } + virtual std::shared_ptr Recv( - std::vector& tensors, int) { // NOLINT + std::vector&, int) { // NOLINT PADDLE_THROW(platform::errors::InvalidArgument( - "ProcessGroup%s does not support receive", GetBackendName())); + "ProcessGroup%s does not support recv", GetBackendName())); } - virtual std::shared_ptr Send_Partial(phi::DenseTensor&, - int, - int, - int) { // NOLINT + virtual std::shared_ptr Recv( + std::vector&, int, bool) { // NOLINT PADDLE_THROW(platform::errors::InvalidArgument( - "ProcessGroup%s does not support send", GetBackendName())); + "ProcessGroup%s does not support recv with sync_op flag", + GetBackendName())); + } + + virtual std::shared_ptr Send_Partial( + phi::DenseTensor&, // NOLINT + int, + int, + int) { + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support send_partial", GetBackendName())); + } + + virtual std::shared_ptr Send_Partial( + phi::DenseTensor&, int, int, int, bool) { // NOLINT + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support send_partial with sync_op flag", + GetBackendName())); } virtual std::shared_ptr Recv_Partial( - phi::DenseTensor& tensors, int, int, int) { // NOLINT + phi::DenseTensor&, // NOLINT + int, + int, + int) { PADDLE_THROW(platform::errors::InvalidArgument( - "ProcessGroup%s does not support receive", GetBackendName())); + "ProcessGroup%s does not support recv_partial", GetBackendName())); + } + + virtual std::shared_ptr Recv_Partial( + phi::DenseTensor&, int, int, int, bool) { // NOLINT + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support recv_partial with sync_op flag", + GetBackendName())); } virtual std::shared_ptr AllGather( diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc index 239114ae6188c..368008d9cc0ce 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc @@ -51,6 +51,17 @@ std::shared_ptr ProcessGroupNCCL::CreateTask( places, rank, comm_type, inputs); } +std::shared_ptr ProcessGroupNCCL::CreateTask( + const std::vector& places, + int rank, + CommType comm_type, + const std::vector& inputs, + bool is_sync, + bool use_calc_stream) { + return std::make_shared( + places, rank, comm_type, inputs, is_sync, use_calc_stream); +} + ProcessGroupNCCL::NCCLTask::NCCLTask( const std::vector& places, int rank, @@ -264,10 +275,12 @@ std::shared_ptr ProcessGroupNCCL::Collective( auto& nccl_comms = places_to_ncclcomm_[key]; - SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]); + if (!use_calc_stream) { + SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]); + } - auto task = std::make_shared( - places, rank_, comm_type, inputs, sync_op, use_calc_stream); + auto task = + CreateTask(places, rank_, comm_type, inputs, sync_op, use_calc_stream); platform::CUDADeviceGuard cuda_guard; @@ -406,6 +419,78 @@ void ProcessGroupNCCL::Collective(const phi::DenseTensor* in, cuda_guard.SetDevice(places[0]); } +template +std::shared_ptr ProcessGroupNCCL::PointToPoint( + std::vector& tensors, + Fn fn, + int dst_rank, + CommType op_type, + bool sync_op, + bool use_calc_stream) { + const auto& places = GetPlaceList(tensors); + const auto& key = GetKeyFromPlaces(places); + + { + std::lock_guard lock(mutex_); + if (places_to_ncclcomm_.find(key) == places_to_ncclcomm_.end()) { + CreateNCCLManagerCache(key, places); + } + } + + auto& nccl_comms = places_to_ncclcomm_[key]; + + if (!use_calc_stream) { + SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]); + } + + auto task = + CreateTask(places, rank_, op_type, tensors, sync_op, use_calc_stream); + + platform::CUDADeviceGuard cuda_guard; + + if (FLAGS_use_stream_safe_cuda_allocator) { + for (size_t i = 0; i < tensors.size(); ++i) { + cuda_guard.SetDevice(places[i]); + gpuStream_t nccl_stream; + if (use_calc_stream) { + nccl_stream = + static_cast( + platform::DeviceContextPool::Instance().Get(places[i])) + ->stream(); + } else { + nccl_stream = places_to_ctx_[key][i]->stream(); + } + memory::RecordStream(tensors[i].Holder(), nccl_stream); + } + } + + { + platform::NCCLGroupGuard nccl_guard; + for (size_t i = 0; i < tensors.size(); ++i) { + cuda_guard.SetDevice(places[i]); + gpuStream_t nccl_stream; + if (use_calc_stream) { + nccl_stream = + static_cast( + platform::DeviceContextPool::Instance().Get(places[i])) + ->stream(); + } else { + nccl_stream = places_to_ctx_[key][i]->stream(); + } + fn(tensors[i], nccl_comms[i]->GetNcclComm(), nccl_stream, dst_rank); + } + } + + if (!use_calc_stream) { + for (size_t i = 0; i < tensors.size(); ++i) { + cuda_guard.SetDevice(places[i]); + task->control_events_[i].Record(*places_to_ctx_[key][i]); + } + } + + return task; +} + template std::shared_ptr ProcessGroupNCCL::PointToPoint( std::vector& tensors, @@ -617,6 +702,34 @@ std::shared_ptr ProcessGroupNCCL::Send( return task; } +std::shared_ptr ProcessGroupNCCL::Send( + std::vector& tensors, + int dst_rank, + bool sync_op, + bool use_calc_stream) { + CheckTensorsInDifferentDevices(tensors, static_cast(GetSize())); + + auto task = PointToPoint( + tensors, + [&](phi::DenseTensor& input, + ncclComm_t comm, + const gpuStream_t& stream, + int dst_rank) { + return platform::dynload::ncclSend( + input.data(), + input.numel(), + platform::ToNCCLDataType(input.dtype()), + dst_rank, + comm, + stream); + }, + dst_rank, + CommType::SEND, + sync_op, + use_calc_stream); + return task; +} + std::shared_ptr ProcessGroupNCCL::Recv( std::vector& tensors, int src_rank) { CheckTensorsInDifferentDevices(tensors, static_cast(GetSize())); @@ -640,6 +753,34 @@ std::shared_ptr ProcessGroupNCCL::Recv( return task; } +std::shared_ptr ProcessGroupNCCL::Recv( + std::vector& tensors, + int src_rank, + bool sync_op, + bool use_calc_stream) { + CheckTensorsInDifferentDevices(tensors, static_cast(GetSize())); + + auto task = PointToPoint( + tensors, + [&](phi::DenseTensor& output, + ncclComm_t comm, + const gpuStream_t& stream, + int src_rank) { + return platform::dynload::ncclRecv( + output.data(), + output.numel(), + platform::ToNCCLDataType(output.dtype()), + src_rank, + comm, + stream); + }, + src_rank, + CommType::RECV, + sync_op, + use_calc_stream); + return task; +} + std::shared_ptr ProcessGroupNCCL::Send_Partial( phi::DenseTensor& tensors, int dst_rank, int offset, int length) { // CheckTensorsInDifferentDevices(tensors, static_cast(GetSize())); @@ -647,10 +788,8 @@ std::shared_ptr ProcessGroupNCCL::Send_Partial( phi::DenseTensor flatten_tensor; flatten_tensor.ShareDataWith(tensors).Resize({tensors.numel()}); - phi::DenseTensor shared_input = flatten_tensor.Slice(offset, offset + length); - - std::vector shared_tensors; - shared_tensors.push_back(shared_input); + std::vector shared_tensors{ + flatten_tensor.Slice(offset, offset + length)}; auto task = PointToPoint( shared_tensors, @@ -671,16 +810,49 @@ std::shared_ptr ProcessGroupNCCL::Send_Partial( return task; } +std::shared_ptr ProcessGroupNCCL::Send_Partial( + phi::DenseTensor& tensors, + int dst_rank, + int offset, + int length, + bool sync_op, + bool use_calc_stream) { + phi::DenseTensor flatten_tensor; + flatten_tensor.ShareDataWith(tensors).Resize({tensors.numel()}); + + std::vector shared_tensors{ + flatten_tensor.Slice(offset, offset + length)}; + + auto task = PointToPoint( + shared_tensors, + [&](phi::DenseTensor& input, + ncclComm_t comm, + const gpuStream_t& stream, + int dst_rank) { + return platform::dynload::ncclSend( + input.data(), + input.numel(), + platform::ToNCCLDataType(input.dtype()), + dst_rank, + comm, + stream); + }, + dst_rank, + CommType::SEND, + sync_op, + use_calc_stream); + return task; +} + std::shared_ptr ProcessGroupNCCL::Recv_Partial( phi::DenseTensor& tensors, int src_rank, int offset, int length) { // phi::DenseTensor shared_input = tensors.Slice(offset, offset+length); phi::DenseTensor flatten_tensor; flatten_tensor.ShareDataWith(tensors).Resize({tensors.numel()}); - phi::DenseTensor shared_input = flatten_tensor.Slice(offset, offset + length); - std::vector shared_tensors; - shared_tensors.push_back(shared_input); + std::vector shared_tensors{ + flatten_tensor.Slice(offset, offset + length)}; auto task = PointToPoint( shared_tensors, @@ -701,6 +873,40 @@ std::shared_ptr ProcessGroupNCCL::Recv_Partial( return task; } +std::shared_ptr ProcessGroupNCCL::Recv_Partial( + phi::DenseTensor& tensors, + int src_rank, + int offset, + int length, + bool sync_op, + bool use_calc_stream) { + phi::DenseTensor flatten_tensor; + flatten_tensor.ShareDataWith(tensors).Resize({tensors.numel()}); + + std::vector shared_tensors{ + flatten_tensor.Slice(offset, offset + length)}; + + auto task = PointToPoint( + shared_tensors, + [&](phi::DenseTensor& output, + ncclComm_t comm, + const gpuStream_t& stream, + int src_rank) { + return platform::dynload::ncclRecv( + output.data(), + output.numel(), + platform::ToNCCLDataType(output.dtype()), + src_rank, + comm, + stream); + }, + src_rank, + CommType::RECV, + sync_op, + use_calc_stream); + return task; +} + std::shared_ptr ProcessGroupNCCL::AllGather( std::vector& in_tensors, std::vector& out_tensors) { diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h index e0e298e9113e9..0b8fa54cd337e 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h @@ -60,7 +60,7 @@ class ProcessGroupNCCL : public ProcessGroupStream { int rank, CommType comm_type, const std::vector& inputs, - bool is_sync, + bool sync_op, bool use_calc_stream); bool IsCompleted(); @@ -122,19 +122,47 @@ class ProcessGroupNCCL : public ProcessGroupStream { std::shared_ptr Send( std::vector& tensors, int dst_rank) override; + std::shared_ptr Send( + std::vector& tensors, + int dst_rank, + bool sync_op, + bool use_calc_stream) override; + std::shared_ptr Recv( std::vector& tensors, int src_rank) override; + std::shared_ptr Recv( + std::vector& tensors, + int src_rank, + bool sync_op, + bool use_calc_stream) override; + std::shared_ptr Send_Partial(phi::DenseTensor& tensors, int dst_rank, int offset, int length) override; + std::shared_ptr Send_Partial( + phi::DenseTensor& tensors, + int dst_rank, + int offset, + int length, + bool sync_op, + bool use_calc_stream) override; + std::shared_ptr Recv_Partial(phi::DenseTensor& tensors, int src_rank, int offset, int length) override; + std::shared_ptr Recv_Partial( + phi::DenseTensor& tensors, + int src_rank, + int offset, + int length, + bool sync_op, + bool use_calc_stream) override; + std::shared_ptr AllGather( std::vector& in_tensors, std::vector& out_tensors) override; @@ -180,9 +208,17 @@ class ProcessGroupNCCL : public ProcessGroupStream { virtual std::shared_ptr CreateTask( std::vector places, int rank, - CommType opType, + CommType op_type, const std::vector& inputs); + virtual std::shared_ptr CreateTask( + const std::vector& places, + int rank, + CommType op_type, + const std::vector& inputs, + bool sync_op, + bool use_calc_stream); + protected: std::shared_ptr store_; std::shared_ptr nccl_comm_; @@ -233,6 +269,15 @@ class ProcessGroupNCCL : public ProcessGroupStream { int dst_rank, CommType op_type); + template + std::shared_ptr PointToPoint( + std::vector& tensors, // NOLINT + Fn fn, + int dst_rank, + CommType op_type, + bool sync_op, + bool use_calc_stream); + void CreateNCCLManagerCache(const std::string& places_key, const std::vector& places); diff --git a/paddle/fluid/distributed/collective/ProcessGroupStream.cc b/paddle/fluid/distributed/collective/ProcessGroupStream.cc index 9a20b8e6eaf79..51c8fe7bd9b1b 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupStream.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupStream.cc @@ -45,5 +45,89 @@ std::shared_ptr ProcessGroupStream::AllReduce( "ProcessGroup%s does not support do allreduce", GetBackendName())); } +std::shared_ptr ProcessGroupStream::Send( + std::vector& tensors, int dst_rank, bool sync_op) { + return Send(tensors, + dst_rank, + sync_op, + /*use_calc_stream*/ false); +} + +std::shared_ptr ProcessGroupStream::Send( + std::vector& tensors, + int dst_rank, + bool sync_op, + bool use_calc_stream) { + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support do send", GetBackendName())); +} + +std::shared_ptr ProcessGroupStream::Send_Partial( + phi::DenseTensor& tensors, + int dst_rank, + int offset, + int length, + bool sync_op) { + return Send_Partial(tensors, + dst_rank, + offset, + length, + sync_op, + /*use_calc_stream*/ false); +} + +std::shared_ptr ProcessGroupStream::Send_Partial( + phi::DenseTensor& tensors, + int dst_rank, + int offset, + int length, + bool sync_op, + bool use_calc_stream) { + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support do send_partial", GetBackendName())); +} + +std::shared_ptr ProcessGroupStream::Recv( + std::vector& tensors, int src_rank, bool sync_op) { + return Recv(tensors, + src_rank, + sync_op, + /*use_calc_stream*/ false); +} + +std::shared_ptr ProcessGroupStream::Recv( + std::vector& tensors, + int src_rank, + bool sync_op, + bool use_calc_stream) { + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support do recv", GetBackendName())); +} + +std::shared_ptr ProcessGroupStream::Recv_Partial( + phi::DenseTensor& tensors, + int src_rank, + int offset, + int length, + bool sync_op) { + return Recv_Partial(tensors, + src_rank, + offset, + length, + sync_op, + /*use_calc_stream*/ false); +} + +std::shared_ptr ProcessGroupStream::Recv_Partial( + phi::DenseTensor& tensors, + int src_rank, + int offset, + int length, + bool sync_op, + bool use_calc_stream) { + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support do recv_partial", GetBackendName())); +} + } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroupStream.h b/paddle/fluid/distributed/collective/ProcessGroupStream.h index 81a05ee2416e0..4cd17ac72562e 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupStream.h +++ b/paddle/fluid/distributed/collective/ProcessGroupStream.h @@ -66,6 +66,58 @@ class ProcessGroupStream : public ProcessGroup { const AllreduceOptions& options, bool sync_op, bool use_calc_stream); + + std::shared_ptr Send( + std::vector& tensors, // NOLINT + int dst_rank, + bool sync_op) override; + + virtual std::shared_ptr Send( + std::vector& tensors, // NOLINT + int dst_rank, + bool sync_op, + bool use_calc_stream); + + std::shared_ptr Send_Partial( + phi::DenseTensor& tensors, // NOLINT + int dst_rank, + int offset, + int length, + bool sync_op) override; + + virtual std::shared_ptr Send_Partial( + phi::DenseTensor& tensors, // NOLINT + int dst_rank, + int offset, + int length, + bool sync_op, + bool use_calc_stream); + + std::shared_ptr Recv( + std::vector& tensors, // NOLINT + int src_rank, + bool sync_op) override; + + virtual std::shared_ptr Recv( + std::vector& tensors, // NOLINT + int src_rank, + bool sync_op, + bool use_calc_stream); + + std::shared_ptr Recv_Partial( + phi::DenseTensor& tensors, // NOLINT + int src_rank, + int offset, + int length, + bool sync_op) override; + + virtual std::shared_ptr Recv_Partial( + phi::DenseTensor& tensors, // NOLINT + int src_rank, + int offset, + int length, + bool sync_op, + bool use_calc_stream); }; } // namespace distributed diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc index 5a7e2355f64eb..8a434f42811a8 100644 --- a/paddle/fluid/pybind/distributed_py.cc +++ b/paddle/fluid/pybind/distributed_py.cc @@ -196,6 +196,23 @@ void BindDistributed(py::module *m) { py::arg("dst"), py::call_guard()) + .def( + "send", + [](distributed::ProcessGroup &self, + py::handle py_tensor, + int dst, + bool sync_op) { + auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); + auto dense = + std::dynamic_pointer_cast(tensor.impl()); + std::vector tensors = {*dense}; + return self.Send(tensors, dst, sync_op); + }, + py::arg("tensor"), + py::arg("dst"), + py::arg("sync_op"), + py::call_guard()) + .def( "send_partial", [](distributed::ProcessGroup &self, @@ -217,6 +234,30 @@ void BindDistributed(py::module *m) { py::arg("id"), py::call_guard()) + .def( + "send_partial", + [](distributed::ProcessGroup &self, + py::handle py_tensor, + int dst_rank, + int nranks, + int rank_id, + bool sync_op) { + auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); + auto dense = + std::dynamic_pointer_cast(tensor.impl()); + int numel = (*dense).numel(); + int send_numel = numel / nranks; + int offset = send_numel * rank_id; + return self.Send_Partial( + *dense, dst_rank, offset, send_numel, sync_op); + }, + py::arg("tensor"), + py::arg("dst"), + py::arg("num"), + py::arg("id"), + py::arg("sync_op"), + py::call_guard()) + .def( "recv", [](distributed::ProcessGroup &self, @@ -232,6 +273,23 @@ void BindDistributed(py::module *m) { py::arg("src"), py::call_guard()) + .def( + "recv", + [](distributed::ProcessGroup &self, + py::handle py_tensor, + int src, + bool sync_op) { + auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); + auto dense = + std::dynamic_pointer_cast(tensor.impl()); + std::vector tensors = {*dense}; + return self.Recv(tensors, src, sync_op); + }, + py::arg("tensor"), + py::arg("src"), + py::arg("sync_op"), + py::call_guard()) + .def( "recv_partial", [](distributed::ProcessGroup &self, @@ -253,6 +311,30 @@ void BindDistributed(py::module *m) { py::arg("id"), py::call_guard()) + .def( + "recv_partial", + [](distributed::ProcessGroup &self, + py::handle py_tensor, + int src_rank, + int nranks, + int rank_id, + bool sync_op) { + auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); + auto dense = + std::dynamic_pointer_cast(tensor.impl()); + int numel = (*dense).numel(); + int recv_numel = numel / nranks; + int offset = recv_numel * rank_id; + return self.Recv_Partial( + *dense, src_rank, offset, recv_numel, sync_op); + }, + py::arg("tensor"), + py::arg("src"), + py::arg("num"), + py::arg("id"), + py::arg("sync_op"), + py::call_guard()) + .def( "all_gather", [](distributed::ProcessGroup &self, @@ -427,6 +509,94 @@ void BindDistributed(py::module *m) { }, py::arg("tensor"), py::arg("op"), + py::call_guard()) + + .def( + "send_on_calc_stream", + [](distributed::ProcessGroupStream &self, + py::handle py_tensor, + int dst) { + auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); + auto dense = + std::dynamic_pointer_cast(tensor.impl()); + std::vector tensors = {*dense}; + return self.Send(tensors, + dst, + /*sync_op*/ true, + /*use_calc_stream*/ true); + }, + py::arg("tensor"), + py::arg("dst"), + py::call_guard()) + + .def( + "send_partial_on_calc_stream", + [](distributed::ProcessGroupStream &self, + py::handle py_tensor, + int dst_rank, + int nranks, + int rank_id) { + auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); + auto dense = + std::dynamic_pointer_cast(tensor.impl()); + int numel = (*dense).numel(); + int send_numel = numel / nranks; + int offset = send_numel * rank_id; + return self.Send_Partial(*dense, + dst_rank, + offset, + send_numel, + /*sync_op*/ true, + /*use_calc_stream*/ true); + }, + py::arg("tensor"), + py::arg("dst"), + py::arg("num"), + py::arg("id"), + py::call_guard()) + + .def( + "recv_on_calc_stream", + [](distributed::ProcessGroupStream &self, + py::handle py_tensor, + int src) { + auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); + auto dense = + std::dynamic_pointer_cast(tensor.impl()); + std::vector tensors = {*dense}; + return self.Recv(tensors, + src, + /*sync_op*/ true, + /*use_calc_stream*/ true); + }, + py::arg("tensor"), + py::arg("src"), + py::call_guard()) + + .def( + "recv_partial_on_calc_stream", + [](distributed::ProcessGroupStream &self, + py::handle py_tensor, + int src_rank, + int nranks, + int rank_id) { + auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); + auto dense = + std::dynamic_pointer_cast(tensor.impl()); + int numel = (*dense).numel(); + int recv_numel = numel / nranks; + int offset = recv_numel * rank_id; + return self.Recv_Partial(*dense, + src_rank, + offset, + recv_numel, + /*sync_op*/ true, + /*use_calc_stream*/ true); + }, + py::arg("tensor"), + py::arg("src"), + py::arg("num"), + py::arg("id"), py::call_guard()); #if defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL) diff --git a/python/paddle/distributed/communication/stream/__init__.py b/python/paddle/distributed/communication/stream/__init__.py index 24194dd9fb1e2..3dd9f60b81295 100644 --- a/python/paddle/distributed/communication/stream/__init__.py +++ b/python/paddle/distributed/communication/stream/__init__.py @@ -13,5 +13,7 @@ # limitations under the License. from .all_reduce import all_reduce +from .send import send +from .recv import recv -__all__ = ["all_reduce"] +__all__ = ["all_reduce", "send", "recv"] diff --git a/python/paddle/distributed/communication/stream/all_reduce.py b/python/paddle/distributed/communication/stream/all_reduce.py index 6a0b622cf0dfe..f94422f4bd0a6 100644 --- a/python/paddle/distributed/communication/stream/all_reduce.py +++ b/python/paddle/distributed/communication/stream/all_reduce.py @@ -12,13 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +import paddle.distributed.collective as collective import paddle.fluid.framework as framework -from ...collective import _get_default_group, _get_reduce_op, ReduceOp def _all_reduce_in_dygraph(tensor, op, group, sync_op, use_calc_stream): - op_type = _get_reduce_op(op, "all_reduce") - group = _get_default_group() if group is None else group + op_type = collective._get_reduce_op(op, "all_reduce") + group = collective._get_default_group() if group is None else group if use_calc_stream: return group.process_group.allreduce_on_calc_stream(tensor, op_type) @@ -30,7 +30,7 @@ def _all_reduce_in_dygraph(tensor, op, group, sync_op, use_calc_stream): def all_reduce(tensor, - op=ReduceOp.SUM, + op=collective.ReduceOp.SUM, group=None, sync_op=True, use_calc_stream=False): diff --git a/python/paddle/distributed/communication/stream/recv.py b/python/paddle/distributed/communication/stream/recv.py new file mode 100644 index 0000000000000..b225f64b8b4d2 --- /dev/null +++ b/python/paddle/distributed/communication/stream/recv.py @@ -0,0 +1,82 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.distributed.collective as collective +import paddle.fluid.framework as framework + + +def _recv_in_dygraph(tensor, src, group, sync_op, use_calc_stream): + group = collective._get_default_group() if group is None else group + if use_calc_stream: + return group.process_group.recv_on_calc_stream(tensor, src) + + task = group.process_group.recv(tensor, src, sync_op) + if sync_op: + task.wait() + + return task + + +def recv(tensor, src=0, group=None, sync_op=True, use_calc_stream=False): + """ + + Receive a tensor from the source device. + + Args: + tensor (Tensor): The tensor to receive. Support float16, float32, float64, int32, int64, int8, uint8 or bool as its data type. + src (int, optional): Rank of the source device. If none is given, use `0` as default. + group (Group, optional): Communicate in which group. If none is given, use the global group as default. + sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default. + use_calc_stream (bool, optional): Indicate whether the communication is done on calculation stream. If none is given, use false as default. This + option is designed for high performance demand, be careful to turn it on except you are clearly know its meaning. + + Returns: + Return a task object. + + Warning: + This API only supports the dygraph mode now. + + Examples: + .. code-block:: python + + # required: distributed + import paddle + import paddle.distributed as dist + + dist.init_parallel_env() + local_rank = dist.get_rank() + if local_rank == 0: + data = paddle.to_tensor([[4, 5, 6], [4, 5, 6]]) + task = dist.stream.send(data, dst=1, sync_op=False) + else: + data = paddle.to_tensor([[1, 2, 3], [1, 2, 3]]) + task = dist.stream.recv(data, src=0, sync_op=False) + task.wait() + out = data.numpy() + # [[4, 5, 6], [4, 5, 6] + """ + if group is not None and not group.is_member(): + raise RuntimeError( + "The group should not be None and all ranks which invoke this operation should be the member of this group." + ) + + if not sync_op and use_calc_stream: + raise RuntimeError( + "use_calc_stream can only be True in sync op behavior.") + + if framework.in_dygraph_mode(): + return _recv_in_dygraph(tensor, src, group, sync_op, use_calc_stream) + + raise RuntimeError( + "paddle.distributed.stream.recv is only supported in dygraph mode now.") diff --git a/python/paddle/distributed/communication/stream/send.py b/python/paddle/distributed/communication/stream/send.py new file mode 100644 index 0000000000000..fa052734c7ee7 --- /dev/null +++ b/python/paddle/distributed/communication/stream/send.py @@ -0,0 +1,82 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.distributed.collective as collective +import paddle.fluid.framework as framework + + +def _send_in_dygraph(tensor, dst, group, sync_op, use_calc_stream): + group = collective._get_default_group() if group is None else group + if use_calc_stream: + return group.process_group.send_on_calc_stream(tensor, dst) + + task = group.process_group.send(tensor, dst, sync_op) + if sync_op: + task.wait() + + return task + + +def send(tensor, dst=0, group=None, sync_op=True, use_calc_stream=False): + """ + + Send a tensor to the destination device. + + Args: + tensor (Tensor): The tensor to send. Support float16, float32, float64, int32, int64, int8, uint8 or bool as its data type. + dst (int, optional): Rank of the destination device. If none is given, use `0` as default. + group (Group, optional): Communicate in which group. If none is given, use the global group as default. + sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default. + use_calc_stream (bool, optional): Indicate whether the communication is done on calculation stream. If none is given, use false as default. This + option is designed for high performance demand, be careful to turn it on except you are clearly know its meaning. + + Returns: + Return a task object. + + Warning: + This API only supports the dygraph mode now. + + Examples: + .. code-block:: python + + # required: distributed + import paddle + import paddle.distributed as dist + + dist.init_parallel_env() + local_rank = dist.get_rank() + if local_rank == 0: + data = paddle.to_tensor([[4, 5, 6], [4, 5, 6]]) + task = dist.stream.send(data, dst=1, sync_op=False) + else: + data = paddle.to_tensor([[1, 2, 3], [1, 2, 3]]) + task = dist.stream.recv(data, src=0, sync_op=False) + task.wait() + out = data.numpy() + # [[4, 5, 6], [4, 5, 6] + """ + if group is not None and not group.is_member(): + raise RuntimeError( + "The group should not be None and all ranks which invoke this operation should be the member of this group." + ) + + if not sync_op and use_calc_stream: + raise RuntimeError( + "use_calc_stream can only be True in sync op behavior.") + + if framework.in_dygraph_mode(): + return _send_in_dygraph(tensor, dst, group, sync_op, use_calc_stream) + + raise RuntimeError( + "paddle.distributed.stream.send is only supported in dygraph mode now.") diff --git a/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt b/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt index 5a1a6df2dd7ec..55f4453b1ab08 100644 --- a/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt @@ -268,17 +268,26 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX)) endif() if((WITH_GPU OR WITH_ROCM) AND (LINUX)) py_test_modules( - test_eager_dist_api MODULES test_eager_dist_api ENVS - "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python") - set_tests_properties(test_eager_dist_api PROPERTIES TIMEOUT "120" LABELS - "RUN_TYPE=DIST") + test_communication_stream_allreduce_api MODULES + test_communication_stream_allreduce_api ENVS + "PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=") + set_tests_properties(test_communication_stream_allreduce_api + PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST") endif() if((WITH_GPU OR WITH_ROCM) AND (LINUX)) py_test_modules( - test_new_group_api MODULES test_new_group_api ENVS + test_communication_stream_sendrecv_api MODULES + test_communication_stream_sendrecv_api ENVS + "PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=") + set_tests_properties(test_communication_stream_sendrecv_api + PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST") +endif() +if((WITH_GPU OR WITH_ROCM) AND (LINUX)) + py_test_modules( + test_eager_dist_api MODULES test_eager_dist_api ENVS "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python") - set_tests_properties(test_new_group_api PROPERTIES TIMEOUT "120" LABELS - "RUN_TYPE=DIST") + set_tests_properties(test_eager_dist_api PROPERTIES TIMEOUT "120" LABELS + "RUN_TYPE=DIST") endif() if((WITH_GPU OR WITH_ROCM @@ -298,11 +307,10 @@ if((WITH_GPU endif() if((WITH_GPU OR WITH_ROCM) AND (LINUX)) py_test_modules( - test_communication_stream_allreduce_api MODULES - test_communication_stream_allreduce_api ENVS - "PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=") - set_tests_properties(test_communication_stream_allreduce_api - PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST") + test_new_group_api MODULES test_new_group_api ENVS + "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python") + set_tests_properties(test_new_group_api PROPERTIES TIMEOUT "120" LABELS + "RUN_TYPE=DIST") endif() if((WITH_ROCM OR WITH_GPU) AND (LINUX)) bash_test_modules( diff --git a/python/paddle/fluid/tests/unittests/collective/communication_stream_sendrecv_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/communication_stream_sendrecv_api_dygraph.py new file mode 100644 index 0000000000000..175e24c3d0d86 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/collective/communication_stream_sendrecv_api_dygraph.py @@ -0,0 +1,68 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import numpy as np +import paddle +import paddle.distributed as dist +import paddle.fluid as fluid +import test_collective_api_base as test_collective_base +import test_communication_api_base as test_base + + +class StreamSendRecvTestCase(): + + def __init__(self): + self._sync_op = eval(os.getenv("sync_op")) + self._use_calc_stream = eval(os.getenv("use_calc_stream")) + self._backend = os.getenv("backend") + self._shape = eval(os.getenv("shape")) + self._dtype = os.getenv("dtype") + self._seeds = eval(os.getenv("seeds")) + if self._backend not in ["nccl", "gloo"]: + raise NotImplementedError( + "Only support nccl and gloo as the backend for now.") + os.environ["PADDLE_DISTRI_BACKEND"] = self._backend + + def run_test_case(self): + dist.init_parallel_env() + + test_data_list = [] + for seed in self._seeds: + test_data_list.append( + test_collective_base.create_test_data(shape=self._shape, + dtype=self._dtype, + seed=seed)) + + rank = dist.get_rank() + tensor = paddle.to_tensor(test_data_list[rank]) + if rank == 0: + task = dist.stream.send(tensor, + dst=1, + sync_op=self._sync_op, + use_calc_stream=self._use_calc_stream) + else: + task = dist.stream.recv(tensor, + src=0, + sync_op=self._sync_op, + use_calc_stream=self._use_calc_stream) + if not self._sync_op: + task.wait() + + result = test_data_list[0] + assert np.allclose(tensor, result, rtol=1e-05, atol=1e-05) + + +if __name__ == "__main__": + StreamSendRecvTestCase().run_test_case() diff --git a/python/paddle/fluid/tests/unittests/collective/test_communication_stream_sendrecv_api.py b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_sendrecv_api.py new file mode 100644 index 0000000000000..9590519bc2e13 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_sendrecv_api.py @@ -0,0 +1,50 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle +import test_communication_api_base as test_base + + +class TestCommunicationStreamSendRecvAPI(test_base.CommunicationTestDistBase): + + def setUp(self): + super(TestCommunicationStreamSendRecvAPI, self).setUp(num_of_devices=2, + timeout=120) + self._default_envs = { + "backend": "nccl", + "shape": "(100, 200)", + "dtype": "float32", + "seeds": str(self._seeds) + } + self._changeable_envs = { + "sync_op": ["True", "False"], + "use_calc_stream": ["True", "False"] + } + + def test_sendrecv_stream(self): + envs_list = test_base.gen_product_envs_list(self._default_envs, + self._changeable_envs) + for envs in envs_list: + if eval(envs["use_calc_stream"]) and not eval(envs["sync_op"]): + continue + self.run_test_case("communication_stream_sendrecv_api_dygraph.py", + user_defined_envs=envs) + + def tearDown(self): + super(TestCommunicationStreamSendRecvAPI, self).tearDown() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/testslist.csv b/python/paddle/fluid/tests/unittests/collective/testslist.csv index 16eb200565f73..b4ba281f45420 100644 --- a/python/paddle/fluid/tests/unittests/collective/testslist.csv +++ b/python/paddle/fluid/tests/unittests/collective/testslist.csv @@ -32,8 +32,9 @@ test_collective_split_col_linear,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_ test_collective_split_embedding_none_divisible,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., test_collective_split_row_linear,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., test_collective_wait,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., +test_communication_stream_allreduce_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=, +test_communication_stream_sendrecv_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=, test_eager_dist_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., -test_new_group_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., test_gen_nccl_id_op,,gpu;rocm;ASCEND;ASCEND_CL,,DIST,../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=.., -test_communication_stream_allreduce_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=, +test_new_group_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., test_world_size_and_rank,linux,rocm;gpu,120,DIST,test_world_size_and_rank.sh,2,,http_proxy=;https_proxy=, From 0e4df2d338af092c1f7f1ab044a0a76e23c5838a Mon Sep 17 00:00:00 2001 From: Wen Sun <35923278+HermitSun@users.noreply.github.com> Date: Fri, 30 Sep 2022 20:52:21 +0800 Subject: [PATCH 2/6] Support both use_calc_stream and sync_op in allgather API (#46295) --- .../distributed/collective/ProcessGroup.h | 11 +- .../collective/ProcessGroupNCCL.cc | 56 ++++++- .../distributed/collective/ProcessGroupNCCL.h | 9 ++ .../collective/ProcessGroupStream.cc | 27 +++- .../collective/ProcessGroupStream.h | 14 ++ paddle/fluid/distributed/collective/Utils.h | 145 ++++++++++++++++++ paddle/fluid/pybind/distributed_py.cc | 106 +++++++++++++ .../communication/stream/__init__.py | 3 +- .../communication/stream/all_gather.py | 136 ++++++++++++++++ .../tests/unittests/collective/CMakeLists.txt | 8 + ...munication_stream_allgather_api_dygraph.py | 91 +++++++++++ ...test_communication_stream_allgather_api.py | 51 ++++++ .../tests/unittests/collective/testslist.csv | 1 + 13 files changed, 648 insertions(+), 10 deletions(-) create mode 100644 paddle/fluid/distributed/collective/Utils.h create mode 100644 python/paddle/distributed/communication/stream/all_gather.py create mode 100644 python/paddle/fluid/tests/unittests/collective/communication_stream_allgather_api_dygraph.py create mode 100644 python/paddle/fluid/tests/unittests/collective/test_communication_stream_allgather_api.py diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h index 3db2464e59afd..40a27db7601f1 100644 --- a/paddle/fluid/distributed/collective/ProcessGroup.h +++ b/paddle/fluid/distributed/collective/ProcessGroup.h @@ -190,7 +190,16 @@ class ProcessGroup { std::vector&, // NOLINT std::vector&) { // NOLINT PADDLE_THROW(platform::errors::InvalidArgument( - "ProcessGroup%s does not support AllGather", GetBackendName())); + "ProcessGroup%s does not support all_gather", GetBackendName())); + } + + virtual std::shared_ptr AllGather( + std::vector&, // NOLINT + std::vector&, // NOLINT + bool) { + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support all_gather with sync_op flag", + GetBackendName())); } virtual std::shared_ptr AllGather_Partial( diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc index 368008d9cc0ce..12f60faf80053 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc @@ -936,6 +936,39 @@ std::shared_ptr ProcessGroupNCCL::AllGather( CommType::ALLGATHER); } +std::shared_ptr ProcessGroupNCCL::AllGather( + std::vector& in_tensors, + std::vector& out_tensors, + bool sync_op, + bool use_calc_stream) { + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(in_tensors), + true, + platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(out_tensors), + true, + platform::errors::InvalidArgument("All outputs should be in CudaPlace.")); + return Collective( + in_tensors, + out_tensors, + [&](const phi::DenseTensor& input, + phi::DenseTensor& output, + ncclComm_t comm, + const gpuStream_t& stream) { + return platform::dynload::ncclAllGather( + input.data(), + output.data(), + input.numel(), + platform::ToNCCLDataType(input.dtype()), + comm, + stream); + }, + CommType::ALLGATHER, + sync_op, + use_calc_stream); +} + void* GetPointerByOffset(void* raw_pointer, size_t offset, experimental::DataType type) { @@ -1250,13 +1283,22 @@ ncclComm_t ProcessGroupNCCL::NCCLComm(const Place& place) const { phi::DeviceContext* ProcessGroupNCCL::GetDeviceContext( const Place& place) const { - std::vector places = {place}; - const auto& iter = places_to_ctx_.find(GetKeyFromPlaces(places)); - PADDLE_ENFORCE_NE(iter, - places_to_ctx_.end(), - platform::errors::InvalidArgument( - "Cannot find device context in process group.")); - return iter->second[0].get(); + return GetDeviceContext(place, /*use_calc_stream*/ false); +} + +phi::DeviceContext* ProcessGroupNCCL::GetDeviceContext( + const Place& place, bool use_calc_stream) const { + if (use_calc_stream) { + return platform::DeviceContextPool::Instance().Get(place); + } else { + std::vector places = {place}; + const auto& iter = places_to_ctx_.find(GetKeyFromPlaces(places)); + PADDLE_ENFORCE_NE(iter, + places_to_ctx_.end(), + platform::errors::InvalidArgument( + "Cannot find device context in process group.")); + return iter->second[0].get(); + } } } // namespace distributed diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h index 0b8fa54cd337e..24ba7c86b1838 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h @@ -98,6 +98,9 @@ class ProcessGroupNCCL : public ProcessGroupStream { phi::DeviceContext* GetDeviceContext(const Place& place) const override; + phi::DeviceContext* GetDeviceContext(const Place& place, + bool use_calc_stream) const override; + std::shared_ptr AllReduce( std::vector& in_tensors, // NOLINT std::vector& out_tensors, // NOLINT @@ -167,6 +170,12 @@ class ProcessGroupNCCL : public ProcessGroupStream { std::vector& in_tensors, std::vector& out_tensors) override; + std::shared_ptr AllGather( + std::vector& in_tensors, + std::vector& out_tensors, + bool sync_op, + bool use_calc_stream) override; + std::shared_ptr AllGather_Partial( std::vector& in_tensors, std::vector& out_tensors, diff --git a/paddle/fluid/distributed/collective/ProcessGroupStream.cc b/paddle/fluid/distributed/collective/ProcessGroupStream.cc index 51c8fe7bd9b1b..43ca0bbb36d3f 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupStream.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupStream.cc @@ -23,6 +23,31 @@ ProcessGroupStream::ProcessGroupStream(int rank, int gid) : ProcessGroup(rank, size, place, gid) {} +phi::DeviceContext* ProcessGroupStream::GetDeviceContext( + const Place& place, bool use_calc_stream) const { + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support get device_context.", GetBackendName())); +} + +std::shared_ptr ProcessGroupStream::AllGather( + std::vector& input_tensors, // NOLINT + std::vector& output_tensors, // NOLINT + bool sync_op) { + return AllGather(input_tensors, + output_tensors, + sync_op, + /*use_calc_stream*/ false); +} + +std::shared_ptr ProcessGroupStream::AllGather( + std::vector& input_tensors, // NOLINT + std::vector& output_tensors, // NOLINT + bool sync_op, + bool use_calc_stream) { + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support do all_gather", GetBackendName())); +} + std::shared_ptr ProcessGroupStream::AllReduce( std::vector& input_tensors, // NOLINT std::vector& output_tensors, // NOLINT @@ -42,7 +67,7 @@ std::shared_ptr ProcessGroupStream::AllReduce( bool sync_op, bool use_calc_stream) { PADDLE_THROW(platform::errors::InvalidArgument( - "ProcessGroup%s does not support do allreduce", GetBackendName())); + "ProcessGroup%s does not support do all_reduce", GetBackendName())); } std::shared_ptr ProcessGroupStream::Send( diff --git a/paddle/fluid/distributed/collective/ProcessGroupStream.h b/paddle/fluid/distributed/collective/ProcessGroupStream.h index 4cd17ac72562e..f8ab562ad075c 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupStream.h +++ b/paddle/fluid/distributed/collective/ProcessGroupStream.h @@ -54,6 +54,20 @@ class ProcessGroupStream : public ProcessGroup { ProcessGroupStream(int rank, int size, const platform::Place& place, int gid); virtual ~ProcessGroupStream() = default; + virtual phi::DeviceContext* GetDeviceContext(const Place& place, + bool use_calc_stream) const; + + std::shared_ptr AllGather( + std::vector& in_tensors, // NOLINT + std::vector& out_tensors, // NOLINT + bool sync_op) override; + + virtual std::shared_ptr AllGather( + std::vector& in_tensors, // NOLINT + std::vector& out_tensors, // NOLINT + bool sync_op, + bool use_calc_stream); + std::shared_ptr AllReduce( std::vector& input_tensors, // NOLINT std::vector& output_tensors, // NOLINT diff --git a/paddle/fluid/distributed/collective/Utils.h b/paddle/fluid/distributed/collective/Utils.h new file mode 100644 index 0000000000000..79146febdf809 --- /dev/null +++ b/paddle/fluid/distributed/collective/Utils.h @@ -0,0 +1,145 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/operators/math/concat_and_split.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/phi/api/include/tensor.h" +#include "paddle/phi/backends/device_manager.h" + +namespace paddle { +namespace distributed { + +template +struct SplitDenseTensor { + void operator()(const DeviceContext *context, + const phi::DenseTensor &in, + std::vector *out, + int axis = 0) { + std::vector shape_refer; + shape_refer.reserve(out->size()); + for (auto *p_tensor : *out) { + shape_refer.emplace_back(p_tensor); + } + operators::math::SplitFunctor split_functor_; + split_functor_(*context, in, shape_refer, axis, out); + } +}; + +#ifdef PADDLE_WITH_CUSTOM_DEVICE +template +struct SplitDenseTensor { + void operator()(const platform::CustomDeviceContext *context, + const phi::DenseTensor &in, + std::vector *out) { + auto *in_data = in.data(); + auto *device = phi::DeviceManager::GetDeviceWithPlace(context->GetPlace()); + size_t offset = 0; + for (auto *p_tensor : *out) { + auto *out_data = p_tensor->data(); + auto sz = p_tensor->numel() * sizeof(T); + device->MemoryCopyD2D(out_data, in_data + offset, sz, nullptr); + offset += sz; + } + } +}; +#endif + +template +void SplitDenseTensorWithType(const DeviceContext *dev_ctx, + const phi::DenseTensor &p_dense, + std::vector *p_list, + phi::DataType type) { + switch (type) { + case phi::DataType::BOOL: + SplitDenseTensor()(dev_ctx, p_dense, p_list); + break; + case phi::DataType::UINT8: + SplitDenseTensor()(dev_ctx, p_dense, p_list); + break; + case phi::DataType::INT8: + SplitDenseTensor()(dev_ctx, p_dense, p_list); + break; + case phi::DataType::INT32: + SplitDenseTensor()(dev_ctx, p_dense, p_list); + break; + case phi::DataType::INT64: + SplitDenseTensor()(dev_ctx, p_dense, p_list); + break; + case phi::DataType::FLOAT16: + SplitDenseTensor()( + dev_ctx, p_dense, p_list); + break; + case phi::DataType::FLOAT32: + SplitDenseTensor()(dev_ctx, p_dense, p_list); + break; + case phi::DataType::FLOAT64: + SplitDenseTensor()(dev_ctx, p_dense, p_list); + break; + default: + PADDLE_THROW(platform::errors::Unimplemented( + "Data type (%s) is not supported when it splits tensors for " + "allgather.", + type)); + } +} + +void SplitTensor(const phi::DeviceContext *dev_ctx, + const phi::DenseTensor &tensor, + const std::vector *tensor_list) { + std::vector dense_list; + for (auto &tensor : *tensor_list) { + auto p_tensor = + std::dynamic_pointer_cast(tensor.impl()).get(); + dense_list.emplace_back(p_tensor); + } + + const auto &place = dev_ctx->GetPlace(); + if (platform::is_gpu_place(place)) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + SplitDenseTensorWithType(static_cast(dev_ctx), + tensor, + &dense_list, + tensor.dtype()); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Paddle can't split tensor since it's not support NCCL/RCCL, please " + "recompile or reinstall Paddle with NCCL/RCCL support.")); +#endif + } else if (platform::is_custom_place(place)) { +#ifdef PADDLE_WITH_CUSTOM_DEVICE + SplitDenseTensorWithType( + static_cast(dev_ctx), + tensor, + &dense_list, + tensor.dtype()); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Paddle can't split tensor since it's not compiled with CUSTOM_DEVICE, " + "please recompile or reinstall Paddle with CUSTOM_DEVICE support.")); +#endif + } else if (platform::is_cpu_place(place)) { + SplitDenseTensorWithType(static_cast(dev_ctx), + tensor, + &dense_list, + tensor.dtype()); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Split tensor not supported on place (%s)", place)); + } +} + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc index 8a434f42811a8..bec3c93cbd8b7 100644 --- a/paddle/fluid/pybind/distributed_py.cc +++ b/paddle/fluid/pybind/distributed_py.cc @@ -24,6 +24,7 @@ limitations under the License. */ #include "paddle/fluid/distributed/collective/ProcessGroup.h" #include "paddle/fluid/distributed/collective/ProcessGroupStream.h" #include "paddle/fluid/distributed/collective/Types.h" +#include "paddle/fluid/distributed/collective/Utils.h" #include "paddle/fluid/distributed/collective/reducer.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/tensor.h" @@ -354,6 +355,57 @@ void BindDistributed(py::module *m) { py::arg("out"), py::call_guard()) + .def( + "allgather", + [](distributed::ProcessGroup &self, + py::handle py_in_tensor, + py::handle py_out_tensor_list, + bool sync_op) { + auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); + auto in_dense = std::dynamic_pointer_cast( + in_tensor.impl()); + std::vector in_wrapper = {*in_dense}; + + auto out_tensor_list = + CastPyArg2VectorOfTensor(py_out_tensor_list.ptr(), 0); + Tensor concat_out_tensor = paddle::concat(out_tensor_list, 0); + auto out_dense = std::dynamic_pointer_cast( + concat_out_tensor.impl()); + std::vector out_wrapper = {*out_dense}; + + const auto *dev_ctx = self.GetDeviceContext(in_tensor.place()); + auto task = self.AllGather(in_wrapper, out_wrapper, sync_op); + distributed::SplitTensor(dev_ctx, *out_dense, &out_tensor_list); + return task; + }, + py::arg("in"), + py::arg("out"), + py::arg("sync_op"), + py::call_guard()) + + .def( + "allgather_base", + [](distributed::ProcessGroup &self, + py::handle py_in_tensor, + py::handle py_out_tensor, + bool sync_op) { + auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); + auto in_dense = std::dynamic_pointer_cast( + in_tensor.impl()); + std::vector in_wrapper = {*in_dense}; + + auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); + auto out_dense = std::dynamic_pointer_cast( + out_tensor.impl()); + std::vector out_wrapper = {*out_dense}; + + return self.AllGather(in_wrapper, out_wrapper, sync_op); + }, + py::arg("in"), + py::arg("out"), + py::arg("sync_op"), + py::call_guard()) + .def( "all_gather_partial", [](distributed::ProcessGroup &self, @@ -490,6 +542,60 @@ void BindDistributed(py::module *m) { py::class_>( *m, "ProcessGroupStream", ProcessGroup) + .def( + "allgather_on_calc_stream", + [](distributed::ProcessGroupStream &self, + py::handle py_in_tensor, + py::handle py_out_tensor_list) { + auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); + auto in_dense = std::dynamic_pointer_cast( + in_tensor.impl()); + std::vector in_wrapper = {*in_dense}; + + auto out_tensor_list = + CastPyArg2VectorOfTensor(py_out_tensor_list.ptr(), 0); + Tensor concat_out_tensor = paddle::concat(out_tensor_list, 0); + auto out_dense = std::dynamic_pointer_cast( + concat_out_tensor.impl()); + std::vector out_wrapper = {*out_dense}; + + const auto *dev_ctx = + self.GetDeviceContext(in_tensor.place(), true); + auto task = self.AllGather(in_wrapper, + out_wrapper, + /*sync_op*/ true, + /*use_calc_stream*/ true); + distributed::SplitTensor(dev_ctx, *out_dense, &out_tensor_list); + return task; + }, + py::arg("in"), + py::arg("out"), + py::call_guard()) + + .def( + "allgather_base_on_calc_stream", + [](distributed::ProcessGroupStream &self, + py::handle py_in_tensor, + py::handle py_out_tensor) { + auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); + auto in_dense = std::dynamic_pointer_cast( + in_tensor.impl()); + std::vector in_wrapper = {*in_dense}; + + auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); + auto out_dense = std::dynamic_pointer_cast( + out_tensor.impl()); + std::vector out_wrapper = {*out_dense}; + + return self.AllGather(in_wrapper, + out_wrapper, + /*sync_op*/ true, + /*use_calc_stream*/ true); + }, + py::arg("in"), + py::arg("out"), + py::call_guard()) + .def( "allreduce_on_calc_stream", [](distributed::ProcessGroupStream &self, diff --git a/python/paddle/distributed/communication/stream/__init__.py b/python/paddle/distributed/communication/stream/__init__.py index 3dd9f60b81295..deab1f97ea28e 100644 --- a/python/paddle/distributed/communication/stream/__init__.py +++ b/python/paddle/distributed/communication/stream/__init__.py @@ -12,8 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +from .all_gather import all_gather from .all_reduce import all_reduce from .send import send from .recv import recv -__all__ = ["all_reduce", "send", "recv"] +__all__ = ["all_gather", "all_reduce", "send", "recv"] diff --git a/python/paddle/distributed/communication/stream/all_gather.py b/python/paddle/distributed/communication/stream/all_gather.py new file mode 100644 index 0000000000000..dca2957309068 --- /dev/null +++ b/python/paddle/distributed/communication/stream/all_gather.py @@ -0,0 +1,136 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.fluid.framework as framework +from paddle.distributed import collective + + +def _check_tensor_shape(tensor, shape, nranks=1): + expect_shape = list(shape) + expect_shape[0] *= nranks + if list(tensor.shape) != expect_shape: + raise RuntimeError('The tensor for all_gather is not correctly-sized.') + + +def _check_tensor_list_shape(tensor_list, shape, nranks=1): + if len(tensor_list) != nranks: + raise RuntimeError( + 'The tensor_list for all_gather is not correctly-sized.') + for tensor in tensor_list: + if tensor.shape != shape: + raise RuntimeError( + 'The tensor_list for all_gather is not correctly-sized.') + + +def _all_gather_base_in_dygraph(out_tensor, in_tensor, group, sync_op, + use_calc_stream): + group = collective._get_default_group() if group is None else group + + _check_tensor_shape(out_tensor, in_tensor.shape, group.nranks) + + if use_calc_stream: + return group.process_group.allgather_base_on_calc_stream( + in_tensor, out_tensor) + + task = group.process_group.allgather_base(in_tensor, out_tensor, sync_op) + if sync_op: + task.wait() + + return task + + +def _all_gather_in_dygraph(tensor_list, tensor, group, sync_op, + use_calc_stream): + group = collective._get_default_group() if group is None else group + + if len(tensor_list) == 0: + tensor_list += [paddle.empty_like(tensor) for _ in range(group.nranks)] + else: + _check_tensor_list_shape(tensor_list, tensor.shape, group.nranks) + + if use_calc_stream: + return group.process_group.allgather_on_calc_stream(tensor, tensor_list) + + task = group.process_group.allgather(tensor, tensor_list, sync_op) + if sync_op: + task.wait() + + return task + + +def all_gather(tensor_or_tensor_list, + tensor, + group=None, + sync_op=True, + use_calc_stream=False): + """ + + Gather tensors across devices to a correctly-sized tensor or a tensor list. + + Args: + tensor_or_tensor_list (Union[Tensor, List[Tensor]]): The output. If it is a tensor, it should be correctly-sized. If it is a list, it + should be empty or contain correctly-sized tensors. + tensor (Tensor): The input tensor on each rank. The result will overwrite this tenor after communication. Support + float16, float32, float64, int32 or int64 as the input data type. + group (Group, optional): Communicate in which group. If none is given, use the global group as default. + sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default. + use_calc_stream (bool, optional): Indicate whether the communication is done on calculation stream. If none is given, use false as default. This + option is designed for high performance demand, be careful to turn it on except you are clearly know its meaning. + + Returns: + Return a task object. + + Warning: + This API only supports the dygraph mode now. + + Examples: + .. code-block:: python + + # required: distributed + import paddle + import paddle.distributed as dist + + dist.init_parallel_env() + local_rank = dist.get_rank() + tensor_list = [] + if local_rank == 0: + data = paddle.to_tensor([[4, 5, 6], [4, 5, 6]]) + else: + data = paddle.to_tensor([[1, 2, 3], [1, 2, 3]]) + task = dist.stream.all_gather(tensor_list, data, sync_op=False) + task.wait() + print(tensor_list) + # [[[4, 5, 6], [4, 5, 6]], [[1, 2, 3], [1, 2, 3]]] (2 GPUs) + """ + if group is not None and not group.is_member(): + raise RuntimeError( + "The group should not be None and all ranks which invoke this operation should be the member of this group." + ) + + if not sync_op and use_calc_stream: + raise RuntimeError( + "use_calc_stream can only be true in sync op behavior.") + + if framework.in_dygraph_mode(): + if paddle.is_tensor(tensor_or_tensor_list): + return _all_gather_base_in_dygraph(tensor_or_tensor_list, tensor, + group, sync_op, use_calc_stream) + else: + return _all_gather_in_dygraph(tensor_or_tensor_list, tensor, group, + sync_op, use_calc_stream) + + raise RuntimeError( + "paddle.distributed.stream.all_gather is only supported in dygraph mode now." + ) diff --git a/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt b/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt index 55f4453b1ab08..d7ee67c10f435 100644 --- a/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt @@ -266,6 +266,14 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX)) set_tests_properties(test_collective_wait PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST") endif() +if((WITH_GPU OR WITH_ROCM) AND (LINUX)) + py_test_modules( + test_communication_stream_allgather_api MODULES + test_communication_stream_allgather_api ENVS + "PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=") + set_tests_properties(test_communication_stream_allgather_api + PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST") +endif() if((WITH_GPU OR WITH_ROCM) AND (LINUX)) py_test_modules( test_communication_stream_allreduce_api MODULES diff --git a/python/paddle/fluid/tests/unittests/collective/communication_stream_allgather_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/communication_stream_allgather_api_dygraph.py new file mode 100644 index 0000000000000..d0e46600b8b50 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/collective/communication_stream_allgather_api_dygraph.py @@ -0,0 +1,91 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import numpy as np +import paddle +import paddle.fluid as fluid +import paddle.distributed as dist +import test_communication_api_base as test_base +import test_collective_api_base as test_collective_base + + +class StreamAllgatherTestCase(): + + def __init__(self): + self._sync_op = eval(os.getenv("sync_op")) + self._use_calc_stream = eval(os.getenv("use_calc_stream")) + self._backend = os.getenv("backend") + self._shape = eval(os.getenv("shape")) + self._dtype = os.getenv("dtype") + self._seeds = eval(os.getenv("seeds")) + if self._backend not in ["nccl", "gloo"]: + raise NotImplementedError( + "Only support nccl and gloo as the backend for now.") + os.environ["PADDLE_DISTRI_BACKEND"] = self._backend + + def run_test_case(self): + dist.init_parallel_env() + + test_data_list = [] + for seed in self._seeds: + test_data_list.append( + test_collective_base.create_test_data(shape=self._shape, + dtype=self._dtype, + seed=seed)) + + rank = dist.get_rank() + tensor = paddle.to_tensor(test_data_list[rank]) + + # case 1: pass an empty tensor list + empty_tensor_list = [] + task = dist.stream.all_gather(empty_tensor_list, + tensor, + sync_op=self._sync_op, + use_calc_stream=self._use_calc_stream) + if not self._sync_op: + task.wait() + assert np.allclose(empty_tensor_list, + test_data_list, + rtol=1e-05, + atol=1e-05) + + # case 2: pass a pre-sized tensor list + full_tensor_list = [paddle.empty_like(tensor) for _ in test_data_list] + task = dist.stream.all_gather(full_tensor_list, + tensor, + sync_op=self._sync_op, + use_calc_stream=self._use_calc_stream) + if not self._sync_op: + task.wait() + assert np.allclose(full_tensor_list, + test_data_list, + rtol=1e-05, + atol=1e-05) + + # case 3: pass a pre-sized tensor + result_tensor = paddle.concat( + [paddle.to_tensor(data) for data in test_data_list]) + out_tensor = paddle.empty_like(result_tensor) + task = dist.stream.all_gather(out_tensor, + tensor, + sync_op=self._sync_op, + use_calc_stream=self._use_calc_stream) + if not self._sync_op: + task.wait() + assert np.allclose(out_tensor, result_tensor, rtol=1e-05, atol=1e-05) + + +if __name__ == "__main__": + StreamAllgatherTestCase().run_test_case() diff --git a/python/paddle/fluid/tests/unittests/collective/test_communication_stream_allgather_api.py b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_allgather_api.py new file mode 100644 index 0000000000000..254b64907ce07 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_allgather_api.py @@ -0,0 +1,51 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle +import itertools +import test_communication_api_base as test_base + + +class TestCommunicationStreamAllgatherAPI(test_base.CommunicationTestDistBase): + + def setUp(self): + super(TestCommunicationStreamAllgatherAPI, self).setUp(num_of_devices=2, + timeout=120) + self._default_envs = { + "backend": "nccl", + "shape": "(100, 200)", + "dtype": "float32", + "seeds": str(self._seeds) + } + self._changeable_envs = { + "sync_op": ["True", "False"], + "use_calc_stream": ["True", "False"] + } + + def test_allgather_stream(self): + envs_list = test_base.gen_product_envs_list(self._default_envs, + self._changeable_envs) + for envs in envs_list: + if eval(envs["use_calc_stream"]) and not eval(envs["sync_op"]): + continue + self.run_test_case("communication_stream_allgather_api_dygraph.py", + user_defined_envs=envs) + + def tearDown(self): + super(TestCommunicationStreamAllgatherAPI, self).tearDown() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/testslist.csv b/python/paddle/fluid/tests/unittests/collective/testslist.csv index b4ba281f45420..c6c7c13937f38 100644 --- a/python/paddle/fluid/tests/unittests/collective/testslist.csv +++ b/python/paddle/fluid/tests/unittests/collective/testslist.csv @@ -32,6 +32,7 @@ test_collective_split_col_linear,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_ test_collective_split_embedding_none_divisible,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., test_collective_split_row_linear,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., test_collective_wait,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., +test_communication_stream_allgather_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=, test_communication_stream_allreduce_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=, test_communication_stream_sendrecv_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=, test_eager_dist_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., From 9641b9342e52f459b04eb5c1a38a160f3a493506 Mon Sep 17 00:00:00 2001 From: Wen Sun <35923278+HermitSun@users.noreply.github.com> Date: Tue, 11 Oct 2022 19:14:09 +0800 Subject: [PATCH 3/6] Support both use_calc_stream and sync_op in collective communication API (#46761) --- .../distributed/collective/ProcessGroup.h | 96 ++- .../collective/ProcessGroupCustom.cc | 4 +- .../collective/ProcessGroupCustom.h | 8 +- .../collective/ProcessGroupNCCL.cc | 323 +++++++++- .../distributed/collective/ProcessGroupNCCL.h | 76 ++- .../collective/ProcessGroupStream.cc | 173 +++++- .../collective/ProcessGroupStream.h | 109 +++- paddle/fluid/distributed/collective/Utils.h | 144 ++++- paddle/fluid/pybind/distributed_py.cc | 555 +++++++++++++++++- .../communication/stream/__init__.py | 13 +- .../communication/stream/all_gather.py | 16 +- .../communication/stream/all_reduce.py | 4 +- .../communication/stream/alltoall.py | 157 +++++ .../communication/stream/alltoall_single.py | 128 ++++ .../communication/stream/broadcast.py | 83 +++ .../distributed/communication/stream/recv.py | 2 +- .../communication/stream/reduce.py | 93 +++ .../communication/stream/reduce_scatter.py | 216 +++++++ .../communication/stream/scatter.py | 162 +++++ .../distributed/communication/stream/send.py | 2 +- .../tests/unittests/collective/CMakeLists.txt | 48 ++ ...mmunication_stream_alltoall_api_dygraph.py | 113 ++++ ...tion_stream_alltoall_single_api_dygraph.py | 74 +++ ...munication_stream_broadcast_api_dygraph.py | 54 ++ ...communication_stream_reduce_api_dygraph.py | 66 +++ ...ation_stream_reduce_scatter_api_dygraph.py | 94 +++ ...ommunication_stream_scatter_api_dygraph.py | 84 +++ ...mmunication_stream_sendrecv_api_dygraph.py | 9 +- .../test_communication_stream_alltoall_api.py | 51 ++ ...ommunication_stream_alltoall_single_api.py | 53 ++ ...test_communication_stream_broadcast_api.py | 51 ++ .../test_communication_stream_reduce_api.py | 51 ++ ...communication_stream_reduce_scatter_api.py | 53 ++ .../test_communication_stream_scatter_api.py | 51 ++ .../tests/unittests/collective/testslist.csv | 6 + 35 files changed, 3108 insertions(+), 114 deletions(-) create mode 100644 python/paddle/distributed/communication/stream/alltoall.py create mode 100644 python/paddle/distributed/communication/stream/alltoall_single.py create mode 100644 python/paddle/distributed/communication/stream/broadcast.py create mode 100644 python/paddle/distributed/communication/stream/reduce.py create mode 100644 python/paddle/distributed/communication/stream/reduce_scatter.py create mode 100644 python/paddle/distributed/communication/stream/scatter.py create mode 100644 python/paddle/fluid/tests/unittests/collective/communication_stream_alltoall_api_dygraph.py create mode 100644 python/paddle/fluid/tests/unittests/collective/communication_stream_alltoall_single_api_dygraph.py create mode 100644 python/paddle/fluid/tests/unittests/collective/communication_stream_broadcast_api_dygraph.py create mode 100644 python/paddle/fluid/tests/unittests/collective/communication_stream_reduce_api_dygraph.py create mode 100644 python/paddle/fluid/tests/unittests/collective/communication_stream_reduce_scatter_api_dygraph.py create mode 100644 python/paddle/fluid/tests/unittests/collective/communication_stream_scatter_api_dygraph.py create mode 100644 python/paddle/fluid/tests/unittests/collective/test_communication_stream_alltoall_api.py create mode 100644 python/paddle/fluid/tests/unittests/collective/test_communication_stream_alltoall_single_api.py create mode 100644 python/paddle/fluid/tests/unittests/collective/test_communication_stream_broadcast_api.py create mode 100644 python/paddle/fluid/tests/unittests/collective/test_communication_stream_reduce_api.py create mode 100644 python/paddle/fluid/tests/unittests/collective/test_communication_stream_reduce_scatter_api.py create mode 100644 python/paddle/fluid/tests/unittests/collective/test_communication_stream_scatter_api.py diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h index 40a27db7601f1..fb4832442a469 100644 --- a/paddle/fluid/distributed/collective/ProcessGroup.h +++ b/paddle/fluid/distributed/collective/ProcessGroup.h @@ -122,6 +122,16 @@ class ProcessGroup { "ProcessGroup%s does not support broadcast", GetBackendName())); } + virtual std::shared_ptr Broadcast( + std::vector& /* input tensors */, // NOLINT + std::vector& /* output tensors */, // NOLINT + const BroadcastOptions&, + bool) { + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support broadcast with sync_op flag", + GetBackendName())); + } + virtual std::shared_ptr Barrier( const BarrierOptions& = BarrierOptions()) { PADDLE_THROW(platform::errors::InvalidArgument( @@ -157,14 +167,14 @@ class ProcessGroup { virtual std::shared_ptr Send_Partial( phi::DenseTensor&, // NOLINT int, - int, - int) { + int64_t, + int64_t) { PADDLE_THROW(platform::errors::InvalidArgument( "ProcessGroup%s does not support send_partial", GetBackendName())); } virtual std::shared_ptr Send_Partial( - phi::DenseTensor&, int, int, int, bool) { // NOLINT + phi::DenseTensor&, int, int64_t, int64_t, bool) { // NOLINT PADDLE_THROW(platform::errors::InvalidArgument( "ProcessGroup%s does not support send_partial with sync_op flag", GetBackendName())); @@ -173,14 +183,14 @@ class ProcessGroup { virtual std::shared_ptr Recv_Partial( phi::DenseTensor&, // NOLINT int, - int, - int) { + int64_t, + int64_t) { PADDLE_THROW(platform::errors::InvalidArgument( "ProcessGroup%s does not support recv_partial", GetBackendName())); } virtual std::shared_ptr Recv_Partial( - phi::DenseTensor&, int, int, int, bool) { // NOLINT + phi::DenseTensor&, int, int64_t, int64_t, bool) { // NOLINT PADDLE_THROW(platform::errors::InvalidArgument( "ProcessGroup%s does not support recv_partial with sync_op flag", GetBackendName())); @@ -205,8 +215,18 @@ class ProcessGroup { virtual std::shared_ptr AllGather_Partial( std::vector& in_tensors, // NOLINT std::vector& out_tensors, // NOLINT - int offset, - int length) { // NOLINT + int64_t offset, + int64_t length) { + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support AllGather_Partial", GetBackendName())); + } + + virtual std::shared_ptr AllGather_Partial( + std::vector& in_tensors, // NOLINT + std::vector& out_tensors, // NOLINT + int64_t offset, + int64_t length, + bool) { PADDLE_THROW(platform::errors::InvalidArgument( "ProcessGroup%s does not support AllGather_Partial", GetBackendName())); } @@ -218,6 +238,14 @@ class ProcessGroup { "ProcessGroup%s does not support AllToAll", GetBackendName())); } + virtual std::shared_ptr AllToAll( + std::vector&, // NOLINT + std::vector&, // NOLINT + bool) { + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support alltoall", GetBackendName())); + } + virtual std::shared_ptr AllToAll_Single( std::vector&, // NOLINT std::vector&, // NOLINT @@ -227,26 +255,66 @@ class ProcessGroup { "ProcessGroup%s does not support AllToAll_Single", GetBackendName())); } + virtual std::shared_ptr AllToAllSingle( + std::vector&, // NOLINT + std::vector&, // NOLINT + std::vector&, + std::vector&, + bool) { + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support alltoall_single", GetBackendName())); + } + virtual std::shared_ptr Reduce( std::vector&, // NOLINT std::vector&, // NOLINT const ReduceOptions& opts) { PADDLE_THROW(platform::errors::InvalidArgument( - "ProcessGroup%s does not support Reduce", GetBackendName())); + "ProcessGroup%s does not support reduce", GetBackendName())); + } + + virtual std::shared_ptr Reduce( + std::vector& /* input tensors */, // NOLINT + std::vector& /* output tensors */, // NOLINT + const ReduceOptions&, + bool) { + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support reduce with sync_op flag", + GetBackendName())); } virtual std::shared_ptr Scatter( std::vector&, // NOLINT std::vector&, // NOLINT - const ScatterOptions&) { // NOLINT + const ScatterOptions&) { PADDLE_THROW(platform::errors::InvalidArgument( - "ProcessGroup%s does not support Scatter", GetBackendName())); + "ProcessGroup%s does not support scatter", GetBackendName())); + } + + virtual std::shared_ptr Scatter( + std::vector&, // NOLINT + std::vector&, // NOLINT + const ScatterOptions&, + bool) { + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support scatter with sync_op flag", + GetBackendName())); + } + + virtual std::shared_ptr ReduceScatter( + std::vector&, // NOLINT + std::vector&, // NOLINT + const ReduceScatterOptions&, + bool) { + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support reduce_scatter with sync_op flag", + GetBackendName())); } virtual std::shared_ptr _ReduceScatterBase( - phi::DenseTensor&, // NOLINT - phi::DenseTensor&, // NOLINT - const ReduceScatterOptions&) { // NOLINT + phi::DenseTensor&, // NOLINT + phi::DenseTensor&, // NOLINT + const ReduceScatterOptions&) { PADDLE_THROW(platform::errors::InvalidArgument( "ProcessGroup%s does not support ReduceScatter", GetBackendName())); } diff --git a/paddle/fluid/distributed/collective/ProcessGroupCustom.cc b/paddle/fluid/distributed/collective/ProcessGroupCustom.cc index ad9356b368ea2..f18765a05f619 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupCustom.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupCustom.cc @@ -267,8 +267,8 @@ void* XcclGetPointerByOffset(void* raw_pointer, std::shared_ptr ProcessGroupCustom::AllGather_Partial( std::vector& in_tensors, std::vector& out_tensors, - int offset, - int length) { + int64_t offset, + int64_t length) { PADDLE_ENFORCE_EQ( CheckTensorsInCustomPlace(in_tensors, device_type_), true, diff --git a/paddle/fluid/distributed/collective/ProcessGroupCustom.h b/paddle/fluid/distributed/collective/ProcessGroupCustom.h index ccce66603afe6..ce3532bbb6f0e 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupCustom.h +++ b/paddle/fluid/distributed/collective/ProcessGroupCustom.h @@ -80,8 +80,8 @@ class ProcessGroupCustom : public ProcessGroup { std::shared_ptr AllGather_Partial( std::vector& in_tensors, std::vector& out_tensors, - int offset, - int length) override; + int64_t offset, + int64_t length) override; std::shared_ptr AllReduce( std::vector& in_tensors, @@ -117,8 +117,8 @@ class ProcessGroupCustom : public ProcessGroup { std::set used_place_ids_; private: - void BcastCustomId(std::vector& ccl_ids, - int root, // NOLINT + void BcastCustomId(std::vector& ccl_ids, // NOLINT + int root, int server_fd); void BroadcastUniqueCustomID( diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc index 12f60faf80053..bc5eb4885ee52 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc @@ -628,6 +628,40 @@ std::shared_ptr ProcessGroupNCCL::Broadcast( CommType::BROADCAST); } +std::shared_ptr ProcessGroupNCCL::Broadcast( + std::vector& in_tensors, + std::vector& out_tensors, + const BroadcastOptions& opts, + bool sync_op, + bool use_calc_stream) { + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(in_tensors), + true, + platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); + + return Collective( + in_tensors, + out_tensors, + [&](phi::DenseTensor& input, + phi::DenseTensor& output, + ncclComm_t comm, + const gpuStream_t& stream) { + const auto root = + opts.source_rank * in_tensors.size() + opts.source_root; + return platform::dynload::ncclBroadcast( + input.data(), + output.data(), + input.numel(), + platform::ToNCCLDataType(input.type()), + root, + comm, + stream); + }, + CommType::BROADCAST, + sync_op, + use_calc_stream); +} + std::shared_ptr ProcessGroupNCCL::Barrier( const BarrierOptions& opts) { // Only support single card single process @@ -782,7 +816,7 @@ std::shared_ptr ProcessGroupNCCL::Recv( } std::shared_ptr ProcessGroupNCCL::Send_Partial( - phi::DenseTensor& tensors, int dst_rank, int offset, int length) { + phi::DenseTensor& tensors, int dst_rank, int64_t offset, int64_t length) { // CheckTensorsInDifferentDevices(tensors, static_cast(GetSize())); phi::DenseTensor flatten_tensor; @@ -813,8 +847,8 @@ std::shared_ptr ProcessGroupNCCL::Send_Partial( std::shared_ptr ProcessGroupNCCL::Send_Partial( phi::DenseTensor& tensors, int dst_rank, - int offset, - int length, + int64_t offset, + int64_t length, bool sync_op, bool use_calc_stream) { phi::DenseTensor flatten_tensor; @@ -845,7 +879,7 @@ std::shared_ptr ProcessGroupNCCL::Send_Partial( } std::shared_ptr ProcessGroupNCCL::Recv_Partial( - phi::DenseTensor& tensors, int src_rank, int offset, int length) { + phi::DenseTensor& tensors, int src_rank, int64_t offset, int64_t length) { // phi::DenseTensor shared_input = tensors.Slice(offset, offset+length); phi::DenseTensor flatten_tensor; @@ -876,8 +910,8 @@ std::shared_ptr ProcessGroupNCCL::Recv_Partial( std::shared_ptr ProcessGroupNCCL::Recv_Partial( phi::DenseTensor& tensors, int src_rank, - int offset, - int length, + int64_t offset, + int64_t length, bool sync_op, bool use_calc_stream) { phi::DenseTensor flatten_tensor; @@ -1006,8 +1040,8 @@ void* GetPointerByOffset(void* raw_pointer, std::shared_ptr ProcessGroupNCCL::AllGather_Partial( std::vector& in_tensors, std::vector& out_tensors, - int offset, - int length) { + int64_t offset, + int64_t length) { PADDLE_ENFORCE_EQ( CheckTensorsInCudaPlace(in_tensors), true, @@ -1034,6 +1068,41 @@ std::shared_ptr ProcessGroupNCCL::AllGather_Partial( CommType::ALLGATHER); } +std::shared_ptr ProcessGroupNCCL::AllGather_Partial( + std::vector& in_tensors, + std::vector& out_tensors, + int64_t offset, + int64_t length, + bool sync_op, + bool use_calc_stream) { + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(in_tensors), + true, + platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(out_tensors), + true, + platform::errors::InvalidArgument("All outputs should be in CudaPlace.")); + return Collective( + in_tensors, + out_tensors, + [&](phi::DenseTensor& input, + phi::DenseTensor& output, + ncclComm_t comm, + const gpuStream_t& stream) { + return platform::dynload::ncclAllGather( + GetPointerByOffset(input.data(), offset, input.dtype()), + output.data(), + length, + platform::ToNCCLDataType(input.dtype()), + comm, + stream); + }, + CommType::ALLGATHER, + sync_op, + use_calc_stream); +} + std::shared_ptr ProcessGroupNCCL::AllToAll( std::vector& in_tensors, std::vector& out_tensors) { @@ -1076,6 +1145,52 @@ std::shared_ptr ProcessGroupNCCL::AllToAll( CommType::ALLTOALL); } +std::shared_ptr ProcessGroupNCCL::AllToAll( + std::vector& in_tensors, + std::vector& out_tensors, + bool sync_op, + bool use_calc_stream) { + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(in_tensors), + true, + platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(out_tensors), + true, + platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); + return Collective( + in_tensors, + out_tensors, + [&](phi::DenseTensor& input, + phi::DenseTensor& output, + ncclComm_t comm, + const gpuStream_t& stream) { + size_t offset = 0; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); + for (auto i = 0; i < size_; i++) { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend( + GetPointerByOffset(input.data(), offset, input.dtype()), + input.numel() / size_, + platform::ToNCCLDataType(input.dtype()), + i, + comm, + stream)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( + GetPointerByOffset(output.data(), offset, input.dtype()), + input.numel() / size_, + platform::ToNCCLDataType(input.dtype()), + i, + comm, + stream)); + offset += input.numel() / size_; + } + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); + }, + CommType::ALLTOALL, + sync_op, + use_calc_stream); +} + std::shared_ptr ProcessGroupNCCL::AllToAll_Single( std::vector& in_tensors, std::vector& out_tensors, @@ -1138,6 +1253,72 @@ std::shared_ptr ProcessGroupNCCL::AllToAll_Single( CommType::ALLTOALL_SINGLE); } +std::shared_ptr ProcessGroupNCCL::AllToAllSingle( + std::vector& in_tensors, + std::vector& out_tensors, + std::vector& in_sizes, + std::vector& out_sizes, + bool sync_op, + bool use_calc_stream) { + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(in_tensors), + true, + platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(out_tensors), + true, + platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); + return Collective( + in_tensors, + out_tensors, + [&](phi::DenseTensor& input, + phi::DenseTensor& output, + ncclComm_t comm, + const gpuStream_t& stream) { + PADDLE_ENFORCE_EQ(input.dtype() == output.dtype(), + true, + platform::errors::InvalidArgument( + "The dtypes of input and output must be equal.")); + + std::vector in_dims = phi::vectorize(input.dims()); + std::vector out_dims = phi::vectorize(output.dims()); + CheckSplitSizes(&in_sizes, in_dims); + CheckSplitSizes(&out_sizes, out_dims); + + size_t in_offset = 0, out_offset = 0; + size_t in_length = 0, out_length = 0; + size_t in_row_size = input.numel() / in_dims[0]; + size_t out_row_size = output.numel() / out_dims[0]; + + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); + for (auto i = 0; i < size_; i++) { + in_length = in_sizes[i] * in_row_size; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend( + GetPointerByOffset(input.data(), in_offset, input.dtype()), + in_length, + platform::ToNCCLDataType(input.dtype()), + i, + comm, + stream)); + in_offset += in_length; + + out_length = out_sizes[i] * out_row_size; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( + GetPointerByOffset(output.data(), out_offset, input.dtype()), + out_length, + platform::ToNCCLDataType(input.dtype()), + i, + comm, + stream)); + out_offset += out_length; + } + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); + }, + CommType::ALLTOALL_SINGLE, + sync_op, + use_calc_stream); +} + std::shared_ptr ProcessGroupNCCL::Reduce( std::vector& in_tensors, std::vector& out_tensors, @@ -1166,6 +1347,70 @@ std::shared_ptr ProcessGroupNCCL::Reduce( CommType::REDUCE); } +std::shared_ptr ProcessGroupNCCL::Reduce( + std::vector& in_tensors, + std::vector& out_tensors, + const ReduceOptions& opts, + bool sync_op, + bool use_calc_stream) { + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(in_tensors), + true, + platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); + return Collective( + in_tensors, + out_tensors, + [&](const phi::DenseTensor& input, + phi::DenseTensor& output, + ncclComm_t comm, + const gpuStream_t& stream) { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce( + input.data(), + output.data(), + input.numel(), + platform::ToNCCLDataType(input.dtype()), + ToNCCLRedType(opts.reduce_op), + opts.root_rank, + comm, + stream)); + }, + CommType::REDUCE, + sync_op, + use_calc_stream); +} + +std::shared_ptr ProcessGroupNCCL::ReduceScatter( + std::vector& in_tensors, + std::vector& out_tensors, + const ReduceScatterOptions& opts, + bool sync_op, + bool use_calc_stream) { + return Collective( + in_tensors, + out_tensors, + [&](phi::DenseTensor& input, + phi::DenseTensor& output, + ncclComm_t comm, + const gpuStream_t& stream) { + if (FLAGS_use_stream_safe_cuda_allocator) { + platform::CUDADeviceGuard cuda_guard; + cuda_guard.SetDevice(output.place()); + memory::RecordStream(output.Holder(), stream); + } + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduceScatter( + input.data(), + output.data(), + output.numel(), + platform::ToNCCLDataType(input.dtype()), + ToNCCLRedType(opts.reduce_op), + comm, + stream)); + }, + CommType::REDUCE_SCATTER, + sync_op, + use_calc_stream); +} + std::shared_ptr ProcessGroupNCCL::Scatter( std::vector& in_tensors, std::vector& out_tensors, @@ -1219,6 +1464,68 @@ std::shared_ptr ProcessGroupNCCL::Scatter( CommType::SCATTER); } +std::shared_ptr ProcessGroupNCCL::Scatter( + std::vector& in_tensors, + std::vector& out_tensors, + const ScatterOptions& opts, + bool sync_op, + bool use_calc_stream) { + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(in_tensors), + true, + platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(out_tensors), + true, + platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); + return Collective( + in_tensors, + out_tensors, + [&](phi::DenseTensor& input, + phi::DenseTensor& output, + ncclComm_t comm, + const gpuStream_t& stream) { + PADDLE_ENFORCE_EQ( + output.numel(), + input.numel() / size_, + platform::errors::InvalidArgument( + "Input and output tensors should have the same shape.")); + size_t offset = 0; + if (rank_ == opts.root_rank) { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); + for (auto i = 0; i < size_; i++) { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend( + GetPointerByOffset(input.data(), offset, input.dtype()), + input.numel() / size_, + platform::ToNCCLDataType(input.dtype()), + i, + comm, + stream)); + offset += input.numel() / size_; + } + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( + output.data(), + input.numel() / size_, + platform::ToNCCLDataType(input.dtype()), + opts.root_rank, + comm, + stream)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); + } else { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( + output.data(), + input.numel() / size_, + platform::ToNCCLDataType(input.dtype()), + opts.root_rank, + comm, + stream)); + } + }, + CommType::SCATTER, + sync_op, + use_calc_stream); +} + std::shared_ptr ProcessGroupNCCL::_ReduceScatterBase( phi::DenseTensor& out_tensor, phi::DenseTensor& in_tensor, diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h index 24ba7c86b1838..6427e9e3e2ab1 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h @@ -119,6 +119,13 @@ class ProcessGroupNCCL : public ProcessGroupStream { std::vector& out_tensors, const BroadcastOptions& = BroadcastOptions()) override; + std::shared_ptr Broadcast( + std::vector& in_tensors, + std::vector& out_tensors, + const BroadcastOptions& opts, + bool sync_op, + bool use_calc_stream) override; + std::shared_ptr Barrier( const BarrierOptions& = BarrierOptions()) override; @@ -142,27 +149,27 @@ class ProcessGroupNCCL : public ProcessGroupStream { std::shared_ptr Send_Partial(phi::DenseTensor& tensors, int dst_rank, - int offset, - int length) override; + int64_t offset, + int64_t length) override; std::shared_ptr Send_Partial( phi::DenseTensor& tensors, int dst_rank, - int offset, - int length, + int64_t offset, + int64_t length, bool sync_op, bool use_calc_stream) override; std::shared_ptr Recv_Partial(phi::DenseTensor& tensors, int src_rank, - int offset, - int length) override; + int64_t offset, + int64_t length) override; std::shared_ptr Recv_Partial( phi::DenseTensor& tensors, int src_rank, - int offset, - int length, + int64_t offset, + int64_t length, bool sync_op, bool use_calc_stream) override; @@ -179,12 +186,26 @@ class ProcessGroupNCCL : public ProcessGroupStream { std::shared_ptr AllGather_Partial( std::vector& in_tensors, std::vector& out_tensors, - int offset, - int length) override; + int64_t offset, + int64_t length) override; + + std::shared_ptr AllGather_Partial( + std::vector& in_tensors, + std::vector& out_tensors, + int64_t offset, + int64_t length, + bool sync_op, + bool use_calc_stream) override; std::shared_ptr AllToAll( - std::vector& in, - std::vector& out) override; + std::vector& in_tensors, + std::vector& out_tensors) override; + + std::shared_ptr AllToAll( + std::vector& in_tensors, + std::vector& out_tensors, + bool sync_op, + bool use_calc_stream) override; std::shared_ptr AllToAll_Single( std::vector& in, @@ -192,15 +213,44 @@ class ProcessGroupNCCL : public ProcessGroupStream { std::vector& in_sizes, std::vector& out_sizes) override; + std::shared_ptr AllToAllSingle( + std::vector& in_tensors, + std::vector& out_tensors, + std::vector& in_sizes, + std::vector& out_sizes, + bool sync_op, + bool use_calc_stream) override; + std::shared_ptr Reduce( std::vector& tensors, std::vector& out_tensors, const ReduceOptions& opts) override; + std::shared_ptr Reduce( + std::vector& in_tensors, + std::vector& out_tensors, + const ReduceOptions& opts, + bool sync_op, + bool use_calc_stream) override; + + std::shared_ptr ReduceScatter( + std::vector& in_tensors, + std::vector& out_tensors, + const ReduceScatterOptions& opts, + bool sync_op, + bool use_calc_stream) override; + std::shared_ptr Scatter( std::vector& in_tensors, std::vector& out_tensors, - const ScatterOptions&) override; + const ScatterOptions& opts) override; + + std::shared_ptr Scatter( + std::vector& in_tensors, + std::vector& out_tensors, + const ScatterOptions& opts, + bool sync_op, + bool use_calc_stream) override; std::shared_ptr _ReduceScatterBase( phi::DenseTensor&, // NOLINT diff --git a/paddle/fluid/distributed/collective/ProcessGroupStream.cc b/paddle/fluid/distributed/collective/ProcessGroupStream.cc index 43ca0bbb36d3f..b2cfae088b227 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupStream.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupStream.cc @@ -70,6 +70,138 @@ std::shared_ptr ProcessGroupStream::AllReduce( "ProcessGroup%s does not support do all_reduce", GetBackendName())); } +std::shared_ptr ProcessGroupStream::AllToAll( + std::vector& in_tensors, + std::vector& out_tensors, + bool sync_op) { + return AllToAll(in_tensors, + out_tensors, + sync_op, + /*use_calc_stream*/ false); +} + +std::shared_ptr ProcessGroupStream::AllToAll( + std::vector& in_tensors, + std::vector& out_tensors, + bool sync_op, + bool use_calc_stream) { + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support do alltoall", GetBackendName())); +} + +std::shared_ptr ProcessGroupStream::AllToAllSingle( + std::vector& in_tensors, + std::vector& out_tensors, + std::vector& in_sizes, + std::vector& out_sizes, + bool sync_op) { + return AllToAllSingle(in_tensors, + out_tensors, + in_sizes, + out_sizes, + sync_op, + /*use_calc_stream*/ false); +} + +std::shared_ptr ProcessGroupStream::AllToAllSingle( + std::vector& in_tensors, + std::vector& out_tensors, + std::vector& in_sizes, + std::vector& out_sizes, + bool sync_op, + bool use_calc_stream) { + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support do alltoall_single", GetBackendName())); +} + +std::shared_ptr ProcessGroupStream::Broadcast( + std::vector& in_tensors, + std::vector& out_tensors, + const BroadcastOptions& opts, + bool sync_op) { + return Broadcast(in_tensors, + out_tensors, + opts, + sync_op, + /*use_calc_stream*/ false); +} + +std::shared_ptr ProcessGroupStream::Broadcast( + std::vector& in_tensors, + std::vector& out_tensors, + const BroadcastOptions& opts, + bool sync_op, + bool use_calc_stream) { + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support do broadcast", GetBackendName())); +} + +std::shared_ptr ProcessGroupStream::Reduce( + std::vector& in_tensors, + std::vector& out_tensors, + const ReduceOptions& opts, + bool sync_op) { + return Reduce(in_tensors, + out_tensors, + opts, + sync_op, + /*use_calc_stream*/ false); +} + +std::shared_ptr ProcessGroupStream::Reduce( + std::vector& in_tensors, + std::vector& out_tensors, + const ReduceOptions& opts, + bool sync_op, + bool use_calc_stream) { + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support do reduce", GetBackendName())); +} + +std::shared_ptr ProcessGroupStream::ReduceScatter( + std::vector& in_tensors, + std::vector& out_tensors, + const ReduceScatterOptions& opts, + bool sync_op) { + return ReduceScatter(in_tensors, + out_tensors, + opts, + sync_op, + /*use_calc_stream*/ false); +} + +std::shared_ptr ProcessGroupStream::ReduceScatter( + std::vector& in_tensors, + std::vector& out_tensors, + const ReduceScatterOptions& opts, + bool sync_op, + bool use_calc_stream) { + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support do reduce_scatter", GetBackendName())); +} + +std::shared_ptr ProcessGroupStream::Scatter( + std::vector& in_tensors, + std::vector& out_tensors, + const ScatterOptions& opts, + bool sync_op) { + return Scatter(in_tensors, + out_tensors, + opts, + sync_op, + /*use_calc_stream*/ false); +} + +std::shared_ptr ProcessGroupStream::Scatter( + std::vector& in_tensors, + std::vector& out_tensors, + const ScatterOptions& opts, + bool sync_op, + bool use_calc_stream) { + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support do scatter", GetBackendName())); +} + std::shared_ptr ProcessGroupStream::Send( std::vector& tensors, int dst_rank, bool sync_op) { return Send(tensors, @@ -90,8 +222,8 @@ std::shared_ptr ProcessGroupStream::Send( std::shared_ptr ProcessGroupStream::Send_Partial( phi::DenseTensor& tensors, int dst_rank, - int offset, - int length, + int64_t offset, + int64_t length, bool sync_op) { return Send_Partial(tensors, dst_rank, @@ -104,8 +236,8 @@ std::shared_ptr ProcessGroupStream::Send_Partial( std::shared_ptr ProcessGroupStream::Send_Partial( phi::DenseTensor& tensors, int dst_rank, - int offset, - int length, + int64_t offset, + int64_t length, bool sync_op, bool use_calc_stream) { PADDLE_THROW(platform::errors::InvalidArgument( @@ -132,8 +264,8 @@ std::shared_ptr ProcessGroupStream::Recv( std::shared_ptr ProcessGroupStream::Recv_Partial( phi::DenseTensor& tensors, int src_rank, - int offset, - int length, + int64_t offset, + int64_t length, bool sync_op) { return Recv_Partial(tensors, src_rank, @@ -146,8 +278,33 @@ std::shared_ptr ProcessGroupStream::Recv_Partial( std::shared_ptr ProcessGroupStream::Recv_Partial( phi::DenseTensor& tensors, int src_rank, - int offset, - int length, + int64_t offset, + int64_t length, + bool sync_op, + bool use_calc_stream) { + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support do recv_partial", GetBackendName())); +} + +std::shared_ptr ProcessGroupStream::AllGather_Partial( + std::vector& in_tensors, + std::vector& out_tensors, + int64_t offset, + int64_t length, + bool sync_op) { + return AllGather_Partial(in_tensors, + out_tensors, + offset, + length, + sync_op, + /*use_calc_stream*/ false); +} + +std::shared_ptr ProcessGroupStream::AllGather_Partial( + std::vector& in_tensors, + std::vector& out_tensors, + int64_t offset, + int64_t length, bool sync_op, bool use_calc_stream) { PADDLE_THROW(platform::errors::InvalidArgument( diff --git a/paddle/fluid/distributed/collective/ProcessGroupStream.h b/paddle/fluid/distributed/collective/ProcessGroupStream.h index f8ab562ad075c..2f0aa139104e9 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupStream.h +++ b/paddle/fluid/distributed/collective/ProcessGroupStream.h @@ -81,6 +81,84 @@ class ProcessGroupStream : public ProcessGroup { bool sync_op, bool use_calc_stream); + std::shared_ptr AllToAll( + std::vector& in_tensors, // NOLINT + std::vector& out_tensors, // NOLINT + bool sync_op) override; + + virtual std::shared_ptr AllToAll( + std::vector& in_tensors, // NOLINT + std::vector& out_tensors, // NOLINT + bool sync_op, + bool use_calc_stream); + + std::shared_ptr AllToAllSingle( + std::vector& in_tensors, // NOLINT + std::vector& out_tensors, // NOLINT + std::vector& in_sizes, // NOLINT + std::vector& out_sizes, // NOLINT + bool sync_op) override; + + virtual std::shared_ptr AllToAllSingle( + std::vector& in_tensors, // NOLINT + std::vector& out_tensors, // NOLINT + std::vector& in_sizes, // NOLINT + std::vector& out_sizes, // NOLINT + bool sync_op, + bool use_calc_stream); + + std::shared_ptr Broadcast( + std::vector& in_tensors, // NOLINT + std::vector& out_tensors, // NOLINT + const BroadcastOptions& opts, + bool sync_op) override; + + virtual std::shared_ptr Broadcast( + std::vector& in_tensors, // NOLINT + std::vector& out_tensors, // NOLINT + const BroadcastOptions& opts, + bool sync_op, + bool use_calc_stream); + + std::shared_ptr Reduce( + std::vector& in_tensors, // NOLINT + std::vector& out_tensors, // NOLINT + const ReduceOptions& opts, + bool sync_op) override; + + virtual std::shared_ptr Reduce( + std::vector& in_tensors, // NOLINT + std::vector& out_tensors, // NOLINT + const ReduceOptions& opts, + bool sync_op, + bool use_calc_stream); + + std::shared_ptr ReduceScatter( + std::vector& in_tensors, // NOLINT + std::vector& out_tensors, // NOLINT + const ReduceScatterOptions& opts, + bool sync_op) override; + + virtual std::shared_ptr ReduceScatter( + std::vector& in_tensors, // NOLINT + std::vector& out_tensors, // NOLINT + const ReduceScatterOptions& opts, + bool sync_op, + bool use_calc_stream); + + std::shared_ptr Scatter( + std::vector& in_tensors, // NOLINT + std::vector& out_tensors, // NOLINT + const ScatterOptions& opts, + bool sync_op) override; + + virtual std::shared_ptr Scatter( + std::vector& in_tensors, // NOLINT + std::vector& out_tensors, // NOLINT + const ScatterOptions& opts, + bool sync_op, + bool use_calc_stream); + std::shared_ptr Send( std::vector& tensors, // NOLINT int dst_rank, @@ -95,15 +173,15 @@ class ProcessGroupStream : public ProcessGroup { std::shared_ptr Send_Partial( phi::DenseTensor& tensors, // NOLINT int dst_rank, - int offset, - int length, + int64_t offset, + int64_t length, bool sync_op) override; virtual std::shared_ptr Send_Partial( phi::DenseTensor& tensors, // NOLINT int dst_rank, - int offset, - int length, + int64_t offset, + int64_t length, bool sync_op, bool use_calc_stream); @@ -121,15 +199,30 @@ class ProcessGroupStream : public ProcessGroup { std::shared_ptr Recv_Partial( phi::DenseTensor& tensors, // NOLINT int src_rank, - int offset, - int length, + int64_t offset, + int64_t length, bool sync_op) override; virtual std::shared_ptr Recv_Partial( phi::DenseTensor& tensors, // NOLINT int src_rank, - int offset, - int length, + int64_t offset, + int64_t length, + bool sync_op, + bool use_calc_stream); + + std::shared_ptr AllGather_Partial( + std::vector& in_tensors, + std::vector& out_tensors, + int64_t offset, + int64_t length, + bool sync_op) override; + + virtual std::shared_ptr AllGather_Partial( + std::vector& in_tensors, // NOLINT + std::vector& out_tensors, // NOLINT + int64_t offset, + int64_t length, bool sync_op, bool use_calc_stream); }; diff --git a/paddle/fluid/distributed/collective/Utils.h b/paddle/fluid/distributed/collective/Utils.h index 79146febdf809..c06c0345163ed 100644 --- a/paddle/fluid/distributed/collective/Utils.h +++ b/paddle/fluid/distributed/collective/Utils.h @@ -14,14 +14,26 @@ #pragma once -#include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/phi/api/include/tensor.h" +#include "paddle/phi/backends/device_guard.h" #include "paddle/phi/backends/device_manager.h" +#include "paddle/phi/kernels/funcs/concat_and_split_functor.h" namespace paddle { namespace distributed { +template +struct ConcatDenseTensor { + void operator()(const DeviceContext *context, + const std::vector &in, + phi::DenseTensor *out, + int axis = 0) { + phi::funcs::ConcatFunctor concat_functor; + concat_functor(*context, in, axis, out); + } +}; + template struct SplitDenseTensor { void operator()(const DeviceContext *context, @@ -33,17 +45,36 @@ struct SplitDenseTensor { for (auto *p_tensor : *out) { shape_refer.emplace_back(p_tensor); } - operators::math::SplitFunctor split_functor_; - split_functor_(*context, in, shape_refer, axis, out); + phi::funcs::SplitFunctor split_functor; + split_functor(*context, in, shape_refer, axis, out); } }; #ifdef PADDLE_WITH_CUSTOM_DEVICE +template +struct ConcatDenseTensor { + void operator()(const platform::CustomDeviceContext *context, + const std::vector &in, + phi::DenseTensor *out, + int axis = 0) { + auto *out_data = out->data(); + auto *device = phi::DeviceManager::GetDeviceWithPlace(context->GetPlace()); + size_t offset = 0; + for (const auto &tensor : in) { + const auto *in_data = tensor.data(); + auto sz = tensor.numel() * sizeof(T); + device->MemoryCopyD2D(out_data + offset, in_data, sz, nullptr); + offset += sz; + } + } +}; + template struct SplitDenseTensor { void operator()(const platform::CustomDeviceContext *context, const phi::DenseTensor &in, - std::vector *out) { + std::vector *out, + int axis = 0) { auto *in_data = in.data(); auto *device = phi::DeviceManager::GetDeviceWithPlace(context->GetPlace()); size_t offset = 0; @@ -57,42 +88,119 @@ struct SplitDenseTensor { }; #endif +template +void ConcatDenseTensorWithType(const DeviceContext *dev_ctx, + const std::vector &t_list, + phi::DenseTensor *p_out, + phi::DataType type) { + switch (type) { + case phi::DataType::BOOL: + ConcatDenseTensor()(dev_ctx, t_list, p_out); + break; + case phi::DataType::UINT8: + ConcatDenseTensor()(dev_ctx, t_list, p_out); + break; + case phi::DataType::INT8: + ConcatDenseTensor()(dev_ctx, t_list, p_out); + break; + case phi::DataType::INT32: + ConcatDenseTensor()(dev_ctx, t_list, p_out); + break; + case phi::DataType::INT64: + ConcatDenseTensor()(dev_ctx, t_list, p_out); + break; + case phi::DataType::FLOAT16: + ConcatDenseTensor()( + dev_ctx, t_list, p_out); + break; + case phi::DataType::FLOAT32: + ConcatDenseTensor()(dev_ctx, t_list, p_out); + break; + case phi::DataType::FLOAT64: + ConcatDenseTensor()(dev_ctx, t_list, p_out); + break; + default: + PADDLE_THROW(platform::errors::Unimplemented( + "Data type (%s) is not supported when it concats tensors.", type)); + } +} + template void SplitDenseTensorWithType(const DeviceContext *dev_ctx, - const phi::DenseTensor &p_dense, + const phi::DenseTensor &t_in, std::vector *p_list, phi::DataType type) { switch (type) { case phi::DataType::BOOL: - SplitDenseTensor()(dev_ctx, p_dense, p_list); + SplitDenseTensor()(dev_ctx, t_in, p_list); break; case phi::DataType::UINT8: - SplitDenseTensor()(dev_ctx, p_dense, p_list); + SplitDenseTensor()(dev_ctx, t_in, p_list); break; case phi::DataType::INT8: - SplitDenseTensor()(dev_ctx, p_dense, p_list); + SplitDenseTensor()(dev_ctx, t_in, p_list); break; case phi::DataType::INT32: - SplitDenseTensor()(dev_ctx, p_dense, p_list); + SplitDenseTensor()(dev_ctx, t_in, p_list); break; case phi::DataType::INT64: - SplitDenseTensor()(dev_ctx, p_dense, p_list); + SplitDenseTensor()(dev_ctx, t_in, p_list); break; case phi::DataType::FLOAT16: SplitDenseTensor()( - dev_ctx, p_dense, p_list); + dev_ctx, t_in, p_list); break; case phi::DataType::FLOAT32: - SplitDenseTensor()(dev_ctx, p_dense, p_list); + SplitDenseTensor()(dev_ctx, t_in, p_list); break; case phi::DataType::FLOAT64: - SplitDenseTensor()(dev_ctx, p_dense, p_list); + SplitDenseTensor()(dev_ctx, t_in, p_list); break; default: PADDLE_THROW(platform::errors::Unimplemented( - "Data type (%s) is not supported when it splits tensors for " - "allgather.", - type)); + "Data type (%s) is not supported when it splits tensors.", type)); + } +} + +void ConcatTensor(const phi::DeviceContext *dev_ctx, + const std::vector &tensor_list, + const experimental::Tensor *tensor) { + auto *dense_tensor = + std::dynamic_pointer_cast(tensor->impl()).get(); + + const auto &place = dev_ctx->GetPlace(); + if (platform::is_gpu_place(place)) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + ConcatDenseTensorWithType(static_cast(dev_ctx), + tensor_list, + dense_tensor, + tensor->dtype()); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Paddle can't concat tensor since it's not support GPU, please " + "recompile or reinstall Paddle with GPU support.")); +#endif + } else if (platform::is_custom_place(place)) { +#ifdef PADDLE_WITH_CUSTOM_DEVICE + ConcatDenseTensorWithType( + static_cast(dev_ctx), + tensor_list, + dense_tensor, + tensor->dtype()); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Paddle can't concat tensor since it's not compiled with " + "CUSTOM_DEVICE, please recompile or reinstall Paddle with " + "CUSTOM_DEVICE support.")); +#endif + } else if (platform::is_cpu_place(place)) { + ConcatDenseTensorWithType(static_cast(dev_ctx), + tensor_list, + dense_tensor, + tensor->dtype()); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Concat tensor not supported on place (%s)", place)); } } @@ -115,8 +223,8 @@ void SplitTensor(const phi::DeviceContext *dev_ctx, tensor.dtype()); #else PADDLE_THROW(platform::errors::PermissionDenied( - "Paddle can't split tensor since it's not support NCCL/RCCL, please " - "recompile or reinstall Paddle with NCCL/RCCL support.")); + "Paddle can't split tensor since it's not support GPU, please " + "recompile or reinstall Paddle with GPU support.")); #endif } else if (platform::is_custom_place(place)) { #ifdef PADDLE_WITH_CUSTOM_DEVICE diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc index bec3c93cbd8b7..29e6e9e5d1e79 100644 --- a/paddle/fluid/pybind/distributed_py.cc +++ b/paddle/fluid/pybind/distributed_py.cc @@ -172,6 +172,24 @@ void BindDistributed(py::module *m) { py::arg("source_rank"), py::call_guard()) + .def( + "broadcast", + [](distributed::ProcessGroup &self, + py::handle py_tensor, + int src, + bool sync_op) { + auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); + distributed::BroadcastOptions opts{src}; + auto dense = + std::dynamic_pointer_cast(tensor.impl()); + std::vector tensors = {*dense}; + return self.Broadcast(tensors, tensors, opts, sync_op); + }, + py::arg("tensor"), + py::arg("src"), + py::arg("sync_op"), + py::call_guard()) + .def( "barrier", [](distributed::ProcessGroup &self, std::vector place_ids) { @@ -224,9 +242,9 @@ void BindDistributed(py::module *m) { auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); auto dense = std::dynamic_pointer_cast(tensor.impl()); - int numel = (*dense).numel(); - int send_numel = numel / nranks; - int offset = send_numel * rank_id; + int64_t numel = (*dense).numel(); + int64_t send_numel = numel / nranks; + int64_t offset = send_numel * rank_id; return self.Send_Partial(*dense, dst_rank, offset, send_numel); }, py::arg("tensor"), @@ -246,9 +264,9 @@ void BindDistributed(py::module *m) { auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); auto dense = std::dynamic_pointer_cast(tensor.impl()); - int numel = (*dense).numel(); - int send_numel = numel / nranks; - int offset = send_numel * rank_id; + int64_t numel = (*dense).numel(); + int64_t send_numel = numel / nranks; + int64_t offset = send_numel * rank_id; return self.Send_Partial( *dense, dst_rank, offset, send_numel, sync_op); }, @@ -301,9 +319,9 @@ void BindDistributed(py::module *m) { auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); auto dense = std::dynamic_pointer_cast(tensor.impl()); - int numel = (*dense).numel(); - int recv_numel = numel / nranks; - int offset = recv_numel * rank_id; + int64_t numel = (*dense).numel(); + int64_t recv_numel = numel / nranks; + int64_t offset = recv_numel * rank_id; return self.Recv_Partial(*dense, src_rank, offset, recv_numel); }, py::arg("tensor"), @@ -323,9 +341,9 @@ void BindDistributed(py::module *m) { auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); auto dense = std::dynamic_pointer_cast(tensor.impl()); - int numel = (*dense).numel(); - int recv_numel = numel / nranks; - int offset = recv_numel * rank_id; + int64_t numel = (*dense).numel(); + int64_t recv_numel = numel / nranks; + int64_t offset = recv_numel * rank_id; return self.Recv_Partial( *dense, src_rank, offset, recv_numel, sync_op); }, @@ -384,7 +402,7 @@ void BindDistributed(py::module *m) { py::call_guard()) .def( - "allgather_base", + "allgather_into_tensor", [](distributed::ProcessGroup &self, py::handle py_in_tensor, py::handle py_out_tensor, @@ -421,9 +439,9 @@ void BindDistributed(py::module *m) { out_tensor.impl()); std::vector in_tensors = {*in_dense}; std::vector out_tensors = {*out_dense}; - int numel = (*in_dense).numel(); - int send_numel = numel / nranks; - int offset = send_numel * rank_id; + int64_t numel = (*in_dense).numel(); + int64_t send_numel = numel / nranks; + int64_t offset = send_numel * rank_id; return self.AllGather_Partial( in_tensors, out_tensors, offset, send_numel); }, @@ -452,6 +470,61 @@ void BindDistributed(py::module *m) { py::arg("out"), py::call_guard()) + .def( + "alltoall", + [](distributed::ProcessGroup &self, + py::handle py_in_tensor_list, + py::handle py_out_tensor_list, + bool sync_op) { + auto in_tensor_list = + CastPyArg2VectorOfTensor(py_in_tensor_list.ptr(), 0); + Tensor concat_in_tensor = paddle::concat(in_tensor_list, 0); + auto in_dense = std::dynamic_pointer_cast( + concat_in_tensor.impl()); + std::vector in_wrapper = {*in_dense}; + + auto out_tensor_list = + CastPyArg2VectorOfTensor(py_out_tensor_list.ptr(), 0); + Tensor concat_out_tensor = paddle::concat(out_tensor_list, 0); + auto out_dense = std::dynamic_pointer_cast( + concat_out_tensor.impl()); + std::vector out_wrapper = {*out_dense}; + + // in_tensor_list should not be empty + const auto *dev_ctx = + self.GetDeviceContext(in_tensor_list.back().place()); + auto task = self.AllToAll(in_wrapper, out_wrapper, sync_op); + distributed::SplitTensor(dev_ctx, *out_dense, &out_tensor_list); + return task; + }, + py::arg("in"), + py::arg("out"), + py::arg("sync_op"), + py::call_guard()) + + .def( + "alltoall_tensor", + [](distributed::ProcessGroup &self, + py::handle py_in_tensor, + py::handle py_out_tensor, + bool sync_op) { + auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); + auto in_dense = std::dynamic_pointer_cast( + in_tensor.impl()); + std::vector in_wrapper = {*in_dense}; + + auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); + auto out_dense = std::dynamic_pointer_cast( + out_tensor.impl()); + std::vector out_wrapper = {*out_dense}; + + return self.AllToAll(in_wrapper, out_wrapper, sync_op); + }, + py::arg("in"), + py::arg("out"), + py::arg("sync_op"), + py::call_guard()) + .def( "alltoall_single", [](distributed::ProcessGroup &self, @@ -476,6 +549,34 @@ void BindDistributed(py::module *m) { py::arg("out_sizes"), py::call_guard()) + .def( + "alltoall_single", + [](distributed::ProcessGroup &self, + py::handle py_in_tensor, + py::handle py_out_tensor, + std::vector &in_sizes, + std::vector &out_sizes, + bool sync_op) { + auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); + auto in_dense = std::dynamic_pointer_cast( + in_tensor.impl()); + std::vector in_wrapper = {*in_dense}; + + auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); + auto out_dense = std::dynamic_pointer_cast( + out_tensor.impl()); + std::vector out_wrapper = {*out_dense}; + + return self.AllToAllSingle( + in_wrapper, out_wrapper, in_sizes, out_sizes, sync_op); + }, + py::arg("in"), + py::arg("out"), + py::arg("in_sizes"), + py::arg("out_sizes"), + py::arg("sync_op"), + py::call_guard()) + .def( "reduce", [](distributed::ProcessGroup &self, @@ -495,6 +596,83 @@ void BindDistributed(py::module *m) { py::arg("dst"), py::arg("op") = distributed::ReduceOp::SUM, py::call_guard()) + + .def( + "reduce", + [](distributed::ProcessGroup &self, + py::handle py_in_tensor, + int dst, + distributed::ReduceOp op, + bool sync_op) { + auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); + distributed::ReduceOptions opts{op, dst}; + auto dense = std::dynamic_pointer_cast( + in_tensor.impl()); + std::vector tensors = {*dense}; + return self.Reduce(tensors, tensors, opts, sync_op); + }, + py::arg("tensor"), + py::arg("dst"), + py::arg("op"), + py::arg("sync_op"), + py::call_guard()) + + .def( + "reduce_scatter", + [](distributed::ProcessGroup &self, + py::handle py_in_tensor_list, + py::handle py_out_tensor, + distributed::ReduceOp op, + bool sync_op) { + auto in_tensor_list = + CastPyArg2VectorOfTensor(py_in_tensor_list.ptr(), 0); + Tensor concat_in_tensor = paddle::concat(in_tensor_list, 0); + auto in_dense = std::dynamic_pointer_cast( + concat_in_tensor.impl()); + std::vector in_wrapper = {*in_dense}; + + auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); + auto out_dense = std::dynamic_pointer_cast( + out_tensor.impl()); + std::vector out_wrapper = {*out_dense}; + + distributed::ReduceScatterOptions opts{op}; + return self.ReduceScatter( + in_wrapper, out_wrapper, opts, sync_op); + }, + py::arg("in"), + py::arg("out"), + py::arg("op"), + py::arg("sync_op"), + py::call_guard()) + + .def( + "reduce_scatter_tensor", + [](distributed::ProcessGroup &self, + py::handle py_in_tensor, + py::handle py_out_tensor, + distributed::ReduceOp op, + bool sync_op) { + auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); + auto in_dense = std::dynamic_pointer_cast( + in_tensor.impl()); + std::vector in_wrapper = {*in_dense}; + + auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); + auto out_dense = std::dynamic_pointer_cast( + out_tensor.impl()); + std::vector out_wrapper = {*out_dense}; + + distributed::ReduceScatterOptions opts{op}; + return self.ReduceScatter( + in_wrapper, out_wrapper, opts, sync_op); + }, + py::arg("in"), + py::arg("out"), + py::arg("op"), + py::arg("sync_op"), + py::call_guard()) + .def( "scatter", [](distributed::ProcessGroup &self, @@ -517,6 +695,61 @@ void BindDistributed(py::module *m) { py::arg("out"), py::arg("src"), py::call_guard()) + + .def( + "scatter", + [](distributed::ProcessGroup &self, + py::handle py_in_tensor_list, + py::handle py_out_tensor, + int src, + bool sync_op) { + auto in_tensor_list = + CastPyArg2VectorOfTensor(py_in_tensor_list.ptr(), 0); + Tensor concat_in_tensor = paddle::concat(in_tensor_list, 0); + auto in_dense = std::dynamic_pointer_cast( + concat_in_tensor.impl()); + std::vector in_wrapper = {*in_dense}; + + auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); + auto out_dense = std::dynamic_pointer_cast( + out_tensor.impl()); + std::vector out_wrapper = {*out_dense}; + + distributed::ScatterOptions opts{src}; + return self.Scatter(in_wrapper, out_wrapper, opts, sync_op); + }, + py::arg("in"), + py::arg("out"), + py::arg("src"), + py::arg("sync_op"), + py::call_guard()) + + .def( + "scatter_tensor", + [](distributed::ProcessGroup &self, + py::handle py_in_tensor, + py::handle py_out_tensor, + int src, + bool sync_op) { + auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); + auto in_dense = std::dynamic_pointer_cast( + in_tensor.impl()); + std::vector in_wrapper = {*in_dense}; + + auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); + auto out_dense = std::dynamic_pointer_cast( + out_tensor.impl()); + std::vector out_wrapper = {*out_dense}; + + distributed::ScatterOptions opts{src}; + return self.Scatter(in_wrapper, out_wrapper, opts, sync_op); + }, + py::arg("in"), + py::arg("out"), + py::arg("src"), + py::arg("sync_op"), + py::call_guard()) + .def( "_reduce_scatter_base", [](distributed::ProcessGroup &self, @@ -573,7 +806,7 @@ void BindDistributed(py::module *m) { py::call_guard()) .def( - "allgather_base_on_calc_stream", + "allgather_into_tensor_on_calc_stream", [](distributed::ProcessGroupStream &self, py::handle py_in_tensor, py::handle py_out_tensor) { @@ -596,6 +829,37 @@ void BindDistributed(py::module *m) { py::arg("out"), py::call_guard()) + .def( + "all_gather_partial_on_calc_stream", + [](distributed::ProcessGroupStream &self, + py::handle py_in_tensor, + py::handle py_out_tensor, + int nranks, + int rank_id) { + auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); + auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); + auto in_dense = std::dynamic_pointer_cast( + in_tensor.impl()); + auto out_dense = std::dynamic_pointer_cast( + out_tensor.impl()); + std::vector in_tensors = {*in_dense}; + std::vector out_tensors = {*out_dense}; + int64_t numel = (*in_dense).numel(); + int64_t send_numel = numel / nranks; + int64_t offset = send_numel * rank_id; + return self.AllGather_Partial(in_tensors, + out_tensors, + offset, + send_numel, + /*sync_op*/ true, + /*use_calc_stream*/ true); + }, + py::arg("in"), + py::arg("out"), + py::arg("num"), + py::arg("id"), + py::call_guard()) + .def( "allreduce_on_calc_stream", [](distributed::ProcessGroupStream &self, @@ -617,6 +881,251 @@ void BindDistributed(py::module *m) { py::arg("op"), py::call_guard()) + .def( + "alltoall_on_calc_stream", + [](distributed::ProcessGroupStream &self, + py::handle py_in_tensor_list, + py::handle py_out_tensor_list) { + auto in_tensor_list = + CastPyArg2VectorOfTensor(py_in_tensor_list.ptr(), 0); + Tensor concat_in_tensor = paddle::concat(in_tensor_list, 0); + auto in_dense = std::dynamic_pointer_cast( + concat_in_tensor.impl()); + std::vector in_wrapper = {*in_dense}; + + auto out_tensor_list = + CastPyArg2VectorOfTensor(py_out_tensor_list.ptr(), 0); + Tensor concat_out_tensor = paddle::concat(out_tensor_list, 0); + auto out_dense = std::dynamic_pointer_cast( + concat_out_tensor.impl()); + std::vector out_wrapper = {*out_dense}; + + // in_tensor_list must not be empty + const auto *dev_ctx = self.GetDeviceContext( + in_tensor_list.back().place(), /*use_calc_stream*/ true); + auto task = self.AllToAll(in_wrapper, + out_wrapper, + /*sync_op*/ true, + /*use_calc_stream*/ true); + distributed::SplitTensor(dev_ctx, *out_dense, &out_tensor_list); + return task; + }, + py::arg("in"), + py::arg("out"), + py::call_guard()) + + .def( + "alltoall_tensor_on_calc_stream", + [](distributed::ProcessGroupStream &self, + py::handle py_in_tensor, + py::handle py_out_tensor) { + auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); + auto in_dense = std::dynamic_pointer_cast( + in_tensor.impl()); + std::vector in_wrapper = {*in_dense}; + + auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); + auto out_dense = std::dynamic_pointer_cast( + out_tensor.impl()); + std::vector out_wrapper = {*out_dense}; + + return self.AllToAll(in_wrapper, + out_wrapper, + /*sync_op*/ true, + /*use_calc_stream*/ true); + }, + py::arg("in"), + py::arg("out"), + py::call_guard()) + + .def( + "alltoall_single_on_calc_stream", + [](distributed::ProcessGroupStream &self, + py::handle py_in_tensor, + py::handle py_out_tensor, + std::vector &in_sizes, + std::vector &out_sizes) { + auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); + auto in_dense = std::dynamic_pointer_cast( + in_tensor.impl()); + std::vector in_wrapper = {*in_dense}; + + auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); + auto out_dense = std::dynamic_pointer_cast( + out_tensor.impl()); + std::vector out_wrapper = {*out_dense}; + + return self.AllToAllSingle(in_wrapper, + out_wrapper, + in_sizes, + out_sizes, + /*sync_op*/ true, + /*use_calc_stream*/ true); + }, + py::arg("in"), + py::arg("out"), + py::arg("in_sizes"), + py::arg("out_sizes"), + py::call_guard()) + + .def( + "broadcast_on_calc_stream", + [](distributed::ProcessGroupStream &self, + py::handle py_tensor, + int src) { + auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); + distributed::BroadcastOptions opts{src}; + auto dense = + std::dynamic_pointer_cast(tensor.impl()); + std::vector tensors = {*dense}; + return self.Broadcast(tensors, + tensors, + opts, + /*sync_op*/ true, + /*use_calc_stream*/ true); + }, + py::arg("tensor"), + py::arg("src"), + py::call_guard()) + + .def( + "reduce_on_calc_stream", + [](distributed::ProcessGroupStream &self, + py::handle py_in_tensor, + int dst, + distributed::ReduceOp op) { + auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); + distributed::ReduceOptions opts{op, dst}; + auto dense = std::dynamic_pointer_cast( + in_tensor.impl()); + std::vector tensors = {*dense}; + return self.Reduce(tensors, + tensors, + opts, + /*sync_op*/ true, + /*use_calc_stream*/ true); + }, + py::arg("tensor"), + py::arg("dst"), + py::arg("op"), + py::call_guard()) + + .def( + "reduce_scatter_on_calc_stream", + [](distributed::ProcessGroupStream &self, + py::handle py_in_tensor_list, + py::handle py_out_tensor, + distributed::ReduceOp op) { + auto in_tensor_list = + CastPyArg2VectorOfTensor(py_in_tensor_list.ptr(), 0); + Tensor concat_in_tensor = paddle::concat(in_tensor_list, 0); + auto in_dense = std::dynamic_pointer_cast( + concat_in_tensor.impl()); + std::vector in_wrapper = {*in_dense}; + + auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); + auto out_dense = std::dynamic_pointer_cast( + out_tensor.impl()); + std::vector out_wrapper = {*out_dense}; + + distributed::ReduceScatterOptions opts{op}; + return self.ReduceScatter(in_wrapper, + out_wrapper, + opts, + /*sync_op*/ true, + /*use_calc_stream*/ true); + }, + py::arg("in"), + py::arg("out"), + py::arg("op"), + py::call_guard()) + + .def( + "reduce_scatter_tensor_on_calc_stream", + [](distributed::ProcessGroupStream &self, + py::handle py_in_tensor, + py::handle py_out_tensor, + distributed::ReduceOp op) { + auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); + auto in_dense = std::dynamic_pointer_cast( + in_tensor.impl()); + std::vector in_wrapper = {*in_dense}; + + auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); + auto out_dense = std::dynamic_pointer_cast( + out_tensor.impl()); + std::vector out_wrapper = {*out_dense}; + + distributed::ReduceScatterOptions opts{op}; + return self.ReduceScatter(in_wrapper, + out_wrapper, + opts, + /*sync_op*/ true, + /*use_calc_stream*/ true); + }, + py::arg("in"), + py::arg("out"), + py::arg("op"), + py::call_guard()) + + .def( + "scatter_on_calc_stream", + [](distributed::ProcessGroupStream &self, + py::handle py_in_tensor_list, + py::handle py_out_tensor, + int src) { + auto in_tensor_list = + CastPyArg2VectorOfTensor(py_in_tensor_list.ptr(), 0); + Tensor concat_in_tensor = paddle::concat(in_tensor_list, 0); + auto in_dense = std::dynamic_pointer_cast( + concat_in_tensor.impl()); + std::vector in_wrapper = {*in_dense}; + + auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); + auto out_dense = std::dynamic_pointer_cast( + out_tensor.impl()); + std::vector out_wrapper = {*out_dense}; + + distributed::ScatterOptions opts{src}; + return self.Scatter(in_wrapper, + out_wrapper, + opts, + /*sync_op*/ true, + /*use_calc_stream*/ true); + }, + py::arg("in"), + py::arg("out"), + py::arg("src"), + py::call_guard()) + + .def( + "scatter_tensor_on_calc_stream", + [](distributed::ProcessGroupStream &self, + py::handle py_in_tensor, + py::handle py_out_tensor, + int src) { + auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); + auto in_dense = std::dynamic_pointer_cast( + in_tensor.impl()); + std::vector in_wrapper = {*in_dense}; + + auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); + auto out_dense = std::dynamic_pointer_cast( + out_tensor.impl()); + std::vector out_wrapper = {*out_dense}; + + distributed::ScatterOptions opts{src}; + return self.Scatter(in_wrapper, + out_wrapper, + opts, + /*sync_op*/ true, + /*use_calc_stream*/ true); + }, + py::arg("in"), + py::arg("out"), + py::arg("src"), + py::call_guard()) + .def( "send_on_calc_stream", [](distributed::ProcessGroupStream &self, @@ -645,9 +1154,9 @@ void BindDistributed(py::module *m) { auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); auto dense = std::dynamic_pointer_cast(tensor.impl()); - int numel = (*dense).numel(); - int send_numel = numel / nranks; - int offset = send_numel * rank_id; + int64_t numel = (*dense).numel(); + int64_t send_numel = numel / nranks; + int64_t offset = send_numel * rank_id; return self.Send_Partial(*dense, dst_rank, offset, @@ -689,9 +1198,9 @@ void BindDistributed(py::module *m) { auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); auto dense = std::dynamic_pointer_cast(tensor.impl()); - int numel = (*dense).numel(); - int recv_numel = numel / nranks; - int offset = recv_numel * rank_id; + int64_t numel = (*dense).numel(); + int64_t recv_numel = numel / nranks; + int64_t offset = recv_numel * rank_id; return self.Recv_Partial(*dense, src_rank, offset, diff --git a/python/paddle/distributed/communication/stream/__init__.py b/python/paddle/distributed/communication/stream/__init__.py index deab1f97ea28e..a1844decf9478 100644 --- a/python/paddle/distributed/communication/stream/__init__.py +++ b/python/paddle/distributed/communication/stream/__init__.py @@ -14,7 +14,16 @@ from .all_gather import all_gather from .all_reduce import all_reduce -from .send import send +from .alltoall import alltoall +from .alltoall_single import alltoall_single +from .broadcast import broadcast +from .reduce import reduce +from .reduce_scatter import _reduce_scatter_base, reduce_scatter from .recv import recv +from .scatter import scatter +from .send import send -__all__ = ["all_gather", "all_reduce", "send", "recv"] +__all__ = [ + "_reduce_scatter_base", "all_reduce", "alltoall", "alltoall_single", + "broadcast", "reduce", "reduce_scatter", "recv", "scatter", "send" +] diff --git a/python/paddle/distributed/communication/stream/all_gather.py b/python/paddle/distributed/communication/stream/all_gather.py index dca2957309068..9eb961cda171d 100644 --- a/python/paddle/distributed/communication/stream/all_gather.py +++ b/python/paddle/distributed/communication/stream/all_gather.py @@ -34,17 +34,18 @@ def _check_tensor_list_shape(tensor_list, shape, nranks=1): 'The tensor_list for all_gather is not correctly-sized.') -def _all_gather_base_in_dygraph(out_tensor, in_tensor, group, sync_op, - use_calc_stream): +def _all_gather_into_tensor_in_dygraph(out_tensor, in_tensor, group, sync_op, + use_calc_stream): group = collective._get_default_group() if group is None else group _check_tensor_shape(out_tensor, in_tensor.shape, group.nranks) if use_calc_stream: - return group.process_group.allgather_base_on_calc_stream( + return group.process_group.allgather_into_tensor_on_calc_stream( in_tensor, out_tensor) - task = group.process_group.allgather_base(in_tensor, out_tensor, sync_op) + task = group.process_group.allgather_into_tensor(in_tensor, out_tensor, + sync_op) if sync_op: task.wait() @@ -83,7 +84,7 @@ def all_gather(tensor_or_tensor_list, tensor_or_tensor_list (Union[Tensor, List[Tensor]]): The output. If it is a tensor, it should be correctly-sized. If it is a list, it should be empty or contain correctly-sized tensors. tensor (Tensor): The input tensor on each rank. The result will overwrite this tenor after communication. Support - float16, float32, float64, int32 or int64 as the input data type. + float16, float32, float64, int32, int64, int8, uint8 or bool as the input data type. group (Group, optional): Communicate in which group. If none is given, use the global group as default. sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default. use_calc_stream (bool, optional): Indicate whether the communication is done on calculation stream. If none is given, use false as default. This @@ -125,8 +126,9 @@ def all_gather(tensor_or_tensor_list, if framework.in_dygraph_mode(): if paddle.is_tensor(tensor_or_tensor_list): - return _all_gather_base_in_dygraph(tensor_or_tensor_list, tensor, - group, sync_op, use_calc_stream) + return _all_gather_into_tensor_in_dygraph(tensor_or_tensor_list, + tensor, group, sync_op, + use_calc_stream) else: return _all_gather_in_dygraph(tensor_or_tensor_list, tensor, group, sync_op, use_calc_stream) diff --git a/python/paddle/distributed/communication/stream/all_reduce.py b/python/paddle/distributed/communication/stream/all_reduce.py index f94422f4bd0a6..67fc4c8b63a0c 100644 --- a/python/paddle/distributed/communication/stream/all_reduce.py +++ b/python/paddle/distributed/communication/stream/all_reduce.py @@ -40,8 +40,8 @@ def all_reduce(tensor, Args: tensor (Tensor): The input tensor on each rank. The result will overwrite this tenor after communication. Support - float16, float32, float64, int32 or int64 as the input data type. - op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.Min|ReduceOp.PROD, optional): The reduction used. If none is given, use ReduceOp.SUM as default. + float16, float32, float64, int32, int64, int8, uint8 or bool as the input data type. + op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD, optional): The reduction used. If none is given, use ReduceOp.SUM as default. group (Group, optional): Communicate in which group. If none is given, use the global group as default. sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default. use_calc_stream (bool, optional): Indicate whether the communication is done on calculation stream. If none is given, use false as default. This diff --git a/python/paddle/distributed/communication/stream/alltoall.py b/python/paddle/distributed/communication/stream/alltoall.py new file mode 100644 index 0000000000000..b216906d04568 --- /dev/null +++ b/python/paddle/distributed/communication/stream/alltoall.py @@ -0,0 +1,157 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.fluid.framework as framework +from paddle.distributed import collective + + +def _check_tensor_shape(tensor, shape, nranks=1): + if tensor.shape != shape: + raise RuntimeError('The tensor for alltoall is not correctly-sized.') + + +def _check_tensor_list_shape(tensor_list, shape, nranks=1): + if len(tensor_list) != nranks: + raise RuntimeError( + 'The tensor_list for alltoall is not correctly-sized.') + for tensor in tensor_list: + if tensor.shape != shape: + raise RuntimeError( + 'The tensor_list for alltoall is not correctly-sized.') + + +def _alltoall_tensor_in_dygraph(out_tensor, in_tensor, group, sync_op, + use_calc_stream): + group = collective._get_default_group() if group is None else group + + _check_tensor_shape(out_tensor, in_tensor.shape, group.nranks) + + if use_calc_stream: + return group.process_group.alltoall_tensor_on_calc_stream( + in_tensor, out_tensor) + + task = group.process_group.alltoall_tensor(in_tensor, out_tensor, sync_op) + if sync_op: + task.wait() + + return task + + +def _alltoall_in_dygraph(out_tensor_list, in_tensor_list, group, sync_op, + use_calc_stream): + group = collective._get_default_group() if group is None else group + + if len(in_tensor_list) == 0: + raise RuntimeError("The input tensor_list should not be empty.") + + if len(out_tensor_list) == 0: + out_tensor_list += [ + paddle.empty_like(tensor) for tensor in in_tensor_list + ] + else: + _check_tensor_list_shape(out_tensor_list, in_tensor_list[0].shape, + group.nranks) + + if use_calc_stream: + return group.process_group.alltoall_on_calc_stream( + in_tensor_list, out_tensor_list) + + task = group.process_group.alltoall(in_tensor_list, out_tensor_list, + sync_op) + if sync_op: + task.wait() + + return task + + +def alltoall(out_tensor_or_tensor_list, + in_tensor_or_tensor_list, + group=None, + sync_op=True, + use_calc_stream=False): + """ + + Scatter a tensor (or a tensor list) across devices and gather outputs to another tensor (or a tensor list, respectively). + + Args: + out_tensor_or_tensor_list (Union[Tensor, List[Tensor]]): The output. If it is a tensor, it should be correctly-sized. + If it is a list, it should be empty or contain correctly-sized tensors. Its data type should be the same as the input. + in_tensor_or_tensor_list (Union[Tensor, List[Tensor]]): The input to scatter (must be specified on the source rank). + If it is a tensor, it should be correctly-sized. If it is a list, it should contain correctly-sized tensors. Support + float16, float32, float64, int32, int64, int8, uint8 or bool as the input data type. + group (Group, optional): Communicate in which group. If none is given, use the global group as default. + sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default. + use_calc_stream (bool, optional): Indicate whether the communication is done on calculation stream. If none is given, use false as default. This + option is designed for high performance demand, be careful to turn it on except you are clearly know its meaning. + + Returns: + Return a task object. + + Warning: + This API only supports the dygraph mode now. + + Examples: + .. code-block:: python + + # required: distributed + import paddle + import paddle.distributed as dist + + dist.init_parallel_env() + out_tensor_list = [] + if dist.get_rank() == 0: + data1 = paddle.to_tensor([[1, 2, 3], [4, 5, 6]]) + data2 = paddle.to_tensor([[7, 8, 9], [10, 11, 12]]) + else: + data1 = paddle.to_tensor([[13, 14, 15], [16, 17, 18]]) + data2 = paddle.to_tensor([[19, 20, 21], [22, 23, 24]]) + task = dist.stream.alltoall(out_tensor_list, [data1, data2], sync_op=False) + task.wait() + print(out_tensor_list) + # [[[1, 2, 3], [4, 5, 6]], [[13, 14, 15], [16, 17, 18]]] (2 GPUs, out for rank 0) + # [[[7, 8, 9], [10, 11, 12]], [[19, 20, 21], [22, 23, 24]]] (2 GPUs, out for rank 1) + """ + if group is not None and not group.is_member(): + raise RuntimeError( + "The group should not be None and all ranks which invoke this operation should be the member of this group." + ) + + if not sync_op and use_calc_stream: + raise RuntimeError( + "use_calc_stream can only be true in sync op behavior.") + + if out_tensor_or_tensor_list is None: + raise RuntimeError("The output should be specified.") + if in_tensor_or_tensor_list is None: + raise RuntimeError("The input should be specified.") + + if framework.in_dygraph_mode(): + out_is_tensor = paddle.is_tensor(out_tensor_or_tensor_list) + in_is_tensor = paddle.is_tensor(in_tensor_or_tensor_list) + if out_is_tensor and in_is_tensor: + return _alltoall_tensor_in_dygraph(out_tensor_or_tensor_list, + in_tensor_or_tensor_list, group, + sync_op, use_calc_stream) + elif not out_is_tensor and not in_is_tensor: + return _alltoall_in_dygraph(out_tensor_or_tensor_list, + in_tensor_or_tensor_list, group, + sync_op, use_calc_stream) + else: + raise RuntimeError( + "The output and input should be both tensor or tensor list.") + + raise RuntimeError( + "paddle.distributed.stream.alltoall is only supported in dygraph mode now." + ) diff --git a/python/paddle/distributed/communication/stream/alltoall_single.py b/python/paddle/distributed/communication/stream/alltoall_single.py new file mode 100644 index 0000000000000..b2187cc06e343 --- /dev/null +++ b/python/paddle/distributed/communication/stream/alltoall_single.py @@ -0,0 +1,128 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.fluid.framework as framework +from paddle.distributed import collective + + +def _alltoall_single_in_dygraph(out_tensor, in_tensor, out_split_sizes, + in_split_sizes, group, sync_op, + use_calc_stream): + group = collective._get_default_group() if group is None else group + + if out_split_sizes is None: + out_split_sizes = [] + if in_split_sizes is None: + in_split_sizes = [] + + if use_calc_stream: + return group.process_group.alltoall_single_on_calc_stream( + in_tensor, out_tensor, in_split_sizes, out_split_sizes) + + task = group.process_group.alltoall_single(in_tensor, out_tensor, + in_split_sizes, out_split_sizes, + sync_op) + if sync_op: + task.wait() + + return task + + +def alltoall_single(out_tensor, + in_tensor, + out_split_sizes=None, + in_split_sizes=None, + group=None, + sync_op=True, + use_calc_stream=False): + """ + + Split and Scatter the splitted input tensor to the out tensor across devices. + + Args: + out_tensor(Tensor): The output tensor. Its data type should be the same as the input. + in_tensor (Tensor): The input tensor. Its data type should be float16, float32, float64, int32, int64, int8, uint8 or bool. + out_split_sizes (List[int], optional): Split sizes of out_tensor for dim[0]. If not given, dim[0] of out_tensor must be divisible + by group size and out_tensor will be gathered averagely from all participators. If none is given, use a empty list as default. + in_split_sizes (List[int], optional): Split sizes of in_tensor for dim[0]. If not given, dim[0] of in_tensor must be divisible + by group size and in_tensor will be scattered averagely to all participators. If none is given, use a empty list as default. + group (Group, optional): Communicate in which group. If none is given, use the global group as default. + sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default. + use_calc_stream (bool, optional): Indicate whether the communication is done on calculation stream. If none is given, use false as default. This + option is designed for high performance demand, be careful to turn it on except you are clearly know its meaning. + + Returns: + Return a task object. + + Warning: + This API only supports the dygraph mode now. + + Examples: + .. code-block:: python + + # required: distributed + import paddle + import paddle.distributed as dist + + dist.init_parallel_env() + local_rank = dist.get_rank() + + # case 1 + output = paddle.empty([2], dtype="int64") + if local_rank == 0: + data = paddle.to_tensor([0, 1]) + else: + data = paddle.to_tensor([2, 3]) + task = dist.stream.alltoall_single(output, data, sync_op=False) + task.wait() + out = output.numpy() + # [0, 2] (2 GPUs, out for rank 0) + # [1, 3] (2 GPUs, out for rank 1) + + # case 2 + size = dist.get_world_size() + output = paddle.empty([(local_rank + 1) * size, size], dtype='float32') + if local_rank == 0: + data = paddle.to_tensor([[0., 0.], [0., 0.], [0., 0.]]) + else: + data = paddle.to_tensor([[1., 1.], [1., 1.], [1., 1.]]) + out_split_sizes = [local_rank + 1 for i in range(size)] + in_split_sizes = [i + 1 for i in range(size)] + task = dist.stream.alltoall_single(output, + data, + out_split_sizes, + in_split_sizes, + sync_op=False) + task.wait() + out = output.numpy() + # [[0., 0.], [1., 1.]] (2 GPUs, out for rank 0) + # [[0., 0.], [0., 0.], [1., 1.], [1., 1.]] (2 GPUs, out for rank 1) + """ + if group is not None and not group.is_member(): + raise RuntimeError( + "The group should not be None and all ranks which invoke this operation should be the member of this group." + ) + + if not sync_op and use_calc_stream: + raise RuntimeError( + "use_calc_stream can only be true in sync op behavior.") + + if framework.in_dygraph_mode(): + return _alltoall_single_in_dygraph(out_tensor, in_tensor, + out_split_sizes, in_split_sizes, + group, sync_op, use_calc_stream) + + raise RuntimeError( + "paddle.distributed.stream.alltoall_single is only supported in dygraph mode now." + ) diff --git a/python/paddle/distributed/communication/stream/broadcast.py b/python/paddle/distributed/communication/stream/broadcast.py new file mode 100644 index 0000000000000..06bde316937a9 --- /dev/null +++ b/python/paddle/distributed/communication/stream/broadcast.py @@ -0,0 +1,83 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.fluid.framework as framework +from paddle.distributed import collective + + +def _broadcast_in_dygraph(tensor, src, group, sync_op, use_calc_stream): + group = collective._get_default_group() if group is None else group + if use_calc_stream: + return group.process_group.broadcast_on_calc_stream(tensor, src) + + task = group.process_group.broadcast(tensor, src, sync_op) + if sync_op: + task.wait() + + return task + + +def broadcast(tensor, src=0, group=None, sync_op=True, use_calc_stream=False): + """ + + Broadcast a tensor to all devices. + + Args: + tensor (Tensor): The tensor to broadcast. Support float16, float32, float64, int32, int64, int8, uint8 or bool as its data type. + src (int, optional): Rank of the source device. If none is given, use `0` as default. + group (Group, optional): Communicate in which group. If none is given, use the global group as default. + sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default. + use_calc_stream (bool, optional): Indicate whether the communication is done on calculation stream. If none is given, use false as default. This + option is designed for high performance demand, be careful to turn it on except you are clearly know its meaning. + + Returns: + Return a task object. + + Warning: + This API only supports the dygraph mode now. + + Examples: + .. code-block:: python + + # required: distributed + import paddle + import paddle.distributed as dist + + dist.init_parallel_env() + local_rank = dist.get_rank() + if local_rank == 0: + data = paddle.to_tensor([[4, 5, 6], [4, 5, 6]]) + else: + data = paddle.to_tensor([[1, 2, 3], [1, 2, 3]]) + task = dist.stream.broadcast(data, src=1, sync_op=False) + task.wait() + out = data.numpy() + # [[1, 2, 3], [1, 2, 3]] (2 GPUs) + """ + if group is not None and not group.is_member(): + raise RuntimeError( + "The group should not be None and all ranks which invoke this operation should be the member of this group." + ) + + if not sync_op and use_calc_stream: + raise RuntimeError( + "use_calc_stream can only be True in sync op behavior.") + + if framework.in_dygraph_mode(): + return _broadcast_in_dygraph(tensor, src, group, sync_op, + use_calc_stream) + + raise RuntimeError( + "paddle.distributed.stream.broadcast is only supported in dygraph mode now." + ) diff --git a/python/paddle/distributed/communication/stream/recv.py b/python/paddle/distributed/communication/stream/recv.py index b225f64b8b4d2..d572dd44622b8 100644 --- a/python/paddle/distributed/communication/stream/recv.py +++ b/python/paddle/distributed/communication/stream/recv.py @@ -64,7 +64,7 @@ def recv(tensor, src=0, group=None, sync_op=True, use_calc_stream=False): task = dist.stream.recv(data, src=0, sync_op=False) task.wait() out = data.numpy() - # [[4, 5, 6], [4, 5, 6] + # [[4, 5, 6], [4, 5, 6]] (2 GPUs) """ if group is not None and not group.is_member(): raise RuntimeError( diff --git a/python/paddle/distributed/communication/stream/reduce.py b/python/paddle/distributed/communication/stream/reduce.py new file mode 100644 index 0000000000000..b0f7f5c884743 --- /dev/null +++ b/python/paddle/distributed/communication/stream/reduce.py @@ -0,0 +1,93 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.fluid.framework as framework +from paddle.distributed.communication.group import _get_global_group +from paddle.distributed.communication.reduce import _get_reduce_op, ReduceOp + + +def _reduce_in_dygraph(tensor, dst, op, group, sync_op, use_calc_stream): + op_type = _get_reduce_op(op, "reduce") + group = _get_global_group() if group is None else group + if use_calc_stream: + return group.process_group.reduce_on_calc_stream(tensor, dst, op_type) + + task = group.process_group.reduce(tensor, dst, op_type, sync_op) + if sync_op: + task.wait() + + return task + + +def reduce(tensor, + dst=0, + op=ReduceOp.SUM, + group=None, + sync_op=True, + use_calc_stream=False): + """ + + Perform specific reduction (for example, sum, max) on a tensor across devices and send to the destintion device. + + Args: + tensor (Tensor): The input tensor on each rank. The result will overwrite this tenor after communication. Support + float16, float32, float64, int32, int64, int8, uint8 or bool as the input data type. + dst (int, optional): Rank of the destination device. If none is given, use `0` as default. + op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD, optional): The reduction used. If none is given, use ReduceOp.SUM as default. + group (Group, optional): Communicate in which group. If none is given, use the global group as default. + sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default. + use_calc_stream (bool, optional): Indicate whether the communication is done on calculation stream. If none is given, use false as default. This + option is designed for high performance demand, be careful to turn it on except you are clearly know its meaning. + + Returns: + Return a task object. + + Warning: + This API only supports the dygraph mode now. + + Examples: + .. code-block:: python + + # required: distributed + import paddle + import paddle.distributed as dist + + dist.init_parallel_env() + local_rank = dist.get_rank() + if local_rank == 0: + data = paddle.to_tensor([[4, 5, 6], [4, 5, 6]]) + else: + data = paddle.to_tensor([[1, 2, 3], [1, 2, 3]]) + task = dist.stream.reduce(data, dst=0, sync_op=False) + task.wait() + out = data.numpy() + # [[5, 7, 9], [5, 7, 9]] (2 GPUs, out for rank 0) + # [[1, 2, 3], [1, 2, 3]] (2 GPUs, out for rank 1) + """ + if group is not None and not group.is_member(): + raise RuntimeError( + "The group should not be None and all ranks which invoke this operation should be the member of this group." + ) + + if not sync_op and use_calc_stream: + raise RuntimeError( + "use_calc_stream can only be true in sync op behavior.") + + if framework.in_dygraph_mode(): + return _reduce_in_dygraph(tensor, dst, op, group, sync_op, + use_calc_stream) + + raise RuntimeError( + "paddle.distributed.stream.reduce is only supported in dygraph mode now." + ) diff --git a/python/paddle/distributed/communication/stream/reduce_scatter.py b/python/paddle/distributed/communication/stream/reduce_scatter.py new file mode 100644 index 0000000000000..a4aeae6312a30 --- /dev/null +++ b/python/paddle/distributed/communication/stream/reduce_scatter.py @@ -0,0 +1,216 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.distributed as dist +import paddle.fluid.framework as framework +from paddle.distributed.communication.group import _get_global_group +from paddle.distributed.communication.reduce import _get_reduce_op, ReduceOp + + +def _check_tensor_shape(tensor, shape, nranks=1): + expect_shape = list(shape) + expect_shape[0] //= nranks + if list(tensor.shape) != expect_shape: + raise RuntimeError( + "The in_tensor for reduce_scatter is not correctly-sized.") + + +def _check_tensor_list_shape(tensor_list, shape, nranks=1): + if len(tensor_list) != nranks: + raise RuntimeError( + f"The tensor_list for reduce_scatter is not correctly-sized.") + for tensor in tensor_list: + if tensor.shape != shape: + raise RuntimeError( + f"The tensor_list for reduce_scatter is not correctly-sized.") + + +def _reduce_scatter_tensor_in_dygraph(out_tensor, + in_tensor, + op, + group, + sync_op, + use_calc_stream, + caller="reduce_scatter"): + op_type = _get_reduce_op(op, caller) + group = _get_global_group() if group is None else group + + _check_tensor_shape(out_tensor, in_tensor.shape, group.nranks) + + if use_calc_stream: + return group.process_group.reduce_scatter_tensor_on_calc_stream( + in_tensor, out_tensor, op_type) + + task = group.process_group.reduce_scatter_tensor(in_tensor, out_tensor, + op_type, sync_op) + if sync_op: + task.wait() + + return task + + +def _reduce_scatter_in_dygraph(tensor, tensor_list, op, group, sync_op, + use_calc_stream): + op_type = _get_reduce_op(op, "reduce_scatter") + group = _get_global_group() if group is None else group + + _check_tensor_list_shape(tensor_list, tensor.shape, group.nranks) + + if use_calc_stream: + return group.process_group.reduce_scatter_on_calc_stream( + tensor_list, tensor, op_type) + + task = group.process_group.reduce_scatter(tensor_list, tensor, op_type, + sync_op) + if sync_op: + task.wait() + + return task + + +def reduce_scatter(tensor, + tensor_or_tensor_list, + op=ReduceOp.SUM, + group=None, + sync_op=True, + use_calc_stream=False): + """ + + Reduce, then scatter a tensor (or a tensor list) across devices. + + Args: + tensor (Tensor): The output tensor on each rank. The result will overwrite this tenor after communication. Support + float16, float32, float64, int32, int64, int8, uint8 or bool as the input data type. + tensor_list (List[Tensor]]): The input to scatter. + If it is a tensor, it should be correctly-sized. If it is a list, it should contain correctly-sized tensors. + op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD, optional): The reduction used. If none is given, use ReduceOp.SUM as default. + group (Group, optional): Communicate in which group. If none is given, use the global group as default. + sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default. + use_calc_stream (bool, optional): Indicate whether the communication is done on calculation stream. If none is given, use false as default. This + option is designed for high performance demand, be careful to turn it on except you are clearly know its meaning. + + Returns: + Return a task object. + + Warning: + This API only supports the dygraph mode now. + + Examples: + .. code-block:: python + + # required: distributed + import paddle + import paddle.distributed as dist + + dist.init_parallel_env() + if dist.get_rank() == 0: + data1 = paddle.to_tensor([0, 1]) + data2 = paddle.to_tensor([2, 3]) + else: + data1 = paddle.to_tensor([4, 5]) + data2 = paddle.to_tensor([6, 7]) + dist.stream.reduce_scatter(data1, [data1, data2]) + out = data1.numpy() + # [4, 6] (2 GPUs, out for rank 0) + # [8, 10] (2 GPUs, out for rank 1) + """ + if group is not None and not group.is_member(): + raise RuntimeError( + "The group should not be None and all ranks which invoke this operation should be the member of this group." + ) + + if not sync_op and use_calc_stream: + raise RuntimeError( + "use_calc_stream can only be true in sync op behavior.") + + if framework.in_dygraph_mode(): + if paddle.is_tensor(tensor_or_tensor_list): + return _reduce_scatter_tensor_in_dygraph(tensor, + tensor_or_tensor_list, op, + group, sync_op, + use_calc_stream) + else: + return _reduce_scatter_in_dygraph(tensor, tensor_or_tensor_list, op, + group, sync_op, use_calc_stream) + + raise RuntimeError( + "paddle.distributed.stream.reduce_scatter is only supported in dygraph mode now." + ) + + +def _reduce_scatter_base(out_tensor, + in_tensor, + op=ReduceOp.SUM, + group=None, + sync_op=True, + use_calc_stream=False): + """ + + Reduce, then scatter a flattened tensor across devices. + + Args: + out_tensor (Tensor): The output tensor on each rank. The result will overwrite this tenor after communication. Support + float16, float32, float64, int32 or int64 as the input data type. + in_tensor (Tensor): The input tensor to reduce and scatter. + op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD, optional): The reduction used. If none is given, use ReduceOp.SUM as default. + group (Group, optional): Communicate in which group. If none is given, use the global group as default. + sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default. + use_calc_stream (bool, optional): Indicate whether the communication is done on calculation stream. If none is given, use false as default. This + option is designed for high performance demand, be careful to turn it on except you are clearly know its meaning. + + Returns: + Return a task object. + + Warning: + This API will be deprecated in the future, and only supports the dygraph mode now. + + Examples: + .. code-block:: python + + # required: distributed + import paddle + import paddle.distributed as dist + + dist.init_parallel_env() + if dist.get_rank() == 0: + data1 = paddle.to_tensor([7, 8, 9]) + data2 = paddle.to_tensor([10, 11, 12]) + dist.stream.scatter(data1, src=1) + else: + data1 = paddle.to_tensor([1, 2, 3]) + data2 = paddle.to_tensor([4, 5, 6]) + dist.stream.scatter(data1, [data1, data2], src=1) + out = data1.numpy() + # [1, 2, 3] (2 GPUs, out for rank 0) + # [4, 5, 6] (2 GPUs, out for rank 1) + """ + if group is not None and not group.is_member(): + raise RuntimeError( + "The group should not be None and all ranks which invoke this operation should be the member of this group." + ) + + if not sync_op and use_calc_stream: + raise RuntimeError( + "use_calc_stream can only be true in sync op behavior.") + + if framework.in_dygraph_mode(): + return _reduce_scatter_tensor_in_dygraph(out_tensor, in_tensor, op, + group, sync_op, + use_calc_stream, + "_reduce_scatter_base") + + raise RuntimeError( + "paddle.distributed.stream._reduce_scatter_base is only supported in dygraph mode now." + ) diff --git a/python/paddle/distributed/communication/stream/scatter.py b/python/paddle/distributed/communication/stream/scatter.py new file mode 100644 index 0000000000000..3a3fb00534783 --- /dev/null +++ b/python/paddle/distributed/communication/stream/scatter.py @@ -0,0 +1,162 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.distributed as dist +import paddle.fluid.framework as framework +from paddle.distributed import collective + + +def _check_tensor_shape(tensor, shape, nranks=1): + expect_shape = list(shape) + expect_shape[0] //= nranks + if list(tensor.shape) != expect_shape: + raise RuntimeError("The in_tensor for scatter is not correctly-sized.") + + +def _check_tensor_list_shape(tensor_list, shape, nranks=1): + if len(tensor_list) != nranks: + raise RuntimeError( + f"The tensor_list for scatter is not correctly-sized.") + for tensor in tensor_list: + if tensor.shape != shape: + raise RuntimeError( + f"The tensor_list for scatter is not correctly-sized.") + + +def _scatter_tensor_in_dygraph(out_tensor, in_tensor, src, group, sync_op, + use_calc_stream): + group = collective._get_default_group() if group is None else group + + src_rank = group.get_group_rank(src) + if src_rank == -1: + raise RuntimeError("Src rank out of group.") + + nranks = group.nranks + rank = dist.get_rank() + if rank == src_rank: + _check_tensor_shape(out_tensor, in_tensor.shape, nranks) + + if use_calc_stream: + return group.process_group.scatter_tensor_on_calc_stream( + in_tensor, out_tensor, src) + + task = group.process_group.scatter_tensor(in_tensor, out_tensor, src, + sync_op) + if sync_op: + task.wait() + + return task + + +def _scatter_in_dygraph(tensor, tensor_list, src, group, sync_op, + use_calc_stream): + group = collective._get_default_group() if group is None else group + + src_rank = group.get_group_rank(src) + if src_rank == -1: + raise RuntimeError("Src rank out of group.") + + nranks = group.nranks + rank = dist.get_rank() + if rank == src_rank: + if len(tensor_list) == 0: + raise RuntimeError( + "The tensor_list should not be empty on src rank.") + _check_tensor_list_shape(tensor_list, tensor.shape, nranks) + else: + tensor_list = [tensor for _ in range(nranks)] + + if use_calc_stream: + return group.process_group.scatter_on_calc_stream( + tensor_list, tensor, src) + + task = group.process_group.scatter(tensor_list, tensor, src, sync_op) + if sync_op: + task.wait() + + return task + + +def scatter(tensor, + tensor_or_tensor_list=None, + src=0, + group=None, + sync_op=True, + use_calc_stream=False): + """ + + Scatter a tensor (or a tensor list) across devices. + + Args: + tensor (Tensor): The output tensor on each rank. The result will overwrite this tenor after communication. Support + float16, float32, float64, int32, int64, int8, uint8 or bool as the input data type. + tensor_or_tensor_list (Union[Tensor, List[Tensor]]): The input to scatter (default is `None`, must be specified on the source rank). + If it is a tensor, it should be correctly-sized. If it is a list, it should contain correctly-sized tensors. + src (int, optional): Rank of the source device. If none is given, use `0` as default. + group (Group, optional): Communicate in which group. If none is given, use the global group as default. + sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default. + use_calc_stream (bool, optional): Indicate whether the communication is done on calculation stream. If none is given, use false as default. This + option is designed for high performance demand, be careful to turn it on except you are clearly know its meaning. + + Returns: + Return a task object. + + Warning: + This API only supports the dygraph mode now. + + Examples: + .. code-block:: python + + # required: distributed + import paddle + import paddle.distributed as dist + + dist.init_parallel_env() + if dist.get_rank() == 0: + data1 = paddle.to_tensor([7, 8, 9]) + data2 = paddle.to_tensor([10, 11, 12]) + dist.stream.scatter(data1, src=1) + else: + data1 = paddle.to_tensor([1, 2, 3]) + data2 = paddle.to_tensor([4, 5, 6]) + dist.stream.scatter(data1, [data1, data2], src=1) + out = data1.numpy() + # [1, 2, 3] (2 GPUs, out for rank 0) + # [4, 5, 6] (2 GPUs, out for rank 1) + """ + if group is not None and not group.is_member(): + raise RuntimeError( + "The group should not be None and all ranks which invoke this operation should be the member of this group." + ) + + if not sync_op and use_calc_stream: + raise RuntimeError( + "use_calc_stream can only be true in sync op behavior.") + + if tensor_or_tensor_list is None: + raise RuntimeError("The input should be specified.") + + if framework.in_dygraph_mode(): + if paddle.is_tensor(tensor_or_tensor_list): + return _scatter_tensor_in_dygraph(tensor, tensor_or_tensor_list, + src, group, sync_op, + use_calc_stream) + else: + return _scatter_in_dygraph(tensor, tensor_or_tensor_list, src, + group, sync_op, use_calc_stream) + + raise RuntimeError( + "paddle.distributed.stream.scatter is only supported in dygraph mode now." + ) diff --git a/python/paddle/distributed/communication/stream/send.py b/python/paddle/distributed/communication/stream/send.py index fa052734c7ee7..49eceed55e31f 100644 --- a/python/paddle/distributed/communication/stream/send.py +++ b/python/paddle/distributed/communication/stream/send.py @@ -64,7 +64,7 @@ def send(tensor, dst=0, group=None, sync_op=True, use_calc_stream=False): task = dist.stream.recv(data, src=0, sync_op=False) task.wait() out = data.numpy() - # [[4, 5, 6], [4, 5, 6] + # [[4, 5, 6], [4, 5, 6]] (2 GPUs) """ if group is not None and not group.is_member(): raise RuntimeError( diff --git a/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt b/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt index d7ee67c10f435..a9db1e0bc7eec 100644 --- a/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt @@ -282,6 +282,54 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX)) set_tests_properties(test_communication_stream_allreduce_api PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST") endif() +if((WITH_GPU OR WITH_ROCM) AND (LINUX)) + py_test_modules( + test_communication_stream_alltoall_api MODULES + test_communication_stream_alltoall_api ENVS + "PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=") + set_tests_properties(test_communication_stream_alltoall_api + PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST") +endif() +if((WITH_GPU OR WITH_ROCM) AND (LINUX)) + py_test_modules( + test_communication_stream_alltoall_single_api MODULES + test_communication_stream_alltoall_single_api ENVS + "PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=") + set_tests_properties(test_communication_stream_alltoall_single_api + PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST") +endif() +if((WITH_GPU OR WITH_ROCM) AND (LINUX)) + py_test_modules( + test_communication_stream_broadcast_api MODULES + test_communication_stream_broadcast_api ENVS + "PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=") + set_tests_properties(test_communication_stream_broadcast_api + PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST") +endif() +if((WITH_GPU OR WITH_ROCM) AND (LINUX)) + py_test_modules( + test_communication_stream_reduce_api MODULES + test_communication_stream_reduce_api ENVS + "PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=") + set_tests_properties(test_communication_stream_reduce_api + PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST") +endif() +if((WITH_GPU OR WITH_ROCM) AND (LINUX)) + py_test_modules( + test_communication_stream_reduce_scatter_api MODULES + test_communication_stream_reduce_scatter_api ENVS + "PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=") + set_tests_properties(test_communication_stream_reduce_scatter_api + PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST") +endif() +if((WITH_GPU OR WITH_ROCM) AND (LINUX)) + py_test_modules( + test_communication_stream_scatter_api MODULES + test_communication_stream_scatter_api ENVS + "PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=") + set_tests_properties(test_communication_stream_scatter_api + PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST") +endif() if((WITH_GPU OR WITH_ROCM) AND (LINUX)) py_test_modules( test_communication_stream_sendrecv_api MODULES diff --git a/python/paddle/fluid/tests/unittests/collective/communication_stream_alltoall_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/communication_stream_alltoall_api_dygraph.py new file mode 100644 index 0000000000000..8e65ea8d8aee5 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/collective/communication_stream_alltoall_api_dygraph.py @@ -0,0 +1,113 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import numpy as np +import paddle +import paddle.fluid as fluid +import paddle.distributed as dist +import test_communication_api_base as test_base +import test_collective_api_base as test_collective_base + + +class StreamAllToAllTestCase(): + + def __init__(self): + self._sync_op = eval(os.getenv("sync_op")) + self._use_calc_stream = eval(os.getenv("use_calc_stream")) + self._backend = os.getenv("backend") + self._shape = eval(os.getenv("shape")) + self._dtype = os.getenv("dtype") + self._seeds = eval(os.getenv("seeds")) + if self._backend not in ["nccl", "gloo"]: + raise NotImplementedError( + "Only support nccl and gloo as the backend for now.") + os.environ["PADDLE_DISTRI_BACKEND"] = self._backend + + def run_test_case(self): + dist.init_parallel_env() + + test_data_list = [] + for seed in self._seeds: + test_data_list.append( + test_collective_base.create_test_data(shape=self._shape, + dtype=self._dtype, + seed=seed)) + + nranks = len(test_data_list) + data1 = test_data_list[0] + data2 = test_data_list[1] + result1 = np.vstack( + [data1[0:data1.shape[0] // 2, :], data2[0:data2.shape[0] // 2, :]]) + result2 = np.vstack( + [data1[data1.shape[0] // 2:, :], data2[data2.shape[0] // 2:, :]]) + + rank = dist.get_rank() + tensor = paddle.to_tensor(test_data_list[rank]) + t1, t2 = paddle.split(tensor, nranks, axis=0) + + # case 1: pass an empty tensor list + empty_tensor_list = [] + task = dist.stream.alltoall(empty_tensor_list, [t1, t2], + sync_op=self._sync_op, + use_calc_stream=self._use_calc_stream) + if not self._sync_op: + task.wait() + result_tensor_list = np.vstack(empty_tensor_list) + if rank == 0: + assert np.allclose(result_tensor_list, + result1, + rtol=1e-05, + atol=1e-05) + else: + assert np.allclose(result_tensor_list, + result2, + rtol=1e-05, + atol=1e-05) + + # case 2: pass a pre-sized tensor list + full_tensor_list = [paddle.empty_like(t1) for _ in test_data_list] + task = dist.stream.alltoall(full_tensor_list, [t1, t2], + sync_op=self._sync_op, + use_calc_stream=self._use_calc_stream) + if not self._sync_op: + task.wait() + result_tensor_list = np.vstack(full_tensor_list) + if rank == 0: + assert np.allclose(result_tensor_list, + result1, + rtol=1e-05, + atol=1e-05) + else: + assert np.allclose(result_tensor_list, + result2, + rtol=1e-05, + atol=1e-05) + + # case 3: pass a pre-sized tensor + out_tensor = paddle.empty_like(tensor) + task = dist.stream.alltoall(out_tensor, + tensor, + sync_op=self._sync_op, + use_calc_stream=self._use_calc_stream) + if not self._sync_op: + task.wait() + if rank == 0: + assert np.allclose(out_tensor, result1, rtol=1e-05, atol=1e-05) + else: + assert np.allclose(out_tensor, result2, rtol=1e-05, atol=1e-05) + + +if __name__ == "__main__": + StreamAllToAllTestCase().run_test_case() diff --git a/python/paddle/fluid/tests/unittests/collective/communication_stream_alltoall_single_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/communication_stream_alltoall_single_api_dygraph.py new file mode 100644 index 0000000000000..9bdfe124b0b49 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/collective/communication_stream_alltoall_single_api_dygraph.py @@ -0,0 +1,74 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import numpy as np +import paddle +import paddle.fluid as fluid +import paddle.distributed as dist +import test_communication_api_base as test_base +import test_collective_api_base as test_collective_base + + +class StreamAllToAllSingleTestCase(): + + def __init__(self): + self._sync_op = eval(os.getenv("sync_op")) + self._use_calc_stream = eval(os.getenv("use_calc_stream")) + self._backend = os.getenv("backend") + self._shape = eval(os.getenv("shape")) + self._dtype = os.getenv("dtype") + self._seeds = eval(os.getenv("seeds")) + if self._backend not in ["nccl", "gloo"]: + raise NotImplementedError( + "Only support nccl and gloo as the backend for now.") + os.environ["PADDLE_DISTRI_BACKEND"] = self._backend + + def run_test_case(self): + dist.init_parallel_env() + + test_data_list = [] + for seed in self._seeds: + test_data_list.append( + test_collective_base.create_test_data(shape=self._shape, + dtype=self._dtype, + seed=seed)) + + nranks = len(test_data_list) + data1 = paddle.to_tensor(test_data_list[0]) + data2 = paddle.to_tensor(test_data_list[1]) + result1 = np.vstack( + (data1[0:data1.shape[0] // 2, :], data2[0:data2.shape[0] // 2, :])) + result2 = np.vstack( + (data1[data1.shape[0] // 2:, :], data2[data2.shape[0] // 2:, :])) + + rank = dist.get_rank() + tensor = paddle.to_tensor(test_data_list[rank]) + + out_tensor = paddle.empty_like(tensor) + task = dist.stream.alltoall_single( + out_tensor, + tensor, + sync_op=self._sync_op, + use_calc_stream=self._use_calc_stream) + if not self._sync_op: + task.wait() + if rank == 0: + assert np.allclose(out_tensor, result1, rtol=1e-05, atol=1e-05) + else: + assert np.allclose(out_tensor, result2, rtol=1e-05, atol=1e-05) + + +if __name__ == "__main__": + StreamAllToAllSingleTestCase().run_test_case() diff --git a/python/paddle/fluid/tests/unittests/collective/communication_stream_broadcast_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/communication_stream_broadcast_api_dygraph.py new file mode 100644 index 0000000000000..487dfd6ae6894 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/collective/communication_stream_broadcast_api_dygraph.py @@ -0,0 +1,54 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import numpy as np +import paddle +import paddle.distributed as dist +import test_collective_api_base as test_collective_base + + +class StreamBroadcastTestCase(): + + def __init__(self): + self._sync_op = eval(os.getenv("sync_op")) + self._use_calc_stream = eval(os.getenv("use_calc_stream")) + self._backend = os.getenv("backend") + self._shape = eval(os.getenv("shape")) + self._dtype = os.getenv("dtype") + self._seeds = eval(os.getenv("seeds")) + if self._backend not in ["nccl", "gloo"]: + raise NotImplementedError( + "Only support nccl and gloo as the backend for now.") + os.environ["PADDLE_DISTRI_BACKEND"] = self._backend + + def run_test_case(self): + dist.init_parallel_env() + + src_rank = 1 + result = test_collective_base.create_test_data( + shape=self._shape, dtype=self._dtype, seed=self._seeds[src_rank]) + tensor = paddle.to_tensor(result) + task = dist.stream.broadcast(tensor, + src=src_rank, + sync_op=self._sync_op, + use_calc_stream=self._use_calc_stream) + if not self._sync_op: + task.wait() + + assert np.allclose(tensor, result, rtol=1e-05, atol=1e-05) + + +if __name__ == "__main__": + StreamBroadcastTestCase().run_test_case() diff --git a/python/paddle/fluid/tests/unittests/collective/communication_stream_reduce_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/communication_stream_reduce_api_dygraph.py new file mode 100644 index 0000000000000..a487eac566ab5 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/collective/communication_stream_reduce_api_dygraph.py @@ -0,0 +1,66 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import numpy as np +import paddle +import paddle.distributed as dist +import test_collective_api_base as test_collective_base + + +class StreamReduceTestCase(): + + def __init__(self): + self._sync_op = eval(os.getenv("sync_op")) + self._use_calc_stream = eval(os.getenv("use_calc_stream")) + self._backend = os.getenv("backend") + self._shape = eval(os.getenv("shape")) + self._dtype = os.getenv("dtype") + self._seeds = eval(os.getenv("seeds")) + if self._backend not in ["nccl", "gloo"]: + raise NotImplementedError( + "Only support nccl and gloo as the backend for now.") + os.environ["PADDLE_DISTRI_BACKEND"] = self._backend + + def run_test_case(self): + dist.init_parallel_env() + + test_data_list = [] + for seed in self._seeds: + test_data_list.append( + test_collective_base.create_test_data(shape=self._shape, + dtype=self._dtype, + seed=seed)) + + rank = dist.get_rank() + tensor = paddle.to_tensor(test_data_list[rank]) + task = dist.stream.reduce(tensor, + dst=1, + sync_op=self._sync_op, + use_calc_stream=self._use_calc_stream) + if not self._sync_op: + task.wait() + + result = sum(test_data_list) + if rank == 1: + assert np.allclose(tensor, result, rtol=1e-05, atol=1e-05) + else: + assert np.allclose(tensor, + test_data_list[rank], + rtol=1e-05, + atol=1e-05) + + +if __name__ == "__main__": + StreamReduceTestCase().run_test_case() diff --git a/python/paddle/fluid/tests/unittests/collective/communication_stream_reduce_scatter_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/communication_stream_reduce_scatter_api_dygraph.py new file mode 100644 index 0000000000000..8f66d67e0d58c --- /dev/null +++ b/python/paddle/fluid/tests/unittests/collective/communication_stream_reduce_scatter_api_dygraph.py @@ -0,0 +1,94 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import numpy as np +import paddle +import paddle.distributed as dist +import test_collective_api_base as test_collective_base + + +class StreamReduceScatterTestCase(): + + def __init__(self): + self._sync_op = eval(os.getenv("sync_op")) + self._use_calc_stream = eval(os.getenv("use_calc_stream")) + self._backend = os.getenv("backend") + self._shape = eval(os.getenv("shape")) + self._dtype = os.getenv("dtype") + self._seeds = eval(os.getenv("seeds")) + if self._backend not in ["nccl", "gloo"]: + raise NotImplementedError( + "Only support nccl and gloo as the backend for now.") + os.environ["PADDLE_DISTRI_BACKEND"] = self._backend + + def run_test_case(self): + dist.init_parallel_env() + + test_data_list = [] + for seed in self._seeds: + test_data_list.append( + test_collective_base.create_test_data(shape=self._shape, + dtype=self._dtype, + seed=seed)) + reduce_result = sum(test_data_list) + result1 = reduce_result[0:reduce_result.shape[0] // 2] + result2 = reduce_result[reduce_result.shape[0] // 2:] + + rank = dist.get_rank() + tensor = paddle.to_tensor(test_data_list[rank]) + + # case 1: pass a pre-sized tensor list + t1, t2 = paddle.split(tensor, 2, axis=0) + result_tensor = paddle.empty_like(t1) + task = dist.stream.reduce_scatter(result_tensor, [t1, t2], + sync_op=self._sync_op, + use_calc_stream=self._use_calc_stream) + if not self._sync_op: + task.wait() + if rank == 0: + assert np.allclose(result_tensor, result1, rtol=1e-05, atol=1e-05) + else: + assert np.allclose(result_tensor, result2, rtol=1e-05, atol=1e-05) + + # case 2: pass a pre-sized tensor + result_tensor = paddle.empty_like(t1) + task = dist.stream.reduce_scatter(result_tensor, + tensor, + sync_op=self._sync_op, + use_calc_stream=self._use_calc_stream) + if not self._sync_op: + task.wait() + if rank == 0: + assert np.allclose(result_tensor, result1, rtol=1e-05, atol=1e-05) + else: + assert np.allclose(result_tensor, result2, rtol=1e-05, atol=1e-05) + + # case 3: test the legacy API + result_tensor = paddle.empty_like(t1) + task = dist.stream._reduce_scatter_base( + result_tensor, + tensor, + sync_op=self._sync_op, + use_calc_stream=self._use_calc_stream) + if not self._sync_op: + task.wait() + if rank == 0: + assert np.allclose(result_tensor, result1, rtol=1e-05, atol=1e-05) + else: + assert np.allclose(result_tensor, result2, rtol=1e-05, atol=1e-05) + + +if __name__ == "__main__": + StreamReduceScatterTestCase().run_test_case() diff --git a/python/paddle/fluid/tests/unittests/collective/communication_stream_scatter_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/communication_stream_scatter_api_dygraph.py new file mode 100644 index 0000000000000..6060e5050ca09 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/collective/communication_stream_scatter_api_dygraph.py @@ -0,0 +1,84 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import numpy as np +import paddle +import paddle.distributed as dist +import test_collective_api_base as test_collective_base + + +class StreamScatterTestCase(): + + def __init__(self): + self._sync_op = eval(os.getenv("sync_op")) + self._use_calc_stream = eval(os.getenv("use_calc_stream")) + self._backend = os.getenv("backend") + self._shape = eval(os.getenv("shape")) + self._dtype = os.getenv("dtype") + self._seeds = eval(os.getenv("seeds")) + if self._backend not in ["nccl", "gloo"]: + raise NotImplementedError( + "Only support nccl and gloo as the backend for now.") + os.environ["PADDLE_DISTRI_BACKEND"] = self._backend + + def run_test_case(self): + dist.init_parallel_env() + + test_data_list = [] + for seed in self._seeds: + test_data_list.append( + test_collective_base.create_test_data(shape=self._shape, + dtype=self._dtype, + seed=seed)) + + src_rank = 1 + src_data = test_data_list[src_rank] + result1 = src_data[0:src_data.shape[0] // 2] + result2 = src_data[src_data.shape[0] // 2:] + + rank = dist.get_rank() + + # case 1: pass a pre-sized tensor list + tensor = paddle.to_tensor(test_data_list[rank]) + t1, t2 = paddle.split(tensor, 2, axis=0) + task = dist.stream.scatter(t1, [t1, t2], + src=src_rank, + sync_op=self._sync_op, + use_calc_stream=self._use_calc_stream) + if not self._sync_op: + task.wait() + if rank == src_rank: + assert np.allclose(t1, result2, rtol=1e-05, atol=1e-05) + else: + assert np.allclose(t1, result1, rtol=1e-05, atol=1e-05) + + # case 2: pass a pre-sized tensor + tensor = paddle.to_tensor(src_data) + t1 = paddle.empty_like(t1) + task = dist.stream.scatter(t1, + tensor, + src=src_rank, + sync_op=self._sync_op, + use_calc_stream=self._use_calc_stream) + if not self._sync_op: + task.wait() + if rank == src_rank: + assert np.allclose(t1, result2, rtol=1e-05, atol=1e-05) + else: + assert np.allclose(t1, result1, rtol=1e-05, atol=1e-05) + + +if __name__ == "__main__": + StreamScatterTestCase().run_test_case() diff --git a/python/paddle/fluid/tests/unittests/collective/communication_stream_sendrecv_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/communication_stream_sendrecv_api_dygraph.py index 175e24c3d0d86..c22e734adf2fb 100644 --- a/python/paddle/fluid/tests/unittests/collective/communication_stream_sendrecv_api_dygraph.py +++ b/python/paddle/fluid/tests/unittests/collective/communication_stream_sendrecv_api_dygraph.py @@ -45,22 +45,25 @@ def run_test_case(self): dtype=self._dtype, seed=seed)) + src_rank = 0 + dst_rank = 1 + rank = dist.get_rank() tensor = paddle.to_tensor(test_data_list[rank]) if rank == 0: task = dist.stream.send(tensor, - dst=1, + dst=dst_rank, sync_op=self._sync_op, use_calc_stream=self._use_calc_stream) else: task = dist.stream.recv(tensor, - src=0, + src=src_rank, sync_op=self._sync_op, use_calc_stream=self._use_calc_stream) if not self._sync_op: task.wait() - result = test_data_list[0] + result = test_data_list[src_rank] assert np.allclose(tensor, result, rtol=1e-05, atol=1e-05) diff --git a/python/paddle/fluid/tests/unittests/collective/test_communication_stream_alltoall_api.py b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_alltoall_api.py new file mode 100644 index 0000000000000..4fa55d86840bc --- /dev/null +++ b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_alltoall_api.py @@ -0,0 +1,51 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle +import itertools +import test_communication_api_base as test_base + + +class TestCommunicationStreamAllToAllAPI(test_base.CommunicationTestDistBase): + + def setUp(self): + super(TestCommunicationStreamAllToAllAPI, self).setUp(num_of_devices=2, + timeout=120) + self._default_envs = { + "backend": "nccl", + "shape": "(100, 200)", + "dtype": "float32", + "seeds": str(self._seeds) + } + self._changeable_envs = { + "sync_op": ["True", "False"], + "use_calc_stream": ["True", "False"] + } + + def test_alltoall_stream(self): + envs_list = test_base.gen_product_envs_list(self._default_envs, + self._changeable_envs) + for envs in envs_list: + if eval(envs["use_calc_stream"]) and not eval(envs["sync_op"]): + continue + self.run_test_case("communication_stream_alltoall_api_dygraph.py", + user_defined_envs=envs) + + def tearDown(self): + super(TestCommunicationStreamAllToAllAPI, self).tearDown() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/test_communication_stream_alltoall_single_api.py b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_alltoall_single_api.py new file mode 100644 index 0000000000000..f1f099b9571f8 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_alltoall_single_api.py @@ -0,0 +1,53 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle +import itertools +import test_communication_api_base as test_base + + +class TestCommunicationStreamAllToAllSingleAPI( + test_base.CommunicationTestDistBase): + + def setUp(self): + super(TestCommunicationStreamAllToAllSingleAPI, + self).setUp(num_of_devices=2, timeout=120) + self._default_envs = { + "backend": "nccl", + "shape": "(100, 200)", + "dtype": "float32", + "seeds": str(self._seeds) + } + self._changeable_envs = { + "sync_op": ["True", "False"], + "use_calc_stream": ["True", "False"] + } + + def test_alltoall_single_stream(self): + envs_list = test_base.gen_product_envs_list(self._default_envs, + self._changeable_envs) + for envs in envs_list: + if eval(envs["use_calc_stream"]) and not eval(envs["sync_op"]): + continue + self.run_test_case( + "communication_stream_alltoall_single_api_dygraph.py", + user_defined_envs=envs) + + def tearDown(self): + super(TestCommunicationStreamAllToAllSingleAPI, self).tearDown() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/test_communication_stream_broadcast_api.py b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_broadcast_api.py new file mode 100644 index 0000000000000..07537a480e851 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_broadcast_api.py @@ -0,0 +1,51 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle +import itertools +import test_communication_api_base as test_base + + +class TestCommunicationStreamBroadcastAPI(test_base.CommunicationTestDistBase): + + def setUp(self): + super(TestCommunicationStreamBroadcastAPI, self).setUp(num_of_devices=2, + timeout=120) + self._default_envs = { + "backend": "nccl", + "shape": "(100, 200)", + "dtype": "float32", + "seeds": str(self._seeds) + } + self._changeable_envs = { + "sync_op": ["True", "False"], + "use_calc_stream": ["True", "False"] + } + + def test_broadcast_stream(self): + envs_list = test_base.gen_product_envs_list(self._default_envs, + self._changeable_envs) + for envs in envs_list: + if eval(envs["use_calc_stream"]) and not eval(envs["sync_op"]): + continue + self.run_test_case("communication_stream_broadcast_api_dygraph.py", + user_defined_envs=envs) + + def tearDown(self): + super(TestCommunicationStreamBroadcastAPI, self).tearDown() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/test_communication_stream_reduce_api.py b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_reduce_api.py new file mode 100644 index 0000000000000..c8a04c8d893e1 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_reduce_api.py @@ -0,0 +1,51 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle +import itertools +import test_communication_api_base as test_base + + +class TestCommunicationStreamReduceAPI(test_base.CommunicationTestDistBase): + + def setUp(self): + super(TestCommunicationStreamReduceAPI, self).setUp(num_of_devices=2, + timeout=120) + self._default_envs = { + "backend": "nccl", + "shape": "(100, 200)", + "dtype": "float32", + "seeds": str(self._seeds) + } + self._changeable_envs = { + "sync_op": ["True", "False"], + "use_calc_stream": ["True", "False"] + } + + def test_reduce_stream(self): + envs_list = test_base.gen_product_envs_list(self._default_envs, + self._changeable_envs) + for envs in envs_list: + if eval(envs["use_calc_stream"]) and not eval(envs["sync_op"]): + continue + self.run_test_case("communication_stream_reduce_api_dygraph.py", + user_defined_envs=envs) + + def tearDown(self): + super(TestCommunicationStreamReduceAPI, self).tearDown() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/test_communication_stream_reduce_scatter_api.py b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_reduce_scatter_api.py new file mode 100644 index 0000000000000..a90e634860d95 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_reduce_scatter_api.py @@ -0,0 +1,53 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle +import itertools +import test_communication_api_base as test_base + + +class TestCommunicationStreamReduceScatterAPI( + test_base.CommunicationTestDistBase): + + def setUp(self): + super(TestCommunicationStreamReduceScatterAPI, + self).setUp(num_of_devices=2, timeout=120) + self._default_envs = { + "backend": "nccl", + "shape": "(100, 200)", + "dtype": "float32", + "seeds": str(self._seeds) + } + self._changeable_envs = { + "sync_op": ["True", "False"], + "use_calc_stream": ["True", "False"] + } + + def test_reduce_scatter_stream(self): + envs_list = test_base.gen_product_envs_list(self._default_envs, + self._changeable_envs) + for envs in envs_list: + if eval(envs["use_calc_stream"]) and not eval(envs["sync_op"]): + continue + self.run_test_case( + "communication_stream_reduce_scatter_api_dygraph.py", + user_defined_envs=envs) + + def tearDown(self): + super(TestCommunicationStreamReduceScatterAPI, self).tearDown() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/test_communication_stream_scatter_api.py b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_scatter_api.py new file mode 100644 index 0000000000000..d96d931f43fbf --- /dev/null +++ b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_scatter_api.py @@ -0,0 +1,51 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle +import itertools +import test_communication_api_base as test_base + + +class TestCommunicationStreamScatterAPI(test_base.CommunicationTestDistBase): + + def setUp(self): + super(TestCommunicationStreamScatterAPI, self).setUp(num_of_devices=2, + timeout=120) + self._default_envs = { + "backend": "nccl", + "shape": "(100, 200)", + "dtype": "float32", + "seeds": str(self._seeds) + } + self._changeable_envs = { + "sync_op": ["True", "False"], + "use_calc_stream": ["True", "False"] + } + + def test_reduce_stream(self): + envs_list = test_base.gen_product_envs_list(self._default_envs, + self._changeable_envs) + for envs in envs_list: + if eval(envs["use_calc_stream"]) and not eval(envs["sync_op"]): + continue + self.run_test_case("communication_stream_scatter_api_dygraph.py", + user_defined_envs=envs) + + def tearDown(self): + super(TestCommunicationStreamScatterAPI, self).tearDown() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/testslist.csv b/python/paddle/fluid/tests/unittests/collective/testslist.csv index c6c7c13937f38..60a2c4af239ed 100644 --- a/python/paddle/fluid/tests/unittests/collective/testslist.csv +++ b/python/paddle/fluid/tests/unittests/collective/testslist.csv @@ -34,6 +34,12 @@ test_collective_split_row_linear,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_ test_collective_wait,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., test_communication_stream_allgather_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=, test_communication_stream_allreduce_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=, +test_communication_stream_alltoall_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=, +test_communication_stream_alltoall_single_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=, +test_communication_stream_broadcast_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=, +test_communication_stream_reduce_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=, +test_communication_stream_reduce_scatter_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=, +test_communication_stream_scatter_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=, test_communication_stream_sendrecv_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=, test_eager_dist_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., test_gen_nccl_id_op,,gpu;rocm;ASCEND;ASCEND_CL,,DIST,../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=.., From 868b1e1c59bcc052af91c53ee55603dbcbadf102 Mon Sep 17 00:00:00 2001 From: LiYuRio <63526175+LiYuRio@users.noreply.github.com> Date: Mon, 10 Oct 2022 11:32:01 +0800 Subject: [PATCH 4/6] Move group and all reduce from collective to communication (#45848) --- .../collective/ProcessGroupGloo.cc | 8 + .../distributed/collective/ProcessGroupGloo.h | 6 + python/paddle/distributed/collective.py | 173 ++---------------- .../distributed/communication/all_reduce.py | 87 +++++++++ .../paddle/distributed/communication/group.py | 94 ++++++++++ .../{comm_utils.py => reduce.py} | 28 ++- .../communication/stream/all_reduce.py | 48 +++-- .../paddle/distributed/fleet/base/topology.py | 4 +- .../distributed/fleet/layers/mpu/mp_ops.py | 2 +- python/paddle/distributed/parallel.py | 9 +- .../distributed/models/moe/moe_layer.py | 1 - 11 files changed, 276 insertions(+), 184 deletions(-) create mode 100644 python/paddle/distributed/communication/all_reduce.py create mode 100644 python/paddle/distributed/communication/group.py rename python/paddle/distributed/communication/{comm_utils.py => reduce.py} (59%) diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc index b23942b114f3b..097c9799b70f2 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc @@ -293,6 +293,14 @@ std::shared_ptr ProcessGroupGloo::AllReduce( std::vector& inputs, std::vector& outputs, const AllreduceOptions& opts) { + return AllReduce(inputs, outputs, opts, true); +} + +std::shared_ptr ProcessGroupGloo::AllReduce( + std::vector& inputs, + std::vector& outputs, + const AllreduceOptions& opts, + bool sync_op) { auto tag = next_tag(); std::shared_ptr task; auto context = get_context(); diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.h b/paddle/fluid/distributed/collective/ProcessGroupGloo.h index 95ce18c1d8217..d911da91eb1a3 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupGloo.h +++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.h @@ -120,6 +120,12 @@ class ProcessGroupGloo : public ProcessGroup { std::vector& outputs, const AllreduceOptions& opts = AllreduceOptions()) override; + std::shared_ptr AllReduce( + std::vector& inputs, + std::vector& outputs, + const AllreduceOptions& opts, + bool sync_op) override; + std::shared_ptr Barrier( const BarrierOptions& = BarrierOptions()) override; diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py index e1ee362cadfd7..2af34d942e0b1 100644 --- a/python/paddle/distributed/collective.py +++ b/python/paddle/distributed/collective.py @@ -52,54 +52,12 @@ from .fleet.layers.mpu.mp_ops import _linear from .fleet.layers.mpu.mp_ops import _parallel_linear from .fleet.layers.mpu.mp_ops import _parallel_embedding -from .communication.comm_utils import ReduceOp +from .communication.group import Group, _add_new_group +from .communication.all_reduce import all_reduce +from .communication.reduce import _get_reduce_op, ReduceOp __all__ = [] - -class Group(): - """ - The abstract representation of group. - """ - - def __init__(self, rank, rank_num, id=0, ranks=[], pg=None, name=None): - self.rank = rank - self.nranks = rank_num - self.id = id - self.ranks = ranks - self.pg = pg - self.name = name - - def is_member(self): - if self.rank < 0: - return False - if self.nranks < 2: - return False - return True - - def get_group_rank(self, rank): - if self.is_member() and rank in self.ranks: - return self.ranks.index(rank) - else: - return -1 - - @property - def process_group(self): - return self.pg - - @property - def world_size(self): - return self.nranks if self.rank >= 0 else -1 - - def __repr__(self): - debug_str = "rank: {}, nranks: {}, id: {}, ranks: ".format( - self.rank, self.nranks, self.id) - debug_str += ", ".join(map(str, self.ranks)) - debug_str += "; name: " - debug_str += self.name if self.name else "None" - return debug_str - - _global_env = None @@ -147,9 +105,8 @@ def _get_group_map(): global _group_map if _global_env_gid not in _group_map: genv = _get_global_env() - _group_map[_global_env_gid] = Group(genv.rank, - genv.world_size, - ranks=list(range(genv.world_size))) + _group_map[_global_env_gid] = Group(genv.rank, 0, + list(range(genv.world_size))) return _group_map @@ -197,19 +154,6 @@ def _new_ring_id(): return len(_get_group_map()) + max(_get_global_env().nrings, 9) -def _get_reduce_op(reduce_op, func_name): - if reduce_op == ReduceOp.SUM: - return core.ReduceOp.SUM - elif reduce_op == ReduceOp.MAX: - return core.ReduceOp.MAX - elif reduce_op == ReduceOp.MIN: - return core.ReduceOp.MIN - elif reduce_op == ReduceOp.PROD: - return core.ReduceOp.PRODUCT - else: - raise ValueError("Unknown reduce_op type for {}.".format(func_name)) - - def get_group(id=0): """ @@ -451,10 +395,13 @@ def new_group(ranks=None, backend=None, timeout=_default_timeout): else: rank = -1 pg = None - group = Group(rank, size, id=gid, ranks=ranks, pg=pg, name=group_name) + group = Group(rank, gid, ranks, pg=pg, name=group_name) _group_map_by_name[group_name] = group _group_map[gid] = group _group_map_backend[group] = backend + #TODO: The method below is a new method for group management, will replace the previous + # three in the future. + _add_new_group(group) # TODO(shenliang03): This is a temporary solution to solve the problem of # hang caused by tcp @@ -476,13 +423,13 @@ def new_group(ranks=None, backend=None, timeout=_default_timeout): ring_id = _new_ring_id() if global_rank not in ranks: - gp = Group(-1, -1, ring_id, ranks) + gp = Group(-1, ring_id, ranks) _group_map[ring_id] = gp else: ranks = sorted(ranks) group_rank = ranks.index(global_rank) group_size = len(ranks) - gp = Group(group_rank, group_size, ring_id, ranks) + gp = Group(group_rank, ring_id, ranks) _group_map[ring_id] = gp if group_size >= 2: @@ -748,104 +695,6 @@ def broadcast(tensor, src, group=None, sync_op=True): }) -def all_reduce(tensor, op=ReduceOp.SUM, group=None, sync_op=True): - """ - - Reduce a tensor over all ranks so that all get the result. - As shown below, one process is started with a GPU and the data of this process is represented - by its group rank. The reduce operator is sum. Through all_reduce operator, - each GPU will have the sum of the data from all GPUs. - - .. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/allreduce.png - :width: 800 - :alt: all_reduce - :align: center - - Args: - tensor (Tensor): The input Tensor. It also works as the output Tensor. Its data type - should be float16, float32, float64, int32, int64, int8, uint8 or bool. - op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD, optional): The operation used. Default value is ReduceOp.SUM. - group (Group, optional): The group instance return by new_group or None for global default group. - sync_op (bool, optional): Wether this op is a sync op. Default value is True. - - Returns: - None. - - Examples: - .. code-block:: python - - # required: distributed - import paddle - import paddle.distributed as dist - - dist.init_parallel_env() - if dist.get_rank() == 0: - data = paddle.to_tensor([[4, 5, 6], [4, 5, 6]]) - else: - data = paddle.to_tensor([[1, 2, 3], [1, 2, 3]]) - dist.all_reduce(data) - print(data) - # [[5, 7, 9], [5, 7, 9]] (2 GPUs) - """ - if group is not None and not group.is_member(): - return - - if in_dygraph_mode(): - op_type = _get_reduce_op(op, "all_reduce") - group = _get_default_group() if group is None else group - task = group.process_group.allreduce(tensor, op_type) - if sync_op: - task.wait() - return None - else: - return task - - use_calc_stream = sync_op - ring_id = 0 if group is None else group.id - if _non_static_mode(): - if op == ReduceOp.SUM: - return _legacy_C_ops.c_allreduce_sum_(tensor, 'use_calc_stream', - use_calc_stream, 'ring_id', - ring_id) - elif op == ReduceOp.MAX: - return _legacy_C_ops.c_allreduce_max_(tensor, 'use_calc_stream', - use_calc_stream, 'ring_id', - ring_id) - elif op == ReduceOp.MIN: - return _legacy_C_ops.c_allreduce_min_(tensor, 'use_calc_stream', - use_calc_stream, 'ring_id', - ring_id) - elif op == ReduceOp.PROD: - return _legacy_C_ops.c_allreduce_prod_(tensor, 'use_calc_stream', - use_calc_stream, 'ring_id', - ring_id) - else: - raise ValueError("Unknown parameter: {}.".format(op)) - - check_variable_and_dtype(tensor, 'tensor', [ - 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', - 'bool' - ], 'all_reduce') - if op == ReduceOp.SUM: - op_type = 'c_allreduce_sum' - elif op == ReduceOp.MAX: - op_type = 'c_allreduce_max' - elif op == ReduceOp.MIN: - op_type = 'c_allreduce_min' - elif op == ReduceOp.PROD: - op_type = 'c_allreduce_prod' - if not isinstance(ring_id, int): - raise ValueError("The type of 'ring_id' for all_reduce should be int.") - helper = LayerHelper(op_type, **locals()) - helper.append_op(type=op_type, - inputs={'X': [tensor]}, - outputs={'Out': [tensor]}, - attrs={ - 'ring_id': ring_id, - 'use_calc_stream': use_calc_stream - }) - - def reduce(tensor, dst, op=ReduceOp.SUM, group=None, sync_op=True): """ diff --git a/python/paddle/distributed/communication/all_reduce.py b/python/paddle/distributed/communication/all_reduce.py new file mode 100644 index 0000000000000..737e0cbbfb56c --- /dev/null +++ b/python/paddle/distributed/communication/all_reduce.py @@ -0,0 +1,87 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.fluid.framework as framework +from paddle.distributed.communication import stream as stream +from paddle.distributed.communication.reduce import ReduceOp + + +def all_reduce(tensor, op=ReduceOp.SUM, group=None, sync_op=True): + """ + + Reduce a tensor over all ranks so that all get the result. + As shown below, one process is started with a GPU and the data of this process is represented + by its group rank. The reduce operator is sum. Through all_reduce operator, + each GPU will have the sum of the data from all GPUs. + + .. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/allreduce.png + :width: 800 + :alt: all_reduce + :align: center + + Args: + tensor (Tensor): The input Tensor. It also works as the output Tensor. Its data type + should be float16, float32, float64, int32, int64, int8, uint8 or bool. + op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD, optional): The operation used. Default value is ReduceOp.SUM. + group (Group, optional): The group instance return by new_group or None for global default group. + sync_op (bool, optional): Wether this op is a sync op. Default value is True. + + Returns: + Return a task object. + + Examples: + .. code-block:: python + + # required: distributed + import paddle + import paddle.distributed as dist + + dist.init_parallel_env() + if dist.get_rank() == 0: + data = paddle.to_tensor([[4, 5, 6], [4, 5, 6]]) + else: + data = paddle.to_tensor([[1, 2, 3], [1, 2, 3]]) + dist.all_reduce(data) + print(data) + # [[5, 7, 9], [5, 7, 9]] (2 GPUs) + """ + if not framework._in_legacy_dygraph(): + return stream.all_reduce(tensor, + op=op, + group=group, + sync_op=sync_op, + use_calc_stream=False) + + # code below will be removed after we remove the old dygraph + use_calc_stream = sync_op + ring_id = 0 if group is None else group.id + if op == ReduceOp.SUM: + return paddle._legacy_C_ops.c_allreduce_sum_(tensor, 'use_calc_stream', + use_calc_stream, 'ring_id', + ring_id) + elif op == ReduceOp.MAX: + return paddle._legacy_C_ops.c_allreduce_max_(tensor, 'use_calc_stream', + use_calc_stream, 'ring_id', + ring_id) + elif op == ReduceOp.MIN: + return paddle._legacy_C_ops.c_allreduce_min_(tensor, 'use_calc_stream', + use_calc_stream, 'ring_id', + ring_id) + elif op == ReduceOp.PROD: + return paddle._legacy_C_ops.c_allreduce_prod_(tensor, 'use_calc_stream', + use_calc_stream, + 'ring_id', ring_id) + else: + raise ValueError("Unknown parameter: {}.".format(op)) diff --git a/python/paddle/distributed/communication/group.py b/python/paddle/distributed/communication/group.py new file mode 100644 index 0000000000000..6b4e545b245d1 --- /dev/null +++ b/python/paddle/distributed/communication/group.py @@ -0,0 +1,94 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class Group(): + """ + The abstract representation of group. + """ + + def __init__(self, rank_in_group, id, ranks, pg=None, name=None): + self._rank_in_group = rank_in_group + self._world_size = len(ranks) if rank_in_group >= 0 else -1 + self._id = id + self._ranks = ranks + self._pg = pg + self._name = name + + @property + def rank(self): + return self._rank_in_group + + @property + def ranks(self): + return self._ranks + + @property + def nranks(self): + return len(self._ranks) + + @property + def name(self): + return self._name + + @property + def process_group(self): + return self._pg + + @property + def world_size(self): + return self._world_size + + @property + def id(self): + return self._id + + def is_member(self): + if self.rank < 0: + return False + if self.nranks < 2: + return False + return True + + def get_group_rank(self, rank): + if self.is_member(): + return self.ranks.index(rank) + else: + return -1 + + def __repr__(self): + debug_str = "rank: {}, nranks: {}, id: {}, ranks: ".format( + self.rank, self.nranks, self.id) + debug_str += ", ".join(map(str, self.ranks)) + debug_str += "; name: " + debug_str += self.name if self.name else "None" + return debug_str + + +class _GroupManager(): + global_group_id = 0 + group_map_by_id = {} + + +def _get_global_group(): + if _GroupManager.global_group_id not in _GroupManager.group_map_by_id: + raise RuntimeError("The global group is not initialized.") + return _GroupManager.group_map_by_id[_GroupManager.global_group_id] + + +def _add_new_group(group): + if group.id in _GroupManager.group_map_by_id: + raise RuntimeError("The group with id {} already exist.".format( + group.id)) + _GroupManager.group_map_by_id[group.id] = group diff --git a/python/paddle/distributed/communication/comm_utils.py b/python/paddle/distributed/communication/reduce.py similarity index 59% rename from python/paddle/distributed/communication/comm_utils.py rename to python/paddle/distributed/communication/reduce.py index 62e1bcb4cca94..5caa5bebedfd8 100644 --- a/python/paddle/distributed/communication/comm_utils.py +++ b/python/paddle/distributed/communication/reduce.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,6 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +import paddle.fluid.framework as framework +import paddle.fluid.core as core + class ReduceOp: """ @@ -48,3 +51,26 @@ class ReduceOp: MIN = 2 PROD = 3 AVG = 4 + + +def _get_reduce_op(reduce_op, func_name): + if framework.in_dygraph_mode(): + if reduce_op == ReduceOp.SUM: + return core.ReduceOp.SUM + elif reduce_op == ReduceOp.MAX: + return core.ReduceOp.MAX + elif reduce_op == ReduceOp.MIN: + return core.ReduceOp.MIN + elif reduce_op == ReduceOp.PROD: + return core.ReduceOp.PRODUCT + else: + if reduce_op == ReduceOp.SUM: + return 'c_allreduce_sum' + elif reduce_op == ReduceOp.MAX: + return 'c_allreduce_max' + elif reduce_op == ReduceOp.MIN: + return 'c_allreduce_min' + elif reduce_op == ReduceOp.PROD: + return 'c_allreduce_prod' + + raise ValueError("Unknown reduce_op type for {}.".format(func_name)) diff --git a/python/paddle/distributed/communication/stream/all_reduce.py b/python/paddle/distributed/communication/stream/all_reduce.py index 67fc4c8b63a0c..0ba161a078ab8 100644 --- a/python/paddle/distributed/communication/stream/all_reduce.py +++ b/python/paddle/distributed/communication/stream/all_reduce.py @@ -12,13 +12,17 @@ # See the License for the specific language governing permissions and # limitations under the License. -import paddle.distributed.collective as collective import paddle.fluid.framework as framework +import paddle.fluid.data_feeder as data_feeder +import paddle.fluid.layer_helper as layer_helper +from paddle.distributed.communication.reduce import _get_reduce_op, ReduceOp +from paddle.distributed.communication.group import _get_global_group def _all_reduce_in_dygraph(tensor, op, group, sync_op, use_calc_stream): - op_type = collective._get_reduce_op(op, "all_reduce") - group = collective._get_default_group() if group is None else group + op_type = _get_reduce_op(op, "all_reduce") + + group = _get_global_group() if group is None else group if use_calc_stream: return group.process_group.allreduce_on_calc_stream(tensor, op_type) @@ -29,8 +33,34 @@ def _all_reduce_in_dygraph(tensor, op, group, sync_op, use_calc_stream): return task +def _all_reduce_in_static_mode(tensor, op, group, sync_op, use_calc_stream): + data_feeder.check_variable_and_dtype(tensor, 'tensor', [ + 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', + 'bool' + ], 'all_reduce') + + op_type = _get_reduce_op(op, "all_reduce") + ring_id = 0 if group is None else group.id + + if not isinstance(ring_id, int): + raise ValueError("The type of 'ring_id' for all_reduce should be int.") + + # TODO: Support task and use task.wait in static mode + # Use use_calc_stream rather than sync_op + helper = layer_helper.LayerHelper(op_type, **locals()) + helper.append_op(type=op_type, + inputs={'X': [tensor]}, + outputs={'Out': [tensor]}, + attrs={ + 'ring_id': ring_id, + 'use_calc_stream': sync_op + }) + + return None + + def all_reduce(tensor, - op=collective.ReduceOp.SUM, + op=ReduceOp.SUM, group=None, sync_op=True, use_calc_stream=False): @@ -50,9 +80,6 @@ def all_reduce(tensor, Returns: Return a task object. - Warning: - This API only supports the dygraph mode now. - Examples: .. code-block:: python @@ -84,7 +111,6 @@ def all_reduce(tensor, if framework.in_dygraph_mode(): return _all_reduce_in_dygraph(tensor, op, group, sync_op, use_calc_stream) - - raise RuntimeError( - "paddle.distributed.stream.all_reduce is only supported in dygraph mode now." - ) + else: + return _all_reduce_in_static_mode(tensor, op, group, sync_op, + use_calc_stream) diff --git a/python/paddle/distributed/fleet/base/topology.py b/python/paddle/distributed/fleet/base/topology.py index bbaca8951205b..305452e99f380 100644 --- a/python/paddle/distributed/fleet/base/topology.py +++ b/python/paddle/distributed/fleet/base/topology.py @@ -378,8 +378,8 @@ def __init__(self): def set_comm_group(self, group_name, group_rank, group_size, ring_id, group_ranks): - group = paddle.distributed.collective.Group(group_rank, group_size, - ring_id, group_ranks) + group = paddle.distributed.collective.Group(group_rank, ring_id, + group_ranks) self.groups[group_name] = group def get_group(self, group_name): diff --git a/python/paddle/distributed/fleet/layers/mpu/mp_ops.py b/python/paddle/distributed/fleet/layers/mpu/mp_ops.py index dc4dc05c7ba41..a2f3bde6cfc64 100644 --- a/python/paddle/distributed/fleet/layers/mpu/mp_ops.py +++ b/python/paddle/distributed/fleet/layers/mpu/mp_ops.py @@ -22,7 +22,7 @@ from paddle.fluid.data_feeder import check_variable_and_dtype from paddle.fluid.dygraph import layers from paddle.distributed import collective -from ....communication.comm_utils import ReduceOp +from ....communication.reduce import ReduceOp from paddle.fluid.data_feeder import check_dtype import paddle.fluid.dygraph_utils as dygraph_utils diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py index 6cda451a266e8..507a765d0c550 100644 --- a/python/paddle/distributed/parallel.py +++ b/python/paddle/distributed/parallel.py @@ -43,6 +43,7 @@ from paddle.distributed.collective import _new_process_group_impl from paddle.distributed.collective import Group from paddle.distributed.collective import _set_group_map_backend +from paddle.distributed.communication.group import _add_new_group __all__ = [] @@ -258,15 +259,11 @@ def train(): _default_group_name, pg_options=None) ranks = list(range(world_size)) - group = Group(rank, - world_size, - id=0, - ranks=ranks, - pg=pg, - name=_default_group_name) + group = Group(rank, 0, ranks, pg=pg, name=_default_group_name) _set_group_map_by_name(_default_group_name, group) _set_group_map(0, group) _set_group_map_backend(group, backend) + _add_new_group(group) parallel_helper._set_parallel_ctx(True) paddle.distributed.barrier(group=group) diff --git a/python/paddle/incubate/distributed/models/moe/moe_layer.py b/python/paddle/incubate/distributed/models/moe/moe_layer.py index e74a712f09682..0a0fe32a8e918 100644 --- a/python/paddle/incubate/distributed/models/moe/moe_layer.py +++ b/python/paddle/incubate/distributed/models/moe/moe_layer.py @@ -265,7 +265,6 @@ class MoELayer(nn.Layer): from paddle.distributed import fleet moe_group = Group(fleet.worker_index(), - fleet.worker_num(), 0, list(range(fleet.worker_num()))) mp_group = None From ae3d48deb6ab6052a2dd9624e3aec99bbf85c5f7 Mon Sep 17 00:00:00 2001 From: Wen Sun <35923278+HermitSun@users.noreply.github.com> Date: Tue, 11 Oct 2022 19:08:57 +0800 Subject: [PATCH 5/6] Completes bfloat16 dtype for collective api in eager mode (#45844) --- .../collective/ProcessGroupGloo.cc | 3 + .../collective/ProcessGroupNCCL.cc | 3 + .../fluid/platform/device/gpu/nccl_helper.h | 4 +- python/paddle/distributed/collective.py | 37 +-- .../tests/unittests/collective/CMakeLists.txt | 18 +- .../collective_allgather_api_dygraph.py | 15 +- .../collective_allreduce_api_dygraph.py | 13 +- .../collective_alltoall_api_dygraph.py | 29 ++- .../collective_alltoall_single_api_dygraph.py | 16 +- .../collective_broadcast_api_dygraph.py | 13 +- .../collective_isend_irecv_api_dygraph.py | 23 +- .../collective_reduce_api_dygraph.py | 13 +- .../collective_reduce_scatter_api_dygraph.py | 16 +- .../collective_scatter_api_dygraph.py | 29 ++- .../collective_sendrecv_api_dygraph.py | 33 +-- .../test_collective_allgather_api.py | 244 +++--------------- .../test_collective_allreduce_api.py | 12 +- .../test_collective_alltoall_api.py | 8 +- .../test_collective_alltoall_single_api.py | 8 +- .../test_collective_broadcast_api.py | 12 +- .../test_collective_isend_irecv_api.py | 8 +- .../collective/test_collective_reduce_api.py | 12 +- .../test_collective_reduce_scatter_api.py | 8 +- .../collective/test_collective_scatter_api.py | 12 +- .../test_collective_sendrecv_api.py | 8 +- .../tests/unittests/collective/testslist.csv | 18 +- .../unittests/test_collective_api_base.py | 47 +++- 27 files changed, 318 insertions(+), 344 deletions(-) diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc index 097c9799b70f2..07065ac908e4e 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc @@ -88,6 +88,9 @@ namespace distributed { case experimental::DataType::BOOL: \ func(args); \ break; \ + case experimental::DataType::BFLOAT16: \ + func(args); \ + break; \ default: \ VLOG(0) << "Error: Unknown DataType."; \ exit(-1); \ diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc index bc5eb4885ee52..2e18dfcc3ba12 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc @@ -1030,6 +1030,9 @@ void* GetPointerByOffset(void* raw_pointer, } else if (type == experimental::DataType::BOOL) { return reinterpret_cast(reinterpret_cast(raw_pointer) + offset); + } else if (type == experimental::DataType::BFLOAT16) { + return reinterpret_cast(reinterpret_cast(raw_pointer) + + offset); } else { PADDLE_THROW(platform::errors::Unimplemented( "This datatype in nccl is not supported.")); diff --git a/paddle/fluid/platform/device/gpu/nccl_helper.h b/paddle/fluid/platform/device/gpu/nccl_helper.h index a5d89f6001fa1..5d89da86efa6c 100644 --- a/paddle/fluid/platform/device/gpu/nccl_helper.h +++ b/paddle/fluid/platform/device/gpu/nccl_helper.h @@ -59,7 +59,7 @@ inline ncclDataType_t ToNCCLDataType(framework::proto::VarType::Type type) { return ncclUint8; } else if (type == framework::proto::VarType::BOOL) { return ncclUint8; -#if CUDNN_VERSION_MIN(8, 1, 0) && NCCL_VERSION_CODE >= 21000 +#if NCCL_VERSION_CODE >= 21000 } else if (type == framework::proto::VarType::BF16) { return ncclBfloat16; #endif @@ -86,7 +86,7 @@ inline ncclDataType_t ToNCCLDataType(experimental::DataType type) { return ncclInt8; } else if (type == experimental::DataType::BOOL) { return ncclUint8; -#if CUDNN_VERSION_MIN(8, 1, 0) && NCCL_VERSION_CODE >= 21000 +#if NCCL_VERSION_CODE >= 21000 } else if (type == experimental::DataType::BFLOAT16) { return ncclBfloat16; #endif diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py index 2af34d942e0b1..95b63cb0518a5 100644 --- a/python/paddle/distributed/collective.py +++ b/python/paddle/distributed/collective.py @@ -478,7 +478,8 @@ def is_initialized(): Check whether the distributed environment has been initialized - Returns (bool): `True` if distributed environment has been initialized, otherwise `False`. + Returns: + `True` if distributed environment has been initialized, otherwise `False`. Examples: .. code-block:: python @@ -626,7 +627,7 @@ def broadcast(tensor, src, group=None, sync_op=True): Args: tensor (Tensor): The Tensor to send if current rank is the source, or the Tensor to receive otherwise. Its data type - should be float16, float32, float64, int32, int64, int8, uint8 or bool. + should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16. src (int): The source rank. group (Group, optional): The group instance return by new_group or None for global default group. sync_op (bool, optional): Whether this op is a sync op. The default value is True. @@ -709,7 +710,7 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=None, sync_op=True): Args: tensor (Tensor): The output Tensor for the destination and the input Tensor otherwise. Its data type - should be float16, float32, float64, int32, int64, int8, uint8 or bool. + should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16. dst (int): The destination rank id. op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD, optional): The operation used. Default value is ReduceOp.SUM. group (Group, optional): The group instance return by new_group or None for global default group. @@ -817,7 +818,7 @@ def all_gather(tensor_list, tensor, group=None, sync_op=True): Args: tensor_list (list): A list of output Tensors. Every element in the list must be a Tensor whose data type - should be float16, float32, float64, int32, int64, int8, uint8, bool, complex64 or complex128. + should be float16, float32, float64, int32, int64, int8, uint8, bool, bfloat16, complex64 or complex128. tensor (Tensor): The Tensor to send. Its data type should be float16, float32, float64, int32, int64, int8, uint8, bool, complex64 or complex128. group (Group, optional): The group instance return by new_group or None for global default group. @@ -999,9 +1000,9 @@ def scatter(tensor, tensor_list=None, src=0, group=None, sync_op=True): Args: tensor (Tensor): The output Tensor. Its data type - should be float16, float32, float64, int32, int64, int8, uint8 or bool. + should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16. tensor_list (list|tuple): A list/tuple of Tensors to scatter. Every element in the list must be a Tensor whose data type - should be float16, float32, float64, int32, int64, int8, uint8 or bool. Default value is None. + should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16. Default value is None. src (int): The source rank id. Default value is 0. group (Group, optional): The group instance return by new_group or None for global default group. sync_op (bool, optional): Whether this op is a sync op. The default value is True. @@ -1096,7 +1097,7 @@ def alltoall(in_tensor_list, out_tensor_list, group=None, sync_op=True): Args: in_tensor_list (list): A list of input Tensors. Every element in the list must be a Tensor whose data type - should be float16, float32, float64, int32, int64, int8, uint8 or bool. + should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16. out_tensor_list (list): A list of output Tensors. The data type of its elements should be the same as the data type of the input Tensors. group (Group, optional): The group instance return by new_group or None for global default group. Default: None. @@ -1197,7 +1198,7 @@ def alltoall_single(in_tensor, ``alltoall_single`` is only supported in eager mode. Args: - in_tensor (Tensor): Input tensor. The data type should be float16, float32, float64, int32, int64, int8, uint8 or bool. + in_tensor (Tensor): Input tensor. The data type should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16. out_tensor (Tensor): Output Tensor. The data type should be the same as the data type of the input Tensor. in_split_sizes (list[int], optional): Split sizes of ``in_tensor`` for dim[0]. If not given, dim[0] of ``in_tensor`` must be divisible by group size and ``in_tensor`` will be scattered averagely to all participators. Default: None. @@ -1286,7 +1287,7 @@ def send(tensor, dst=0, group=None, sync_op=True): Args: tensor (Tensor): The Tensor to send. Its data type - should be float16, float32, float64, int32, int64, int8, uint8 or bool. + should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16. dst (int): The destination rank id. group (Group, optional): The group instance return by new_group or None for global default group. Default: None. sync_op (bool, optional): Whether this op is a sync op. The default value is True. @@ -1352,7 +1353,7 @@ def recv(tensor, src=0, group=None, sync_op=True): Args: tensor (Tensor): The Tensor to receive. Its data type - should be float16, float32, float64, int32, int64, int8, uint8 or bool. + should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16. src (int): The source rank id. group (Group, optional): The group instance return by new_group or None for global default group. Default: None. sync_op (bool, optional): Whether this op is a sync op. The default value is True. @@ -1435,7 +1436,7 @@ def isend(tensor, dst, group=None): Args: tensor (Tensor): The Tensor to send. Its data type - should be float16, float32, float64, int32, int64, int8, uint8 or bool. + should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16. dst (int): The destination rank. group (Group, optional): The group instance return by new_group or None for global default group. Default: None. @@ -1485,7 +1486,7 @@ def irecv(tensor, src=None, group=None): Args: tensor (Tensor): The Tensor to receive. Its data type - should be float16, float32, float64, int32, int64, int8, uint8 or bool. + should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16. src (int): The source rank id. group (Group, optional): The group instance return by new_group or None for global default group. Default: None. @@ -1594,7 +1595,7 @@ def batch_isend_irecv(p2p_op_list): corresponding tasks. NCCL are currently supported. Args: - p2p_op_list: A list of point-to-point operations(type of each operator is + p2p_op_list (List[P2POp]): A list of point-to-point operations(type of each operator is ``paddle.distributed.P2POp``). The order of the isend/irecv in the list matters and it needs to match with corresponding isend/irecv on the remote end. @@ -1668,9 +1669,9 @@ def reduce_scatter(tensor, Reduces, then scatters a list of tensors to all processes in a group Args: - tensor (Tensor): Output tensor. Its data type should be float16, float32, float64, int32, int64, int8, uint8 or bool. + tensor (Tensor): Output tensor. Its data type should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16. tensor_list (list[Tensor]): List of tensors to reduce and scatter. Every element in the list must be a Tensor whose data type - should be float16, float32, float64, int32, int64, int8, uint8 or bool. + should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16. op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD): Optional. The operation used. Default: ReduceOp.SUM. group (Group, optional): The group instance return by new_group or None for global default group. Default: None. @@ -1736,9 +1737,9 @@ def _reduce_scatter_base(output, Reduces, then scatters a flattened tensor to all processes in a group. Args: - output (Tensor): Output tensor. Its data type should be float16, float32, float64, int32, int64, int8, uint8 or bool. - input (Tensor): Input tensor that is of size output tensor size times world size. Its data type - should be float16, float32, float64, int32, int64, int8, uint8 or bool. + output (Tensor): Output tensor. Its data type should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16. + input (Tensor): Input tensor that is of size output tensor size times world size. Its data type + should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16. op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD): Optional. The operation used. Default: ReduceOp.SUM. group (ProcessGroup, optional): The process group to work on. If None, the default process group will be used. diff --git a/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt b/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt index a9db1e0bc7eec..69cfef8e58fba 100644 --- a/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt @@ -71,14 +71,14 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX)) test_collective_allreduce_api MODULES test_collective_allreduce_api ENVS "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python") set_tests_properties(test_collective_allreduce_api - PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST") + PROPERTIES TIMEOUT "180" LABELS "RUN_TYPE=DIST") endif() if((WITH_GPU OR WITH_ROCM) AND (LINUX)) py_test_modules( test_collective_alltoall_api MODULES test_collective_alltoall_api ENVS "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python") set_tests_properties(test_collective_alltoall_api - PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST") + PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST") endif() if((WITH_GPU OR WITH_ROCM) AND (LINUX)) bash_test_modules( @@ -98,7 +98,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX)) test_collective_alltoall_single_api ENVS "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python") set_tests_properties(test_collective_alltoall_single_api - PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST") + PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST") endif() if((WITH_GPU OR WITH_ROCM) AND (LINUX)) py_test_modules( @@ -125,7 +125,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX)) test_collective_broadcast_api MODULES test_collective_broadcast_api ENVS "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python") set_tests_properties(test_collective_broadcast_api - PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST") + PROPERTIES TIMEOUT "180" LABELS "RUN_TYPE=DIST") endif() if((WITH_GPU OR WITH_ROCM) AND (LINUX)) py_test_modules( @@ -154,7 +154,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX)) test_collective_isend_irecv_api MODULES test_collective_isend_irecv_api ENVS "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python") set_tests_properties(test_collective_isend_irecv_api - PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST") + PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST") endif() if((WITH_GPU OR WITH_ROCM) AND (LINUX)) py_test_modules( @@ -187,7 +187,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX)) test_collective_reduce_api MODULES test_collective_reduce_api ENVS "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python") set_tests_properties(test_collective_reduce_api - PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST") + PROPERTIES TIMEOUT "180" LABELS "RUN_TYPE=DIST") endif() if((WITH_GPU OR WITH_ROCM) AND (LINUX)) bash_test_modules( @@ -207,7 +207,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX)) test_collective_reduce_scatter_api ENVS "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python") set_tests_properties(test_collective_reduce_scatter_api - PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST") + PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST") endif() if((WITH_GPU OR WITH_ROCM) AND (LINUX)) py_test_modules( @@ -221,7 +221,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX)) test_collective_scatter_api MODULES test_collective_scatter_api ENVS "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python") set_tests_properties(test_collective_scatter_api - PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST") + PROPERTIES TIMEOUT "180" LABELS "RUN_TYPE=DIST") endif() if((WITH_GPU OR WITH_ROCM) AND (LINUX)) py_test_modules( @@ -235,7 +235,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX)) test_collective_sendrecv_api MODULES test_collective_sendrecv_api ENVS "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python") set_tests_properties(test_collective_sendrecv_api - PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST") + PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST") endif() if((WITH_GPU OR WITH_ROCM) AND (LINUX)) py_test_modules( diff --git a/python/paddle/fluid/tests/unittests/collective/collective_allgather_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_allgather_api_dygraph.py index d485fd23d9571..38e1cc555da75 100644 --- a/python/paddle/fluid/tests/unittests/collective/collective_allgather_api_dygraph.py +++ b/python/paddle/fluid/tests/unittests/collective/collective_allgather_api_dygraph.py @@ -15,6 +15,7 @@ from __future__ import print_function import paddle +import paddle.distributed as dist import paddle.fluid as fluid import unittest import test_collective_api_base as test_base @@ -27,10 +28,18 @@ def __init__(self): def get_model(self, main_prog, startup_program, rank, indata=None): with fluid.program_guard(main_prog, startup_program): - tindata = paddle.to_tensor(indata) tensor_list = [] - paddle.distributed.all_gather(tensor_list, tindata) - return [tensor.numpy() for tensor in tensor_list] + # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16 + if indata.dtype == "bfloat16": + tindata = paddle.to_tensor(indata, "float32").cast("uint16") + dist.all_gather(tensor_list, tindata) + return [ + tensor.cast("float32").numpy() for tensor in tensor_list + ] + else: + tindata = paddle.to_tensor(indata) + dist.all_gather(tensor_list, tindata) + return [tensor.numpy() for tensor in tensor_list] if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/collective/collective_allreduce_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_allreduce_api_dygraph.py index 83588d450a7c9..92cc5a1623d9f 100644 --- a/python/paddle/fluid/tests/unittests/collective/collective_allreduce_api_dygraph.py +++ b/python/paddle/fluid/tests/unittests/collective/collective_allreduce_api_dygraph.py @@ -15,6 +15,7 @@ from __future__ import print_function import paddle +import paddle.distributed as dist import paddle.fluid as fluid import unittest import test_collective_api_base as test_base @@ -27,9 +28,15 @@ def __init__(self): def get_model(self, main_prog, startup_program, rank, indata=None): with fluid.program_guard(main_prog, startup_program): - tindata = paddle.to_tensor(indata) - paddle.distributed.all_reduce(tindata) - return [tindata.numpy()] + # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16 + if indata.dtype == "bfloat16": + tindata = paddle.to_tensor(indata, "float32").cast("uint16") + dist.all_reduce(tindata) + return [tindata.cast("float32").numpy()] + else: + tindata = paddle.to_tensor(indata) + dist.all_reduce(tindata) + return [tindata.numpy()] if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/collective/collective_alltoall_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_api_dygraph.py index fcabaffd614d0..da6c5ec1b3ad2 100644 --- a/python/paddle/fluid/tests/unittests/collective/collective_alltoall_api_dygraph.py +++ b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_api_dygraph.py @@ -25,30 +25,31 @@ from six import string_types import math import paddle +import paddle.distributed as dist import paddle.fluid as fluid -import paddle.fluid.profiler as profiler -import paddle.fluid.unique_name as nameGen -from paddle.fluid import core -import unittest -from multiprocessing import Process -import paddle.fluid.layers as layers -from functools import reduce -from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main +import test_collective_api_base as test_base -class TestCollectiveAllToAllAPI(TestCollectiveAPIRunnerBase): +class TestCollectiveAllToAllAPI(test_base.TestCollectiveAPIRunnerBase): def __init__(self): self.global_ring_id = 0 def get_model(self, main_prog, startup_program, rank, indata=None): with fluid.program_guard(main_prog, startup_program): - tindata = paddle.to_tensor(indata) - tindata = paddle.split(tindata, 2, axis=0) toutdata = [] - paddle.distributed.alltoall(tindata, toutdata) - return [data.numpy() for data in toutdata] + # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16 + if indata.dtype == "bfloat16": + tindata = paddle.to_tensor(indata, "float32").cast("uint16") + tindata = paddle.split(tindata, 2, axis=0) + dist.alltoall(tindata, toutdata) + return [data.cast("float32").numpy() for data in toutdata] + else: + tindata = paddle.to_tensor(indata) + tindata = paddle.split(tindata, 2, axis=0) + dist.alltoall(tindata, toutdata) + return [data.numpy() for data in toutdata] if __name__ == "__main__": - runtime_main(TestCollectiveAllToAllAPI, "alltoall") + test_base.runtime_main(TestCollectiveAllToAllAPI, "alltoall") diff --git a/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single_api_dygraph.py index 5fac73989a606..1100a4a481b5a 100644 --- a/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single_api_dygraph.py +++ b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single_api_dygraph.py @@ -15,6 +15,7 @@ from __future__ import print_function import paddle +import paddle.distributed as dist import paddle.fluid as fluid import test_collective_api_base as test_base @@ -26,10 +27,17 @@ def __init__(self): def get_model(self, main_prog, startup_program, rank, indata=None): with fluid.program_guard(main_prog, startup_program): - tindata = paddle.to_tensor(indata) - toutdata = paddle.to_tensor(indata) - paddle.distributed.alltoall_single(tindata, toutdata) - return [toutdata.numpy()] + # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16 + if indata.dtype == "bfloat16": + tindata = paddle.to_tensor(indata, "float32").cast("uint16") + toutdata = paddle.to_tensor(tindata, "float32").cast("uint16") + dist.alltoall_single(tindata, toutdata) + return [toutdata.cast("float32").numpy()] + else: + tindata = paddle.to_tensor(indata) + toutdata = paddle.to_tensor(indata) + dist.alltoall_single(tindata, toutdata) + return [toutdata.numpy()] if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/collective/collective_broadcast_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_broadcast_api_dygraph.py index 29f0b74bb405b..de80e3b99a2d5 100644 --- a/python/paddle/fluid/tests/unittests/collective/collective_broadcast_api_dygraph.py +++ b/python/paddle/fluid/tests/unittests/collective/collective_broadcast_api_dygraph.py @@ -15,6 +15,7 @@ from __future__ import print_function import paddle +import paddle.distributed as dist import paddle.fluid as fluid import unittest import test_collective_api_base as test_base @@ -27,9 +28,15 @@ def __init__(self): def get_model(self, main_prog, startup_program, rank, indata=None): with fluid.program_guard(main_prog, startup_program): - tindata = paddle.to_tensor(indata) - paddle.distributed.broadcast(tindata, src=1) - return [tindata.numpy()] + # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16 + if indata.dtype == "bfloat16": + tindata = paddle.to_tensor(indata, "float32").cast("uint16") + dist.broadcast(tindata, src=1) + return [tindata.cast("float32").numpy()] + else: + tindata = paddle.to_tensor(indata) + dist.broadcast(tindata, src=1) + return [tindata.numpy()] if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/collective/collective_isend_irecv_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_isend_irecv_api_dygraph.py index 70437216a8f85..da3d4c064300c 100644 --- a/python/paddle/fluid/tests/unittests/collective/collective_isend_irecv_api_dygraph.py +++ b/python/paddle/fluid/tests/unittests/collective/collective_isend_irecv_api_dygraph.py @@ -15,6 +15,7 @@ from __future__ import print_function import paddle +import paddle.distributed as dist import paddle.fluid as fluid import unittest import test_collective_api_base as test_base @@ -27,13 +28,23 @@ def __init__(self): def get_model(self, main_prog, startup_program, rank, indata=None): with fluid.program_guard(main_prog, startup_program): - tindata = paddle.to_tensor(indata) - if rank == 0: - task = paddle.distributed.isend(tindata, dst=1) + # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16 + if indata.dtype == "bfloat16": + tindata = paddle.to_tensor(indata, "float32").cast("uint16") + if rank == 0: + task = dist.isend(tindata, dst=1) + else: + task = dist.irecv(tindata, src=0) + task.wait() + return [tindata.cast("float32").numpy()] else: - task = paddle.distributed.irecv(tindata, src=0) - task.wait() - return [tindata.numpy()] + tindata = paddle.to_tensor(indata) + if rank == 0: + task = dist.isend(tindata, dst=1) + else: + task = dist.irecv(tindata, src=0) + task.wait() + return [tindata.numpy()] if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/collective/collective_reduce_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_reduce_api_dygraph.py index 257fc27ceee9f..6e2b1e86bcd14 100644 --- a/python/paddle/fluid/tests/unittests/collective/collective_reduce_api_dygraph.py +++ b/python/paddle/fluid/tests/unittests/collective/collective_reduce_api_dygraph.py @@ -15,6 +15,7 @@ from __future__ import print_function import paddle +import paddle.distributed as dist import paddle.fluid as fluid import unittest import test_collective_api_base as test_base @@ -27,9 +28,15 @@ def __init__(self): def get_model(self, main_prog, startup_program, rank, indata=None): with fluid.program_guard(main_prog, startup_program): - tindata = paddle.to_tensor(indata) - paddle.distributed.reduce(tindata, dst=0) - return [tindata.numpy()] + # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16 + if indata.dtype == "bfloat16": + tindata = paddle.to_tensor(indata, "float32").cast("uint16") + dist.reduce(tindata, dst=0) + return [tindata.cast("float32").numpy()] + else: + tindata = paddle.to_tensor(indata) + dist.reduce(tindata, dst=0) + return [tindata.numpy()] if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter_api_dygraph.py index 1b0eb6aef9d47..c5b5756ac74ba 100644 --- a/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter_api_dygraph.py +++ b/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter_api_dygraph.py @@ -15,6 +15,7 @@ from __future__ import print_function import paddle +import paddle.distributed as dist import paddle.fluid as fluid import unittest import test_collective_api_base as test_base @@ -27,10 +28,17 @@ def __init__(self): def get_model(self, main_prog, startup_program, rank, indata=None): with fluid.program_guard(main_prog, startup_program): - tindata = paddle.to_tensor(indata) - subdata1, subdata2 = paddle.split(tindata, 2, axis=0) - paddle.distributed.reduce_scatter(subdata1, [subdata1, subdata2]) - return [subdata1.numpy()] + # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16 + if indata.dtype == "bfloat16": + tindata = paddle.to_tensor(indata, "float32").cast("uint16") + subdata1, subdata2 = paddle.split(tindata, 2, axis=0) + dist.reduce_scatter(subdata1, [subdata1, subdata2]) + return [subdata1.cast("float32").numpy()] + else: + tindata = paddle.to_tensor(indata) + subdata1, subdata2 = paddle.split(tindata, 2, axis=0) + dist.reduce_scatter(subdata1, [subdata1, subdata2]) + return [subdata1.numpy()] if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/collective/collective_scatter_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_scatter_api_dygraph.py index f37f5653806ec..255b4e7e0ac59 100644 --- a/python/paddle/fluid/tests/unittests/collective/collective_scatter_api_dygraph.py +++ b/python/paddle/fluid/tests/unittests/collective/collective_scatter_api_dygraph.py @@ -15,6 +15,7 @@ from __future__ import print_function import paddle +import paddle.distributed as dist import paddle.fluid as fluid import unittest import test_collective_api_base as test_base @@ -27,15 +28,27 @@ def __init__(self): def get_model(self, main_prog, startup_program, rank, indata=None): with fluid.program_guard(main_prog, startup_program): - tindata = paddle.to_tensor(indata) - subdata1, subdata2 = paddle.split(tindata, 2, axis=0) - if rank == 0: - paddle.distributed.scatter(subdata1, src=1) + # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16 + if indata.dtype == "bfloat16": + tindata = paddle.to_tensor(indata, "float32").cast("uint16") + subdata1, subdata2 = paddle.split(tindata, 2, axis=0) + if rank == 0: + dist.scatter(subdata1, src=1) + else: + dist.scatter(subdata1, + tensor_list=[subdata1, subdata2], + src=1) + return [subdata1.cast("float32").numpy()] else: - paddle.distributed.scatter(subdata1, - tensor_list=[subdata1, subdata2], - src=1) - return [subdata1.numpy()] + tindata = paddle.to_tensor(indata) + subdata1, subdata2 = paddle.split(tindata, 2, axis=0) + if rank == 0: + dist.scatter(subdata1, src=1) + else: + dist.scatter(subdata1, + tensor_list=[subdata1, subdata2], + src=1) + return [subdata1.numpy()] if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/collective/collective_sendrecv_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_sendrecv_api_dygraph.py index 8508c3d043c93..f4ae715a349fc 100644 --- a/python/paddle/fluid/tests/unittests/collective/collective_sendrecv_api_dygraph.py +++ b/python/paddle/fluid/tests/unittests/collective/collective_sendrecv_api_dygraph.py @@ -25,31 +25,34 @@ from six import string_types import math import paddle +import paddle.distributed as dist import paddle.fluid as fluid -import paddle.fluid.profiler as profiler -import paddle.fluid.unique_name as nameGen -from paddle.fluid import core -import unittest -from multiprocessing import Process -import paddle.fluid.layers as layers -from functools import reduce -from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main +import test_collective_api_base as test_base -class TestCollectiveSendRecvAPI(TestCollectiveAPIRunnerBase): +class TestCollectiveSendRecvAPI(test_base.TestCollectiveAPIRunnerBase): def __init__(self): self.global_ring_id = 0 def get_model(self, main_prog, startup_program, rank, indata=None): with fluid.program_guard(main_prog, startup_program): - tindata = paddle.to_tensor(indata) - if rank == 0: - paddle.distributed.send(tindata, dst=1) + # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16 + if indata.dtype == "bfloat16": + tindata = paddle.to_tensor(indata, "float32").cast("uint16") + if rank == 0: + dist.send(tindata, dst=1) + else: + dist.recv(tindata, src=0) + return [tindata.cast("float32").numpy()] else: - paddle.distributed.recv(tindata, src=0) - return [tindata.numpy()] + tindata = paddle.to_tensor(indata) + if rank == 0: + dist.send(tindata, dst=1) + else: + dist.recv(tindata, src=0) + return [tindata.numpy()] if __name__ == "__main__": - runtime_main(TestCollectiveSendRecvAPI, "sendrecv") + test_base.runtime_main(TestCollectiveSendRecvAPI, "sendrecv") diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py index a01a96a0d6b29..78ecf0816b67f 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py @@ -28,213 +28,55 @@ def _setup_config(self): pass def test_allgather_nccl(self): - self.check_with_place("collective_allgather_api.py", - "allgather", - "nccl", - dtype="float16") - self.check_with_place("collective_allgather_api.py", - "allgather", - "nccl", - dtype="float32") - self.check_with_place("collective_allgather_api.py", - "allgather", - "nccl", - dtype="float64") - self.check_with_place("collective_allgather_api.py", - "allgather", - "nccl", - dtype="bool") - self.check_with_place("collective_allgather_api.py", - "allgather", - "nccl", - dtype="uint8") - self.check_with_place("collective_allgather_api.py", - "allgather", - "nccl", - dtype="int8") - self.check_with_place("collective_allgather_api.py", - "allgather", - "nccl", - dtype="int32") - self.check_with_place("collective_allgather_api.py", - "allgather", - "nccl", - dtype="int64") - self.check_with_place("collective_allgather_api.py", - "allgather", - "nccl", - dtype="complex64") - self.check_with_place("collective_allgather_api.py", - "allgather", - "nccl", - dtype="complex128") + dtypes_to_test = [ + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool", "complex64", "complex128" + ] + for dtype in dtypes_to_test: + self.check_with_place("collective_allgather_api.py", + "allgather", + "nccl", + dtype=dtype) def test_allgather_gloo(self): - self.check_with_place("collective_allgather_api.py", - "allgather", - "gloo", - "3", - dtype="float16") - self.check_with_place("collective_allgather_api.py", - "allgather", - "gloo", - "3", - dtype="float32") - self.check_with_place("collective_allgather_api.py", - "allgather", - "gloo", - "3", - dtype="float64") - self.check_with_place("collective_allgather_api.py", - "allgather", - "gloo", - "3", - dtype="bool") - self.check_with_place("collective_allgather_api.py", - "allgather", - "gloo", - "3", - dtype="uint8") - self.check_with_place("collective_allgather_api.py", - "allgather", - "gloo", - "3", - dtype="int8") - self.check_with_place("collective_allgather_api.py", - "allgather", - "gloo", - "3", - dtype="int32") - self.check_with_place("collective_allgather_api.py", - "allgather", - "gloo", - "3", - dtype="int64") - self.check_with_place("collective_allgather_api.py", - "allgather", - "gloo", - "3", - dtype="complex64") - self.check_with_place("collective_allgather_api.py", - "allgather", - "gloo", - "3", - dtype="complex128") + dtypes_to_test = [ + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool", "complex64", "complex128" + ] + for dtype in dtypes_to_test: + self.check_with_place("collective_allgather_api.py", + "allgather", + "gloo", + "3", + dtype=dtype) def test_allgatther_nccl_dygraph(self): - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "nccl", - static_mode="0", - dtype="float16") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "nccl", - static_mode="0", - dtype="float32") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "nccl", - static_mode="0", - dtype="float64") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "nccl", - static_mode="0", - dtype="bool") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "nccl", - static_mode="0", - dtype="uint8") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "nccl", - static_mode="0", - dtype="int8") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "nccl", - static_mode="0", - dtype="int32") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "nccl", - static_mode="0", - dtype="int64") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "nccl", - static_mode="0", - dtype="complex64") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "nccl", - static_mode="0", - dtype="complex128") + dtypes_to_test = [ + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool", "complex64", "complex128" + ] + if self._nccl_version >= 2100: + dtypes_to_test.append("bfloat16") + for dtype in dtypes_to_test: + self.check_with_place("collective_allgather_api_dygraph.py", + "allgather", + "nccl", + static_mode="0", + dtype=dtype) def test_allgather_gloo_dygraph(self): - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "gloo", - "3", - static_mode="0", - dtype="float16") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "gloo", - "3", - static_mode="0", - dtype="float32") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "gloo", - "3", - static_mode="0", - dtype="float64") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "gloo", - "3", - static_mode="0", - dtype="bool") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "gloo", - "3", - static_mode="0", - dtype="uint8") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "gloo", - "3", - static_mode="0", - dtype="int8") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "gloo", - "3", - static_mode="0", - dtype="int32") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "gloo", - "3", - static_mode="0", - dtype="int64") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "gloo", - "3", - static_mode="0", - dtype="complex64") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "gloo", - "3", - static_mode="0", - dtype="complex128") + dtypes_to_test = [ + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool", "bfloat16", "complex64", "complex128" + ] + for dtype in dtypes_to_test: + self.check_with_place("collective_allgather_api_dygraph.py", + "allgather", + "gloo", + "3", + static_mode="0", + dtype=dtype) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py index 2598606fc9cc2..65754989d3f7e 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py @@ -43,9 +43,11 @@ def test_allreduce_gloo(self): def test_allreduce_nccl_dygraph(self): dtypes_to_test = [ - 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', - 'bool' + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool" ] + if self._nccl_version >= 2100: + dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place("collective_allreduce_api_dygraph.py", "allreduce", @@ -55,8 +57,8 @@ def test_allreduce_nccl_dygraph(self): def test_allreduce_gloo_dygraph(self): dtypes_to_test = [ - 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', - 'bool' + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool", "bfloat16" ] for dtype in dtypes_to_test: self.check_with_place("collective_allreduce_api_dygraph.py", @@ -67,5 +69,5 @@ def test_allreduce_gloo_dygraph(self): dtype=dtype) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py index e079e99efebf5..35e3bf323964d 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py @@ -32,9 +32,11 @@ def test_alltoall_nccl(self): def test_alltoall_nccl_dygraph(self): dtypes_to_test = [ - 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', - 'bool' + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool" ] + if self._nccl_version >= 2100: + dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place("collective_alltoall_api_dygraph.py", "alltoall", @@ -43,5 +45,5 @@ def test_alltoall_nccl_dygraph(self): dtype=dtype) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py index fb1e5e9da22ef..23d2a998d8e32 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py @@ -24,9 +24,11 @@ def _setup_config(self): def test_alltooall_single_nccl_dygraph(self): dtypes_to_test = [ - 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', - 'bool' + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool" ] + if self._nccl_version >= 2100: + dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place("collective_alltoall_single_api_dygraph.py", "alltoall", @@ -35,5 +37,5 @@ def test_alltooall_single_nccl_dygraph(self): dtype=dtype) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py index 2d21be144a68b..e08930cefe9ca 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py @@ -37,9 +37,11 @@ def test_broadcast_gloo(self): def test_broadcast_nccl_dygraph(self): dtypes_to_test = [ - 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', - 'bool' + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool" ] + if self._nccl_version >= 2100: + dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place("collective_broadcast_api_dygraph.py", "broadcast", @@ -49,8 +51,8 @@ def test_broadcast_nccl_dygraph(self): def test_broadcast_gloo_dygraph(self): dtypes_to_test = [ - 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', - 'bool' + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool", "bfloat16" ] for dtype in dtypes_to_test: self.check_with_place("collective_broadcast_api_dygraph.py", @@ -61,5 +63,5 @@ def test_broadcast_gloo_dygraph(self): dtype=dtype) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py index f9613abc24063..28e502821aa52 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py @@ -24,9 +24,11 @@ def _setup_config(self): def test_isend_irecv_nccl_dygraph(self): dtypes_to_test = [ - 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', - 'bool' + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool" ] + if self._nccl_version >= 2100: + dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place("collective_isend_irecv_api_dygraph.py", "sendrecv", @@ -35,5 +37,5 @@ def test_isend_irecv_nccl_dygraph(self): dtype=dtype) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py index 2fa84ea2ed7f1..cc6093a3f431c 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py @@ -40,9 +40,11 @@ def test_reduce_gloo(self): def test_reduce_nccl_dygraph(self): dtypes_to_test = [ - 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', - 'bool' + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool" ] + if self._nccl_version >= 2100: + dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place("collective_reduce_api_dygraph.py", "reduce", @@ -52,8 +54,8 @@ def test_reduce_nccl_dygraph(self): def test_reduce_gloo_dygraph(self): dtypes_to_test = [ - 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', - 'bool' + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool", "bfloat16" ] for dtype in dtypes_to_test: self.check_with_place("collective_reduce_api_dygraph.py", @@ -64,5 +66,5 @@ def test_reduce_gloo_dygraph(self): dtype=dtype) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py index 1d25527407f45..283f73020f749 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py @@ -24,9 +24,11 @@ def _setup_config(self): def test_reduce_scatter_nccl_dygraph(self): dtypes_to_test = [ - 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', - 'bool' + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool" ] + if self._nccl_version >= 2100: + dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place("collective_reduce_scatter_api_dygraph.py", "reduce_scatter", @@ -35,5 +37,5 @@ def test_reduce_scatter_nccl_dygraph(self): dtype=dtype) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py index 4093b8ed69093..82ef4bd80e2d8 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py @@ -36,9 +36,11 @@ def test_scatter_nccl(self): def test_scatter_nccl_dygraph(self): dtypes_to_test = [ - 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', - 'bool' + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool" ] + if self._nccl_version >= 2100: + dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place("collective_scatter_api_dygraph.py", "scatter", @@ -48,8 +50,8 @@ def test_scatter_nccl_dygraph(self): def test_scatter_gloo_dygraph(self): dtypes_to_test = [ - 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', - 'bool' + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool", "bfloat16" ] for dtype in dtypes_to_test: self.check_with_place("collective_scatter_api_dygraph.py", @@ -60,5 +62,5 @@ def test_scatter_gloo_dygraph(self): dtype=dtype) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py index 940d6ec709bf1..c2fc98ed18e38 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py @@ -34,9 +34,11 @@ def _setup_config(self): def test_sendrecv_nccl_dygraph(self): dtypes_to_test = [ - 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', - 'bool' + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool" ] + if self._nccl_version >= 2100: + dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place("collective_sendrecv_api_dygraph.py", "sendrecv", @@ -45,5 +47,5 @@ def test_sendrecv_nccl_dygraph(self): dtype=dtype) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/testslist.csv b/python/paddle/fluid/tests/unittests/collective/testslist.csv index 60a2c4af239ed..2cf632a29d0ac 100644 --- a/python/paddle/fluid/tests/unittests/collective/testslist.csv +++ b/python/paddle/fluid/tests/unittests/collective/testslist.csv @@ -7,27 +7,27 @@ test_c_split,linux,gpu;rocm,120,DIST,test_runner.py,2,,PYTHONPATH=..;http_proxy= test_collective_split_embedding,linux,rocm;gpu,300,DIST,../dist_test.sh,2,,PYTHONPATH=..;http_proxy=;https_proxy=, test_collective_allgather_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., test_collective_allgather_object_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., -test_collective_allreduce_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., -test_collective_alltoall_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., +test_collective_allreduce_api,linux,gpu;rocm,180,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., +test_collective_alltoall_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., test_collective_alltoall_single,linux,gpu;rocm,350,DIST,../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=.., -test_collective_alltoall_single_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., +test_collective_alltoall_single_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., test_collective_barrier_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., test_collective_batch_isend_irecv,linux,gpu;rocm,350,DIST,../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=.., -test_collective_broadcast_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., +test_collective_broadcast_api,linux,gpu;rocm,180,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., test_collective_cpu_barrier_with_gloo,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., test_collective_global_gather,linux,gpu;rocm,200,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., test_collective_global_scatter,linux,gpu;rocm,200,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., -test_collective_isend_irecv_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., +test_collective_isend_irecv_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., test_collective_optimizer,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., test_collective_process_group,linux,gpu;rocm,350,DIST,../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=.., test_collective_reduce,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., -test_collective_reduce_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., +test_collective_reduce_api,linux,gpu;rocm,180,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., test_collective_reduce_scatter,linux,gpu;rocm,350,DIST,../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=.., -test_collective_reduce_scatter_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., +test_collective_reduce_scatter_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., test_collective_scatter,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., -test_collective_scatter_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., +test_collective_scatter_api,linux,gpu;rocm,180,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., test_collective_sendrecv,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., -test_collective_sendrecv_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., +test_collective_sendrecv_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., test_collective_split_col_linear,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., test_collective_split_embedding_none_divisible,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., test_collective_split_row_linear,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., diff --git a/python/paddle/fluid/tests/unittests/test_collective_api_base.py b/python/paddle/fluid/tests/unittests/test_collective_api_base.py index 21c9b172e9822..7d1a237951110 100644 --- a/python/paddle/fluid/tests/unittests/test_collective_api_base.py +++ b/python/paddle/fluid/tests/unittests/test_collective_api_base.py @@ -29,6 +29,7 @@ import paddle.fluid as fluid import paddle.fluid.unique_name as nameGen from paddle.fluid import core +from paddle_bfloat import bfloat16 def create_bool_test_data(shape=None, seed=None): @@ -82,6 +83,9 @@ def create_test_data(shape=None, dtype=None, seed=None): assert shape, "Shape should be specified" if dtype == "float32" or dtype == "float16" or dtype == "float64": return create_float_test_data(shape=shape, dtype=dtype, seed=seed) + elif dtype == "bfloat16": + # since numpy does not support bfloat16 yet, use `paddle_bfloat` to replace + return create_float_test_data(shape=shape, dtype=bfloat16, seed=seed) elif dtype == "bool": return create_bool_test_data(shape=shape, seed=seed) elif dtype == "int32" or dtype == "int64" or dtype == "int8" or dtype == "uint8": @@ -174,6 +178,15 @@ def setUp(self): self.temp_dir = tempfile.TemporaryDirectory() + # NOTE: this is a hack to get int format nccl version, like 2134 + # if current platform is not linux, version number will be 0 + nccl_version_str = subprocess.check_output( + r"ldconfig -v | grep 'libnccl.so' | tail -n1 | sed -r 's/^.*\.so\.//'", + stderr=subprocess.DEVNULL, + shell=True).decode('utf-8') + self._nccl_version = int("".join( + nccl_version_str.split("."))) if nccl_version_str else 0 + def tearDown(self): self.temp_dir.cleanup() @@ -312,6 +325,10 @@ def check_with_place(self, model_file, required_envs) input1 = create_test_data(shape=(10, 1000), dtype=dtype, seed=pid0) input2 = create_test_data(shape=(10, 1000), dtype=dtype, seed=pid1) + # cast bfloat16 to float32 for numeric comparison + if dtype == "bfloat16": + input1 = input1.astype("float32") + input2 = input2.astype("float32") if col_type == "allgather": need_result = np.vstack((input1, input2)) tr_out0 = np.vstack((tr0_out[0], tr0_out[1])) @@ -328,7 +345,13 @@ def check_with_place(self, np.testing.assert_allclose(tr1_out[0], need_result, rtol=1e-05) elif col_type == "reduce": need_result = input1 + input2 - np.testing.assert_allclose(tr0_out[0], need_result, rtol=1e-05) + # bfloat16 precision loss comes from truncating the last 16 bits of float32, + # which sums (\sum_{i=-23}^{-8}2^{i}) to about 0.0078 + if dtype == "bfloat16": + rtol = 8e-03 + else: + rtol = 1e-05 + np.testing.assert_allclose(tr0_out[0], need_result, rtol=rtol) elif col_type == "scatter": need_result = input2 need_result1 = need_result[0:need_result.shape[0] // 2] @@ -339,18 +362,28 @@ def check_with_place(self, need_result = input1 + input2 need_result1 = need_result[0:need_result.shape[0] // 2] need_result2 = need_result[need_result.shape[0] // 2:] - np.testing.assert_allclose(tr0_out[0], need_result1, rtol=1e-05) - np.testing.assert_allclose(tr1_out[0], need_result2, rtol=1e-05) + if dtype == "bfloat16": + rtol = 8e-03 + else: + rtol = 1e-05 + np.testing.assert_allclose(tr0_out[0], need_result1, rtol=rtol) + np.testing.assert_allclose(tr1_out[0], need_result2, rtol=rtol) elif col_type == "allreduce": need_result = input1 + input2 + if dtype == "bfloat16": + rtol = 8e-03 + atol = 8e-03 + else: + rtol = 1e-05 + atol = 1e-05 np.testing.assert_allclose(tr0_out[0], need_result, - rtol=1e-05, - atol=1e-05) + rtol=rtol, + atol=atol) np.testing.assert_allclose(tr1_out[0], need_result, - rtol=1e-05, - atol=1e-05) + rtol=rtol, + atol=atol) elif col_type == "parallel_embedding": result_data = tr0_out[0] np.random.seed(2020) From 64de943d027ade87842d1d7de521504517106512 Mon Sep 17 00:00:00 2001 From: Wen Sun <35923278+HermitSun@users.noreply.github.com> Date: Fri, 14 Oct 2022 20:02:38 +0800 Subject: [PATCH 6/6] Fix collective APIs cannot be recognized when building docs (#46962) --- python/paddle/distributed/__init__.py | 6 +++--- .../distributed/communication/stream/__init__.py | 6 +++--- .../communication_stream_reduce_scatter_api_dygraph.py | 10 +++++----- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py index 6c7b2fa732969..658d942bec025 100644 --- a/python/paddle/distributed/__init__.py +++ b/python/paddle/distributed/__init__.py @@ -74,7 +74,7 @@ "gloo_release", "QueueDataset", "split", "CountFilterEntry", "ShowClickEntry", "get_world_size", "get_group", "all_gather", "all_gather_object", "InMemoryDataset", "barrier", "all_reduce", "alltoall", - "send", "reduce", "recv", "ReduceOp", "wait", "get_rank", - "ProbabilityEntry", "ParallelMode", "is_initialized", "isend", "irecv", - "reduce_scatter" + "alltoall_single", "send", "reduce", "recv", "ReduceOp", "wait", "get_rank", + "ProbabilityEntry", "ParallelMode", "is_initialized", + "destroy_process_group", "isend", "irecv", "reduce_scatter", "stream" ] diff --git a/python/paddle/distributed/communication/stream/__init__.py b/python/paddle/distributed/communication/stream/__init__.py index a1844decf9478..43952ce5541a3 100644 --- a/python/paddle/distributed/communication/stream/__init__.py +++ b/python/paddle/distributed/communication/stream/__init__.py @@ -18,12 +18,12 @@ from .alltoall_single import alltoall_single from .broadcast import broadcast from .reduce import reduce -from .reduce_scatter import _reduce_scatter_base, reduce_scatter +from .reduce_scatter import reduce_scatter from .recv import recv from .scatter import scatter from .send import send __all__ = [ - "_reduce_scatter_base", "all_reduce", "alltoall", "alltoall_single", - "broadcast", "reduce", "reduce_scatter", "recv", "scatter", "send" + "all_gather", "all_reduce", "alltoall", "alltoall_single", "broadcast", + "reduce", "reduce_scatter", "recv", "scatter", "send" ] diff --git a/python/paddle/fluid/tests/unittests/collective/communication_stream_reduce_scatter_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/communication_stream_reduce_scatter_api_dygraph.py index 8f66d67e0d58c..effaf1cb6c99a 100644 --- a/python/paddle/fluid/tests/unittests/collective/communication_stream_reduce_scatter_api_dygraph.py +++ b/python/paddle/fluid/tests/unittests/collective/communication_stream_reduce_scatter_api_dygraph.py @@ -17,6 +17,7 @@ import paddle import paddle.distributed as dist import test_collective_api_base as test_collective_base +from paddle.distributed.communication.stream.reduce_scatter import _reduce_scatter_base class StreamReduceScatterTestCase(): @@ -77,11 +78,10 @@ def run_test_case(self): # case 3: test the legacy API result_tensor = paddle.empty_like(t1) - task = dist.stream._reduce_scatter_base( - result_tensor, - tensor, - sync_op=self._sync_op, - use_calc_stream=self._use_calc_stream) + task = _reduce_scatter_base(result_tensor, + tensor, + sync_op=self._sync_op, + use_calc_stream=self._use_calc_stream) if not self._sync_op: task.wait() if rank == 0: