From 6aed20dfb3fd7889dbf62239c80ecac02140bf89 Mon Sep 17 00:00:00 2001 From: Yulong Ao Date: Sun, 7 Aug 2022 09:36:53 +0000 Subject: [PATCH 1/3] [Auto Parallel] Add ProcessMesh, DeviceMesh and DistributedMapper --- paddle/fluid/distributed/CMakeLists.txt | 1 + .../distributed/auto_parallel/CMakeLists.txt | 40 ++ .../auto_parallel/auto_parallel.proto | 173 ++++++++ .../distributed/auto_parallel/device_mesh.cc | 398 ++++++++++++++++++ .../distributed/auto_parallel/device_mesh.h | 273 ++++++++++++ .../auto_parallel/device_mesh_test.cc | 93 ++++ .../distributed/auto_parallel/dist_mapper.cc | 146 +++++++ .../distributed/auto_parallel/dist_mapper.h | 73 ++++ .../auto_parallel/dist_mapper_test.cc | 72 ++++ .../distributed/auto_parallel/process_mesh.cc | 134 ++++++ .../distributed/auto_parallel/process_mesh.h | 94 +++++ .../auto_parallel/process_mesh_test.cc | 54 +++ .../fluid/distributed/auto_parallel/utils.h | 105 +++++ 13 files changed, 1656 insertions(+) create mode 100644 paddle/fluid/distributed/auto_parallel/CMakeLists.txt create mode 100644 paddle/fluid/distributed/auto_parallel/auto_parallel.proto create mode 100644 paddle/fluid/distributed/auto_parallel/device_mesh.cc create mode 100644 paddle/fluid/distributed/auto_parallel/device_mesh.h create mode 100644 paddle/fluid/distributed/auto_parallel/device_mesh_test.cc create mode 100644 paddle/fluid/distributed/auto_parallel/dist_mapper.cc create mode 100644 paddle/fluid/distributed/auto_parallel/dist_mapper.h create mode 100644 paddle/fluid/distributed/auto_parallel/dist_mapper_test.cc create mode 100644 paddle/fluid/distributed/auto_parallel/process_mesh.cc create mode 100644 paddle/fluid/distributed/auto_parallel/process_mesh.h create mode 100644 paddle/fluid/distributed/auto_parallel/process_mesh_test.cc create mode 100644 paddle/fluid/distributed/auto_parallel/utils.h diff --git a/paddle/fluid/distributed/CMakeLists.txt b/paddle/fluid/distributed/CMakeLists.txt index 24e0a8c7a5d9f..b18ed421fcd78 100755 --- a/paddle/fluid/distributed/CMakeLists.txt +++ b/paddle/fluid/distributed/CMakeLists.txt @@ -47,3 +47,4 @@ add_subdirectory(ps) add_subdirectory(test) add_subdirectory(index_dataset) add_subdirectory(fleet_executor) +add_subdirectory(auto_parallel) diff --git a/paddle/fluid/distributed/auto_parallel/CMakeLists.txt b/paddle/fluid/distributed/auto_parallel/CMakeLists.txt new file mode 100644 index 0000000000000..cfcc12515da91 --- /dev/null +++ b/paddle/fluid/distributed/auto_parallel/CMakeLists.txt @@ -0,0 +1,40 @@ +cc_library( + device_mesh + SRCS device_mesh.cc + DEPS auto_parallel_proto) +cc_test( + device_mesh_test + SRCS device_mesh_test.cc + DEPS device_mesh) + +cc_library( + process_mesh + SRCS process_mesh.cc + DEPS auto_parallel_proto) +cc_test( + process_mesh_test + SRCS process_mesh_test.cc + DEPS process_mesh) + +# cc_library( +# dist_attr +# SRCS dist_attr.cc +# DEPS process_mesh auto_parallel_proto proto_desc) +# cc_test( +# dist_attr_test +# SRCS dist_attr_test.cc +# DEPS dist_attr) + +cc_library( + dist_mapper + SRCS dist_mapper.cc + DEPS device_mesh auto_parallel_proto) +cc_test( + dist_mapper_test + SRCS dist_mapper_test.cc + DEPS dist_mapper) + +proto_library(auto_parallel_proto SRCS auto_parallel.proto) + +# cc_library(auto_parallel DEPS process_mesh device_mesh dist_attr dist_mapper +# auto_parallel_proto) diff --git a/paddle/fluid/distributed/auto_parallel/auto_parallel.proto b/paddle/fluid/distributed/auto_parallel/auto_parallel.proto new file mode 100644 index 0000000000000..eae33e929a2d3 --- /dev/null +++ b/paddle/fluid/distributed/auto_parallel/auto_parallel.proto @@ -0,0 +1,173 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless optional by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +syntax = "proto2"; + +package paddle.distributed.auto_parallel; + +// ProcessMesh is used to organize processes and like n-dimension array. +message ProcessMeshProto { + // The size of each dimension. + repeated int64 shape = 1; + + // These process ids are stored by a row-major way. + // There are no duplicate process ids within one process mesh. + repeated int64 process_ids = 2; + + // The name of each dimension. + repeated string dim_names = 3; + +} + +// // This distributed attribute describes how to distribute the corresponding tensor, +// // and store any other information needed by auto parallel. +// message TensorDistAttrProto { +// // The process mesh where a tensor is distributed. +// optional ProcessMeshProto process_mesh = 1; +// +// // The length of dims_mapping is same as the length of the tensor shape. +// // The i-th dimension of the tensor will be sharded by the dims_mapping[i]-th dimension +// // of the above process mesh. If dims_mapping[i] is -1, the i-th dimension of the tensor +// // will not be sharded. For example, given a tensor shape [2, 6, 12], a process mesh +// // shape [2, 3] and a dims_mapping [-1, 1, 0], each sharded tensor will have a shape [2, 2, 6]. +// repeated int64 dims_mapping = 2; +// +// // The batch dimension of the corresponding tensor. +// optional int64 batch_dim = 3; +// +// // If the dynamic_dims[i] is True, the i-th dimension of the corresponding tensor +// // is dynamic changed. Otherwise, the i-th dimension of the tensor is static determined. +// repeated bool dynamic_dims = 4; +// } +// +// // This distributed attribute describes how to distribute the corresponding operator, +// // and store any other information needed by auto parallel. +// message OperatorDistAttrProto { +// message TensorDistAttrMappingEntryProto { +// optional string name = 1; +// optional TensorDistAttrProto tensor_dist_attr = 2; +// } +// // The key of this map is the input tensor name and the value is the distributed attribute +// // of the input tensor required by this corresponding operator. +// // The distributed attribute of the actual tensor may be not the same as that within +// // the distributed attribute of the operator. +// repeated TensorDistAttrMappingEntryProto input_dist_attrs = 1; +// +// // The key of this map is the output tensor name and the value is the distributed attribute +// // of the output tensor required by this corresponding operator. +// // The distributed attribute of the actual tensor may be not the same as that within +// // the distributed attribute of the operator. +// repeated TensorDistAttrMappingEntryProto output_dist_attrs = 2; +// +// // The process mesh where a op is distributed. +// optional ProcessMeshProto process_mesh = 3; +// +// // A operator ideally has a distributed operator which may have multiple distributed implementations. +// // This filed is usually same as the operator type. However, some operators such as the element-wise operators +// // may shared the same distributed operator, the field is use for this scenario. +// optional string impl_type = 4; +// +// // This field tells which distributed implementations of this corresponding operator +// // will be selected for the actual computation. +// optional int64 impl_idx = 5; +// } + +// This proto describes the capability of one device such as the computation and memory. +message DeviceCapabilityProto { + optional double single_precision_flops = 1; + + optional double double_precision_flops = 2; + + optional double memory_size_in_bytes = 3; + + optional double clock_rate_in_ghz = 4; +} + +// This proto represents a device. +message DeviceProto { + // The global id of this device within the cluster. + optional int64 global_id = 1; + + // The local id of this device within the machine. + optional int64 local_id = 2; + + // The id of the machine own this device. + optional int64 machine_id = 3; + + // The id of the machine has this device. + optional string type = 4; + + // The capability of this device. + optional DeviceCapabilityProto capability = 5; +} + +// This proto describes the capability of the link between two devices. +message LinkCapabilityProto { + optional int64 bandwidth = 1; // Bytes/s + optional int64 latency = 2; +} + +message LinkProto { + // The global id of the source device. + optional int64 source_id = 1; + + // The global id of the source device. + optional int64 target_id = 2; + + // Represent the link type. + optional string type = 3; + + // The capability of this link. + optional LinkCapabilityProto capability = 4; +} + +// DeviceMesh is used to organize devices and like n-dimension array. +message DeviceMeshProto { + // The global id of this mesh. + optional string name = 1; + + // The size of each dimension. + repeated int64 shape = 2; + + // These device ids are stored by a row-major way. + // There are no duplicate device ids within one device mesh. + repeated int64 device_ids = 3; + + // The name of each dimension. + repeated string dim_names = 4; + + // The devices of this mesh. + repeated DeviceProto devices = 5; + + // The links are between devices. + repeated LinkProto links = 6; +} + +// Record the mapping between the logical processes and the physical devices. +message DistributedMapperProto { + // The device meshes used by this distributed computation, + // which may be shared by different multiple device meshes. + repeated DeviceMeshProto device_meshes = 1; + + message MapperEntryProto { + optional int64 process_id = 1; + optional string device_mesh_name = 2; + repeated int64 device_ids = 3; + } + + // The mapping from process ids to device ids. + // It is also possible for one process to use multiple devices. + // It is possible for one device shared by multiple processes. + repeated MapperEntryProto process_id_to_device_ids = 2; +} diff --git a/paddle/fluid/distributed/auto_parallel/device_mesh.cc b/paddle/fluid/distributed/auto_parallel/device_mesh.cc new file mode 100644 index 0000000000000..6bf26ad6f74e4 --- /dev/null +++ b/paddle/fluid/distributed/auto_parallel/device_mesh.cc @@ -0,0 +1,398 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/fluid/distributed/auto_parallel/device_mesh.h" +#include "paddle/fluid/distributed/auto_parallel/utils.h" + +namespace paddle { +namespace distributed { +namespace auto_parallel { + +std::string DeviceCapability::to_string() const { + std::string str; + str += "{sflops: " + to_string_with_precision(single_precision_flops) + ", "; + str += "dflops: " + to_string_with_precision(double_precision_flops) + ", "; + str += "memory: " + to_string_with_precision(memory_size_in_bytes) + ", "; + str += "rate: " + to_string_with_precision(clock_rate_in_ghz) + "}"; + return str; +} + +DeviceCapability DeviceCapability::from_proto( + const DeviceCapabilityProto &proto) { + DeviceCapability capability; + capability.single_precision_flops = proto.single_precision_flops(); + capability.double_precision_flops = proto.double_precision_flops(); + capability.memory_size_in_bytes = proto.memory_size_in_bytes(); + capability.clock_rate_in_ghz = proto.clock_rate_in_ghz(); + return capability; +} + +DeviceCapabilityProto DeviceCapability::to_proto() const { + DeviceCapabilityProto proto; + proto.set_single_precision_flops(single_precision_flops); + proto.set_double_precision_flops(double_precision_flops); + proto.set_memory_size_in_bytes(memory_size_in_bytes); + proto.set_clock_rate_in_ghz(clock_rate_in_ghz); + return proto; +} + +std::string Device::to_string() const { + std::string str = "{global_id: " + std::to_string(global_id_) + ", "; + str += "local_id: " + std::to_string(local_id_) + ", "; + str += "machine_id: " + std::to_string(machine_id_) + ", "; + str += "type: " + type_ + ", "; + str += "capability: " + capability_.to_string() + "}"; + return str; +} + +Device Device::from_proto(const DeviceProto &proto) { + Device device; + device.global_id_ = proto.global_id(); + device.local_id_ = proto.local_id(); + device.machine_id_ = proto.machine_id(); + device.type_ = proto.type(); + device.capability_ = DeviceCapability::from_proto(proto.capability()); + return device; +} + +DeviceProto Device::to_proto() const { + DeviceProto proto; + proto.set_global_id(global_id_); + proto.set_local_id(local_id_); + proto.set_machine_id(machine_id_); + proto.set_type(type_); + proto.mutable_capability()->CopyFrom(capability_.to_proto()); + return proto; +} + +bool operator==(const Device &lhs, const Device &rhs) { + if (lhs.global_id() != rhs.global_id()) { + return false; + } + if (lhs.local_id() != rhs.local_id()) { + return false; + } + if (lhs.machine_id() != rhs.machine_id()) { + return false; + } + if (lhs.type() != rhs.type()) { + return false; + } + return true; +} + +std::string LinkCapability::to_string() const { + std::string str; + str += "{bandwidth: " + to_string_with_precision(bandwidth) + ","; + str += "latency: " + to_string_with_precision(latency) + "}"; + return str; +} + +LinkCapability LinkCapability::from_proto(const LinkCapabilityProto &proto) { + LinkCapability capability; + capability.bandwidth = proto.bandwidth(); + capability.latency = proto.latency(); + return capability; +} + +LinkCapabilityProto LinkCapability::to_proto() const { + LinkCapabilityProto proto; + proto.set_bandwidth(bandwidth); + proto.set_latency(latency); + return proto; +} + +std::string Link::to_string() const { + std::string str = "{source_id:" + std::to_string(source_id_) + ","; + str += "target_id:" + std::to_string(target_id_) + ","; + str += "type:" + type_ + ","; + str += "capability:" + capability_.to_string() + "}"; + return str; +} + +Link Link::from_proto(const LinkProto &proto) { + Link link; + link.source_id_ = proto.source_id(); + link.target_id_ = proto.target_id(); + link.type_ = proto.type(); + link.capability_ = LinkCapability::from_proto(proto.capability()); + return link; +} + +LinkProto Link::to_proto() const { + LinkProto proto; + proto.set_source_id(source_id_); + proto.set_target_id(target_id_); + proto.set_type(type_); + proto.mutable_capability()->CopyFrom(capability_.to_proto()); + return proto; +} + +bool operator==(const Link &lhs, const Link &rhs) { + if (lhs.source_id() != rhs.source_id()) { + return false; + } + if (lhs.target_id() != rhs.target_id()) { + return false; + } + if (lhs.type() != rhs.type()) { + return false; + } + return true; +} + +bool Machine::contains(int64_t device_id) const { + if (devices_.count(device_id) == 1) { + return true; + } else { + return false; + } +} + +void Machine::add_device(const Device &device) { + if (id() == -1) { + set_id(device.machine_id()); + } else { + PADDLE_ENFORCE_EQ(device.machine_id(), + id(), + platform::errors::InvalidArgument( + "The machine id [%d] of the device should be equal " + "to this machine id [%d].", + device.machine_id(), + id_)); + } + devices_[device.global_id()] = &device; +} + +void Machine::add_link(const Link &link) { + PADDLE_ENFORCE_EQ(contains(link.source_id()), + true, + platform::errors::InvalidArgument( + "The source device id of the added link [%s] " + "cannot be found in the device_ids. Please add the " + "source device before adding this link", + std::to_string(link.source_id()))); + links_[link.source_id()][link.target_id()] = &link; +} + +std::string Machine::to_string() const { + std::string str = "{devices: ["; + for (const auto &device : devices_) { + str += device.second->to_string() + ", "; + } + str.replace(str.size() - 2, 2, "], "); + + str += "links: ["; + for (const auto &item : links_) { + str += "{"; + str += "source_id: " + std::to_string(item.first) + ", neighbors: ["; + for (const auto &link : item.second) { + str += link.second->to_string() + ", "; + } + str.replace(str.size() - 2, 2, "]}, "); + } + str.replace(str.size() - 4, 4, "]}"); + return str; +} + +DeviceMesh::DeviceMesh(const std::string &name, + const std::vector &shape, + const std::vector &device_ids, + const std::vector &dim_names) { + name_ = name; + shape_ = shape; + int64_t size = this->size(); + + PADDLE_ENFORCE_EQ(size, + device_ids.size(), + platform::errors::InvalidArgument( + "The size %d of this device mesh must be " + "equal to the size %d of its device ids.", + size, + device_ids.size())); + PADDLE_ENFORCE_EQ( + has_duplicates(device_ids), + false, + platform::errors::InvalidArgument("The device ids [%s] must be unique.", + str_join(device_ids))); + device_ids_ = device_ids; + + PADDLE_ENFORCE_EQ( + shape_.size(), + dim_names.size(), + platform::errors::InvalidArgument( + "The size %d of mesh shape must be equal to the size %d " + "of the dimension names.", + shape_.size(), + dim_names.size())); + PADDLE_ENFORCE_EQ(has_duplicates(dim_names), + false, + platform::errors::InvalidArgument( + "The names [%s] of each dimension must be unique.", + str_join(dim_names))); + dim_names_ = dim_names; +} + +int64_t DeviceMesh::size() const { + if (shape_.empty()) return 0; + int64_t size = 1; + for (const int64_t dim_size : shape_) size *= dim_size; + return size; +} + +bool DeviceMesh::contains(int64_t device_id) const { + auto result = + std::find(std::begin(device_ids_), std::end(device_ids_), device_id); + if (result != std::end(device_ids_)) { + return true; + } else { + return false; + } +} + +void DeviceMesh::add_device(const Device &device) { + PADDLE_ENFORCE_EQ( + contains(device.global_id()), + true, + platform::errors::InvalidArgument( + "The added device id [%s] cannot be found in the device_ids.", + std::to_string(device.global_id()))); + // Operator [] will create a new object if it cannot find one. + // So we add the default constructor for Device and Machine + // to make sure the new object can be created. + devices_[device.global_id()] = device; + machines_[device.machine_id()].add_device(devices_[device.global_id()]); +} + +void DeviceMesh::add_link(const Link &link) { + PADDLE_ENFORCE_EQ( + contains(link.source_id()), + true, + platform::errors::InvalidArgument("The source id of the added link [%s] " + "cannot be found in the device_ids.", + std::to_string(link.source_id()))); + PADDLE_ENFORCE_EQ( + contains(link.target_id()), + true, + platform::errors::InvalidArgument("The source id of the added link [%s] " + "cannot be found in the device_ids.", + std::to_string(link.target_id()))); + // Operator [] will create a new object if it cannot find one. + // So we add the default constructor for Device and Machine + // to make sure the new object can be created. + links_[link.source_id()][link.target_id()] = link; + const Device &source_device = devices_[link.source_id()]; + machines_[source_device.machine_id()].add_link( + links_[link.source_id()][link.target_id()]); +} + +std::string DeviceMesh::to_string() const { + std::string mesh_str = "{name: " + name_ + ", "; + mesh_str += "shape: [" + str_join(shape_) + "], "; + mesh_str += "device_ids: [" + str_join(device_ids_) + "], "; + mesh_str += "dim_names: [" + str_join(dim_names_) + "], "; + mesh_str += "\ndevices: [\n"; + for (const auto &device : devices_) { + mesh_str += " " + device.second.to_string() + ",\n"; + } + mesh_str.replace(mesh_str.size() - 2, 2, "],"); + + mesh_str += "\nlinks: [\n"; + for (const auto &item : links_) { + mesh_str += " {"; + mesh_str += "source_id: " + std::to_string(item.first) + ", neighbors: ["; + for (const auto &link : item.second) { + mesh_str += link.second.to_string() + ", "; + } + mesh_str.replace(mesh_str.size() - 2, 2, "]},\n"); + } + mesh_str.replace(mesh_str.size() - 4, 4, "]}"); + return mesh_str; +} + +DeviceMesh DeviceMesh::from_proto(const DeviceMeshProto &proto) { + DeviceMesh mesh; + + mesh.name_ = proto.name(); + + mesh.shape_.resize(proto.shape_size()); + for (int64_t i = 0; i < proto.shape_size(); ++i) { + mesh.shape_[i] = proto.shape(i); + } + + mesh.device_ids_.resize(proto.device_ids_size()); + for (int64_t i = 0; i < proto.device_ids_size(); ++i) { + mesh.device_ids_[i] = proto.device_ids(i); + } + + mesh.dim_names_.resize(proto.dim_names_size()); + for (int64_t i = 0; i < proto.dim_names_size(); ++i) { + mesh.dim_names_[i] = proto.dim_names(i); + } + + for (int64_t i = 0; i < proto.devices_size(); ++i) { + mesh.add_device(Device::from_proto(proto.devices(i))); + } + + for (int64_t i = 0; i < proto.links_size(); ++i) { + mesh.add_link(Link::from_proto(proto.links(i))); + } + + return mesh; +} + +DeviceMeshProto DeviceMesh::to_proto() const { + DeviceMeshProto proto; + + proto.set_name(name_); + + for (const auto &i : shape_) { + proto.add_shape(i); + } + + for (const auto &i : device_ids_) { + proto.add_device_ids(i); + } + + for (const auto &i : dim_names_) { + proto.add_dim_names(i); + } + + for (const auto &device : devices_) { + proto.mutable_devices()->Add()->CopyFrom(device.second.to_proto()); + } + + for (const auto &neighbors : links_) { + for (const auto &link : neighbors.second) { + proto.mutable_links()->Add()->CopyFrom(link.second.to_proto()); + } + } + + return proto; +} + +bool operator==(const DeviceMesh &lhs, const DeviceMesh &rhs) { + // Use the unique name to do the fast comparison + if (lhs.name() != rhs.name()) { + return false; + } + return true; +} + +} // namespace auto_parallel +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/auto_parallel/device_mesh.h b/paddle/fluid/distributed/auto_parallel/device_mesh.h new file mode 100644 index 0000000000000..15ec50f546d30 --- /dev/null +++ b/paddle/fluid/distributed/auto_parallel/device_mesh.h @@ -0,0 +1,273 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/distributed/auto_parallel/auto_parallel.pb.h" +#include "paddle/fluid/distributed/auto_parallel/utils.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace distributed { +namespace auto_parallel { +struct DeviceCapability { + double single_precision_flops = 0.0; + double double_precision_flops = 0.0; + double memory_size_in_bytes = 0.0; + double clock_rate_in_ghz = 0.0; + + // DeviceCapability from_string(const std::string& str); + std::string to_string() const; + + static DeviceCapability from_proto(const DeviceCapabilityProto& proto); + DeviceCapabilityProto to_proto() const; +}; + +inline std::ostream& operator<<(std::ostream& os, const DeviceCapability& obj) { + os << obj.to_string(); + return os; +} + +class Device { + public: + Device() = default; + Device(int64_t global_id, + int64_t local_id, + int64_t machine_id, + const std::string& type) + : global_id_(global_id), + local_id_(local_id), + machine_id_(machine_id), + type_(type) {} + + int64_t global_id() const { return global_id_; } + int64_t local_id() const { return local_id_; } + int64_t machine_id() const { return machine_id_; } + const std::string& type() const { return type_; } + + const DeviceCapability& capability() const { return capability_; } + void set_capability(const DeviceCapability& capability) { + capability_ = capability; + } + + // Device from_string(const std::string& mesh_str); + std::string to_string() const; + + static Device from_proto(const DeviceProto& proto); + DeviceProto to_proto() const; + + private: + int64_t global_id_; + int64_t local_id_; + int64_t machine_id_; + std::string type_; + DeviceCapability capability_; +}; + +inline std::ostream& operator<<(std::ostream& os, const Device& obj) { + os << obj.to_string(); + return os; +} + +bool operator==(const Device& lhs, const Device& rhs); + +inline bool operator!=(const Device& lhs, const Device& rhs) { + return !operator==(lhs, rhs); +} + +struct LinkCapability { + double bandwidth = 0.0; // Bytes/s + double latency = 0.0; + + // LinkCapability from_string(const std::string& str); + std::string to_string() const; + + static LinkCapability from_proto(const LinkCapabilityProto& proto); + LinkCapabilityProto to_proto() const; +}; + +inline std::ostream& operator<<(std::ostream& os, const LinkCapability& obj) { + os << obj.to_string(); + return os; +} + +class Link { + public: + Link() = default; + + Link(int64_t source_id, int64_t target_id, const std::string& type) + : source_id_(source_id), target_id_(target_id), type_(type) {} + + int64_t source_id() const { return source_id_; } + int64_t target_id() const { return target_id_; } + const std::string& type() const { return type_; } + + const LinkCapability& capability() const { return capability_; } + void set_capability(const LinkCapability& capability) { + capability_ = capability; + } + + // Link from_string(const std::string& str); + std::string to_string() const; + + static Link from_proto(const LinkProto& proto); + LinkProto to_proto() const; + + private: + int64_t source_id_; + int64_t target_id_; + std::string type_; + LinkCapability capability_; +}; + +inline std::ostream& operator<<(std::ostream& os, const Link& obj) { + os << obj.to_string(); + return os; +} + +bool operator==(const Link& lhs, const Link& rhs); + +inline bool operator!=(const Link& lhs, const Link& rhs) { + return !operator==(lhs, rhs); +} + +class Machine { + public: + Machine() = default; + + explicit Machine(int64_t id) : id_(id) {} + + int64_t id() const { return id_; } + + void set_id(int64_t id) { id_ = id; } + + bool contains(int64_t device_id) const; + + void add_device(const Device& device); + + void add_link(const Link& link); + + // Machine from_string(const std::string& str); + std::string to_string() const; + + private: + int64_t id_ = -1; + std::unordered_map devices_; + std::unordered_map> links_; +}; + +class DeviceMesh { + public: + DeviceMesh() = default; + + DeviceMesh(const std::string& name, + const std::vector& shape, + const std::vector& device_ids, + const std::vector& dim_names); + + const std::string& name() const { return name_; } + + void set_name(const std::string& name) { name_ = name; } + + const std::vector& shape() const { return shape_; } + + const std::vector& device_ids() const { return device_ids_; } + + const std::vector& dim_names() const { return dim_names_; } + + std::string device_type() const { + if (empty()) return std::string(); + return std::begin(devices_)->second.type(); + } + + const std::unordered_map& devices() const { + return devices_; + } + + const std::unordered_map>& links() + const { + return links_; + } + + const Device& device(int64_t global_id) const { + return devices_.at(global_id); + } + + const Link& link(int64_t source_id, int64_t target_id) const { + return links_.at(source_id).at(target_id); + } + + int64_t size() const; + int64_t ndim() const { return shape_.size(); } + + int64_t dim_size(int64_t dim) const { + int64_t cdim = canonical_dim(dim, shape_.size()); + return shape_[cdim]; + } + + int64_t dim_size(const std::string& dim_name) const { + for (std::size_t i = 0; i < dim_names_.size(); ++i) { + if (dim_names_[i] == dim_name) { + return shape_[i]; + } + } + PADDLE_THROW(platform::errors::InvalidArgument( + "Cannot find the dimension of %s in this device mesh.", dim_name)); + } + + bool empty() const { return (shape_.empty() || device_ids_.empty()); } + bool contains(int64_t device_id) const; + + void add_device(const Device& device); + void add_link(const Link& link); + + // DeviceMesh from_string(const std::string& mesh_str); + std::string to_string() const; + + static DeviceMesh from_proto(const DeviceMeshProto& proto); + DeviceMeshProto to_proto() const; + + private: + std::string name_; + std::vector shape_; + std::vector device_ids_; + std::vector dim_names_; + std::unordered_map devices_; + std::unordered_map> links_; + std::unordered_map machines_; +}; + +inline std::ostream& operator<<(std::ostream& os, const DeviceMesh& obj) { + os << obj.to_string(); + return os; +} + +bool operator==(const DeviceMesh& lhs, const DeviceMesh& rhs); + +inline bool operator!=(const DeviceMesh& lhs, const DeviceMesh& rhs) { + return !operator==(lhs, rhs); +} + +} // namespace auto_parallel +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/auto_parallel/device_mesh_test.cc b/paddle/fluid/distributed/auto_parallel/device_mesh_test.cc new file mode 100644 index 0000000000000..bdfc13baa424d --- /dev/null +++ b/paddle/fluid/distributed/auto_parallel/device_mesh_test.cc @@ -0,0 +1,93 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/distributed/auto_parallel/device_mesh.h" +#include +#include +#include "gtest/gtest.h" + +namespace paddle { +namespace distributed { +namespace auto_parallel { + +TEST(DeviceMesh, Ctor) { + std::vector shape = {2, 3}; + std::vector device_ids = {0, 1, 2, 3, 4, 5}; + std::vector dim_names = {"x", "y"}; + std::string device_type = "GPU"; + int64_t size = shape[0] * shape[1]; + + DeviceMesh device_mesh("mesh", shape, device_ids, dim_names); + for (int64_t i = 0; i < shape[0]; ++i) { + for (int64_t j = 0; j < shape[1]; ++j) { + int64_t global_id = i * shape[1] + j; + int64_t local_id = j; + int64_t machine_id = i; + device_mesh.add_device( + Device(global_id, local_id, machine_id, device_type)); + } + } + for (int64_t i = 0; i < size; ++i) { + for (int64_t j = 0; j < size; ++j) { + device_mesh.add_link(Link(i, j, "NVL")); + } + } + + EXPECT_EQ(device_mesh.name(), "mesh"); + EXPECT_EQ(device_mesh.shape(), shape); + EXPECT_EQ(device_mesh.device_ids(), device_ids); + EXPECT_EQ(device_mesh.dim_names()[0], "x"); + EXPECT_EQ(device_mesh.dim_names()[1], "y"); + EXPECT_EQ(device_mesh.device_type(), device_type); + EXPECT_EQ(device_mesh.size(), size); + EXPECT_EQ(device_mesh.ndim(), static_cast(shape.size())); + EXPECT_EQ(device_mesh.dim_size(0), shape[0]); + EXPECT_EQ(device_mesh.dim_size(-1), shape[1]); + EXPECT_EQ(device_mesh.dim_size("x"), shape[0]); + EXPECT_EQ(device_mesh.dim_size("y"), shape[1]); + EXPECT_EQ(device_mesh.empty(), false); + EXPECT_EQ(device_mesh.contains(0), true); + EXPECT_EQ(device_mesh.contains(6), false); + EXPECT_EQ(device_mesh.device(3).global_id(), 3); + EXPECT_EQ(device_mesh.device(3).local_id(), 0); + EXPECT_EQ(device_mesh.device(3).machine_id(), 1); + EXPECT_EQ(device_mesh.device(3).type(), "GPU"); + EXPECT_EQ(device_mesh.link(3, 4).source_id(), 3); + EXPECT_EQ(device_mesh.link(3, 4).target_id(), 4); + EXPECT_EQ(device_mesh.link(3, 4).type(), "NVL"); + for (int64_t i = 0; i < shape[0]; ++i) { + for (int64_t j = 0; j < shape[1]; ++j) { + int64_t global_id = i * shape[1] + j; + int64_t local_id = j; + int64_t machine_id = i; + auto device = device_mesh.devices().at(global_id); + EXPECT_EQ(device, Device(global_id, local_id, machine_id, device_type)); + } + } + for (int64_t i = 0; i < size; ++i) { + for (int64_t j = 0; j < size; ++j) { + EXPECT_EQ(device_mesh.links().at(i).at(j), Link(i, j, "NVL")); + } + } + std::stringstream sstream; + sstream << device_mesh; + EXPECT_EQ(sstream.str(), device_mesh.to_string()); + auto proto = device_mesh.to_proto(); + DeviceMesh new_device_mesh = DeviceMesh::from_proto(proto); + EXPECT_EQ(device_mesh, new_device_mesh); +} + +} // namespace auto_parallel +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/auto_parallel/dist_mapper.cc b/paddle/fluid/distributed/auto_parallel/dist_mapper.cc new file mode 100644 index 0000000000000..d0995604522e5 --- /dev/null +++ b/paddle/fluid/distributed/auto_parallel/dist_mapper.cc @@ -0,0 +1,146 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include "paddle/fluid/distributed/auto_parallel/dist_mapper.h" +#include "paddle/fluid/distributed/auto_parallel/utils.h" + +namespace paddle { +namespace distributed { +namespace auto_parallel { + +void DistributedMapper::set_process_id_to_device_ids( + const std::map>>& + process_id_to_device_ids) { + std::vector device_mesh_names; + for (const auto& item : device_meshes_) { + device_mesh_names.push_back(item.first); + } + for (const auto& item : process_id_to_device_ids) { + PADDLE_ENFORCE_GE( + item.first, + 0, + platform::errors::InvalidArgument( + "The process id %d must be greater than or equal to 0.", + item.first)); + std::string device_mesh_name = item.second.first; + const std::vector& device_ids = item.second.second; + PADDLE_ENFORCE_EQ( + device_meshes_.count(device_mesh_name), + 1, + platform::errors::InvalidArgument( + "Cannot find the device mesh %d in device_mesh ids [%s].", + device_mesh_name, + str_join(device_mesh_names))); + PADDLE_ENFORCE_EQ( + has_duplicates(device_ids), + false, + platform::errors::InvalidArgument( + "The mapped device ids [%s] of process_mesh %d must be unique.", + str_join(device_ids), + item.first)); + const DeviceMesh& device_mesh = device_meshes_[device_mesh_name]; + const std::vector cur_device_ids = device_mesh.device_ids(); + for (int64_t device_id : device_ids) { + bool found = + std::find(cur_device_ids.begin(), cur_device_ids.end(), device_id) != + cur_device_ids.end(); + PADDLE_ENFORCE_EQ( + found, + true, + platform::errors::InvalidArgument( + "The device id %d cannot be find in the device mesh [%s].", + device_id, + str_join(cur_device_ids))); + } + } + process_id_to_device_ids_ = process_id_to_device_ids; +} + +DistributedMapper DistributedMapper::from_proto( + const DistributedMapperProto& proto) { + DistributedMapper dist_mapper; + for (int64_t i = 0; i < proto.device_meshes_size(); ++i) { + dist_mapper.device_meshes_[proto.device_meshes(i).name()] = + DeviceMesh::from_proto(proto.device_meshes(i)); + } + for (int64_t i = 0; i < proto.process_id_to_device_ids_size(); ++i) { + int64_t process_id = proto.process_id_to_device_ids(i).process_id(); + std::string device_mesh_name = + proto.process_id_to_device_ids(i).device_mesh_name(); + std::vector device_ids; + int64_t num_devices = proto.process_id_to_device_ids(i).device_ids_size(); + for (int64_t j = 0; j < num_devices; ++j) { + device_ids.push_back(proto.process_id_to_device_ids(i).device_ids(j)); + } + dist_mapper.process_id_to_device_ids_[process_id].first = device_mesh_name; + dist_mapper.process_id_to_device_ids_[process_id].second = device_ids; + } + return dist_mapper; +} + +DistributedMapperProto DistributedMapper::to_proto() const { + DistributedMapperProto proto; + for (const auto& item : device_meshes_) { + proto.mutable_device_meshes()->Add()->CopyFrom(item.second.to_proto()); + } + for (const auto& outer : process_id_to_device_ids_) { + auto proto_item = proto.mutable_process_id_to_device_ids()->Add(); + proto_item->set_process_id(outer.first); + proto_item->set_device_mesh_name(outer.second.first); + for (const auto& inner : outer.second.second) { + proto_item->add_device_ids(inner); + } + } + return proto; +} + +std::string DistributedMapper::to_string() const { + std::string mapper_str = "{device_meshes: ["; + for (const auto& item : device_meshes_) { + mapper_str += item.second.to_string() + ", "; + } + mapper_str.replace(mapper_str.size() - 2, 2, "]"); + + mapper_str += "\nprocess_id_to_device_ids: ["; + for (const auto& item : process_id_to_device_ids_) { + mapper_str += "{"; + mapper_str += + "process_id: " + std::to_string(item.first) + ", device_ids: ["; + for (const auto& device_id : item.second.second) { + mapper_str += + "{" + item.second.first + ", " + std::to_string(device_id) + "}, "; + } + mapper_str.replace(mapper_str.size() - 2, 2, "]"); + mapper_str += "}, "; + } + mapper_str.replace(mapper_str.size() - 2, 2, "]"); + mapper_str += "}"; + return mapper_str; +} + +bool operator==(const DistributedMapper& lhs, const DistributedMapper& rhs) { + if (lhs.device_meshes() != rhs.device_meshes()) { + return false; + } + if (lhs.process_id_to_device_ids() != rhs.process_id_to_device_ids()) { + return false; + } + return true; +} + +} // namespace auto_parallel +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/auto_parallel/dist_mapper.h b/paddle/fluid/distributed/auto_parallel/dist_mapper.h new file mode 100644 index 0000000000000..bd7f9790ad69f --- /dev/null +++ b/paddle/fluid/distributed/auto_parallel/dist_mapper.h @@ -0,0 +1,73 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include + +#include "paddle/fluid/distributed/auto_parallel/auto_parallel.pb.h" +#include "paddle/fluid/distributed/auto_parallel/device_mesh.h" +#include "paddle/fluid/distributed/auto_parallel/process_mesh.h" + +namespace paddle { +namespace distributed { +namespace auto_parallel { + +class DistributedMapper { + public: + DistributedMapper() = default; + + const std::map& device_meshes() const { + return device_meshes_; + } + + const DeviceMesh& device_mesh(const std::string& name) const { + return device_meshes_.at(name); + } + + void add_device_mesh(const DeviceMesh& device_mesh) { + device_meshes_[device_mesh.name()] = device_mesh; + } + + const std::map>>& + process_id_to_device_ids() const { + return process_id_to_device_ids_; + } + + void set_process_id_to_device_ids( + const std::map>>& + process_id_to_device_ids); + + // DistributedMapper from_string(const std::string& mapper_str); + std::string to_string() const; + + static DistributedMapper from_proto(const DistributedMapperProto& proto); + DistributedMapperProto to_proto() const; + + private: + std::map device_meshes_; + std::map>> + process_id_to_device_ids_; +}; + +bool operator==(const DistributedMapper& lhs, const DistributedMapper& rhs); + +inline std::ostream& operator<<(std::ostream& os, + const DistributedMapper& obj) { + os << obj.to_string(); + return os; +} + +} // namespace auto_parallel +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/auto_parallel/dist_mapper_test.cc b/paddle/fluid/distributed/auto_parallel/dist_mapper_test.cc new file mode 100644 index 0000000000000..d427b9cbb09ed --- /dev/null +++ b/paddle/fluid/distributed/auto_parallel/dist_mapper_test.cc @@ -0,0 +1,72 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/distributed/auto_parallel/dist_mapper.h" +#include +#include +#include "gtest/gtest.h" + +namespace paddle { +namespace distributed { +namespace auto_parallel { + +TEST(DistributedMapper, Ctor) { + std::vector shape = {2, 3}; + std::vector device_ids = {0, 1, 2, 3, 4, 5}; + std::vector dim_names = {"x", "y"}; + std::string device_type = "GPU"; + int64_t size = shape[0] * shape[1]; + + DeviceMesh device_mesh("device_mesh", shape, device_ids, dim_names); + for (int64_t i = 0; i < shape[0]; ++i) { + for (int64_t j = 0; j < shape[1]; ++j) { + int64_t global_id = i * shape[1] + j; + int64_t local_id = j; + int64_t machine_id = i; + device_mesh.add_device( + Device(global_id, local_id, machine_id, device_type)); + } + } + for (int64_t i = 0; i < size; ++i) { + for (int64_t j = 0; j < size; ++j) { + device_mesh.add_link(Link(i, j, "NVL")); + } + } + + DistributedMapper dist_mapper; + dist_mapper.add_device_mesh(device_mesh); + std::map>> + process_id_to_device_ids; + process_id_to_device_ids[0] = {"device_mesh", {5}}; + process_id_to_device_ids[1] = {"device_mesh", {4}}; + process_id_to_device_ids[2] = {"device_mesh", {3}}; + process_id_to_device_ids[3] = {"device_mesh", {2}}; + process_id_to_device_ids[4] = {"device_mesh", {1}}; + process_id_to_device_ids[5] = {"device_mesh", {0}}; + dist_mapper.set_process_id_to_device_ids(process_id_to_device_ids); + + EXPECT_EQ(dist_mapper.device_meshes().at("device_mesh"), device_mesh); + EXPECT_EQ(dist_mapper.device_mesh("device_mesh"), device_mesh); + EXPECT_EQ(dist_mapper.process_id_to_device_ids(), process_id_to_device_ids); + std::stringstream sstream; + sstream << dist_mapper; + EXPECT_EQ(sstream.str(), dist_mapper.to_string()); + auto proto = dist_mapper.to_proto(); + DistributedMapper new_dist_mapper = DistributedMapper::from_proto(proto); + EXPECT_EQ(dist_mapper, new_dist_mapper); +} + +} // namespace auto_parallel +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/auto_parallel/process_mesh.cc b/paddle/fluid/distributed/auto_parallel/process_mesh.cc new file mode 100644 index 0000000000000..dda2873768997 --- /dev/null +++ b/paddle/fluid/distributed/auto_parallel/process_mesh.cc @@ -0,0 +1,134 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/fluid/distributed/auto_parallel/process_mesh.h" +#include "paddle/fluid/distributed/auto_parallel/utils.h" + +namespace paddle { +namespace distributed { +namespace auto_parallel { + +ProcessMesh::ProcessMesh(const std::vector &shape, + const std::vector &process_ids, + const std::vector &dim_names) { + shape_ = shape; + int64_t size = this->size(); + PADDLE_ENFORCE_EQ( + size, + process_ids.size(), + platform::errors::InvalidArgument("The size of this process mesh must be " + "equal to the size of its process ids.", + size, + process_ids.size())); + PADDLE_ENFORCE_EQ( + has_duplicates(process_ids), + false, + platform::errors::InvalidArgument("The process ids [%s] must be unique.", + str_join(process_ids_))); + process_ids_ = process_ids; + + PADDLE_ENFORCE_EQ(shape_.size(), + dim_names.size(), + platform::errors::InvalidArgument( + "The size of mesh shape must be equal to the size " + "of the dimension names.", + shape_.size(), + dim_names_.size())); + PADDLE_ENFORCE_EQ(has_duplicates(dim_names), + false, + platform::errors::InvalidArgument( + "The names [%s] of each dimension must be unique.", + str_join(dim_names))); + dim_names_ = dim_names; +} + +int64_t ProcessMesh::size() const { + if (shape_.empty()) return 0; + int64_t size = 1; + for (const int64_t dim_size : shape_) size *= dim_size; + return size; +} + +bool ProcessMesh::contains(int64_t process_id) const { + auto result = + std::find(std::begin(process_ids_), std::end(process_ids_), process_id); + if (result != std::end(process_ids_)) { + return true; + } else { + return false; + } +} + +std::string ProcessMesh::to_string() const { + std::string mesh_str = "{shape: [" + str_join(shape_) + "], "; + mesh_str += "process_ids: [" + str_join(process_ids_) + "], "; + mesh_str += "dim_names: [" + str_join(dim_names_) + "]}"; + return mesh_str; +} + +ProcessMesh ProcessMesh::from_proto(const ProcessMeshProto &proto) { + ProcessMesh mesh; + + mesh.shape_.resize(proto.shape_size()); + for (int64_t i = 0; i < proto.shape_size(); ++i) { + mesh.shape_[i] = proto.shape(i); + } + + mesh.process_ids_.resize(proto.process_ids_size()); + for (int64_t i = 0; i < proto.process_ids_size(); ++i) { + mesh.process_ids_[i] = proto.process_ids(i); + } + + mesh.dim_names_.resize(proto.dim_names_size()); + for (int64_t i = 0; i < proto.dim_names_size(); ++i) { + mesh.dim_names_[i] = proto.dim_names(i); + } + + return mesh; +} + +ProcessMeshProto ProcessMesh::to_proto() const { + ProcessMeshProto proto; + + for (const auto &i : shape_) { + proto.add_shape(i); + } + + for (const auto &i : process_ids_) { + proto.add_process_ids(i); + } + + for (const auto &i : dim_names_) { + proto.add_dim_names(i); + } + + return proto; +} + +bool operator==(const ProcessMesh &lhs, const ProcessMesh &rhs) { + if (lhs.shape() != rhs.shape()) { + return false; + } + if (lhs.process_ids() != rhs.process_ids()) { + return false; + } + return true; +} + +} // namespace auto_parallel +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/auto_parallel/process_mesh.h b/paddle/fluid/distributed/auto_parallel/process_mesh.h new file mode 100644 index 0000000000000..2652a8f606216 --- /dev/null +++ b/paddle/fluid/distributed/auto_parallel/process_mesh.h @@ -0,0 +1,94 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/distributed/auto_parallel/auto_parallel.pb.h" +#include "paddle/fluid/distributed/auto_parallel/device_mesh.h" +#include "paddle/fluid/distributed/auto_parallel/utils.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace distributed { +namespace auto_parallel { + +class ProcessMesh { + public: + ProcessMesh() = default; + + ProcessMesh(const std::vector& shape, + const std::vector& process_ids, + const std::vector& dim_names); + + const std::vector& shape() const { return shape_; } + + const std::vector& process_ids() const { return process_ids_; } + + const std::vector& dim_names() const { return dim_names_; } + + int64_t size() const; + + int64_t ndim() const { return shape_.size(); } + + int64_t dim_size(int64_t dim) const { + int64_t cdim = canonical_dim(dim, shape_.size()); + return shape_[cdim]; + } + + int64_t dim_size(const std::string& dim_name) const { + for (std::size_t i = 0; i < dim_names_.size(); ++i) { + if (dim_names_[i] == dim_name) { + return shape_[i]; + } + } + PADDLE_THROW(platform::errors::InvalidArgument( + "Cannot find the dimension of %s in this process mesh.", dim_name)); + } + + bool empty() const { return (shape_.empty() || process_ids_.empty()); } + bool contains(int64_t process_id) const; + + // ProcessMesh from_string(const std::string& mesh_str); + std::string to_string() const; + + static ProcessMesh from_proto(const ProcessMeshProto& proto); + ProcessMeshProto to_proto() const; + + private: + std::vector shape_; + std::vector process_ids_; + std::vector dim_names_; +}; + +inline std::ostream& operator<<(std::ostream& os, const ProcessMesh& obj) { + os << obj.to_string(); + return os; +} + +bool operator==(const ProcessMesh& lhs, const ProcessMesh& rhs); + +inline bool operator!=(const ProcessMesh& lhs, const ProcessMesh& rhs) { + return !operator==(lhs, rhs); +} + +} // namespace auto_parallel +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/auto_parallel/process_mesh_test.cc b/paddle/fluid/distributed/auto_parallel/process_mesh_test.cc new file mode 100644 index 0000000000000..d0f3c5b510b43 --- /dev/null +++ b/paddle/fluid/distributed/auto_parallel/process_mesh_test.cc @@ -0,0 +1,54 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/distributed/auto_parallel/process_mesh.h" +#include +#include +#include "gtest/gtest.h" + +namespace paddle { +namespace distributed { +namespace auto_parallel { + +TEST(ProcessMesh, Ctor) { + std::vector shape = {2, 3}; + std::vector process_ids = {0, 1, 2, 3, 4, 5}; + std::vector dim_names = {"x", "y"}; + int64_t size = shape[0] * shape[1]; + ProcessMesh process_mesh(shape, process_ids, dim_names); + EXPECT_EQ(process_mesh.shape(), shape); + EXPECT_EQ(process_mesh.process_ids(), process_ids); + EXPECT_EQ(process_mesh.dim_names()[0], "x"); + EXPECT_EQ(process_mesh.dim_names()[1], "y"); + EXPECT_EQ(process_mesh.size(), size); + EXPECT_EQ(process_mesh.ndim(), static_cast(shape.size())); + EXPECT_EQ(process_mesh.dim_size(0), shape[0]); + EXPECT_EQ(process_mesh.dim_size(-1), shape[1]); + EXPECT_EQ(process_mesh.dim_size("x"), shape[0]); + EXPECT_EQ(process_mesh.dim_size("y"), shape[1]); + EXPECT_EQ(process_mesh.empty(), false); + EXPECT_EQ(process_mesh.contains(0), true); + EXPECT_EQ(process_mesh.contains(6), false); + std::stringstream sstream; + sstream << process_mesh; + EXPECT_EQ(sstream.str(), process_mesh.to_string()); + auto proto = process_mesh.to_proto(); + ProcessMesh new_process_mesh = ProcessMesh::from_proto(proto); + EXPECT_EQ(process_mesh, new_process_mesh); + std::cout << new_process_mesh << std::endl; +} + +} // namespace auto_parallel +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/auto_parallel/utils.h b/paddle/fluid/distributed/auto_parallel/utils.h new file mode 100644 index 0000000000000..106cbea5e2d6f --- /dev/null +++ b/paddle/fluid/distributed/auto_parallel/utils.h @@ -0,0 +1,105 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include +#include +#include +#include +#include + +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace distributed { +namespace auto_parallel { + +// struct Indent { +// Indent(int &level) : level(level) { ++level; } +// ~Indent() { --level; } +// int &level; +// }; + +// inline std::string str_indent(std::string& str, cur_indent) { +// string spaces(cur_indent, " "); +// return str + std::string(cur_indent, " "); +// } + +template +bool has_duplicates(const std::vector& vec) { + std::unordered_map map; + for (const auto& i : vec) { + ++map[i]; + if (map[i] > 1) return true; + } + return false; +} + +inline int64_t canonical_dim(int dim, int ndim) { + PADDLE_ENFORCE_EQ( + dim >= -ndim && dim < ndim, + true, + platform::errors::InvalidArgument( + "Dimension %d is outside of [-%d, %d).", dim, ndim, ndim)); + if (dim < 0) { + return dim + ndim; + } + return dim; +} + +// Refer to https://stackoverflow.com/a/5289170 +template +std::string str_join(Range const& elements, + const std::string& delimiter = ",") { + std::ostringstream os; + auto b = std::begin(elements), e = std::end(elements); + + if (b != e) { + std::copy(b, prev(e), std::ostream_iterator(os, delimiter.c_str())); + b = prev(e); + } + if (b != e) { + os << *b; + } + + return os.str(); +} + +// Refer to https://stackoverflow.com/a/46931770 +inline std::vector str_split(std::string const& input, + const std::string& delimiter = ",") { + size_t pos_start = 0, pos_end, delim_len = delimiter.length(); + std::string token; + std::vector output; + while ((pos_end = input.find(delimiter, pos_start)) != std::string::npos) { + token = input.substr(pos_start, pos_end - pos_start); + pos_start = pos_end + delim_len; + output.push_back(token); + } + output.push_back(input.substr(pos_start)); + return output; +} + +// Refer to https://stackoverflow.com/a/29200671/2358969 +template +std::string to_string_with_precision(const T a_value, const int n = 2) { + std::ostringstream out; + out.precision(n); + out << std::fixed << a_value; + return out.str(); +} + +} // namespace auto_parallel +} // namespace distributed +} // namespace paddle From d9946c570701618f4e3b204c2eea81a419f532c8 Mon Sep 17 00:00:00 2001 From: Yulong Ao Date: Sun, 7 Aug 2022 09:48:06 +0000 Subject: [PATCH 2/3] [Auto Parallel] Remove unecessary codes --- .../auto_parallel/auto_parallel.proto | 85 ---------- .../distributed/auto_parallel/dist_mapper.cc | 146 ------------------ .../distributed/auto_parallel/dist_mapper.h | 73 --------- .../auto_parallel/dist_mapper_test.cc | 72 --------- .../distributed/auto_parallel/process_mesh.cc | 134 ---------------- .../distributed/auto_parallel/process_mesh.h | 94 ----------- .../auto_parallel/process_mesh_test.cc | 54 ------- 7 files changed, 658 deletions(-) delete mode 100644 paddle/fluid/distributed/auto_parallel/dist_mapper.cc delete mode 100644 paddle/fluid/distributed/auto_parallel/dist_mapper.h delete mode 100644 paddle/fluid/distributed/auto_parallel/dist_mapper_test.cc delete mode 100644 paddle/fluid/distributed/auto_parallel/process_mesh.cc delete mode 100644 paddle/fluid/distributed/auto_parallel/process_mesh.h delete mode 100644 paddle/fluid/distributed/auto_parallel/process_mesh_test.cc diff --git a/paddle/fluid/distributed/auto_parallel/auto_parallel.proto b/paddle/fluid/distributed/auto_parallel/auto_parallel.proto index eae33e929a2d3..5625737c4426a 100644 --- a/paddle/fluid/distributed/auto_parallel/auto_parallel.proto +++ b/paddle/fluid/distributed/auto_parallel/auto_parallel.proto @@ -16,73 +16,6 @@ syntax = "proto2"; package paddle.distributed.auto_parallel; -// ProcessMesh is used to organize processes and like n-dimension array. -message ProcessMeshProto { - // The size of each dimension. - repeated int64 shape = 1; - - // These process ids are stored by a row-major way. - // There are no duplicate process ids within one process mesh. - repeated int64 process_ids = 2; - - // The name of each dimension. - repeated string dim_names = 3; - -} - -// // This distributed attribute describes how to distribute the corresponding tensor, -// // and store any other information needed by auto parallel. -// message TensorDistAttrProto { -// // The process mesh where a tensor is distributed. -// optional ProcessMeshProto process_mesh = 1; -// -// // The length of dims_mapping is same as the length of the tensor shape. -// // The i-th dimension of the tensor will be sharded by the dims_mapping[i]-th dimension -// // of the above process mesh. If dims_mapping[i] is -1, the i-th dimension of the tensor -// // will not be sharded. For example, given a tensor shape [2, 6, 12], a process mesh -// // shape [2, 3] and a dims_mapping [-1, 1, 0], each sharded tensor will have a shape [2, 2, 6]. -// repeated int64 dims_mapping = 2; -// -// // The batch dimension of the corresponding tensor. -// optional int64 batch_dim = 3; -// -// // If the dynamic_dims[i] is True, the i-th dimension of the corresponding tensor -// // is dynamic changed. Otherwise, the i-th dimension of the tensor is static determined. -// repeated bool dynamic_dims = 4; -// } -// -// // This distributed attribute describes how to distribute the corresponding operator, -// // and store any other information needed by auto parallel. -// message OperatorDistAttrProto { -// message TensorDistAttrMappingEntryProto { -// optional string name = 1; -// optional TensorDistAttrProto tensor_dist_attr = 2; -// } -// // The key of this map is the input tensor name and the value is the distributed attribute -// // of the input tensor required by this corresponding operator. -// // The distributed attribute of the actual tensor may be not the same as that within -// // the distributed attribute of the operator. -// repeated TensorDistAttrMappingEntryProto input_dist_attrs = 1; -// -// // The key of this map is the output tensor name and the value is the distributed attribute -// // of the output tensor required by this corresponding operator. -// // The distributed attribute of the actual tensor may be not the same as that within -// // the distributed attribute of the operator. -// repeated TensorDistAttrMappingEntryProto output_dist_attrs = 2; -// -// // The process mesh where a op is distributed. -// optional ProcessMeshProto process_mesh = 3; -// -// // A operator ideally has a distributed operator which may have multiple distributed implementations. -// // This filed is usually same as the operator type. However, some operators such as the element-wise operators -// // may shared the same distributed operator, the field is use for this scenario. -// optional string impl_type = 4; -// -// // This field tells which distributed implementations of this corresponding operator -// // will be selected for the actual computation. -// optional int64 impl_idx = 5; -// } - // This proto describes the capability of one device such as the computation and memory. message DeviceCapabilityProto { optional double single_precision_flops = 1; @@ -153,21 +86,3 @@ message DeviceMeshProto { // The links are between devices. repeated LinkProto links = 6; } - -// Record the mapping between the logical processes and the physical devices. -message DistributedMapperProto { - // The device meshes used by this distributed computation, - // which may be shared by different multiple device meshes. - repeated DeviceMeshProto device_meshes = 1; - - message MapperEntryProto { - optional int64 process_id = 1; - optional string device_mesh_name = 2; - repeated int64 device_ids = 3; - } - - // The mapping from process ids to device ids. - // It is also possible for one process to use multiple devices. - // It is possible for one device shared by multiple processes. - repeated MapperEntryProto process_id_to_device_ids = 2; -} diff --git a/paddle/fluid/distributed/auto_parallel/dist_mapper.cc b/paddle/fluid/distributed/auto_parallel/dist_mapper.cc deleted file mode 100644 index d0995604522e5..0000000000000 --- a/paddle/fluid/distributed/auto_parallel/dist_mapper.cc +++ /dev/null @@ -1,146 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -#include "paddle/fluid/distributed/auto_parallel/dist_mapper.h" -#include "paddle/fluid/distributed/auto_parallel/utils.h" - -namespace paddle { -namespace distributed { -namespace auto_parallel { - -void DistributedMapper::set_process_id_to_device_ids( - const std::map>>& - process_id_to_device_ids) { - std::vector device_mesh_names; - for (const auto& item : device_meshes_) { - device_mesh_names.push_back(item.first); - } - for (const auto& item : process_id_to_device_ids) { - PADDLE_ENFORCE_GE( - item.first, - 0, - platform::errors::InvalidArgument( - "The process id %d must be greater than or equal to 0.", - item.first)); - std::string device_mesh_name = item.second.first; - const std::vector& device_ids = item.second.second; - PADDLE_ENFORCE_EQ( - device_meshes_.count(device_mesh_name), - 1, - platform::errors::InvalidArgument( - "Cannot find the device mesh %d in device_mesh ids [%s].", - device_mesh_name, - str_join(device_mesh_names))); - PADDLE_ENFORCE_EQ( - has_duplicates(device_ids), - false, - platform::errors::InvalidArgument( - "The mapped device ids [%s] of process_mesh %d must be unique.", - str_join(device_ids), - item.first)); - const DeviceMesh& device_mesh = device_meshes_[device_mesh_name]; - const std::vector cur_device_ids = device_mesh.device_ids(); - for (int64_t device_id : device_ids) { - bool found = - std::find(cur_device_ids.begin(), cur_device_ids.end(), device_id) != - cur_device_ids.end(); - PADDLE_ENFORCE_EQ( - found, - true, - platform::errors::InvalidArgument( - "The device id %d cannot be find in the device mesh [%s].", - device_id, - str_join(cur_device_ids))); - } - } - process_id_to_device_ids_ = process_id_to_device_ids; -} - -DistributedMapper DistributedMapper::from_proto( - const DistributedMapperProto& proto) { - DistributedMapper dist_mapper; - for (int64_t i = 0; i < proto.device_meshes_size(); ++i) { - dist_mapper.device_meshes_[proto.device_meshes(i).name()] = - DeviceMesh::from_proto(proto.device_meshes(i)); - } - for (int64_t i = 0; i < proto.process_id_to_device_ids_size(); ++i) { - int64_t process_id = proto.process_id_to_device_ids(i).process_id(); - std::string device_mesh_name = - proto.process_id_to_device_ids(i).device_mesh_name(); - std::vector device_ids; - int64_t num_devices = proto.process_id_to_device_ids(i).device_ids_size(); - for (int64_t j = 0; j < num_devices; ++j) { - device_ids.push_back(proto.process_id_to_device_ids(i).device_ids(j)); - } - dist_mapper.process_id_to_device_ids_[process_id].first = device_mesh_name; - dist_mapper.process_id_to_device_ids_[process_id].second = device_ids; - } - return dist_mapper; -} - -DistributedMapperProto DistributedMapper::to_proto() const { - DistributedMapperProto proto; - for (const auto& item : device_meshes_) { - proto.mutable_device_meshes()->Add()->CopyFrom(item.second.to_proto()); - } - for (const auto& outer : process_id_to_device_ids_) { - auto proto_item = proto.mutable_process_id_to_device_ids()->Add(); - proto_item->set_process_id(outer.first); - proto_item->set_device_mesh_name(outer.second.first); - for (const auto& inner : outer.second.second) { - proto_item->add_device_ids(inner); - } - } - return proto; -} - -std::string DistributedMapper::to_string() const { - std::string mapper_str = "{device_meshes: ["; - for (const auto& item : device_meshes_) { - mapper_str += item.second.to_string() + ", "; - } - mapper_str.replace(mapper_str.size() - 2, 2, "]"); - - mapper_str += "\nprocess_id_to_device_ids: ["; - for (const auto& item : process_id_to_device_ids_) { - mapper_str += "{"; - mapper_str += - "process_id: " + std::to_string(item.first) + ", device_ids: ["; - for (const auto& device_id : item.second.second) { - mapper_str += - "{" + item.second.first + ", " + std::to_string(device_id) + "}, "; - } - mapper_str.replace(mapper_str.size() - 2, 2, "]"); - mapper_str += "}, "; - } - mapper_str.replace(mapper_str.size() - 2, 2, "]"); - mapper_str += "}"; - return mapper_str; -} - -bool operator==(const DistributedMapper& lhs, const DistributedMapper& rhs) { - if (lhs.device_meshes() != rhs.device_meshes()) { - return false; - } - if (lhs.process_id_to_device_ids() != rhs.process_id_to_device_ids()) { - return false; - } - return true; -} - -} // namespace auto_parallel -} // namespace distributed -} // namespace paddle diff --git a/paddle/fluid/distributed/auto_parallel/dist_mapper.h b/paddle/fluid/distributed/auto_parallel/dist_mapper.h deleted file mode 100644 index bd7f9790ad69f..0000000000000 --- a/paddle/fluid/distributed/auto_parallel/dist_mapper.h +++ /dev/null @@ -1,73 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#pragma once - -#include - -#include "paddle/fluid/distributed/auto_parallel/auto_parallel.pb.h" -#include "paddle/fluid/distributed/auto_parallel/device_mesh.h" -#include "paddle/fluid/distributed/auto_parallel/process_mesh.h" - -namespace paddle { -namespace distributed { -namespace auto_parallel { - -class DistributedMapper { - public: - DistributedMapper() = default; - - const std::map& device_meshes() const { - return device_meshes_; - } - - const DeviceMesh& device_mesh(const std::string& name) const { - return device_meshes_.at(name); - } - - void add_device_mesh(const DeviceMesh& device_mesh) { - device_meshes_[device_mesh.name()] = device_mesh; - } - - const std::map>>& - process_id_to_device_ids() const { - return process_id_to_device_ids_; - } - - void set_process_id_to_device_ids( - const std::map>>& - process_id_to_device_ids); - - // DistributedMapper from_string(const std::string& mapper_str); - std::string to_string() const; - - static DistributedMapper from_proto(const DistributedMapperProto& proto); - DistributedMapperProto to_proto() const; - - private: - std::map device_meshes_; - std::map>> - process_id_to_device_ids_; -}; - -bool operator==(const DistributedMapper& lhs, const DistributedMapper& rhs); - -inline std::ostream& operator<<(std::ostream& os, - const DistributedMapper& obj) { - os << obj.to_string(); - return os; -} - -} // namespace auto_parallel -} // namespace distributed -} // namespace paddle diff --git a/paddle/fluid/distributed/auto_parallel/dist_mapper_test.cc b/paddle/fluid/distributed/auto_parallel/dist_mapper_test.cc deleted file mode 100644 index d427b9cbb09ed..0000000000000 --- a/paddle/fluid/distributed/auto_parallel/dist_mapper_test.cc +++ /dev/null @@ -1,72 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/distributed/auto_parallel/dist_mapper.h" -#include -#include -#include "gtest/gtest.h" - -namespace paddle { -namespace distributed { -namespace auto_parallel { - -TEST(DistributedMapper, Ctor) { - std::vector shape = {2, 3}; - std::vector device_ids = {0, 1, 2, 3, 4, 5}; - std::vector dim_names = {"x", "y"}; - std::string device_type = "GPU"; - int64_t size = shape[0] * shape[1]; - - DeviceMesh device_mesh("device_mesh", shape, device_ids, dim_names); - for (int64_t i = 0; i < shape[0]; ++i) { - for (int64_t j = 0; j < shape[1]; ++j) { - int64_t global_id = i * shape[1] + j; - int64_t local_id = j; - int64_t machine_id = i; - device_mesh.add_device( - Device(global_id, local_id, machine_id, device_type)); - } - } - for (int64_t i = 0; i < size; ++i) { - for (int64_t j = 0; j < size; ++j) { - device_mesh.add_link(Link(i, j, "NVL")); - } - } - - DistributedMapper dist_mapper; - dist_mapper.add_device_mesh(device_mesh); - std::map>> - process_id_to_device_ids; - process_id_to_device_ids[0] = {"device_mesh", {5}}; - process_id_to_device_ids[1] = {"device_mesh", {4}}; - process_id_to_device_ids[2] = {"device_mesh", {3}}; - process_id_to_device_ids[3] = {"device_mesh", {2}}; - process_id_to_device_ids[4] = {"device_mesh", {1}}; - process_id_to_device_ids[5] = {"device_mesh", {0}}; - dist_mapper.set_process_id_to_device_ids(process_id_to_device_ids); - - EXPECT_EQ(dist_mapper.device_meshes().at("device_mesh"), device_mesh); - EXPECT_EQ(dist_mapper.device_mesh("device_mesh"), device_mesh); - EXPECT_EQ(dist_mapper.process_id_to_device_ids(), process_id_to_device_ids); - std::stringstream sstream; - sstream << dist_mapper; - EXPECT_EQ(sstream.str(), dist_mapper.to_string()); - auto proto = dist_mapper.to_proto(); - DistributedMapper new_dist_mapper = DistributedMapper::from_proto(proto); - EXPECT_EQ(dist_mapper, new_dist_mapper); -} - -} // namespace auto_parallel -} // namespace distributed -} // namespace paddle diff --git a/paddle/fluid/distributed/auto_parallel/process_mesh.cc b/paddle/fluid/distributed/auto_parallel/process_mesh.cc deleted file mode 100644 index dda2873768997..0000000000000 --- a/paddle/fluid/distributed/auto_parallel/process_mesh.cc +++ /dev/null @@ -1,134 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/distributed/auto_parallel/process_mesh.h" -#include "paddle/fluid/distributed/auto_parallel/utils.h" - -namespace paddle { -namespace distributed { -namespace auto_parallel { - -ProcessMesh::ProcessMesh(const std::vector &shape, - const std::vector &process_ids, - const std::vector &dim_names) { - shape_ = shape; - int64_t size = this->size(); - PADDLE_ENFORCE_EQ( - size, - process_ids.size(), - platform::errors::InvalidArgument("The size of this process mesh must be " - "equal to the size of its process ids.", - size, - process_ids.size())); - PADDLE_ENFORCE_EQ( - has_duplicates(process_ids), - false, - platform::errors::InvalidArgument("The process ids [%s] must be unique.", - str_join(process_ids_))); - process_ids_ = process_ids; - - PADDLE_ENFORCE_EQ(shape_.size(), - dim_names.size(), - platform::errors::InvalidArgument( - "The size of mesh shape must be equal to the size " - "of the dimension names.", - shape_.size(), - dim_names_.size())); - PADDLE_ENFORCE_EQ(has_duplicates(dim_names), - false, - platform::errors::InvalidArgument( - "The names [%s] of each dimension must be unique.", - str_join(dim_names))); - dim_names_ = dim_names; -} - -int64_t ProcessMesh::size() const { - if (shape_.empty()) return 0; - int64_t size = 1; - for (const int64_t dim_size : shape_) size *= dim_size; - return size; -} - -bool ProcessMesh::contains(int64_t process_id) const { - auto result = - std::find(std::begin(process_ids_), std::end(process_ids_), process_id); - if (result != std::end(process_ids_)) { - return true; - } else { - return false; - } -} - -std::string ProcessMesh::to_string() const { - std::string mesh_str = "{shape: [" + str_join(shape_) + "], "; - mesh_str += "process_ids: [" + str_join(process_ids_) + "], "; - mesh_str += "dim_names: [" + str_join(dim_names_) + "]}"; - return mesh_str; -} - -ProcessMesh ProcessMesh::from_proto(const ProcessMeshProto &proto) { - ProcessMesh mesh; - - mesh.shape_.resize(proto.shape_size()); - for (int64_t i = 0; i < proto.shape_size(); ++i) { - mesh.shape_[i] = proto.shape(i); - } - - mesh.process_ids_.resize(proto.process_ids_size()); - for (int64_t i = 0; i < proto.process_ids_size(); ++i) { - mesh.process_ids_[i] = proto.process_ids(i); - } - - mesh.dim_names_.resize(proto.dim_names_size()); - for (int64_t i = 0; i < proto.dim_names_size(); ++i) { - mesh.dim_names_[i] = proto.dim_names(i); - } - - return mesh; -} - -ProcessMeshProto ProcessMesh::to_proto() const { - ProcessMeshProto proto; - - for (const auto &i : shape_) { - proto.add_shape(i); - } - - for (const auto &i : process_ids_) { - proto.add_process_ids(i); - } - - for (const auto &i : dim_names_) { - proto.add_dim_names(i); - } - - return proto; -} - -bool operator==(const ProcessMesh &lhs, const ProcessMesh &rhs) { - if (lhs.shape() != rhs.shape()) { - return false; - } - if (lhs.process_ids() != rhs.process_ids()) { - return false; - } - return true; -} - -} // namespace auto_parallel -} // namespace distributed -} // namespace paddle diff --git a/paddle/fluid/distributed/auto_parallel/process_mesh.h b/paddle/fluid/distributed/auto_parallel/process_mesh.h deleted file mode 100644 index 2652a8f606216..0000000000000 --- a/paddle/fluid/distributed/auto_parallel/process_mesh.h +++ /dev/null @@ -1,94 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include -#include - -#include "paddle/fluid/distributed/auto_parallel/auto_parallel.pb.h" -#include "paddle/fluid/distributed/auto_parallel/device_mesh.h" -#include "paddle/fluid/distributed/auto_parallel/utils.h" -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace distributed { -namespace auto_parallel { - -class ProcessMesh { - public: - ProcessMesh() = default; - - ProcessMesh(const std::vector& shape, - const std::vector& process_ids, - const std::vector& dim_names); - - const std::vector& shape() const { return shape_; } - - const std::vector& process_ids() const { return process_ids_; } - - const std::vector& dim_names() const { return dim_names_; } - - int64_t size() const; - - int64_t ndim() const { return shape_.size(); } - - int64_t dim_size(int64_t dim) const { - int64_t cdim = canonical_dim(dim, shape_.size()); - return shape_[cdim]; - } - - int64_t dim_size(const std::string& dim_name) const { - for (std::size_t i = 0; i < dim_names_.size(); ++i) { - if (dim_names_[i] == dim_name) { - return shape_[i]; - } - } - PADDLE_THROW(platform::errors::InvalidArgument( - "Cannot find the dimension of %s in this process mesh.", dim_name)); - } - - bool empty() const { return (shape_.empty() || process_ids_.empty()); } - bool contains(int64_t process_id) const; - - // ProcessMesh from_string(const std::string& mesh_str); - std::string to_string() const; - - static ProcessMesh from_proto(const ProcessMeshProto& proto); - ProcessMeshProto to_proto() const; - - private: - std::vector shape_; - std::vector process_ids_; - std::vector dim_names_; -}; - -inline std::ostream& operator<<(std::ostream& os, const ProcessMesh& obj) { - os << obj.to_string(); - return os; -} - -bool operator==(const ProcessMesh& lhs, const ProcessMesh& rhs); - -inline bool operator!=(const ProcessMesh& lhs, const ProcessMesh& rhs) { - return !operator==(lhs, rhs); -} - -} // namespace auto_parallel -} // namespace distributed -} // namespace paddle diff --git a/paddle/fluid/distributed/auto_parallel/process_mesh_test.cc b/paddle/fluid/distributed/auto_parallel/process_mesh_test.cc deleted file mode 100644 index d0f3c5b510b43..0000000000000 --- a/paddle/fluid/distributed/auto_parallel/process_mesh_test.cc +++ /dev/null @@ -1,54 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/distributed/auto_parallel/process_mesh.h" -#include -#include -#include "gtest/gtest.h" - -namespace paddle { -namespace distributed { -namespace auto_parallel { - -TEST(ProcessMesh, Ctor) { - std::vector shape = {2, 3}; - std::vector process_ids = {0, 1, 2, 3, 4, 5}; - std::vector dim_names = {"x", "y"}; - int64_t size = shape[0] * shape[1]; - ProcessMesh process_mesh(shape, process_ids, dim_names); - EXPECT_EQ(process_mesh.shape(), shape); - EXPECT_EQ(process_mesh.process_ids(), process_ids); - EXPECT_EQ(process_mesh.dim_names()[0], "x"); - EXPECT_EQ(process_mesh.dim_names()[1], "y"); - EXPECT_EQ(process_mesh.size(), size); - EXPECT_EQ(process_mesh.ndim(), static_cast(shape.size())); - EXPECT_EQ(process_mesh.dim_size(0), shape[0]); - EXPECT_EQ(process_mesh.dim_size(-1), shape[1]); - EXPECT_EQ(process_mesh.dim_size("x"), shape[0]); - EXPECT_EQ(process_mesh.dim_size("y"), shape[1]); - EXPECT_EQ(process_mesh.empty(), false); - EXPECT_EQ(process_mesh.contains(0), true); - EXPECT_EQ(process_mesh.contains(6), false); - std::stringstream sstream; - sstream << process_mesh; - EXPECT_EQ(sstream.str(), process_mesh.to_string()); - auto proto = process_mesh.to_proto(); - ProcessMesh new_process_mesh = ProcessMesh::from_proto(proto); - EXPECT_EQ(process_mesh, new_process_mesh); - std::cout << new_process_mesh << std::endl; -} - -} // namespace auto_parallel -} // namespace distributed -} // namespace paddle From 00b3aafad6c6623c04bfadc3d79810f6697d7036 Mon Sep 17 00:00:00 2001 From: Yulong Ao Date: Sun, 7 Aug 2022 11:58:02 +0000 Subject: [PATCH 3/3] [Auto Parallel] Comment out unnecessary cmake statements --- .../distributed/auto_parallel/CMakeLists.txt | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/distributed/auto_parallel/CMakeLists.txt b/paddle/fluid/distributed/auto_parallel/CMakeLists.txt index cfcc12515da91..192871c73c9a4 100644 --- a/paddle/fluid/distributed/auto_parallel/CMakeLists.txt +++ b/paddle/fluid/distributed/auto_parallel/CMakeLists.txt @@ -7,14 +7,14 @@ cc_test( SRCS device_mesh_test.cc DEPS device_mesh) -cc_library( - process_mesh - SRCS process_mesh.cc - DEPS auto_parallel_proto) -cc_test( - process_mesh_test - SRCS process_mesh_test.cc - DEPS process_mesh) +# cc_library( +# process_mesh +# SRCS process_mesh.cc +# DEPS auto_parallel_proto) +# cc_test( +# process_mesh_test +# SRCS process_mesh_test.cc +# DEPS process_mesh) # cc_library( # dist_attr @@ -25,14 +25,14 @@ cc_test( # SRCS dist_attr_test.cc # DEPS dist_attr) -cc_library( - dist_mapper - SRCS dist_mapper.cc - DEPS device_mesh auto_parallel_proto) -cc_test( - dist_mapper_test - SRCS dist_mapper_test.cc - DEPS dist_mapper) +# cc_library( +# dist_mapper +# SRCS dist_mapper.cc +# DEPS device_mesh auto_parallel_proto) +# cc_test( +# dist_mapper_test +# SRCS dist_mapper_test.cc +# DEPS dist_mapper) proto_library(auto_parallel_proto SRCS auto_parallel.proto)