From 67881ad22a40628db7a5eb73a1f01c4df11ee9e1 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Tue, 6 Feb 2018 22:21:47 +0000 Subject: [PATCH 01/26] compile with nccl2 --- CMakeLists.txt | 1 - Dockerfile | 2 +- paddle/platform/CMakeLists.txt | 2 +- paddle/platform/dynload/CMakeLists.txt | 2 +- paddle/scripts/docker/build.sh | 5 ++++- 5 files changed, 7 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 49334279f6dc8..3b4c7e65c67e1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -141,7 +141,6 @@ include(external/boost) # download boost include(external/any) # download libn::any include(external/eigen) # download eigen3 include(external/pybind11) # download pybind11 -include(external/nccl) include(external/cares) include(external/grpc) diff --git a/Dockerfile b/Dockerfile index 6ac9901ac6cea..ed559ca5c432d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -22,7 +22,7 @@ COPY ./paddle/scripts/docker/root/ /root/ RUN apt-get update && \ apt-get install -y \ - git python-pip python-dev openssh-server bison libnccl-dev \ + git python-pip python-dev openssh-server bison \ wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \ curl sed grep graphviz libjpeg-dev zlib1g-dev \ python-matplotlib gcc-4.8 g++-4.8 \ diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt index d68caea99719b..83164f07aaa1a 100644 --- a/paddle/platform/CMakeLists.txt +++ b/paddle/platform/CMakeLists.txt @@ -1,5 +1,5 @@ if(WITH_GPU) - cc_library(enforce SRCS enforce.cc DEPS nccl) + cc_library(enforce SRCS enforce.cc DEPS) else() cc_library(enforce SRCS enforce.cc) endif() diff --git a/paddle/platform/dynload/CMakeLists.txt b/paddle/platform/dynload/CMakeLists.txt index cf2081b434961..264b4ebf2c06d 100644 --- a/paddle/platform/dynload/CMakeLists.txt +++ b/paddle/platform/dynload/CMakeLists.txt @@ -1,4 +1,4 @@ cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce) nv_library(dynload_cuda SRCS cublas.cc cudnn.cc curand.cc nccl.cc - DEPS dynamic_loader nccl) + DEPS dynamic_loader) cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc) diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh index ba496db5f834e..26ecb128eb82a 100644 --- a/paddle/scripts/docker/build.sh +++ b/paddle/scripts/docker/build.sh @@ -34,6 +34,7 @@ function cmake_gen() { Configuring cmake in /paddle/build ... -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} ${PYTHON_FLAGS} + -DWITH_DSO=ON -DWITH_DOC=OFF -DWITH_GPU=${WITH_GPU:-OFF} -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} @@ -57,6 +58,7 @@ EOF cmake .. \ -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} \ ${PYTHON_FLAGS} \ + -DWITH_DSO=ON \ -DWITH_DOC=OFF \ -DWITH_GPU=${WITH_GPU:-OFF} \ -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \ @@ -173,7 +175,7 @@ EOF if [[ ${WITH_GPU} == "ON" ]]; then NCCL_DEPS="apt-get install -y libnccl-dev &&" else - NCCL_DEPS="" + NCCL_DEPS="" fi cat >> /paddle/build/Dockerfile < Date: Wed, 7 Feb 2018 23:53:15 +0000 Subject: [PATCH 02/26] add ncclGroup; it is necessary in nccl2 --- paddle/platform/nccl_test.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/paddle/platform/nccl_test.cu b/paddle/platform/nccl_test.cu index ef6d845874745..5a75ff3382b8f 100644 --- a/paddle/platform/nccl_test.cu +++ b/paddle/platform/nccl_test.cu @@ -89,6 +89,7 @@ TEST(NCCL, all_reduce) { VLOG(1) << "Invoking ncclAllReduce"; + dynload::ncclGroupStart(); for (int i = 0; i < dev_count; ++i) { VLOG(1) << "Invoking ncclAllReduce with device " << i; SetDeviceId(i); @@ -97,6 +98,7 @@ TEST(NCCL, all_reduce) { ncclSum, comms[i], data[i]->dev_ctx.stream())); VLOG(1) << "Invoked ncclAllReduce for device " << i; } + dynload::ncclGroupEnd(); VLOG(1) << "Invoked ncclAllReduce"; From 1c91574bbdf5b7a46ad5cba35908354b178bf0e4 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Fri, 9 Feb 2018 21:35:14 +0000 Subject: [PATCH 03/26] backward insert callback pass compile --- python/paddle/v2/fluid/backward.py | 46 ++++++++++++++++++++++++++++-- 1 file changed, 44 insertions(+), 2 deletions(-) diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/v2/fluid/backward.py index 29243c90e872c..2946ef19678d0 100644 --- a/python/paddle/v2/fluid/backward.py +++ b/python/paddle/v2/fluid/backward.py @@ -199,6 +199,47 @@ def _op_can_be_removed_(op_desc, no_grad_set): return op_descs +def _callback_lookup_(op): + """ + Only used in _append_backward_ops_ + Build and returns a callback function for certain op. For example + + parallel_do: AllReduce + + :param op: + :return: callback function + """ + print(op.type) + if op.type == 'parallel_do': + param_names = set(op.input('parameters')) + param_grad_names = [n + "@GRAD" for n in param_names] + + class ParallelDoCallBack(object): + def __init__(self, param_grad_names): + self.has_inserted_nccl_init = False + self.param_grad_names = param_grad_names + + def __call__(self, block, context): + # TODO(tonyyang-svail): insert nccl init + + for o_param in context.output_names(): + for o_argu in context.output(o_param): + if o_argu in self.param_grad_names: + print("reduce", o_argu) + op_desc = block.desc.append_op() + framework.Operator( + block, + type='fill_constant', + desc=op_desc, + inputs={}, + attrs={'shape': [1], }, + outputs={'Out': [block.create_var()]}) + + return ParallelDoCallBack(param_grad_names) + else: + return None + + def _append_backward_ops_(block, ops, target_block, @@ -239,7 +280,8 @@ def empty_callback(block, context): sub_block = program.block(op.block_attr("sub_block")) grad_sub_block = program.create_block(parent_idx=sub_block.idx) _append_backward_ops_(sub_block, sub_block.ops, grad_sub_block, - no_grad_dict, grad_to_var) + no_grad_dict, grad_to_var, + _callback_lookup_(op)) grad_sub_block_list.append(grad_sub_block.desc) # Getting op's corresponding grad_op @@ -258,7 +300,7 @@ def empty_callback(block, context): for op_desc in grad_op_descs: new_op_desc = target_block.desc.append_op() new_op_desc.copy_from(op_desc) - callback(block=target_block, context=grad_to_var) + callback(block=target_block, context=new_op_desc) def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map): From 672cdc21a0e95a59590be1b6be376f73ad9ba116 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Fri, 9 Feb 2018 21:52:48 +0000 Subject: [PATCH 04/26] add nccl --- CMakeLists.txt | 1 - paddle/framework/executor.cc | 6 ++++-- paddle/framework/framework.proto | 1 + paddle/operators/nccl_op.cc | 16 ++++++++-------- paddle/platform/CMakeLists.txt | 2 +- paddle/platform/dynload/CMakeLists.txt | 2 +- paddle/pybind/protobuf.cc | 3 ++- paddle/scripts/docker/build.sh | 5 ++++- 8 files changed, 21 insertions(+), 15 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3a21574b855bc..37556a37a09f8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -141,7 +141,6 @@ include(external/boost) # download boost include(external/any) # download libn::any include(external/eigen) # download eigen3 include(external/pybind11) # download pybind11 -include(external/nccl) include(external/cares) include(external/grpc) diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index 2a88e5a92985f..c604fdcc7b1fe 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -23,6 +23,7 @@ limitations under the License. */ #include "paddle/framework/lod_tensor_array.h" #include "paddle/framework/op_registry.h" #include "paddle/framework/reader.h" +#include "paddle/operators/nccl/nccl_gpu_common.h" // platform::Communicator #include "paddle/platform/place.h" #include "paddle/platform/profiler.h" @@ -53,6 +54,8 @@ static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) { var->GetMutable(); } else if (var_type == proto::VarDesc::PLACE_LIST) { var->GetMutable(); + } else if (var_type == proto::VarDesc::NCCL_COM) { + var->GetMutable(); } else if (var_type == proto::VarDesc::READER) { var->GetMutable(); } else { @@ -118,13 +121,12 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, for (auto& op_desc : block.AllOps()) { auto op = paddle::framework::OpRegistry::CreateOp(*op_desc); - VLOG(4) << op->DebugStringEx(local_scope); + VLOG(3) << op->DebugStringEx(local_scope); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::RecordEvent record_event(op->Type(), pool.Get(place_)); op->Run(*local_scope, place_); - VLOG(3) << op->DebugStringEx(local_scope); if (FLAGS_benchmark) { VLOG(2) << "Memory used after operator " + op->Type() + " running: " << memory::memory_usage(place_); diff --git a/paddle/framework/framework.proto b/paddle/framework/framework.proto index d7be1a7352da5..1e3db1a3bab19 100644 --- a/paddle/framework/framework.proto +++ b/paddle/framework/framework.proto @@ -129,6 +129,7 @@ message VarDesc { LOD_TENSOR_ARRAY = 7; PLACE_LIST = 8; READER = 9; + NCCL_COM = 10; } required string name = 1; required VarType type = 2; diff --git a/paddle/operators/nccl_op.cc b/paddle/operators/nccl_op.cc index 9d51153b0631b..83ac67f353dcd 100644 --- a/paddle/operators/nccl_op.cc +++ b/paddle/operators/nccl_op.cc @@ -31,8 +31,13 @@ class NCCLInitOp : public framework::OperatorBase { const auto &name = Output("Communicator"); PADDLE_ENFORCE_NOT_NULL(scope.FindVar(name), "Can not find variable '%s' in the scope.", name); - std::vector gpus = Attr>("gpus"); - PADDLE_ENFORCE(!gpus.empty(), "Attr(gpus) should not be empty."); + + int count = platform::GetCUDADeviceCount(); + std::vector gpus(count); + for (int i = 0; i < count; ++i) { + gpus[i] = i; + } + PADDLE_ENFORCE(!gpus.empty(), "NCCL init with 0 gpus."); if (scope.FindVar(name) == nullptr) { PADDLE_THROW("Output(Communicator) is needed for ncclInit operator."); @@ -50,11 +55,6 @@ class NCCLInitOpMaker : public framework::OpProtoAndCheckerMaker { : OpProtoAndCheckerMaker(proto, op_checker) { AddOutput("Communicator", "Create Communicator for communicating between gpus"); - AddAttr>("gpus", "(vector) GPU id lists"); - AddAttr("dtype", - "(int, default 5 (FP32)) " - "Output data type") - .SetDefault(framework::proto::DataType::FP32); AddComment(R"DOC( NCCLInit Operator. @@ -77,7 +77,7 @@ class NCCLAllReduceOp : public framework::OperatorWithKernel { ctx->HasInput("Communicator"), " Input(Communicator) of AllReduce op input should not be NULL"); PADDLE_ENFORCE(ctx->HasOutput("Out"), - " Input(X) of AllReduce op input should not be NULL"); + " Output(Out) of AllReduce op output should not be NULL"); auto x_dims = ctx->GetInputsDim("X"); diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt index 5ce4b3de39d93..b91fd4cf5410b 100644 --- a/paddle/platform/CMakeLists.txt +++ b/paddle/platform/CMakeLists.txt @@ -1,5 +1,5 @@ if(WITH_GPU) - cc_library(enforce SRCS enforce.cc DEPS nccl) + cc_library(enforce SRCS enforce.cc DEPS) else() cc_library(enforce SRCS enforce.cc) endif() diff --git a/paddle/platform/dynload/CMakeLists.txt b/paddle/platform/dynload/CMakeLists.txt index cf2081b434961..264b4ebf2c06d 100644 --- a/paddle/platform/dynload/CMakeLists.txt +++ b/paddle/platform/dynload/CMakeLists.txt @@ -1,4 +1,4 @@ cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce) nv_library(dynload_cuda SRCS cublas.cc cudnn.cc curand.cc nccl.cc - DEPS dynamic_loader nccl) + DEPS dynamic_loader) cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc) diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc index 0a92e10927caf..02aeae8b3d2bd 100644 --- a/paddle/pybind/protobuf.cc +++ b/paddle/pybind/protobuf.cc @@ -241,7 +241,8 @@ void BindVarDsec(py::module &m) { .value("LOD_RANK_TABLE", proto::VarDesc::LOD_RANK_TABLE) .value("LOD_TENSOR_ARRAY", proto::VarDesc::LOD_TENSOR_ARRAY) .value("PLACE_LIST", proto::VarDesc::PLACE_LIST) - .value("READER", proto::VarDesc::READER); + .value("READER", proto::VarDesc::READER) + .value("NCCL_COM", proto::VarDesc::NCCL_COM); } void BindOpDesc(py::module &m) { diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh index ba496db5f834e..26ecb128eb82a 100644 --- a/paddle/scripts/docker/build.sh +++ b/paddle/scripts/docker/build.sh @@ -34,6 +34,7 @@ function cmake_gen() { Configuring cmake in /paddle/build ... -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} ${PYTHON_FLAGS} + -DWITH_DSO=ON -DWITH_DOC=OFF -DWITH_GPU=${WITH_GPU:-OFF} -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} @@ -57,6 +58,7 @@ EOF cmake .. \ -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} \ ${PYTHON_FLAGS} \ + -DWITH_DSO=ON \ -DWITH_DOC=OFF \ -DWITH_GPU=${WITH_GPU:-OFF} \ -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \ @@ -173,7 +175,7 @@ EOF if [[ ${WITH_GPU} == "ON" ]]; then NCCL_DEPS="apt-get install -y libnccl-dev &&" else - NCCL_DEPS="" + NCCL_DEPS="" fi cat >> /paddle/build/Dockerfile < Date: Fri, 9 Feb 2018 21:53:45 +0000 Subject: [PATCH 05/26] disable ncclInit infer shape & var type --- python/paddle/v2/fluid/framework.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py index a12427258e9d3..36f20d9155f28 100644 --- a/python/paddle/v2/fluid/framework.py +++ b/python/paddle/v2/fluid/framework.py @@ -490,7 +490,7 @@ def find_name(var_list, name): 'feed', 'fetch', 'save', 'load', 'recurrent', 'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv', 'listen_and_serv', 'parallel_do', 'save_combine', - 'load_combine' + 'load_combine', 'ncclInit' } if type not in no_kernel_op_set: self.desc.infer_var_type(self.block.desc) From f2129b193e91094b5e2a9faf8207599d3f2abd41 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Sat, 10 Feb 2018 02:35:45 +0000 Subject: [PATCH 06/26] pass run time --- python/paddle/v2/fluid/backward.py | 57 ++++++++++++++++++++++-------- 1 file changed, 43 insertions(+), 14 deletions(-) diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/v2/fluid/backward.py index 2946ef19678d0..34383827fd094 100644 --- a/python/paddle/v2/fluid/backward.py +++ b/python/paddle/v2/fluid/backward.py @@ -199,6 +199,15 @@ def _op_can_be_removed_(op_desc, no_grad_set): return op_descs +import proto.framework_pb2 as framework_pb2 + + +def serialize_op_decs(op_desc): + protostr = op_desc.serialize_to_string() + proto = framework_pb2.OpDesc.FromString(str(protostr)) + return proto.__str__() + + def _callback_lookup_(op): """ Only used in _append_backward_ops_ @@ -209,7 +218,6 @@ def _callback_lookup_(op): :param op: :return: callback function """ - print(op.type) if op.type == 'parallel_do': param_names = set(op.input('parameters')) param_grad_names = [n + "@GRAD" for n in param_names] @@ -220,20 +228,38 @@ def __init__(self, param_grad_names): self.param_grad_names = param_grad_names def __call__(self, block, context): - # TODO(tonyyang-svail): insert nccl init - - for o_param in context.output_names(): - for o_argu in context.output(o_param): + # move to parallel_do.py + # # TODO(tonyyang-svail): insert nccl init + if not self.has_inserted_nccl_init: + global_block = block.program.global_block() + op_desc = global_block.desc.append_op() + var_desc = global_block.desc.var('nccl_com') + var_desc.set_type(core.VarDesc.VarType.NCCL_COM) + self.nccl_com = global_block.create_var( + name='nccl_com', type=core.VarDesc.VarType.NCCL_COM) + framework.Operator( + global_block, + type='ncclInit', + desc=op_desc, + inputs={}, + outputs={'Communicator': [self.nccl_com]}) + self.has_inserted_nccl_init = True + + current_op_desc = context["__current_op_desc__"] + # print(serialize_op_decs(context)) + for o_param in current_op_desc.output_names(): + for o_argu in current_op_desc.output(o_param): if o_argu in self.param_grad_names: - print("reduce", o_argu) + # print("reduce", o_argu) op_desc = block.desc.append_op() - framework.Operator( - block, - type='fill_constant', - desc=op_desc, - inputs={}, - attrs={'shape': [1], }, - outputs={'Out': [block.create_var()]}) + op_desc.set_type("ncclAllReduce") + op_desc.set_input("X", [o_argu]) + # FIXME(tonyyang-svail): + # Looks like nccl_com has been changed to nccl_com_0 + op_desc.set_input("Communicator", ['nccl_com_0']) + out_var = block.create_var() + op_desc.set_output("Out", [out_var.name]) + op_desc.set_attr("reduction", "ncclSum") return ParallelDoCallBack(param_grad_names) else: @@ -300,7 +326,8 @@ def empty_callback(block, context): for op_desc in grad_op_descs: new_op_desc = target_block.desc.append_op() new_op_desc.copy_from(op_desc) - callback(block=target_block, context=new_op_desc) + grad_to_var["__current_op_desc__"] = new_op_desc + callback(block=target_block, context=grad_to_var) def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map): @@ -336,6 +363,8 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map): continue grad_info_map[grad_to_var[grad_var_name]] = (grad_var_name, block) # infer_shape and infer_type + if op_desc.type() == 'ncclInit': + continue op_desc.infer_var_type(block.desc) op_desc.infer_shape(block.desc) for arg in op_desc.output_arg_names(): From 0815c0f141d1df2088ed3c5a5391662bb4484e3d Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Sat, 10 Feb 2018 03:16:02 +0000 Subject: [PATCH 07/26] add assign op --- python/paddle/v2/fluid/backward.py | 36 ++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/v2/fluid/backward.py index 34383827fd094..40c54bf220f3b 100644 --- a/python/paddle/v2/fluid/backward.py +++ b/python/paddle/v2/fluid/backward.py @@ -228,8 +228,6 @@ def __init__(self, param_grad_names): self.param_grad_names = param_grad_names def __call__(self, block, context): - # move to parallel_do.py - # # TODO(tonyyang-svail): insert nccl init if not self.has_inserted_nccl_init: global_block = block.program.global_block() op_desc = global_block.desc.append_op() @@ -250,16 +248,30 @@ def __call__(self, block, context): for o_param in current_op_desc.output_names(): for o_argu in current_op_desc.output(o_param): if o_argu in self.param_grad_names: - # print("reduce", o_argu) - op_desc = block.desc.append_op() - op_desc.set_type("ncclAllReduce") - op_desc.set_input("X", [o_argu]) - # FIXME(tonyyang-svail): - # Looks like nccl_com has been changed to nccl_com_0 - op_desc.set_input("Communicator", ['nccl_com_0']) - out_var = block.create_var() - op_desc.set_output("Out", [out_var.name]) - op_desc.set_attr("reduction", "ncclSum") + # # print("reduce", o_argu) + # op_desc = block.desc.append_op() + # op_desc.set_type("ncclAllReduce") + # op_desc.set_input("X", [o_argu]) + # + # # FIXME(tonyyang-svail): + # # Looks like nccl_com has been changed to nccl_com_0 + # op_desc.set_input("Communicator", ['nccl_com_0']) + # out_var = block.create_var() + # op_desc.set_output("Out", [out_var.name]) + # op_desc.set_attr("reduction", "ncclSum") + allreduce_out_name = o_argu + "__nccl_all_reduce__" + op_desc = _create_op_desc_( + "ncclAllReduce", { + "X": [o_argu], + "Communicator": ['nccl_com_0'] + }, {"Out": [allreduce_out_name]}, + {"reduction": "ncclSum"}) + block.desc.append_op().copy_from(op_desc) + + op_desc = _create_op_desc_( + "assign", {"X": [allreduce_out_name]}, + {"Out": [o_argu]}, {}) + block.desc.append_op().copy_from(op_desc) return ParallelDoCallBack(param_grad_names) else: From 0d57ca46ea06257447cc2a82839d64d94fc5e421 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Sat, 10 Feb 2018 23:31:12 +0000 Subject: [PATCH 08/26] nccl pass parallel_do test --- paddle/operators/nccl_op.cc | 21 +++++++++- paddle/operators/nccl_op.cu.cc | 8 ++++ paddle/operators/parallel_do_op.cc | 24 ++++++++++- python/paddle/v2/fluid/backward.py | 41 +++++++++++-------- python/paddle/v2/fluid/layers/control_flow.py | 6 ++- .../paddle/v2/fluid/tests/test_parallel_op.py | 33 +++++++++------ 6 files changed, 99 insertions(+), 34 deletions(-) diff --git a/paddle/operators/nccl_op.cc b/paddle/operators/nccl_op.cc index 83ac67f353dcd..a906223f38caf 100644 --- a/paddle/operators/nccl_op.cc +++ b/paddle/operators/nccl_op.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include "paddle/framework/op_registry.h" #include "paddle/operators/nccl/nccl_gpu_common.h" @@ -49,6 +50,22 @@ class NCCLInitOp : public framework::OperatorBase { } }; +class NCCLInitOpVarTypeInference : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + auto out_var_name = op_desc.Output("Communicator").front(); + auto &out_var = block->FindRecursiveOrCreateVar(out_var_name); + auto var_type = framework::proto::VarDesc::NCCL_COM; + out_var.SetType(var_type); + } +}; + +class NCCLInitOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override {} +}; + class NCCLInitOpMaker : public framework::OpProtoAndCheckerMaker { public: NCCLInitOpMaker(OpProto *proto, OpAttrChecker *op_checker) @@ -214,7 +231,9 @@ Bcast the tensors. namespace ops = paddle::operators; REGISTER_OPERATOR(ncclInit, ops::NCCLInitOp, - paddle::framework::EmptyGradOpMaker, ops::NCCLInitOpMaker); + paddle::framework::EmptyGradOpMaker, ops::NCCLInitOpMaker, + ops::NCCLInitOpVarTypeInference, + ops::NCCLInitOpShapeInference); REGISTER_OP_WITHOUT_GRADIENT(ncclAllReduce, ops::NCCLAllReduceOp, ops::NCCLAllReduceOpMaker); diff --git a/paddle/operators/nccl_op.cu.cc b/paddle/operators/nccl_op.cu.cc index 1b986a13650de..b6db63ac6aed2 100644 --- a/paddle/operators/nccl_op.cu.cc +++ b/paddle/operators/nccl_op.cu.cc @@ -47,8 +47,11 @@ class NCCLAllReduceKernel : public framework::OpKernel { auto ins = ctx.MultiInput("X"); auto outs = ctx.MultiOutput("Out"); + LOG(INFO) << "------------------"; std::string reduction = ctx.Attr("reduction"); + LOG(INFO) << "------------------"; ncclRedOp_t reduction_op_ = ncclSum; + LOG(INFO) << "------------------"; if (reduction == "ncclMin") { reduction_op_ = ncclMin; @@ -62,14 +65,19 @@ class NCCLAllReduceKernel : public framework::OpKernel { PADDLE_THROW("Invalid reduction. default ncclSum."); } + LOG(INFO) << "------------------"; auto* comm = ctx.Input("Communicator"); + LOG(INFO) << "------------------"; auto stream = ctx.cuda_device_context().stream(); + LOG(INFO) << "------------------"; // device id int gpu_id = boost::get(ctx.GetPlace()).GetDeviceId(); + LOG(INFO) << "------------------"; int idx = comm->GetCommId(gpu_id); + LOG(INFO) << "------------------"; for (size_t i = 0; i < ins.size(); ++i) { VLOG(1) << "gpu : " << " invoke allreduce. send " << ins[i]->numel() << " recv " diff --git a/paddle/operators/parallel_do_op.cc b/paddle/operators/parallel_do_op.cc index 89045923f9ff2..950a95ae360cf 100644 --- a/paddle/operators/parallel_do_op.cc +++ b/paddle/operators/parallel_do_op.cc @@ -30,6 +30,7 @@ static constexpr char kOutputs[] = "outputs"; static constexpr char kParallelScopes[] = "parallel_scopes"; static constexpr char kParallelBlock[] = "sub_block"; +static constexpr char kUseNCCL[] = "use_nccl"; using LoDTensor = framework::LoDTensor; using SelectedRows = framework::SelectedRows; @@ -159,6 +160,7 @@ class ParallelDoOp : public framework::OperatorBase { } WaitOnPlaces(places); + // PADDLE_ENFORCE_EQ(places.size(), sub_scopes.size()); std::vector> workers; workers.reserve(places.size()); for (size_t place_idx = 0; place_idx < sub_scopes.size(); ++place_idx) { @@ -202,6 +204,8 @@ class ParallelDoOpProtoMaker : public framework::OpProtoAndCheckerMaker { AddOutput(kOutputs, "").AsDuplicable(); AddOutput(kParallelScopes, ""); AddAttr(kParallelBlock, ""); + AddAttr(kUseNCCL, "true if we use nccl on backward") + .SetDefault(false); AddComment(R"DOC( ParallelDo Operator. )DOC"); @@ -223,20 +227,22 @@ class ParallelDoGradOp : public framework::OperatorBase { auto &sub_scopes = scope.FindVar(Input(kParallelScopes)) ->Get>(); - auto &places = scope.FindVar(Input(kPlaces))->Get(); + // PADDLE_ENFORCE_EQ(places.size(), sub_scopes.size()); // feed output@grad SplitTensorAndMoveTensorToScopes( scope, const_cast *>(&sub_scopes), places, Inputs(framework::GradVarName(kOutputs))); WaitOnPlaces(places); + LOG(INFO) << "places " << places.size(); // exe run std::vector> workers; for (size_t i = 0; i < sub_scopes.size(); ++i) { auto &place = places[i]; auto *cur_scope = sub_scopes[i]; + LOG(INFO) << place; // execute workers.emplace_back(framework::Async([program, cur_scope, place, block] { @@ -245,12 +251,26 @@ class ParallelDoGradOp : public framework::OperatorBase { false /*create_local_scope*/); })); } + LOG(INFO) << "places " << places.size(); for (auto &worker : workers) { worker.wait(); } WaitOnPlaces(places); - AccumulateGrad(scope, place, sub_scopes, places); + // NCCL allreduce op will be added by backward, + // so no need to explicitly accumulate grad + if (!(Attr(kUseNCCL))) { + AccumulateGrad(scope, place, sub_scopes, places); + } else { + for (auto &place : places) { + PADDLE_ENFORCE(platform::is_gpu_place(place), + "NCCL only supports cuda place"); + } + } + for (auto &s : Outputs(framework::GradVarName(kParameters))) { + CopyOrShare(*sub_scopes[0]->FindVar(s), place, scope.FindVar(s)); + } + WaitOnPlaces(places); } void AccumulateGrad(const framework::Scope &scope, diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/v2/fluid/backward.py index 40c54bf220f3b..28768ef07fcc9 100644 --- a/python/paddle/v2/fluid/backward.py +++ b/python/paddle/v2/fluid/backward.py @@ -218,7 +218,7 @@ def _callback_lookup_(op): :param op: :return: callback function """ - if op.type == 'parallel_do': + if op.type == 'parallel_do' and op.attr('use_nccl'): param_names = set(op.input('parameters')) param_grad_names = [n + "@GRAD" for n in param_names] @@ -229,18 +229,25 @@ def __init__(self, param_grad_names): def __call__(self, block, context): if not self.has_inserted_nccl_init: - global_block = block.program.global_block() - op_desc = global_block.desc.append_op() - var_desc = global_block.desc.var('nccl_com') - var_desc.set_type(core.VarDesc.VarType.NCCL_COM) - self.nccl_com = global_block.create_var( - name='nccl_com', type=core.VarDesc.VarType.NCCL_COM) - framework.Operator( - global_block, - type='ncclInit', - desc=op_desc, - inputs={}, - outputs={'Communicator': [self.nccl_com]}) + # global_block = block.program.global_block() + # op_desc = global_block.desc.append_op() + # var_desc = global_block.desc.var('nccl_com__do_not_change_') + # var_desc.set_type(core.VarDesc.VarType.NCCL_COM) + # self.nccl_com = global_block.create_var( + # name='nccl_com', type=core.VarDesc.VarType.NCCL_COM) + # framework.Operator( + # global_block, + # type='ncclInit', + # desc=op_desc, + # inputs={}, + # outputs={'Communicator': [self.nccl_com]}) + op_desc = _create_op_desc_( + "ncclInit", {}, + {"Communicator": ['nccl_com__do_not_change_']}, {}) + # block.desc.append_op().copy_from(op_desc) + print(serialize_op_decs(op_desc)) + block.program.global_block().desc.append_op().copy_from( + op_desc) self.has_inserted_nccl_init = True current_op_desc = context["__current_op_desc__"] @@ -263,7 +270,8 @@ def __call__(self, block, context): op_desc = _create_op_desc_( "ncclAllReduce", { "X": [o_argu], - "Communicator": ['nccl_com_0'] + "Communicator": + ['nccl_com__do_not_change_'] }, {"Out": [allreduce_out_name]}, {"reduction": "ncclSum"}) block.desc.append_op().copy_from(op_desc) @@ -375,10 +383,11 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map): continue grad_info_map[grad_to_var[grad_var_name]] = (grad_var_name, block) # infer_shape and infer_type - if op_desc.type() == 'ncclInit': - continue op_desc.infer_var_type(block.desc) op_desc.infer_shape(block.desc) + # ncclInit dones't need to set data_type + if op_desc.type() == 'ncclInit': + continue for arg in op_desc.output_arg_names(): if arg in new_vars: _infer_var_data_type_(arg, block) diff --git a/python/paddle/v2/fluid/layers/control_flow.py b/python/paddle/v2/fluid/layers/control_flow.py index 71a9459d556e2..5c9c247066173 100644 --- a/python/paddle/v2/fluid/layers/control_flow.py +++ b/python/paddle/v2/fluid/layers/control_flow.py @@ -237,12 +237,13 @@ class ParallelDo(object): ParallelDo class is used to create a ParallelDo. """ - def __init__(self, places, name=None): + def __init__(self, places, use_nccl=False, name=None): self.helper = LayerHelper("parallel_do", name=name) self.inputs = [] self.places = places self.outputs = [] self.status = StaticRNN.BEFORE_RNN_BLOCK + self.use_nccl = use_nccl def do(self): return BlockGuardWithCompletion(self) @@ -325,7 +326,8 @@ def complete_op(self): }, outputs={'outputs': outputs, 'parallel_scopes': [step_scope]}, - attrs={'sub_block': current_block}) + attrs={'sub_block': current_block, + 'use_nccl': self.use_nccl}) class BlockGuardWithCompletion(BlockGuard): diff --git a/python/paddle/v2/fluid/tests/test_parallel_op.py b/python/paddle/v2/fluid/tests/test_parallel_op.py index 367cc8b1aaf0a..8452d6835fa77 100644 --- a/python/paddle/v2/fluid/tests/test_parallel_op.py +++ b/python/paddle/v2/fluid/tests/test_parallel_op.py @@ -67,12 +67,25 @@ def run_test(self, callback, feed, fetch): fetch=fetch, place=gpu, use_parallel=True) + result_gpu_nccl = self._run_test_impl_( + callback=callback, + feed=feed, + fetch=fetch, + place=gpu, + use_parallel=True, + use_nccl=True) self._assert_same_(fetch, result_cpu, result_cpu_parallel, - result_gpu, result_gpu_parallel) + result_gpu, result_gpu_parallel, result_gpu_nccl) else: self._assert_same_(fetch, result_cpu, result_cpu_parallel) - def _run_test_impl_(self, callback, feed, fetch, place, use_parallel=False): + def _run_test_impl_(self, + callback, + feed, + fetch, + place, + use_parallel=False, + use_nccl=False): """ Run a single test, returns the fetch values Args: @@ -96,7 +109,7 @@ def _run_test_impl_(self, callback, feed, fetch, place, use_parallel=False): # Automatically insert parallel do if use_parallel = True if use_parallel: places = fluid.layers.get_places() - pd = fluid.layers.ParallelDo(places) + pd = fluid.layers.ParallelDo(places, use_nccl=use_nccl) data = next(generator) if isinstance(data, fluid.Variable): @@ -137,7 +150,9 @@ def _assert_same_(self, fetch, *args): """ def _impl_(a, b, fetch_id, item_id): - item_str = ['CPU', 'ParallelCPU', 'GPU', 'ParallelGPU'] + item_str = [ + 'CPU', 'ParallelCPU', 'GPU', 'ParallelGPU', 'ParallelGPUNCCL' + ] flag = numpy.allclose(a, b, rtol=0.1) self.assertTrue(flag, "The {0} are different in {1}".format( fetch[fetch_id], item_str[item_id])) @@ -157,18 +172,10 @@ def __network__(): loss = fluid.layers.mean(x=hidden) yield loss - def test_simple_fc(self): - self.run_test( - callback=self.__network__, - feed={ - 'img': numpy.random.random(size=(51, 784)).astype('float32') - }, - fetch=['fc1.w@GRAD']) - def test_fc_with_tiny_data(self): self.run_test( callback=self.__network__, - feed={'img': numpy.random.random(size=(1, 784)).astype('float32')}, + feed={'img': numpy.random.random(size=(8, 784)).astype('float32')}, fetch=['fc1.w@GRAD']) From bb3ae20664a1bba5ce1e6d45e3afff274095e9e1 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Sat, 10 Feb 2018 23:34:36 +0000 Subject: [PATCH 09/26] nccl pass parallel_do test --- python/paddle/v2/fluid/tests/test_parallel_op.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/python/paddle/v2/fluid/tests/test_parallel_op.py b/python/paddle/v2/fluid/tests/test_parallel_op.py index 8452d6835fa77..dc8c806074146 100644 --- a/python/paddle/v2/fluid/tests/test_parallel_op.py +++ b/python/paddle/v2/fluid/tests/test_parallel_op.py @@ -172,12 +172,18 @@ def __network__(): loss = fluid.layers.mean(x=hidden) yield loss - def test_fc_with_tiny_data(self): + def test_simple_fc(self): self.run_test( callback=self.__network__, feed={'img': numpy.random.random(size=(8, 784)).astype('float32')}, fetch=['fc1.w@GRAD']) + def test_fc_with_tiny_data(self): + self.run_test( + callback=self.__network__, + feed={'img': numpy.random.random(size=(1, 784)).astype('float32')}, + fetch=['fc1.w@GRAD']) + class ParallelOpTestMultipleInput(BaseParallelForTest): @staticmethod From 4bb492e76c09c1bd11953d3893d559f88b9ea219 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Sun, 11 Feb 2018 00:11:26 +0000 Subject: [PATCH 10/26] pass tiny data --- paddle/operators/nccl_op.cc | 24 ++++++++++++++++++++---- python/paddle/v2/fluid/backward.py | 9 ++++++--- 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/paddle/operators/nccl_op.cc b/paddle/operators/nccl_op.cc index a906223f38caf..8e4edb78bba61 100644 --- a/paddle/operators/nccl_op.cc +++ b/paddle/operators/nccl_op.cc @@ -19,6 +19,8 @@ limitations under the License. */ namespace paddle { namespace operators { +static constexpr char kParallelScopes[] = "parallel_scopes"; + // NCCLinitOp class NCCLInitOp : public framework::OperatorBase { public: @@ -29,24 +31,37 @@ class NCCLInitOp : public framework::OperatorBase { void Run(const framework::Scope &scope, const platform::Place &place) const override { + PADDLE_ENFORCE_NOT_NULL(scope.FindVar(Input(kParallelScopes)), + "Can not find variable '%s' in the scope.", + kParallelScopes); const auto &name = Output("Communicator"); PADDLE_ENFORCE_NOT_NULL(scope.FindVar(name), "Can not find variable '%s' in the scope.", name); - - int count = platform::GetCUDADeviceCount(); - std::vector gpus(count); - for (int i = 0; i < count; ++i) { + // A parallel do may not use all the gpus. For example, the batch size is 7 + // in the last batch while we have 8 gpu. In this case, parallel_do will + // create 7 parallel scopes, so should ncclInitOp create 7 gpu peers + LOG(INFO) << "---------------"; + auto ¶llel_scopes = scope.FindVar(Input(kParallelScopes)) + ->Get>(); + LOG(INFO) << "---------------"; + std::vector gpus(parallel_scopes.size()); + for (int i = 0; i < static_cast(parallel_scopes.size()); ++i) { gpus[i] = i; } + LOG(INFO) << "---------------"; PADDLE_ENFORCE(!gpus.empty(), "NCCL init with 0 gpus."); + LOG(INFO) << "---------------"; if (scope.FindVar(name) == nullptr) { PADDLE_THROW("Output(Communicator) is needed for ncclInit operator."); } + LOG(INFO) << "---------------"; platform::Communicator *comm = scope.FindVar(name)->GetMutable(); + LOG(INFO) << "---------------"; comm->InitAll(gpus); + LOG(INFO) << "---------------"; } }; @@ -70,6 +85,7 @@ class NCCLInitOpMaker : public framework::OpProtoAndCheckerMaker { public: NCCLInitOpMaker(OpProto *proto, OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput(kParallelScopes, "The working place of parallel do."); AddOutput("Communicator", "Create Communicator for communicating between gpus"); AddComment(R"DOC( diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/v2/fluid/backward.py index 28768ef07fcc9..8ec9db81b38a7 100644 --- a/python/paddle/v2/fluid/backward.py +++ b/python/paddle/v2/fluid/backward.py @@ -223,9 +223,10 @@ def _callback_lookup_(op): param_grad_names = [n + "@GRAD" for n in param_names] class ParallelDoCallBack(object): - def __init__(self, param_grad_names): + def __init__(self, param_grad_names, parallel_scopes_name): self.has_inserted_nccl_init = False self.param_grad_names = param_grad_names + self.parallel_scopes_name = parallel_scopes_name def __call__(self, block, context): if not self.has_inserted_nccl_init: @@ -242,7 +243,8 @@ def __call__(self, block, context): # inputs={}, # outputs={'Communicator': [self.nccl_com]}) op_desc = _create_op_desc_( - "ncclInit", {}, + "ncclInit", + {"parallel_scopes": self.parallel_scopes_name}, {"Communicator": ['nccl_com__do_not_change_']}, {}) # block.desc.append_op().copy_from(op_desc) print(serialize_op_decs(op_desc)) @@ -281,7 +283,8 @@ def __call__(self, block, context): {"Out": [o_argu]}, {}) block.desc.append_op().copy_from(op_desc) - return ParallelDoCallBack(param_grad_names) + return ParallelDoCallBack(param_grad_names, + op.output("parallel_scopes")) else: return None From bfa78cacdfdf7988159419256432d5550a59c730 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Sun, 11 Feb 2018 00:11:56 +0000 Subject: [PATCH 11/26] clean up log(info) --- paddle/operators/nccl_op.cc | 7 ------- 1 file changed, 7 deletions(-) diff --git a/paddle/operators/nccl_op.cc b/paddle/operators/nccl_op.cc index 8e4edb78bba61..ae912d7f362ef 100644 --- a/paddle/operators/nccl_op.cc +++ b/paddle/operators/nccl_op.cc @@ -40,28 +40,21 @@ class NCCLInitOp : public framework::OperatorBase { // A parallel do may not use all the gpus. For example, the batch size is 7 // in the last batch while we have 8 gpu. In this case, parallel_do will // create 7 parallel scopes, so should ncclInitOp create 7 gpu peers - LOG(INFO) << "---------------"; auto ¶llel_scopes = scope.FindVar(Input(kParallelScopes)) ->Get>(); - LOG(INFO) << "---------------"; std::vector gpus(parallel_scopes.size()); for (int i = 0; i < static_cast(parallel_scopes.size()); ++i) { gpus[i] = i; } - LOG(INFO) << "---------------"; PADDLE_ENFORCE(!gpus.empty(), "NCCL init with 0 gpus."); - LOG(INFO) << "---------------"; if (scope.FindVar(name) == nullptr) { PADDLE_THROW("Output(Communicator) is needed for ncclInit operator."); } - LOG(INFO) << "---------------"; platform::Communicator *comm = scope.FindVar(name)->GetMutable(); - LOG(INFO) << "---------------"; comm->InitAll(gpus); - LOG(INFO) << "---------------"; } }; From 3067114f3a08f39e44cd1e828381e06b633a7a48 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Sun, 11 Feb 2018 00:22:17 +0000 Subject: [PATCH 12/26] clean up --- paddle/fluid/operators/nccl_op.cu.cc | 8 ------ paddle/fluid/operators/parallel_do_op.cc | 5 ---- python/paddle/v2/fluid/backward.py | 25 ------------------- .../paddle/v2/fluid/tests/test_parallel_op.py | 4 ++- 4 files changed, 3 insertions(+), 39 deletions(-) diff --git a/paddle/fluid/operators/nccl_op.cu.cc b/paddle/fluid/operators/nccl_op.cu.cc index 7637c7ed163f6..333aed2903e78 100644 --- a/paddle/fluid/operators/nccl_op.cu.cc +++ b/paddle/fluid/operators/nccl_op.cu.cc @@ -47,11 +47,8 @@ class NCCLAllReduceKernel : public framework::OpKernel { auto ins = ctx.MultiInput("X"); auto outs = ctx.MultiOutput("Out"); - LOG(INFO) << "------------------"; std::string reduction = ctx.Attr("reduction"); - LOG(INFO) << "------------------"; ncclRedOp_t reduction_op_ = ncclSum; - LOG(INFO) << "------------------"; if (reduction == "ncclMin") { reduction_op_ = ncclMin; @@ -65,19 +62,14 @@ class NCCLAllReduceKernel : public framework::OpKernel { PADDLE_THROW("Invalid reduction. default ncclSum."); } - LOG(INFO) << "------------------"; auto* comm = ctx.Input("Communicator"); - LOG(INFO) << "------------------"; auto stream = ctx.cuda_device_context().stream(); - LOG(INFO) << "------------------"; // device id int gpu_id = boost::get(ctx.GetPlace()).GetDeviceId(); - LOG(INFO) << "------------------"; int idx = comm->GetCommId(gpu_id); - LOG(INFO) << "------------------"; for (size_t i = 0; i < ins.size(); ++i) { VLOG(1) << "gpu : " << " invoke allreduce. send " << ins[i]->numel() << " recv " diff --git a/paddle/fluid/operators/parallel_do_op.cc b/paddle/fluid/operators/parallel_do_op.cc index ff5730bfe77a9..f808c7130668c 100644 --- a/paddle/fluid/operators/parallel_do_op.cc +++ b/paddle/fluid/operators/parallel_do_op.cc @@ -151,7 +151,6 @@ class ParallelDoOp : public framework::OperatorBase { } WaitOnPlaces(places); - // PADDLE_ENFORCE_EQ(places.size(), sub_scopes.size()); std::vector> workers; workers.reserve(places.size()); for (size_t place_idx = 0; place_idx < sub_scopes.size(); ++place_idx) { @@ -219,21 +218,18 @@ class ParallelDoGradOp : public framework::OperatorBase { auto &sub_scopes = scope.FindVar(Input(kParallelScopes)) ->Get>(); auto &places = scope.FindVar(Input(kPlaces))->Get(); - // PADDLE_ENFORCE_EQ(places.size(), sub_scopes.size()); // feed output@grad SplitTensorAndMoveTensorToScopes( scope, const_cast *>(&sub_scopes), places, Inputs(framework::GradVarName(kOutputs))); WaitOnPlaces(places); - LOG(INFO) << "places " << places.size(); // exe run std::vector> workers; for (size_t i = 0; i < sub_scopes.size(); ++i) { auto &place = places[i]; auto *cur_scope = sub_scopes[i]; - LOG(INFO) << place; // execute workers.emplace_back(framework::Async([program, cur_scope, place, block] { @@ -242,7 +238,6 @@ class ParallelDoGradOp : public framework::OperatorBase { false /*create_local_scope*/); })); } - LOG(INFO) << "places " << places.size(); for (auto &worker : workers) { worker.wait(); } diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/v2/fluid/backward.py index 8ec9db81b38a7..6da4325c64f4e 100644 --- a/python/paddle/v2/fluid/backward.py +++ b/python/paddle/v2/fluid/backward.py @@ -230,44 +230,19 @@ def __init__(self, param_grad_names, parallel_scopes_name): def __call__(self, block, context): if not self.has_inserted_nccl_init: - # global_block = block.program.global_block() - # op_desc = global_block.desc.append_op() - # var_desc = global_block.desc.var('nccl_com__do_not_change_') - # var_desc.set_type(core.VarDesc.VarType.NCCL_COM) - # self.nccl_com = global_block.create_var( - # name='nccl_com', type=core.VarDesc.VarType.NCCL_COM) - # framework.Operator( - # global_block, - # type='ncclInit', - # desc=op_desc, - # inputs={}, - # outputs={'Communicator': [self.nccl_com]}) op_desc = _create_op_desc_( "ncclInit", {"parallel_scopes": self.parallel_scopes_name}, {"Communicator": ['nccl_com__do_not_change_']}, {}) - # block.desc.append_op().copy_from(op_desc) print(serialize_op_decs(op_desc)) block.program.global_block().desc.append_op().copy_from( op_desc) self.has_inserted_nccl_init = True current_op_desc = context["__current_op_desc__"] - # print(serialize_op_decs(context)) for o_param in current_op_desc.output_names(): for o_argu in current_op_desc.output(o_param): if o_argu in self.param_grad_names: - # # print("reduce", o_argu) - # op_desc = block.desc.append_op() - # op_desc.set_type("ncclAllReduce") - # op_desc.set_input("X", [o_argu]) - # - # # FIXME(tonyyang-svail): - # # Looks like nccl_com has been changed to nccl_com_0 - # op_desc.set_input("Communicator", ['nccl_com_0']) - # out_var = block.create_var() - # op_desc.set_output("Out", [out_var.name]) - # op_desc.set_attr("reduction", "ncclSum") allreduce_out_name = o_argu + "__nccl_all_reduce__" op_desc = _create_op_desc_( "ncclAllReduce", { diff --git a/python/paddle/v2/fluid/tests/test_parallel_op.py b/python/paddle/v2/fluid/tests/test_parallel_op.py index 2914c8dbaaebb..66bb6442af9c5 100644 --- a/python/paddle/v2/fluid/tests/test_parallel_op.py +++ b/python/paddle/v2/fluid/tests/test_parallel_op.py @@ -175,7 +175,9 @@ def __network__(): def test_simple_fc(self): self.run_test( callback=self.__network__, - feed={'img': numpy.random.random(size=(8, 784)).astype('float32')}, + feed={ + 'img': numpy.random.random(size=(51, 784)).astype('float32') + }, fetch=['fc1.w@GRAD']) def test_fc_with_tiny_data(self): From 82c33c61d9e7584060916c604478b67f59fbdfc0 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Sun, 11 Feb 2018 10:46:25 +0800 Subject: [PATCH 13/26] Fix constructor bug in mixed_vector --- paddle/fluid/framework/mixed_vector.h | 5 ++--- paddle/fluid/framework/mixed_vector_test.cu | 8 ++++++++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h index 9756754260d46..902dedd48e111 100644 --- a/paddle/fluid/framework/mixed_vector.h +++ b/paddle/fluid/framework/mixed_vector.h @@ -37,9 +37,8 @@ class Vector { // Fill vector with value. The vector size is `count`. explicit Vector(size_t count, const T& value = T()) { - if (count == 0) { - InitEmpty(); - } else { + InitEmpty(); + if (count != 0) { resize(count); T* ptr = begin(); for (size_t i = 0; i < count; ++i) { diff --git a/paddle/fluid/framework/mixed_vector_test.cu b/paddle/fluid/framework/mixed_vector_test.cu index a89064525661a..20b79d60c1a52 100644 --- a/paddle/fluid/framework/mixed_vector_test.cu +++ b/paddle/fluid/framework/mixed_vector_test.cu @@ -15,6 +15,7 @@ #include "glog/logging.h" #include "gtest/gtest.h" +#include "mixed_vector.h" #include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/platform/gpu_info.h" @@ -91,3 +92,10 @@ TEST(mixed_vector, MultiGPU) { ASSERT_EQ(tmp[i], i * 100); } } + +TEST(mixed_vector, InitWithCount) { + paddle::framework::Vector vec(10, 10); + for (int i = 0; i < 10; ++i) { + ASSERT_EQ(vec[i], 10); + } +} From 816fa8f32e951f2c6bef9c7b59bd2cb8dd4d9f96 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Sun, 11 Feb 2018 10:59:56 +0800 Subject: [PATCH 14/26] Fix warnings --- paddle/fluid/framework/mixed_vector.h | 7 ++++++- paddle/fluid/framework/mixed_vector_test.cu | 8 ++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h index 902dedd48e111..26f160e509329 100644 --- a/paddle/fluid/framework/mixed_vector.h +++ b/paddle/fluid/framework/mixed_vector.h @@ -121,6 +121,10 @@ class Vector { const T* begin() const { return &this->operator[](0); } const T* end() const { return &this->operator[](size()); } + const T* cbegin() const { return begin(); } + + const T* cend() const { return end(); } + const T& back() const { auto it = end(); --it; @@ -243,7 +247,8 @@ class Vector { bool operator==(const Vector& other) const { if (size() != other.size()) return false; - for (auto it1 = begin(), it2 = other.begin(); it1 < end(); ++it1, ++it2) { + for (const T *it1 = cbegin(), it2 = other.cbegin(); it1 < cend(); + ++it1, ++it2) { if (*it1 != *it2) { return false; } diff --git a/paddle/fluid/framework/mixed_vector_test.cu b/paddle/fluid/framework/mixed_vector_test.cu index 20b79d60c1a52..83694a590f861 100644 --- a/paddle/fluid/framework/mixed_vector_test.cu +++ b/paddle/fluid/framework/mixed_vector_test.cu @@ -27,10 +27,10 @@ TEST(mixed_vector, CPU_VECTOR) { for (int i = 0; i < 10; ++i) { tmp.push_back(i); } - ASSERT_EQ(tmp.size(), 10); + ASSERT_EQ(tmp.size(), 10UL); vec tmp2; tmp2 = tmp; - ASSERT_EQ(tmp2.size(), 10); + ASSERT_EQ(tmp2.size(), 10UL); for (int i = 0; i < 10; ++i) { ASSERT_EQ(tmp2[i], i); ASSERT_EQ(tmp2[i], tmp[i]); @@ -59,7 +59,7 @@ TEST(mixed_vector, GPU_VECTOR) { for (int i = 0; i < 10; ++i) { tmp.push_back(i); } - ASSERT_EQ(tmp.size(), 10); + ASSERT_EQ(tmp.size(), 10UL); paddle::platform::CUDAPlace gpu(0); multiply_10<<<1, 1, 0, GetCUDAStream(gpu)>>>(tmp.MutableData(gpu)); @@ -80,7 +80,7 @@ TEST(mixed_vector, MultiGPU) { for (int i = 0; i < 10; ++i) { tmp.push_back(i); } - ASSERT_EQ(tmp.size(), 10); + ASSERT_EQ(tmp.size(), 10UL); paddle::platform::CUDAPlace gpu0(0); paddle::platform::SetDeviceId(0); multiply_10<<<1, 1, 0, GetCUDAStream(gpu0)>>>(tmp.MutableData(gpu0)); From ae2296e806fb3b70f2ffe326815ade868039715f Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Sun, 11 Feb 2018 11:01:11 +0800 Subject: [PATCH 15/26] Clean code --- paddle/fluid/framework/mixed_vector_test.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/paddle/fluid/framework/mixed_vector_test.cu b/paddle/fluid/framework/mixed_vector_test.cu index 83694a590f861..0d5a914eac780 100644 --- a/paddle/fluid/framework/mixed_vector_test.cu +++ b/paddle/fluid/framework/mixed_vector_test.cu @@ -15,7 +15,6 @@ #include "glog/logging.h" #include "gtest/gtest.h" -#include "mixed_vector.h" #include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/platform/gpu_info.h" From 190119bb98eaca554bb183488f6828fd2b3e18c0 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Sun, 11 Feb 2018 11:28:32 +0800 Subject: [PATCH 16/26] Extract for-loop init. Make nvcc happy --- paddle/fluid/framework/mixed_vector.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h index 26f160e509329..4dc3de54deef7 100644 --- a/paddle/fluid/framework/mixed_vector.h +++ b/paddle/fluid/framework/mixed_vector.h @@ -247,8 +247,9 @@ class Vector { bool operator==(const Vector& other) const { if (size() != other.size()) return false; - for (const T *it1 = cbegin(), it2 = other.cbegin(); it1 < cend(); - ++it1, ++it2) { + auto it1 = cbegin(); + auto it2 = other.cbegin(); + for (; it1 < cend(); ++it1, ++it2) { if (*it1 != *it2) { return false; } From 0c45eab7fffd94169ddda0d61e0613524dbcd8e6 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Sun, 11 Feb 2018 05:43:55 +0000 Subject: [PATCH 17/26] no getmutable nccl_com --- paddle/fluid/framework/executor.cc | 7 +++---- python/paddle/v2/fluid/tests/test_parallel_op.py | 4 ++-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 5e1358ab0e91c..254df564e2f4f 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -23,7 +23,6 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/reader.h" -#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" // platform::Communicator #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" @@ -54,15 +53,15 @@ static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) { var->GetMutable(); } else if (var_type == proto::VarDesc::PLACE_LIST) { var->GetMutable(); - } else if (var_type == proto::VarDesc::NCCL_COM) { - var->GetMutable(); } else if (var_type == proto::VarDesc::READER) { var->GetMutable(); + } else if (var_type == proto::VarDesc::NCCL_COM) { + // GetMutable will be called in ncclInit } else { PADDLE_THROW( "Variable type %d is not in " "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, " - "LOD_RANK_TABLE, PLACE_LIST, READER]", + "LOD_RANK_TABLE, PLACE_LIST, READER, NCCL_COM]", var_type); } } diff --git a/python/paddle/v2/fluid/tests/test_parallel_op.py b/python/paddle/v2/fluid/tests/test_parallel_op.py index 66bb6442af9c5..7f6d0b8d32e1b 100644 --- a/python/paddle/v2/fluid/tests/test_parallel_op.py +++ b/python/paddle/v2/fluid/tests/test_parallel_op.py @@ -212,5 +212,5 @@ def test_simple_fc(self): fetch=['fc1.w@GRAD', 'fc2.w@GRAD', 'fc3.w@GRAD']) -#if __name__ == '__main__': -# unittest.main() +if __name__ == '__main__': + unittest.main() From f35401c4da32e575bcf902c293549465374e5d60 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Sun, 11 Feb 2018 05:47:06 +0000 Subject: [PATCH 18/26] diable debug string due to vector bug --- paddle/fluid/framework/executor.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 254df564e2f4f..3723a9131d71b 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -120,11 +120,12 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, for (auto& op_desc : block.AllOps()) { auto op = paddle::framework::OpRegistry::CreateOp(*op_desc); - VLOG(3) << op->DebugStringEx(local_scope); + // VLOG(3) << op->DebugStringEx(local_scope); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::RecordEvent record_event(op->Type(), pool.Get(place_)); + VLOG(3) << op->Type(); op->Run(*local_scope, place_); if (FLAGS_benchmark) { VLOG(2) << "Memory used after operator " + op->Type() + " running: " From 3c47c730483d52d50b7d83ab4edbdbaff5b6694b Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Mon, 12 Feb 2018 19:43:22 +0000 Subject: [PATCH 19/26] add back libnccl-dev --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index ed559ca5c432d..6ac9901ac6cea 100644 --- a/Dockerfile +++ b/Dockerfile @@ -22,7 +22,7 @@ COPY ./paddle/scripts/docker/root/ /root/ RUN apt-get update && \ apt-get install -y \ - git python-pip python-dev openssh-server bison \ + git python-pip python-dev openssh-server bison libnccl-dev \ wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \ curl sed grep graphviz libjpeg-dev zlib1g-dev \ python-matplotlib gcc-4.8 g++-4.8 \ From a259ad41b03e52d3d3e97f53e5ffa163f80c79bb Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Mon, 12 Feb 2018 23:13:12 +0000 Subject: [PATCH 20/26] remove duplicated cbegin and cend in mixed vector --- paddle/fluid/framework/mixed_vector.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h index 114c21c26cd18..c1a89a1261c7f 100644 --- a/paddle/fluid/framework/mixed_vector.h +++ b/paddle/fluid/framework/mixed_vector.h @@ -132,10 +132,6 @@ class Vector { const T* cend() const { return end(); } - const T* cbegin() const { return begin(); } - - const T* cend() const { return end(); } - const T& back() const { auto it = end(); --it; From 3f09620ef2b1924bbeff8b9915ca2a46aed1aa5c Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Tue, 13 Feb 2018 22:09:05 +0000 Subject: [PATCH 21/26] pass compile --- paddle/fluid/framework/executor.cc | 2 +- paddle/fluid/operators/nccl_op.cc | 2 +- python/paddle/v2/fluid/backward.py | 27 +++++++++++++++------------ 3 files changed, 17 insertions(+), 14 deletions(-) diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 1d7eccbc650db..92b32b04d6b83 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -55,7 +55,7 @@ static void CreateTensor(Variable* var, proto::VarType::Type var_type) { var->GetMutable(); } else if (var_type == proto::VarType::READER) { var->GetMutable(); - } else if (var_type == proto::VarDesc::NCCL_COM) { + } else if (var_type == proto::VarType::NCCL_COM) { // GetMutable will be called in ncclInit } else { PADDLE_THROW( diff --git a/paddle/fluid/operators/nccl_op.cc b/paddle/fluid/operators/nccl_op.cc index f61b5003bde6c..0994bba782b42 100644 --- a/paddle/fluid/operators/nccl_op.cc +++ b/paddle/fluid/operators/nccl_op.cc @@ -65,7 +65,7 @@ class NCCLInitOpVarTypeInference : public framework::VarTypeInference { framework::BlockDesc *block) const override { auto out_var_name = op_desc.Output("Communicator").front(); auto &out_var = block->FindRecursiveOrCreateVar(out_var_name); - auto var_type = framework::proto::VarDesc::NCCL_COM; + auto var_type = framework::proto::VarType::NCCL_COM; out_var.SetType(var_type); } }; diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/v2/fluid/backward.py index cf32c6683b42b..682df3301b60f 100644 --- a/python/paddle/v2/fluid/backward.py +++ b/python/paddle/v2/fluid/backward.py @@ -269,7 +269,7 @@ def _append_backward_ops_(block, target_block, no_grad_dict, grad_to_var, - callback=None): + callbacks=None): """ Create all grad ops, and insert them into given block @@ -285,14 +285,13 @@ def _append_backward_ops_(block, val(str): corresponding forward variable name callback(callable object): a callable object used to decorate new generated grad ops """ - if callback is None: - - def empty_callback(block, context): - pass - - callback = empty_callback - elif not hasattr(callback, '__call__'): - raise ValueError("'callback' must be a callable object.") + if callbacks is None: + callbacks = [] + else: + assert (isinstance(callbacks, list)) + for cb in callbacks: + if not hasattr(cb, '__call__'): + raise ValueError("'callback' must be a callable object.") # grad_op_descs holds created grad_op, and will be appended to target_block grad_op_descs = [] @@ -303,9 +302,12 @@ def empty_callback(block, context): if op.has_attr("sub_block"): sub_block = program.block(op.block_attr("sub_block")) grad_sub_block = program.create_block(parent_idx=sub_block.idx) + if callbacks is None: + callbacks = [_callback_lookup_(op)] + else: + callbacks.append(_callback_lookup_(op)) _append_backward_ops_(sub_block, sub_block.ops, grad_sub_block, - no_grad_dict, grad_to_var, - _callback_lookup_(op)) + no_grad_dict, grad_to_var, callbacks) grad_sub_block_list.append(grad_sub_block.desc) # Getting op's corresponding grad_op @@ -325,7 +327,8 @@ def empty_callback(block, context): new_op_desc = target_block.desc.append_op() new_op_desc.copy_from(op_desc) grad_to_var["__current_op_desc__"] = new_op_desc - callback(block=target_block, context=grad_to_var) + for cb in callbacks: + cb(block=target_block, context=grad_to_var) def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map): From 9d26f1a3dfd03bbce40c52f0195d2ebba9ced22b Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Thu, 15 Feb 2018 02:08:33 +0000 Subject: [PATCH 22/26] callback to list of callbacks --- paddle/fluid/framework/executor.cc | 3 +-- python/paddle/v2/fluid/backward.py | 33 +++++++++++++++++------------ python/paddle/v2/fluid/optimizer.py | 2 +- 3 files changed, 22 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 92b32b04d6b83..93a4883368aa1 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -120,12 +120,11 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, for (auto& op_desc : block.AllOps()) { auto op = paddle::framework::OpRegistry::CreateOp(*op_desc); - // VLOG(3) << op->DebugStringEx(local_scope); + VLOG(3) << place_ << " " << op->DebugStringEx(local_scope); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::RecordEvent record_event(op->Type(), pool.Get(place_)); - VLOG(3) << op->Type(); op->Run(*local_scope, place_); if (FLAGS_benchmark) { VLOG(2) << "Memory used after operator " + op->Type() + " running: " diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/v2/fluid/backward.py index 682df3301b60f..4e494db93b86f 100644 --- a/python/paddle/v2/fluid/backward.py +++ b/python/paddle/v2/fluid/backward.py @@ -234,7 +234,6 @@ def __call__(self, block, context): "ncclInit", {"parallel_scopes": self.parallel_scopes_name}, {"Communicator": ['nccl_com__do_not_change_']}, {}) - print(serialize_op_decs(op_desc)) block.program.global_block().desc.append_op().copy_from( op_desc) self.has_inserted_nccl_init = True @@ -285,9 +284,7 @@ def _append_backward_ops_(block, val(str): corresponding forward variable name callback(callable object): a callable object used to decorate new generated grad ops """ - if callbacks is None: - callbacks = [] - else: + if callbacks is not None: assert (isinstance(callbacks, list)) for cb in callbacks: if not hasattr(cb, '__call__'): @@ -302,12 +299,17 @@ def _append_backward_ops_(block, if op.has_attr("sub_block"): sub_block = program.block(op.block_attr("sub_block")) grad_sub_block = program.create_block(parent_idx=sub_block.idx) - if callbacks is None: - callbacks = [_callback_lookup_(op)] + cb = _callback_lookup_(op) + if cb is not None: + if callbacks is None: + new_callbacks = [cb] + else: + new_callbacks = callbacks + [_callback_lookup_(op)] + _append_backward_ops_(sub_block, sub_block.ops, grad_sub_block, + no_grad_dict, grad_to_var, new_callbacks) else: - callbacks.append(_callback_lookup_(op)) - _append_backward_ops_(sub_block, sub_block.ops, grad_sub_block, - no_grad_dict, grad_to_var, callbacks) + _append_backward_ops_(sub_block, sub_block.ops, grad_sub_block, + no_grad_dict, grad_to_var, callbacks) grad_sub_block_list.append(grad_sub_block.desc) # Getting op's corresponding grad_op @@ -327,8 +329,10 @@ def _append_backward_ops_(block, new_op_desc = target_block.desc.append_op() new_op_desc.copy_from(op_desc) grad_to_var["__current_op_desc__"] = new_op_desc - for cb in callbacks: - cb(block=target_block, context=grad_to_var) + if callbacks is not None: + assert (isinstance(callbacks, list)) + for cb in callbacks: + cb(block=target_block, context=grad_to_var) def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map): @@ -408,7 +412,8 @@ def _get_stop_gradients_(program): return no_grad_dict -def append_backward(loss, parameter_list=None, no_grad_set=None, callback=None): +def append_backward(loss, parameter_list=None, no_grad_set=None, + callbacks=None): """ Append backward part to main_program @@ -424,6 +429,8 @@ def append_backward(loss, parameter_list=None, no_grad_set=None, callback=None): (list[(Variable,Variable)]): list of (parameter, gradient) pair. """ assert isinstance(loss, framework.Variable) + if callbacks is not None: + isinstance(callbacks, list) program = loss.block.program if no_grad_set is None: @@ -451,7 +458,7 @@ def append_backward(loss, parameter_list=None, no_grad_set=None, callback=None): no_grad_dict[0].update(map(_append_grad_suffix_, block_no_grad_set)) _append_backward_ops_(root_block, op_path, root_block, no_grad_dict, - grad_to_var, callback) + grad_to_var, callbacks) # Because calc_gradient may be called multiple times, # we need rename the internal gradient variables so that they have diff --git a/python/paddle/v2/fluid/optimizer.py b/python/paddle/v2/fluid/optimizer.py index 39391eb8e40ce..ecc42f6215bdd 100644 --- a/python/paddle/v2/fluid/optimizer.py +++ b/python/paddle/v2/fluid/optimizer.py @@ -225,7 +225,7 @@ def minimize(self, `create_optimization_pass()` into one. """ params_grads = append_backward(loss, parameter_list, no_grad_set, - error_clip_callback) + [error_clip_callback]) params_grads = append_gradient_clip_ops(params_grads) From 1d9fd1c0069beef175e9de9d2194a5f1620d3ed3 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Fri, 16 Feb 2018 00:10:28 +0000 Subject: [PATCH 23/26] pass test_recognize_digits --- paddle/fluid/operators/conv_op.cc | 5 +++-- paddle/fluid/operators/conv_op.h | 4 ++-- paddle/fluid/operators/parallel_do_op.cc | 7 +++++++ 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 6b378ec1bcb75..2ecece707314f 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -60,8 +60,9 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const { "Due to the settings of paddings, filter_dims and " "dilations, the output size is less than 0, please check " "again."); - output_shape.push_back(OutputSize(in_dims[i + 2], filter_dims[i + 2], - dilations[i], paddings[i], strides[i])); + output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], + dilations[i], paddings[i], + strides[i])); } ctx->SetOutputDim("Output", framework::make_ddim(output_shape)); ctx->ShareLoD("Input", "Output"); diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h index ecbe3d505ac0f..c93c2e73f720a 100644 --- a/paddle/fluid/operators/conv_op.h +++ b/paddle/fluid/operators/conv_op.h @@ -28,8 +28,8 @@ using Tensor = framework::Tensor; // Base convolution operator definations for other conv // like operators to reuse the implementation. -inline int OutputSize(int input_size, int filter_size, int dilation, - int padding, int stride) { +inline int ConvOutputSize(int input_size, int filter_size, int dilation, + int padding, int stride) { const int dkernel = dilation * (filter_size - 1) + 1; const int output_size = (input_size + 2 * padding - dkernel) / stride + 1; return output_size; diff --git a/paddle/fluid/operators/parallel_do_op.cc b/paddle/fluid/operators/parallel_do_op.cc index d63962bb52425..f09a79ffc5262 100644 --- a/paddle/fluid/operators/parallel_do_op.cc +++ b/paddle/fluid/operators/parallel_do_op.cc @@ -256,6 +256,10 @@ class ParallelDoGradOp : public framework::OperatorBase { } } for (auto &s : Outputs(framework::GradVarName(kParameters))) { + if (s == "@EMPTY@") { + continue; + } + VLOG(3) << "Moving " << s; CopyOrShare(*sub_scopes[0]->FindVar(s), place, scope.FindVar(s)); } WaitOnPlaces(places); @@ -266,6 +270,9 @@ class ParallelDoGradOp : public framework::OperatorBase { const std::vector &sub_scopes, const platform::PlaceList &places) const { for (auto &s : Outputs(framework::GradVarName(kParameters))) { + if (s == "@EMPTY@") { + continue; + } VLOG(3) << "Accumulating " << s; if (s == framework::kEmptyVarName) continue; std::string tmp_name; From eb82b5ccc0c5519682ba2836c00e7e5f38426f95 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Fri, 16 Feb 2018 00:22:17 +0000 Subject: [PATCH 24/26] test error clip --- python/paddle/v2/fluid/tests/test_error_clip.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/v2/fluid/tests/test_error_clip.py b/python/paddle/v2/fluid/tests/test_error_clip.py index b331f16913d0f..3d6ffe807442f 100644 --- a/python/paddle/v2/fluid/tests/test_error_clip.py +++ b/python/paddle/v2/fluid/tests/test_error_clip.py @@ -43,7 +43,7 @@ avg_cost_clip = prog_clip.block(0).var(avg_cost.name) fluid.backward.append_backward(loss=avg_cost) fluid.backward.append_backward( - loss=avg_cost_clip, callback=fluid.clip.error_clip_callback) + loss=avg_cost_clip, callbacks=fluid.clip.error_clip_callback) hidden1_grad = prog.block(0).var(hidden1.name + "@GRAD") hidden1_grad_clip = prog_clip.block(0).var(hidden1.name + "@GRAD") From 3494b79c4dee0e5382a2d7ea5f2754ff48893c79 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Fri, 16 Feb 2018 00:23:24 +0000 Subject: [PATCH 25/26] test error clip --- python/paddle/v2/fluid/tests/test_error_clip.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/v2/fluid/tests/test_error_clip.py b/python/paddle/v2/fluid/tests/test_error_clip.py index 3d6ffe807442f..d577d0014dc13 100644 --- a/python/paddle/v2/fluid/tests/test_error_clip.py +++ b/python/paddle/v2/fluid/tests/test_error_clip.py @@ -43,7 +43,7 @@ avg_cost_clip = prog_clip.block(0).var(avg_cost.name) fluid.backward.append_backward(loss=avg_cost) fluid.backward.append_backward( - loss=avg_cost_clip, callbacks=fluid.clip.error_clip_callback) + loss=avg_cost_clip, callbacks=[fluid.clip.error_clip_callback]) hidden1_grad = prog.block(0).var(hidden1.name + "@GRAD") hidden1_grad_clip = prog_clip.block(0).var(hidden1.name + "@GRAD") From 4b957af2372c861942aa7193223d00f23a5a3318 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Sat, 17 Feb 2018 01:35:58 +0000 Subject: [PATCH 26/26] clean up --- .../v2/fluid/tests/book/test_recognize_digits.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits.py index 442cf9604a162..a0b4774da5fba 100644 --- a/python/paddle/v2/fluid/tests/book/test_recognize_digits.py +++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits.py @@ -155,7 +155,6 @@ def train(nn_type, use_cuda, parallel, save_dirname, save_param_filename): float(avg_loss_val), float(acc_val))) if math.isnan(float(avg_loss_val)): sys.exit("got NaN loss, training failed.") - exit(0) raise AssertionError("Loss of recognize digits is too large") @@ -231,14 +230,10 @@ def __impl__(self): def inject_all_tests(): - for use_cuda in [True]: - for parallel in [True]: - for nn_type in ['mlp']: + for use_cuda in (False, True): + for parallel in (False, True): + for nn_type in ('mlp', 'conv'): inject_test_method(use_cuda, parallel, nn_type, True) - # for use_cuda in (False, True): - # for parallel in (False, True): - # for nn_type in ('mlp', 'conv'): - # inject_test_method(use_cuda, parallel, nn_type, True) # One unit-test for saving parameters as separate files inject_test_method(False, False, 'mlp', False)