From 568a329c83312df89defe22f24dc9ef497ac0aca Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Wed, 9 May 2018 20:59:46 +0800 Subject: [PATCH 01/56] add checkpoint util class and implement --- paddle/fluid/operators/detail/checkpoint.cc | 54 +++++++++++++++++++++ paddle/fluid/operators/detail/checkpoint.h | 33 +++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 paddle/fluid/operators/detail/checkpoint.cc create mode 100644 paddle/fluid/operators/detail/checkpoint.h diff --git a/paddle/fluid/operators/detail/checkpoint.cc b/paddle/fluid/operators/detail/checkpoint.cc new file mode 100644 index 0000000000000..78506a0a72e42 --- /dev/null +++ b/paddle/fluid/operators/detail/checkpoint.cc @@ -0,0 +1,54 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/operators/detail/checkpoint.h" + +#include + +namespace paddle { +namespace framework { +namespace details { +Checkpoint::Save(const framework::Scope& scope, const platform::Place& place, + const std::string& save_dir, const std::string& var_name, + const bool overwrite) { + auto* var = scope.FindVar(var_name); + PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s for save_op", + var_name); + PADDLE_ENFORCE(var->IsType(), + "Checkpoint only supports LoDTensor, %s has wrong type", + var_name); + + bool is_present = FileExists(save_dir); + if (is_present && !overwrite) { + PADDLE_THROW("%s exists!, checkpoint cannot write it when overwrite=false", + save_dir, overwrite); + } + + MkDirRecursively(DirName(save_dir).c_str()); + std::ofstream fout(save_dir); + PADDLE_ENFORCE(static_cast(fout), "Cannot open %s to write", save_dir); + + // get device context from pool + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& dev_ctx = *pool.Get(place); + + auto& tensor = var->Get(); + // Serialize tensor + framework::SerializeToStream(fout, tensor, dev_ctx); + fout.close(); +} +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/operators/detail/checkpoint.h b/paddle/fluid/operators/detail/checkpoint.h new file mode 100644 index 0000000000000..0f0f450ce17bb --- /dev/null +++ b/paddle/fluid/operators/detail/checkpoint.h @@ -0,0 +1,33 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include + +#include "paddle/fluid/framework/scope.h" + +namespace paddle { +namespace framework { +namespace details { +class Checkpoint { + public: + static void Save(const framework::Scope& scope, const platform::Place& place, + const std::string& save_dir, const std::string& var_name, + const bool overwrite); + + static void Load(); +} +} // namespace details +} // namespace framework +} // namespace paddle From 1fabbbade28d4a642700c0df9ac6c4a0be0d4a66 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Thu, 10 May 2018 12:33:33 +0800 Subject: [PATCH 02/56] modify const to const & --- paddle/fluid/operators/detail/checkpoint.cc | 2 +- paddle/fluid/operators/detail/checkpoint.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/detail/checkpoint.cc b/paddle/fluid/operators/detail/checkpoint.cc index 78506a0a72e42..38e46532e6e18 100644 --- a/paddle/fluid/operators/detail/checkpoint.cc +++ b/paddle/fluid/operators/detail/checkpoint.cc @@ -22,7 +22,7 @@ namespace framework { namespace details { Checkpoint::Save(const framework::Scope& scope, const platform::Place& place, const std::string& save_dir, const std::string& var_name, - const bool overwrite) { + const bool& overwrite) { auto* var = scope.FindVar(var_name); PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s for save_op", var_name); diff --git a/paddle/fluid/operators/detail/checkpoint.h b/paddle/fluid/operators/detail/checkpoint.h index 0f0f450ce17bb..dfa41979734ec 100644 --- a/paddle/fluid/operators/detail/checkpoint.h +++ b/paddle/fluid/operators/detail/checkpoint.h @@ -24,7 +24,7 @@ class Checkpoint { public: static void Save(const framework::Scope& scope, const platform::Place& place, const std::string& save_dir, const std::string& var_name, - const bool overwrite); + const bool& overwrite); static void Load(); } From 77c6b71ec44e3ba5220576fa528f3600b8784908 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Thu, 10 May 2018 20:03:13 +0800 Subject: [PATCH 03/56] add ckpt to sync loop --- paddle/fluid/operators/listen_and_serv_op.cc | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index 8acbf82025095..7fb7f07a610a7 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -101,6 +101,7 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor, framework::Scope *recv_scope, framework::BlockDesc *prefetch_block) const { auto fan_in = Attr("Fanin"); + auto checkpoint = Attr("Checkpoint"); size_t num_blocks = program->Size(); PADDLE_ENFORCE_GE(num_blocks, 2, @@ -188,6 +189,18 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor, for (auto &var : sparse_vars) { var->GetMutable()->mutable_rows()->clear(); } + + /******************** CHECK POINT ***********************/ + std::vector all_vars = recv_scope.LocalVarNames(); + + std::vector::iterator it; + for (it = all_vars.begin(); it != all_vars.end(); it++) { + VLOG(2) << "Checkpoint Var: " << *it; + break; + } + + /******************** CHECK POINT ***********************/ + rpc_service_->SetCond(1); // FIXME(typhoonzero): use another condition to sync wait clients get. rpc_service_->WaitClientGet(fan_in); From b81671ecf214edca344cce12da51d6f0e1d21a66 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Thu, 10 May 2018 20:03:58 +0800 Subject: [PATCH 04/56] add ckpt attr to pserver python config --- python/paddle/fluid/transpiler/distribute_transpiler.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 640ac9f085e6d..8cd7cd5d3a9f8 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -481,7 +481,8 @@ def __append_optimize_op__(op, block, grad_to_block_id): "Fanin": self.trainer_num, "PrefetchBlock": prefetch_block, "sync_mode": self.sync_mode, - "grad_to_block_id": grad_to_block_id + "grad_to_block_id": grad_to_block_id, + "Checkpoint": "/tmp/tangwei_ckpt/" }) pserver_program.sync_with_cpp() From 2a05b3d5a3e8f8e58d01eebc2c0826e61c15c5dd Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Fri, 11 May 2018 16:23:30 +0800 Subject: [PATCH 05/56] delete checkpoint function --- paddle/fluid/operators/detail/checkpoint.cc | 54 ------------------- paddle/fluid/operators/detail/checkpoint.h | 33 ------------ paddle/fluid/operators/listen_and_serv_op.cc | 12 ----- .../fluid/transpiler/distribute_transpiler.py | 3 +- 4 files changed, 1 insertion(+), 101 deletions(-) delete mode 100644 paddle/fluid/operators/detail/checkpoint.cc delete mode 100644 paddle/fluid/operators/detail/checkpoint.h diff --git a/paddle/fluid/operators/detail/checkpoint.cc b/paddle/fluid/operators/detail/checkpoint.cc deleted file mode 100644 index 38e46532e6e18..0000000000000 --- a/paddle/fluid/operators/detail/checkpoint.cc +++ /dev/null @@ -1,54 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/operators/detail/checkpoint.h" - -#include - -namespace paddle { -namespace framework { -namespace details { -Checkpoint::Save(const framework::Scope& scope, const platform::Place& place, - const std::string& save_dir, const std::string& var_name, - const bool& overwrite) { - auto* var = scope.FindVar(var_name); - PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s for save_op", - var_name); - PADDLE_ENFORCE(var->IsType(), - "Checkpoint only supports LoDTensor, %s has wrong type", - var_name); - - bool is_present = FileExists(save_dir); - if (is_present && !overwrite) { - PADDLE_THROW("%s exists!, checkpoint cannot write it when overwrite=false", - save_dir, overwrite); - } - - MkDirRecursively(DirName(save_dir).c_str()); - std::ofstream fout(save_dir); - PADDLE_ENFORCE(static_cast(fout), "Cannot open %s to write", save_dir); - - // get device context from pool - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& dev_ctx = *pool.Get(place); - - auto& tensor = var->Get(); - // Serialize tensor - framework::SerializeToStream(fout, tensor, dev_ctx); - fout.close(); -} -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/operators/detail/checkpoint.h b/paddle/fluid/operators/detail/checkpoint.h deleted file mode 100644 index dfa41979734ec..0000000000000 --- a/paddle/fluid/operators/detail/checkpoint.h +++ /dev/null @@ -1,33 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include - -#include "paddle/fluid/framework/scope.h" - -namespace paddle { -namespace framework { -namespace details { -class Checkpoint { - public: - static void Save(const framework::Scope& scope, const platform::Place& place, - const std::string& save_dir, const std::string& var_name, - const bool& overwrite); - - static void Load(); -} -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index 7fb7f07a610a7..8a3d747f86cf6 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -101,7 +101,6 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor, framework::Scope *recv_scope, framework::BlockDesc *prefetch_block) const { auto fan_in = Attr("Fanin"); - auto checkpoint = Attr("Checkpoint"); size_t num_blocks = program->Size(); PADDLE_ENFORCE_GE(num_blocks, 2, @@ -190,17 +189,6 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor, var->GetMutable()->mutable_rows()->clear(); } - /******************** CHECK POINT ***********************/ - std::vector all_vars = recv_scope.LocalVarNames(); - - std::vector::iterator it; - for (it = all_vars.begin(); it != all_vars.end(); it++) { - VLOG(2) << "Checkpoint Var: " << *it; - break; - } - - /******************** CHECK POINT ***********************/ - rpc_service_->SetCond(1); // FIXME(typhoonzero): use another condition to sync wait clients get. rpc_service_->WaitClientGet(fan_in); diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 635763ed35270..b45cb987d896b 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -483,8 +483,7 @@ def __append_optimize_op__(op, block, grad_to_block_id): "Fanin": self.trainer_num, "PrefetchBlock": prefetch_block, "sync_mode": self.sync_mode, - "grad_to_block_id": grad_to_block_id, - "Checkpoint": "/tmp/tangwei_ckpt/" + "grad_to_block_id": grad_to_block_id }) pserver_program.sync_with_cpp() From 87a08563841715806972398dfeb7770d2b69d30b Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Fri, 11 May 2018 16:24:08 +0800 Subject: [PATCH 06/56] add checkpoint save op --- paddle/fluid/operators/cpkt_save_op.cc | 158 +++++++++++++++++++++++++ 1 file changed, 158 insertions(+) create mode 100644 paddle/fluid/operators/cpkt_save_op.cc diff --git a/paddle/fluid/operators/cpkt_save_op.cc b/paddle/fluid/operators/cpkt_save_op.cc new file mode 100644 index 0000000000000..352bd3350796d --- /dev/null +++ b/paddle/fluid/operators/cpkt_save_op.cc @@ -0,0 +1,158 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/data_type_transform.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { + +// TODO(sidgoyal78): These function are needed by other files (save_op), move +// them to paddle::filesystem namespace. (as noted by yuyang18 in save_op). +constexpr char kSEP = '/'; +static bool FileExists(const std::string &filepath) { + struct stat buffer; + return (stat(filepath.c_str(), &buffer) == 0); +} + +static std::string DirName(const std::string &filepath) { + auto pos = filepath.rfind(kSEP); + if (pos == std::string::npos) { + return ""; + } + return filepath.substr(0, pos); +} + +static void MkDir(const char *path) { + if (mkdir(path, 0755)) { + PADDLE_ENFORCE_EQ(errno, EEXIST, "%s mkdir failed!", path); + } +} + +static void MkDirRecursively(const char *fullpath) { + if (*fullpath == '\0') return; // empty string + if (FileExists(fullpath)) return; + + MkDirRecursively(DirName(fullpath).c_str()); + MkDir(fullpath); +} + +class CkptSaveOp : public framework::OperatorBase { + public: + CkptSaveOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + private: + void RunImpl(const framework::Scope &scope, + const platform::Place &place) const override { + auto filename = Attr("file_path"); + auto overwrite = Attr("overwrite"); + + bool is_present = FileExists(filename); + if (is_present && !overwrite) { + PADDLE_THROW("%s exists!, cannot save_combine to it when overwrite=false", + filename, overwrite); + } + + MkDirRecursively(DirName(filename).c_str()); + std::ofstream fout(filename); + PADDLE_ENFORCE(static_cast(fout), "Cannot open %s to write", + filename); + + auto inp_var_names = Inputs("X"); + PADDLE_ENFORCE_GT(static_cast(inp_var_names.size()), 0, + "The number of input variables should be greater than 0"); + + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + + for (size_t i = 0; i < inp_var_names.size(); i++) { + auto *var = scope.FindVar(inp_var_names[i]); + + PADDLE_ENFORCE(var != nullptr, + "Cannot find variable %s for save_combine_op", + inp_var_names[i]); + PADDLE_ENFORCE(var->IsType(), + "SaveCombineOp only supports LoDTensor, %s has wrong type", + inp_var_names[i]); + + auto &tensor = var->Get(); + // Serialize tensors one by one + + // Check types to see if a fp16 transformation is required + auto in_dtype = framework::ToDataType(tensor.type()); + auto out_dtype = in_dtype; + + if (in_dtype != out_dtype) { + auto in_kernel_type = framework::OpKernelType(in_dtype, place); + auto out_kernel_type = framework::OpKernelType(out_dtype, place); + framework::LoDTensor out; + // copy LoD info to the new tensor + out.set_lod(tensor.lod()); + framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out); + framework::SerializeToStream(fout, out, dev_ctx); + } else { + framework::SerializeToStream(fout, tensor, dev_ctx); + } + } + fout.close(); + } +}; + +class CkptSaveOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + CkptSaveOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput( + "X", + "(vector) Input LoDTensors that need to be saved together in a file.") + .AsDuplicable(); + AddComment(R"DOC( +SaveCombine operator + +This operator will serialize and write a list of input LoDTensor variables +to a file on disk. +)DOC"); + AddAttr("overwrite", + "(boolean, default true)" + "Overwrite the output file if it exists.") + .SetDefault(true); + + AddAttr( + "file_path", + "(string)" + "The \"file_path\" where the LoDTensor variables will be saved.") + .AddCustomChecker( + [](const std::string &path) { return !path.empty(); }); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(ckpt_save, ops::CkptSaveOp, ops::CkptSaveOpProtoMaker); From dc534fc19525b2671a9620863daa7ace47a37c00 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Fri, 11 May 2018 16:44:10 +0800 Subject: [PATCH 07/56] add checkpoint save op test --- paddle/fluid/operators/cpkt_save_op_test.cc | 44 +++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 paddle/fluid/operators/cpkt_save_op_test.cc diff --git a/paddle/fluid/operators/cpkt_save_op_test.cc b/paddle/fluid/operators/cpkt_save_op_test.cc new file mode 100644 index 0000000000000..3e620a0e9cbbd --- /dev/null +++ b/paddle/fluid/operators/cpkt_save_op_test.cc @@ -0,0 +1,44 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" + +TEST(CkptSaveOp, CPU) { + paddle::framework::Scope scope; + paddle::platform::CPUPlace place; + + auto var = scope.Var("test_var"); + auto tensor = var->GetMutable(); + tensor->Resize({3, 10}); + paddle::framework::LoD expect_lod; + expect_lod.resize(1); + expect_lod[0].push_back(0); + expect_lod[0].push_back(1); + expect_lod[0].push_back(2); + expect_lod[0].push_back(3); + + tensor->set_lod(expect_lod); + float* expect = tensor->mutable_data(place); + for (int64_t i = 0; i < tensor->numel(); ++i) { + expect[i] = static_cast(paddle::platform::float16(i)); + } + + paddle::framework::AttributeMap attrs; + attrs.insert({"file_path", std::string("tensor.save")}); + + auto save_op = paddle::framework::OpRegistry::CreateOp( + "ckpt_save", {{"X", {"test_var"}}}, {}, attrs); + save_op->Run(scope, place); +} From 802d10cf53c693a6fe551a9d007ce988fe89ccab Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Fri, 11 May 2018 19:10:04 +0800 Subject: [PATCH 08/56] rename cpkt_save_op --- paddle/fluid/operators/{cpkt_save_op.cc => ckpt_save_op.cc} | 0 .../operators/{cpkt_save_op_test.cc => ckpt_save_op_test.cc} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename paddle/fluid/operators/{cpkt_save_op.cc => ckpt_save_op.cc} (100%) rename paddle/fluid/operators/{cpkt_save_op_test.cc => ckpt_save_op_test.cc} (100%) diff --git a/paddle/fluid/operators/cpkt_save_op.cc b/paddle/fluid/operators/ckpt_save_op.cc similarity index 100% rename from paddle/fluid/operators/cpkt_save_op.cc rename to paddle/fluid/operators/ckpt_save_op.cc diff --git a/paddle/fluid/operators/cpkt_save_op_test.cc b/paddle/fluid/operators/ckpt_save_op_test.cc similarity index 100% rename from paddle/fluid/operators/cpkt_save_op_test.cc rename to paddle/fluid/operators/ckpt_save_op_test.cc From d1bd3fdefc9ec5a2c8d3746ab833dabd9f841948 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Fri, 11 May 2018 19:10:24 +0800 Subject: [PATCH 09/56] add build and test make --- paddle/fluid/operators/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 256aded8ca234..a6c7690d6b7c7 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -229,6 +229,7 @@ op_library(recurrent_op DEPS executor) op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale) op_library(cos_sim_op DEPS cos_sim_functor) op_library(parallel_do_op DEPS executor) +op_library(ckpt_save_op DEPS lod_tensor) if (WITH_GPU) op_library(conv_op DEPS vol2col depthwise_conv im2col) @@ -277,5 +278,6 @@ cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_sea cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory) cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op) +cc_test(ckpt_save_op_test SRCS ckpt_save_op_test.cc DEPS ckpt_save_op) nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor) From 5e74db3f2a1872b9433ec0348092f150f727359c Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Fri, 11 May 2018 21:38:49 +0800 Subject: [PATCH 10/56] add build and test make --- paddle/fluid/operators/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index a6c7690d6b7c7..948ce79da7db5 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -229,7 +229,6 @@ op_library(recurrent_op DEPS executor) op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale) op_library(cos_sim_op DEPS cos_sim_functor) op_library(parallel_do_op DEPS executor) -op_library(ckpt_save_op DEPS lod_tensor) if (WITH_GPU) op_library(conv_op DEPS vol2col depthwise_conv im2col) @@ -243,6 +242,7 @@ op_library(save_op DEPS lod_tensor) op_library(load_op DEPS lod_tensor) op_library(save_combine_op DEPS lod_tensor) op_library(load_combine_op DEPS lod_tensor) +op_library(ckpt_save_op DEPS lod_tensor) op_library(concat_op DEPS concat) # FIXME(thuan): Move CSP operators to paddle/fluid/framework/operators/concurrency From a1419f1062826167313e6ff68f894eb00fe1f34f Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Fri, 11 May 2018 22:54:54 +0800 Subject: [PATCH 11/56] test add op declare --- paddle/fluid/operators/ckpt_save_op_test.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/paddle/fluid/operators/ckpt_save_op_test.cc b/paddle/fluid/operators/ckpt_save_op_test.cc index 3e620a0e9cbbd..f8616ef53ce1a 100644 --- a/paddle/fluid/operators/ckpt_save_op_test.cc +++ b/paddle/fluid/operators/ckpt_save_op_test.cc @@ -15,6 +15,8 @@ limitations under the License. */ #include "gtest/gtest.h" #include "paddle/fluid/framework/op_registry.h" +USE_NO_KERNEL_OP(ckpt_save) + TEST(CkptSaveOp, CPU) { paddle::framework::Scope scope; paddle::platform::CPUPlace place; From 461d2fc0d7ef3ddfc2bcb47561facb43929ecd56 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Mon, 14 May 2018 15:21:08 +0800 Subject: [PATCH 12/56] rename ckpt -> checkpoint --- paddle/fluid/operators/CMakeLists.txt | 4 ++-- ...op_test.cc => che'ck'po'in't_save_op_test.cc} | 6 +++--- .../{ckpt_save_op.cc => checkpoint_save_op.cc} | 16 +++++++++------- 3 files changed, 14 insertions(+), 12 deletions(-) rename paddle/fluid/operators/{ckpt_save_op_test.cc => che'ck'po'in't_save_op_test.cc} (92%) rename paddle/fluid/operators/{ckpt_save_op.cc => checkpoint_save_op.cc} (90%) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 948ce79da7db5..34ec82c294b60 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -242,7 +242,7 @@ op_library(save_op DEPS lod_tensor) op_library(load_op DEPS lod_tensor) op_library(save_combine_op DEPS lod_tensor) op_library(load_combine_op DEPS lod_tensor) -op_library(ckpt_save_op DEPS lod_tensor) +op_library(checkpoint_save_op DEPS lod_tensor) op_library(concat_op DEPS concat) # FIXME(thuan): Move CSP operators to paddle/fluid/framework/operators/concurrency @@ -278,6 +278,6 @@ cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_sea cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory) cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op) -cc_test(ckpt_save_op_test SRCS ckpt_save_op_test.cc DEPS ckpt_save_op) +cc_test(checkpoint_save_op_test SRCS checkpoint_save_op_test.cc DEPS checkpoint_save_op) nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor) diff --git a/paddle/fluid/operators/ckpt_save_op_test.cc b/paddle/fluid/operators/che'ck'po'in't_save_op_test.cc similarity index 92% rename from paddle/fluid/operators/ckpt_save_op_test.cc rename to paddle/fluid/operators/che'ck'po'in't_save_op_test.cc index f8616ef53ce1a..b49bbd1a58f2c 100644 --- a/paddle/fluid/operators/ckpt_save_op_test.cc +++ b/paddle/fluid/operators/che'ck'po'in't_save_op_test.cc @@ -15,9 +15,9 @@ limitations under the License. */ #include "gtest/gtest.h" #include "paddle/fluid/framework/op_registry.h" -USE_NO_KERNEL_OP(ckpt_save) +USE_NO_KERNEL_OP(checkpoint_save) -TEST(CkptSaveOp, CPU) { +TEST(CheckpointSaveOp, CPU) { paddle::framework::Scope scope; paddle::platform::CPUPlace place; @@ -41,6 +41,6 @@ TEST(CkptSaveOp, CPU) { attrs.insert({"file_path", std::string("tensor.save")}); auto save_op = paddle::framework::OpRegistry::CreateOp( - "ckpt_save", {{"X", {"test_var"}}}, {}, attrs); + "checkpoint_save", {{"X", {"test_var"}}}, {}, attrs); save_op->Run(scope, place); } diff --git a/paddle/fluid/operators/ckpt_save_op.cc b/paddle/fluid/operators/checkpoint_save_op.cc similarity index 90% rename from paddle/fluid/operators/ckpt_save_op.cc rename to paddle/fluid/operators/checkpoint_save_op.cc index 352bd3350796d..2462ec09d6b86 100644 --- a/paddle/fluid/operators/ckpt_save_op.cc +++ b/paddle/fluid/operators/checkpoint_save_op.cc @@ -57,11 +57,12 @@ static void MkDirRecursively(const char *fullpath) { MkDir(fullpath); } -class CkptSaveOp : public framework::OperatorBase { +class CheckpointSaveOp : public framework::OperatorBase { public: - CkptSaveOp(const std::string &type, const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) + CheckpointSaveOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) : OperatorBase(type, inputs, outputs, attrs) {} private: @@ -122,9 +123,9 @@ class CkptSaveOp : public framework::OperatorBase { } }; -class CkptSaveOpProtoMaker : public framework::OpProtoAndCheckerMaker { +class CheckpointSaveOpProtoMaker : public framework::OpProtoAndCheckerMaker { public: - CkptSaveOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + CheckpointSaveOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput( "X", @@ -155,4 +156,5 @@ to a file on disk. namespace ops = paddle::operators; -REGISTER_OPERATOR(ckpt_save, ops::CkptSaveOp, ops::CkptSaveOpProtoMaker); +REGISTER_OPERATOR(checkpoint_save, ops::CheckpointSaveOp, + ops::CheckpointSaveOpProtoMaker); From 2f4c039e6218c68f6047c6ef8f1ba23431689e68 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Mon, 14 May 2018 21:36:34 +0800 Subject: [PATCH 13/56] rename, modify ckpt structure --- paddle/fluid/operators/checkpoint_save_op.cc | 34 ++++++------------- ..._op_test.cc => checkpoint_save_op_test.cc} | 2 +- .../fluid/transpiler/distribute_transpiler.py | 12 +++++++ 3 files changed, 24 insertions(+), 24 deletions(-) rename paddle/fluid/operators/{che'ck'po'in't_save_op_test.cc => checkpoint_save_op_test.cc} (96%) diff --git a/paddle/fluid/operators/checkpoint_save_op.cc b/paddle/fluid/operators/checkpoint_save_op.cc index 2462ec09d6b86..1e621a00e5028 100644 --- a/paddle/fluid/operators/checkpoint_save_op.cc +++ b/paddle/fluid/operators/checkpoint_save_op.cc @@ -68,19 +68,16 @@ class CheckpointSaveOp : public framework::OperatorBase { private: void RunImpl(const framework::Scope &scope, const platform::Place &place) const override { - auto filename = Attr("file_path"); + auto dir = Attr("dir"); auto overwrite = Attr("overwrite"); - bool is_present = FileExists(filename); + bool is_present = FileExists(dir); if (is_present && !overwrite) { PADDLE_THROW("%s exists!, cannot save_combine to it when overwrite=false", - filename, overwrite); + dir, overwrite); } - MkDirRecursively(DirName(filename).c_str()); - std::ofstream fout(filename); - PADDLE_ENFORCE(static_cast(fout), "Cannot open %s to write", - filename); + MkDirRecursively(dir.c_str()); auto inp_var_names = Inputs("X"); PADDLE_ENFORCE_GT(static_cast(inp_var_names.size()), 0, @@ -92,6 +89,10 @@ class CheckpointSaveOp : public framework::OperatorBase { for (size_t i = 0; i < inp_var_names.size(); i++) { auto *var = scope.FindVar(inp_var_names[i]); + std::string var_file; + var_file.append(dir); + var_file.append("/"); + var_file.append(inp_var_names[i]); PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s for save_combine_op", @@ -103,23 +104,10 @@ class CheckpointSaveOp : public framework::OperatorBase { auto &tensor = var->Get(); // Serialize tensors one by one - // Check types to see if a fp16 transformation is required - auto in_dtype = framework::ToDataType(tensor.type()); - auto out_dtype = in_dtype; - - if (in_dtype != out_dtype) { - auto in_kernel_type = framework::OpKernelType(in_dtype, place); - auto out_kernel_type = framework::OpKernelType(out_dtype, place); - framework::LoDTensor out; - // copy LoD info to the new tensor - out.set_lod(tensor.lod()); - framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out); - framework::SerializeToStream(fout, out, dev_ctx); - } else { - framework::SerializeToStream(fout, tensor, dev_ctx); - } + std::ofstream fout(var_file); + framework::SerializeToStream(fout, tensor, dev_ctx); + fout.close(); } - fout.close(); } }; diff --git a/paddle/fluid/operators/che'ck'po'in't_save_op_test.cc b/paddle/fluid/operators/checkpoint_save_op_test.cc similarity index 96% rename from paddle/fluid/operators/che'ck'po'in't_save_op_test.cc rename to paddle/fluid/operators/checkpoint_save_op_test.cc index b49bbd1a58f2c..7b5aa7bcde16e 100644 --- a/paddle/fluid/operators/che'ck'po'in't_save_op_test.cc +++ b/paddle/fluid/operators/checkpoint_save_op_test.cc @@ -38,7 +38,7 @@ TEST(CheckpointSaveOp, CPU) { } paddle::framework::AttributeMap attrs; - attrs.insert({"file_path", std::string("tensor.save")}); + attrs.insert({"dir", std::string("tensor/ckpt")}); auto save_op = paddle::framework::OpRegistry::CreateOp( "checkpoint_save", {{"X", {"test_var"}}}, {}, attrs); diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index b45cb987d896b..b76f8de504010 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -207,6 +207,11 @@ def transpile(self, self.pserver_endpoints = pserver_endpoints self.optimize_ops, params_grads = self._get_optimize_pass() + # is_chief (no.0 triner) for checkpoint + # the no.0 trainer will save all variables and its own reader offset to checkpoint + # other trianers will save its own reader offset to checkpoint + self.is_chief = trainer_id == 0 + # process lookup_table_op # 1. check all lookup_table_op is distributed # 2. check all lookup_table_op share the same table. @@ -309,6 +314,13 @@ def transpile(self, "epmap": eplist, "sync_mode": self.sync_mode }) + + program.global_block().append_op( + type="checkpoint_save", + inputs={"X": send_outputs}, + attrs={"overwrite": True, + "file_path": "/workspace/ckpt/"}) + # step4: Concat the parameters splits together after recv. for varname, splited_var in param_var_mapping.iteritems(): if len(splited_var) <= 1: From 38596cfb1e3b034bd26e68e97f3291dbbdea3de0 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Mon, 14 May 2018 21:37:09 +0800 Subject: [PATCH 14/56] move file_path to dir --- python/paddle/fluid/transpiler/distribute_transpiler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index b76f8de504010..6366ba8a58558 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -319,7 +319,7 @@ def transpile(self, type="checkpoint_save", inputs={"X": send_outputs}, attrs={"overwrite": True, - "file_path": "/workspace/ckpt/"}) + "dir": "/workspace/ckpt/"}) # step4: Concat the parameters splits together after recv. for varname, splited_var in param_var_mapping.iteritems(): From ce1bcc947f5d036dad34fabcc854531cb63cbc25 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Mon, 14 May 2018 23:11:23 +0800 Subject: [PATCH 15/56] add op to framework.py --- python/paddle/fluid/framework.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 28e54f5492e7b..4612263540477 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -489,7 +489,7 @@ def find_name(var_list, name): 'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv', 'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine', 'ncclInit', 'channel_create', 'channel_close', - 'channel_send', 'channel_recv', 'select' + 'channel_send', 'channel_recv', 'select', 'checkpoint_save' } if type not in no_kernel_op_set: self.desc.infer_var_type(self.block.desc) From 3c820064defc0ef2e24439f3674b7d1f34269436 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Mon, 14 May 2018 23:14:06 +0800 Subject: [PATCH 16/56] remove overwrite judge to test load --- paddle/fluid/operators/checkpoint_save_op.cc | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/checkpoint_save_op.cc b/paddle/fluid/operators/checkpoint_save_op.cc index 1e621a00e5028..94a1cc05c76a3 100644 --- a/paddle/fluid/operators/checkpoint_save_op.cc +++ b/paddle/fluid/operators/checkpoint_save_op.cc @@ -30,6 +30,9 @@ namespace operators { // TODO(sidgoyal78): These function are needed by other files (save_op), move // them to paddle::filesystem namespace. (as noted by yuyang18 in save_op). constexpr char kSEP = '/'; +// write empty file named _SUCCESS +const char SUCCESS[] = "_SUCCESS"; + static bool FileExists(const std::string &filepath) { struct stat buffer; return (stat(filepath.c_str(), &buffer) == 0); @@ -73,8 +76,11 @@ class CheckpointSaveOp : public framework::OperatorBase { bool is_present = FileExists(dir); if (is_present && !overwrite) { - PADDLE_THROW("%s exists!, cannot save_combine to it when overwrite=false", - dir, overwrite); + return; + // todo(tangwei) judge the folder is exist + // PADDLE_THROW("%s exists!, cannot save_combine to it when + // overwrite=false", + // dir, overwrite); } MkDirRecursively(dir.c_str()); @@ -108,6 +114,13 @@ class CheckpointSaveOp : public framework::OperatorBase { framework::SerializeToStream(fout, tensor, dev_ctx); fout.close(); } + + std::string success; + success.append(dir); + success.append("/"); + success.append(SUCCESS); + std::ofstream fout(success); + fout.close(); } }; From f04b23adf96651185bd0b47d90f8b5f1fee77706 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Tue, 15 May 2018 16:13:41 +0800 Subject: [PATCH 17/56] add checkpoint_load, update checkpoint save --- paddle/fluid/operators/CMakeLists.txt | 3 +- paddle/fluid/operators/checkpoint_load_op.cc | 87 +++++++++++++++++++ ..._save_op_test.cc => checkpoint_op_test.cc} | 0 paddle/fluid/operators/checkpoint_save_op.cc | 21 +++-- 4 files changed, 103 insertions(+), 8 deletions(-) create mode 100644 paddle/fluid/operators/checkpoint_load_op.cc rename paddle/fluid/operators/{checkpoint_save_op_test.cc => checkpoint_op_test.cc} (100%) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 34ec82c294b60..df0292d902f2a 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -243,6 +243,7 @@ op_library(load_op DEPS lod_tensor) op_library(save_combine_op DEPS lod_tensor) op_library(load_combine_op DEPS lod_tensor) op_library(checkpoint_save_op DEPS lod_tensor) +op_library(checkpoint_load_op DEPS lod_tensor) op_library(concat_op DEPS concat) # FIXME(thuan): Move CSP operators to paddle/fluid/framework/operators/concurrency @@ -278,6 +279,6 @@ cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_sea cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory) cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op) -cc_test(checkpoint_save_op_test SRCS checkpoint_save_op_test.cc DEPS checkpoint_save_op) +cc_test(checkpoint_op_test SRCS checkpoint_op_test.cc DEPS checkpoint_save_op checkpoint_load_op) nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor) diff --git a/paddle/fluid/operators/checkpoint_load_op.cc b/paddle/fluid/operators/checkpoint_load_op.cc new file mode 100644 index 0000000000000..b2ca59f2b5b5b --- /dev/null +++ b/paddle/fluid/operators/checkpoint_load_op.cc @@ -0,0 +1,87 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/data_type_transform.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { + +constexpr char kSEP = '/'; +// write empty file named _SUCCESS +const char SUCCESS[] = "_SUCCESS"; + +static bool FileExists(const std::string &filepath) { + struct stat buffer; + return (stat(filepath.c_str(), &buffer) == 0); +} + +static std::string DirName(const std::string &filepath) { + auto pos = filepath.rfind(kSEP); + if (pos == std::string::npos) { + return ""; + } + return filepath.substr(0, pos); +} + +class CheckpointLoadOp : public framework::OperatorBase { + public: + CheckpointLoadOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + private: + void RunImpl(const framework::Scope &scope, + const platform::Place &place) const override { + auto dir = Attr("dir"); + bool is_present = FileExists(dir); + if (!is_present) { + return; + } + + // UPDATE LATER ... + } +}; + +class CheckpointLoadOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + CheckpointLoadOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddAttr( + "dir", + "(string)" + "The \"file_path\" where the LoDTensor variables will be saved.") + .AddCustomChecker( + [](const std::string &path) { return !path.empty(); }); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(checkpoint_load, ops::CheckpointLoadOp, + ops::CheckpointLoadOpProtoMaker); diff --git a/paddle/fluid/operators/checkpoint_save_op_test.cc b/paddle/fluid/operators/checkpoint_op_test.cc similarity index 100% rename from paddle/fluid/operators/checkpoint_save_op_test.cc rename to paddle/fluid/operators/checkpoint_op_test.cc diff --git a/paddle/fluid/operators/checkpoint_save_op.cc b/paddle/fluid/operators/checkpoint_save_op.cc index 94a1cc05c76a3..7007ab9e1a1a7 100644 --- a/paddle/fluid/operators/checkpoint_save_op.cc +++ b/paddle/fluid/operators/checkpoint_save_op.cc @@ -27,8 +27,6 @@ limitations under the License. */ namespace paddle { namespace operators { -// TODO(sidgoyal78): These function are needed by other files (save_op), move -// them to paddle::filesystem namespace. (as noted by yuyang18 in save_op). constexpr char kSEP = '/'; // write empty file named _SUCCESS const char SUCCESS[] = "_SUCCESS"; @@ -82,7 +80,14 @@ class CheckpointSaveOp : public framework::OperatorBase { // overwrite=false", // dir, overwrite); } + MkDirRecursively(dir.c_str()); + auto serial_var_name = Output("Serial"); + auto *serial_var = scope.FindVar(serial_var_name); + std::string *serial_num = serial_var->GetMutable(); + serial_num->append("0"); + dir.append("/"); + dir.append(serial_num); MkDirRecursively(dir.c_str()); auto inp_var_names = Inputs("X"); @@ -93,6 +98,7 @@ class CheckpointSaveOp : public framework::OperatorBase { platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto &dev_ctx = *pool.Get(place); + // todo (tangwei) made it async for (size_t i = 0; i < inp_var_names.size(); i++) { auto *var = scope.FindVar(inp_var_names[i]); std::string var_file; @@ -132,19 +138,20 @@ class CheckpointSaveOpProtoMaker : public framework::OpProtoAndCheckerMaker { "X", "(vector) Input LoDTensors that need to be saved together in a file.") .AsDuplicable(); + AddOutput("Serial", "the serial number"); AddComment(R"DOC( -SaveCombine operator +CheckpointSave operator This operator will serialize and write a list of input LoDTensor variables to a file on disk. )DOC"); AddAttr("overwrite", - "(boolean, default true)" - "Overwrite the output file if it exists.") - .SetDefault(true); + "(boolean, default false)" + "Delete the output dir if it exists.") + .SetDefault(false); AddAttr( - "file_path", + "dir", "(string)" "The \"file_path\" where the LoDTensor variables will be saved.") .AddCustomChecker( From c80125f286fb641472b62a51c6f350e00e904519 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Tue, 15 May 2018 17:16:17 +0800 Subject: [PATCH 18/56] add checkpoint_load to python framework --- python/paddle/fluid/framework.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 4612263540477..6ab31ec9463b8 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -489,7 +489,8 @@ def find_name(var_list, name): 'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv', 'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine', 'ncclInit', 'channel_create', 'channel_close', - 'channel_send', 'channel_recv', 'select', 'checkpoint_save' + 'channel_send', 'channel_recv', 'select', 'checkpoint_save', + 'checkpoint_save' } if type not in no_kernel_op_set: self.desc.infer_var_type(self.block.desc) From 2e25e739f33189002c8aea56a5180666794e5dcc Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Tue, 15 May 2018 17:17:14 +0800 Subject: [PATCH 19/56] write checkpoint_load code simply --- paddle/fluid/operators/checkpoint_load_op.cc | 8 ------- paddle/fluid/operators/checkpoint_op_test.cc | 22 ++++++++++++++++++-- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/operators/checkpoint_load_op.cc b/paddle/fluid/operators/checkpoint_load_op.cc index b2ca59f2b5b5b..8edf3b6429dbd 100644 --- a/paddle/fluid/operators/checkpoint_load_op.cc +++ b/paddle/fluid/operators/checkpoint_load_op.cc @@ -36,14 +36,6 @@ static bool FileExists(const std::string &filepath) { return (stat(filepath.c_str(), &buffer) == 0); } -static std::string DirName(const std::string &filepath) { - auto pos = filepath.rfind(kSEP); - if (pos == std::string::npos) { - return ""; - } - return filepath.substr(0, pos); -} - class CheckpointLoadOp : public framework::OperatorBase { public: CheckpointLoadOp(const std::string &type, diff --git a/paddle/fluid/operators/checkpoint_op_test.cc b/paddle/fluid/operators/checkpoint_op_test.cc index 7b5aa7bcde16e..1445d9f9acffc 100644 --- a/paddle/fluid/operators/checkpoint_op_test.cc +++ b/paddle/fluid/operators/checkpoint_op_test.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" USE_NO_KERNEL_OP(checkpoint_save) +USE_NO_KERNEL_OP(checkpoint_load) TEST(CheckpointSaveOp, CPU) { paddle::framework::Scope scope; @@ -37,10 +38,27 @@ TEST(CheckpointSaveOp, CPU) { expect[i] = static_cast(paddle::platform::float16(i)); } + scope.Var("SERIAL_NUMBER"); + paddle::framework::AttributeMap attrs; - attrs.insert({"dir", std::string("tensor/ckpt")}); + attrs.insert({"dir", std::string("ckpt")}); auto save_op = paddle::framework::OpRegistry::CreateOp( - "checkpoint_save", {{"X", {"test_var"}}}, {}, attrs); + "checkpoint_save", {{"X", {"test_var"}}}, {{"Serial", {"SERIAL_NUMBER"}}}, + attrs); + save_op->Run(scope, place); +} + +TEST(CheckpointLoadOp, CPU) { + paddle::framework::Scope scope; + paddle::platform::CPUPlace place; + + scope.Var("test_var"); + + paddle::framework::AttributeMap attrs; + attrs.insert({"dir", std::string("ckpt")}); + + auto save_op = + paddle::framework::OpRegistry::CreateOp("checkpoint_load", {}, {}, attrs); save_op->Run(scope, place); } From 30b50dcf8cd07efedd3d99a36199f589b29a448a Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Tue, 15 May 2018 17:23:48 +0800 Subject: [PATCH 20/56] fix Serial output type --- paddle/fluid/operators/checkpoint_save_op.cc | 25 +++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/checkpoint_save_op.cc b/paddle/fluid/operators/checkpoint_save_op.cc index 7007ab9e1a1a7..7449352117b58 100644 --- a/paddle/fluid/operators/checkpoint_save_op.cc +++ b/paddle/fluid/operators/checkpoint_save_op.cc @@ -87,7 +87,7 @@ class CheckpointSaveOp : public framework::OperatorBase { std::string *serial_num = serial_var->GetMutable(); serial_num->append("0"); dir.append("/"); - dir.append(serial_num); + dir.append(serial_num->c_str()); MkDirRecursively(dir.c_str()); auto inp_var_names = Inputs("X"); @@ -159,10 +159,29 @@ to a file on disk. } }; +class CheckpointSaveOpVarTypeInference : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + auto out_var_name = op_desc.Output("Serial").front(); + auto &out_var = block->FindRecursiveOrCreateVar(out_var_name); + auto var_type = framework::proto::VarType::RAW; + out_var.SetType(var_type); + } +}; + +class CheckpointSaveOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override {} +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(checkpoint_save, ops::CheckpointSaveOp, - ops::CheckpointSaveOpProtoMaker); +REGISTER_OPERATOR(send_vars, ops::CheckpointSaveOp, + paddle::framework::EmptyGradOpMaker, + ops::CheckpointSaveOpProtoMaker, + ops::CheckpointSaveOpVarTypeInference, + ops::CheckpointSaveOpShapeInference); From 0334d494406ff3fc0ac6e9a078ce17bee38a2fd6 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Tue, 15 May 2018 18:58:00 +0800 Subject: [PATCH 21/56] fix bug --- paddle/fluid/operators/checkpoint_load_op.cc | 7 +++++++ paddle/fluid/operators/checkpoint_save_op.cc | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/checkpoint_load_op.cc b/paddle/fluid/operators/checkpoint_load_op.cc index 8edf3b6429dbd..ec451c9f3f0cd 100644 --- a/paddle/fluid/operators/checkpoint_load_op.cc +++ b/paddle/fluid/operators/checkpoint_load_op.cc @@ -61,6 +61,13 @@ class CheckpointLoadOpProtoMaker : public framework::OpProtoAndCheckerMaker { public: CheckpointLoadOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { + AddComment(R"DOC( +CheckpointLoad operator + +This operator will serialize and write a list of input LoDTensor variables +to a file on disk. +)DOC"); + AddAttr( "dir", "(string)" diff --git a/paddle/fluid/operators/checkpoint_save_op.cc b/paddle/fluid/operators/checkpoint_save_op.cc index 7449352117b58..1082bb4a345a2 100644 --- a/paddle/fluid/operators/checkpoint_save_op.cc +++ b/paddle/fluid/operators/checkpoint_save_op.cc @@ -180,7 +180,7 @@ class CheckpointSaveOpShapeInference : public framework::InferShapeBase { namespace ops = paddle::operators; -REGISTER_OPERATOR(send_vars, ops::CheckpointSaveOp, +REGISTER_OPERATOR(checkpoint_save, ops::CheckpointSaveOp, paddle::framework::EmptyGradOpMaker, ops::CheckpointSaveOpProtoMaker, ops::CheckpointSaveOpVarTypeInference, From d081256cd541521c17c1e8f1988e02109582d2f2 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Wed, 16 May 2018 15:29:57 +0800 Subject: [PATCH 22/56] add api in distribute transpiler --- .../fluid/transpiler/distribute_transpiler.py | 36 +++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 6366ba8a58558..104e2405322e9 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -315,10 +315,21 @@ def transpile(self, "sync_mode": self.sync_mode }) + serial_var = program.global_block().create_var( + name="SERIAL_NUMBER", + persistable=True, + type=core.VarDesc.VarType.RAW) + + save_vars = [] + for var in self.origin_program.list_vars(): + if self.is_persistable(var): + save_vars.append(var.name) + program.global_block().append_op( type="checkpoint_save", - inputs={"X": send_outputs}, - attrs={"overwrite": True, + inputs={"X": save_vars}, + outputs={"Serial": serial_var}, + attrs={"overwrite": False, "dir": "/workspace/ckpt/"}) # step4: Concat the parameters splits together after recv. @@ -501,6 +512,27 @@ def __append_optimize_op__(op, block, grad_to_block_id): pserver_program.sync_with_cpp() return pserver_program + def is_persistable(self, var): + if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \ + var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \ + var.desc.type() == core.VarDesc.VarType.RAW : + return False + return var.persistable + + def get_train_startup_program(self, checkpoint_load_dir=None): + startup_prog = default_startup_program() + + if not checkpoint_load_dir: + return startup_prog + + for var in startup_prog.list_vars(): + if self.is_persistable(var): + print("var: %s" % var.name) + + startup_prog.global_block().append_op( + type="checkpoint_load", attrs={"dir": checkpoint_load_dir}) + return startup_prog + def get_startup_program(self, endpoint, pserver_program): """ Get startup program for current parameter server. From 886897ccf742f3c95714703b5ed925d35a56e46e Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Wed, 16 May 2018 16:05:33 +0800 Subject: [PATCH 23/56] load implement --- paddle/fluid/operators/checkpoint_load_op.cc | 48 ++++++++++++++++++-- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/checkpoint_load_op.cc b/paddle/fluid/operators/checkpoint_load_op.cc index ec451c9f3f0cd..ba8b5dbb51c10 100644 --- a/paddle/fluid/operators/checkpoint_load_op.cc +++ b/paddle/fluid/operators/checkpoint_load_op.cc @@ -47,13 +47,54 @@ class CheckpointLoadOp : public framework::OperatorBase { private: void RunImpl(const framework::Scope &scope, const platform::Place &place) const override { - auto dir = Attr("dir"); - bool is_present = FileExists(dir); + std::string dir = Attr("dir"); + + VLOG(3) << "Load checkpoint from dir: " << dir; + + std::string success; + success.append(dir); + success.append("/"); + success.append(SUCCESS); + + bool is_present = FileExists(success); if (!is_present) { + VLOG(3) << "can not find _SUCCESS from path: " << success; return; } - // UPDATE LATER ... + auto inp_var_names = Output("Out"); + PADDLE_ENFORCE_GT(static_cast(inp_var_names.size()), 0, + "The number of input variables should be greater than 0"); + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + + // todo (tangwei) made it async + for (size_t i = 0; i < inp_var_names.size(); i++) { + auto *var = scope.FindVar(inp_var_names[i]); + + PADDLE_ENFORCE(var != nullptr, + "Cannot find variable %s for save_combine_op", + inp_var_names[i]); + PADDLE_ENFORCE(var->IsType(), + "SaveCombineOp only supports LoDTensor, %s has wrong type", + inp_var_names[i]); + + std::string var_file; + var_file.append(dir); + var_file.append("/"); + var_file.append(inp_var_names[i]); + VLOG(3) << "ready to load var: " << inp_var_names[i]; + + auto &tensor = var->Get(); + + std::ifstream fin(var_file); + PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s for load op", + var_file); + DeserializeFromStream(fin, tensor, *dev_ctx); + fin.close(); + VLOG(3) << " load var: " << inp_var_names[i] << " finished"; + } } }; @@ -61,6 +102,7 @@ class CheckpointLoadOpProtoMaker : public framework::OpProtoAndCheckerMaker { public: CheckpointLoadOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { + AddOutput("Out", "(Tensor) The tensor need to be loaded"); AddComment(R"DOC( CheckpointLoad operator From 9cf47afe6154d86214b74f7082155cc3ae014ea8 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Wed, 16 May 2018 16:06:21 +0800 Subject: [PATCH 24/56] modify get trainer param --- python/paddle/fluid/transpiler/distribute_transpiler.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 104e2405322e9..04aa51d2cdd38 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -525,12 +525,15 @@ def get_train_startup_program(self, checkpoint_load_dir=None): if not checkpoint_load_dir: return startup_prog + load_vars = [] for var in startup_prog.list_vars(): if self.is_persistable(var): - print("var: %s" % var.name) + load_vars.append(var.name) startup_prog.global_block().append_op( - type="checkpoint_load", attrs={"dir": checkpoint_load_dir}) + type="checkpoint_load", + outputs={"Out": load_vars}, + attrs={"dir": checkpoint_load_dir}) return startup_prog def get_startup_program(self, endpoint, pserver_program): From c6f042f5d653af725b8af31f73570c153cb790be Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Wed, 16 May 2018 16:27:33 +0800 Subject: [PATCH 25/56] modify load op --- paddle/fluid/operators/checkpoint_load_op.cc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/checkpoint_load_op.cc b/paddle/fluid/operators/checkpoint_load_op.cc index ba8b5dbb51c10..026820ca30324 100644 --- a/paddle/fluid/operators/checkpoint_load_op.cc +++ b/paddle/fluid/operators/checkpoint_load_op.cc @@ -62,7 +62,7 @@ class CheckpointLoadOp : public framework::OperatorBase { return; } - auto inp_var_names = Output("Out"); + auto inp_var_names = Inputs("X"); PADDLE_ENFORCE_GT(static_cast(inp_var_names.size()), 0, "The number of input variables should be greater than 0"); // get device context from pool @@ -102,7 +102,10 @@ class CheckpointLoadOpProtoMaker : public framework::OpProtoAndCheckerMaker { public: CheckpointLoadOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { - AddOutput("Out", "(Tensor) The tensor need to be loaded"); + AddInput( + "X", + "(vector) Input LoDTensors that need to be saved together in a file.") + .AsDuplicable(); AddComment(R"DOC( CheckpointLoad operator From b677d8216e4454fadfc5204e00f7d483bb189368 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Wed, 16 May 2018 16:48:16 +0800 Subject: [PATCH 26/56] bug fix --- paddle/fluid/operators/checkpoint_load_op.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/checkpoint_load_op.cc b/paddle/fluid/operators/checkpoint_load_op.cc index 026820ca30324..241886e2be566 100644 --- a/paddle/fluid/operators/checkpoint_load_op.cc +++ b/paddle/fluid/operators/checkpoint_load_op.cc @@ -87,12 +87,12 @@ class CheckpointLoadOp : public framework::OperatorBase { VLOG(3) << "ready to load var: " << inp_var_names[i]; auto &tensor = var->Get(); - std::ifstream fin(var_file); PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s for load op", var_file); - DeserializeFromStream(fin, tensor, *dev_ctx); + DeserializeFromStream(fin, tensor, dev_ctx); fin.close(); + VLOG(3) << " load var: " << inp_var_names[i] << " finished"; } } From 744e95d30559cc5518b612678e0af6d0680fdbbe Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Wed, 16 May 2018 19:06:02 +0800 Subject: [PATCH 27/56] add ckpt load --- python/paddle/fluid/framework.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 6ab31ec9463b8..c33d15e32f9e5 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -490,7 +490,7 @@ def find_name(var_list, name): 'recv', 'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine', 'ncclInit', 'channel_create', 'channel_close', 'channel_send', 'channel_recv', 'select', 'checkpoint_save', - 'checkpoint_save' + 'checkpoint_load' } if type not in no_kernel_op_set: self.desc.infer_var_type(self.block.desc) From 955c79340c24adad885539d8e89b67835f666481 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Wed, 16 May 2018 19:07:23 +0800 Subject: [PATCH 28/56] add X to test --- paddle/fluid/operators/checkpoint_op_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/checkpoint_op_test.cc b/paddle/fluid/operators/checkpoint_op_test.cc index 1445d9f9acffc..bea44b35cada2 100644 --- a/paddle/fluid/operators/checkpoint_op_test.cc +++ b/paddle/fluid/operators/checkpoint_op_test.cc @@ -58,7 +58,7 @@ TEST(CheckpointLoadOp, CPU) { paddle::framework::AttributeMap attrs; attrs.insert({"dir", std::string("ckpt")}); - auto save_op = - paddle::framework::OpRegistry::CreateOp("checkpoint_load", {}, {}, attrs); + auto save_op = paddle::framework::OpRegistry::CreateOp( + "checkpoint_load", {{"X", {"test_var"}}}, {}, attrs); save_op->Run(scope, place); } From 3dd274657fb20c17e02fb2f76e1169b218828d93 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Wed, 16 May 2018 19:08:15 +0800 Subject: [PATCH 29/56] modify Get -> GetMutable --- paddle/fluid/operators/checkpoint_load_op.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/checkpoint_load_op.cc b/paddle/fluid/operators/checkpoint_load_op.cc index 241886e2be566..72cfccaaa22b7 100644 --- a/paddle/fluid/operators/checkpoint_load_op.cc +++ b/paddle/fluid/operators/checkpoint_load_op.cc @@ -86,11 +86,11 @@ class CheckpointLoadOp : public framework::OperatorBase { var_file.append(inp_var_names[i]); VLOG(3) << "ready to load var: " << inp_var_names[i]; - auto &tensor = var->Get(); + auto *tensor = var->GetMutable(); std::ifstream fin(var_file); PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s for load op", var_file); - DeserializeFromStream(fin, tensor, dev_ctx); + framework::DeserializeFromStream(fin, tensor, dev_ctx); fin.close(); VLOG(3) << " load var: " << inp_var_names[i] << " finished"; From 4220b31d4f45918fbc0a74cc05ba14ffd4ab093c Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Wed, 16 May 2018 20:50:24 +0800 Subject: [PATCH 30/56] update pserver startup --- .../fluid/transpiler/distribute_transpiler.py | 21 ++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 04aa51d2cdd38..84cfc6e0117e8 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -520,6 +520,11 @@ def is_persistable(self, var): return var.persistable def get_train_startup_program(self, checkpoint_load_dir=None): + """ + Get train startup program. + If checkpoint_load_dir is None, rerurn default startup program. + IF checkpoint_load_dir is Exist, add checkpoint_load op and load Var. + """ startup_prog = default_startup_program() if not checkpoint_load_dir: @@ -536,7 +541,10 @@ def get_train_startup_program(self, checkpoint_load_dir=None): attrs={"dir": checkpoint_load_dir}) return startup_prog - def get_startup_program(self, endpoint, pserver_program): + def get_startup_program(self, + endpoint, + pserver_program, + checkpoint_load_dir=None): """ Get startup program for current parameter server. Modify operator input variables if there are variables that @@ -561,6 +569,7 @@ def _get_splited_name_and_shape(varname): created_var_map[var.name] = tmpvar # 2. rename op outputs + load_vars = [] for op in orig_s_prog.global_block().ops: new_inputs = dict() new_outputs = dict() @@ -588,6 +597,16 @@ def _get_splited_name_and_shape(varname): inputs=new_inputs, outputs=new_outputs, attrs=op.attrs) + for var in new_outputs.values(): + load_vars.append(var.name) + # add checkpoint op + if not checkpoint_load_dir: + return s_prog + + s_prog.global_block().append_op( + type="checkpoint_load", + inputs={"X": load_vars}, + attrs={"dir": checkpoint_load_dir}) return s_prog # transpiler function for dis lookup_table From 6d53dceeec5b0b014c614821d6e1bf355a280d64 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Thu, 17 May 2018 21:47:44 +0800 Subject: [PATCH 31/56] optimized checkpoint serial number and folder --- paddle/fluid/operators/checkpoint_load_op.cc | 121 +++++++++++++------ paddle/fluid/operators/checkpoint_op_test.cc | 10 +- paddle/fluid/operators/checkpoint_save_op.cc | 103 ++++++++-------- 3 files changed, 143 insertions(+), 91 deletions(-) diff --git a/paddle/fluid/operators/checkpoint_load_op.cc b/paddle/fluid/operators/checkpoint_load_op.cc index 72cfccaaa22b7..ad237a889ad0a 100644 --- a/paddle/fluid/operators/checkpoint_load_op.cc +++ b/paddle/fluid/operators/checkpoint_load_op.cc @@ -17,6 +17,10 @@ limitations under the License. */ #include #include #include +#include + +#include + #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type_transform.h" #include "paddle/fluid/framework/framework.pb.h" @@ -30,12 +34,70 @@ namespace operators { constexpr char kSEP = '/'; // write empty file named _SUCCESS const char SUCCESS[] = "_SUCCESS"; +const char SERIAL_VAR[] = "SERIAL_NUMBER"; static bool FileExists(const std::string &filepath) { struct stat buffer; return (stat(filepath.c_str(), &buffer) == 0); } +static std::string GenePath(const std::string &dir, const std::string &file) { + boost::filesystem::path dir(dir); + boost::filesystem::path file(file); + boost::filesystem::path full_path = dir / file; + return full_path; +} + +static void LoadInputVars(const framework::Scope &scope, + const platform::Place &place, + const std::vector &inp_var_names, + const std::string &dir) { + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + + // todo (tangwei) made it async + for (size_t i = 0; i < inp_var_names.size(); i++) { + auto *var = scope.FindVar(inp_var_names[i]); + + PADDLE_ENFORCE(var != nullptr, + "Cannot find variable %s for save_combine_op", + inp_var_names[i]); + PADDLE_ENFORCE(var->IsType(), + "SaveCombineOp only supports LoDTensor, %s has wrong type", + inp_var_names[i]); + + std::string var_file = GenePath(dir, inp_var_names[i]); + auto *tensor = var->GetMutable(); + std::ifstream fin(var_file); + PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s for load op", + var_file); + framework::DeserializeFromStream(fin, tensor, dev_ctx); + fin.close(); + VLOG(3) << " load var: " << inp_var_names[i] << " finished"; + } +} + +static void LoadStringArgv(const framework::Scope &scope, + const platform::Place &place, + const std::string &argv, const std::string &dir) { + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + + for (size_t i = 0; i < argv.size(); i++) { + auto *var = scope.FindVar(inp_var_names[i]); + std::string *var_str = var->GetMutable(); + + std::string var_file = GenePath(dir, argv); + std::ifstream fin(var_file); + PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s for load op", + var_file); + std::getline(fin, var_str); + fin.close(); + VLOG(3) << " load String argv: " << argv << " value is: " << var_str; + } +} + class CheckpointLoadOp : public framework::OperatorBase { public: CheckpointLoadOp(const std::string &type, @@ -48,53 +110,33 @@ class CheckpointLoadOp : public framework::OperatorBase { void RunImpl(const framework::Scope &scope, const platform::Place &place) const override { std::string dir = Attr("dir"); + int serial_num = Attr("Serial"); - VLOG(3) << "Load checkpoint from dir: " << dir; + auto *serial_var = scope.FindVar(SERIAL_VAR); + serial_var = serial_num; + VLOG(1) << "CheckpointLoadOp set " << SERIAL_NUMBER + << " value: " << serial_num; std::string success; - success.append(dir); - success.append("/"); - success.append(SUCCESS); - + = GenePath(dir, std::to_string(serial_num)); + VLOG(3) << "Load checkpoint from dir: " << success; + success = GenePath(success, SUCCESS); bool is_present = FileExists(success); if (!is_present) { - VLOG(3) << "can not find _SUCCESS from path: " << success; + VLOG(1) << "CheckpointLoadOp can not find " << SUCCESS + << " from: " << success; return; } + VLOG(3) << "Ready to load vars to scope"; auto inp_var_names = Inputs("X"); PADDLE_ENFORCE_GT(static_cast(inp_var_names.size()), 0, "The number of input variables should be greater than 0"); - // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(place); - - // todo (tangwei) made it async - for (size_t i = 0; i < inp_var_names.size(); i++) { - auto *var = scope.FindVar(inp_var_names[i]); - - PADDLE_ENFORCE(var != nullptr, - "Cannot find variable %s for save_combine_op", - inp_var_names[i]); - PADDLE_ENFORCE(var->IsType(), - "SaveCombineOp only supports LoDTensor, %s has wrong type", - inp_var_names[i]); - - std::string var_file; - var_file.append(dir); - var_file.append("/"); - var_file.append(inp_var_names[i]); - VLOG(3) << "ready to load var: " << inp_var_names[i]; - - auto *tensor = var->GetMutable(); - std::ifstream fin(var_file); - PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s for load op", - var_file); - framework::DeserializeFromStream(fin, tensor, dev_ctx); - fin.close(); - - VLOG(3) << " load var: " << inp_var_names[i] << " finished"; - } + LoadInputVars(scope, place, &inp_var_names); + + VLOG(3) << "Ready to load string argv to scope"; + auto argv = Inputs("Argv"); + LoadStringArgv(scope, place, &argv, &dir); } }; @@ -106,6 +148,10 @@ class CheckpointLoadOpProtoMaker : public framework::OpProtoAndCheckerMaker { "X", "(vector) Input LoDTensors that need to be saved together in a file.") .AsDuplicable(); + AddInput( + "Argv", + "(vector) Input LoDTensors that need to be saved together in a file.") + .AsDuplicable(); AddComment(R"DOC( CheckpointLoad operator @@ -113,6 +159,9 @@ This operator will serialize and write a list of input LoDTensor variables to a file on disk. )DOC"); + AddAttr("Serial", + "(int)" + "The serial number of the checkpoint will to be load."); AddAttr( "dir", "(string)" diff --git a/paddle/fluid/operators/checkpoint_op_test.cc b/paddle/fluid/operators/checkpoint_op_test.cc index bea44b35cada2..75bfc3f840765 100644 --- a/paddle/fluid/operators/checkpoint_op_test.cc +++ b/paddle/fluid/operators/checkpoint_op_test.cc @@ -44,8 +44,7 @@ TEST(CheckpointSaveOp, CPU) { attrs.insert({"dir", std::string("ckpt")}); auto save_op = paddle::framework::OpRegistry::CreateOp( - "checkpoint_save", {{"X", {"test_var"}}}, {{"Serial", {"SERIAL_NUMBER"}}}, - attrs); + "checkpoint_save", {{"X", {"test_var"}}}, attrs); save_op->Run(scope, place); } @@ -58,7 +57,8 @@ TEST(CheckpointLoadOp, CPU) { paddle::framework::AttributeMap attrs; attrs.insert({"dir", std::string("ckpt")}); - auto save_op = paddle::framework::OpRegistry::CreateOp( - "checkpoint_load", {{"X", {"test_var"}}}, {}, attrs); - save_op->Run(scope, place); + auto load_op = paddle::framework::OpRegistry::CreateOp( + "checkpoint_load", {{"X", {"test_var"}}}, {{"Serial", {"SERIAL_NUMBER"}}}, + attrs); + load_op->Run(scope, place); } diff --git a/paddle/fluid/operators/checkpoint_save_op.cc b/paddle/fluid/operators/checkpoint_save_op.cc index 1082bb4a345a2..54911fc054c21 100644 --- a/paddle/fluid/operators/checkpoint_save_op.cc +++ b/paddle/fluid/operators/checkpoint_save_op.cc @@ -17,6 +17,10 @@ limitations under the License. */ #include #include #include +#include + +#include + #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type_transform.h" #include "paddle/fluid/framework/framework.pb.h" @@ -30,6 +34,14 @@ namespace operators { constexpr char kSEP = '/'; // write empty file named _SUCCESS const char SUCCESS[] = "_SUCCESS"; +const char SERIAL_VAR[] = "SERIAL_NUMBER"; + +static std::string GenePath(const std::string &dir, const std::string &file) { + boost::filesystem::path dir(dir); + boost::filesystem::path file(file); + boost::filesystem::path full_path = dir / file; + return full_path; +} static bool FileExists(const std::string &filepath) { struct stat buffer; @@ -72,24 +84,20 @@ class CheckpointSaveOp : public framework::OperatorBase { auto dir = Attr("dir"); auto overwrite = Attr("overwrite"); + auto serial_num = scope.FindVar(SERIAL_VAR); + if (serial_num == nullptr) { + serial_num = scope.Var(SERIAL_VAR); + } + serial_num = serial_num + 1; + + dir = GenePath(dir, std::to_string(serial_num)); bool is_present = FileExists(dir); if (is_present && !overwrite) { - return; - // todo(tangwei) judge the folder is exist - // PADDLE_THROW("%s exists!, cannot save_combine to it when - // overwrite=false", - // dir, overwrite); + PADDLE_THROW("%s exists!, checkpoint save cannot to overwrite it", dir, + overwrite); } MkDirRecursively(dir.c_str()); - auto serial_var_name = Output("Serial"); - auto *serial_var = scope.FindVar(serial_var_name); - std::string *serial_num = serial_var->GetMutable(); - serial_num->append("0"); - dir.append("/"); - dir.append(serial_num->c_str()); - MkDirRecursively(dir.c_str()); - auto inp_var_names = Inputs("X"); PADDLE_ENFORCE_GT(static_cast(inp_var_names.size()), 0, "The number of input variables should be greater than 0"); @@ -101,30 +109,24 @@ class CheckpointSaveOp : public framework::OperatorBase { // todo (tangwei) made it async for (size_t i = 0; i < inp_var_names.size(); i++) { auto *var = scope.FindVar(inp_var_names[i]); - std::string var_file; - var_file.append(dir); - var_file.append("/"); - var_file.append(inp_var_names[i]); PADDLE_ENFORCE(var != nullptr, - "Cannot find variable %s for save_combine_op", - inp_var_names[i]); - PADDLE_ENFORCE(var->IsType(), - "SaveCombineOp only supports LoDTensor, %s has wrong type", + "Cannot find variable %s for checkpoint save op", inp_var_names[i]); + PADDLE_ENFORCE( + var->IsType(), + "CheckpointSaveOp only supports LoDTensor, %s has wrong type", + inp_var_names[i]); auto &tensor = var->Get(); // Serialize tensors one by one - + std::string var_file = GenePath(dir, inp_var_names[i]); std::ofstream fout(var_file); framework::SerializeToStream(fout, tensor, dev_ctx); fout.close(); } - std::string success; - success.append(dir); - success.append("/"); - success.append(SUCCESS); + std::string success = GenePath(dir, SUCCESS); std::ofstream fout(success); fout.close(); } @@ -138,7 +140,6 @@ class CheckpointSaveOpProtoMaker : public framework::OpProtoAndCheckerMaker { "X", "(vector) Input LoDTensors that need to be saved together in a file.") .AsDuplicable(); - AddOutput("Serial", "the serial number"); AddComment(R"DOC( CheckpointSave operator @@ -150,30 +151,29 @@ to a file on disk. "Delete the output dir if it exists.") .SetDefault(false); - AddAttr( - "dir", - "(string)" - "The \"file_path\" where the LoDTensor variables will be saved.") + AddAttr("dir", + "(string)" + "The dir where the LoDTensor variables will be saved.") .AddCustomChecker( [](const std::string &path) { return !path.empty(); }); } }; -class CheckpointSaveOpVarTypeInference : public framework::VarTypeInference { - public: - void operator()(const framework::OpDesc &op_desc, - framework::BlockDesc *block) const override { - auto out_var_name = op_desc.Output("Serial").front(); - auto &out_var = block->FindRecursiveOrCreateVar(out_var_name); - auto var_type = framework::proto::VarType::RAW; - out_var.SetType(var_type); - } -}; - -class CheckpointSaveOpShapeInference : public framework::InferShapeBase { - public: - void operator()(framework::InferShapeContext *ctx) const override {} -}; +// class CheckpointSaveOpVarTypeInference : public framework::VarTypeInference { +// public: +// void operator()(const framework::OpDesc &op_desc, +// framework::BlockDesc *block) const override { +// auto out_var_name = op_desc.Output("Serial").front(); +// auto &out_var = block->FindRecursiveOrCreateVar(out_var_name); +// auto var_type = framework::proto::VarType::RAW; +// out_var.SetType(var_type); +// } +// }; + +// class CheckpointSaveOpShapeInference : public framework::InferShapeBase { +// public: +// void operator()(framework::InferShapeContext *ctx) const override {} +// }; } // namespace operators } // namespace paddle @@ -181,7 +181,10 @@ class CheckpointSaveOpShapeInference : public framework::InferShapeBase { namespace ops = paddle::operators; REGISTER_OPERATOR(checkpoint_save, ops::CheckpointSaveOp, - paddle::framework::EmptyGradOpMaker, - ops::CheckpointSaveOpProtoMaker, - ops::CheckpointSaveOpVarTypeInference, - ops::CheckpointSaveOpShapeInference); + ops::CheckpointSaveOpProtoMaker); + +// REGISTER_OPERATOR(checkpoint_save, ops::CheckpointSaveOp, +// paddle::framework::EmptyGradOpMaker, +// ops::CheckpointSaveOpProtoMaker, +// ops::CheckpointSaveOpVarTypeInference, +// ops::CheckpointSaveOpShapeInference); From 8430c8d798d4b722ad8da940c94c7696fd308606 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Thu, 17 May 2018 21:56:49 +0800 Subject: [PATCH 32/56] remove boost filesystem --- paddle/fluid/operators/checkpoint_load_op.cc | 10 ++++------ paddle/fluid/operators/checkpoint_save_op.cc | 10 ++++------ 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/paddle/fluid/operators/checkpoint_load_op.cc b/paddle/fluid/operators/checkpoint_load_op.cc index ad237a889ad0a..d270ae31ed791 100644 --- a/paddle/fluid/operators/checkpoint_load_op.cc +++ b/paddle/fluid/operators/checkpoint_load_op.cc @@ -18,9 +18,6 @@ limitations under the License. */ #include #include #include - -#include - #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type_transform.h" #include "paddle/fluid/framework/framework.pb.h" @@ -42,9 +39,10 @@ static bool FileExists(const std::string &filepath) { } static std::string GenePath(const std::string &dir, const std::string &file) { - boost::filesystem::path dir(dir); - boost::filesystem::path file(file); - boost::filesystem::path full_path = dir / file; + std::string file_path; + file_path.append(file_path); + file_path.append("/"); + file_path.append(file); return full_path; } diff --git a/paddle/fluid/operators/checkpoint_save_op.cc b/paddle/fluid/operators/checkpoint_save_op.cc index 54911fc054c21..ee494c68822c4 100644 --- a/paddle/fluid/operators/checkpoint_save_op.cc +++ b/paddle/fluid/operators/checkpoint_save_op.cc @@ -18,9 +18,6 @@ limitations under the License. */ #include #include #include - -#include - #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type_transform.h" #include "paddle/fluid/framework/framework.pb.h" @@ -37,9 +34,10 @@ const char SUCCESS[] = "_SUCCESS"; const char SERIAL_VAR[] = "SERIAL_NUMBER"; static std::string GenePath(const std::string &dir, const std::string &file) { - boost::filesystem::path dir(dir); - boost::filesystem::path file(file); - boost::filesystem::path full_path = dir / file; + std::string file_path; + file_path.append(file_path); + file_path.append("/"); + file_path.append(file); return full_path; } From 7b6c0abfc9b1e5ab44404ed0c253d4250d9a440a Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Thu, 17 May 2018 22:41:02 +0800 Subject: [PATCH 33/56] modify variable point --- paddle/fluid/operators/checkpoint_load_op.cc | 24 +++++++++++++------- paddle/fluid/operators/checkpoint_save_op.cc | 20 ++++++++++++---- 2 files changed, 31 insertions(+), 13 deletions(-) diff --git a/paddle/fluid/operators/checkpoint_load_op.cc b/paddle/fluid/operators/checkpoint_load_op.cc index d270ae31ed791..0f0d989ccd2f7 100644 --- a/paddle/fluid/operators/checkpoint_load_op.cc +++ b/paddle/fluid/operators/checkpoint_load_op.cc @@ -108,15 +108,22 @@ class CheckpointLoadOp : public framework::OperatorBase { void RunImpl(const framework::Scope &scope, const platform::Place &place) const override { std::string dir = Attr("dir"); - int serial_num = Attr("Serial"); + std::string serial_num = Attr("Serial"); + + std::string serial_var_name = std::string(SERIAL_VAR); + auto *serial_var = scope.FindVar(serial_var_name); + auto *serial_num; + if (serial_var == nullptr) { + *serial_var = scope.Var(serial_var_name); + *serial_num = serial_var->GetMutable(); + serial_num->append("0"); + } - auto *serial_var = scope.FindVar(SERIAL_VAR); - serial_var = serial_num; + *serial_num = serial_var->GetMutable(); VLOG(1) << "CheckpointLoadOp set " << SERIAL_NUMBER << " value: " << serial_num; - std::string success; - = GenePath(dir, std::to_string(serial_num)); + std::string success = GenePath(dir, serial_num); VLOG(3) << "Load checkpoint from dir: " << success; success = GenePath(success, SUCCESS); bool is_present = FileExists(success); @@ -157,9 +164,10 @@ This operator will serialize and write a list of input LoDTensor variables to a file on disk. )DOC"); - AddAttr("Serial", - "(int)" - "The serial number of the checkpoint will to be load."); + AddAttr( + "Serial", + "(std::string)" + "The serial number of the checkpoint will to be load."); AddAttr( "dir", "(string)" diff --git a/paddle/fluid/operators/checkpoint_save_op.cc b/paddle/fluid/operators/checkpoint_save_op.cc index ee494c68822c4..3c2cc50ac490a 100644 --- a/paddle/fluid/operators/checkpoint_save_op.cc +++ b/paddle/fluid/operators/checkpoint_save_op.cc @@ -82,13 +82,23 @@ class CheckpointSaveOp : public framework::OperatorBase { auto dir = Attr("dir"); auto overwrite = Attr("overwrite"); - auto serial_num = scope.FindVar(SERIAL_VAR); - if (serial_num == nullptr) { - serial_num = scope.Var(SERIAL_VAR); + std::string serial_var_name = std::string(SERIAL_VAR); + auto *serial_var = scope.FindVar(serial_var_name); + auto *serial_num; + if (serial_var == nullptr) { + *serial_var = scope.Var(serial_var_name); + *serial_num = serial_var->GetMutable(); + serial_num->append("0"); } - serial_num = serial_num + 1; - dir = GenePath(dir, std::to_string(serial_num)); + *serial_num = serial_var->GetMutable(); + VLOG(1) << "CheckpointSaveOp get " << SERIAL_NUMBER + << " value: " << serial_num; + + auto *serial_num = serial_var->GetMutable(); + serial_num->append("1"); + + dir = GenePath(dir, serial_num); bool is_present = FileExists(dir); if (is_present && !overwrite) { PADDLE_THROW("%s exists!, checkpoint save cannot to overwrite it", dir, From f9d4b9dabfcf33de11154aa5dc67be5537a34bb8 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Thu, 17 May 2018 22:49:40 +0800 Subject: [PATCH 34/56] fix auto serial_num has no initializer --- paddle/fluid/operators/checkpoint_load_op.cc | 8 ++++---- paddle/fluid/operators/checkpoint_save_op.cc | 9 ++++----- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/operators/checkpoint_load_op.cc b/paddle/fluid/operators/checkpoint_load_op.cc index 0f0d989ccd2f7..5fd3a7af9cf1f 100644 --- a/paddle/fluid/operators/checkpoint_load_op.cc +++ b/paddle/fluid/operators/checkpoint_load_op.cc @@ -112,14 +112,14 @@ class CheckpointLoadOp : public framework::OperatorBase { std::string serial_var_name = std::string(SERIAL_VAR); auto *serial_var = scope.FindVar(serial_var_name); - auto *serial_num; + if (serial_var == nullptr) { *serial_var = scope.Var(serial_var_name); - *serial_num = serial_var->GetMutable(); - serial_num->append("0"); + auto *serial_tmp = serial_var->GetMutable(); + serial_tmp->append("0"); } - *serial_num = serial_var->GetMutable(); + auto *serial_num = serial_var->GetMutable(); VLOG(1) << "CheckpointLoadOp set " << SERIAL_NUMBER << " value: " << serial_num; diff --git a/paddle/fluid/operators/checkpoint_save_op.cc b/paddle/fluid/operators/checkpoint_save_op.cc index 3c2cc50ac490a..5fccefeed251a 100644 --- a/paddle/fluid/operators/checkpoint_save_op.cc +++ b/paddle/fluid/operators/checkpoint_save_op.cc @@ -84,14 +84,13 @@ class CheckpointSaveOp : public framework::OperatorBase { std::string serial_var_name = std::string(SERIAL_VAR); auto *serial_var = scope.FindVar(serial_var_name); - auto *serial_num; + if (serial_var == nullptr) { *serial_var = scope.Var(serial_var_name); - *serial_num = serial_var->GetMutable(); - serial_num->append("0"); + *serial_tmp = serial_var->GetMutable(); + serial_tmp->append("0"); } - - *serial_num = serial_var->GetMutable(); + auto *serial_num = serial_var->GetMutable(); VLOG(1) << "CheckpointSaveOp get " << SERIAL_NUMBER << " value: " << serial_num; From a4fd3756bbd95fb8c676af9aab7a22cfe87d9cc5 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Fri, 18 May 2018 09:46:14 +0800 Subject: [PATCH 35/56] bug fix --- paddle/fluid/operators/checkpoint_load_op.cc | 85 +++++++++++++------- paddle/fluid/operators/checkpoint_op_test.cc | 24 +++++- paddle/fluid/operators/checkpoint_save_op.cc | 36 +++++---- 3 files changed, 95 insertions(+), 50 deletions(-) diff --git a/paddle/fluid/operators/checkpoint_load_op.cc b/paddle/fluid/operators/checkpoint_load_op.cc index 5fd3a7af9cf1f..d24c7819990f0 100644 --- a/paddle/fluid/operators/checkpoint_load_op.cc +++ b/paddle/fluid/operators/checkpoint_load_op.cc @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include +#include #include #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type_transform.h" @@ -43,7 +44,13 @@ static std::string GenePath(const std::string &dir, const std::string &file) { file_path.append(file_path); file_path.append("/"); file_path.append(file); - return full_path; + return file_path; +} + +static bool IsNumber(const std::string &s) { + std::string::const_iterator it = s.begin(); + while (it != s.end() && std::isdigit(*it)) ++it; + return !s.empty() && it == s.end(); } static void LoadInputVars(const framework::Scope &scope, @@ -62,7 +69,7 @@ static void LoadInputVars(const framework::Scope &scope, "Cannot find variable %s for save_combine_op", inp_var_names[i]); PADDLE_ENFORCE(var->IsType(), - "SaveCombineOp only supports LoDTensor, %s has wrong type", + "LoadCombineOp only supports LoDTensor, %s has wrong type", inp_var_names[i]); std::string var_file = GenePath(dir, inp_var_names[i]); @@ -78,21 +85,18 @@ static void LoadInputVars(const framework::Scope &scope, static void LoadStringArgv(const framework::Scope &scope, const platform::Place &place, - const std::string &argv, const std::string &dir) { - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(place); - + const std::vector &argv, + const std::string &dir) { for (size_t i = 0; i < argv.size(); i++) { - auto *var = scope.FindVar(inp_var_names[i]); + auto *var = scope.FindVar(argv[i]); std::string *var_str = var->GetMutable(); - - std::string var_file = GenePath(dir, argv); + std::string var_file = GenePath(dir, argv[i]); std::ifstream fin(var_file); PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s for load op", var_file); - std::getline(fin, var_str); + std::getline(fin, *var_str); fin.close(); - VLOG(3) << " load String argv: " << argv << " value is: " << var_str; + VLOG(3) << " load String argv: " << argv[i] << " value is: " << var_str; } } @@ -108,22 +112,24 @@ class CheckpointLoadOp : public framework::OperatorBase { void RunImpl(const framework::Scope &scope, const platform::Place &place) const override { std::string dir = Attr("dir"); - std::string serial_num = Attr("Serial"); + std::string serial_num_attr = Attr("Serial"); + + PADDLE_ENFORCE(IsNumber(serial_num_attr), + "Checkpoint Serial must be a number"); std::string serial_var_name = std::string(SERIAL_VAR); auto *serial_var = scope.FindVar(serial_var_name); - - if (serial_var == nullptr) { - *serial_var = scope.Var(serial_var_name); - auto *serial_tmp = serial_var->GetMutable(); - serial_tmp->append("0"); - } + PADDLE_ENFORCE(serial_var != nullptr, + "Cannot find variable %s for checkpoint_load_op", + serial_var_name); auto *serial_num = serial_var->GetMutable(); - VLOG(1) << "CheckpointLoadOp set " << SERIAL_NUMBER + serial_num = serial_num_attr; + + VLOG(1) << "CheckpointLoadOp set " << SERIAL_VAR << " value: " << serial_num; - std::string success = GenePath(dir, serial_num); + std::string success = GenePath(dir, serial_num->c_str()); VLOG(3) << "Load checkpoint from dir: " << success; success = GenePath(success, SUCCESS); bool is_present = FileExists(success); @@ -137,11 +143,11 @@ class CheckpointLoadOp : public framework::OperatorBase { auto inp_var_names = Inputs("X"); PADDLE_ENFORCE_GT(static_cast(inp_var_names.size()), 0, "The number of input variables should be greater than 0"); - LoadInputVars(scope, place, &inp_var_names); + LoadInputVars(scope, place, inp_var_names, dir); - VLOG(3) << "Ready to load string argv to scope"; - auto argv = Inputs("Argv"); - LoadStringArgv(scope, place, &argv, &dir); + // VLOG(3) << "Ready to load string argv to scope"; + // auto argv = Output("Argv"); + // LoadStringArgv(scope, place, argv, dir); } }; @@ -153,14 +159,13 @@ class CheckpointLoadOpProtoMaker : public framework::OpProtoAndCheckerMaker { "X", "(vector) Input LoDTensors that need to be saved together in a file.") .AsDuplicable(); - AddInput( + AddOutput( "Argv", - "(vector) Input LoDTensors that need to be saved together in a file.") - .AsDuplicable(); + "(vector) Input LoDTensors that need to be saved together in a file."); AddComment(R"DOC( CheckpointLoad operator -This operator will serialize and write a list of input LoDTensor variables +This operator will serialize and write a list of input LoDTensor variables to a file on disk. )DOC"); @@ -177,10 +182,32 @@ to a file on disk. } }; +class CheckpointLoadOpVarTypeInference : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + auto out_var_name = op_desc.Output("Argv").front(); + auto &out_var = block->FindRecursiveOrCreateVar(out_var_name); + auto var_type = framework::proto::VarType::RAW; + out_var.SetType(var_type); + } +}; + +class CheckpointLoadOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override {} +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(checkpoint_load, ops::CheckpointLoadOp, - ops::CheckpointLoadOpProtoMaker); + paddle::framework::EmptyGradOpMaker, + ops::CheckpointLoadOpProtoMaker, + ops::CheckpointLoadOpVarTypeInference, + ops::CheckpointLoadOpShapeInference); + +// REGISTER_OPERATOR(checkpoint_load, ops::CheckpointLoadOp, +// ops::CheckpointLoadOpProtoMaker); diff --git a/paddle/fluid/operators/checkpoint_op_test.cc b/paddle/fluid/operators/checkpoint_op_test.cc index 75bfc3f840765..2acce227d23de 100644 --- a/paddle/fluid/operators/checkpoint_op_test.cc +++ b/paddle/fluid/operators/checkpoint_op_test.cc @@ -44,7 +44,7 @@ TEST(CheckpointSaveOp, CPU) { attrs.insert({"dir", std::string("ckpt")}); auto save_op = paddle::framework::OpRegistry::CreateOp( - "checkpoint_save", {{"X", {"test_var"}}}, attrs); + "checkpoint_save", {{"X", {"test_var"}}}, {}, attrs); save_op->Run(scope, place); } @@ -52,13 +52,29 @@ TEST(CheckpointLoadOp, CPU) { paddle::framework::Scope scope; paddle::platform::CPUPlace place; - scope.Var("test_var"); + auto var = scope.Var("test_var"); + auto tensor = var->GetMutable(); + tensor->Resize({3, 10}); + paddle::framework::LoD expect_lod; + expect_lod.resize(1); + expect_lod[0].push_back(0); + expect_lod[0].push_back(1); + expect_lod[0].push_back(2); + expect_lod[0].push_back(3); + + tensor->set_lod(expect_lod); + float* expect = tensor->mutable_data(place); + for (int64_t i = 0; i < tensor->numel(); ++i) { + expect[i] = static_cast(paddle::platform::float16(i)); + } + + scope.Var("SERIAL_NUMBER"); paddle::framework::AttributeMap attrs; attrs.insert({"dir", std::string("ckpt")}); + attrs.insert({"Serial", std::string("SERIAL_NUMBER")}); auto load_op = paddle::framework::OpRegistry::CreateOp( - "checkpoint_load", {{"X", {"test_var"}}}, {{"Serial", {"SERIAL_NUMBER"}}}, - attrs); + "checkpoint_load", {{"X", {"test_var"}}}, {{"Argv", {}}}, attrs); load_op->Run(scope, place); } diff --git a/paddle/fluid/operators/checkpoint_save_op.cc b/paddle/fluid/operators/checkpoint_save_op.cc index 5fccefeed251a..bab979e4074a6 100644 --- a/paddle/fluid/operators/checkpoint_save_op.cc +++ b/paddle/fluid/operators/checkpoint_save_op.cc @@ -33,12 +33,18 @@ constexpr char kSEP = '/'; const char SUCCESS[] = "_SUCCESS"; const char SERIAL_VAR[] = "SERIAL_NUMBER"; +static bool IsNumber(const std::string &s) { + std::string::const_iterator it = s.begin(); + while (it != s.end() && std::isdigit(*it)) ++it; + return !s.empty() && it == s.end(); +} + static std::string GenePath(const std::string &dir, const std::string &file) { std::string file_path; - file_path.append(file_path); + file_path.append(dir); file_path.append("/"); file_path.append(file); - return full_path; + return file_path; } static bool FileExists(const std::string &filepath) { @@ -79,28 +85,24 @@ class CheckpointSaveOp : public framework::OperatorBase { private: void RunImpl(const framework::Scope &scope, const platform::Place &place) const override { - auto dir = Attr("dir"); + auto ck_dir = Attr("dir"); auto overwrite = Attr("overwrite"); std::string serial_var_name = std::string(SERIAL_VAR); - auto *serial_var = scope.FindVar(serial_var_name); - - if (serial_var == nullptr) { - *serial_var = scope.Var(serial_var_name); - *serial_tmp = serial_var->GetMutable(); - serial_tmp->append("0"); - } - auto *serial_num = serial_var->GetMutable(); - VLOG(1) << "CheckpointSaveOp get " << SERIAL_NUMBER + auto *serial_num = + scope.FindVar(serial_var_name)->GetMutable(); + VLOG(1) << "CheckpointSaveOp get " << SERIAL_VAR << " value: " << serial_num; - auto *serial_num = serial_var->GetMutable(); - serial_num->append("1"); + if (!IsNumber(serial_num)) { + serial_num = "0"; + } - dir = GenePath(dir, serial_num); + std::string dir = GenePath(ck_dir, serial_num->c_str()); + VLOG(1) << "CheckpointSaveOp current dir: " << dir; bool is_present = FileExists(dir); if (is_present && !overwrite) { - PADDLE_THROW("%s exists!, checkpoint save cannot to overwrite it", dir, + PADDLE_THROW("%s exists!, checkpoint save cannot to overwrite it", dir, overwrite); } MkDirRecursively(dir.c_str()); @@ -150,7 +152,7 @@ class CheckpointSaveOpProtoMaker : public framework::OpProtoAndCheckerMaker { AddComment(R"DOC( CheckpointSave operator -This operator will serialize and write a list of input LoDTensor variables +This operator will serialize and write a list of input LoDTensor variables to a file on disk. )DOC"); AddAttr("overwrite", From f688652f1e3ee2eaf949ef79cbd56c05fc4980cd Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Fri, 18 May 2018 10:26:41 +0800 Subject: [PATCH 36/56] bug fix --- paddle/fluid/operators/checkpoint_load_op.cc | 5 +++-- paddle/fluid/operators/checkpoint_op_test.cc | 2 ++ paddle/fluid/operators/checkpoint_save_op.cc | 4 ++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/operators/checkpoint_load_op.cc b/paddle/fluid/operators/checkpoint_load_op.cc index d24c7819990f0..a9676de369b4b 100644 --- a/paddle/fluid/operators/checkpoint_load_op.cc +++ b/paddle/fluid/operators/checkpoint_load_op.cc @@ -114,7 +114,7 @@ class CheckpointLoadOp : public framework::OperatorBase { std::string dir = Attr("dir"); std::string serial_num_attr = Attr("Serial"); - PADDLE_ENFORCE(IsNumber(serial_num_attr), + PADDLE_ENFORCE(!IsNumber(serial_num_attr), "Checkpoint Serial must be a number"); std::string serial_var_name = std::string(SERIAL_VAR); @@ -124,7 +124,8 @@ class CheckpointLoadOp : public framework::OperatorBase { serial_var_name); auto *serial_num = serial_var->GetMutable(); - serial_num = serial_num_attr; + serial_num->clear(); + serial_num->append(serial_num_attr); VLOG(1) << "CheckpointLoadOp set " << SERIAL_VAR << " value: " << serial_num; diff --git a/paddle/fluid/operators/checkpoint_op_test.cc b/paddle/fluid/operators/checkpoint_op_test.cc index 2acce227d23de..5312225e5f952 100644 --- a/paddle/fluid/operators/checkpoint_op_test.cc +++ b/paddle/fluid/operators/checkpoint_op_test.cc @@ -69,6 +69,8 @@ TEST(CheckpointLoadOp, CPU) { } scope.Var("SERIAL_NUMBER"); + auto* serial_num = scope.FindVar("SERIAL_NUMBER")->GetMutable(); + serial_num->append("0"); paddle::framework::AttributeMap attrs; attrs.insert({"dir", std::string("ckpt")}); diff --git a/paddle/fluid/operators/checkpoint_save_op.cc b/paddle/fluid/operators/checkpoint_save_op.cc index bab979e4074a6..30eda30c5f52f 100644 --- a/paddle/fluid/operators/checkpoint_save_op.cc +++ b/paddle/fluid/operators/checkpoint_save_op.cc @@ -94,8 +94,8 @@ class CheckpointSaveOp : public framework::OperatorBase { VLOG(1) << "CheckpointSaveOp get " << SERIAL_VAR << " value: " << serial_num; - if (!IsNumber(serial_num)) { - serial_num = "0"; + if (serial_num->empty()) { + serial_num->append("0"); } std::string dir = GenePath(ck_dir, serial_num->c_str()); From 821acdb3bffdf0594d4bf94a4cddc47c2c681ca6 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Fri, 18 May 2018 11:18:22 +0800 Subject: [PATCH 37/56] update op to trianer and pserver --- .../fluid/transpiler/distribute_transpiler.py | 99 ++++++++++++++----- 1 file changed, 72 insertions(+), 27 deletions(-) diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 84cfc6e0117e8..4e15718771162 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -14,6 +14,7 @@ from __future__ import print_function +import os import math import distributed_splitter as splitter @@ -26,6 +27,10 @@ LOOKUP_TABLE_GRAD_TYPE = "lookup_table_grad" RPC_CLIENT_VAR_NAME = "RPC_CLIENT_VAR" +# for checkpoint +SUCCESS = "_SUCCESS" +SERIAL_VAR_NAME = "SERIAL_NUMBER" + class VarBlock: def __init__(self, varname, offset, size): @@ -153,7 +158,8 @@ def transpile(self, pservers="127.0.0.1:6174", trainers=1, split_method=splitter.round_robin, - sync_mode=True): + sync_mode=True, + checkpoint_dir=None): """ Transpile the program to distributed data-parallelism programs. The main_program will be transformed to use a remote parameter server @@ -315,22 +321,22 @@ def transpile(self, "sync_mode": self.sync_mode }) - serial_var = program.global_block().create_var( - name="SERIAL_NUMBER", - persistable=True, - type=core.VarDesc.VarType.RAW) + if checkpoint_dir and self.is_chief: + program.global_block().create_var( + name=SERIAL_VAR_NAME, + persistable=True, + type=core.VarDesc.VarType.RAW) - save_vars = [] - for var in self.origin_program.list_vars(): - if self.is_persistable(var): - save_vars.append(var.name) + save_vars = [] + for var in self.origin_program.list_vars(): + if self._is_persistable(var): + save_vars.append(var.name) - program.global_block().append_op( - type="checkpoint_save", - inputs={"X": save_vars}, - outputs={"Serial": serial_var}, - attrs={"overwrite": False, - "dir": "/workspace/ckpt/"}) + program.global_block().append_op( + type="checkpoint_save", + inputs={"X": save_vars}, + attrs={"overwrite": True, + "dir": checkpoint_dir}) # step4: Concat the parameters splits together after recv. for varname, splited_var in param_var_mapping.iteritems(): @@ -512,13 +518,6 @@ def __append_optimize_op__(op, block, grad_to_block_id): pserver_program.sync_with_cpp() return pserver_program - def is_persistable(self, var): - if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \ - var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \ - var.desc.type() == core.VarDesc.VarType.RAW : - return False - return var.persistable - def get_train_startup_program(self, checkpoint_load_dir=None): """ Get train startup program. @@ -532,13 +531,16 @@ def get_train_startup_program(self, checkpoint_load_dir=None): load_vars = [] for var in startup_prog.list_vars(): - if self.is_persistable(var): + if self._is_persistable(var): load_vars.append(var.name) + serial_number = self._get_lastest_checkpoint_dir(checkpoint_load_dir) + startup_prog.global_block().append_op( type="checkpoint_load", - outputs={"Out": load_vars}, - attrs={"dir": checkpoint_load_dir}) + inputs={"X": load_vars}, + attrs={"dir": checkpoint_load_dir, + "Serial": serial_number}) return startup_prog def get_startup_program(self, @@ -599,16 +601,59 @@ def _get_splited_name_and_shape(varname): attrs=op.attrs) for var in new_outputs.values(): load_vars.append(var.name) - # add checkpoint op + # add checkpoint op if not checkpoint_load_dir: return s_prog + serial_number = self._get_lastest_checkpoint_dir(checkpoint_load_dir) + s_prog.global_block().append_op( type="checkpoint_load", inputs={"X": load_vars}, - attrs={"dir": checkpoint_load_dir}) + attrs={"dir": checkpoint_load_dir, + "Serial": serial_number}) + return s_prog + def _is_persistable(self, var): + """only save LodTensor variable""" + if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \ + var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \ + var.desc.type() == core.VarDesc.VarType.RAW : + return False + return var.persistable + + def _get_lastest_checkpoint_dir(self, checkpoint_dir): + """ + get the biggest number in checkpoint_dir, which has _SUCCESS + """ + if not checkpoint_dir.strip(): + return "" + + def has_success(checkpoint_dir, cur_dir): + """ + is _SUCCESS in this dir + """ + if not os.path.isdir(cur_dir): + return -1 + + try: + int(cur_dir) + except ValueError: + return -1 + + success_path = os.path.join(checkpoint_dir, cur_dir, SUCCESS) + if os.path.isfile(success_path): + return int(cur_dir) + + current_dir = 0 + dirs = os.listdir(checkpoint_dir) + for cur_dir in dirs: + success_num = has_success(checkpoint_dir, cur_dir) + if success_num > current_dir: + current_dir = success_num + return str(current_dir) + # transpiler function for dis lookup_table def _replace_lookup_table_op_with_prefetch(self, program, rpc_client_var, eplist): From cd98f2b7e0eb251659565c9f9171f52c95c819f8 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Fri, 18 May 2018 17:44:48 +0800 Subject: [PATCH 38/56] bug fix --- paddle/fluid/operators/checkpoint_load_op.cc | 3 +-- paddle/fluid/operators/checkpoint_save_op.cc | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/operators/checkpoint_load_op.cc b/paddle/fluid/operators/checkpoint_load_op.cc index a9676de369b4b..82a40e18d5c9e 100644 --- a/paddle/fluid/operators/checkpoint_load_op.cc +++ b/paddle/fluid/operators/checkpoint_load_op.cc @@ -154,8 +154,7 @@ class CheckpointLoadOp : public framework::OperatorBase { class CheckpointLoadOpProtoMaker : public framework::OpProtoAndCheckerMaker { public: - CheckpointLoadOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) - : OpProtoAndCheckerMaker(proto, op_checker) { + void Make() override { AddInput( "X", "(vector) Input LoDTensors that need to be saved together in a file.") diff --git a/paddle/fluid/operators/checkpoint_save_op.cc b/paddle/fluid/operators/checkpoint_save_op.cc index 30eda30c5f52f..790fd4ea68505 100644 --- a/paddle/fluid/operators/checkpoint_save_op.cc +++ b/paddle/fluid/operators/checkpoint_save_op.cc @@ -143,8 +143,7 @@ class CheckpointSaveOp : public framework::OperatorBase { class CheckpointSaveOpProtoMaker : public framework::OpProtoAndCheckerMaker { public: - CheckpointSaveOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) - : OpProtoAndCheckerMaker(proto, op_checker) { + void Make() override { AddInput( "X", "(vector) Input LoDTensors that need to be saved together in a file.") From dbd023771f82cb69574b374bea30836f3804015a Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Fri, 18 May 2018 19:37:49 +0800 Subject: [PATCH 39/56] fix serial number --- paddle/fluid/operators/checkpoint_load_op.cc | 2 +- paddle/fluid/operators/checkpoint_save_op.cc | 10 ++++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/checkpoint_load_op.cc b/paddle/fluid/operators/checkpoint_load_op.cc index 82a40e18d5c9e..c18edf6306204 100644 --- a/paddle/fluid/operators/checkpoint_load_op.cc +++ b/paddle/fluid/operators/checkpoint_load_op.cc @@ -176,7 +176,7 @@ to a file on disk. AddAttr( "dir", "(string)" - "The \"file_path\" where the LoDTensor variables will be saved.") + "The \"dir\" where the checkpoint files will be loaded.") .AddCustomChecker( [](const std::string &path) { return !path.empty(); }); } diff --git a/paddle/fluid/operators/checkpoint_save_op.cc b/paddle/fluid/operators/checkpoint_save_op.cc index 790fd4ea68505..1832c5792a18c 100644 --- a/paddle/fluid/operators/checkpoint_save_op.cc +++ b/paddle/fluid/operators/checkpoint_save_op.cc @@ -94,10 +94,16 @@ class CheckpointSaveOp : public framework::OperatorBase { VLOG(1) << "CheckpointSaveOp get " << SERIAL_VAR << " value: " << serial_num; - if (serial_num->empty()) { - serial_num->append("0"); + int serials = 0; + if (!serial_num->empty()) { + std::string::size_type sz; + serials = std::stoi(serial_num->data, &sz); + serials += 1; } + serial_num->clear(); + serial_num->append(std::to_string(serials)); + std::string dir = GenePath(ck_dir, serial_num->c_str()); VLOG(1) << "CheckpointSaveOp current dir: " << dir; bool is_present = FileExists(dir); From 22df4c278c19ab5eca71431d878eb78f053e6bc5 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Fri, 18 May 2018 21:17:37 +0800 Subject: [PATCH 40/56] fix serial number --- paddle/fluid/operators/checkpoint_load_op.cc | 2 +- paddle/fluid/operators/checkpoint_save_op.cc | 3 +-- python/paddle/fluid/transpiler/distribute_transpiler.py | 4 +++- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/operators/checkpoint_load_op.cc b/paddle/fluid/operators/checkpoint_load_op.cc index c18edf6306204..6c88cbdab0758 100644 --- a/paddle/fluid/operators/checkpoint_load_op.cc +++ b/paddle/fluid/operators/checkpoint_load_op.cc @@ -114,7 +114,7 @@ class CheckpointLoadOp : public framework::OperatorBase { std::string dir = Attr("dir"); std::string serial_num_attr = Attr("Serial"); - PADDLE_ENFORCE(!IsNumber(serial_num_attr), + PADDLE_ENFORCE(IsNumber(serial_num_attr), "Checkpoint Serial must be a number"); std::string serial_var_name = std::string(SERIAL_VAR); diff --git a/paddle/fluid/operators/checkpoint_save_op.cc b/paddle/fluid/operators/checkpoint_save_op.cc index 1832c5792a18c..f904cdc8269e7 100644 --- a/paddle/fluid/operators/checkpoint_save_op.cc +++ b/paddle/fluid/operators/checkpoint_save_op.cc @@ -96,8 +96,7 @@ class CheckpointSaveOp : public framework::OperatorBase { int serials = 0; if (!serial_num->empty()) { - std::string::size_type sz; - serials = std::stoi(serial_num->data, &sz); + serials = std::stoi(serial_num->data()); serials += 1; } diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index e1a2fe86a5804..335dc2342d08c 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -545,6 +545,7 @@ def get_train_startup_program(self, checkpoint_load_dir=None): startup_prog.global_block().append_op( type="checkpoint_load", inputs={"X": load_vars}, + outputs={"Argv": []}, attrs={"dir": checkpoint_load_dir, "Serial": serial_number}) return startup_prog @@ -616,6 +617,7 @@ def _get_splited_name_and_shape(varname): s_prog.global_block().append_op( type="checkpoint_load", inputs={"X": load_vars}, + outputs={"Argv": []}, attrs={"dir": checkpoint_load_dir, "Serial": serial_number}) @@ -640,7 +642,7 @@ def has_success(checkpoint_dir, cur_dir): """ is _SUCCESS in this dir """ - if not os.path.isdir(cur_dir): + if not os.path.isdir(os.path.join(checkpoint_dir, cur_dir)): return -1 try: From d98480cff5fe2e08fadc79fccd5bce9ab01ed28c Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Fri, 18 May 2018 21:55:33 +0800 Subject: [PATCH 41/56] fix serial number --- paddle/fluid/operators/checkpoint_load_op.cc | 4 ++-- python/paddle/fluid/transpiler/distribute_transpiler.py | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/checkpoint_load_op.cc b/paddle/fluid/operators/checkpoint_load_op.cc index 6c88cbdab0758..18871e56c5017 100644 --- a/paddle/fluid/operators/checkpoint_load_op.cc +++ b/paddle/fluid/operators/checkpoint_load_op.cc @@ -114,8 +114,8 @@ class CheckpointLoadOp : public framework::OperatorBase { std::string dir = Attr("dir"); std::string serial_num_attr = Attr("Serial"); - PADDLE_ENFORCE(IsNumber(serial_num_attr), - "Checkpoint Serial must be a number"); + VLOG(3) << "CheckpointLoadOp get Attr dir: " << dir; + VLOG(3) << "CheckpointLoadOp get Attr Serial: " << serial_num_attr; std::string serial_var_name = std::string(SERIAL_VAR); auto *serial_var = scope.FindVar(serial_var_name); diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 335dc2342d08c..bb1e026a6b1ff 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -654,6 +654,9 @@ def has_success(checkpoint_dir, cur_dir): if os.path.isfile(success_path): return int(cur_dir) + if os.path.isdir(checkpoint_dir): + return "-1" + current_dir = 0 dirs = os.listdir(checkpoint_dir) for cur_dir in dirs: From ee91e48e346a0504cd3c478ca5ba4e905f5442ff Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Fri, 18 May 2018 22:20:07 +0800 Subject: [PATCH 42/56] fix serial number --- python/paddle/fluid/transpiler/distribute_transpiler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index bb1e026a6b1ff..8b379ddcf89f0 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -654,7 +654,7 @@ def has_success(checkpoint_dir, cur_dir): if os.path.isfile(success_path): return int(cur_dir) - if os.path.isdir(checkpoint_dir): + if not os.path.isdir(checkpoint_dir): return "-1" current_dir = 0 From b6ee59ae2573fbbe66ab574be299d6b6fe52552c Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Fri, 18 May 2018 22:24:24 +0800 Subject: [PATCH 43/56] optimize python checkpint dir config --- .../fluid/transpiler/distribute_transpiler.py | 30 +++++++++---------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 8b379ddcf89f0..dc9d254fa5531 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -219,7 +219,8 @@ def transpile(self, # is_chief (no.0 triner) for checkpoint # the no.0 trainer will save all variables and its own reader offset to checkpoint # other trianers will save its own reader offset to checkpoint - self.is_chief = trainer_id == 0 + self._is_chief = trainer_id == 0 + self.checkpoint_dir = checkpoint_dir # process lookup_table_op # 1. check all lookup_table_op is distributed @@ -327,7 +328,7 @@ def transpile(self, "sync_mode": self.sync_mode }) - if checkpoint_dir and self.is_chief: + if self.checkpoint_dir and self._is_chief: program.global_block().create_var( name=SERIAL_VAR_NAME, persistable=True, @@ -342,7 +343,7 @@ def transpile(self, type="checkpoint_save", inputs={"X": save_vars}, attrs={"overwrite": True, - "dir": checkpoint_dir}) + "dir": self.checkpoint_dir}) # step4: Concat the parameters splits together after recv. for varname, splited_var in param_var_mapping.iteritems(): @@ -524,15 +525,15 @@ def __append_optimize_op__(op, block, grad_to_block_id): pserver_program.sync_with_cpp() return pserver_program - def get_train_startup_program(self, checkpoint_load_dir=None): + def get_train_startup_program(self): """ Get train startup program. - If checkpoint_load_dir is None, rerurn default startup program. - IF checkpoint_load_dir is Exist, add checkpoint_load op and load Var. + If self.checkpoint_dir is None, rerurn default startup program. + IF self.checkpoint_dir is Exist, add checkpoint_load op and load Var. """ startup_prog = default_startup_program() - if not checkpoint_load_dir: + if not self.checkpoint_dir: return startup_prog load_vars = [] @@ -540,20 +541,17 @@ def get_train_startup_program(self, checkpoint_load_dir=None): if self._is_persistable(var): load_vars.append(var.name) - serial_number = self._get_lastest_checkpoint_dir(checkpoint_load_dir) + serial_number = self._get_lastest_checkpoint_dir(self.checkpoint_dir) startup_prog.global_block().append_op( type="checkpoint_load", inputs={"X": load_vars}, outputs={"Argv": []}, - attrs={"dir": checkpoint_load_dir, + attrs={"dir": self.checkpoint_dir, "Serial": serial_number}) return startup_prog - def get_startup_program(self, - endpoint, - pserver_program, - checkpoint_load_dir=None): + def get_startup_program(self, endpoint, pserver_program): """ Get startup program for current parameter server. Modify operator input variables if there are variables that @@ -609,16 +607,16 @@ def _get_splited_name_and_shape(varname): for var in new_outputs.values(): load_vars.append(var.name) # add checkpoint op - if not checkpoint_load_dir: + if not self.checkpoint_dir: return s_prog - serial_number = self._get_lastest_checkpoint_dir(checkpoint_load_dir) + serial_number = self._get_lastest_checkpoint_dir(self.checkpoint_dir) s_prog.global_block().append_op( type="checkpoint_load", inputs={"X": load_vars}, outputs={"Argv": []}, - attrs={"dir": checkpoint_load_dir, + attrs={"dir": self.checkpoint_dir, "Serial": serial_number}) return s_prog From e130bf375235cf349904c433f3ff1c1c99f12083 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Fri, 18 May 2018 23:28:46 +0800 Subject: [PATCH 44/56] optimize python checkpint dir config --- .../paddle/fluid/transpiler/distribute_transpiler.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index dc9d254fa5531..1d51ed4579266 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -543,6 +543,11 @@ def get_train_startup_program(self): serial_number = self._get_lastest_checkpoint_dir(self.checkpoint_dir) + startup_prog.global_block().create_var( + name=SERIAL_VAR_NAME, + persistable=True, + type=core.VarDesc.VarType.RAW) + startup_prog.global_block().append_op( type="checkpoint_load", inputs={"X": load_vars}, @@ -612,6 +617,11 @@ def _get_splited_name_and_shape(varname): serial_number = self._get_lastest_checkpoint_dir(self.checkpoint_dir) + s_prog.global_block().create_var( + name=SERIAL_VAR_NAME, + persistable=True, + type=core.VarDesc.VarType.RAW) + s_prog.global_block().append_op( type="checkpoint_load", inputs={"X": load_vars}, From 5451c78ded15dedb1e9e89f25d5145e646f83563 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Mon, 21 May 2018 15:13:58 +0800 Subject: [PATCH 45/56] add checkpoint in io --- python/paddle/fluid/io.py | 87 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 54506e97ed5c9..502386016cfad 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -454,3 +454,90 @@ def get_parameter_value_by_name(name, executor, program=None): program = default_main_program() var = program.global_block().var(name) return get_parameter_value(var, executor) + + +SUCCESS = "_SUCCESS" + + +def save_checkpoint(executor, + dirname, + keep_max=10, + save_secs=600, + main_program=None): + """ + Save Variables to Checkpint Dir + + :param dirname + :param keep_max + :param save_secs + """ + if dirname is None: + raise Exception("save checkpoint dir can not be none") + + if not os.path.isdir(dirname): + os.makedirs(dirname) + serial = _get_lastest_checkpoint_dir(dirname) + 1 + + cur_dir = os.path.join(dirname, serial) + save_persistables(executor, cur_dir, main_program) + _write_success(cur_dir) + + +def restore_checkpoint(dirname, executor, main_program=None): + """ + Load Variables from Checkpint Dir + + :param dir + """ + if dirname is None and os.path.isdir(dirname): + raise Exception("restore checkpoint can not load variables from %s" % + dirname) + serial = _get_lastest_checkpoint_dir(dirname) + 1 + + if serial < -1: + return + cur_dir = os.path.join(dirname, serial) + load_persistables(executor, cur_dir, main_program) + + +def _write_success(dirname): + """ + """ + success_file = os.path.join(dirname, SUCCESS) + with open(success_file, 'a'): + pass + + +def _get_lastest_checkpoint_dir(checkpoint_dir): + """ + get the biggest number in checkpoint_dir, which has _SUCCESS + """ + if not checkpoint_dir.strip(): + return "" + + def has_success(checkpoint_dir, cur_dir): + """ + is _SUCCESS in this dir + """ + if not os.path.isdir(os.path.join(checkpoint_dir, cur_dir)): + return -1 + + try: + int(cur_dir) + except ValueError: + return -1 + + success_path = os.path.join(checkpoint_dir, cur_dir, SUCCESS) + if os.path.isfile(success_path): + return int(cur_dir) + + if not os.path.isdir(checkpoint_dir): + return -1 + + current_dir = -1 + dirs = os.listdir(checkpoint_dir) + for cur_dir in dirs: + success_num = has_success(checkpoint_dir, cur_dir) + if success_num > current_dir: + current_dir = success_num + return current_dir From 01975ec1c749c9576a1124a7f029234caa86e0ed Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Mon, 21 May 2018 16:53:59 +0800 Subject: [PATCH 46/56] add checkpoint in io --- python/paddle/fluid/io.py | 65 +++++++++++++++++++++++++++++---------- 1 file changed, 48 insertions(+), 17 deletions(-) diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 502386016cfad..83c32fe9d6ebd 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -13,21 +13,17 @@ # limitations under the License. import os +import time +import shutil from paddle.fluid.evaluator import Evaluator from paddle.fluid.framework import Program, Parameter, default_main_program, Variable from . import core __all__ = [ - 'save_vars', - 'save_params', - 'save_persistables', - 'load_vars', - 'load_params', - 'load_persistables', - 'save_inference_model', - 'load_inference_model', - 'get_inference_program', + 'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params', + 'load_persistables', 'save_inference_model', 'load_inference_model', + 'get_inference_program', 'save_checkpoint', 'restore_checkpoint' ] @@ -195,6 +191,8 @@ def load_vars(executor, load_var_map = {} for each_var in vars: assert isinstance(each_var, Variable) + if each_var.type == core.VarDesc.VarType.RAW: + continue new_var = _clone_var_in_block_(load_block, each_var) if filename is None: load_block.append_op( @@ -457,11 +455,12 @@ def get_parameter_value_by_name(name, executor, program=None): SUCCESS = "_SUCCESS" +BEGIN_SECS = time.time() def save_checkpoint(executor, dirname, - keep_max=10, + keep_max=3, save_secs=600, main_program=None): """ @@ -470,38 +469,70 @@ def save_checkpoint(executor, :param dirname :param keep_max :param save_secs + :param main_program """ if dirname is None: raise Exception("save checkpoint dir can not be none") if not os.path.isdir(dirname): os.makedirs(dirname) - serial = _get_lastest_checkpoint_dir(dirname) + 1 - cur_dir = os.path.join(dirname, serial) + global BEGIN_SECS + if time.time() - BEGIN_SECS < save_secs: + return + BEGIN_SECS = time.time() + + serial = _get_lastest_checkpoint_dir(dirname) + 1 + cur_dir = os.path.join(dirname, str(serial)) save_persistables(executor, cur_dir, main_program) _write_success(cur_dir) + _lru_delete(dirname, keep_max) def restore_checkpoint(dirname, executor, main_program=None): """ Load Variables from Checkpint Dir - :param dir + :param dirname + :param executor + :param main_program """ if dirname is None and os.path.isdir(dirname): raise Exception("restore checkpoint can not load variables from %s" % dirname) - serial = _get_lastest_checkpoint_dir(dirname) + 1 + serial = _get_lastest_checkpoint_dir(dirname) - if serial < -1: + if serial < 0: return - cur_dir = os.path.join(dirname, serial) + cur_dir = os.path.join(dirname, str(serial)) load_persistables(executor, cur_dir, main_program) +def _lru_delete(dirname, keep_max=3): + """ + retain checkpoint nums with keep_max + """ + dirs = os.listdir(dirname) + serials = [] + for serial in dirs: + try: + serials.append(int(serial)) + except ValueError: + continue + + if len(serials) <= keep_max: + return + + serials.sort(reverse=True) + serials = serials[keep_max:] + for serial in serials: + cur_dir = os.path.join(dirname, str(serial)) + shutil.rmtree(cur_dir) + + def _write_success(dirname): """ + write _SUCCESS to checkpoint dir """ success_file = os.path.join(dirname, SUCCESS) with open(success_file, 'a'): @@ -513,7 +544,7 @@ def _get_lastest_checkpoint_dir(checkpoint_dir): get the biggest number in checkpoint_dir, which has _SUCCESS """ if not checkpoint_dir.strip(): - return "" + return -1 def has_success(checkpoint_dir, cur_dir): """ From ed2129cc50b794f76574065430577e0303a6703d Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Mon, 21 May 2018 16:57:40 +0800 Subject: [PATCH 47/56] revert distribute_transpiler.py --- .../fluid/transpiler/distribute_transpiler.py | 126 +----------------- 1 file changed, 1 insertion(+), 125 deletions(-) diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 1d51ed4579266..42ff0a9eb1112 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -14,7 +14,6 @@ from __future__ import print_function -import os import math import distributed_splitter as splitter @@ -27,10 +26,6 @@ LOOKUP_TABLE_GRAD_TYPE = "lookup_table_grad" RPC_CLIENT_VAR_NAME = "RPC_CLIENT_VAR" -# for checkpoint -SUCCESS = "_SUCCESS" -SERIAL_VAR_NAME = "SERIAL_NUMBER" - class VarBlock: def __init__(self, varname, offset, size): @@ -161,8 +156,7 @@ def transpile(self, pservers="127.0.0.1:6174", trainers=1, split_method=splitter.round_robin, - sync_mode=True, - checkpoint_dir=None): + sync_mode=True): """ Transpile the program to distributed data-parallelism programs. The main_program will be transformed to use a remote parameter server @@ -216,12 +210,6 @@ def transpile(self, self.pserver_endpoints = pserver_endpoints self.optimize_ops, params_grads = self._get_optimize_pass() - # is_chief (no.0 triner) for checkpoint - # the no.0 trainer will save all variables and its own reader offset to checkpoint - # other trianers will save its own reader offset to checkpoint - self._is_chief = trainer_id == 0 - self.checkpoint_dir = checkpoint_dir - # process lookup_table_op # 1. check all lookup_table_op is distributed # 2. check all lookup_table_op share the same table. @@ -327,24 +315,6 @@ def transpile(self, "epmap": eplist, "sync_mode": self.sync_mode }) - - if self.checkpoint_dir and self._is_chief: - program.global_block().create_var( - name=SERIAL_VAR_NAME, - persistable=True, - type=core.VarDesc.VarType.RAW) - - save_vars = [] - for var in self.origin_program.list_vars(): - if self._is_persistable(var): - save_vars.append(var.name) - - program.global_block().append_op( - type="checkpoint_save", - inputs={"X": save_vars}, - attrs={"overwrite": True, - "dir": self.checkpoint_dir}) - # step4: Concat the parameters splits together after recv. for varname, splited_var in param_var_mapping.iteritems(): if len(splited_var) <= 1: @@ -525,37 +495,6 @@ def __append_optimize_op__(op, block, grad_to_block_id): pserver_program.sync_with_cpp() return pserver_program - def get_train_startup_program(self): - """ - Get train startup program. - If self.checkpoint_dir is None, rerurn default startup program. - IF self.checkpoint_dir is Exist, add checkpoint_load op and load Var. - """ - startup_prog = default_startup_program() - - if not self.checkpoint_dir: - return startup_prog - - load_vars = [] - for var in startup_prog.list_vars(): - if self._is_persistable(var): - load_vars.append(var.name) - - serial_number = self._get_lastest_checkpoint_dir(self.checkpoint_dir) - - startup_prog.global_block().create_var( - name=SERIAL_VAR_NAME, - persistable=True, - type=core.VarDesc.VarType.RAW) - - startup_prog.global_block().append_op( - type="checkpoint_load", - inputs={"X": load_vars}, - outputs={"Argv": []}, - attrs={"dir": self.checkpoint_dir, - "Serial": serial_number}) - return startup_prog - def get_startup_program(self, endpoint, pserver_program): """ Get startup program for current parameter server. @@ -581,7 +520,6 @@ def _get_splited_name_and_shape(varname): created_var_map[var.name] = tmpvar # 2. rename op outputs - load_vars = [] for op in orig_s_prog.global_block().ops: new_inputs = dict() new_outputs = dict() @@ -609,70 +547,8 @@ def _get_splited_name_and_shape(varname): inputs=new_inputs, outputs=new_outputs, attrs=op.attrs) - for var in new_outputs.values(): - load_vars.append(var.name) - # add checkpoint op - if not self.checkpoint_dir: - return s_prog - - serial_number = self._get_lastest_checkpoint_dir(self.checkpoint_dir) - - s_prog.global_block().create_var( - name=SERIAL_VAR_NAME, - persistable=True, - type=core.VarDesc.VarType.RAW) - - s_prog.global_block().append_op( - type="checkpoint_load", - inputs={"X": load_vars}, - outputs={"Argv": []}, - attrs={"dir": self.checkpoint_dir, - "Serial": serial_number}) - return s_prog - def _is_persistable(self, var): - """only save LodTensor variable""" - if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \ - var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \ - var.desc.type() == core.VarDesc.VarType.RAW : - return False - return var.persistable - - def _get_lastest_checkpoint_dir(self, checkpoint_dir): - """ - get the biggest number in checkpoint_dir, which has _SUCCESS - """ - if not checkpoint_dir.strip(): - return "" - - def has_success(checkpoint_dir, cur_dir): - """ - is _SUCCESS in this dir - """ - if not os.path.isdir(os.path.join(checkpoint_dir, cur_dir)): - return -1 - - try: - int(cur_dir) - except ValueError: - return -1 - - success_path = os.path.join(checkpoint_dir, cur_dir, SUCCESS) - if os.path.isfile(success_path): - return int(cur_dir) - - if not os.path.isdir(checkpoint_dir): - return "-1" - - current_dir = 0 - dirs = os.listdir(checkpoint_dir) - for cur_dir in dirs: - success_num = has_success(checkpoint_dir, cur_dir) - if success_num > current_dir: - current_dir = success_num - return str(current_dir) - # transpiler function for dis lookup_table def _replace_lookup_table_op_with_prefetch(self, program, rpc_client_var, eplist): From be050565241780003cef777e0b0ad0e49cd7f6b1 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Mon, 21 May 2018 19:11:23 +0800 Subject: [PATCH 48/56] delete old checkpoint code --- paddle/fluid/operators/CMakeLists.txt | 3 - paddle/fluid/operators/checkpoint_load_op.cc | 213 ------------------- paddle/fluid/operators/checkpoint_op_test.cc | 82 ------- paddle/fluid/operators/checkpoint_save_op.cc | 203 ------------------ python/paddle/fluid/framework.py | 3 +- python/paddle/fluid/io.py | 36 +++- 6 files changed, 32 insertions(+), 508 deletions(-) delete mode 100644 paddle/fluid/operators/checkpoint_load_op.cc delete mode 100644 paddle/fluid/operators/checkpoint_op_test.cc delete mode 100644 paddle/fluid/operators/checkpoint_save_op.cc diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 2288987eaf9f9..ac1f3f44ae870 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -252,8 +252,6 @@ op_library(save_op DEPS lod_tensor) op_library(load_op DEPS lod_tensor) op_library(save_combine_op DEPS lod_tensor) op_library(load_combine_op DEPS lod_tensor) -op_library(checkpoint_save_op DEPS lod_tensor) -op_library(checkpoint_load_op DEPS lod_tensor) op_library(concat_op DEPS concat) # FIXME(thuan): Move CSP operators to paddle/fluid/framework/operators/concurrency @@ -294,6 +292,5 @@ cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_sea cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory) cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op) -cc_test(checkpoint_op_test SRCS checkpoint_op_test.cc DEPS checkpoint_save_op checkpoint_load_op) nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor) diff --git a/paddle/fluid/operators/checkpoint_load_op.cc b/paddle/fluid/operators/checkpoint_load_op.cc deleted file mode 100644 index 18871e56c5017..0000000000000 --- a/paddle/fluid/operators/checkpoint_load_op.cc +++ /dev/null @@ -1,213 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include -#include -#include -#include -#include -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/data_type_transform.h" -#include "paddle/fluid/framework/framework.pb.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/device_context.h" - -namespace paddle { -namespace operators { - -constexpr char kSEP = '/'; -// write empty file named _SUCCESS -const char SUCCESS[] = "_SUCCESS"; -const char SERIAL_VAR[] = "SERIAL_NUMBER"; - -static bool FileExists(const std::string &filepath) { - struct stat buffer; - return (stat(filepath.c_str(), &buffer) == 0); -} - -static std::string GenePath(const std::string &dir, const std::string &file) { - std::string file_path; - file_path.append(file_path); - file_path.append("/"); - file_path.append(file); - return file_path; -} - -static bool IsNumber(const std::string &s) { - std::string::const_iterator it = s.begin(); - while (it != s.end() && std::isdigit(*it)) ++it; - return !s.empty() && it == s.end(); -} - -static void LoadInputVars(const framework::Scope &scope, - const platform::Place &place, - const std::vector &inp_var_names, - const std::string &dir) { - // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(place); - - // todo (tangwei) made it async - for (size_t i = 0; i < inp_var_names.size(); i++) { - auto *var = scope.FindVar(inp_var_names[i]); - - PADDLE_ENFORCE(var != nullptr, - "Cannot find variable %s for save_combine_op", - inp_var_names[i]); - PADDLE_ENFORCE(var->IsType(), - "LoadCombineOp only supports LoDTensor, %s has wrong type", - inp_var_names[i]); - - std::string var_file = GenePath(dir, inp_var_names[i]); - auto *tensor = var->GetMutable(); - std::ifstream fin(var_file); - PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s for load op", - var_file); - framework::DeserializeFromStream(fin, tensor, dev_ctx); - fin.close(); - VLOG(3) << " load var: " << inp_var_names[i] << " finished"; - } -} - -static void LoadStringArgv(const framework::Scope &scope, - const platform::Place &place, - const std::vector &argv, - const std::string &dir) { - for (size_t i = 0; i < argv.size(); i++) { - auto *var = scope.FindVar(argv[i]); - std::string *var_str = var->GetMutable(); - std::string var_file = GenePath(dir, argv[i]); - std::ifstream fin(var_file); - PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s for load op", - var_file); - std::getline(fin, *var_str); - fin.close(); - VLOG(3) << " load String argv: " << argv[i] << " value is: " << var_str; - } -} - -class CheckpointLoadOp : public framework::OperatorBase { - public: - CheckpointLoadOp(const std::string &type, - const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : OperatorBase(type, inputs, outputs, attrs) {} - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &place) const override { - std::string dir = Attr("dir"); - std::string serial_num_attr = Attr("Serial"); - - VLOG(3) << "CheckpointLoadOp get Attr dir: " << dir; - VLOG(3) << "CheckpointLoadOp get Attr Serial: " << serial_num_attr; - - std::string serial_var_name = std::string(SERIAL_VAR); - auto *serial_var = scope.FindVar(serial_var_name); - PADDLE_ENFORCE(serial_var != nullptr, - "Cannot find variable %s for checkpoint_load_op", - serial_var_name); - - auto *serial_num = serial_var->GetMutable(); - serial_num->clear(); - serial_num->append(serial_num_attr); - - VLOG(1) << "CheckpointLoadOp set " << SERIAL_VAR - << " value: " << serial_num; - - std::string success = GenePath(dir, serial_num->c_str()); - VLOG(3) << "Load checkpoint from dir: " << success; - success = GenePath(success, SUCCESS); - bool is_present = FileExists(success); - if (!is_present) { - VLOG(1) << "CheckpointLoadOp can not find " << SUCCESS - << " from: " << success; - return; - } - - VLOG(3) << "Ready to load vars to scope"; - auto inp_var_names = Inputs("X"); - PADDLE_ENFORCE_GT(static_cast(inp_var_names.size()), 0, - "The number of input variables should be greater than 0"); - LoadInputVars(scope, place, inp_var_names, dir); - - // VLOG(3) << "Ready to load string argv to scope"; - // auto argv = Output("Argv"); - // LoadStringArgv(scope, place, argv, dir); - } -}; - -class CheckpointLoadOpProtoMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput( - "X", - "(vector) Input LoDTensors that need to be saved together in a file.") - .AsDuplicable(); - AddOutput( - "Argv", - "(vector) Input LoDTensors that need to be saved together in a file."); - AddComment(R"DOC( -CheckpointLoad operator - -This operator will serialize and write a list of input LoDTensor variables -to a file on disk. -)DOC"); - - AddAttr( - "Serial", - "(std::string)" - "The serial number of the checkpoint will to be load."); - AddAttr( - "dir", - "(string)" - "The \"dir\" where the checkpoint files will be loaded.") - .AddCustomChecker( - [](const std::string &path) { return !path.empty(); }); - } -}; - -class CheckpointLoadOpVarTypeInference : public framework::VarTypeInference { - public: - void operator()(const framework::OpDesc &op_desc, - framework::BlockDesc *block) const override { - auto out_var_name = op_desc.Output("Argv").front(); - auto &out_var = block->FindRecursiveOrCreateVar(out_var_name); - auto var_type = framework::proto::VarType::RAW; - out_var.SetType(var_type); - } -}; - -class CheckpointLoadOpShapeInference : public framework::InferShapeBase { - public: - void operator()(framework::InferShapeContext *ctx) const override {} -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OPERATOR(checkpoint_load, ops::CheckpointLoadOp, - paddle::framework::EmptyGradOpMaker, - ops::CheckpointLoadOpProtoMaker, - ops::CheckpointLoadOpVarTypeInference, - ops::CheckpointLoadOpShapeInference); - -// REGISTER_OPERATOR(checkpoint_load, ops::CheckpointLoadOp, -// ops::CheckpointLoadOpProtoMaker); diff --git a/paddle/fluid/operators/checkpoint_op_test.cc b/paddle/fluid/operators/checkpoint_op_test.cc deleted file mode 100644 index 5312225e5f952..0000000000000 --- a/paddle/fluid/operators/checkpoint_op_test.cc +++ /dev/null @@ -1,82 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "gtest/gtest.h" -#include "paddle/fluid/framework/op_registry.h" - -USE_NO_KERNEL_OP(checkpoint_save) -USE_NO_KERNEL_OP(checkpoint_load) - -TEST(CheckpointSaveOp, CPU) { - paddle::framework::Scope scope; - paddle::platform::CPUPlace place; - - auto var = scope.Var("test_var"); - auto tensor = var->GetMutable(); - tensor->Resize({3, 10}); - paddle::framework::LoD expect_lod; - expect_lod.resize(1); - expect_lod[0].push_back(0); - expect_lod[0].push_back(1); - expect_lod[0].push_back(2); - expect_lod[0].push_back(3); - - tensor->set_lod(expect_lod); - float* expect = tensor->mutable_data(place); - for (int64_t i = 0; i < tensor->numel(); ++i) { - expect[i] = static_cast(paddle::platform::float16(i)); - } - - scope.Var("SERIAL_NUMBER"); - - paddle::framework::AttributeMap attrs; - attrs.insert({"dir", std::string("ckpt")}); - - auto save_op = paddle::framework::OpRegistry::CreateOp( - "checkpoint_save", {{"X", {"test_var"}}}, {}, attrs); - save_op->Run(scope, place); -} - -TEST(CheckpointLoadOp, CPU) { - paddle::framework::Scope scope; - paddle::platform::CPUPlace place; - - auto var = scope.Var("test_var"); - auto tensor = var->GetMutable(); - tensor->Resize({3, 10}); - paddle::framework::LoD expect_lod; - expect_lod.resize(1); - expect_lod[0].push_back(0); - expect_lod[0].push_back(1); - expect_lod[0].push_back(2); - expect_lod[0].push_back(3); - - tensor->set_lod(expect_lod); - float* expect = tensor->mutable_data(place); - for (int64_t i = 0; i < tensor->numel(); ++i) { - expect[i] = static_cast(paddle::platform::float16(i)); - } - - scope.Var("SERIAL_NUMBER"); - auto* serial_num = scope.FindVar("SERIAL_NUMBER")->GetMutable(); - serial_num->append("0"); - - paddle::framework::AttributeMap attrs; - attrs.insert({"dir", std::string("ckpt")}); - attrs.insert({"Serial", std::string("SERIAL_NUMBER")}); - - auto load_op = paddle::framework::OpRegistry::CreateOp( - "checkpoint_load", {{"X", {"test_var"}}}, {{"Argv", {}}}, attrs); - load_op->Run(scope, place); -} diff --git a/paddle/fluid/operators/checkpoint_save_op.cc b/paddle/fluid/operators/checkpoint_save_op.cc deleted file mode 100644 index f904cdc8269e7..0000000000000 --- a/paddle/fluid/operators/checkpoint_save_op.cc +++ /dev/null @@ -1,203 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include -#include -#include -#include -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/data_type_transform.h" -#include "paddle/fluid/framework/framework.pb.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/device_context.h" - -namespace paddle { -namespace operators { - -constexpr char kSEP = '/'; -// write empty file named _SUCCESS -const char SUCCESS[] = "_SUCCESS"; -const char SERIAL_VAR[] = "SERIAL_NUMBER"; - -static bool IsNumber(const std::string &s) { - std::string::const_iterator it = s.begin(); - while (it != s.end() && std::isdigit(*it)) ++it; - return !s.empty() && it == s.end(); -} - -static std::string GenePath(const std::string &dir, const std::string &file) { - std::string file_path; - file_path.append(dir); - file_path.append("/"); - file_path.append(file); - return file_path; -} - -static bool FileExists(const std::string &filepath) { - struct stat buffer; - return (stat(filepath.c_str(), &buffer) == 0); -} - -static std::string DirName(const std::string &filepath) { - auto pos = filepath.rfind(kSEP); - if (pos == std::string::npos) { - return ""; - } - return filepath.substr(0, pos); -} - -static void MkDir(const char *path) { - if (mkdir(path, 0755)) { - PADDLE_ENFORCE_EQ(errno, EEXIST, "%s mkdir failed!", path); - } -} - -static void MkDirRecursively(const char *fullpath) { - if (*fullpath == '\0') return; // empty string - if (FileExists(fullpath)) return; - - MkDirRecursively(DirName(fullpath).c_str()); - MkDir(fullpath); -} - -class CheckpointSaveOp : public framework::OperatorBase { - public: - CheckpointSaveOp(const std::string &type, - const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : OperatorBase(type, inputs, outputs, attrs) {} - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &place) const override { - auto ck_dir = Attr("dir"); - auto overwrite = Attr("overwrite"); - - std::string serial_var_name = std::string(SERIAL_VAR); - auto *serial_num = - scope.FindVar(serial_var_name)->GetMutable(); - VLOG(1) << "CheckpointSaveOp get " << SERIAL_VAR - << " value: " << serial_num; - - int serials = 0; - if (!serial_num->empty()) { - serials = std::stoi(serial_num->data()); - serials += 1; - } - - serial_num->clear(); - serial_num->append(std::to_string(serials)); - - std::string dir = GenePath(ck_dir, serial_num->c_str()); - VLOG(1) << "CheckpointSaveOp current dir: " << dir; - bool is_present = FileExists(dir); - if (is_present && !overwrite) { - PADDLE_THROW("%s exists!, checkpoint save cannot to overwrite it", dir, - overwrite); - } - MkDirRecursively(dir.c_str()); - - auto inp_var_names = Inputs("X"); - PADDLE_ENFORCE_GT(static_cast(inp_var_names.size()), 0, - "The number of input variables should be greater than 0"); - - // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(place); - - // todo (tangwei) made it async - for (size_t i = 0; i < inp_var_names.size(); i++) { - auto *var = scope.FindVar(inp_var_names[i]); - - PADDLE_ENFORCE(var != nullptr, - "Cannot find variable %s for checkpoint save op", - inp_var_names[i]); - PADDLE_ENFORCE( - var->IsType(), - "CheckpointSaveOp only supports LoDTensor, %s has wrong type", - inp_var_names[i]); - - auto &tensor = var->Get(); - // Serialize tensors one by one - std::string var_file = GenePath(dir, inp_var_names[i]); - std::ofstream fout(var_file); - framework::SerializeToStream(fout, tensor, dev_ctx); - fout.close(); - } - - std::string success = GenePath(dir, SUCCESS); - std::ofstream fout(success); - fout.close(); - } -}; - -class CheckpointSaveOpProtoMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput( - "X", - "(vector) Input LoDTensors that need to be saved together in a file.") - .AsDuplicable(); - AddComment(R"DOC( -CheckpointSave operator - -This operator will serialize and write a list of input LoDTensor variables -to a file on disk. -)DOC"); - AddAttr("overwrite", - "(boolean, default false)" - "Delete the output dir if it exists.") - .SetDefault(false); - - AddAttr("dir", - "(string)" - "The dir where the LoDTensor variables will be saved.") - .AddCustomChecker( - [](const std::string &path) { return !path.empty(); }); - } -}; - -// class CheckpointSaveOpVarTypeInference : public framework::VarTypeInference { -// public: -// void operator()(const framework::OpDesc &op_desc, -// framework::BlockDesc *block) const override { -// auto out_var_name = op_desc.Output("Serial").front(); -// auto &out_var = block->FindRecursiveOrCreateVar(out_var_name); -// auto var_type = framework::proto::VarType::RAW; -// out_var.SetType(var_type); -// } -// }; - -// class CheckpointSaveOpShapeInference : public framework::InferShapeBase { -// public: -// void operator()(framework::InferShapeContext *ctx) const override {} -// }; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OPERATOR(checkpoint_save, ops::CheckpointSaveOp, - ops::CheckpointSaveOpProtoMaker); - -// REGISTER_OPERATOR(checkpoint_save, ops::CheckpointSaveOp, -// paddle::framework::EmptyGradOpMaker, -// ops::CheckpointSaveOpProtoMaker, -// ops::CheckpointSaveOpVarTypeInference, -// ops::CheckpointSaveOpShapeInference); diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index c5044a07c9421..38c765938fe9d 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -489,8 +489,7 @@ def find_name(var_list, name): 'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv', 'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine', 'ncclInit', 'channel_create', 'channel_close', - 'channel_send', 'channel_recv', 'select', 'gen_nccl_id', - 'checkpoint_save', 'checkpoint_load' + 'channel_send', 'channel_recv', 'select', 'gen_nccl_id' } if type not in no_kernel_op_set: self.desc.infer_var_type(self.block.desc) diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 83c32fe9d6ebd..b1748f0ad0a39 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -455,7 +455,7 @@ def get_parameter_value_by_name(name, executor, program=None): SUCCESS = "_SUCCESS" -BEGIN_SECS = time.time() +BEGIN_SECS = None def save_checkpoint(executor, @@ -478,13 +478,21 @@ def save_checkpoint(executor, os.makedirs(dirname) global BEGIN_SECS - if time.time() - BEGIN_SECS < save_secs: - return + if BEGIN_SECS is not None: + if time.time() - BEGIN_SECS < save_secs: + return BEGIN_SECS = time.time() serial = _get_lastest_checkpoint_dir(dirname) + 1 cur_dir = os.path.join(dirname, str(serial)) - save_persistables(executor, cur_dir, main_program) + # save_persistables(executor, cur_dir, main_program) + save_vars( + executor, + dirname=cur_dir, + main_program=main_program, + vars=None, + predicate=is_checkpoint_var, + filename=None) _write_success(cur_dir) _lru_delete(dirname, keep_max) @@ -505,7 +513,25 @@ def restore_checkpoint(dirname, executor, main_program=None): if serial < 0: return cur_dir = os.path.join(dirname, str(serial)) - load_persistables(executor, cur_dir, main_program) + # load_persistables(executor, cur_dir, main_program) + load_vars( + executor, + dirname=cur_dir, + main_program=main_program, + predicate=is_checkpoint_var, + filename=None) + + +def is_checkpoint_var(var): + if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \ + var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \ + var.desc.type() == core.VarDesc.VarType.RAW: + return False + + if var.name.endswith("@GRAD"): + return False + + return var.persistable def _lru_delete(dirname, keep_max=3): From 2412dee3769189e2e1f94cc0e2c298c4c1035699 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Mon, 21 May 2018 21:20:03 +0800 Subject: [PATCH 49/56] code optimized --- python/paddle/fluid/io.py | 55 +++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 22 deletions(-) diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index b1748f0ad0a39..01debaff56a61 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -454,17 +454,16 @@ def get_parameter_value_by_name(name, executor, program=None): return get_parameter_value(var, executor) -SUCCESS = "_SUCCESS" -BEGIN_SECS = None +SUCCESS_MARK_FILENAME = "_SUCCESS" def save_checkpoint(executor, - dirname, - keep_max=3, - save_secs=600, + dirname=None, + max_num_checkpoints=3, + save_interval_secs=600, main_program=None): """ - Save Variables to Checkpint Dir + Save Variables to Checkpoint Directory :param dirname :param keep_max @@ -472,20 +471,19 @@ def save_checkpoint(executor, :param main_program """ if dirname is None: - raise Exception("save checkpoint dir can not be none") + dirname = os.getcwd() if not os.path.isdir(dirname): os.makedirs(dirname) - global BEGIN_SECS - if BEGIN_SECS is not None: - if time.time() - BEGIN_SECS < save_secs: - return - BEGIN_SECS = time.time() + serial = _get_lastest_checkpoint_dir(dirname) + if serial >= 0 and not _interval_secs_exceed( + os.path.join(dirname, str(serial)), save_interval_secs): + return - serial = _get_lastest_checkpoint_dir(dirname) + 1 + serial = serial + 1 cur_dir = os.path.join(dirname, str(serial)) - # save_persistables(executor, cur_dir, main_program) + save_vars( executor, dirname=cur_dir, @@ -494,10 +492,10 @@ def save_checkpoint(executor, predicate=is_checkpoint_var, filename=None) _write_success(cur_dir) - _lru_delete(dirname, keep_max) + _lru_delete(dirname, max_num_checkpoints) -def restore_checkpoint(dirname, executor, main_program=None): +def restore_checkpoint(executor, dirname=None, main_program=None): """ Load Variables from Checkpint Dir @@ -505,15 +503,16 @@ def restore_checkpoint(dirname, executor, main_program=None): :param executor :param main_program """ - if dirname is None and os.path.isdir(dirname): - raise Exception("restore checkpoint can not load variables from %s" % - dirname) + + if dirname is None: + dirname = os.getcwd() + serial = _get_lastest_checkpoint_dir(dirname) if serial < 0: return cur_dir = os.path.join(dirname, str(serial)) - # load_persistables(executor, cur_dir, main_program) + load_vars( executor, dirname=cur_dir, @@ -523,6 +522,10 @@ def restore_checkpoint(dirname, executor, main_program=None): def is_checkpoint_var(var): + """ + VarType will fliter out FEED_MINIBATCH FETCH_LIST RAW + VarName will fliter out Gradient + """ if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \ var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \ var.desc.type() == core.VarDesc.VarType.RAW: @@ -534,6 +537,13 @@ def is_checkpoint_var(var): return var.persistable +def _interval_secs_exceed(dirname, save_interval_secs): + dir_time = os.path.getmtime(dirname) + if save_interval_secs > (time.time() - dir_time): + return False + return True + + def _lru_delete(dirname, keep_max=3): """ retain checkpoint nums with keep_max @@ -560,7 +570,7 @@ def _write_success(dirname): """ write _SUCCESS to checkpoint dir """ - success_file = os.path.join(dirname, SUCCESS) + success_file = os.path.join(dirname, SUCCESS_MARK_FILENAME) with open(success_file, 'a'): pass @@ -584,7 +594,8 @@ def has_success(checkpoint_dir, cur_dir): except ValueError: return -1 - success_path = os.path.join(checkpoint_dir, cur_dir, SUCCESS) + success_path = os.path.join(checkpoint_dir, cur_dir, + SUCCESS_MARK_FILENAME) if os.path.isfile(success_path): return int(cur_dir) From e901de66814041adfec471673ac970de2ffe7bbc Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Tue, 22 May 2018 10:07:15 +0800 Subject: [PATCH 50/56] update var name --- python/paddle/fluid/io.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 01debaff56a61..ac26991d41dd6 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -23,7 +23,7 @@ __all__ = [ 'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params', 'load_persistables', 'save_inference_model', 'load_inference_model', - 'get_inference_program', 'save_checkpoint', 'restore_checkpoint' + 'get_inference_program', 'save_checkpoint', 'load_checkpoint' ] @@ -466,7 +466,7 @@ def save_checkpoint(executor, Save Variables to Checkpoint Directory :param dirname - :param keep_max + :param max_num_checkpoints :param save_secs :param main_program """ @@ -495,7 +495,7 @@ def save_checkpoint(executor, _lru_delete(dirname, max_num_checkpoints) -def restore_checkpoint(executor, dirname=None, main_program=None): +def load_checkpoint(executor, dirname=None, main_program=None): """ Load Variables from Checkpint Dir @@ -544,9 +544,9 @@ def _interval_secs_exceed(dirname, save_interval_secs): return True -def _lru_delete(dirname, keep_max=3): +def _lru_delete(dirname, max_num_checkpoints=3): """ - retain checkpoint nums with keep_max + retain checkpoint nums with max_num_checkpoints """ dirs = os.listdir(dirname) serials = [] @@ -556,11 +556,11 @@ def _lru_delete(dirname, keep_max=3): except ValueError: continue - if len(serials) <= keep_max: + if len(serials) <= max_num_checkpoints: return serials.sort(reverse=True) - serials = serials[keep_max:] + serials = serials[max_num_checkpoints:] for serial in serials: cur_dir = os.path.join(dirname, str(serial)) shutil.rmtree(cur_dir) From 27b717516f466ca1068af5a211fdda9d35f5334d Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Tue, 22 May 2018 18:02:31 +0800 Subject: [PATCH 51/56] update python annotation --- python/paddle/fluid/io.py | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index ac26991d41dd6..3a7b68a682d04 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -463,8 +463,11 @@ def save_checkpoint(executor, save_interval_secs=600, main_program=None): """ - Save Variables to Checkpoint Directory - + Save Checkpoint will save persistable LodTensor variables from main_program in checkpoint directory, + directory named by serial number from 0 to (n -1), save_checkpoint use LRU strategy + to keep numbers of checkpoint directory, the numbers of checkpoint directory are max_num_checkpoints at most, + The interval time between two save_checkpoint must great than or equal to save_interval_secs. + :param dirname :param max_num_checkpoints :param save_secs @@ -489,7 +492,7 @@ def save_checkpoint(executor, dirname=cur_dir, main_program=main_program, vars=None, - predicate=is_checkpoint_var, + predicate=_is_checkpoint_var, filename=None) _write_success(cur_dir) _lru_delete(dirname, max_num_checkpoints) @@ -497,10 +500,11 @@ def save_checkpoint(executor, def load_checkpoint(executor, dirname=None, main_program=None): """ - Load Variables from Checkpint Dir + Load checkpoint from directory by executor, + it will find lastest checkpoint file and load it auto. - :param dirname :param executor + :param dirname :param main_program """ @@ -517,14 +521,16 @@ def load_checkpoint(executor, dirname=None, main_program=None): executor, dirname=cur_dir, main_program=main_program, - predicate=is_checkpoint_var, + predicate=_is_checkpoint_var, filename=None) -def is_checkpoint_var(var): +def _is_checkpoint_var(var): """ - VarType will fliter out FEED_MINIBATCH FETCH_LIST RAW - VarName will fliter out Gradient + checkpoint will not save or load all the variables. + var type is FEED_MINIBATCH/FETCH_LIST/RAW and var name is end with @GRAD are discarded. + + :param var """ if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \ var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \ @@ -545,9 +551,6 @@ def _interval_secs_exceed(dirname, save_interval_secs): def _lru_delete(dirname, max_num_checkpoints=3): - """ - retain checkpoint nums with max_num_checkpoints - """ dirs = os.listdir(dirname) serials = [] for serial in dirs: @@ -568,7 +571,9 @@ def _lru_delete(dirname, max_num_checkpoints=3): def _write_success(dirname): """ - write _SUCCESS to checkpoint dir + write an empty _SUCCESS file to checkpoint dir, indicate this checkpoint is correct. + + :param dirname """ success_file = os.path.join(dirname, SUCCESS_MARK_FILENAME) with open(success_file, 'a'): @@ -577,7 +582,9 @@ def _write_success(dirname): def _get_lastest_checkpoint_dir(checkpoint_dir): """ - get the biggest number in checkpoint_dir, which has _SUCCESS + get the latest file in checkpoint directory, the _SUCCESS file must exist in the directory + + :param checkpoint_dir """ if not checkpoint_dir.strip(): return -1 From 9d985340e5a1ebf7df7a1a8f9d324c08d4d07a97 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Wed, 23 May 2018 10:24:22 +0800 Subject: [PATCH 52/56] update annotation grammar --- python/paddle/fluid/io.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 3a7b68a682d04..845e8c9ca2765 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -464,10 +464,10 @@ def save_checkpoint(executor, main_program=None): """ Save Checkpoint will save persistable LodTensor variables from main_program in checkpoint directory, - directory named by serial number from 0 to (n -1), save_checkpoint use LRU strategy + the directory named by serial number from 0 to (n -1), save_checkpoint use LRU strategy to keep numbers of checkpoint directory, the numbers of checkpoint directory are max_num_checkpoints at most, The interval time between two save_checkpoint must great than or equal to save_interval_secs. - + :param dirname :param max_num_checkpoints :param save_secs @@ -500,8 +500,8 @@ def save_checkpoint(executor, def load_checkpoint(executor, dirname=None, main_program=None): """ - Load checkpoint from directory by executor, - it will find lastest checkpoint file and load it auto. + Load checkpoint from a directory by executor, + it will find latest checkpoint file and load it auto. :param executor :param dirname @@ -527,9 +527,9 @@ def load_checkpoint(executor, dirname=None, main_program=None): def _is_checkpoint_var(var): """ - checkpoint will not save or load all the variables. - var type is FEED_MINIBATCH/FETCH_LIST/RAW and var name is end with @GRAD are discarded. - + the checkpoint will not save or load all the variables. + var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded. + :param var """ if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \ @@ -571,7 +571,7 @@ def _lru_delete(dirname, max_num_checkpoints=3): def _write_success(dirname): """ - write an empty _SUCCESS file to checkpoint dir, indicate this checkpoint is correct. + write an empty file named "_SUCCESS" in checkpoint dir, indicate this checkpoint is correct. :param dirname """ From d96b4427a25a7839d11aa9c94224570c35e51d76 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Wed, 23 May 2018 11:05:09 +0800 Subject: [PATCH 53/56] rename checkpoint folder to checkpoint_serial --- python/paddle/fluid/io.py | 66 +++++++++++++++++++++++---------------- 1 file changed, 39 insertions(+), 27 deletions(-) diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 845e8c9ca2765..239736aad0854 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -455,10 +455,12 @@ def get_parameter_value_by_name(name, executor, program=None): SUCCESS_MARK_FILENAME = "_SUCCESS" +CHECKPOINT_PREFIX = "checkpoint" +CHECKPOINT_SEPARATOR = "_" def save_checkpoint(executor, - dirname=None, + checkpoint_dir=None, max_num_checkpoints=3, save_interval_secs=600, main_program=None): @@ -466,26 +468,27 @@ def save_checkpoint(executor, Save Checkpoint will save persistable LodTensor variables from main_program in checkpoint directory, the directory named by serial number from 0 to (n -1), save_checkpoint use LRU strategy to keep numbers of checkpoint directory, the numbers of checkpoint directory are max_num_checkpoints at most, - The interval time between two save_checkpoint must great than or equal to save_interval_secs. + The interval between two saved checkpoints must greater than save_interval_secs. - :param dirname + :param executor + :param checkpoint_dir :param max_num_checkpoints - :param save_secs + :param save_interval_secs :param main_program """ - if dirname is None: - dirname = os.getcwd() + if checkpoint_dir is None: + checkpoint_dir = os.getcwd() - if not os.path.isdir(dirname): - os.makedirs(dirname) + if not os.path.isdir(checkpoint_dir): + os.makedirs(checkpoint_dir) - serial = _get_lastest_checkpoint_dir(dirname) + serial = _get_lastest_checkpoint_dir(checkpoint_dir) if serial >= 0 and not _interval_secs_exceed( - os.path.join(dirname, str(serial)), save_interval_secs): + _get_serial_dir(serial, checkpoint_dir), save_interval_secs): return - serial = serial + 1 - cur_dir = os.path.join(dirname, str(serial)) + serial += 1 + cur_dir = _get_serial_dir(serial, checkpoint_dir) save_vars( executor, @@ -495,27 +498,28 @@ def save_checkpoint(executor, predicate=_is_checkpoint_var, filename=None) _write_success(cur_dir) - _lru_delete(dirname, max_num_checkpoints) + _lru_delete(checkpoint_dir, max_num_checkpoints) -def load_checkpoint(executor, dirname=None, main_program=None): +def load_checkpoint(executor, checkpoint_dir=None, main_program=None): """ Load checkpoint from a directory by executor, - it will find latest checkpoint file and load it auto. + it will find the most recent saved checkpoint file and load it auto. :param executor - :param dirname + :param checkpoint_dir :param main_program """ - if dirname is None: - dirname = os.getcwd() + if checkpoint_dir is None: + checkpoint_dir = os.getcwd() - serial = _get_lastest_checkpoint_dir(dirname) + serial = _get_lastest_checkpoint_dir(checkpoint_dir) if serial < 0: return - cur_dir = os.path.join(dirname, str(serial)) + + cur_dir = _get_serial_dir(serial, checkpoint_dir) load_vars( executor, @@ -525,6 +529,11 @@ def load_checkpoint(executor, dirname=None, main_program=None): filename=None) +def _get_serial_dir(serial, checkpoint_dir): + serial_folder = CHECKPOINT_PREFIX + CHECKPOINT_SEPARATOR + str(serial) + return os.path.join(checkpoint_dir, serial_folder) + + def _is_checkpoint_var(var): """ the checkpoint will not save or load all the variables. @@ -577,7 +586,8 @@ def _write_success(dirname): """ success_file = os.path.join(dirname, SUCCESS_MARK_FILENAME) with open(success_file, 'a'): - pass + now = time.ctime() + success_file.write(now) def _get_lastest_checkpoint_dir(checkpoint_dir): @@ -593,18 +603,20 @@ def has_success(checkpoint_dir, cur_dir): """ is _SUCCESS in this dir """ - if not os.path.isdir(os.path.join(checkpoint_dir, cur_dir)): - return -1 + _, serial = cur_dir.split(CHECKPOINT_SEPARATOR) try: - int(cur_dir) + int(serial) except ValueError: return -1 - success_path = os.path.join(checkpoint_dir, cur_dir, - SUCCESS_MARK_FILENAME) + if not os.path.isdir(os.path.join(checkpoint_dir, cur_dir)): + return -1 + + success_path = os.path.join( + _get_serial_dir(serial, checkpoint_dir), SUCCESS_MARK_FILENAME) if os.path.isfile(success_path): - return int(cur_dir) + return int(serial) if not os.path.isdir(checkpoint_dir): return -1 From 192f9a5a70a12bf57ec487d791f535e515524bd0 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Wed, 23 May 2018 11:37:24 +0800 Subject: [PATCH 54/56] bug fix --- python/paddle/fluid/io.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 239736aad0854..c638da67c825d 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -585,9 +585,9 @@ def _write_success(dirname): :param dirname """ success_file = os.path.join(dirname, SUCCESS_MARK_FILENAME) - with open(success_file, 'a'): + with open(success_file, 'a') as f: now = time.ctime() - success_file.write(now) + f.write(now) def _get_lastest_checkpoint_dir(checkpoint_dir): From cf3fb2488c667b1cfbf7bc4a5c7441bdf837b6e7 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Wed, 23 May 2018 17:26:21 +0800 Subject: [PATCH 55/56] add clean checkpoint --- python/paddle/fluid/io.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index c638da67c825d..9e0bc425f0e34 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -529,6 +529,19 @@ def load_checkpoint(executor, checkpoint_dir=None, main_program=None): filename=None) +def clean_checkpoint(checkpoint_dir, delete_dir=False): + """ + clean the checkpoint dir, when the train exits normally, the trainer will call clean_checkpoint to delete checkpoint directory saved before. + delete_dir only works when the directory is empty, otherwise, OSError is raised. + """ + if checkpoint_dir is None: + checkpoint_dir = os.getcwd() + _lru_delete(checkpoint_dir, max_num_checkpoints=0) + + if delete_dir and not os.listdir(checkpoint_dir): + os.rmdir(checkpoint_dir) + + def _get_serial_dir(serial, checkpoint_dir): serial_folder = CHECKPOINT_PREFIX + CHECKPOINT_SEPARATOR + str(serial) return os.path.join(checkpoint_dir, serial_folder) From 2c47e067ae8485c6ad1ae0be870b792775e4e276 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Wed, 23 May 2018 18:03:20 +0800 Subject: [PATCH 56/56] add clean checkpoint --- python/paddle/fluid/io.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 9e0bc425f0e34..8e58e5eb794e1 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -23,7 +23,8 @@ __all__ = [ 'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params', 'load_persistables', 'save_inference_model', 'load_inference_model', - 'get_inference_program', 'save_checkpoint', 'load_checkpoint' + 'get_inference_program', 'save_checkpoint', 'load_checkpoint', + 'clean_checkpoint' ]