Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Inference] Add TryShrinkMemory interface. #28409

Merged
merged 10 commits into from
Nov 11, 2020
15 changes: 13 additions & 2 deletions paddle/fluid/inference/api/analysis_predictor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,10 @@ bool AnalysisPredictor::PrepareScope(
status_is_cloned_ = true;
} else {
paddle::framework::InitDevices(false);
scope_.reset(new paddle::framework::Scope());
scope_.reset(new paddle::framework::Scope(), [&](framework::Scope *scope) {
delete scope;
memory::Release(place_);
});
status_is_cloned_ = false;
}
sub_scope_ = &scope_->NewScope();
Expand Down Expand Up @@ -591,7 +594,6 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
gflags.push_back("--allocator_strategy=thread_local");
process_level_allocator_enabled = false;
} else {
gflags.push_back("--allocator_strategy=naive_best_fit");
process_level_allocator_enabled = true;
}

Expand Down Expand Up @@ -890,6 +892,11 @@ bool AnalysisPredictor::LoadParameters() {
return true;
}

uint64_t AnalysisPredictor::TryShrinkMemory() {
ClearIntermediateTensor();
return paddle::memory::Release(place_);
}

void AnalysisPredictor::ClearIntermediateTensor() {
PADDLE_ENFORCE_NOT_NULL(inference_program_.get(),
platform::errors::PreconditionNotMet(
Expand Down Expand Up @@ -985,6 +992,8 @@ AnalysisPredictor::~AnalysisPredictor() {
mkldnn_quantizer_ = nullptr;
}
#endif

memory::Release(place_);
}

std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
Expand Down Expand Up @@ -1142,6 +1151,8 @@ void Predictor::ClearIntermediateTensor() {
predictor_->ClearIntermediateTensor();
}

uint64_t Predictor::TryShrinkMemory() { return predictor_->TryShrinkMemory(); }

int GetNumBytesOfDataType(DataType dtype) {
switch (dtype) {
case DataType::FLOAT32:
Expand Down
11 changes: 11 additions & 0 deletions paddle/fluid/inference/api/analysis_predictor.h
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,17 @@ class AnalysisPredictor : public PaddlePredictor {
///
void ClearIntermediateTensor();

///
/// \brief Release all tmp tensor to compress the size of the memory pool.
/// The memory pool is considered to be composed of a list of chunks, if
/// the chunk is not occupied, it can be released.
///
/// \return Number of bytes released. It may be smaller than the actual
/// released memory, because part of the memory is not managed by the
/// MemoryPool.
///
uint64_t TryShrinkMemory() override;

///
/// \brief Get the argument used by predictor
///
Expand Down
46 changes: 44 additions & 2 deletions paddle/fluid/inference/api/analysis_predictor_tester.cc
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ TEST(AnalysisPredictor, ZeroCopy) {
auto* out_data = out->data<float>(&place, &size);
LOG(INFO) << "output size: " << size / sizeof(float);
LOG(INFO) << "output_data: " << out_data;
predictor->TryShrinkMemory();
}

TEST(AnalysisPredictor, Clone) {
Expand Down Expand Up @@ -253,8 +254,7 @@ class MkldnnQuantizerTest : public testing::Test {
public:
MkldnnQuantizerTest() {
AnalysisConfig config(FLAGS_dirname);

predictor.reset(new AnalysisPredictor(config));
predictor = std::move(CreatePaddlePredictor(config));
auto* predictor_p = static_cast<AnalysisPredictor*>(predictor.get());

auto qconfig = new MkldnnQuantizerConfig();
Expand Down Expand Up @@ -507,3 +507,45 @@ TEST(AnalysisPredictor, bf16_pass_strategy) {
}

} // namespace paddle

namespace paddle_infer {

TEST(Predictor, Run) {
Config config;
config.SetModel(FLAGS_dirname);

auto predictor = CreatePredictor(config);

auto w0 = predictor->GetInputHandle("firstw");
auto w1 = predictor->GetInputHandle("secondw");
auto w2 = predictor->GetInputHandle("thirdw");
auto w3 = predictor->GetInputHandle("forthw");

w0->Reshape({4, 1});
w1->Reshape({4, 1});
w2->Reshape({4, 1});
w3->Reshape({4, 1});

auto* w0_data = w0->mutable_data<int64_t>(PlaceType::kCPU);
auto* w1_data = w1->mutable_data<int64_t>(PlaceType::kCPU);
auto* w2_data = w2->mutable_data<int64_t>(PlaceType::kCPU);
auto* w3_data = w3->mutable_data<int64_t>(PlaceType::kCPU);

for (int i = 0; i < 4; i++) {
w0_data[i] = i;
w1_data[i] = i;
w2_data[i] = i;
w3_data[i] = i;
}

predictor->Run();

auto out = predictor->GetOutputHandle("fc_1.tmp_2");
PlaceType place;
int size = 0;
out->data<float>(&place, &size);
LOG(INFO) << "output size: " << size / sizeof(float);
predictor->TryShrinkMemory();
}

} // namespace paddle_infer
1 change: 1 addition & 0 deletions paddle/fluid/inference/api/api_tester.cc
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ TEST(paddle_inference_api, demo) {
auto predictor = CreatePaddlePredictor(config);
std::vector<PaddleTensor> outputs;
predictor->Run({}, &outputs);
predictor->TryShrinkMemory();
}

TEST(paddle_inference_api, get_version) {
Expand Down
11 changes: 11 additions & 0 deletions paddle/fluid/inference/api/paddle_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,17 @@ class PD_INFER_DECL PaddlePredictor {
///
virtual void ClearIntermediateTensor() {}

///
/// \brief Release all tmp tensor to compress the size of the memory pool.
/// The memory pool is considered to be composed of a list of chunks, if
/// the chunk is not occupied, it can be released.
///
/// \return Number of bytes released. It may be smaller than the actual
/// released memory, because part of the memory is not managed by the
/// MemoryPool.
///
virtual uint64_t TryShrinkMemory() { return 0; }

/// \brief Clone an existing predictor
/// When using clone, the same network will be created,
/// and the parameters between them are shared.
Expand Down
11 changes: 11 additions & 0 deletions paddle/fluid/inference/api/paddle_inference_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,17 @@ class PD_INFER_DECL Predictor {
/// \brief Clear the intermediate tensors of the predictor
void ClearIntermediateTensor();

///
/// \brief Release all tmp tensor to compress the size of the memory pool.
/// The memory pool is considered to be composed of a list of chunks, if
/// the chunk is not occupied, it can be released.
///
/// \return Number of bytes released. It may be smaller than the actual
/// released memory, because part of the memory is not managed by the
/// MemoryPool.
///
uint64_t TryShrinkMemory();

private:
std::unique_ptr<paddle::PaddlePredictor> predictor_;
};
Expand Down
2 changes: 2 additions & 0 deletions paddle/fluid/pybind/inference_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -566,6 +566,7 @@ void BindAnalysisPredictor(py::module *m) {
.def("zero_copy_run", &AnalysisPredictor::ZeroCopyRun)
.def("clear_intermediate_tensor",
&AnalysisPredictor::ClearIntermediateTensor)
.def("try_shrink_memory", &AnalysisPredictor::TryShrinkMemory)
.def("create_feed_fetch_var", &AnalysisPredictor::CreateFeedFetchVar)
.def("prepare_feed_fetch", &AnalysisPredictor::PrepareFeedFetch)
.def("prepare_argument", &AnalysisPredictor::PrepareArgument)
Expand Down Expand Up @@ -593,6 +594,7 @@ void BindPaddleInferPredictor(py::module *m) {
.def("get_output_handle", &paddle_infer::Predictor::GetOutputHandle)
.def("run", &paddle_infer::Predictor::Run)
.def("clone", &paddle_infer::Predictor::Clone)
.def("try_shrink_memory", &paddle_infer::Predictor::TryShrinkMemory)
.def("clear_intermediate_tensor",
&paddle_infer::Predictor::ClearIntermediateTensor);
}
Expand Down