Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Inference Tensorrt] Add attr for trt engine and handle the input seq problem for ernie var len. #33575

Merged
merged 5 commits into from
Jun 17, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
framework::OpDesc op_desc(op, nullptr);
auto word_id_name = op_desc.Input("WordId").front();
auto pos_id_name = op_desc.Input("PosId").front();
engine_->Set("ernie_pos_name", new std::string(pos_id_name));

auto sent_id_name = op_desc.Input("SentId").front();
auto word_emb_name = op_desc.Input("WordEmbedding").front();
auto pos_emb_name = op_desc.Input("PosEmbedding").front();
Expand Down
12 changes: 9 additions & 3 deletions paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -191,9 +191,15 @@ class MultiheadMatMulOpConverter : public OpConverter {
std::vector<nvinfer1::ITensor*> plugin_inputs;
plugin_inputs.emplace_back(fc_layer->getOutput(0));
plugin_inputs.emplace_back(mask_tensor);
plugin_inputs.emplace_back(engine_->GetITensor(
engine_->network()->getInput(2)->getName())); // cu_seqlens,
// eval_placeholder_2
if (engine_->Has("ernie_pos_name")) {
plugin_inputs.emplace_back(
engine_->GetITensor(engine_->Get<std::string>("ernie_pos_name")));
} else {
plugin_inputs.emplace_back(engine_->GetITensor(
engine_->network()
->getInput(2)
->getName())); // cu_seqlens, eval_placeholder_2
}
auto max_seqlen_tensor =
engine_->GetITensor(engine_->network()->getInput(3)->getName());
auto* shuffle_layer = TRT_ENGINE_ADD_LAYER(
Expand Down
13 changes: 10 additions & 3 deletions paddle/fluid/inference/tensorrt/convert/slice_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,16 @@ class SliceOpConverter : public OpConverter {
std::vector<nvinfer1::ITensor*> plugin_inputs;
// plugin_inputs.emplace_back(trans_layer->getOutput(0));
plugin_inputs.emplace_back(input);
plugin_inputs.emplace_back(engine_->GetITensor(
engine_->network()->getInput(2)->getName())); // cu_seqlens,
// eval_placeholder_2

std::string pos_name;
if (engine_->Has("ernie_pos_name")) {
pos_name = engine_->Get<std::string>("ernie_pos_name");
} else {
// hard code for compatibility
pos_name = engine_->network()->getInput(2)->getName();
}
plugin_inputs.emplace_back(
engine_->GetITensor(pos_name)); // cu_seqlens, eval_placeholder_2

// bool ban_fp16 = engine_->disable_trt_plugin_fp16();
plugin::SpecialSlicePluginDynamic* plugin =
Expand Down
89 changes: 88 additions & 1 deletion paddle/fluid/inference/tensorrt/engine.h
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,15 @@ class TensorRTEngine {
dy::initLibNvInferPlugins(&logger, "");
}

~TensorRTEngine() {}
~TensorRTEngine() {
for (auto& attr : attrs_) {
if (attr_dels_.find(attr.first) != attr_dels_.end()) {
attr_dels_[attr.first]();
}
}
attrs_.clear();
attr_dels_.clear();
}

// Add an input and set its name, data type and dimension.
nvinfer1::ITensor* DeclareInput(const std::string& name,
Expand Down Expand Up @@ -386,6 +394,82 @@ class TensorRTEngine {
}
#endif

bool Has(const std::string& attr_name) const {
return attrs_.count(attr_name) > 0;
}

void Erase(const std::string& attr_name) {
if (!Has(attr_name)) {
return;
}
if (attr_dels_.find(attr_name) != attr_dels_.end()) {
attr_dels_[attr_name]();
attr_dels_.erase(attr_name);
}
attrs_.erase(attr_name);
}

// Set a pointer to the attribute. Engine takes ownership of the attribute.
template <typename AttrType>
void Set(const std::string& attr_name, AttrType* attr) {
if (attrs_.count(attr_name) == 0) {
PADDLE_ENFORCE_EQ(
attrs_.count(attr_name), 0,
platform::errors::AlreadyExists(
"Attribute %s already set in trt engine.", attr_name));
} else {
VLOG(3) << "Setting the attribute " << attr_name << " for trt engine "
<< this;
}
attrs_[attr_name] = attr;
attr_dels_[attr_name] = [attr, attr_name]() {
VLOG(3) << "deleting " << attr_name;
delete attr;
};
}

// Set a pointer to the attribute. Engine doesn't take ownership. Caller
// should delete the attribute.
template <typename AttrType>
void SetNotOwned(const std::string& attr_name, AttrType* attr) {
PADDLE_ENFORCE_EQ(
attrs_.count(attr_name), 0,
platform::errors::AlreadyExists(
"Attribute %s already set in trt engine.", attr_name));
attrs_[attr_name] = attr;
}

// Get a reference to the attributed previously set.
template <typename AttrType>
AttrType& Get(const std::string& attr_name) const {
PADDLE_ENFORCE_NE(attrs_.find(attr_name), attrs_.end(),
platform::errors::InvalidArgument(
"Attribute %s not found in trt engine.", attr_name));
try {
return *boost::any_cast<AttrType*>(attrs_.at(attr_name));
} catch (boost::bad_any_cast&) {
auto TypeToString = [](const std::type_info& info) -> std::string {
if (std::type_index(info) == std::type_index(typeid(bool*))) {
return "bool";
} else if (std::type_index(info) == std::type_index(typeid(int*))) {
return "int";
} else if (std::type_index(info) ==
std::type_index(typeid(const int*))) {
return "const int";
} else if (std::type_index(info) ==
std::type_index(typeid(std::string*))) {
return "std::string";
}
return info.name();
};

PADDLE_THROW(platform::errors::InvalidArgument(
"Invalid type for attritube %s, expected: %s, actual: %s.", attr_name,
TypeToString(typeid(AttrType*)),
TypeToString(attrs_.at(attr_name).type())));
}
}

private:
// Each ICudaEngine object is bound to a specific GPU when it is instantiated,
// ensure that the thread is associated with the correct device by calling
Expand Down Expand Up @@ -441,6 +525,9 @@ class TensorRTEngine {
infer_ptr<nvinfer1::IHostMemory> ihost_memory_;
std::unordered_map<nvinfer1::ITensor*, float> quant_dynamic_range_;

std::unordered_map<std::string, boost::any> attrs_;
std::unordered_map<std::string, std::function<void(void)>> attr_dels_;

// For dynamic shape
bool with_dynamic_shape_{false};
infer_ptr<nvinfer1::INetworkDefinition> infer_networkv2_;
Expand Down
11 changes: 11 additions & 0 deletions paddle/fluid/inference/tensorrt/test_engine.cc
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,15 @@ TEST_F(TensorRTEngineTest, add_layer) {
buffers[0] = reinterpret_cast<void *>(x_v_gpu_data);
buffers[1] = reinterpret_cast<void *>(y_gpu_data);

LOG(INFO) << "Set attr";
engine_->Set("test_attr", new std::string("test_attr"));
if (engine_->Has("test_attr")) {
auto attr_val = engine_->Get<std::string>("test_attr");
engine_->Erase("test_attr");
}
std::string *attr_key = new std::string("attr_key");
engine_->SetNotOwned("attr1", attr_key);

LOG(INFO) << "to execute";
engine_->Execute(1, &buffers, ctx_->stream());

Expand All @@ -99,6 +108,8 @@ TEST_F(TensorRTEngineTest, add_layer) {

LOG(INFO) << "to checkout output";
ASSERT_EQ(y_cpu[0], x_v[0] * 2 + 3);

delete attr_key;
}

TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
Expand Down
1 change: 1 addition & 0 deletions paddle/fluid/inference/tests/api/tester_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/tests/api/config_printer.h"
#include "paddle/fluid/inference/tests/test_helper.h"
Expand Down
132 changes: 132 additions & 0 deletions paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ limitations under the License. */
#include <gtest/gtest.h>
#include "gflags/gflags.h"

#include "paddle/fluid/inference/tensorrt/helper.h"
#include "paddle/fluid/inference/tests/api/trt_test_helper.h"

namespace paddle {
Expand Down Expand Up @@ -143,5 +144,136 @@ TEST(AnalysisPredictor, fp16) {
#endif
}

// ernie_varlen
std::shared_ptr<paddle_infer::Predictor> InitPredictor() {
paddle_infer::Config config;
config.SetModel(FLAGS_infer_model);

config.EnableUseGpu(100, 0);

// Open the memory optim.
config.EnableMemoryOptim();

int max_batch = 32;
int max_single_seq_len = 128;
int opt_single_seq_len = 64;
int min_batch_seq_len = 1;
int max_batch_seq_len = 512;
int opt_batch_seq_len = 256;

std::string input_name0 = "read_file_0.tmp_0";
std::string input_name1 = "read_file_0.tmp_1";
std::string input_name2 = "read_file_0.tmp_2";
std::string input_name3 = "read_file_0.tmp_4";

std::vector<int> min_shape = {min_batch_seq_len};
std::vector<int> max_shape = {max_batch_seq_len};
std::vector<int> opt_shape = {opt_batch_seq_len};
// Set the input's min, max, opt shape
std::map<std::string, std::vector<int>> min_input_shape = {
{input_name0, min_shape},
{input_name1, min_shape},
{input_name2, {1}},
{input_name3, {1, 1, 1}}};
std::map<std::string, std::vector<int>> max_input_shape = {
{input_name0, max_shape},
{input_name1, max_shape},
{input_name2, {max_batch + 1}},
{input_name3, {1, max_single_seq_len, 1}}};
std::map<std::string, std::vector<int>> opt_input_shape = {
{input_name0, opt_shape},
{input_name1, opt_shape},
{input_name2, {max_batch + 1}},
{input_name3, {1, opt_single_seq_len, 1}}};

// only kHalf supported
config.EnableTensorRtEngine(
1 << 30, 1, 5, paddle_infer::Config::Precision::kHalf, false, false);
// erinie varlen must be used with dynamic shape
config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape,
opt_input_shape);
// erinie varlen must be used with oss
config.EnableTensorRtOSS();

return paddle_infer::CreatePredictor(config);
}

void run(paddle_infer::Predictor* predictor, std::vector<float>* out_data) {
const int run_batch = 2;
const int run_seq_len = 71;
const int max_seq_len = 128;

int32_t i1[run_seq_len] = {
// sentence 1
1, 3558, 4, 75, 491, 89, 340, 313, 93, 4, 255, 10, 75, 321, 4095, 1902, 4,
134, 49, 75, 311, 14, 44, 178, 543, 15, 12043, 2, 75, 201, 340, 9, 14, 44,
486, 218, 1140, 279, 12043, 2,
// sentence 2
101, 2054, 2234, 2046, 2486, 2044, 1996, 2047, 4552, 2001, 9536, 1029,
102, 2004, 1997, 2008, 2154, 1010, 1996, 2047, 4552, 9536, 2075, 1996,
2117, 3072, 2234, 2046, 2486, 1012, 102,
};
int32_t i2[run_seq_len] = {
// sentence 1
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
// sentence 2
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1};
// shape info of this batch
int32_t i3[3] = {0, 40, 71};
// max_seq_len represents the max sentence length of all the sentences, only
// length of
// input i4 is useful, data means nothing.
int32_t i4[max_seq_len] = {0};

auto input_names = predictor->GetInputNames();
// first input
auto input_t1 = predictor->GetInputHandle(input_names[0]);
input_t1->Reshape({run_seq_len});
input_t1->CopyFromCpu(i1);

// second input
auto input_t2 = predictor->GetInputHandle(input_names[1]);
input_t2->Reshape({run_seq_len});
input_t2->CopyFromCpu(i2);

// third input
auto input_t3 = predictor->GetInputHandle(input_names[2]);
input_t3->Reshape({run_batch + 1});
input_t3->CopyFromCpu(i3);

// fourth input
auto input_t4 = predictor->GetInputHandle(input_names[3]);
input_t4->Reshape({1, max_seq_len, 1});
input_t4->CopyFromCpu(i4);

CHECK(predictor->Run());

auto output_names = predictor->GetOutputNames();
auto output_t = predictor->GetOutputHandle(output_names[0]);
std::vector<int> output_shape = output_t->shape();
int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
std::multiplies<int>());
out_data->resize(out_num);
output_t->CopyToCpu(out_data->data());

return;
}

TEST(AnalysisPredictor, ernie_varlen) {
#if IS_TRT_VERSION_GE(7234)
auto predictor = InitPredictor();
std::vector<float> out_data;
run(predictor.get(), &out_data);
std::vector<float> ref_data{0.59814, 0.219882, 0.181978,
0.359796, 0.577414, 0.0627908};
float near_tolerance = 1e-3;
for (size_t i = 0; i < out_data.size(); i++) {
EXPECT_NEAR(ref_data[i], out_data[i], near_tolerance);
}
#endif
}

} // namespace inference
} // namespace paddle