PaddlePaddle · heavengate · Jul 15, 2022 · Jul 12, 2022 · Jul 13, 2022 · Jul 13, 2022
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -2089,6 +2089,7 @@ USE_TRT_CONVERTER(top_k)
 USE_TRT_CONVERTER(top_k_v2)
 USE_TRT_CONVERTER(squeeze2)
 USE_TRT_CONVERTER(unsqueeze2)
+USE_TRT_CONVERTER(fused_token_prune)
 #if PADDLE_WITH_CUSPARSELT && IS_TRT_VERSION_GE(8000)
 USE_TRT_CONVERTER(sparse_fc)
 USE_TRT_CONVERTER(sparse_multihead_matmul)

diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -68,7 +68,8 @@ list(
   c_allreduce_op.cc
   top_k_op.cc
   squeeze2_op.cc
-  unsqueeze2_op.cc)
+  unsqueeze2_op.cc
+  fused_token_prune_op.cc)
 
 if(CUSPARSELT_FOUND AND ${TENSORRT_MAJOR_VERSION} GREATER_EQUAL 8)
   list(APPEND CONVERT_FILES sparse_fc_op.cc sparse_multihead_matmul_op.cc)

diff --git a/paddle/fluid/inference/tensorrt/convert/fused_token_prune_op.cc b/paddle/fluid/inference/tensorrt/convert/fused_token_prune_op.cc
@@ -0,0 +1,76 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class FusedTokenPruneOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope,
+                  bool test_mode) override {
+    framework::OpDesc op_desc(op, nullptr);
+    nvinfer1::ILayer* layer = nullptr;
+
+    auto* Attn = engine_->GetITensor(op_desc.Input("Attn").front());
+    auto* X = engine_->GetITensor(op_desc.Input("X").front());
+    auto* Mask = engine_->GetITensor(op_desc.Input("Mask").front());
+    auto* NewMask = engine_->GetITensor(op_desc.Input("NewMask").front());
+    bool keep_first_token =
+        op_desc.HasAttr("keep_first_token")
+            ? BOOST_GET_CONST(bool, op_desc.GetAttr("keep_first_token"))
+            : true;
+    bool keep_order = op_desc.HasAttr("keep_order")
+                          ? BOOST_GET_CONST(bool, op_desc.GetAttr("keep_order"))
+                          : false;
+
+    std::vector<nvinfer1::ITensor*> itensors = {Attn, X, Mask, NewMask};
+
+    auto output_name = op_desc.Output("SlimmedX")[0];
+    auto out_inds_name = op_desc.Output("CLSInds")[0];
+    if (engine_->with_dynamic_shape()) {
+#if IS_TRT_VERSION_GE(6000)
+      bool with_fp16 =
+          engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
+
+      if (engine_->precision() == AnalysisConfig::Precision::kInt8) {
+        with_fp16 = true;
+      }
+      plugin::FusedTokenPrunePluginDynamic* plugin =
+          new plugin::FusedTokenPrunePluginDynamic(
+              with_fp16, keep_first_token, keep_order);
+      layer = engine_->AddDynamicPlugin(itensors.data(), 4, plugin);
+#else
+      PADDLE_THROW(platform::errors::Fatal(
+          "You are running the TRT Dynamic Shape mode, need to confirm that "
+          "your TRT version is no less than 6.0"));
+#endif
+    } else {
+      PADDLE_THROW(platform::errors::Fatal(
+          "You are running the Ernie(Bert) model in static shape mode, which "
+          "is not supported for the time being.\n"
+          "You can use the config.SetTRTDynamicShapeInfo(...) interface to set "
+          "the shape information to run the dynamic shape mode."));
+    }
+    RreplenishLayerAndOutput(
+        layer, "fused_token_prune", {output_name, out_inds_name}, test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(fused_token_prune, FusedTokenPruneOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -275,7 +275,8 @@ struct SimpleOpTypeSetTeller : public Teller {
       "recover_padding",
       "remove_padding",
       "squeeze2",
-      "unsqueeze2"};
+      "unsqueeze2",
+      "fused_token_prune"};
 };
 
 bool OpTeller::Tell(const framework::ir::Node* node,

diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@@ -29,7 +29,8 @@ list(
   remove_padding_plugin.cu
   recover_padding_plugin.cu
   c_allreduce_op_plugin.cu
-  preln_residual_bias_plugin.cu)
+  preln_residual_bias_plugin.cu
+  fused_token_prune_op_plugin.cu)
 
 if(CUSPARSELT_FOUND AND ${TENSORRT_MAJOR_VERSION} GREATER_EQUAL 8)
   list(APPEND TRT_FILES spmm_plugin.cu)
@@ -44,3 +45,10 @@ nv_test(
   test_split_plugin
   SRCS test_split_plugin.cc
   DEPS paddle_framework ${GLOB_OPERATOR_DEPS} tensorrt_plugin)
+
+if(NOT WIN32)
+  nv_test(
+    test_fused_token_prune_plugin
+    SRCS test_fused_token_prune_plugin.cc
+    DEPS paddle_framework ${GLOB_OPERATOR_DEPS} tensorrt_plugin)
+endif()