laipaang · laipaang · Jan 16, 2024 · Dec 29, 2023 · Jan 8, 2024 · Jan 11, 2024
diff --git a/paddle/fluid/framework/trie.h b/paddle/fluid/framework/trie.h
@@ -64,7 +64,7 @@ struct File {
 
 struct Node {
     uint32_t id = 0;
-    uint16_t label = 0;
+    uint32_t label = 0;
     std::vector<uint32_t> child;
     uint8_t aleaf = 0;
 };
@@ -74,7 +74,7 @@ struct Node {
     virtual ~Trie() {}
     int load(const std::string& dir, const uint32_t thr_num=20u);
 
-    uint16_t label(uint32_t id) {
+    uint32_t label(uint32_t id) {
         return label_.at(id);
     }
 
@@ -157,7 +157,7 @@ struct Node {
     void load_file(uint32_t thr_id, File& file);
     void stat_file(uint32_t thr_id, File& file);
 
-    std::vector<uint16_t> label_;
+    std::vector<uint32_t> label_;
     std::vector<uint8_t>  aleaf_;
     std::vector<uint32_t> child_mem_;
     std::vector<uint32_t> mem_off_;

diff --git a/paddle/fluid/framework/trie_manager.cc b/paddle/fluid/framework/trie_manager.cc
@@ -19,6 +19,45 @@ namespace paddle {
 namespace framework {
 std::shared_ptr<TrieManager> TrieManager::_s_instance = nullptr;
 
+void TrieManager::reset(const std::vector<int>& labels) {
+    VLOG(3) << "trie reset...";
+    std::unique_lock<std::mutex> lock(mtx_);
+
+    size_t root = 0;
+    size_t chs = trie_.child_size(root);
+    std::unordered_map<uint32_t, uint32_t> l2n;
+    for (size_t i = 0; i < chs; ++i) {
+        uint32_t cid = trie_.child_at(root, i);
+        uint32_t lab = trie_.label(cid);
+        l2n.insert({lab, cid});
+    }
+
+    parent_idx_.mutable_data<int64_t>({int(labels.size())}, phi::GPUPinnedPlace());
+    int64_t* parent_idx = parent_idx_.data<int64_t>();
+
+    select_ids_.mutable_data<int64_t>({int(labels.size())}, phi::GPUPinnedPlace());
+    int64_t* select_ids = select_ids_.data<int64_t>();
+
+    label2node_.resize(labels.size());
+    for (size_t i = 0; i < labels.size(); ++i) {
+        auto it = l2n.find(labels[i]);
+        uint32_t label = endid_;
+        uint32_t nodeid = end_nodeid_;
+
+        if (it != l2n.end()) {
+            label = labels[i];
+            nodeid = it->second;
+        }
+
+        parent_idx[i] = i;
+        select_ids[i] = label;
+        label2node_[i].insert({label, nodeid});
+    }
+
+    phase_ = Phase::run;
+    cv_.notify_one();
+}
+
 void TrieManager::reset() {
     VLOG(3) << "trie reset...";
     std::unique_lock<std::mutex> lock(mtx_);
@@ -84,8 +123,8 @@ void TrieManager::run() {
         int64_t* parent_idx = parent_idx_.data<int64_t>();
         int64_t* select_ids = select_ids_.data<int64_t>();
 
-        std::vector<std::unordered_map<uint16_t, uint32_t>> label2node(numel);
-        std::vector<std::vector<uint16_t>> outs(numel);
+        std::vector<std::unordered_map<uint32_t, uint32_t>> label2node(numel);
+        std::vector<std::vector<uint32_t>> outs(numel);
         parallel_run_range(numel, thr_num, [this, parent_idx, select_ids, &outs, &label2node] (
                 uint32_t thr_id, uint32_t start, uint32_t end) {
             for (size_t i = start; i < end; ++i) {

diff --git a/paddle/fluid/framework/trie_manager.h b/paddle/fluid/framework/trie_manager.h
@@ -69,7 +69,7 @@ enum class Phase {
 };
 
 public:
-    TrieManager(uint16_t endid) : endid_(endid),
+    TrieManager(uint32_t endid) : endid_(endid),
             place_(platform::GetCurrentDeviceId()) {
         thread_ = std::thread(&TrieManager::run, this);
     }
@@ -94,7 +94,7 @@ enum class Phase {
         return _s_instance;
     }
 
-    static std::shared_ptr<TrieManager> SetInstance(uint16_t endid) {
+    static std::shared_ptr<TrieManager> SetInstance(uint32_t endid) {
         static std::mutex mutex;
         std::lock_guard<std::mutex> lock(mutex);
         if (nullptr == _s_instance) {
@@ -111,6 +111,7 @@ enum class Phase {
         return trie_.load(dir, thr_num);
     }
     void reset();
+    void reset(const std::vector<int>& labels);
     void search_start(const Tensor* d_parent, const Tensor* d_select);
     void search_wait();
 
@@ -124,7 +125,7 @@ enum class Phase {
     // cpu
     Tensor parent_idx_;
     Tensor select_ids_;
-    std::vector<std::unordered_map<uint16_t, uint32_t>> label2node_;
+    std::vector<std::unordered_map<uint32_t, uint32_t>> label2node_;
 
     // cpu
     Tensor next_out_;

diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cu b/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cu
@@ -245,7 +245,7 @@ class FusedMultiTransformerINT8OpKernel : public framework::OpKernel<T> {
     qktv_out.Resize({{bsz, num_head, seq_len, dim_head}});
     auto *qktv_out_data =
         dev_ctx.Alloc<T>(&qktv_out, qktv_out.numel() * sizeof(T));
-    fmha_out.Resize({{token_num, num_head, dim_head}});
+    fmha_out.Resize({{bsz, seq_len, num_head, dim_head}});
     auto *fmha_out_data =
         dev_ctx.Alloc<T>(&fmha_out, fmha_out.numel() * sizeof(T));
 

diff --git a/paddle/fluid/pybind/box_helper_py.cc b/paddle/fluid/pybind/box_helper_py.cc
@@ -142,7 +142,10 @@ void BindTrieManager(py::module* m) {
           py::arg("thr_num")=20u,
           py::call_guard<py::gil_scoped_release>())
       .def("reset",
-          &framework::TrieManager::reset,
+          py::overload_cast<>(&framework::TrieManager::reset),
+          py::call_guard<py::gil_scoped_release>())
+      .def("reset",
+          py::overload_cast<const std::vector<int>&>(&framework::TrieManager::reset),
           py::call_guard<py::gil_scoped_release>());
 }  // end TrieManager
 

diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
@@ -228,6 +228,35 @@ void PaddleInferTensorCreate(
   tensor.CopyFromCpu(static_cast<const T *>(data.data()));
 }
 
+void CopyFromCpuPaddleTensor(paddle_infer::Tensor &tensor,
+                             paddle::experimental::Tensor &&paddle_tensor) {
+  std::vector<int> shape;
+  for (int i = 0; i < paddle_tensor.dims().size(); ++i) {
+    shape.push_back(paddle_tensor.dims()[i]);
+  }
+  tensor.Reshape(std::move(shape));
+
+  switch (paddle_tensor.dtype()) {
+    case paddle::experimental::DataType::FLOAT16:
+      tensor.CopyFromCpu(static_cast<const paddle::platform::float16 *>(
+          paddle_tensor.data<paddle::platform::float16>()));
+      break;
+    case paddle::experimental::DataType::FLOAT32:
+      tensor.CopyFromCpu(static_cast<const float *>(paddle_tensor.data<float>()));
+      break;
+    case paddle::experimental::DataType::INT32:
+      tensor.CopyFromCpu(static_cast<const int32_t *>(paddle_tensor.data<int32_t>()));
+      break;
+    case paddle::experimental::DataType::INT64:
+      tensor.CopyFromCpu(static_cast<const int64_t *>(paddle_tensor.data<int64_t>()));
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+        "Unsupported data type. Now copy_from_cpu only supports FLOAT16, FLOAT32, "
+        "INT32, and INT64."));
+  }
+}
+
 paddle_infer::PlaceType ToPaddleInferPlace(
     phi::AllocationType allocation_type) {
   if (allocation_type == phi::AllocationType::CPU) {
@@ -585,7 +614,8 @@ void BindPaddlePredictor(py::module *m) {
              std::vector<PaddleTensor> outputs;
              self.Run(inputs, &outputs);
              return outputs;
-           })
+           },
+           py::call_guard<py::gil_scoped_release>())
       .def("get_input_tensor", &PaddlePredictor::GetInputTensor)
       .def("get_output_tensor", &PaddlePredictor::GetOutputTensor)
       .def("get_input_names", &PaddlePredictor::GetInputNames)
@@ -634,7 +664,8 @@ void BindNativePredictor(py::module *m) {
              std::vector<PaddleTensor> outputs;
              self.Run(inputs, &outputs);
              return outputs;
-           })
+           },
+           py::call_guard<py::gil_scoped_release>())
       .def("get_input_tensor", &NativePaddlePredictor::GetInputTensor)
       .def("get_output_tensor", &NativePaddlePredictor::GetOutputTensor)
       .def("zero_copy_run", &NativePaddlePredictor::ZeroCopyRun)
@@ -926,7 +957,8 @@ void BindAnalysisPredictor(py::module *m) {
             std::vector<PaddleTensor> outputs;
             self.Run(inputs, &outputs);
             return outputs;
-          })
+          },
+          py::call_guard<py::gil_scoped_release>())
       .def("get_input_tensor", &AnalysisPredictor::GetInputTensor)
       .def("get_output_tensor", &AnalysisPredictor::GetOutputTensor)
       .def("get_input_names", &AnalysisPredictor::GetInputNames)
@@ -972,11 +1004,9 @@ void BindPaddleInferPredictor(py::module *m) {
       .def("get_output_handle", &paddle_infer::Predictor::GetOutputHandle)
       .def("run",
            [](paddle_infer::Predictor &self) {
-#ifdef PADDLE_WITH_ASCEND_CL
-             pybind11::gil_scoped_release release;
-#endif
              self.Run();
-           })
+           },
+           py::call_guard<py::gil_scoped_release>())
       .def("clone",
            [](paddle_infer::Predictor &self) { return self.Clone(nullptr); })
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -1024,6 +1054,11 @@ void BindPaddleInferTensor(py::module *m) {
       .def("copy_from_cpu_bind",
            &PaddleInferTensorCreate<paddle_infer::float16>)
       .def("copy_from_cpu_bind", &PaddleInferStringTensorCreate)
+      .def("_copy_from_cpu_bind",
+           [](paddle_infer::Tensor &self, const py::handle &input) {
+             PyObject *obj = input.ptr();
+             CopyFromCpuPaddleTensor(self, std::move(CastPyArg2Tensor(obj, 0)));
+           })
       .def("share_external_data_bind", &PaddleInferShareExternalData)
       .def("_share_external_data_paddle_tensor_bind",
            [](paddle_infer::Tensor &self, const py::handle &input) {

diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
@@ -210,7 +210,7 @@
   backward : flip_grad
 
 - op : beam_search_softmax
-  args : (Tensor logits, Tensor cum_scores, Tensor sequence_lengths, Tensor stop_flags, Tensor end_ids, Tensor step_ids,  Tensor last_cache_ids,  Tensor last_beam_offsets, int beam_size, int max_seq_len, int max_dec_len, bool fuse_softmax, bool early_stop, float length_penalty=0.0)
+  args : (Tensor logits, Tensor cum_scores, Tensor sequence_lengths, Tensor stop_flags, Tensor end_ids, Tensor step_ids,  Tensor last_cache_ids,  Tensor last_beam_offsets, int beam_size, int max_seq_len, int max_dec_len, bool fuse_softmax, bool early_stop, float length_penalty=0.0, bool one_stage_topk=false)
   output : Tensor(ids_this_time), Tensor(out_cum_scores), Tensor(cache_ids), Tensor(beam_offsets), Tensor(parent_idx), Tensor(stop_flags_out), Tensor(seq_lens_out), Tensor(step_ids_out)
   infer_meta :
     func : BeamSearchSoftmaxInferMeta

diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
@@ -688,6 +688,7 @@ void BeamSearchSoftmaxInferMeta(const MetaTensor& logits,
                                 bool fuse_softmax,
                                 bool early_stop,
                                 float length_penalty,
+                                bool one_stage_topk,
                                 MetaTensor* ids_this_time,
                                 MetaTensor* out_cum_scores,
                                 MetaTensor* cache_ids,

diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
@@ -204,6 +204,7 @@ void BeamSearchSoftmaxInferMeta(const MetaTensor& logits,
                                 bool fuse_softmax,
                                 bool early_stop,
                                 float length_penalty,
+                                bool one_stage_topk,
                                 MetaTensor* ids_this_time,
                                 MetaTensor* out_cum_scores,
                                 MetaTensor* cache_ids,

diff --git a/paddle/phi/kernels/fusion/beam_search_softmax.h b/paddle/phi/kernels/fusion/beam_search_softmax.h
@@ -35,6 +35,7 @@ void BeamSearchSoftmaxKernel(const Context &dev_ctx,
                              bool fuse_softmax,
                              bool early_stop,
                              float length_penalty,
+                             bool one_stage_topk,
                              DenseTensor *ids_this_time,
                              DenseTensor *out_cum_scores,
                              DenseTensor *cache_ids,