Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dingxiang opt #9

Merged
merged 4 commits into from
Jan 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions paddle/fluid/framework/trie.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ struct File {

struct Node {
uint32_t id = 0;
uint16_t label = 0;
uint32_t label = 0;
std::vector<uint32_t> child;
uint8_t aleaf = 0;
};
Expand All @@ -74,7 +74,7 @@ struct Node {
virtual ~Trie() {}
int load(const std::string& dir, const uint32_t thr_num=20u);

uint16_t label(uint32_t id) {
uint32_t label(uint32_t id) {
return label_.at(id);
}

Expand Down Expand Up @@ -157,7 +157,7 @@ struct Node {
void load_file(uint32_t thr_id, File& file);
void stat_file(uint32_t thr_id, File& file);

std::vector<uint16_t> label_;
std::vector<uint32_t> label_;
std::vector<uint8_t> aleaf_;
std::vector<uint32_t> child_mem_;
std::vector<uint32_t> mem_off_;
Expand Down
43 changes: 41 additions & 2 deletions paddle/fluid/framework/trie_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,45 @@ namespace paddle {
namespace framework {
std::shared_ptr<TrieManager> TrieManager::_s_instance = nullptr;

void TrieManager::reset(const std::vector<int>& labels) {
VLOG(3) << "trie reset...";
std::unique_lock<std::mutex> lock(mtx_);

size_t root = 0;
size_t chs = trie_.child_size(root);
std::unordered_map<uint32_t, uint32_t> l2n;
for (size_t i = 0; i < chs; ++i) {
uint32_t cid = trie_.child_at(root, i);
uint32_t lab = trie_.label(cid);
l2n.insert({lab, cid});
}

parent_idx_.mutable_data<int64_t>({int(labels.size())}, phi::GPUPinnedPlace());
int64_t* parent_idx = parent_idx_.data<int64_t>();

select_ids_.mutable_data<int64_t>({int(labels.size())}, phi::GPUPinnedPlace());
int64_t* select_ids = select_ids_.data<int64_t>();

label2node_.resize(labels.size());
for (size_t i = 0; i < labels.size(); ++i) {
auto it = l2n.find(labels[i]);
uint32_t label = endid_;
uint32_t nodeid = end_nodeid_;

if (it != l2n.end()) {
label = labels[i];
nodeid = it->second;
}

parent_idx[i] = i;
select_ids[i] = label;
label2node_[i].insert({label, nodeid});
}

phase_ = Phase::run;
cv_.notify_one();
}

void TrieManager::reset() {
VLOG(3) << "trie reset...";
std::unique_lock<std::mutex> lock(mtx_);
Expand Down Expand Up @@ -84,8 +123,8 @@ void TrieManager::run() {
int64_t* parent_idx = parent_idx_.data<int64_t>();
int64_t* select_ids = select_ids_.data<int64_t>();

std::vector<std::unordered_map<uint16_t, uint32_t>> label2node(numel);
std::vector<std::vector<uint16_t>> outs(numel);
std::vector<std::unordered_map<uint32_t, uint32_t>> label2node(numel);
std::vector<std::vector<uint32_t>> outs(numel);
parallel_run_range(numel, thr_num, [this, parent_idx, select_ids, &outs, &label2node] (
uint32_t thr_id, uint32_t start, uint32_t end) {
for (size_t i = start; i < end; ++i) {
Expand Down
7 changes: 4 additions & 3 deletions paddle/fluid/framework/trie_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ enum class Phase {
};

public:
TrieManager(uint16_t endid) : endid_(endid),
TrieManager(uint32_t endid) : endid_(endid),
place_(platform::GetCurrentDeviceId()) {
thread_ = std::thread(&TrieManager::run, this);
}
Expand All @@ -94,7 +94,7 @@ enum class Phase {
return _s_instance;
}

static std::shared_ptr<TrieManager> SetInstance(uint16_t endid) {
static std::shared_ptr<TrieManager> SetInstance(uint32_t endid) {
static std::mutex mutex;
std::lock_guard<std::mutex> lock(mutex);
if (nullptr == _s_instance) {
Expand All @@ -111,6 +111,7 @@ enum class Phase {
return trie_.load(dir, thr_num);
}
void reset();
void reset(const std::vector<int>& labels);
void search_start(const Tensor* d_parent, const Tensor* d_select);
void search_wait();

Expand All @@ -124,7 +125,7 @@ enum class Phase {
// cpu
Tensor parent_idx_;
Tensor select_ids_;
std::vector<std::unordered_map<uint16_t, uint32_t>> label2node_;
std::vector<std::unordered_map<uint32_t, uint32_t>> label2node_;

// cpu
Tensor next_out_;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ class FusedMultiTransformerINT8OpKernel : public framework::OpKernel<T> {
qktv_out.Resize({{bsz, num_head, seq_len, dim_head}});
auto *qktv_out_data =
dev_ctx.Alloc<T>(&qktv_out, qktv_out.numel() * sizeof(T));
fmha_out.Resize({{token_num, num_head, dim_head}});
fmha_out.Resize({{bsz, seq_len, num_head, dim_head}});
auto *fmha_out_data =
dev_ctx.Alloc<T>(&fmha_out, fmha_out.numel() * sizeof(T));

Expand Down
5 changes: 4 additions & 1 deletion paddle/fluid/pybind/box_helper_py.cc
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,10 @@ void BindTrieManager(py::module* m) {
py::arg("thr_num")=20u,
py::call_guard<py::gil_scoped_release>())
.def("reset",
&framework::TrieManager::reset,
py::overload_cast<>(&framework::TrieManager::reset),
py::call_guard<py::gil_scoped_release>())
.def("reset",
py::overload_cast<const std::vector<int>&>(&framework::TrieManager::reset),
py::call_guard<py::gil_scoped_release>());
} // end TrieManager

Expand Down
49 changes: 42 additions & 7 deletions paddle/fluid/pybind/inference_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,35 @@ void PaddleInferTensorCreate(
tensor.CopyFromCpu(static_cast<const T *>(data.data()));
}

void CopyFromCpuPaddleTensor(paddle_infer::Tensor &tensor,
paddle::experimental::Tensor &&paddle_tensor) {
std::vector<int> shape;
for (int i = 0; i < paddle_tensor.dims().size(); ++i) {
shape.push_back(paddle_tensor.dims()[i]);
}
tensor.Reshape(std::move(shape));

switch (paddle_tensor.dtype()) {
case paddle::experimental::DataType::FLOAT16:
tensor.CopyFromCpu(static_cast<const paddle::platform::float16 *>(
paddle_tensor.data<paddle::platform::float16>()));
break;
case paddle::experimental::DataType::FLOAT32:
tensor.CopyFromCpu(static_cast<const float *>(paddle_tensor.data<float>()));
break;
case paddle::experimental::DataType::INT32:
tensor.CopyFromCpu(static_cast<const int32_t *>(paddle_tensor.data<int32_t>()));
break;
case paddle::experimental::DataType::INT64:
tensor.CopyFromCpu(static_cast<const int64_t *>(paddle_tensor.data<int64_t>()));
break;
default:
PADDLE_THROW(platform::errors::Unimplemented(
"Unsupported data type. Now copy_from_cpu only supports FLOAT16, FLOAT32, "
"INT32, and INT64."));
}
}

paddle_infer::PlaceType ToPaddleInferPlace(
phi::AllocationType allocation_type) {
if (allocation_type == phi::AllocationType::CPU) {
Expand Down Expand Up @@ -585,7 +614,8 @@ void BindPaddlePredictor(py::module *m) {
std::vector<PaddleTensor> outputs;
self.Run(inputs, &outputs);
return outputs;
})
},
py::call_guard<py::gil_scoped_release>())
.def("get_input_tensor", &PaddlePredictor::GetInputTensor)
.def("get_output_tensor", &PaddlePredictor::GetOutputTensor)
.def("get_input_names", &PaddlePredictor::GetInputNames)
Expand Down Expand Up @@ -634,7 +664,8 @@ void BindNativePredictor(py::module *m) {
std::vector<PaddleTensor> outputs;
self.Run(inputs, &outputs);
return outputs;
})
},
py::call_guard<py::gil_scoped_release>())
.def("get_input_tensor", &NativePaddlePredictor::GetInputTensor)
.def("get_output_tensor", &NativePaddlePredictor::GetOutputTensor)
.def("zero_copy_run", &NativePaddlePredictor::ZeroCopyRun)
Expand Down Expand Up @@ -926,7 +957,8 @@ void BindAnalysisPredictor(py::module *m) {
std::vector<PaddleTensor> outputs;
self.Run(inputs, &outputs);
return outputs;
})
},
py::call_guard<py::gil_scoped_release>())
.def("get_input_tensor", &AnalysisPredictor::GetInputTensor)
.def("get_output_tensor", &AnalysisPredictor::GetOutputTensor)
.def("get_input_names", &AnalysisPredictor::GetInputNames)
Expand Down Expand Up @@ -972,11 +1004,9 @@ void BindPaddleInferPredictor(py::module *m) {
.def("get_output_handle", &paddle_infer::Predictor::GetOutputHandle)
.def("run",
[](paddle_infer::Predictor &self) {
#ifdef PADDLE_WITH_ASCEND_CL
pybind11::gil_scoped_release release;
#endif
self.Run();
})
},
py::call_guard<py::gil_scoped_release>())
.def("clone",
[](paddle_infer::Predictor &self) { return self.Clone(nullptr); })
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
Expand Down Expand Up @@ -1024,6 +1054,11 @@ void BindPaddleInferTensor(py::module *m) {
.def("copy_from_cpu_bind",
&PaddleInferTensorCreate<paddle_infer::float16>)
.def("copy_from_cpu_bind", &PaddleInferStringTensorCreate)
.def("_copy_from_cpu_bind",
[](paddle_infer::Tensor &self, const py::handle &input) {
PyObject *obj = input.ptr();
CopyFromCpuPaddleTensor(self, std::move(CastPyArg2Tensor(obj, 0)));
})
.def("share_external_data_bind", &PaddleInferShareExternalData)
.def("_share_external_data_paddle_tensor_bind",
[](paddle_infer::Tensor &self, const py::handle &input) {
Expand Down
2 changes: 1 addition & 1 deletion paddle/phi/api/yaml/ops.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@
backward : flip_grad

- op : beam_search_softmax
args : (Tensor logits, Tensor cum_scores, Tensor sequence_lengths, Tensor stop_flags, Tensor end_ids, Tensor step_ids, Tensor last_cache_ids, Tensor last_beam_offsets, int beam_size, int max_seq_len, int max_dec_len, bool fuse_softmax, bool early_stop, float length_penalty=0.0)
args : (Tensor logits, Tensor cum_scores, Tensor sequence_lengths, Tensor stop_flags, Tensor end_ids, Tensor step_ids, Tensor last_cache_ids, Tensor last_beam_offsets, int beam_size, int max_seq_len, int max_dec_len, bool fuse_softmax, bool early_stop, float length_penalty=0.0, bool one_stage_topk=false)
output : Tensor(ids_this_time), Tensor(out_cum_scores), Tensor(cache_ids), Tensor(beam_offsets), Tensor(parent_idx), Tensor(stop_flags_out), Tensor(seq_lens_out), Tensor(step_ids_out)
infer_meta :
func : BeamSearchSoftmaxInferMeta
Expand Down
1 change: 1 addition & 0 deletions paddle/phi/infermeta/multiary.cc
Original file line number Diff line number Diff line change
Expand Up @@ -688,6 +688,7 @@ void BeamSearchSoftmaxInferMeta(const MetaTensor& logits,
bool fuse_softmax,
bool early_stop,
float length_penalty,
bool one_stage_topk,
MetaTensor* ids_this_time,
MetaTensor* out_cum_scores,
MetaTensor* cache_ids,
Expand Down
1 change: 1 addition & 0 deletions paddle/phi/infermeta/multiary.h
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@ void BeamSearchSoftmaxInferMeta(const MetaTensor& logits,
bool fuse_softmax,
bool early_stop,
float length_penalty,
bool one_stage_topk,
MetaTensor* ids_this_time,
MetaTensor* out_cum_scores,
MetaTensor* cache_ids,
Expand Down
1 change: 1 addition & 0 deletions paddle/phi/kernels/fusion/beam_search_softmax.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ void BeamSearchSoftmaxKernel(const Context &dev_ctx,
bool fuse_softmax,
bool early_stop,
float length_penalty,
bool one_stage_topk,
DenseTensor *ids_this_time,
DenseTensor *out_cum_scores,
DenseTensor *cache_ids,
Expand Down
Loading