From 4b7242b0d8c7917a8e23e49ee8ebf4c460a392cd Mon Sep 17 00:00:00 2001
From: Thunderbrook <52529258+Thunderbrook@users.noreply.github.com>
Date: Mon, 26 Apr 2021 19:05:12 +0800
Subject: [PATCH] [PsCore] optimize performance of large kv (#32535)

* optimize pull sparse

* optimize pull sparse

* change macro

* format
---
 CMakeLists.txt                                |   5 +
 .../distributed/service/brpc_ps_server.cc     |  23 +--
 .../distributed/table/common_sparse_table.cc  |  55 +++---
 .../table/depends/large_scale_kv.h            | 158 ++++++++++--------
 .../framework/fleet/heter_ps/CMakeLists.txt   |   7 +-
 .../distributed/fleet/runtime/the_one_ps.py   |  45 +++--
 .../distributed_strategy.py                   |   1 +
 .../fleet/parameter_server/ir/public.py       |   1 +
 8 files changed, 176 insertions(+), 119 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d874b21b0873d..2d13874f178d4 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -352,6 +352,11 @@ if (WITH_MIPS)
     add_definitions(-DPADDLE_WITH_MIPS)
 endif()
 
+if (WITH_HETERPS)
+    if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -faligned-new")
+    endif()
+endif()
 set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
 
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
diff --git a/paddle/fluid/distributed/service/brpc_ps_server.cc b/paddle/fluid/distributed/service/brpc_ps_server.cc
index a9370561a540b..a1440260bf2e7 100644
--- a/paddle/fluid/distributed/service/brpc_ps_server.cc
+++ b/paddle/fluid/distributed/service/brpc_ps_server.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/distributed/service/brpc_ps_server.h"
 #include <thread>  // NOLINT
+#include "butil/object_pool.h"
 #include "paddle/fluid/distributed/table/depends/sparse_utils.h"
 #include "paddle/fluid/distributed/table/table.h"
 #include "paddle/fluid/framework/archive.h"
@@ -196,12 +197,13 @@ int32_t BrpcPsService::pull_dense(Table *table, const PsRequestMessage &request,
     return 0;
   }
 
-  std::vector<float> res_data;
-  res_data.resize(num * table->value_accesor()->select_size() / sizeof(float));
-  table->pull_dense(res_data.data(), num);
+  auto res_data = butil::get_object<std::vector<float>>();
+  res_data->resize(num * table->value_accesor()->select_size() / sizeof(float));
+  table->pull_dense(res_data->data(), num);
 
-  cntl->response_attachment().append((char *)res_data.data(),
-                                     res_data.size() * sizeof(float));
+  cntl->response_attachment().append((char *)(res_data->data()),
+                                     res_data->size() * sizeof(float));
+  butil::return_object(res_data);
 
   return 0;
 }
@@ -367,12 +369,13 @@ int32_t BrpcPsService::pull_sparse(Table *table,
 
   value.DeserializeFromBytes(const_cast<void *>(data));
 
-  std::vector<float> res_data;
-  res_data.resize(num * dim);
-  table->pull_sparse(res_data.data(), value);
+  auto res_data = butil::get_object<std::vector<float>>();
+  res_data->resize(num * dim);
+  table->pull_sparse(res_data->data(), value);
 
-  cntl->response_attachment().append((char *)res_data.data(),
-                                     res_data.size() * sizeof(float));
+  cntl->response_attachment().append((char *)(res_data->data()),
+                                     res_data->size() * sizeof(float));
+  butil::return_object(res_data);
   return 0;
 }
 
diff --git a/paddle/fluid/distributed/table/common_sparse_table.cc b/paddle/fluid/distributed/table/common_sparse_table.cc
index 1c315d34abcb6..718fce9950719 100644
--- a/paddle/fluid/distributed/table/common_sparse_table.cc
+++ b/paddle/fluid/distributed/table/common_sparse_table.cc
@@ -125,34 +125,37 @@ void ProcessALine(const std::vector<std::string>& columns, const Meta& meta,
 
 int64_t SaveToText(std::ostream* os, std::shared_ptr<ValueBlock> block,
                    const int mode) {
-  int64_t not_save_num = 0;
-  for (auto& value : block->values_) {
-    if (mode == SaveMode::delta && !value.second.need_save_) {
-      not_save_num++;
-      continue;
-    }
-
-    auto* vs = value.second.data_;
-    std::stringstream ss;
-    auto id = value.first;
-    ss << id << "\t" << value.second.count_ << "\t" << value.second.unseen_days_
-       << "\t" << value.second.is_entry_ << "\t";
-
-    for (int i = 0; i < block->value_length_; i++) {
-      ss << vs[i];
-      ss << ",";
-    }
+  int64_t save_num = 0;
+  for (auto& table : block->values_) {
+    for (auto& value : table) {
+      if (mode == SaveMode::delta && !value.second->need_save_) {
+        continue;
+      }
+      save_num += 1;
+
+      auto* vs = value.second->data_.data();
+      std::stringstream ss;
+      auto id = value.first;
+      ss << id << "\t" << value.second->count_ << "\t"
+         << value.second->unseen_days_ << "\t" << value.second->is_entry_
+         << "\t";
+
+      for (int i = 0; i < block->value_length_; i++) {
+        ss << vs[i];
+        ss << ",";
+      }
 
-    ss << "\n";
+      ss << "\n";
 
-    os->write(ss.str().c_str(), sizeof(char) * ss.str().size());
+      os->write(ss.str().c_str(), sizeof(char) * ss.str().size());
 
-    if (mode == SaveMode::base || mode == SaveMode::delta) {
-      value.second.need_save_ = false;
+      if (mode == SaveMode::base || mode == SaveMode::delta) {
+        value.second->need_save_ = false;
+      }
     }
   }
 
-  return block->values_.size() - not_save_num;
+  return save_num;
 }
 
 int64_t LoadFromText(const std::string& valuepath, const std::string& metapath,
@@ -183,7 +186,7 @@ int64_t LoadFromText(const std::string& valuepath, const std::string& metapath,
 
     block->Init(id, false);
 
-    auto value_instant = block->GetValue(id);
+    VALUE* value_instant = block->GetValue(id);
     if (values.size() == 5) {
       value_instant->count_ = std::stoi(values[1]);
       value_instant->unseen_days_ = std::stoi(values[2]);
@@ -373,8 +376,10 @@ std::pair<int64_t, int64_t> CommonSparseTable::print_table_stat() {
   int64_t feasign_size = 0;
   int64_t mf_size = 0;
 
-  for (auto& value : shard_values_) {
-    feasign_size += value->values_.size();
+  for (auto& shard : shard_values_) {
+    for (auto& table : shard->values_) {
+      feasign_size += table.size();
+    }
   }
 
   return {feasign_size, mf_size};
diff --git a/paddle/fluid/distributed/table/depends/large_scale_kv.h b/paddle/fluid/distributed/table/depends/large_scale_kv.h
index bb4174bd2c579..5c10fca98cda4 100644
--- a/paddle/fluid/distributed/table/depends/large_scale_kv.h
+++ b/paddle/fluid/distributed/table/depends/large_scale_kv.h
@@ -26,6 +26,7 @@
 #include <vector>
 #include "gflags/gflags.h"
 
+#include "butil/object_pool.h"
 #include "paddle/fluid/distributed/common/utils.h"
 #include "paddle/fluid/distributed/table/depends/initializers.h"
 #include "paddle/fluid/distributed/thirdparty/round_robin.h"
@@ -48,6 +49,10 @@ namespace distributed {
 
 enum Mode { training, infer };
 
+static const int SPARSE_SHARD_BUCKET_NUM_BITS = 6;
+static const size_t SPARSE_SHARD_BUCKET_NUM = (size_t)1
+                                              << SPARSE_SHARD_BUCKET_NUM_BITS;
+
 struct VALUE {
   explicit VALUE(size_t length)
       : length_(length),
@@ -55,46 +60,16 @@ struct VALUE {
         unseen_days_(0),
         need_save_(false),
         is_entry_(false) {
-    data_ = new float[length];
-    memset(data_, 0, sizeof(float) * length);
-  }
-
-  VALUE(const VALUE &value) {
-    length_ = value.length_;
-    count_ = value.count_;
-    unseen_days_ = value.unseen_days_;
-    need_save_ = value.need_save_;
-    is_entry_ = value.is_entry_;
-    data_ = new float[length_];
-    memcpy(data_, value.data_, sizeof(float) * length_);
-  }
-
-  VALUE &operator=(const VALUE &value) {
-    if (this != &value) {
-      delete[] data_;
-      length_ = value.length_;
-      count_ = value.count_;
-      unseen_days_ = value.unseen_days_;
-      need_save_ = value.need_save_;
-      is_entry_ = value.is_entry_;
-
-      data_ = new float[length_];
-      memcpy(data_, value.data_, sizeof(float) * length_);
-    }
-    return *this;
-  }
-
-  ~VALUE() {
-    delete[] data_;
-    data_ = nullptr;
+    data_.resize(length);
+    memset(data_.data(), 0, sizeof(float) * length);
   }
 
   size_t length_;
+  std::vector<float> data_;
   int count_;
   int unseen_days_;  // use to check knock-out
   bool need_save_;   // whether need to save
   bool is_entry_;    // whether knock-in
-  float *data_;
 };
 
 inline bool count_entry(VALUE *value, int threshold) {
@@ -176,12 +151,12 @@ class ValueBlock {
                            const std::vector<int> &value_dims) {
     auto pts = std::vector<float *>();
     pts.reserve(value_names.size());
-    auto &values = values_.at(id);
+    auto values = GetValue(id);
     for (int i = 0; i < static_cast<int>(value_names.size()); i++) {
       PADDLE_ENFORCE_EQ(
           value_dims[i], value_dims_[i],
           platform::errors::InvalidArgument("value dims is not match"));
-      pts.push_back(values.data_ +
+      pts.push_back(values->data_.data() +
                     value_offsets_.at(value_idx_.at(value_names[i])));
     }
     return pts;
@@ -190,33 +165,45 @@ class ValueBlock {
   // pull
   float *Init(const uint64_t &id, const bool with_update = true,
               const int counter = 1) {
-    if (!Has(id)) {
-      values_.emplace(std::make_pair(id, VALUE(value_length_)));
-    }
+    size_t hash = _hasher(id);
+    size_t bucket = compute_bucket(hash);
 
-    auto &value = values_.at(id);
+    auto &table = values_[bucket];
+    auto res = table.find(id);
 
-    if (with_update) {
-      AttrUpdate(&value, counter);
+    VALUE *value = nullptr;
+    if (res == table.end()) {
+      value = butil::get_object<VALUE>(value_length_);
+
+      table[id] = value;
+
+    } else {
+      value = res->second;
     }
 
-    return value.data_;
+    if (with_update) {
+      AttrUpdate(value, counter);
+    }
+    return value->data_.data();
   }
 
-
   VALUE *InitGet(const uint64_t &id, const bool with_update = true,
                  const int counter = 1) {
-    if (!Has(id)) {
-      values_.emplace(std::make_pair(id, VALUE(value_length_)));
-    }
+    size_t hash = _hasher(id);
+    size_t bucket = compute_bucket(hash);
 
-    auto &value = values_.at(id);
+    auto &table = values_[bucket];
+    auto res = table.find(id);
 
-    if (with_update) {
-      AttrUpdate(&value, counter);
+    VALUE *value = nullptr;
+    if (res == table.end()) {
+      value = butil::get_object<VALUE>(value_length_);
+      // value = _alloc.acquire(value_length_);
+      table[id] = value;
+    } else {
+      value = (VALUE *)(void *)(res->second);
     }
-
-    return &value;
+    return value;
   }
 
   void AttrUpdate(VALUE *value, const int counter) {
@@ -229,7 +216,7 @@ class ValueBlock {
       if (value->is_entry_) {
         // initialize
         for (size_t x = 0; x < value_names_.size(); ++x) {
-          initializers_[x]->GetValue(value->data_ + value_offsets_[x],
+          initializers_[x]->GetValue(value->data_.data() + value_offsets_[x],
                                      value_dims_[x]);
         }
         value->need_save_ = true;
@@ -243,42 +230,73 @@ class ValueBlock {
 
   // dont jude if (has(id))
   float *Get(const uint64_t &id) {
-    auto &value = values_.at(id);
-    return value.data_;
+    size_t hash = _hasher(id);
+    size_t bucket = compute_bucket(hash);
+    auto &table = values_[bucket];
+
+    // auto &value = table.at(id);
+    // return value->data_.data();
+    auto res = table.find(id);
+    VALUE *value = res->second;
+    return value->data_.data();
   }
 
   // for load, to reset count, unseen_days
-  VALUE *GetValue(const uint64_t &id) { return &values_.at(id); }
+  VALUE *GetValue(const uint64_t &id) {
+    size_t hash = _hasher(id);
+    size_t bucket = compute_bucket(hash);
+
+    auto &table = values_[bucket];
+    auto res = table.find(id);
+    return res->second;
+  }
 
   bool GetEntry(const uint64_t &id) {
-    auto &value = values_.at(id);
-    return value.is_entry_;
+    auto value = GetValue(id);
+    return value->is_entry_;
   }
 
   void SetEntry(const uint64_t &id, const bool state) {
-    auto &value = values_.at(id);
-    value.is_entry_ = state;
+    auto value = GetValue(id);
+    value->is_entry_ = state;
   }
 
   void Shrink(const int threshold) {
-    for (auto iter = values_.begin(); iter != values_.end();) {
-      auto &value = iter->second;
-      value.unseen_days_++;
-      if (value.unseen_days_ >= threshold) {
-        iter = values_.erase(iter);
-      } else {
-        ++iter;
+    for (auto &table : values_) {
+      for (auto iter = table.begin(); iter != table.end();) {
+        // VALUE* value = (VALUE*)(void*)(iter->second);
+        VALUE *value = iter->second;
+        value->unseen_days_++;
+        if (value->unseen_days_ >= threshold) {
+          butil::return_object(iter->second);
+          //_alloc.release(iter->second);
+          //_alloc.release(value);
+          iter = table.erase(iter);
+        } else {
+          ++iter;
+        }
       }
     }
     return;
   }
 
   float GetThreshold() { return threshold_; }
+  size_t compute_bucket(size_t hash) {
+    if (SPARSE_SHARD_BUCKET_NUM == 1) {
+      return 0;
+    } else {
+      return hash >> (sizeof(size_t) * 8 - SPARSE_SHARD_BUCKET_NUM_BITS);
+    }
+  }
 
  private:
   bool Has(const uint64_t id) {
-    auto got = values_.find(id);
-    if (got == values_.end()) {
+    size_t hash = _hasher(id);
+    size_t bucket = compute_bucket(hash);
+    auto &table = values_[bucket];
+
+    auto got = table.find(id);
+    if (got == table.end()) {
       return false;
     } else {
       return true;
@@ -286,8 +304,9 @@ class ValueBlock {
   }
 
  public:
-  robin_hood::unordered_map<uint64_t, VALUE> values_;
+  robin_hood::unordered_map<uint64_t, VALUE *> values_[SPARSE_SHARD_BUCKET_NUM];
   size_t value_length_ = 0;
+  std::hash<uint64_t> _hasher;
 
  private:
   const std::vector<std::string> &value_names_;
@@ -302,4 +321,3 @@ class ValueBlock {
 
 }  // namespace distributed
 }  // namespace paddle
-
diff --git a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
index 6df2cd52bb401..db562045dcc41 100644
--- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
@@ -1,5 +1,10 @@
 IF(WITH_GPU)
-    nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS cub device_context)
+    SET(HETERPS_DEPS device_context)
+    if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
+        SET(HETERPS_DEPS ${HETERPS_DEPS} cub)
+    endif()
+
+    nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS ${HETERPS_DEPS})
     nv_test(test_heter_comm SRCS test_heter_comm.cu feature_value.h DEPS heter_comm)
     nv_library(heter_ps SRCS heter_ps.cu DEPS heter_comm)
 ENDIF()
diff --git a/python/paddle/distributed/fleet/runtime/the_one_ps.py b/python/paddle/distributed/fleet/runtime/the_one_ps.py
index df07a7a6e7783..24b83662c9dbf 100644
--- a/python/paddle/distributed/fleet/runtime/the_one_ps.py
+++ b/python/paddle/distributed/fleet/runtime/the_one_ps.py
@@ -77,10 +77,13 @@ def define_optimize_map(self):
                                  ("Moment2", None), ("Beta1Pow", 1),
                                  ("Beta2Pow", 1), ("LearningRate", 1)]
         opt_input_map["sum"] = [("Param", None)]
+        opt_input_map["naive_adagrad"] = [("Param", None), ("G2Sum", 1),
+                                          ("LearningRate", 1)]
 
         opt_attr_map = {}
         opt_attr_map["sgd"] = []
         opt_attr_map["sum"] = []
+        opt_attr_map["naive_adagrad"] = []
         opt_attr_map["adam"] = [("beta1", "f"), ("beta2", "f"),
                                 ("epsilon", "f")]
 
@@ -169,6 +172,10 @@ def parse_by_optimizer(self, grad_name, is_sparse, total_dims,
             param_varnames = self.opt_input_map["sum"]
             attr_varnames = self.opt_attr_map["sum"]
             self.accessor_class = "sum"
+        elif compiled_strategy.use_ps_gpu and is_sparse:
+            param_varnames = self.opt_input_map["naive_adagrad"]
+            attr_varnames = self.opt_attr_map["naive_adagrad"]
+            self.accessor_class = "sgd"
         else:
             param_varnames = self.opt_input_map[oop.type]
             attr_varnames = self.opt_attr_map[oop.type]
@@ -176,20 +183,28 @@ def parse_by_optimizer(self, grad_name, is_sparse, total_dims,
 
         for (formal_name, shape) in param_varnames:
             params.append(formal_name)
-            param = main_program.global_block().vars[oop.input(formal_name)[0]]
-            if formal_name == "LearningRate" and param.name != "learning_rate_0":
-                warnings.warn("will support decay soon")
-                param = main_program.global_block().vars["learning_rate_0"]
-
-            if shape is None:
-                if is_sparse:
-                    shape = total_dims
-                else:
-                    shape = self.get_shard(total_dims, pserver_num, pserver_id)
-            dims.append(shape)
+            if formal_name == "G2Sum":
+                dims.append(1)
+                initializer = "fill_constant&0"
+                initializers.append(initializer)
+            else:
+                param = main_program.global_block().vars[oop.input(formal_name)[
+                    0]]
+                if formal_name == "LearningRate" and param.name != "learning_rate_0":
+                    warnings.warn("will support decay soon")
+                    param = main_program.global_block().vars["learning_rate_0"]
+
+                if shape is None:
+                    if is_sparse:
+                        shape = total_dims
+                    else:
+                        shape = self.get_shard(total_dims, pserver_num,
+                                               pserver_id)
+                dims.append(shape)
 
-            initializer = self.get_initializer_attr(param.name, startup_program)
-            initializers.append(initializer)
+                initializer = self.get_initializer_attr(param.name,
+                                                        startup_program)
+                initializers.append(initializer)
 
         for (attr_varname, type_) in attr_varnames:
             value = oop.attr(attr_varname)
@@ -435,6 +450,8 @@ def _get_distributed_strategy(self):
         if not strategy:
             raise ValueError("k_steps must be invalid value, please check")
 
+        if dist_strategy.a_sync_configs["use_ps_gpu"]:
+            strategy.use_ps_gpu = True
         return strategy
 
     def build_compiled_startegy(self):
@@ -443,6 +460,8 @@ def build_compiled_startegy(self):
         compiled_config = CompileTimeStrategy(
             self.origin_main_program, self.origin_main_program,
             self.async_strategy, self.role_maker)
+        if self.async_strategy.use_ps_gpu:
+            compiled_config.use_ps_gpu = True
         return compiled_config
 
     def _init_worker(self):
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/distributed_strategy.py b/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/distributed_strategy.py
index 35029a3dfc7e7..2a9d26daaed90 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/distributed_strategy.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/distributed_strategy.py
@@ -149,6 +149,7 @@ def __init__(self):
         if num_threads > 1:
             self._build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
         self.debug_opt = None
+        self.use_ps_gpu = False
 
     def set_debug_opt(self, opt_info):
         self.debug_opt = opt_info
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
index baf8add04caad..b2735727f6755 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
@@ -138,6 +138,7 @@ def __init__(self, main_program, startup_program, strategy, role_maker):
 
         self.strategy = strategy
         self.role_maker = role_maker
+        self.use_ps_gpu = False
         try:
             self.is_heter_ps_mode = role_maker._is_heter_parameter_server_mode
         except: