From a5cc08c66a13b783300c3d36dd9f74578702b45c Mon Sep 17 00:00:00 2001
From: qingshui <qshuihu@gmail.com>
Date: Thu, 13 May 2021 14:04:02 +0800
Subject: [PATCH 1/2] 1. fix nncross expand_embedx size bug

---
 paddle/fluid/framework/boxps_worker.cc        | 23 +++++++-----
 paddle/fluid/framework/fleet/box_wrapper.h    | 37 +++++++++++++++++--
 .../fluid/framework/fleet/box_wrapper_impl.h  | 23 +++++-------
 3 files changed, 58 insertions(+), 25 deletions(-)
diff --git a/paddle/fluid/framework/boxps_worker.cc b/paddle/fluid/framework/boxps_worker.cc
index 385185154a24e..c173bb888e7a3 100644
--- a/paddle/fluid/framework/boxps_worker.cc
+++ b/paddle/fluid/framework/boxps_worker.cc
@@ -322,8 +322,9 @@ void BoxPSWorker::CreateDeviceResource(const ProgramDesc& main_prog) {
   }
 
   int64_t pad_len = 0;
-  AllocParamTensor(&pad_len);
-
+  if (sync_mode_ > 0) {
+    AllocParamTensor(&pad_len);
+  }
   auto& block = program_->Block(0);
   thread_scope_ = &(root_scope_->NewScope());
 
@@ -338,18 +339,22 @@ void BoxPSWorker::CreateDeviceResource(const ProgramDesc& main_prog) {
       const LoDTensor& root_tensor =
           root_scope_->FindVar(name)->Get<LoDTensor>();
       LoDTensor* gpu_tensor = thread_scope_->Var(name)->GetMutable<LoDTensor>();
-      if (CheckNeedParam(var)) {
-        auto dim = root_tensor.dims();
-        size_t len = root_tensor.numel();
-        gpu_tensor->ShareDataWith(param_sync_.Slice(offset, offset + len))
-            .Resize(dim);
-        offset += len;
+      if (sync_mode_ > 0) {
+        if (CheckNeedParam(var)) {
+          auto dim = root_tensor.dims();
+          size_t len = root_tensor.numel();
+          gpu_tensor->ShareDataWith(param_sync_.Slice(offset, offset + len))
+              .Resize(dim);
+          offset += len;
+        }
       }
       TensorCopy(*static_cast<const Tensor*>(&root_tensor), place_,
                  static_cast<Tensor*>(gpu_tensor));
     }
   }
-  CHECK(offset <= (param_sync_.numel() - pad_len));
+  if (sync_mode_ > 0) {
+    CHECK(offset <= (param_sync_.numel() - pad_len));
+  }
 }
 void BoxPSWorker::SyncParam(void) {
   if (sync_mode_ == DenseKStepNode && node_size_ == 1) {
diff --git a/paddle/fluid/framework/fleet/box_wrapper.h b/paddle/fluid/framework/fleet/box_wrapper.h
index fc7b040bcb88a..d6191fc100415 100644
--- a/paddle/fluid/framework/fleet/box_wrapper.h
+++ b/paddle/fluid/framework/fleet/box_wrapper.h
@@ -240,14 +240,45 @@ class InputTable {
   std::vector<float> table_;
   std::atomic<size_t> miss_;
 };
+class DCacheBuffer {
+ public:
+  DCacheBuffer() : buf_(nullptr) {}
+  ~DCacheBuffer() {}
+  /**
+   * @Brief get data
+   */
+  template <typename T>
+  T* mutable_data(const size_t total_bytes,
+                  const paddle::platform::Place& place) {
+    if (buf_ == nullptr) {
+      buf_ = memory::AllocShared(place, total_bytes);
+    } else if (buf_->size() < total_bytes) {
+      buf_.reset();
+      buf_ = memory::AllocShared(place, total_bytes);
+    }
+    return reinterpret_cast<T*>(buf_->ptr());
+  }
+  template <typename T>
+  T* data() {
+    return reinterpret_cast<T*>(buf_->ptr());
+  }
+  size_t memory_size() {
+    if (buf_ == nullptr) {
+      return 0;
+    }
+    return buf_->size();
+  }
 
+ private:
+  std::shared_ptr<memory::Allocation> buf_ = nullptr;
+};
 class BoxWrapper {
   struct DeviceBoxData {
     LoDTensor keys_tensor;
     LoDTensor dims_tensor;
-    LoDTensor pull_push_tensor;
-    LoDTensor keys_ptr_tensor;
-    LoDTensor values_ptr_tensor;
+    DCacheBuffer pull_push_tensor;
+    DCacheBuffer keys_ptr_tensor;
+    DCacheBuffer values_ptr_tensor;
 
     LoDTensor slot_lens;
     LoDTensor d_slot_vector;
diff --git a/paddle/fluid/framework/fleet/box_wrapper_impl.h b/paddle/fluid/framework/fleet/box_wrapper_impl.h
index 6cbc6b12e826e..4a7a2d0a61579 100644
--- a/paddle/fluid/framework/fleet/box_wrapper_impl.h
+++ b/paddle/fluid/framework/fleet/box_wrapper_impl.h
@@ -51,8 +51,8 @@ void BoxWrapper::PullSparseCase(const paddle::platform::Place& place,
 
   int64_t total_bytes = total_length * sizeof(FEATURE_VALUE_GPU_TYPE);
   FEATURE_VALUE_GPU_TYPE* total_values_gpu =
-      reinterpret_cast<FEATURE_VALUE_GPU_TYPE*>(
-          dev.pull_push_tensor.mutable_data<int8_t>({total_bytes, 1}, place));
+      dev.pull_push_tensor.mutable_data<FEATURE_VALUE_GPU_TYPE>(total_bytes,
+                                                                place);
 
   if (platform::is_cpu_place(place)) {
     PADDLE_THROW(platform::errors::Unimplemented(
@@ -69,9 +69,8 @@ void BoxWrapper::PullSparseCase(const paddle::platform::Place& place,
         dev.dims_tensor.mutable_data<int>({total_length, 1}, place));
     int* key2slot = reinterpret_cast<int*>(
         dev.keys2slot.mutable_data<int>({total_length, 1}, place));
-    uint64_t** gpu_keys =
-        reinterpret_cast<uint64_t**>(dev.keys_ptr_tensor.mutable_data<int8_t>(
-            {static_cast<int>(slot_num * sizeof(uint64_t*)), 1}, place));
+    uint64_t** gpu_keys = dev.keys_ptr_tensor.mutable_data<uint64_t*>(
+        static_cast<int>(keys.size() * sizeof(uint64_t*)), place);
     int64_t* slot_lens = reinterpret_cast<int64_t*>(
         dev.slot_lens.mutable_data<int64_t>({(slot_num + 1), 1}, place));
     cudaMemcpyAsync(gpu_keys, keys.data(), keys.size() * sizeof(uint64_t*),
@@ -89,10 +88,9 @@ void BoxWrapper::PullSparseCase(const paddle::platform::Place& place,
     PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
                                   "PullSparseGPU failed in BoxPS."));
     pull_boxps_timer.Pause();
-
-    float** gpu_values =
-        reinterpret_cast<float**>(dev.values_ptr_tensor.mutable_data<int8_t>(
-            {static_cast<int>(slot_num * sizeof(float*)), 1}, place));
+    // values.size() not sure equal slot_num
+    float** gpu_values = dev.values_ptr_tensor.mutable_data<float*>(
+        static_cast<int>(values.size() * sizeof(float*)), place);
     cudaMemcpyAsync(gpu_values, values.data(), values.size() * sizeof(float*),
                     cudaMemcpyHostToDevice, stream);
 
@@ -132,8 +130,8 @@ void BoxWrapper::PushSparseGradCase(
   int64_t total_length = dev.total_key_length;
   int64_t total_bytes = total_length * sizeof(FeaturePushValueGpuType);
   FeaturePushValueGpuType* total_grad_values_gpu =
-      reinterpret_cast<FeaturePushValueGpuType*>(
-          dev.pull_push_tensor.mutable_data<int8_t>({total_bytes, 1}, place));
+      dev.pull_push_tensor.mutable_data<FeaturePushValueGpuType>(total_bytes,
+                                                                 place);
   if (platform::is_cpu_place(place)) {
     PADDLE_THROW(platform::errors::Unimplemented(
         "Warning:: CPUPlace is not supported in PaddleBox now."));
@@ -158,8 +156,7 @@ void BoxWrapper::PushSparseGradCase(
         reinterpret_cast<int64_t*>(dev.slot_lens.data<int64_t>());
     const int* d_slot_vector = dev.d_slot_vector.data<int>();
     const int* key2slot = reinterpret_cast<int*>(dev.keys2slot.data<int>());
-    float** gpu_values =
-        reinterpret_cast<float**>(dev.values_ptr_tensor.data<int8_t>());
+    float** gpu_values = dev.values_ptr_tensor.data<float*>();
     cudaMemcpyAsync(gpu_values, grad_values.data(),
                     grad_values.size() * sizeof(float*), cudaMemcpyHostToDevice,
                     stream);

From 878ad7ba3c9c904b1e1abe076ff1d840b9c41e1e Mon Sep 17 00:00:00 2001
From: qingshui <qshuihu@gmail.com>
Date: Thu, 13 May 2021 18:35:28 +0800
Subject: [PATCH 2/2] 1. Simplify macro definition code

---
 paddle/fluid/framework/fleet/box_wrapper.cu | 495 ++++++++------------
 1 file changed, 206 insertions(+), 289 deletions(-)

diff --git a/paddle/fluid/framework/fleet/box_wrapper.cu b/paddle/fluid/framework/fleet/box_wrapper.cu
index 8a04cc9ba1fc3..3e2f8fc63183d 100644
--- a/paddle/fluid/framework/fleet/box_wrapper.cu
+++ b/paddle/fluid/framework/fleet/box_wrapper.cu
@@ -433,6 +433,52 @@ __global__ void AddMaskCalculator(const float* pred, const int64_t* label,
   }
 }
 
+template <typename FeaturePullValueType>
+void FeaturePullCopy(cudaStream_t stream, uint64_t** gpu_keys,
+                     float** gpu_values, void* src, const int hidden_size,
+                     const size_t embedx_dim, const int total_length,
+                     int* total_dims, const int64_t* slot_lens,
+                     const int slot_num, const int* key2slot, const float scale,
+                     const int cvm_offset) {
+  FeaturePullValueType* pull_values_gpu =
+      reinterpret_cast<FeaturePullValueType*>(src);
+  // normal
+  PullCopyBase<
+      FeaturePullValueType><<<(total_length + 512 - 1) / 512, 512, 0, stream>>>(
+      gpu_values, pull_values_gpu, hidden_size, total_length, gpu_keys,
+      total_dims, slot_lens, slot_num, key2slot, cvm_offset);
+  // embedx
+  int embedx_total_length = total_length * embedx_dim;
+  PullCopyExpand<FeaturePullValueType><<<(embedx_total_length + 512 - 1) / 512,
+                                         512, 0, stream>>>(
+      gpu_values, pull_values_gpu, embedx_dim, embedx_total_length, total_dims,
+      slot_lens, slot_num, key2slot, scale, cvm_offset);
+}
+
+template <typename FeaturePullValueType>
+void FeaturePullCopyNNCross(cudaStream_t stream, uint64_t** gpu_keys,
+                            float** gpu_values, void* src,
+                            const int hidden_size, const size_t embedx_dim,
+                            const size_t expand_dim, const int total_length,
+                            int* total_dims, const int64_t* slot_lens,
+                            const int slot_num, const int* key2slot,
+                            const float scale, const int cvm_offset) {
+  FeaturePullValueType* pull_values_gpu =
+      reinterpret_cast<FeaturePullValueType*>(src);
+  // nncross
+  PullCopyBaseNNCross<
+      FeaturePullValueType><<<(total_length + 512 - 1) / 512, 512, 0, stream>>>(
+      gpu_values, pull_values_gpu, hidden_size, expand_dim, total_length,
+      gpu_keys, total_dims, slot_lens, slot_num, key2slot, cvm_offset);
+  // embedx + expand_embedx
+  int embedx_total_length = total_length * (embedx_dim + expand_dim);
+  PullCopyExpandNNCross<FeaturePullValueType><<<
+      (embedx_total_length + 512 - 1) / 512, 512, 0, stream>>>(
+      gpu_values, pull_values_gpu, (embedx_dim + expand_dim), embedx_dim,
+      expand_dim, embedx_total_length, total_dims, slot_lens, slot_num,
+      key2slot, scale, cvm_offset);
+}
+
 void BoxWrapper::CopyForPull(const paddle::platform::Place& place,
                              uint64_t** gpu_keys, float** gpu_values,
                              void* total_values_gpu, const int64_t* slot_lens,
@@ -454,190 +500,70 @@ void BoxWrapper::CopyForPull(const paddle::platform::Place& place,
     }                                                                        \
   } break
 
-#define EXPAND_EMBED_PULL_CASE(i, ...)                                         \
-  case i: {                                                                    \
-    constexpr size_t ExpandDim = i;                                            \
-    if (feature_type_ == static_cast<int>(boxps::FEATURE_PCOC)) {              \
-      PullCopy<boxps::FeaturePullValueGpuPCOC<EmbedxDim, ExpandDim>><<<        \
-          (total_length + 512 - 1) / 512, 512, 0, stream>>>(                   \
-          gpu_values,                                                          \
-          reinterpret_cast<                                                    \
-              boxps::FeaturePullValueGpuPCOC<EmbedxDim, ExpandDim>*>(          \
-              total_values_gpu),                                               \
-          hidden_size, expand_embed_dim, total_length, gpu_keys, total_dims,   \
-          slot_lens, slot_num, key2slot, pull_embedx_scale_, cvm_offset_);     \
-    } else if (feature_type_ == static_cast<int>(boxps::FEATURE_QUANT) ||      \
-               feature_type_ == static_cast<int>(boxps::FEATURE_SHOWCLK)) {    \
-      PullCopy<boxps::FeaturePullValueGpuQuant<EmbedxDim, ExpandDim>><<<       \
-          (total_length + 512 - 1) / 512, 512, 0, stream>>>(                   \
-          gpu_values,                                                          \
-          reinterpret_cast<                                                    \
-              boxps::FeaturePullValueGpuQuant<EmbedxDim, ExpandDim>*>(         \
-              total_values_gpu),                                               \
-          hidden_size, expand_embed_dim, total_length, gpu_keys, total_dims,   \
-          slot_lens, slot_num, key2slot, pull_embedx_scale_, cvm_offset_);     \
-    } else {                                                                   \
-      PullCopy<boxps::FeaturePullValueGpu<EmbedxDim, ExpandDim>><<<            \
-          (total_length + 512 - 1) / 512, 512, 0, stream>>>(                   \
-          gpu_values,                                                          \
-          reinterpret_cast<boxps::FeaturePullValueGpu<EmbedxDim, ExpandDim>*>( \
-              total_values_gpu),                                               \
-          hidden_size, expand_embed_dim, total_length, gpu_keys, total_dims,   \
-          slot_lens, slot_num, key2slot, 1.0, cvm_offset_);                    \
-    }                                                                          \
+#define EXPAND_EMBED_PULL_CASE(i, ...)                                        \
+  case i: {                                                                   \
+    constexpr size_t ExpandDim = i;                                           \
+    if (feature_type_ == static_cast<int>(boxps::FEATURE_SHARE_EMBEDDING)) {  \
+      constexpr size_t SingleEmbedxDim =                                      \
+          EmbedxDim / boxps::SHARE_EMBEDDING_NUM;                             \
+      FeaturePullCopy<boxps::FeaturePullValueGpuShareEmbedding<               \
+          SingleEmbedxDim, ExpandDim>>(                                       \
+          stream, gpu_keys, gpu_values, total_values_gpu, hidden_size,        \
+          EmbedxDim, total_length, total_dims, slot_lens, slot_num, key2slot, \
+          pull_embedx_scale_, cvm_offset_);                                   \
+    } else if (feature_type_ == static_cast<int>(boxps::FEATURE_PCOC)) {      \
+      FeaturePullCopy<boxps::FeaturePullValueGpuPCOC<EmbedxDim, ExpandDim>>(  \
+          stream, gpu_keys, gpu_values, total_values_gpu, hidden_size,        \
+          EmbedxDim, total_length, total_dims, slot_lens, slot_num, key2slot, \
+          pull_embedx_scale_, cvm_offset_);                                   \
+    } else if (feature_type_ == static_cast<int>(boxps::FEATURE_QUANT) ||     \
+               feature_type_ == static_cast<int>(boxps::FEATURE_SHOWCLK)) {   \
+      FeaturePullCopy<boxps::FeaturePullValueGpuQuant<EmbedxDim, ExpandDim>>( \
+          stream, gpu_keys, gpu_values, total_values_gpu, hidden_size,        \
+          EmbedxDim, total_length, total_dims, slot_lens, slot_num, key2slot, \
+          pull_embedx_scale_, cvm_offset_);                                   \
+    } else {                                                                  \
+      FeaturePullCopy<boxps::FeaturePullValueGpu<EmbedxDim, ExpandDim>>(      \
+          stream, gpu_keys, gpu_values, total_values_gpu, hidden_size,        \
+          EmbedxDim, total_length, total_dims, slot_lens, slot_num, key2slot, \
+          1.0, cvm_offset_);                                                  \
+    }                                                                         \
   } break
 
-#define EXPAND_EMBED_PULL_CASE2(i, ...)                                        \
+#define EXPAND_EMBED_PULL_NNCROSS(i, ...)                                      \
   case i: {                                                                    \
     constexpr size_t ExpandDim = i;                                            \
-    if (feature_type_ == static_cast<int>(boxps::FEATURE_SHARE_EMBEDDING)) {   \
-      constexpr size_t SingleEmbedxDim =                                       \
-          EmbedxDim / boxps::SHARE_EMBEDDING_NUM;                              \
-      typedef boxps::FeaturePullValueGpuShareEmbedding<SingleEmbedxDim,        \
-                                                       ExpandDim>              \
-          PullShareEmbedding;                                                  \
-      PullCopyBase<PullShareEmbedding><<<(total_length + 512 - 1) / 512, 512,  \
-                                         0, stream>>>(                         \
-          gpu_values, reinterpret_cast<PullShareEmbedding*>(total_values_gpu), \
-          hidden_size, total_length, gpu_keys, total_dims, slot_lens,          \
-          slot_num, key2slot, cvm_offset_);                                    \
-      int embedx_total_length = total_length * EmbedxDim;                      \
-      PullCopyExpand<PullShareEmbedding><<<                                    \
-          (embedx_total_length + 512 - 1) / 512, 512, 0, stream>>>(            \
-          gpu_values, reinterpret_cast<PullShareEmbedding*>(total_values_gpu), \
-          EmbedxDim, embedx_total_length, total_dims, slot_lens, slot_num,     \
-          key2slot, pull_embedx_scale_, cvm_offset_);                          \
-    } else if (feature_type_ == static_cast<int>(boxps::FEATURE_PCOC)) {       \
-      PullCopyBase<boxps::FeaturePullValueGpuPCOC<EmbedxDim, ExpandDim>><<<    \
-          (total_length + 512 - 1) / 512, 512, 0, stream>>>(                   \
-          gpu_values,                                                          \
-          reinterpret_cast<                                                    \
-              boxps::FeaturePullValueGpuPCOC<EmbedxDim, ExpandDim>*>(          \
-              total_values_gpu),                                               \
-          hidden_size, total_length, gpu_keys, total_dims, slot_lens,          \
-          slot_num, key2slot, cvm_offset_);                                    \
-      int embedx_total_length = total_length * EmbedxDim;                      \
-      PullCopyExpand<boxps::FeaturePullValueGpuPCOC<EmbedxDim, ExpandDim>><<<  \
-          (embedx_total_length + 512 - 1) / 512, 512, 0, stream>>>(            \
-          gpu_values,                                                          \
-          reinterpret_cast<                                                    \
-              boxps::FeaturePullValueGpuPCOC<EmbedxDim, ExpandDim>*>(          \
-              total_values_gpu),                                               \
-          EmbedxDim, embedx_total_length, total_dims, slot_lens, slot_num,     \
+    if (feature_type_ == static_cast<int>(boxps::FEATURE_PCOC)) {              \
+      FeaturePullCopyNNCross<                                                  \
+          boxps::FeaturePullValueGpuPCOC<EmbedxDim, ExpandDim>>(               \
+          stream, gpu_keys, gpu_values, total_values_gpu, hidden_size,         \
+          EmbedxDim, ExpandDim, total_length, total_dims, slot_lens, slot_num, \
           key2slot, pull_embedx_scale_, cvm_offset_);                          \
     } else if (feature_type_ == static_cast<int>(boxps::FEATURE_QUANT) ||      \
                feature_type_ == static_cast<int>(boxps::FEATURE_SHOWCLK)) {    \
-      PullCopyBase<boxps::FeaturePullValueGpuQuant<EmbedxDim, ExpandDim>><<<   \
-          (total_length + 512 - 1) / 512, 512, 0, stream>>>(                   \
-          gpu_values,                                                          \
-          reinterpret_cast<                                                    \
-              boxps::FeaturePullValueGpuQuant<EmbedxDim, ExpandDim>*>(         \
-              total_values_gpu),                                               \
-          hidden_size, total_length, gpu_keys, total_dims, slot_lens,          \
-          slot_num, key2slot, cvm_offset_);                                    \
-      int embedx_total_length = total_length * EmbedxDim;                      \
-      PullCopyExpand<boxps::FeaturePullValueGpuQuant<EmbedxDim, ExpandDim>><<< \
-          (embedx_total_length + 512 - 1) / 512, 512, 0, stream>>>(            \
-          gpu_values,                                                          \
-          reinterpret_cast<                                                    \
-              boxps::FeaturePullValueGpuQuant<EmbedxDim, ExpandDim>*>(         \
-              total_values_gpu),                                               \
-          EmbedxDim, embedx_total_length, total_dims, slot_lens, slot_num,     \
+      FeaturePullCopyNNCross<                                                  \
+          boxps::FeaturePullValueGpuQuant<EmbedxDim, ExpandDim>>(              \
+          stream, gpu_keys, gpu_values, total_values_gpu, hidden_size,         \
+          EmbedxDim, ExpandDim, total_length, total_dims, slot_lens, slot_num, \
           key2slot, pull_embedx_scale_, cvm_offset_);                          \
     } else {                                                                   \
-      PullCopyBase<boxps::FeaturePullValueGpu<EmbedxDim, ExpandDim>><<<        \
-          (total_length + 512 - 1) / 512, 512, 0, stream>>>(                   \
-          gpu_values,                                                          \
-          reinterpret_cast<boxps::FeaturePullValueGpu<EmbedxDim, ExpandDim>*>( \
-              total_values_gpu),                                               \
-          hidden_size, total_length, gpu_keys, total_dims, slot_lens,          \
-          slot_num, key2slot, cvm_offset_);                                    \
-      int embedx_total_length = total_length * EmbedxDim;                      \
-      PullCopyExpand<boxps::FeaturePullValueGpu<EmbedxDim, ExpandDim>><<<      \
-          (embedx_total_length + 512 - 1) / 512, 512, 0, stream>>>(            \
-          gpu_values,                                                          \
-          reinterpret_cast<boxps::FeaturePullValueGpu<EmbedxDim, ExpandDim>*>( \
-              total_values_gpu),                                               \
-          EmbedxDim, embedx_total_length, total_dims, slot_lens, slot_num,     \
+      FeaturePullCopyNNCross<                                                  \
+          boxps::FeaturePullValueGpu<EmbedxDim, ExpandDim>>(                   \
+          stream, gpu_keys, gpu_values, total_values_gpu, hidden_size,         \
+          EmbedxDim, ExpandDim, total_length, total_dims, slot_lens, slot_num, \
           key2slot, 1.0, cvm_offset_);                                         \
     }                                                                          \
   } break
 
-#define EXPAND_EMBED_PULL_NNCROSS(i, ...)                                      \
-  case i: {                                                                    \
-    constexpr size_t ExpandDim = i;                                            \
-    if (feature_type_ == static_cast<int>(boxps::FEATURE_PCOC)) {              \
-      PullCopyBaseNNCross<boxps::FeaturePullValueGpuPCOC<                      \
-          EmbedxDim,                                                           \
-          ExpandDim>><<<(total_length + 512 - 1) / 512, 512, 0, stream>>>(     \
-          gpu_values,                                                          \
-          reinterpret_cast<                                                    \
-              boxps::FeaturePullValueGpuPCOC<EmbedxDim, ExpandDim>*>(          \
-              total_values_gpu),                                               \
-          hidden_size, expand_embed_dim, total_length, gpu_keys, total_dims,   \
-          slot_lens, slot_num, key2slot, cvm_offset_);                         \
-      int embedx_total_length = total_length * (EmbedxDim + ExpandDim);        \
-      PullCopyExpandNNCross<                                                   \
-          boxps::FeaturePullValueGpuPCOC<EmbedxDim, ExpandDim>><<<             \
-          (embedx_total_length + 512 - 1) / 512, 512, 0, stream>>>(            \
-          gpu_values,                                                          \
-          reinterpret_cast<                                                    \
-              boxps::FeaturePullValueGpuPCOC<EmbedxDim, ExpandDim>*>(          \
-              total_values_gpu),                                               \
-          (EmbedxDim + ExpandDim), EmbedxDim, ExpandDim, embedx_total_length,  \
-          total_dims, slot_lens, slot_num, key2slot, pull_embedx_scale_,       \
-          cvm_offset_);                                                        \
-    } else if (feature_type_ == static_cast<int>(boxps::FEATURE_QUANT) ||      \
-               feature_type_ == static_cast<int>(boxps::FEATURE_SHOWCLK)) {    \
-      PullCopyBaseNNCross<boxps::FeaturePullValueGpuQuant<                     \
-          EmbedxDim,                                                           \
-          ExpandDim>><<<(total_length + 512 - 1) / 512, 512, 0, stream>>>(     \
-          gpu_values,                                                          \
-          reinterpret_cast<                                                    \
-              boxps::FeaturePullValueGpuQuant<EmbedxDim, ExpandDim>*>(         \
-              total_values_gpu),                                               \
-          hidden_size, expand_embed_dim, total_length, gpu_keys, total_dims,   \
-          slot_lens, slot_num, key2slot, cvm_offset_);                         \
-      int embedx_total_length = total_length * (EmbedxDim + ExpandDim);        \
-      PullCopyExpandNNCross<                                                   \
-          boxps::FeaturePullValueGpuQuant<EmbedxDim, ExpandDim>><<<            \
-          (embedx_total_length + 512 - 1) / 512, 512, 0, stream>>>(            \
-          gpu_values,                                                          \
-          reinterpret_cast<                                                    \
-              boxps::FeaturePullValueGpuQuant<EmbedxDim, ExpandDim>*>(         \
-              total_values_gpu),                                               \
-          (EmbedxDim + ExpandDim), EmbedxDim, ExpandDim, embedx_total_length,  \
-          total_dims, slot_lens, slot_num, key2slot, pull_embedx_scale_,       \
-          cvm_offset_);                                                        \
-    } else {                                                                   \
-      PullCopyBaseNNCross<boxps::FeaturePullValueGpu<EmbedxDim, ExpandDim>><<< \
-          (total_length + 512 - 1) / 512, 512, 0, stream>>>(                   \
-          gpu_values,                                                          \
-          reinterpret_cast<boxps::FeaturePullValueGpu<EmbedxDim, ExpandDim>*>( \
-              total_values_gpu),                                               \
-          hidden_size, expand_embed_dim, total_length, gpu_keys, total_dims,   \
-          slot_lens, slot_num, key2slot, cvm_offset_);                         \
-      int embedx_total_length = total_length * (EmbedxDim + ExpandDim);        \
-      PullCopyExpandNNCross<                                                   \
-          boxps::FeaturePullValueGpu<EmbedxDim, ExpandDim>><<<                 \
-          (embedx_total_length + 512 - 1) / 512, 512, 0, stream>>>(            \
-          gpu_values,                                                          \
-          reinterpret_cast<boxps::FeaturePullValueGpu<EmbedxDim, ExpandDim>*>( \
-              total_values_gpu),                                               \
-          (EmbedxDim + ExpandDim), EmbedxDim, ExpandDim, embedx_total_length,  \
-          total_dims, slot_lens, slot_num, key2slot, 1.0, cvm_offset_);        \
-    }                                                                          \
-  } break
-
   switch (hidden_size - cvm_offset_) {
     EMBEDX_CASE(8, EXPAND_EMBED_PULL_CASE(0); EXPAND_EMBED_PULL_NNCROSS(8);
                 EXPAND_EMBED_PULL_NNCROSS(64););
-    EMBEDX_CASE(16, EXPAND_EMBED_PULL_CASE2(0); EXPAND_EMBED_PULL_NNCROSS(64););
-    EMBEDX_CASE(32, EXPAND_EMBED_PULL_CASE2(0););
-    EMBEDX_CASE(64, EXPAND_EMBED_PULL_CASE2(0););
-    EMBEDX_CASE(256, EXPAND_EMBED_PULL_CASE2(0););
-    EMBEDX_CASE(128, EXPAND_EMBED_PULL_CASE2(0););
-    EMBEDX_CASE(280, EXPAND_EMBED_PULL_CASE2(0););
+    EMBEDX_CASE(16, EXPAND_EMBED_PULL_CASE(0); EXPAND_EMBED_PULL_NNCROSS(64););
+    EMBEDX_CASE(32, EXPAND_EMBED_PULL_CASE(0););
+    EMBEDX_CASE(64, EXPAND_EMBED_PULL_CASE(0););
+    EMBEDX_CASE(256, EXPAND_EMBED_PULL_CASE(0););
+    EMBEDX_CASE(128, EXPAND_EMBED_PULL_CASE(0););
+    EMBEDX_CASE(280, EXPAND_EMBED_PULL_CASE(0););
     default:
       PADDLE_THROW(platform::errors::InvalidArgument(
           "Unsupport this embedding size [%d]", hidden_size - cvm_offset_));
@@ -662,6 +588,73 @@ void BoxWrapper::CopyKeys(const paddle::platform::Place& place,
   cudaStreamSynchronize(stream);
 }
 
+template <typename FeaturePushValueGpuType>
+void FeaturePushCopy(cudaStream_t stream, void* dest, float** grad_values,
+                     const int hidden_size, const int embedx_dim,
+                     const int total_length, const int batch_size,
+                     const int* slot_vector, const int* total_dims,
+                     const int64_t* slot_lens, const int slot_num,
+                     const int* key2slot, const int cvm_offset) {
+  FeaturePushValueGpuType* push_grad_values =
+      reinterpret_cast<FeaturePushValueGpuType*>(dest);
+  // normal
+  PushCopyBase<FeaturePushValueGpuType><<<(total_length + 512 - 1) / 512, 512,
+                                          0, stream>>>(
+      push_grad_values, grad_values, hidden_size, total_length, batch_size,
+      slot_vector, total_dims, slot_lens, slot_num, key2slot, cvm_offset);
+  // normal
+  int embedx_total_length = total_length * embedx_dim;
+  PushCopyExpand<FeaturePushValueGpuType><<<
+      (embedx_total_length + 512 - 1) / 512, 512, 0, stream>>>(
+      push_grad_values, grad_values, embedx_dim, embedx_total_length,
+      batch_size, slot_vector, total_dims, slot_lens, slot_num, key2slot,
+      cvm_offset);
+}
+
+template <typename FeaturePushValueGpuType>
+void FeaturePushCopyNNCross(cudaStream_t stream, void* dest,
+                            float** grad_values, const int hidden_size,
+                            const int embedx_dim, const int expand_dim,
+                            const int total_length, const int batch_size,
+                            const int* slot_vector, const int* total_dims,
+                            const int64_t* slot_lens, const int slot_num,
+                            const int* key2slot, const int cvm_offset) {
+  FeaturePushValueGpuType* push_grad_values =
+      reinterpret_cast<FeaturePushValueGpuType*>(dest);
+  // nncross
+  PushCopyBaseNNCross<FeaturePushValueGpuType><<<(total_length + 512 - 1) / 512,
+                                                 512, 0, stream>>>(
+      push_grad_values, grad_values, hidden_size, total_length, batch_size,
+      slot_vector, total_dims, slot_lens, slot_num, key2slot, cvm_offset);
+  int embedx_total_length = total_length * (embedx_dim + expand_dim);
+  PushCopyExpandNNCross<FeaturePushValueGpuType><<<
+      (embedx_total_length + 512 - 1) / 512, 512, 0, stream>>>(
+      push_grad_values, grad_values, (embedx_dim + expand_dim), embedx_dim,
+      expand_dim, embedx_total_length, batch_size, slot_vector, total_dims,
+      slot_lens, slot_num, key2slot, cvm_offset);
+}
+
+template <typename FeaturePushValueGpuType>
+void FeaturePushCopyShareEmbedding(
+    cudaStream_t stream, void* dest, float** grad_values, const int hidden_size,
+    const size_t embedx_dim, const size_t expand_dim, const int total_length,
+    const int batch_size, const int* slot_vector, const int* total_dims,
+    const int64_t* slot_lens, const int slot_num, const int* key2slot,
+    const int cvm_offset) {
+  FeaturePushValueGpuType* push_grad_values =
+      reinterpret_cast<FeaturePushValueGpuType*>(dest);
+  // share embedding
+  PushCopyBaseShareEmbedding<FeaturePushValueGpuType><<<
+      (total_length + 512 - 1) / 512, 512, 0, stream>>>(
+      push_grad_values, grad_values, hidden_size, total_length, batch_size,
+      slot_vector, total_dims, slot_lens, slot_num, key2slot, cvm_offset);
+  int embedx_total_length = total_length * embedx_dim;
+  PushCopyExpand<FeaturePushValueGpuType><<<
+      (embedx_total_length + 512 - 1) / 512, 512, 0, stream>>>(
+      push_grad_values, grad_values, embedx_dim, embedx_total_length,
+      batch_size, slot_vector, total_dims, slot_lens, slot_num, key2slot,
+      cvm_offset);
+}
 void BoxWrapper::CopyForPush(const paddle::platform::Place& place,
                              float** grad_values, void* total_grad_values_gpu,
                              const int* d_slot_vector, const int64_t* slot_lens,
@@ -683,134 +676,58 @@ void BoxWrapper::CopyForPush(const paddle::platform::Place& place,
             "Unsupport this expand embedding size [%d]", expand_embed_dim)); \
     }                                                                        \
   } break
-#define EXPAND_EMBED_PUSH_CASE(i, ...)                                         \
-  case i: {                                                                    \
-    constexpr size_t ExpandDim = i;                                            \
-    if (feature_type_ == static_cast<int>(boxps::FEATURE_PCOC)) {              \
-      PushCopy<boxps::FeaturePushValueGpuPCOC<EmbedxDim, ExpandDim>><<<        \
-          (total_length + 512 - 1) / 512, 512, 0, stream>>>(                   \
-          reinterpret_cast<                                                    \
-              boxps::FeaturePushValueGpuPCOC<EmbedxDim, ExpandDim>*>(          \
-              total_grad_values_gpu),                                          \
-          grad_values, hidden_size, expand_embed_dim, total_length,            \
-          batch_size, d_slot_vector, total_dims, slot_lens, slot_num,          \
-          key2slot, cvm_offset_);                                              \
-    } else {                                                                   \
-      PushCopy<boxps::FeaturePushValueGpu<EmbedxDim, ExpandDim>><<<            \
-          (total_length + 512 - 1) / 512, 512, 0, stream>>>(                   \
-          reinterpret_cast<boxps::FeaturePushValueGpu<EmbedxDim, ExpandDim>*>( \
-              total_grad_values_gpu),                                          \
-          grad_values, hidden_size, expand_embed_dim, total_length,            \
-          batch_size, d_slot_vector, total_dims, slot_lens, slot_num,          \
-          key2slot, cvm_offset_);                                              \
-    }                                                                          \
-  } break
 
-#define EXPAND_EMBED_PUSH_CASE2(i, ...)                                        \
-  case i: {                                                                    \
-    constexpr size_t ExpandDim = i;                                            \
-    if (feature_type_ == static_cast<int>(boxps::FEATURE_SHARE_EMBEDDING)) {   \
-      constexpr size_t SingleEmbedxDim =                                       \
-          EmbedxDim / boxps::SHARE_EMBEDDING_NUM;                              \
-      typedef boxps::FeaturePushValueGpuShareEmbedding<SingleEmbedxDim,        \
-                                                       ExpandDim>              \
-          PushShareEmbedding;                                                  \
-      PushCopyBaseShareEmbedding<PushShareEmbedding><<<                        \
-          (total_length + 512 - 1) / 512, 512, 0, stream>>>(                   \
-          reinterpret_cast<PushShareEmbedding*>(total_grad_values_gpu),        \
-          grad_values, hidden_size, total_length, batch_size, d_slot_vector,   \
-          total_dims, slot_lens, slot_num, key2slot, cvm_offset_);             \
-      int embedx_total_length = total_length * EmbedxDim;                      \
-      PushCopyExpand<PushShareEmbedding><<<                                    \
-          (embedx_total_length + 512 - 1) / 512, 512, 0, stream>>>(            \
-          reinterpret_cast<PushShareEmbedding*>(total_grad_values_gpu),        \
-          grad_values, EmbedxDim, embedx_total_length, batch_size,             \
-          d_slot_vector, total_dims, slot_lens, slot_num, key2slot,            \
-          cvm_offset_);                                                        \
-    } else if (feature_type_ == static_cast<int>(boxps::FEATURE_PCOC)) {       \
-      PushCopyBase<boxps::FeaturePushValueGpuPCOC<EmbedxDim, ExpandDim>><<<    \
-          (total_length + 512 - 1) / 512, 512, 0, stream>>>(                   \
-          reinterpret_cast<                                                    \
-              boxps::FeaturePushValueGpuPCOC<EmbedxDim, ExpandDim>*>(          \
-              total_grad_values_gpu),                                          \
-          grad_values, hidden_size, total_length, batch_size, d_slot_vector,   \
-          total_dims, slot_lens, slot_num, key2slot, cvm_offset_);             \
-      int embedx_total_length = total_length * EmbedxDim;                      \
-      PushCopyExpand<boxps::FeaturePushValueGpuPCOC<EmbedxDim, ExpandDim>><<<  \
-          (embedx_total_length + 512 - 1) / 512, 512, 0, stream>>>(            \
-          reinterpret_cast<                                                    \
-              boxps::FeaturePushValueGpuPCOC<EmbedxDim, ExpandDim>*>(          \
-              total_grad_values_gpu),                                          \
-          grad_values, EmbedxDim, embedx_total_length, batch_size,             \
-          d_slot_vector, total_dims, slot_lens, slot_num, key2slot,            \
-          cvm_offset_);                                                        \
-    } else {                                                                   \
-      PushCopyBase<boxps::FeaturePushValueGpu<EmbedxDim, ExpandDim>><<<        \
-          (total_length + 512 - 1) / 512, 512, 0, stream>>>(                   \
-          reinterpret_cast<boxps::FeaturePushValueGpu<EmbedxDim, ExpandDim>*>( \
-              total_grad_values_gpu),                                          \
-          grad_values, hidden_size, total_length, batch_size, d_slot_vector,   \
-          total_dims, slot_lens, slot_num, key2slot, cvm_offset_);             \
-      int embedx_total_length = total_length * EmbedxDim;                      \
-      PushCopyExpand<boxps::FeaturePushValueGpu<EmbedxDim, ExpandDim>><<<      \
-          (embedx_total_length + 512 - 1) / 512, 512, 0, stream>>>(            \
-          reinterpret_cast<boxps::FeaturePushValueGpu<EmbedxDim, ExpandDim>*>( \
-              total_grad_values_gpu),                                          \
-          grad_values, EmbedxDim, embedx_total_length, batch_size,             \
-          d_slot_vector, total_dims, slot_lens, slot_num, key2slot,            \
-          cvm_offset_);                                                        \
-    }                                                                          \
+#define EXPAND_EMBED_PUSH_CASE(i, ...)                                        \
+  case i: {                                                                   \
+    constexpr size_t ExpandDim = i;                                           \
+    if (feature_type_ == static_cast<int>(boxps::FEATURE_SHARE_EMBEDDING)) {  \
+      constexpr size_t SingleEmbedxDim =                                      \
+          EmbedxDim / boxps::SHARE_EMBEDDING_NUM;                             \
+      FeaturePushCopyShareEmbedding<boxps::FeaturePushValueGpuShareEmbedding< \
+          SingleEmbedxDim, ExpandDim>>(                                       \
+          stream, total_grad_values_gpu, grad_values, hidden_size, EmbedxDim, \
+          ExpandDim, total_length, batch_size, d_slot_vector, total_dims,     \
+          slot_lens, slot_num, key2slot, cvm_offset_);                        \
+    } else if (feature_type_ == static_cast<int>(boxps::FEATURE_PCOC)) {      \
+      FeaturePushCopy<boxps::FeaturePushValueGpuPCOC<EmbedxDim, ExpandDim>>(  \
+          stream, total_grad_values_gpu, grad_values, hidden_size, EmbedxDim, \
+          total_length, batch_size, d_slot_vector, total_dims, slot_lens,     \
+          slot_num, key2slot, cvm_offset_);                                   \
+    } else {                                                                  \
+      FeaturePushCopy<boxps::FeaturePushValueGpu<EmbedxDim, ExpandDim>>(      \
+          stream, total_grad_values_gpu, grad_values, hidden_size, EmbedxDim, \
+          total_length, batch_size, d_slot_vector, total_dims, slot_lens,     \
+          slot_num, key2slot, cvm_offset_);                                   \
+    }                                                                         \
   } break
 
-#define EXPAND_EMBED_PUSH_NNCROSS(i, ...)                                      \
-  case i: {                                                                    \
-    constexpr size_t ExpandDim = i;                                            \
-    if (feature_type_ == static_cast<int>(boxps::FEATURE_PCOC)) {              \
-      PushCopyBaseNNCross<boxps::FeaturePushValueGpuPCOC<                      \
-          EmbedxDim,                                                           \
-          ExpandDim>><<<(total_length + 512 - 1) / 512, 512, 0, stream>>>(     \
-          reinterpret_cast<                                                    \
-              boxps::FeaturePushValueGpuPCOC<EmbedxDim, ExpandDim>*>(          \
-              total_grad_values_gpu),                                          \
-          grad_values, hidden_size, total_length, batch_size, d_slot_vector,   \
-          total_dims, slot_lens, slot_num, key2slot, cvm_offset_);             \
-      int embedx_total_length = total_length * (EmbedxDim + ExpandDim);        \
-      PushCopyExpandNNCross<                                                   \
-          boxps::FeaturePushValueGpuPCOC<EmbedxDim, ExpandDim>><<<             \
-          (embedx_total_length + 512 - 1) / 512, 512, 0, stream>>>(            \
-          reinterpret_cast<                                                    \
-              boxps::FeaturePushValueGpuPCOC<EmbedxDim, ExpandDim>*>(          \
-              total_grad_values_gpu),                                          \
-          grad_values, (EmbedxDim + ExpandDim), EmbedxDim, ExpandDim,          \
-          embedx_total_length, batch_size, d_slot_vector, total_dims,          \
-          slot_lens, slot_num, key2slot, cvm_offset_);                         \
-    } else {                                                                   \
-      PushCopyBaseNNCross<boxps::FeaturePushValueGpu<EmbedxDim, ExpandDim>><<< \
-          (total_length + 512 - 1) / 512, 512, 0, stream>>>(                   \
-          reinterpret_cast<boxps::FeaturePushValueGpu<EmbedxDim, ExpandDim>*>( \
-              total_grad_values_gpu),                                          \
-          grad_values, hidden_size, total_length, batch_size, d_slot_vector,   \
-          total_dims, slot_lens, slot_num, key2slot, cvm_offset_);             \
-      int embedx_total_length = total_length * (EmbedxDim + ExpandDim);        \
-      PushCopyExpandNNCross<                                                   \
-          boxps::FeaturePushValueGpu<EmbedxDim, ExpandDim>><<<                 \
-          (embedx_total_length + 512 - 1) / 512, 512, 0, stream>>>(            \
-          reinterpret_cast<boxps::FeaturePushValueGpu<EmbedxDim, ExpandDim>*>( \
-              total_grad_values_gpu),                                          \
-          grad_values, (EmbedxDim + ExpandDim), EmbedxDim, ExpandDim,          \
-          embedx_total_length, batch_size, d_slot_vector, total_dims,          \
-          slot_lens, slot_num, key2slot, cvm_offset_);                         \
-    }                                                                          \
+#define EXPAND_EMBED_PUSH_NNCROSS(i, ...)                                     \
+  case i: {                                                                   \
+    constexpr size_t ExpandDim = i;                                           \
+    if (feature_type_ == static_cast<int>(boxps::FEATURE_PCOC)) {             \
+      FeaturePushCopyNNCross<                                                 \
+          boxps::FeaturePushValueGpuPCOC<EmbedxDim, ExpandDim>>(              \
+          stream, total_grad_values_gpu, grad_values, hidden_size, EmbedxDim, \
+          ExpandDim, total_length, batch_size, d_slot_vector, total_dims,     \
+          slot_lens, slot_num, key2slot, cvm_offset_);                        \
+    } else {                                                                  \
+      FeaturePushCopyNNCross<                                                 \
+          boxps::FeaturePushValueGpu<EmbedxDim, ExpandDim>>(                  \
+          stream, total_grad_values_gpu, grad_values, hidden_size, EmbedxDim, \
+          ExpandDim, total_length, batch_size, d_slot_vector, total_dims,     \
+          slot_lens, slot_num, key2slot, cvm_offset_);                        \
+    }                                                                         \
   } break
+
   switch (hidden_size - cvm_offset_) {
     EMBEDX_CASE(8, EXPAND_EMBED_PUSH_CASE(0); EXPAND_EMBED_PUSH_NNCROSS(8);
                 EXPAND_EMBED_PUSH_NNCROSS(64););
-    EMBEDX_CASE(16, EXPAND_EMBED_PUSH_CASE2(0); EXPAND_EMBED_PUSH_NNCROSS(64););
-    EMBEDX_CASE(32, EXPAND_EMBED_PUSH_CASE2(0););
-    EMBEDX_CASE(64, EXPAND_EMBED_PUSH_CASE2(0););
-    EMBEDX_CASE(256, EXPAND_EMBED_PUSH_CASE2(0););
-    EMBEDX_CASE(128, EXPAND_EMBED_PUSH_CASE2(0););
-    EMBEDX_CASE(280, EXPAND_EMBED_PUSH_CASE2(0););
+    EMBEDX_CASE(16, EXPAND_EMBED_PUSH_CASE(0); EXPAND_EMBED_PUSH_NNCROSS(64););
+    EMBEDX_CASE(32, EXPAND_EMBED_PUSH_CASE(0););
+    EMBEDX_CASE(64, EXPAND_EMBED_PUSH_CASE(0););
+    EMBEDX_CASE(256, EXPAND_EMBED_PUSH_CASE(0););
+    EMBEDX_CASE(128, EXPAND_EMBED_PUSH_CASE(0););
+    EMBEDX_CASE(280, EXPAND_EMBED_PUSH_CASE(0););
     default:
       PADDLE_THROW(platform::errors::InvalidArgument(
           "Unsupport this embedding size [%d]", hidden_size - cvm_offset_));