From a5cc08c66a13b783300c3d36dd9f74578702b45c Mon Sep 17 00:00:00 2001 From: qingshui Date: Thu, 13 May 2021 14:04:02 +0800 Subject: [PATCH 1/2] 1. fix nncross expand_embedx size bug --- paddle/fluid/framework/boxps_worker.cc | 23 +++++++----- paddle/fluid/framework/fleet/box_wrapper.h | 37 +++++++++++++++++-- .../fluid/framework/fleet/box_wrapper_impl.h | 23 +++++------- 3 files changed, 58 insertions(+), 25 deletions(-) diff --git a/paddle/fluid/framework/boxps_worker.cc b/paddle/fluid/framework/boxps_worker.cc index 385185154a24e..c173bb888e7a3 100644 --- a/paddle/fluid/framework/boxps_worker.cc +++ b/paddle/fluid/framework/boxps_worker.cc @@ -322,8 +322,9 @@ void BoxPSWorker::CreateDeviceResource(const ProgramDesc& main_prog) { } int64_t pad_len = 0; - AllocParamTensor(&pad_len); - + if (sync_mode_ > 0) { + AllocParamTensor(&pad_len); + } auto& block = program_->Block(0); thread_scope_ = &(root_scope_->NewScope()); @@ -338,18 +339,22 @@ void BoxPSWorker::CreateDeviceResource(const ProgramDesc& main_prog) { const LoDTensor& root_tensor = root_scope_->FindVar(name)->Get(); LoDTensor* gpu_tensor = thread_scope_->Var(name)->GetMutable(); - if (CheckNeedParam(var)) { - auto dim = root_tensor.dims(); - size_t len = root_tensor.numel(); - gpu_tensor->ShareDataWith(param_sync_.Slice(offset, offset + len)) - .Resize(dim); - offset += len; + if (sync_mode_ > 0) { + if (CheckNeedParam(var)) { + auto dim = root_tensor.dims(); + size_t len = root_tensor.numel(); + gpu_tensor->ShareDataWith(param_sync_.Slice(offset, offset + len)) + .Resize(dim); + offset += len; + } } TensorCopy(*static_cast(&root_tensor), place_, static_cast(gpu_tensor)); } } - CHECK(offset <= (param_sync_.numel() - pad_len)); + if (sync_mode_ > 0) { + CHECK(offset <= (param_sync_.numel() - pad_len)); + } } void BoxPSWorker::SyncParam(void) { if (sync_mode_ == DenseKStepNode && node_size_ == 1) { diff --git a/paddle/fluid/framework/fleet/box_wrapper.h b/paddle/fluid/framework/fleet/box_wrapper.h index fc7b040bcb88a..d6191fc100415 100644 --- a/paddle/fluid/framework/fleet/box_wrapper.h +++ b/paddle/fluid/framework/fleet/box_wrapper.h @@ -240,14 +240,45 @@ class InputTable { std::vector table_; std::atomic miss_; }; +class DCacheBuffer { + public: + DCacheBuffer() : buf_(nullptr) {} + ~DCacheBuffer() {} + /** + * @Brief get data + */ + template + T* mutable_data(const size_t total_bytes, + const paddle::platform::Place& place) { + if (buf_ == nullptr) { + buf_ = memory::AllocShared(place, total_bytes); + } else if (buf_->size() < total_bytes) { + buf_.reset(); + buf_ = memory::AllocShared(place, total_bytes); + } + return reinterpret_cast(buf_->ptr()); + } + template + T* data() { + return reinterpret_cast(buf_->ptr()); + } + size_t memory_size() { + if (buf_ == nullptr) { + return 0; + } + return buf_->size(); + } + private: + std::shared_ptr buf_ = nullptr; +}; class BoxWrapper { struct DeviceBoxData { LoDTensor keys_tensor; LoDTensor dims_tensor; - LoDTensor pull_push_tensor; - LoDTensor keys_ptr_tensor; - LoDTensor values_ptr_tensor; + DCacheBuffer pull_push_tensor; + DCacheBuffer keys_ptr_tensor; + DCacheBuffer values_ptr_tensor; LoDTensor slot_lens; LoDTensor d_slot_vector; diff --git a/paddle/fluid/framework/fleet/box_wrapper_impl.h b/paddle/fluid/framework/fleet/box_wrapper_impl.h index 6cbc6b12e826e..4a7a2d0a61579 100644 --- a/paddle/fluid/framework/fleet/box_wrapper_impl.h +++ b/paddle/fluid/framework/fleet/box_wrapper_impl.h @@ -51,8 +51,8 @@ void BoxWrapper::PullSparseCase(const paddle::platform::Place& place, int64_t total_bytes = total_length * sizeof(FEATURE_VALUE_GPU_TYPE); FEATURE_VALUE_GPU_TYPE* total_values_gpu = - reinterpret_cast( - dev.pull_push_tensor.mutable_data({total_bytes, 1}, place)); + dev.pull_push_tensor.mutable_data(total_bytes, + place); if (platform::is_cpu_place(place)) { PADDLE_THROW(platform::errors::Unimplemented( @@ -69,9 +69,8 @@ void BoxWrapper::PullSparseCase(const paddle::platform::Place& place, dev.dims_tensor.mutable_data({total_length, 1}, place)); int* key2slot = reinterpret_cast( dev.keys2slot.mutable_data({total_length, 1}, place)); - uint64_t** gpu_keys = - reinterpret_cast(dev.keys_ptr_tensor.mutable_data( - {static_cast(slot_num * sizeof(uint64_t*)), 1}, place)); + uint64_t** gpu_keys = dev.keys_ptr_tensor.mutable_data( + static_cast(keys.size() * sizeof(uint64_t*)), place); int64_t* slot_lens = reinterpret_cast( dev.slot_lens.mutable_data({(slot_num + 1), 1}, place)); cudaMemcpyAsync(gpu_keys, keys.data(), keys.size() * sizeof(uint64_t*), @@ -89,10 +88,9 @@ void BoxWrapper::PullSparseCase(const paddle::platform::Place& place, PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( "PullSparseGPU failed in BoxPS.")); pull_boxps_timer.Pause(); - - float** gpu_values = - reinterpret_cast(dev.values_ptr_tensor.mutable_data( - {static_cast(slot_num * sizeof(float*)), 1}, place)); + // values.size() not sure equal slot_num + float** gpu_values = dev.values_ptr_tensor.mutable_data( + static_cast(values.size() * sizeof(float*)), place); cudaMemcpyAsync(gpu_values, values.data(), values.size() * sizeof(float*), cudaMemcpyHostToDevice, stream); @@ -132,8 +130,8 @@ void BoxWrapper::PushSparseGradCase( int64_t total_length = dev.total_key_length; int64_t total_bytes = total_length * sizeof(FeaturePushValueGpuType); FeaturePushValueGpuType* total_grad_values_gpu = - reinterpret_cast( - dev.pull_push_tensor.mutable_data({total_bytes, 1}, place)); + dev.pull_push_tensor.mutable_data(total_bytes, + place); if (platform::is_cpu_place(place)) { PADDLE_THROW(platform::errors::Unimplemented( "Warning:: CPUPlace is not supported in PaddleBox now.")); @@ -158,8 +156,7 @@ void BoxWrapper::PushSparseGradCase( reinterpret_cast(dev.slot_lens.data()); const int* d_slot_vector = dev.d_slot_vector.data(); const int* key2slot = reinterpret_cast(dev.keys2slot.data()); - float** gpu_values = - reinterpret_cast(dev.values_ptr_tensor.data()); + float** gpu_values = dev.values_ptr_tensor.data(); cudaMemcpyAsync(gpu_values, grad_values.data(), grad_values.size() * sizeof(float*), cudaMemcpyHostToDevice, stream); From 878ad7ba3c9c904b1e1abe076ff1d840b9c41e1e Mon Sep 17 00:00:00 2001 From: qingshui Date: Thu, 13 May 2021 18:35:28 +0800 Subject: [PATCH 2/2] 1. Simplify macro definition code --- paddle/fluid/framework/fleet/box_wrapper.cu | 495 ++++++++------------ 1 file changed, 206 insertions(+), 289 deletions(-) diff --git a/paddle/fluid/framework/fleet/box_wrapper.cu b/paddle/fluid/framework/fleet/box_wrapper.cu index 8a04cc9ba1fc3..3e2f8fc63183d 100644 --- a/paddle/fluid/framework/fleet/box_wrapper.cu +++ b/paddle/fluid/framework/fleet/box_wrapper.cu @@ -433,6 +433,52 @@ __global__ void AddMaskCalculator(const float* pred, const int64_t* label, } } +template +void FeaturePullCopy(cudaStream_t stream, uint64_t** gpu_keys, + float** gpu_values, void* src, const int hidden_size, + const size_t embedx_dim, const int total_length, + int* total_dims, const int64_t* slot_lens, + const int slot_num, const int* key2slot, const float scale, + const int cvm_offset) { + FeaturePullValueType* pull_values_gpu = + reinterpret_cast(src); + // normal + PullCopyBase< + FeaturePullValueType><<<(total_length + 512 - 1) / 512, 512, 0, stream>>>( + gpu_values, pull_values_gpu, hidden_size, total_length, gpu_keys, + total_dims, slot_lens, slot_num, key2slot, cvm_offset); + // embedx + int embedx_total_length = total_length * embedx_dim; + PullCopyExpand<<<(embedx_total_length + 512 - 1) / 512, + 512, 0, stream>>>( + gpu_values, pull_values_gpu, embedx_dim, embedx_total_length, total_dims, + slot_lens, slot_num, key2slot, scale, cvm_offset); +} + +template +void FeaturePullCopyNNCross(cudaStream_t stream, uint64_t** gpu_keys, + float** gpu_values, void* src, + const int hidden_size, const size_t embedx_dim, + const size_t expand_dim, const int total_length, + int* total_dims, const int64_t* slot_lens, + const int slot_num, const int* key2slot, + const float scale, const int cvm_offset) { + FeaturePullValueType* pull_values_gpu = + reinterpret_cast(src); + // nncross + PullCopyBaseNNCross< + FeaturePullValueType><<<(total_length + 512 - 1) / 512, 512, 0, stream>>>( + gpu_values, pull_values_gpu, hidden_size, expand_dim, total_length, + gpu_keys, total_dims, slot_lens, slot_num, key2slot, cvm_offset); + // embedx + expand_embedx + int embedx_total_length = total_length * (embedx_dim + expand_dim); + PullCopyExpandNNCross<<< + (embedx_total_length + 512 - 1) / 512, 512, 0, stream>>>( + gpu_values, pull_values_gpu, (embedx_dim + expand_dim), embedx_dim, + expand_dim, embedx_total_length, total_dims, slot_lens, slot_num, + key2slot, scale, cvm_offset); +} + void BoxWrapper::CopyForPull(const paddle::platform::Place& place, uint64_t** gpu_keys, float** gpu_values, void* total_values_gpu, const int64_t* slot_lens, @@ -454,190 +500,70 @@ void BoxWrapper::CopyForPull(const paddle::platform::Place& place, } \ } break -#define EXPAND_EMBED_PULL_CASE(i, ...) \ - case i: { \ - constexpr size_t ExpandDim = i; \ - if (feature_type_ == static_cast(boxps::FEATURE_PCOC)) { \ - PullCopy><<< \ - (total_length + 512 - 1) / 512, 512, 0, stream>>>( \ - gpu_values, \ - reinterpret_cast< \ - boxps::FeaturePullValueGpuPCOC*>( \ - total_values_gpu), \ - hidden_size, expand_embed_dim, total_length, gpu_keys, total_dims, \ - slot_lens, slot_num, key2slot, pull_embedx_scale_, cvm_offset_); \ - } else if (feature_type_ == static_cast(boxps::FEATURE_QUANT) || \ - feature_type_ == static_cast(boxps::FEATURE_SHOWCLK)) { \ - PullCopy><<< \ - (total_length + 512 - 1) / 512, 512, 0, stream>>>( \ - gpu_values, \ - reinterpret_cast< \ - boxps::FeaturePullValueGpuQuant*>( \ - total_values_gpu), \ - hidden_size, expand_embed_dim, total_length, gpu_keys, total_dims, \ - slot_lens, slot_num, key2slot, pull_embedx_scale_, cvm_offset_); \ - } else { \ - PullCopy><<< \ - (total_length + 512 - 1) / 512, 512, 0, stream>>>( \ - gpu_values, \ - reinterpret_cast*>( \ - total_values_gpu), \ - hidden_size, expand_embed_dim, total_length, gpu_keys, total_dims, \ - slot_lens, slot_num, key2slot, 1.0, cvm_offset_); \ - } \ +#define EXPAND_EMBED_PULL_CASE(i, ...) \ + case i: { \ + constexpr size_t ExpandDim = i; \ + if (feature_type_ == static_cast(boxps::FEATURE_SHARE_EMBEDDING)) { \ + constexpr size_t SingleEmbedxDim = \ + EmbedxDim / boxps::SHARE_EMBEDDING_NUM; \ + FeaturePullCopy>( \ + stream, gpu_keys, gpu_values, total_values_gpu, hidden_size, \ + EmbedxDim, total_length, total_dims, slot_lens, slot_num, key2slot, \ + pull_embedx_scale_, cvm_offset_); \ + } else if (feature_type_ == static_cast(boxps::FEATURE_PCOC)) { \ + FeaturePullCopy>( \ + stream, gpu_keys, gpu_values, total_values_gpu, hidden_size, \ + EmbedxDim, total_length, total_dims, slot_lens, slot_num, key2slot, \ + pull_embedx_scale_, cvm_offset_); \ + } else if (feature_type_ == static_cast(boxps::FEATURE_QUANT) || \ + feature_type_ == static_cast(boxps::FEATURE_SHOWCLK)) { \ + FeaturePullCopy>( \ + stream, gpu_keys, gpu_values, total_values_gpu, hidden_size, \ + EmbedxDim, total_length, total_dims, slot_lens, slot_num, key2slot, \ + pull_embedx_scale_, cvm_offset_); \ + } else { \ + FeaturePullCopy>( \ + stream, gpu_keys, gpu_values, total_values_gpu, hidden_size, \ + EmbedxDim, total_length, total_dims, slot_lens, slot_num, key2slot, \ + 1.0, cvm_offset_); \ + } \ } break -#define EXPAND_EMBED_PULL_CASE2(i, ...) \ +#define EXPAND_EMBED_PULL_NNCROSS(i, ...) \ case i: { \ constexpr size_t ExpandDim = i; \ - if (feature_type_ == static_cast(boxps::FEATURE_SHARE_EMBEDDING)) { \ - constexpr size_t SingleEmbedxDim = \ - EmbedxDim / boxps::SHARE_EMBEDDING_NUM; \ - typedef boxps::FeaturePullValueGpuShareEmbedding \ - PullShareEmbedding; \ - PullCopyBase<<<(total_length + 512 - 1) / 512, 512, \ - 0, stream>>>( \ - gpu_values, reinterpret_cast(total_values_gpu), \ - hidden_size, total_length, gpu_keys, total_dims, slot_lens, \ - slot_num, key2slot, cvm_offset_); \ - int embedx_total_length = total_length * EmbedxDim; \ - PullCopyExpand<<< \ - (embedx_total_length + 512 - 1) / 512, 512, 0, stream>>>( \ - gpu_values, reinterpret_cast(total_values_gpu), \ - EmbedxDim, embedx_total_length, total_dims, slot_lens, slot_num, \ - key2slot, pull_embedx_scale_, cvm_offset_); \ - } else if (feature_type_ == static_cast(boxps::FEATURE_PCOC)) { \ - PullCopyBase><<< \ - (total_length + 512 - 1) / 512, 512, 0, stream>>>( \ - gpu_values, \ - reinterpret_cast< \ - boxps::FeaturePullValueGpuPCOC*>( \ - total_values_gpu), \ - hidden_size, total_length, gpu_keys, total_dims, slot_lens, \ - slot_num, key2slot, cvm_offset_); \ - int embedx_total_length = total_length * EmbedxDim; \ - PullCopyExpand><<< \ - (embedx_total_length + 512 - 1) / 512, 512, 0, stream>>>( \ - gpu_values, \ - reinterpret_cast< \ - boxps::FeaturePullValueGpuPCOC*>( \ - total_values_gpu), \ - EmbedxDim, embedx_total_length, total_dims, slot_lens, slot_num, \ + if (feature_type_ == static_cast(boxps::FEATURE_PCOC)) { \ + FeaturePullCopyNNCross< \ + boxps::FeaturePullValueGpuPCOC>( \ + stream, gpu_keys, gpu_values, total_values_gpu, hidden_size, \ + EmbedxDim, ExpandDim, total_length, total_dims, slot_lens, slot_num, \ key2slot, pull_embedx_scale_, cvm_offset_); \ } else if (feature_type_ == static_cast(boxps::FEATURE_QUANT) || \ feature_type_ == static_cast(boxps::FEATURE_SHOWCLK)) { \ - PullCopyBase><<< \ - (total_length + 512 - 1) / 512, 512, 0, stream>>>( \ - gpu_values, \ - reinterpret_cast< \ - boxps::FeaturePullValueGpuQuant*>( \ - total_values_gpu), \ - hidden_size, total_length, gpu_keys, total_dims, slot_lens, \ - slot_num, key2slot, cvm_offset_); \ - int embedx_total_length = total_length * EmbedxDim; \ - PullCopyExpand><<< \ - (embedx_total_length + 512 - 1) / 512, 512, 0, stream>>>( \ - gpu_values, \ - reinterpret_cast< \ - boxps::FeaturePullValueGpuQuant*>( \ - total_values_gpu), \ - EmbedxDim, embedx_total_length, total_dims, slot_lens, slot_num, \ + FeaturePullCopyNNCross< \ + boxps::FeaturePullValueGpuQuant>( \ + stream, gpu_keys, gpu_values, total_values_gpu, hidden_size, \ + EmbedxDim, ExpandDim, total_length, total_dims, slot_lens, slot_num, \ key2slot, pull_embedx_scale_, cvm_offset_); \ } else { \ - PullCopyBase><<< \ - (total_length + 512 - 1) / 512, 512, 0, stream>>>( \ - gpu_values, \ - reinterpret_cast*>( \ - total_values_gpu), \ - hidden_size, total_length, gpu_keys, total_dims, slot_lens, \ - slot_num, key2slot, cvm_offset_); \ - int embedx_total_length = total_length * EmbedxDim; \ - PullCopyExpand><<< \ - (embedx_total_length + 512 - 1) / 512, 512, 0, stream>>>( \ - gpu_values, \ - reinterpret_cast*>( \ - total_values_gpu), \ - EmbedxDim, embedx_total_length, total_dims, slot_lens, slot_num, \ + FeaturePullCopyNNCross< \ + boxps::FeaturePullValueGpu>( \ + stream, gpu_keys, gpu_values, total_values_gpu, hidden_size, \ + EmbedxDim, ExpandDim, total_length, total_dims, slot_lens, slot_num, \ key2slot, 1.0, cvm_offset_); \ } \ } break -#define EXPAND_EMBED_PULL_NNCROSS(i, ...) \ - case i: { \ - constexpr size_t ExpandDim = i; \ - if (feature_type_ == static_cast(boxps::FEATURE_PCOC)) { \ - PullCopyBaseNNCross><<<(total_length + 512 - 1) / 512, 512, 0, stream>>>( \ - gpu_values, \ - reinterpret_cast< \ - boxps::FeaturePullValueGpuPCOC*>( \ - total_values_gpu), \ - hidden_size, expand_embed_dim, total_length, gpu_keys, total_dims, \ - slot_lens, slot_num, key2slot, cvm_offset_); \ - int embedx_total_length = total_length * (EmbedxDim + ExpandDim); \ - PullCopyExpandNNCross< \ - boxps::FeaturePullValueGpuPCOC><<< \ - (embedx_total_length + 512 - 1) / 512, 512, 0, stream>>>( \ - gpu_values, \ - reinterpret_cast< \ - boxps::FeaturePullValueGpuPCOC*>( \ - total_values_gpu), \ - (EmbedxDim + ExpandDim), EmbedxDim, ExpandDim, embedx_total_length, \ - total_dims, slot_lens, slot_num, key2slot, pull_embedx_scale_, \ - cvm_offset_); \ - } else if (feature_type_ == static_cast(boxps::FEATURE_QUANT) || \ - feature_type_ == static_cast(boxps::FEATURE_SHOWCLK)) { \ - PullCopyBaseNNCross><<<(total_length + 512 - 1) / 512, 512, 0, stream>>>( \ - gpu_values, \ - reinterpret_cast< \ - boxps::FeaturePullValueGpuQuant*>( \ - total_values_gpu), \ - hidden_size, expand_embed_dim, total_length, gpu_keys, total_dims, \ - slot_lens, slot_num, key2slot, cvm_offset_); \ - int embedx_total_length = total_length * (EmbedxDim + ExpandDim); \ - PullCopyExpandNNCross< \ - boxps::FeaturePullValueGpuQuant><<< \ - (embedx_total_length + 512 - 1) / 512, 512, 0, stream>>>( \ - gpu_values, \ - reinterpret_cast< \ - boxps::FeaturePullValueGpuQuant*>( \ - total_values_gpu), \ - (EmbedxDim + ExpandDim), EmbedxDim, ExpandDim, embedx_total_length, \ - total_dims, slot_lens, slot_num, key2slot, pull_embedx_scale_, \ - cvm_offset_); \ - } else { \ - PullCopyBaseNNCross><<< \ - (total_length + 512 - 1) / 512, 512, 0, stream>>>( \ - gpu_values, \ - reinterpret_cast*>( \ - total_values_gpu), \ - hidden_size, expand_embed_dim, total_length, gpu_keys, total_dims, \ - slot_lens, slot_num, key2slot, cvm_offset_); \ - int embedx_total_length = total_length * (EmbedxDim + ExpandDim); \ - PullCopyExpandNNCross< \ - boxps::FeaturePullValueGpu><<< \ - (embedx_total_length + 512 - 1) / 512, 512, 0, stream>>>( \ - gpu_values, \ - reinterpret_cast*>( \ - total_values_gpu), \ - (EmbedxDim + ExpandDim), EmbedxDim, ExpandDim, embedx_total_length, \ - total_dims, slot_lens, slot_num, key2slot, 1.0, cvm_offset_); \ - } \ - } break - switch (hidden_size - cvm_offset_) { EMBEDX_CASE(8, EXPAND_EMBED_PULL_CASE(0); EXPAND_EMBED_PULL_NNCROSS(8); EXPAND_EMBED_PULL_NNCROSS(64);); - EMBEDX_CASE(16, EXPAND_EMBED_PULL_CASE2(0); EXPAND_EMBED_PULL_NNCROSS(64);); - EMBEDX_CASE(32, EXPAND_EMBED_PULL_CASE2(0);); - EMBEDX_CASE(64, EXPAND_EMBED_PULL_CASE2(0);); - EMBEDX_CASE(256, EXPAND_EMBED_PULL_CASE2(0);); - EMBEDX_CASE(128, EXPAND_EMBED_PULL_CASE2(0);); - EMBEDX_CASE(280, EXPAND_EMBED_PULL_CASE2(0);); + EMBEDX_CASE(16, EXPAND_EMBED_PULL_CASE(0); EXPAND_EMBED_PULL_NNCROSS(64);); + EMBEDX_CASE(32, EXPAND_EMBED_PULL_CASE(0);); + EMBEDX_CASE(64, EXPAND_EMBED_PULL_CASE(0);); + EMBEDX_CASE(256, EXPAND_EMBED_PULL_CASE(0);); + EMBEDX_CASE(128, EXPAND_EMBED_PULL_CASE(0);); + EMBEDX_CASE(280, EXPAND_EMBED_PULL_CASE(0);); default: PADDLE_THROW(platform::errors::InvalidArgument( "Unsupport this embedding size [%d]", hidden_size - cvm_offset_)); @@ -662,6 +588,73 @@ void BoxWrapper::CopyKeys(const paddle::platform::Place& place, cudaStreamSynchronize(stream); } +template +void FeaturePushCopy(cudaStream_t stream, void* dest, float** grad_values, + const int hidden_size, const int embedx_dim, + const int total_length, const int batch_size, + const int* slot_vector, const int* total_dims, + const int64_t* slot_lens, const int slot_num, + const int* key2slot, const int cvm_offset) { + FeaturePushValueGpuType* push_grad_values = + reinterpret_cast(dest); + // normal + PushCopyBase<<<(total_length + 512 - 1) / 512, 512, + 0, stream>>>( + push_grad_values, grad_values, hidden_size, total_length, batch_size, + slot_vector, total_dims, slot_lens, slot_num, key2slot, cvm_offset); + // normal + int embedx_total_length = total_length * embedx_dim; + PushCopyExpand<<< + (embedx_total_length + 512 - 1) / 512, 512, 0, stream>>>( + push_grad_values, grad_values, embedx_dim, embedx_total_length, + batch_size, slot_vector, total_dims, slot_lens, slot_num, key2slot, + cvm_offset); +} + +template +void FeaturePushCopyNNCross(cudaStream_t stream, void* dest, + float** grad_values, const int hidden_size, + const int embedx_dim, const int expand_dim, + const int total_length, const int batch_size, + const int* slot_vector, const int* total_dims, + const int64_t* slot_lens, const int slot_num, + const int* key2slot, const int cvm_offset) { + FeaturePushValueGpuType* push_grad_values = + reinterpret_cast(dest); + // nncross + PushCopyBaseNNCross<<<(total_length + 512 - 1) / 512, + 512, 0, stream>>>( + push_grad_values, grad_values, hidden_size, total_length, batch_size, + slot_vector, total_dims, slot_lens, slot_num, key2slot, cvm_offset); + int embedx_total_length = total_length * (embedx_dim + expand_dim); + PushCopyExpandNNCross<<< + (embedx_total_length + 512 - 1) / 512, 512, 0, stream>>>( + push_grad_values, grad_values, (embedx_dim + expand_dim), embedx_dim, + expand_dim, embedx_total_length, batch_size, slot_vector, total_dims, + slot_lens, slot_num, key2slot, cvm_offset); +} + +template +void FeaturePushCopyShareEmbedding( + cudaStream_t stream, void* dest, float** grad_values, const int hidden_size, + const size_t embedx_dim, const size_t expand_dim, const int total_length, + const int batch_size, const int* slot_vector, const int* total_dims, + const int64_t* slot_lens, const int slot_num, const int* key2slot, + const int cvm_offset) { + FeaturePushValueGpuType* push_grad_values = + reinterpret_cast(dest); + // share embedding + PushCopyBaseShareEmbedding<<< + (total_length + 512 - 1) / 512, 512, 0, stream>>>( + push_grad_values, grad_values, hidden_size, total_length, batch_size, + slot_vector, total_dims, slot_lens, slot_num, key2slot, cvm_offset); + int embedx_total_length = total_length * embedx_dim; + PushCopyExpand<<< + (embedx_total_length + 512 - 1) / 512, 512, 0, stream>>>( + push_grad_values, grad_values, embedx_dim, embedx_total_length, + batch_size, slot_vector, total_dims, slot_lens, slot_num, key2slot, + cvm_offset); +} void BoxWrapper::CopyForPush(const paddle::platform::Place& place, float** grad_values, void* total_grad_values_gpu, const int* d_slot_vector, const int64_t* slot_lens, @@ -683,134 +676,58 @@ void BoxWrapper::CopyForPush(const paddle::platform::Place& place, "Unsupport this expand embedding size [%d]", expand_embed_dim)); \ } \ } break -#define EXPAND_EMBED_PUSH_CASE(i, ...) \ - case i: { \ - constexpr size_t ExpandDim = i; \ - if (feature_type_ == static_cast(boxps::FEATURE_PCOC)) { \ - PushCopy><<< \ - (total_length + 512 - 1) / 512, 512, 0, stream>>>( \ - reinterpret_cast< \ - boxps::FeaturePushValueGpuPCOC*>( \ - total_grad_values_gpu), \ - grad_values, hidden_size, expand_embed_dim, total_length, \ - batch_size, d_slot_vector, total_dims, slot_lens, slot_num, \ - key2slot, cvm_offset_); \ - } else { \ - PushCopy><<< \ - (total_length + 512 - 1) / 512, 512, 0, stream>>>( \ - reinterpret_cast*>( \ - total_grad_values_gpu), \ - grad_values, hidden_size, expand_embed_dim, total_length, \ - batch_size, d_slot_vector, total_dims, slot_lens, slot_num, \ - key2slot, cvm_offset_); \ - } \ - } break -#define EXPAND_EMBED_PUSH_CASE2(i, ...) \ - case i: { \ - constexpr size_t ExpandDim = i; \ - if (feature_type_ == static_cast(boxps::FEATURE_SHARE_EMBEDDING)) { \ - constexpr size_t SingleEmbedxDim = \ - EmbedxDim / boxps::SHARE_EMBEDDING_NUM; \ - typedef boxps::FeaturePushValueGpuShareEmbedding \ - PushShareEmbedding; \ - PushCopyBaseShareEmbedding<<< \ - (total_length + 512 - 1) / 512, 512, 0, stream>>>( \ - reinterpret_cast(total_grad_values_gpu), \ - grad_values, hidden_size, total_length, batch_size, d_slot_vector, \ - total_dims, slot_lens, slot_num, key2slot, cvm_offset_); \ - int embedx_total_length = total_length * EmbedxDim; \ - PushCopyExpand<<< \ - (embedx_total_length + 512 - 1) / 512, 512, 0, stream>>>( \ - reinterpret_cast(total_grad_values_gpu), \ - grad_values, EmbedxDim, embedx_total_length, batch_size, \ - d_slot_vector, total_dims, slot_lens, slot_num, key2slot, \ - cvm_offset_); \ - } else if (feature_type_ == static_cast(boxps::FEATURE_PCOC)) { \ - PushCopyBase><<< \ - (total_length + 512 - 1) / 512, 512, 0, stream>>>( \ - reinterpret_cast< \ - boxps::FeaturePushValueGpuPCOC*>( \ - total_grad_values_gpu), \ - grad_values, hidden_size, total_length, batch_size, d_slot_vector, \ - total_dims, slot_lens, slot_num, key2slot, cvm_offset_); \ - int embedx_total_length = total_length * EmbedxDim; \ - PushCopyExpand><<< \ - (embedx_total_length + 512 - 1) / 512, 512, 0, stream>>>( \ - reinterpret_cast< \ - boxps::FeaturePushValueGpuPCOC*>( \ - total_grad_values_gpu), \ - grad_values, EmbedxDim, embedx_total_length, batch_size, \ - d_slot_vector, total_dims, slot_lens, slot_num, key2slot, \ - cvm_offset_); \ - } else { \ - PushCopyBase><<< \ - (total_length + 512 - 1) / 512, 512, 0, stream>>>( \ - reinterpret_cast*>( \ - total_grad_values_gpu), \ - grad_values, hidden_size, total_length, batch_size, d_slot_vector, \ - total_dims, slot_lens, slot_num, key2slot, cvm_offset_); \ - int embedx_total_length = total_length * EmbedxDim; \ - PushCopyExpand><<< \ - (embedx_total_length + 512 - 1) / 512, 512, 0, stream>>>( \ - reinterpret_cast*>( \ - total_grad_values_gpu), \ - grad_values, EmbedxDim, embedx_total_length, batch_size, \ - d_slot_vector, total_dims, slot_lens, slot_num, key2slot, \ - cvm_offset_); \ - } \ +#define EXPAND_EMBED_PUSH_CASE(i, ...) \ + case i: { \ + constexpr size_t ExpandDim = i; \ + if (feature_type_ == static_cast(boxps::FEATURE_SHARE_EMBEDDING)) { \ + constexpr size_t SingleEmbedxDim = \ + EmbedxDim / boxps::SHARE_EMBEDDING_NUM; \ + FeaturePushCopyShareEmbedding>( \ + stream, total_grad_values_gpu, grad_values, hidden_size, EmbedxDim, \ + ExpandDim, total_length, batch_size, d_slot_vector, total_dims, \ + slot_lens, slot_num, key2slot, cvm_offset_); \ + } else if (feature_type_ == static_cast(boxps::FEATURE_PCOC)) { \ + FeaturePushCopy>( \ + stream, total_grad_values_gpu, grad_values, hidden_size, EmbedxDim, \ + total_length, batch_size, d_slot_vector, total_dims, slot_lens, \ + slot_num, key2slot, cvm_offset_); \ + } else { \ + FeaturePushCopy>( \ + stream, total_grad_values_gpu, grad_values, hidden_size, EmbedxDim, \ + total_length, batch_size, d_slot_vector, total_dims, slot_lens, \ + slot_num, key2slot, cvm_offset_); \ + } \ } break -#define EXPAND_EMBED_PUSH_NNCROSS(i, ...) \ - case i: { \ - constexpr size_t ExpandDim = i; \ - if (feature_type_ == static_cast(boxps::FEATURE_PCOC)) { \ - PushCopyBaseNNCross><<<(total_length + 512 - 1) / 512, 512, 0, stream>>>( \ - reinterpret_cast< \ - boxps::FeaturePushValueGpuPCOC*>( \ - total_grad_values_gpu), \ - grad_values, hidden_size, total_length, batch_size, d_slot_vector, \ - total_dims, slot_lens, slot_num, key2slot, cvm_offset_); \ - int embedx_total_length = total_length * (EmbedxDim + ExpandDim); \ - PushCopyExpandNNCross< \ - boxps::FeaturePushValueGpuPCOC><<< \ - (embedx_total_length + 512 - 1) / 512, 512, 0, stream>>>( \ - reinterpret_cast< \ - boxps::FeaturePushValueGpuPCOC*>( \ - total_grad_values_gpu), \ - grad_values, (EmbedxDim + ExpandDim), EmbedxDim, ExpandDim, \ - embedx_total_length, batch_size, d_slot_vector, total_dims, \ - slot_lens, slot_num, key2slot, cvm_offset_); \ - } else { \ - PushCopyBaseNNCross><<< \ - (total_length + 512 - 1) / 512, 512, 0, stream>>>( \ - reinterpret_cast*>( \ - total_grad_values_gpu), \ - grad_values, hidden_size, total_length, batch_size, d_slot_vector, \ - total_dims, slot_lens, slot_num, key2slot, cvm_offset_); \ - int embedx_total_length = total_length * (EmbedxDim + ExpandDim); \ - PushCopyExpandNNCross< \ - boxps::FeaturePushValueGpu><<< \ - (embedx_total_length + 512 - 1) / 512, 512, 0, stream>>>( \ - reinterpret_cast*>( \ - total_grad_values_gpu), \ - grad_values, (EmbedxDim + ExpandDim), EmbedxDim, ExpandDim, \ - embedx_total_length, batch_size, d_slot_vector, total_dims, \ - slot_lens, slot_num, key2slot, cvm_offset_); \ - } \ +#define EXPAND_EMBED_PUSH_NNCROSS(i, ...) \ + case i: { \ + constexpr size_t ExpandDim = i; \ + if (feature_type_ == static_cast(boxps::FEATURE_PCOC)) { \ + FeaturePushCopyNNCross< \ + boxps::FeaturePushValueGpuPCOC>( \ + stream, total_grad_values_gpu, grad_values, hidden_size, EmbedxDim, \ + ExpandDim, total_length, batch_size, d_slot_vector, total_dims, \ + slot_lens, slot_num, key2slot, cvm_offset_); \ + } else { \ + FeaturePushCopyNNCross< \ + boxps::FeaturePushValueGpu>( \ + stream, total_grad_values_gpu, grad_values, hidden_size, EmbedxDim, \ + ExpandDim, total_length, batch_size, d_slot_vector, total_dims, \ + slot_lens, slot_num, key2slot, cvm_offset_); \ + } \ } break + switch (hidden_size - cvm_offset_) { EMBEDX_CASE(8, EXPAND_EMBED_PUSH_CASE(0); EXPAND_EMBED_PUSH_NNCROSS(8); EXPAND_EMBED_PUSH_NNCROSS(64);); - EMBEDX_CASE(16, EXPAND_EMBED_PUSH_CASE2(0); EXPAND_EMBED_PUSH_NNCROSS(64);); - EMBEDX_CASE(32, EXPAND_EMBED_PUSH_CASE2(0);); - EMBEDX_CASE(64, EXPAND_EMBED_PUSH_CASE2(0);); - EMBEDX_CASE(256, EXPAND_EMBED_PUSH_CASE2(0);); - EMBEDX_CASE(128, EXPAND_EMBED_PUSH_CASE2(0);); - EMBEDX_CASE(280, EXPAND_EMBED_PUSH_CASE2(0);); + EMBEDX_CASE(16, EXPAND_EMBED_PUSH_CASE(0); EXPAND_EMBED_PUSH_NNCROSS(64);); + EMBEDX_CASE(32, EXPAND_EMBED_PUSH_CASE(0);); + EMBEDX_CASE(64, EXPAND_EMBED_PUSH_CASE(0);); + EMBEDX_CASE(256, EXPAND_EMBED_PUSH_CASE(0);); + EMBEDX_CASE(128, EXPAND_EMBED_PUSH_CASE(0);); + EMBEDX_CASE(280, EXPAND_EMBED_PUSH_CASE(0);); default: PADDLE_THROW(platform::errors::InvalidArgument( "Unsupport this embedding size [%d]", hidden_size - cvm_offset_));