From c48a9ad56e69a5d27d1b36df8c731c9c32f84d78 Mon Sep 17 00:00:00 2001 From: Wilber Date: Mon, 17 Jan 2022 13:28:03 +0800 Subject: [PATCH] [Pten] Replace platform::Place to pten::Place. (#38899) * add pten::Place data structure. * update ci problem * fix ci problem * update * using platform::Place=pten::Place * remove BOOST_GET_CONST for CPUPlace and GPUPlace * compile pass 25%. * compile pass 45% * compile pass 60% * remove boost_get for xpu npu mlu and ipu * compile pass on cpu and gpu. * fix compile problem * fix compile error. * update * fix ci problem * update * ci approve * fix ci problem * fix ci eager test problem * remove BOOST_GET_CONST * fix npu compile --- .../distributed/fleet_executor/carrier.cc | 4 +- .../fluid/distributed/service/brpc_utils.cc | 59 ++- .../fluid/distributed/service/heter_client.cc | 3 +- .../accumulation/gradient_accumulation.cc | 32 +- paddle/fluid/eager/legacy/op_runner.cc | 9 +- .../fluid/eager/legacy/prepared_operator.cc | 4 +- .../fluid/framework/data_device_transform.cc | 2 +- .../framework/details/all_reduce_op_handle.cc | 7 +- .../bind_threaded_ssa_graph_executor.cc | 5 +- .../fluid/framework/details/bkcl_op_handle.h | 2 +- .../framework/details/broadcast_op_handle.cc | 14 +- .../details/eager_deletion_op_handle.cc | 8 +- .../details/fused_all_reduce_op_handle.cc | 5 +- .../framework/details/nan_inf_utils_detail.cc | 2 +- .../framework/details/nan_inf_utils_detail.cu | 2 +- .../fluid/framework/details/nccl_op_handle.h | 10 +- .../fluid/framework/details/op_handle_base.cc | 15 +- .../details/parallel_ssa_graph_executor.cc | 2 +- .../framework/details/reduce_op_handle.cc | 12 +- .../details/scale_loss_grad_op_handle.cc | 9 +- .../details/share_tensor_buffer_op_handle.cc | 3 +- .../details/sparse_all_reduce_op_handle.cc | 2 +- .../framework/details/variable_visitor.cc | 18 +- paddle/fluid/framework/dlpack_tensor.cc | 2 +- paddle/fluid/framework/dlpack_tensor_test.cc | 3 +- paddle/fluid/framework/executor.cc | 29 +- paddle/fluid/framework/fleet/box_wrapper.cu | 9 +- .../fluid/framework/fleet/box_wrapper_impl.h | 7 +- paddle/fluid/framework/fleet/fleet_wrapper.cc | 6 +- paddle/fluid/framework/fleet/heter_wrapper.cc | 12 +- .../fluid/framework/fleet/ps_gpu_wrapper.cc | 4 +- .../fluid/framework/fleet/ps_gpu_wrapper.cu | 9 +- paddle/fluid/framework/garbage_collector.cc | 4 +- .../fluid/framework/heter_section_worker.cc | 4 +- paddle/fluid/framework/heterxpu_trainer.cc | 34 +- paddle/fluid/framework/ir/pass.cc | 2 +- paddle/fluid/framework/mixed_vector.h | 14 +- paddle/fluid/framework/naive_executor.cc | 2 +- .../fluid/framework/new_executor/profiler.h | 2 +- paddle/fluid/framework/op_kernel_type.cc | 2 +- paddle/fluid/framework/op_kernel_type_test.cc | 4 +- paddle/fluid/framework/operator.cc | 14 +- paddle/fluid/framework/parallel_executor.cc | 29 +- paddle/fluid/framework/pull_dense_worker.cc | 6 +- paddle/fluid/framework/section_worker.cc | 9 +- paddle/fluid/framework/selected_rows.h | 8 +- paddle/fluid/framework/tensor_util.cc | 194 +++----- paddle/fluid/framework/tensor_util.h | 72 +-- paddle/fluid/imperative/bkcl_context.cc | 4 +- .../fluid/imperative/gradient_accumulator.cc | 28 +- paddle/fluid/imperative/hccl_context.cc | 20 +- paddle/fluid/imperative/layer.cc | 4 +- paddle/fluid/imperative/nccl_context.cc | 19 +- paddle/fluid/imperative/prepared_operator.cc | 6 +- paddle/fluid/imperative/reducer.cc | 2 +- paddle/fluid/imperative/tracer.cc | 29 +- .../fluid/inference/api/analysis_predictor.cc | 20 +- paddle/fluid/inference/api/api_impl.cc | 6 +- .../inference/api/details/zero_copy_tensor.cc | 6 +- paddle/fluid/inference/lite/tensor_utils.cc | 2 +- .../memory/allocation/allocator_facade.cc | 11 +- .../memory/allocation/best_fit_allocator.h | 7 +- .../fluid/memory/allocation/cuda_allocator.cc | 2 +- .../cuda_device_context_allocator.h | 4 +- .../allocation/cuda_virtual_mem_allocator.cc | 2 +- .../allocation/naive_best_fit_allocator.cc | 11 +- .../fluid/memory/allocation/npu_allocator.cc | 2 +- .../allocation/stream_safe_cuda_allocator.cc | 5 +- .../allocation/stream_safe_cuda_allocator.h | 2 +- .../allocation/thread_local_allocator.cc | 3 +- paddle/fluid/memory/memcpy.cc | 452 ++++++++++++++++++ .../fluid/operators/activation_cudnn_op.cu.cc | 6 - paddle/fluid/operators/allclose_op.cu | 3 +- .../amp/check_finite_and_unscale_op.cu | 9 +- .../amp/check_finite_and_unscale_op_xpu.cc | 12 +- .../operators/amp/update_loss_scaling_op.cu | 10 +- .../amp/update_loss_scaling_op_npu.cc | 4 +- .../amp/update_loss_scaling_op_xpu.cc | 29 +- paddle/fluid/operators/assign_op.cc | 2 - paddle/fluid/operators/assign_op_npu.cc | 2 - paddle/fluid/operators/assign_op_xpu.cc | 2 - paddle/fluid/operators/assign_value_op.cc | 3 - .../fluid/operators/average_accumulates_op.cu | 6 +- paddle/fluid/operators/bernoulli_op.cu | 3 +- paddle/fluid/operators/cholesky_op.cu | 3 +- .../fluid/operators/class_center_sample_op.cu | 5 +- .../fluid/operators/collective/allreduce_op.h | 2 +- .../operators/collective/broadcast_op.cu.cc | 2 +- .../operators/collective/broadcast_op_xpu.cc | 2 +- .../collective/c_allreduce_max_op.cc | 1 - .../collective/c_allreduce_max_op.cu.cc | 2 +- .../collective/c_allreduce_max_op_xpu.cc | 2 +- .../collective/c_allreduce_min_op.cc | 1 - .../collective/c_allreduce_min_op.cu.cc | 2 +- .../collective/c_allreduce_min_op_xpu.cc | 2 +- .../collective/c_allreduce_prod_op.cc | 1 - .../collective/c_allreduce_prod_op.cu.cc | 2 +- .../collective/c_allreduce_prod_op_xpu.cc | 2 +- .../collective/c_allreduce_sum_op.cc | 1 - .../collective/c_allreduce_sum_op.cu.cc | 2 +- .../collective/c_allreduce_sum_op_xpu.cc | 2 +- .../collective/c_comm_init_all_op.cc | 2 +- .../collective/c_comm_init_hccl_op.cc | 4 +- .../operators/collective/c_comm_init_op.cc | 9 +- .../operators/collective/c_reduce_max_op.cc | 1 - .../collective/c_reduce_max_op.cu.cc | 2 +- .../collective/c_reduce_max_op_xpu.cc | 2 +- .../operators/collective/c_reduce_min_op.cc | 1 - .../collective/c_reduce_min_op.cu.cc | 2 +- .../collective/c_reduce_min_op_xpu.cc | 2 +- .../fluid/operators/collective/c_reduce_op.h | 2 +- .../operators/collective/c_reduce_prod_op.cc | 1 - .../collective/c_reduce_prod_op.cu.cc | 2 +- .../collective/c_reduce_prod_op_xpu.cc | 2 +- .../operators/collective/c_reduce_sum_op.cc | 1 - .../collective/c_reduce_sum_op.cu.cc | 2 +- .../collective/c_reduce_sum_op_xpu.cc | 2 +- .../collective/c_sync_calc_stream_op.cc | 2 +- .../collective/c_sync_comm_stream_op.cc | 9 +- .../operators/collective/c_wait_comm_op.cc | 8 +- .../operators/collective/c_wait_compute_op.cc | 5 +- .../fluid/operators/controlflow/compare_op.cc | 5 +- .../operators/controlflow/fetch_v2_op.cc | 2 - paddle/fluid/operators/cudnn_lstm_op.cu.cc | 17 +- paddle/fluid/operators/cumprod_op.cu | 2 +- paddle/fluid/operators/cumprod_op.h | 2 +- .../operators/deformable_psroi_pooling_op.cu | 4 +- .../fluid/operators/dequantize_abs_max_op.cc | 4 +- paddle/fluid/operators/dequantize_log_op.cc | 3 - .../fluid/operators/detail/strided_memcpy.h | 8 +- .../fluid/operators/detection/bbox_util.cu.h | 4 +- .../fluid/operators/detection/box_coder_op.cu | 3 +- .../detection/collect_fpn_proposals_op.cu | 2 +- .../detection/distribute_fpn_proposals_op.cu | 2 +- .../detection/generate_proposals_op.cu | 4 +- .../detection/generate_proposals_v2_op.cu | 4 +- .../fluid/operators/detection/yolo_box_op.cu | 2 +- paddle/fluid/operators/dirichlet_op.cu | 3 +- paddle/fluid/operators/distribution_helper.h | 3 +- .../fluid/operators/dlnne/dlnne_engine_op.h | 3 +- paddle/fluid/operators/dropout_impl_util.h | 3 +- paddle/fluid/operators/edit_distance_op.cu | 4 +- .../elementwise/elementwise_add_op.cc | 1 - .../elementwise/elementwise_floordiv_op.cc | 1 - .../elementwise/elementwise_max_op.cc | 1 - .../elementwise/elementwise_min_op.cc | 1 - .../elementwise/elementwise_mod_op.cc | 1 - .../elementwise/elementwise_pow_op.cc | 1 - .../elementwise/elementwise_sub_op.cc | 1 - .../elementwise/elementwise_sub_op.h | 1 + .../mkldnn/elementwise_add_mkldnn_op.cc | 1 - .../mkldnn/elementwise_div_mkldnn_op.cc | 1 - .../mkldnn/elementwise_mul_mkldnn_op.cc | 1 - .../mkldnn/elementwise_sub_mkldnn_op.cc | 1 - .../test_elementwise_op_grad_grad.h | 4 +- paddle/fluid/operators/expand_op_npu.cc | 4 +- paddle/fluid/operators/fake_quantize_op.cu | 4 +- .../operators/fill_diagonal_tensor_op.cu | 5 +- paddle/fluid/operators/flip_op.cu | 2 +- .../operators/fused/fused_dropout_helper.h | 2 +- .../fused_layernorm_residual_dropout_bias.h | 2 +- .../fused/fused_residual_dropout_bias.h | 2 +- .../fusion_transpose_flatten_concat_op.cu.cc | 7 +- paddle/fluid/operators/gather.cu.h | 2 +- paddle/fluid/operators/gaussian_random_op.cu | 6 +- .../fluid/operators/gaussian_random_op_xpu.cc | 5 +- paddle/fluid/operators/gru_op.cu.cc | 2 +- paddle/fluid/operators/gumbel_softmax_op.cu | 3 +- paddle/fluid/operators/hash_op.cc | 3 - paddle/fluid/operators/increment_op.cc | 1 - paddle/fluid/operators/isclose_op.cu | 3 +- paddle/fluid/operators/isfinite_op.cc | 1 - paddle/fluid/operators/isfinite_v2_op.cc | 1 - paddle/fluid/operators/jit/kernel_key.h | 4 +- paddle/fluid/operators/label_smooth_op.cc | 1 - paddle/fluid/operators/lookup_table_op.cu | 2 +- paddle/fluid/operators/lookup_table_v2_op.cu | 2 +- paddle/fluid/operators/lstsq_op.cu | 10 +- paddle/fluid/operators/lu_op.h | 2 +- .../fluid/operators/masked_select_op_xpu.cc | 3 +- .../fluid/operators/math/concat_and_split.cc | 15 +- .../fluid/operators/math/concat_and_split.cu | 20 +- .../operators/math/eigen_values_vectors.h | 3 +- paddle/fluid/operators/math/math_function.cc | 3 +- paddle/fluid/operators/math/math_function.cu | 3 +- paddle/fluid/operators/math/math_function.h | 5 +- .../fluid/operators/math/matrix_inverse.cu.cc | 15 +- .../fluid/operators/math/matrix_solve.cu.cc | 13 +- .../operators/math/selected_rows_functor.cc | 15 +- .../operators/math/selected_rows_functor.cu | 11 +- paddle/fluid/operators/math/tree2col.cc | 4 +- paddle/fluid/operators/math/tree2col.cu | 4 +- paddle/fluid/operators/matrix_rank_op.cu | 12 +- paddle/fluid/operators/mean_op.cu | 2 +- paddle/fluid/operators/mean_op_xpu.cc | 4 +- paddle/fluid/operators/memcpy_d2h_op.cc | 2 - paddle/fluid/operators/memcpy_h2d_op.cc | 2 - paddle/fluid/operators/memcpy_op.cc | 2 - .../operators/metrics/accuracy_op_xpu.cc | 16 +- paddle/fluid/operators/multiplex_op.cu | 6 +- paddle/fluid/operators/multiplex_op.h | 6 +- paddle/fluid/operators/nccl/nccl_op.cu.cc | 9 +- paddle/fluid/operators/partial_concat_op.cu | 6 +- paddle/fluid/operators/partial_sum_op.cu | 8 +- paddle/fluid/operators/poisson_op.cu | 3 +- paddle/fluid/operators/prroi_pool_op.cu | 4 +- .../pscore/send_and_recv_op_gpu_test.cc | 6 +- paddle/fluid/operators/psroi_pool_op.cu | 6 +- paddle/fluid/operators/qr_op.cu | 29 +- paddle/fluid/operators/range_op_xpu.cc | 5 +- paddle/fluid/operators/rank_loss_op.cc | 1 - .../fluid/operators/reader/buffered_reader.cc | 46 +- .../reader/create_double_buffer_reader_op.cc | 6 +- .../operators/reduce_ops/frobenius_norm_op.cc | 1 - .../operators/reduce_ops/reduce_all_op.cc | 1 - .../operators/reduce_ops/reduce_any_op.cc | 1 - .../operators/reduce_ops/reduce_prod_op.cc | 1 - .../operators/reduce_ops/reduce_sum_op.cc | 1 - .../operators/reduce_ops/reduce_sum_op.h | 2 +- paddle/fluid/operators/reshape_op.cc | 2 - paddle/fluid/operators/rnn_op.cu.cc | 21 +- paddle/fluid/operators/roi_align_op.cu | 4 +- paddle/fluid/operators/roi_align_op_xpu.cc | 4 +- paddle/fluid/operators/roi_pool_op.cu | 4 +- paddle/fluid/operators/run_program_op.h | 2 +- paddle/fluid/operators/scatter.cu.h | 2 +- paddle/fluid/operators/seed_op.cu | 3 +- paddle/fluid/operators/segment_pool_op.h | 3 +- .../sequence_ops/sequence_expand_op.cu | 3 +- paddle/fluid/operators/set_value_op.cc | 1 - paddle/fluid/operators/split_op.h | 2 - paddle/fluid/operators/split_op_npu.cc | 1 - paddle/fluid/operators/stack_op.cu | 6 +- paddle/fluid/operators/strided_memcpy.h | 6 +- paddle/fluid/operators/sum_op.cu | 7 +- paddle/fluid/operators/svd_op.cu | 6 +- paddle/fluid/operators/tensor_formatter.cc | 2 +- .../operators/tensorrt/tensorrt_engine_op.h | 7 +- .../operators/truncated_gaussian_random_op.cu | 3 +- .../truncated_gaussian_random_op_xpu.cc | 5 +- paddle/fluid/operators/unbind_op.h | 3 - .../operators/uniform_random_inplace_op.cu | 3 +- .../uniform_random_inplace_op_xpu.cc | 8 +- paddle/fluid/operators/uniform_random_op.cu | 3 +- .../fluid/operators/uniform_random_op_xpu.cc | 5 +- paddle/fluid/operators/where_index_op.cu | 8 +- paddle/fluid/operators/where_index_op_xpu.cc | 4 +- paddle/fluid/platform/CMakeLists.txt | 4 +- paddle/fluid/platform/collective_helper.cc | 8 +- paddle/fluid/platform/collective_helper.h | 6 +- .../fluid/platform/device/gpu/nccl_helper.h | 14 +- .../device/mlu/device_context_allocator.h | 3 +- .../fluid/platform/device/npu/hccl_helper.h | 16 +- .../device/npu/npu_collective_helper.cc | 4 +- .../fluid/platform/device/npu/npu_op_runner.h | 5 +- .../fluid/platform/device/npu/npu_stream.cc | 4 +- .../fluid/platform/device/xpu/bkcl_helper.h | 14 +- paddle/fluid/platform/device/xpu/xpu_info.h | 2 +- .../fluid/platform/device/xpu/xpu_op_list.cc | 3 +- paddle/fluid/platform/device_context.cc | 36 +- paddle/fluid/platform/device_event_gpu.cc | 2 +- paddle/fluid/platform/device_tracer.cc | 3 +- paddle/fluid/platform/place.cc | 59 +-- paddle/fluid/platform/place.h | 362 +++----------- paddle/fluid/platform/place_test.cc | 8 +- paddle/fluid/platform/stream/cuda_stream.cc | 4 +- paddle/fluid/pybind/eager.cc | 25 +- paddle/fluid/pybind/imperative.cc | 20 +- paddle/fluid/pybind/pybind.cc | 27 +- paddle/fluid/pybind/tensor_py.h | 56 +-- paddle/pten/api/include/tensor.h | 7 +- paddle/pten/common/place.cc | 8 +- paddle/pten/common/place.h | 70 ++- paddle/pten/kernels/cpu/copy_kernel.cc | 6 +- paddle/pten/kernels/funcs/transpose.cu | 3 +- paddle/pten/kernels/gpu/copy_kernel.cu | 61 +-- paddle/pten/kernels/gpu/elementwise.h | 3 +- paddle/pten/kernels/xpu/copy_kernel.cc | 18 +- paddle/pten/tests/common/test_place.cc | 31 +- .../tests/unittests/test_egr_python_api.py | 2 +- .../fluid/tests/unittests/test_memcpy_op.py | 2 +- .../fluid/tests/unittests/test_var_base.py | 34 +- 282 files changed, 1466 insertions(+), 1613 deletions(-) diff --git a/paddle/fluid/distributed/fleet_executor/carrier.cc b/paddle/fluid/distributed/fleet_executor/carrier.cc index 3e198dc3eeea4..56d8da3eca4b5 100644 --- a/paddle/fluid/distributed/fleet_executor/carrier.cc +++ b/paddle/fluid/distributed/fleet_executor/carrier.cc @@ -221,8 +221,8 @@ static std::shared_ptr GetGC( #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(place)) { if (framework::IsFastEagerDeletionModeEnabled()) { - gc.reset(new framework::UnsafeFastGPUGarbageCollector( - BOOST_GET_CONST(platform::CUDAPlace, place), max_memory_size)); + gc.reset(new framework::UnsafeFastGPUGarbageCollector(place, + max_memory_size)); } } #endif diff --git a/paddle/fluid/distributed/service/brpc_utils.cc b/paddle/fluid/distributed/service/brpc_utils.cc index db55c9ad438a7..4d9f84fdc6e0f 100644 --- a/paddle/fluid/distributed/service/brpc_utils.cc +++ b/paddle/fluid/distributed/service/brpc_utils.cc @@ -106,13 +106,12 @@ void SerializeLodTensor(framework::Variable* var, iobuf->append(reinterpret_cast(tensor->data()), data_len); } else { #ifdef PADDLE_WITH_CUDA - char* temp_ptr = - new char[tensor->numel() * framework::SizeOfType(tensor->type())]; + char* temp_ptr = new char[tensor->numel() * + framework::SizeOfType(tensor->type())]; // NOLINT auto stream = reinterpret_cast(ctx).stream(); memory::Copy( - platform::CPUPlace(), temp_ptr, - BOOST_GET_CONST(platform::CUDAPlace, tensor->place()), tensor->data(), + platform::CPUPlace(), temp_ptr, tensor->place(), tensor->data(), tensor->numel() * framework::SizeOfType(tensor->type()), stream); auto data_len = tensor->numel() * framework::SizeOfType(tensor->type()); iobuf->append(reinterpret_cast(&data_len), 8); @@ -148,13 +147,12 @@ void SerializeSelectedRows(framework::Variable* var, iobuf->append(reinterpret_cast(tensor->data()), data_len); } else { #ifdef PADDLE_WITH_CUDA - char* temp_ptr = - new char[tensor->numel() * framework::SizeOfType(tensor->type())]; + char* temp_ptr = new char[tensor->numel() * + framework::SizeOfType(tensor->type())]; // NOLINT auto stream = reinterpret_cast(ctx).stream(); memory::Copy( - platform::CPUPlace(), temp_ptr, - BOOST_GET_CONST(platform::CUDAPlace, tensor->place()), tensor->data(), + platform::CPUPlace(), temp_ptr, tensor->place(), tensor->data(), tensor->numel() * framework::SizeOfType(tensor->type()), stream); auto data_len = tensor->numel() * framework::SizeOfType(tensor->type()); iobuf->append(reinterpret_cast(&data_len), 8); @@ -204,7 +202,7 @@ void DeserializeFromMultiVarMsgAndIOBuf(const MultiVarMsg& multi_msg, } void DeserializeLodTensor(framework::Variable* var, const VarMsg& msg, - butil::IOBufBytesIterator& io_buffer_itr, + butil::IOBufBytesIterator& io_buffer_itr, // NOLINT const platform::DeviceContext& ctx) { const auto place = ctx.GetPlace(); framework::LoDTensor* tensor = var->GetMutable(); @@ -229,30 +227,30 @@ void DeserializeLodTensor(framework::Variable* var, const VarMsg& msg, // IO Buffer if (platform::is_cpu_place(place)) { - unsigned long data_len; - io_buffer_itr.copy_and_forward((void*)(&data_len), 8); + unsigned long data_len; // NOLINT + io_buffer_itr.copy_and_forward((void*)(&data_len), 8); // NOLINT io_buffer_itr.copy_and_forward(tensor_data, data_len); } else if (platform::is_gpu_place(place)) { #ifdef PADDLE_WITH_CUDA - unsigned long data_len; - char* temp_ptr = - new char[tensor->numel() * framework::SizeOfType(tensor->type())]; - io_buffer_itr.copy_and_forward((void*)(&data_len), 8); - io_buffer_itr.copy_and_forward((void*)temp_ptr, data_len); + unsigned long data_len; // NOLINT + char* temp_ptr = new char[tensor->numel() * + framework::SizeOfType(tensor->type())]; // NOLINT + io_buffer_itr.copy_and_forward((void*)(&data_len), 8); // NOLINT + io_buffer_itr.copy_and_forward((void*)temp_ptr, data_len); // NOLINT auto stream = reinterpret_cast(ctx).stream(); - memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data, - platform::CPUPlace(), (void*)temp_ptr, - tensor->numel() * framework::SizeOfType(tensor->type()), - stream); + memory::Copy( + place, tensor_data, platform::CPUPlace(), (void*)temp_ptr, // NOLINT + tensor->numel() * framework::SizeOfType(tensor->type()), stream); delete[] temp_ptr; #endif } } -void DeserializeSelectedRows(framework::Variable* var, const VarMsg& msg, - butil::IOBufBytesIterator& io_buffer_itr, - const platform::DeviceContext& ctx) { +void DeserializeSelectedRows( + framework::Variable* var, const VarMsg& msg, + butil::IOBufBytesIterator& io_buffer_itr, // NOLINT + const platform::DeviceContext& ctx) { const auto place = ctx.GetPlace(); auto* slr = var->GetMutable(); framework::Tensor* tensor = slr->mutable_value(); @@ -269,20 +267,19 @@ void DeserializeSelectedRows(framework::Variable* var, const VarMsg& msg, tensor->mutable_data(place, VarMessageToVarType(msg.data_type())); // IO Buffer if (platform::is_cpu_place(place)) { - unsigned long data_len; - io_buffer_itr.copy_and_forward((void*)(&data_len), 8); + unsigned long data_len; // NOLINT + io_buffer_itr.copy_and_forward((void*)(&data_len), 8); // NOLINT io_buffer_itr.copy_and_forward(tensor_data, data_len); } else if (platform::is_gpu_place(place)) { #ifdef PADDLE_WITH_CUDA - char* temp_ptr = - new char[tensor->numel() * framework::SizeOfType(tensor->type())]; - unsigned long data_len; - io_buffer_itr.copy_and_forward((void*)(&data_len), 8); + char* temp_ptr = new char[tensor->numel() * + framework::SizeOfType(tensor->type())]; // NOLINT + unsigned long data_len; // NOLINT + io_buffer_itr.copy_and_forward((void*)(&data_len), 8); // NOLINT io_buffer_itr.copy_and_forward(temp_ptr, data_len); auto stream = reinterpret_cast(ctx).stream(); - memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data, - platform::CPUPlace(), temp_ptr, + memory::Copy(place, tensor_data, platform::CPUPlace(), temp_ptr, tensor->numel() * framework::SizeOfType(tensor->type()), stream); delete[] temp_ptr; diff --git a/paddle/fluid/distributed/service/heter_client.cc b/paddle/fluid/distributed/service/heter_client.cc index 13016d60515dd..95023704f9d51 100644 --- a/paddle/fluid/distributed/service/heter_client.cc +++ b/paddle/fluid/distributed/service/heter_client.cc @@ -44,8 +44,7 @@ int GetMicroId(const platform::DeviceContext& ctx, auto stream = reinterpret_cast(ctx).stream(); memory::Copy( - platform::CPUPlace(), temp_ptr, - BOOST_GET_CONST(platform::CUDAPlace, tensor->place()), tensor->data(), + platform::CPUPlace(), temp_ptr, tensor->place(), tensor->data(), tensor->numel() * framework::SizeOfType(tensor->type()), stream); float* temp_ptr_float = reinterpret_cast(temp_ptr); micro_id = static_cast(temp_ptr_float[0]); diff --git a/paddle/fluid/eager/accumulation/gradient_accumulation.cc b/paddle/fluid/eager/accumulation/gradient_accumulation.cc index 1f66596a0b578..ffd76c5bda621 100644 --- a/paddle/fluid/eager/accumulation/gradient_accumulation.cc +++ b/paddle/fluid/eager/accumulation/gradient_accumulation.cc @@ -43,7 +43,7 @@ class TensorAddFunctor : public boost::static_visitor<> { TensorAddFunctor(int64_t numel, const T* x, T* y) : numel_(numel), x_(x), y_(y) {} - void operator()(const paddle::platform::CPUPlace& place) { + void operator()(const paddle::platform::CPUPlace& place) const { paddle::platform::CPUDeviceContext* ctx = dynamic_cast( paddle::platform::DeviceContextPool::Instance().Get(place)); @@ -56,7 +56,7 @@ class TensorAddFunctor : public boost::static_visitor<> { // TODO(jiabin): Support xpu here from gradient_accumulator.cc #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - void operator()(const paddle::platform::CUDAPlace& place) { + void operator()(const paddle::platform::CUDAPlace& place) const { paddle::platform::CUDADeviceContext* ctx = dynamic_cast( paddle::platform::DeviceContextPool::Instance().Get(place)); @@ -66,7 +66,7 @@ class TensorAddFunctor : public boost::static_visitor<> { blas.AXPY(numel_, 1., x_, y_); } #else - void operator()(const paddle::platform::CUDAPlace& place) { + void operator()(const paddle::platform::CUDAPlace& place) const { PADDLE_THROW(paddle::platform::errors::PermissionDenied( "Gradient accumulation on place (%s) " "is not supported in imperative mode", @@ -76,7 +76,7 @@ class TensorAddFunctor : public boost::static_visitor<> { // TODO(jiabin): Support Npu here from gradient_accumulator.cc // there is NO blas in CUDAPinnedPlace - void operator()(const paddle::platform::CUDAPinnedPlace& place) { + void operator()(const paddle::platform::CUDAPinnedPlace& place) const { PADDLE_THROW(paddle::platform::errors::PermissionDenied( "Gradient accumulation on place (%s) " "is not supported in imperative mode", @@ -84,14 +84,14 @@ class TensorAddFunctor : public boost::static_visitor<> { } #ifdef PADDLE_WITH_ASCEND_CL - void operator()(const paddle::platform::NPUPlace& place) { + void operator()(const paddle::platform::NPUPlace& place) const { PADDLE_THROW(paddle::platform::errors::PermissionDenied( "Gradient accumulation on place (%s) " "is not supported in imperative mode", place)); } #else - void operator()(const paddle::platform::NPUPlace& place) { + void operator()(const paddle::platform::NPUPlace& place) const { PADDLE_THROW(paddle::platform::errors::PermissionDenied( "Gradient accumulation on place (%s) " "is not supported in imperative mode", @@ -100,14 +100,14 @@ class TensorAddFunctor : public boost::static_visitor<> { #endif #ifdef PADDLE_WITH_XPU - void operator()(const paddle::platform::XPUPlace& place) { + void operator()(const paddle::platform::XPUPlace& place) const { paddle::platform::XPUDeviceContext* ctx = dynamic_cast( paddle::platform::DeviceContextPool::Instance().Get(place)); xpu::add(ctx->x_context(), x_, y_, y_, static_cast(numel_)); } #else - void operator()(const paddle::platform::XPUPlace& place) { + void operator()(const paddle::platform::XPUPlace& place) const { PADDLE_THROW(paddle::platform::errors::PermissionDenied( "Gradient accumulation on place (%s) " "is not supported in imperative mode", @@ -116,14 +116,14 @@ class TensorAddFunctor : public boost::static_visitor<> { #endif #ifdef PADDLE_WITH_MLU - void operator()(const paddle::platform::MLUPlace& place) { + void operator()(const paddle::platform::MLUPlace& place) const { PADDLE_THROW(paddle::platform::errors::PermissionDenied( "Gradient accumulation on place (%s) " "is not supported in imperative mode", place)); } #else - void operator()(const paddle::platform::MLUPlace& place) { + void operator()(const paddle::platform::MLUPlace& place) const { PADDLE_THROW(paddle::platform::errors::PermissionDenied( "Gradient accumulation on place (%s) " "is not supported in imperative mode", @@ -132,14 +132,14 @@ class TensorAddFunctor : public boost::static_visitor<> { #endif #ifdef PADDLE_WITH_IPU - void operator()(const paddle::platform::IPUPlace& place) { + void operator()(const paddle::platform::IPUPlace& place) const { PADDLE_THROW(paddle::platform::errors::PermissionDenied( "Gradient accumulation on place (%s) " "is not supported in imperative mode", place)); } #else - void operator()(const paddle::platform::IPUPlace& place) { + void operator()(const paddle::platform::IPUPlace& place) const { PADDLE_THROW(paddle::platform::errors::PermissionDenied( "Gradient accumulation on place (%s) " "is not supported in imperative mode", @@ -147,7 +147,7 @@ class TensorAddFunctor : public boost::static_visitor<> { } #endif - void operator()(const paddle::platform::NPUPinnedPlace& place) { + void operator()(const paddle::platform::NPUPinnedPlace& place) const { PADDLE_THROW(paddle::platform::errors::PermissionDenied( "Gradient accumulation on place (%s) " "is not supported in imperative mode", @@ -157,7 +157,7 @@ class TensorAddFunctor : public boost::static_visitor<> { private: int64_t numel_; const T* x_; - T* y_; + mutable T* y_; }; template @@ -218,7 +218,7 @@ void TensorAdd(const egr::EagerTensor& src, egr::EagerTensor* dst) { if (data_type == paddle::framework::DataTypeTrait::DataType()) { \ TensorAddFunctor func(numel, src_tensor->data(), \ dst_tensor->mutable_data()); \ - boost::apply_visitor(func, place); \ + paddle::platform::VisitPlace(place, func); \ return; \ } @@ -294,7 +294,7 @@ void VariableAdd(const egr::EagerTensor& src, egr::EagerTensor* dst) { TensorAddFunctor func( \ numel, src_tensor.data(), \ dst_tensor->mutable_data(place)); \ - boost::apply_visitor(func, place); \ + paddle::platform::VisitPlace(place, func); \ return; \ } diff --git a/paddle/fluid/eager/legacy/op_runner.cc b/paddle/fluid/eager/legacy/op_runner.cc index 4dab96c53eca4..305d66d134c36 100644 --- a/paddle/fluid/eager/legacy/op_runner.cc +++ b/paddle/fluid/eager/legacy/op_runner.cc @@ -150,24 +150,21 @@ void RunOp(const std::string& type, const NameTensorMap& ins, VLOG(6) << "Get Device id"; if (paddle::platform::is_gpu_place(place)) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - paddle::platform::SetDeviceId( - BOOST_GET_CONST(paddle::platform::CUDAPlace, place).device); + paddle::platform::SetDeviceId(place.device); #else PADDLE_THROW(paddle::platform::errors::PreconditionNotMet( "PaddlePaddle should compile with GPU if use CUDAPlace.")); #endif } else if (paddle::platform::is_xpu_place(place)) { #ifdef PADDLE_WITH_XPU - paddle::platform::SetXPUDeviceId( - BOOST_GET_CONST(paddle::platform::XPUPlace, place).device); + paddle::platform::SetXPUDeviceId(place.device); #else PADDLE_THROW(paddle::platform::errors::PreconditionNotMet( "PaddlePaddle should compile with XPU if use XPUPlace.")); #endif } else if (paddle::platform::is_npu_place(place)) { #ifdef PADDLE_WITH_ASCEND_CL - paddle::platform::SetNPUDeviceId( - BOOST_GET_CONST(paddle::platform::NPUPlace, place).device); + paddle::platform::SetNPUDeviceId(place.device); #else PADDLE_THROW(paddle::platform::errors::PreconditionNotMet( "PaddlePaddle should compile with NPU if use NPUPlace.")); diff --git a/paddle/fluid/eager/legacy/prepared_operator.cc b/paddle/fluid/eager/legacy/prepared_operator.cc index fbf2d678740ab..bd7e5c549872d 100644 --- a/paddle/fluid/eager/legacy/prepared_operator.cc +++ b/paddle/fluid/eager/legacy/prepared_operator.cc @@ -116,7 +116,7 @@ PreparedOp PrepareImpl(const NameTensorMap& ins, const NameTensorMap& outs, auto& kernels = kernels_iter->second; auto kernel_iter = kernels.find(expected_kernel_key); #ifdef PADDLE_WITH_XPU - if (is_xpu_place(expected_kernel_key.place_) && + if (paddle::platform::is_xpu_place(expected_kernel_key.place_) && (kernel_iter == kernels.end() || !paddle::platform::is_xpu_support_op(op.Type(), expected_kernel_key) || paddle::platform::is_in_xpu_black_list(op.Type()))) { @@ -129,7 +129,7 @@ PreparedOp PrepareImpl(const NameTensorMap& ins, const NameTensorMap& outs, #endif #ifdef PADDLE_WITH_ASCEND_CL if (kernel_iter == kernels.end() && - is_npu_place(expected_kernel_key.place_)) { + paddle::platform::is_npu_place(expected_kernel_key.place_)) { VLOG(3) << "missing NPU kernel: " << op.Type() << ", expected_kernel_key:" << expected_kernel_key << ", fallbacking to CPU one!"; diff --git a/paddle/fluid/framework/data_device_transform.cc b/paddle/fluid/framework/data_device_transform.cc index f447a00f37c80..d06f5a0227af7 100644 --- a/paddle/fluid/framework/data_device_transform.cc +++ b/paddle/fluid/framework/data_device_transform.cc @@ -22,7 +22,7 @@ void TransDataDevice(const Tensor &in, const platform::Place &dst_place, << " dst_place: " << dst_place; PADDLE_ENFORCE_NE( - in.place().which(), dst_place.which(), + in.place().GetType(), dst_place.GetType(), platform::errors::Unavailable("Currently, model parallelism is only " "supported between CPU and CUDA.")); diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index f93202769dbd0..633963d1793d3 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/reduce_and_gather.h" +#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) @@ -181,7 +182,7 @@ void AllReduceOpHandle::AllReduceFunc( const framework::proto::VarType::Type &dtype, int64_t numel, const std::vector &places, const std::vector &out_var_names) { - if (is_gpu_place(places[0])) { + if (platform::is_gpu_place(places[0])) { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_, platform::errors::InvalidArgument( @@ -200,7 +201,7 @@ void AllReduceOpHandle::AllReduceFunc( PADDLE_THROW( platform::errors::PreconditionNotMet("Not compiled with GPU.")); #endif - } else if (is_xpu_place(places[0])) { + } else if (platform::is_xpu_place(places[0])) { #if defined(PADDLE_WITH_XPU_BKCL) PADDLE_ENFORCE_NOT_NULL(bkcl_ctxs_, platform::errors::InvalidArgument( @@ -286,7 +287,7 @@ void AllReduceOpHandle::NCCLAllReduceFunc( void AllReduceOpHandle::SyncNCCLAllReduce() { if (FLAGS_sync_nccl_allreduce) { for (auto &p : places_) { - int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p).device; + int dev_id = p.device; auto *nccl_ctxs = nccl_ctxs_->GetRunEnvNCCLCtx(run_order_, use_hierarchical_allreduce_); auto &nccl_ctx = nccl_ctxs->at(dev_id); diff --git a/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.cc index 6ce1eac2e30d2..0d8f71a7555ec 100644 --- a/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.cc @@ -46,7 +46,7 @@ BindThreadedSSAGraphExecutor::BindThreadedSSAGraphExecutor( } int index = 0; for (uint32_t i = 0; i < places.size(); i++) { - int id = BOOST_GET_CONST(platform::XPUPlace, places_[i]).device; + int id = places_[i].device; if (place_to_index_.find(id) == place_to_index_.end()) { place_to_index_[id] = index; index++; @@ -145,8 +145,7 @@ FetchResultType BindThreadedSSAGraphExecutor::RunMainStream( RunMultiDeviceOpAsync(cur_op, op_deps.get(), ready_ops); continue; } else { - cur_place = - BOOST_GET_CONST(platform::XPUPlace, dev_ctxes_.begin()->first); + cur_place = dev_ctxes_.begin()->first; int cur_index = place_to_index_[cur_place.device]; RunOpAsyncMainStream(cur_op, op_deps.get(), ready_ops, cur_index); } diff --git a/paddle/fluid/framework/details/bkcl_op_handle.h b/paddle/fluid/framework/details/bkcl_op_handle.h index f863cb123a8af..1a098f06f08f9 100644 --- a/paddle/fluid/framework/details/bkcl_op_handle.h +++ b/paddle/fluid/framework/details/bkcl_op_handle.h @@ -85,7 +85,7 @@ class BKCLOpHandleBase : public OpHandleBase { platform::errors::InvalidArgument( "The argument run_order_ must be >= 0, but got %d.", run_order_)); auto flat_bkcl_ctxs = bkcl_ctxs_->GetFlatCtx(run_order_); - int dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device; + int dev_id = place.device; auto& bkcl_ctx = flat_bkcl_ctxs->at(dev_id); auto comm = bkcl_ctx.comm_; diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc index 01dc5a45146f1..e8fa500e094b3 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.cc +++ b/paddle/fluid/framework/details/broadcast_op_handle.cc @@ -16,6 +16,7 @@ #include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/variable_visitor.h" +#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { @@ -83,8 +84,7 @@ void BroadcastOpHandle::BroadcastOneVar( } else if (platform::is_gpu_place(in_tensor.place())) { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) VarHandle *out_handle = nullptr; - int root_id = - BOOST_GET_CONST(platform::CUDAPlace, in_tensor.place()).device; + int root_id = in_tensor.place().device; std::vector> broadcast_calls; int type = platform::ToNCCLDataType(in_tensor.type()); @@ -94,8 +94,7 @@ void BroadcastOpHandle::BroadcastOneVar( Variable *out_var = var_scopes.at(out_var_handle->scope_idx()) ->FindVar(out_var_handle->name()); - int dst_id = - BOOST_GET_CONST(platform::CUDAPlace, out_var_handle->place()).device; + int dst_id = out_var_handle->place().device; auto &nccl_ctx = nccl_ctxs_->at(dst_id); @@ -145,7 +144,7 @@ void BroadcastOpHandle::BroadcastOneVar( } else { #if defined(PADDLE_WITH_XPU_BKCL) VarHandle *out_handle = nullptr; - int root_id = BOOST_GET_CONST(platform::XPUPlace, in_tensor.place()).device; + int root_id = in_tensor.place().device; std::vector> broadcast_calls; int type = platform::ToBKCLDataType(in_tensor.type()); @@ -155,8 +154,7 @@ void BroadcastOpHandle::BroadcastOneVar( Variable *out_var = var_scopes.at(out_var_handle->scope_idx()) ->FindVar(out_var_handle->name()); - int dst_id = - BOOST_GET_CONST(platform::XPUPlace, out_var_handle->place()).device; + int dst_id = out_var_handle->place().device; auto &bkcl_ctx = bkcl_ctxs_->at(dst_id); @@ -232,7 +230,7 @@ void BroadcastOpHandle::InitOutputValue( PADDLE_ENFORCE_NOT_NULL(out_var, platform::errors::NotFound( "Variable %s is not found in scopes.", out_var_handle->name())); - if (is_gpu_place(in_tensor.place())) { + if (platform::is_gpu_place(in_tensor.place())) { PADDLE_ENFORCE_EQ(platform::is_gpu_place(t_out_p), true, platform::errors::PreconditionNotMet( "Places of input and output must be all on GPU.")); diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc index bcdd6129230b0..59614e89c1344 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc @@ -46,8 +46,7 @@ EagerDeletionOpHandle::EagerDeletionOpHandle( dev_ctx_ = reinterpret_cast( platform::DeviceContextPool::Instance().Get(place)); if (dynamic_cast(gc_)) { - platform::CUDADeviceGuard guard( - BOOST_GET_CONST(platform::CUDAPlace, place).device); + platform::CUDADeviceGuard guard(place.device); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipEventCreateWithFlags(&event_, hipEventDisableTiming)); @@ -72,7 +71,7 @@ EagerDeletionOpHandle::EagerDeletionOpHandle( EagerDeletionOpHandle::~EagerDeletionOpHandle() { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (event_) { - auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx_->GetPlace()); + auto gpu_place = dev_ctx_->GetPlace(); platform::CUDADeviceGuard guard(gpu_place.device); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event_)); @@ -85,8 +84,7 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() { void EagerDeletionOpHandle::InitCUDA() { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - int dev_id = - BOOST_GET_CONST(platform::CUDAPlace, dev_ctxes_.begin()->first).device; + int dev_id = dev_ctxes_.begin()->first.device; events_[dev_id] = nullptr; #endif } diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc index b65d4e4fcd55a..af1b73f40be53 100644 --- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc @@ -16,6 +16,7 @@ #include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/platform/device_memory_aligment.h" +#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" DEFINE_bool(skip_fused_all_reduce_check, false, ""); @@ -102,7 +103,7 @@ void FusedAllReduceOpHandle::RunImpl() { gpuStream_t compute_stream{nullptr}; if (FLAGS_allreduce_record_one_event) { - auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, places_[0]); + auto gpu_place = platform::CUDAPlace(places_[0].GetDeviceId()); compute_stream = platform::DeviceContextPool::Instance().GetByPlace(gpu_place)->stream(); auto flat_nccl_ctxs = nccl_ctxs_->GetFlatCtx(run_order_); @@ -291,7 +292,7 @@ bool FusedAllReduceOpHandle::InputIsInDifferentPlace( var, platform::errors::NotFound( "The variable '%s' is not found in local scope.", var_name)); auto &lod_tensor = var->Get(); - if (!is_same_place(lod_tensor.place(), places_.at(scope_idx))) { + if (!platform::is_same_place(lod_tensor.place(), places_.at(scope_idx))) { return true; } } diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc index a5787ac39665c..db3eaece3569f 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc @@ -354,7 +354,7 @@ void CheckVarHasNanOrInf(const std::string& op_type, float* cpu_data = new float[tensor->numel()]; memory::Copy(platform::CPUPlace(), static_cast(cpu_data), - BOOST_GET_CONST(platform::XPUPlace, tensor->place()), + tensor->place(), static_cast(tensor->data()), tensor->numel() * sizeof(float)); bool flag = false; diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cu b/paddle/fluid/framework/details/nan_inf_utils_detail.cu index 8255707654416..bf38a56dc9372 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.cu +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cu @@ -132,7 +132,7 @@ void TensorCheckerVisitor::apply( auto* dev_ctx = reinterpret_cast( platform::DeviceContextPool::Instance().Get(tensor_.place())); - int dev_id = BOOST_GET_CONST(platform::CUDAPlace, tensor_.place()).device; + int dev_id = tensor_.place().device; PADDLE_ENFORCE_EQ( (dev_id >= 0 && dev_id < multi_op_var2gpu_str_mutex().size()), true, platform::errors::OutOfRange("GPU dev_id must >=0 and < dev_count=%d", diff --git a/paddle/fluid/framework/details/nccl_op_handle.h b/paddle/fluid/framework/details/nccl_op_handle.h index 324d39ed8bb77..09372a8ba05b0 100644 --- a/paddle/fluid/framework/details/nccl_op_handle.h +++ b/paddle/fluid/framework/details/nccl_op_handle.h @@ -102,7 +102,7 @@ class NCCLOpHandleBase : public OpHandleBase { } for (auto& p : dev_ctxes_) { - int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p.first).device; + int dev_id = p.first.device; if (inter_events_.find(dev_id) != inter_events_.end()) { continue; } @@ -133,7 +133,7 @@ class NCCLOpHandleBase : public OpHandleBase { platform::errors::InvalidArgument( "The argument run_order_ must be >= 0, but got %d.", run_order_)); auto flat_nccl_ctxs = nccl_ctxs_->GetFlatCtx(run_order_); - int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; + int dev_id = place.device; auto& nccl_ctx = flat_nccl_ctxs->at(dev_id); auto stream = nccl_ctx.stream(); auto comm = nccl_ctx.comm_; @@ -181,7 +181,7 @@ class NCCLOpHandleBase : public OpHandleBase { void InterReduce(platform::Place place, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op) { auto nccl_ctxs = nccl_ctxs_->GetHierarchicalInterCtx(run_order_); - int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; + int dev_id = place.device; auto& nccl_ctx = nccl_ctxs->at(dev_id); auto stream = nccl_ctx.stream(); auto comm = nccl_ctx.comm_; @@ -213,7 +213,7 @@ class NCCLOpHandleBase : public OpHandleBase { PADDLE_ENFORCE_NOT_NULL( nccl_ctxs_, platform::errors::NotFound( "Can't get exter %d nccl contexts.", run_order_)); - int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; + int dev_id = place.device; auto& nccl_ctx = nccl_ctxs->at(dev_id); auto stream = nccl_ctx.stream(); auto comm = nccl_ctx.comm_; @@ -246,7 +246,7 @@ class NCCLOpHandleBase : public OpHandleBase { void InterBroadCast(platform::Place place, void* sendbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op) { auto nccl_ctxs = nccl_ctxs_->GetHierarchicalInterCtx(run_order_); - int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; + int dev_id = place.device; auto& nccl_ctx = nccl_ctxs->at(dev_id); auto stream = nccl_ctx.stream(); auto comm = nccl_ctx.comm_; diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index fe21a62efd087..faaeeaeecb11f 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -47,7 +47,7 @@ OpHandleBase::~OpHandleBase() PADDLE_MAY_THROW { void OpHandleBase::InitCUDA() { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) for (auto &p : dev_ctxes_) { - int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p.first).device; + int dev_id = p.first.device; platform::SetDeviceId(dev_id); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( @@ -61,9 +61,7 @@ void OpHandleBase::InitCUDA() { for (auto &out_var : outputs_) { auto *out_var_handle = dynamic_cast(out_var); if (out_var_handle) { - int dev_id = - BOOST_GET_CONST(platform::CUDAPlace, out_var_handle->place()) - .device; + int dev_id = out_var_handle->place().device; out_var_handle->SetGenerateEvent(events_.at(dev_id)); } } @@ -74,7 +72,7 @@ void OpHandleBase::InitCUDA() { "Operator %s should have only one dev_ctx, but got %d.", Name(), dev_ctxes_.size())); auto &place = dev_ctxes_.begin()->first; - int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; + int dev_id = place.device; for (auto &out_var : outputs_) { auto *out_var_handle = dynamic_cast(out_var); if (out_var_handle) { @@ -109,7 +107,7 @@ void OpHandleBase::InitXPU() { platform::errors::InvalidArgument( "%s should have only one dev_ctx.", Name())); auto &place = dev_ctxes_.begin()->first; - int dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device; + int dev_id = place.device; platform::SetXPUDeviceId(dev_id); for (auto &out_var : outputs_) { auto *out_var_handle = dynamic_cast(out_var); @@ -309,7 +307,7 @@ void OpHandleBase::RunAndRecordEvent(const std::function &callback) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (!events_.empty()) { // Use event for (auto &p : dev_ctxes_) { - auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, p.first).device; + auto dev_id = p.first.device; auto *cuda_dev_ctx = static_cast(p.second); VLOG(10) << "cudadevicecontext:" << cuda_dev_ctx << ", dev_id:" << dev_id; #ifdef PADDLE_WITH_HIP @@ -332,8 +330,7 @@ void OpHandleBase::RunAndRecordEvent(platform::Place p, } else { auto *ctx = dev_ctxes_.at(p); auto *cuda_ctx = static_cast(ctx); - cuda_ctx->RecordEvent( - events_.at(BOOST_GET_CONST(platform::CUDAPlace, p).device), callback); + cuda_ctx->RecordEvent(events_.at(p.device), callback); } #else callback(); diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc index 35834fe5d7480..51063f68d4cbd 100644 --- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc @@ -45,7 +45,7 @@ static std::vector> SeparateMultiDevicesGraph( for (auto &op : op_handles) { auto &dev_ctx = op->DeviceContext(); auto &p = dev_ctx.begin()->first; - int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p).device; + int dev_id = p.device; auto &dev_dummys = graphs[dev_id]->Get(kGraphDepVars); graphs[dev_id]->AddNode(graph->RemoveNode(op->Node()).release()); diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc index 196f7a3d4a4bf..6493ef540ccbe 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.cc +++ b/paddle/fluid/framework/details/reduce_op_handle.cc @@ -17,6 +17,7 @@ #include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/variable_visitor.h" +#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" PADDLE_DEFINE_EXPORTED_bool( @@ -125,7 +126,8 @@ void ReduceOpHandle::RunImpl() { // TODO(gongwb): add cpu support if (collective_context.endpoints_.size() <= 1 || - is_cpu_place(in_places[0]) || is_cpu_place(t_out_p)) { + platform::is_cpu_place(in_places[0]) || + platform::is_cpu_place(t_out_p)) { GatherLocalSelectedRowsFunctor functor( in_selected_rows, in_places, dev_ctxes_, t_out_p, out_var->GetMutable()); @@ -172,13 +174,13 @@ void ReduceOpHandle::RunImpl() { out_var_handle->place(), pre_in.type()); auto out_p = out_var_handle->place(); - int root_id = BOOST_GET_CONST(platform::CUDAPlace, out_p).device; + int root_id = out_p.device; std::vector> all_reduce_calls; for (size_t i = 0; i < var_scopes.size(); ++i) { auto &p = in_places[i]; auto &lod_tensor = *lod_tensors[i]; - int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p).device; + int dev_id = p.device; auto &nccl_ctx = nccl_ctxs_->at(dev_id); void *buffer = const_cast(lod_tensor.data()); @@ -218,13 +220,13 @@ void ReduceOpHandle::RunImpl() { out_var_handle->place(), pre_in.type()); auto out_p = out_var_handle->place(); - int root_id = BOOST_GET_CONST(platform::XPUPlace, out_p).device; + int root_id = out_p.device; std::vector> all_reduce_calls; for (size_t i = 0; i < var_scopes.size(); ++i) { auto &p = in_places[i]; auto &lod_tensor = *lod_tensors[i]; - int dev_id = BOOST_GET_CONST(platform::XPUPlace, p).device; + int dev_id = p.device; auto &bkcl_ctx = bkcl_ctxs_->at(dev_id); void *buffer = const_cast(lod_tensor.data()); diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc index 1e3cd4f0aa77c..a2f7cc6fcecbf 100644 --- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc +++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc @@ -61,8 +61,8 @@ struct ScaleLossGradFunctor { } else if (platform::is_xpu_place(place_)) { #if defined(PADDLE_WITH_XPU) OutT cast_coeff = static_cast(coeff_); - memory::Copy(BOOST_GET_CONST(platform::XPUPlace, place_), out_data, - platform::CPUPlace(), &cast_coeff, SizeOfType(out_dtype_)); + memory::Copy(place_, out_data, platform::CPUPlace(), &cast_coeff, + SizeOfType(out_dtype_)); VLOG(10) << place_ << "RUN Scale loss grad op"; #else PADDLE_THROW(platform::errors::PermissionDenied( @@ -73,9 +73,8 @@ struct ScaleLossGradFunctor { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) OutT cast_coeff = static_cast(coeff_); auto stream = static_cast(ctx_)->stream(); - memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place_), out_data, - platform::CPUPlace(), &cast_coeff, SizeOfType(out_dtype_), - stream); + memory::Copy(place_, out_data, platform::CPUPlace(), &cast_coeff, + SizeOfType(out_dtype_), stream); VLOG(10) << place_ << "RUN Scale loss grad op"; #else PADDLE_THROW(platform::errors::PermissionDenied( diff --git a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc index aa942415fb404..3d6322b8c4179 100644 --- a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc +++ b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc @@ -86,8 +86,7 @@ void ShareTensorBufferOpHandle::SetShareDimsAndDtype( void ShareTensorBufferOpHandle::InitCUDA() { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - int dev_id = - BOOST_GET_CONST(platform::CUDAPlace, dev_ctxes_.begin()->first).device; + int dev_id = dev_ctxes_.begin()->first.device; events_[dev_id] = nullptr; #endif } diff --git a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc index ed485ed587c0b..1ab944720f8f4 100644 --- a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc @@ -165,7 +165,7 @@ void SparseAllReduceOpHandle::RunImplEncoded() { in_numel)); out_numel = (out_numel == 0) ? static_cast(out.numel()) : out_numel; - int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; + int dev_id = place.device; auto *nccl_ctxs = nccl_ctxs_->GetRunEnvNCCLCtx(run_order_, false); auto &nccl_ctx = nccl_ctxs->at(dev_id); auto stream = nccl_ctx.stream(); diff --git a/paddle/fluid/framework/details/variable_visitor.cc b/paddle/fluid/framework/details/variable_visitor.cc index 56c88e9d25a91..8207855501384 100644 --- a/paddle/fluid/framework/details/variable_visitor.cc +++ b/paddle/fluid/framework/details/variable_visitor.cc @@ -106,9 +106,12 @@ struct EnforceShapeAndDTypeEQVisitor { void operator()(const LoDTensor& src) { auto& tensor = dst_->Get(); - PADDLE_ENFORCE_EQ(src.place().which(), tensor.place().which(), - platform::errors::PreconditionNotMet( - "The place type of the two variables is not equal.")); + PADDLE_ENFORCE_EQ( + src.place().GetType(), tensor.place().GetType(), + platform::errors::PreconditionNotMet( + "The place type of the two variables is not equal. The src place " + "is %s, but the dst place is %s", + src.place().DebugString(), tensor.place().DebugString())); PADDLE_ENFORCE_EQ(src.type(), tensor.type(), platform::errors::PreconditionNotMet( "The dtype of the two variables is not equal.")); @@ -127,9 +130,12 @@ struct EnforceShapeAndDTypeEQVisitor { void operator()(const SelectedRows& src) { auto& selected_rows = dst_->Get(); - PADDLE_ENFORCE_EQ(src.place().which(), selected_rows.place().which(), - platform::errors::PreconditionNotMet( - "The place type of the two variables is not equal.")); + PADDLE_ENFORCE_EQ( + src.place().GetType(), selected_rows.place().GetType(), + platform::errors::PreconditionNotMet( + "The place type of the two variables is not equal. The src place " + "is %s, but the dst place is %s", + src.place().DebugString(), selected_rows.place().DebugString())); PADDLE_ENFORCE_EQ(src.value().type(), selected_rows.value().type(), platform::errors::PreconditionNotMet( "The dtype of the two variables is not equal.")); diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc index 95913664961b3..ef705aae1572b 100644 --- a/paddle/fluid/framework/dlpack_tensor.cc +++ b/paddle/fluid/framework/dlpack_tensor.cc @@ -138,7 +138,7 @@ DLPackTensor::DLPackTensor(const Tensor &tensor, LaneType lanes) { // init device, DLDevice type with device_type and device_id auto place = tensor.place(); - t_.device = boost::apply_visitor(internal::DLDeviceVisitor(), place); + t_.device = paddle::platform::VisitPlace(place, internal::DLDeviceVisitor()); // init dtype t_.dtype = internal::GetDLDataTypeFromTypeIndex(tensor.type()); diff --git a/paddle/fluid/framework/dlpack_tensor_test.cc b/paddle/fluid/framework/dlpack_tensor_test.cc index 9b8bdebe706eb..8639caf4dac90 100644 --- a/paddle/fluid/framework/dlpack_tensor_test.cc +++ b/paddle/fluid/framework/dlpack_tensor_test.cc @@ -63,8 +63,7 @@ void TestMain(const platform::Place &place, uint16_t lanes) { CHECK_EQ(0, dl_tensor.device.device_id); } else if (platform::is_gpu_place(place)) { CHECK_EQ(kDLGPU, dl_tensor.device.device_type); - CHECK_EQ(BOOST_GET_CONST(platform::CUDAPlace, place).device, - dl_tensor.device.device_id); + CHECK_EQ(place.device, dl_tensor.device.device_id); } else if (platform::is_cuda_pinned_place(place)) { CHECK_EQ(kDLCPUPinned, dl_tensor.device.device_type); CHECK_EQ(0, dl_tensor.device.device_id); diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index d669f2ab11d6c..5596aba52131b 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -72,7 +72,7 @@ Executor::~Executor() { #ifdef PADDLE_WITH_MKLDNN // Clear mkl-dnn cache, // this is needed to have mkl-dnn unit tests working - ClearMKLDNNCache(place_, this); + platform::ClearMKLDNNCache(place_, this); #endif } @@ -443,31 +443,26 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx, if (platform::is_gpu_place(place_)) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (IsFastEagerDeletionModeEnabled()) { - gc.reset(new UnsafeFastGPUGarbageCollector( - BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size)); + gc.reset(new UnsafeFastGPUGarbageCollector(place_, max_memory_size)); } else { - gc.reset(new DefaultStreamGarbageCollector( - BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size)); + gc.reset(new DefaultStreamGarbageCollector(place_, max_memory_size)); } #else PADDLE_THROW( platform::errors::Unimplemented("No GPU gc found in CPU/XPU paddle")); #endif } else if (platform::is_cpu_place(place_)) { - gc.reset(new CPUGarbageCollector( - BOOST_GET_CONST(platform::CPUPlace, place_), max_memory_size)); + gc.reset(new CPUGarbageCollector(place_, max_memory_size)); } else if (platform::is_xpu_place(place_)) { #ifdef PADDLE_WITH_XPU - gc.reset(new XPUGarbageCollector( - BOOST_GET_CONST(platform::XPUPlace, place_), max_memory_size)); + gc.reset(new XPUGarbageCollector(place_, max_memory_size)); #else PADDLE_THROW( platform::errors::Unimplemented("No XPU gc found in CPU/GPU paddle")); #endif } else if (platform::is_ipu_place(place_)) { #ifdef PADDLE_WITH_IPU - gc.reset(new IPUGarbageCollector( - BOOST_GET_CONST(platform::IPUPlace, place_), max_memory_size)); + gc.reset(new IPUGarbageCollector(place_, max_memory_size)); #else PADDLE_THROW( platform::errors::Unimplemented("No IPU gc found in CPU/IPU paddle")); @@ -476,16 +471,14 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx, #ifdef PADDLE_WITH_ASCEND_CL if (IsFastEagerDeletionModeEnabled()) { VLOG(4) << "Use unsafe fast gc for NPU."; - gc.reset(new NPUUnsafeFastGarbageCollector( - BOOST_GET_CONST(platform::NPUPlace, place_), max_memory_size)); + gc.reset(new NPUUnsafeFastGarbageCollector(place_, max_memory_size)); } else { PADDLE_THROW(platform::errors::Unimplemented( "Please set FLAGS_fast_eager_deletion_mode=true to use " "GarbageCollector on NPU.")); // TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector. VLOG(4) << "Use default stream gc for NPU."; - gc.reset(new NPUDefaultStreamGarbageCollector( - BOOST_GET_CONST(platform::NPUPlace, place_), max_memory_size)); + gc.reset(new NPUDefaultStreamGarbageCollector(place_, max_memory_size)); } #else PADDLE_THROW( @@ -494,11 +487,9 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx, } else if (platform::is_mlu_place(place_)) { #ifdef PADDLE_WITH_MLU if (IsFastEagerDeletionModeEnabled()) { - gc.reset(new MLUUnsafeFastGarbageCollector( - BOOST_GET_CONST(platform::MLUPlace, place_), max_memory_size)); + gc.reset(new MLUUnsafeFastGarbageCollector(place_, max_memory_size)); } else { - gc.reset(new MLUDefaultStreamGarbageCollector( - BOOST_GET_CONST(platform::MLUPlace, place_), max_memory_size)); + gc.reset(new MLUDefaultStreamGarbageCollector(place_, max_memory_size)); } #else PADDLE_THROW( diff --git a/paddle/fluid/framework/fleet/box_wrapper.cu b/paddle/fluid/framework/fleet/box_wrapper.cu index 0ef77a0387671..aea479ed0b214 100644 --- a/paddle/fluid/framework/fleet/box_wrapper.cu +++ b/paddle/fluid/framework/fleet/box_wrapper.cu @@ -137,8 +137,7 @@ void BoxWrapper::CopyForPull(const paddle::platform::Place& place, const int expand_embed_dim, const int64_t total_length) { auto stream = dynamic_cast( - platform::DeviceContextPool::Instance().Get( - BOOST_GET_CONST(platform::CUDAPlace, place))) + platform::DeviceContextPool::Instance().Get(place)) ->stream(); auto buf_value = memory::Alloc(place, values.size() * sizeof(float*)); float** gpu_values = reinterpret_cast(buf_value->ptr()); @@ -203,8 +202,7 @@ void BoxWrapper::CopyKeys(const paddle::platform::Place& place, uint64_t** origin_keys, uint64_t* total_keys, const int64_t* gpu_len, int slot_num, int total_len) { auto stream = dynamic_cast( - platform::DeviceContextPool::Instance().Get( - BOOST_GET_CONST(platform::CUDAPlace, place))) + platform::DeviceContextPool::Instance().Get(place)) ->stream(); #ifdef PADDLE_WITH_HIP hipLaunchKernelGGL(CopyKeysKernel, dim3((total_len + 512 - 1) / 512), @@ -225,8 +223,7 @@ void BoxWrapper::CopyForPush(const paddle::platform::Place& place, const int hidden_size, const int expand_embed_dim, const int64_t total_length, const int batch_size) { auto stream = dynamic_cast( - platform::DeviceContextPool::Instance().Get( - BOOST_GET_CONST(platform::CUDAPlace, place))) + platform::DeviceContextPool::Instance().Get(place)) ->stream(); auto slot_lengths_lod = slot_lengths; for (int i = 1; i < slot_lengths_lod.size(); i++) { diff --git a/paddle/fluid/framework/fleet/box_wrapper_impl.h b/paddle/fluid/framework/fleet/box_wrapper_impl.h index f42b0395eaf49..6f7009f4d5143 100644 --- a/paddle/fluid/framework/fleet/box_wrapper_impl.h +++ b/paddle/fluid/framework/fleet/box_wrapper_impl.h @@ -45,7 +45,7 @@ void BoxWrapper::PullSparseCase(const paddle::platform::Place& place, } else if (platform::is_gpu_place(place)) { #if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32) VLOG(3) << "Begin copy keys, key_num[" << total_length << "]"; - int device_id = BOOST_GET_CONST(platform::CUDAPlace, place).GetDeviceId(); + int device_id = place.GetDeviceId(); LoDTensor& total_keys_tensor = keys_tensor[device_id]; uint64_t* total_keys = reinterpret_cast( total_keys_tensor.mutable_data({total_length, 1}, place)); @@ -131,7 +131,7 @@ void BoxWrapper::PushSparseGradCase( "Warning:: CPUPlace is not supported in PaddleBox now.")); } else if (platform::is_gpu_place(place)) { #if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32) - int device_id = BOOST_GET_CONST(platform::CUDAPlace, place).GetDeviceId(); + int device_id = place.GetDeviceId(); LoDTensor& cached_total_keys_tensor = keys_tensor[device_id]; uint64_t* total_keys = reinterpret_cast(cached_total_keys_tensor.data()); @@ -143,8 +143,7 @@ void BoxWrapper::PushSparseGradCase( push_boxps_timer.Start(); int ret = boxps_ptr_->PushSparseGPU( total_keys, reinterpret_cast(total_grad_values_gpu), - static_cast(total_length), - BOOST_GET_CONST(platform::CUDAPlace, place).GetDeviceId()); + static_cast(total_length), place.GetDeviceId()); PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( "PushSparseGPU failed in BoxPS.")); push_boxps_timer.Pause(); diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc index f90027556342d..4fddfca5d805a 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.cc +++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc @@ -764,8 +764,7 @@ void FleetWrapper::PushDenseVarsAsync( LoDTensor* pin_tensor = pin_var->GetMutable(); float* pin_g = pin_tensor->mutable_data(tensor->dims(), platform::CUDAPinnedPlace()); - memory::Copy(platform::CUDAPinnedPlace(), pin_g, - BOOST_GET_CONST(platform::CUDAPlace, place), g_data, + memory::Copy(platform::CUDAPinnedPlace(), pin_g, place, g_data, sizeof(float) * count, stream); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, stream)); @@ -821,8 +820,7 @@ void FleetWrapper::PushDenseVarsAsync( LoDTensor* pin_tensor = pin_var->GetMutable(); float* pin_g = pin_tensor->mutable_data(tensor->dims(), platform::CPUPlace()); - memory::Copy(platform::CPUPlace(), pin_g, - BOOST_GET_CONST(platform::XPUPlace, place), g_data, + memory::Copy(platform::CPUPlace(), pin_g, place, g_data, sizeof(float) * count); float* g = pin_g; diff --git a/paddle/fluid/framework/fleet/heter_wrapper.cc b/paddle/fluid/framework/fleet/heter_wrapper.cc index 66f0d116f2412..5b54aa03bb30a 100644 --- a/paddle/fluid/framework/fleet/heter_wrapper.cc +++ b/paddle/fluid/framework/fleet/heter_wrapper.cc @@ -116,14 +116,12 @@ void HeterWrapper::SerializeToReq(const std::string& varname, Scope* scope, tensor->numel() * SizeOfType(tensor->type())); } else { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - memory::Copy(platform::CPUPlace(), data_ptr, - BOOST_GET_CONST(platform::CUDAPlace, tensor->place()), + memory::Copy(platform::CPUPlace(), data_ptr, tensor->place(), tensor->data(), tensor->numel() * SizeOfType(tensor->type()), nullptr); #endif #ifdef PADDLE_WITH_XPU - memory::Copy(platform::CPUPlace(), data_ptr, - BOOST_GET_CONST(platform::XPUPlace, tensor->place()), + memory::Copy(platform::CPUPlace(), data_ptr, tensor->place(), tensor->data(), tensor->numel() * SizeOfType(tensor->type())); #endif } @@ -158,8 +156,7 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope, tensor->mutable_data(place, ToVarType(req_var.data_type())); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data, - platform::CPUPlace(), req_var.data().data(), + memory::Copy(place, tensor_data, platform::CPUPlace(), req_var.data().data(), tensor->numel() * SizeOfType(tensor->type()), stream); #else memcpy(tensor_data, req_var.data().data(), @@ -197,8 +194,7 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope, tensor->mutable_data(place, ToVarType(req_var.data_type())); #ifdef PADDLE_WITH_XPU - memory::Copy(BOOST_GET_CONST(platform::XPUPlace, place), tensor_data, - platform::CPUPlace(), req_var.data().data(), + memory::Copy(place, tensor_data, platform::CPUPlace(), req_var.data().data(), tensor->numel() * SizeOfType(tensor->type())); #else memcpy(tensor_data, req_var.data().data(), diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc index 2b712d8cc5db8..31a30f72e3aa6 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc @@ -791,7 +791,7 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, "Warning:: CPUPlace is not supported in GpuPs now.")); } else if (platform::is_gpu_place(place)) { VLOG(3) << "Begin copy keys, key_num[" << total_length << "]"; - int device_id = BOOST_GET_CONST(platform::CUDAPlace, place).GetDeviceId(); + int device_id = place.GetDeviceId(); int devid_2_index = HeterPs_->get_index_by_devid(device_id); LoDTensor& total_keys_tensor = keys_tensor[devid_2_index]; uint64_t* total_keys = reinterpret_cast( @@ -859,7 +859,7 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place, PADDLE_THROW(platform::errors::Unimplemented( "Warning:: CPUPlace is not supported in GPUPS now.")); } else if (platform::is_gpu_place(place)) { - int device_id = BOOST_GET_CONST(platform::CUDAPlace, place).GetDeviceId(); + int device_id = place.GetDeviceId(); int devid_2_index = HeterPs_->get_index_by_devid(device_id); LoDTensor& cached_total_keys_tensor = keys_tensor[devid_2_index]; uint64_t* total_keys = diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu index b4fa09653a391..6a78a617b1fef 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu @@ -113,8 +113,7 @@ void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place, const int hidden_size, const int64_t total_length) { auto stream = dynamic_cast( - platform::DeviceContextPool::Instance().Get( - BOOST_GET_CONST(platform::CUDAPlace, place))) + platform::DeviceContextPool::Instance().Get(place)) ->stream(); auto buf_value = memory::Alloc(place, values.size() * sizeof(float*)); float** gpu_values = reinterpret_cast(buf_value->ptr()); @@ -132,8 +131,7 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place, const int64_t* gpu_len, int slot_num, int total_len) { auto stream = dynamic_cast( - platform::DeviceContextPool::Instance().Get( - BOOST_GET_CONST(platform::CUDAPlace, place))) + platform::DeviceContextPool::Instance().Get(place)) ->stream(); CopyKeysKernel<<<(total_len + 1024 - 1) / 1024, 1024, 0, stream>>>( origin_keys, total_keys, gpu_len, slot_num, total_len); @@ -148,8 +146,7 @@ void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place, const int64_t total_length, const int batch_size) { auto stream = dynamic_cast( - platform::DeviceContextPool::Instance().Get( - BOOST_GET_CONST(platform::CUDAPlace, place))) + platform::DeviceContextPool::Instance().Get(place)) ->stream(); auto slot_lengths_lod = slot_lengths; for (int i = 1; i < slot_lengths_lod.size(); i++) { diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc index b2d976fea0476..22f77be850555 100644 --- a/paddle/fluid/framework/garbage_collector.cc +++ b/paddle/fluid/framework/garbage_collector.cc @@ -101,7 +101,7 @@ StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place, } StreamGarbageCollector::~StreamGarbageCollector() { - auto place = BOOST_GET_CONST(platform::CUDAPlace, this->dev_ctx_->GetPlace()); + auto place = this->dev_ctx_->GetPlace(); platform::CUDADeviceGuard guard(place.device); platform::GpuStreamSync(stream_); platform::GpuDestroyStream(stream_); @@ -186,7 +186,7 @@ MLUStreamGarbageCollector::MLUStreamGarbageCollector( } MLUStreamGarbageCollector::~MLUStreamGarbageCollector() { - auto place = BOOST_GET_CONST(platform::MLUPlace, this->dev_ctx_->GetPlace()); + auto place = this->dev_ctx_->GetPlace(); platform::MLUDeviceGuard guard(place.device); PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueSync(stream_)); PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueDestroy(stream_)); diff --git a/paddle/fluid/framework/heter_section_worker.cc b/paddle/fluid/framework/heter_section_worker.cc index 69a4a180a9071..a4e582c8fed13 100644 --- a/paddle/fluid/framework/heter_section_worker.cc +++ b/paddle/fluid/framework/heter_section_worker.cc @@ -46,8 +46,8 @@ void SetMicroId(paddle::framework::Scope* scope, temp_ptr_float[0] = micro_id; auto stream = reinterpret_cast(*dev_ctx).stream(); - memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data, - platform::CPUPlace(), reinterpret_cast(temp_ptr), + memory::Copy(place, tensor_data, platform::CPUPlace(), + reinterpret_cast(temp_ptr), tensor->numel() * framework::SizeOfType(tensor->type()), stream); #endif diff --git a/paddle/fluid/framework/heterxpu_trainer.cc b/paddle/fluid/framework/heterxpu_trainer.cc index 3ed886e874db0..01430781c64cd 100644 --- a/paddle/fluid/framework/heterxpu_trainer.cc +++ b/paddle/fluid/framework/heterxpu_trainer.cc @@ -117,12 +117,12 @@ void HeterXpuTrainer::CreateThreadParam(const ProgramDesc& program, int num) { #ifdef PADDLE_WITH_CUDA auto stream = copy_streams_[num]; auto event = events_[num]; - auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; + auto dev_id = place.device; platform::CUDADeviceGuard guard(dev_id); #endif #ifdef PADDLE_WITH_XPU - auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device; + auto dev_id = place.device; platform::XPUDeviceGuard guard(dev_id); #endif @@ -173,13 +173,11 @@ void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor, thread_tensor->mutable_data(root_tensor->dims(), thread_place); T* root_ptr = root_tensor->data(); if (platform::is_cpu_place(root_tensor->place())) { - memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, thread_place), thread_ptr, - platform::CPUPlace(), root_ptr, + memory::Copy(thread_place, thread_ptr, platform::CPUPlace(), root_ptr, sizeof(T) * root_tensor->numel(), stream); } else { - memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, thread_place), thread_ptr, - BOOST_GET_CONST(platform::CUDAPlace, root_tensor->place()), - root_ptr, sizeof(T) * root_tensor->numel(), stream); + memory::Copy(thread_place, thread_ptr, root_tensor->place(), root_ptr, + sizeof(T) * root_tensor->numel(), stream); } } #endif @@ -193,13 +191,11 @@ void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor, thread_tensor->mutable_data(root_tensor->dims(), thread_place); T* root_ptr = root_tensor->data(); if (platform::is_cpu_place(root_tensor->place())) { - memory::Copy(BOOST_GET_CONST(platform::XPUPlace, thread_place), thread_ptr, - platform::CPUPlace(), root_ptr, + memory::Copy(thread_place, thread_ptr, platform::CPUPlace(), root_ptr, sizeof(T) * root_tensor->numel()); } else { - memory::Copy(BOOST_GET_CONST(platform::XPUPlace, thread_place), thread_ptr, - BOOST_GET_CONST(platform::XPUPlace, root_tensor->place()), - root_ptr, sizeof(T) * root_tensor->numel()); + memory::Copy(thread_place, thread_ptr, root_tensor->place(), root_ptr, + sizeof(T) * root_tensor->numel()); } } #endif @@ -286,7 +282,7 @@ void HeterXpuTrainer::InitOtherEnv(const ProgramDesc& main_program) { (context->ops_).push_back(local_op_ptr); } #ifdef PADDLE_WITH_CUDA - auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; + auto dev_id = place.device; platform::CUDADeviceGuard guard(dev_id); PADDLE_ENFORCE_GPU_SUCCESS( cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming)); @@ -336,15 +332,14 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request, _ForEachDataType_(MergeCallback); if (!platform::is_cpu_place(thread_tensor->place())) { #ifdef PADDLE_WITH_CUDA - auto dev_id = - BOOST_GET_CONST(platform::CUDAPlace, thread_tensor->place()).device; + auto dev_id = thread_tensor->place().device; platform::CUDADeviceGuard guard(dev_id); cudaMemset(thread_tensor->data(), 0, thread_tensor->numel() * SizeOfType(thread_tensor->type())); #endif #ifdef PADDLE_WITH_XPU auto place = thread_tensor->place(); - auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device; + auto dev_id = place.device; platform::XPUDeviceGuard guard(dev_id); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); @@ -364,15 +359,14 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request, merge_var); if (!platform::is_cpu_place(root_tensor->place())) { #ifdef PADDLE_WITH_CUDA - auto dev_id = - BOOST_GET_CONST(platform::CUDAPlace, root_tensor->place()).device; + auto dev_id = root_tensor->place().device; platform::CUDADeviceGuard guard(dev_id); cudaMemset(root_tensor->data(), 0, root_tensor->numel() * SizeOfType(root_tensor->type())); #endif #ifdef PADDLE_WITH_XPU auto place = root_tensor->place(); - auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device; + auto dev_id = place.device; platform::XPUDeviceGuard guard(dev_id); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); @@ -442,7 +436,7 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request, (context->ops_).push_back(local_op_ptr); } #ifdef PADDLE_WITH_CUDA - auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; + auto dev_id = place.device; platform::CUDADeviceGuard guard(dev_id); PADDLE_ENFORCE_GPU_SUCCESS( cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming)); diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc index 1199d251d2a18..2c10a68188eb4 100644 --- a/paddle/fluid/framework/ir/pass.cc +++ b/paddle/fluid/framework/ir/pass.cc @@ -67,7 +67,7 @@ Graph *Pass::Apply(Graph *graph) const { #ifdef PADDLE_WITH_MKLDNN // Clear mkl-dnn cache, // Passes can change params, tensors, so caching need to be discarded - ClearMKLDNNCache(paddle::platform::CPUPlace()); + platform::ClearMKLDNNCache(paddle::platform::CPUPlace()); #endif VLOG(10) << "finish to apply pass " << Type() << " to graph"; return graph; diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h index d1aee6cb2f662..0fd67efc177b3 100644 --- a/paddle/fluid/framework/mixed_vector.h +++ b/paddle/fluid/framework/mixed_vector.h @@ -32,10 +32,8 @@ namespace framework { inline paddle::optional OptionalCUDAPlace( const paddle::memory::allocation::AllocationPtr &gpu_) { - return gpu_ == nullptr - ? paddle::none - : paddle::optional( - BOOST_GET_CONST(platform::CUDAPlace, gpu_->place())); + return gpu_ == nullptr ? paddle::none + : paddle::optional(gpu_->place()); } // Vector implements the std::vector interface, and can get Data or @@ -369,11 +367,11 @@ class Vector { // get cuda ptr. immutable const T *CUDAData(platform::Place place) const { { + platform::CUDAPlace p(place.GetDeviceId()); auto &mtx = m_.Data().Mutex(); std::lock_guard guard(mtx); auto cuda_place = m_.Data().CUDAPlace(); - if (cuda_place == paddle::none || - cuda_place == BOOST_GET(platform::CUDAPlace, place)) { + if (cuda_place == paddle::none || cuda_place == p) { return m_.Data().CUDAData(place); } } @@ -385,11 +383,11 @@ class Vector { // get cuda ptr. mutable T *CUDAMutableData(platform::Place place) { { + platform::CUDAPlace p(place.GetDeviceId()); auto &mtx = m_.Data().Mutex(); std::lock_guard guard(mtx); auto cuda_place = m_.Data().CUDAPlace(); - if (cuda_place == paddle::none || - cuda_place == BOOST_GET(platform::CUDAPlace, place)) { + if (cuda_place == paddle::none || cuda_place == p) { return m_.MutableData()->CUDAMutableData(place); } } diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc index 9bd6aba3ea842..ece4815858640 100644 --- a/paddle/fluid/framework/naive_executor.cc +++ b/paddle/fluid/framework/naive_executor.cc @@ -131,7 +131,7 @@ NaiveExecutor::~NaiveExecutor() { #ifdef PADDLE_WITH_MKLDNN // Clear mkl-dnn cache, // this is needed to have mkl-dnn unit tests working - ClearMKLDNNCache(place_, this); + platform::ClearMKLDNNCache(place_, this); #endif } diff --git a/paddle/fluid/framework/new_executor/profiler.h b/paddle/fluid/framework/new_executor/profiler.h index 8df8db35592bb..95eee77d36288 100644 --- a/paddle/fluid/framework/new_executor/profiler.h +++ b/paddle/fluid/framework/new_executor/profiler.h @@ -43,7 +43,7 @@ class ProfilerGuard { void TotalCUDAAllocatedMemorySize(const platform::Place& place) { if (platform::is_gpu_place(place)) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - auto cuda_place = BOOST_GET_CONST(platform::CUDAPlace, place); + auto cuda_place = place; cost_info_->device_memory_bytes = platform::RecordedGpuMallocSize(cuda_place.device); #endif diff --git a/paddle/fluid/framework/op_kernel_type.cc b/paddle/fluid/framework/op_kernel_type.cc index 4965f7b720c1d..7dac6a092d245 100644 --- a/paddle/fluid/framework/op_kernel_type.cc +++ b/paddle/fluid/framework/op_kernel_type.cc @@ -22,7 +22,7 @@ namespace framework { size_t OpKernelType::Hash::operator()(const OpKernelType& key) const { int cur_loc = 0; - int place = key.place_.which(); + int place = static_cast(key.place_.GetType()); cur_loc += OpKernelType::kPlaceBits; int data_type = static_cast(key.data_type_) << cur_loc; diff --git a/paddle/fluid/framework/op_kernel_type_test.cc b/paddle/fluid/framework/op_kernel_type_test.cc index 2979750fba792..3879a7957600d 100644 --- a/paddle/fluid/framework/op_kernel_type_test.cc +++ b/paddle/fluid/framework/op_kernel_type_test.cc @@ -27,7 +27,7 @@ TEST(OpKernelType, ToString) { LibraryType::kCUDNN); ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type), - "data_type[float]:data_layout[NCHW]:place[CPUPlace]:library_type[" + "data_type[float]:data_layout[NCHW]:place[Place(cpu)]:library_type[" "CUDNN]"); using CUDAPlace = paddle::platform::CUDAPlace; @@ -35,7 +35,7 @@ TEST(OpKernelType, ToString) { LibraryType::kCUDNN); ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type2), "data_type[::paddle::platform::float16]:data_layout[NCHW]:place[" - "CUDAPlace(0)]:library_" + "Place(gpu:0)]:library_" "type[CUDNN]"); } diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index ff12edb72c06a..e3f0fbbdfdc4a 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -210,7 +210,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { "reinstall Paddle with CUDA support.", place)); #else - auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; + auto dev_id = place.device; platform::SetDeviceId(dev_id); #endif } else if (platform::is_xpu_place(place)) { @@ -220,7 +220,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { "reinstall Paddle with XPU support.", place)); #else - auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device; + auto dev_id = place.device; platform::SetXPUDeviceId(dev_id); #endif } else if (platform::is_npu_place(place)) { @@ -230,7 +230,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { "reinstall Paddle with NPU support.", place)); #else - auto dev_id = BOOST_GET_CONST(platform::NPUPlace, place).device; + auto dev_id = place.device; platform::SetNPUDeviceId(dev_id); #endif } else if (platform::is_mlu_place(place)) { @@ -240,7 +240,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { "reinstall Paddle with MLU support.", place)); #else - auto dev_id = BOOST_GET_CONST(platform::MLUPlace, place).device; + auto dev_id = place.device; platform::SetMLUDeviceId(dev_id); #endif } @@ -1330,7 +1330,7 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const { } #endif #ifdef PADDLE_WITH_XPU - if (is_xpu_place(expected_kernel_key.place_) && + if (platform::is_xpu_place(expected_kernel_key.place_) && (kernel_iter == kernels.end() || !paddle::platform::is_xpu_support_op(type_, expected_kernel_key) || paddle::platform::is_in_xpu_black_list(type_))) { @@ -1343,7 +1343,7 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const { #endif #ifdef PADDLE_WITH_ASCEND_CL if (kernel_iter == kernels.end() && - is_npu_place(expected_kernel_key.place_)) { + platform::is_npu_place(expected_kernel_key.place_)) { VLOG(3) << "missing NPU kernel: " << type_ << ", expected_kernel_key:" << expected_kernel_key << ", fallbacking to CPU one!"; @@ -1353,7 +1353,7 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const { #endif #ifdef PADDLE_WITH_MLU if (kernel_iter == kernels.end() && - is_mlu_place(expected_kernel_key.place_)) { + platform::is_mlu_place(expected_kernel_key.place_)) { VLOG(3) << "missing MLU kernel: " << type_ << ", expected_kernel_key:" << expected_kernel_key << ", fallbacking to CPU one!"; diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 9a38a2d5d6fe8..d6c1c4cb6acc0 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -500,11 +500,9 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) { if (platform::is_gpu_place(place)) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (IsFastEagerDeletionModeEnabled()) { - gc.reset(new UnsafeFastGPUGarbageCollector( - BOOST_GET_CONST(platform::CUDAPlace, place), max_memory_size)); + gc.reset(new UnsafeFastGPUGarbageCollector(place, max_memory_size)); } else { - gc.reset(new StreamGarbageCollector( - BOOST_GET_CONST(platform::CUDAPlace, place), max_memory_size)); + gc.reset(new StreamGarbageCollector(place, max_memory_size)); } VLOG(10) << "Created " << i << "-th GarbageCollector at " << place; #else @@ -515,11 +513,9 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) { } else if (platform::is_mlu_place(place)) { #ifdef PADDLE_WITH_MLU if (IsFastEagerDeletionModeEnabled()) { - gc.reset(new MLUUnsafeFastGarbageCollector( - BOOST_GET_CONST(platform::MLUPlace, place), max_memory_size)); + gc.reset(new MLUUnsafeFastGarbageCollector(place, max_memory_size)); } else { - gc.reset(new MLUStreamGarbageCollector( - BOOST_GET_CONST(platform::MLUPlace, place), max_memory_size)); + gc.reset(new MLUStreamGarbageCollector(place, max_memory_size)); } VLOG(10) << "Created " << i << "-th GarbageCollector at " << place; #else @@ -529,8 +525,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) { #endif } else if (platform::is_xpu_place(place)) { #if defined(PADDLE_WITH_XPU) - gc.reset(new XPUGarbageCollector( - BOOST_GET_CONST(platform::XPUPlace, place), max_memory_size)); + gc.reset(new XPUGarbageCollector(place, max_memory_size)); VLOG(10) << "Created " << i << "-th GarbageCollector at " << place; #else PADDLE_THROW(platform::errors::PermissionDenied( @@ -538,8 +533,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) { "Please recompile or reinstall Paddle with XPU support.")); #endif } else if (platform::is_cpu_place(place)) { - gc.reset(new CPUGarbageCollector( - BOOST_GET_CONST(platform::CPUPlace, place), max_memory_size)); + gc.reset(new CPUGarbageCollector(place, max_memory_size)); VLOG(10) << "Created GarbageCollector at " << place; } else { PADDLE_THROW(platform::errors::PreconditionNotMet( @@ -609,10 +603,9 @@ void InitP2P(const std::vector &places) { std::vector devices; for (int i = 0; i < count; i++) { - if (!is_gpu_place(places[i])) return; + if (!platform::is_gpu_place(places[i])) return; - platform::CUDAPlace device = - BOOST_GET_CONST(platform::CUDAPlace, places[i]); + platform::CUDAPlace device = places[i]; devices.push_back(device.GetDeviceId()); } @@ -655,9 +648,9 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, const BuildStrategy &build_strategy, ir::Graph *graph) : member_(new ParallelExecutorPrivate(places, scope)) { - PADDLE_ENFORCE(places.size() > 0 && !is_npu_place(places[0]), - platform::errors::Unavailable( - "NPU is not supported in ParallelExecutor")); + PADDLE_ENFORCE_EQ(places.size() > 0 && !platform::is_npu_place(places[0]), + true, platform::errors::Unavailable( + "NPU is not supported in ParallelExecutor.")); InitP2P(places); ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_), member_->places_.size()); diff --git a/paddle/fluid/framework/pull_dense_worker.cc b/paddle/fluid/framework/pull_dense_worker.cc index b13aaadc81661..62d6ba0973547 100644 --- a/paddle/fluid/framework/pull_dense_worker.cc +++ b/paddle/fluid/framework/pull_dense_worker.cc @@ -135,13 +135,11 @@ void PullDenseWorker::Wait(std::vector<::std::future>* status_vec) { LoDTensor* tensor = var->GetMutable(); float* w = tensor->data(); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, places_[i]), w, - platform::CUDAPinnedPlace(), pin_w, + memory::Copy(places_[i], w, platform::CUDAPinnedPlace(), pin_w, sizeof(float) * tensor->numel(), copy_streams_[i]); #endif #ifdef PADDLE_WITH_XPU - memory::Copy(BOOST_GET_CONST(platform::XPUPlace, places_[i]), w, - platform::CPUPlace(), pin_w, + memory::Copy(places_[i], w, platform::CPUPlace(), pin_w, sizeof(float) * tensor->numel()); #endif } diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc index 64d8332e22327..1f821720d64d2 100644 --- a/paddle/fluid/framework/section_worker.cc +++ b/paddle/fluid/framework/section_worker.cc @@ -224,23 +224,20 @@ void SectionWorker::TrainFiles() { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(place_)) { if (IsFastEagerDeletionModeEnabled()) { - gc.reset(new UnsafeFastGPUGarbageCollector( - BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size)); + gc.reset(new UnsafeFastGPUGarbageCollector(place_, max_memory_size)); } } #elif defined(PADDLE_WITH_ASCEND_CL) if (IsFastEagerDeletionModeEnabled()) { VLOG(4) << "Use unsafe fast gc for NPU."; - gc.reset(new NPUUnsafeFastGarbageCollector( - BOOST_GET_CONST(platform::NPUPlace, place_), max_memory_size)); + gc.reset(new NPUUnsafeFastGarbageCollector(place_, max_memory_size)); } else { PADDLE_THROW(platform::errors::Unimplemented( "Please set FLAGS_fast_eager_deletion_mode=true to use " "GarbageCollector on NPU.")); // TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector. VLOG(4) << "Use default stream gc for NPU."; - gc.reset(new NPUDefaultStreamGarbageCollector( - BOOST_GET_CONST(platform::NPUPlace, place_), max_memory_size)); + gc.reset(new NPUDefaultStreamGarbageCollector(place_, max_memory_size)); } #endif } // max_memory_size >= 0 diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h index 3e4beb9498cf7..3634ccca95126 100644 --- a/paddle/fluid/framework/selected_rows.h +++ b/paddle/fluid/framework/selected_rows.h @@ -25,13 +25,7 @@ limitations under the License. */ #include "paddle/fluid/framework/rw_lock.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/memory/memcpy.h" - -namespace paddle { -namespace platform { -class DeviceContext; -class Place; -} // namespace platform -} // namespace paddle +#include "paddle/fluid/platform/place.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 84334417dc7da..4298b159ead52 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -76,34 +76,28 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place, #endif if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { - memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr, - BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size); + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } #ifdef PADDLE_WITH_IPU else if (platform::is_ipu_place(src_place) && // NOLINT platform::is_cpu_place(dst_place)) { - memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr, - BOOST_GET_CONST(platform::IPUPlace, src_place), src_ptr, size); + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } else if (platform::is_cpu_place(src_place) && platform::is_ipu_place(dst_place)) { - memory::Copy(BOOST_GET_CONST(platform::IPUPlace, dst_place), dst_ptr, - BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size); + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } else if (platform::is_ipu_place(src_place) && platform::is_ipu_place(dst_place)) { - memory::Copy(BOOST_GET_CONST(platform::IPUPlace, dst_place), dst_ptr, - BOOST_GET_CONST(platform::IPUPlace, src_place), src_ptr, size); + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } #endif #ifdef PADDLE_WITH_XPU else if (platform::is_xpu_place(src_place) && // NOLINT platform::is_cpu_place(dst_place)) { - memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr, - BOOST_GET_CONST(platform::XPUPlace, src_place), src_ptr, size); + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } else if (platform::is_cpu_place(src_place) && platform::is_xpu_place(dst_place)) { - memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dst_place), dst_ptr, - BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size); + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } else if (platform::is_xpu_place(src_place) && platform::is_xpu_place(dst_place)) { if (src_ptr == dst_ptr) { @@ -111,8 +105,7 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place, << dst_place; return; } - memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dst_place), dst_ptr, - BOOST_GET_CONST(platform::XPUPlace, src_place), src_ptr, size); + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } else { PADDLE_THROW(platform::errors::Unimplemented( "Copy from %s to %s is not supported.", src_place, dst_place)); @@ -124,9 +117,7 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place, platform::is_cpu_place(dst_place)) { auto stream = reinterpret_cast(ctx).stream(); - memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr, - BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size, - stream); + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream); } else if (platform::is_cpu_place(src_place) && // NOLINT platform::is_npu_place(dst_place)) { @@ -136,13 +127,11 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place, npu_pinned_tensor.Resize(src.dims()); auto npu_pinned_ptr = npu_pinned_tensor.mutable_data(npu_pinned_place, src.type()); - memory::Copy(npu_pinned_place, npu_pinned_ptr, - BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size); + memory::Copy(npu_pinned_place, npu_pinned_ptr, src_place, src_ptr, size); // 2. async copy npu pinned tensor -> npu tensor memory::Copy( - BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, - npu_pinned_place, npu_pinned_ptr, size, + dst_place, dst_ptr, npu_pinned_place, npu_pinned_ptr, size, reinterpret_cast(ctx).stream()); // 3. record event @@ -165,22 +154,19 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place, } auto stream = reinterpret_cast(ctx).stream(); - memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, - BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size, - stream); + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream); } else if (platform::is_npu_pinned_place(src_place) && // NOLINT platform::is_npu_place(dst_place)) { /* npu_pinned->npu */ - auto src_npu_pinned_place = - BOOST_GET_CONST(platform::NPUPinnedPlace, src_place); - auto dst_npu_place = BOOST_GET_CONST(platform::NPUPlace, dst_place); + auto src_npu_pinned_place = src_place; + auto dst_npu_place = dst_place; auto ctx_place = ctx.GetPlace(); PADDLE_ENFORCE_EQ(platform::is_npu_place(ctx_place), true, platform::errors::PreconditionNotMet( "Device context place mismatch. When copying Tensor " "data from NPU Pinned memory to NPU memory, current " "device context place should be NPU.")); - auto ctx_npu_place = BOOST_GET_CONST(platform::NPUPlace, ctx_place); + auto ctx_npu_place = ctx_place; PADDLE_ENFORCE_EQ(dst_npu_place, ctx_npu_place, platform::errors::PreconditionNotMet( "The target NPU device and current device context do " @@ -194,16 +180,15 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place, } else if (platform::is_npu_place(src_place) && // NOLINT platform::is_npu_pinned_place(dst_place)) { /* npu->npu_pinned */ - auto src_npu_place = BOOST_GET_CONST(platform::NPUPlace, src_place); - auto dst_npu_pinned_place = - BOOST_GET_CONST(platform::NPUPinnedPlace, dst_place); + auto src_npu_place = src_place; + auto dst_npu_pinned_place = dst_place; auto ctx_place = ctx.GetPlace(); PADDLE_ENFORCE_EQ(platform::is_npu_place(ctx_place), true, platform::errors::PreconditionNotMet( "Device context place mismatch. When copying Tensor " "data from NPU memory to NPU Pinned memory, current " "device context place should be NPU.")); - auto ctx_npu_place = BOOST_GET_CONST(platform::NPUPlace, ctx_place); + auto ctx_npu_place = ctx_place; PADDLE_ENFORCE_EQ(src_place, ctx_npu_place, platform::errors::PreconditionNotMet( "The source NPU device and current device context do " @@ -223,32 +208,27 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place, #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) else if (platform::is_cuda_pinned_place(src_place) && // NOLINT platform::is_cuda_pinned_place(dst_place)) { - memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr, - BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place), src_ptr, - size); + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } else if (platform::is_cuda_pinned_place(src_place) && // NOLINT platform::is_cpu_place(dst_place)) { - memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr, - BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place), src_ptr, - size); + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } else if (platform::is_cpu_place(src_place) && // NOLINT platform::is_cuda_pinned_place(dst_place)) { - memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr, - BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size); + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } else if (platform::is_gpu_place(src_place) && // NOLINT platform::is_cpu_place(dst_place)) { - auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place); - auto dst_cpu_place = BOOST_GET_CONST(platform::CPUPlace, dst_place); + auto src_gpu_place = src_place; + auto dst_cpu_place = dst_place; auto ctx_place = ctx.GetPlace(); PADDLE_ENFORCE_EQ( platform::is_gpu_place(ctx_place), true, platform::errors::PreconditionNotMet( "Context place error, excepted GPUPlace, but actually %s.", ctx_place)); - auto ctx_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, ctx_place); + auto ctx_gpu_place = ctx_place; PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place, platform::errors::Unavailable( "Source place and context place do not match, source " @@ -260,15 +240,15 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place, } else if (platform::is_cpu_place(src_place) && // NOLINT platform::is_gpu_place(dst_place)) { - auto src_cpu_place = BOOST_GET_CONST(platform::CPUPlace, src_place); - auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place); + auto src_cpu_place = src_place; + auto dst_gpu_place = dst_place; auto ctx_place = ctx.GetPlace(); PADDLE_ENFORCE_EQ( platform::is_gpu_place(ctx_place), true, platform::errors::PreconditionNotMet( "Context place error, excepted GPUPlace, but actually %s.", ctx_place)); - auto ctx_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, ctx_place); + auto ctx_gpu_place = ctx_place; PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place, platform::errors::Unavailable( "Destination place and context place do not match, " @@ -280,16 +260,15 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place, } else if (platform::is_gpu_place(src_place) && // NOLINT platform::is_cuda_pinned_place(dst_place)) { - auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place); - auto dst_cuda_pinned_place = - BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place); + auto src_gpu_place = src_place; + auto dst_cuda_pinned_place = dst_place; auto ctx_place = ctx.GetPlace(); PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx_place), true, platform::errors::PreconditionNotMet( "Device context place mismatch. When copying Tensor " "data from GPU memory to CUDA Pinned memory, current " "device context place should be GPU.")); - auto ctx_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, ctx_place); + auto ctx_gpu_place = ctx_place; PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place, platform::errors::PreconditionNotMet( "The source GPU device and current device context do " @@ -303,16 +282,15 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place, } else if (platform::is_cuda_pinned_place(src_place) && // NOLINT platform::is_gpu_place(dst_place)) { - auto src_cuda_pinned_place = - BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place); - auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place); + auto src_cuda_pinned_place = src_place; + auto dst_gpu_place = dst_place; auto ctx_place = ctx.GetPlace(); PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx_place), true, platform::errors::PreconditionNotMet( "Device context place mismatch. When copying Tensor " "data from CUDA Pinned memory to GPU memory, current " "device context place should be GPU.")); - auto ctx_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, ctx_place); + auto ctx_gpu_place = ctx_place; PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place, platform::errors::PreconditionNotMet( "The target GPU device and current device context do " @@ -326,8 +304,8 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place, } else if (platform::is_gpu_place(src_place) && // NOLINT platform::is_gpu_place(dst_place)) { - auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place); - auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place); + auto src_gpu_place = src_place; + auto dst_gpu_place = dst_place; auto ctx_place = ctx.GetPlace(); PADDLE_ENFORCE_EQ( platform::is_gpu_place(ctx_place), true, @@ -362,24 +340,24 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place, #ifdef PADDLE_WITH_MLU else if (platform::is_mlu_place(src_place) && // NOLINT platform::is_cpu_place(dst_place)) { - auto src_mlu_place = BOOST_GET_CONST(platform::MLUPlace, src_place); - auto dst_cpu_place = BOOST_GET_CONST(platform::CPUPlace, dst_place); + auto src_mlu_place = src_place; + auto dst_cpu_place = dst_place; auto stream = reinterpret_cast(ctx).stream(); memory::Copy(dst_cpu_place, dst_ptr, src_mlu_place, src_ptr, size, stream); } else if (platform::is_cpu_place(src_place) && // NOLINT platform::is_mlu_place(dst_place)) { - auto src_cpu_place = BOOST_GET_CONST(platform::CPUPlace, src_place); - auto dst_mlu_place = BOOST_GET_CONST(platform::MLUPlace, dst_place); + auto src_cpu_place = src_place; + auto dst_mlu_place = dst_place; auto stream = reinterpret_cast(ctx).stream(); memory::Copy(dst_mlu_place, dst_ptr, src_cpu_place, src_ptr, size, stream); } else if (platform::is_mlu_place(src_place) && // NOLINT platform::is_mlu_place(dst_place)) { - auto src_mlu_place = BOOST_GET_CONST(platform::MLUPlace, src_place); - auto dst_mlu_place = BOOST_GET_CONST(platform::MLUPlace, dst_place); + auto src_mlu_place = src_place; + auto dst_mlu_place = dst_place; auto stream = reinterpret_cast(ctx).stream(); memory::Copy(dst_mlu_place, dst_ptr, src_mlu_place, src_ptr, size, stream); @@ -451,18 +429,15 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, auto size = src.numel() * SizeOfType(src.type()); if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { - memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr, - BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size); + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } #ifdef PADDLE_WITH_IPU else if (platform::is_ipu_place(src_place) && // NOLINT platform::is_cpu_place(dst_place)) { - memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr, - BOOST_GET_CONST(platform::IPUPlace, src_place), src_ptr, size); + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } else if (platform::is_cpu_place(src_place) && // NOLINT platform::is_ipu_place(dst_place)) { - memory::Copy(BOOST_GET_CONST(platform::IPUPlace, dst_place), dst_ptr, - BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size); + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } else { // NOLINT PADDLE_THROW(platform::errors::Unimplemented( "Copy from %s to %s is not supported.", src_place, dst_place)); @@ -471,13 +446,11 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, #ifdef PADDLE_WITH_XPU else if (platform::is_xpu_place(src_place) && // NOLINT platform::is_cpu_place(dst_place)) { - memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr, - BOOST_GET_CONST(platform::XPUPlace, src_place), src_ptr, size); + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } else if (platform::is_cpu_place(src_place) && // NOLINT platform::is_xpu_place(dst_place)) { - memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dst_place), dst_ptr, - BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size); + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } else if (platform::is_xpu_place(src_place) && // NOLINT platform::is_xpu_place(dst_place)) { @@ -486,12 +459,9 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, << dst_place; return; } - memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dst_place), dst_ptr, - BOOST_GET_CONST(platform::XPUPlace, src_place), src_ptr, size); - platform::XPUPlace xpu_dst_place = - BOOST_GET_CONST(platform::XPUPlace, dst_place); - platform::XPUPlace xpu_src_place = - BOOST_GET_CONST(platform::XPUPlace, src_place); + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); + platform::XPUPlace xpu_dst_place = dst_place; + platform::XPUPlace xpu_src_place = src_place; if (xpu_dst_place.device == xpu_src_place.device) { auto xpu_ctx = platform::DeviceContextPool::Instance().Get(xpu_dst_place); xpu_ctx->Wait(); @@ -505,15 +475,11 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, #ifdef PADDLE_WITH_ASCEND_CL else if (platform::is_npu_place(src_place) && // NOLINT platform::is_cpu_place(dst_place)) { /* npu -> cpu*/ - memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr, - BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size, - nullptr); + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr); } else if (platform::is_cpu_place(src_place) && // NOLINT platform::is_npu_place(dst_place)) { /* cpu -> npu*/ - memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, - BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size, - nullptr); + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr); } else if (platform::is_npu_place(src_place) && // NOLINT platform::is_npu_place(dst_place)) { /* npu -> npu*/ @@ -522,9 +488,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, << dst_place; return; } - memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, - BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size, - nullptr); + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr); } else { // NOLINT PADDLE_THROW(platform::errors::Unimplemented( @@ -534,50 +498,42 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) else if (platform::is_cuda_pinned_place(src_place) && // NOLINT platform::is_cuda_pinned_place(dst_place)) { - memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr, - BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place), src_ptr, - size); + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } else if (platform::is_cuda_pinned_place(src_place) && // NOLINT platform::is_cpu_place(dst_place)) { - memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr, - BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place), src_ptr, - size); + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } else if (platform::is_cpu_place(src_place) && // NOLINT platform::is_cuda_pinned_place(dst_place)) { - memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr, - BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size); + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } else if (platform::is_gpu_place(src_place) && // NOLINT platform::is_cuda_pinned_place(dst_place)) { - memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr, - BOOST_GET_CONST(platform::CUDAPlace, src_place), src_ptr, size, - nullptr); + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr); } else if (platform::is_gpu_place(src_place) && // NOLINT platform::is_cpu_place(dst_place)) { - auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place); - auto dst_cpu_place = BOOST_GET_CONST(platform::CPUPlace, dst_place); + auto src_gpu_place = src_place; + auto dst_cpu_place = dst_place; memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr); } else if (platform::is_cpu_place(src_place) && // NOLINT platform::is_gpu_place(dst_place)) { - auto src_cpu_place = BOOST_GET_CONST(platform::CPUPlace, src_place); - auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place); + auto src_cpu_place = src_place; + auto dst_gpu_place = dst_place; memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr); } else if (platform::is_gpu_place(src_place) && // NOLINT platform::is_gpu_place(dst_place)) { - auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place); - auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place); + auto src_gpu_place = src_place; + auto dst_gpu_place = dst_place; memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr); } else if (platform::is_cuda_pinned_place(src_place) && // NOLINT platform::is_gpu_place(dst_place)) { - auto src_pinned_place = - BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place); - auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place); + auto src_pinned_place = src_place; + auto dst_gpu_place = dst_place; memory::Copy(dst_gpu_place, dst_ptr, src_pinned_place, src_ptr, size, nullptr); } @@ -589,15 +545,11 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, #ifdef PADDLE_WITH_MLU else if (platform::is_mlu_place(src_place) && // NOLINT platform::is_cpu_place(dst_place)) { - memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr, - BOOST_GET_CONST(platform::MLUPlace, src_place), src_ptr, size, - nullptr); + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr); } else if (platform::is_cpu_place(src_place) && // NOLINT platform::is_mlu_place(dst_place)) { - memory::Copy(BOOST_GET_CONST(platform::MLUPlace, dst_place), dst_ptr, - BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size, - nullptr); + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr); } else if (platform::is_mlu_place(src_place) && // NOLINT platform::is_mlu_place(dst_place)) { @@ -606,9 +558,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, << dst_place; return; } - memory::Copy(BOOST_GET_CONST(platform::MLUPlace, dst_place), dst_ptr, - BOOST_GET_CONST(platform::MLUPlace, src_place), src_ptr, size, - nullptr); + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr); } else { // NOLINT PADDLE_THROW(platform::errors::Unimplemented( @@ -1015,8 +965,7 @@ void TensorToStream(std::ostream& os, const Tensor& tensor, uintptr_t data = reinterpret_cast(data_ptr); while (size != 0) { size_t size_to_write = std::min(kBufSize, static_cast(size)); - memory::Copy(cpu, buf.get(), - BOOST_GET_CONST(platform::CUDAPlace, tensor.place()), + memory::Copy(cpu, buf.get(), tensor.place(), reinterpret_cast(data), size_to_write, gpu_dev_ctx.stream()); gpu_dev_ctx.Wait(); @@ -1038,8 +987,7 @@ void TensorToStream(std::ostream& os, const Tensor& tensor, uintptr_t data = reinterpret_cast(data_ptr); while (size != 0) { size_t size_to_write = std::min(kBufSize, static_cast(size)); - memory::Copy(cpu, buf.get(), - BOOST_GET_CONST(platform::XPUPlace, tensor.place()), + memory::Copy(cpu, buf.get(), tensor.place(), reinterpret_cast(data), size_to_write); xpu_dev_ctx.Wait(); os.write(buf.get(), size_to_write); @@ -1060,8 +1008,7 @@ void TensorToStream(std::ostream& os, const Tensor& tensor, uintptr_t data = reinterpret_cast(data_ptr); while (size != 0) { size_t size_to_write = std::min(kBufSize, static_cast(size)); - memory::Copy(cpu, buf.get(), - BOOST_GET_CONST(platform::MLUPlace, tensor.place()), + memory::Copy(cpu, buf.get(), tensor.place(), reinterpret_cast(data), size_to_write, mlu_dev_ctx.stream()); mlu_dev_ctx.Wait(); @@ -1083,8 +1030,7 @@ void TensorToStream(std::ostream& os, const Tensor& tensor, uintptr_t data = reinterpret_cast(data_ptr); while (size != 0) { size_t size_to_write = std::min(kBufSize, static_cast(size)); - memory::Copy(cpu, buf.get(), - BOOST_GET_CONST(platform::NPUPlace, tensor.place()), + memory::Copy(cpu, buf.get(), tensor.place(), reinterpret_cast(data), size_to_write, npu_dev_ctx.stream()); npu_dev_ctx.Wait(); diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h index 355be39baa2a5..3cb3c733f4042 100644 --- a/paddle/fluid/framework/tensor_util.h +++ b/paddle/fluid/framework/tensor_util.h @@ -153,14 +153,12 @@ void TensorFromArray(const T* src, const size_t& array_size, auto size = array_size * sizeof(T); if (platform::is_cpu_place(dst_place)) { - memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr, - src_place, src_ptr, size); + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) else if (platform::is_gpu_place(dst_place)) { // NOLINT memory::Copy( - BOOST_GET_CONST(platform::CUDAPlace, dst_place), dst_ptr, src_place, - src_ptr, size, + dst_place, dst_ptr, src_place, src_ptr, size, reinterpret_cast(ctx).stream()); } #endif @@ -176,8 +174,7 @@ void TensorFromArray(const T* src, const size_t& array_size, // 2. async copy npu pinned tensor -> npu tensor memory::Copy( - BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, - npu_pinned_place, npu_pinned_ptr, size, + dst_place, dst_ptr, npu_pinned_place, npu_pinned_ptr, size, reinterpret_cast(ctx).stream()); // 3. record event @@ -205,14 +202,12 @@ void TensorFromVector(const std::vector& src, auto size = src.size() * sizeof(T); if (platform::is_cpu_place(dst_place)) { - memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr, - src_place, src_ptr, size); + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) else if (platform::is_gpu_place(dst_place)) { // NOLINT memory::Copy( - BOOST_GET_CONST(platform::CUDAPlace, dst_place), dst_ptr, src_place, - src_ptr, size, + dst_place, dst_ptr, src_place, src_ptr, size, reinterpret_cast(ctx).stream()); } #endif @@ -233,8 +228,7 @@ void TensorFromVector(const std::vector& src, // 2. async copy npu pinned tensor -> npu tensor memory::Copy( - BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, - npu_pinned_place, npu_pinned_ptr, size, + dst_place, dst_ptr, npu_pinned_place, npu_pinned_ptr, size, reinterpret_cast(ctx).stream()); // 3. record event @@ -252,8 +246,7 @@ void TensorFromVector(const std::vector& src, #ifdef PADDLE_WITH_MLU if (platform::is_mlu_place(dst_place)) { memory::Copy( - BOOST_GET_CONST(platform::MLUPlace, dst_place), dst_ptr, src_place, - src_ptr, size, + dst_place, dst_ptr, src_place, src_ptr, size, reinterpret_cast(ctx).stream()); } #endif @@ -280,14 +273,12 @@ inline void TensorFromVector(const std::vector& src, auto size = src.size() * sizeof(bool); if (platform::is_cpu_place(dst_place)) { - memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr, - src_place, src_ptr, size); + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } #ifdef PADDLE_WITH_CUDA else if (platform::is_gpu_place(dst_place)) { // NOLINT memory::Copy( - BOOST_GET_CONST(platform::CUDAPlace, dst_place), dst_ptr, src_place, - src_ptr, size, + dst_place, dst_ptr, src_place, src_ptr, size, reinterpret_cast(ctx).stream()); } #endif @@ -303,8 +294,7 @@ inline void TensorFromVector(const std::vector& src, // 2. async copy npu pinned tensor -> npu tensor memory::Copy( - BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, - npu_pinned_place, npu_pinned_ptr, size, + dst_place, dst_ptr, npu_pinned_place, npu_pinned_ptr, size, reinterpret_cast(ctx).stream()); // 3. record event @@ -362,37 +352,29 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx, auto dst_ptr = static_cast(dst->data()); if (platform::is_cpu_place(src.place())) { - memory::Copy(dst_place, dst_ptr, - BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr, - size); + memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size); } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) else if (platform::is_gpu_place(src.place())) { // NOLINT memory::Copy( - dst_place, dst_ptr, BOOST_GET_CONST(platform::CUDAPlace, src.place()), - src_ptr, size, + dst_place, dst_ptr, src.place(), src_ptr, size, reinterpret_cast(ctx).stream()); } #endif #if defined(PADDLE_WITH_XPU) else if (platform::is_xpu_place(src.place())) { // NOLINT - memory::Copy(dst_place, dst_ptr, - BOOST_GET_CONST(platform::XPUPlace, src.place()), src_ptr, - size); + memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size); } #endif #ifdef PADDLE_WITH_ASCEND_CL else if (platform::is_npu_place(src.place())) { // NOLINT - memory::Copy(dst_place, dst_ptr, - BOOST_GET_CONST(platform::NPUPlace, src.place()), src_ptr, - size, nullptr); + memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr); } #endif #ifdef PADDLE_WITH_MLU else if (platform::is_mlu_place(src.place())) { // NOLINT memory::Copy( - dst_place, dst_ptr, BOOST_GET_CONST(platform::MLUPlace, src.place()), - src_ptr, size, + dst_place, dst_ptr, src.place(), src_ptr, size, reinterpret_cast(ctx).stream()); } #endif @@ -412,37 +394,29 @@ inline void TensorToVector(const Tensor& src, auto dst_ptr = static_cast(array); if (platform::is_cpu_place(src.place())) { - memory::Copy(dst_place, dst_ptr, - BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr, - size); + memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size); } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) else if (platform::is_gpu_place(src.place())) { // NOLINT memory::Copy( - dst_place, dst_ptr, BOOST_GET_CONST(platform::CUDAPlace, src.place()), - src_ptr, size, + dst_place, dst_ptr, src.place(), src_ptr, size, reinterpret_cast(ctx).stream()); } #endif #if defined(PADDLE_WITH_XPU) else if (platform::is_xpu_place(src.place())) { // NOLINT - memory::Copy(dst_place, dst_ptr, - BOOST_GET_CONST(platform::XPUPlace, src.place()), src_ptr, - size); + memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size); } #endif #ifdef PADDLE_WITH_ASCEND_CL else if (platform::is_npu_place(src.place())) { // NOLINT - memory::Copy(dst_place, dst_ptr, - BOOST_GET_CONST(platform::NPUPlace, src.place()), src_ptr, - size, nullptr); + memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr); } #endif #ifdef PADDLE_WITH_MLU else if (platform::is_mlu_place(src.place())) { // NOLINT memory::Copy( - dst_place, dst_ptr, BOOST_GET_CONST(platform::MLUPlace, src.place()), - src_ptr, size, + dst_place, dst_ptr, src.place(), src_ptr, size, reinterpret_cast(ctx).stream()); } #endif @@ -467,8 +441,7 @@ void TensorToVector(const Tensor& src, std::vector* dst) { "The input tensor should be CPU device, but actually it is in %s.", src.place())); - memory::Copy(dst_place, dst_ptr, - BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr, size); + memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size); } template <> @@ -488,8 +461,7 @@ inline void TensorToVector(const Tensor& src, std::vector* dst) { "The input tensor should be CPU device, but actually it is in %s.", src.place())); - memory::Copy(dst_place, dst_ptr, - BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr, size); + memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size); for (unsigned int i = 0; i < src.numel(); i++) { (*dst)[i] = static_cast(array[i]); diff --git a/paddle/fluid/imperative/bkcl_context.cc b/paddle/fluid/imperative/bkcl_context.cc index 2072c41673aaf..f08dd59e39206 100644 --- a/paddle/fluid/imperative/bkcl_context.cc +++ b/paddle/fluid/imperative/bkcl_context.cc @@ -86,7 +86,7 @@ void BKCLParallelContext::Init() { } BcastBKCLId(bkcl_ids, 0); - int xpu_id = BOOST_GET_CONST(platform::XPUPlace, place_).device; + int xpu_id = place_.device; for (int ring_id = 0; ring_id < strategy_.nrings_; ring_id++) { VLOG(0) << "init BKCL context nranks: " << strategy_.nranks_ << " local rank: " << strategy_.local_rank_ << " xpu id: " << xpu_id @@ -111,7 +111,7 @@ void BKCLParallelContext::InitWithRingID(int ring_id) { } BcastBKCLId(bkcl_ids, 0); - int xpu_id = BOOST_GET_CONST(platform::XPUPlace, place_).device; + int xpu_id = place_.device; VLOG(0) << "init BKCL context nranks: " << strategy_.nranks_ << " local rank: " << strategy_.local_rank_ << " xpu id: " << xpu_id << " ring id: " << ring_id; diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index d98609273a61f..2056b8622052b 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -78,7 +78,7 @@ class TensorAddFunctor : public boost::static_visitor<> { TensorAddFunctor(int64_t numel, const T* x, T* y) : numel_(numel), x_(x), y_(y) {} - void operator()(const platform::CPUPlace& place) { + void operator()(const platform::CPUPlace& place) const { platform::CPUDeviceContext* ctx = dynamic_cast( platform::DeviceContextPool::Instance().Get(place)); auto blas = operators::math::GetBlas(*ctx); @@ -86,7 +86,7 @@ class TensorAddFunctor : public boost::static_visitor<> { } #ifdef PADDLE_WITH_XPU - void operator()(const platform::XPUPlace& place) { + void operator()(const platform::XPUPlace& place) const { using XPUType = typename XPUTypeTrait::Type; platform::XPUDeviceContext* ctx = dynamic_cast( platform::DeviceContextPool::Instance().Get(place)); @@ -100,7 +100,7 @@ class TensorAddFunctor : public boost::static_visitor<> { r, XPUAPIErrorMsg[r])); } #else - void operator()(const platform::XPUPlace& place) { + void operator()(const platform::XPUPlace& place) const { PADDLE_THROW(platform::errors::PermissionDenied( "Gradient accumulation on place (%s) " "is not supported in imperative mode", @@ -109,7 +109,7 @@ class TensorAddFunctor : public boost::static_visitor<> { #endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - void operator()(const platform::CUDAPlace& place) { + void operator()(const platform::CUDAPlace& place) const { platform::CUDADeviceContext* ctx = dynamic_cast( platform::DeviceContextPool::Instance().Get(place)); @@ -117,7 +117,7 @@ class TensorAddFunctor : public boost::static_visitor<> { blas.AXPY(numel_, 1., x_, y_); } #else - void operator()(const platform::CUDAPlace& place) { + void operator()(const platform::CUDAPlace& place) const { PADDLE_THROW(platform::errors::PermissionDenied( "Gradient accumulation on place (%s) " "is not supported in imperative mode", @@ -126,7 +126,7 @@ class TensorAddFunctor : public boost::static_visitor<> { #endif #ifdef PADDLE_WITH_MLU - void operator()(const platform::MLUPlace& place) { + void operator()(const platform::MLUPlace& place) const { // TODO(fwg): SUPPORT it PADDLE_THROW(platform::errors::PermissionDenied( "Gradient accumulation on place (%s) " @@ -134,7 +134,7 @@ class TensorAddFunctor : public boost::static_visitor<> { place)); } #else - void operator()(const platform::MLUPlace& place) { + void operator()(const platform::MLUPlace& place) const { PADDLE_THROW(platform::errors::PermissionDenied( "Gradient accumulation on place (%s) " "is not supported in imperative mode", @@ -143,7 +143,7 @@ class TensorAddFunctor : public boost::static_visitor<> { #endif #ifdef PADDLE_WITH_ASCEND_CL - void operator()(const platform::NPUPlace& place) { + void operator()(const platform::NPUPlace& place) const { // TODO(zhiqiu): SUPPORT it PADDLE_THROW(platform::errors::PermissionDenied( "Gradient accumulation on place (%s) " @@ -151,7 +151,7 @@ class TensorAddFunctor : public boost::static_visitor<> { place)); } #else - void operator()(const platform::NPUPlace& place) { + void operator()(const platform::NPUPlace& place) const { PADDLE_THROW(platform::errors::PermissionDenied( "Gradient accumulation on place (%s) " "is not supported in imperative mode", @@ -159,21 +159,21 @@ class TensorAddFunctor : public boost::static_visitor<> { } #endif - void operator()(const platform::NPUPinnedPlace& place) { + void operator()(const platform::NPUPinnedPlace& place) const { PADDLE_THROW(platform::errors::PermissionDenied( "Gradient accumulation on place (%s) " "is not supported in imperative mode", place)); } // there is NO blas in CUDAPinnedPlace - void operator()(const platform::CUDAPinnedPlace& place) { + void operator()(const platform::CUDAPinnedPlace& place) const { PADDLE_THROW(platform::errors::PermissionDenied( "Gradient accumulation on place (%s) " "is not supported in imperative mode", place)); } // there is NO support in IPUPlace - void operator()(const platform::IPUPlace& place) { + void operator()(const platform::IPUPlace& place) const { PADDLE_THROW(platform::errors::PermissionDenied( "Gradient accumulation on place (%s) " "is not supported in imperative mode", @@ -183,7 +183,7 @@ class TensorAddFunctor : public boost::static_visitor<> { private: int64_t numel_; const T* x_; - T* y_; + mutable T* y_; }; #ifdef PADDLE_WITH_XPU @@ -248,7 +248,7 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) { TensorAddFunctor func( \ numel, src_tensor.data(), \ dst_tensor->mutable_data(place)); \ - boost::apply_visitor(func, place); \ + platform::VisitPlace(place, func); \ return; \ } diff --git a/paddle/fluid/imperative/hccl_context.cc b/paddle/fluid/imperative/hccl_context.cc index 818b2f424b6af..7292c0f82fced 100644 --- a/paddle/fluid/imperative/hccl_context.cc +++ b/paddle/fluid/imperative/hccl_context.cc @@ -86,7 +86,7 @@ void HCCLParallelContext::Init() { } BcastHCCLId(hccl_ids, 0, server_fd); - int npu_id = BOOST_GET_CONST(platform::NPUPlace, place_).device; + int npu_id = place_.device; for (int ring_id = 0; ring_id < strategy_.nrings_; ring_id++) { VLOG(0) << "init hccl context nranks: " << strategy_.nranks_ << " local rank: " << strategy_.local_rank_ << " npu id: " << npu_id @@ -96,10 +96,10 @@ void HCCLParallelContext::Init() { &hccl_ids[ring_id], strategy_.nranks_, strategy_.local_rank_, npu_id, ring_id); - compute_events_.emplace_back(platform::NpuEventResourcePool::Instance().New( - BOOST_GET_CONST(platform::NPUPlace, place_).device)); - comm_events_.emplace_back(platform::NpuEventResourcePool::Instance().New( - BOOST_GET_CONST(platform::NPUPlace, place_).device)); + compute_events_.emplace_back( + platform::NpuEventResourcePool::Instance().New(place_.device)); + comm_events_.emplace_back( + platform::NpuEventResourcePool::Instance().New(place_.device)); } } @@ -117,7 +117,7 @@ void HCCLParallelContext::InitWithRingID(int ring_id) { } BcastHCCLId(hccl_ids, 0, server_fd); - int npu_id = BOOST_GET_CONST(platform::NPUPlace, place_).device; + int npu_id = place_.device; VLOG(0) << "init hccl context nranks: " << strategy_.nranks_ << " local rank: " << strategy_.local_rank_ << " npu id: " << npu_id << " ring id: " << ring_id; @@ -125,10 +125,10 @@ void HCCLParallelContext::InitWithRingID(int ring_id) { platform::HCCLCommContext::Instance().CreateHCCLComm( &hccl_ids[0], strategy_.nranks_, strategy_.local_rank_, npu_id, ring_id); - compute_events_.emplace_back(platform::NpuEventResourcePool::Instance().New( - BOOST_GET_CONST(platform::NPUPlace, place_).device)); - comm_events_.emplace_back(platform::NpuEventResourcePool::Instance().New( - BOOST_GET_CONST(platform::NPUPlace, place_).device)); + compute_events_.emplace_back( + platform::NpuEventResourcePool::Instance().New(place_.device)); + comm_events_.emplace_back( + platform::NpuEventResourcePool::Instance().New(place_.device)); } void HCCLParallelContext::AllReduceByStream(const framework::Variable &src, diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index cc7fcf455a13d..d2c63d5b21008 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -193,7 +193,7 @@ void VarBase::ClearGradient(bool set_to_zero) { grad_var_->MutableVar()->GetMutable(); if (grad_t->mutable_value()->IsInitialized()) { #ifdef PADDLE_WITH_MKLDNN - if (FLAGS_use_mkldnn) ClearMKLDNNCache(grad_t->place()); + if (FLAGS_use_mkldnn) platform::ClearMKLDNNCache(grad_t->place()); #endif grad_t->mutable_rows()->clear(); grad_t->mutable_value()->clear(); @@ -211,7 +211,7 @@ void VarBase::ClearGradient(bool set_to_zero) { grad_t->clear(); } #ifdef PADDLE_WITH_MKLDNN - if (FLAGS_use_mkldnn) ClearMKLDNNCache(grad_t->place()); + if (FLAGS_use_mkldnn) platform::ClearMKLDNNCache(grad_t->place()); #endif } } diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc index 1b50c515635d2..066d0db134817 100644 --- a/paddle/fluid/imperative/nccl_context.cc +++ b/paddle/fluid/imperative/nccl_context.cc @@ -77,7 +77,7 @@ void NCCLParallelContext::Init() { } BcastNCCLId(nccl_ids, 0, server_fd); - int gpu_id = BOOST_GET_CONST(platform::CUDAPlace, place_).device; + int gpu_id = place_.device; for (int ring_id = 0; ring_id < strategy_.nrings_; ring_id++) { VLOG(0) << "init nccl context nranks: " << strategy_.nranks_ << " local rank: " << strategy_.local_rank_ << " gpu id: " << gpu_id @@ -88,10 +88,9 @@ void NCCLParallelContext::Init() { ring_id); compute_events_.emplace_back( - platform::CudaEventResourcePool::Instance().New( - BOOST_GET_CONST(platform::CUDAPlace, place_).device)); - comm_events_.emplace_back(platform::CudaEventResourcePool::Instance().New( - BOOST_GET_CONST(platform::CUDAPlace, place_).device)); + platform::CudaEventResourcePool::Instance().New(place_.device)); + comm_events_.emplace_back( + platform::CudaEventResourcePool::Instance().New(place_.device)); } } @@ -111,7 +110,7 @@ void NCCLParallelContext::InitWithRingID(int ring_id) { } BcastNCCLId(nccl_ids, 0, server_fd); - int gpu_id = BOOST_GET_CONST(platform::CUDAPlace, place_).device; + int gpu_id = place_.device; VLOG(0) << "init nccl context nranks: " << strategy_.nranks_ << " local rank: " << strategy_.local_rank_ << " gpu id: " << gpu_id << " ring id: " << ring_id; @@ -119,10 +118,10 @@ void NCCLParallelContext::InitWithRingID(int ring_id) { platform::NCCLCommContext::Instance().CreateComm( &nccl_ids[0], strategy_.nranks_, strategy_.local_rank_, gpu_id, ring_id); - compute_events_.emplace_back(platform::CudaEventResourcePool::Instance().New( - BOOST_GET_CONST(platform::CUDAPlace, place_).device)); - comm_events_.emplace_back(platform::CudaEventResourcePool::Instance().New( - BOOST_GET_CONST(platform::CUDAPlace, place_).device)); + compute_events_.emplace_back( + platform::CudaEventResourcePool::Instance().New(place_.device)); + comm_events_.emplace_back( + platform::CudaEventResourcePool::Instance().New(place_.device)); } void NCCLParallelContext::AllReduceByStream(const framework::Variable &src, diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index 15a278c2e6464..6474f3c07fa16 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -194,7 +194,7 @@ PreparedOp PrepareImpl(const NameVarMap& ins, auto& kernels = kernels_iter->second; auto kernel_iter = kernels.find(expected_kernel_key); #ifdef PADDLE_WITH_XPU - if (is_xpu_place(expected_kernel_key.place_) && + if (paddle::platform::is_xpu_place(expected_kernel_key.place_) && (kernel_iter == kernels.end() || !paddle::platform::is_xpu_support_op(op.Type(), expected_kernel_key) || paddle::platform::is_in_xpu_black_list(op.Type()))) { @@ -207,7 +207,7 @@ PreparedOp PrepareImpl(const NameVarMap& ins, #endif #ifdef PADDLE_WITH_ASCEND_CL if (kernel_iter == kernels.end() && - is_npu_place(expected_kernel_key.place_)) { + paddle::platform::is_npu_place(expected_kernel_key.place_)) { VLOG(3) << "missing NPU kernel: " << op.Type() << ", expected_kernel_key:" << expected_kernel_key << ", fallbacking to CPU one!"; @@ -217,7 +217,7 @@ PreparedOp PrepareImpl(const NameVarMap& ins, #endif #ifdef PADDLE_WITH_MLU if (kernel_iter == kernels.end() && - is_mlu_place(expected_kernel_key.place_)) { + paddle::platform::is_mlu_place(expected_kernel_key.place_)) { VLOG(3) << "missing MLU kernel: " << op.Type() << ", expected_kernel_key:" << expected_kernel_key << ", fallbacking to CPU one!"; diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc index beddbd5d12008..0c9bedf3dca32 100644 --- a/paddle/fluid/imperative/reducer.cc +++ b/paddle/fluid/imperative/reducer.cc @@ -835,7 +835,7 @@ void Reducer::MarkGroupReady(size_t group_index) { // thrown in comm_pool_. auto next_group = next_group_; comm_pool_->enqueue([this, run_order, next_group, &group] { - auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place_).device; + auto dev_id = place_.device; platform::SetXPUDeviceId(dev_id); FusedAllReduceSchedule(run_order, group, next_group); { diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 7ed9f08906a73..f4e535de108a6 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -87,8 +87,7 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists( std::unique_ptr gc; if (platform::is_gpu_place(place)) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - gc.reset(new framework::DefaultStreamGarbageCollector( - BOOST_GET_CONST(platform::CUDAPlace, place), 0)); + gc.reset(new framework::DefaultStreamGarbageCollector(place, 0)); VLOG(10) << "Created GarbageCollector at " << place; #else @@ -98,8 +97,7 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists( #endif } else if (platform::is_cuda_pinned_place(place)) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - gc.reset(new framework::CUDAPinnedGarbageCollector( - BOOST_GET_CONST(platform::CUDAPinnedPlace, place), 0)); + gc.reset(new framework::CUDAPinnedGarbageCollector(place, 0)); VLOG(10) << "Created GarbageCollector at " << place; #else @@ -110,8 +108,7 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists( #endif } else if (platform::is_xpu_place(place)) { #if defined(PADDLE_WITH_XPU) - gc.reset(new framework::XPUGarbageCollector( - BOOST_GET_CONST(platform::XPUPlace, place), 0)); + gc.reset(new framework::XPUGarbageCollector(place, 0)); VLOG(10) << "Created GarbageCollector at " << place; #else PADDLE_THROW(platform::errors::PermissionDenied( @@ -119,14 +116,12 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists( "Please recompile or reinstall Paddle with XPU support.")); #endif } else if (platform::is_cpu_place(place)) { - gc.reset(new framework::CPUGarbageCollector( - BOOST_GET_CONST(platform::CPUPlace, place), 0)); + gc.reset(new framework::CPUGarbageCollector(place, 0)); VLOG(10) << "Created GarbageCollector at " << place; } else if (platform::is_npu_place(place)) { #if defined(PADDLE_WITH_ASCEND_CL) // TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector. - gc.reset(new framework::NPUUnsafeFastGarbageCollector( - BOOST_GET_CONST(platform::NPUPlace, place), 0)); + gc.reset(new framework::NPUUnsafeFastGarbageCollector(place, 0)); VLOG(10) << "Created GarbageCollector at " << place; #else PADDLE_THROW(platform::errors::PermissionDenied( @@ -135,8 +130,7 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists( #endif } else if (platform::is_mlu_place(place)) { #if defined(PADDLE_WITH_MLU) - gc.reset(new framework::MLUDefaultStreamGarbageCollector( - BOOST_GET_CONST(platform::MLUPlace, place), 0)); + gc.reset(new framework::MLUDefaultStreamGarbageCollector(place, 0)); VLOG(10) << "Created GarbageCollector at " << place; #else PADDLE_THROW(platform::errors::PermissionDenied( @@ -197,31 +191,28 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins, try { if (platform::is_gpu_place(place)) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - platform::SetDeviceId(BOOST_GET_CONST(platform::CUDAPlace, place).device); + platform::SetDeviceId(place.device); #else PADDLE_THROW(platform::errors::PreconditionNotMet( "PaddlePaddle should compile with GPU if use CUDAPlace.")); #endif } else if (platform::is_xpu_place(place)) { #ifdef PADDLE_WITH_XPU - platform::SetXPUDeviceId( - BOOST_GET_CONST(platform::XPUPlace, place).device); + platform::SetXPUDeviceId(place.device); #else PADDLE_THROW(platform::errors::PreconditionNotMet( "PaddlePaddle should compile with XPU if use XPUPlace.")); #endif } else if (platform::is_npu_place(place)) { #ifdef PADDLE_WITH_ASCEND_CL - platform::SetNPUDeviceId( - BOOST_GET_CONST(platform::NPUPlace, place).device); + platform::SetNPUDeviceId(place.device); #else PADDLE_THROW(platform::errors::PreconditionNotMet( "PaddlePaddle should compile with NPU if use NPUPlace.")); #endif } else if (platform::is_mlu_place(place)) { #ifdef PADDLE_WITH_MLU - platform::SetMLUDeviceId( - BOOST_GET_CONST(platform::MLUPlace, place).device); + platform::SetMLUDeviceId(place.device); #else PADDLE_THROW(platform::errors::PreconditionNotMet( "PaddlePaddle should compile with MLU if use MLUPlace.")); diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index d4b680288e347..a86329a2b2b25 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -127,7 +127,7 @@ bool PaddleTensorToLoDTensor(const PaddleTensor &pt, framework::LoDTensor *t, platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto *dev_ctx = static_cast(pool.Get(place)); - auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place); + auto dst_gpu_place = place; memory::Copy(dst_gpu_place, static_cast(input_ptr), platform::CPUPlace(), pt.data.data(), pt.data.length(), dev_ctx->stream()); @@ -137,7 +137,7 @@ bool PaddleTensorToLoDTensor(const PaddleTensor &pt, framework::LoDTensor *t, #endif } else if (platform::is_xpu_place(place)) { #ifdef PADDLE_WITH_XPU - auto dst_xpu_place = BOOST_GET_CONST(platform::XPUPlace, place); + auto dst_xpu_place = place; memory::Copy(dst_xpu_place, static_cast(input_ptr), platform::CPUPlace(), pt.data.data(), pt.data.length()); #else @@ -954,14 +954,14 @@ std::unique_ptr AnalysisPredictor::GetInputTensor( // model. res->SetPlace(PaddlePlace::kCPU); } else { - auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_); + auto xpu_place = place_; res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId()); } } else if (platform::is_npu_place(place_)) { - auto npu_place = BOOST_GET_CONST(platform::NPUPlace, place_); + auto npu_place = place_; res->SetPlace(PaddlePlace::kNPU, npu_place.GetDeviceId()); } else { - auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place_); + auto gpu_place = place_; res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId()); } return res; @@ -993,14 +993,14 @@ std::unique_ptr AnalysisPredictor::GetOutputTensor( // model. res->SetPlace(PaddlePlace::kCPU); } else { - auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_); + auto xpu_place = place_; res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId()); } } else if (platform::is_npu_place(place_)) { - auto npu_place = BOOST_GET_CONST(platform::NPUPlace, place_); + auto npu_place = place_; res->SetPlace(PaddlePlace::kNPU, npu_place.GetDeviceId()); } else { - auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place_); + auto gpu_place = place_; res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId()); } return res; @@ -1050,7 +1050,7 @@ bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) { if (stream != nullptr) { paddle::platform::DeviceContextPool &pool = paddle::platform::DeviceContextPool::Instance(); - auto gpu_place = BOOST_GET_CONST(paddle::platform::CUDAPlace, place_); + auto gpu_place = place_; auto *dev_ctx = reinterpret_cast( pool.Get(gpu_place)); dev_ctx->SetThreadLocalStream(stream); @@ -1065,7 +1065,7 @@ void AnalysisPredictor::CollectShapeRangeInfo() { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) paddle::platform::DeviceContextPool &pool = paddle::platform::DeviceContextPool::Instance(); - auto gpu_place = BOOST_GET_CONST(paddle::platform::CUDAPlace, place_); + auto gpu_place = place_; auto *dev_ctx = static_cast( pool.Get(gpu_place)); #ifdef PADDLE_WITH_HIP diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index c1a0cb4be4429..d1f49b84f0679 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -243,7 +243,7 @@ bool NativePaddlePredictor::SetFeed(const std::vector &inputs, platform::DeviceContextPool::Instance(); auto *dev_ctx = static_cast(pool.Get(place_)); - auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place_); + auto dst_gpu_place = place_; memory::Copy(dst_gpu_place, static_cast(input_ptr), platform::CPUPlace(), inputs[i].data.data(), inputs[i].data.length(), dev_ctx->stream()); @@ -253,7 +253,7 @@ bool NativePaddlePredictor::SetFeed(const std::vector &inputs, #endif } else if (platform::is_xpu_place(place_)) { #ifdef PADDLE_WITH_XPU - auto dst_xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_); + auto dst_xpu_place = place_; memory::Copy(dst_xpu_place, static_cast(input_ptr), platform::CPUPlace(), inputs[i].data.data(), inputs[i].data.length()); @@ -267,7 +267,7 @@ bool NativePaddlePredictor::SetFeed(const std::vector &inputs, platform::DeviceContextPool::Instance(); auto *dev_ctx = static_cast(pool.Get(place_)); - auto dst_npu_place = BOOST_GET_CONST(platform::NPUPlace, place_); + auto dst_npu_place = place_; memory::Copy(dst_npu_place, static_cast(input_ptr), platform::CPUPlace(), inputs[i].data.data(), inputs[i].data.length(), dev_ctx->stream()); diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc index 2f2f4c0ead760..13b07a8e8fb7b 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -253,7 +253,7 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb, #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) paddle::platform::DeviceContextPool &pool = paddle::platform::DeviceContextPool::Instance(); - auto gpu_place = BOOST_GET_CONST(paddle::platform::CUDAPlace, t_place); + auto gpu_place = t_place; auto *dev_ctx = static_cast( pool.Get(gpu_place)); paddle::memory::Copy(paddle::platform::CPUPlace(), @@ -280,7 +280,7 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb, #endif } else if (place_ == PlaceType::kXPU) { #ifdef PADDLE_WITH_XPU - auto xpu_place = BOOST_GET_CONST(paddle::platform::XPUPlace, t_place); + auto xpu_place = t_place; paddle::memory::Copy(paddle::platform::CPUPlace(), static_cast(data), xpu_place, t_data, ele_num * sizeof(T)); @@ -293,7 +293,7 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb, #ifdef PADDLE_WITH_ASCEND_CL paddle::platform::DeviceContextPool &pool = paddle::platform::DeviceContextPool::Instance(); - auto npu_place = BOOST_GET_CONST(paddle::platform::NPUPlace, t_place); + auto npu_place = t_place; auto *dev_ctx = static_cast( pool.Get(npu_place)); paddle::memory::Copy(paddle::platform::CPUPlace(), diff --git a/paddle/fluid/inference/lite/tensor_utils.cc b/paddle/fluid/inference/lite/tensor_utils.cc index 0d5cd29a0c579..27e3417933806 100644 --- a/paddle/fluid/inference/lite/tensor_utils.cc +++ b/paddle/fluid/inference/lite/tensor_utils.cc @@ -134,7 +134,7 @@ void MemoryCopyAsync(const platform::Place& dst_place, void* dst_data, "Lite::MemoryCopy CPU->GPU is not yet implemented.")); } else if (platform::is_gpu_place(dst_place) && platform::is_gpu_place(src_place)) { - auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place); + auto gpu_place = src_place; memory::Copy( gpu_place, dst_data, gpu_place, src_data, size, static_cast(ctx).stream()); diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 474b4fe3d4522..6615bdf4b138b 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -813,8 +813,7 @@ const std::shared_ptr& AllocatorFacade::GetAllocator( } #endif - platform::CUDAPlace cuda_place = - BOOST_GET_CONST(platform::CUDAPlace, place); + platform::CUDAPlace cuda_place(place.GetDeviceId()); return m_->GetAllocator(cuda_place, m_->GetDefaultStream(cuda_place)); } #endif @@ -838,8 +837,7 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, } #endif - platform::CUDAPlace cuda_place = - BOOST_GET_CONST(platform::CUDAPlace, place); + platform::CUDAPlace cuda_place(place.GetDeviceId()); return Alloc(cuda_place, size, m_->GetDefaultStream(cuda_place)); } #endif @@ -859,8 +857,7 @@ uint64_t AllocatorFacade::Release(const platform::Place& place) { } #endif - platform::CUDAPlace cuda_place = - BOOST_GET_CONST(platform::CUDAPlace, place); + platform::CUDAPlace cuda_place(place.GetDeviceId()); return Release(cuda_place, m_->GetDefaultStream(cuda_place)); } #endif @@ -935,7 +932,7 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size, } #endif - platform::CUDAPlace p = BOOST_GET_CONST(platform::CUDAPlace, place); + platform::CUDAPlace p(place.GetDeviceId()); if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) { return m_->GetAllocator(p, stream, /* create_if_not_found = */ true) ->Allocate(size); diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.h b/paddle/fluid/memory/allocation/best_fit_allocator.h index 297d876178f3d..15a59fd7ed0c1 100644 --- a/paddle/fluid/memory/allocation/best_fit_allocator.h +++ b/paddle/fluid/memory/allocation/best_fit_allocator.h @@ -19,12 +19,7 @@ #include #include "paddle/fluid/memory/allocation/allocator.h" - -namespace paddle { -namespace platform { -class Place; -} // namespace platform -} // namespace paddle +#include "paddle/fluid/platform//place.h" namespace paddle { namespace memory { diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc index ff9bbf4ab3df8..6000e636dd523 100644 --- a/paddle/fluid/memory/allocation/cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/cuda_allocator.cc @@ -34,7 +34,7 @@ namespace allocation { bool CUDAAllocator::IsAllocThreadSafe() const { return true; } void CUDAAllocator::FreeImpl(pten::Allocation* allocation) { PADDLE_ENFORCE_EQ( - BOOST_GET_CONST(platform::CUDAPlace, allocation->place()), place_, + allocation->place(), place_, platform::errors::PermissionDenied( "GPU memory is freed in incorrect device. This may be a bug")); platform::RecordedGpuFree(allocation->ptr(), allocation->size(), diff --git a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h index a6696634c12d4..8a84d9f201ef0 100644 --- a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h +++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h @@ -144,8 +144,8 @@ class CUDADeviceContextAllocatorPool { } AllocationPtr Alloc(const platform::CUDADeviceContext &dev_ctx, size_t size) { - auto iter = allocators_.find( - BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace())); + auto iter = + allocators_.find(platform::CUDAPlace(dev_ctx.GetPlace().GetDeviceId())); PADDLE_ENFORCE_NE( iter, allocators_.end(), platform::errors::NotFound("No allocator found for CUDAPlace.")); diff --git a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc index 2ae2cf20ee6d4..17e0cc614d168 100644 --- a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc +++ b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc @@ -103,7 +103,7 @@ bool CUDAVirtualMemAllocator::IsAllocThreadSafe() const { return false; } void CUDAVirtualMemAllocator::FreeImpl(pten::Allocation* allocation) { PADDLE_ENFORCE_EQ( - BOOST_GET_CONST(platform::CUDAPlace, allocation->place()), place_, + allocation->place(), place_, platform::errors::PermissionDenied( "GPU memory is freed in incorrect device. This may be a bug")); diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc index ffe7ccf9190be..91358b688040a 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc @@ -26,6 +26,7 @@ #include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/split.h" +#include "paddle/pten/common/place.h" #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/cuda_device_guard.h" #endif @@ -791,7 +792,7 @@ size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const { namespace allocation { pten::Allocation *NaiveBestFitAllocator::AllocateImpl(size_t size) { - void *ptr = boost::apply_visitor(legacy::AllocVisitor(size), place_); + void *ptr = paddle::platform::VisitPlace(place_, legacy::AllocVisitor(size)); auto *tmp_alloc = new Allocation(ptr, size, place_); platform::MemEvenRecorder::Instance().PushMemRecord( static_cast(tmp_alloc), place_, size); @@ -799,16 +800,16 @@ pten::Allocation *NaiveBestFitAllocator::AllocateImpl(size_t size) { } void NaiveBestFitAllocator::FreeImpl(pten::Allocation *allocation) { - boost::apply_visitor( - legacy::FreeVisitor(allocation->ptr(), allocation->size()), - allocation->place()); + paddle::platform::VisitPlace( + allocation->place(), + legacy::FreeVisitor(allocation->ptr(), allocation->size())); platform::MemEvenRecorder::Instance().PopMemRecord( static_cast(allocation), place_); delete allocation; } uint64_t NaiveBestFitAllocator::ReleaseImpl(const platform::Place &place) { - return boost::apply_visitor(legacy::ReleaseVisitor(), place); + return paddle::platform::VisitPlace(place, legacy::ReleaseVisitor()); } } // namespace allocation diff --git a/paddle/fluid/memory/allocation/npu_allocator.cc b/paddle/fluid/memory/allocation/npu_allocator.cc index d9fa7ec27fdde..a17c15c35d758 100644 --- a/paddle/fluid/memory/allocation/npu_allocator.cc +++ b/paddle/fluid/memory/allocation/npu_allocator.cc @@ -24,7 +24,7 @@ namespace allocation { bool NPUAllocator::IsAllocThreadSafe() const { return true; } void NPUAllocator::FreeImpl(pten::Allocation* allocation) { PADDLE_ENFORCE_EQ( - BOOST_GET_CONST(platform::NPUPlace, allocation->place()), place_, + allocation->place(), place_, platform::errors::PermissionDenied( "NPU memory is freed in incorrect device. This may be a bug")); platform::RecordedNPUFree(allocation->ptr(), allocation->size(), diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc index 05c6a7adaff8b..66ded146f047d 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc @@ -164,8 +164,7 @@ void StreamSafeCUDAAllocator::FreeImpl(pten::Allocation* allocation) { uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) { std::lock_guard lock_guard(allocator_map_lock_); - std::vector& allocators = - allocator_map_[BOOST_GET_CONST(platform::CUDAPlace, place)]; + std::vector& allocators = allocator_map_[place]; uint64_t released_size = 0; for (StreamSafeCUDAAllocator* allocator : allocators) { released_size += allocator->ProcessUnfreedAllocationsWithRelease(); @@ -192,7 +191,7 @@ uint64_t StreamSafeCUDAAllocator::ProcessUnfreedAllocationsWithRelease() { return underlying_allocator_->Release(place_); } -std::map> +std::map> StreamSafeCUDAAllocator::allocator_map_; SpinLock StreamSafeCUDAAllocator::allocator_map_lock_; diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h index f54cdc749611a..7a89e0f6095a2 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h @@ -65,7 +65,7 @@ class StreamSafeCUDAAllocator : public Allocator { void ProcessUnfreedAllocations(); uint64_t ProcessUnfreedAllocationsWithRelease(); - static std::map> + static std::map> allocator_map_; static SpinLock allocator_map_lock_; diff --git a/paddle/fluid/memory/allocation/thread_local_allocator.cc b/paddle/fluid/memory/allocation/thread_local_allocator.cc index 98af151007594..f125670a598bc 100644 --- a/paddle/fluid/memory/allocation/thread_local_allocator.cc +++ b/paddle/fluid/memory/allocation/thread_local_allocator.cc @@ -23,8 +23,7 @@ ThreadLocalAllocatorImpl::ThreadLocalAllocatorImpl(const platform::Place& p) if (platform::is_gpu_place(place_)) { buddy_allocator_.reset(new memory::detail::BuddyAllocator( std::unique_ptr( - new memory::detail::GPUAllocator( - BOOST_GET_CONST(platform::CUDAPlace, place_).device)), + new memory::detail::GPUAllocator(place_.device)), platform::GpuMinChunkSize(), platform::GpuMaxChunkSize())); } else { PADDLE_THROW(platform::errors::Unavailable( diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index 153e19a9f1450..f804c2af53916 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/profiler.h" +#include "paddle/pten/common/place.h" namespace paddle { namespace memory { @@ -29,6 +30,7 @@ void Copy(platform::CPUPlace, void* dst, VLOG(4) << "src: " << src << ", dst: " << dst << ", num: " << num; std::memcpy(dst, src, num); } + #ifdef PADDLE_WITH_IPU template <> void Copy(platform::IPUPlace dst_place, @@ -54,6 +56,61 @@ void Copy(platform::IPUPlace dst_place, if (UNLIKELY(num == 0)) return; std::memcpy(dst, src, num); } + +// NOTE: only for CPUPlace and IPUPlace. +template <> +void Copy(pten::Place dst_place, void* dst, + pten::Place src_place, const void* src, + size_t num) { + if (src_place.GetType() == pten::AllocationType::CPU && + dst_place.GetType() == pten::AllocationType::CPU) { + platform::CPUPlace place_dst, place_src; + return Copy(place_dst, dst, place_src, src, num); + } else if (src_place.GetType() == pten::AllocationType::CPU && + dst_place.GetType() == pten::AllocationType::IPU) { + platform::IPUPlace place_dst(dst_place.GetDeviceId()); + platform::CPUPlace place_src; + return Copy(place_dst, dst, place_src, src, num); + } else if (src_place.GetType() == pten::AllocationType::IPU && + dst_place.GetType() == pten::AllocationType::CPU) { + platform::IPUPlace place_src(src_place.GetDeviceId()); + platform::CPUPlace place_dst; + return Copy(place_dst, dst, place_src, src, num); + } else if (src_place.GetType() == pten::AllocationType::IPU && + dst_place.GetType() == pten::AllocationType::IPU) { + platform::IPUPlace place_src(src_place.GetDeviceId()); + platform::IPUPlace place_dst(dst_place.GetDeviceId()); + return Copy(place_dst, dst, place_src, src, num); + } +} + +// NOTE: only for (CPUPlace and IPUPlace) -> (IPUPlace). +template <> +void Copy(pten::IPUPlace dst_place, void* dst, + pten::Place src_place, const void* src, + size_t num) { + if (src_place.GetType() == pten::AllocationType::CPU) { + platform::CPUPlace place_src; + return Copy(dst_place, dst, place_src, src, num); + } else if (src_place.GetType() == pten::AllocationType::IPU) { + platform::IPUPlace place_src(src_place.GetDeviceId()); + return Copy(dst_place, dst, place_src, src, num); + } +} + +// NOTE: only for (IPUPlace) -> (CPUPlace and IPUPlace). +template <> +void Copy(pten::Place dst_place, void* dst, + pten::IPUPlace src_place, + const void* src, size_t num) { + if (dst_place.GetType() == pten::AllocationType::CPU) { + platform::CPUPlace place_dst; + return Copy(place_dst, dst, src_place, src, num); + } else if (dst_place.GetType() == pten::AllocationType::IPU) { + platform::IPUPlace place_dst(dst_place.GetDeviceId()); + return Copy(place_dst, dst, src_place, src, num); + } +} #endif #ifdef PADDLE_WITH_XPU @@ -92,6 +149,34 @@ void Copy(platform::XPUPlace dst_place, } platform::MemcpySyncD2D(dst, dst_place, src, src_place, num); } + +// NOTE: only for (CPUPlace and XPUPlace) -> (XPUPlace). +template <> +void Copy(pten::XPUPlace dst_place, void* dst, + pten::Place src_place, const void* src, + size_t num) { + if (src_place.GetType() == pten::AllocationType::CPU) { + platform::CPUPlace place_src; + return Copy(dst_place, dst, place_src, src, num); + } else if (src_place.GetType() == pten::AllocationType::XPU) { + platform::XPUPlace place_src(src_place.GetDeviceId()); + return Copy(dst_place, dst, place_src, src, num); + } +} + +// NOTE: only for (XPUPlace) -> (CPUPlace and XPUPlace). +template <> +void Copy(pten::Place dst_place, void* dst, + pten::XPUPlace src_place, + const void* src, size_t num) { + if (dst_place.GetType() == pten::AllocationType::CPU) { + platform::CPUPlace place_dst; + return Copy(place_dst, dst, src_place, src, num); + } else if (dst_place.GetType() == pten::AllocationType::XPU) { + platform::XPUPlace place_dst(dst_place.GetDeviceId()); + return Copy(place_dst, dst, src_place, src, num); + } +} #endif #ifdef PADDLE_WITH_ASCEND_CL @@ -272,6 +357,128 @@ void Copy( } } +// NOTE: only for CPUPlace, NPUPlace and NPUPinnedPlace. +template <> +void Copy(pten::Place dst_place, void* dst, + pten::Place src_place, const void* src, + size_t num, aclrtStream stream) { + if (src_place.GetType() == pten::AllocationType::CPU && + dst_place.GetType() == pten::AllocationType::CPU) { + platform::CPUPlace place_dst, place_src; + return Copy(place_dst, dst, place_src, src, num); + } else if (src_place.GetType() == pten::AllocationType::CPU && + dst_place.GetType() == pten::AllocationType::NPU) { + platform::NPUPlace place_dst(dst_place.GetDeviceId()); + platform::CPUPlace place_src; + return Copy(place_dst, dst, place_src, src, num, stream); + } else if (src_place.GetType() == pten::AllocationType::NPU && + dst_place.GetType() == pten::AllocationType::CPU) { + platform::NPUPlace place_src(src_place.GetDeviceId()); + platform::CPUPlace place_dst; + return Copy(place_dst, dst, place_src, src, num, stream); + } else if (src_place.GetType() == pten::AllocationType::NPU && + dst_place.GetType() == pten::AllocationType::NPU) { + platform::NPUPlace place_src(src_place.GetDeviceId()); + platform::NPUPlace place_dst(dst_place.GetDeviceId()); + return Copy(place_dst, dst, place_src, src, num, stream); + } else if (src_place.GetType() == pten::AllocationType::CPU && + dst_place.GetType() == pten::AllocationType::NPUPINNED) { + platform::CPUPlace place_src; + platform::NPUPinnedPlace place_dst; + return Copy(place_dst, dst, place_src, src, num); + } else if (src_place.GetType() == pten::AllocationType::NPUPINNED && + dst_place.GetType() == pten::AllocationType::CPU) { + platform::CPUPlace place_dst; + platform::NPUPinnedPlace place_src; + return Copy(place_dst, dst, place_src, src, num); + } else if (src_place.GetType() == pten::AllocationType::NPUPINNED && + dst_place.GetType() == pten::AllocationType::NPUPINNED) { + platform::NPUPinnedPlace place_dst; + platform::NPUPinnedPlace place_src; + return Copy(place_dst, dst, place_src, src, num); + } else if (src_place.GetType() == pten::AllocationType::NPUPINNED && + dst_place.GetType() == pten::AllocationType::NPU) { + platform::NPUPinnedPlace place_src; + platform::NPUPlace place_dst(dst_place.GetDeviceId()); + return Copy(place_dst, dst, place_src, src, num, stream); + } else if (src_place.GetType() == pten::AllocationType::NPU && + dst_place.GetType() == pten::AllocationType::NPUPINNED) { + platform::NPUPinnedPlace place_dst; + platform::NPUPlace place_src(src_place.GetDeviceId()); + return Copy(place_dst, dst, place_src, src, num, stream); + } +} + +// NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (CPUPlace). +template <> +void Copy(pten::CPUPlace dst_place, void* dst, + pten::Place src_place, const void* src, + size_t num, aclrtStream stream) { + Copy(pten::Place(dst_place.GetType()), dst, src_place, src, num, stream); +} + +// NOTE: only for (CPUPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace). +template <> +void Copy(pten::Place dst_place, void* dst, + pten::CPUPlace src_place, + const void* src, size_t num, + aclrtStream stream) { + Copy(dst_place, dst, pten::Place(src_place.GetType()), src, num, stream); +} + +// NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (NPUPlace) +template <> +void Copy(pten::NPUPlace dst_place, void* dst, + pten::Place src_place, const void* src, + size_t num, aclrtStream stream) { + Copy(pten::Place(dst_place.GetType(), dst_place.GetDeviceId()), dst, + src_place, src, num, stream); +} + +// NOTE: only for (NPUPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace) +template <> +void Copy(pten::Place dst_place, void* dst, + pten::NPUPlace src_place, + const void* src, size_t num, + aclrtStream stream) { + Copy(dst_place, dst, + pten::Place(src_place.GetType(), src_place.GetDeviceId()), src, num, + stream); +} + +// NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (NPUPinnedPlace) +template <> +void Copy(pten::NPUPinnedPlace dst_place, + void* dst, pten::Place src_place, + const void* src, size_t num, + aclrtStream stream) { + Copy(pten::Place(dst_place.GetType()), dst, src_place, src, num, stream); +} + +// NOTE: only for (NPUPinnedPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace) +template <> +void Copy(pten::Place dst_place, void* dst, + pten::NPUPinnedPlace src_place, + const void* src, size_t num, + aclrtStream stream) { + Copy(dst_place, dst, pten::Place(src_place.GetType()), src, num, stream); +} + +// NOTE: only for (CPUPlace) -> (NPUPinnedPlace) +template <> +void Copy(pten::NPUPinnedPlace dst_place, + void* dst, pten::Place src_place, + const void* src, size_t num) { + Copy(pten::Place(dst_place.GetType()), dst, src_place, src, num, nullptr); +} + +// NOTE: only for (NPUPinnedPlace) -> (CPUPlace) +template <> +void Copy(pten::Place dst_place, void* dst, + pten::NPUPinnedPlace src_place, + const void* src, size_t num) { + Copy(dst_place, dst, pten::Place(src_place.GetType()), src, num, nullptr); +} #endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) @@ -490,6 +697,128 @@ void Copy( } } +// NOTE: only for CPUPlace、CUDAPlace and CUDAPinnedPlace. +template <> +void Copy(pten::Place dst_place, void* dst, + pten::Place src_place, const void* src, + size_t num, gpuStream_t stream) { + if (src_place.GetType() == pten::AllocationType::CPU && + dst_place.GetType() == pten::AllocationType::CPU) { + platform::CPUPlace place_dst, place_src; + return Copy(place_dst, dst, place_src, src, num); + } else if (src_place.GetType() == pten::AllocationType::CPU && + dst_place.GetType() == pten::AllocationType::GPU) { + platform::CUDAPlace place_dst(dst_place.GetDeviceId()); + platform::CPUPlace place_src; + return Copy(place_dst, dst, place_src, src, num, stream); + } else if (src_place.GetType() == pten::AllocationType::GPU && + dst_place.GetType() == pten::AllocationType::CPU) { + platform::CUDAPlace place_src(src_place.GetDeviceId()); + platform::CPUPlace place_dst; + return Copy(place_dst, dst, place_src, src, num, stream); + } else if (src_place.GetType() == pten::AllocationType::GPU && + dst_place.GetType() == pten::AllocationType::GPU) { + platform::CUDAPlace place_src(src_place.GetDeviceId()); + platform::CUDAPlace place_dst(dst_place.GetDeviceId()); + return Copy(place_dst, dst, place_src, src, num, stream); + } else if (src_place.GetType() == pten::AllocationType::CPU && + dst_place.GetType() == pten::AllocationType::GPUPINNED) { + platform::CPUPlace place_src; + platform::CUDAPinnedPlace place_dst; + return Copy(place_dst, dst, place_src, src, num); + } else if (src_place.GetType() == pten::AllocationType::GPUPINNED && + dst_place.GetType() == pten::AllocationType::CPU) { + platform::CPUPlace place_dst; + platform::CUDAPinnedPlace place_src; + return Copy(place_dst, dst, place_src, src, num); + } else if (src_place.GetType() == pten::AllocationType::GPUPINNED && + dst_place.GetType() == pten::AllocationType::GPUPINNED) { + platform::CUDAPinnedPlace place_dst; + platform::CUDAPinnedPlace place_src; + return Copy(place_dst, dst, place_src, src, num); + } else if (src_place.GetType() == pten::AllocationType::GPUPINNED && + dst_place.GetType() == pten::AllocationType::GPU) { + platform::CUDAPinnedPlace place_src; + platform::CUDAPlace place_dst(dst_place.GetDeviceId()); + return Copy(place_dst, dst, place_src, src, num, stream); + } else if (src_place.GetType() == pten::AllocationType::GPU && + dst_place.GetType() == pten::AllocationType::GPUPINNED) { + platform::CUDAPinnedPlace place_dst; + platform::CUDAPlace place_src(src_place.GetDeviceId()); + return Copy(place_dst, dst, place_src, src, num, stream); + } +} + +// NOTE: only for (CPUPlace, CUDAPlace and CUDAPinnedPlace) -> (CPUPlace). +template <> +void Copy(pten::CPUPlace dst_place, void* dst, + pten::Place src_place, const void* src, + size_t num, gpuStream_t stream) { + Copy(pten::Place(dst_place.GetType()), dst, src_place, src, num, stream); +} + +// NOTE: only for (CPUPlace) -> (CPUPlace, CUDAPlace and CUDAPinnedPlace). +template <> +void Copy(pten::Place dst_place, void* dst, + pten::CPUPlace src_place, + const void* src, size_t num, + gpuStream_t stream) { + Copy(dst_place, dst, pten::Place(src_place.GetType()), src, num, stream); +} + +// NOTE: only for (CPUPlace, CUDAPlace and CUDAPinnedPlace) -> (CUDAPlace) +template <> +void Copy(pten::GPUPlace dst_place, void* dst, + pten::Place src_place, const void* src, + size_t num, gpuStream_t stream) { + Copy(pten::Place(dst_place.GetType(), dst_place.GetDeviceId()), dst, + src_place, src, num, stream); +} + +// NOTE: only for (CUDAPlace) -> (CPUPlace, CUDAPlace and CUDAPinnedPlace) +template <> +void Copy(pten::Place dst_place, void* dst, + pten::GPUPlace src_place, + const void* src, size_t num, + gpuStream_t stream) { + Copy(dst_place, dst, + pten::Place(src_place.GetType(), src_place.GetDeviceId()), src, num, + stream); +} + +// NOTE: only for (CPUPlace, CUDAPlace and CUDAPinnedPlace) -> (CUDAPinnedPlace) +template <> +void Copy(pten::GPUPinnedPlace dst_place, + void* dst, pten::Place src_place, + const void* src, size_t num, + gpuStream_t stream) { + Copy(pten::Place(dst_place.GetType()), dst, src_place, src, num, stream); +} + +// NOTE: only for (CUDAPinnedPlace) -> (CPUPlace, CUDAPlace and CUDAPinnedPlace) +template <> +void Copy(pten::Place dst_place, void* dst, + pten::GPUPinnedPlace src_place, + const void* src, size_t num, + gpuStream_t stream) { + Copy(dst_place, dst, pten::Place(src_place.GetType()), src, num, stream); +} + +// NOTE: only for (CPUPlace) -> (CUDAPinnedPlace) +template <> +void Copy(pten::GPUPinnedPlace dst_place, + void* dst, pten::Place src_place, + const void* src, size_t num) { + Copy(pten::Place(dst_place.GetType()), dst, src_place, src, num, nullptr); +} + +// NOTE: only for (CUDAPinnedPlace) -> (CPUPlace) +template <> +void Copy(pten::Place dst_place, void* dst, + pten::GPUPinnedPlace src_place, + const void* src, size_t num) { + Copy(dst_place, dst, pten::Place(src_place.GetType()), src, num, nullptr); +} #endif #ifdef PADDLE_WITH_MLU @@ -586,7 +915,130 @@ void Copy(platform::MLUPlace dst_place, } } +// NOTE: only for CPUPlace and MLUPlace. +template <> +void Copy(pten::Place dst_place, void* dst, + pten::Place src_place, const void* src, + size_t num, mluStream stream) { + if (src_place.GetType() == pten::AllocationType::CPU && + dst_place.GetType() == pten::AllocationType::CPU) { + platform::CPUPlace place_dst, place_src; + return Copy(place_dst, dst, place_src, src, num); + } else if (src_place.GetType() == pten::AllocationType::CPU && + dst_place.GetType() == pten::AllocationType::MLU) { + platform::MLUPlace place_dst(dst_place.GetDeviceId()); + platform::CPUPlace place_src; + return Copy(place_dst, dst, place_src, src, num, stream); + } else if (src_place.GetType() == pten::AllocationType::MLU && + dst_place.GetType() == pten::AllocationType::CPU) { + platform::MLUPlace place_src(src_place.GetDeviceId()); + platform::CPUPlace place_dst; + return Copy(place_dst, dst, place_src, src, num, stream); + } else if (src_place.GetType() == pten::AllocationType::MLU && + dst_place.GetType() == pten::AllocationType::MLU) { + platform::MLUPlace place_src(src_place.GetDeviceId()); + platform::MLUPlace place_dst(dst_place.GetDeviceId()); + return Copy(place_dst, dst, place_src, src, num, stream); + } +} + +// NOTE: only for (CPUPlace and MLUPlace) -> (MLUPlace) +template <> +void Copy(pten::MLUPlace dst_place, void* dst, + pten::Place src_place, const void* src, + size_t num, mluStream stream) { + Copy(pten::Place(dst_place.GetType(), dst_place.GetDeviceId()), dst, + src_place, src, num, stream); +} + +// NOTE: only for (MLUPlace) -> (CPUPlace and MLUPlace) +template <> +void Copy(pten::Place dst_place, void* dst, + pten::MLUPlace src_place, + const void* src, size_t num, + mluStream stream) { + Copy(dst_place, dst, + pten::Place(src_place.GetType(), src_place.GetDeviceId()), src, num, + stream); +} + #endif // PADDLE_WITH_MLU +// NOTE: Only for CPUPlace, XPUPlace and PinnedPlace. +template <> +void Copy(pten::Place dst_place, void* dst, + pten::Place src_place, const void* src, + size_t num) { + if (UNLIKELY(num == 0)) return; + VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " + << dst_place; + if (src_place.GetType() == pten::AllocationType::CPU && + dst_place.GetType() == pten::AllocationType::CPU) { + std::memcpy(dst, src, num); + } +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + else if (src_place.GetType() == pten::AllocationType::CPU && // NOLINT + dst_place.GetType() == pten::AllocationType::GPUPINNED) { + std::memcpy(dst, src, num); + } else if (src_place.GetType() == pten::AllocationType::GPUPINNED && + dst_place.GetType() == pten::AllocationType::CPU) { + std::memcpy(dst, src, num); + } else if (src_place.GetType() == pten::AllocationType::GPUPINNED && + dst_place.GetType() == pten::AllocationType::GPUPINNED) { + std::memcpy(dst, src, num); + } +#endif +#ifdef PADDLE_WITH_ASCEND_CL + else if (src_place.GetType() == pten::AllocationType::CPU && // NOLINT + dst_place.GetType() == pten::AllocationType::NPUPINNED) { + std::memcpy(dst, src, num); + } else if (src_place.GetType() == pten::AllocationType::NPUPINNED && + dst_place.GetType() == pten::AllocationType::CPU) { + std::memcpy(dst, src, num); + } else if (src_place.GetType() == pten::AllocationType::NPUPINNED && + dst_place.GetType() == pten::AllocationType::NPUPINNED) { + std::memcpy(dst, src, num); + } +#endif +#ifdef PADDLE_WITH_XPU + else if (src_place.GetType() == pten::AllocationType::CPU && // NOLINT + dst_place.GetType() == pten::AllocationType::CPU) { + platform::CPUPlace place_dst, place_src; + return Copy(place_dst, dst, place_src, src, num); + } else if (src_place.GetType() == pten::AllocationType::CPU && + dst_place.GetType() == pten::AllocationType::XPU) { + platform::XPUPlace place_dst(dst_place.GetDeviceId()); + platform::CPUPlace place_src; + return Copy(place_dst, dst, place_src, src, num); + } else if (src_place.GetType() == pten::AllocationType::XPU && + dst_place.GetType() == pten::AllocationType::CPU) { + platform::XPUPlace place_src(src_place.GetDeviceId()); + platform::CPUPlace place_dst; + return Copy(place_dst, dst, place_src, src, num); + } else if (src_place.GetType() == pten::AllocationType::XPU && + dst_place.GetType() == pten::AllocationType::XPU) { + platform::XPUPlace place_src(src_place.GetDeviceId()); + platform::XPUPlace place_dst(dst_place.GetDeviceId()); + return Copy(place_dst, dst, place_src, src, num); + } +#endif +} + +// NOTE: Only for (CPUPlace) -> (CPUPlace and PinnedPlace). +template <> +void Copy(pten::Place dst_place, void* dst, + pten::CPUPlace src_place, + const void* src, size_t num) { + Copy(dst_place, dst, pten::Place(src_place.GetType()), src, num); +} + +// NOTE: Only for (CPUPlace and PinnedPlace) -> (CPUPlace). +template <> +void Copy(pten::CPUPlace dst_place, void* dst, + pten::Place src_place, const void* src, + size_t num) { + Copy(pten::Place(dst_place.GetType()), dst, src_place, src, num); +} + } // namespace memory } // namespace paddle diff --git a/paddle/fluid/operators/activation_cudnn_op.cu.cc b/paddle/fluid/operators/activation_cudnn_op.cu.cc index 2776fe9c13132..0ac29e6d3ada7 100644 --- a/paddle/fluid/operators/activation_cudnn_op.cu.cc +++ b/paddle/fluid/operators/activation_cudnn_op.cu.cc @@ -16,12 +16,6 @@ #include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" -namespace paddle { -namespace platform { -struct CUDAPlace; -} // namespace platform -} // namespace paddle - namespace paddle { namespace operators { using framework::Tensor; diff --git a/paddle/fluid/operators/allclose_op.cu b/paddle/fluid/operators/allclose_op.cu index 173e24b2f1450..32c90ff8fdc10 100644 --- a/paddle/fluid/operators/allclose_op.cu +++ b/paddle/fluid/operators/allclose_op.cu @@ -25,8 +25,7 @@ struct GetTensorValue { const framework::Tensor& tensor) const { const T* data = tensor.data(); T value; - const auto gpu_place = - BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()); + const auto gpu_place = dev_ctx.GetPlace(); memory::Copy(platform::CPUPlace(), &value, gpu_place, data, sizeof(T), dev_ctx.stream()); return value; diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu index f8c0426d7b1fb..2f6977b9e2da2 100644 --- a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu @@ -117,9 +117,8 @@ class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel { h_starts[i] = h_starts[i - 1] + xs[i - 1]->numel(); } int64_t total_num = h_starts[xs_size]; - memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), - d_starts, cpu_place, h_starts, (xs_size + 1) * sizeof(int64_t), - dev_ctx.stream()); + memory::Copy(dev_ctx.GetPlace(), d_starts, cpu_place, h_starts, + (xs_size + 1) * sizeof(int64_t), dev_ctx.stream()); // copy each tensor's data address to device auto h_mem = memory::Alloc(cpu_place, 2 * xs_size * sizeof(T*)); @@ -134,8 +133,8 @@ class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel { h_xs[i] = xs[i]->data(); h_outs[i] = outs[i]->mutable_data(dev_ctx.GetPlace()); } - memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), d_xs, - cpu_place, h_xs, 2 * xs_size * sizeof(T*), dev_ctx.stream()); + memory::Copy(dev_ctx.GetPlace(), d_xs, cpu_place, h_xs, + 2 * xs_size * sizeof(T*), dev_ctx.stream()); // Launch Kernel int threads_per_block = std::min(static_cast(1024), total_num); diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc index 5d5e13e848a75..979ae5c508c6b 100644 --- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc @@ -41,8 +41,8 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel { MPDType cpu_scale_data; if (platform::is_xpu_place(scale->place())) { memory::Copy(platform::CPUPlace(), static_cast(&cpu_scale_data), - BOOST_GET_CONST(platform::XPUPlace, scale->place()), - static_cast(scale_data), sizeof(MPDType)); + scale->place(), static_cast(scale_data), + sizeof(MPDType)); } else { cpu_scale_data = (*scale_data); @@ -87,8 +87,7 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel { dev_ctx.Wait(); } memory::Copy(platform::CPUPlace(), &cpu_found_inf_data, - BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()), - found_inf_data, sizeof(bool)); + dev_ctx.GetPlace(), found_inf_data, sizeof(bool)); } if (cpu_found_inf_data) { @@ -142,9 +141,8 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel { if (dev_ctx.x_context()->xpu_stream) { dev_ctx.Wait(); } - memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()), - found_inf_data, platform::CPUPlace(), &cpu_found_inf_data, - sizeof(bool)); + memory::Copy(dev_ctx.GetPlace(), found_inf_data, platform::CPUPlace(), + &cpu_found_inf_data, sizeof(bool)); } }; diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cu b/paddle/fluid/operators/amp/update_loss_scaling_op.cu index ba8e2bd15874f..6d9cd96a3fb9a 100644 --- a/paddle/fluid/operators/amp/update_loss_scaling_op.cu +++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cu @@ -114,9 +114,8 @@ class LazyZeros { for (int i = 0; i < xs_size; i++) { h_starts[i + 1] = h_starts[i] + outs[i]->numel(); } - memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), - d_starts, cpu_place, h_starts, (xs_size + 1) * sizeof(int64_t), - dev_ctx.stream()); + memory::Copy(dev_ctx.GetPlace(), d_starts, cpu_place, h_starts, + (xs_size + 1) * sizeof(int64_t), dev_ctx.stream()); // copy each tensor of "outs" data address array to device auto h_out_addrs_mem = memory::Alloc(cpu_place, xs_size * sizeof(T*)); @@ -128,9 +127,8 @@ class LazyZeros { for (size_t i = 0; i < xs_size; ++i) { h_out_addrs[i] = outs[i]->mutable_data(dev_ctx.GetPlace()); } - memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), - d_out_addrs, cpu_place, h_out_addrs, xs_size * sizeof(T*), - dev_ctx.stream()); + memory::Copy(dev_ctx.GetPlace(), d_out_addrs, cpu_place, h_out_addrs, + xs_size * sizeof(T*), dev_ctx.stream()); // launch cuda kernel int64_t total_num = h_starts[xs_size]; diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc index 8160368d72ad1..6582be7354f63 100644 --- a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc +++ b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc @@ -187,9 +187,7 @@ class LazyZerosNPU { framework::TensorCopy(*x, place, dev_ctx, out); } else if (zero_ptr != dst_ptr) { auto size = out->numel() * framework::SizeOfType(out->type()); - memory::Copy(BOOST_GET_CONST(platform::NPUPlace, place), dst_ptr, - BOOST_GET_CONST(platform::NPUPlace, place), zero_ptr, size, - stream); + memory::Copy(place, dst_ptr, place, zero_ptr, size, stream); } } } diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_xpu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_xpu.cc index fa7985e186d58..fe03d93f4480f 100644 --- a/paddle/fluid/operators/amp/update_loss_scaling_op_xpu.cc +++ b/paddle/fluid/operators/amp/update_loss_scaling_op_xpu.cc @@ -43,8 +43,7 @@ class UpdateLossScalingXPUKernel : public framework::OpKernel { bool cpu_found_inf_data = false; if (platform::is_xpu_place(found_inf->place())) { memory::Copy(platform::CPUPlace(), - static_cast(&cpu_found_inf_data), - BOOST_GET_CONST(platform::XPUPlace, found_inf->place()), + static_cast(&cpu_found_inf_data), found_inf->place(), static_cast(found_inf_data), sizeof(bool)); } else { cpu_found_inf_data = (*found_inf_data); @@ -97,16 +96,16 @@ class UpdateLossScalingXPUKernel : public framework::OpKernel { MPDType cpu_pre_loss_scaling_data; if (platform::is_xpu_place(bad_in->place())) { memory::Copy(platform::CPUPlace(), static_cast(&cpu_bad_in_data), - BOOST_GET_CONST(platform::XPUPlace, bad_in->place()), - static_cast(bad_in_data), sizeof(int)); + bad_in->place(), static_cast(bad_in_data), + sizeof(int)); } else { cpu_bad_in_data = (*bad_in_data); } if (platform::is_xpu_place(good_in->place())) { memory::Copy(platform::CPUPlace(), static_cast(&cpu_good_in_data), - BOOST_GET_CONST(platform::XPUPlace, good_in->place()), - static_cast(good_in_data), sizeof(int)); + good_in->place(), static_cast(good_in_data), + sizeof(int)); } else { cpu_good_in_data = (*good_in_data); } @@ -114,7 +113,7 @@ class UpdateLossScalingXPUKernel : public framework::OpKernel { if (platform::is_xpu_place(pre_loss_scaling->place())) { memory::Copy( platform::CPUPlace(), static_cast(&cpu_pre_loss_scaling_data), - BOOST_GET_CONST(platform::XPUPlace, pre_loss_scaling->place()), + pre_loss_scaling->place(), static_cast(pre_loss_scaling_data), sizeof(MPDType)); } else { cpu_pre_loss_scaling_data = (*pre_loss_scaling_data); @@ -146,15 +145,13 @@ class UpdateLossScalingXPUKernel : public framework::OpKernel { } } // copy to device - memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()), - bad_out_data, platform::CPUPlace(), &cpu_bad_out_data, - sizeof(int)); - memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()), - good_out_data, platform::CPUPlace(), &cpu_good_out_data, - sizeof(int)); - memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()), - updated_loss_scaling_data, platform::CPUPlace(), - &cpu_updated_loss_scaling_data, sizeof(MPDType)); + memory::Copy(dev_ctx.GetPlace(), bad_out_data, platform::CPUPlace(), + &cpu_bad_out_data, sizeof(int)); + memory::Copy(dev_ctx.GetPlace(), good_out_data, platform::CPUPlace(), + &cpu_good_out_data, sizeof(int)); + memory::Copy(dev_ctx.GetPlace(), updated_loss_scaling_data, + platform::CPUPlace(), &cpu_updated_loss_scaling_data, + sizeof(MPDType)); } }; diff --git a/paddle/fluid/operators/assign_op.cc b/paddle/fluid/operators/assign_op.cc index 09452b8f68baf..da5ee4dd82b4d 100644 --- a/paddle/fluid/operators/assign_op.cc +++ b/paddle/fluid/operators/assign_op.cc @@ -25,8 +25,6 @@ namespace imperative { class OpBase; } // namespace imperative namespace platform { -struct CPUPlace; -struct CUDAPlace; struct float16; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/assign_op_npu.cc b/paddle/fluid/operators/assign_op_npu.cc index 449ae02ecbc19..5be1beaa3dfb2 100644 --- a/paddle/fluid/operators/assign_op_npu.cc +++ b/paddle/fluid/operators/assign_op_npu.cc @@ -27,8 +27,6 @@ namespace imperative { class OpBase; } // namespace imperative namespace platform { -struct CPUPlace; -struct CUDAPlace; struct float16; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/assign_op_xpu.cc b/paddle/fluid/operators/assign_op_xpu.cc index 6255b5d341e09..26c879c3fb612 100644 --- a/paddle/fluid/operators/assign_op_xpu.cc +++ b/paddle/fluid/operators/assign_op_xpu.cc @@ -26,8 +26,6 @@ namespace imperative { class OpBase; } // namespace imperative namespace platform { -struct CPUPlace; -struct CUDAPlace; struct float16; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/assign_value_op.cc b/paddle/fluid/operators/assign_value_op.cc index 1589f9e8911f3..1adad7837f2f0 100644 --- a/paddle/fluid/operators/assign_value_op.cc +++ b/paddle/fluid/operators/assign_value_op.cc @@ -26,9 +26,6 @@ class EmptyGradOpMaker; namespace imperative { class OpBase; } // namespace imperative -namespace platform { -struct CPUPlace; -} // namespace platform } // namespace paddle namespace paddle { diff --git a/paddle/fluid/operators/average_accumulates_op.cu b/paddle/fluid/operators/average_accumulates_op.cu index 3bffe0a05a8f7..17c0e035f4632 100644 --- a/paddle/fluid/operators/average_accumulates_op.cu +++ b/paddle/fluid/operators/average_accumulates_op.cu @@ -25,8 +25,7 @@ void GetAccumulators( auto* in_num_accumulates = ctx.Input("in_num_accumulates"); auto* in_num_updates = ctx.Input("in_num_updates"); auto stream = ctx.cuda_device_context().stream(); - auto cuda_place = - BOOST_GET_CONST(platform::CUDAPlace, in_old_num_accumulates->place()); + auto cuda_place = in_old_num_accumulates->place(); memory::Copy(platform::CPUPlace(), old_num_accumulates_, cuda_place, in_old_num_accumulates->data(), sizeof(int64_t), stream); @@ -44,8 +43,7 @@ void SetAccumulators( auto* out_old_num_accumulates = ctx.Output("out_old_num_accumulates"); auto* out_num_accumulates = ctx.Output("out_num_accumulates"); auto* out_num_updates = ctx.Output("out_num_updates"); - auto cuda_place = - BOOST_GET_CONST(platform::CUDAPlace, out_old_num_accumulates->place()); + auto cuda_place = out_old_num_accumulates->place(); memory::Copy(cuda_place, out_old_num_accumulates->data(), platform::CPUPlace(), &old_num_accumulates_, sizeof(int64_t), diff --git a/paddle/fluid/operators/bernoulli_op.cu b/paddle/fluid/operators/bernoulli_op.cu index dde4dd2567b79..030f7cb7d7c33 100644 --- a/paddle/fluid/operators/bernoulli_op.cu +++ b/paddle/fluid/operators/bernoulli_op.cu @@ -57,8 +57,7 @@ class BernoulliOpKernel auto* out_data = out->mutable_data(ctx.GetPlace()); int64_t size = x->numel(); - int device_id = - BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).GetDeviceId(); + int device_id = ctx.GetPlace().GetDeviceId(); auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id); auto seed_offset = gen_cuda->IncrementOffset(1); int64_t gen_offset = size * seed_offset.second; diff --git a/paddle/fluid/operators/cholesky_op.cu b/paddle/fluid/operators/cholesky_op.cu index 0bfddf8b5f386..43c16d607c2db 100644 --- a/paddle/fluid/operators/cholesky_op.cu +++ b/paddle/fluid/operators/cholesky_op.cu @@ -102,8 +102,7 @@ class CholeskyGPUKernel : public framework::OpKernel { std::vector error_info; // only for checking positive matrix error_info.resize(batch_count); - memory::Copy(platform::CPUPlace(), error_info.data(), - BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), + memory::Copy(platform::CPUPlace(), error_info.data(), dev_ctx.GetPlace(), info_ptr, sizeof(int) * batch_count, dev_ctx.stream()); for (int i = 0; i < batch_count; ++i) { diff --git a/paddle/fluid/operators/class_center_sample_op.cu b/paddle/fluid/operators/class_center_sample_op.cu index fad74b81e14e4..2d7800d9997fc 100644 --- a/paddle/fluid/operators/class_center_sample_op.cu +++ b/paddle/fluid/operators/class_center_sample_op.cu @@ -306,7 +306,7 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel { num_classes, num_samples)); auto& dev_ctx = ctx.template device_context(); - auto place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()); + auto place = dev_ctx.GetPlace(); int batch_size = label->numel(); // Algorithm: @@ -397,8 +397,7 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel { (NumBlocks(num_classes) * kNumCUDAThreads * vec_size) + 1) * vec_size; - int device_id = - BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).GetDeviceId(); + int device_id = ctx.GetPlace().GetDeviceId(); auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id); if (gen_cuda->GetIsInitPy() && (!fix_seed)) { auto seed_offset = gen_cuda->IncrementOffset(offset); diff --git a/paddle/fluid/operators/collective/allreduce_op.h b/paddle/fluid/operators/collective/allreduce_op.h index 226b2c5132318..314a91841bebf 100644 --- a/paddle/fluid/operators/collective/allreduce_op.h +++ b/paddle/fluid/operators/collective/allreduce_op.h @@ -33,7 +33,7 @@ class AllReduceOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto place = ctx.GetPlace(); - PADDLE_ENFORCE_EQ(is_gpu_place(place), true, + PADDLE_ENFORCE_EQ(platform::is_gpu_place(place), true, platform::errors::PreconditionNotMet( "AllReduce op can run on gpu place only for now.")); #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) diff --git a/paddle/fluid/operators/collective/broadcast_op.cu.cc b/paddle/fluid/operators/collective/broadcast_op.cu.cc index 229d42e64e4e5..04d028536a9b2 100644 --- a/paddle/fluid/operators/collective/broadcast_op.cu.cc +++ b/paddle/fluid/operators/collective/broadcast_op.cu.cc @@ -34,7 +34,7 @@ class NCCLBroadcastOpKernel : public framework::OpKernel { "The place of ExecutionContext should be CUDAPlace.")); #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - int dev_id = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).device; + int dev_id = ctx.GetPlace().device; int root_dev_id = ctx.Attr("root"); auto in = ctx.Input("X"); diff --git a/paddle/fluid/operators/collective/broadcast_op_xpu.cc b/paddle/fluid/operators/collective/broadcast_op_xpu.cc index e8566803aecfa..b3d4585da003d 100644 --- a/paddle/fluid/operators/collective/broadcast_op_xpu.cc +++ b/paddle/fluid/operators/collective/broadcast_op_xpu.cc @@ -40,7 +40,7 @@ class BKCLBroadcastOpKernel : public framework::OpKernel { "The place of ExecutionContext should be XPUPlace.")); #if defined(PADDLE_WITH_XPU_BKCL) - int dev_id = BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()).device; + int dev_id = ctx.GetPlace().device; int root_dev_id = ctx.Attr("root"); auto in = ctx.Input("X"); diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op.cc b/paddle/fluid/operators/collective/c_allreduce_max_op.cc index 8bdbdfac8ffd1..4ea1876da2569 100644 --- a/paddle/fluid/operators/collective/c_allreduce_max_op.cc +++ b/paddle/fluid/operators/collective/c_allreduce_max_op.cc @@ -24,7 +24,6 @@ namespace imperative { class OpBase; } // namespace imperative namespace platform { -struct CPUPlace; struct float16; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc index ec8d651819502..17b49eda2f804 100644 --- a/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc +++ b/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc @@ -16,7 +16,7 @@ limitations under the License. */ namespace paddle { namespace platform { -struct CUDAPlace; + struct float16; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc index b0aa51f7cfdfd..96da390d45db0 100644 --- a/paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc +++ b/paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc @@ -16,7 +16,7 @@ limitations under the License. */ namespace paddle { namespace platform { -struct XPUPlace; + struct float16; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op.cc b/paddle/fluid/operators/collective/c_allreduce_min_op.cc index 9d913b12b1376..75a484ef87166 100644 --- a/paddle/fluid/operators/collective/c_allreduce_min_op.cc +++ b/paddle/fluid/operators/collective/c_allreduce_min_op.cc @@ -24,7 +24,6 @@ namespace imperative { class OpBase; } // namespace imperative namespace platform { -struct CPUPlace; struct float16; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc index 7935a1f722e55..4eca34fb50707 100644 --- a/paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc +++ b/paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc @@ -16,7 +16,7 @@ limitations under the License. */ namespace paddle { namespace platform { -struct CUDAPlace; + struct float16; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc index 2f16a89c217da..bded82229619d 100644 --- a/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc +++ b/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc @@ -16,7 +16,7 @@ limitations under the License. */ namespace paddle { namespace platform { -struct XPUPlace; + struct float16; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op.cc index 3ad078e1c8ff0..c49e72eac2326 100644 --- a/paddle/fluid/operators/collective/c_allreduce_prod_op.cc +++ b/paddle/fluid/operators/collective/c_allreduce_prod_op.cc @@ -24,7 +24,6 @@ namespace imperative { class OpBase; } // namespace imperative namespace platform { -struct CPUPlace; struct float16; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc index 1a78427cd19ee..74acbacf2b94e 100644 --- a/paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc +++ b/paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc @@ -16,7 +16,7 @@ limitations under the License. */ namespace paddle { namespace platform { -struct CUDAPlace; + struct float16; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc index 92ba00428065b..a1d439cfdae62 100644 --- a/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc +++ b/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc @@ -16,7 +16,7 @@ limitations under the License. */ namespace paddle { namespace platform { -struct XPUPlace; + struct float16; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op.cc index 18c317506c06e..72659282afa60 100644 --- a/paddle/fluid/operators/collective/c_allreduce_sum_op.cc +++ b/paddle/fluid/operators/collective/c_allreduce_sum_op.cc @@ -22,7 +22,6 @@ namespace imperative { class OpBase; } // namespace imperative namespace platform { -struct CPUPlace; struct float16; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc index 06e90cdff8045..cfd508be27fb1 100644 --- a/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc +++ b/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc @@ -16,7 +16,7 @@ limitations under the License. */ namespace paddle { namespace platform { -struct CUDAPlace; + struct float16; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc index e4ec538cd2323..bacdf7fb53c35 100644 --- a/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc +++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc @@ -16,7 +16,7 @@ limitations under the License. */ namespace paddle { namespace platform { -struct XPUPlace; + struct float16; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/collective/c_comm_init_all_op.cc b/paddle/fluid/operators/collective/c_comm_init_all_op.cc index db9a8428e3d03..5820bd318d8bc 100644 --- a/paddle/fluid/operators/collective/c_comm_init_all_op.cc +++ b/paddle/fluid/operators/collective/c_comm_init_all_op.cc @@ -48,7 +48,7 @@ class CCommInitAllOp : public framework::OperatorBase { void RunImpl(const framework::Scope& scope, const platform::Place& place) const override { - PADDLE_ENFORCE_EQ(is_gpu_place(place), true, + PADDLE_ENFORCE_EQ(platform::is_gpu_place(place), true, platform::errors::PreconditionNotMet( "CCommInitAllOp can run on gpu place only")); diff --git a/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc b/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc index 86044b5ba1c1a..bdd904bf7be7a 100644 --- a/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc +++ b/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc @@ -41,7 +41,7 @@ class CCommInitOpAscend : public framework::OperatorBase { void RunImpl(const framework::Scope& scope, const platform::Place& place) const override { - PADDLE_ENFORCE_EQ(is_npu_place(place), true, + PADDLE_ENFORCE_EQ(platform::is_npu_place(place), true, platform::errors::PreconditionNotMet( "CCommInitOpAscend can run on npu place only.")); @@ -54,7 +54,7 @@ class CCommInitOpAscend : public framework::OperatorBase { int rank_ids = Attr("rank_ids"); int rank_id = Attr("rank"); int rid = Attr("ring_id"); - int device_id = BOOST_GET_CONST(platform::NPUPlace, place).device; + int device_id = place.device; if (Attr("device_id") >= 0) { device_id = Attr("device_id"); } diff --git a/paddle/fluid/operators/collective/c_comm_init_op.cc b/paddle/fluid/operators/collective/c_comm_init_op.cc index 9bf86dc926773..56b0017fefe63 100644 --- a/paddle/fluid/operators/collective/c_comm_init_op.cc +++ b/paddle/fluid/operators/collective/c_comm_init_op.cc @@ -61,9 +61,10 @@ class CCommInitOp : public framework::OperatorBase { "PaddlePaddle should be compiled with GPU or XPU.")); #endif - PADDLE_ENFORCE_EQ(is_gpu_place(place) || is_xpu_place(place), true, - platform::errors::PreconditionNotMet( - "CCommInitOp can run on gpu or xpu place only.")); + PADDLE_ENFORCE_EQ( + platform::is_gpu_place(place) || platform::is_xpu_place(place), true, + platform::errors::PreconditionNotMet( + "CCommInitOp can run on gpu or xpu place only.")); #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ defined(PADDLE_WITH_XPU_BKCL) @@ -85,7 +86,7 @@ class CCommInitOp : public framework::OperatorBase { rid)); #endif - int device_id = BOOST_GET_CONST(Place, place).device; + int device_id = place.device; if (Attr("device_id") >= 0) { device_id = Attr("device_id"); } diff --git a/paddle/fluid/operators/collective/c_reduce_max_op.cc b/paddle/fluid/operators/collective/c_reduce_max_op.cc index 41a07f9439951..8a5ed7d7bde9b 100644 --- a/paddle/fluid/operators/collective/c_reduce_max_op.cc +++ b/paddle/fluid/operators/collective/c_reduce_max_op.cc @@ -24,7 +24,6 @@ namespace imperative { class OpBase; } // namespace imperative namespace platform { -struct CPUPlace; struct float16; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/collective/c_reduce_max_op.cu.cc b/paddle/fluid/operators/collective/c_reduce_max_op.cu.cc index e03da37360f47..9668c68c7da20 100644 --- a/paddle/fluid/operators/collective/c_reduce_max_op.cu.cc +++ b/paddle/fluid/operators/collective/c_reduce_max_op.cu.cc @@ -16,7 +16,7 @@ limitations under the License. */ namespace paddle { namespace platform { -struct CUDAPlace; + struct float16; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/collective/c_reduce_max_op_xpu.cc b/paddle/fluid/operators/collective/c_reduce_max_op_xpu.cc index 6d3af7bb5f258..82a10b24dab36 100644 --- a/paddle/fluid/operators/collective/c_reduce_max_op_xpu.cc +++ b/paddle/fluid/operators/collective/c_reduce_max_op_xpu.cc @@ -16,7 +16,7 @@ limitations under the License. */ namespace paddle { namespace platform { -struct XPUPlace; + struct float16; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/collective/c_reduce_min_op.cc b/paddle/fluid/operators/collective/c_reduce_min_op.cc index 77bb96347f943..c2ecf2419a0b5 100644 --- a/paddle/fluid/operators/collective/c_reduce_min_op.cc +++ b/paddle/fluid/operators/collective/c_reduce_min_op.cc @@ -24,7 +24,6 @@ namespace imperative { class OpBase; } // namespace imperative namespace platform { -struct CPUPlace; struct float16; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/collective/c_reduce_min_op.cu.cc b/paddle/fluid/operators/collective/c_reduce_min_op.cu.cc index 83f7fce1ec6b7..7f5b4cd3608ef 100644 --- a/paddle/fluid/operators/collective/c_reduce_min_op.cu.cc +++ b/paddle/fluid/operators/collective/c_reduce_min_op.cu.cc @@ -16,7 +16,7 @@ limitations under the License. */ namespace paddle { namespace platform { -struct CUDAPlace; + struct float16; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc b/paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc index 791e58d8493ce..b1136b796699c 100644 --- a/paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc +++ b/paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc @@ -16,7 +16,7 @@ limitations under the License. */ namespace paddle { namespace platform { -struct XPUPlace; + struct float16; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/collective/c_reduce_op.h b/paddle/fluid/operators/collective/c_reduce_op.h index c06b2683a6bbe..a0e0f8e92bdde 100644 --- a/paddle/fluid/operators/collective/c_reduce_op.h +++ b/paddle/fluid/operators/collective/c_reduce_op.h @@ -187,7 +187,7 @@ class CReduceOpASCENDKernel : public framework::OpKernel { reinterpret_cast(stream))); if (rank_id != root_id) { - auto npu_place = BOOST_GET_CONST(platform::NPUPlace, place); + auto npu_place = place; memory::Copy(npu_place, reinterpret_cast(out->data()), npu_place, reinterpret_cast(const_cast(in->data())), diff --git a/paddle/fluid/operators/collective/c_reduce_prod_op.cc b/paddle/fluid/operators/collective/c_reduce_prod_op.cc index f6c1c5d50e864..a689b9db15aac 100644 --- a/paddle/fluid/operators/collective/c_reduce_prod_op.cc +++ b/paddle/fluid/operators/collective/c_reduce_prod_op.cc @@ -24,7 +24,6 @@ namespace imperative { class OpBase; } // namespace imperative namespace platform { -struct CPUPlace; struct float16; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/collective/c_reduce_prod_op.cu.cc b/paddle/fluid/operators/collective/c_reduce_prod_op.cu.cc index 83db107b36faf..c3de32b9fbdb0 100644 --- a/paddle/fluid/operators/collective/c_reduce_prod_op.cu.cc +++ b/paddle/fluid/operators/collective/c_reduce_prod_op.cu.cc @@ -16,7 +16,7 @@ limitations under the License. */ namespace paddle { namespace platform { -struct CUDAPlace; + struct float16; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc b/paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc index e7e770e8ffdca..f6def80a19076 100644 --- a/paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc +++ b/paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc @@ -16,7 +16,7 @@ limitations under the License. */ namespace paddle { namespace platform { -struct XPUPlace; + struct float16; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op.cc b/paddle/fluid/operators/collective/c_reduce_sum_op.cc index e59ec85fefd13..b7f521b371ac8 100644 --- a/paddle/fluid/operators/collective/c_reduce_sum_op.cc +++ b/paddle/fluid/operators/collective/c_reduce_sum_op.cc @@ -24,7 +24,6 @@ namespace imperative { class OpBase; } // namespace imperative namespace platform { -struct CPUPlace; struct float16; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op.cu.cc b/paddle/fluid/operators/collective/c_reduce_sum_op.cu.cc index 39c8716a92a36..a4a651be3c5ee 100644 --- a/paddle/fluid/operators/collective/c_reduce_sum_op.cu.cc +++ b/paddle/fluid/operators/collective/c_reduce_sum_op.cu.cc @@ -16,7 +16,7 @@ limitations under the License. */ namespace paddle { namespace platform { -struct CUDAPlace; + struct float16; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc index a0ec4d2a99cd7..ec928bd6a095d 100644 --- a/paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc +++ b/paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc @@ -16,7 +16,7 @@ limitations under the License. */ namespace paddle { namespace platform { -struct XPUPlace; + struct float16; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc index 72339bbd48752..8a4c1979adbbf 100644 --- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc +++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc @@ -59,7 +59,7 @@ class CSyncCalcStreamKernel : public framework::OpKernel { #elif defined(PADDLE_WITH_ASCEND_CL) && !defined(_WIN32) auto place = ctx.GetPlace(); - PADDLE_ENFORCE_EQ(is_npu_place(place), true, + PADDLE_ENFORCE_EQ(platform::is_npu_place(place), true, platform::errors::PreconditionNotMet( "Sync stream op can run on npu place only for now.")); diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc index 21bad096c2d49..893cc90762f33 100644 --- a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc +++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc @@ -61,8 +61,8 @@ template class CSyncCommStreamKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto place = ctx.GetPlace(); #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + auto place = ctx.GetPlace(); int ring_id = ctx.Attr("ring_id"); auto stream = platform::NCCLCommContext::Instance().Get(ring_id, place)->stream(); @@ -70,9 +70,12 @@ class CSyncCommStreamKernel : public framework::OpKernel { platform::GpuStreamSync(stream); #elif defined(PADDLE_WITH_ASCEND_CL) - PADDLE_ENFORCE_EQ(is_npu_place(place), true, + auto place = ctx.GetPlace(); + PADDLE_ENFORCE_EQ(platform::is_npu_place(place), true, platform::errors::PreconditionNotMet( - "Sync stream op can run on npu place only for now.")); + "Sync comm stream op can run on npu place only for " + "now, but we got %s, please check the environment.", + place.DebugString())); int ring_id = ctx.Attr("ring_id"); auto stream = platform::HCCLCommContext::Instance().Get(ring_id, place)->stream(); diff --git a/paddle/fluid/operators/collective/c_wait_comm_op.cc b/paddle/fluid/operators/collective/c_wait_comm_op.cc index dfa4dcd0fac59..b15e33417a05b 100644 --- a/paddle/fluid/operators/collective/c_wait_comm_op.cc +++ b/paddle/fluid/operators/collective/c_wait_comm_op.cc @@ -35,9 +35,11 @@ class CWaitCommOp : public framework::OperatorBase { void RunImpl(const framework::Scope& scope, const platform::Place& place) const override { - PADDLE_ENFORCE_EQ(is_gpu_place(place), true, - platform::errors::PreconditionNotMet( - "wait_comm op can run on gpu place only for now.")); + PADDLE_ENFORCE_EQ( + platform::is_gpu_place(place), true, + platform::errors::PreconditionNotMet( + "wait_comm op can run on gpu place only for now, but got %s", + place.DebugString())); #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) int ring_id = Attr("ring_id"); diff --git a/paddle/fluid/operators/collective/c_wait_compute_op.cc b/paddle/fluid/operators/collective/c_wait_compute_op.cc index e038617bf3d6a..7ca0a087d909b 100644 --- a/paddle/fluid/operators/collective/c_wait_compute_op.cc +++ b/paddle/fluid/operators/collective/c_wait_compute_op.cc @@ -37,9 +37,10 @@ class CWaitComputeOp : public framework::OperatorBase { void RunImpl(const framework::Scope& scope, const platform::Place& place) const override { PADDLE_ENFORCE_EQ( - is_gpu_place(place), true, + platform::is_gpu_place(place), true, platform::errors::PreconditionNotMet( - "wait_compute op can run on gpu place only for now.")); + "wait_compute op can run on gpu place only for now, but got %s", + place.DebugString())); #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) int ring_id = Attr("ring_id"); diff --git a/paddle/fluid/operators/controlflow/compare_op.cc b/paddle/fluid/operators/controlflow/compare_op.cc index a03e4165755dd..882d74a0d51d8 100644 --- a/paddle/fluid/operators/controlflow/compare_op.cc +++ b/paddle/fluid/operators/controlflow/compare_op.cc @@ -19,6 +19,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" +#include "paddle/pten/common/place.h" namespace paddle { namespace operators { @@ -92,8 +93,8 @@ class CompareOp : public framework::OperatorWithKernel { if (force_cpu) { kt.place_ = platform::CPUPlace(); } else { - if (ctx.Input("X")->place().type() != - typeid(platform::CUDAPinnedPlace)) { + if (ctx.Input("X")->place().GetType() != + pten::AllocationType::GPUPINNED) { kt.place_ = ctx.Input("X")->place(); } else { kt.place_ = ctx.GetPlace(); diff --git a/paddle/fluid/operators/controlflow/fetch_v2_op.cc b/paddle/fluid/operators/controlflow/fetch_v2_op.cc index 29132f2930acb..9bb9e481034bd 100644 --- a/paddle/fluid/operators/controlflow/fetch_v2_op.cc +++ b/paddle/fluid/operators/controlflow/fetch_v2_op.cc @@ -27,8 +27,6 @@ namespace imperative { class OpBase; } // namespace imperative namespace platform { -struct CPUPlace; -struct CUDAPlace; struct float16; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc index 6f696afa23886..8adf556b4cd3d 100644 --- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc +++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc @@ -26,7 +26,7 @@ limitations under the License. */ namespace paddle { namespace platform { class CUDADeviceContext; -struct CUDAPlace; + } // namespace platform } // namespace paddle @@ -68,10 +68,8 @@ void weight_to_tensor(const platform::Place &place, gpuStream_t stream, const T *in_data = weight_list[i]->data(); auto in_size = weight_list[i]->numel(); - memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, weight->place()), - weight_data + weight_offset, - BOOST_GET_CONST(platform::CUDAPlace, weight_list[i]->place()), - in_data, in_size * sizeof(T), stream); + memory::Copy(weight->place(), weight_data + weight_offset, + weight_list[i]->place(), in_data, in_size * sizeof(T), stream); weight_offset += in_size; } } @@ -88,10 +86,8 @@ void weight_to_tensor_list(const platform::Place &place, gpuStream_t stream, T *weight_grad_data = (*weight_grad)[i]->mutable_data(place); const T *src = weight_data + weight_offset; - memory::Copy( - BOOST_GET_CONST(platform::CUDAPlace, (*weight_grad)[i]->place()), - weight_grad_data, BOOST_GET_CONST(platform::CUDAPlace, weight->place()), - src, in_size * sizeof(T), stream); + memory::Copy((*weight_grad)[i]->place(), weight_grad_data, weight->place(), + src, in_size * sizeof(T), stream); weight_offset += in_size; } } @@ -176,8 +172,7 @@ class CudnnLSTMGPUKernel : public framework::OpKernel { int seed = ctx.Attr("seed"); if (!is_test) { - int device_id = - BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).GetDeviceId(); + int device_id = ctx.GetPlace().GetDeviceId(); auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id); if (gen_cuda->GetIsInitPy() && seed == 0) { // If perform `manual_seed` in python and inner seed is not specified diff --git a/paddle/fluid/operators/cumprod_op.cu b/paddle/fluid/operators/cumprod_op.cu index 82ed0bd444de9..2b69db7d24a12 100644 --- a/paddle/fluid/operators/cumprod_op.cu +++ b/paddle/fluid/operators/cumprod_op.cu @@ -225,7 +225,7 @@ class CumprodGradOpCUDAKernel : public framework::OpKernel { const auto *y_data = y->data(); const auto *dy_data = dy->data(); - auto place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); + auto place = ctx.GetPlace(); const auto &dev_ctx = ctx.template device_context(); auto *dx_data = dx->mutable_data(place); diff --git a/paddle/fluid/operators/cumprod_op.h b/paddle/fluid/operators/cumprod_op.h index a964cfb3d7bea..d8c3c1febdcf3 100644 --- a/paddle/fluid/operators/cumprod_op.h +++ b/paddle/fluid/operators/cumprod_op.h @@ -101,7 +101,7 @@ class CumprodGradOpCPUKernel : public framework::OpKernel { auto* out_data = out->data(); auto* d_x_data = d_x->mutable_data(context.GetPlace()); - auto place = BOOST_GET_CONST(platform::CPUPlace, context.GetPlace()); + auto place = context.GetPlace(); const auto& dev_ctx = context.template device_context(); diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.cu b/paddle/fluid/operators/deformable_psroi_pooling_op.cu index 6489c1f9784cf..eeb2c7692b5d5 100644 --- a/paddle/fluid/operators/deformable_psroi_pooling_op.cu +++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cu @@ -245,7 +245,7 @@ class DeformablePSROIPoolCUDAKernel : public framework::OpKernel { int bytes = roi_batch_id_list.numel() * sizeof(int); auto roi_ptr = memory::Alloc(dev_ctx, bytes); int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); - const auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); + const auto gplace = ctx.GetPlace(); memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes, dev_ctx.stream()); @@ -516,7 +516,7 @@ class DeformablePSROIPoolGradCUDAKernel : public framework::OpKernel { int bytes = roi_batch_id_list.numel() * sizeof(int); auto roi_ptr = memory::Alloc(dev_ctx, bytes); int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); - const auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); + const auto gplace = ctx.GetPlace(); memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes, dev_ctx.stream()); diff --git a/paddle/fluid/operators/dequantize_abs_max_op.cc b/paddle/fluid/operators/dequantize_abs_max_op.cc index aee468e05e182..7583bdabc3015 100644 --- a/paddle/fluid/operators/dequantize_abs_max_op.cc +++ b/paddle/fluid/operators/dequantize_abs_max_op.cc @@ -26,9 +26,7 @@ class EmptyGradOpMaker; namespace imperative { class OpBase; } // namespace imperative -namespace platform { -struct CPUPlace; -} // namespace platform +namespace platform {} // namespace platform } // namespace paddle namespace paddle { diff --git a/paddle/fluid/operators/dequantize_log_op.cc b/paddle/fluid/operators/dequantize_log_op.cc index c12dd9e6d218a..016be54eeb7b4 100644 --- a/paddle/fluid/operators/dequantize_log_op.cc +++ b/paddle/fluid/operators/dequantize_log_op.cc @@ -26,9 +26,6 @@ class EmptyGradOpMaker; namespace imperative { class OpBase; } // namespace imperative -namespace platform { -struct CPUPlace; -} // namespace platform } // namespace paddle namespace paddle { diff --git a/paddle/fluid/operators/detail/strided_memcpy.h b/paddle/fluid/operators/detail/strided_memcpy.h index 7df0f85523bc6..5470d44202590 100644 --- a/paddle/fluid/operators/detail/strided_memcpy.h +++ b/paddle/fluid/operators/detail/strided_memcpy.h @@ -31,11 +31,11 @@ struct StridedMemcpyFunctor { const int64_t* dst_stride, T* dst) const { auto place = dev_ctx.GetPlace(); if (platform::is_cpu_place(place)) { - auto& cpu_place = BOOST_GET_CONST(platform::CPUPlace, place); + auto& cpu_place = place; memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T)); } else { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - auto& gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place); + auto& gpu_place = place; auto& cuda_ctx = reinterpret_cast(dev_ctx); memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T), @@ -55,11 +55,11 @@ struct StridedMemcpyFunctor { const int64_t* dst_stride, T* dst) const { auto place = dev_ctx.GetPlace(); if (platform::is_cpu_place(place)) { - auto& cpu_place = BOOST_GET_CONST(platform::CPUPlace, place); + auto& cpu_place = place; memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim[0]); } else { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - auto& gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place); + auto& gpu_place = place; auto& cuda_ctx = reinterpret_cast(dev_ctx); memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T) * dst_dim[0], diff --git a/paddle/fluid/operators/detection/bbox_util.cu.h b/paddle/fluid/operators/detection/bbox_util.cu.h index 6f5137be62011..c6754f62cc74e 100644 --- a/paddle/fluid/operators/detection/bbox_util.cu.h +++ b/paddle/fluid/operators/detection/bbox_util.cu.h @@ -68,7 +68,7 @@ static void SortDescending(const platform::CUDADeviceContext &ctx, nullptr, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, num, 0, sizeof(T) * 8, ctx.stream()); // Allocate temporary storage - auto place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); + auto place = ctx.GetPlace(); auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); // Run sorting operation @@ -274,7 +274,7 @@ static void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals, dim3 threads(kThreadsPerBlock); const T *boxes = proposals.data(); - auto place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); + auto place = ctx.GetPlace(); auto mask_ptr = memory::Alloc(ctx, boxes_num * col_blocks * sizeof(uint64_t)); uint64_t *mask_dev = reinterpret_cast(mask_ptr->ptr()); diff --git a/paddle/fluid/operators/detection/box_coder_op.cu b/paddle/fluid/operators/detection/box_coder_op.cu index 6e5fa1e293353..22dc606df9df5 100644 --- a/paddle/fluid/operators/detection/box_coder_op.cu +++ b/paddle/fluid/operators/detection/box_coder_op.cu @@ -183,8 +183,7 @@ class BoxCoderCUDAKernel : public framework::OpKernel { auto dev_var = memory::Alloc(device_ctx, bytes); float* dev_var_data = reinterpret_cast(dev_var->ptr()); auto cplace = platform::CPUPlace(); - const auto gplace = - BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()); + const auto gplace = context.GetPlace(); memory::Copy(gplace, dev_var_data, cplace, &variance[0], bytes, device_ctx.stream()); diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu index bd5703022db90..60cb16ce6c047 100644 --- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu +++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu @@ -85,7 +85,7 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel { roi_batch_id_list.mutable_data(platform::CPUPlace()); int index = 0; int lod_size; - auto place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()); + auto place = dev_ctx.GetPlace(); auto multi_rois_num = ctx.MultiInput("MultiLevelRoIsNum"); for (size_t i = 0; i < roi_ins.size(); ++i) { diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu index 1df7dcbe670c0..a9a6dcea1bbe5 100644 --- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu +++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu @@ -135,7 +135,7 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel { roi_num, fpn_rois->data(), lod_size, refer_level, refer_scale, max_level, min_level, roi_batch_id_list_gpu.data(), sub_lod_list_data, target_lvls_data, pixel_offset); - auto place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()); + auto place = dev_ctx.GetPlace(); Tensor index_in_t; int* idx_in = index_in_t.mutable_data({roi_num}, dev_ctx.GetPlace()); diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu index e8ab628db16bd..2de06e06d9ad3 100644 --- a/paddle/fluid/operators/detection/generate_proposals_op.cu +++ b/paddle/fluid/operators/detection/generate_proposals_op.cu @@ -67,7 +67,7 @@ static std::pair ProposalForOneImage( proposals.data(), im_info.data(), min_size, pre_nms_num, keep_num_t.data(), keep_index.data()); int keep_num; - const auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); + const auto gpu_place = ctx.GetPlace(); memory::Copy(platform::CPUPlace(), &keep_num, gpu_place, keep_num_t.data(), sizeof(int), ctx.stream()); ctx.Wait(); @@ -169,7 +169,7 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel { T *rpn_rois_data = rpn_rois->data(); T *rpn_roi_probs_data = rpn_roi_probs->data(); - auto place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()); + auto place = dev_ctx.GetPlace(); auto cpu_place = platform::CPUPlace(); int64_t num_proposals = 0; diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op.cu b/paddle/fluid/operators/detection/generate_proposals_v2_op.cu index 6244827f685ba..cc2d4578e3eb1 100644 --- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cu +++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cu @@ -68,7 +68,7 @@ static std::pair ProposalForOneImage( proposals.data(), im_shape.data(), min_size, pre_nms_num, keep_num_t.data(), keep_index.data(), false, pixel_offset); int keep_num; - const auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); + const auto gpu_place = ctx.GetPlace(); memory::Copy(platform::CPUPlace(), &keep_num, gpu_place, keep_num_t.data(), sizeof(int), ctx.stream()); ctx.Wait(); @@ -172,7 +172,7 @@ class CUDAGenerateProposalsV2Kernel : public framework::OpKernel { T *rpn_rois_data = rpn_rois->data(); T *rpn_roi_probs_data = rpn_roi_probs->data(); - auto place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()); + auto place = dev_ctx.GetPlace(); auto cpu_place = platform::CPUPlace(); int64_t num_proposals = 0; diff --git a/paddle/fluid/operators/detection/yolo_box_op.cu b/paddle/fluid/operators/detection/yolo_box_op.cu index 23bd6af6bd2e8..bfe4742c4b3c3 100644 --- a/paddle/fluid/operators/detection/yolo_box_op.cu +++ b/paddle/fluid/operators/detection/yolo_box_op.cu @@ -104,7 +104,7 @@ class YoloBoxOpCUDAKernel : public framework::OpKernel { int bytes = sizeof(int) * anchors.size(); auto anchors_ptr = memory::Alloc(dev_ctx, sizeof(int) * anchors.size()); int* anchors_data = reinterpret_cast(anchors_ptr->ptr()); - const auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); + const auto gplace = ctx.GetPlace(); const auto cplace = platform::CPUPlace(); memory::Copy(gplace, anchors_data, cplace, anchors.data(), bytes, dev_ctx.stream()); diff --git a/paddle/fluid/operators/dirichlet_op.cu b/paddle/fluid/operators/dirichlet_op.cu index 3e1d523ae0e15..63f9c7339bfc5 100644 --- a/paddle/fluid/operators/dirichlet_op.cu +++ b/paddle/fluid/operators/dirichlet_op.cu @@ -76,8 +76,7 @@ struct DirichletSampler { auto& dev_ctx = ctx.device_context(); // init state, seed & offset for all threads - int device_id = - BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).GetDeviceId(); + int device_id = ctx.GetPlace().GetDeviceId(); auto p_gen = framework::GetDefaultCUDAGenerator(device_id); auto seed_and_offset = p_gen->IncrementOffset(10); // hard-coded offset auto seed = seed_and_offset.first; diff --git a/paddle/fluid/operators/distribution_helper.h b/paddle/fluid/operators/distribution_helper.h index c6305e5ba73e8..695cb6e0ef2de 100644 --- a/paddle/fluid/operators/distribution_helper.h +++ b/paddle/fluid/operators/distribution_helper.h @@ -164,8 +164,7 @@ void distribution_and_transform(const platform::CUDADeviceContext &dev_ctx, T *out_data = out->mutable_data(dev_ctx.GetPlace()); auto size = out->numel(); - int64_t device_id = - BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()).GetDeviceId(); + int64_t device_id = dev_ctx.GetPlace().GetDeviceId(); auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id); size_t block_size = 256; diff --git a/paddle/fluid/operators/dlnne/dlnne_engine_op.h b/paddle/fluid/operators/dlnne/dlnne_engine_op.h index d426876c18fa5..94c8513086c20 100644 --- a/paddle/fluid/operators/dlnne/dlnne_engine_op.h +++ b/paddle/fluid/operators/dlnne/dlnne_engine_op.h @@ -272,8 +272,7 @@ class DlnneEngineOp : public framework::OperatorBase { fluid_t->Resize(framework::make_ddim(out_shapes[bind_index])); int32_t dtype; - output_buffers[bind_index] = fluid_t->mutable_data( - BOOST_GET_CONST(platform::CPUPlace, dev_place)); + output_buffers[bind_index] = fluid_t->mutable_data(dev_place); dtype = 0; cpu_output_buffers[bind_index] = output_buffers[bind_index]; // malloc(data_bytes); diff --git a/paddle/fluid/operators/dropout_impl_util.h b/paddle/fluid/operators/dropout_impl_util.h index f2038d12528c4..33fa7a092768c 100644 --- a/paddle/fluid/operators/dropout_impl_util.h +++ b/paddle/fluid/operators/dropout_impl_util.h @@ -25,8 +25,7 @@ inline void GetSeedDataAndIncrement(const platform::CUDADeviceContext& dev_ctx, const bool is_fix_seed, const int seed_val, const int offset, uint64_t* seed_data, uint64_t* increment) { - int device_id = - BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()).GetDeviceId(); + int device_id = dev_ctx.GetPlace().GetDeviceId(); auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id); if (seed) { diff --git a/paddle/fluid/operators/edit_distance_op.cu b/paddle/fluid/operators/edit_distance_op.cu index f28fa4d6338d7..3096795f3eaf0 100644 --- a/paddle/fluid/operators/edit_distance_op.cu +++ b/paddle/fluid/operators/edit_distance_op.cu @@ -135,8 +135,8 @@ class EditDistanceGPUKernel : public framework::OpKernel { if (normalized) { distance = distance / n; } - memory::Copy(BOOST_GET_CONST(Place, ctx.GetPlace()), out + num, - platform::CPUPlace(), &distance, sizeof(T), stream); + memory::Copy(ctx.GetPlace(), out + num, platform::CPUPlace(), &distance, + sizeof(T), stream); } else { framework::Tensor dist_t; dist_t.Resize({m + 1, n + 1}); diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cc b/paddle/fluid/operators/elementwise/elementwise_add_op.cc index d66d6b66a0582..216178f7d8938 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc @@ -34,7 +34,6 @@ class OpBase; } // namespace imperative namespace platform { class CPUDeviceContext; -struct CPUPlace; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc index b28f713256526..b876438a1941f 100644 --- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc @@ -29,7 +29,6 @@ class OpBase; } // namespace imperative namespace platform { class CPUDeviceContext; -struct CPUPlace; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.cc b/paddle/fluid/operators/elementwise/elementwise_max_op.cc index e0686e815459a..cc27bab720057 100644 --- a/paddle/fluid/operators/elementwise/elementwise_max_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_max_op.cc @@ -27,7 +27,6 @@ class OpBase; } // namespace imperative namespace platform { class CPUDeviceContext; -struct CPUPlace; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.cc b/paddle/fluid/operators/elementwise/elementwise_min_op.cc index 1448520eca18f..3a1951999546e 100644 --- a/paddle/fluid/operators/elementwise/elementwise_min_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_min_op.cc @@ -27,7 +27,6 @@ class OpBase; } // namespace imperative namespace platform { class CPUDeviceContext; -struct CPUPlace; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op.cc b/paddle/fluid/operators/elementwise/elementwise_mod_op.cc index 2ac3aa6ebd3e3..bb116c9c65ac0 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mod_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.cc @@ -29,7 +29,6 @@ class OpBase; } // namespace imperative namespace platform { class CPUDeviceContext; -struct CPUPlace; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op.cc b/paddle/fluid/operators/elementwise/elementwise_pow_op.cc index d564cc3717f5e..eddbfd3b15ea4 100644 --- a/paddle/fluid/operators/elementwise/elementwise_pow_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_pow_op.cc @@ -24,7 +24,6 @@ class OpBase; } // namespace imperative namespace platform { class CPUDeviceContext; -struct CPUPlace; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc index 810f78ce80827..f5290a69bbda1 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc @@ -34,7 +34,6 @@ class OpBase; } // namespace imperative namespace platform { class CPUDeviceContext; -struct CPUPlace; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.h b/paddle/fluid/operators/elementwise/elementwise_sub_op.h index f035e46d1d082..0d889ef26c954 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include "paddle/fluid/operators/elementwise/elementwise_op.h" +#include "paddle/fluid/platform/place.h" #include "paddle/pten/kernels/math_kernel.h" namespace paddle { diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc index 4f41ecf04cf43..6c51df5c61ef3 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc @@ -20,7 +20,6 @@ class ExecutionContext; } // namespace framework namespace platform { class CPUDeviceContext; -struct CPUPlace; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc index c037daba0ee3f..d8b8c2728987e 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc @@ -20,7 +20,6 @@ class ExecutionContext; } // namespace framework namespace platform { class CPUDeviceContext; -struct CPUPlace; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc index 2acf1e0fcd7aa..397a50f2bc69c 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc @@ -20,7 +20,6 @@ class ExecutionContext; } // namespace framework namespace platform { class CPUDeviceContext; -struct CPUPlace; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc index be8dad62c3c05..e7bb73340b841 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc @@ -20,7 +20,6 @@ class ExecutionContext; } // namespace framework namespace platform { class CPUDeviceContext; -struct CPUPlace; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h b/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h index 8bfb566d496d0..be5aded3521c9 100644 --- a/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h +++ b/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h @@ -85,11 +85,11 @@ class TestElementwiseOpGradGrad { auto src = feed_datas_[in_name].data(); auto src_place = platform::CPUPlace(); if (platform::is_cpu_place(place_)) { - auto dst_place = BOOST_GET_CONST(platform::CPUPlace, place_); + auto dst_place = place_; memory::Copy(dst_place, dst, src_place, src, bytes); } else if (platform::is_gpu_place(place_)) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - auto dst_place = BOOST_GET_CONST(platform::CUDAPlace, place_); + auto dst_place = place_; memory::Copy(dst_place, dst, src_place, src, bytes, nullptr); #else PADDLE_THROW(platform::errors::InvalidArgument( diff --git a/paddle/fluid/operators/expand_op_npu.cc b/paddle/fluid/operators/expand_op_npu.cc index e9f31f8ddd698..64530f31abab9 100644 --- a/paddle/fluid/operators/expand_op_npu.cc +++ b/paddle/fluid/operators/expand_op_npu.cc @@ -93,9 +93,7 @@ class ExpandNPUKernel : public framework::OpKernel { (out0->numel() == in0->numel()) ? true : false; if (is_expand_times_all_one) { - memory::Copy(BOOST_GET_CONST(platform::NPUPlace, place), - out0->mutable_data(place), - BOOST_GET_CONST(platform::NPUPlace, place), in0->data(), + memory::Copy(place, out0->mutable_data(place), place, in0->data(), in0->numel() * sizeof(T), stream); if (out_dims != in_dims) { out0->Resize(out_dims); diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu index b95bbc775a0d7..9c4f61ecdf1bc 100644 --- a/paddle/fluid/operators/fake_quantize_op.cu +++ b/paddle/fluid/operators/fake_quantize_op.cu @@ -377,7 +377,7 @@ struct FindRangeAbsMaxFunctor { const framework::Tensor& last_scale, const framework::Tensor& iter, const int window_size, framework::Tensor* scales_arr, framework::Tensor* out_scale) { - const auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); + const auto gpu_place = ctx.GetPlace(); T* scale_arr = scales_arr->mutable_data(gpu_place); T* out_scale_data = out_scale->mutable_data(gpu_place); @@ -414,7 +414,7 @@ struct FindMovingAverageAbsMaxFunctor { const framework::Tensor& in_state, const T* cur_scale, const float rate, framework::Tensor* out_state, framework::Tensor* out_accum, framework::Tensor* out_scale) { - const auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); + const auto gpu_place = ctx.GetPlace(); T accum; T state; diff --git a/paddle/fluid/operators/fill_diagonal_tensor_op.cu b/paddle/fluid/operators/fill_diagonal_tensor_op.cu index 834964079fd39..256c9c3d75c0d 100644 --- a/paddle/fluid/operators/fill_diagonal_tensor_op.cu +++ b/paddle/fluid/operators/fill_diagonal_tensor_op.cu @@ -101,7 +101,7 @@ class FillDiagonalTensorCUDAKernel : public framework::OpKernel { Tensor tensor_tmp; int64_t *memory_block_cu = tensor_tmp.mutable_data({2 + fill_dims[0]}, ctx.GetPlace()); - const auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); + const auto gpu_place = ctx.GetPlace(); memory::Copy(gpu_place, memory_block_cu, platform::CPUPlace(), memory_block.data(), sizeof(int64_t) * (2 + fill_dims[0]), stream); @@ -159,8 +159,7 @@ class FillDiagonalTensorGradCUDAKernel : public framework::OpKernel { Tensor tensor_tmp; int64_t *memory_block_cu = tensor_tmp.mutable_data({2 + matrows}, ctx.GetPlace()); - const auto gpu_place = - BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); + const auto gpu_place = ctx.GetPlace(); memory::Copy(gpu_place, memory_block_cu, platform::CPUPlace(), memory_block.data(), sizeof(int64_t) * (2 + matrows), stream); diff --git a/paddle/fluid/operators/flip_op.cu b/paddle/fluid/operators/flip_op.cu index 2391d4b907a60..09893cb3f4b2c 100644 --- a/paddle/fluid/operators/flip_op.cu +++ b/paddle/fluid/operators/flip_op.cu @@ -56,7 +56,7 @@ class FlipKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - const auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); + const auto gplace = ctx.GetPlace(); auto cplace = platform::CPUPlace(); auto& dev_ctx = ctx.template device_context(); diff --git a/paddle/fluid/operators/fused/fused_dropout_helper.h b/paddle/fluid/operators/fused/fused_dropout_helper.h index 3972c60e8347b..4aa8b65635e7a 100644 --- a/paddle/fluid/operators/fused/fused_dropout_helper.h +++ b/paddle/fluid/operators/fused/fused_dropout_helper.h @@ -150,7 +150,7 @@ class FusedDropoutHelper { LaunchResidualDropoutBiasGrad( d_out, mask, dropout_param_.dropout_prob, dropout_param_.is_upscale_in_train, rows_, cols_, d_src, d_bias, ctx); - auto cuda_place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); + auto cuda_place = ctx.GetPlace(); memory::Copy(cuda_place, d_residual, cuda_place, d_out, rows_ * cols_ * sizeof(T), ctx.stream()); } diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h index b27b70dc9dc0c..c6205863103ff 100644 --- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h +++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h @@ -182,7 +182,7 @@ void LaunchLayernormResidualDropoutBias( LayerNormParamType *var, const platform::CUDADeviceContext &ctx) { // dropout_prob == 1.0f if (std::abs(dropout_prob - 1.0f) < 1e-5) { - auto cuda_place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); + auto cuda_place = ctx.GetPlace(); memory::Copy(cuda_place, dst, cuda_place, residual, rows * cols * sizeof(T), ctx.stream()); PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync( diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h index d984ad1a27768..2f5ec839fc2c7 100644 --- a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h +++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h @@ -141,7 +141,7 @@ void LaunchResidualDropoutBias(const uint32_t rows, const uint32_t cols, // dropout_prob == 1.0f if (std::abs(dropout_prob - 1.0f) < 1e-5) { if (residual == dst) return; - auto cuda_place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); + auto cuda_place = ctx.GetPlace(); memory::Copy(cuda_place, dst, cuda_place, residual, rows * cols * sizeof(T), ctx.stream()); if (!is_test) { diff --git a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc index 1fa4225934d39..786f5b4e07798 100644 --- a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc +++ b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc @@ -15,12 +15,7 @@ limitations under the License. */ #include "paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" - -namespace paddle { -namespace platform { -struct CUDAPlace; -} // namespace platform -} // namespace paddle +#include "paddle/fluid/platform/place.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/gather.cu.h b/paddle/fluid/operators/gather.cu.h index 700de8074ff8a..8386896027fa0 100644 --- a/paddle/fluid/operators/gather.cu.h +++ b/paddle/fluid/operators/gather.cu.h @@ -115,7 +115,7 @@ template void GPUGatherNd(const framework::ExecutionContext& context, const Tensor& input, const Tensor& index, Tensor* output) { const auto& ctx = context.template device_context(); - const auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); + const auto gplace = ctx.GetPlace(); auto cplace = platform::CPUPlace(); auto index_dims = index.dims(); diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu index ef0e000b25efd..e43ffdae903f5 100644 --- a/paddle/fluid/operators/gaussian_random_op.cu +++ b/paddle/fluid/operators/gaussian_random_op.cu @@ -69,8 +69,7 @@ class GPUGaussianRandomKernel : public framework::OpKernel { int64_t size = tensor->numel(); - int device_id = - BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()).GetDeviceId(); + int device_id = context.GetPlace().GetDeviceId(); auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id); if (gen_cuda->GetIsInitPy() && seed_flag) { @@ -106,8 +105,7 @@ class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel { thrust::counting_iterator index_sequence_begin(0); int64_t size = tensor->numel(); - int device_id = - BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()).GetDeviceId(); + int device_id = context.GetPlace().GetDeviceId(); auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id); if (gen_cuda->GetIsInitPy() && seed_flag) { diff --git a/paddle/fluid/operators/gaussian_random_op_xpu.cc b/paddle/fluid/operators/gaussian_random_op_xpu.cc index 5d3ba84b05f5e..5a1ac46f615d2 100644 --- a/paddle/fluid/operators/gaussian_random_op_xpu.cc +++ b/paddle/fluid/operators/gaussian_random_op_xpu.cc @@ -41,9 +41,8 @@ class XPUGaussianRandomKernel : public framework::OpKernel { for (int64_t i = 0; i < size; ++i) { data_cpu[i] = dist(*engine); } - memory::Copy(BOOST_GET_CONST(platform::XPUPlace, context.GetPlace()), data, - platform::CPUPlace(), reinterpret_cast(data_cpu.get()), - size * sizeof(T)); + memory::Copy(context.GetPlace(), data, platform::CPUPlace(), + reinterpret_cast(data_cpu.get()), size * sizeof(T)); } }; diff --git a/paddle/fluid/operators/gru_op.cu.cc b/paddle/fluid/operators/gru_op.cu.cc index edd7f8a7cf553..ce3c8ac51c76a 100644 --- a/paddle/fluid/operators/gru_op.cu.cc +++ b/paddle/fluid/operators/gru_op.cu.cc @@ -17,7 +17,7 @@ limitations under the License. */ namespace paddle { namespace platform { class CUDADeviceContext; -struct CUDAPlace; + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/gumbel_softmax_op.cu b/paddle/fluid/operators/gumbel_softmax_op.cu index d3edf72449537..51d912f451b92 100644 --- a/paddle/fluid/operators/gumbel_softmax_op.cu +++ b/paddle/fluid/operators/gumbel_softmax_op.cu @@ -132,8 +132,7 @@ struct GumbleNoiseGenerator { thrust::counting_iterator index_sequence_begin(0); // generate gumbel noise - int device_id = - BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()).GetDeviceId(); + int device_id = context.GetPlace().GetDeviceId(); auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id); if (gen_cuda->GetIsInitPy()) { auto seed_offset = gen_cuda->IncrementOffset(1); diff --git a/paddle/fluid/operators/hash_op.cc b/paddle/fluid/operators/hash_op.cc index 47b480c11c28f..2b8fdcb4d1067 100644 --- a/paddle/fluid/operators/hash_op.cc +++ b/paddle/fluid/operators/hash_op.cc @@ -26,9 +26,6 @@ class EmptyGradOpMaker; namespace imperative { class OpBase; } // namespace imperative -namespace platform { -struct CPUPlace; -} // namespace platform } // namespace paddle namespace paddle { diff --git a/paddle/fluid/operators/increment_op.cc b/paddle/fluid/operators/increment_op.cc index e727f6ceb56f7..16320aa26bd20 100644 --- a/paddle/fluid/operators/increment_op.cc +++ b/paddle/fluid/operators/increment_op.cc @@ -26,7 +26,6 @@ class OpBase; } // namespace imperative namespace platform { class CPUDeviceContext; -struct CPUPlace; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/isclose_op.cu b/paddle/fluid/operators/isclose_op.cu index 77295414eb903..09710ba0c6957 100644 --- a/paddle/fluid/operators/isclose_op.cu +++ b/paddle/fluid/operators/isclose_op.cu @@ -25,8 +25,7 @@ struct GetTensorValue { const framework::Tensor& tensor) const { const T* data = tensor.data(); T value; - const auto gpu_place = - BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()); + const auto gpu_place = dev_ctx.GetPlace(); memory::Copy(platform::CPUPlace(), &value, gpu_place, data, sizeof(T), dev_ctx.stream()); return value; diff --git a/paddle/fluid/operators/isfinite_op.cc b/paddle/fluid/operators/isfinite_op.cc index 1ac1c26796cf3..753b34484e411 100644 --- a/paddle/fluid/operators/isfinite_op.cc +++ b/paddle/fluid/operators/isfinite_op.cc @@ -28,7 +28,6 @@ class OpBase; } // namespace imperative namespace platform { class CPUDeviceContext; -struct CPUPlace; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/isfinite_v2_op.cc b/paddle/fluid/operators/isfinite_v2_op.cc index c676a3e57fff9..d3391fddd3026 100644 --- a/paddle/fluid/operators/isfinite_v2_op.cc +++ b/paddle/fluid/operators/isfinite_v2_op.cc @@ -34,7 +34,6 @@ class OverflowKernel; } // namespace operators namespace platform { class CPUDeviceContext; -struct CPUPlace; struct float16; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/jit/kernel_key.h b/paddle/fluid/operators/jit/kernel_key.h index b2cf92f23e8cc..943f310f62df7 100644 --- a/paddle/fluid/operators/jit/kernel_key.h +++ b/paddle/fluid/operators/jit/kernel_key.h @@ -23,8 +23,8 @@ namespace jit { struct KernelKey { struct Hash { size_t operator()(const KernelKey& key) const { - int place = key.place_.which(); // less than 2^8 - int type = static_cast(key.type_) << 8; // less than 2^(32-8) + int place = static_cast(key.place_.GetType()); // less than 2^8 + int type = static_cast(key.type_) << 8; // less than 2^(32-8) std::hash hasher; return hasher(place + type); } diff --git a/paddle/fluid/operators/label_smooth_op.cc b/paddle/fluid/operators/label_smooth_op.cc index 231ff941278c7..27341fdc84349 100644 --- a/paddle/fluid/operators/label_smooth_op.cc +++ b/paddle/fluid/operators/label_smooth_op.cc @@ -26,7 +26,6 @@ class OpBase; } // namespace imperative namespace platform { class CPUDeviceContext; -struct CPUPlace; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu index 5aa546cbcc21a..89c84d9e14377 100644 --- a/paddle/fluid/operators/lookup_table_op.cu +++ b/paddle/fluid/operators/lookup_table_op.cu @@ -160,7 +160,7 @@ class LookupTableGradCUDAKernel : public framework::OpKernel { // copy GPU memory to CPU pinned memory framework::Vector new_rows; new_rows.resize(ids_num); - auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()); + auto gpu_place = context.GetPlace(); // TODO(yuyang18): Strange code here. memory::Copy(gpu_place, new_rows.CUDAMutableData(context.GetPlace()), diff --git a/paddle/fluid/operators/lookup_table_v2_op.cu b/paddle/fluid/operators/lookup_table_v2_op.cu index 317f9eeb94f39..44a6151f1b6ce 100644 --- a/paddle/fluid/operators/lookup_table_v2_op.cu +++ b/paddle/fluid/operators/lookup_table_v2_op.cu @@ -162,7 +162,7 @@ class LookupTableV2GradCUDAKernel : public framework::OpKernel { // copy GPU memory to CPU pinned memory framework::Vector new_rows; new_rows.resize(ids_num); - auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()); + auto gpu_place = context.GetPlace(); if (ids->type() == framework::proto::VarType::INT32) { InputTypeCovert< diff --git a/paddle/fluid/operators/lstsq_op.cu b/paddle/fluid/operators/lstsq_op.cu index a71b900f14f8e..46f93abd22122 100644 --- a/paddle/fluid/operators/lstsq_op.cu +++ b/paddle/fluid/operators/lstsq_op.cu @@ -149,9 +149,8 @@ void BatchedOrmqr( // check the error info int info_h; - memory::Copy(platform::CPUPlace(), &info_h, - BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), - info_d, sizeof(int), dev_ctx.stream()); + memory::Copy(platform::CPUPlace(), &info_h, dev_ctx.GetPlace(), info_d, + sizeof(int), dev_ctx.stream()); PADDLE_ENFORCE_EQ( info_h, 0, platform::errors::PreconditionNotMet( @@ -189,9 +188,8 @@ void BatchedOrmqr( // check the error info int info_h; - memory::Copy(platform::CPUPlace(), &info_h, - BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), - info_d, sizeof(int), dev_ctx.stream()); + memory::Copy(platform::CPUPlace(), &info_h, dev_ctx.GetPlace(), info_d, + sizeof(int), dev_ctx.stream()); PADDLE_ENFORCE_EQ( info_h, 0, platform::errors::PreconditionNotMet( diff --git a/paddle/fluid/operators/lu_op.h b/paddle/fluid/operators/lu_op.h index 21839c263e4a8..da448fbd35a9f 100644 --- a/paddle/fluid/operators/lu_op.h +++ b/paddle/fluid/operators/lu_op.h @@ -413,7 +413,7 @@ void LU_Unpack(const DeviceContext& dev_ctx, const framework::Tensor* LU, batchsize = std::max(static_cast(batchsize), 1); arange(dev_ctx, &rowtensor, dim, batchsize, H); auto idtptr = rowtensor.data(); - if (is_gpu_place(dev_ctx.GetPlace())) { + if (platform::is_gpu_place(dev_ctx.GetPlace())) { framework::TensorCopy(rowtensor, dev_ctx.GetPlace(), &rt_dev); idtptr = rt_dev.data(); } diff --git a/paddle/fluid/operators/masked_select_op_xpu.cc b/paddle/fluid/operators/masked_select_op_xpu.cc index dbf8793b5cb6f..8dbc5bcfc347a 100644 --- a/paddle/fluid/operators/masked_select_op_xpu.cc +++ b/paddle/fluid/operators/masked_select_op_xpu.cc @@ -47,8 +47,7 @@ class MaskedSelectXPUKernel : public framework::OpKernel { mask->numel()), "nonzero_count "); memory::Copy(platform::CPUPlace(), static_cast(&out_size_cpu), - BOOST_GET_CONST(platform::XPUPlace, mask->place()), - static_cast(out_size), sizeof(int32_t)); + mask->place(), static_cast(out_size), sizeof(int32_t)); framework::DDim out_dim{out_size_cpu}; out->Resize(out_dim); diff --git a/paddle/fluid/operators/math/concat_and_split.cc b/paddle/fluid/operators/math/concat_and_split.cc index 347d9e6c2b9a8..45effd404cfb3 100644 --- a/paddle/fluid/operators/math/concat_and_split.cc +++ b/paddle/fluid/operators/math/concat_and_split.cc @@ -58,7 +58,7 @@ class ConcatFunctor { out_cols += t_cols; input_cols[i] = t_cols; } - auto cpu_place = BOOST_GET_CONST(platform::CPUPlace, context.GetPlace()); + auto cpu_place = context.GetPlace(); // computation auto output_data = output->data(); @@ -109,7 +109,7 @@ class SplitFunctor { input_cols += t_cols; output_cols[i] = t_cols; } - auto cpu_place = BOOST_GET_CONST(platform::CPUPlace, context.GetPlace()); + auto cpu_place = context.GetPlace(); // computation for (int k = 0; k < input_rows; ++k) { @@ -140,8 +140,7 @@ class ConcatFunctor { void operator()(const platform::XPUDeviceContext& context, const std::vector& input, int axis, framework::Tensor* output) { - int dev_id = - BOOST_GET_CONST(platform::XPUPlace, context.GetPlace()).GetDeviceId(); + int dev_id = context.GetPlace().GetDeviceId(); platform::XPUDeviceGuard guard(dev_id); int num = input.size(); @@ -179,8 +178,7 @@ class SplitFunctor { const framework::Tensor& input, const std::vector& ref_inputs, const int axis, std::vector* outputs) { - int dev_id = - BOOST_GET_CONST(platform::XPUPlace, context.GetPlace()).GetDeviceId(); + int dev_id = context.GetPlace().GetDeviceId(); platform::XPUDeviceGuard guard(dev_id); auto& ins = ref_inputs; @@ -225,8 +223,7 @@ class ConcatFunctor { void operator()(const platform::NPUDeviceContext& context, const std::vector& input, int axis, framework::Tensor* output) { - int dev_id = - BOOST_GET_CONST(platform::NPUPlace, context.GetPlace()).GetDeviceId(); + int dev_id = context.GetPlace().GetDeviceId(); platform::NPUDeviceGuard guard(dev_id); std::vector names; @@ -270,7 +267,7 @@ class SplitFunctor { input_cols += t_cols; output_cols[i] = t_cols; } - auto npu_place = BOOST_GET_CONST(platform::NPUPlace, context.GetPlace()); + auto npu_place = context.GetPlace(); // computation for (int k = 0; k < input_rows; ++k) { diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu index 6892f7ce4e503..5b99a62d78d2a 100644 --- a/paddle/fluid/operators/math/concat_and_split.cu +++ b/paddle/fluid/operators/math/concat_and_split.cu @@ -289,9 +289,9 @@ class ConcatFunctor { tmp_dev_ins_data = memory::Alloc(context, in_num * sizeof(T*)); auto* restored = platform::RestoreHostMemIfCapturingCUDAGraph(inputs_data, in_num); - memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()), - tmp_dev_ins_data->ptr(), platform::CPUPlace(), restored, - in_num * sizeof(T*), context.stream()); + memory::Copy(context.GetPlace(), tmp_dev_ins_data->ptr(), + platform::CPUPlace(), restored, in_num * sizeof(T*), + context.stream()); dev_ins_data = reinterpret_cast(tmp_dev_ins_data->ptr()); } @@ -318,8 +318,8 @@ class ConcatFunctor { auto* restored = platform::RestoreHostMemIfCapturingCUDAGraph( inputs_col, inputs_col_num); - memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()), - tmp_dev_ins_col_data->ptr(), platform::CPUPlace(), restored, + memory::Copy(context.GetPlace(), tmp_dev_ins_col_data->ptr(), + platform::CPUPlace(), restored, inputs_col_num * sizeof(int64_t), context.stream()); int64_t* dev_ins_col_data = static_cast(tmp_dev_ins_col_data->ptr()); @@ -420,9 +420,9 @@ class SplitFunctor { tmp_dev_outs_data = memory::Alloc(context, o_num * sizeof(T*)); auto* restored = platform::RestoreHostMemIfCapturingCUDAGraph(outputs_data, o_num); - memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()), - tmp_dev_outs_data->ptr(), platform::CPUPlace(), restored, - o_num * sizeof(T*), context.stream()); + memory::Copy(context.GetPlace(), tmp_dev_outs_data->ptr(), + platform::CPUPlace(), restored, o_num * sizeof(T*), + context.stream()); dev_out_gpu_data = reinterpret_cast(tmp_dev_outs_data->ptr()); } @@ -448,8 +448,8 @@ class SplitFunctor { memory::Alloc(context, outputs_cols_num * sizeof(int64_t)); auto* restored = platform::RestoreHostMemIfCapturingCUDAGraph( outputs_cols, outputs_cols_num); - memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()), - tmp_dev_ins_col_data->ptr(), platform::CPUPlace(), restored, + memory::Copy(context.GetPlace(), tmp_dev_ins_col_data->ptr(), + platform::CPUPlace(), restored, outputs_cols_num * sizeof(int64_t), context.stream()); int64_t* dev_outs_col_data = reinterpret_cast(tmp_dev_ins_col_data->ptr()); diff --git a/paddle/fluid/operators/math/eigen_values_vectors.h b/paddle/fluid/operators/math/eigen_values_vectors.h index b24f5d40e8dca..2cfff0ae88ff6 100644 --- a/paddle/fluid/operators/math/eigen_values_vectors.h +++ b/paddle/fluid/operators/math/eigen_values_vectors.h @@ -211,8 +211,7 @@ struct MatrixEighFunctor { info_ptr); } int error_info = 0; - memory::Copy(platform::CPUPlace(), &error_info, - BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), + memory::Copy(platform::CPUPlace(), &error_info, dev_ctx.GetPlace(), info_ptr, sizeof(int), dev_ctx.stream()); CheckEighResult(i, error_info); } diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc index ec21524b0b880..6ca3abe0f05a5 100644 --- a/paddle/fluid/operators/math/math_function.cc +++ b/paddle/fluid/operators/math/math_function.cc @@ -220,7 +220,8 @@ void set_constant(const platform::DeviceContext& context, framework::Tensor* tensor, float value) { TensorSetConstantWithPlace func(context, tensor, value); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - tensor->place().apply_visitor(func); + // tensor->place().apply_visitor(func); + paddle::platform::VisitPlace(tensor->place(), func); #else func(platform::CPUPlace()); #endif diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu index 378f0426ddfb7..6e2547145cfed 100644 --- a/paddle/fluid/operators/math/math_function.cu +++ b/paddle/fluid/operators/math/math_function.cu @@ -98,8 +98,7 @@ struct TransposeNormal { auto* out_ptr = out->data(); // copy in_stride, out_stride, axis to gpu device - const platform::CUDAPlace& cuda_place = - BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()); + const platform::CUDAPlace& cuda_place = context.GetPlace(); platform::CPUPlace cpu_place = platform::CPUPlace(); size_t size = 3 * rank * sizeof(int64_t); auto cpu_buf_holder = memory::Alloc(cpu_place, size); diff --git a/paddle/fluid/operators/math/math_function.h b/paddle/fluid/operators/math/math_function.h index 4c0eb592e8c17..40293ad725b93 100644 --- a/paddle/fluid/operators/math/math_function.h +++ b/paddle/fluid/operators/math/math_function.h @@ -100,9 +100,8 @@ struct TensorSetConstantXPU { int numel = tensor_->numel(); std::unique_ptr data_cpu(new T[numel]); std::fill(data_cpu.get(), data_cpu.get() + numel, static_cast(value_)); - memory::Copy(BOOST_GET_CONST(platform::XPUPlace, place_), begin, - platform::CPUPlace(), static_cast(data_cpu.get()), - numel * sizeof(T)); + memory::Copy(place_, begin, platform::CPUPlace(), + static_cast(data_cpu.get()), numel * sizeof(T)); } framework::Tensor* tensor_; U value_; diff --git a/paddle/fluid/operators/math/matrix_inverse.cu.cc b/paddle/fluid/operators/math/matrix_inverse.cu.cc index 7d03f9590357e..0b6a097d09d15 100644 --- a/paddle/fluid/operators/math/matrix_inverse.cu.cc +++ b/paddle/fluid/operators/math/matrix_inverse.cu.cc @@ -45,10 +45,9 @@ class MatrixInverseFunctor { // Copy all elements of input matrix A to a temporary memory space to // avoid being overriden by getrf. tmp_gpu_mat_data = memory::Alloc(context, a.numel() * sizeof(T)); - memory::Copy(boost::get(context.GetPlace()), - tmp_gpu_mat_data->ptr(), - boost::get(context.GetPlace()), - a.data(), a.numel() * sizeof(T), context.stream()); + memory::Copy(context.GetPlace(), tmp_gpu_mat_data->ptr(), + context.GetPlace(), a.data(), a.numel() * sizeof(T), + context.stream()); gpu_mat = reinterpret_cast(tmp_gpu_mat_data->ptr()); } @@ -61,9 +60,8 @@ class MatrixInverseFunctor { // Copy the addresses of A and A_inv from host to device. memory::allocation::AllocationPtr tmp_gpu_ptrs_data = memory::Alloc(context, cpu_ptrs.size() * sizeof(T*)); - memory::Copy(boost::get(context.GetPlace()), - tmp_gpu_ptrs_data->ptr(), platform::CPUPlace(), - static_cast(cpu_ptrs.data()), + memory::Copy(context.GetPlace(), tmp_gpu_ptrs_data->ptr(), + platform::CPUPlace(), static_cast(cpu_ptrs.data()), cpu_ptrs.size() * sizeof(T*), context.stream()); T** gpu_inv_ptrs = reinterpret_cast(tmp_gpu_ptrs_data->ptr()) + batch_size; @@ -102,8 +100,7 @@ class MatrixInverseFunctor { reinterpret_cast(tmp_gpu_ptrs_data->ptr()), gpu_pivot_ptr, gpu_inv_ptrs, gpu_info_ptr, batch_size); } - memory::Copy(platform::CPUPlace(), info.data(), - BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()), + memory::Copy(platform::CPUPlace(), info.data(), context.GetPlace(), gpu_info_ptr, sizeof(int) * batch_size, context.stream()); for (int i = 0; i < batch_size; ++i) { PADDLE_ENFORCE_EQ(info[i], 0, diff --git a/paddle/fluid/operators/math/matrix_solve.cu.cc b/paddle/fluid/operators/math/matrix_solve.cu.cc index 4e5601248c1a2..f0b41f98dc0cd 100644 --- a/paddle/fluid/operators/math/matrix_solve.cu.cc +++ b/paddle/fluid/operators/math/matrix_solve.cu.cc @@ -92,9 +92,8 @@ class MatrixSolveFunctor { // Copy the addresses of A and tmp_b from host to device. memory::allocation::AllocationPtr tmp_gpu_ptrs_data = memory::Alloc(context, cpu_ptrs.size() * sizeof(T*)); - memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()), - tmp_gpu_ptrs_data->ptr(), platform::CPUPlace(), - static_cast(cpu_ptrs.data()), + memory::Copy(context.GetPlace(), tmp_gpu_ptrs_data->ptr(), + platform::CPUPlace(), static_cast(cpu_ptrs.data()), cpu_ptrs.size() * sizeof(T*), context.stream()); T** gpu_tmp_b_ptrs = @@ -122,8 +121,7 @@ class MatrixSolveFunctor { gpu_pivot_ptr, gpu_info_ptr, batch_size); // check whether BatchedGETRF is executed successfully or not - memory::Copy(platform::CPUPlace(), info.data(), - BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()), + memory::Copy(platform::CPUPlace(), info.data(), context.GetPlace(), gpu_info_ptr, sizeof(int) * batch_size, context.stream()); for (int i = 0; i < batch_size; ++i) { PADDLE_ENFORCE_EQ(info[i], 0, @@ -207,9 +205,8 @@ class TriangularSolveFunctor { // Copy the addresses of A and tmp_b from host to device. memory::allocation::AllocationPtr tmp_gpu_ptrs_data = memory::Alloc(context, cpu_ptrs.size() * sizeof(T*)); - memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()), - tmp_gpu_ptrs_data->ptr(), platform::CPUPlace(), - static_cast(cpu_ptrs.data()), + memory::Copy(context.GetPlace(), tmp_gpu_ptrs_data->ptr(), + platform::CPUPlace(), static_cast(cpu_ptrs.data()), cpu_ptrs.size() * sizeof(T*), context.stream()); const T** gpu_a_ptrs = diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc index 37dafa5c4908f..67176f26b079f 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cc +++ b/paddle/fluid/operators/math/selected_rows_functor.cc @@ -79,14 +79,11 @@ struct SelectedRowsAdd { auto* out_data = out_value->data(); auto* in1_data = in1_value.data(); - memory::Copy(BOOST_GET_CONST(platform::CPUPlace, out_place), out_data, - BOOST_GET_CONST(platform::CPUPlace, in1_place), in1_data, + memory::Copy(out_place, out_data, in1_place, in1_data, in1_value.numel() * sizeof(T)); auto* in2_data = in2_value.data(); - memory::Copy(BOOST_GET_CONST(platform::CPUPlace, out_place), - out_data + in1_value.numel(), - BOOST_GET_CONST(platform::CPUPlace, in2_place), in2_data, + memory::Copy(out_place, out_data + in1_value.numel(), in2_place, in2_data, in2_value.numel() * sizeof(T)); } }; @@ -188,9 +185,7 @@ struct SelectedRowsAddTo { auto* in1_data = in1_value.data(); auto* in2_data = in2_value->data(); - memory::Copy(BOOST_GET_CONST(platform::CPUPlace, in2_place), - in2_data + input2_offset, - BOOST_GET_CONST(platform::CPUPlace, in1_place), in1_data, + memory::Copy(in2_place, in2_data + input2_offset, in1_place, in1_data, in1_value.numel() * sizeof(T)); } }; @@ -455,9 +450,7 @@ struct MergeAdd { for (auto* in : inputs) { auto* in_data = in->value().data(); auto in_numel = in->rows().size() * input_width; - memory::Copy(BOOST_GET_CONST(platform::CPUPlace, out_place), - out_data + copied_numel, - BOOST_GET_CONST(platform::CPUPlace, in_place), in_data, + memory::Copy(out_place, out_data + copied_numel, in_place, in_data, in_numel * sizeof(T)); copied_numel += in_numel; } diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu index 0e04c37ed2b12..654a5653cbed1 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cu +++ b/paddle/fluid/operators/math/selected_rows_functor.cu @@ -82,14 +82,11 @@ struct SelectedRowsAdd { platform::errors::InvalidArgument( "The running enviroment is not on the GPU place.")); - memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, out_place), out_data, - BOOST_GET_CONST(platform::CUDAPlace, in1_place), in1_data, + memory::Copy(out_place, out_data, in1_place, in1_data, in1_value.numel() * sizeof(T), context.stream()); auto* in2_data = in2_value.data(); - memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, out_place), - out_data + in1_value.numel(), - BOOST_GET_CONST(platform::CUDAPlace, in2_place), in2_data, + memory::Copy(out_place, out_data + in1_value.numel(), in2_place, in2_data, in2_value.numel() * sizeof(T), context.stream()); } }; @@ -218,9 +215,7 @@ struct SelectedRowsAddTo { auto* in1_data = in1_value.data(); auto* in2_data = in2_value->data(); - memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, in2_place), - in2_data + input2_offset, - BOOST_GET_CONST(platform::CUDAPlace, in1_place), in1_data, + memory::Copy(in2_place, in2_data + input2_offset, in1_place, in1_data, in1_value.numel() * sizeof(T), context.stream()); } }; diff --git a/paddle/fluid/operators/math/tree2col.cc b/paddle/fluid/operators/math/tree2col.cc index 97ab2c5f52ac2..af5df27207ace 100644 --- a/paddle/fluid/operators/math/tree2col.cc +++ b/paddle/fluid/operators/math/tree2col.cc @@ -90,7 +90,7 @@ class Tree2ColFunctor { framework::Tensor *patch, int max_depth) { std::vector> tr; auto feature_dims = node_features.dims(); - auto cpu_place = BOOST_GET_CONST(platform::CPUPlace, context.GetPlace()); + auto cpu_place = context.GetPlace(); math::SetConstant constant; int64_t feature_size = feature_dims[1]; size_t patch_elem_size = 3 * static_cast(feature_size); @@ -143,7 +143,7 @@ class Col2TreeFunctor { int max_depth) { std::vector> tr; auto output_dims = out_grad.dims(); - auto cpu_place = BOOST_GET_CONST(platform::CPUPlace, context.GetPlace()); + auto cpu_place = context.GetPlace(); math::SetConstant constant; int64_t output_size = output_dims[1]; size_t grad_elem_size = 3 * static_cast(output_size); diff --git a/paddle/fluid/operators/math/tree2col.cu b/paddle/fluid/operators/math/tree2col.cu index d9b787b6df33d..4f3ab31916558 100644 --- a/paddle/fluid/operators/math/tree2col.cu +++ b/paddle/fluid/operators/math/tree2col.cu @@ -52,7 +52,7 @@ class Tree2ColFunctor { const framework::Tensor& node_features, framework::Tensor* patch, int max_depth) { std::vector> tr; - auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()); + auto gpu_place = context.GetPlace(); auto cpu_place = platform::CPUPlace(); auto stream = context.stream(); auto feature_dims = node_features.dims(); @@ -124,7 +124,7 @@ class Col2TreeFunctor { const framework::Tensor& patch_grad, framework::Tensor* embedding_grad, int max_depth) { std::vector> tr; - auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()); + auto gpu_place = context.GetPlace(); auto cpu_place = platform::CPUPlace(); auto stream = context.stream(); auto output_dims = patch_grad.dims(); diff --git a/paddle/fluid/operators/matrix_rank_op.cu b/paddle/fluid/operators/matrix_rank_op.cu index 757c780b4ea53..87c8abc1c432e 100644 --- a/paddle/fluid/operators/matrix_rank_op.cu +++ b/paddle/fluid/operators/matrix_rank_op.cu @@ -178,8 +178,7 @@ void MatrixRankGPUKernel::GesvdjBatched( U + stride_U * i, ldu, V + stride_V * i, ldt, workspace_ptr, lwork, info, gesvdj_params)); int error_info; - memory::Copy(platform::CPUPlace(), &error_info, - BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), info, + memory::Copy(platform::CPUPlace(), &error_info, dev_ctx.GetPlace(), info, sizeof(int), dev_ctx.stream()); PADDLE_ENFORCE_EQ( error_info, 0, @@ -220,8 +219,7 @@ void MatrixRankGPUKernel::GesvdjBatched( info, gesvdj_params)); // check the error info int error_info; - memory::Copy(platform::CPUPlace(), &error_info, - BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), info, + memory::Copy(platform::CPUPlace(), &error_info, dev_ctx.GetPlace(), info, sizeof(int), dev_ctx.stream()); PADDLE_ENFORCE_EQ( error_info, 0, @@ -259,8 +257,7 @@ void MatrixRankGPUKernel::SyevjBatched( lwork, info, params)); int error_info; - memory::Copy(platform::CPUPlace(), &error_info, - BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), info, + memory::Copy(platform::CPUPlace(), &error_info, dev_ctx.GetPlace(), info, sizeof(int), dev_ctx.stream()); PADDLE_ENFORCE_EQ( error_info, 0, @@ -297,8 +294,7 @@ void MatrixRankGPUKernel::SyevjBatched( handle, jobz, uplo, n, A + stride_A * i, lda, W + n * i, workspace_ptr, lwork, info, params)); int error_info; - memory::Copy(platform::CPUPlace(), &error_info, - BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), info, + memory::Copy(platform::CPUPlace(), &error_info, dev_ctx.GetPlace(), info, sizeof(int), dev_ctx.stream()); PADDLE_ENFORCE_EQ( error_info, 0, diff --git a/paddle/fluid/operators/mean_op.cu b/paddle/fluid/operators/mean_op.cu index c48fc79326fa6..5a0afb68d63f1 100644 --- a/paddle/fluid/operators/mean_op.cu +++ b/paddle/fluid/operators/mean_op.cu @@ -53,7 +53,7 @@ class MeanCUDAKernel : public framework::OpKernel { auto stream = context.cuda_device_context().stream(); if (rank == 0) { // scalar - auto gpu_place = BOOST_GET(platform::CUDAPlace, place); + auto gpu_place = place; memory::Copy(gpu_place, out_data, gpu_place, in_data, numel * sizeof(T), stream); return; diff --git a/paddle/fluid/operators/mean_op_xpu.cc b/paddle/fluid/operators/mean_op_xpu.cc index 1521265e1b3a9..53bc658af61b2 100644 --- a/paddle/fluid/operators/mean_op_xpu.cc +++ b/paddle/fluid/operators/mean_op_xpu.cc @@ -64,9 +64,7 @@ class MeanGradXPUKernel : public framework::OpKernel { const T* dy = OG->data(); T dy0_value; xpu_wait(dev_ctx.x_context()->xpu_stream); - memory::Copy(platform::CPUPlace(), &dy0_value, - BOOST_GET_CONST(platform::XPUPlace, OG->place()), dy, - sizeof(T)); + memory::Copy(platform::CPUPlace(), &dy0_value, OG->place(), dy, sizeof(T)); float dy0_fp32 = static_cast(dy0_value); dy0_fp32 = dy0_fp32 / static_cast(IG->numel()); diff --git a/paddle/fluid/operators/memcpy_d2h_op.cc b/paddle/fluid/operators/memcpy_d2h_op.cc index 1eb8d09c783b0..1aaa4c2367938 100644 --- a/paddle/fluid/operators/memcpy_d2h_op.cc +++ b/paddle/fluid/operators/memcpy_d2h_op.cc @@ -24,8 +24,6 @@ namespace imperative { class OpBase; } // namespace imperative namespace platform { -struct CPUPlace; -struct CUDAPlace; struct float16; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/memcpy_h2d_op.cc b/paddle/fluid/operators/memcpy_h2d_op.cc index 0e27ec0dc75b7..4e0f353a7a36c 100644 --- a/paddle/fluid/operators/memcpy_h2d_op.cc +++ b/paddle/fluid/operators/memcpy_h2d_op.cc @@ -24,8 +24,6 @@ namespace imperative { class OpBase; } // namespace imperative namespace platform { -struct CPUPlace; -struct CUDAPlace; struct float16; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/memcpy_op.cc b/paddle/fluid/operators/memcpy_op.cc index 56eee13cb060a..d1eeff0b0572c 100644 --- a/paddle/fluid/operators/memcpy_op.cc +++ b/paddle/fluid/operators/memcpy_op.cc @@ -27,8 +27,6 @@ namespace imperative { class OpBase; } // namespace imperative namespace platform { -struct CPUPlace; -struct CUDAPlace; struct float16; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc index 7031b96a50b9e..de71312d78df9 100644 --- a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc +++ b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc @@ -75,11 +75,9 @@ class AccuracyXPUKernel : public framework::OpKernel { int64_t* label_int64_host = reinterpret_cast(std::malloc(label_int64_size)); dev_ctx.Wait(); - memory::Copy(platform::CPUPlace(), indices_int64_host, - BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()), + memory::Copy(platform::CPUPlace(), indices_int64_host, ctx.GetPlace(), indices_data, indices_int64_size); - memory::Copy(platform::CPUPlace(), label_int64_host, - BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()), + memory::Copy(platform::CPUPlace(), label_int64_host, ctx.GetPlace(), label_data, label_int64_size); for (size_t i = 0; i < num_samples; ++i) { label_int32_host[i] = label_int64_host[i]; @@ -88,12 +86,10 @@ class AccuracyXPUKernel : public framework::OpKernel { indices_int64_host[i * class_dim + j]; } } - memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()), - indices_int32_device, platform::CPUPlace(), indices_int32_host, - indices_int32_size); - memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()), - label_int32_device, platform::CPUPlace(), label_int32_host, - label_int32_size); + memory::Copy(ctx.GetPlace(), indices_int32_device, platform::CPUPlace(), + indices_int32_host, indices_int32_size); + memory::Copy(ctx.GetPlace(), label_int32_device, platform::CPUPlace(), + label_int32_host, label_int32_size); int r = xpu::accuracy(dev_ctx.x_context(), indices_int32_device, label_int32_device, num_samples, class_dim, correct_data, total_data, accuracy_data); diff --git a/paddle/fluid/operators/multiplex_op.cu b/paddle/fluid/operators/multiplex_op.cu index 505e322310caf..5a212bcacae50 100644 --- a/paddle/fluid/operators/multiplex_op.cu +++ b/paddle/fluid/operators/multiplex_op.cu @@ -44,8 +44,7 @@ class MultiplexGPUKernel : public framework::OpKernel { TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu); auto* index = index_t_cpu.data(); auto stream = ctx.cuda_device_context().stream(); - platform::CUDAPlace place = - BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); + platform::CUDAPlace place = ctx.GetPlace(); for (auto i = 0; i < rows; i++) { int32_t k = index[i]; PADDLE_ENFORCE_GE(k, 0, platform::errors::PreconditionNotMet( @@ -89,8 +88,7 @@ class MultiplexGradGPUKernel : public framework::OpKernel { auto* index = index_t_cpu.data(); auto stream = ctx.cuda_device_context().stream(); - platform::CUDAPlace place = - BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); + platform::CUDAPlace place = ctx.GetPlace(); for (auto i = 0; i < rows; i++) { size_t k = static_cast(index[i]); if (d_ins[k]) { diff --git a/paddle/fluid/operators/multiplex_op.h b/paddle/fluid/operators/multiplex_op.h index c0f24a2034a15..1d0a009edeedc 100644 --- a/paddle/fluid/operators/multiplex_op.h +++ b/paddle/fluid/operators/multiplex_op.h @@ -42,8 +42,7 @@ class MultiplexCPUKernel : public framework::OpKernel { auto rows = ins[0]->dims()[0]; auto cols = ins[0]->numel() / rows; auto index = ids->data(); - platform::CPUPlace place = - BOOST_GET_CONST(platform::CPUPlace, ctx.GetPlace()); + platform::CPUPlace place = ctx.GetPlace(); for (auto i = 0; i < rows; i++) { int32_t k = index[i]; PADDLE_ENFORCE_GE(k, 0, platform::errors::PreconditionNotMet( @@ -83,8 +82,7 @@ class MultiplexGradCPUKernel : public framework::OpKernel { auto rows = d_ins[idx]->dims()[0]; auto cols = d_ins[idx]->numel() / rows; auto* index = ids->data(); - platform::CPUPlace place = - BOOST_GET_CONST(platform::CPUPlace, ctx.GetPlace()); + platform::CPUPlace place = ctx.GetPlace(); for (auto i = 0; i < rows; i++) { size_t k = static_cast(index[i]); if (d_ins[k]) { diff --git a/paddle/fluid/operators/nccl/nccl_op.cu.cc b/paddle/fluid/operators/nccl/nccl_op.cu.cc index f319ce159f6dd..1e4bf925cc2f8 100644 --- a/paddle/fluid/operators/nccl/nccl_op.cu.cc +++ b/paddle/fluid/operators/nccl/nccl_op.cu.cc @@ -68,8 +68,7 @@ class NCCLAllReduceKernel : public framework::OpKernel { auto reduction_op_ = str_to_nccl_red_type(reduction); // device id - int gpu_id = - BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).GetDeviceId(); + int gpu_id = ctx.GetPlace().GetDeviceId(); int idx = comm->GetCommId(gpu_id); VLOG(3) << "gpu : " << " invoke allreduce. send " << x->numel() << " recv " @@ -100,8 +99,7 @@ class NCCLReduceKernel : public framework::OpKernel { auto reduction_op_ = str_to_nccl_red_type(reduction); // device id - int gpu_id = - BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).GetDeviceId(); + int gpu_id = ctx.GetPlace().GetDeviceId(); int idx = comm->GetCommId(gpu_id); T* recvbuffer = nullptr; if (root == gpu_id) { @@ -130,8 +128,7 @@ class NCCLBcastKernel : public framework::OpKernel { int root = ctx.Attr("root"); auto* comm = ctx.Input("Communicator"); // device id - int gpu_id = - BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).GetDeviceId(); + int gpu_id = ctx.GetPlace().GetDeviceId(); int idx = comm->GetCommId(gpu_id); if (idx == root) { auto* x = ctx.Input("X"); diff --git a/paddle/fluid/operators/partial_concat_op.cu b/paddle/fluid/operators/partial_concat_op.cu index 779a45daf7a78..322e84ae8b9c2 100644 --- a/paddle/fluid/operators/partial_concat_op.cu +++ b/paddle/fluid/operators/partial_concat_op.cu @@ -118,8 +118,7 @@ class PartialConcatOpCUDAKernel : public framework::OpKernel { in_data.emplace_back(in_vars[i]->data()); auto tmp_in_array = memory::Alloc(dev_ctx, in_data.size() * sizeof(T *)); - memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), - tmp_in_array->ptr(), platform::CPUPlace(), + memory::Copy(dev_ctx.GetPlace(), tmp_in_array->ptr(), platform::CPUPlace(), reinterpret_cast(in_data.data()), in_data.size() * sizeof(T *), dev_ctx.stream()); @@ -188,8 +187,7 @@ class PartialConcatGradOpCUDAKernel : public framework::OpKernel { } auto tmp_out_array = memory::Alloc(dev_ctx, out_data.size() * sizeof(T *)); - memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), - tmp_out_array->ptr(), platform::CPUPlace(), + memory::Copy(dev_ctx.GetPlace(), tmp_out_array->ptr(), platform::CPUPlace(), reinterpret_cast(out_data.data()), out_data.size() * sizeof(T *), dev_ctx.stream()); diff --git a/paddle/fluid/operators/partial_sum_op.cu b/paddle/fluid/operators/partial_sum_op.cu index c401a222c3aa2..63d140d6769b8 100644 --- a/paddle/fluid/operators/partial_sum_op.cu +++ b/paddle/fluid/operators/partial_sum_op.cu @@ -115,8 +115,8 @@ class PartialSumOpCUDAKernel : public framework::OpKernel { if (!in_data.empty()) { auto tmp_in_array = memory::Alloc(dev_ctx, in_data.size() * sizeof(T *)); - memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), - tmp_in_array->ptr(), platform::CPUPlace(), + memory::Copy(dev_ctx.GetPlace(), tmp_in_array->ptr(), + platform::CPUPlace(), reinterpret_cast(in_data.data()), in_data.size() * sizeof(T *), dev_ctx.stream()); @@ -191,8 +191,8 @@ class PartialSumGradOpCUDAKernel : public framework::OpKernel { auto tmp_out_array = memory::Alloc(dev_ctx, out_data.size() * sizeof(T *)); - memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), - tmp_out_array->ptr(), platform::CPUPlace(), + memory::Copy(dev_ctx.GetPlace(), tmp_out_array->ptr(), + platform::CPUPlace(), reinterpret_cast(out_data.data()), out_data.size() * sizeof(T *), dev_ctx.stream()); diff --git a/paddle/fluid/operators/poisson_op.cu b/paddle/fluid/operators/poisson_op.cu index 3f18eb994e145..ef2f6d4665554 100644 --- a/paddle/fluid/operators/poisson_op.cu +++ b/paddle/fluid/operators/poisson_op.cu @@ -61,8 +61,7 @@ class PoissonKernel const T* x_data = x->data(); T* out_data = out->mutable_data(ctx.GetPlace()); auto size = x->numel(); - int64_t device_id = - BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).GetDeviceId(); + int64_t device_id = ctx.GetPlace().GetDeviceId(); auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id); auto seed_offset = gen_cuda->IncrementOffset(20); diff --git a/paddle/fluid/operators/prroi_pool_op.cu b/paddle/fluid/operators/prroi_pool_op.cu index a21f565dae71d..71aaf08c5256a 100644 --- a/paddle/fluid/operators/prroi_pool_op.cu +++ b/paddle/fluid/operators/prroi_pool_op.cu @@ -246,7 +246,7 @@ class GPUPRROIPoolOpKernel : public framework::OpKernel { int bytes = rois_batch_id_list.numel() * sizeof(int); auto roi_ptr = memory::Alloc(dev_ctx, bytes); int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); - const auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); + const auto gplace = ctx.GetPlace(); memory::Copy(gplace, roi_id_data, cplace, rois_batch_id_data, bytes, dev_ctx.stream()); @@ -322,7 +322,7 @@ class GPUPRROIPoolGradOpKernel : public framework::OpKernel { int bytes = rois_batch_id_list.numel() * sizeof(int); auto roi_ptr = memory::Alloc(dev_ctx, bytes); int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); - const auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); + const auto gplace = ctx.GetPlace(); memory::Copy(gplace, roi_id_data, cplace, rois_batch_id_data, bytes, dev_ctx.stream()); diff --git a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc index dc4bc36d34f22..3a361360e2ed7 100644 --- a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc +++ b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc @@ -97,8 +97,7 @@ void InitTensorsOnClient(framework::Scope* scope, int64_t rows_numel, float* temp_ptr = temp_vec.data(); memory::Copy( - BOOST_GET_CONST(platform::CUDAPlace, place), - reinterpret_cast(micro_id_ptr), platform::CPUPlace(), + place, reinterpret_cast(micro_id_ptr), platform::CPUPlace(), reinterpret_cast(temp_ptr), micro_id_var->numel() * framework::SizeOfType(micro_id_var->type()), stream); @@ -109,8 +108,7 @@ void InitTensorsOnClient(framework::Scope* scope, int64_t rows_numel, std::vector x_vec; for (int64_t i = 0; i < rows_numel; ++i) x_vec.push_back(1.0); float* x_vec_ptr = x_vec.data(); - memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), - reinterpret_cast(x_ptr), platform::CPUPlace(), + memory::Copy(place, reinterpret_cast(x_ptr), platform::CPUPlace(), reinterpret_cast(x_vec_ptr), x_var->numel() * framework::SizeOfType(x_var->type()), stream); diff --git a/paddle/fluid/operators/psroi_pool_op.cu b/paddle/fluid/operators/psroi_pool_op.cu index 5a0d1a700417c..efdcc59a5c49e 100644 --- a/paddle/fluid/operators/psroi_pool_op.cu +++ b/paddle/fluid/operators/psroi_pool_op.cu @@ -203,8 +203,7 @@ class GPUPSROIPoolOpKernel : public framework::OpKernel { "input(X) is %d and %d respectively.", rois_batch_size, batch_size)); std::vector rois_num_list(rois_batch_size); - memory::Copy(platform::CPUPlace(), rois_num_list.data(), - BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()), + memory::Copy(platform::CPUPlace(), rois_num_list.data(), ctx.GetPlace(), rois_num_data, sizeof(int) * rois_batch_size, 0); int rois_num_count = 0; for (int i = 0; i < rois_batch_size; ++i) { @@ -295,8 +294,7 @@ class GPUPSROIPoolGradOpKernel : public framework::OpKernel { auto* rois_num_t = ctx.Input("RoisNum"); rois_batch_size = rois_num_t->numel(); std::vector rois_num_list(rois_batch_size); - memory::Copy(platform::CPUPlace(), rois_num_list.data(), - BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()), + memory::Copy(platform::CPUPlace(), rois_num_list.data(), ctx.GetPlace(), rois_num_t->data(), sizeof(int) * rois_batch_size, 0); int start = 0; for (int n = 0; n < rois_batch_size; ++n) { diff --git a/paddle/fluid/operators/qr_op.cu b/paddle/fluid/operators/qr_op.cu index af5ebdc53126a..9eddb03828b5d 100644 --- a/paddle/fluid/operators/qr_op.cu +++ b/paddle/fluid/operators/qr_op.cu @@ -122,12 +122,9 @@ class QrGPUKernel : public framework::OpKernel { auto new_qr_data = new_qr.mutable_data(context.GetPlace()); auto new_qr_stride = m * m; for (int i = 0; i < batch_size; ++i) { - memory::Copy( - BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), - (new_qr_data + i * new_qr_stride), - BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), - (qr_data + i * qr_stride), qr_stride * sizeof(math::Real), - dev_ctx.stream()); + memory::Copy(dev_ctx.GetPlace(), (new_qr_data + i * new_qr_stride), + dev_ctx.GetPlace(), (qr_data + i * qr_stride), + qr_stride * sizeof(math::Real), dev_ctx.stream()); } BatchedOrgqr( dev_ctx, batch_size, m, m, min_mn, new_qr_data, m, tau_data, @@ -171,9 +168,8 @@ void BatchedGeqrf( // Do we need synchronized here? // check the error info int info_h; - memory::Copy(platform::CPUPlace(), &info_h, - BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), - info_d, sizeof(int), dev_ctx.stream()); + memory::Copy(platform::CPUPlace(), &info_h, dev_ctx.GetPlace(), info_d, + sizeof(int), dev_ctx.stream()); PADDLE_ENFORCE_EQ( info_h, 0, platform::errors::PreconditionNotMet( @@ -205,9 +201,8 @@ void BatchedGeqrf( // Do we need synchronized here? // check the error info int info_h; - memory::Copy(platform::CPUPlace(), &info_h, - BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), - info_d, sizeof(int), dev_ctx.stream()); + memory::Copy(platform::CPUPlace(), &info_h, dev_ctx.GetPlace(), info_d, + sizeof(int), dev_ctx.stream()); PADDLE_ENFORCE_EQ( info_h, 0, platform::errors::PreconditionNotMet( @@ -239,9 +234,8 @@ void BatchedOrgqr( // Do we need synchronized here? // check the error info int info_h; - memory::Copy(platform::CPUPlace(), &info_h, - BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), - info_d, sizeof(int), dev_ctx.stream()); + memory::Copy(platform::CPUPlace(), &info_h, dev_ctx.GetPlace(), info_d, + sizeof(int), dev_ctx.stream()); PADDLE_ENFORCE_EQ( info_h, 0, platform::errors::PreconditionNotMet( @@ -273,9 +267,8 @@ void BatchedOrgqr( // Do we need synchronized here? // check the error info int info_h; - memory::Copy(platform::CPUPlace(), &info_h, - BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), - info_d, sizeof(int), dev_ctx.stream()); + memory::Copy(platform::CPUPlace(), &info_h, dev_ctx.GetPlace(), info_d, + sizeof(int), dev_ctx.stream()); PADDLE_ENFORCE_EQ( info_h, 0, platform::errors::PreconditionNotMet( diff --git a/paddle/fluid/operators/range_op_xpu.cc b/paddle/fluid/operators/range_op_xpu.cc index 1d4de77978180..2f142a626c5f2 100644 --- a/paddle/fluid/operators/range_op_xpu.cc +++ b/paddle/fluid/operators/range_op_xpu.cc @@ -50,9 +50,8 @@ class XPURangeKernel : public framework::OpKernel { out_cpu_data_ptr[i] = value; value += step; } - memory::Copy(BOOST_GET_CONST(platform::XPUPlace, context.GetPlace()), - static_cast(out_data), platform::CPUPlace(), - static_cast(out_cpu_data_ptr), + memory::Copy(context.GetPlace(), static_cast(out_data), + platform::CPUPlace(), static_cast(out_cpu_data_ptr), out->numel() * sizeof(T)); } }; diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc index 01f5b4c732712..a0eca3d9b09d9 100644 --- a/paddle/fluid/operators/rank_loss_op.cc +++ b/paddle/fluid/operators/rank_loss_op.cc @@ -26,7 +26,6 @@ class OpBase; } // namespace imperative namespace platform { class CPUDeviceContext; -struct CPUPlace; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index 3c0c8ad1cafce..4aad78f1c49cf 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -41,7 +41,7 @@ BufferedReader::BufferedReader( VLOG(1) << "BufferedReader"; #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(place_) && !pin_memory) { - int dev_idx = BOOST_GET_CONST(platform::CUDAPlace, place_).device; + int dev_idx = place_.device; compute_stream_ = ((platform::CUDADeviceContext *)(platform::DeviceContextPool::Instance() .Get(place_))) @@ -56,7 +56,7 @@ BufferedReader::BufferedReader( #ifdef PADDLE_WITH_ASCEND_CL if (platform::is_npu_place(place_)) { - int dev_idx = BOOST_GET_CONST(platform::NPUPlace, place_).device; + int dev_idx = place_.device; compute_stream_ = ((platform::NPUDeviceContext *)(platform::DeviceContextPool::Instance() .Get(place_))) @@ -119,8 +119,7 @@ void BufferedReader::ReadAsync(size_t i) { // cudaHostAlloc, that is a CUDA API, calling CUDA API need load // cuda lib into device, it will cost hundreds of MB of GPU memory. // If we don't set Device here, which will use CUDAPlace(0) default. - platform::SetDeviceId( - BOOST_GET_CONST(platform::CUDAPlace, place_).device); + platform::SetDeviceId(place_.device); for (size_t i = 0; i < cpu.size(); ++i) { if (platform::is_cpu_place(cpu[i].place())) { cuda[i].Resize(cpu[i].dims()); @@ -130,8 +129,7 @@ void BufferedReader::ReadAsync(size_t i) { auto size = cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type()); - memory::Copy(cuda_pinned_place, cuda_pinned_ptrs[i], - BOOST_GET_CONST(platform::CPUPlace, cpu[i].place()), + memory::Copy(cuda_pinned_place, cuda_pinned_ptrs[i], cpu[i].place(), cpu[i].data(), size); cuda[i].set_lod(cpu[i].lod()); @@ -158,8 +156,7 @@ void BufferedReader::ReadAsync(size_t i) { // NOTE(zjl): cudaStreamWaitEvent() must be called after all // cuda[i].mutable_data() is called, since some ops release // cuda memory immediately without waiting cuda kernel ends - platform::SetDeviceId( - BOOST_GET_CONST(platform::CUDAPlace, place_).device); + platform::SetDeviceId(place_.device); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipEventRecord(events_[i].get(), compute_stream_)); @@ -180,25 +177,21 @@ void BufferedReader::ReadAsync(size_t i) { auto size = cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type()); if (platform::is_cuda_pinned_place(cpu_place)) { - memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place_), gpu_ptr, - BOOST_GET_CONST(platform::CUDAPinnedPlace, cpu_place), - cpu_ptr, size, stream_.get()); + memory::Copy(place_, gpu_ptr, cpu_place, cpu_ptr, size, + stream_.get()); } else if ((platform::is_gpu_place(cpu_place))) { - memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place_), gpu_ptr, - BOOST_GET_CONST(platform::CUDAPlace, cpu_place), - cpu_ptr, size, stream_.get()); + memory::Copy(place_, gpu_ptr, cpu_place, cpu_ptr, size, + stream_.get()); } else { platform::CUDAPinnedPlace cuda_pinned_place; framework::LoDTensor cuda_pinned_tensor; cuda_pinned_tensor.Resize(cpu[i].dims()); auto cuda_pinned_ptr = cuda_pinned_tensor.mutable_data( cuda_pinned_place, cpu[i].type()); - memory::Copy(cuda_pinned_place, cuda_pinned_ptr, - BOOST_GET_CONST(platform::CPUPlace, cpu_place), - cpu_ptr, size); - memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place_), gpu_ptr, - cuda_pinned_place, cuda_pinned_ptr, size, - stream_.get()); + memory::Copy(cuda_pinned_place, cuda_pinned_ptr, cpu_place, cpu_ptr, + size); + memory::Copy(place_, gpu_ptr, cuda_pinned_place, cuda_pinned_ptr, + size, stream_.get()); platform::GpuStreamSync(stream_.get()); } @@ -231,8 +224,7 @@ void BufferedReader::ReadAsync(size_t i) { npu_ptrs.emplace_back(npu[i].mutable_data(place_, cpu[i].type())); } - platform::SetNPUDeviceId( - BOOST_GET_CONST(platform::NPUPlace, place_).device); + platform::SetNPUDeviceId(place_.device); platform::NPUEventRecord(events_[i].get(), compute_stream_); platform::NPUStreamWaitEvent(stream_.get(), events_[i].get()); @@ -244,13 +236,11 @@ void BufferedReader::ReadAsync(size_t i) { auto size = cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type()); if ((platform::is_npu_place(cpu_place))) { - memory::Copy(BOOST_GET_CONST(platform::NPUPlace, place_), npu_ptr, - BOOST_GET_CONST(platform::NPUPlace, cpu_place), cpu_ptr, - size, stream_.get()); + memory::Copy(place_, npu_ptr, cpu_place, cpu_ptr, size, + stream_.get()); } else { - memory::Copy(BOOST_GET_CONST(platform::NPUPlace, place_), npu_ptr, - BOOST_GET_CONST(platform::CPUPlace, cpu_place), cpu_ptr, - size, stream_.get()); + memory::Copy(place_, npu_ptr, cpu_place, cpu_ptr, size, + stream_.get()); platform::NPUStreamSync(stream_.get()); } npu[i].set_lod(cpu[i].lod()); diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc index 44db3f3a33563..d4a68260a6b98 100644 --- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc +++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc @@ -47,12 +47,12 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase { platform::Place place; if (place_str == "AUTO") { place = dev_place; - } else if (place_str == "CPUPLACE") { + } else if (place_str == "PLACE(CPU)") { place = platform::CPUPlace(); } else { place_str = place_str.substr(0, place_str.length() - 1); std::istringstream sin(place_str); - sin.seekg(std::string("CUDAPLACE(").size(), std::ios::beg); + sin.seekg(std::string("PLACE(GPU:").size(), std::ios::beg); size_t num; sin >> num; place = platform::CUDAPlace(static_cast(num)); @@ -79,7 +79,7 @@ class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase { std::unordered_set enum_range; constexpr size_t kMaxCUDADevs = 128; for (size_t i = 0; i < kMaxCUDADevs; ++i) { - enum_range.insert(string::Sprintf("CUDAPLACE(%d)", i)); + enum_range.insert(string::Sprintf("PLACE(GPU:%d)", i)); } enum_range.insert("CPUPLACE"); enum_range.insert("AUTO"); diff --git a/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc b/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc index 2d7cce68e8171..0a5d54e72c845 100644 --- a/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc +++ b/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc @@ -25,7 +25,6 @@ class OpBase; } // namespace imperative namespace platform { class CPUDeviceContext; -struct CPUPlace; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cc b/paddle/fluid/operators/reduce_ops/reduce_all_op.cc index 10095bc955047..955cf8d4448c1 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cc @@ -25,7 +25,6 @@ class OpBase; } // namespace imperative namespace platform { class CPUDeviceContext; -struct CPUPlace; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op.cc index f288fce753802..fa3800dd3c9e4 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cc @@ -25,7 +25,6 @@ class OpBase; } // namespace imperative namespace platform { class CPUDeviceContext; -struct CPUPlace; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc index f27cd6b125b32..50df75d9ad3fd 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc @@ -23,7 +23,6 @@ class OpBase; } // namespace imperative namespace platform { class CPUDeviceContext; -struct CPUPlace; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc index 9e4cc8e213c61..562a5719d74d9 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc @@ -25,7 +25,6 @@ class OpBase; } // namespace imperative namespace platform { class CPUDeviceContext; -struct CPUPlace; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.h b/paddle/fluid/operators/reduce_ops/reduce_sum_op.h index 7f61794fbb11b..9782ce28da4af 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.h @@ -72,7 +72,7 @@ class ReduceSumGradKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& context) const override { auto dims = context.Attr>("dim"); - if (context.GetPlace().type() == typeid(platform::CPUPlace) && + if (context.GetPlace().GetType() == platform::CPUPlace().GetType() && dims.size() == 1) { int in_dtype = context.Attr("in_dtype"); diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index 47b8da70adbac..9e343517e3fbf 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -31,8 +31,6 @@ namespace imperative { class OpBase; } // namespace imperative namespace platform { -struct CPUPlace; -struct CUDAPlace; struct float16; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/rnn_op.cu.cc b/paddle/fluid/operators/rnn_op.cu.cc index de4847ddc4590..80a0ef10fa150 100644 --- a/paddle/fluid/operators/rnn_op.cu.cc +++ b/paddle/fluid/operators/rnn_op.cu.cc @@ -256,10 +256,8 @@ void weight_to_tensor(const platform::Place &place, gpuStream_t stream, const T *in_data = weight_list[i]->data(); auto in_size = weight_list[i]->numel(); - memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, weight->place()), - weight_data + weight_offset, - BOOST_GET_CONST(platform::CUDAPlace, weight_list[i]->place()), - in_data, in_size * sizeof(T), stream); + memory::Copy(weight->place(), weight_data + weight_offset, + weight_list[i]->place(), in_data, in_size * sizeof(T), stream); weight_offset += in_size; } } @@ -276,10 +274,8 @@ void weight_to_tensor_list(const platform::Place &place, gpuStream_t stream, T *weight_grad_data = (*weight_grad)[i]->mutable_data(place); const T *src = weight_data + weight_offset; - memory::Copy( - BOOST_GET_CONST(platform::CUDAPlace, (*weight_grad)[i]->place()), - weight_grad_data, BOOST_GET_CONST(platform::CUDAPlace, weight->place()), - src, in_size * sizeof(T), stream); + memory::Copy((*weight_grad)[i]->place(), weight_grad_data, weight->place(), + src, in_size * sizeof(T), stream); weight_offset += in_size; } } @@ -295,10 +291,8 @@ void weight_list_to_tensor(const platform::Place &place, gpuStream_t stream, for (size_t i = 0; i < tensor_list.size(); ++i) { const T *in_data = tensor_list[i].data(); auto in_size = tensor_list[i].numel(); - memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, weight_whole->place()), - weight_data + weight_offset, - BOOST_GET_CONST(platform::CUDAPlace, tensor_list[i].place()), - in_data, in_size * sizeof(T), stream); + memory::Copy(weight_whole->place(), weight_data + weight_offset, + tensor_list[i].place(), in_data, in_size * sizeof(T), stream); weight_offset += in_size; } } @@ -430,8 +424,7 @@ class RNNCudnnKernel : public framework::OpKernel { bool is_test = ctx.Attr("is_test"); int seed = ctx.Attr("seed"); if (!is_test) { - int device_id = - BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).GetDeviceId(); + int device_id = ctx.GetPlace().GetDeviceId(); auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id); if (gen_cuda->GetIsInitPy() && seed == 0) { // If perform `manual_seed` in python and inner seed is not specified diff --git a/paddle/fluid/operators/roi_align_op.cu b/paddle/fluid/operators/roi_align_op.cu index 3b25676fb0c36..520023229fe1b 100644 --- a/paddle/fluid/operators/roi_align_op.cu +++ b/paddle/fluid/operators/roi_align_op.cu @@ -271,7 +271,7 @@ class GPUROIAlignOpKernel : public framework::OpKernel { auto cplace = platform::CPUPlace(); int* roi_batch_id_data = roi_batch_id_list.mutable_data(cplace); auto& dev_ctx = ctx.cuda_device_context(); - auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); + auto gplace = ctx.GetPlace(); if (ctx.HasInput("RoisNum")) { auto* rois_num_t = ctx.Input("RoisNum"); int rois_batch_size = rois_num_t->numel(); @@ -365,7 +365,7 @@ class GPUROIAlignGradOpKernel : public framework::OpKernel { int* roi_batch_id_data = roi_batch_id_list.mutable_data(cplace); auto& dev_ctx = ctx.cuda_device_context(); - auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); + auto gplace = ctx.GetPlace(); if (ctx.HasInput("RoisNum")) { auto* rois_num_t = ctx.Input("RoisNum"); int rois_batch_size = rois_num_t->numel(); diff --git a/paddle/fluid/operators/roi_align_op_xpu.cc b/paddle/fluid/operators/roi_align_op_xpu.cc index f35cf06e5f704..7764e52c2f6da 100644 --- a/paddle/fluid/operators/roi_align_op_xpu.cc +++ b/paddle/fluid/operators/roi_align_op_xpu.cc @@ -48,7 +48,7 @@ class XPUROIAlignOpKernel : public framework::OpKernel { auto cplace = platform::CPUPlace(); int* roi_batch_id_data = roi_batch_id_list.mutable_data(cplace); auto& dev_ctx = ctx.template device_context(); - auto xplace = BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()); + auto xplace = ctx.GetPlace(); int rois_batch_size = 0; int* cpu_lod = nullptr; if (ctx.HasInput("RoisNum")) { @@ -157,7 +157,7 @@ class XPUROIAlignGradOpKernel : public framework::OpKernel { auto cplace = platform::CPUPlace(); auto& dev_ctx = ctx.template device_context(); - auto xplace = BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()); + auto xplace = ctx.GetPlace(); int rois_batch_size = 0; int* cpu_lod = nullptr; diff --git a/paddle/fluid/operators/roi_pool_op.cu b/paddle/fluid/operators/roi_pool_op.cu index 0a4a076c6caae..16a8e2bf586a7 100644 --- a/paddle/fluid/operators/roi_pool_op.cu +++ b/paddle/fluid/operators/roi_pool_op.cu @@ -156,7 +156,7 @@ class GPUROIPoolOpKernel : public framework::OpKernel { auto cplace = platform::CPUPlace(); int* roi_batch_id_data = roi_batch_id_list.mutable_data(cplace); auto& dev_ctx = ctx.cuda_device_context(); - auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); + auto gplace = ctx.GetPlace(); if (ctx.HasInput("RoisNum")) { auto* rois_num_t = ctx.Input("RoisNum"); int rois_batch_size = rois_num_t->numel(); @@ -244,7 +244,7 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel { int* roi_batch_id_data = roi_batch_id_list.mutable_data(cplace); auto& dev_ctx = ctx.cuda_device_context(); - auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); + auto gplace = ctx.GetPlace(); if (ctx.HasInput("RoisNum")) { auto* rois_num_t = ctx.Input("RoisNum"); int rois_batch_size = rois_num_t->numel(); diff --git a/paddle/fluid/operators/run_program_op.h b/paddle/fluid/operators/run_program_op.h index 04e4dc62b039b..c130dbb35a0da 100644 --- a/paddle/fluid/operators/run_program_op.h +++ b/paddle/fluid/operators/run_program_op.h @@ -261,7 +261,7 @@ class RunProgramOpKernel : public framework::OpKernel { VLOG(2) << "The number of sub scopes after forward: " << out_scope_vec->front()->kids().size(); #ifdef PADDLE_WITH_MKLDNN - if (FLAGS_use_mkldnn) DontClearMKLDNNCache(ctx.GetPlace()); + if (FLAGS_use_mkldnn) platform::DontClearMKLDNNCache(ctx.GetPlace()); #endif } }; diff --git a/paddle/fluid/operators/scatter.cu.h b/paddle/fluid/operators/scatter.cu.h index 6c7a0a8886ef0..98311ff404b47 100644 --- a/paddle/fluid/operators/scatter.cu.h +++ b/paddle/fluid/operators/scatter.cu.h @@ -221,7 +221,7 @@ void GPUScatterNdAdd(const framework::ExecutionContext& context, // put output_dims int CUDA // gplace and cplace const auto& ctx = context.template device_context(); - const auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); + const auto gplace = ctx.GetPlace(); auto cplace = platform::CPUPlace(); std::vector v_output_dims(output_dims_size); diff --git a/paddle/fluid/operators/seed_op.cu b/paddle/fluid/operators/seed_op.cu index 4ca75bcf76e51..2154b08ae86fe 100644 --- a/paddle/fluid/operators/seed_op.cu +++ b/paddle/fluid/operators/seed_op.cu @@ -37,8 +37,7 @@ class GPUSeedKernel : public framework::OpKernel { out, static_cast(seed)); } else { auto *out_data = out->mutable_data(context.GetPlace()); - auto target_gpu_place = - BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()); + auto target_gpu_place = context.GetPlace(); auto stream = context.cuda_device_context().stream(); memory::Copy(target_gpu_place, out_data, platform::CPUPlace(), &seed, sizeof(int), stream); diff --git a/paddle/fluid/operators/segment_pool_op.h b/paddle/fluid/operators/segment_pool_op.h index 307bf4010f7ff..4f180a31ce518 100644 --- a/paddle/fluid/operators/segment_pool_op.h +++ b/paddle/fluid/operators/segment_pool_op.h @@ -19,6 +19,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/segment_pooling.h" #include "paddle/fluid/platform/macros.h" +#include "paddle/pten/common/place.h" namespace paddle { namespace operators { @@ -48,7 +49,7 @@ void SegmentKernelLaunchHelper(const framework::ExecutionContext& context) { return; } - bool cpu_place = context.GetPlace().type() == typeid(platform::CPUPlace); + bool cpu_place = context.GetPlace().GetType() == pten::AllocationType::CPU; if (cpu_place) { auto dims = input->dims(); auto* segment_ids = segment->data(); diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu index f63fa5be7f496..4c9faa1875df6 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu @@ -98,8 +98,7 @@ static int ExpandByMemoryCopy(const platform::CUDADeviceContext& context, auto out_data = out->data(); auto x_data = x.data(); - const auto& gpu_place = - BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()); + const auto& gpu_place = context.GetPlace(); int x_item_length = x.numel() / x.dims()[0]; int out_offset = 0; diff --git a/paddle/fluid/operators/set_value_op.cc b/paddle/fluid/operators/set_value_op.cc index 41e6d2d40061e..ec3e04e71faf0 100644 --- a/paddle/fluid/operators/set_value_op.cc +++ b/paddle/fluid/operators/set_value_op.cc @@ -28,7 +28,6 @@ class OpBase; } // namespace imperative namespace platform { class CPUDeviceContext; -struct CPUPlace; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/split_op.h b/paddle/fluid/operators/split_op.h index ceba0dfddf0f5..96ac2c7a1bd08 100644 --- a/paddle/fluid/operators/split_op.h +++ b/paddle/fluid/operators/split_op.h @@ -142,8 +142,6 @@ class SplitOpKernel : public framework::OpKernel { } } - auto place = ctx.GetPlace(); - std::vector shape_refer; for (size_t j = 0; j < outs.size(); ++j) { outs[j]->mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/operators/split_op_npu.cc b/paddle/fluid/operators/split_op_npu.cc index b23c58f16925d..5e570572c35e1 100644 --- a/paddle/fluid/operators/split_op_npu.cc +++ b/paddle/fluid/operators/split_op_npu.cc @@ -45,7 +45,6 @@ class SplitNPUKernel : public framework::OpKernel { } std::vector outputs; - auto place = ctx.GetPlace(); for (size_t j = 0; j < outs.size(); ++j) { outs[j]->mutable_data(ctx.GetPlace()); outputs.push_back(*outs[j]); diff --git a/paddle/fluid/operators/stack_op.cu b/paddle/fluid/operators/stack_op.cu index 5b3f03445d352..2cebe0e320e7e 100644 --- a/paddle/fluid/operators/stack_op.cu +++ b/paddle/fluid/operators/stack_op.cu @@ -64,8 +64,7 @@ class StackGPUKernel : public framework::OpKernel { auto& dev_ctx = ctx.template device_context(); auto tmp_x_data = memory::Alloc(dev_ctx, x_datas.size() * sizeof(T*)); - memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), - tmp_x_data->ptr(), platform::CPUPlace(), + memory::Copy(dev_ctx.GetPlace(), tmp_x_data->ptr(), platform::CPUPlace(), reinterpret_cast(x_datas.data()), x_datas.size() * sizeof(T*), dev_ctx.stream()); @@ -169,8 +168,7 @@ class StackGradGPUKernel : public framework::OpKernel { auto& dev_ctx = ctx.template device_context(); auto tmp_out_data = memory::Alloc(dev_ctx, outputs.size() * sizeof(T*)); - memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), - tmp_out_data->ptr(), platform::CPUPlace(), + memory::Copy(dev_ctx.GetPlace(), tmp_out_data->ptr(), platform::CPUPlace(), reinterpret_cast(outputs.data()), outputs.size() * sizeof(T*), dev_ctx.stream()); diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h index 159450aa178d1..c92d468f3462c 100644 --- a/paddle/fluid/operators/strided_memcpy.h +++ b/paddle/fluid/operators/strided_memcpy.h @@ -94,18 +94,18 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx, for (int64_t i = 0; i < before; ++i) { if (platform::is_cpu_place(place)) { - auto& cpu_place = BOOST_GET_CONST(platform::CPUPlace, place); + auto& cpu_place = place; memory::Copy(cpu_place, dst + i * dst_after, cpu_place, src + i * src_after, sizeof(T) * size); } else { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - auto& gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place); + auto& gpu_place = place; auto& cuda_ctx = reinterpret_cast(ctx); memory::Copy(gpu_place, dst + i * dst_after, gpu_place, src + i * src_after, sizeof(T) * size, cuda_ctx.stream()); #elif defined(PADDLE_WITH_ASCEND_CL) - auto& npu_place = BOOST_GET_CONST(platform::NPUPlace, place); + auto& npu_place = place; auto& npu_ctx = reinterpret_cast(ctx); memory::Copy(npu_place, dst + i * dst_after, npu_place, src + i * src_after, sizeof(T) * size, npu_ctx.stream()); diff --git a/paddle/fluid/operators/sum_op.cu b/paddle/fluid/operators/sum_op.cu index 6034cda50c32a..4288e9415aa86 100644 --- a/paddle/fluid/operators/sum_op.cu +++ b/paddle/fluid/operators/sum_op.cu @@ -196,8 +196,8 @@ void SumToLoDTensor(const framework::ExecutionContext &context) { auto tmp_sr_in_out_array = memory::Alloc(dev_ctx, sr_in_out_data.size() * sizeof(T *)); - memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), - tmp_sr_in_out_array->ptr(), platform::CPUPlace(), + memory::Copy(dev_ctx.GetPlace(), tmp_sr_in_out_array->ptr(), + platform::CPUPlace(), reinterpret_cast(sr_in_out_data.data()), sr_in_out_data.size() * sizeof(T *), dev_ctx.stream()); @@ -214,8 +214,7 @@ void SumToLoDTensor(const framework::ExecutionContext &context) { if (!in_data.empty()) { auto tmp_in_array = memory::Alloc(dev_ctx, in_data.size() * sizeof(T *)); - memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), - tmp_in_array->ptr(), platform::CPUPlace(), + memory::Copy(dev_ctx.GetPlace(), tmp_in_array->ptr(), platform::CPUPlace(), reinterpret_cast(in_data.data()), in_data.size() * sizeof(T *), dev_ctx.stream()); diff --git a/paddle/fluid/operators/svd_op.cu b/paddle/fluid/operators/svd_op.cu index 0a7ed093ad0b8..f17e92e47b731 100644 --- a/paddle/fluid/operators/svd_op.cu +++ b/paddle/fluid/operators/svd_op.cu @@ -108,8 +108,7 @@ void SvdGPUKernel::GesvdjBatched( info, gesvdj_params)); // check the error info int error_info; - memory::Copy(platform::CPUPlace(), &error_info, - BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), info, + memory::Copy(platform::CPUPlace(), &error_info, dev_ctx.GetPlace(), info, sizeof(int), dev_ctx.stream()); PADDLE_ENFORCE_EQ( error_info, 0, @@ -151,8 +150,7 @@ void SvdGPUKernel::GesvdjBatched( info, gesvdj_params)); // check the error info int error_info; - memory::Copy(platform::CPUPlace(), &error_info, - BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), info, + memory::Copy(platform::CPUPlace(), &error_info, dev_ctx.GetPlace(), info, sizeof(int), dev_ctx.stream()); PADDLE_ENFORCE_EQ( error_info, 0, diff --git a/paddle/fluid/operators/tensor_formatter.cc b/paddle/fluid/operators/tensor_formatter.cc index a0cda54b31b4c..558f5f2a3128f 100644 --- a/paddle/fluid/operators/tensor_formatter.cc +++ b/paddle/fluid/operators/tensor_formatter.cc @@ -120,7 +120,7 @@ void TensorFormatter::FormatData(const framework::LoDTensor& print_tensor, : std::min(summarize_, print_tensor.numel()); const T* data = nullptr; framework::LoDTensor cpu_tensor; - if (is_cpu_place(print_tensor.place())) { + if (paddle::platform::is_cpu_place(print_tensor.place())) { data = print_tensor.data(); } else { platform::CPUPlace cpu_place; diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index 5ebf67587f3cb..9357eb4b2295a 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -388,8 +388,7 @@ class TensorRTEngineOp : public framework::OperatorBase { calib_res->thr_.reset(new std::thread([&]() { calib_res->engine_.reset(new TensorRTEngine( max_batch_size_, workspace_size_, precision_mode_, - calib_res->calib_.get(), - BOOST_GET_CONST(platform::CUDAPlace, dev_place).device)); + calib_res->calib_.get(), dev_place.device)); VLOG(3) << "start the calib trt engine thread"; PrepareTRTEngine(scope, calib_res->engine_.get()); })); @@ -567,8 +566,8 @@ class TensorRTEngineOp : public framework::OperatorBase { "than the number of bindings, but got binding " "index = %d, number of bindings = %d.", bind_index, num_bindings)); - buffers[bind_index] = static_cast(fluid_t->mutable_data( - BOOST_GET_CONST(platform::CUDAPlace, dev_place))); + buffers[bind_index] = + static_cast(fluid_t->mutable_data(dev_place)); output_index += 1; } diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cu b/paddle/fluid/operators/truncated_gaussian_random_op.cu index aaed8e5b62584..5e530a5bb5248 100644 --- a/paddle/fluid/operators/truncated_gaussian_random_op.cu +++ b/paddle/fluid/operators/truncated_gaussian_random_op.cu @@ -100,8 +100,7 @@ class GPUTruncatedGaussianRandomKernel : public framework::OpKernel { thrust::counting_iterator index_sequence_begin(0); int64_t size = tensor->numel(); - int device_id = - BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()).GetDeviceId(); + int device_id = context.GetPlace().GetDeviceId(); auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id); if (gen_cuda->GetIsInitPy() && seed_flag) { diff --git a/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc b/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc index b2ff91a37451e..803b61fbe813f 100644 --- a/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc +++ b/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc @@ -48,9 +48,8 @@ class XPUTruncatedGaussianRandomKernel : public framework::OpKernel { data_cpu[i] = truncated_normal(dist(*engine)); } - memory::Copy(BOOST_GET_CONST(platform::XPUPlace, context.GetPlace()), data, - platform::CPUPlace(), reinterpret_cast(data_cpu.get()), - size * sizeof(T)); + memory::Copy(context.GetPlace(), data, platform::CPUPlace(), + reinterpret_cast(data_cpu.get()), size * sizeof(T)); } }; diff --git a/paddle/fluid/operators/unbind_op.h b/paddle/fluid/operators/unbind_op.h index 2d968260f0a58..365dc9547a2d6 100644 --- a/paddle/fluid/operators/unbind_op.h +++ b/paddle/fluid/operators/unbind_op.h @@ -43,9 +43,6 @@ class UnbindOpKernel : public framework::OpKernel { int axis = ctx.Attr("axis"); auto in_dims = in->dims(); - - auto place = ctx.GetPlace(); - axis = axis < 0 ? in_dims.size() + axis : axis; std::vector shape_refer; for (size_t j = 0; j < outs.size(); ++j) { diff --git a/paddle/fluid/operators/uniform_random_inplace_op.cu b/paddle/fluid/operators/uniform_random_inplace_op.cu index bf82af865a1eb..a5231354eb47e 100644 --- a/paddle/fluid/operators/uniform_random_inplace_op.cu +++ b/paddle/fluid/operators/uniform_random_inplace_op.cu @@ -120,8 +120,7 @@ class GPUUniformRandomInplaceKernel : public framework::OpKernel { T diag_val = static_cast(ctx.Attr("diag_val")); thrust::counting_iterator index_sequence_begin(0); int64_t size = tensor->numel(); - int device_id = - BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).GetDeviceId(); + int device_id = ctx.GetPlace().GetDeviceId(); auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id); if (gen_cuda->GetIsInitPy() && seed_flag) { auto seed_offset = gen_cuda->IncrementOffset(1); diff --git a/paddle/fluid/operators/uniform_random_inplace_op_xpu.cc b/paddle/fluid/operators/uniform_random_inplace_op_xpu.cc index 24b1459a09510..fe43bb4ec60ca 100644 --- a/paddle/fluid/operators/uniform_random_inplace_op_xpu.cc +++ b/paddle/fluid/operators/uniform_random_inplace_op_xpu.cc @@ -59,9 +59,8 @@ class XPUUniformRandomInplaceKernel : public framework::OpKernel { data_cpu[pos] = diag_val; } } - memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()), data, - platform::CPUPlace(), reinterpret_cast(data_cpu.get()), - size * sizeof(T)); + memory::Copy(ctx.GetPlace(), data, platform::CPUPlace(), + reinterpret_cast(data_cpu.get()), size * sizeof(T)); } }; @@ -77,8 +76,7 @@ class XPUUniformRandomInplaceGradKernel : public framework::OpKernel { for (int64_t i = 0; i < size; ++i) { data_cpu[i] = T(0); } - memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()), data, - platform::CPUPlace(), + memory::Copy(ctx.GetPlace(), data, platform::CPUPlace(), reinterpret_cast(data_cpu.get()), size * sizeof(T)); } } diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu index 97288b2b1fa7c..440c9b786b69c 100644 --- a/paddle/fluid/operators/uniform_random_op.cu +++ b/paddle/fluid/operators/uniform_random_op.cu @@ -141,8 +141,7 @@ class GPUUniformRandomKernel : public framework::OpKernel { T diag_val = static_cast(context.Attr("diag_val")); thrust::counting_iterator index_sequence_begin(0); int64_t size = tensor->numel(); - int device_id = - BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()).GetDeviceId(); + int device_id = context.GetPlace().GetDeviceId(); auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id); if (gen_cuda->GetIsInitPy() && seed_flag) { auto seed_offset = gen_cuda->IncrementOffset(1); diff --git a/paddle/fluid/operators/uniform_random_op_xpu.cc b/paddle/fluid/operators/uniform_random_op_xpu.cc index d8b82ad5f863e..fed0accd8a14c 100644 --- a/paddle/fluid/operators/uniform_random_op_xpu.cc +++ b/paddle/fluid/operators/uniform_random_op_xpu.cc @@ -91,9 +91,8 @@ class XPUUniformRandomKernel : public framework::OpKernel { } } - memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()), data, - platform::CPUPlace(), reinterpret_cast(data_cpu.get()), - size * sizeof(T)); + memory::Copy(ctx.GetPlace(), data, platform::CPUPlace(), + reinterpret_cast(data_cpu.get()), size * sizeof(T)); } }; diff --git a/paddle/fluid/operators/where_index_op.cu b/paddle/fluid/operators/where_index_op.cu index feb8e83864e84..50b856bfe9841 100644 --- a/paddle/fluid/operators/where_index_op.cu +++ b/paddle/fluid/operators/where_index_op.cu @@ -128,14 +128,12 @@ class CUDAWhereIndexKernel : public framework::OpKernel { for (int i = rank - 2; i >= 0; i--) { h_stride_array[i] = h_stride_array[i + 1] * dims[i + 1]; } - memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), - d_stride_array, platform::CPUPlace(), h_stride_array, - rank * sizeof(int64_t), dev_ctx.stream()); + memory::Copy(dev_ctx.GetPlace(), d_stride_array, platform::CPUPlace(), + h_stride_array, rank * sizeof(int64_t), dev_ctx.stream()); // get total ture number and set output size // the last element of cub::InclusiveSum is the total number - memory::Copy(platform::CPUPlace(), h_total_true_num, - BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), + memory::Copy(platform::CPUPlace(), h_total_true_num, dev_ctx.GetPlace(), d_true_num_array + numel - 1, sizeof(int64_t), dev_ctx.stream()); dev_ctx.Wait(); diff --git a/paddle/fluid/operators/where_index_op_xpu.cc b/paddle/fluid/operators/where_index_op_xpu.cc index 53ddefbbe0cab..d80a266846e95 100644 --- a/paddle/fluid/operators/where_index_op_xpu.cc +++ b/paddle/fluid/operators/where_index_op_xpu.cc @@ -44,8 +44,8 @@ class WhereIndexXPUKernel : public framework::OpKernel { ret, XPUAPIErrorMsg[ret])); memory::Copy(platform::CPUPlace(), static_cast(&true_num_cpu), - BOOST_GET_CONST(platform::XPUPlace, context.GetPlace()), - static_cast(true_num), sizeof(int32_t)); + context.GetPlace(), static_cast(true_num), + sizeof(int32_t)); out->Resize( framework::make_ddim({static_cast(true_num_cpu), rank})); diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 517b4a28a690f..9e0a0cb5f8d35 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -55,7 +55,7 @@ ELSE() cc_library(cuda_graph_with_memory_pool SRCS cuda_graph_with_memory_pool.cc DEPS device_context allocator_facade) ENDIF() -cc_library(place SRCS place.cc DEPS enforce boost) +cc_library(place SRCS place.cc DEPS enforce boost pten_place) cc_test(place_test SRCS place_test.cc DEPS place glog gflags) add_subdirectory(device) @@ -122,7 +122,7 @@ cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc DEPS boost) # memcpy depends on device_context, here add deps individually for # avoiding cycle dependencies cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc xxhash ${STREAM_CALLBACK_DEPS} - place eigen3 stringpiece cpu_helper cpu_info framework_proto ${IPU_CTX_DEPS} ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS} + place pten_place eigen3 stringpiece cpu_helper cpu_info framework_proto ${IPU_CTX_DEPS} ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS} ${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS} ${MLU_CTX_DEPS}) cc_library(collective_helper SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto device_context enforce) diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc index 7d2ea57545d08..dd2dc9a40799e 100644 --- a/paddle/fluid/platform/collective_helper.cc +++ b/paddle/fluid/platform/collective_helper.cc @@ -32,9 +32,7 @@ class NCCLCommImpl : public NCCLComm { void set_rank(int rank) { rank_ = rank; } int rank() const override { return rank_; } - int device_id() const override { - return BOOST_GET_CONST(CUDAPlace, dev_ctx_->GetPlace()).device; - } + int device_id() const override { return dev_ctx_->GetPlace().device; } void set_comm(ncclComm_t comm) { comm_ = comm; } ncclComm_t comm() const override { return comm_; } @@ -246,9 +244,7 @@ class BKCLCommImpl : public BKCLComm { void set_rank(int rank) { rank_ = rank; } int rank() const override { return rank_; } - int device_id() const override { - return BOOST_GET_CONST(XPUPlace, dev_ctx_->GetPlace()).device; - } + int device_id() const override { return dev_ctx_->GetPlace().device; } void set_comm(BKCLContext_t comm) { comm_ = comm; } BKCLContext_t comm() const override { return comm_; } diff --git a/paddle/fluid/platform/collective_helper.h b/paddle/fluid/platform/collective_helper.h index 2fdc462a693ec..62a07669259a4 100644 --- a/paddle/fluid/platform/collective_helper.h +++ b/paddle/fluid/platform/collective_helper.h @@ -115,7 +115,7 @@ class NCCLCommContext { // retrieve a communicator by the ring id and place NCCLComm* Get(int ring_id, Place place) const { - return Get(ring_id, BOOST_GET_CONST(CUDAPlace, place).device); + return Get(ring_id, place.device); } private: @@ -212,7 +212,7 @@ class HCCLCommContext { // retrieve a communicator by the ring id and place HCCLComm* Get(int ring_id, Place place) const { - return Get(ring_id, BOOST_GET_CONST(NPUPlace, place).device); + return Get(ring_id, place.device); } private: @@ -317,7 +317,7 @@ class BKCLCommContext { // retrieve a communicator by the ring id and place BKCLComm* Get(int ring_id, Place place) const { - return Get(ring_id, BOOST_GET_CONST(XPUPlace, place).device); + return Get(ring_id, place.device); } private: diff --git a/paddle/fluid/platform/device/gpu/nccl_helper.h b/paddle/fluid/platform/device/gpu/nccl_helper.h index f26116749077e..261916b2555be 100644 --- a/paddle/fluid/platform/device/gpu/nccl_helper.h +++ b/paddle/fluid/platform/device/gpu/nccl_helper.h @@ -89,9 +89,7 @@ struct NCCLContext { gpuStream_t stream() const { return ctx_->stream(); } ncclComm_t comm() const { return comm_; } - int device_id() const { - return BOOST_GET_CONST(platform::CUDAPlace, ctx_->GetPlace()).device; - } + int device_id() const { return ctx_->GetPlace().device; } }; struct NCCLContextMap { @@ -106,7 +104,7 @@ struct NCCLContextMap { "The NCCL place should not be empty.")); order_.reserve(places.size()); for (auto &p : places) { - int dev_id = BOOST_GET_CONST(CUDAPlace, p).device; + int dev_id = p.device; order_.emplace_back(dev_id); contexts_.emplace(dev_id, NCCLContext(dev_id)); } @@ -155,12 +153,10 @@ struct NCCLContextMap { CUDADeviceContext *DevCtx(int dev_id) const { return at(dev_id).ctx_.get(); } CUDADeviceContext *DevCtx(platform::Place p) const { - return DevCtx(BOOST_GET_CONST(CUDAPlace, p).device); + return DevCtx(p.device); } - const NCCLContext &at(platform::Place p) const { - return this->at(BOOST_GET_CONST(CUDAPlace, p).device); - } + const NCCLContext &at(platform::Place p) const { return this->at(p.device); } const NCCLContext &at(int dev_id) const { return contexts_.at(dev_id); } @@ -259,7 +255,7 @@ class NCCLCommunicator { for (int ring_id = 0; ring_id < nrings; ++ring_id) { for (size_t p = 0; p < places.size(); ++p) { int rank = trainer_id * places.size() + p; - int dev_id = BOOST_GET_CONST(CUDAPlace, places[p]).device; + int dev_id = places[p].device; auto &ctx = flat_ctxs_[ring_id]->contexts_.at(dev_id); NCCLCommContext::Instance().AssignNCCLComm(ctx.comm_, nranks, rank, dev_id, ring_id); diff --git a/paddle/fluid/platform/device/mlu/device_context_allocator.h b/paddle/fluid/platform/device/mlu/device_context_allocator.h index 408016c0f0d99..2be960ef4ae41 100644 --- a/paddle/fluid/platform/device/mlu/device_context_allocator.h +++ b/paddle/fluid/platform/device/mlu/device_context_allocator.h @@ -128,8 +128,7 @@ class MLUDeviceContextAllocatorPool { } AllocationPtr Alloc(const platform::MLUDeviceContext &dev_ctx, size_t size) { - auto iter = allocators_.find( - BOOST_GET_CONST(platform::MLUPlace, dev_ctx.GetPlace())); + auto iter = allocators_.find(dev_ctx.GetPlace()); PADDLE_ENFORCE_NE( iter, allocators_.end(), platform::errors::NotFound("No allocator found for MLUPlace.")); diff --git a/paddle/fluid/platform/device/npu/hccl_helper.h b/paddle/fluid/platform/device/npu/hccl_helper.h index 69cea31446680..c2338fff02926 100644 --- a/paddle/fluid/platform/device/npu/hccl_helper.h +++ b/paddle/fluid/platform/device/npu/hccl_helper.h @@ -85,9 +85,7 @@ struct HCCLContext { aclrtStream stream() const { return ctx_->stream(); } HcclComm comm() const { return comm_; } - int device_id() const { - return BOOST_GET_CONST(platform::NPUPlace, ctx_->GetPlace()).device; - } + int device_id() const { return ctx_->GetPlace().device; } }; struct HCCLContextMap { @@ -102,7 +100,7 @@ struct HCCLContextMap { "The HCCL place should not be empty.")); order_.reserve(places.size()); for (auto &p : places) { - int dev_id = BOOST_GET_CONST(NPUPlace, p).device; + int dev_id = p.device; order_.emplace_back(dev_id); contexts_.emplace(dev_id, HCCLContext(dev_id)); } @@ -151,13 +149,9 @@ struct HCCLContextMap { NPUDeviceContext *DevCtx(int dev_id) const { return at(dev_id).ctx_.get(); } - NPUDeviceContext *DevCtx(platform::Place p) const { - return DevCtx(BOOST_GET_CONST(NPUPlace, p).device); - } + NPUDeviceContext *DevCtx(platform::Place p) const { return DevCtx(p.device); } - const HCCLContext &at(platform::Place p) const { - return this->at(BOOST_GET_CONST(NPUPlace, p).device); - } + const HCCLContext &at(platform::Place p) const { return this->at(p.device); } const HCCLContext &at(int dev_id) const { return contexts_.at(dev_id); } @@ -257,7 +251,7 @@ class HCCLCommunicator { for (int ring_id = 0; ring_id < nrings; ++ring_id) { for (size_t p = 0; p < places.size(); ++p) { int rank = trainer_id * places.size() + p; - int dev_id = BOOST_GET_CONST(NPUPlace, places[p]).device; + int dev_id = places[p].device; auto &ctx = flat_ctxs_[ring_id]->contexts_.at(dev_id); HCCLCommContext::Instance().AssignHCCLComm(ctx.comm_, nranks, rank, dev_id, ring_id); diff --git a/paddle/fluid/platform/device/npu/npu_collective_helper.cc b/paddle/fluid/platform/device/npu/npu_collective_helper.cc index 4d1f444411f71..cdec3519a23f3 100644 --- a/paddle/fluid/platform/device/npu/npu_collective_helper.cc +++ b/paddle/fluid/platform/device/npu/npu_collective_helper.cc @@ -31,9 +31,7 @@ class HCCLCommImpl : public HCCLComm { void set_rank(int rank) { rank_ = rank; } int rank() const override { return rank_; } - int device_id() const override { - return BOOST_GET_CONST(NPUPlace, dev_ctx_->GetPlace()).device; - } + int device_id() const override { return dev_ctx_->GetPlace().device; } ~HCCLCommImpl() { PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclCommDestroy(comm_)); diff --git a/paddle/fluid/platform/device/npu/npu_op_runner.h b/paddle/fluid/platform/device/npu/npu_op_runner.h index c049da3b33566..39d2b9ffa9b1d 100644 --- a/paddle/fluid/platform/device/npu/npu_op_runner.h +++ b/paddle/fluid/platform/device/npu/npu_op_runner.h @@ -149,9 +149,8 @@ void FillNpuTensorWithConstant(Tensor *tensor, T val) { npu_pinned_tensor.mutable_data({1}, npu_pinned_place); *npu_pinned_ptr = val; - memory::Copy(BOOST_GET_CONST(platform::NPUPlace, tensor->place()), - tensor->data(), npu_pinned_place, npu_pinned_ptr, sizeof(T), - GetCurrentNPUStream()); + memory::Copy(tensor->place(), tensor->data(), npu_pinned_place, + npu_pinned_ptr, sizeof(T), GetCurrentNPUStream()); auto npu_pinned_allocator = static_cast( diff --git a/paddle/fluid/platform/device/npu/npu_stream.cc b/paddle/fluid/platform/device/npu/npu_stream.cc index e86b30f3244c0..0b15a0d937e82 100644 --- a/paddle/fluid/platform/device/npu/npu_stream.cc +++ b/paddle/fluid/platform/device/npu/npu_stream.cc @@ -24,7 +24,7 @@ bool NPUStream::Init(const Place& place) { platform::errors::InvalidArgument( "NPU stream must be created using npu place.")); place_ = place; - NPUDeviceGuard guard(BOOST_GET_CONST(NPUPlace, place_).device); + NPUDeviceGuard guard(place_.device); NPUStreamCreate(&stream_); callback_manager_.reset(new StreamCallbackManager(stream_)); VLOG(3) << "NPUStream Init stream: " << stream_; @@ -32,7 +32,7 @@ bool NPUStream::Init(const Place& place) { } void NPUStream::Destroy() { - NPUDeviceGuard guard(BOOST_GET_CONST(NPUPlace, place_).device); + NPUDeviceGuard guard(place_.device); Wait(); WaitCallback(); if (stream_) { diff --git a/paddle/fluid/platform/device/xpu/bkcl_helper.h b/paddle/fluid/platform/device/xpu/bkcl_helper.h index d9ffbfe011f91..24fd8b5faa4e9 100644 --- a/paddle/fluid/platform/device/xpu/bkcl_helper.h +++ b/paddle/fluid/platform/device/xpu/bkcl_helper.h @@ -58,9 +58,7 @@ struct BKCLContext { BKCLContext_t comm() const { return comm_; } - int device_id() const { - return BOOST_GET_CONST(platform::XPUPlace, ctx_->GetPlace()).device; - } + int device_id() const { return ctx_->GetPlace().device; } }; struct InitBKCLPara { @@ -104,7 +102,7 @@ struct BKCLContextMap { "The BKCL place should not be empty.")); order_.reserve(places_.size()); for (auto &p : places_) { - int dev_id = BOOST_GET_CONST(platform::XPUPlace, p).device; + int dev_id = p.device; order_.emplace_back(dev_id); contexts_.emplace(dev_id, BKCLContext(dev_id)); } @@ -165,13 +163,9 @@ struct BKCLContextMap { XPUDeviceContext *DevCtx(int dev_id) const { return at(dev_id).ctx_.get(); } - XPUDeviceContext *DevCtx(platform::Place p) const { - return DevCtx(BOOST_GET_CONST(platform::XPUPlace, p).device); - } + XPUDeviceContext *DevCtx(platform::Place p) const { return DevCtx(p.device); } - const BKCLContext &at(platform::Place p) const { - return this->at(BOOST_GET_CONST(platform::XPUPlace, p).device); - } + const BKCLContext &at(platform::Place p) const { return this->at(p.device); } const BKCLContext &at(int dev_id) const { return contexts_.at(dev_id); } diff --git a/paddle/fluid/platform/device/xpu/xpu_info.h b/paddle/fluid/platform/device/xpu/xpu_info.h index 018ba1bce163b..220bebb9e6b05 100644 --- a/paddle/fluid/platform/device/xpu/xpu_info.h +++ b/paddle/fluid/platform/device/xpu/xpu_info.h @@ -12,11 +12,11 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU #include +#include "paddle/fluid/platform/place.h" namespace paddle { namespace platform { -class XPUPlace; /***** Version Management *****/ //! Get the version of XPU Driver diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.cc b/paddle/fluid/platform/device/xpu/xpu_op_list.cc index 7561830fc76c1..448559a9edfee 100644 --- a/paddle/fluid/platform/device/xpu/xpu_op_list.cc +++ b/paddle/fluid/platform/device/xpu/xpu_op_list.cc @@ -23,8 +23,7 @@ namespace platform { bool is_xpu_support_op(const std::string& op_name, const pOpKernelType& type) { auto& ops = get_kl1_ops(); - auto v = - get_xpu_version(BOOST_GET_CONST(platform::XPUPlace, type.place_).device); + auto v = get_xpu_version(type.place_.device); if (v == XPU2) { ops = get_kl2_ops(); } diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index b2f444c30c248..effd67fa5c967 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -10,6 +10,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/device_context.h" +#include #include #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) @@ -140,7 +141,7 @@ inline void EmplaceDeviceContext( map_ptr->emplace(p, std::async(std::launch::deferred, [=] { // lazy evaluation. i.e., only create device context at // first `Get` - return PtrType(new DevCtx(BOOST_GET_CONST(PlaceType, p))); + return PtrType(new DevCtx(p)); })); } @@ -157,14 +158,19 @@ DeviceContextPool::DeviceContextPool( } for (auto& p : set) { if (platform::is_cpu_place(p)) { + platform::CPUPlace place; #ifdef PADDLE_WITH_MKLDNN - EmplaceDeviceContext(&device_contexts_, p); + EmplaceDeviceContext(&device_contexts_, + place); #else - EmplaceDeviceContext(&device_contexts_, p); + EmplaceDeviceContext(&device_contexts_, + place); #endif } else if (platform::is_gpu_place(p)) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - EmplaceDeviceContext(&device_contexts_, p); + platform::CUDAPlace place(p.GetDeviceId()); + EmplaceDeviceContext(&device_contexts_, + place); #else PADDLE_THROW( platform::errors::Unimplemented("CUDAPlace is not supported. Please " @@ -172,8 +178,9 @@ DeviceContextPool::DeviceContextPool( #endif } else if (platform::is_cuda_pinned_place(p)) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + platform::CUDAPinnedPlace place; EmplaceDeviceContext( - &device_contexts_, p); + &device_contexts_, place); #else PADDLE_THROW(platform::errors::Unimplemented( "CUDAPlace is not supported. Please re-compile with WITH_GPU " @@ -181,7 +188,9 @@ DeviceContextPool::DeviceContextPool( #endif } else if (platform::is_xpu_place(p)) { #ifdef PADDLE_WITH_XPU - EmplaceDeviceContext(&device_contexts_, p); + platform::XPUPlace place(p.GetDeviceId()); + EmplaceDeviceContext(&device_contexts_, + place); #else PADDLE_THROW( platform::errors::Unimplemented("XPUPlace is not supported. Please " @@ -189,7 +198,9 @@ DeviceContextPool::DeviceContextPool( #endif } else if (platform::is_mlu_place(p)) { #ifdef PADDLE_WITH_MLU - EmplaceDeviceContext(&device_contexts_, p); + platform::MLUPlace place(p.GetDeviceId()); + EmplaceDeviceContext(&device_contexts_, + place); #else PADDLE_THROW( platform::errors::Unimplemented("MLUPlace is not supported. Please " @@ -197,7 +208,9 @@ DeviceContextPool::DeviceContextPool( #endif } else if (platform::is_ipu_place(p)) { #ifdef PADDLE_WITH_IPU - EmplaceDeviceContext(&device_contexts_, p); + platform::IPUPlace place(p.GetDeviceId()); + EmplaceDeviceContext(&device_contexts_, + place); #else PADDLE_THROW( platform::errors::Unimplemented("IPUPlace is not supported. Please " @@ -205,7 +218,9 @@ DeviceContextPool::DeviceContextPool( #endif } else if (platform::is_npu_place(p)) { #ifdef PADDLE_WITH_ASCEND_CL - EmplaceDeviceContext(&device_contexts_, p); + platform::NPUPlace place(p.GetDeviceId()); + EmplaceDeviceContext(&device_contexts_, + place); #else PADDLE_THROW(platform::errors::Unimplemented( "NPUPlace is not supported. Please " @@ -213,8 +228,9 @@ DeviceContextPool::DeviceContextPool( #endif } else if (platform::is_npu_pinned_place(p)) { #ifdef PADDLE_WITH_ASCEND_CL + platform::NPUPinnedPlace place; EmplaceDeviceContext( - &device_contexts_, p); + &device_contexts_, place); #else PADDLE_THROW(platform::errors::Unimplemented( "NPUPinnedPlace is not supported. Please re-compile with " diff --git a/paddle/fluid/platform/device_event_gpu.cc b/paddle/fluid/platform/device_event_gpu.cc index bc842ef9c74de..0a6b3917fbc21 100644 --- a/paddle/fluid/platform/device_event_gpu.cc +++ b/paddle/fluid/platform/device_event_gpu.cc @@ -26,7 +26,7 @@ struct CUDADeviceEventWrapper { platform::errors::PreconditionNotMet( "Required device shall be CUDAPlace, but received %d. ", place)); - device_id_ = BOOST_GET_CONST(platform::CUDAPlace, place).device; + device_id_ = place.device; PADDLE_ENFORCE_GT( device_id_, -1, platform::errors::PreconditionNotMet( diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc index ff11bfd62c138..73847ce24aa72 100644 --- a/paddle/fluid/platform/device_tracer.cc +++ b/paddle/fluid/platform/device_tracer.cc @@ -643,8 +643,7 @@ class DeviceTracerImpl : public DeviceTracer { event->set_place(proto::MemEvent::CPUPlace); } else if (platform::is_gpu_place(r.place)) { event->set_place(proto::MemEvent::CUDAPlace); - event->set_device_id( - BOOST_GET_CONST(platform::CUDAPlace, r.place).GetDeviceId()); + event->set_device_id(r.place.GetDeviceId()); } else if (platform::is_cuda_pinned_place(r.place)) { event->set_place(proto::MemEvent::CUDAPinnedPlace); } else if (platform::is_npu_place(r.place)) { diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc index 6251a28823ac3..e73e3736f64b4 100644 --- a/paddle/fluid/platform/place.cc +++ b/paddle/fluid/platform/place.cc @@ -24,89 +24,62 @@ PADDLE_DEFINE_EXPORTED_bool( namespace paddle { namespace platform { -namespace detail { - -class PlacePrinter : public boost::static_visitor<> { - public: - explicit PlacePrinter(std::ostream &os) : os_(os) {} - void operator()(const CPUPlace &) { os_ << "CPUPlace"; } - void operator()(const CUDAPlace &p) { - os_ << "CUDAPlace(" << p.device << ")"; - } - void operator()(const XPUPlace &p) { os_ << "XPUPlace(" << p.device << ")"; } - void operator()(const MLUPlace &p) { os_ << "MLUPlace(" << p.device << ")"; } - void operator()(const NPUPlace &p) { os_ << "NPUPlace(" << p.device << ")"; } - void operator()(const NPUPinnedPlace &p) { os_ << "NPUPinnedPlace"; } - void operator()(const IPUPlace &p) { os_ << "IPUPlace(" << p.device << ")"; } - void operator()(const CUDAPinnedPlace &p) { os_ << "CUDAPinnedPlace"; } - - private: - std::ostream &os_; -}; - -} // namespace detail - bool is_gpu_place(const Place &p) { - return boost::apply_visitor(IsCUDAPlace(), p); + return p.GetType() == pten::AllocationType::GPU; } bool is_xpu_place(const Place &p) { - return boost::apply_visitor(IsXPUPlace(), p); + return p.GetType() == pten::AllocationType::XPU; } bool is_mlu_place(const Place &p) { - return boost::apply_visitor(IsMLUPlace(), p); + return p.GetType() == pten::AllocationType::MLU; } bool is_npu_place(const Place &p) { - return boost::apply_visitor(IsNPUPlace(), p); + return p.GetType() == pten::AllocationType::NPU; } bool is_ipu_place(const Place &p) { - return boost::apply_visitor(IsIPUPlace(), p); + return p.GetType() == pten::AllocationType::IPU; } bool is_cpu_place(const Place &p) { - return boost::apply_visitor(IsCPUPlace(), p); + return p.GetType() == pten::AllocationType::CPU; } bool is_cuda_pinned_place(const Place &p) { - return boost::apply_visitor(IsCUDAPinnedPlace(), p); + return p.GetType() == pten::AllocationType::GPUPINNED; } bool is_npu_pinned_place(const Place &p) { - return boost::apply_visitor(IsNPUPinnedPlace(), p); + return p.GetType() == pten::AllocationType::NPUPINNED; } bool places_are_same_class(const Place &p1, const Place &p2) { - return p1.which() == p2.which(); + return p1.GetType() == p2.GetType(); } bool is_same_place(const Place &p1, const Place &p2) { if (places_are_same_class(p1, p2)) { - if (is_cpu_place(p1) || is_cuda_pinned_place(p1)) { + if (is_cpu_place(p1) || is_cuda_pinned_place(p1) || + is_npu_pinned_place(p1)) { return true; } else if (is_xpu_place(p1)) { - return BOOST_GET_CONST(XPUPlace, p1) == BOOST_GET_CONST(XPUPlace, p2); + return p1 == p2; } else if (is_mlu_place(p1)) { - return BOOST_GET_CONST(MLUPlace, p1) == BOOST_GET_CONST(MLUPlace, p2); + return p1 == p2; } else if (is_npu_place(p1)) { - return BOOST_GET_CONST(NPUPlace, p1) == BOOST_GET_CONST(NPUPlace, p2); + return p1 == p2; } else if (is_ipu_place(p1)) { - return BOOST_GET_CONST(IPUPlace, p1) == BOOST_GET_CONST(IPUPlace, p2); + return p1 == p2; } else { - return BOOST_GET_CONST(CUDAPlace, p1) == BOOST_GET_CONST(CUDAPlace, p2); + return p1 == p2; } } else { return false; } } -std::ostream &operator<<(std::ostream &os, const Place &p) { - detail::PlacePrinter printer(os); - boost::apply_visitor(printer, p); - return os; -} - } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h index 886eb05813bd8..80bbeac251810 100644 --- a/paddle/fluid/platform/place.h +++ b/paddle/fluid/platform/place.h @@ -13,229 +13,29 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include -#include -#include +// #include +// #include +// #include #include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/variant.h" +// #include "paddle/fluid/platform/variant.h" #ifdef PADDLE_WITH_ASCEND_CL #include "paddle/fluid/platform/device/npu/enforce_npu.h" #endif +#include "paddle/pten/common/place.h" namespace paddle { namespace platform { -struct CPUPlace { - // WORKAROUND: for some reason, omitting this constructor - // causes errors with boost 1.59 and OSX - CPUPlace() {} - - // needed for variant equality comparison - inline bool operator==(const CPUPlace &) const { return true; } - inline bool operator!=(const CPUPlace &) const { return false; } - inline bool operator<(const CPUPlace &) const { return false; } -}; - -struct CUDAPlace { - CUDAPlace() : CUDAPlace(0) {} - explicit CUDAPlace(int d) : device(d) {} - - inline int GetDeviceId() const { return device; } - // needed for variant equality comparison - inline bool operator==(const CUDAPlace &o) const { - return device == o.device; - } - inline bool operator!=(const CUDAPlace &o) const { return !(*this == o); } - inline bool operator<(const CUDAPlace &o) const { return device < o.device; } - - int device; -}; - -struct CUDAPinnedPlace { - CUDAPinnedPlace() {} - - // needed for variant equality comparison - inline bool operator==(const CUDAPinnedPlace &) const { return true; } - inline bool operator!=(const CUDAPinnedPlace &) const { return false; } - inline bool operator<(const CUDAPinnedPlace &) const { return false; } -}; - -// Place for Baidu Kunlun Accelerator -struct XPUPlace { - XPUPlace() : XPUPlace(0) {} - explicit XPUPlace(int d) : device(d) {} - - inline int GetDeviceId() const { return device; } - // needed for variant equality comparison - inline bool operator==(const XPUPlace &o) const { return device == o.device; } - inline bool operator!=(const XPUPlace &o) const { return !(*this == o); } - inline bool operator<(const XPUPlace &o) const { return device < o.device; } - - int device; -}; - -struct NPUPlace { - NPUPlace() : NPUPlace(0) {} - explicit NPUPlace(int d) : device(d) {} - - inline int GetDeviceId() const { return device; } - // needed for variant equality comparison - inline bool operator==(const NPUPlace &o) const { return device == o.device; } - inline bool operator!=(const NPUPlace &o) const { return !(*this == o); } - inline bool operator<(const NPUPlace &o) const { return device < o.device; } - - int device; -}; - -struct NPUPinnedPlace { - NPUPinnedPlace() {} - - inline bool operator==(const NPUPinnedPlace &) const { return true; } - inline bool operator!=(const NPUPinnedPlace &) const { return false; } - inline bool operator<(const NPUPinnedPlace &) const { return false; } -}; -struct IPUPlace { - IPUPlace() : IPUPlace(0) {} - explicit IPUPlace(int d) : device(d) {} - - inline int GetDeviceId() const { return device; } - // needed for variant equality comparison - inline bool operator==(const IPUPlace &o) const { return device == o.device; } - inline bool operator!=(const IPUPlace &o) const { return !(*this == o); } - inline bool operator<(const IPUPlace &o) const { return device < o.device; } - - int device; -}; - -struct MLUPlace { - MLUPlace() : MLUPlace(0) {} - explicit MLUPlace(int d) : device(d) {} - - inline int GetDeviceId() const { return device; } - // needed for variant equality comparison - inline bool operator==(const MLUPlace &o) const { return device == o.device; } - inline bool operator!=(const MLUPlace &o) const { return !(*this == o); } - inline bool operator<(const MLUPlace &o) const { return device < o.device; } - - int device; -}; - -struct IsCUDAPlace : public boost::static_visitor { - bool operator()(const CPUPlace &) const { return false; } - bool operator()(const XPUPlace &) const { return false; } - bool operator()(const NPUPlace &) const { return false; } - bool operator()(const NPUPinnedPlace &) const { return false; } - bool operator()(const MLUPlace &) const { return false; } - bool operator()(const IPUPlace &) const { return false; } - bool operator()(const CUDAPlace &) const { return true; } - bool operator()(const CUDAPinnedPlace &) const { return false; } -}; - -struct IsCPUPlace : public boost::static_visitor { - bool operator()(const CPUPlace &) const { return true; } - bool operator()(const XPUPlace &) const { return false; } - bool operator()(const NPUPlace &) const { return false; } - bool operator()(const NPUPinnedPlace &) const { return false; } - bool operator()(const MLUPlace &) const { return false; } - bool operator()(const IPUPlace &) const { return false; } - bool operator()(const CUDAPlace &) const { return false; } - bool operator()(const CUDAPinnedPlace &) const { return false; } -}; - -struct IsCUDAPinnedPlace : public boost::static_visitor { - bool operator()(const CPUPlace &) const { return false; } - bool operator()(const XPUPlace &) const { return false; } - bool operator()(const NPUPlace &) const { return false; } - bool operator()(const NPUPinnedPlace &) const { return false; } - bool operator()(const MLUPlace &) const { return false; } - bool operator()(const IPUPlace &) const { return false; } - bool operator()(const CUDAPlace &) const { return false; } - bool operator()(const CUDAPinnedPlace &cuda_pinned) const { return true; } -}; - -struct IsXPUPlace : public boost::static_visitor { - bool operator()(const CPUPlace &) const { return false; } - bool operator()(const XPUPlace &) const { return true; } - bool operator()(const NPUPlace &) const { return false; } - bool operator()(const NPUPinnedPlace &) const { return false; } - bool operator()(const MLUPlace &) const { return false; } - bool operator()(const IPUPlace &) const { return false; } - bool operator()(const CUDAPlace &) const { return false; } - bool operator()(const CUDAPinnedPlace &) const { return false; } -}; - -struct IsNPUPlace : public boost::static_visitor { - bool operator()(const CPUPlace &) const { return false; } - bool operator()(const XPUPlace &) const { return false; } - bool operator()(const NPUPlace &) const { return true; } - bool operator()(const NPUPinnedPlace &) const { return false; } - bool operator()(const MLUPlace &) const { return false; } - bool operator()(const IPUPlace &) const { return false; } - bool operator()(const CUDAPlace &) const { return false; } - bool operator()(const CUDAPinnedPlace &) const { return false; } -}; - -struct IsNPUPinnedPlace : public boost::static_visitor { - bool operator()(const CPUPlace &) const { return false; } - bool operator()(const XPUPlace &) const { return false; } - bool operator()(const NPUPlace &) const { return false; } - bool operator()(const MLUPlace &) const { return false; } - bool operator()(const IPUPlace &) const { return false; } - bool operator()(const CUDAPlace &) const { return false; } - bool operator()(const CUDAPinnedPlace &) const { return false; } - bool operator()(const NPUPinnedPlace &) const { return true; } -}; - -struct IsMLUPlace : public boost::static_visitor { - bool operator()(const CPUPlace &) const { return false; } - bool operator()(const XPUPlace &) const { return false; } - bool operator()(const NPUPlace &) const { return false; } - bool operator()(const NPUPinnedPlace &) const { return false; } - bool operator()(const MLUPlace &) const { return true; } - bool operator()(const IPUPlace &) const { return false; } - bool operator()(const CUDAPlace &) const { return false; } - bool operator()(const CUDAPinnedPlace &) const { return false; } -}; -struct IsIPUPlace : public boost::static_visitor { - bool operator()(const CPUPlace &) const { return false; } - bool operator()(const XPUPlace &) const { return false; } - bool operator()(const NPUPlace &) const { return false; } - bool operator()(const IPUPlace &) const { return true; } - bool operator()(const MLUPlace &) const { return false; } - bool operator()(const CUDAPlace &) const { return false; } - bool operator()(const CUDAPinnedPlace &) const { return false; } - bool operator()(const NPUPinnedPlace &) const { return false; } -}; - -class Place : public boost::variant { - private: - using PlaceBase = - boost::variant; - - public: - Place() = default; - Place(const CPUPlace &cpu_place) : PlaceBase(cpu_place) {} // NOLINT - Place(const XPUPlace &xpu_place) : PlaceBase(xpu_place) {} // NOLINT - Place(const NPUPlace &npu_place) : PlaceBase(npu_place) {} // NOLINT - Place(const MLUPlace &mlu_place) : PlaceBase(mlu_place) {} // NOLINT - Place(const IPUPlace &ipu_place) : PlaceBase(ipu_place) {} // NOLINT - Place(const CUDAPlace &cuda_place) : PlaceBase(cuda_place) {} // NOLINT - Place(const CUDAPinnedPlace &cuda_pinned_place) // NOLINT - : PlaceBase(cuda_pinned_place) {} - Place(const NPUPinnedPlace &npu_pinned_place) // NOLINT - : PlaceBase(npu_pinned_place) {} - - bool operator<(const Place &place) const { - return PlaceBase::operator<(static_cast(place)); - } - bool operator==(const Place &place) const { - return PlaceBase::operator==(static_cast(place)); - } -}; +using Place = pten::Place; +using CPUPlace = pten::CPUPlace; +using CUDAPlace = pten::GPUPlace; +using CUDAPinnedPlace = pten::GPUPinnedPlace; +using NPUPlace = pten::NPUPlace; +using NPUPinnedPlace = pten::NPUPinnedPlace; +using XPUPlace = pten::XPUPlace; +using IPUPlace = pten::IPUPlace; +using MLUPlace = pten::MLUPlace; using PlaceList = std::vector; @@ -250,94 +50,84 @@ bool is_npu_pinned_place(const Place &); bool places_are_same_class(const Place &, const Place &); bool is_same_place(const Place &, const Place &); -std::ostream &operator<<(std::ostream &, const Place &); - template -struct PlaceVisitorWrapper - : public boost::static_visitor { - const Visitor &visitor_; - explicit PlaceVisitorWrapper(const Visitor &visitor) : visitor_(visitor) {} - - typename Visitor::result_type operator()(const CPUPlace &cpu) const { - return visitor_(cpu); - } - - typename Visitor::result_type operator()(const XPUPlace &xpu) const { -#ifdef PADDLE_WITH_XPU - return visitor_(xpu); +typename Visitor::result_type VisitPlace(const Place &place, + const Visitor &visitor) { + switch (place.GetType()) { + case pten::AllocationType::GPU: { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + platform::CUDAPlace p(place.GetDeviceId()); + return visitor(p); #else - PADDLE_THROW(platform::errors::Unavailable( - "Paddle is not compiled with XPU. Cannot visit xpu device")); - return typename Visitor::result_type(); + PADDLE_THROW(platform::errors::Unavailable( + "Paddle is not compiled with CUDA. Cannot visit cuda_pinned")); + return typename Visitor::result_type(); #endif - } - - typename Visitor::result_type operator()(const NPUPlace &npu) const { -#ifdef PADDLE_WITH_ASCEND - return visitor_(npu); + } + case pten::AllocationType::GPUPINNED: { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + platform::CUDAPinnedPlace p; + return visitor(p); #else - PADDLE_THROW(platform::errors::Unavailable( - "Paddle is not compiled with NPU. Cannot visit npu device")); - return typename Visitor::result_type(); + PADDLE_THROW(platform::errors::Unavailable( + "Paddle is not compiled with CUDA. Cannot visit cuda_pinned")); + return typename Visitor::result_type(); #endif - } - - typename Visitor::result_type operator()( - const NPUPinnedPlace &npu_pinned) const { -#ifdef PADDLE_WITH_ASCEND_CL - return visitor_(npu_pinned); + } + case pten::AllocationType::XPU: { +#ifdef PADDLE_WITH_XPU + platform::XPUPlace p(place.GetDeviceId()); + return visitor(p); #else - PADDLE_THROW(platform::errors::Unavailable( - "Paddle is not compiled with NPU. Cannot visit npu_pinned")); - return typename Visitor::result_type(); + PADDLE_THROW(paddle::platform::errors::Unavailable( + "Paddle is not compiled with XPU. Cannot visit xpu device")); + return typename Visitor::result_type(); #endif - } - - typename Visitor::result_type operator()(const MLUPlace &mlu) const { -#ifdef PADDLE_WITH_MLU - return visitor_(mlu); + } + case pten::AllocationType::NPU: { +#ifdef PADDLE_WITH_ASCEND_CL + platform::NPUPlace p(place.GetDeviceId()); + return visitor(p); #else - PADDLE_THROW(platform::errors::Unavailable( - "Paddle is not compiled with MLU. Cannot visit mlu device")); + PADDLE_THROW(platform::errors::Unavailable( + "Paddle is not compiled with NPU. Cannot visit npu_pinned")); + return typename Visitor::result_type(); #endif - } - - typename Visitor::result_type operator()(const IPUPlace &ipu) const { -#ifdef PADDLE_WITH_IPU - return visitor_(ipu); + } + case pten::AllocationType::NPUPINNED: { +#ifdef PADDLE_WITH_ASCEND_CL + platform::NPUPinnedPlace p; + return visitor(p); #else - PADDLE_THROW(platform::errors::Unavailable( - "Paddle is not compiled with IPU. Cannot visit ipu device")); - return typename Visitor::result_type(); + PADDLE_THROW(platform::errors::Unavailable( + "Paddle is not compiled with NPU. Cannot visit npu_pinned")); + return typename Visitor::result_type(); #endif - } - - typename Visitor::result_type operator()(const CUDAPlace &cuda) const { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - return visitor_(cuda); + } + case pten::AllocationType::IPU: { +#ifdef PADDLE_WITH_IPU + platform::IPUPlace p(place.GetDeviceId()); + return visitor(p); #else - PADDLE_THROW(platform::errors::Unavailable( - "Paddle is not compiled with CUDA. Cannot visit cuda device")); - return typename Visitor::result_type(); + PADDLE_THROW(platform::errors::Unavailable( + "Paddle is not compiled with IPU. Cannot visit ipu device")); + return typename Visitor::result_type(); #endif - } - - typename Visitor::result_type operator()( - const CUDAPinnedPlace &cuda_pinned) const { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - return visitor_(cuda_pinned); + } + case pten::AllocationType::MLU: { +#ifdef PADDLE_WITH_MLU + platform::MLUPlace p(place.GetDeviceId()); + return visitor(p); #else - PADDLE_THROW(platform::errors::Unavailable( - "Paddle is not compiled with CUDA. Cannot visit cuda_pinned")); - return typename Visitor::result_type(); + PADDLE_THROW(platform::errors::Unavailable( + "Paddle is not compiled with MLU. Cannot visit mlu device")); #endif + } + default: { + platform::CPUPlace p; + return visitor(p); + } } -}; - -template -typename Visitor::result_type VisitPlace(const Place &place, - const Visitor &visitor) { - return boost::apply_visitor(PlaceVisitorWrapper(visitor), place); } } // namespace platform diff --git a/paddle/fluid/platform/place_test.cc b/paddle/fluid/platform/place_test.cc index ba19f14fb8f87..4fccb0eda70fd 100644 --- a/paddle/fluid/platform/place_test.cc +++ b/paddle/fluid/platform/place_test.cc @@ -47,21 +47,21 @@ TEST(Place, Print) { { std::stringstream ss; ss << paddle::platform::XPUPlace(1); - EXPECT_EQ("XPUPlace(1)", ss.str()); + EXPECT_EQ("Place(xpu:1)", ss.str()); } { std::stringstream ss; ss << paddle::platform::MLUPlace(1); - EXPECT_EQ("MLUPlace(1)", ss.str()); + EXPECT_EQ("Place(mlu:1)", ss.str()); } { std::stringstream ss; ss << paddle::platform::CUDAPlace(1); - EXPECT_EQ("CUDAPlace(1)", ss.str()); + EXPECT_EQ("Place(gpu:1)", ss.str()); } { std::stringstream ss; ss << paddle::platform::CPUPlace(); - EXPECT_EQ("CPUPlace", ss.str()); + EXPECT_EQ("Place(cpu)", ss.str()); } } diff --git a/paddle/fluid/platform/stream/cuda_stream.cc b/paddle/fluid/platform/stream/cuda_stream.cc index 742d267b59543..5697bbee0bb92 100644 --- a/paddle/fluid/platform/stream/cuda_stream.cc +++ b/paddle/fluid/platform/stream/cuda_stream.cc @@ -27,7 +27,7 @@ bool CUDAStream::Init(const Place& place, const Priority& priority, platform::errors::InvalidArgument( "Cuda stream must be created using cuda place.")); place_ = place; - CUDADeviceGuard guard(BOOST_GET_CONST(CUDAPlace, place_).device); + CUDADeviceGuard guard(place_.device); if (priority == Priority::kHigh) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreateWithPriority( @@ -53,7 +53,7 @@ bool CUDAStream::Init(const Place& place, const Priority& priority, } void CUDAStream::Destroy() { - CUDADeviceGuard guard(BOOST_GET_CONST(CUDAPlace, place_).device); + CUDADeviceGuard guard(place_.device); Wait(); WaitCallback(); if (stream_ && owned_stream_) { diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc index 102bc9f162b0f..3439f96984d99 100644 --- a/paddle/fluid/pybind/eager.cc +++ b/paddle/fluid/pybind/eager.cc @@ -108,25 +108,20 @@ void InitEagerTensorWithNumpyValue(EagerTensorObject* self, paddle::platform::Place place = impl_ptr->place(); paddle::framework::LoDTensor temp_tensor = paddle::framework::LoDTensor(); if (platform::is_cpu_place(place)) { - SetTensorFromPyArray( - &temp_tensor, array, BOOST_GET_CONST(platform::CPUPlace, place), - zero_copy); + SetTensorFromPyArray(&temp_tensor, array, place, + zero_copy); } else if (platform::is_xpu_place(place)) { - SetTensorFromPyArray( - &temp_tensor, array, BOOST_GET_CONST(platform::XPUPlace, place), - zero_copy); + SetTensorFromPyArray(&temp_tensor, array, place, + zero_copy); } else if (platform::is_gpu_place(place)) { - SetTensorFromPyArray( - &temp_tensor, array, BOOST_GET_CONST(platform::CUDAPlace, place), - zero_copy); + SetTensorFromPyArray(&temp_tensor, array, place, + zero_copy); } else if (platform::is_cuda_pinned_place(place)) { - SetTensorFromPyArray( - &temp_tensor, array, BOOST_GET_CONST(platform::CUDAPinnedPlace, place), - zero_copy); + SetTensorFromPyArray(&temp_tensor, array, place, + zero_copy); } else if (platform::is_npu_place(place)) { - SetTensorFromPyArray( - &temp_tensor, array, BOOST_GET_CONST(platform::NPUPlace, place), - zero_copy); + SetTensorFromPyArray(&temp_tensor, array, place, + zero_copy); } else { PADDLE_THROW(platform::errors::InvalidArgument( "Place should be one of " diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 4f22e83ac626f..3650b44ed0a85 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -170,24 +170,18 @@ static void InitVarBaseAndTensor( auto *tensor = self->MutableVar()->GetMutable(); VLOG(4) << "zero_copy: " << zero_copy; if (platform::is_cpu_place(place)) { - SetTensorFromPyArray( - tensor, array, BOOST_GET_CONST(platform::CPUPlace, place), zero_copy); + SetTensorFromPyArray(tensor, array, place, zero_copy); } else if (platform::is_xpu_place(place)) { - SetTensorFromPyArray( - tensor, array, BOOST_GET_CONST(platform::XPUPlace, place), zero_copy); + SetTensorFromPyArray(tensor, array, place, zero_copy); } else if (platform::is_gpu_place(place)) { - SetTensorFromPyArray( - tensor, array, BOOST_GET_CONST(platform::CUDAPlace, place), zero_copy); + SetTensorFromPyArray(tensor, array, place, zero_copy); } else if (platform::is_cuda_pinned_place(place)) { - SetTensorFromPyArray( - tensor, array, BOOST_GET_CONST(platform::CUDAPinnedPlace, place), - zero_copy); + SetTensorFromPyArray(tensor, array, place, + zero_copy); } else if (platform::is_npu_place(place)) { - SetTensorFromPyArray( - tensor, array, BOOST_GET_CONST(platform::NPUPlace, place), zero_copy); + SetTensorFromPyArray(tensor, array, place, zero_copy); } else if (platform::is_mlu_place(place)) { - SetTensorFromPyArray( - tensor, array, BOOST_GET_CONST(platform::MLUPlace, place), zero_copy); + SetTensorFromPyArray(tensor, array, place, zero_copy); } else { PADDLE_THROW(platform::errors::InvalidArgument( "Place should be one of " diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 3eabf255ccbac..63f1e817137d4 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -372,7 +372,7 @@ static inline bool IsSamePlace(const PlaceType1 &p1, const PlaceType2 &p2) { template static inline int PlaceIndex(const PlaceType &p) { - return static_cast(paddle::platform::Place(p).which()); + return static_cast(paddle::platform::Place(p).GetType()); } static PyObject *GetPythonAttribute(PyObject *obj, const char *attr_name) { @@ -2050,26 +2050,11 @@ All parameter, weight, gradient are variables in Paddle. }) .def("is_mlu_place", [](platform::Place &self) { return platform::is_mlu_place(self); }) - .def("gpu_device_id", - [](platform::Place &self) { - return BOOST_GET_CONST(platform::CUDAPlace, self).device; - }) - .def("xpu_device_id", - [](platform::Place &self) { - return BOOST_GET_CONST(platform::XPUPlace, self).device; - }) - .def("npu_device_id", - [](platform::Place &self) { - return BOOST_GET_CONST(platform::NPUPlace, self).device; - }) - .def("ipu_device_id", - [](platform::Place &self) { - return BOOST_GET_CONST(platform::IPUPlace, self).device; - }) - .def("mlu_device_id", - [](platform::Place &self) { - return BOOST_GET_CONST(platform::MLUPlace, self).device; - }) + .def("gpu_device_id", [](platform::Place &self) { return self.device; }) + .def("xpu_device_id", [](platform::Place &self) { return self.device; }) + .def("npu_device_id", [](platform::Place &self) { return self.device; }) + .def("ipu_device_id", [](platform::Place &self) { return self.device; }) + .def("mlu_device_id", [](platform::Place &self) { return self.device; }) .def("set_place", [](platform::Place &self, const platform::Place &other) { self = other; }) .def("set_place", diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index 1fe6686919453..5fe361b148c41 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -223,27 +223,27 @@ T TensorGetElement(const framework::Tensor &self, size_t offset) { } else if (platform::is_xpu_place(self.place())) { #ifdef PADDLE_WITH_XPU const T *a = self.data(); - auto p = BOOST_GET_CONST(platform::XPUPlace, self.place()); + auto p = self.place(); paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T)); #endif } else if (platform::is_gpu_place(self.place())) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) const T *a = self.data(); - auto p = BOOST_GET_CONST(platform::CUDAPlace, self.place()); + auto p = self.place(); paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T), nullptr); #endif } else if (platform::is_mlu_place(self.place())) { #ifdef PADDLE_WITH_MLU const T *a = self.data(); - auto p = BOOST_GET_CONST(platform::MLUPlace, self.place()); + auto p = self.place(); paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T), nullptr); #endif } else if (platform::is_npu_place(self.place())) { #if defined(PADDLE_WITH_ASCEND_CL) const T *a = self.data(); - auto p = BOOST_GET_CONST(platform::NPUPlace, self.place()); + auto p = self.place(); paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T), nullptr); #endif @@ -264,27 +264,27 @@ void TensorSetElement(framework::Tensor *self, size_t offset, T elem) { self->mutable_data(self->place())[offset] = elem; } else if (platform::is_xpu_place(self->place())) { #ifdef PADDLE_WITH_XPU - auto p = BOOST_GET_CONST(platform::XPUPlace, self->place()); + auto p = self->place(); T *a = self->mutable_data(p); paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T)); #endif } else if (platform::is_gpu_place(self->place())) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - auto p = BOOST_GET_CONST(platform::CUDAPlace, self->place()); + auto p = self->place(); T *a = self->mutable_data(p); paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T), nullptr); #endif } else if (platform::is_mlu_place(self->place())) { #ifdef PADDLE_WITH_MLU - auto p = BOOST_GET_CONST(platform::MLUPlace, self->place()); + auto p = self->place(); T *a = self->mutable_data(p); paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T), nullptr); #endif } else if (platform::is_npu_place(self->place())) { #if defined(PADDLE_WITH_ASCEND_CL) - auto p = BOOST_GET_CONST(platform::NPUPlace, self->place()); + auto p = self->place(); T *a = self->mutable_data(p); paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T), nullptr); @@ -318,11 +318,9 @@ void SetTensorFromPyArrayT( // NOTE(wangxi): When copying data to the accelerator card, // we need set_device(dev_id) first. platform::Place tmp_place = place; - platform::XPUDeviceGuard guard( - BOOST_GET_CONST(platform::XPUPlace, tmp_place).device); + platform::XPUDeviceGuard guard(tmp_place.device); auto dst = self->mutable_data(place); - memory::Copy(BOOST_GET_CONST(platform::XPUPlace, tmp_place), - static_cast(dst), platform::CPUPlace(), + memory::Copy(tmp_place, static_cast(dst), platform::CPUPlace(), static_cast(array.data()), array.nbytes()); #else PADDLE_THROW(platform::errors::PermissionDenied( @@ -347,8 +345,7 @@ void SetTensorFromPyArrayT( } else if (paddle::platform::is_npu_place(place)) { #ifdef PADDLE_WITH_ASCEND_CL platform::Place tmp_place = place; - platform::NPUDeviceGuard guard( - BOOST_GET_CONST(platform::NPUPlace, tmp_place).device); + platform::NPUDeviceGuard guard(tmp_place.device); auto dst = self->mutable_data(place); platform::NPUMemcpySync(dst, array.data(), array.nbytes(), ACL_MEMCPY_HOST_TO_DEVICE); @@ -363,8 +360,7 @@ void SetTensorFromPyArrayT( } else if (paddle::platform::is_mlu_place(place)) { #ifdef PADDLE_WITH_MLU platform::Place tmp_place = place; - platform::MLUDeviceGuard guard( - BOOST_GET_CONST(platform::MLUPlace, tmp_place).device); + platform::MLUDeviceGuard guard(tmp_place.device); auto dst = self->mutable_data(place); paddle::platform::MLUMemcpyH2DSync(dst, array.data(), array.nbytes()); #else @@ -377,9 +373,7 @@ void SetTensorFromPyArrayT( if (paddle::platform::is_gpu_place(place)) { // NOTE(wangxi): When copying data to the accelerator card, // we need set_device(dev_id) first. - platform::Place tmp_place = place; - platform::CUDADeviceGuard guard( - BOOST_GET_CONST(platform::CUDAPlace, tmp_place).device); + platform::CUDADeviceGuard guard(place.device); auto dst = self->mutable_data(place); #ifdef PADDLE_WITH_HIP paddle::platform::GpuMemcpySync(dst, array.data(), array.nbytes(), @@ -460,7 +454,6 @@ void _sliceCompute(const framework::Tensor *in, framework::Tensor *out, const std::vector &axes, const std::vector &starts) { auto &eigen_place = *ctx.eigen_device(); - auto place = in->place(); auto out_dims = out->dims(); auto in_dims = in->dims(); @@ -551,26 +544,21 @@ inline framework::Tensor *_getTensor(const framework::Tensor &self, output->Resize(ddim); auto place = self.place(); if (platform::is_cpu_place(place)) { - output->mutable_data(BOOST_GET_CONST(platform::CPUPlace, place), - self.type()); + output->mutable_data(place, self.type()); } else if (platform::is_xpu_place(place)) { #ifdef PADDLE_WITH_XPU - output->mutable_data(BOOST_GET_CONST(platform::XPUPlace, place), - self.type()); + output->mutable_data(place, self.type()); #endif } else if (platform::is_mlu_place(place)) { #ifdef PADDLE_WITH_MLU - output->mutable_data(BOOST_GET_CONST(platform::MLUPlace, place), - self.type()); + output->mutable_data(place, self.type()); #endif } else { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_cuda_pinned_place(place)) { - output->mutable_data(BOOST_GET_CONST(platform::CUDAPinnedPlace, place), - self.type()); + output->mutable_data(place, self.type()); } else if ((platform::is_gpu_place(place))) { - output->mutable_data(BOOST_GET_CONST(platform::CUDAPlace, place), - self.type()); + output->mutable_data(place, self.type()); } #endif } @@ -789,7 +777,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor, "or double free would occur")); size_t copy_bytes = sizeof_dtype * numel; - auto p = BOOST_GET_CONST(platform::XPUPlace, tensor.place()); + auto p = tensor.place(); paddle::memory::Copy(platform::CPUPlace(), py_arr.mutable_data(), p, tensor_buf_ptr, copy_bytes); return py_arr; @@ -812,7 +800,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor, "or double free would occur")); size_t copy_bytes = sizeof_dtype * numel; - auto p = BOOST_GET_CONST(platform::CUDAPlace, tensor.place()); + auto p = tensor.place(); paddle::memory::Copy(platform::CPUPlace(), py_arr.mutable_data(), p, tensor_buf_ptr, copy_bytes, nullptr); return py_arr; @@ -835,7 +823,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor, "or double free would occur")); size_t copy_bytes = sizeof_dtype * numel; - auto p = BOOST_GET_CONST(platform::NPUPlace, tensor.place()); + auto p = tensor.place(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto &ctx = *pool.Get(tensor.place()); paddle::memory::Copy( @@ -863,7 +851,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor, "or double free would occur")); size_t copy_bytes = sizeof_dtype * numel; - auto p = BOOST_GET_CONST(platform::MLUPlace, tensor.place()); + auto p = tensor.place(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto &ctx = *pool.Get(tensor.place()); paddle::memory::Copy( diff --git a/paddle/pten/api/include/tensor.h b/paddle/pten/api/include/tensor.h index b22d2d65a439c..a6e2c4d103769 100644 --- a/paddle/pten/api/include/tensor.h +++ b/paddle/pten/api/include/tensor.h @@ -34,6 +34,7 @@ using gpuStream_t = hipStream_t; #include "paddle/pten/common/backend.h" #include "paddle/pten/common/data_type.h" #include "paddle/pten/common/layout.h" +#include "paddle/pten/common/place.h" namespace pten { class TensorBase; @@ -43,9 +44,7 @@ namespace paddle { namespace framework { class DDim; } -namespace platform { -class Place; -} + namespace experimental { class Tensor; @@ -229,7 +228,7 @@ class PADDLE_API Tensor final { * * @return paddle::platform::Place */ - paddle::platform::Place inner_place() const; + pten::Place inner_place() const; /** * @brief Determine whether the tensor device is CPU diff --git a/paddle/pten/common/place.cc b/paddle/pten/common/place.cc index 2d33bb508af44..e2cb934f0a1c5 100644 --- a/paddle/pten/common/place.cc +++ b/paddle/pten/common/place.cc @@ -23,20 +23,20 @@ namespace pten { const char *AllocationTypeStr(AllocationType type) { switch (type) { - case AllocationType::UNDEF: - return "undef"; + case AllocationType::UNDEFINED: + return "undefined"; case AllocationType::CPU: return "cpu"; case AllocationType::GPU: return "gpu"; case AllocationType::GPUPINNED: - return "gpu pinned"; + return "gpu_pinned"; case AllocationType::XPU: return "xpu"; case AllocationType::NPU: return "npu"; case AllocationType::NPUPINNED: - return "npu pinned"; + return "npu_pinned"; case AllocationType::IPU: return "ipu"; case AllocationType::MLU: diff --git a/paddle/pten/common/place.h b/paddle/pten/common/place.h index 24d24305202cf..75f1f4de9984c 100644 --- a/paddle/pten/common/place.h +++ b/paddle/pten/common/place.h @@ -19,7 +19,7 @@ limitations under the License. */ namespace pten { enum class AllocationType : int8_t { - UNDEF = 0, + UNDEFINED = 0, CPU = 1, GPU = 2, GPUPINNED = 3, @@ -30,12 +30,12 @@ enum class AllocationType : int8_t { MLU = 8, }; -const char *AllocationTypeStr(AllocationType type); +const char* AllocationTypeStr(AllocationType type); /// \brief The place is used to specify where the data is stored. class Place { public: - Place() : device(0), alloc_type_(AllocationType::UNDEF) {} + Place() : device(0), alloc_type_(AllocationType::UNDEFINED) {} explicit Place(AllocationType type, int8_t id) : device(id), alloc_type_(type) {} @@ -53,60 +53,110 @@ class Place { std::string DebugString() const; + inline bool operator==(const Place& rhs) const { + if (alloc_type_ != rhs.GetType()) { + return false; + } + if (alloc_type_ == AllocationType::CPU || + alloc_type_ == AllocationType::GPUPINNED || + alloc_type_ == AllocationType::NPUPINNED) { + return true; + } + return device == rhs.GetDeviceId(); + } + inline bool operator!=(const Place& rhs) const { return !(*this == rhs); } + inline bool operator<(const Place& rhs) const { + if (alloc_type_ != rhs.GetType()) { + return static_cast(alloc_type_) < static_cast(rhs.GetType()); + } + return device < rhs.GetDeviceId(); + } + public: // TODO(wilber): Just because of backward compatibility, it needs to be // changed to private in the future. - int8_t device; + int8_t device{0}; private: - AllocationType alloc_type_; + AllocationType alloc_type_{AllocationType::UNDEFINED}; }; class CPUPlace : public Place { public: - CPUPlace() : Place(AllocationType::CPU, 0) {} + CPUPlace() : Place(AllocationType::CPU) {} + + CPUPlace(const CPUPlace&) = default; + CPUPlace(const Place& place) : Place(AllocationType::CPU) {} // NOLINT }; class GPUPlace : public Place { public: GPUPlace() : Place(AllocationType::GPU, 0) {} explicit GPUPlace(int device_id) : Place(AllocationType::GPU, device_id) {} + + GPUPlace(const GPUPlace&) = default; + GPUPlace(const Place& place) // NOLINT + : Place(AllocationType::GPU, place.GetDeviceId()) {} }; class GPUPinnedPlace : public Place { public: GPUPinnedPlace() : Place(AllocationType::GPUPINNED) {} + + GPUPinnedPlace(const GPUPinnedPlace&) = default; + GPUPinnedPlace(const Place& place) // NOLINT + : Place(AllocationType::GPUPINNED) {} }; class XPUPlace : public Place { public: XPUPlace() : Place(AllocationType::XPU, 0) {} explicit XPUPlace(int device_id) : Place(AllocationType::XPU, device_id) {} + + XPUPlace(const XPUPlace&) = default; + XPUPlace(const Place& place) // NOLINT + : Place(AllocationType::XPU, place.GetDeviceId()) {} }; class NPUPlace : public Place { public: NPUPlace() : Place(AllocationType::NPU, 0) {} - explicit NPUPlace(int device_id) : Place(AllocationType::XPU, device_id) {} + explicit NPUPlace(int device_id) : Place(AllocationType::NPU, device_id) {} + + NPUPlace(const NPUPlace&) = default; + NPUPlace(const Place& place) // NOLINT + : Place(AllocationType::NPU, place.GetDeviceId()) {} }; class NPUPinnedPlace : public Place { public: NPUPinnedPlace() : Place(AllocationType::NPUPINNED) {} + + NPUPinnedPlace(const NPUPinnedPlace&) = default; + NPUPinnedPlace(const Place& place) // NOLINT + : Place(AllocationType::NPUPINNED) {} }; class IPUPlace : public Place { public: - IPUPlace() : Place(AllocationType::XPU, 0) {} - explicit IPUPlace(int device_id) : Place(AllocationType::XPU, device_id) {} + IPUPlace() : Place(AllocationType::IPU, 0) {} + explicit IPUPlace(int device_id) : Place(AllocationType::IPU, device_id) {} + + IPUPlace(const IPUPlace&) = default; + IPUPlace(const Place& place) // NOLINT + : Place(AllocationType::IPU, place.GetDeviceId()) {} }; class MLUPlace : public Place { public: MLUPlace() : Place(AllocationType::MLU, 0) {} explicit MLUPlace(int device_id) : Place(AllocationType::MLU, device_id) {} + + MLUPlace(const MLUPlace&) = default; + MLUPlace(const Place& place) // NOLINT + : Place(AllocationType::MLU, place.GetDeviceId()) {} }; -std::ostream &operator<<(std::ostream &, const Place &); +std::ostream& operator<<(std::ostream&, const Place&); } // namespace pten diff --git a/paddle/pten/kernels/cpu/copy_kernel.cc b/paddle/pten/kernels/cpu/copy_kernel.cc index f3c4156fcddf0..28623b539d847 100644 --- a/paddle/pten/kernels/cpu/copy_kernel.cc +++ b/paddle/pten/kernels/cpu/copy_kernel.cc @@ -53,11 +53,7 @@ void Copy(const Context& dev_ctx, if (paddle::platform::is_cpu_place(src_place) && paddle::platform::is_cpu_place(dst_place)) { - paddle::memory::Copy(BOOST_GET_CONST(paddle::platform::CPUPlace, dst_place), - dst_ptr, - BOOST_GET_CONST(paddle::platform::CPUPlace, src_place), - src_ptr, - size); + paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } } diff --git a/paddle/pten/kernels/funcs/transpose.cu b/paddle/pten/kernels/funcs/transpose.cu index e03c538e38682..77a345d7a0f7c 100644 --- a/paddle/pten/kernels/funcs/transpose.cu +++ b/paddle/pten/kernels/funcs/transpose.cu @@ -64,8 +64,7 @@ struct TransposeNormal { auto* out_ptr = out->mutable_data(); // copy in_stride, out_stride, axis to gpu device - const paddle::platform::CUDAPlace& cuda_place = - BOOST_GET_CONST(paddle::platform::CUDAPlace, dev_ctx.GetPlace()); + const paddle::platform::CUDAPlace& cuda_place = dev_ctx.GetPlace(); paddle::platform::CPUPlace cpu_place = paddle::platform::CPUPlace(); size_t size = 3 * rank * sizeof(int64_t); auto cpu_buf_holder = paddle::memory::Alloc(cpu_place, size); diff --git a/paddle/pten/kernels/gpu/copy_kernel.cu b/paddle/pten/kernels/gpu/copy_kernel.cu index 877a06ce33e5d..7eeef85f0f3e6 100644 --- a/paddle/pten/kernels/gpu/copy_kernel.cu +++ b/paddle/pten/kernels/gpu/copy_kernel.cu @@ -58,33 +58,17 @@ void Copy(const Context& dev_ctx, if (paddle::platform::is_cuda_pinned_place(src_place) && // NOLINT paddle::platform::is_cuda_pinned_place(dst_place)) { - paddle::memory::Copy( - BOOST_GET_CONST(paddle::platform::CUDAPinnedPlace, dst_place), - dst_ptr, - BOOST_GET_CONST(paddle::platform::CUDAPinnedPlace, src_place), - src_ptr, - size); + paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } else if (paddle::platform::is_cuda_pinned_place(src_place) && // NOLINT paddle::platform::is_cpu_place(dst_place)) { - paddle::memory::Copy( - BOOST_GET_CONST(paddle::platform::CPUPlace, dst_place), - dst_ptr, - BOOST_GET_CONST(paddle::platform::CUDAPinnedPlace, src_place), - src_ptr, - size); + paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } else if (paddle::platform::is_cpu_place(src_place) && // NOLINT paddle::platform::is_cuda_pinned_place(dst_place)) { - paddle::memory::Copy( - BOOST_GET_CONST(paddle::platform::CUDAPinnedPlace, dst_place), - dst_ptr, - BOOST_GET_CONST(paddle::platform::CPUPlace, src_place), - src_ptr, - size); + paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } else if (paddle::platform::is_gpu_place(src_place) && // NOLINT paddle::platform::is_cpu_place(dst_place)) { - auto src_gpu_place = - BOOST_GET_CONST(paddle::platform::CUDAPlace, src_place); - auto dst_cpu_place = BOOST_GET_CONST(paddle::platform::CPUPlace, dst_place); + auto src_gpu_place = src_place; + auto dst_cpu_place = dst_place; auto ctx_place = dev_ctx.GetPlace(); PADDLE_ENFORCE_EQ( paddle::platform::is_gpu_place(ctx_place), @@ -92,8 +76,7 @@ void Copy(const Context& dev_ctx, paddle::platform::errors::PreconditionNotMet( "Context place error, excepted GPUPlace, but actually %s.", ctx_place)); - auto ctx_gpu_place = - BOOST_GET_CONST(paddle::platform::CUDAPlace, ctx_place); + auto ctx_gpu_place = ctx_place; PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place, paddle::platform::errors::Unavailable( @@ -110,9 +93,8 @@ void Copy(const Context& dev_ctx, dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream); } else if (paddle::platform::is_cpu_place(src_place) && // NOLINT paddle::platform::is_gpu_place(dst_place)) { - auto src_cpu_place = BOOST_GET_CONST(paddle::platform::CPUPlace, src_place); - auto dst_gpu_place = - BOOST_GET_CONST(paddle::platform::CUDAPlace, dst_place); + auto src_cpu_place = src_place; + auto dst_gpu_place = dst_place; auto ctx_place = dev_ctx.GetPlace(); PADDLE_ENFORCE_EQ( paddle::platform::is_gpu_place(ctx_place), @@ -120,8 +102,7 @@ void Copy(const Context& dev_ctx, paddle::platform::errors::PreconditionNotMet( "Context place error, excepted GPUPlace, but actually %s.", ctx_place)); - auto ctx_gpu_place = - BOOST_GET_CONST(paddle::platform::CUDAPlace, ctx_place); + auto ctx_gpu_place = ctx_place; PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place, paddle::platform::errors::Unavailable( @@ -138,10 +119,8 @@ void Copy(const Context& dev_ctx, dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream); } else if (paddle::platform::is_gpu_place(src_place) && // NOLINT paddle::platform::is_cuda_pinned_place(dst_place)) { - auto src_gpu_place = - BOOST_GET_CONST(paddle::platform::CUDAPlace, src_place); - auto dst_cuda_pinned_place = - BOOST_GET_CONST(paddle::platform::CUDAPinnedPlace, dst_place); + auto src_gpu_place = src_place; + auto dst_cuda_pinned_place = dst_place; auto ctx_place = dev_ctx.GetPlace(); PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(ctx_place), true, @@ -149,8 +128,7 @@ void Copy(const Context& dev_ctx, "Device context place mismatch. When copying Tensor " "data from GPU memory to CUDA Pinned memory, current " "device context place should be GPU.")); - auto ctx_gpu_place = - BOOST_GET_CONST(paddle::platform::CUDAPlace, ctx_place); + auto ctx_gpu_place = ctx_place; PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place, paddle::platform::errors::PreconditionNotMet( @@ -168,10 +146,8 @@ void Copy(const Context& dev_ctx, dst_cuda_pinned_place, dst_ptr, src_gpu_place, src_ptr, size, stream); } else if (paddle::platform::is_cuda_pinned_place(src_place) && // NOLINT paddle::platform::is_gpu_place(dst_place)) { - auto src_cuda_pinned_place = - BOOST_GET_CONST(paddle::platform::CUDAPinnedPlace, src_place); - auto dst_gpu_place = - BOOST_GET_CONST(paddle::platform::CUDAPlace, dst_place); + auto src_cuda_pinned_place = src_place; + auto dst_gpu_place = dst_place; auto ctx_place = dev_ctx.GetPlace(); PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(ctx_place), true, @@ -179,8 +155,7 @@ void Copy(const Context& dev_ctx, "Device context place mismatch. When copying Tensor " "data from CUDA Pinned memory to GPU memory, current " "device context place should be GPU.")); - auto ctx_gpu_place = - BOOST_GET_CONST(paddle::platform::CUDAPlace, ctx_place); + auto ctx_gpu_place = ctx_place; PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place, paddle::platform::errors::PreconditionNotMet( @@ -198,10 +173,8 @@ void Copy(const Context& dev_ctx, dst_gpu_place, dst_ptr, src_cuda_pinned_place, src_ptr, size, stream); } else if (paddle::platform::is_gpu_place(src_place) && // NOLINT paddle::platform::is_gpu_place(dst_place)) { - auto src_gpu_place = - BOOST_GET_CONST(paddle::platform::CUDAPlace, src_place); - auto dst_gpu_place = - BOOST_GET_CONST(paddle::platform::CUDAPlace, dst_place); + auto src_gpu_place = src_place; + auto dst_gpu_place = dst_place; auto ctx_place = dev_ctx.GetPlace(); PADDLE_ENFORCE_EQ( paddle::platform::is_gpu_place(ctx_place), diff --git a/paddle/pten/kernels/gpu/elementwise.h b/paddle/pten/kernels/gpu/elementwise.h index 5abc40c75d17f..a024495a9ff0f 100644 --- a/paddle/pten/kernels/gpu/elementwise.h +++ b/paddle/pten/kernels/gpu/elementwise.h @@ -1540,8 +1540,7 @@ void CommonGradBroadcastCUDA(const DenseTensor &x, const GPUContext &ctx, DX_OP dx_op, DY_OP dy_op) { - const auto gplace = - BOOST_GET_CONST(paddle::platform::CUDAPlace, ctx.GetPlace()); + const auto gplace = ctx.GetPlace(); auto cplace = paddle::platform::CPUPlace(); const T *x_data = x.data(); const T *y_data = y.data(); diff --git a/paddle/pten/kernels/xpu/copy_kernel.cc b/paddle/pten/kernels/xpu/copy_kernel.cc index 190eb39e22ecd..f464a4926d3b5 100644 --- a/paddle/pten/kernels/xpu/copy_kernel.cc +++ b/paddle/pten/kernels/xpu/copy_kernel.cc @@ -50,18 +50,10 @@ void Copy(const Context& dev_ctx, if (paddle::platform::is_xpu_place(src_place) && // NOLINT paddle::platform::is_cpu_place(dst_place)) { - paddle::memory::Copy(BOOST_GET_CONST(paddle::platform::CPUPlace, dst_place), - dst_ptr, - BOOST_GET_CONST(paddle::platform::XPUPlace, src_place), - src_ptr, - size); + paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } else if (paddle::platform::is_cpu_place(src_place) && paddle::platform::is_xpu_place(dst_place)) { - paddle::memory::Copy(BOOST_GET_CONST(paddle::platform::XPUPlace, dst_place), - dst_ptr, - BOOST_GET_CONST(paddle::platform::CPUPlace, src_place), - src_ptr, - size); + paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } else if (paddle::platform::is_xpu_place(src_place) && paddle::platform::is_xpu_place(dst_place)) { if (src_ptr == dst_ptr) { @@ -69,11 +61,7 @@ void Copy(const Context& dev_ctx, << dst_place; return; } - paddle::memory::Copy(BOOST_GET_CONST(paddle::platform::XPUPlace, dst_place), - dst_ptr, - BOOST_GET_CONST(paddle::platform::XPUPlace, src_place), - src_ptr, - size); + paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } else { PADDLE_THROW(paddle::platform::errors::Unimplemented( "Copy from %s to %s is not supported.", src_place, dst_place)); diff --git a/paddle/pten/tests/common/test_place.cc b/paddle/pten/tests/common/test_place.cc index 0bbd8f1d42273..39a5cdef6b580 100644 --- a/paddle/pten/tests/common/test_place.cc +++ b/paddle/pten/tests/common/test_place.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/pten/common/place.h" +#include // NOLINT #include "gtest/gtest.h" namespace pten { @@ -21,7 +22,7 @@ namespace tests { TEST(PtenPlace, place) { pten::Place place; - EXPECT_EQ(place.GetType(), pten::AllocationType::UNDEF); + EXPECT_EQ(place.GetType(), pten::AllocationType::UNDEFINED); place.Reset(pten::AllocationType::GPU, 1); EXPECT_EQ(place.GetType(), pten::AllocationType::GPU); @@ -47,6 +48,34 @@ TEST(Place, gpu_place) { pten::GPUPinnedPlace place2; EXPECT_EQ(place2.GetType(), pten::AllocationType::GPUPINNED); std::cout << "gpu pinned place repr: " << place2 << std::endl; + + EXPECT_NE(place2, pten::CPUPlace()); +} + +TEST(Place, convert_place) { + pten::Place base_place(pten::AllocationType::CPU); + pten::CPUPlace cpu_place = base_place; + EXPECT_EQ(cpu_place.GetType(), base_place.GetType()); + base_place.Reset(pten::AllocationType::GPU, 2); + pten::GPUPlace gpu_place = base_place; + EXPECT_EQ(gpu_place.GetType(), base_place.GetType()); + EXPECT_EQ(gpu_place.GetDeviceId(), base_place.GetDeviceId()); + pten::Place place = gpu_place; + EXPECT_EQ(gpu_place.GetType(), place.GetType()); + EXPECT_EQ(gpu_place.GetDeviceId(), place.GetDeviceId()); + place = cpu_place; + EXPECT_EQ(cpu_place.GetType(), place.GetType()); + + std::map maps; + maps[pten::CPUPlace()] = 1; + maps[pten::GPUPlace(0)] = 2; + maps[pten::GPUPlace(1)] = 3; + maps[pten::GPUPlace(2)] = 4; + maps[pten::GPUPlace(3)] = 5; + maps[pten::GPUPinnedPlace()] = 6; + for (auto iter = maps.begin(); iter != maps.end(); ++iter) { + std::cout << iter->first << ":" << iter->second << std::endl; + } } } // namespace tests diff --git a/python/paddle/fluid/tests/unittests/test_egr_python_api.py b/python/paddle/fluid/tests/unittests/test_egr_python_api.py index 9630462b4963a..e84c11e8601c1 100644 --- a/python/paddle/fluid/tests/unittests/test_egr_python_api.py +++ b/python/paddle/fluid/tests/unittests/test_egr_python_api.py @@ -689,7 +689,7 @@ def test_properties(self): tensor.persistable = False self.assertEqual(tensor.persistable, False) self.assertTrue(tensor.place.is_cpu_place()) - self.assertEqual(tensor._place_str, 'CPUPlace') + self.assertEqual(tensor._place_str, 'Place(cpu)') self.assertEqual(tensor.stop_gradient, True) tensor.stop_gradient = False self.assertEqual(tensor.stop_gradient, False) diff --git a/python/paddle/fluid/tests/unittests/test_memcpy_op.py b/python/paddle/fluid/tests/unittests/test_memcpy_op.py index d6efe4d471efd..623c43f5b75f3 100755 --- a/python/paddle/fluid/tests/unittests/test_memcpy_op.py +++ b/python/paddle/fluid/tests/unittests/test_memcpy_op.py @@ -202,7 +202,7 @@ class TestMemcpyApi(unittest.TestCase): def test_api(self): a = paddle.ones([1024, 1024]) b = paddle.tensor.creation._memcpy(a, paddle.CUDAPinnedPlace()) - self.assertEqual(b.place.__repr__(), "CUDAPinnedPlace") + self.assertEqual(b.place.__repr__(), "Place(gpu_pinned)") self.assertTrue(np.array_equal(a.numpy(), b.numpy())) diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py index c4c4edbbb9335..c74dd24b78bac 100644 --- a/python/paddle/fluid/tests/unittests/test_var_base.py +++ b/python/paddle/fluid/tests/unittests/test_var_base.py @@ -68,19 +68,19 @@ def _test_place(place): np.array_equal(x.grad.numpy(), np.array([2.4]).astype('float32'))) y = x.cpu() - self.assertEqual(y.place.__repr__(), "CPUPlace") + self.assertEqual(y.place.__repr__(), "Place(cpu)") if core.is_compiled_with_cuda(): y = x.pin_memory() - self.assertEqual(y.place.__repr__(), "CUDAPinnedPlace") + self.assertEqual(y.place.__repr__(), "Place(gpu_pinned)") y = x.cuda() y = x.cuda(None) - self.assertEqual(y.place.__repr__(), "CUDAPlace(0)") + self.assertEqual(y.place.__repr__(), "Place(gpu:0)") y = x.cuda(device_id=0) - self.assertEqual(y.place.__repr__(), "CUDAPlace(0)") + self.assertEqual(y.place.__repr__(), "Place(gpu:0)") y = x.cuda(blocking=False) - self.assertEqual(y.place.__repr__(), "CUDAPlace(0)") + self.assertEqual(y.place.__repr__(), "Place(gpu:0)") y = x.cuda(blocking=True) - self.assertEqual(y.place.__repr__(), "CUDAPlace(0)") + self.assertEqual(y.place.__repr__(), "Place(gpu:0)") with self.assertRaises(ValueError): y = x.cuda("test") @@ -271,17 +271,17 @@ def test_to_tensor_change_place(self): with paddle.fluid.dygraph.guard(core.CPUPlace()): a = paddle.to_tensor(a_np, place=paddle.CUDAPinnedPlace()) a = paddle.to_tensor(a) - self.assertEqual(a.place.__repr__(), "CPUPlace") + self.assertEqual(a.place.__repr__(), "Place(cpu)") with paddle.fluid.dygraph.guard(core.CUDAPlace(0)): a = paddle.to_tensor(a_np, place=paddle.CUDAPinnedPlace()) a = paddle.to_tensor(a) - self.assertEqual(a.place.__repr__(), "CUDAPlace(0)") + self.assertEqual(a.place.__repr__(), "Place(gpu:0)") with paddle.fluid.dygraph.guard(core.CUDAPlace(0)): a = paddle.to_tensor(a_np, place=paddle.CPUPlace()) a = paddle.to_tensor(a, place=paddle.CUDAPinnedPlace()) - self.assertEqual(a.place.__repr__(), "CUDAPinnedPlace") + self.assertEqual(a.place.__repr__(), "Place(gpu_pinned)") def test_to_tensor_with_lodtensor(self): if core.is_compiled_with_cuda(): @@ -297,7 +297,7 @@ def test_to_tensor_with_lodtensor(self): lod_tensor.set(a_np, core.CUDAPlace(0)) a = paddle.to_tensor(lod_tensor, place=core.CPUPlace()) self.assertTrue(np.array_equal(a_np, a.numpy())) - self.assertTrue(a.place.__repr__(), "CPUPlace") + self.assertTrue(a.place.__repr__(), "Place(cpu)") def test_to_variable(self): with fluid.dygraph.guard(): @@ -984,7 +984,7 @@ def test_tensor_str(self): paddle.set_printoptions(4, 100, 3) a_str = str(a) - expected = '''Tensor(shape=[10, 20], dtype=float32, place=CPUPlace, stop_gradient=True, + expected = '''Tensor(shape=[10, 20], dtype=float32, place=Place(cpu), stop_gradient=True, [[0.2727, 0.5489, 0.8655, ..., 0.2916, 0.8525, 0.9000], [0.3806, 0.8996, 0.0928, ..., 0.9535, 0.8378, 0.6409], [0.1484, 0.4038, 0.8294, ..., 0.0148, 0.6520, 0.4250], @@ -1001,7 +1001,7 @@ def test_tensor_str2(self): a = paddle.to_tensor([[1.5111111, 1.0], [0, 0]]) a_str = str(a) - expected = '''Tensor(shape=[2, 2], dtype=float32, place=CPUPlace, stop_gradient=True, + expected = '''Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True, [[1.5111, 1. ], [0. , 0. ]])''' @@ -1013,7 +1013,7 @@ def test_tensor_str3(self): a = paddle.to_tensor([[-1.5111111, 1.0], [0, -0.5]]) a_str = str(a) - expected = '''Tensor(shape=[2, 2], dtype=float32, place=CPUPlace, stop_gradient=True, + expected = '''Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True, [[-1.5111, 1. ], [ 0. , -0.5000]])''' @@ -1025,7 +1025,7 @@ def test_tensor_str_scaler(self): a = paddle.to_tensor(np.array(False)) a_str = str(a) - expected = '''Tensor(shape=[], dtype=bool, place=CPUPlace, stop_gradient=True, + expected = '''Tensor(shape=[], dtype=bool, place=Place(cpu), stop_gradient=True, False)''' self.assertEqual(a_str, expected) @@ -1037,7 +1037,7 @@ def test_tensor_str_shape_with_zero(self): y = paddle.fluid.layers.where(x == 0) a_str = str(y) - expected = '''Tensor(shape=[0, 2], dtype=int64, place=CPUPlace, stop_gradient=True, + expected = '''Tensor(shape=[0, 2], dtype=int64, place=Place(cpu), stop_gradient=True, [])''' self.assertEqual(a_str, expected) @@ -1051,7 +1051,7 @@ def test_tensor_str_linewidth(self): precision=4, threshold=1000, edgeitems=3, linewidth=80) a_str = str(x) - expected = '''Tensor(shape=[128], dtype=float32, place=CPUPlace, stop_gradient=True, + expected = '''Tensor(shape=[128], dtype=float32, place=Place(cpu), stop_gradient=True, [0.3759, 0.0278, 0.2489, 0.3110, 0.9105, 0.7381, 0.1905, 0.4726, 0.2435, 0.9142, 0.3367, 0.7243, 0.7664, 0.9915, 0.2921, 0.1363, 0.8096, 0.2915, 0.9564, 0.9972, 0.2573, 0.2597, 0.3429, 0.2484, 0.9579, 0.7003, 0.4126, @@ -1078,7 +1078,7 @@ def test_tensor_str_linewidth2(self): paddle.set_printoptions(precision=4, linewidth=160, sci_mode=True) a_str = str(x) - expected = '''Tensor(shape=[128], dtype=float32, place=CPUPlace, stop_gradient=True, + expected = '''Tensor(shape=[128], dtype=float32, place=Place(cpu), stop_gradient=True, [3.7587e-01, 2.7798e-02, 2.4891e-01, 3.1097e-01, 9.1053e-01, 7.3811e-01, 1.9045e-01, 4.7258e-01, 2.4354e-01, 9.1415e-01, 3.3666e-01, 7.2428e-01, 7.6640e-01, 9.9146e-01, 2.9215e-01, 1.3625e-01, 8.0957e-01, 2.9153e-01, 9.5642e-01, 9.9718e-01, 2.5732e-01, 2.5973e-01, 3.4292e-01, 2.4841e-01, 9.5794e-01, 7.0029e-01, 4.1260e-01, 4.2737e-01, 7.3788e-03, 9.6863e-01, 9.9102e-01, 1.4416e-02, 6.5640e-01, 2.9318e-01, 7.1136e-01, 9.3008e-01,