From c48a9ad56e69a5d27d1b36df8c731c9c32f84d78 Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Mon, 17 Jan 2022 13:28:03 +0800
Subject: [PATCH] [Pten] Replace platform::Place to pten::Place. (#38899)

* add pten::Place data structure.

* update ci problem

* fix ci problem

* update

* using platform::Place=pten::Place

* remove BOOST_GET_CONST for CPUPlace and GPUPlace

* compile pass 25%.

* compile pass 45%

* compile pass 60%

* remove boost_get for xpu npu mlu and ipu

* compile pass on cpu and gpu.

* fix compile problem

* fix compile error.

* update

* fix ci problem

* update

* ci approve

* fix ci problem

* fix ci eager test problem

* remove BOOST_GET_CONST

* fix npu compile
---
 .../distributed/fleet_executor/carrier.cc     |   4 +-
 .../fluid/distributed/service/brpc_utils.cc   |  59 ++-
 .../fluid/distributed/service/heter_client.cc |   3 +-
 .../accumulation/gradient_accumulation.cc     |  32 +-
 paddle/fluid/eager/legacy/op_runner.cc        |   9 +-
 .../fluid/eager/legacy/prepared_operator.cc   |   4 +-
 .../fluid/framework/data_device_transform.cc  |   2 +-
 .../framework/details/all_reduce_op_handle.cc |   7 +-
 .../bind_threaded_ssa_graph_executor.cc       |   5 +-
 .../fluid/framework/details/bkcl_op_handle.h  |   2 +-
 .../framework/details/broadcast_op_handle.cc  |  14 +-
 .../details/eager_deletion_op_handle.cc       |   8 +-
 .../details/fused_all_reduce_op_handle.cc     |   5 +-
 .../framework/details/nan_inf_utils_detail.cc |   2 +-
 .../framework/details/nan_inf_utils_detail.cu |   2 +-
 .../fluid/framework/details/nccl_op_handle.h  |  10 +-
 .../fluid/framework/details/op_handle_base.cc |  15 +-
 .../details/parallel_ssa_graph_executor.cc    |   2 +-
 .../framework/details/reduce_op_handle.cc     |  12 +-
 .../details/scale_loss_grad_op_handle.cc      |   9 +-
 .../details/share_tensor_buffer_op_handle.cc  |   3 +-
 .../details/sparse_all_reduce_op_handle.cc    |   2 +-
 .../framework/details/variable_visitor.cc     |  18 +-
 paddle/fluid/framework/dlpack_tensor.cc       |   2 +-
 paddle/fluid/framework/dlpack_tensor_test.cc  |   3 +-
 paddle/fluid/framework/executor.cc            |  29 +-
 paddle/fluid/framework/fleet/box_wrapper.cu   |   9 +-
 .../fluid/framework/fleet/box_wrapper_impl.h  |   7 +-
 paddle/fluid/framework/fleet/fleet_wrapper.cc |   6 +-
 paddle/fluid/framework/fleet/heter_wrapper.cc |  12 +-
 .../fluid/framework/fleet/ps_gpu_wrapper.cc   |   4 +-
 .../fluid/framework/fleet/ps_gpu_wrapper.cu   |   9 +-
 paddle/fluid/framework/garbage_collector.cc   |   4 +-
 .../fluid/framework/heter_section_worker.cc   |   4 +-
 paddle/fluid/framework/heterxpu_trainer.cc    |  34 +-
 paddle/fluid/framework/ir/pass.cc             |   2 +-
 paddle/fluid/framework/mixed_vector.h         |  14 +-
 paddle/fluid/framework/naive_executor.cc      |   2 +-
 .../fluid/framework/new_executor/profiler.h   |   2 +-
 paddle/fluid/framework/op_kernel_type.cc      |   2 +-
 paddle/fluid/framework/op_kernel_type_test.cc |   4 +-
 paddle/fluid/framework/operator.cc            |  14 +-
 paddle/fluid/framework/parallel_executor.cc   |  29 +-
 paddle/fluid/framework/pull_dense_worker.cc   |   6 +-
 paddle/fluid/framework/section_worker.cc      |   9 +-
 paddle/fluid/framework/selected_rows.h        |   8 +-
 paddle/fluid/framework/tensor_util.cc         | 194 +++-----
 paddle/fluid/framework/tensor_util.h          |  72 +--
 paddle/fluid/imperative/bkcl_context.cc       |   4 +-
 .../fluid/imperative/gradient_accumulator.cc  |  28 +-
 paddle/fluid/imperative/hccl_context.cc       |  20 +-
 paddle/fluid/imperative/layer.cc              |   4 +-
 paddle/fluid/imperative/nccl_context.cc       |  19 +-
 paddle/fluid/imperative/prepared_operator.cc  |   6 +-
 paddle/fluid/imperative/reducer.cc            |   2 +-
 paddle/fluid/imperative/tracer.cc             |  29 +-
 .../fluid/inference/api/analysis_predictor.cc |  20 +-
 paddle/fluid/inference/api/api_impl.cc        |   6 +-
 .../inference/api/details/zero_copy_tensor.cc |   6 +-
 paddle/fluid/inference/lite/tensor_utils.cc   |   2 +-
 .../memory/allocation/allocator_facade.cc     |  11 +-
 .../memory/allocation/best_fit_allocator.h    |   7 +-
 .../fluid/memory/allocation/cuda_allocator.cc |   2 +-
 .../cuda_device_context_allocator.h           |   4 +-
 .../allocation/cuda_virtual_mem_allocator.cc  |   2 +-
 .../allocation/naive_best_fit_allocator.cc    |  11 +-
 .../fluid/memory/allocation/npu_allocator.cc  |   2 +-
 .../allocation/stream_safe_cuda_allocator.cc  |   5 +-
 .../allocation/stream_safe_cuda_allocator.h   |   2 +-
 .../allocation/thread_local_allocator.cc      |   3 +-
 paddle/fluid/memory/memcpy.cc                 | 452 ++++++++++++++++++
 .../fluid/operators/activation_cudnn_op.cu.cc |   6 -
 paddle/fluid/operators/allclose_op.cu         |   3 +-
 .../amp/check_finite_and_unscale_op.cu        |   9 +-
 .../amp/check_finite_and_unscale_op_xpu.cc    |  12 +-
 .../operators/amp/update_loss_scaling_op.cu   |  10 +-
 .../amp/update_loss_scaling_op_npu.cc         |   4 +-
 .../amp/update_loss_scaling_op_xpu.cc         |  29 +-
 paddle/fluid/operators/assign_op.cc           |   2 -
 paddle/fluid/operators/assign_op_npu.cc       |   2 -
 paddle/fluid/operators/assign_op_xpu.cc       |   2 -
 paddle/fluid/operators/assign_value_op.cc     |   3 -
 .../fluid/operators/average_accumulates_op.cu |   6 +-
 paddle/fluid/operators/bernoulli_op.cu        |   3 +-
 paddle/fluid/operators/cholesky_op.cu         |   3 +-
 .../fluid/operators/class_center_sample_op.cu |   5 +-
 .../fluid/operators/collective/allreduce_op.h |   2 +-
 .../operators/collective/broadcast_op.cu.cc   |   2 +-
 .../operators/collective/broadcast_op_xpu.cc  |   2 +-
 .../collective/c_allreduce_max_op.cc          |   1 -
 .../collective/c_allreduce_max_op.cu.cc       |   2 +-
 .../collective/c_allreduce_max_op_xpu.cc      |   2 +-
 .../collective/c_allreduce_min_op.cc          |   1 -
 .../collective/c_allreduce_min_op.cu.cc       |   2 +-
 .../collective/c_allreduce_min_op_xpu.cc      |   2 +-
 .../collective/c_allreduce_prod_op.cc         |   1 -
 .../collective/c_allreduce_prod_op.cu.cc      |   2 +-
 .../collective/c_allreduce_prod_op_xpu.cc     |   2 +-
 .../collective/c_allreduce_sum_op.cc          |   1 -
 .../collective/c_allreduce_sum_op.cu.cc       |   2 +-
 .../collective/c_allreduce_sum_op_xpu.cc      |   2 +-
 .../collective/c_comm_init_all_op.cc          |   2 +-
 .../collective/c_comm_init_hccl_op.cc         |   4 +-
 .../operators/collective/c_comm_init_op.cc    |   9 +-
 .../operators/collective/c_reduce_max_op.cc   |   1 -
 .../collective/c_reduce_max_op.cu.cc          |   2 +-
 .../collective/c_reduce_max_op_xpu.cc         |   2 +-
 .../operators/collective/c_reduce_min_op.cc   |   1 -
 .../collective/c_reduce_min_op.cu.cc          |   2 +-
 .../collective/c_reduce_min_op_xpu.cc         |   2 +-
 .../fluid/operators/collective/c_reduce_op.h  |   2 +-
 .../operators/collective/c_reduce_prod_op.cc  |   1 -
 .../collective/c_reduce_prod_op.cu.cc         |   2 +-
 .../collective/c_reduce_prod_op_xpu.cc        |   2 +-
 .../operators/collective/c_reduce_sum_op.cc   |   1 -
 .../collective/c_reduce_sum_op.cu.cc          |   2 +-
 .../collective/c_reduce_sum_op_xpu.cc         |   2 +-
 .../collective/c_sync_calc_stream_op.cc       |   2 +-
 .../collective/c_sync_comm_stream_op.cc       |   9 +-
 .../operators/collective/c_wait_comm_op.cc    |   8 +-
 .../operators/collective/c_wait_compute_op.cc |   5 +-
 .../fluid/operators/controlflow/compare_op.cc |   5 +-
 .../operators/controlflow/fetch_v2_op.cc      |   2 -
 paddle/fluid/operators/cudnn_lstm_op.cu.cc    |  17 +-
 paddle/fluid/operators/cumprod_op.cu          |   2 +-
 paddle/fluid/operators/cumprod_op.h           |   2 +-
 .../operators/deformable_psroi_pooling_op.cu  |   4 +-
 .../fluid/operators/dequantize_abs_max_op.cc  |   4 +-
 paddle/fluid/operators/dequantize_log_op.cc   |   3 -
 .../fluid/operators/detail/strided_memcpy.h   |   8 +-
 .../fluid/operators/detection/bbox_util.cu.h  |   4 +-
 .../fluid/operators/detection/box_coder_op.cu |   3 +-
 .../detection/collect_fpn_proposals_op.cu     |   2 +-
 .../detection/distribute_fpn_proposals_op.cu  |   2 +-
 .../detection/generate_proposals_op.cu        |   4 +-
 .../detection/generate_proposals_v2_op.cu     |   4 +-
 .../fluid/operators/detection/yolo_box_op.cu  |   2 +-
 paddle/fluid/operators/dirichlet_op.cu        |   3 +-
 paddle/fluid/operators/distribution_helper.h  |   3 +-
 .../fluid/operators/dlnne/dlnne_engine_op.h   |   3 +-
 paddle/fluid/operators/dropout_impl_util.h    |   3 +-
 paddle/fluid/operators/edit_distance_op.cu    |   4 +-
 .../elementwise/elementwise_add_op.cc         |   1 -
 .../elementwise/elementwise_floordiv_op.cc    |   1 -
 .../elementwise/elementwise_max_op.cc         |   1 -
 .../elementwise/elementwise_min_op.cc         |   1 -
 .../elementwise/elementwise_mod_op.cc         |   1 -
 .../elementwise/elementwise_pow_op.cc         |   1 -
 .../elementwise/elementwise_sub_op.cc         |   1 -
 .../elementwise/elementwise_sub_op.h          |   1 +
 .../mkldnn/elementwise_add_mkldnn_op.cc       |   1 -
 .../mkldnn/elementwise_div_mkldnn_op.cc       |   1 -
 .../mkldnn/elementwise_mul_mkldnn_op.cc       |   1 -
 .../mkldnn/elementwise_sub_mkldnn_op.cc       |   1 -
 .../test_elementwise_op_grad_grad.h           |   4 +-
 paddle/fluid/operators/expand_op_npu.cc       |   4 +-
 paddle/fluid/operators/fake_quantize_op.cu    |   4 +-
 .../operators/fill_diagonal_tensor_op.cu      |   5 +-
 paddle/fluid/operators/flip_op.cu             |   2 +-
 .../operators/fused/fused_dropout_helper.h    |   2 +-
 .../fused_layernorm_residual_dropout_bias.h   |   2 +-
 .../fused/fused_residual_dropout_bias.h       |   2 +-
 .../fusion_transpose_flatten_concat_op.cu.cc  |   7 +-
 paddle/fluid/operators/gather.cu.h            |   2 +-
 paddle/fluid/operators/gaussian_random_op.cu  |   6 +-
 .../fluid/operators/gaussian_random_op_xpu.cc |   5 +-
 paddle/fluid/operators/gru_op.cu.cc           |   2 +-
 paddle/fluid/operators/gumbel_softmax_op.cu   |   3 +-
 paddle/fluid/operators/hash_op.cc             |   3 -
 paddle/fluid/operators/increment_op.cc        |   1 -
 paddle/fluid/operators/isclose_op.cu          |   3 +-
 paddle/fluid/operators/isfinite_op.cc         |   1 -
 paddle/fluid/operators/isfinite_v2_op.cc      |   1 -
 paddle/fluid/operators/jit/kernel_key.h       |   4 +-
 paddle/fluid/operators/label_smooth_op.cc     |   1 -
 paddle/fluid/operators/lookup_table_op.cu     |   2 +-
 paddle/fluid/operators/lookup_table_v2_op.cu  |   2 +-
 paddle/fluid/operators/lstsq_op.cu            |  10 +-
 paddle/fluid/operators/lu_op.h                |   2 +-
 .../fluid/operators/masked_select_op_xpu.cc   |   3 +-
 .../fluid/operators/math/concat_and_split.cc  |  15 +-
 .../fluid/operators/math/concat_and_split.cu  |  20 +-
 .../operators/math/eigen_values_vectors.h     |   3 +-
 paddle/fluid/operators/math/math_function.cc  |   3 +-
 paddle/fluid/operators/math/math_function.cu  |   3 +-
 paddle/fluid/operators/math/math_function.h   |   5 +-
 .../fluid/operators/math/matrix_inverse.cu.cc |  15 +-
 .../fluid/operators/math/matrix_solve.cu.cc   |  13 +-
 .../operators/math/selected_rows_functor.cc   |  15 +-
 .../operators/math/selected_rows_functor.cu   |  11 +-
 paddle/fluid/operators/math/tree2col.cc       |   4 +-
 paddle/fluid/operators/math/tree2col.cu       |   4 +-
 paddle/fluid/operators/matrix_rank_op.cu      |  12 +-
 paddle/fluid/operators/mean_op.cu             |   2 +-
 paddle/fluid/operators/mean_op_xpu.cc         |   4 +-
 paddle/fluid/operators/memcpy_d2h_op.cc       |   2 -
 paddle/fluid/operators/memcpy_h2d_op.cc       |   2 -
 paddle/fluid/operators/memcpy_op.cc           |   2 -
 .../operators/metrics/accuracy_op_xpu.cc      |  16 +-
 paddle/fluid/operators/multiplex_op.cu        |   6 +-
 paddle/fluid/operators/multiplex_op.h         |   6 +-
 paddle/fluid/operators/nccl/nccl_op.cu.cc     |   9 +-
 paddle/fluid/operators/partial_concat_op.cu   |   6 +-
 paddle/fluid/operators/partial_sum_op.cu      |   8 +-
 paddle/fluid/operators/poisson_op.cu          |   3 +-
 paddle/fluid/operators/prroi_pool_op.cu       |   4 +-
 .../pscore/send_and_recv_op_gpu_test.cc       |   6 +-
 paddle/fluid/operators/psroi_pool_op.cu       |   6 +-
 paddle/fluid/operators/qr_op.cu               |  29 +-
 paddle/fluid/operators/range_op_xpu.cc        |   5 +-
 paddle/fluid/operators/rank_loss_op.cc        |   1 -
 .../fluid/operators/reader/buffered_reader.cc |  46 +-
 .../reader/create_double_buffer_reader_op.cc  |   6 +-
 .../operators/reduce_ops/frobenius_norm_op.cc |   1 -
 .../operators/reduce_ops/reduce_all_op.cc     |   1 -
 .../operators/reduce_ops/reduce_any_op.cc     |   1 -
 .../operators/reduce_ops/reduce_prod_op.cc    |   1 -
 .../operators/reduce_ops/reduce_sum_op.cc     |   1 -
 .../operators/reduce_ops/reduce_sum_op.h      |   2 +-
 paddle/fluid/operators/reshape_op.cc          |   2 -
 paddle/fluid/operators/rnn_op.cu.cc           |  21 +-
 paddle/fluid/operators/roi_align_op.cu        |   4 +-
 paddle/fluid/operators/roi_align_op_xpu.cc    |   4 +-
 paddle/fluid/operators/roi_pool_op.cu         |   4 +-
 paddle/fluid/operators/run_program_op.h       |   2 +-
 paddle/fluid/operators/scatter.cu.h           |   2 +-
 paddle/fluid/operators/seed_op.cu             |   3 +-
 paddle/fluid/operators/segment_pool_op.h      |   3 +-
 .../sequence_ops/sequence_expand_op.cu        |   3 +-
 paddle/fluid/operators/set_value_op.cc        |   1 -
 paddle/fluid/operators/split_op.h             |   2 -
 paddle/fluid/operators/split_op_npu.cc        |   1 -
 paddle/fluid/operators/stack_op.cu            |   6 +-
 paddle/fluid/operators/strided_memcpy.h       |   6 +-
 paddle/fluid/operators/sum_op.cu              |   7 +-
 paddle/fluid/operators/svd_op.cu              |   6 +-
 paddle/fluid/operators/tensor_formatter.cc    |   2 +-
 .../operators/tensorrt/tensorrt_engine_op.h   |   7 +-
 .../operators/truncated_gaussian_random_op.cu |   3 +-
 .../truncated_gaussian_random_op_xpu.cc       |   5 +-
 paddle/fluid/operators/unbind_op.h            |   3 -
 .../operators/uniform_random_inplace_op.cu    |   3 +-
 .../uniform_random_inplace_op_xpu.cc          |   8 +-
 paddle/fluid/operators/uniform_random_op.cu   |   3 +-
 .../fluid/operators/uniform_random_op_xpu.cc  |   5 +-
 paddle/fluid/operators/where_index_op.cu      |   8 +-
 paddle/fluid/operators/where_index_op_xpu.cc  |   4 +-
 paddle/fluid/platform/CMakeLists.txt          |   4 +-
 paddle/fluid/platform/collective_helper.cc    |   8 +-
 paddle/fluid/platform/collective_helper.h     |   6 +-
 .../fluid/platform/device/gpu/nccl_helper.h   |  14 +-
 .../device/mlu/device_context_allocator.h     |   3 +-
 .../fluid/platform/device/npu/hccl_helper.h   |  16 +-
 .../device/npu/npu_collective_helper.cc       |   4 +-
 .../fluid/platform/device/npu/npu_op_runner.h |   5 +-
 .../fluid/platform/device/npu/npu_stream.cc   |   4 +-
 .../fluid/platform/device/xpu/bkcl_helper.h   |  14 +-
 paddle/fluid/platform/device/xpu/xpu_info.h   |   2 +-
 .../fluid/platform/device/xpu/xpu_op_list.cc  |   3 +-
 paddle/fluid/platform/device_context.cc       |  36 +-
 paddle/fluid/platform/device_event_gpu.cc     |   2 +-
 paddle/fluid/platform/device_tracer.cc        |   3 +-
 paddle/fluid/platform/place.cc                |  59 +--
 paddle/fluid/platform/place.h                 | 362 +++-----------
 paddle/fluid/platform/place_test.cc           |   8 +-
 paddle/fluid/platform/stream/cuda_stream.cc   |   4 +-
 paddle/fluid/pybind/eager.cc                  |  25 +-
 paddle/fluid/pybind/imperative.cc             |  20 +-
 paddle/fluid/pybind/pybind.cc                 |  27 +-
 paddle/fluid/pybind/tensor_py.h               |  56 +--
 paddle/pten/api/include/tensor.h              |   7 +-
 paddle/pten/common/place.cc                   |   8 +-
 paddle/pten/common/place.h                    |  70 ++-
 paddle/pten/kernels/cpu/copy_kernel.cc        |   6 +-
 paddle/pten/kernels/funcs/transpose.cu        |   3 +-
 paddle/pten/kernels/gpu/copy_kernel.cu        |  61 +--
 paddle/pten/kernels/gpu/elementwise.h         |   3 +-
 paddle/pten/kernels/xpu/copy_kernel.cc        |  18 +-
 paddle/pten/tests/common/test_place.cc        |  31 +-
 .../tests/unittests/test_egr_python_api.py    |   2 +-
 .../fluid/tests/unittests/test_memcpy_op.py   |   2 +-
 .../fluid/tests/unittests/test_var_base.py    |  34 +-
 282 files changed, 1466 insertions(+), 1613 deletions(-)

diff --git a/paddle/fluid/distributed/fleet_executor/carrier.cc b/paddle/fluid/distributed/fleet_executor/carrier.cc
index 3e198dc3eeea4..56d8da3eca4b5 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.cc
+++ b/paddle/fluid/distributed/fleet_executor/carrier.cc
@@ -221,8 +221,8 @@ static std::shared_ptr<framework::GarbageCollector> GetGC(
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (platform::is_gpu_place(place)) {
       if (framework::IsFastEagerDeletionModeEnabled()) {
-        gc.reset(new framework::UnsafeFastGPUGarbageCollector(
-            BOOST_GET_CONST(platform::CUDAPlace, place), max_memory_size));
+        gc.reset(new framework::UnsafeFastGPUGarbageCollector(place,
+                                                              max_memory_size));
       }
     }
 #endif
diff --git a/paddle/fluid/distributed/service/brpc_utils.cc b/paddle/fluid/distributed/service/brpc_utils.cc
index db55c9ad438a7..4d9f84fdc6e0f 100644
--- a/paddle/fluid/distributed/service/brpc_utils.cc
+++ b/paddle/fluid/distributed/service/brpc_utils.cc
@@ -106,13 +106,12 @@ void SerializeLodTensor(framework::Variable* var,
     iobuf->append(reinterpret_cast<const char*>(tensor->data()), data_len);
   } else {
 #ifdef PADDLE_WITH_CUDA
-    char* temp_ptr =
-        new char[tensor->numel() * framework::SizeOfType(tensor->type())];
+    char* temp_ptr = new char[tensor->numel() *
+                              framework::SizeOfType(tensor->type())];  // NOLINT
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
     memory::Copy(
-        platform::CPUPlace(), temp_ptr,
-        BOOST_GET_CONST(platform::CUDAPlace, tensor->place()), tensor->data(),
+        platform::CPUPlace(), temp_ptr, tensor->place(), tensor->data(),
         tensor->numel() * framework::SizeOfType(tensor->type()), stream);
     auto data_len = tensor->numel() * framework::SizeOfType(tensor->type());
     iobuf->append(reinterpret_cast<const char*>(&data_len), 8);
@@ -148,13 +147,12 @@ void SerializeSelectedRows(framework::Variable* var,
     iobuf->append(reinterpret_cast<const char*>(tensor->data()), data_len);
   } else {
 #ifdef PADDLE_WITH_CUDA
-    char* temp_ptr =
-        new char[tensor->numel() * framework::SizeOfType(tensor->type())];
+    char* temp_ptr = new char[tensor->numel() *
+                              framework::SizeOfType(tensor->type())];  // NOLINT
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
     memory::Copy(
-        platform::CPUPlace(), temp_ptr,
-        BOOST_GET_CONST(platform::CUDAPlace, tensor->place()), tensor->data(),
+        platform::CPUPlace(), temp_ptr, tensor->place(), tensor->data(),
         tensor->numel() * framework::SizeOfType(tensor->type()), stream);
     auto data_len = tensor->numel() * framework::SizeOfType(tensor->type());
     iobuf->append(reinterpret_cast<const char*>(&data_len), 8);
@@ -204,7 +202,7 @@ void DeserializeFromMultiVarMsgAndIOBuf(const MultiVarMsg& multi_msg,
 }
 
 void DeserializeLodTensor(framework::Variable* var, const VarMsg& msg,
-                          butil::IOBufBytesIterator& io_buffer_itr,
+                          butil::IOBufBytesIterator& io_buffer_itr,  // NOLINT
                           const platform::DeviceContext& ctx) {
   const auto place = ctx.GetPlace();
   framework::LoDTensor* tensor = var->GetMutable<framework::LoDTensor>();
@@ -229,30 +227,30 @@ void DeserializeLodTensor(framework::Variable* var, const VarMsg& msg,
 
   // IO Buffer
   if (platform::is_cpu_place(place)) {
-    unsigned long data_len;
-    io_buffer_itr.copy_and_forward((void*)(&data_len), 8);
+    unsigned long data_len;                                 // NOLINT
+    io_buffer_itr.copy_and_forward((void*)(&data_len), 8);  // NOLINT
     io_buffer_itr.copy_and_forward(tensor_data, data_len);
   } else if (platform::is_gpu_place(place)) {
 #ifdef PADDLE_WITH_CUDA
-    unsigned long data_len;
-    char* temp_ptr =
-        new char[tensor->numel() * framework::SizeOfType(tensor->type())];
-    io_buffer_itr.copy_and_forward((void*)(&data_len), 8);
-    io_buffer_itr.copy_and_forward((void*)temp_ptr, data_len);
+    unsigned long data_len;  // NOLINT
+    char* temp_ptr = new char[tensor->numel() *
+                              framework::SizeOfType(tensor->type())];  // NOLINT
+    io_buffer_itr.copy_and_forward((void*)(&data_len), 8);             // NOLINT
+    io_buffer_itr.copy_and_forward((void*)temp_ptr, data_len);         // NOLINT
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
-    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data,
-                 platform::CPUPlace(), (void*)temp_ptr,
-                 tensor->numel() * framework::SizeOfType(tensor->type()),
-                 stream);
+    memory::Copy(
+        place, tensor_data, platform::CPUPlace(), (void*)temp_ptr,  // NOLINT
+        tensor->numel() * framework::SizeOfType(tensor->type()), stream);
     delete[] temp_ptr;
 #endif
   }
 }
 
-void DeserializeSelectedRows(framework::Variable* var, const VarMsg& msg,
-                             butil::IOBufBytesIterator& io_buffer_itr,
-                             const platform::DeviceContext& ctx) {
+void DeserializeSelectedRows(
+    framework::Variable* var, const VarMsg& msg,
+    butil::IOBufBytesIterator& io_buffer_itr,  // NOLINT
+    const platform::DeviceContext& ctx) {
   const auto place = ctx.GetPlace();
   auto* slr = var->GetMutable<framework::SelectedRows>();
   framework::Tensor* tensor = slr->mutable_value();
@@ -269,20 +267,19 @@ void DeserializeSelectedRows(framework::Variable* var, const VarMsg& msg,
       tensor->mutable_data(place, VarMessageToVarType(msg.data_type()));
   // IO Buffer
   if (platform::is_cpu_place(place)) {
-    unsigned long data_len;
-    io_buffer_itr.copy_and_forward((void*)(&data_len), 8);
+    unsigned long data_len;                                 // NOLINT
+    io_buffer_itr.copy_and_forward((void*)(&data_len), 8);  // NOLINT
     io_buffer_itr.copy_and_forward(tensor_data, data_len);
   } else if (platform::is_gpu_place(place)) {
 #ifdef PADDLE_WITH_CUDA
-    char* temp_ptr =
-        new char[tensor->numel() * framework::SizeOfType(tensor->type())];
-    unsigned long data_len;
-    io_buffer_itr.copy_and_forward((void*)(&data_len), 8);
+    char* temp_ptr = new char[tensor->numel() *
+                              framework::SizeOfType(tensor->type())];  // NOLINT
+    unsigned long data_len;                                            // NOLINT
+    io_buffer_itr.copy_and_forward((void*)(&data_len), 8);             // NOLINT
     io_buffer_itr.copy_and_forward(temp_ptr, data_len);
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
-    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data,
-                 platform::CPUPlace(), temp_ptr,
+    memory::Copy(place, tensor_data, platform::CPUPlace(), temp_ptr,
                  tensor->numel() * framework::SizeOfType(tensor->type()),
                  stream);
     delete[] temp_ptr;
diff --git a/paddle/fluid/distributed/service/heter_client.cc b/paddle/fluid/distributed/service/heter_client.cc
index 13016d60515dd..95023704f9d51 100644
--- a/paddle/fluid/distributed/service/heter_client.cc
+++ b/paddle/fluid/distributed/service/heter_client.cc
@@ -44,8 +44,7 @@ int GetMicroId(const platform::DeviceContext& ctx,
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
     memory::Copy(
-        platform::CPUPlace(), temp_ptr,
-        BOOST_GET_CONST(platform::CUDAPlace, tensor->place()), tensor->data(),
+        platform::CPUPlace(), temp_ptr, tensor->place(), tensor->data(),
         tensor->numel() * framework::SizeOfType(tensor->type()), stream);
     float* temp_ptr_float = reinterpret_cast<float*>(temp_ptr);
     micro_id = static_cast<int>(temp_ptr_float[0]);
diff --git a/paddle/fluid/eager/accumulation/gradient_accumulation.cc b/paddle/fluid/eager/accumulation/gradient_accumulation.cc
index 1f66596a0b578..ffd76c5bda621 100644
--- a/paddle/fluid/eager/accumulation/gradient_accumulation.cc
+++ b/paddle/fluid/eager/accumulation/gradient_accumulation.cc
@@ -43,7 +43,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
   TensorAddFunctor(int64_t numel, const T* x, T* y)
       : numel_(numel), x_(x), y_(y) {}
 
-  void operator()(const paddle::platform::CPUPlace& place) {
+  void operator()(const paddle::platform::CPUPlace& place) const {
     paddle::platform::CPUDeviceContext* ctx =
         dynamic_cast<paddle::platform::CPUDeviceContext*>(
             paddle::platform::DeviceContextPool::Instance().Get(place));
@@ -56,7 +56,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
 // TODO(jiabin): Support xpu here from gradient_accumulator.cc
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  void operator()(const paddle::platform::CUDAPlace& place) {
+  void operator()(const paddle::platform::CUDAPlace& place) const {
     paddle::platform::CUDADeviceContext* ctx =
         dynamic_cast<paddle::platform::CUDADeviceContext*>(
             paddle::platform::DeviceContextPool::Instance().Get(place));
@@ -66,7 +66,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
     blas.AXPY(numel_, 1., x_, y_);
   }
 #else
-  void operator()(const paddle::platform::CUDAPlace& place) {
+  void operator()(const paddle::platform::CUDAPlace& place) const {
     PADDLE_THROW(paddle::platform::errors::PermissionDenied(
         "Gradient accumulation on place (%s) "
         "is not supported in imperative mode",
@@ -76,7 +76,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
 
   // TODO(jiabin): Support Npu here from gradient_accumulator.cc
   // there is NO blas in CUDAPinnedPlace
-  void operator()(const paddle::platform::CUDAPinnedPlace& place) {
+  void operator()(const paddle::platform::CUDAPinnedPlace& place) const {
     PADDLE_THROW(paddle::platform::errors::PermissionDenied(
         "Gradient accumulation on place (%s) "
         "is not supported in imperative mode",
@@ -84,14 +84,14 @@ class TensorAddFunctor : public boost::static_visitor<> {
   }
 
 #ifdef PADDLE_WITH_ASCEND_CL
-  void operator()(const paddle::platform::NPUPlace& place) {
+  void operator()(const paddle::platform::NPUPlace& place) const {
     PADDLE_THROW(paddle::platform::errors::PermissionDenied(
         "Gradient accumulation on place (%s) "
         "is not supported in imperative mode",
         place));
   }
 #else
-  void operator()(const paddle::platform::NPUPlace& place) {
+  void operator()(const paddle::platform::NPUPlace& place) const {
     PADDLE_THROW(paddle::platform::errors::PermissionDenied(
         "Gradient accumulation on place (%s) "
         "is not supported in imperative mode",
@@ -100,14 +100,14 @@ class TensorAddFunctor : public boost::static_visitor<> {
 #endif
 
 #ifdef PADDLE_WITH_XPU
-  void operator()(const paddle::platform::XPUPlace& place) {
+  void operator()(const paddle::platform::XPUPlace& place) const {
     paddle::platform::XPUDeviceContext* ctx =
         dynamic_cast<paddle::platform::XPUDeviceContext*>(
             paddle::platform::DeviceContextPool::Instance().Get(place));
     xpu::add<T>(ctx->x_context(), x_, y_, y_, static_cast<int>(numel_));
   }
 #else
-  void operator()(const paddle::platform::XPUPlace& place) {
+  void operator()(const paddle::platform::XPUPlace& place) const {
     PADDLE_THROW(paddle::platform::errors::PermissionDenied(
         "Gradient accumulation on place (%s) "
         "is not supported in imperative mode",
@@ -116,14 +116,14 @@ class TensorAddFunctor : public boost::static_visitor<> {
 #endif
 
 #ifdef PADDLE_WITH_MLU
-  void operator()(const paddle::platform::MLUPlace& place) {
+  void operator()(const paddle::platform::MLUPlace& place) const {
     PADDLE_THROW(paddle::platform::errors::PermissionDenied(
         "Gradient accumulation on place (%s) "
         "is not supported in imperative mode",
         place));
   }
 #else
-  void operator()(const paddle::platform::MLUPlace& place) {
+  void operator()(const paddle::platform::MLUPlace& place) const {
     PADDLE_THROW(paddle::platform::errors::PermissionDenied(
         "Gradient accumulation on place (%s) "
         "is not supported in imperative mode",
@@ -132,14 +132,14 @@ class TensorAddFunctor : public boost::static_visitor<> {
 #endif
 
 #ifdef PADDLE_WITH_IPU
-  void operator()(const paddle::platform::IPUPlace& place) {
+  void operator()(const paddle::platform::IPUPlace& place) const {
     PADDLE_THROW(paddle::platform::errors::PermissionDenied(
         "Gradient accumulation on place (%s) "
         "is not supported in imperative mode",
         place));
   }
 #else
-  void operator()(const paddle::platform::IPUPlace& place) {
+  void operator()(const paddle::platform::IPUPlace& place) const {
     PADDLE_THROW(paddle::platform::errors::PermissionDenied(
         "Gradient accumulation on place (%s) "
         "is not supported in imperative mode",
@@ -147,7 +147,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
   }
 #endif
 
-  void operator()(const paddle::platform::NPUPinnedPlace& place) {
+  void operator()(const paddle::platform::NPUPinnedPlace& place) const {
     PADDLE_THROW(paddle::platform::errors::PermissionDenied(
         "Gradient accumulation on place (%s) "
         "is not supported in imperative mode",
@@ -157,7 +157,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
  private:
   int64_t numel_;
   const T* x_;
-  T* y_;
+  mutable T* y_;
 };
 
 template <typename DeviceContext, typename T>
@@ -218,7 +218,7 @@ void TensorAdd(const egr::EagerTensor& src, egr::EagerTensor* dst) {
   if (data_type == paddle::framework::DataTypeTrait<cpp_type>::DataType()) { \
     TensorAddFunctor<cpp_type> func(numel, src_tensor->data<cpp_type>(),     \
                                     dst_tensor->mutable_data<cpp_type>());   \
-    boost::apply_visitor(func, place);                                       \
+    paddle::platform::VisitPlace(place, func);                               \
     return;                                                                  \
   }
 
@@ -294,7 +294,7 @@ void VariableAdd(const egr::EagerTensor& src, egr::EagerTensor* dst) {
     TensorAddFunctor<cpp_type> func(                                         \
         numel, src_tensor.data<cpp_type>(),                                  \
         dst_tensor->mutable_data<cpp_type>(place));                          \
-    boost::apply_visitor(func, place);                                       \
+    paddle::platform::VisitPlace(place, func);                               \
     return;                                                                  \
   }
 
diff --git a/paddle/fluid/eager/legacy/op_runner.cc b/paddle/fluid/eager/legacy/op_runner.cc
index 4dab96c53eca4..305d66d134c36 100644
--- a/paddle/fluid/eager/legacy/op_runner.cc
+++ b/paddle/fluid/eager/legacy/op_runner.cc
@@ -150,24 +150,21 @@ void RunOp(const std::string& type, const NameTensorMap& ins,
     VLOG(6) << "Get Device id";
     if (paddle::platform::is_gpu_place(place)) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      paddle::platform::SetDeviceId(
-          BOOST_GET_CONST(paddle::platform::CUDAPlace, place).device);
+      paddle::platform::SetDeviceId(place.device);
 #else
       PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
           "PaddlePaddle should compile with GPU if use CUDAPlace."));
 #endif
     } else if (paddle::platform::is_xpu_place(place)) {
 #ifdef PADDLE_WITH_XPU
-      paddle::platform::SetXPUDeviceId(
-          BOOST_GET_CONST(paddle::platform::XPUPlace, place).device);
+      paddle::platform::SetXPUDeviceId(place.device);
 #else
       PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
           "PaddlePaddle should compile with XPU if use XPUPlace."));
 #endif
     } else if (paddle::platform::is_npu_place(place)) {
 #ifdef PADDLE_WITH_ASCEND_CL
-      paddle::platform::SetNPUDeviceId(
-          BOOST_GET_CONST(paddle::platform::NPUPlace, place).device);
+      paddle::platform::SetNPUDeviceId(place.device);
 #else
       PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
           "PaddlePaddle should compile with NPU if use NPUPlace."));
diff --git a/paddle/fluid/eager/legacy/prepared_operator.cc b/paddle/fluid/eager/legacy/prepared_operator.cc
index fbf2d678740ab..bd7e5c549872d 100644
--- a/paddle/fluid/eager/legacy/prepared_operator.cc
+++ b/paddle/fluid/eager/legacy/prepared_operator.cc
@@ -116,7 +116,7 @@ PreparedOp PrepareImpl(const NameTensorMap& ins, const NameTensorMap& outs,
   auto& kernels = kernels_iter->second;
   auto kernel_iter = kernels.find(expected_kernel_key);
 #ifdef PADDLE_WITH_XPU
-  if (is_xpu_place(expected_kernel_key.place_) &&
+  if (paddle::platform::is_xpu_place(expected_kernel_key.place_) &&
       (kernel_iter == kernels.end() ||
        !paddle::platform::is_xpu_support_op(op.Type(), expected_kernel_key) ||
        paddle::platform::is_in_xpu_black_list(op.Type()))) {
@@ -129,7 +129,7 @@ PreparedOp PrepareImpl(const NameTensorMap& ins, const NameTensorMap& outs,
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
   if (kernel_iter == kernels.end() &&
-      is_npu_place(expected_kernel_key.place_)) {
+      paddle::platform::is_npu_place(expected_kernel_key.place_)) {
     VLOG(3) << "missing NPU kernel: " << op.Type()
             << ", expected_kernel_key:" << expected_kernel_key
             << ", fallbacking to CPU one!";
diff --git a/paddle/fluid/framework/data_device_transform.cc b/paddle/fluid/framework/data_device_transform.cc
index f447a00f37c80..d06f5a0227af7 100644
--- a/paddle/fluid/framework/data_device_transform.cc
+++ b/paddle/fluid/framework/data_device_transform.cc
@@ -22,7 +22,7 @@ void TransDataDevice(const Tensor &in, const platform::Place &dst_place,
           << " dst_place: " << dst_place;
 
   PADDLE_ENFORCE_NE(
-      in.place().which(), dst_place.which(),
+      in.place().GetType(), dst_place.GetType(),
       platform::errors::Unavailable("Currently, model parallelism is only "
                                     "supported between CPU and CUDA."));
 
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index f93202769dbd0..633963d1793d3 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -15,6 +15,7 @@
 
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
+#include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
@@ -181,7 +182,7 @@ void AllReduceOpHandle::AllReduceFunc(
     const framework::proto::VarType::Type &dtype, int64_t numel,
     const std::vector<platform::Place> &places,
     const std::vector<std::string> &out_var_names) {
-  if (is_gpu_place(places[0])) {
+  if (platform::is_gpu_place(places[0])) {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_,
                             platform::errors::InvalidArgument(
@@ -200,7 +201,7 @@ void AllReduceOpHandle::AllReduceFunc(
     PADDLE_THROW(
         platform::errors::PreconditionNotMet("Not compiled with GPU."));
 #endif
-  } else if (is_xpu_place(places[0])) {
+  } else if (platform::is_xpu_place(places[0])) {
 #if defined(PADDLE_WITH_XPU_BKCL)
     PADDLE_ENFORCE_NOT_NULL(bkcl_ctxs_,
                             platform::errors::InvalidArgument(
@@ -286,7 +287,7 @@ void AllReduceOpHandle::NCCLAllReduceFunc(
 void AllReduceOpHandle::SyncNCCLAllReduce() {
   if (FLAGS_sync_nccl_allreduce) {
     for (auto &p : places_) {
-      int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p).device;
+      int dev_id = p.device;
       auto *nccl_ctxs =
           nccl_ctxs_->GetRunEnvNCCLCtx(run_order_, use_hierarchical_allreduce_);
       auto &nccl_ctx = nccl_ctxs->at(dev_id);
diff --git a/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.cc
index 6ce1eac2e30d2..0d8f71a7555ec 100644
--- a/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.cc
@@ -46,7 +46,7 @@ BindThreadedSSAGraphExecutor::BindThreadedSSAGraphExecutor(
   }
   int index = 0;
   for (uint32_t i = 0; i < places.size(); i++) {
-    int id = BOOST_GET_CONST(platform::XPUPlace, places_[i]).device;
+    int id = places_[i].device;
     if (place_to_index_.find(id) == place_to_index_.end()) {
       place_to_index_[id] = index;
       index++;
@@ -145,8 +145,7 @@ FetchResultType BindThreadedSSAGraphExecutor::RunMainStream(
       RunMultiDeviceOpAsync(cur_op, op_deps.get(), ready_ops);
       continue;
     } else {
-      cur_place =
-          BOOST_GET_CONST(platform::XPUPlace, dev_ctxes_.begin()->first);
+      cur_place = dev_ctxes_.begin()->first;
       int cur_index = place_to_index_[cur_place.device];
       RunOpAsyncMainStream(cur_op, op_deps.get(), ready_ops, cur_index);
     }
diff --git a/paddle/fluid/framework/details/bkcl_op_handle.h b/paddle/fluid/framework/details/bkcl_op_handle.h
index f863cb123a8af..1a098f06f08f9 100644
--- a/paddle/fluid/framework/details/bkcl_op_handle.h
+++ b/paddle/fluid/framework/details/bkcl_op_handle.h
@@ -85,7 +85,7 @@ class BKCLOpHandleBase : public OpHandleBase {
         platform::errors::InvalidArgument(
             "The argument run_order_ must be >= 0, but got %d.", run_order_));
     auto flat_bkcl_ctxs = bkcl_ctxs_->GetFlatCtx(run_order_);
-    int dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
+    int dev_id = place.device;
     auto& bkcl_ctx = flat_bkcl_ctxs->at(dev_id);
     auto comm = bkcl_ctx.comm_;
 
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc
index 01dc5a45146f1..e8fa500e094b3 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -16,6 +16,7 @@
 
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
+#include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
@@ -83,8 +84,7 @@ void BroadcastOpHandle::BroadcastOneVar(
   } else if (platform::is_gpu_place(in_tensor.place())) {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     VarHandle *out_handle = nullptr;
-    int root_id =
-        BOOST_GET_CONST(platform::CUDAPlace, in_tensor.place()).device;
+    int root_id = in_tensor.place().device;
     std::vector<std::function<void()>> broadcast_calls;
 
     int type = platform::ToNCCLDataType(in_tensor.type());
@@ -94,8 +94,7 @@ void BroadcastOpHandle::BroadcastOneVar(
       Variable *out_var = var_scopes.at(out_var_handle->scope_idx())
                               ->FindVar(out_var_handle->name());
 
-      int dst_id =
-          BOOST_GET_CONST(platform::CUDAPlace, out_var_handle->place()).device;
+      int dst_id = out_var_handle->place().device;
 
       auto &nccl_ctx = nccl_ctxs_->at(dst_id);
 
@@ -145,7 +144,7 @@ void BroadcastOpHandle::BroadcastOneVar(
   } else {
 #if defined(PADDLE_WITH_XPU_BKCL)
     VarHandle *out_handle = nullptr;
-    int root_id = BOOST_GET_CONST(platform::XPUPlace, in_tensor.place()).device;
+    int root_id = in_tensor.place().device;
     std::vector<std::function<void()>> broadcast_calls;
 
     int type = platform::ToBKCLDataType(in_tensor.type());
@@ -155,8 +154,7 @@ void BroadcastOpHandle::BroadcastOneVar(
       Variable *out_var = var_scopes.at(out_var_handle->scope_idx())
                               ->FindVar(out_var_handle->name());
 
-      int dst_id =
-          BOOST_GET_CONST(platform::XPUPlace, out_var_handle->place()).device;
+      int dst_id = out_var_handle->place().device;
 
       auto &bkcl_ctx = bkcl_ctxs_->at(dst_id);
 
@@ -232,7 +230,7 @@ void BroadcastOpHandle::InitOutputValue(
     PADDLE_ENFORCE_NOT_NULL(out_var, platform::errors::NotFound(
                                          "Variable %s is not found in scopes.",
                                          out_var_handle->name()));
-    if (is_gpu_place(in_tensor.place())) {
+    if (platform::is_gpu_place(in_tensor.place())) {
       PADDLE_ENFORCE_EQ(platform::is_gpu_place(t_out_p), true,
                         platform::errors::PreconditionNotMet(
                             "Places of input and output must be all on GPU."));
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
index bcdd6129230b0..59614e89c1344 100644
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
@@ -46,8 +46,7 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
     dev_ctx_ = reinterpret_cast<platform::CUDADeviceContext *>(
         platform::DeviceContextPool::Instance().Get(place));
     if (dynamic_cast<StreamGarbageCollector *>(gc_)) {
-      platform::CUDADeviceGuard guard(
-          BOOST_GET_CONST(platform::CUDAPlace, place).device);
+      platform::CUDADeviceGuard guard(place.device);
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipEventCreateWithFlags(&event_, hipEventDisableTiming));
@@ -72,7 +71,7 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
 EagerDeletionOpHandle::~EagerDeletionOpHandle() {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (event_) {
-    auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx_->GetPlace());
+    auto gpu_place = dev_ctx_->GetPlace();
     platform::CUDADeviceGuard guard(gpu_place.device);
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event_));
@@ -85,8 +84,7 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() {
 
 void EagerDeletionOpHandle::InitCUDA() {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  int dev_id =
-      BOOST_GET_CONST(platform::CUDAPlace, dev_ctxes_.begin()->first).device;
+  int dev_id = dev_ctxes_.begin()->first.device;
   events_[dev_id] = nullptr;
 #endif
 }
diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
index b65d4e4fcd55a..af1b73f40be53 100644
--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
@@ -16,6 +16,7 @@
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
 #include "paddle/fluid/platform/device_memory_aligment.h"
+#include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 
 DEFINE_bool(skip_fused_all_reduce_check, false, "");
@@ -102,7 +103,7 @@ void FusedAllReduceOpHandle::RunImpl() {
   gpuStream_t compute_stream{nullptr};
 
   if (FLAGS_allreduce_record_one_event) {
-    auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, places_[0]);
+    auto gpu_place = platform::CUDAPlace(places_[0].GetDeviceId());
     compute_stream =
         platform::DeviceContextPool::Instance().GetByPlace(gpu_place)->stream();
     auto flat_nccl_ctxs = nccl_ctxs_->GetFlatCtx(run_order_);
@@ -291,7 +292,7 @@ bool FusedAllReduceOpHandle::InputIsInDifferentPlace(
           var, platform::errors::NotFound(
                    "The variable '%s' is not found in local scope.", var_name));
       auto &lod_tensor = var->Get<LoDTensor>();
-      if (!is_same_place(lod_tensor.place(), places_.at(scope_idx))) {
+      if (!platform::is_same_place(lod_tensor.place(), places_.at(scope_idx))) {
         return true;
       }
     }
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
index a5787ac39665c..db3eaece3569f 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
@@ -354,7 +354,7 @@ void CheckVarHasNanOrInf(const std::string& op_type,
 
     float* cpu_data = new float[tensor->numel()];
     memory::Copy(platform::CPUPlace(), static_cast<void*>(cpu_data),
-                 BOOST_GET_CONST(platform::XPUPlace, tensor->place()),
+                 tensor->place(),
                  static_cast<const void*>(tensor->data<float>()),
                  tensor->numel() * sizeof(float));
     bool flag = false;
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cu b/paddle/fluid/framework/details/nan_inf_utils_detail.cu
index 8255707654416..bf38a56dc9372 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cu
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cu
@@ -132,7 +132,7 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(
 
   auto* dev_ctx = reinterpret_cast<platform::CUDADeviceContext*>(
       platform::DeviceContextPool::Instance().Get(tensor_.place()));
-  int dev_id = BOOST_GET_CONST(platform::CUDAPlace, tensor_.place()).device;
+  int dev_id = tensor_.place().device;
   PADDLE_ENFORCE_EQ(
       (dev_id >= 0 && dev_id < multi_op_var2gpu_str_mutex().size()), true,
       platform::errors::OutOfRange("GPU dev_id must >=0 and < dev_count=%d",
diff --git a/paddle/fluid/framework/details/nccl_op_handle.h b/paddle/fluid/framework/details/nccl_op_handle.h
index 324d39ed8bb77..09372a8ba05b0 100644
--- a/paddle/fluid/framework/details/nccl_op_handle.h
+++ b/paddle/fluid/framework/details/nccl_op_handle.h
@@ -102,7 +102,7 @@ class NCCLOpHandleBase : public OpHandleBase {
     }
 
     for (auto& p : dev_ctxes_) {
-      int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p.first).device;
+      int dev_id = p.first.device;
       if (inter_events_.find(dev_id) != inter_events_.end()) {
         continue;
       }
@@ -133,7 +133,7 @@ class NCCLOpHandleBase : public OpHandleBase {
         platform::errors::InvalidArgument(
             "The argument run_order_ must be >= 0, but got %d.", run_order_));
     auto flat_nccl_ctxs = nccl_ctxs_->GetFlatCtx(run_order_);
-    int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
+    int dev_id = place.device;
     auto& nccl_ctx = flat_nccl_ctxs->at(dev_id);
     auto stream = nccl_ctx.stream();
     auto comm = nccl_ctx.comm_;
@@ -181,7 +181,7 @@ class NCCLOpHandleBase : public OpHandleBase {
   void InterReduce(platform::Place place, const void* sendbuff, void* recvbuff,
                    size_t count, ncclDataType_t datatype, ncclRedOp_t op) {
     auto nccl_ctxs = nccl_ctxs_->GetHierarchicalInterCtx(run_order_);
-    int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
+    int dev_id = place.device;
     auto& nccl_ctx = nccl_ctxs->at(dev_id);
     auto stream = nccl_ctx.stream();
     auto comm = nccl_ctx.comm_;
@@ -213,7 +213,7 @@ class NCCLOpHandleBase : public OpHandleBase {
     PADDLE_ENFORCE_NOT_NULL(
         nccl_ctxs_, platform::errors::NotFound(
                         "Can't get exter %d nccl contexts.", run_order_));
-    int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
+    int dev_id = place.device;
     auto& nccl_ctx = nccl_ctxs->at(dev_id);
     auto stream = nccl_ctx.stream();
     auto comm = nccl_ctx.comm_;
@@ -246,7 +246,7 @@ class NCCLOpHandleBase : public OpHandleBase {
   void InterBroadCast(platform::Place place, void* sendbuff, size_t count,
                       ncclDataType_t datatype, ncclRedOp_t op) {
     auto nccl_ctxs = nccl_ctxs_->GetHierarchicalInterCtx(run_order_);
-    int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
+    int dev_id = place.device;
     auto& nccl_ctx = nccl_ctxs->at(dev_id);
     auto stream = nccl_ctx.stream();
     auto comm = nccl_ctx.comm_;
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index fe21a62efd087..faaeeaeecb11f 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -47,7 +47,7 @@ OpHandleBase::~OpHandleBase() PADDLE_MAY_THROW {
 void OpHandleBase::InitCUDA() {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   for (auto &p : dev_ctxes_) {
-    int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p.first).device;
+    int dev_id = p.first.device;
     platform::SetDeviceId(dev_id);
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(
@@ -61,9 +61,7 @@ void OpHandleBase::InitCUDA() {
     for (auto &out_var : outputs_) {
       auto *out_var_handle = dynamic_cast<VarHandle *>(out_var);
       if (out_var_handle) {
-        int dev_id =
-            BOOST_GET_CONST(platform::CUDAPlace, out_var_handle->place())
-                .device;
+        int dev_id = out_var_handle->place().device;
         out_var_handle->SetGenerateEvent(events_.at(dev_id));
       }
     }
@@ -74,7 +72,7 @@ void OpHandleBase::InitCUDA() {
             "Operator %s should have only one dev_ctx, but got %d.", Name(),
             dev_ctxes_.size()));
     auto &place = dev_ctxes_.begin()->first;
-    int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
+    int dev_id = place.device;
     for (auto &out_var : outputs_) {
       auto *out_var_handle = dynamic_cast<VarHandle *>(out_var);
       if (out_var_handle) {
@@ -109,7 +107,7 @@ void OpHandleBase::InitXPU() {
                       platform::errors::InvalidArgument(
                           "%s should have only one dev_ctx.", Name()));
     auto &place = dev_ctxes_.begin()->first;
-    int dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
+    int dev_id = place.device;
     platform::SetXPUDeviceId(dev_id);
     for (auto &out_var : outputs_) {
       auto *out_var_handle = dynamic_cast<VarHandle *>(out_var);
@@ -309,7 +307,7 @@ void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (!events_.empty()) {  // Use event
     for (auto &p : dev_ctxes_) {
-      auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, p.first).device;
+      auto dev_id = p.first.device;
       auto *cuda_dev_ctx = static_cast<platform::CUDADeviceContext *>(p.second);
       VLOG(10) << "cudadevicecontext:" << cuda_dev_ctx << ", dev_id:" << dev_id;
 #ifdef PADDLE_WITH_HIP
@@ -332,8 +330,7 @@ void OpHandleBase::RunAndRecordEvent(platform::Place p,
   } else {
     auto *ctx = dev_ctxes_.at(p);
     auto *cuda_ctx = static_cast<platform::CUDADeviceContext *>(ctx);
-    cuda_ctx->RecordEvent(
-        events_.at(BOOST_GET_CONST(platform::CUDAPlace, p).device), callback);
+    cuda_ctx->RecordEvent(events_.at(p.device), callback);
   }
 #else
   callback();
diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
index 35834fe5d7480..51063f68d4cbd 100644
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
@@ -45,7 +45,7 @@ static std::vector<std::unique_ptr<ir::Graph>> SeparateMultiDevicesGraph(
   for (auto &op : op_handles) {
     auto &dev_ctx = op->DeviceContext();
     auto &p = dev_ctx.begin()->first;
-    int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p).device;
+    int dev_id = p.device;
     auto &dev_dummys = graphs[dev_id]->Get<GraphDepVars>(kGraphDepVars);
     graphs[dev_id]->AddNode(graph->RemoveNode(op->Node()).release());
 
diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc
index 196f7a3d4a4bf..6493ef540ccbe 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -17,6 +17,7 @@
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
+#include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 
 PADDLE_DEFINE_EXPORTED_bool(
@@ -125,7 +126,8 @@ void ReduceOpHandle::RunImpl() {
 
       // TODO(gongwb): add cpu support
       if (collective_context.endpoints_.size() <= 1 ||
-          is_cpu_place(in_places[0]) || is_cpu_place(t_out_p)) {
+          platform::is_cpu_place(in_places[0]) ||
+          platform::is_cpu_place(t_out_p)) {
         GatherLocalSelectedRowsFunctor functor(
             in_selected_rows, in_places, dev_ctxes_, t_out_p,
             out_var->GetMutable<framework::SelectedRows>());
@@ -172,13 +174,13 @@ void ReduceOpHandle::RunImpl() {
           out_var_handle->place(), pre_in.type());
 
       auto out_p = out_var_handle->place();
-      int root_id = BOOST_GET_CONST(platform::CUDAPlace, out_p).device;
+      int root_id = out_p.device;
       std::vector<std::function<void()>> all_reduce_calls;
       for (size_t i = 0; i < var_scopes.size(); ++i) {
         auto &p = in_places[i];
         auto &lod_tensor = *lod_tensors[i];
 
-        int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p).device;
+        int dev_id = p.device;
         auto &nccl_ctx = nccl_ctxs_->at(dev_id);
 
         void *buffer = const_cast<void *>(lod_tensor.data());
@@ -218,13 +220,13 @@ void ReduceOpHandle::RunImpl() {
           out_var_handle->place(), pre_in.type());
 
       auto out_p = out_var_handle->place();
-      int root_id = BOOST_GET_CONST(platform::XPUPlace, out_p).device;
+      int root_id = out_p.device;
       std::vector<std::function<void()>> all_reduce_calls;
       for (size_t i = 0; i < var_scopes.size(); ++i) {
         auto &p = in_places[i];
         auto &lod_tensor = *lod_tensors[i];
 
-        int dev_id = BOOST_GET_CONST(platform::XPUPlace, p).device;
+        int dev_id = p.device;
         auto &bkcl_ctx = bkcl_ctxs_->at(dev_id);
 
         void *buffer = const_cast<void *>(lod_tensor.data());
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
index 1e3cd4f0aa77c..a2f7cc6fcecbf 100644
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -61,8 +61,8 @@ struct ScaleLossGradFunctor {
     } else if (platform::is_xpu_place(place_)) {
 #if defined(PADDLE_WITH_XPU)
       OutT cast_coeff = static_cast<OutT>(coeff_);
-      memory::Copy(BOOST_GET_CONST(platform::XPUPlace, place_), out_data,
-                   platform::CPUPlace(), &cast_coeff, SizeOfType(out_dtype_));
+      memory::Copy(place_, out_data, platform::CPUPlace(), &cast_coeff,
+                   SizeOfType(out_dtype_));
       VLOG(10) << place_ << "RUN Scale loss grad op";
 #else
       PADDLE_THROW(platform::errors::PermissionDenied(
@@ -73,9 +73,8 @@ struct ScaleLossGradFunctor {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       OutT cast_coeff = static_cast<OutT>(coeff_);
       auto stream = static_cast<platform::CUDADeviceContext *>(ctx_)->stream();
-      memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place_), out_data,
-                   platform::CPUPlace(), &cast_coeff, SizeOfType(out_dtype_),
-                   stream);
+      memory::Copy(place_, out_data, platform::CPUPlace(), &cast_coeff,
+                   SizeOfType(out_dtype_), stream);
       VLOG(10) << place_ << "RUN Scale loss grad op";
 #else
       PADDLE_THROW(platform::errors::PermissionDenied(
diff --git a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
index aa942415fb404..3d6322b8c4179 100644
--- a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
+++ b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
@@ -86,8 +86,7 @@ void ShareTensorBufferOpHandle::SetShareDimsAndDtype(
 
 void ShareTensorBufferOpHandle::InitCUDA() {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  int dev_id =
-      BOOST_GET_CONST(platform::CUDAPlace, dev_ctxes_.begin()->first).device;
+  int dev_id = dev_ctxes_.begin()->first.device;
   events_[dev_id] = nullptr;
 #endif
 }
diff --git a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
index ed485ed587c0b..1ab944720f8f4 100644
--- a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
@@ -165,7 +165,7 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
                           in_numel));
     out_numel = (out_numel == 0) ? static_cast<size_t>(out.numel()) : out_numel;
 
-    int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
+    int dev_id = place.device;
     auto *nccl_ctxs = nccl_ctxs_->GetRunEnvNCCLCtx(run_order_, false);
     auto &nccl_ctx = nccl_ctxs->at(dev_id);
     auto stream = nccl_ctx.stream();
diff --git a/paddle/fluid/framework/details/variable_visitor.cc b/paddle/fluid/framework/details/variable_visitor.cc
index 56c88e9d25a91..8207855501384 100644
--- a/paddle/fluid/framework/details/variable_visitor.cc
+++ b/paddle/fluid/framework/details/variable_visitor.cc
@@ -106,9 +106,12 @@ struct EnforceShapeAndDTypeEQVisitor {
 
   void operator()(const LoDTensor& src) {
     auto& tensor = dst_->Get<LoDTensor>();
-    PADDLE_ENFORCE_EQ(src.place().which(), tensor.place().which(),
-                      platform::errors::PreconditionNotMet(
-                          "The place type of the two variables is not equal."));
+    PADDLE_ENFORCE_EQ(
+        src.place().GetType(), tensor.place().GetType(),
+        platform::errors::PreconditionNotMet(
+            "The place type of the two variables is not equal. The src place "
+            "is %s, but the dst place is %s",
+            src.place().DebugString(), tensor.place().DebugString()));
     PADDLE_ENFORCE_EQ(src.type(), tensor.type(),
                       platform::errors::PreconditionNotMet(
                           "The dtype of the two variables is not equal."));
@@ -127,9 +130,12 @@ struct EnforceShapeAndDTypeEQVisitor {
 
   void operator()(const SelectedRows& src) {
     auto& selected_rows = dst_->Get<SelectedRows>();
-    PADDLE_ENFORCE_EQ(src.place().which(), selected_rows.place().which(),
-                      platform::errors::PreconditionNotMet(
-                          "The place type of the two variables is not equal."));
+    PADDLE_ENFORCE_EQ(
+        src.place().GetType(), selected_rows.place().GetType(),
+        platform::errors::PreconditionNotMet(
+            "The place type of the two variables is not equal. The src place "
+            "is %s, but the dst place is %s",
+            src.place().DebugString(), selected_rows.place().DebugString()));
     PADDLE_ENFORCE_EQ(src.value().type(), selected_rows.value().type(),
                       platform::errors::PreconditionNotMet(
                           "The dtype of the two variables is not equal."));
diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc
index 95913664961b3..ef705aae1572b 100644
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -138,7 +138,7 @@ DLPackTensor::DLPackTensor(const Tensor &tensor, LaneType lanes) {
 
   // init device, DLDevice type with device_type and device_id
   auto place = tensor.place();
-  t_.device = boost::apply_visitor(internal::DLDeviceVisitor(), place);
+  t_.device = paddle::platform::VisitPlace(place, internal::DLDeviceVisitor());
 
   // init dtype
   t_.dtype = internal::GetDLDataTypeFromTypeIndex(tensor.type());
diff --git a/paddle/fluid/framework/dlpack_tensor_test.cc b/paddle/fluid/framework/dlpack_tensor_test.cc
index 9b8bdebe706eb..8639caf4dac90 100644
--- a/paddle/fluid/framework/dlpack_tensor_test.cc
+++ b/paddle/fluid/framework/dlpack_tensor_test.cc
@@ -63,8 +63,7 @@ void TestMain(const platform::Place &place, uint16_t lanes) {
     CHECK_EQ(0, dl_tensor.device.device_id);
   } else if (platform::is_gpu_place(place)) {
     CHECK_EQ(kDLGPU, dl_tensor.device.device_type);
-    CHECK_EQ(BOOST_GET_CONST(platform::CUDAPlace, place).device,
-             dl_tensor.device.device_id);
+    CHECK_EQ(place.device, dl_tensor.device.device_id);
   } else if (platform::is_cuda_pinned_place(place)) {
     CHECK_EQ(kDLCPUPinned, dl_tensor.device.device_type);
     CHECK_EQ(0, dl_tensor.device.device_id);
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index d669f2ab11d6c..5596aba52131b 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -72,7 +72,7 @@ Executor::~Executor() {
 #ifdef PADDLE_WITH_MKLDNN
   // Clear mkl-dnn cache,
   // this is needed to have mkl-dnn unit tests working
-  ClearMKLDNNCache(place_, this);
+  platform::ClearMKLDNNCache(place_, this);
 #endif
 }
 
@@ -443,31 +443,26 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
     if (platform::is_gpu_place(place_)) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       if (IsFastEagerDeletionModeEnabled()) {
-        gc.reset(new UnsafeFastGPUGarbageCollector(
-            BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size));
+        gc.reset(new UnsafeFastGPUGarbageCollector(place_, max_memory_size));
       } else {
-        gc.reset(new DefaultStreamGarbageCollector(
-            BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size));
+        gc.reset(new DefaultStreamGarbageCollector(place_, max_memory_size));
       }
 #else
       PADDLE_THROW(
           platform::errors::Unimplemented("No GPU gc found in CPU/XPU paddle"));
 #endif
     } else if (platform::is_cpu_place(place_)) {
-      gc.reset(new CPUGarbageCollector(
-          BOOST_GET_CONST(platform::CPUPlace, place_), max_memory_size));
+      gc.reset(new CPUGarbageCollector(place_, max_memory_size));
     } else if (platform::is_xpu_place(place_)) {
 #ifdef PADDLE_WITH_XPU
-      gc.reset(new XPUGarbageCollector(
-          BOOST_GET_CONST(platform::XPUPlace, place_), max_memory_size));
+      gc.reset(new XPUGarbageCollector(place_, max_memory_size));
 #else
       PADDLE_THROW(
           platform::errors::Unimplemented("No XPU gc found in CPU/GPU paddle"));
 #endif
     } else if (platform::is_ipu_place(place_)) {
 #ifdef PADDLE_WITH_IPU
-      gc.reset(new IPUGarbageCollector(
-          BOOST_GET_CONST(platform::IPUPlace, place_), max_memory_size));
+      gc.reset(new IPUGarbageCollector(place_, max_memory_size));
 #else
       PADDLE_THROW(
           platform::errors::Unimplemented("No IPU gc found in CPU/IPU paddle"));
@@ -476,16 +471,14 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
 #ifdef PADDLE_WITH_ASCEND_CL
       if (IsFastEagerDeletionModeEnabled()) {
         VLOG(4) << "Use unsafe fast gc for NPU.";
-        gc.reset(new NPUUnsafeFastGarbageCollector(
-            BOOST_GET_CONST(platform::NPUPlace, place_), max_memory_size));
+        gc.reset(new NPUUnsafeFastGarbageCollector(place_, max_memory_size));
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "Please set FLAGS_fast_eager_deletion_mode=true to use "
             "GarbageCollector on NPU."));
         // TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector.
         VLOG(4) << "Use default stream gc for NPU.";
-        gc.reset(new NPUDefaultStreamGarbageCollector(
-            BOOST_GET_CONST(platform::NPUPlace, place_), max_memory_size));
+        gc.reset(new NPUDefaultStreamGarbageCollector(place_, max_memory_size));
       }
 #else
       PADDLE_THROW(
@@ -494,11 +487,9 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
     } else if (platform::is_mlu_place(place_)) {
 #ifdef PADDLE_WITH_MLU
       if (IsFastEagerDeletionModeEnabled()) {
-        gc.reset(new MLUUnsafeFastGarbageCollector(
-            BOOST_GET_CONST(platform::MLUPlace, place_), max_memory_size));
+        gc.reset(new MLUUnsafeFastGarbageCollector(place_, max_memory_size));
       } else {
-        gc.reset(new MLUDefaultStreamGarbageCollector(
-            BOOST_GET_CONST(platform::MLUPlace, place_), max_memory_size));
+        gc.reset(new MLUDefaultStreamGarbageCollector(place_, max_memory_size));
       }
 #else
       PADDLE_THROW(
diff --git a/paddle/fluid/framework/fleet/box_wrapper.cu b/paddle/fluid/framework/fleet/box_wrapper.cu
index 0ef77a0387671..aea479ed0b214 100644
--- a/paddle/fluid/framework/fleet/box_wrapper.cu
+++ b/paddle/fluid/framework/fleet/box_wrapper.cu
@@ -137,8 +137,7 @@ void BoxWrapper::CopyForPull(const paddle::platform::Place& place,
                              const int expand_embed_dim,
                              const int64_t total_length) {
   auto stream = dynamic_cast<platform::CUDADeviceContext*>(
-                    platform::DeviceContextPool::Instance().Get(
-                        BOOST_GET_CONST(platform::CUDAPlace, place)))
+                    platform::DeviceContextPool::Instance().Get(place))
                     ->stream();
   auto buf_value = memory::Alloc(place, values.size() * sizeof(float*));
   float** gpu_values = reinterpret_cast<float**>(buf_value->ptr());
@@ -203,8 +202,7 @@ void BoxWrapper::CopyKeys(const paddle::platform::Place& place,
                           uint64_t** origin_keys, uint64_t* total_keys,
                           const int64_t* gpu_len, int slot_num, int total_len) {
   auto stream = dynamic_cast<platform::CUDADeviceContext*>(
-                    platform::DeviceContextPool::Instance().Get(
-                        BOOST_GET_CONST(platform::CUDAPlace, place)))
+                    platform::DeviceContextPool::Instance().Get(place))
                     ->stream();
 #ifdef PADDLE_WITH_HIP
   hipLaunchKernelGGL(CopyKeysKernel, dim3((total_len + 512 - 1) / 512),
@@ -225,8 +223,7 @@ void BoxWrapper::CopyForPush(const paddle::platform::Place& place,
                              const int hidden_size, const int expand_embed_dim,
                              const int64_t total_length, const int batch_size) {
   auto stream = dynamic_cast<platform::CUDADeviceContext*>(
-                    platform::DeviceContextPool::Instance().Get(
-                        BOOST_GET_CONST(platform::CUDAPlace, place)))
+                    platform::DeviceContextPool::Instance().Get(place))
                     ->stream();
   auto slot_lengths_lod = slot_lengths;
   for (int i = 1; i < slot_lengths_lod.size(); i++) {
diff --git a/paddle/fluid/framework/fleet/box_wrapper_impl.h b/paddle/fluid/framework/fleet/box_wrapper_impl.h
index f42b0395eaf49..6f7009f4d5143 100644
--- a/paddle/fluid/framework/fleet/box_wrapper_impl.h
+++ b/paddle/fluid/framework/fleet/box_wrapper_impl.h
@@ -45,7 +45,7 @@ void BoxWrapper::PullSparseCase(const paddle::platform::Place& place,
   } else if (platform::is_gpu_place(place)) {
 #if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
     VLOG(3) << "Begin copy keys, key_num[" << total_length << "]";
-    int device_id = BOOST_GET_CONST(platform::CUDAPlace, place).GetDeviceId();
+    int device_id = place.GetDeviceId();
     LoDTensor& total_keys_tensor = keys_tensor[device_id];
     uint64_t* total_keys = reinterpret_cast<uint64_t*>(
         total_keys_tensor.mutable_data<int64_t>({total_length, 1}, place));
@@ -131,7 +131,7 @@ void BoxWrapper::PushSparseGradCase(
         "Warning:: CPUPlace is not supported in PaddleBox now."));
   } else if (platform::is_gpu_place(place)) {
 #if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
-    int device_id = BOOST_GET_CONST(platform::CUDAPlace, place).GetDeviceId();
+    int device_id = place.GetDeviceId();
     LoDTensor& cached_total_keys_tensor = keys_tensor[device_id];
     uint64_t* total_keys =
         reinterpret_cast<uint64_t*>(cached_total_keys_tensor.data<int64_t>());
@@ -143,8 +143,7 @@ void BoxWrapper::PushSparseGradCase(
     push_boxps_timer.Start();
     int ret = boxps_ptr_->PushSparseGPU(
         total_keys, reinterpret_cast<void*>(total_grad_values_gpu),
-        static_cast<int>(total_length),
-        BOOST_GET_CONST(platform::CUDAPlace, place).GetDeviceId());
+        static_cast<int>(total_length), place.GetDeviceId());
     PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
                                   "PushSparseGPU failed in BoxPS."));
     push_boxps_timer.Pause();
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc
index f90027556342d..4fddfca5d805a 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.cc
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc
@@ -764,8 +764,7 @@ void FleetWrapper::PushDenseVarsAsync(
     LoDTensor* pin_tensor = pin_var->GetMutable<LoDTensor>();
     float* pin_g = pin_tensor->mutable_data<float>(tensor->dims(),
                                                    platform::CUDAPinnedPlace());
-    memory::Copy(platform::CUDAPinnedPlace(), pin_g,
-                 BOOST_GET_CONST(platform::CUDAPlace, place), g_data,
+    memory::Copy(platform::CUDAPinnedPlace(), pin_g, place, g_data,
                  sizeof(float) * count, stream);
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, stream));
@@ -821,8 +820,7 @@ void FleetWrapper::PushDenseVarsAsync(
     LoDTensor* pin_tensor = pin_var->GetMutable<LoDTensor>();
     float* pin_g =
         pin_tensor->mutable_data<float>(tensor->dims(), platform::CPUPlace());
-    memory::Copy(platform::CPUPlace(), pin_g,
-                 BOOST_GET_CONST(platform::XPUPlace, place), g_data,
+    memory::Copy(platform::CPUPlace(), pin_g, place, g_data,
                  sizeof(float) * count);
 
     float* g = pin_g;
diff --git a/paddle/fluid/framework/fleet/heter_wrapper.cc b/paddle/fluid/framework/fleet/heter_wrapper.cc
index 66f0d116f2412..5b54aa03bb30a 100644
--- a/paddle/fluid/framework/fleet/heter_wrapper.cc
+++ b/paddle/fluid/framework/fleet/heter_wrapper.cc
@@ -116,14 +116,12 @@ void HeterWrapper::SerializeToReq(const std::string& varname, Scope* scope,
            tensor->numel() * SizeOfType(tensor->type()));
   } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    memory::Copy(platform::CPUPlace(), data_ptr,
-                 BOOST_GET_CONST(platform::CUDAPlace, tensor->place()),
+    memory::Copy(platform::CPUPlace(), data_ptr, tensor->place(),
                  tensor->data(), tensor->numel() * SizeOfType(tensor->type()),
                  nullptr);
 #endif
 #ifdef PADDLE_WITH_XPU
-    memory::Copy(platform::CPUPlace(), data_ptr,
-                 BOOST_GET_CONST(platform::XPUPlace, tensor->place()),
+    memory::Copy(platform::CPUPlace(), data_ptr, tensor->place(),
                  tensor->data(), tensor->numel() * SizeOfType(tensor->type()));
 #endif
   }
@@ -158,8 +156,7 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope,
       tensor->mutable_data(place, ToVarType(req_var.data_type()));
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data,
-               platform::CPUPlace(), req_var.data().data(),
+  memory::Copy(place, tensor_data, platform::CPUPlace(), req_var.data().data(),
                tensor->numel() * SizeOfType(tensor->type()), stream);
 #else
   memcpy(tensor_data, req_var.data().data(),
@@ -197,8 +194,7 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope,
       tensor->mutable_data(place, ToVarType(req_var.data_type()));
 
 #ifdef PADDLE_WITH_XPU
-  memory::Copy(BOOST_GET_CONST(platform::XPUPlace, place), tensor_data,
-               platform::CPUPlace(), req_var.data().data(),
+  memory::Copy(place, tensor_data, platform::CPUPlace(), req_var.data().data(),
                tensor->numel() * SizeOfType(tensor->type()));
 #else
   memcpy(tensor_data, req_var.data().data(),
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
index 2b712d8cc5db8..31a30f72e3aa6 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
@@ -791,7 +791,7 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
         "Warning:: CPUPlace is not supported in GpuPs now."));
   } else if (platform::is_gpu_place(place)) {
     VLOG(3) << "Begin copy keys, key_num[" << total_length << "]";
-    int device_id = BOOST_GET_CONST(platform::CUDAPlace, place).GetDeviceId();
+    int device_id = place.GetDeviceId();
     int devid_2_index = HeterPs_->get_index_by_devid(device_id);
     LoDTensor& total_keys_tensor = keys_tensor[devid_2_index];
     uint64_t* total_keys = reinterpret_cast<uint64_t*>(
@@ -859,7 +859,7 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place,
     PADDLE_THROW(platform::errors::Unimplemented(
         "Warning:: CPUPlace is not supported in GPUPS now."));
   } else if (platform::is_gpu_place(place)) {
-    int device_id = BOOST_GET_CONST(platform::CUDAPlace, place).GetDeviceId();
+    int device_id = place.GetDeviceId();
     int devid_2_index = HeterPs_->get_index_by_devid(device_id);
     LoDTensor& cached_total_keys_tensor = keys_tensor[devid_2_index];
     uint64_t* total_keys =
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
index b4fa09653a391..6a78a617b1fef 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
@@ -113,8 +113,7 @@ void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place,
                                const int hidden_size,
                                const int64_t total_length) {
   auto stream = dynamic_cast<platform::CUDADeviceContext*>(
-                    platform::DeviceContextPool::Instance().Get(
-                        BOOST_GET_CONST(platform::CUDAPlace, place)))
+                    platform::DeviceContextPool::Instance().Get(place))
                     ->stream();
   auto buf_value = memory::Alloc(place, values.size() * sizeof(float*));
   float** gpu_values = reinterpret_cast<float**>(buf_value->ptr());
@@ -132,8 +131,7 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place,
                             const int64_t* gpu_len, int slot_num,
                             int total_len) {
   auto stream = dynamic_cast<platform::CUDADeviceContext*>(
-                    platform::DeviceContextPool::Instance().Get(
-                        BOOST_GET_CONST(platform::CUDAPlace, place)))
+                    platform::DeviceContextPool::Instance().Get(place))
                     ->stream();
   CopyKeysKernel<<<(total_len + 1024 - 1) / 1024, 1024, 0, stream>>>(
       origin_keys, total_keys, gpu_len, slot_num, total_len);
@@ -148,8 +146,7 @@ void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place,
                                const int64_t total_length,
                                const int batch_size) {
   auto stream = dynamic_cast<platform::CUDADeviceContext*>(
-                    platform::DeviceContextPool::Instance().Get(
-                        BOOST_GET_CONST(platform::CUDAPlace, place)))
+                    platform::DeviceContextPool::Instance().Get(place))
                     ->stream();
   auto slot_lengths_lod = slot_lengths;
   for (int i = 1; i < slot_lengths_lod.size(); i++) {
diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc
index b2d976fea0476..22f77be850555 100644
--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@@ -101,7 +101,7 @@ StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place,
 }
 
 StreamGarbageCollector::~StreamGarbageCollector() {
-  auto place = BOOST_GET_CONST(platform::CUDAPlace, this->dev_ctx_->GetPlace());
+  auto place = this->dev_ctx_->GetPlace();
   platform::CUDADeviceGuard guard(place.device);
   platform::GpuStreamSync(stream_);
   platform::GpuDestroyStream(stream_);
@@ -186,7 +186,7 @@ MLUStreamGarbageCollector::MLUStreamGarbageCollector(
 }
 
 MLUStreamGarbageCollector::~MLUStreamGarbageCollector() {
-  auto place = BOOST_GET_CONST(platform::MLUPlace, this->dev_ctx_->GetPlace());
+  auto place = this->dev_ctx_->GetPlace();
   platform::MLUDeviceGuard guard(place.device);
   PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueSync(stream_));
   PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueDestroy(stream_));
diff --git a/paddle/fluid/framework/heter_section_worker.cc b/paddle/fluid/framework/heter_section_worker.cc
index 69a4a180a9071..a4e582c8fed13 100644
--- a/paddle/fluid/framework/heter_section_worker.cc
+++ b/paddle/fluid/framework/heter_section_worker.cc
@@ -46,8 +46,8 @@ void SetMicroId(paddle::framework::Scope* scope,
     temp_ptr_float[0] = micro_id;
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext&>(*dev_ctx).stream();
-    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data,
-                 platform::CPUPlace(), reinterpret_cast<void*>(temp_ptr),
+    memory::Copy(place, tensor_data, platform::CPUPlace(),
+                 reinterpret_cast<void*>(temp_ptr),
                  tensor->numel() * framework::SizeOfType(tensor->type()),
                  stream);
 #endif
diff --git a/paddle/fluid/framework/heterxpu_trainer.cc b/paddle/fluid/framework/heterxpu_trainer.cc
index 3ed886e874db0..01430781c64cd 100644
--- a/paddle/fluid/framework/heterxpu_trainer.cc
+++ b/paddle/fluid/framework/heterxpu_trainer.cc
@@ -117,12 +117,12 @@ void HeterXpuTrainer::CreateThreadParam(const ProgramDesc& program, int num) {
 #ifdef PADDLE_WITH_CUDA
   auto stream = copy_streams_[num];
   auto event = events_[num];
-  auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
+  auto dev_id = place.device;
   platform::CUDADeviceGuard guard(dev_id);
 #endif
 
 #ifdef PADDLE_WITH_XPU
-  auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
+  auto dev_id = place.device;
   platform::XPUDeviceGuard guard(dev_id);
 #endif
 
@@ -173,13 +173,11 @@ void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor,
       thread_tensor->mutable_data<T>(root_tensor->dims(), thread_place);
   T* root_ptr = root_tensor->data<T>();
   if (platform::is_cpu_place(root_tensor->place())) {
-    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, thread_place), thread_ptr,
-                 platform::CPUPlace(), root_ptr,
+    memory::Copy(thread_place, thread_ptr, platform::CPUPlace(), root_ptr,
                  sizeof(T) * root_tensor->numel(), stream);
   } else {
-    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, thread_place), thread_ptr,
-                 BOOST_GET_CONST(platform::CUDAPlace, root_tensor->place()),
-                 root_ptr, sizeof(T) * root_tensor->numel(), stream);
+    memory::Copy(thread_place, thread_ptr, root_tensor->place(), root_ptr,
+                 sizeof(T) * root_tensor->numel(), stream);
   }
 }
 #endif
@@ -193,13 +191,11 @@ void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor,
       thread_tensor->mutable_data<T>(root_tensor->dims(), thread_place);
   T* root_ptr = root_tensor->data<T>();
   if (platform::is_cpu_place(root_tensor->place())) {
-    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, thread_place), thread_ptr,
-                 platform::CPUPlace(), root_ptr,
+    memory::Copy(thread_place, thread_ptr, platform::CPUPlace(), root_ptr,
                  sizeof(T) * root_tensor->numel());
   } else {
-    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, thread_place), thread_ptr,
-                 BOOST_GET_CONST(platform::XPUPlace, root_tensor->place()),
-                 root_ptr, sizeof(T) * root_tensor->numel());
+    memory::Copy(thread_place, thread_ptr, root_tensor->place(), root_ptr,
+                 sizeof(T) * root_tensor->numel());
   }
 }
 #endif
@@ -286,7 +282,7 @@ void HeterXpuTrainer::InitOtherEnv(const ProgramDesc& main_program) {
         (context->ops_).push_back(local_op_ptr);
       }
 #ifdef PADDLE_WITH_CUDA
-      auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
+      auto dev_id = place.device;
       platform::CUDADeviceGuard guard(dev_id);
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming));
@@ -336,15 +332,14 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request,
       _ForEachDataType_(MergeCallback);
       if (!platform::is_cpu_place(thread_tensor->place())) {
 #ifdef PADDLE_WITH_CUDA
-        auto dev_id =
-            BOOST_GET_CONST(platform::CUDAPlace, thread_tensor->place()).device;
+        auto dev_id = thread_tensor->place().device;
         platform::CUDADeviceGuard guard(dev_id);
         cudaMemset(thread_tensor->data(), 0,
                    thread_tensor->numel() * SizeOfType(thread_tensor->type()));
 #endif
 #ifdef PADDLE_WITH_XPU
         auto place = thread_tensor->place();
-        auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
+        auto dev_id = place.device;
         platform::XPUDeviceGuard guard(dev_id);
         platform::DeviceContextPool& pool =
             platform::DeviceContextPool::Instance();
@@ -364,15 +359,14 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request,
                                merge_var);
     if (!platform::is_cpu_place(root_tensor->place())) {
 #ifdef PADDLE_WITH_CUDA
-      auto dev_id =
-          BOOST_GET_CONST(platform::CUDAPlace, root_tensor->place()).device;
+      auto dev_id = root_tensor->place().device;
       platform::CUDADeviceGuard guard(dev_id);
       cudaMemset(root_tensor->data(), 0,
                  root_tensor->numel() * SizeOfType(root_tensor->type()));
 #endif
 #ifdef PADDLE_WITH_XPU
       auto place = root_tensor->place();
-      auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
+      auto dev_id = place.device;
       platform::XPUDeviceGuard guard(dev_id);
       platform::DeviceContextPool& pool =
           platform::DeviceContextPool::Instance();
@@ -442,7 +436,7 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
       (context->ops_).push_back(local_op_ptr);
     }
 #ifdef PADDLE_WITH_CUDA
-    auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
+    auto dev_id = place.device;
     platform::CUDADeviceGuard guard(dev_id);
     PADDLE_ENFORCE_GPU_SUCCESS(
         cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming));
diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc
index 1199d251d2a18..2c10a68188eb4 100644
--- a/paddle/fluid/framework/ir/pass.cc
+++ b/paddle/fluid/framework/ir/pass.cc
@@ -67,7 +67,7 @@ Graph *Pass::Apply(Graph *graph) const {
 #ifdef PADDLE_WITH_MKLDNN
   // Clear mkl-dnn cache,
   // Passes can change params, tensors, so caching need to be discarded
-  ClearMKLDNNCache(paddle::platform::CPUPlace());
+  platform::ClearMKLDNNCache(paddle::platform::CPUPlace());
 #endif
   VLOG(10) << "finish to apply pass " << Type() << " to graph";
   return graph;
diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h
index d1aee6cb2f662..0fd67efc177b3 100644
--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -32,10 +32,8 @@ namespace framework {
 
 inline paddle::optional<platform::CUDAPlace> OptionalCUDAPlace(
     const paddle::memory::allocation::AllocationPtr &gpu_) {
-  return gpu_ == nullptr
-             ? paddle::none
-             : paddle::optional<platform::CUDAPlace>(
-                   BOOST_GET_CONST(platform::CUDAPlace, gpu_->place()));
+  return gpu_ == nullptr ? paddle::none
+                         : paddle::optional<platform::CUDAPlace>(gpu_->place());
 }
 
 // Vector<T> implements the std::vector interface, and can get Data or
@@ -369,11 +367,11 @@ class Vector {
   // get cuda ptr. immutable
   const T *CUDAData(platform::Place place) const {
     {
+      platform::CUDAPlace p(place.GetDeviceId());
       auto &mtx = m_.Data().Mutex();
       std::lock_guard<std::mutex> guard(mtx);
       auto cuda_place = m_.Data().CUDAPlace();
-      if (cuda_place == paddle::none ||
-          cuda_place == BOOST_GET(platform::CUDAPlace, place)) {
+      if (cuda_place == paddle::none || cuda_place == p) {
         return m_.Data().CUDAData(place);
       }
     }
@@ -385,11 +383,11 @@ class Vector {
   // get cuda ptr. mutable
   T *CUDAMutableData(platform::Place place) {
     {
+      platform::CUDAPlace p(place.GetDeviceId());
       auto &mtx = m_.Data().Mutex();
       std::lock_guard<std::mutex> guard(mtx);
       auto cuda_place = m_.Data().CUDAPlace();
-      if (cuda_place == paddle::none ||
-          cuda_place == BOOST_GET(platform::CUDAPlace, place)) {
+      if (cuda_place == paddle::none || cuda_place == p) {
         return m_.MutableData()->CUDAMutableData(place);
       }
     }
diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index 9bd6aba3ea842..ece4815858640 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -131,7 +131,7 @@ NaiveExecutor::~NaiveExecutor() {
 #ifdef PADDLE_WITH_MKLDNN
   // Clear mkl-dnn cache,
   // this is needed to have mkl-dnn unit tests working
-  ClearMKLDNNCache(place_, this);
+  platform::ClearMKLDNNCache(place_, this);
 #endif
 }
 
diff --git a/paddle/fluid/framework/new_executor/profiler.h b/paddle/fluid/framework/new_executor/profiler.h
index 8df8db35592bb..95eee77d36288 100644
--- a/paddle/fluid/framework/new_executor/profiler.h
+++ b/paddle/fluid/framework/new_executor/profiler.h
@@ -43,7 +43,7 @@ class ProfilerGuard {
   void TotalCUDAAllocatedMemorySize(const platform::Place& place) {
     if (platform::is_gpu_place(place)) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      auto cuda_place = BOOST_GET_CONST(platform::CUDAPlace, place);
+      auto cuda_place = place;
       cost_info_->device_memory_bytes =
           platform::RecordedGpuMallocSize(cuda_place.device);
 #endif
diff --git a/paddle/fluid/framework/op_kernel_type.cc b/paddle/fluid/framework/op_kernel_type.cc
index 4965f7b720c1d..7dac6a092d245 100644
--- a/paddle/fluid/framework/op_kernel_type.cc
+++ b/paddle/fluid/framework/op_kernel_type.cc
@@ -22,7 +22,7 @@ namespace framework {
 size_t OpKernelType::Hash::operator()(const OpKernelType& key) const {
   int cur_loc = 0;
 
-  int place = key.place_.which();
+  int place = static_cast<int>(key.place_.GetType());
   cur_loc += OpKernelType::kPlaceBits;
 
   int data_type = static_cast<int>(key.data_type_) << cur_loc;
diff --git a/paddle/fluid/framework/op_kernel_type_test.cc b/paddle/fluid/framework/op_kernel_type_test.cc
index 2979750fba792..3879a7957600d 100644
--- a/paddle/fluid/framework/op_kernel_type_test.cc
+++ b/paddle/fluid/framework/op_kernel_type_test.cc
@@ -27,7 +27,7 @@ TEST(OpKernelType, ToString) {
                               LibraryType::kCUDNN);
 
   ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type),
-            "data_type[float]:data_layout[NCHW]:place[CPUPlace]:library_type["
+            "data_type[float]:data_layout[NCHW]:place[Place(cpu)]:library_type["
             "CUDNN]");
 
   using CUDAPlace = paddle::platform::CUDAPlace;
@@ -35,7 +35,7 @@ TEST(OpKernelType, ToString) {
                                LibraryType::kCUDNN);
   ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type2),
             "data_type[::paddle::platform::float16]:data_layout[NCHW]:place["
-            "CUDAPlace(0)]:library_"
+            "Place(gpu:0)]:library_"
             "type[CUDNN]");
 }
 
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index ff12edb72c06a..e3f0fbbdfdc4a 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -210,7 +210,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
           "reinstall Paddle with CUDA support.",
           place));
 #else
-      auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
+      auto dev_id = place.device;
       platform::SetDeviceId(dev_id);
 #endif
     } else if (platform::is_xpu_place(place)) {
@@ -220,7 +220,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
           "reinstall Paddle with XPU support.",
           place));
 #else
-      auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
+      auto dev_id = place.device;
       platform::SetXPUDeviceId(dev_id);
 #endif
     } else if (platform::is_npu_place(place)) {
@@ -230,7 +230,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
           "reinstall Paddle with NPU support.",
           place));
 #else
-      auto dev_id = BOOST_GET_CONST(platform::NPUPlace, place).device;
+      auto dev_id = place.device;
       platform::SetNPUDeviceId(dev_id);
 #endif
     } else if (platform::is_mlu_place(place)) {
@@ -240,7 +240,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
           "reinstall Paddle with MLU support.",
           place));
 #else
-      auto dev_id = BOOST_GET_CONST(platform::MLUPlace, place).device;
+      auto dev_id = place.device;
       platform::SetMLUDeviceId(dev_id);
 #endif
     }
@@ -1330,7 +1330,7 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
   }
 #endif
 #ifdef PADDLE_WITH_XPU
-  if (is_xpu_place(expected_kernel_key.place_) &&
+  if (platform::is_xpu_place(expected_kernel_key.place_) &&
       (kernel_iter == kernels.end() ||
        !paddle::platform::is_xpu_support_op(type_, expected_kernel_key) ||
        paddle::platform::is_in_xpu_black_list(type_))) {
@@ -1343,7 +1343,7 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
   if (kernel_iter == kernels.end() &&
-      is_npu_place(expected_kernel_key.place_)) {
+      platform::is_npu_place(expected_kernel_key.place_)) {
     VLOG(3) << "missing NPU kernel: " << type_
             << ", expected_kernel_key:" << expected_kernel_key
             << ", fallbacking to CPU one!";
@@ -1353,7 +1353,7 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
 #endif
 #ifdef PADDLE_WITH_MLU
   if (kernel_iter == kernels.end() &&
-      is_mlu_place(expected_kernel_key.place_)) {
+      platform::is_mlu_place(expected_kernel_key.place_)) {
     VLOG(3) << "missing MLU kernel: " << type_
             << ", expected_kernel_key:" << expected_kernel_key
             << ", fallbacking to CPU one!";
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 9a38a2d5d6fe8..d6c1c4cb6acc0 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -500,11 +500,9 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
     if (platform::is_gpu_place(place)) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       if (IsFastEagerDeletionModeEnabled()) {
-        gc.reset(new UnsafeFastGPUGarbageCollector(
-            BOOST_GET_CONST(platform::CUDAPlace, place), max_memory_size));
+        gc.reset(new UnsafeFastGPUGarbageCollector(place, max_memory_size));
       } else {
-        gc.reset(new StreamGarbageCollector(
-            BOOST_GET_CONST(platform::CUDAPlace, place), max_memory_size));
+        gc.reset(new StreamGarbageCollector(place, max_memory_size));
       }
       VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
 #else
@@ -515,11 +513,9 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
     } else if (platform::is_mlu_place(place)) {
 #ifdef PADDLE_WITH_MLU
       if (IsFastEagerDeletionModeEnabled()) {
-        gc.reset(new MLUUnsafeFastGarbageCollector(
-            BOOST_GET_CONST(platform::MLUPlace, place), max_memory_size));
+        gc.reset(new MLUUnsafeFastGarbageCollector(place, max_memory_size));
       } else {
-        gc.reset(new MLUStreamGarbageCollector(
-            BOOST_GET_CONST(platform::MLUPlace, place), max_memory_size));
+        gc.reset(new MLUStreamGarbageCollector(place, max_memory_size));
       }
       VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
 #else
@@ -529,8 +525,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
 #endif
     } else if (platform::is_xpu_place(place)) {
 #if defined(PADDLE_WITH_XPU)
-      gc.reset(new XPUGarbageCollector(
-          BOOST_GET_CONST(platform::XPUPlace, place), max_memory_size));
+      gc.reset(new XPUGarbageCollector(place, max_memory_size));
       VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
 #else
       PADDLE_THROW(platform::errors::PermissionDenied(
@@ -538,8 +533,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
           "Please recompile or reinstall Paddle with XPU support."));
 #endif
     } else if (platform::is_cpu_place(place)) {
-      gc.reset(new CPUGarbageCollector(
-          BOOST_GET_CONST(platform::CPUPlace, place), max_memory_size));
+      gc.reset(new CPUGarbageCollector(place, max_memory_size));
       VLOG(10) << "Created GarbageCollector at " << place;
     } else {
       PADDLE_THROW(platform::errors::PreconditionNotMet(
@@ -609,10 +603,9 @@ void InitP2P(const std::vector<platform::Place> &places) {
 
     std::vector<int> devices;
     for (int i = 0; i < count; i++) {
-      if (!is_gpu_place(places[i])) return;
+      if (!platform::is_gpu_place(places[i])) return;
 
-      platform::CUDAPlace device =
-          BOOST_GET_CONST(platform::CUDAPlace, places[i]);
+      platform::CUDAPlace device = places[i];
       devices.push_back(device.GetDeviceId());
     }
 
@@ -655,9 +648,9 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
                                    const BuildStrategy &build_strategy,
                                    ir::Graph *graph)
     : member_(new ParallelExecutorPrivate(places, scope)) {
-  PADDLE_ENFORCE(places.size() > 0 && !is_npu_place(places[0]),
-                 platform::errors::Unavailable(
-                     "NPU is not supported in ParallelExecutor"));
+  PADDLE_ENFORCE_EQ(places.size() > 0 && !platform::is_npu_place(places[0]),
+                    true, platform::errors::Unavailable(
+                              "NPU is not supported in ParallelExecutor."));
   InitP2P(places);
   ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_),
                                  member_->places_.size());
diff --git a/paddle/fluid/framework/pull_dense_worker.cc b/paddle/fluid/framework/pull_dense_worker.cc
index b13aaadc81661..62d6ba0973547 100644
--- a/paddle/fluid/framework/pull_dense_worker.cc
+++ b/paddle/fluid/framework/pull_dense_worker.cc
@@ -135,13 +135,11 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
         LoDTensor* tensor = var->GetMutable<LoDTensor>();
         float* w = tensor->data<float>();
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-        memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, places_[i]), w,
-                     platform::CUDAPinnedPlace(), pin_w,
+        memory::Copy(places_[i], w, platform::CUDAPinnedPlace(), pin_w,
                      sizeof(float) * tensor->numel(), copy_streams_[i]);
 #endif
 #ifdef PADDLE_WITH_XPU
-        memory::Copy(BOOST_GET_CONST(platform::XPUPlace, places_[i]), w,
-                     platform::CPUPlace(), pin_w,
+        memory::Copy(places_[i], w, platform::CPUPlace(), pin_w,
                      sizeof(float) * tensor->numel());
 #endif
       }
diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc
index 64d8332e22327..1f821720d64d2 100644
--- a/paddle/fluid/framework/section_worker.cc
+++ b/paddle/fluid/framework/section_worker.cc
@@ -224,23 +224,20 @@ void SectionWorker::TrainFiles() {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (platform::is_gpu_place(place_)) {
       if (IsFastEagerDeletionModeEnabled()) {
-        gc.reset(new UnsafeFastGPUGarbageCollector(
-            BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size));
+        gc.reset(new UnsafeFastGPUGarbageCollector(place_, max_memory_size));
       }
     }
 #elif defined(PADDLE_WITH_ASCEND_CL)
     if (IsFastEagerDeletionModeEnabled()) {
       VLOG(4) << "Use unsafe fast gc for NPU.";
-      gc.reset(new NPUUnsafeFastGarbageCollector(
-          BOOST_GET_CONST(platform::NPUPlace, place_), max_memory_size));
+      gc.reset(new NPUUnsafeFastGarbageCollector(place_, max_memory_size));
     } else {
       PADDLE_THROW(platform::errors::Unimplemented(
           "Please set FLAGS_fast_eager_deletion_mode=true to use "
           "GarbageCollector on NPU."));
       // TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector.
       VLOG(4) << "Use default stream gc for NPU.";
-      gc.reset(new NPUDefaultStreamGarbageCollector(
-          BOOST_GET_CONST(platform::NPUPlace, place_), max_memory_size));
+      gc.reset(new NPUDefaultStreamGarbageCollector(place_, max_memory_size));
     }
 #endif
   }  // max_memory_size >= 0
diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h
index 3e4beb9498cf7..3634ccca95126 100644
--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@@ -25,13 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/rw_lock.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/memory/memcpy.h"
-
-namespace paddle {
-namespace platform {
-class DeviceContext;
-class Place;
-}  // namespace platform
-}  // namespace paddle
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 84334417dc7da..4298b159ead52 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -76,34 +76,28 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place,
 #endif
 
   if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
-    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
-                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
 #ifdef PADDLE_WITH_IPU
   else if (platform::is_ipu_place(src_place) &&  // NOLINT
            platform::is_cpu_place(dst_place)) {
-    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
-                 BOOST_GET_CONST(platform::IPUPlace, src_place), src_ptr, size);
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   } else if (platform::is_cpu_place(src_place) &&
              platform::is_ipu_place(dst_place)) {
-    memory::Copy(BOOST_GET_CONST(platform::IPUPlace, dst_place), dst_ptr,
-                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   } else if (platform::is_ipu_place(src_place) &&
              platform::is_ipu_place(dst_place)) {
-    memory::Copy(BOOST_GET_CONST(platform::IPUPlace, dst_place), dst_ptr,
-                 BOOST_GET_CONST(platform::IPUPlace, src_place), src_ptr, size);
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
 #endif
 
 #ifdef PADDLE_WITH_XPU
   else if (platform::is_xpu_place(src_place) &&  // NOLINT
            platform::is_cpu_place(dst_place)) {
-    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
-                 BOOST_GET_CONST(platform::XPUPlace, src_place), src_ptr, size);
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   } else if (platform::is_cpu_place(src_place) &&
              platform::is_xpu_place(dst_place)) {
-    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dst_place), dst_ptr,
-                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   } else if (platform::is_xpu_place(src_place) &&
              platform::is_xpu_place(dst_place)) {
     if (src_ptr == dst_ptr) {
@@ -111,8 +105,7 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place,
               << dst_place;
       return;
     }
-    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dst_place), dst_ptr,
-                 BOOST_GET_CONST(platform::XPUPlace, src_place), src_ptr, size);
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   } else {
     PADDLE_THROW(platform::errors::Unimplemented(
         "Copy from %s to %s is not supported.", src_place, dst_place));
@@ -124,9 +117,7 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place,
            platform::is_cpu_place(dst_place)) {
     auto stream =
         reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
-    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
-                 BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size,
-                 stream);
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
   }
   else if (platform::is_cpu_place(src_place) &&  // NOLINT
            platform::is_npu_place(dst_place)) {
@@ -136,13 +127,11 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place,
     npu_pinned_tensor.Resize(src.dims());
     auto npu_pinned_ptr =
         npu_pinned_tensor.mutable_data(npu_pinned_place, src.type());
-    memory::Copy(npu_pinned_place, npu_pinned_ptr,
-                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
+    memory::Copy(npu_pinned_place, npu_pinned_ptr, src_place, src_ptr, size);
 
     //  2. async copy npu pinned tensor -> npu tensor
     memory::Copy(
-        BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
-        npu_pinned_place, npu_pinned_ptr, size,
+        dst_place, dst_ptr, npu_pinned_place, npu_pinned_ptr, size,
         reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
 
     //  3. record event
@@ -165,22 +154,19 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place,
     }
     auto stream =
         reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
-    memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
-                 BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size,
-                 stream);
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
   }
   else if (platform::is_npu_pinned_place(src_place) &&  // NOLINT
            platform::is_npu_place(dst_place)) {         /* npu_pinned->npu */
-    auto src_npu_pinned_place =
-        BOOST_GET_CONST(platform::NPUPinnedPlace, src_place);
-    auto dst_npu_place = BOOST_GET_CONST(platform::NPUPlace, dst_place);
+    auto src_npu_pinned_place = src_place;
+    auto dst_npu_place = dst_place;
     auto ctx_place = ctx.GetPlace();
     PADDLE_ENFORCE_EQ(platform::is_npu_place(ctx_place), true,
                       platform::errors::PreconditionNotMet(
                           "Device context place mismatch. When copying Tensor "
                           "data from NPU Pinned memory to NPU memory, current "
                           "device context place should be NPU."));
-    auto ctx_npu_place = BOOST_GET_CONST(platform::NPUPlace, ctx_place);
+    auto ctx_npu_place = ctx_place;
     PADDLE_ENFORCE_EQ(dst_npu_place, ctx_npu_place,
                       platform::errors::PreconditionNotMet(
                           "The target NPU device and current device context do "
@@ -194,16 +180,15 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place,
   }
   else if (platform::is_npu_place(src_place) &&        // NOLINT
            platform::is_npu_pinned_place(dst_place)) { /* npu->npu_pinned */
-    auto src_npu_place = BOOST_GET_CONST(platform::NPUPlace, src_place);
-    auto dst_npu_pinned_place =
-        BOOST_GET_CONST(platform::NPUPinnedPlace, dst_place);
+    auto src_npu_place = src_place;
+    auto dst_npu_pinned_place = dst_place;
     auto ctx_place = ctx.GetPlace();
     PADDLE_ENFORCE_EQ(platform::is_npu_place(ctx_place), true,
                       platform::errors::PreconditionNotMet(
                           "Device context place mismatch. When copying Tensor "
                           "data from NPU memory to NPU Pinned memory, current "
                           "device context place should be NPU."));
-    auto ctx_npu_place = BOOST_GET_CONST(platform::NPUPlace, ctx_place);
+    auto ctx_npu_place = ctx_place;
     PADDLE_ENFORCE_EQ(src_place, ctx_npu_place,
                       platform::errors::PreconditionNotMet(
                           "The source NPU device and current device context do "
@@ -223,32 +208,27 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place,
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
            platform::is_cuda_pinned_place(dst_place)) {
-    memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr,
-                 BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place), src_ptr,
-                 size);
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
   else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
            platform::is_cpu_place(dst_place)) {
-    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
-                 BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place), src_ptr,
-                 size);
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
   else if (platform::is_cpu_place(src_place) &&  // NOLINT
            platform::is_cuda_pinned_place(dst_place)) {
-    memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr,
-                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
   else if (platform::is_gpu_place(src_place) &&  // NOLINT
            platform::is_cpu_place(dst_place)) {
-    auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place);
-    auto dst_cpu_place = BOOST_GET_CONST(platform::CPUPlace, dst_place);
+    auto src_gpu_place = src_place;
+    auto dst_cpu_place = dst_place;
     auto ctx_place = ctx.GetPlace();
     PADDLE_ENFORCE_EQ(
         platform::is_gpu_place(ctx_place), true,
         platform::errors::PreconditionNotMet(
             "Context place error, excepted GPUPlace, but actually %s.",
             ctx_place));
-    auto ctx_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, ctx_place);
+    auto ctx_gpu_place = ctx_place;
     PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place,
                       platform::errors::Unavailable(
                           "Source place and context place do not match, source "
@@ -260,15 +240,15 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place,
   }
   else if (platform::is_cpu_place(src_place) &&  // NOLINT
            platform::is_gpu_place(dst_place)) {
-    auto src_cpu_place = BOOST_GET_CONST(platform::CPUPlace, src_place);
-    auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place);
+    auto src_cpu_place = src_place;
+    auto dst_gpu_place = dst_place;
     auto ctx_place = ctx.GetPlace();
     PADDLE_ENFORCE_EQ(
         platform::is_gpu_place(ctx_place), true,
         platform::errors::PreconditionNotMet(
             "Context place error, excepted GPUPlace, but actually %s.",
             ctx_place));
-    auto ctx_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, ctx_place);
+    auto ctx_gpu_place = ctx_place;
     PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place,
                       platform::errors::Unavailable(
                           "Destination place and context place do not match, "
@@ -280,16 +260,15 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place,
   }
   else if (platform::is_gpu_place(src_place) &&  // NOLINT
            platform::is_cuda_pinned_place(dst_place)) {
-    auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place);
-    auto dst_cuda_pinned_place =
-        BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place);
+    auto src_gpu_place = src_place;
+    auto dst_cuda_pinned_place = dst_place;
     auto ctx_place = ctx.GetPlace();
     PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx_place), true,
                       platform::errors::PreconditionNotMet(
                           "Device context place mismatch. When copying Tensor "
                           "data from GPU memory to CUDA Pinned memory, current "
                           "device context place should be GPU."));
-    auto ctx_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, ctx_place);
+    auto ctx_gpu_place = ctx_place;
     PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place,
                       platform::errors::PreconditionNotMet(
                           "The source GPU device and current device context do "
@@ -303,16 +282,15 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place,
   }
   else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
            platform::is_gpu_place(dst_place)) {
-    auto src_cuda_pinned_place =
-        BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place);
-    auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place);
+    auto src_cuda_pinned_place = src_place;
+    auto dst_gpu_place = dst_place;
     auto ctx_place = ctx.GetPlace();
     PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx_place), true,
                       platform::errors::PreconditionNotMet(
                           "Device context place mismatch. When copying Tensor "
                           "data from CUDA Pinned memory to GPU memory, current "
                           "device context place should be GPU."));
-    auto ctx_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, ctx_place);
+    auto ctx_gpu_place = ctx_place;
     PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place,
                       platform::errors::PreconditionNotMet(
                           "The target GPU device and current device context do "
@@ -326,8 +304,8 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place,
   }
   else if (platform::is_gpu_place(src_place) &&  // NOLINT
            platform::is_gpu_place(dst_place)) {
-    auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place);
-    auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place);
+    auto src_gpu_place = src_place;
+    auto dst_gpu_place = dst_place;
     auto ctx_place = ctx.GetPlace();
     PADDLE_ENFORCE_EQ(
         platform::is_gpu_place(ctx_place), true,
@@ -362,24 +340,24 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place,
 #ifdef PADDLE_WITH_MLU
   else if (platform::is_mlu_place(src_place) &&  // NOLINT
            platform::is_cpu_place(dst_place)) {
-    auto src_mlu_place = BOOST_GET_CONST(platform::MLUPlace, src_place);
-    auto dst_cpu_place = BOOST_GET_CONST(platform::CPUPlace, dst_place);
+    auto src_mlu_place = src_place;
+    auto dst_cpu_place = dst_place;
     auto stream =
         reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream();
     memory::Copy(dst_cpu_place, dst_ptr, src_mlu_place, src_ptr, size, stream);
   }
   else if (platform::is_cpu_place(src_place) &&  // NOLINT
            platform::is_mlu_place(dst_place)) {
-    auto src_cpu_place = BOOST_GET_CONST(platform::CPUPlace, src_place);
-    auto dst_mlu_place = BOOST_GET_CONST(platform::MLUPlace, dst_place);
+    auto src_cpu_place = src_place;
+    auto dst_mlu_place = dst_place;
     auto stream =
         reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream();
     memory::Copy(dst_mlu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
   }
   else if (platform::is_mlu_place(src_place) &&  // NOLINT
            platform::is_mlu_place(dst_place)) {
-    auto src_mlu_place = BOOST_GET_CONST(platform::MLUPlace, src_place);
-    auto dst_mlu_place = BOOST_GET_CONST(platform::MLUPlace, dst_place);
+    auto src_mlu_place = src_place;
+    auto dst_mlu_place = dst_place;
     auto stream =
         reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream();
     memory::Copy(dst_mlu_place, dst_ptr, src_mlu_place, src_ptr, size, stream);
@@ -451,18 +429,15 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
 
   auto size = src.numel() * SizeOfType(src.type());
   if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
-    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
-                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
 #ifdef PADDLE_WITH_IPU
   else if (platform::is_ipu_place(src_place) &&  // NOLINT
            platform::is_cpu_place(dst_place)) {
-    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
-                 BOOST_GET_CONST(platform::IPUPlace, src_place), src_ptr, size);
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   } else if (platform::is_cpu_place(src_place) &&  // NOLINT
              platform::is_ipu_place(dst_place)) {
-    memory::Copy(BOOST_GET_CONST(platform::IPUPlace, dst_place), dst_ptr,
-                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   } else {  // NOLINT
     PADDLE_THROW(platform::errors::Unimplemented(
         "Copy from %s to %s is not supported.", src_place, dst_place));
@@ -471,13 +446,11 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
 #ifdef PADDLE_WITH_XPU
   else if (platform::is_xpu_place(src_place) &&  // NOLINT
            platform::is_cpu_place(dst_place)) {
-    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
-                 BOOST_GET_CONST(platform::XPUPlace, src_place), src_ptr, size);
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
   else if (platform::is_cpu_place(src_place) &&  // NOLINT
            platform::is_xpu_place(dst_place)) {
-    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dst_place), dst_ptr,
-                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
   else if (platform::is_xpu_place(src_place) &&  // NOLINT
            platform::is_xpu_place(dst_place)) {
@@ -486,12 +459,9 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
               << dst_place;
       return;
     }
-    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dst_place), dst_ptr,
-                 BOOST_GET_CONST(platform::XPUPlace, src_place), src_ptr, size);
-    platform::XPUPlace xpu_dst_place =
-        BOOST_GET_CONST(platform::XPUPlace, dst_place);
-    platform::XPUPlace xpu_src_place =
-        BOOST_GET_CONST(platform::XPUPlace, src_place);
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+    platform::XPUPlace xpu_dst_place = dst_place;
+    platform::XPUPlace xpu_src_place = src_place;
     if (xpu_dst_place.device == xpu_src_place.device) {
       auto xpu_ctx = platform::DeviceContextPool::Instance().Get(xpu_dst_place);
       xpu_ctx->Wait();
@@ -505,15 +475,11 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
 #ifdef PADDLE_WITH_ASCEND_CL
   else if (platform::is_npu_place(src_place) &&  // NOLINT
            platform::is_cpu_place(dst_place)) {  /* npu -> cpu*/
-    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
-                 BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size,
-                 nullptr);
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
   }
   else if (platform::is_cpu_place(src_place) &&  // NOLINT
            platform::is_npu_place(dst_place)) {  /* cpu -> npu*/
-    memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
-                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size,
-                 nullptr);
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
   }
   else if (platform::is_npu_place(src_place) &&  // NOLINT
            platform::is_npu_place(dst_place)) {  /* npu -> npu*/
@@ -522,9 +488,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
               << dst_place;
       return;
     }
-    memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
-                 BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size,
-                 nullptr);
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
   }
   else {  // NOLINT
     PADDLE_THROW(platform::errors::Unimplemented(
@@ -534,50 +498,42 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
            platform::is_cuda_pinned_place(dst_place)) {
-    memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr,
-                 BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place), src_ptr,
-                 size);
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
   else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
            platform::is_cpu_place(dst_place)) {
-    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
-                 BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place), src_ptr,
-                 size);
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
   else if (platform::is_cpu_place(src_place) &&  // NOLINT
            platform::is_cuda_pinned_place(dst_place)) {
-    memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr,
-                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
   else if (platform::is_gpu_place(src_place) &&  // NOLINT
            platform::is_cuda_pinned_place(dst_place)) {
-    memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr,
-                 BOOST_GET_CONST(platform::CUDAPlace, src_place), src_ptr, size,
-                 nullptr);
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
   }
   else if (platform::is_gpu_place(src_place) &&  // NOLINT
            platform::is_cpu_place(dst_place)) {
-    auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place);
-    auto dst_cpu_place = BOOST_GET_CONST(platform::CPUPlace, dst_place);
+    auto src_gpu_place = src_place;
+    auto dst_cpu_place = dst_place;
     memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
   }
   else if (platform::is_cpu_place(src_place) &&  // NOLINT
            platform::is_gpu_place(dst_place)) {
-    auto src_cpu_place = BOOST_GET_CONST(platform::CPUPlace, src_place);
-    auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place);
+    auto src_cpu_place = src_place;
+    auto dst_gpu_place = dst_place;
     memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr);
   }
   else if (platform::is_gpu_place(src_place) &&  // NOLINT
            platform::is_gpu_place(dst_place)) {
-    auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place);
-    auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place);
+    auto src_gpu_place = src_place;
+    auto dst_gpu_place = dst_place;
     memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
   }
   else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
            platform::is_gpu_place(dst_place)) {
-    auto src_pinned_place =
-        BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place);
-    auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place);
+    auto src_pinned_place = src_place;
+    auto dst_gpu_place = dst_place;
     memory::Copy(dst_gpu_place, dst_ptr, src_pinned_place, src_ptr, size,
                  nullptr);
   }
@@ -589,15 +545,11 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
 #ifdef PADDLE_WITH_MLU
   else if (platform::is_mlu_place(src_place) &&  // NOLINT
            platform::is_cpu_place(dst_place)) {
-    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
-                 BOOST_GET_CONST(platform::MLUPlace, src_place), src_ptr, size,
-                 nullptr);
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
   }
   else if (platform::is_cpu_place(src_place) &&  // NOLINT
            platform::is_mlu_place(dst_place)) {
-    memory::Copy(BOOST_GET_CONST(platform::MLUPlace, dst_place), dst_ptr,
-                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size,
-                 nullptr);
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
   }
   else if (platform::is_mlu_place(src_place) &&  // NOLINT
            platform::is_mlu_place(dst_place)) {
@@ -606,9 +558,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
               << dst_place;
       return;
     }
-    memory::Copy(BOOST_GET_CONST(platform::MLUPlace, dst_place), dst_ptr,
-                 BOOST_GET_CONST(platform::MLUPlace, src_place), src_ptr, size,
-                 nullptr);
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
   }
   else {  // NOLINT
     PADDLE_THROW(platform::errors::Unimplemented(
@@ -1015,8 +965,7 @@ void TensorToStream(std::ostream& os, const Tensor& tensor,
       uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
       while (size != 0) {
         size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
-        memory::Copy(cpu, buf.get(),
-                     BOOST_GET_CONST(platform::CUDAPlace, tensor.place()),
+        memory::Copy(cpu, buf.get(), tensor.place(),
                      reinterpret_cast<const void*>(data), size_to_write,
                      gpu_dev_ctx.stream());
         gpu_dev_ctx.Wait();
@@ -1038,8 +987,7 @@ void TensorToStream(std::ostream& os, const Tensor& tensor,
       uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
       while (size != 0) {
         size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
-        memory::Copy(cpu, buf.get(),
-                     BOOST_GET_CONST(platform::XPUPlace, tensor.place()),
+        memory::Copy(cpu, buf.get(), tensor.place(),
                      reinterpret_cast<const void*>(data), size_to_write);
         xpu_dev_ctx.Wait();
         os.write(buf.get(), size_to_write);
@@ -1060,8 +1008,7 @@ void TensorToStream(std::ostream& os, const Tensor& tensor,
       uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
       while (size != 0) {
         size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
-        memory::Copy(cpu, buf.get(),
-                     BOOST_GET_CONST(platform::MLUPlace, tensor.place()),
+        memory::Copy(cpu, buf.get(), tensor.place(),
                      reinterpret_cast<const void*>(data), size_to_write,
                      mlu_dev_ctx.stream());
         mlu_dev_ctx.Wait();
@@ -1083,8 +1030,7 @@ void TensorToStream(std::ostream& os, const Tensor& tensor,
       uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
       while (size != 0) {
         size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
-        memory::Copy(cpu, buf.get(),
-                     BOOST_GET_CONST(platform::NPUPlace, tensor.place()),
+        memory::Copy(cpu, buf.get(), tensor.place(),
                      reinterpret_cast<const void*>(data), size_to_write,
                      npu_dev_ctx.stream());
         npu_dev_ctx.Wait();
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index 355be39baa2a5..3cb3c733f4042 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -153,14 +153,12 @@ void TensorFromArray(const T* src, const size_t& array_size,
   auto size = array_size * sizeof(T);
 
   if (platform::is_cpu_place(dst_place)) {
-    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
-                 src_place, src_ptr, size);
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   else if (platform::is_gpu_place(dst_place)) {  // NOLINT
     memory::Copy(
-        BOOST_GET_CONST(platform::CUDAPlace, dst_place), dst_ptr, src_place,
-        src_ptr, size,
+        dst_place, dst_ptr, src_place, src_ptr, size,
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
   }
 #endif
@@ -176,8 +174,7 @@ void TensorFromArray(const T* src, const size_t& array_size,
 
     //  2. async copy npu pinned tensor -> npu tensor
     memory::Copy(
-        BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
-        npu_pinned_place, npu_pinned_ptr, size,
+        dst_place, dst_ptr, npu_pinned_place, npu_pinned_ptr, size,
         reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
 
     //  3. record event
@@ -205,14 +202,12 @@ void TensorFromVector(const std::vector<T>& src,
   auto size = src.size() * sizeof(T);
 
   if (platform::is_cpu_place(dst_place)) {
-    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
-                 src_place, src_ptr, size);
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   else if (platform::is_gpu_place(dst_place)) {  // NOLINT
     memory::Copy(
-        BOOST_GET_CONST(platform::CUDAPlace, dst_place), dst_ptr, src_place,
-        src_ptr, size,
+        dst_place, dst_ptr, src_place, src_ptr, size,
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
   }
 #endif
@@ -233,8 +228,7 @@ void TensorFromVector(const std::vector<T>& src,
 
     //  2. async copy npu pinned tensor -> npu tensor
     memory::Copy(
-        BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
-        npu_pinned_place, npu_pinned_ptr, size,
+        dst_place, dst_ptr, npu_pinned_place, npu_pinned_ptr, size,
         reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
 
     //  3. record event
@@ -252,8 +246,7 @@ void TensorFromVector(const std::vector<T>& src,
 #ifdef PADDLE_WITH_MLU
   if (platform::is_mlu_place(dst_place)) {
     memory::Copy(
-        BOOST_GET_CONST(platform::MLUPlace, dst_place), dst_ptr, src_place,
-        src_ptr, size,
+        dst_place, dst_ptr, src_place, src_ptr, size,
         reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream());
   }
 #endif
@@ -280,14 +273,12 @@ inline void TensorFromVector(const std::vector<bool>& src,
   auto size = src.size() * sizeof(bool);
 
   if (platform::is_cpu_place(dst_place)) {
-    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
-                 src_place, src_ptr, size);
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
 #ifdef PADDLE_WITH_CUDA
   else if (platform::is_gpu_place(dst_place)) {  // NOLINT
     memory::Copy(
-        BOOST_GET_CONST(platform::CUDAPlace, dst_place), dst_ptr, src_place,
-        src_ptr, size,
+        dst_place, dst_ptr, src_place, src_ptr, size,
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
   }
 #endif
@@ -303,8 +294,7 @@ inline void TensorFromVector(const std::vector<bool>& src,
 
     //  2. async copy npu pinned tensor -> npu tensor
     memory::Copy(
-        BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
-        npu_pinned_place, npu_pinned_ptr, size,
+        dst_place, dst_ptr, npu_pinned_place, npu_pinned_ptr, size,
         reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
 
     //  3. record event
@@ -362,37 +352,29 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx,
   auto dst_ptr = static_cast<void*>(dst->data());
 
   if (platform::is_cpu_place(src.place())) {
-    memory::Copy(dst_place, dst_ptr,
-                 BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr,
-                 size);
+    memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
   }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   else if (platform::is_gpu_place(src.place())) {  // NOLINT
     memory::Copy(
-        dst_place, dst_ptr, BOOST_GET_CONST(platform::CUDAPlace, src.place()),
-        src_ptr, size,
+        dst_place, dst_ptr, src.place(), src_ptr, size,
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
   }
 #endif
 #if defined(PADDLE_WITH_XPU)
   else if (platform::is_xpu_place(src.place())) {  // NOLINT
-    memory::Copy(dst_place, dst_ptr,
-                 BOOST_GET_CONST(platform::XPUPlace, src.place()), src_ptr,
-                 size);
+    memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
   }
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
   else if (platform::is_npu_place(src.place())) {  // NOLINT
-    memory::Copy(dst_place, dst_ptr,
-                 BOOST_GET_CONST(platform::NPUPlace, src.place()), src_ptr,
-                 size, nullptr);
+    memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
   }
 #endif
 #ifdef PADDLE_WITH_MLU
   else if (platform::is_mlu_place(src.place())) {  // NOLINT
     memory::Copy(
-        dst_place, dst_ptr, BOOST_GET_CONST(platform::MLUPlace, src.place()),
-        src_ptr, size,
+        dst_place, dst_ptr, src.place(), src_ptr, size,
         reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream());
   }
 #endif
@@ -412,37 +394,29 @@ inline void TensorToVector(const Tensor& src,
   auto dst_ptr = static_cast<void*>(array);
 
   if (platform::is_cpu_place(src.place())) {
-    memory::Copy(dst_place, dst_ptr,
-                 BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr,
-                 size);
+    memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
   }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   else if (platform::is_gpu_place(src.place())) {  // NOLINT
     memory::Copy(
-        dst_place, dst_ptr, BOOST_GET_CONST(platform::CUDAPlace, src.place()),
-        src_ptr, size,
+        dst_place, dst_ptr, src.place(), src_ptr, size,
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
   }
 #endif
 #if defined(PADDLE_WITH_XPU)
   else if (platform::is_xpu_place(src.place())) {  // NOLINT
-    memory::Copy(dst_place, dst_ptr,
-                 BOOST_GET_CONST(platform::XPUPlace, src.place()), src_ptr,
-                 size);
+    memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
   }
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
   else if (platform::is_npu_place(src.place())) {  // NOLINT
-    memory::Copy(dst_place, dst_ptr,
-                 BOOST_GET_CONST(platform::NPUPlace, src.place()), src_ptr,
-                 size, nullptr);
+    memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
   }
 #endif
 #ifdef PADDLE_WITH_MLU
   else if (platform::is_mlu_place(src.place())) {  // NOLINT
     memory::Copy(
-        dst_place, dst_ptr, BOOST_GET_CONST(platform::MLUPlace, src.place()),
-        src_ptr, size,
+        dst_place, dst_ptr, src.place(), src_ptr, size,
         reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream());
   }
 #endif
@@ -467,8 +441,7 @@ void TensorToVector(const Tensor& src, std::vector<T>* dst) {
           "The input tensor should be CPU device, but actually it is in %s.",
           src.place()));
 
-  memory::Copy(dst_place, dst_ptr,
-               BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr, size);
+  memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
 }
 
 template <>
@@ -488,8 +461,7 @@ inline void TensorToVector(const Tensor& src, std::vector<bool>* dst) {
           "The input tensor should be CPU device, but actually it is in %s.",
           src.place()));
 
-  memory::Copy(dst_place, dst_ptr,
-               BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr, size);
+  memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
 
   for (unsigned int i = 0; i < src.numel(); i++) {
     (*dst)[i] = static_cast<bool>(array[i]);
diff --git a/paddle/fluid/imperative/bkcl_context.cc b/paddle/fluid/imperative/bkcl_context.cc
index 2072c41673aaf..f08dd59e39206 100644
--- a/paddle/fluid/imperative/bkcl_context.cc
+++ b/paddle/fluid/imperative/bkcl_context.cc
@@ -86,7 +86,7 @@ void BKCLParallelContext::Init() {
   }
   BcastBKCLId(bkcl_ids, 0);
 
-  int xpu_id = BOOST_GET_CONST(platform::XPUPlace, place_).device;
+  int xpu_id = place_.device;
   for (int ring_id = 0; ring_id < strategy_.nrings_; ring_id++) {
     VLOG(0) << "init BKCL context nranks: " << strategy_.nranks_
             << " local rank: " << strategy_.local_rank_ << " xpu id: " << xpu_id
@@ -111,7 +111,7 @@ void BKCLParallelContext::InitWithRingID(int ring_id) {
   }
   BcastBKCLId(bkcl_ids, 0);
 
-  int xpu_id = BOOST_GET_CONST(platform::XPUPlace, place_).device;
+  int xpu_id = place_.device;
   VLOG(0) << "init BKCL context nranks: " << strategy_.nranks_
           << " local rank: " << strategy_.local_rank_ << " xpu id: " << xpu_id
           << " ring id: " << ring_id;
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index d98609273a61f..2056b8622052b 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -78,7 +78,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
   TensorAddFunctor(int64_t numel, const T* x, T* y)
       : numel_(numel), x_(x), y_(y) {}
 
-  void operator()(const platform::CPUPlace& place) {
+  void operator()(const platform::CPUPlace& place) const {
     platform::CPUDeviceContext* ctx = dynamic_cast<platform::CPUDeviceContext*>(
         platform::DeviceContextPool::Instance().Get(place));
     auto blas = operators::math::GetBlas<platform::CPUDeviceContext, T>(*ctx);
@@ -86,7 +86,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
   }
 
 #ifdef PADDLE_WITH_XPU
-  void operator()(const platform::XPUPlace& place) {
+  void operator()(const platform::XPUPlace& place) const {
     using XPUType = typename XPUTypeTrait<T>::Type;
     platform::XPUDeviceContext* ctx = dynamic_cast<platform::XPUDeviceContext*>(
         platform::DeviceContextPool::Instance().Get(place));
@@ -100,7 +100,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
                                    r, XPUAPIErrorMsg[r]));
   }
 #else
-  void operator()(const platform::XPUPlace& place) {
+  void operator()(const platform::XPUPlace& place) const {
     PADDLE_THROW(platform::errors::PermissionDenied(
         "Gradient accumulation on place (%s) "
         "is not supported in imperative mode",
@@ -109,7 +109,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
 #endif
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  void operator()(const platform::CUDAPlace& place) {
+  void operator()(const platform::CUDAPlace& place) const {
     platform::CUDADeviceContext* ctx =
         dynamic_cast<platform::CUDADeviceContext*>(
             platform::DeviceContextPool::Instance().Get(place));
@@ -117,7 +117,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
     blas.AXPY(numel_, 1., x_, y_);
   }
 #else
-  void operator()(const platform::CUDAPlace& place) {
+  void operator()(const platform::CUDAPlace& place) const {
     PADDLE_THROW(platform::errors::PermissionDenied(
         "Gradient accumulation on place (%s) "
         "is not supported in imperative mode",
@@ -126,7 +126,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
 #endif
 
 #ifdef PADDLE_WITH_MLU
-  void operator()(const platform::MLUPlace& place) {
+  void operator()(const platform::MLUPlace& place) const {
     // TODO(fwg): SUPPORT it
     PADDLE_THROW(platform::errors::PermissionDenied(
         "Gradient accumulation on place (%s) "
@@ -134,7 +134,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
         place));
   }
 #else
-  void operator()(const platform::MLUPlace& place) {
+  void operator()(const platform::MLUPlace& place) const {
     PADDLE_THROW(platform::errors::PermissionDenied(
         "Gradient accumulation on place (%s) "
         "is not supported in imperative mode",
@@ -143,7 +143,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
 #endif
 
 #ifdef PADDLE_WITH_ASCEND_CL
-  void operator()(const platform::NPUPlace& place) {
+  void operator()(const platform::NPUPlace& place) const {
     // TODO(zhiqiu): SUPPORT it
     PADDLE_THROW(platform::errors::PermissionDenied(
         "Gradient accumulation on place (%s) "
@@ -151,7 +151,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
         place));
   }
 #else
-  void operator()(const platform::NPUPlace& place) {
+  void operator()(const platform::NPUPlace& place) const {
     PADDLE_THROW(platform::errors::PermissionDenied(
         "Gradient accumulation on place (%s) "
         "is not supported in imperative mode",
@@ -159,21 +159,21 @@ class TensorAddFunctor : public boost::static_visitor<> {
   }
 #endif
 
-  void operator()(const platform::NPUPinnedPlace& place) {
+  void operator()(const platform::NPUPinnedPlace& place) const {
     PADDLE_THROW(platform::errors::PermissionDenied(
         "Gradient accumulation on place (%s) "
         "is not supported in imperative mode",
         place));
   }
   // there is NO blas in CUDAPinnedPlace
-  void operator()(const platform::CUDAPinnedPlace& place) {
+  void operator()(const platform::CUDAPinnedPlace& place) const {
     PADDLE_THROW(platform::errors::PermissionDenied(
         "Gradient accumulation on place (%s) "
         "is not supported in imperative mode",
         place));
   }
   // there is NO support in IPUPlace
-  void operator()(const platform::IPUPlace& place) {
+  void operator()(const platform::IPUPlace& place) const {
     PADDLE_THROW(platform::errors::PermissionDenied(
         "Gradient accumulation on place (%s) "
         "is not supported in imperative mode",
@@ -183,7 +183,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
  private:
   int64_t numel_;
   const T* x_;
-  T* y_;
+  mutable T* y_;
 };
 
 #ifdef PADDLE_WITH_XPU
@@ -248,7 +248,7 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) {
     TensorAddFunctor<cpp_type> func(                                 \
         numel, src_tensor.data<cpp_type>(),                          \
         dst_tensor->mutable_data<cpp_type>(place));                  \
-    boost::apply_visitor(func, place);                               \
+    platform::VisitPlace(place, func);                               \
     return;                                                          \
   }
 
diff --git a/paddle/fluid/imperative/hccl_context.cc b/paddle/fluid/imperative/hccl_context.cc
index 818b2f424b6af..7292c0f82fced 100644
--- a/paddle/fluid/imperative/hccl_context.cc
+++ b/paddle/fluid/imperative/hccl_context.cc
@@ -86,7 +86,7 @@ void HCCLParallelContext::Init() {
   }
   BcastHCCLId(hccl_ids, 0, server_fd);
 
-  int npu_id = BOOST_GET_CONST(platform::NPUPlace, place_).device;
+  int npu_id = place_.device;
   for (int ring_id = 0; ring_id < strategy_.nrings_; ring_id++) {
     VLOG(0) << "init hccl context nranks: " << strategy_.nranks_
             << " local rank: " << strategy_.local_rank_ << " npu id: " << npu_id
@@ -96,10 +96,10 @@ void HCCLParallelContext::Init() {
         &hccl_ids[ring_id], strategy_.nranks_, strategy_.local_rank_, npu_id,
         ring_id);
 
-    compute_events_.emplace_back(platform::NpuEventResourcePool::Instance().New(
-        BOOST_GET_CONST(platform::NPUPlace, place_).device));
-    comm_events_.emplace_back(platform::NpuEventResourcePool::Instance().New(
-        BOOST_GET_CONST(platform::NPUPlace, place_).device));
+    compute_events_.emplace_back(
+        platform::NpuEventResourcePool::Instance().New(place_.device));
+    comm_events_.emplace_back(
+        platform::NpuEventResourcePool::Instance().New(place_.device));
   }
 }
 
@@ -117,7 +117,7 @@ void HCCLParallelContext::InitWithRingID(int ring_id) {
   }
   BcastHCCLId(hccl_ids, 0, server_fd);
 
-  int npu_id = BOOST_GET_CONST(platform::NPUPlace, place_).device;
+  int npu_id = place_.device;
   VLOG(0) << "init hccl context nranks: " << strategy_.nranks_
           << " local rank: " << strategy_.local_rank_ << " npu id: " << npu_id
           << " ring id: " << ring_id;
@@ -125,10 +125,10 @@ void HCCLParallelContext::InitWithRingID(int ring_id) {
   platform::HCCLCommContext::Instance().CreateHCCLComm(
       &hccl_ids[0], strategy_.nranks_, strategy_.local_rank_, npu_id, ring_id);
 
-  compute_events_.emplace_back(platform::NpuEventResourcePool::Instance().New(
-      BOOST_GET_CONST(platform::NPUPlace, place_).device));
-  comm_events_.emplace_back(platform::NpuEventResourcePool::Instance().New(
-      BOOST_GET_CONST(platform::NPUPlace, place_).device));
+  compute_events_.emplace_back(
+      platform::NpuEventResourcePool::Instance().New(place_.device));
+  comm_events_.emplace_back(
+      platform::NpuEventResourcePool::Instance().New(place_.device));
 }
 
 void HCCLParallelContext::AllReduceByStream(const framework::Variable &src,
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index cc7fcf455a13d..d2c63d5b21008 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -193,7 +193,7 @@ void VarBase::ClearGradient(bool set_to_zero) {
           grad_var_->MutableVar()->GetMutable<framework::SelectedRows>();
       if (grad_t->mutable_value()->IsInitialized()) {
 #ifdef PADDLE_WITH_MKLDNN
-        if (FLAGS_use_mkldnn) ClearMKLDNNCache(grad_t->place());
+        if (FLAGS_use_mkldnn) platform::ClearMKLDNNCache(grad_t->place());
 #endif
         grad_t->mutable_rows()->clear();
         grad_t->mutable_value()->clear();
@@ -211,7 +211,7 @@ void VarBase::ClearGradient(bool set_to_zero) {
           grad_t->clear();
         }
 #ifdef PADDLE_WITH_MKLDNN
-        if (FLAGS_use_mkldnn) ClearMKLDNNCache(grad_t->place());
+        if (FLAGS_use_mkldnn) platform::ClearMKLDNNCache(grad_t->place());
 #endif
       }
     }
diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc
index 1b50c515635d2..066d0db134817 100644
--- a/paddle/fluid/imperative/nccl_context.cc
+++ b/paddle/fluid/imperative/nccl_context.cc
@@ -77,7 +77,7 @@ void NCCLParallelContext::Init() {
   }
   BcastNCCLId(nccl_ids, 0, server_fd);
 
-  int gpu_id = BOOST_GET_CONST(platform::CUDAPlace, place_).device;
+  int gpu_id = place_.device;
   for (int ring_id = 0; ring_id < strategy_.nrings_; ring_id++) {
     VLOG(0) << "init nccl context nranks: " << strategy_.nranks_
             << " local rank: " << strategy_.local_rank_ << " gpu id: " << gpu_id
@@ -88,10 +88,9 @@ void NCCLParallelContext::Init() {
         ring_id);
 
     compute_events_.emplace_back(
-        platform::CudaEventResourcePool::Instance().New(
-            BOOST_GET_CONST(platform::CUDAPlace, place_).device));
-    comm_events_.emplace_back(platform::CudaEventResourcePool::Instance().New(
-        BOOST_GET_CONST(platform::CUDAPlace, place_).device));
+        platform::CudaEventResourcePool::Instance().New(place_.device));
+    comm_events_.emplace_back(
+        platform::CudaEventResourcePool::Instance().New(place_.device));
   }
 }
 
@@ -111,7 +110,7 @@ void NCCLParallelContext::InitWithRingID(int ring_id) {
   }
   BcastNCCLId(nccl_ids, 0, server_fd);
 
-  int gpu_id = BOOST_GET_CONST(platform::CUDAPlace, place_).device;
+  int gpu_id = place_.device;
   VLOG(0) << "init nccl context nranks: " << strategy_.nranks_
           << " local rank: " << strategy_.local_rank_ << " gpu id: " << gpu_id
           << " ring id: " << ring_id;
@@ -119,10 +118,10 @@ void NCCLParallelContext::InitWithRingID(int ring_id) {
   platform::NCCLCommContext::Instance().CreateComm(
       &nccl_ids[0], strategy_.nranks_, strategy_.local_rank_, gpu_id, ring_id);
 
-  compute_events_.emplace_back(platform::CudaEventResourcePool::Instance().New(
-      BOOST_GET_CONST(platform::CUDAPlace, place_).device));
-  comm_events_.emplace_back(platform::CudaEventResourcePool::Instance().New(
-      BOOST_GET_CONST(platform::CUDAPlace, place_).device));
+  compute_events_.emplace_back(
+      platform::CudaEventResourcePool::Instance().New(place_.device));
+  comm_events_.emplace_back(
+      platform::CudaEventResourcePool::Instance().New(place_.device));
 }
 
 void NCCLParallelContext::AllReduceByStream(const framework::Variable &src,
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 15a278c2e6464..6474f3c07fa16 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -194,7 +194,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
   auto& kernels = kernels_iter->second;
   auto kernel_iter = kernels.find(expected_kernel_key);
 #ifdef PADDLE_WITH_XPU
-  if (is_xpu_place(expected_kernel_key.place_) &&
+  if (paddle::platform::is_xpu_place(expected_kernel_key.place_) &&
       (kernel_iter == kernels.end() ||
        !paddle::platform::is_xpu_support_op(op.Type(), expected_kernel_key) ||
        paddle::platform::is_in_xpu_black_list(op.Type()))) {
@@ -207,7 +207,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
   if (kernel_iter == kernels.end() &&
-      is_npu_place(expected_kernel_key.place_)) {
+      paddle::platform::is_npu_place(expected_kernel_key.place_)) {
     VLOG(3) << "missing NPU kernel: " << op.Type()
             << ", expected_kernel_key:" << expected_kernel_key
             << ", fallbacking to CPU one!";
@@ -217,7 +217,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
 #endif
 #ifdef PADDLE_WITH_MLU
   if (kernel_iter == kernels.end() &&
-      is_mlu_place(expected_kernel_key.place_)) {
+      paddle::platform::is_mlu_place(expected_kernel_key.place_)) {
     VLOG(3) << "missing MLU kernel: " << op.Type()
             << ", expected_kernel_key:" << expected_kernel_key
             << ", fallbacking to CPU one!";
diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index beddbd5d12008..0c9bedf3dca32 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -835,7 +835,7 @@ void Reducer::MarkGroupReady(size_t group_index) {
     // thrown in comm_pool_.
     auto next_group = next_group_;
     comm_pool_->enqueue([this, run_order, next_group, &group] {
-      auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place_).device;
+      auto dev_id = place_.device;
       platform::SetXPUDeviceId(dev_id);
       FusedAllReduceSchedule(run_order, group, next_group);
       {
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 7ed9f08906a73..f4e535de108a6 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -87,8 +87,7 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
     std::unique_ptr<framework::GarbageCollector> gc;
     if (platform::is_gpu_place(place)) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      gc.reset(new framework::DefaultStreamGarbageCollector(
-          BOOST_GET_CONST(platform::CUDAPlace, place), 0));
+      gc.reset(new framework::DefaultStreamGarbageCollector(place, 0));
 
       VLOG(10) << "Created GarbageCollector at " << place;
 #else
@@ -98,8 +97,7 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
 #endif
     } else if (platform::is_cuda_pinned_place(place)) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      gc.reset(new framework::CUDAPinnedGarbageCollector(
-          BOOST_GET_CONST(platform::CUDAPinnedPlace, place), 0));
+      gc.reset(new framework::CUDAPinnedGarbageCollector(place, 0));
 
       VLOG(10) << "Created GarbageCollector at " << place;
 #else
@@ -110,8 +108,7 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
 #endif
     } else if (platform::is_xpu_place(place)) {
 #if defined(PADDLE_WITH_XPU)
-      gc.reset(new framework::XPUGarbageCollector(
-          BOOST_GET_CONST(platform::XPUPlace, place), 0));
+      gc.reset(new framework::XPUGarbageCollector(place, 0));
       VLOG(10) << "Created GarbageCollector at " << place;
 #else
       PADDLE_THROW(platform::errors::PermissionDenied(
@@ -119,14 +116,12 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
           "Please recompile or reinstall Paddle with XPU support."));
 #endif
     } else if (platform::is_cpu_place(place)) {
-      gc.reset(new framework::CPUGarbageCollector(
-          BOOST_GET_CONST(platform::CPUPlace, place), 0));
+      gc.reset(new framework::CPUGarbageCollector(place, 0));
       VLOG(10) << "Created GarbageCollector at " << place;
     } else if (platform::is_npu_place(place)) {
 #if defined(PADDLE_WITH_ASCEND_CL)
       // TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector.
-      gc.reset(new framework::NPUUnsafeFastGarbageCollector(
-          BOOST_GET_CONST(platform::NPUPlace, place), 0));
+      gc.reset(new framework::NPUUnsafeFastGarbageCollector(place, 0));
       VLOG(10) << "Created GarbageCollector at " << place;
 #else
       PADDLE_THROW(platform::errors::PermissionDenied(
@@ -135,8 +130,7 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
 #endif
     } else if (platform::is_mlu_place(place)) {
 #if defined(PADDLE_WITH_MLU)
-      gc.reset(new framework::MLUDefaultStreamGarbageCollector(
-          BOOST_GET_CONST(platform::MLUPlace, place), 0));
+      gc.reset(new framework::MLUDefaultStreamGarbageCollector(place, 0));
       VLOG(10) << "Created GarbageCollector at " << place;
 #else
       PADDLE_THROW(platform::errors::PermissionDenied(
@@ -197,31 +191,28 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins,
   try {
     if (platform::is_gpu_place(place)) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      platform::SetDeviceId(BOOST_GET_CONST(platform::CUDAPlace, place).device);
+      platform::SetDeviceId(place.device);
 #else
       PADDLE_THROW(platform::errors::PreconditionNotMet(
           "PaddlePaddle should compile with GPU if use CUDAPlace."));
 #endif
     } else if (platform::is_xpu_place(place)) {
 #ifdef PADDLE_WITH_XPU
-      platform::SetXPUDeviceId(
-          BOOST_GET_CONST(platform::XPUPlace, place).device);
+      platform::SetXPUDeviceId(place.device);
 #else
       PADDLE_THROW(platform::errors::PreconditionNotMet(
           "PaddlePaddle should compile with XPU if use XPUPlace."));
 #endif
     } else if (platform::is_npu_place(place)) {
 #ifdef PADDLE_WITH_ASCEND_CL
-      platform::SetNPUDeviceId(
-          BOOST_GET_CONST(platform::NPUPlace, place).device);
+      platform::SetNPUDeviceId(place.device);
 #else
       PADDLE_THROW(platform::errors::PreconditionNotMet(
           "PaddlePaddle should compile with NPU if use NPUPlace."));
 #endif
     } else if (platform::is_mlu_place(place)) {
 #ifdef PADDLE_WITH_MLU
-      platform::SetMLUDeviceId(
-          BOOST_GET_CONST(platform::MLUPlace, place).device);
+      platform::SetMLUDeviceId(place.device);
 #else
       PADDLE_THROW(platform::errors::PreconditionNotMet(
           "PaddlePaddle should compile with MLU if use MLUPlace."));
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index d4b680288e347..a86329a2b2b25 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -127,7 +127,7 @@ bool PaddleTensorToLoDTensor(const PaddleTensor &pt, framework::LoDTensor *t,
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto *dev_ctx =
         static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
-    auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place);
+    auto dst_gpu_place = place;
     memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr),
                  platform::CPUPlace(), pt.data.data(), pt.data.length(),
                  dev_ctx->stream());
@@ -137,7 +137,7 @@ bool PaddleTensorToLoDTensor(const PaddleTensor &pt, framework::LoDTensor *t,
 #endif
   } else if (platform::is_xpu_place(place)) {
 #ifdef PADDLE_WITH_XPU
-    auto dst_xpu_place = BOOST_GET_CONST(platform::XPUPlace, place);
+    auto dst_xpu_place = place;
     memory::Copy(dst_xpu_place, static_cast<void *>(input_ptr),
                  platform::CPUPlace(), pt.data.data(), pt.data.length());
 #else
@@ -954,14 +954,14 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
       // model.
       res->SetPlace(PaddlePlace::kCPU);
     } else {
-      auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_);
+      auto xpu_place = place_;
       res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId());
     }
   } else if (platform::is_npu_place(place_)) {
-    auto npu_place = BOOST_GET_CONST(platform::NPUPlace, place_);
+    auto npu_place = place_;
     res->SetPlace(PaddlePlace::kNPU, npu_place.GetDeviceId());
   } else {
-    auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place_);
+    auto gpu_place = place_;
     res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
   }
   return res;
@@ -993,14 +993,14 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
       // model.
       res->SetPlace(PaddlePlace::kCPU);
     } else {
-      auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_);
+      auto xpu_place = place_;
       res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId());
     }
   } else if (platform::is_npu_place(place_)) {
-    auto npu_place = BOOST_GET_CONST(platform::NPUPlace, place_);
+    auto npu_place = place_;
     res->SetPlace(PaddlePlace::kNPU, npu_place.GetDeviceId());
   } else {
-    auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place_);
+    auto gpu_place = place_;
     res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
   }
   return res;
@@ -1050,7 +1050,7 @@ bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) {
   if (stream != nullptr) {
     paddle::platform::DeviceContextPool &pool =
         paddle::platform::DeviceContextPool::Instance();
-    auto gpu_place = BOOST_GET_CONST(paddle::platform::CUDAPlace, place_);
+    auto gpu_place = place_;
     auto *dev_ctx = reinterpret_cast<paddle::platform::CUDADeviceContext *>(
         pool.Get(gpu_place));
     dev_ctx->SetThreadLocalStream(stream);
@@ -1065,7 +1065,7 @@ void AnalysisPredictor::CollectShapeRangeInfo() {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     paddle::platform::DeviceContextPool &pool =
         paddle::platform::DeviceContextPool::Instance();
-    auto gpu_place = BOOST_GET_CONST(paddle::platform::CUDAPlace, place_);
+    auto gpu_place = place_;
     auto *dev_ctx = static_cast<const paddle::platform::CUDADeviceContext *>(
         pool.Get(gpu_place));
 #ifdef PADDLE_WITH_HIP
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index c1a0cb4be4429..d1f49b84f0679 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -243,7 +243,7 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
           platform::DeviceContextPool::Instance();
       auto *dev_ctx =
           static_cast<const platform::CUDADeviceContext *>(pool.Get(place_));
-      auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place_);
+      auto dst_gpu_place = place_;
       memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr),
                    platform::CPUPlace(), inputs[i].data.data(),
                    inputs[i].data.length(), dev_ctx->stream());
@@ -253,7 +253,7 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
 #endif
     } else if (platform::is_xpu_place(place_)) {
 #ifdef PADDLE_WITH_XPU
-      auto dst_xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_);
+      auto dst_xpu_place = place_;
       memory::Copy(dst_xpu_place, static_cast<void *>(input_ptr),
                    platform::CPUPlace(), inputs[i].data.data(),
                    inputs[i].data.length());
@@ -267,7 +267,7 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
           platform::DeviceContextPool::Instance();
       auto *dev_ctx =
           static_cast<const platform::NPUDeviceContext *>(pool.Get(place_));
-      auto dst_npu_place = BOOST_GET_CONST(platform::NPUPlace, place_);
+      auto dst_npu_place = place_;
       memory::Copy(dst_npu_place, static_cast<void *>(input_ptr),
                    platform::CPUPlace(), inputs[i].data.data(),
                    inputs[i].data.length(), dev_ctx->stream());
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index 2f2f4c0ead760..13b07a8e8fb7b 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -253,7 +253,7 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     paddle::platform::DeviceContextPool &pool =
         paddle::platform::DeviceContextPool::Instance();
-    auto gpu_place = BOOST_GET_CONST(paddle::platform::CUDAPlace, t_place);
+    auto gpu_place = t_place;
     auto *dev_ctx = static_cast<const paddle::platform::CUDADeviceContext *>(
         pool.Get(gpu_place));
     paddle::memory::Copy(paddle::platform::CPUPlace(),
@@ -280,7 +280,7 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
 #endif
   } else if (place_ == PlaceType::kXPU) {
 #ifdef PADDLE_WITH_XPU
-    auto xpu_place = BOOST_GET_CONST(paddle::platform::XPUPlace, t_place);
+    auto xpu_place = t_place;
     paddle::memory::Copy(paddle::platform::CPUPlace(),
                          static_cast<void *>(data), xpu_place, t_data,
                          ele_num * sizeof(T));
@@ -293,7 +293,7 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
 #ifdef PADDLE_WITH_ASCEND_CL
     paddle::platform::DeviceContextPool &pool =
         paddle::platform::DeviceContextPool::Instance();
-    auto npu_place = BOOST_GET_CONST(paddle::platform::NPUPlace, t_place);
+    auto npu_place = t_place;
     auto *dev_ctx = static_cast<const paddle::platform::NPUDeviceContext *>(
         pool.Get(npu_place));
     paddle::memory::Copy(paddle::platform::CPUPlace(),
diff --git a/paddle/fluid/inference/lite/tensor_utils.cc b/paddle/fluid/inference/lite/tensor_utils.cc
index 0d5cd29a0c579..27e3417933806 100644
--- a/paddle/fluid/inference/lite/tensor_utils.cc
+++ b/paddle/fluid/inference/lite/tensor_utils.cc
@@ -134,7 +134,7 @@ void MemoryCopyAsync(const platform::Place& dst_place, void* dst_data,
           "Lite::MemoryCopy CPU->GPU is not yet implemented."));
     } else if (platform::is_gpu_place(dst_place) &&
                platform::is_gpu_place(src_place)) {
-      auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place);
+      auto gpu_place = src_place;
       memory::Copy(
           gpu_place, dst_data, gpu_place, src_data, size,
           static_cast<const platform::CUDADeviceContext&>(ctx).stream());
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 474b4fe3d4522..6615bdf4b138b 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -813,8 +813,7 @@ const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
     }
 #endif
 
-    platform::CUDAPlace cuda_place =
-        BOOST_GET_CONST(platform::CUDAPlace, place);
+    platform::CUDAPlace cuda_place(place.GetDeviceId());
     return m_->GetAllocator(cuda_place, m_->GetDefaultStream(cuda_place));
   }
 #endif
@@ -838,8 +837,7 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
     }
 #endif
 
-    platform::CUDAPlace cuda_place =
-        BOOST_GET_CONST(platform::CUDAPlace, place);
+    platform::CUDAPlace cuda_place(place.GetDeviceId());
     return Alloc(cuda_place, size, m_->GetDefaultStream(cuda_place));
   }
 #endif
@@ -859,8 +857,7 @@ uint64_t AllocatorFacade::Release(const platform::Place& place) {
     }
 #endif
 
-    platform::CUDAPlace cuda_place =
-        BOOST_GET_CONST(platform::CUDAPlace, place);
+    platform::CUDAPlace cuda_place(place.GetDeviceId());
     return Release(cuda_place, m_->GetDefaultStream(cuda_place));
   }
 #endif
@@ -935,7 +932,7 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
   }
 #endif
 
-  platform::CUDAPlace p = BOOST_GET_CONST(platform::CUDAPlace, place);
+  platform::CUDAPlace p(place.GetDeviceId());
   if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) {
     return m_->GetAllocator(p, stream, /* create_if_not_found = */ true)
         ->Allocate(size);
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.h b/paddle/fluid/memory/allocation/best_fit_allocator.h
index 297d876178f3d..15a59fd7ed0c1 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.h
@@ -19,12 +19,7 @@
 #include <map>
 
 #include "paddle/fluid/memory/allocation/allocator.h"
-
-namespace paddle {
-namespace platform {
-class Place;
-}  // namespace platform
-}  // namespace paddle
+#include "paddle/fluid/platform//place.h"
 
 namespace paddle {
 namespace memory {
diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc
index ff9bbf4ab3df8..6000e636dd523 100644
--- a/paddle/fluid/memory/allocation/cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_allocator.cc
@@ -34,7 +34,7 @@ namespace allocation {
 bool CUDAAllocator::IsAllocThreadSafe() const { return true; }
 void CUDAAllocator::FreeImpl(pten::Allocation* allocation) {
   PADDLE_ENFORCE_EQ(
-      BOOST_GET_CONST(platform::CUDAPlace, allocation->place()), place_,
+      allocation->place(), place_,
       platform::errors::PermissionDenied(
           "GPU memory is freed in incorrect device. This may be a bug"));
   platform::RecordedGpuFree(allocation->ptr(), allocation->size(),
diff --git a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
index a6696634c12d4..8a84d9f201ef0 100644
--- a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
@@ -144,8 +144,8 @@ class CUDADeviceContextAllocatorPool {
   }
 
   AllocationPtr Alloc(const platform::CUDADeviceContext &dev_ctx, size_t size) {
-    auto iter = allocators_.find(
-        BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()));
+    auto iter =
+        allocators_.find(platform::CUDAPlace(dev_ctx.GetPlace().GetDeviceId()));
     PADDLE_ENFORCE_NE(
         iter, allocators_.end(),
         platform::errors::NotFound("No allocator found for CUDAPlace."));
diff --git a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc
index 2ae2cf20ee6d4..17e0cc614d168 100644
--- a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc
@@ -103,7 +103,7 @@ bool CUDAVirtualMemAllocator::IsAllocThreadSafe() const { return false; }
 
 void CUDAVirtualMemAllocator::FreeImpl(pten::Allocation* allocation) {
   PADDLE_ENFORCE_EQ(
-      BOOST_GET_CONST(platform::CUDAPlace, allocation->place()), place_,
+      allocation->place(), place_,
       platform::errors::PermissionDenied(
           "GPU memory is freed in incorrect device. This may be a bug"));
 
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
index ffe7ccf9190be..91358b688040a 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -26,6 +26,7 @@
 
 #include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/string/split.h"
+#include "paddle/pten/common/place.h"
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
@@ -791,7 +792,7 @@ size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const {
 namespace allocation {
 
 pten::Allocation *NaiveBestFitAllocator::AllocateImpl(size_t size) {
-  void *ptr = boost::apply_visitor(legacy::AllocVisitor(size), place_);
+  void *ptr = paddle::platform::VisitPlace(place_, legacy::AllocVisitor(size));
   auto *tmp_alloc = new Allocation(ptr, size, place_);
   platform::MemEvenRecorder::Instance().PushMemRecord(
       static_cast<void *>(tmp_alloc), place_, size);
@@ -799,16 +800,16 @@ pten::Allocation *NaiveBestFitAllocator::AllocateImpl(size_t size) {
 }
 
 void NaiveBestFitAllocator::FreeImpl(pten::Allocation *allocation) {
-  boost::apply_visitor(
-      legacy::FreeVisitor(allocation->ptr(), allocation->size()),
-      allocation->place());
+  paddle::platform::VisitPlace(
+      allocation->place(),
+      legacy::FreeVisitor(allocation->ptr(), allocation->size()));
   platform::MemEvenRecorder::Instance().PopMemRecord(
       static_cast<void *>(allocation), place_);
   delete allocation;
 }
 
 uint64_t NaiveBestFitAllocator::ReleaseImpl(const platform::Place &place) {
-  return boost::apply_visitor(legacy::ReleaseVisitor(), place);
+  return paddle::platform::VisitPlace(place, legacy::ReleaseVisitor());
 }
 
 }  // namespace allocation
diff --git a/paddle/fluid/memory/allocation/npu_allocator.cc b/paddle/fluid/memory/allocation/npu_allocator.cc
index d9fa7ec27fdde..a17c15c35d758 100644
--- a/paddle/fluid/memory/allocation/npu_allocator.cc
+++ b/paddle/fluid/memory/allocation/npu_allocator.cc
@@ -24,7 +24,7 @@ namespace allocation {
 bool NPUAllocator::IsAllocThreadSafe() const { return true; }
 void NPUAllocator::FreeImpl(pten::Allocation* allocation) {
   PADDLE_ENFORCE_EQ(
-      BOOST_GET_CONST(platform::NPUPlace, allocation->place()), place_,
+      allocation->place(), place_,
       platform::errors::PermissionDenied(
           "NPU memory is freed in incorrect device. This may be a bug"));
   platform::RecordedNPUFree(allocation->ptr(), allocation->size(),
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
index 05c6a7adaff8b..66ded146f047d 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
@@ -164,8 +164,7 @@ void StreamSafeCUDAAllocator::FreeImpl(pten::Allocation* allocation) {
 
 uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) {
   std::lock_guard<SpinLock> lock_guard(allocator_map_lock_);
-  std::vector<StreamSafeCUDAAllocator*>& allocators =
-      allocator_map_[BOOST_GET_CONST(platform::CUDAPlace, place)];
+  std::vector<StreamSafeCUDAAllocator*>& allocators = allocator_map_[place];
   uint64_t released_size = 0;
   for (StreamSafeCUDAAllocator* allocator : allocators) {
     released_size += allocator->ProcessUnfreedAllocationsWithRelease();
@@ -192,7 +191,7 @@ uint64_t StreamSafeCUDAAllocator::ProcessUnfreedAllocationsWithRelease() {
   return underlying_allocator_->Release(place_);
 }
 
-std::map<platform::CUDAPlace, std::vector<StreamSafeCUDAAllocator*>>
+std::map<platform::Place, std::vector<StreamSafeCUDAAllocator*>>
     StreamSafeCUDAAllocator::allocator_map_;
 SpinLock StreamSafeCUDAAllocator::allocator_map_lock_;
 
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
index f54cdc749611a..7a89e0f6095a2 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
@@ -65,7 +65,7 @@ class StreamSafeCUDAAllocator : public Allocator {
   void ProcessUnfreedAllocations();
   uint64_t ProcessUnfreedAllocationsWithRelease();
 
-  static std::map<platform::CUDAPlace, std::vector<StreamSafeCUDAAllocator *>>
+  static std::map<platform::Place, std::vector<StreamSafeCUDAAllocator *>>
       allocator_map_;
   static SpinLock allocator_map_lock_;
 
diff --git a/paddle/fluid/memory/allocation/thread_local_allocator.cc b/paddle/fluid/memory/allocation/thread_local_allocator.cc
index 98af151007594..f125670a598bc 100644
--- a/paddle/fluid/memory/allocation/thread_local_allocator.cc
+++ b/paddle/fluid/memory/allocation/thread_local_allocator.cc
@@ -23,8 +23,7 @@ ThreadLocalAllocatorImpl::ThreadLocalAllocatorImpl(const platform::Place& p)
   if (platform::is_gpu_place(place_)) {
     buddy_allocator_.reset(new memory::detail::BuddyAllocator(
         std::unique_ptr<memory::detail::SystemAllocator>(
-            new memory::detail::GPUAllocator(
-                BOOST_GET_CONST(platform::CUDAPlace, place_).device)),
+            new memory::detail::GPUAllocator(place_.device)),
         platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()));
   } else {
     PADDLE_THROW(platform::errors::Unavailable(
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index 153e19a9f1450..f804c2af53916 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/profiler.h"
+#include "paddle/pten/common/place.h"
 
 namespace paddle {
 namespace memory {
@@ -29,6 +30,7 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
   VLOG(4) << "src: " << src << ", dst: " << dst << ", num: " << num;
   std::memcpy(dst, src, num);
 }
+
 #ifdef PADDLE_WITH_IPU
 template <>
 void Copy<platform::IPUPlace, platform::CPUPlace>(platform::IPUPlace dst_place,
@@ -54,6 +56,61 @@ void Copy<platform::IPUPlace, platform::IPUPlace>(platform::IPUPlace dst_place,
   if (UNLIKELY(num == 0)) return;
   std::memcpy(dst, src, num);
 }
+
+// NOTE: only for CPUPlace and IPUPlace.
+template <>
+void Copy<pten::Place, pten::Place>(pten::Place dst_place, void* dst,
+                                    pten::Place src_place, const void* src,
+                                    size_t num) {
+  if (src_place.GetType() == pten::AllocationType::CPU &&
+      dst_place.GetType() == pten::AllocationType::CPU) {
+    platform::CPUPlace place_dst, place_src;
+    return Copy(place_dst, dst, place_src, src, num);
+  } else if (src_place.GetType() == pten::AllocationType::CPU &&
+             dst_place.GetType() == pten::AllocationType::IPU) {
+    platform::IPUPlace place_dst(dst_place.GetDeviceId());
+    platform::CPUPlace place_src;
+    return Copy(place_dst, dst, place_src, src, num);
+  } else if (src_place.GetType() == pten::AllocationType::IPU &&
+             dst_place.GetType() == pten::AllocationType::CPU) {
+    platform::IPUPlace place_src(src_place.GetDeviceId());
+    platform::CPUPlace place_dst;
+    return Copy(place_dst, dst, place_src, src, num);
+  } else if (src_place.GetType() == pten::AllocationType::IPU &&
+             dst_place.GetType() == pten::AllocationType::IPU) {
+    platform::IPUPlace place_src(src_place.GetDeviceId());
+    platform::IPUPlace place_dst(dst_place.GetDeviceId());
+    return Copy(place_dst, dst, place_src, src, num);
+  }
+}
+
+// NOTE: only for (CPUPlace and IPUPlace) -> (IPUPlace).
+template <>
+void Copy<pten::IPUPlace, pten::Place>(pten::IPUPlace dst_place, void* dst,
+                                       pten::Place src_place, const void* src,
+                                       size_t num) {
+  if (src_place.GetType() == pten::AllocationType::CPU) {
+    platform::CPUPlace place_src;
+    return Copy(dst_place, dst, place_src, src, num);
+  } else if (src_place.GetType() == pten::AllocationType::IPU) {
+    platform::IPUPlace place_src(src_place.GetDeviceId());
+    return Copy(dst_place, dst, place_src, src, num);
+  }
+}
+
+// NOTE: only for (IPUPlace) -> (CPUPlace and IPUPlace).
+template <>
+void Copy<pten::Place, pten::IPUPlace>(pten::Place dst_place, void* dst,
+                                       pten::IPUPlace src_place,
+                                       const void* src, size_t num) {
+  if (dst_place.GetType() == pten::AllocationType::CPU) {
+    platform::CPUPlace place_dst;
+    return Copy(place_dst, dst, src_place, src, num);
+  } else if (dst_place.GetType() == pten::AllocationType::IPU) {
+    platform::IPUPlace place_dst(dst_place.GetDeviceId());
+    return Copy(place_dst, dst, src_place, src, num);
+  }
+}
 #endif
 
 #ifdef PADDLE_WITH_XPU
@@ -92,6 +149,34 @@ void Copy<platform::XPUPlace, platform::XPUPlace>(platform::XPUPlace dst_place,
   }
   platform::MemcpySyncD2D(dst, dst_place, src, src_place, num);
 }
+
+// NOTE: only for (CPUPlace and XPUPlace) -> (XPUPlace).
+template <>
+void Copy<pten::XPUPlace, pten::Place>(pten::XPUPlace dst_place, void* dst,
+                                       pten::Place src_place, const void* src,
+                                       size_t num) {
+  if (src_place.GetType() == pten::AllocationType::CPU) {
+    platform::CPUPlace place_src;
+    return Copy(dst_place, dst, place_src, src, num);
+  } else if (src_place.GetType() == pten::AllocationType::XPU) {
+    platform::XPUPlace place_src(src_place.GetDeviceId());
+    return Copy(dst_place, dst, place_src, src, num);
+  }
+}
+
+// NOTE: only for (XPUPlace) -> (CPUPlace and XPUPlace).
+template <>
+void Copy<pten::Place, pten::XPUPlace>(pten::Place dst_place, void* dst,
+                                       pten::XPUPlace src_place,
+                                       const void* src, size_t num) {
+  if (dst_place.GetType() == pten::AllocationType::CPU) {
+    platform::CPUPlace place_dst;
+    return Copy(place_dst, dst, src_place, src, num);
+  } else if (dst_place.GetType() == pten::AllocationType::XPU) {
+    platform::XPUPlace place_dst(dst_place.GetDeviceId());
+    return Copy(place_dst, dst, src_place, src, num);
+  }
+}
 #endif
 
 #ifdef PADDLE_WITH_ASCEND_CL
@@ -272,6 +357,128 @@ void Copy<platform::NPUPlace, platform::NPUPinnedPlace>(
   }
 }
 
+// NOTE: only for CPUPlace, NPUPlace and NPUPinnedPlace.
+template <>
+void Copy<pten::Place, pten::Place>(pten::Place dst_place, void* dst,
+                                    pten::Place src_place, const void* src,
+                                    size_t num, aclrtStream stream) {
+  if (src_place.GetType() == pten::AllocationType::CPU &&
+      dst_place.GetType() == pten::AllocationType::CPU) {
+    platform::CPUPlace place_dst, place_src;
+    return Copy(place_dst, dst, place_src, src, num);
+  } else if (src_place.GetType() == pten::AllocationType::CPU &&
+             dst_place.GetType() == pten::AllocationType::NPU) {
+    platform::NPUPlace place_dst(dst_place.GetDeviceId());
+    platform::CPUPlace place_src;
+    return Copy(place_dst, dst, place_src, src, num, stream);
+  } else if (src_place.GetType() == pten::AllocationType::NPU &&
+             dst_place.GetType() == pten::AllocationType::CPU) {
+    platform::NPUPlace place_src(src_place.GetDeviceId());
+    platform::CPUPlace place_dst;
+    return Copy(place_dst, dst, place_src, src, num, stream);
+  } else if (src_place.GetType() == pten::AllocationType::NPU &&
+             dst_place.GetType() == pten::AllocationType::NPU) {
+    platform::NPUPlace place_src(src_place.GetDeviceId());
+    platform::NPUPlace place_dst(dst_place.GetDeviceId());
+    return Copy(place_dst, dst, place_src, src, num, stream);
+  } else if (src_place.GetType() == pten::AllocationType::CPU &&
+             dst_place.GetType() == pten::AllocationType::NPUPINNED) {
+    platform::CPUPlace place_src;
+    platform::NPUPinnedPlace place_dst;
+    return Copy(place_dst, dst, place_src, src, num);
+  } else if (src_place.GetType() == pten::AllocationType::NPUPINNED &&
+             dst_place.GetType() == pten::AllocationType::CPU) {
+    platform::CPUPlace place_dst;
+    platform::NPUPinnedPlace place_src;
+    return Copy(place_dst, dst, place_src, src, num);
+  } else if (src_place.GetType() == pten::AllocationType::NPUPINNED &&
+             dst_place.GetType() == pten::AllocationType::NPUPINNED) {
+    platform::NPUPinnedPlace place_dst;
+    platform::NPUPinnedPlace place_src;
+    return Copy(place_dst, dst, place_src, src, num);
+  } else if (src_place.GetType() == pten::AllocationType::NPUPINNED &&
+             dst_place.GetType() == pten::AllocationType::NPU) {
+    platform::NPUPinnedPlace place_src;
+    platform::NPUPlace place_dst(dst_place.GetDeviceId());
+    return Copy(place_dst, dst, place_src, src, num, stream);
+  } else if (src_place.GetType() == pten::AllocationType::NPU &&
+             dst_place.GetType() == pten::AllocationType::NPUPINNED) {
+    platform::NPUPinnedPlace place_dst;
+    platform::NPUPlace place_src(src_place.GetDeviceId());
+    return Copy(place_dst, dst, place_src, src, num, stream);
+  }
+}
+
+// NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (CPUPlace).
+template <>
+void Copy<pten::CPUPlace, pten::Place>(pten::CPUPlace dst_place, void* dst,
+                                       pten::Place src_place, const void* src,
+                                       size_t num, aclrtStream stream) {
+  Copy(pten::Place(dst_place.GetType()), dst, src_place, src, num, stream);
+}
+
+// NOTE: only for (CPUPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace).
+template <>
+void Copy<pten::Place, pten::CPUPlace>(pten::Place dst_place, void* dst,
+                                       pten::CPUPlace src_place,
+                                       const void* src, size_t num,
+                                       aclrtStream stream) {
+  Copy(dst_place, dst, pten::Place(src_place.GetType()), src, num, stream);
+}
+
+// NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (NPUPlace)
+template <>
+void Copy<pten::NPUPlace, pten::Place>(pten::NPUPlace dst_place, void* dst,
+                                       pten::Place src_place, const void* src,
+                                       size_t num, aclrtStream stream) {
+  Copy(pten::Place(dst_place.GetType(), dst_place.GetDeviceId()), dst,
+       src_place, src, num, stream);
+}
+
+// NOTE: only for (NPUPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace)
+template <>
+void Copy<pten::Place, pten::NPUPlace>(pten::Place dst_place, void* dst,
+                                       pten::NPUPlace src_place,
+                                       const void* src, size_t num,
+                                       aclrtStream stream) {
+  Copy(dst_place, dst,
+       pten::Place(src_place.GetType(), src_place.GetDeviceId()), src, num,
+       stream);
+}
+
+// NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (NPUPinnedPlace)
+template <>
+void Copy<pten::NPUPinnedPlace, pten::Place>(pten::NPUPinnedPlace dst_place,
+                                             void* dst, pten::Place src_place,
+                                             const void* src, size_t num,
+                                             aclrtStream stream) {
+  Copy(pten::Place(dst_place.GetType()), dst, src_place, src, num, stream);
+}
+
+// NOTE: only for (NPUPinnedPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace)
+template <>
+void Copy<pten::Place, pten::NPUPinnedPlace>(pten::Place dst_place, void* dst,
+                                             pten::NPUPinnedPlace src_place,
+                                             const void* src, size_t num,
+                                             aclrtStream stream) {
+  Copy(dst_place, dst, pten::Place(src_place.GetType()), src, num, stream);
+}
+
+// NOTE: only for (CPUPlace) -> (NPUPinnedPlace)
+template <>
+void Copy<pten::NPUPinnedPlace, pten::Place>(pten::NPUPinnedPlace dst_place,
+                                             void* dst, pten::Place src_place,
+                                             const void* src, size_t num) {
+  Copy(pten::Place(dst_place.GetType()), dst, src_place, src, num, nullptr);
+}
+
+// NOTE: only for (NPUPinnedPlace) -> (CPUPlace)
+template <>
+void Copy<pten::Place, pten::NPUPinnedPlace>(pten::Place dst_place, void* dst,
+                                             pten::NPUPinnedPlace src_place,
+                                             const void* src, size_t num) {
+  Copy(dst_place, dst, pten::Place(src_place.GetType()), src, num, nullptr);
+}
 #endif
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -490,6 +697,128 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
   }
 }
 
+// NOTE: only for CPUPlace、CUDAPlace and CUDAPinnedPlace.
+template <>
+void Copy<pten::Place, pten::Place>(pten::Place dst_place, void* dst,
+                                    pten::Place src_place, const void* src,
+                                    size_t num, gpuStream_t stream) {
+  if (src_place.GetType() == pten::AllocationType::CPU &&
+      dst_place.GetType() == pten::AllocationType::CPU) {
+    platform::CPUPlace place_dst, place_src;
+    return Copy(place_dst, dst, place_src, src, num);
+  } else if (src_place.GetType() == pten::AllocationType::CPU &&
+             dst_place.GetType() == pten::AllocationType::GPU) {
+    platform::CUDAPlace place_dst(dst_place.GetDeviceId());
+    platform::CPUPlace place_src;
+    return Copy(place_dst, dst, place_src, src, num, stream);
+  } else if (src_place.GetType() == pten::AllocationType::GPU &&
+             dst_place.GetType() == pten::AllocationType::CPU) {
+    platform::CUDAPlace place_src(src_place.GetDeviceId());
+    platform::CPUPlace place_dst;
+    return Copy(place_dst, dst, place_src, src, num, stream);
+  } else if (src_place.GetType() == pten::AllocationType::GPU &&
+             dst_place.GetType() == pten::AllocationType::GPU) {
+    platform::CUDAPlace place_src(src_place.GetDeviceId());
+    platform::CUDAPlace place_dst(dst_place.GetDeviceId());
+    return Copy(place_dst, dst, place_src, src, num, stream);
+  } else if (src_place.GetType() == pten::AllocationType::CPU &&
+             dst_place.GetType() == pten::AllocationType::GPUPINNED) {
+    platform::CPUPlace place_src;
+    platform::CUDAPinnedPlace place_dst;
+    return Copy(place_dst, dst, place_src, src, num);
+  } else if (src_place.GetType() == pten::AllocationType::GPUPINNED &&
+             dst_place.GetType() == pten::AllocationType::CPU) {
+    platform::CPUPlace place_dst;
+    platform::CUDAPinnedPlace place_src;
+    return Copy(place_dst, dst, place_src, src, num);
+  } else if (src_place.GetType() == pten::AllocationType::GPUPINNED &&
+             dst_place.GetType() == pten::AllocationType::GPUPINNED) {
+    platform::CUDAPinnedPlace place_dst;
+    platform::CUDAPinnedPlace place_src;
+    return Copy(place_dst, dst, place_src, src, num);
+  } else if (src_place.GetType() == pten::AllocationType::GPUPINNED &&
+             dst_place.GetType() == pten::AllocationType::GPU) {
+    platform::CUDAPinnedPlace place_src;
+    platform::CUDAPlace place_dst(dst_place.GetDeviceId());
+    return Copy(place_dst, dst, place_src, src, num, stream);
+  } else if (src_place.GetType() == pten::AllocationType::GPU &&
+             dst_place.GetType() == pten::AllocationType::GPUPINNED) {
+    platform::CUDAPinnedPlace place_dst;
+    platform::CUDAPlace place_src(src_place.GetDeviceId());
+    return Copy(place_dst, dst, place_src, src, num, stream);
+  }
+}
+
+// NOTE: only for (CPUPlace, CUDAPlace and CUDAPinnedPlace) -> (CPUPlace).
+template <>
+void Copy<pten::CPUPlace, pten::Place>(pten::CPUPlace dst_place, void* dst,
+                                       pten::Place src_place, const void* src,
+                                       size_t num, gpuStream_t stream) {
+  Copy(pten::Place(dst_place.GetType()), dst, src_place, src, num, stream);
+}
+
+// NOTE: only for (CPUPlace) -> (CPUPlace, CUDAPlace and CUDAPinnedPlace).
+template <>
+void Copy<pten::Place, pten::CPUPlace>(pten::Place dst_place, void* dst,
+                                       pten::CPUPlace src_place,
+                                       const void* src, size_t num,
+                                       gpuStream_t stream) {
+  Copy(dst_place, dst, pten::Place(src_place.GetType()), src, num, stream);
+}
+
+// NOTE: only for (CPUPlace, CUDAPlace and CUDAPinnedPlace) -> (CUDAPlace)
+template <>
+void Copy<pten::GPUPlace, pten::Place>(pten::GPUPlace dst_place, void* dst,
+                                       pten::Place src_place, const void* src,
+                                       size_t num, gpuStream_t stream) {
+  Copy(pten::Place(dst_place.GetType(), dst_place.GetDeviceId()), dst,
+       src_place, src, num, stream);
+}
+
+// NOTE: only for (CUDAPlace) -> (CPUPlace, CUDAPlace and CUDAPinnedPlace)
+template <>
+void Copy<pten::Place, pten::GPUPlace>(pten::Place dst_place, void* dst,
+                                       pten::GPUPlace src_place,
+                                       const void* src, size_t num,
+                                       gpuStream_t stream) {
+  Copy(dst_place, dst,
+       pten::Place(src_place.GetType(), src_place.GetDeviceId()), src, num,
+       stream);
+}
+
+// NOTE: only for (CPUPlace, CUDAPlace and CUDAPinnedPlace) -> (CUDAPinnedPlace)
+template <>
+void Copy<pten::GPUPinnedPlace, pten::Place>(pten::GPUPinnedPlace dst_place,
+                                             void* dst, pten::Place src_place,
+                                             const void* src, size_t num,
+                                             gpuStream_t stream) {
+  Copy(pten::Place(dst_place.GetType()), dst, src_place, src, num, stream);
+}
+
+// NOTE: only for (CUDAPinnedPlace) -> (CPUPlace, CUDAPlace and CUDAPinnedPlace)
+template <>
+void Copy<pten::Place, pten::GPUPinnedPlace>(pten::Place dst_place, void* dst,
+                                             pten::GPUPinnedPlace src_place,
+                                             const void* src, size_t num,
+                                             gpuStream_t stream) {
+  Copy(dst_place, dst, pten::Place(src_place.GetType()), src, num, stream);
+}
+
+// NOTE: only for (CPUPlace) -> (CUDAPinnedPlace)
+template <>
+void Copy<pten::GPUPinnedPlace, pten::Place>(pten::GPUPinnedPlace dst_place,
+                                             void* dst, pten::Place src_place,
+                                             const void* src, size_t num) {
+  Copy(pten::Place(dst_place.GetType()), dst, src_place, src, num, nullptr);
+}
+
+// NOTE: only for (CUDAPinnedPlace) -> (CPUPlace)
+template <>
+void Copy<pten::Place, pten::GPUPinnedPlace>(pten::Place dst_place, void* dst,
+                                             pten::GPUPinnedPlace src_place,
+                                             const void* src, size_t num) {
+  Copy(dst_place, dst, pten::Place(src_place.GetType()), src, num, nullptr);
+}
 #endif
 
 #ifdef PADDLE_WITH_MLU
@@ -586,7 +915,130 @@ void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place,
   }
 }
 
+// NOTE: only for CPUPlace and MLUPlace.
+template <>
+void Copy<pten::Place, pten::Place>(pten::Place dst_place, void* dst,
+                                    pten::Place src_place, const void* src,
+                                    size_t num, mluStream stream) {
+  if (src_place.GetType() == pten::AllocationType::CPU &&
+      dst_place.GetType() == pten::AllocationType::CPU) {
+    platform::CPUPlace place_dst, place_src;
+    return Copy(place_dst, dst, place_src, src, num);
+  } else if (src_place.GetType() == pten::AllocationType::CPU &&
+             dst_place.GetType() == pten::AllocationType::MLU) {
+    platform::MLUPlace place_dst(dst_place.GetDeviceId());
+    platform::CPUPlace place_src;
+    return Copy(place_dst, dst, place_src, src, num, stream);
+  } else if (src_place.GetType() == pten::AllocationType::MLU &&
+             dst_place.GetType() == pten::AllocationType::CPU) {
+    platform::MLUPlace place_src(src_place.GetDeviceId());
+    platform::CPUPlace place_dst;
+    return Copy(place_dst, dst, place_src, src, num, stream);
+  } else if (src_place.GetType() == pten::AllocationType::MLU &&
+             dst_place.GetType() == pten::AllocationType::MLU) {
+    platform::MLUPlace place_src(src_place.GetDeviceId());
+    platform::MLUPlace place_dst(dst_place.GetDeviceId());
+    return Copy(place_dst, dst, place_src, src, num, stream);
+  }
+}
+
+// NOTE: only for (CPUPlace and MLUPlace) -> (MLUPlace)
+template <>
+void Copy<pten::MLUPlace, pten::Place>(pten::MLUPlace dst_place, void* dst,
+                                       pten::Place src_place, const void* src,
+                                       size_t num, mluStream stream) {
+  Copy(pten::Place(dst_place.GetType(), dst_place.GetDeviceId()), dst,
+       src_place, src, num, stream);
+}
+
+// NOTE: only for (MLUPlace) -> (CPUPlace and MLUPlace)
+template <>
+void Copy<pten::Place, pten::MLUPlace>(pten::Place dst_place, void* dst,
+                                       pten::MLUPlace src_place,
+                                       const void* src, size_t num,
+                                       mluStream stream) {
+  Copy(dst_place, dst,
+       pten::Place(src_place.GetType(), src_place.GetDeviceId()), src, num,
+       stream);
+}
+
 #endif  // PADDLE_WITH_MLU
 
+// NOTE: Only for CPUPlace, XPUPlace and PinnedPlace.
+template <>
+void Copy<pten::Place, pten::Place>(pten::Place dst_place, void* dst,
+                                    pten::Place src_place, const void* src,
+                                    size_t num) {
+  if (UNLIKELY(num == 0)) return;
+  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
+          << dst_place;
+  if (src_place.GetType() == pten::AllocationType::CPU &&
+      dst_place.GetType() == pten::AllocationType::CPU) {
+    std::memcpy(dst, src, num);
+  }
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  else if (src_place.GetType() == pten::AllocationType::CPU &&  // NOLINT
+           dst_place.GetType() == pten::AllocationType::GPUPINNED) {
+    std::memcpy(dst, src, num);
+  } else if (src_place.GetType() == pten::AllocationType::GPUPINNED &&
+             dst_place.GetType() == pten::AllocationType::CPU) {
+    std::memcpy(dst, src, num);
+  } else if (src_place.GetType() == pten::AllocationType::GPUPINNED &&
+             dst_place.GetType() == pten::AllocationType::GPUPINNED) {
+    std::memcpy(dst, src, num);
+  }
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  else if (src_place.GetType() == pten::AllocationType::CPU &&  // NOLINT
+           dst_place.GetType() == pten::AllocationType::NPUPINNED) {
+    std::memcpy(dst, src, num);
+  } else if (src_place.GetType() == pten::AllocationType::NPUPINNED &&
+             dst_place.GetType() == pten::AllocationType::CPU) {
+    std::memcpy(dst, src, num);
+  } else if (src_place.GetType() == pten::AllocationType::NPUPINNED &&
+             dst_place.GetType() == pten::AllocationType::NPUPINNED) {
+    std::memcpy(dst, src, num);
+  }
+#endif
+#ifdef PADDLE_WITH_XPU
+  else if (src_place.GetType() == pten::AllocationType::CPU &&  // NOLINT
+           dst_place.GetType() == pten::AllocationType::CPU) {
+    platform::CPUPlace place_dst, place_src;
+    return Copy(place_dst, dst, place_src, src, num);
+  } else if (src_place.GetType() == pten::AllocationType::CPU &&
+             dst_place.GetType() == pten::AllocationType::XPU) {
+    platform::XPUPlace place_dst(dst_place.GetDeviceId());
+    platform::CPUPlace place_src;
+    return Copy(place_dst, dst, place_src, src, num);
+  } else if (src_place.GetType() == pten::AllocationType::XPU &&
+             dst_place.GetType() == pten::AllocationType::CPU) {
+    platform::XPUPlace place_src(src_place.GetDeviceId());
+    platform::CPUPlace place_dst;
+    return Copy(place_dst, dst, place_src, src, num);
+  } else if (src_place.GetType() == pten::AllocationType::XPU &&
+             dst_place.GetType() == pten::AllocationType::XPU) {
+    platform::XPUPlace place_src(src_place.GetDeviceId());
+    platform::XPUPlace place_dst(dst_place.GetDeviceId());
+    return Copy(place_dst, dst, place_src, src, num);
+  }
+#endif
+}
+
+// NOTE: Only for (CPUPlace) -> (CPUPlace and PinnedPlace).
+template <>
+void Copy<pten::Place, pten::CPUPlace>(pten::Place dst_place, void* dst,
+                                       pten::CPUPlace src_place,
+                                       const void* src, size_t num) {
+  Copy(dst_place, dst, pten::Place(src_place.GetType()), src, num);
+}
+
+// NOTE: Only for (CPUPlace and PinnedPlace) -> (CPUPlace).
+template <>
+void Copy<pten::CPUPlace, pten::Place>(pten::CPUPlace dst_place, void* dst,
+                                       pten::Place src_place, const void* src,
+                                       size_t num) {
+  Copy(pten::Place(dst_place.GetType()), dst, src_place, src, num);
+}
+
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/operators/activation_cudnn_op.cu.cc b/paddle/fluid/operators/activation_cudnn_op.cu.cc
index 2776fe9c13132..0ac29e6d3ada7 100644
--- a/paddle/fluid/operators/activation_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/activation_cudnn_op.cu.cc
@@ -16,12 +16,6 @@
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
-namespace paddle {
-namespace platform {
-struct CUDAPlace;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace operators {
 using framework::Tensor;
diff --git a/paddle/fluid/operators/allclose_op.cu b/paddle/fluid/operators/allclose_op.cu
index 173e24b2f1450..32c90ff8fdc10 100644
--- a/paddle/fluid/operators/allclose_op.cu
+++ b/paddle/fluid/operators/allclose_op.cu
@@ -25,8 +25,7 @@ struct GetTensorValue<platform::CUDADeviceContext, T> {
                const framework::Tensor& tensor) const {
     const T* data = tensor.data<T>();
     T value;
-    const auto gpu_place =
-        BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace());
+    const auto gpu_place = dev_ctx.GetPlace();
     memory::Copy(platform::CPUPlace(), &value, gpu_place, data, sizeof(T),
                  dev_ctx.stream());
     return value;
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu
index f8c0426d7b1fb..2f6977b9e2da2 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu
@@ -117,9 +117,8 @@ class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel<T> {
       h_starts[i] = h_starts[i - 1] + xs[i - 1]->numel();
     }
     int64_t total_num = h_starts[xs_size];
-    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
-                 d_starts, cpu_place, h_starts, (xs_size + 1) * sizeof(int64_t),
-                 dev_ctx.stream());
+    memory::Copy(dev_ctx.GetPlace(), d_starts, cpu_place, h_starts,
+                 (xs_size + 1) * sizeof(int64_t), dev_ctx.stream());
 
     // copy each tensor's data address to device
     auto h_mem = memory::Alloc(cpu_place, 2 * xs_size * sizeof(T*));
@@ -134,8 +133,8 @@ class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel<T> {
       h_xs[i] = xs[i]->data<T>();
       h_outs[i] = outs[i]->mutable_data<T>(dev_ctx.GetPlace());
     }
-    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), d_xs,
-                 cpu_place, h_xs, 2 * xs_size * sizeof(T*), dev_ctx.stream());
+    memory::Copy(dev_ctx.GetPlace(), d_xs, cpu_place, h_xs,
+                 2 * xs_size * sizeof(T*), dev_ctx.stream());
 
     // Launch Kernel
     int threads_per_block = std::min(static_cast<int64_t>(1024), total_num);
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc
index 5d5e13e848a75..979ae5c508c6b 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc
@@ -41,8 +41,8 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel<T> {
     MPDType cpu_scale_data;
     if (platform::is_xpu_place(scale->place())) {
       memory::Copy(platform::CPUPlace(), static_cast<void*>(&cpu_scale_data),
-                   BOOST_GET_CONST(platform::XPUPlace, scale->place()),
-                   static_cast<const void*>(scale_data), sizeof(MPDType));
+                   scale->place(), static_cast<const void*>(scale_data),
+                   sizeof(MPDType));
 
     } else {
       cpu_scale_data = (*scale_data);
@@ -87,8 +87,7 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel<T> {
           dev_ctx.Wait();
         }
         memory::Copy(platform::CPUPlace(), &cpu_found_inf_data,
-                     BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()),
-                     found_inf_data, sizeof(bool));
+                     dev_ctx.GetPlace(), found_inf_data, sizeof(bool));
       }
 
       if (cpu_found_inf_data) {
@@ -142,9 +141,8 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel<T> {
     if (dev_ctx.x_context()->xpu_stream) {
       dev_ctx.Wait();
     }
-    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()),
-                 found_inf_data, platform::CPUPlace(), &cpu_found_inf_data,
-                 sizeof(bool));
+    memory::Copy(dev_ctx.GetPlace(), found_inf_data, platform::CPUPlace(),
+                 &cpu_found_inf_data, sizeof(bool));
   }
 };
 
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cu b/paddle/fluid/operators/amp/update_loss_scaling_op.cu
index ba8e2bd15874f..6d9cd96a3fb9a 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op.cu
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cu
@@ -114,9 +114,8 @@ class LazyZeros<platform::CUDADeviceContext, T> {
     for (int i = 0; i < xs_size; i++) {
       h_starts[i + 1] = h_starts[i] + outs[i]->numel();
     }
-    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
-                 d_starts, cpu_place, h_starts, (xs_size + 1) * sizeof(int64_t),
-                 dev_ctx.stream());
+    memory::Copy(dev_ctx.GetPlace(), d_starts, cpu_place, h_starts,
+                 (xs_size + 1) * sizeof(int64_t), dev_ctx.stream());
 
     // copy each tensor of "outs" data address array to device
     auto h_out_addrs_mem = memory::Alloc(cpu_place, xs_size * sizeof(T*));
@@ -128,9 +127,8 @@ class LazyZeros<platform::CUDADeviceContext, T> {
     for (size_t i = 0; i < xs_size; ++i) {
       h_out_addrs[i] = outs[i]->mutable_data<T>(dev_ctx.GetPlace());
     }
-    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
-                 d_out_addrs, cpu_place, h_out_addrs, xs_size * sizeof(T*),
-                 dev_ctx.stream());
+    memory::Copy(dev_ctx.GetPlace(), d_out_addrs, cpu_place, h_out_addrs,
+                 xs_size * sizeof(T*), dev_ctx.stream());
 
     // launch cuda kernel
     int64_t total_num = h_starts[xs_size];
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
index 8160368d72ad1..6582be7354f63 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
@@ -187,9 +187,7 @@ class LazyZerosNPU {
         framework::TensorCopy(*x, place, dev_ctx, out);
       } else if (zero_ptr != dst_ptr) {
         auto size = out->numel() * framework::SizeOfType(out->type());
-        memory::Copy(BOOST_GET_CONST(platform::NPUPlace, place), dst_ptr,
-                     BOOST_GET_CONST(platform::NPUPlace, place), zero_ptr, size,
-                     stream);
+        memory::Copy(place, dst_ptr, place, zero_ptr, size, stream);
       }
     }
   }
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_xpu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_xpu.cc
index fa7985e186d58..fe03d93f4480f 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op_xpu.cc
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op_xpu.cc
@@ -43,8 +43,7 @@ class UpdateLossScalingXPUKernel : public framework::OpKernel<T> {
     bool cpu_found_inf_data = false;
     if (platform::is_xpu_place(found_inf->place())) {
       memory::Copy(platform::CPUPlace(),
-                   static_cast<void*>(&cpu_found_inf_data),
-                   BOOST_GET_CONST(platform::XPUPlace, found_inf->place()),
+                   static_cast<void*>(&cpu_found_inf_data), found_inf->place(),
                    static_cast<const void*>(found_inf_data), sizeof(bool));
     } else {
       cpu_found_inf_data = (*found_inf_data);
@@ -97,16 +96,16 @@ class UpdateLossScalingXPUKernel : public framework::OpKernel<T> {
     MPDType cpu_pre_loss_scaling_data;
     if (platform::is_xpu_place(bad_in->place())) {
       memory::Copy(platform::CPUPlace(), static_cast<void*>(&cpu_bad_in_data),
-                   BOOST_GET_CONST(platform::XPUPlace, bad_in->place()),
-                   static_cast<const void*>(bad_in_data), sizeof(int));
+                   bad_in->place(), static_cast<const void*>(bad_in_data),
+                   sizeof(int));
     } else {
       cpu_bad_in_data = (*bad_in_data);
     }
 
     if (platform::is_xpu_place(good_in->place())) {
       memory::Copy(platform::CPUPlace(), static_cast<void*>(&cpu_good_in_data),
-                   BOOST_GET_CONST(platform::XPUPlace, good_in->place()),
-                   static_cast<const void*>(good_in_data), sizeof(int));
+                   good_in->place(), static_cast<const void*>(good_in_data),
+                   sizeof(int));
     } else {
       cpu_good_in_data = (*good_in_data);
     }
@@ -114,7 +113,7 @@ class UpdateLossScalingXPUKernel : public framework::OpKernel<T> {
     if (platform::is_xpu_place(pre_loss_scaling->place())) {
       memory::Copy(
           platform::CPUPlace(), static_cast<void*>(&cpu_pre_loss_scaling_data),
-          BOOST_GET_CONST(platform::XPUPlace, pre_loss_scaling->place()),
+          pre_loss_scaling->place(),
           static_cast<const void*>(pre_loss_scaling_data), sizeof(MPDType));
     } else {
       cpu_pre_loss_scaling_data = (*pre_loss_scaling_data);
@@ -146,15 +145,13 @@ class UpdateLossScalingXPUKernel : public framework::OpKernel<T> {
       }
     }
     // copy to device
-    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()),
-                 bad_out_data, platform::CPUPlace(), &cpu_bad_out_data,
-                 sizeof(int));
-    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()),
-                 good_out_data, platform::CPUPlace(), &cpu_good_out_data,
-                 sizeof(int));
-    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()),
-                 updated_loss_scaling_data, platform::CPUPlace(),
-                 &cpu_updated_loss_scaling_data, sizeof(MPDType));
+    memory::Copy(dev_ctx.GetPlace(), bad_out_data, platform::CPUPlace(),
+                 &cpu_bad_out_data, sizeof(int));
+    memory::Copy(dev_ctx.GetPlace(), good_out_data, platform::CPUPlace(),
+                 &cpu_good_out_data, sizeof(int));
+    memory::Copy(dev_ctx.GetPlace(), updated_loss_scaling_data,
+                 platform::CPUPlace(), &cpu_updated_loss_scaling_data,
+                 sizeof(MPDType));
   }
 };
 
diff --git a/paddle/fluid/operators/assign_op.cc b/paddle/fluid/operators/assign_op.cc
index 09452b8f68baf..da5ee4dd82b4d 100644
--- a/paddle/fluid/operators/assign_op.cc
+++ b/paddle/fluid/operators/assign_op.cc
@@ -25,8 +25,6 @@ namespace imperative {
 class OpBase;
 }  // namespace imperative
 namespace platform {
-struct CPUPlace;
-struct CUDAPlace;
 struct float16;
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/operators/assign_op_npu.cc b/paddle/fluid/operators/assign_op_npu.cc
index 449ae02ecbc19..5be1beaa3dfb2 100644
--- a/paddle/fluid/operators/assign_op_npu.cc
+++ b/paddle/fluid/operators/assign_op_npu.cc
@@ -27,8 +27,6 @@ namespace imperative {
 class OpBase;
 }  // namespace imperative
 namespace platform {
-struct CPUPlace;
-struct CUDAPlace;
 struct float16;
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/operators/assign_op_xpu.cc b/paddle/fluid/operators/assign_op_xpu.cc
index 6255b5d341e09..26c879c3fb612 100644
--- a/paddle/fluid/operators/assign_op_xpu.cc
+++ b/paddle/fluid/operators/assign_op_xpu.cc
@@ -26,8 +26,6 @@ namespace imperative {
 class OpBase;
 }  // namespace imperative
 namespace platform {
-struct CPUPlace;
-struct CUDAPlace;
 struct float16;
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/operators/assign_value_op.cc b/paddle/fluid/operators/assign_value_op.cc
index 1589f9e8911f3..1adad7837f2f0 100644
--- a/paddle/fluid/operators/assign_value_op.cc
+++ b/paddle/fluid/operators/assign_value_op.cc
@@ -26,9 +26,6 @@ class EmptyGradOpMaker;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-struct CPUPlace;
-}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
diff --git a/paddle/fluid/operators/average_accumulates_op.cu b/paddle/fluid/operators/average_accumulates_op.cu
index 3bffe0a05a8f7..17c0e035f4632 100644
--- a/paddle/fluid/operators/average_accumulates_op.cu
+++ b/paddle/fluid/operators/average_accumulates_op.cu
@@ -25,8 +25,7 @@ void GetAccumulators<paddle::platform::CUDADeviceContext>(
   auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
   auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
   auto stream = ctx.cuda_device_context().stream();
-  auto cuda_place =
-      BOOST_GET_CONST(platform::CUDAPlace, in_old_num_accumulates->place());
+  auto cuda_place = in_old_num_accumulates->place();
   memory::Copy(platform::CPUPlace(), old_num_accumulates_, cuda_place,
                in_old_num_accumulates->data<int64_t>(), sizeof(int64_t),
                stream);
@@ -44,8 +43,7 @@ void SetAccumulators<paddle::platform::CUDADeviceContext>(
   auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
   auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
   auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
-  auto cuda_place =
-      BOOST_GET_CONST(platform::CUDAPlace, out_old_num_accumulates->place());
+  auto cuda_place = out_old_num_accumulates->place();
 
   memory::Copy(cuda_place, out_old_num_accumulates->data<int64_t>(),
                platform::CPUPlace(), &old_num_accumulates_, sizeof(int64_t),
diff --git a/paddle/fluid/operators/bernoulli_op.cu b/paddle/fluid/operators/bernoulli_op.cu
index dde4dd2567b79..030f7cb7d7c33 100644
--- a/paddle/fluid/operators/bernoulli_op.cu
+++ b/paddle/fluid/operators/bernoulli_op.cu
@@ -57,8 +57,7 @@ class BernoulliOpKernel<platform::CUDADeviceContext, T>
     auto* out_data = out->mutable_data<T>(ctx.GetPlace());
     int64_t size = x->numel();
 
-    int device_id =
-        BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).GetDeviceId();
+    int device_id = ctx.GetPlace().GetDeviceId();
     auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
     auto seed_offset = gen_cuda->IncrementOffset(1);
     int64_t gen_offset = size * seed_offset.second;
diff --git a/paddle/fluid/operators/cholesky_op.cu b/paddle/fluid/operators/cholesky_op.cu
index 0bfddf8b5f386..43c16d607c2db 100644
--- a/paddle/fluid/operators/cholesky_op.cu
+++ b/paddle/fluid/operators/cholesky_op.cu
@@ -102,8 +102,7 @@ class CholeskyGPUKernel : public framework::OpKernel<T> {
     std::vector<int> error_info;  // only for checking positive matrix
     error_info.resize(batch_count);
 
-    memory::Copy(platform::CPUPlace(), error_info.data(),
-                 BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
+    memory::Copy(platform::CPUPlace(), error_info.data(), dev_ctx.GetPlace(),
                  info_ptr, sizeof(int) * batch_count, dev_ctx.stream());
 
     for (int i = 0; i < batch_count; ++i) {
diff --git a/paddle/fluid/operators/class_center_sample_op.cu b/paddle/fluid/operators/class_center_sample_op.cu
index fad74b81e14e4..2d7800d9997fc 100644
--- a/paddle/fluid/operators/class_center_sample_op.cu
+++ b/paddle/fluid/operators/class_center_sample_op.cu
@@ -306,7 +306,7 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel<T> {
                           num_classes, num_samples));
 
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace());
+    auto place = dev_ctx.GetPlace();
 
     int batch_size = label->numel();
     // Algorithm:
@@ -397,8 +397,7 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel<T> {
                        (NumBlocks(num_classes) * kNumCUDAThreads * vec_size) +
                    1) *
                   vec_size;
-    int device_id =
-        BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).GetDeviceId();
+    int device_id = ctx.GetPlace().GetDeviceId();
     auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
     if (gen_cuda->GetIsInitPy() && (!fix_seed)) {
       auto seed_offset = gen_cuda->IncrementOffset(offset);
diff --git a/paddle/fluid/operators/collective/allreduce_op.h b/paddle/fluid/operators/collective/allreduce_op.h
index 226b2c5132318..314a91841bebf 100644
--- a/paddle/fluid/operators/collective/allreduce_op.h
+++ b/paddle/fluid/operators/collective/allreduce_op.h
@@ -33,7 +33,7 @@ class AllReduceOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(place), true,
                       platform::errors::PreconditionNotMet(
                           "AllReduce op can run on gpu place only for now."));
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
diff --git a/paddle/fluid/operators/collective/broadcast_op.cu.cc b/paddle/fluid/operators/collective/broadcast_op.cu.cc
index 229d42e64e4e5..04d028536a9b2 100644
--- a/paddle/fluid/operators/collective/broadcast_op.cu.cc
+++ b/paddle/fluid/operators/collective/broadcast_op.cu.cc
@@ -34,7 +34,7 @@ class NCCLBroadcastOpKernel : public framework::OpKernel<T> {
             "The place of ExecutionContext should be CUDAPlace."));
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    int dev_id = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).device;
+    int dev_id = ctx.GetPlace().device;
     int root_dev_id = ctx.Attr<int>("root");
 
     auto in = ctx.Input<framework::Tensor>("X");
diff --git a/paddle/fluid/operators/collective/broadcast_op_xpu.cc b/paddle/fluid/operators/collective/broadcast_op_xpu.cc
index e8566803aecfa..b3d4585da003d 100644
--- a/paddle/fluid/operators/collective/broadcast_op_xpu.cc
+++ b/paddle/fluid/operators/collective/broadcast_op_xpu.cc
@@ -40,7 +40,7 @@ class BKCLBroadcastOpKernel : public framework::OpKernel<T> {
                           "The place of ExecutionContext should be XPUPlace."));
 
 #if defined(PADDLE_WITH_XPU_BKCL)
-    int dev_id = BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()).device;
+    int dev_id = ctx.GetPlace().device;
     int root_dev_id = ctx.Attr<int>("root");
 
     auto in = ctx.Input<framework::Tensor>("X");
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op.cc b/paddle/fluid/operators/collective/c_allreduce_max_op.cc
index 8bdbdfac8ffd1..4ea1876da2569 100644
--- a/paddle/fluid/operators/collective/c_allreduce_max_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op.cc
@@ -24,7 +24,6 @@ namespace imperative {
 class OpBase;
 }  // namespace imperative
 namespace platform {
-struct CPUPlace;
 struct float16;
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc
index ec8d651819502..17b49eda2f804 100644
--- a/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace platform {
-struct CUDAPlace;
+
 struct float16;
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc
index b0aa51f7cfdfd..96da390d45db0 100644
--- a/paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace platform {
-struct XPUPlace;
+
 struct float16;
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op.cc b/paddle/fluid/operators/collective/c_allreduce_min_op.cc
index 9d913b12b1376..75a484ef87166 100644
--- a/paddle/fluid/operators/collective/c_allreduce_min_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_min_op.cc
@@ -24,7 +24,6 @@ namespace imperative {
 class OpBase;
 }  // namespace imperative
 namespace platform {
-struct CPUPlace;
 struct float16;
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc
index 7935a1f722e55..4eca34fb50707 100644
--- a/paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace platform {
-struct CUDAPlace;
+
 struct float16;
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc
index 2f16a89c217da..bded82229619d 100644
--- a/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace platform {
-struct XPUPlace;
+
 struct float16;
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op.cc
index 3ad078e1c8ff0..c49e72eac2326 100644
--- a/paddle/fluid/operators/collective/c_allreduce_prod_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_prod_op.cc
@@ -24,7 +24,6 @@ namespace imperative {
 class OpBase;
 }  // namespace imperative
 namespace platform {
-struct CPUPlace;
 struct float16;
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc
index 1a78427cd19ee..74acbacf2b94e 100644
--- a/paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace platform {
-struct CUDAPlace;
+
 struct float16;
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc
index 92ba00428065b..a1d439cfdae62 100644
--- a/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace platform {
-struct XPUPlace;
+
 struct float16;
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op.cc
index 18c317506c06e..72659282afa60 100644
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op.cc
@@ -22,7 +22,6 @@ namespace imperative {
 class OpBase;
 }  // namespace imperative
 namespace platform {
-struct CPUPlace;
 struct float16;
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc
index 06e90cdff8045..cfd508be27fb1 100644
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace platform {
-struct CUDAPlace;
+
 struct float16;
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc
index e4ec538cd2323..bacdf7fb53c35 100644
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace platform {
-struct XPUPlace;
+
 struct float16;
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_comm_init_all_op.cc b/paddle/fluid/operators/collective/c_comm_init_all_op.cc
index db9a8428e3d03..5820bd318d8bc 100644
--- a/paddle/fluid/operators/collective/c_comm_init_all_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_all_op.cc
@@ -48,7 +48,7 @@ class CCommInitAllOp : public framework::OperatorBase {
 
   void RunImpl(const framework::Scope& scope,
                const platform::Place& place) const override {
-    PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(place), true,
                       platform::errors::PreconditionNotMet(
                           "CCommInitAllOp can run on gpu place only"));
 
diff --git a/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc b/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc
index 86044b5ba1c1a..bdd904bf7be7a 100644
--- a/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc
@@ -41,7 +41,7 @@ class CCommInitOpAscend : public framework::OperatorBase {
 
   void RunImpl(const framework::Scope& scope,
                const platform::Place& place) const override {
-    PADDLE_ENFORCE_EQ(is_npu_place(place), true,
+    PADDLE_ENFORCE_EQ(platform::is_npu_place(place), true,
                       platform::errors::PreconditionNotMet(
                           "CCommInitOpAscend can run on npu place only."));
 
@@ -54,7 +54,7 @@ class CCommInitOpAscend : public framework::OperatorBase {
     int rank_ids = Attr<int>("rank_ids");
     int rank_id = Attr<int>("rank");
     int rid = Attr<int>("ring_id");
-    int device_id = BOOST_GET_CONST(platform::NPUPlace, place).device;
+    int device_id = place.device;
     if (Attr<int>("device_id") >= 0) {
       device_id = Attr<int>("device_id");
     }
diff --git a/paddle/fluid/operators/collective/c_comm_init_op.cc b/paddle/fluid/operators/collective/c_comm_init_op.cc
index 9bf86dc926773..56b0017fefe63 100644
--- a/paddle/fluid/operators/collective/c_comm_init_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_op.cc
@@ -61,9 +61,10 @@ class CCommInitOp : public framework::OperatorBase {
         "PaddlePaddle should be compiled with GPU or XPU."));
 #endif
 
-    PADDLE_ENFORCE_EQ(is_gpu_place(place) || is_xpu_place(place), true,
-                      platform::errors::PreconditionNotMet(
-                          "CCommInitOp can run on gpu or xpu place only."));
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(place) || platform::is_xpu_place(place), true,
+        platform::errors::PreconditionNotMet(
+            "CCommInitOp can run on gpu or xpu place only."));
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
     defined(PADDLE_WITH_XPU_BKCL)
@@ -85,7 +86,7 @@ class CCommInitOp : public framework::OperatorBase {
             rid));
 #endif
 
-    int device_id = BOOST_GET_CONST(Place, place).device;
+    int device_id = place.device;
     if (Attr<int>("device_id") >= 0) {
       device_id = Attr<int>("device_id");
     }
diff --git a/paddle/fluid/operators/collective/c_reduce_max_op.cc b/paddle/fluid/operators/collective/c_reduce_max_op.cc
index 41a07f9439951..8a5ed7d7bde9b 100644
--- a/paddle/fluid/operators/collective/c_reduce_max_op.cc
+++ b/paddle/fluid/operators/collective/c_reduce_max_op.cc
@@ -24,7 +24,6 @@ namespace imperative {
 class OpBase;
 }  // namespace imperative
 namespace platform {
-struct CPUPlace;
 struct float16;
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_reduce_max_op.cu.cc b/paddle/fluid/operators/collective/c_reduce_max_op.cu.cc
index e03da37360f47..9668c68c7da20 100644
--- a/paddle/fluid/operators/collective/c_reduce_max_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_max_op.cu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace platform {
-struct CUDAPlace;
+
 struct float16;
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_reduce_max_op_xpu.cc b/paddle/fluid/operators/collective/c_reduce_max_op_xpu.cc
index 6d3af7bb5f258..82a10b24dab36 100644
--- a/paddle/fluid/operators/collective/c_reduce_max_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_max_op_xpu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace platform {
-struct XPUPlace;
+
 struct float16;
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_reduce_min_op.cc b/paddle/fluid/operators/collective/c_reduce_min_op.cc
index 77bb96347f943..c2ecf2419a0b5 100644
--- a/paddle/fluid/operators/collective/c_reduce_min_op.cc
+++ b/paddle/fluid/operators/collective/c_reduce_min_op.cc
@@ -24,7 +24,6 @@ namespace imperative {
 class OpBase;
 }  // namespace imperative
 namespace platform {
-struct CPUPlace;
 struct float16;
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_reduce_min_op.cu.cc b/paddle/fluid/operators/collective/c_reduce_min_op.cu.cc
index 83f7fce1ec6b7..7f5b4cd3608ef 100644
--- a/paddle/fluid/operators/collective/c_reduce_min_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_min_op.cu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace platform {
-struct CUDAPlace;
+
 struct float16;
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc b/paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc
index 791e58d8493ce..b1136b796699c 100644
--- a/paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace platform {
-struct XPUPlace;
+
 struct float16;
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_reduce_op.h b/paddle/fluid/operators/collective/c_reduce_op.h
index c06b2683a6bbe..a0e0f8e92bdde 100644
--- a/paddle/fluid/operators/collective/c_reduce_op.h
+++ b/paddle/fluid/operators/collective/c_reduce_op.h
@@ -187,7 +187,7 @@ class CReduceOpASCENDKernel : public framework::OpKernel<T> {
         reinterpret_cast<void*>(stream)));
 
     if (rank_id != root_id) {
-      auto npu_place = BOOST_GET_CONST(platform::NPUPlace, place);
+      auto npu_place = place;
       memory::Copy(npu_place, reinterpret_cast<void*>(out->data<T>()),
                    npu_place,
                    reinterpret_cast<void*>(const_cast<T*>(in->data<T>())),
diff --git a/paddle/fluid/operators/collective/c_reduce_prod_op.cc b/paddle/fluid/operators/collective/c_reduce_prod_op.cc
index f6c1c5d50e864..a689b9db15aac 100644
--- a/paddle/fluid/operators/collective/c_reduce_prod_op.cc
+++ b/paddle/fluid/operators/collective/c_reduce_prod_op.cc
@@ -24,7 +24,6 @@ namespace imperative {
 class OpBase;
 }  // namespace imperative
 namespace platform {
-struct CPUPlace;
 struct float16;
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_reduce_prod_op.cu.cc b/paddle/fluid/operators/collective/c_reduce_prod_op.cu.cc
index 83db107b36faf..c3de32b9fbdb0 100644
--- a/paddle/fluid/operators/collective/c_reduce_prod_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_prod_op.cu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace platform {
-struct CUDAPlace;
+
 struct float16;
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc b/paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc
index e7e770e8ffdca..f6def80a19076 100644
--- a/paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace platform {
-struct XPUPlace;
+
 struct float16;
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op.cc b/paddle/fluid/operators/collective/c_reduce_sum_op.cc
index e59ec85fefd13..b7f521b371ac8 100644
--- a/paddle/fluid/operators/collective/c_reduce_sum_op.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op.cc
@@ -24,7 +24,6 @@ namespace imperative {
 class OpBase;
 }  // namespace imperative
 namespace platform {
-struct CPUPlace;
 struct float16;
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op.cu.cc b/paddle/fluid/operators/collective/c_reduce_sum_op.cu.cc
index 39c8716a92a36..a4a651be3c5ee 100644
--- a/paddle/fluid/operators/collective/c_reduce_sum_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op.cu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace platform {
-struct CUDAPlace;
+
 struct float16;
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc
index a0ec4d2a99cd7..ec928bd6a095d 100644
--- a/paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace platform {
-struct XPUPlace;
+
 struct float16;
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
index 72339bbd48752..8a4c1979adbbf 100644
--- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
@@ -59,7 +59,7 @@ class CSyncCalcStreamKernel : public framework::OpKernel<T> {
 
 #elif defined(PADDLE_WITH_ASCEND_CL) && !defined(_WIN32)
     auto place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(is_npu_place(place), true,
+    PADDLE_ENFORCE_EQ(platform::is_npu_place(place), true,
                       platform::errors::PreconditionNotMet(
                           "Sync stream op can run on npu place only for now."));
 
diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
index 21bad096c2d49..893cc90762f33 100644
--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
@@ -61,8 +61,8 @@ template <typename T>
 class CSyncCommStreamKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto place = ctx.GetPlace();
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    auto place = ctx.GetPlace();
     int ring_id = ctx.Attr<int>("ring_id");
     auto stream =
         platform::NCCLCommContext::Instance().Get(ring_id, place)->stream();
@@ -70,9 +70,12 @@ class CSyncCommStreamKernel : public framework::OpKernel<T> {
     platform::GpuStreamSync(stream);
 
 #elif defined(PADDLE_WITH_ASCEND_CL)
-    PADDLE_ENFORCE_EQ(is_npu_place(place), true,
+    auto place = ctx.GetPlace();
+    PADDLE_ENFORCE_EQ(platform::is_npu_place(place), true,
                       platform::errors::PreconditionNotMet(
-                          "Sync stream op can run on npu place only for now."));
+                          "Sync comm stream op can run on npu place only for "
+                          "now, but we got %s, please check the environment.",
+                          place.DebugString()));
     int ring_id = ctx.Attr<int>("ring_id");
     auto stream =
         platform::HCCLCommContext::Instance().Get(ring_id, place)->stream();
diff --git a/paddle/fluid/operators/collective/c_wait_comm_op.cc b/paddle/fluid/operators/collective/c_wait_comm_op.cc
index dfa4dcd0fac59..b15e33417a05b 100644
--- a/paddle/fluid/operators/collective/c_wait_comm_op.cc
+++ b/paddle/fluid/operators/collective/c_wait_comm_op.cc
@@ -35,9 +35,11 @@ class CWaitCommOp : public framework::OperatorBase {
 
   void RunImpl(const framework::Scope& scope,
                const platform::Place& place) const override {
-    PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
-                      platform::errors::PreconditionNotMet(
-                          "wait_comm op can run on gpu place only for now."));
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(place), true,
+        platform::errors::PreconditionNotMet(
+            "wait_comm op can run on gpu place only for now, but got %s",
+            place.DebugString()));
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     int ring_id = Attr<int>("ring_id");
diff --git a/paddle/fluid/operators/collective/c_wait_compute_op.cc b/paddle/fluid/operators/collective/c_wait_compute_op.cc
index e038617bf3d6a..7ca0a087d909b 100644
--- a/paddle/fluid/operators/collective/c_wait_compute_op.cc
+++ b/paddle/fluid/operators/collective/c_wait_compute_op.cc
@@ -37,9 +37,10 @@ class CWaitComputeOp : public framework::OperatorBase {
   void RunImpl(const framework::Scope& scope,
                const platform::Place& place) const override {
     PADDLE_ENFORCE_EQ(
-        is_gpu_place(place), true,
+        platform::is_gpu_place(place), true,
         platform::errors::PreconditionNotMet(
-            "wait_compute op can run on gpu place only for now."));
+            "wait_compute op can run on gpu place only for now, but got %s",
+            place.DebugString()));
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     int ring_id = Attr<int>("ring_id");
diff --git a/paddle/fluid/operators/controlflow/compare_op.cc b/paddle/fluid/operators/controlflow/compare_op.cc
index a03e4165755dd..882d74a0d51d8 100644
--- a/paddle/fluid/operators/controlflow/compare_op.cc
+++ b/paddle/fluid/operators/controlflow/compare_op.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+#include "paddle/pten/common/place.h"
 
 namespace paddle {
 namespace operators {
@@ -92,8 +93,8 @@ class CompareOp : public framework::OperatorWithKernel {
     if (force_cpu) {
       kt.place_ = platform::CPUPlace();
     } else {
-      if (ctx.Input<framework::LoDTensor>("X")->place().type() !=
-          typeid(platform::CUDAPinnedPlace)) {
+      if (ctx.Input<framework::LoDTensor>("X")->place().GetType() !=
+          pten::AllocationType::GPUPINNED) {
         kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
       } else {
         kt.place_ = ctx.GetPlace();
diff --git a/paddle/fluid/operators/controlflow/fetch_v2_op.cc b/paddle/fluid/operators/controlflow/fetch_v2_op.cc
index 29132f2930acb..9bb9e481034bd 100644
--- a/paddle/fluid/operators/controlflow/fetch_v2_op.cc
+++ b/paddle/fluid/operators/controlflow/fetch_v2_op.cc
@@ -27,8 +27,6 @@ namespace imperative {
 class OpBase;
 }  // namespace imperative
 namespace platform {
-struct CPUPlace;
-struct CUDAPlace;
 struct float16;
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
index 6f696afa23886..8adf556b4cd3d 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -26,7 +26,7 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 class CUDADeviceContext;
-struct CUDAPlace;
+
 }  // namespace platform
 }  // namespace paddle
 
@@ -68,10 +68,8 @@ void weight_to_tensor(const platform::Place &place, gpuStream_t stream,
     const T *in_data = weight_list[i]->data<T>();
     auto in_size = weight_list[i]->numel();
 
-    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, weight->place()),
-                 weight_data + weight_offset,
-                 BOOST_GET_CONST(platform::CUDAPlace, weight_list[i]->place()),
-                 in_data, in_size * sizeof(T), stream);
+    memory::Copy(weight->place(), weight_data + weight_offset,
+                 weight_list[i]->place(), in_data, in_size * sizeof(T), stream);
     weight_offset += in_size;
   }
 }
@@ -88,10 +86,8 @@ void weight_to_tensor_list(const platform::Place &place, gpuStream_t stream,
     T *weight_grad_data = (*weight_grad)[i]->mutable_data<T>(place);
     const T *src = weight_data + weight_offset;
 
-    memory::Copy(
-        BOOST_GET_CONST(platform::CUDAPlace, (*weight_grad)[i]->place()),
-        weight_grad_data, BOOST_GET_CONST(platform::CUDAPlace, weight->place()),
-        src, in_size * sizeof(T), stream);
+    memory::Copy((*weight_grad)[i]->place(), weight_grad_data, weight->place(),
+                 src, in_size * sizeof(T), stream);
     weight_offset += in_size;
   }
 }
@@ -176,8 +172,7 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
     int seed = ctx.Attr<int>("seed");
 
     if (!is_test) {
-      int device_id =
-          BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).GetDeviceId();
+      int device_id = ctx.GetPlace().GetDeviceId();
       auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
       if (gen_cuda->GetIsInitPy() && seed == 0) {
         // If perform `manual_seed` in python and inner seed is not specified
diff --git a/paddle/fluid/operators/cumprod_op.cu b/paddle/fluid/operators/cumprod_op.cu
index 82ed0bd444de9..2b69db7d24a12 100644
--- a/paddle/fluid/operators/cumprod_op.cu
+++ b/paddle/fluid/operators/cumprod_op.cu
@@ -225,7 +225,7 @@ class CumprodGradOpCUDAKernel : public framework::OpKernel<T> {
     const auto *y_data = y->data<T>();
     const auto *dy_data = dy->data<T>();
 
-    auto place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
+    auto place = ctx.GetPlace();
     const auto &dev_ctx =
         ctx.template device_context<platform::CUDADeviceContext>();
     auto *dx_data = dx->mutable_data<T>(place);
diff --git a/paddle/fluid/operators/cumprod_op.h b/paddle/fluid/operators/cumprod_op.h
index a964cfb3d7bea..d8c3c1febdcf3 100644
--- a/paddle/fluid/operators/cumprod_op.h
+++ b/paddle/fluid/operators/cumprod_op.h
@@ -101,7 +101,7 @@ class CumprodGradOpCPUKernel : public framework::OpKernel<T> {
     auto* out_data = out->data<T>();
     auto* d_x_data = d_x->mutable_data<T>(context.GetPlace());
 
-    auto place = BOOST_GET_CONST(platform::CPUPlace, context.GetPlace());
+    auto place = context.GetPlace();
     const auto& dev_ctx =
         context.template device_context<platform::CPUDeviceContext>();
 
diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.cu b/paddle/fluid/operators/deformable_psroi_pooling_op.cu
index 6489c1f9784cf..eeb2c7692b5d5 100644
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.cu
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cu
@@ -245,7 +245,7 @@ class DeformablePSROIPoolCUDAKernel : public framework::OpKernel<T> {
     int bytes = roi_batch_id_list.numel() * sizeof(int);
     auto roi_ptr = memory::Alloc(dev_ctx, bytes);
     int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
-    const auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
+    const auto gplace = ctx.GetPlace();
     memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
                  dev_ctx.stream());
 
@@ -516,7 +516,7 @@ class DeformablePSROIPoolGradCUDAKernel : public framework::OpKernel<T> {
     int bytes = roi_batch_id_list.numel() * sizeof(int);
     auto roi_ptr = memory::Alloc(dev_ctx, bytes);
     int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
-    const auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
+    const auto gplace = ctx.GetPlace();
     memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
                  dev_ctx.stream());
 
diff --git a/paddle/fluid/operators/dequantize_abs_max_op.cc b/paddle/fluid/operators/dequantize_abs_max_op.cc
index aee468e05e182..7583bdabc3015 100644
--- a/paddle/fluid/operators/dequantize_abs_max_op.cc
+++ b/paddle/fluid/operators/dequantize_abs_max_op.cc
@@ -26,9 +26,7 @@ class EmptyGradOpMaker;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-struct CPUPlace;
-}  // namespace platform
+namespace platform {}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
diff --git a/paddle/fluid/operators/dequantize_log_op.cc b/paddle/fluid/operators/dequantize_log_op.cc
index c12dd9e6d218a..016be54eeb7b4 100644
--- a/paddle/fluid/operators/dequantize_log_op.cc
+++ b/paddle/fluid/operators/dequantize_log_op.cc
@@ -26,9 +26,6 @@ class EmptyGradOpMaker;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-struct CPUPlace;
-}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
diff --git a/paddle/fluid/operators/detail/strided_memcpy.h b/paddle/fluid/operators/detail/strided_memcpy.h
index 7df0f85523bc6..5470d44202590 100644
--- a/paddle/fluid/operators/detail/strided_memcpy.h
+++ b/paddle/fluid/operators/detail/strided_memcpy.h
@@ -31,11 +31,11 @@ struct StridedMemcpyFunctor<T, 0> {
                   const int64_t* dst_stride, T* dst) const {
     auto place = dev_ctx.GetPlace();
     if (platform::is_cpu_place(place)) {
-      auto& cpu_place = BOOST_GET_CONST(platform::CPUPlace, place);
+      auto& cpu_place = place;
       memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T));
     } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      auto& gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place);
+      auto& gpu_place = place;
       auto& cuda_ctx =
           reinterpret_cast<const platform::CUDADeviceContext&>(dev_ctx);
       memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T),
@@ -55,11 +55,11 @@ struct StridedMemcpyFunctor<T, 1> {
                   const int64_t* dst_stride, T* dst) const {
     auto place = dev_ctx.GetPlace();
     if (platform::is_cpu_place(place)) {
-      auto& cpu_place = BOOST_GET_CONST(platform::CPUPlace, place);
+      auto& cpu_place = place;
       memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim[0]);
     } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      auto& gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place);
+      auto& gpu_place = place;
       auto& cuda_ctx =
           reinterpret_cast<const platform::CUDADeviceContext&>(dev_ctx);
       memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T) * dst_dim[0],
diff --git a/paddle/fluid/operators/detection/bbox_util.cu.h b/paddle/fluid/operators/detection/bbox_util.cu.h
index 6f5137be62011..c6754f62cc74e 100644
--- a/paddle/fluid/operators/detection/bbox_util.cu.h
+++ b/paddle/fluid/operators/detection/bbox_util.cu.h
@@ -68,7 +68,7 @@ static void SortDescending(const platform::CUDADeviceContext &ctx,
       nullptr, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, num, 0,
       sizeof(T) * 8, ctx.stream());
   // Allocate temporary storage
-  auto place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
+  auto place = ctx.GetPlace();
   auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
 
   // Run sorting operation
@@ -274,7 +274,7 @@ static void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
   dim3 threads(kThreadsPerBlock);
 
   const T *boxes = proposals.data<T>();
-  auto place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
+  auto place = ctx.GetPlace();
   auto mask_ptr = memory::Alloc(ctx, boxes_num * col_blocks * sizeof(uint64_t));
   uint64_t *mask_dev = reinterpret_cast<uint64_t *>(mask_ptr->ptr());
 
diff --git a/paddle/fluid/operators/detection/box_coder_op.cu b/paddle/fluid/operators/detection/box_coder_op.cu
index 6e5fa1e293353..22dc606df9df5 100644
--- a/paddle/fluid/operators/detection/box_coder_op.cu
+++ b/paddle/fluid/operators/detection/box_coder_op.cu
@@ -183,8 +183,7 @@ class BoxCoderCUDAKernel : public framework::OpKernel<T> {
     auto dev_var = memory::Alloc(device_ctx, bytes);
     float* dev_var_data = reinterpret_cast<float*>(dev_var->ptr());
     auto cplace = platform::CPUPlace();
-    const auto gplace =
-        BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace());
+    const auto gplace = context.GetPlace();
     memory::Copy(gplace, dev_var_data, cplace, &variance[0], bytes,
                  device_ctx.stream());
 
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
index bd5703022db90..60cb16ce6c047 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
@@ -85,7 +85,7 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
         roi_batch_id_list.mutable_data<int>(platform::CPUPlace());
     int index = 0;
     int lod_size;
-    auto place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace());
+    auto place = dev_ctx.GetPlace();
 
     auto multi_rois_num = ctx.MultiInput<Tensor>("MultiLevelRoIsNum");
     for (size_t i = 0; i < roi_ins.size(); ++i) {
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
index 1df7dcbe670c0..a9a6dcea1bbe5 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
@@ -135,7 +135,7 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
         roi_num, fpn_rois->data<T>(), lod_size, refer_level, refer_scale,
         max_level, min_level, roi_batch_id_list_gpu.data<int>(),
         sub_lod_list_data, target_lvls_data, pixel_offset);
-    auto place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace());
+    auto place = dev_ctx.GetPlace();
 
     Tensor index_in_t;
     int* idx_in = index_in_t.mutable_data<int>({roi_num}, dev_ctx.GetPlace());
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu
index e8ab628db16bd..2de06e06d9ad3 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
@@ -67,7 +67,7 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
       proposals.data<T>(), im_info.data<T>(), min_size, pre_nms_num,
       keep_num_t.data<int>(), keep_index.data<int>());
   int keep_num;
-  const auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
+  const auto gpu_place = ctx.GetPlace();
   memory::Copy(platform::CPUPlace(), &keep_num, gpu_place,
                keep_num_t.data<int>(), sizeof(int), ctx.stream());
   ctx.Wait();
@@ -169,7 +169,7 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
     T *rpn_rois_data = rpn_rois->data<T>();
     T *rpn_roi_probs_data = rpn_roi_probs->data<T>();
 
-    auto place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace());
+    auto place = dev_ctx.GetPlace();
     auto cpu_place = platform::CPUPlace();
 
     int64_t num_proposals = 0;
diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op.cu b/paddle/fluid/operators/detection/generate_proposals_v2_op.cu
index 6244827f685ba..cc2d4578e3eb1 100644
--- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cu
@@ -68,7 +68,7 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
       proposals.data<T>(), im_shape.data<T>(), min_size, pre_nms_num,
       keep_num_t.data<int>(), keep_index.data<int>(), false, pixel_offset);
   int keep_num;
-  const auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
+  const auto gpu_place = ctx.GetPlace();
   memory::Copy(platform::CPUPlace(), &keep_num, gpu_place,
                keep_num_t.data<int>(), sizeof(int), ctx.stream());
   ctx.Wait();
@@ -172,7 +172,7 @@ class CUDAGenerateProposalsV2Kernel : public framework::OpKernel<T> {
     T *rpn_rois_data = rpn_rois->data<T>();
     T *rpn_roi_probs_data = rpn_roi_probs->data<T>();
 
-    auto place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace());
+    auto place = dev_ctx.GetPlace();
     auto cpu_place = platform::CPUPlace();
 
     int64_t num_proposals = 0;
diff --git a/paddle/fluid/operators/detection/yolo_box_op.cu b/paddle/fluid/operators/detection/yolo_box_op.cu
index 23bd6af6bd2e8..bfe4742c4b3c3 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.cu
+++ b/paddle/fluid/operators/detection/yolo_box_op.cu
@@ -104,7 +104,7 @@ class YoloBoxOpCUDAKernel : public framework::OpKernel<T> {
     int bytes = sizeof(int) * anchors.size();
     auto anchors_ptr = memory::Alloc(dev_ctx, sizeof(int) * anchors.size());
     int* anchors_data = reinterpret_cast<int*>(anchors_ptr->ptr());
-    const auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
+    const auto gplace = ctx.GetPlace();
     const auto cplace = platform::CPUPlace();
     memory::Copy(gplace, anchors_data, cplace, anchors.data(), bytes,
                  dev_ctx.stream());
diff --git a/paddle/fluid/operators/dirichlet_op.cu b/paddle/fluid/operators/dirichlet_op.cu
index 3e1d523ae0e15..63f9c7339bfc5 100644
--- a/paddle/fluid/operators/dirichlet_op.cu
+++ b/paddle/fluid/operators/dirichlet_op.cu
@@ -76,8 +76,7 @@ struct DirichletSampler<platform::CUDADeviceContext, T> {
     auto& dev_ctx = ctx.device_context<platform::CUDADeviceContext>();
 
     // init state, seed & offset for all threads
-    int device_id =
-        BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).GetDeviceId();
+    int device_id = ctx.GetPlace().GetDeviceId();
     auto p_gen = framework::GetDefaultCUDAGenerator(device_id);
     auto seed_and_offset = p_gen->IncrementOffset(10);  // hard-coded offset
     auto seed = seed_and_offset.first;
diff --git a/paddle/fluid/operators/distribution_helper.h b/paddle/fluid/operators/distribution_helper.h
index c6305e5ba73e8..695cb6e0ef2de 100644
--- a/paddle/fluid/operators/distribution_helper.h
+++ b/paddle/fluid/operators/distribution_helper.h
@@ -164,8 +164,7 @@ void distribution_and_transform(const platform::CUDADeviceContext &dev_ctx,
   T *out_data = out->mutable_data<T>(dev_ctx.GetPlace());
   auto size = out->numel();
 
-  int64_t device_id =
-      BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()).GetDeviceId();
+  int64_t device_id = dev_ctx.GetPlace().GetDeviceId();
   auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
 
   size_t block_size = 256;
diff --git a/paddle/fluid/operators/dlnne/dlnne_engine_op.h b/paddle/fluid/operators/dlnne/dlnne_engine_op.h
index d426876c18fa5..94c8513086c20 100644
--- a/paddle/fluid/operators/dlnne/dlnne_engine_op.h
+++ b/paddle/fluid/operators/dlnne/dlnne_engine_op.h
@@ -272,8 +272,7 @@ class DlnneEngineOp : public framework::OperatorBase {
       fluid_t->Resize(framework::make_ddim(out_shapes[bind_index]));
 
       int32_t dtype;
-      output_buffers[bind_index] = fluid_t->mutable_data<float>(
-          BOOST_GET_CONST(platform::CPUPlace, dev_place));
+      output_buffers[bind_index] = fluid_t->mutable_data<float>(dev_place);
       dtype = 0;
       cpu_output_buffers[bind_index] =
           output_buffers[bind_index];  // malloc(data_bytes);
diff --git a/paddle/fluid/operators/dropout_impl_util.h b/paddle/fluid/operators/dropout_impl_util.h
index f2038d12528c4..33fa7a092768c 100644
--- a/paddle/fluid/operators/dropout_impl_util.h
+++ b/paddle/fluid/operators/dropout_impl_util.h
@@ -25,8 +25,7 @@ inline void GetSeedDataAndIncrement(const platform::CUDADeviceContext& dev_ctx,
                                     const bool is_fix_seed, const int seed_val,
                                     const int offset, uint64_t* seed_data,
                                     uint64_t* increment) {
-  int device_id =
-      BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()).GetDeviceId();
+  int device_id = dev_ctx.GetPlace().GetDeviceId();
   auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
 
   if (seed) {
diff --git a/paddle/fluid/operators/edit_distance_op.cu b/paddle/fluid/operators/edit_distance_op.cu
index f28fa4d6338d7..3096795f3eaf0 100644
--- a/paddle/fluid/operators/edit_distance_op.cu
+++ b/paddle/fluid/operators/edit_distance_op.cu
@@ -135,8 +135,8 @@ class EditDistanceGPUKernel : public framework::OpKernel<T> {
         if (normalized) {
           distance = distance / n;
         }
-        memory::Copy(BOOST_GET_CONST(Place, ctx.GetPlace()), out + num,
-                     platform::CPUPlace(), &distance, sizeof(T), stream);
+        memory::Copy(ctx.GetPlace(), out + num, platform::CPUPlace(), &distance,
+                     sizeof(T), stream);
       } else {
         framework::Tensor dist_t;
         dist_t.Resize({m + 1, n + 1});
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cc b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
index d66d6b66a0582..216178f7d8938 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
@@ -34,7 +34,6 @@ class OpBase;
 }  // namespace imperative
 namespace platform {
 class CPUDeviceContext;
-struct CPUPlace;
 }  // namespace platform
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
index b28f713256526..b876438a1941f 100644
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
@@ -29,7 +29,6 @@ class OpBase;
 }  // namespace imperative
 namespace platform {
 class CPUDeviceContext;
-struct CPUPlace;
 }  // namespace platform
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.cc b/paddle/fluid/operators/elementwise/elementwise_max_op.cc
index e0686e815459a..cc27bab720057 100644
--- a/paddle/fluid/operators/elementwise/elementwise_max_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op.cc
@@ -27,7 +27,6 @@ class OpBase;
 }  // namespace imperative
 namespace platform {
 class CPUDeviceContext;
-struct CPUPlace;
 }  // namespace platform
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.cc b/paddle/fluid/operators/elementwise/elementwise_min_op.cc
index 1448520eca18f..3a1951999546e 100644
--- a/paddle/fluid/operators/elementwise/elementwise_min_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op.cc
@@ -27,7 +27,6 @@ class OpBase;
 }  // namespace imperative
 namespace platform {
 class CPUDeviceContext;
-struct CPUPlace;
 }  // namespace platform
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op.cc b/paddle/fluid/operators/elementwise/elementwise_mod_op.cc
index 2ac3aa6ebd3e3..bb116c9c65ac0 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mod_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.cc
@@ -29,7 +29,6 @@ class OpBase;
 }  // namespace imperative
 namespace platform {
 class CPUDeviceContext;
-struct CPUPlace;
 }  // namespace platform
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op.cc b/paddle/fluid/operators/elementwise/elementwise_pow_op.cc
index d564cc3717f5e..eddbfd3b15ea4 100644
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op.cc
@@ -24,7 +24,6 @@ class OpBase;
 }  // namespace imperative
 namespace platform {
 class CPUDeviceContext;
-struct CPUPlace;
 }  // namespace platform
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
index 810f78ce80827..f5290a69bbda1 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
@@ -34,7 +34,6 @@ class OpBase;
 }  // namespace imperative
 namespace platform {
 class CPUDeviceContext;
-struct CPUPlace;
 }  // namespace platform
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.h b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
index f035e46d1d082..0d889ef26c954 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
+#include "paddle/fluid/platform/place.h"
 
 #include "paddle/pten/kernels/math_kernel.h"
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
index 4f41ecf04cf43..6c51df5c61ef3 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
@@ -20,7 +20,6 @@ class ExecutionContext;
 }  // namespace framework
 namespace platform {
 class CPUDeviceContext;
-struct CPUPlace;
 }  // namespace platform
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc
index c037daba0ee3f..d8b8c2728987e 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc
@@ -20,7 +20,6 @@ class ExecutionContext;
 }  // namespace framework
 namespace platform {
 class CPUDeviceContext;
-struct CPUPlace;
 }  // namespace platform
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
index 2acf1e0fcd7aa..397a50f2bc69c 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
@@ -20,7 +20,6 @@ class ExecutionContext;
 }  // namespace framework
 namespace platform {
 class CPUDeviceContext;
-struct CPUPlace;
 }  // namespace platform
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc
index be8dad62c3c05..e7bb73340b841 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc
@@ -20,7 +20,6 @@ class ExecutionContext;
 }  // namespace framework
 namespace platform {
 class CPUDeviceContext;
-struct CPUPlace;
 }  // namespace platform
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h b/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h
index 8bfb566d496d0..be5aded3521c9 100644
--- a/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h
+++ b/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h
@@ -85,11 +85,11 @@ class TestElementwiseOpGradGrad {
       auto src = feed_datas_[in_name].data();
       auto src_place = platform::CPUPlace();
       if (platform::is_cpu_place(place_)) {
-        auto dst_place = BOOST_GET_CONST(platform::CPUPlace, place_);
+        auto dst_place = place_;
         memory::Copy(dst_place, dst, src_place, src, bytes);
       } else if (platform::is_gpu_place(place_)) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-        auto dst_place = BOOST_GET_CONST(platform::CUDAPlace, place_);
+        auto dst_place = place_;
         memory::Copy(dst_place, dst, src_place, src, bytes, nullptr);
 #else
         PADDLE_THROW(platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/expand_op_npu.cc b/paddle/fluid/operators/expand_op_npu.cc
index e9f31f8ddd698..64530f31abab9 100644
--- a/paddle/fluid/operators/expand_op_npu.cc
+++ b/paddle/fluid/operators/expand_op_npu.cc
@@ -93,9 +93,7 @@ class ExpandNPUKernel : public framework::OpKernel<T> {
         (out0->numel() == in0->numel()) ? true : false;
 
     if (is_expand_times_all_one) {
-      memory::Copy(BOOST_GET_CONST(platform::NPUPlace, place),
-                   out0->mutable_data<T>(place),
-                   BOOST_GET_CONST(platform::NPUPlace, place), in0->data<T>(),
+      memory::Copy(place, out0->mutable_data<T>(place), place, in0->data<T>(),
                    in0->numel() * sizeof(T), stream);
       if (out_dims != in_dims) {
         out0->Resize(out_dims);
diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu
index b95bbc775a0d7..9c4f61ecdf1bc 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -377,7 +377,7 @@ struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, T> {
                   const framework::Tensor& last_scale,
                   const framework::Tensor& iter, const int window_size,
                   framework::Tensor* scales_arr, framework::Tensor* out_scale) {
-    const auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
+    const auto gpu_place = ctx.GetPlace();
 
     T* scale_arr = scales_arr->mutable_data<T>(gpu_place);
     T* out_scale_data = out_scale->mutable_data<T>(gpu_place);
@@ -414,7 +414,7 @@ struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext, T> {
                   const framework::Tensor& in_state, const T* cur_scale,
                   const float rate, framework::Tensor* out_state,
                   framework::Tensor* out_accum, framework::Tensor* out_scale) {
-    const auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
+    const auto gpu_place = ctx.GetPlace();
 
     T accum;
     T state;
diff --git a/paddle/fluid/operators/fill_diagonal_tensor_op.cu b/paddle/fluid/operators/fill_diagonal_tensor_op.cu
index 834964079fd39..256c9c3d75c0d 100644
--- a/paddle/fluid/operators/fill_diagonal_tensor_op.cu
+++ b/paddle/fluid/operators/fill_diagonal_tensor_op.cu
@@ -101,7 +101,7 @@ class FillDiagonalTensorCUDAKernel : public framework::OpKernel<T> {
     Tensor tensor_tmp;
     int64_t *memory_block_cu =
         tensor_tmp.mutable_data<int64_t>({2 + fill_dims[0]}, ctx.GetPlace());
-    const auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
+    const auto gpu_place = ctx.GetPlace();
     memory::Copy(gpu_place, memory_block_cu, platform::CPUPlace(),
                  memory_block.data(), sizeof(int64_t) * (2 + fill_dims[0]),
                  stream);
@@ -159,8 +159,7 @@ class FillDiagonalTensorGradCUDAKernel : public framework::OpKernel<T> {
       Tensor tensor_tmp;
       int64_t *memory_block_cu =
           tensor_tmp.mutable_data<int64_t>({2 + matrows}, ctx.GetPlace());
-      const auto gpu_place =
-          BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
+      const auto gpu_place = ctx.GetPlace();
       memory::Copy(gpu_place, memory_block_cu, platform::CPUPlace(),
                    memory_block.data(), sizeof(int64_t) * (2 + matrows),
                    stream);
diff --git a/paddle/fluid/operators/flip_op.cu b/paddle/fluid/operators/flip_op.cu
index 2391d4b907a60..09893cb3f4b2c 100644
--- a/paddle/fluid/operators/flip_op.cu
+++ b/paddle/fluid/operators/flip_op.cu
@@ -56,7 +56,7 @@ class FlipKernel<platform::CUDADeviceContext, T>
     : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
+    const auto gplace = ctx.GetPlace();
     auto cplace = platform::CPUPlace();
     auto& dev_ctx = ctx.template device_context<CUDADeviceContext>();
 
diff --git a/paddle/fluid/operators/fused/fused_dropout_helper.h b/paddle/fluid/operators/fused/fused_dropout_helper.h
index 3972c60e8347b..4aa8b65635e7a 100644
--- a/paddle/fluid/operators/fused/fused_dropout_helper.h
+++ b/paddle/fluid/operators/fused/fused_dropout_helper.h
@@ -150,7 +150,7 @@ class FusedDropoutHelper {
     LaunchResidualDropoutBiasGrad<T, uint8_t>(
         d_out, mask, dropout_param_.dropout_prob,
         dropout_param_.is_upscale_in_train, rows_, cols_, d_src, d_bias, ctx);
-    auto cuda_place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
+    auto cuda_place = ctx.GetPlace();
     memory::Copy(cuda_place, d_residual, cuda_place, d_out,
                  rows_ * cols_ * sizeof(T), ctx.stream());
   }
diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
index b27b70dc9dc0c..c6205863103ff 100644
--- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
@@ -182,7 +182,7 @@ void LaunchLayernormResidualDropoutBias(
     LayerNormParamType<T> *var, const platform::CUDADeviceContext &ctx) {
   // dropout_prob == 1.0f
   if (std::abs(dropout_prob - 1.0f) < 1e-5) {
-    auto cuda_place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
+    auto cuda_place = ctx.GetPlace();
     memory::Copy(cuda_place, dst, cuda_place, residual, rows * cols * sizeof(T),
                  ctx.stream());
     PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(
diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
index d984ad1a27768..2f5ec839fc2c7 100644
--- a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
@@ -141,7 +141,7 @@ void LaunchResidualDropoutBias(const uint32_t rows, const uint32_t cols,
   // dropout_prob == 1.0f
   if (std::abs(dropout_prob - 1.0f) < 1e-5) {
     if (residual == dst) return;
-    auto cuda_place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
+    auto cuda_place = ctx.GetPlace();
     memory::Copy(cuda_place, dst, cuda_place, residual, rows * cols * sizeof(T),
                  ctx.stream());
     if (!is_test) {
diff --git a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
index 1fa4225934d39..786f5b4e07798 100644
--- a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
+++ b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
@@ -15,12 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
-
-namespace paddle {
-namespace platform {
-struct CUDAPlace;
-}  // namespace platform
-}  // namespace paddle
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/gather.cu.h b/paddle/fluid/operators/gather.cu.h
index 700de8074ff8a..8386896027fa0 100644
--- a/paddle/fluid/operators/gather.cu.h
+++ b/paddle/fluid/operators/gather.cu.h
@@ -115,7 +115,7 @@ template <typename DeviceContext, typename T, typename IndexT = int>
 void GPUGatherNd(const framework::ExecutionContext& context,
                  const Tensor& input, const Tensor& index, Tensor* output) {
   const auto& ctx = context.template device_context<DeviceContext>();
-  const auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
+  const auto gplace = ctx.GetPlace();
   auto cplace = platform::CPUPlace();
 
   auto index_dims = index.dims();
diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu
index ef0e000b25efd..e43ffdae903f5 100644
--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
@@ -69,8 +69,7 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
 
     int64_t size = tensor->numel();
 
-    int device_id =
-        BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()).GetDeviceId();
+    int device_id = context.GetPlace().GetDeviceId();
     auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
 
     if (gen_cuda->GetIsInitPy() && seed_flag) {
@@ -106,8 +105,7 @@ class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
     thrust::counting_iterator<int64_t> index_sequence_begin(0);
     int64_t size = tensor->numel();
 
-    int device_id =
-        BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()).GetDeviceId();
+    int device_id = context.GetPlace().GetDeviceId();
     auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
 
     if (gen_cuda->GetIsInitPy() && seed_flag) {
diff --git a/paddle/fluid/operators/gaussian_random_op_xpu.cc b/paddle/fluid/operators/gaussian_random_op_xpu.cc
index 5d3ba84b05f5e..5a1ac46f615d2 100644
--- a/paddle/fluid/operators/gaussian_random_op_xpu.cc
+++ b/paddle/fluid/operators/gaussian_random_op_xpu.cc
@@ -41,9 +41,8 @@ class XPUGaussianRandomKernel : public framework::OpKernel<T> {
     for (int64_t i = 0; i < size; ++i) {
       data_cpu[i] = dist(*engine);
     }
-    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, context.GetPlace()), data,
-                 platform::CPUPlace(), reinterpret_cast<void*>(data_cpu.get()),
-                 size * sizeof(T));
+    memory::Copy(context.GetPlace(), data, platform::CPUPlace(),
+                 reinterpret_cast<void*>(data_cpu.get()), size * sizeof(T));
   }
 };
 
diff --git a/paddle/fluid/operators/gru_op.cu.cc b/paddle/fluid/operators/gru_op.cu.cc
index edd7f8a7cf553..ce3c8ac51c76a 100644
--- a/paddle/fluid/operators/gru_op.cu.cc
+++ b/paddle/fluid/operators/gru_op.cu.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 class CUDADeviceContext;
-struct CUDAPlace;
+
 }  // namespace platform
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/gumbel_softmax_op.cu b/paddle/fluid/operators/gumbel_softmax_op.cu
index d3edf72449537..51d912f451b92 100644
--- a/paddle/fluid/operators/gumbel_softmax_op.cu
+++ b/paddle/fluid/operators/gumbel_softmax_op.cu
@@ -132,8 +132,7 @@ struct GumbleNoiseGenerator<platform::CUDADeviceContext, T> {
     thrust::counting_iterator<int64_t> index_sequence_begin(0);
 
     // generate gumbel noise
-    int device_id =
-        BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()).GetDeviceId();
+    int device_id = context.GetPlace().GetDeviceId();
     auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
     if (gen_cuda->GetIsInitPy()) {
       auto seed_offset = gen_cuda->IncrementOffset(1);
diff --git a/paddle/fluid/operators/hash_op.cc b/paddle/fluid/operators/hash_op.cc
index 47b480c11c28f..2b8fdcb4d1067 100644
--- a/paddle/fluid/operators/hash_op.cc
+++ b/paddle/fluid/operators/hash_op.cc
@@ -26,9 +26,6 @@ class EmptyGradOpMaker;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-struct CPUPlace;
-}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
diff --git a/paddle/fluid/operators/increment_op.cc b/paddle/fluid/operators/increment_op.cc
index e727f6ceb56f7..16320aa26bd20 100644
--- a/paddle/fluid/operators/increment_op.cc
+++ b/paddle/fluid/operators/increment_op.cc
@@ -26,7 +26,6 @@ class OpBase;
 }  // namespace imperative
 namespace platform {
 class CPUDeviceContext;
-struct CPUPlace;
 }  // namespace platform
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/isclose_op.cu b/paddle/fluid/operators/isclose_op.cu
index 77295414eb903..09710ba0c6957 100644
--- a/paddle/fluid/operators/isclose_op.cu
+++ b/paddle/fluid/operators/isclose_op.cu
@@ -25,8 +25,7 @@ struct GetTensorValue<platform::CUDADeviceContext, T> {
                const framework::Tensor& tensor) const {
     const T* data = tensor.data<T>();
     T value;
-    const auto gpu_place =
-        BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace());
+    const auto gpu_place = dev_ctx.GetPlace();
     memory::Copy(platform::CPUPlace(), &value, gpu_place, data, sizeof(T),
                  dev_ctx.stream());
     return value;
diff --git a/paddle/fluid/operators/isfinite_op.cc b/paddle/fluid/operators/isfinite_op.cc
index 1ac1c26796cf3..753b34484e411 100644
--- a/paddle/fluid/operators/isfinite_op.cc
+++ b/paddle/fluid/operators/isfinite_op.cc
@@ -28,7 +28,6 @@ class OpBase;
 }  // namespace imperative
 namespace platform {
 class CPUDeviceContext;
-struct CPUPlace;
 }  // namespace platform
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/isfinite_v2_op.cc b/paddle/fluid/operators/isfinite_v2_op.cc
index c676a3e57fff9..d3391fddd3026 100644
--- a/paddle/fluid/operators/isfinite_v2_op.cc
+++ b/paddle/fluid/operators/isfinite_v2_op.cc
@@ -34,7 +34,6 @@ class OverflowKernel;
 }  // namespace operators
 namespace platform {
 class CPUDeviceContext;
-struct CPUPlace;
 struct float16;
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/operators/jit/kernel_key.h b/paddle/fluid/operators/jit/kernel_key.h
index b2cf92f23e8cc..943f310f62df7 100644
--- a/paddle/fluid/operators/jit/kernel_key.h
+++ b/paddle/fluid/operators/jit/kernel_key.h
@@ -23,8 +23,8 @@ namespace jit {
 struct KernelKey {
   struct Hash {
     size_t operator()(const KernelKey& key) const {
-      int place = key.place_.which();               // less than 2^8
-      int type = static_cast<int>(key.type_) << 8;  // less than 2^(32-8)
+      int place = static_cast<int>(key.place_.GetType());  // less than 2^8
+      int type = static_cast<int>(key.type_) << 8;         // less than 2^(32-8)
       std::hash<int> hasher;
       return hasher(place + type);
     }
diff --git a/paddle/fluid/operators/label_smooth_op.cc b/paddle/fluid/operators/label_smooth_op.cc
index 231ff941278c7..27341fdc84349 100644
--- a/paddle/fluid/operators/label_smooth_op.cc
+++ b/paddle/fluid/operators/label_smooth_op.cc
@@ -26,7 +26,6 @@ class OpBase;
 }  // namespace imperative
 namespace platform {
 class CPUDeviceContext;
-struct CPUPlace;
 }  // namespace platform
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu
index 5aa546cbcc21a..89c84d9e14377 100644
--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
@@ -160,7 +160,7 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
       // copy GPU memory to CPU pinned memory
       framework::Vector<int64_t> new_rows;
       new_rows.resize(ids_num);
-      auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace());
+      auto gpu_place = context.GetPlace();
 
       // TODO(yuyang18): Strange code here.
       memory::Copy(gpu_place, new_rows.CUDAMutableData(context.GetPlace()),
diff --git a/paddle/fluid/operators/lookup_table_v2_op.cu b/paddle/fluid/operators/lookup_table_v2_op.cu
index 317f9eeb94f39..44a6151f1b6ce 100644
--- a/paddle/fluid/operators/lookup_table_v2_op.cu
+++ b/paddle/fluid/operators/lookup_table_v2_op.cu
@@ -162,7 +162,7 @@ class LookupTableV2GradCUDAKernel : public framework::OpKernel<T> {
       // copy GPU memory to CPU pinned memory
       framework::Vector<int64_t> new_rows;
       new_rows.resize(ids_num);
-      auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace());
+      auto gpu_place = context.GetPlace();
 
       if (ids->type() == framework::proto::VarType::INT32) {
         InputTypeCovert<
diff --git a/paddle/fluid/operators/lstsq_op.cu b/paddle/fluid/operators/lstsq_op.cu
index a71b900f14f8e..46f93abd22122 100644
--- a/paddle/fluid/operators/lstsq_op.cu
+++ b/paddle/fluid/operators/lstsq_op.cu
@@ -149,9 +149,8 @@ void BatchedOrmqr<platform::CUDADeviceContext, float>(
 
     // check the error info
     int info_h;
-    memory::Copy(platform::CPUPlace(), &info_h,
-                 BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
-                 info_d, sizeof(int), dev_ctx.stream());
+    memory::Copy(platform::CPUPlace(), &info_h, dev_ctx.GetPlace(), info_d,
+                 sizeof(int), dev_ctx.stream());
     PADDLE_ENFORCE_EQ(
         info_h, 0,
         platform::errors::PreconditionNotMet(
@@ -189,9 +188,8 @@ void BatchedOrmqr<platform::CUDADeviceContext, double>(
 
     // check the error info
     int info_h;
-    memory::Copy(platform::CPUPlace(), &info_h,
-                 BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
-                 info_d, sizeof(int), dev_ctx.stream());
+    memory::Copy(platform::CPUPlace(), &info_h, dev_ctx.GetPlace(), info_d,
+                 sizeof(int), dev_ctx.stream());
     PADDLE_ENFORCE_EQ(
         info_h, 0,
         platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/operators/lu_op.h b/paddle/fluid/operators/lu_op.h
index 21839c263e4a8..da448fbd35a9f 100644
--- a/paddle/fluid/operators/lu_op.h
+++ b/paddle/fluid/operators/lu_op.h
@@ -413,7 +413,7 @@ void LU_Unpack(const DeviceContext& dev_ctx, const framework::Tensor* LU,
   batchsize = std::max(static_cast<int>(batchsize), 1);
   arange<DeviceContext>(dev_ctx, &rowtensor, dim, batchsize, H);
   auto idtptr = rowtensor.data<int32_t>();
-  if (is_gpu_place(dev_ctx.GetPlace())) {
+  if (platform::is_gpu_place(dev_ctx.GetPlace())) {
     framework::TensorCopy(rowtensor, dev_ctx.GetPlace(), &rt_dev);
     idtptr = rt_dev.data<int32_t>();
   }
diff --git a/paddle/fluid/operators/masked_select_op_xpu.cc b/paddle/fluid/operators/masked_select_op_xpu.cc
index dbf8793b5cb6f..8dbc5bcfc347a 100644
--- a/paddle/fluid/operators/masked_select_op_xpu.cc
+++ b/paddle/fluid/operators/masked_select_op_xpu.cc
@@ -47,8 +47,7 @@ class MaskedSelectXPUKernel : public framework::OpKernel<T> {
                            mask->numel()),
         "nonzero_count ");
     memory::Copy(platform::CPUPlace(), static_cast<void*>(&out_size_cpu),
-                 BOOST_GET_CONST(platform::XPUPlace, mask->place()),
-                 static_cast<void*>(out_size), sizeof(int32_t));
+                 mask->place(), static_cast<void*>(out_size), sizeof(int32_t));
 
     framework::DDim out_dim{out_size_cpu};
     out->Resize(out_dim);
diff --git a/paddle/fluid/operators/math/concat_and_split.cc b/paddle/fluid/operators/math/concat_and_split.cc
index 347d9e6c2b9a8..45effd404cfb3 100644
--- a/paddle/fluid/operators/math/concat_and_split.cc
+++ b/paddle/fluid/operators/math/concat_and_split.cc
@@ -58,7 +58,7 @@ class ConcatFunctor<platform::CPUDeviceContext, T> {
       out_cols += t_cols;
       input_cols[i] = t_cols;
     }
-    auto cpu_place = BOOST_GET_CONST(platform::CPUPlace, context.GetPlace());
+    auto cpu_place = context.GetPlace();
 
     // computation
     auto output_data = output->data<T>();
@@ -109,7 +109,7 @@ class SplitFunctor<platform::CPUDeviceContext, T> {
       input_cols += t_cols;
       output_cols[i] = t_cols;
     }
-    auto cpu_place = BOOST_GET_CONST(platform::CPUPlace, context.GetPlace());
+    auto cpu_place = context.GetPlace();
 
     // computation
     for (int k = 0; k < input_rows; ++k) {
@@ -140,8 +140,7 @@ class ConcatFunctor<platform::XPUDeviceContext, T> {
   void operator()(const platform::XPUDeviceContext& context,
                   const std::vector<framework::Tensor>& input, int axis,
                   framework::Tensor* output) {
-    int dev_id =
-        BOOST_GET_CONST(platform::XPUPlace, context.GetPlace()).GetDeviceId();
+    int dev_id = context.GetPlace().GetDeviceId();
     platform::XPUDeviceGuard guard(dev_id);
 
     int num = input.size();
@@ -179,8 +178,7 @@ class SplitFunctor<platform::XPUDeviceContext, T> {
                   const framework::Tensor& input,
                   const std::vector<const framework::Tensor*>& ref_inputs,
                   const int axis, std::vector<framework::Tensor*>* outputs) {
-    int dev_id =
-        BOOST_GET_CONST(platform::XPUPlace, context.GetPlace()).GetDeviceId();
+    int dev_id = context.GetPlace().GetDeviceId();
     platform::XPUDeviceGuard guard(dev_id);
 
     auto& ins = ref_inputs;
@@ -225,8 +223,7 @@ class ConcatFunctor<platform::NPUDeviceContext, T> {
   void operator()(const platform::NPUDeviceContext& context,
                   const std::vector<framework::Tensor>& input, int axis,
                   framework::Tensor* output) {
-    int dev_id =
-        BOOST_GET_CONST(platform::NPUPlace, context.GetPlace()).GetDeviceId();
+    int dev_id = context.GetPlace().GetDeviceId();
     platform::NPUDeviceGuard guard(dev_id);
 
     std::vector<std::string> names;
@@ -270,7 +267,7 @@ class SplitFunctor<platform::NPUDeviceContext, T> {
       input_cols += t_cols;
       output_cols[i] = t_cols;
     }
-    auto npu_place = BOOST_GET_CONST(platform::NPUPlace, context.GetPlace());
+    auto npu_place = context.GetPlace();
 
     // computation
     for (int k = 0; k < input_rows; ++k) {
diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu
index 6892f7ce4e503..5b99a62d78d2a 100644
--- a/paddle/fluid/operators/math/concat_and_split.cu
+++ b/paddle/fluid/operators/math/concat_and_split.cu
@@ -289,9 +289,9 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
       tmp_dev_ins_data = memory::Alloc(context, in_num * sizeof(T*));
       auto* restored =
           platform::RestoreHostMemIfCapturingCUDAGraph(inputs_data, in_num);
-      memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
-                   tmp_dev_ins_data->ptr(), platform::CPUPlace(), restored,
-                   in_num * sizeof(T*), context.stream());
+      memory::Copy(context.GetPlace(), tmp_dev_ins_data->ptr(),
+                   platform::CPUPlace(), restored, in_num * sizeof(T*),
+                   context.stream());
       dev_ins_data = reinterpret_cast<const T**>(tmp_dev_ins_data->ptr());
     }
 
@@ -318,8 +318,8 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
 
       auto* restored = platform::RestoreHostMemIfCapturingCUDAGraph(
           inputs_col, inputs_col_num);
-      memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
-                   tmp_dev_ins_col_data->ptr(), platform::CPUPlace(), restored,
+      memory::Copy(context.GetPlace(), tmp_dev_ins_col_data->ptr(),
+                   platform::CPUPlace(), restored,
                    inputs_col_num * sizeof(int64_t), context.stream());
       int64_t* dev_ins_col_data =
           static_cast<int64_t*>(tmp_dev_ins_col_data->ptr());
@@ -420,9 +420,9 @@ class SplitFunctor<platform::CUDADeviceContext, T> {
       tmp_dev_outs_data = memory::Alloc(context, o_num * sizeof(T*));
       auto* restored =
           platform::RestoreHostMemIfCapturingCUDAGraph(outputs_data, o_num);
-      memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
-                   tmp_dev_outs_data->ptr(), platform::CPUPlace(), restored,
-                   o_num * sizeof(T*), context.stream());
+      memory::Copy(context.GetPlace(), tmp_dev_outs_data->ptr(),
+                   platform::CPUPlace(), restored, o_num * sizeof(T*),
+                   context.stream());
       dev_out_gpu_data = reinterpret_cast<T**>(tmp_dev_outs_data->ptr());
     }
 
@@ -448,8 +448,8 @@ class SplitFunctor<platform::CUDADeviceContext, T> {
           memory::Alloc(context, outputs_cols_num * sizeof(int64_t));
       auto* restored = platform::RestoreHostMemIfCapturingCUDAGraph(
           outputs_cols, outputs_cols_num);
-      memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
-                   tmp_dev_ins_col_data->ptr(), platform::CPUPlace(), restored,
+      memory::Copy(context.GetPlace(), tmp_dev_ins_col_data->ptr(),
+                   platform::CPUPlace(), restored,
                    outputs_cols_num * sizeof(int64_t), context.stream());
       int64_t* dev_outs_col_data =
           reinterpret_cast<int64_t*>(tmp_dev_ins_col_data->ptr());
diff --git a/paddle/fluid/operators/math/eigen_values_vectors.h b/paddle/fluid/operators/math/eigen_values_vectors.h
index b24f5d40e8dca..2cfff0ae88ff6 100644
--- a/paddle/fluid/operators/math/eigen_values_vectors.h
+++ b/paddle/fluid/operators/math/eigen_values_vectors.h
@@ -211,8 +211,7 @@ struct MatrixEighFunctor<platform::CUDADeviceContext, T> {
             info_ptr);
       }
       int error_info = 0;
-      memory::Copy(platform::CPUPlace(), &error_info,
-                   BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
+      memory::Copy(platform::CPUPlace(), &error_info, dev_ctx.GetPlace(),
                    info_ptr, sizeof(int), dev_ctx.stream());
       CheckEighResult(i, error_info);
     }
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
index ec21524b0b880..6ca3abe0f05a5 100644
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -220,7 +220,8 @@ void set_constant(const platform::DeviceContext& context,
                   framework::Tensor* tensor, float value) {
   TensorSetConstantWithPlace func(context, tensor, value);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  tensor->place().apply_visitor(func);
+  // tensor->place().apply_visitor(func);
+  paddle::platform::VisitPlace(tensor->place(), func);
 #else
   func(platform::CPUPlace());
 #endif
diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
index 378f0426ddfb7..6e2547145cfed 100644
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -98,8 +98,7 @@ struct TransposeNormal<platform::CUDADeviceContext, T> {
     auto* out_ptr = out->data<T>();
 
     // copy in_stride, out_stride, axis to gpu device
-    const platform::CUDAPlace& cuda_place =
-        BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace());
+    const platform::CUDAPlace& cuda_place = context.GetPlace();
     platform::CPUPlace cpu_place = platform::CPUPlace();
     size_t size = 3 * rank * sizeof(int64_t);
     auto cpu_buf_holder = memory::Alloc(cpu_place, size);
diff --git a/paddle/fluid/operators/math/math_function.h b/paddle/fluid/operators/math/math_function.h
index 4c0eb592e8c17..40293ad725b93 100644
--- a/paddle/fluid/operators/math/math_function.h
+++ b/paddle/fluid/operators/math/math_function.h
@@ -100,9 +100,8 @@ struct TensorSetConstantXPU {
     int numel = tensor_->numel();
     std::unique_ptr<T[]> data_cpu(new T[numel]);
     std::fill(data_cpu.get(), data_cpu.get() + numel, static_cast<T>(value_));
-    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, place_), begin,
-                 platform::CPUPlace(), static_cast<void*>(data_cpu.get()),
-                 numel * sizeof(T));
+    memory::Copy(place_, begin, platform::CPUPlace(),
+                 static_cast<void*>(data_cpu.get()), numel * sizeof(T));
   }
   framework::Tensor* tensor_;
   U value_;
diff --git a/paddle/fluid/operators/math/matrix_inverse.cu.cc b/paddle/fluid/operators/math/matrix_inverse.cu.cc
index 7d03f9590357e..0b6a097d09d15 100644
--- a/paddle/fluid/operators/math/matrix_inverse.cu.cc
+++ b/paddle/fluid/operators/math/matrix_inverse.cu.cc
@@ -45,10 +45,9 @@ class MatrixInverseFunctor<platform::CUDADeviceContext, T> {
       // Copy all elements of input matrix A to a temporary memory space to
       // avoid being overriden by getrf.
       tmp_gpu_mat_data = memory::Alloc(context, a.numel() * sizeof(T));
-      memory::Copy(boost::get<platform::CUDAPlace>(context.GetPlace()),
-                   tmp_gpu_mat_data->ptr(),
-                   boost::get<platform::CUDAPlace>(context.GetPlace()),
-                   a.data(), a.numel() * sizeof(T), context.stream());
+      memory::Copy(context.GetPlace(), tmp_gpu_mat_data->ptr(),
+                   context.GetPlace(), a.data(), a.numel() * sizeof(T),
+                   context.stream());
       gpu_mat = reinterpret_cast<const T*>(tmp_gpu_mat_data->ptr());
     }
 
@@ -61,9 +60,8 @@ class MatrixInverseFunctor<platform::CUDADeviceContext, T> {
     // Copy the addresses of A and A_inv from host to device.
     memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
         memory::Alloc(context, cpu_ptrs.size() * sizeof(T*));
-    memory::Copy(boost::get<platform::CUDAPlace>(context.GetPlace()),
-                 tmp_gpu_ptrs_data->ptr(), platform::CPUPlace(),
-                 static_cast<void*>(cpu_ptrs.data()),
+    memory::Copy(context.GetPlace(), tmp_gpu_ptrs_data->ptr(),
+                 platform::CPUPlace(), static_cast<void*>(cpu_ptrs.data()),
                  cpu_ptrs.size() * sizeof(T*), context.stream());
     T** gpu_inv_ptrs =
         reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()) + batch_size;
@@ -102,8 +100,7 @@ class MatrixInverseFunctor<platform::CUDADeviceContext, T> {
                         reinterpret_cast<const T**>(tmp_gpu_ptrs_data->ptr()),
                         gpu_pivot_ptr, gpu_inv_ptrs, gpu_info_ptr, batch_size);
     }
-    memory::Copy(platform::CPUPlace(), info.data(),
-                 BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
+    memory::Copy(platform::CPUPlace(), info.data(), context.GetPlace(),
                  gpu_info_ptr, sizeof(int) * batch_size, context.stream());
     for (int i = 0; i < batch_size; ++i) {
       PADDLE_ENFORCE_EQ(info[i], 0,
diff --git a/paddle/fluid/operators/math/matrix_solve.cu.cc b/paddle/fluid/operators/math/matrix_solve.cu.cc
index 4e5601248c1a2..f0b41f98dc0cd 100644
--- a/paddle/fluid/operators/math/matrix_solve.cu.cc
+++ b/paddle/fluid/operators/math/matrix_solve.cu.cc
@@ -92,9 +92,8 @@ class MatrixSolveFunctor<platform::CUDADeviceContext, T> {
     // Copy the addresses of A and tmp_b from host to device.
     memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
         memory::Alloc(context, cpu_ptrs.size() * sizeof(T*));
-    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
-                 tmp_gpu_ptrs_data->ptr(), platform::CPUPlace(),
-                 static_cast<void*>(cpu_ptrs.data()),
+    memory::Copy(context.GetPlace(), tmp_gpu_ptrs_data->ptr(),
+                 platform::CPUPlace(), static_cast<void*>(cpu_ptrs.data()),
                  cpu_ptrs.size() * sizeof(T*), context.stream());
 
     T** gpu_tmp_b_ptrs =
@@ -122,8 +121,7 @@ class MatrixSolveFunctor<platform::CUDADeviceContext, T> {
                       gpu_pivot_ptr, gpu_info_ptr, batch_size);
 
     // check whether BatchedGETRF is executed successfully or not
-    memory::Copy(platform::CPUPlace(), info.data(),
-                 BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
+    memory::Copy(platform::CPUPlace(), info.data(), context.GetPlace(),
                  gpu_info_ptr, sizeof(int) * batch_size, context.stream());
     for (int i = 0; i < batch_size; ++i) {
       PADDLE_ENFORCE_EQ(info[i], 0,
@@ -207,9 +205,8 @@ class TriangularSolveFunctor<platform::CUDADeviceContext, T> {
       // Copy the addresses of A and tmp_b from host to device.
       memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
           memory::Alloc(context, cpu_ptrs.size() * sizeof(T*));
-      memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
-                   tmp_gpu_ptrs_data->ptr(), platform::CPUPlace(),
-                   static_cast<void*>(cpu_ptrs.data()),
+      memory::Copy(context.GetPlace(), tmp_gpu_ptrs_data->ptr(),
+                   platform::CPUPlace(), static_cast<void*>(cpu_ptrs.data()),
                    cpu_ptrs.size() * sizeof(T*), context.stream());
 
       const T** gpu_a_ptrs =
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
index 37dafa5c4908f..67176f26b079f 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -79,14 +79,11 @@ struct SelectedRowsAdd<platform::CPUDeviceContext, T> {
 
     auto* out_data = out_value->data<T>();
     auto* in1_data = in1_value.data<T>();
-    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, out_place), out_data,
-                 BOOST_GET_CONST(platform::CPUPlace, in1_place), in1_data,
+    memory::Copy(out_place, out_data, in1_place, in1_data,
                  in1_value.numel() * sizeof(T));
 
     auto* in2_data = in2_value.data<T>();
-    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, out_place),
-                 out_data + in1_value.numel(),
-                 BOOST_GET_CONST(platform::CPUPlace, in2_place), in2_data,
+    memory::Copy(out_place, out_data + in1_value.numel(), in2_place, in2_data,
                  in2_value.numel() * sizeof(T));
   }
 };
@@ -188,9 +185,7 @@ struct SelectedRowsAddTo<platform::CPUDeviceContext, T> {
 
     auto* in1_data = in1_value.data<T>();
     auto* in2_data = in2_value->data<T>();
-    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, in2_place),
-                 in2_data + input2_offset,
-                 BOOST_GET_CONST(platform::CPUPlace, in1_place), in1_data,
+    memory::Copy(in2_place, in2_data + input2_offset, in1_place, in1_data,
                  in1_value.numel() * sizeof(T));
   }
 };
@@ -455,9 +450,7 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
       for (auto* in : inputs) {
         auto* in_data = in->value().data<T>();
         auto in_numel = in->rows().size() * input_width;
-        memory::Copy(BOOST_GET_CONST(platform::CPUPlace, out_place),
-                     out_data + copied_numel,
-                     BOOST_GET_CONST(platform::CPUPlace, in_place), in_data,
+        memory::Copy(out_place, out_data + copied_numel, in_place, in_data,
                      in_numel * sizeof(T));
         copied_numel += in_numel;
       }
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu
index 0e04c37ed2b12..654a5653cbed1 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -82,14 +82,11 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
                       platform::errors::InvalidArgument(
                           "The running enviroment is not on the GPU place."));
 
-    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, out_place), out_data,
-                 BOOST_GET_CONST(platform::CUDAPlace, in1_place), in1_data,
+    memory::Copy(out_place, out_data, in1_place, in1_data,
                  in1_value.numel() * sizeof(T), context.stream());
 
     auto* in2_data = in2_value.data<T>();
-    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, out_place),
-                 out_data + in1_value.numel(),
-                 BOOST_GET_CONST(platform::CUDAPlace, in2_place), in2_data,
+    memory::Copy(out_place, out_data + in1_value.numel(), in2_place, in2_data,
                  in2_value.numel() * sizeof(T), context.stream());
   }
 };
@@ -218,9 +215,7 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
 
     auto* in1_data = in1_value.data<T>();
     auto* in2_data = in2_value->data<T>();
-    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, in2_place),
-                 in2_data + input2_offset,
-                 BOOST_GET_CONST(platform::CUDAPlace, in1_place), in1_data,
+    memory::Copy(in2_place, in2_data + input2_offset, in1_place, in1_data,
                  in1_value.numel() * sizeof(T), context.stream());
   }
 };
diff --git a/paddle/fluid/operators/math/tree2col.cc b/paddle/fluid/operators/math/tree2col.cc
index 97ab2c5f52ac2..af5df27207ace 100644
--- a/paddle/fluid/operators/math/tree2col.cc
+++ b/paddle/fluid/operators/math/tree2col.cc
@@ -90,7 +90,7 @@ class Tree2ColFunctor<platform::CPUDeviceContext, T> {
                   framework::Tensor *patch, int max_depth) {
     std::vector<std::vector<int>> tr;
     auto feature_dims = node_features.dims();
-    auto cpu_place = BOOST_GET_CONST(platform::CPUPlace, context.GetPlace());
+    auto cpu_place = context.GetPlace();
     math::SetConstant<platform::CPUDeviceContext, T> constant;
     int64_t feature_size = feature_dims[1];
     size_t patch_elem_size = 3 * static_cast<size_t>(feature_size);
@@ -143,7 +143,7 @@ class Col2TreeFunctor<platform::CPUDeviceContext, T> {
                   int max_depth) {
     std::vector<std::vector<int>> tr;
     auto output_dims = out_grad.dims();
-    auto cpu_place = BOOST_GET_CONST(platform::CPUPlace, context.GetPlace());
+    auto cpu_place = context.GetPlace();
     math::SetConstant<platform::CPUDeviceContext, T> constant;
     int64_t output_size = output_dims[1];
     size_t grad_elem_size = 3 * static_cast<size_t>(output_size);
diff --git a/paddle/fluid/operators/math/tree2col.cu b/paddle/fluid/operators/math/tree2col.cu
index d9b787b6df33d..4f3ab31916558 100644
--- a/paddle/fluid/operators/math/tree2col.cu
+++ b/paddle/fluid/operators/math/tree2col.cu
@@ -52,7 +52,7 @@ class Tree2ColFunctor<platform::CUDADeviceContext, T> {
                   const framework::Tensor& node_features,
                   framework::Tensor* patch, int max_depth) {
     std::vector<std::vector<int>> tr;
-    auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace());
+    auto gpu_place = context.GetPlace();
     auto cpu_place = platform::CPUPlace();
     auto stream = context.stream();
     auto feature_dims = node_features.dims();
@@ -124,7 +124,7 @@ class Col2TreeFunctor<platform::CUDADeviceContext, T> {
                   const framework::Tensor& patch_grad,
                   framework::Tensor* embedding_grad, int max_depth) {
     std::vector<std::vector<int>> tr;
-    auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace());
+    auto gpu_place = context.GetPlace();
     auto cpu_place = platform::CPUPlace();
     auto stream = context.stream();
     auto output_dims = patch_grad.dims();
diff --git a/paddle/fluid/operators/matrix_rank_op.cu b/paddle/fluid/operators/matrix_rank_op.cu
index 757c780b4ea53..87c8abc1c432e 100644
--- a/paddle/fluid/operators/matrix_rank_op.cu
+++ b/paddle/fluid/operators/matrix_rank_op.cu
@@ -178,8 +178,7 @@ void MatrixRankGPUKernel<float>::GesvdjBatched(
         U + stride_U * i, ldu, V + stride_V * i, ldt, workspace_ptr, lwork,
         info, gesvdj_params));
     int error_info;
-    memory::Copy(platform::CPUPlace(), &error_info,
-                 BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), info,
+    memory::Copy(platform::CPUPlace(), &error_info, dev_ctx.GetPlace(), info,
                  sizeof(int), dev_ctx.stream());
     PADDLE_ENFORCE_EQ(
         error_info, 0,
@@ -220,8 +219,7 @@ void MatrixRankGPUKernel<double>::GesvdjBatched(
         info, gesvdj_params));
     // check the error info
     int error_info;
-    memory::Copy(platform::CPUPlace(), &error_info,
-                 BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), info,
+    memory::Copy(platform::CPUPlace(), &error_info, dev_ctx.GetPlace(), info,
                  sizeof(int), dev_ctx.stream());
     PADDLE_ENFORCE_EQ(
         error_info, 0,
@@ -259,8 +257,7 @@ void MatrixRankGPUKernel<float>::SyevjBatched(
         lwork, info, params));
 
     int error_info;
-    memory::Copy(platform::CPUPlace(), &error_info,
-                 BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), info,
+    memory::Copy(platform::CPUPlace(), &error_info, dev_ctx.GetPlace(), info,
                  sizeof(int), dev_ctx.stream());
     PADDLE_ENFORCE_EQ(
         error_info, 0,
@@ -297,8 +294,7 @@ void MatrixRankGPUKernel<double>::SyevjBatched(
         handle, jobz, uplo, n, A + stride_A * i, lda, W + n * i, workspace_ptr,
         lwork, info, params));
     int error_info;
-    memory::Copy(platform::CPUPlace(), &error_info,
-                 BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), info,
+    memory::Copy(platform::CPUPlace(), &error_info, dev_ctx.GetPlace(), info,
                  sizeof(int), dev_ctx.stream());
     PADDLE_ENFORCE_EQ(
         error_info, 0,
diff --git a/paddle/fluid/operators/mean_op.cu b/paddle/fluid/operators/mean_op.cu
index c48fc79326fa6..5a0afb68d63f1 100644
--- a/paddle/fluid/operators/mean_op.cu
+++ b/paddle/fluid/operators/mean_op.cu
@@ -53,7 +53,7 @@ class MeanCUDAKernel : public framework::OpKernel<T> {
     auto stream = context.cuda_device_context().stream();
 
     if (rank == 0) {  // scalar
-      auto gpu_place = BOOST_GET(platform::CUDAPlace, place);
+      auto gpu_place = place;
       memory::Copy(gpu_place, out_data, gpu_place, in_data, numel * sizeof(T),
                    stream);
       return;
diff --git a/paddle/fluid/operators/mean_op_xpu.cc b/paddle/fluid/operators/mean_op_xpu.cc
index 1521265e1b3a9..53bc658af61b2 100644
--- a/paddle/fluid/operators/mean_op_xpu.cc
+++ b/paddle/fluid/operators/mean_op_xpu.cc
@@ -64,9 +64,7 @@ class MeanGradXPUKernel : public framework::OpKernel<T> {
     const T* dy = OG->data<T>();
     T dy0_value;
     xpu_wait(dev_ctx.x_context()->xpu_stream);
-    memory::Copy(platform::CPUPlace(), &dy0_value,
-                 BOOST_GET_CONST(platform::XPUPlace, OG->place()), dy,
-                 sizeof(T));
+    memory::Copy(platform::CPUPlace(), &dy0_value, OG->place(), dy, sizeof(T));
     float dy0_fp32 = static_cast<float>(dy0_value);
     dy0_fp32 = dy0_fp32 / static_cast<float>(IG->numel());
 
diff --git a/paddle/fluid/operators/memcpy_d2h_op.cc b/paddle/fluid/operators/memcpy_d2h_op.cc
index 1eb8d09c783b0..1aaa4c2367938 100644
--- a/paddle/fluid/operators/memcpy_d2h_op.cc
+++ b/paddle/fluid/operators/memcpy_d2h_op.cc
@@ -24,8 +24,6 @@ namespace imperative {
 class OpBase;
 }  // namespace imperative
 namespace platform {
-struct CPUPlace;
-struct CUDAPlace;
 struct float16;
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/operators/memcpy_h2d_op.cc b/paddle/fluid/operators/memcpy_h2d_op.cc
index 0e27ec0dc75b7..4e0f353a7a36c 100644
--- a/paddle/fluid/operators/memcpy_h2d_op.cc
+++ b/paddle/fluid/operators/memcpy_h2d_op.cc
@@ -24,8 +24,6 @@ namespace imperative {
 class OpBase;
 }  // namespace imperative
 namespace platform {
-struct CPUPlace;
-struct CUDAPlace;
 struct float16;
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/operators/memcpy_op.cc b/paddle/fluid/operators/memcpy_op.cc
index 56eee13cb060a..d1eeff0b0572c 100644
--- a/paddle/fluid/operators/memcpy_op.cc
+++ b/paddle/fluid/operators/memcpy_op.cc
@@ -27,8 +27,6 @@ namespace imperative {
 class OpBase;
 }  // namespace imperative
 namespace platform {
-struct CPUPlace;
-struct CUDAPlace;
 struct float16;
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
index 7031b96a50b9e..de71312d78df9 100644
--- a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
@@ -75,11 +75,9 @@ class AccuracyXPUKernel : public framework::OpKernel<T> {
     int64_t* label_int64_host =
         reinterpret_cast<int64_t*>(std::malloc(label_int64_size));
     dev_ctx.Wait();
-    memory::Copy(platform::CPUPlace(), indices_int64_host,
-                 BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
+    memory::Copy(platform::CPUPlace(), indices_int64_host, ctx.GetPlace(),
                  indices_data, indices_int64_size);
-    memory::Copy(platform::CPUPlace(), label_int64_host,
-                 BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
+    memory::Copy(platform::CPUPlace(), label_int64_host, ctx.GetPlace(),
                  label_data, label_int64_size);
     for (size_t i = 0; i < num_samples; ++i) {
       label_int32_host[i] = label_int64_host[i];
@@ -88,12 +86,10 @@ class AccuracyXPUKernel : public framework::OpKernel<T> {
             indices_int64_host[i * class_dim + j];
       }
     }
-    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
-                 indices_int32_device, platform::CPUPlace(), indices_int32_host,
-                 indices_int32_size);
-    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
-                 label_int32_device, platform::CPUPlace(), label_int32_host,
-                 label_int32_size);
+    memory::Copy(ctx.GetPlace(), indices_int32_device, platform::CPUPlace(),
+                 indices_int32_host, indices_int32_size);
+    memory::Copy(ctx.GetPlace(), label_int32_device, platform::CPUPlace(),
+                 label_int32_host, label_int32_size);
     int r = xpu::accuracy(dev_ctx.x_context(), indices_int32_device,
                           label_int32_device, num_samples, class_dim,
                           correct_data, total_data, accuracy_data);
diff --git a/paddle/fluid/operators/multiplex_op.cu b/paddle/fluid/operators/multiplex_op.cu
index 505e322310caf..5a212bcacae50 100644
--- a/paddle/fluid/operators/multiplex_op.cu
+++ b/paddle/fluid/operators/multiplex_op.cu
@@ -44,8 +44,7 @@ class MultiplexGPUKernel : public framework::OpKernel<T> {
     TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu);
     auto* index = index_t_cpu.data<int32_t>();
     auto stream = ctx.cuda_device_context().stream();
-    platform::CUDAPlace place =
-        BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
+    platform::CUDAPlace place = ctx.GetPlace();
     for (auto i = 0; i < rows; i++) {
       int32_t k = index[i];
       PADDLE_ENFORCE_GE(k, 0, platform::errors::PreconditionNotMet(
@@ -89,8 +88,7 @@ class MultiplexGradGPUKernel : public framework::OpKernel<T> {
     auto* index = index_t_cpu.data<int32_t>();
 
     auto stream = ctx.cuda_device_context().stream();
-    platform::CUDAPlace place =
-        BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
+    platform::CUDAPlace place = ctx.GetPlace();
     for (auto i = 0; i < rows; i++) {
       size_t k = static_cast<size_t>(index[i]);
       if (d_ins[k]) {
diff --git a/paddle/fluid/operators/multiplex_op.h b/paddle/fluid/operators/multiplex_op.h
index c0f24a2034a15..1d0a009edeedc 100644
--- a/paddle/fluid/operators/multiplex_op.h
+++ b/paddle/fluid/operators/multiplex_op.h
@@ -42,8 +42,7 @@ class MultiplexCPUKernel : public framework::OpKernel<T> {
     auto rows = ins[0]->dims()[0];
     auto cols = ins[0]->numel() / rows;
     auto index = ids->data<int32_t>();
-    platform::CPUPlace place =
-        BOOST_GET_CONST(platform::CPUPlace, ctx.GetPlace());
+    platform::CPUPlace place = ctx.GetPlace();
     for (auto i = 0; i < rows; i++) {
       int32_t k = index[i];
       PADDLE_ENFORCE_GE(k, 0, platform::errors::PreconditionNotMet(
@@ -83,8 +82,7 @@ class MultiplexGradCPUKernel : public framework::OpKernel<T> {
     auto rows = d_ins[idx]->dims()[0];
     auto cols = d_ins[idx]->numel() / rows;
     auto* index = ids->data<int32_t>();
-    platform::CPUPlace place =
-        BOOST_GET_CONST(platform::CPUPlace, ctx.GetPlace());
+    platform::CPUPlace place = ctx.GetPlace();
     for (auto i = 0; i < rows; i++) {
       size_t k = static_cast<size_t>(index[i]);
       if (d_ins[k]) {
diff --git a/paddle/fluid/operators/nccl/nccl_op.cu.cc b/paddle/fluid/operators/nccl/nccl_op.cu.cc
index f319ce159f6dd..1e4bf925cc2f8 100644
--- a/paddle/fluid/operators/nccl/nccl_op.cu.cc
+++ b/paddle/fluid/operators/nccl/nccl_op.cu.cc
@@ -68,8 +68,7 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
     auto reduction_op_ = str_to_nccl_red_type(reduction);
 
     // device id
-    int gpu_id =
-        BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).GetDeviceId();
+    int gpu_id = ctx.GetPlace().GetDeviceId();
     int idx = comm->GetCommId(gpu_id);
     VLOG(3) << "gpu : "
             << " invoke allreduce. send " << x->numel() << " recv "
@@ -100,8 +99,7 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
     auto reduction_op_ = str_to_nccl_red_type(reduction);
 
     // device id
-    int gpu_id =
-        BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).GetDeviceId();
+    int gpu_id = ctx.GetPlace().GetDeviceId();
     int idx = comm->GetCommId(gpu_id);
     T* recvbuffer = nullptr;
     if (root == gpu_id) {
@@ -130,8 +128,7 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
     int root = ctx.Attr<int>("root");
     auto* comm = ctx.Input<Communicator>("Communicator");
     // device id
-    int gpu_id =
-        BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).GetDeviceId();
+    int gpu_id = ctx.GetPlace().GetDeviceId();
     int idx = comm->GetCommId(gpu_id);
     if (idx == root) {
       auto* x = ctx.Input<LoDTensor>("X");
diff --git a/paddle/fluid/operators/partial_concat_op.cu b/paddle/fluid/operators/partial_concat_op.cu
index 779a45daf7a78..322e84ae8b9c2 100644
--- a/paddle/fluid/operators/partial_concat_op.cu
+++ b/paddle/fluid/operators/partial_concat_op.cu
@@ -118,8 +118,7 @@ class PartialConcatOpCUDAKernel : public framework::OpKernel<T> {
       in_data.emplace_back(in_vars[i]->data<T>());
 
     auto tmp_in_array = memory::Alloc(dev_ctx, in_data.size() * sizeof(T *));
-    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
-                 tmp_in_array->ptr(), platform::CPUPlace(),
+    memory::Copy(dev_ctx.GetPlace(), tmp_in_array->ptr(), platform::CPUPlace(),
                  reinterpret_cast<void *>(in_data.data()),
                  in_data.size() * sizeof(T *), dev_ctx.stream());
 
@@ -188,8 +187,7 @@ class PartialConcatGradOpCUDAKernel : public framework::OpKernel<T> {
     }
     auto tmp_out_array = memory::Alloc(dev_ctx, out_data.size() * sizeof(T *));
 
-    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
-                 tmp_out_array->ptr(), platform::CPUPlace(),
+    memory::Copy(dev_ctx.GetPlace(), tmp_out_array->ptr(), platform::CPUPlace(),
                  reinterpret_cast<void *>(out_data.data()),
                  out_data.size() * sizeof(T *), dev_ctx.stream());
 
diff --git a/paddle/fluid/operators/partial_sum_op.cu b/paddle/fluid/operators/partial_sum_op.cu
index c401a222c3aa2..63d140d6769b8 100644
--- a/paddle/fluid/operators/partial_sum_op.cu
+++ b/paddle/fluid/operators/partial_sum_op.cu
@@ -115,8 +115,8 @@ class PartialSumOpCUDAKernel : public framework::OpKernel<T> {
     if (!in_data.empty()) {
       auto tmp_in_array = memory::Alloc(dev_ctx, in_data.size() * sizeof(T *));
 
-      memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
-                   tmp_in_array->ptr(), platform::CPUPlace(),
+      memory::Copy(dev_ctx.GetPlace(), tmp_in_array->ptr(),
+                   platform::CPUPlace(),
                    reinterpret_cast<void *>(in_data.data()),
                    in_data.size() * sizeof(T *), dev_ctx.stream());
 
@@ -191,8 +191,8 @@ class PartialSumGradOpCUDAKernel : public framework::OpKernel<T> {
       auto tmp_out_array =
           memory::Alloc(dev_ctx, out_data.size() * sizeof(T *));
 
-      memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
-                   tmp_out_array->ptr(), platform::CPUPlace(),
+      memory::Copy(dev_ctx.GetPlace(), tmp_out_array->ptr(),
+                   platform::CPUPlace(),
                    reinterpret_cast<void *>(out_data.data()),
                    out_data.size() * sizeof(T *), dev_ctx.stream());
 
diff --git a/paddle/fluid/operators/poisson_op.cu b/paddle/fluid/operators/poisson_op.cu
index 3f18eb994e145..ef2f6d4665554 100644
--- a/paddle/fluid/operators/poisson_op.cu
+++ b/paddle/fluid/operators/poisson_op.cu
@@ -61,8 +61,7 @@ class PoissonKernel<platform::CUDADeviceContext, T>
     const T* x_data = x->data<T>();
     T* out_data = out->mutable_data<T>(ctx.GetPlace());
     auto size = x->numel();
-    int64_t device_id =
-        BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).GetDeviceId();
+    int64_t device_id = ctx.GetPlace().GetDeviceId();
 
     auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
     auto seed_offset = gen_cuda->IncrementOffset(20);
diff --git a/paddle/fluid/operators/prroi_pool_op.cu b/paddle/fluid/operators/prroi_pool_op.cu
index a21f565dae71d..71aaf08c5256a 100644
--- a/paddle/fluid/operators/prroi_pool_op.cu
+++ b/paddle/fluid/operators/prroi_pool_op.cu
@@ -246,7 +246,7 @@ class GPUPRROIPoolOpKernel : public framework::OpKernel<T> {
     int bytes = rois_batch_id_list.numel() * sizeof(int);
     auto roi_ptr = memory::Alloc(dev_ctx, bytes);
     int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
-    const auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
+    const auto gplace = ctx.GetPlace();
     memory::Copy(gplace, roi_id_data, cplace, rois_batch_id_data, bytes,
                  dev_ctx.stream());
 
@@ -322,7 +322,7 @@ class GPUPRROIPoolGradOpKernel : public framework::OpKernel<T> {
       int bytes = rois_batch_id_list.numel() * sizeof(int);
       auto roi_ptr = memory::Alloc(dev_ctx, bytes);
       int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
-      const auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
+      const auto gplace = ctx.GetPlace();
       memory::Copy(gplace, roi_id_data, cplace, rois_batch_id_data, bytes,
                    dev_ctx.stream());
 
diff --git a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc
index dc4bc36d34f22..3a361360e2ed7 100644
--- a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc
+++ b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc
@@ -97,8 +97,7 @@ void InitTensorsOnClient(framework::Scope* scope, int64_t rows_numel,
   float* temp_ptr = temp_vec.data();
 
   memory::Copy(
-      BOOST_GET_CONST(platform::CUDAPlace, place),
-      reinterpret_cast<void*>(micro_id_ptr), platform::CPUPlace(),
+      place, reinterpret_cast<void*>(micro_id_ptr), platform::CPUPlace(),
       reinterpret_cast<void*>(temp_ptr),
       micro_id_var->numel() * framework::SizeOfType(micro_id_var->type()),
       stream);
@@ -109,8 +108,7 @@ void InitTensorsOnClient(framework::Scope* scope, int64_t rows_numel,
   std::vector<float> x_vec;
   for (int64_t i = 0; i < rows_numel; ++i) x_vec.push_back(1.0);
   float* x_vec_ptr = x_vec.data();
-  memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place),
-               reinterpret_cast<void*>(x_ptr), platform::CPUPlace(),
+  memory::Copy(place, reinterpret_cast<void*>(x_ptr), platform::CPUPlace(),
                reinterpret_cast<void*>(x_vec_ptr),
                x_var->numel() * framework::SizeOfType(x_var->type()), stream);
 
diff --git a/paddle/fluid/operators/psroi_pool_op.cu b/paddle/fluid/operators/psroi_pool_op.cu
index 5a0d1a700417c..efdcc59a5c49e 100644
--- a/paddle/fluid/operators/psroi_pool_op.cu
+++ b/paddle/fluid/operators/psroi_pool_op.cu
@@ -203,8 +203,7 @@ class GPUPSROIPoolOpKernel : public framework::OpKernel<T> {
               "input(X) is %d and %d respectively.",
               rois_batch_size, batch_size));
       std::vector<int> rois_num_list(rois_batch_size);
-      memory::Copy(platform::CPUPlace(), rois_num_list.data(),
-                   BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()),
+      memory::Copy(platform::CPUPlace(), rois_num_list.data(), ctx.GetPlace(),
                    rois_num_data, sizeof(int) * rois_batch_size, 0);
       int rois_num_count = 0;
       for (int i = 0; i < rois_batch_size; ++i) {
@@ -295,8 +294,7 @@ class GPUPSROIPoolGradOpKernel : public framework::OpKernel<T> {
         auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
         rois_batch_size = rois_num_t->numel();
         std::vector<int> rois_num_list(rois_batch_size);
-        memory::Copy(platform::CPUPlace(), rois_num_list.data(),
-                     BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()),
+        memory::Copy(platform::CPUPlace(), rois_num_list.data(), ctx.GetPlace(),
                      rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
         int start = 0;
         for (int n = 0; n < rois_batch_size; ++n) {
diff --git a/paddle/fluid/operators/qr_op.cu b/paddle/fluid/operators/qr_op.cu
index af5ebdc53126a..9eddb03828b5d 100644
--- a/paddle/fluid/operators/qr_op.cu
+++ b/paddle/fluid/operators/qr_op.cu
@@ -122,12 +122,9 @@ class QrGPUKernel : public framework::OpKernel<T> {
           auto new_qr_data = new_qr.mutable_data<T>(context.GetPlace());
           auto new_qr_stride = m * m;
           for (int i = 0; i < batch_size; ++i) {
-            memory::Copy(
-                BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
-                (new_qr_data + i * new_qr_stride),
-                BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
-                (qr_data + i * qr_stride), qr_stride * sizeof(math::Real<T>),
-                dev_ctx.stream());
+            memory::Copy(dev_ctx.GetPlace(), (new_qr_data + i * new_qr_stride),
+                         dev_ctx.GetPlace(), (qr_data + i * qr_stride),
+                         qr_stride * sizeof(math::Real<T>), dev_ctx.stream());
           }
           BatchedOrgqr<platform::CUDADeviceContext, T>(
               dev_ctx, batch_size, m, m, min_mn, new_qr_data, m, tau_data,
@@ -171,9 +168,8 @@ void BatchedGeqrf<platform::CUDADeviceContext, float>(
     // Do we need synchronized here?
     // check the error info
     int info_h;
-    memory::Copy(platform::CPUPlace(), &info_h,
-                 BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
-                 info_d, sizeof(int), dev_ctx.stream());
+    memory::Copy(platform::CPUPlace(), &info_h, dev_ctx.GetPlace(), info_d,
+                 sizeof(int), dev_ctx.stream());
     PADDLE_ENFORCE_EQ(
         info_h, 0,
         platform::errors::PreconditionNotMet(
@@ -205,9 +201,8 @@ void BatchedGeqrf<platform::CUDADeviceContext, double>(
     // Do we need synchronized here?
     // check the error info
     int info_h;
-    memory::Copy(platform::CPUPlace(), &info_h,
-                 BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
-                 info_d, sizeof(int), dev_ctx.stream());
+    memory::Copy(platform::CPUPlace(), &info_h, dev_ctx.GetPlace(), info_d,
+                 sizeof(int), dev_ctx.stream());
     PADDLE_ENFORCE_EQ(
         info_h, 0,
         platform::errors::PreconditionNotMet(
@@ -239,9 +234,8 @@ void BatchedOrgqr<platform::CUDADeviceContext, float>(
     // Do we need synchronized here?
     // check the error info
     int info_h;
-    memory::Copy(platform::CPUPlace(), &info_h,
-                 BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
-                 info_d, sizeof(int), dev_ctx.stream());
+    memory::Copy(platform::CPUPlace(), &info_h, dev_ctx.GetPlace(), info_d,
+                 sizeof(int), dev_ctx.stream());
     PADDLE_ENFORCE_EQ(
         info_h, 0,
         platform::errors::PreconditionNotMet(
@@ -273,9 +267,8 @@ void BatchedOrgqr<platform::CUDADeviceContext, double>(
     // Do we need synchronized here?
     // check the error info
     int info_h;
-    memory::Copy(platform::CPUPlace(), &info_h,
-                 BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
-                 info_d, sizeof(int), dev_ctx.stream());
+    memory::Copy(platform::CPUPlace(), &info_h, dev_ctx.GetPlace(), info_d,
+                 sizeof(int), dev_ctx.stream());
     PADDLE_ENFORCE_EQ(
         info_h, 0,
         platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/operators/range_op_xpu.cc b/paddle/fluid/operators/range_op_xpu.cc
index 1d4de77978180..2f142a626c5f2 100644
--- a/paddle/fluid/operators/range_op_xpu.cc
+++ b/paddle/fluid/operators/range_op_xpu.cc
@@ -50,9 +50,8 @@ class XPURangeKernel : public framework::OpKernel<T> {
       out_cpu_data_ptr[i] = value;
       value += step;
     }
-    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, context.GetPlace()),
-                 static_cast<void*>(out_data), platform::CPUPlace(),
-                 static_cast<void*>(out_cpu_data_ptr),
+    memory::Copy(context.GetPlace(), static_cast<void*>(out_data),
+                 platform::CPUPlace(), static_cast<void*>(out_cpu_data_ptr),
                  out->numel() * sizeof(T));
   }
 };
diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc
index 01f5b4c732712..a0eca3d9b09d9 100644
--- a/paddle/fluid/operators/rank_loss_op.cc
+++ b/paddle/fluid/operators/rank_loss_op.cc
@@ -26,7 +26,6 @@ class OpBase;
 }  // namespace imperative
 namespace platform {
 class CPUDeviceContext;
-struct CPUPlace;
 }  // namespace platform
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
index 3c0c8ad1cafce..4aad78f1c49cf 100644
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -41,7 +41,7 @@ BufferedReader::BufferedReader(
   VLOG(1) << "BufferedReader";
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (platform::is_gpu_place(place_) && !pin_memory) {
-    int dev_idx = BOOST_GET_CONST(platform::CUDAPlace, place_).device;
+    int dev_idx = place_.device;
     compute_stream_ =
         ((platform::CUDADeviceContext *)(platform::DeviceContextPool::Instance()
                                              .Get(place_)))
@@ -56,7 +56,7 @@ BufferedReader::BufferedReader(
 
 #ifdef PADDLE_WITH_ASCEND_CL
   if (platform::is_npu_place(place_)) {
-    int dev_idx = BOOST_GET_CONST(platform::NPUPlace, place_).device;
+    int dev_idx = place_.device;
     compute_stream_ =
         ((platform::NPUDeviceContext *)(platform::DeviceContextPool::Instance()
                                             .Get(place_)))
@@ -119,8 +119,7 @@ void BufferedReader::ReadAsync(size_t i) {
         // cudaHostAlloc, that is a CUDA API, calling CUDA API need load
         // cuda lib into device, it will cost hundreds of MB of GPU memory.
         // If we don't set Device here, which will use CUDAPlace(0) default.
-        platform::SetDeviceId(
-            BOOST_GET_CONST(platform::CUDAPlace, place_).device);
+        platform::SetDeviceId(place_.device);
         for (size_t i = 0; i < cpu.size(); ++i) {
           if (platform::is_cpu_place(cpu[i].place())) {
             cuda[i].Resize(cpu[i].dims());
@@ -130,8 +129,7 @@ void BufferedReader::ReadAsync(size_t i) {
             auto size =
                 cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type());
 
-            memory::Copy(cuda_pinned_place, cuda_pinned_ptrs[i],
-                         BOOST_GET_CONST(platform::CPUPlace, cpu[i].place()),
+            memory::Copy(cuda_pinned_place, cuda_pinned_ptrs[i], cpu[i].place(),
                          cpu[i].data(), size);
 
             cuda[i].set_lod(cpu[i].lod());
@@ -158,8 +156,7 @@ void BufferedReader::ReadAsync(size_t i) {
         // NOTE(zjl): cudaStreamWaitEvent() must be called after all
         // cuda[i].mutable_data() is called, since some ops release
         // cuda memory immediately without waiting cuda kernel ends
-        platform::SetDeviceId(
-            BOOST_GET_CONST(platform::CUDAPlace, place_).device);
+        platform::SetDeviceId(place_.device);
 #ifdef PADDLE_WITH_HIP
         PADDLE_ENFORCE_GPU_SUCCESS(
             hipEventRecord(events_[i].get(), compute_stream_));
@@ -180,25 +177,21 @@ void BufferedReader::ReadAsync(size_t i) {
           auto size =
               cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type());
           if (platform::is_cuda_pinned_place(cpu_place)) {
-            memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place_), gpu_ptr,
-                         BOOST_GET_CONST(platform::CUDAPinnedPlace, cpu_place),
-                         cpu_ptr, size, stream_.get());
+            memory::Copy(place_, gpu_ptr, cpu_place, cpu_ptr, size,
+                         stream_.get());
           } else if ((platform::is_gpu_place(cpu_place))) {
-            memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place_), gpu_ptr,
-                         BOOST_GET_CONST(platform::CUDAPlace, cpu_place),
-                         cpu_ptr, size, stream_.get());
+            memory::Copy(place_, gpu_ptr, cpu_place, cpu_ptr, size,
+                         stream_.get());
           } else {
             platform::CUDAPinnedPlace cuda_pinned_place;
             framework::LoDTensor cuda_pinned_tensor;
             cuda_pinned_tensor.Resize(cpu[i].dims());
             auto cuda_pinned_ptr = cuda_pinned_tensor.mutable_data(
                 cuda_pinned_place, cpu[i].type());
-            memory::Copy(cuda_pinned_place, cuda_pinned_ptr,
-                         BOOST_GET_CONST(platform::CPUPlace, cpu_place),
-                         cpu_ptr, size);
-            memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place_), gpu_ptr,
-                         cuda_pinned_place, cuda_pinned_ptr, size,
-                         stream_.get());
+            memory::Copy(cuda_pinned_place, cuda_pinned_ptr, cpu_place, cpu_ptr,
+                         size);
+            memory::Copy(place_, gpu_ptr, cuda_pinned_place, cuda_pinned_ptr,
+                         size, stream_.get());
 
             platform::GpuStreamSync(stream_.get());
           }
@@ -231,8 +224,7 @@ void BufferedReader::ReadAsync(size_t i) {
         npu_ptrs.emplace_back(npu[i].mutable_data(place_, cpu[i].type()));
       }
 
-      platform::SetNPUDeviceId(
-          BOOST_GET_CONST(platform::NPUPlace, place_).device);
+      platform::SetNPUDeviceId(place_.device);
       platform::NPUEventRecord(events_[i].get(), compute_stream_);
       platform::NPUStreamWaitEvent(stream_.get(), events_[i].get());
 
@@ -244,13 +236,11 @@ void BufferedReader::ReadAsync(size_t i) {
         auto size =
             cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type());
         if ((platform::is_npu_place(cpu_place))) {
-          memory::Copy(BOOST_GET_CONST(platform::NPUPlace, place_), npu_ptr,
-                       BOOST_GET_CONST(platform::NPUPlace, cpu_place), cpu_ptr,
-                       size, stream_.get());
+          memory::Copy(place_, npu_ptr, cpu_place, cpu_ptr, size,
+                       stream_.get());
         } else {
-          memory::Copy(BOOST_GET_CONST(platform::NPUPlace, place_), npu_ptr,
-                       BOOST_GET_CONST(platform::CPUPlace, cpu_place), cpu_ptr,
-                       size, stream_.get());
+          memory::Copy(place_, npu_ptr, cpu_place, cpu_ptr, size,
+                       stream_.get());
           platform::NPUStreamSync(stream_.get());
         }
         npu[i].set_lod(cpu[i].lod());
diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
index 44db3f3a33563..d4a68260a6b98 100644
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -47,12 +47,12 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase {
     platform::Place place;
     if (place_str == "AUTO") {
       place = dev_place;
-    } else if (place_str == "CPUPLACE") {
+    } else if (place_str == "PLACE(CPU)") {
       place = platform::CPUPlace();
     } else {
       place_str = place_str.substr(0, place_str.length() - 1);
       std::istringstream sin(place_str);
-      sin.seekg(std::string("CUDAPLACE(").size(), std::ios::beg);
+      sin.seekg(std::string("PLACE(GPU:").size(), std::ios::beg);
       size_t num;
       sin >> num;
       place = platform::CUDAPlace(static_cast<int>(num));
@@ -79,7 +79,7 @@ class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase {
     std::unordered_set<std::string> enum_range;
     constexpr size_t kMaxCUDADevs = 128;
     for (size_t i = 0; i < kMaxCUDADevs; ++i) {
-      enum_range.insert(string::Sprintf("CUDAPLACE(%d)", i));
+      enum_range.insert(string::Sprintf("PLACE(GPU:%d)", i));
     }
     enum_range.insert("CPUPLACE");
     enum_range.insert("AUTO");
diff --git a/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc b/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc
index 2d7cce68e8171..0a5d54e72c845 100644
--- a/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc
+++ b/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc
@@ -25,7 +25,6 @@ class OpBase;
 }  // namespace imperative
 namespace platform {
 class CPUDeviceContext;
-struct CPUPlace;
 }  // namespace platform
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cc b/paddle/fluid/operators/reduce_ops/reduce_all_op.cc
index 10095bc955047..955cf8d4448c1 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cc
@@ -25,7 +25,6 @@ class OpBase;
 }  // namespace imperative
 namespace platform {
 class CPUDeviceContext;
-struct CPUPlace;
 }  // namespace platform
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op.cc
index f288fce753802..fa3800dd3c9e4 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cc
@@ -25,7 +25,6 @@ class OpBase;
 }  // namespace imperative
 namespace platform {
 class CPUDeviceContext;
-struct CPUPlace;
 }  // namespace platform
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc
index f27cd6b125b32..50df75d9ad3fd 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc
@@ -23,7 +23,6 @@ class OpBase;
 }  // namespace imperative
 namespace platform {
 class CPUDeviceContext;
-struct CPUPlace;
 }  // namespace platform
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
index 9e4cc8e213c61..562a5719d74d9 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
@@ -25,7 +25,6 @@ class OpBase;
 }  // namespace imperative
 namespace platform {
 class CPUDeviceContext;
-struct CPUPlace;
 }  // namespace platform
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.h b/paddle/fluid/operators/reduce_ops/reduce_sum_op.h
index 7f61794fbb11b..9782ce28da4af 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.h
@@ -72,7 +72,7 @@ class ReduceSumGradKernel : public framework::OpKernel<T> {
 
   void Compute(const framework::ExecutionContext& context) const override {
     auto dims = context.Attr<std::vector<int>>("dim");
-    if (context.GetPlace().type() == typeid(platform::CPUPlace) &&
+    if (context.GetPlace().GetType() == platform::CPUPlace().GetType() &&
         dims.size() == 1) {
       int in_dtype = context.Attr<int>("in_dtype");
 
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 47b8da70adbac..9e343517e3fbf 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -31,8 +31,6 @@ namespace imperative {
 class OpBase;
 }  // namespace imperative
 namespace platform {
-struct CPUPlace;
-struct CUDAPlace;
 struct float16;
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/operators/rnn_op.cu.cc b/paddle/fluid/operators/rnn_op.cu.cc
index de4847ddc4590..80a0ef10fa150 100644
--- a/paddle/fluid/operators/rnn_op.cu.cc
+++ b/paddle/fluid/operators/rnn_op.cu.cc
@@ -256,10 +256,8 @@ void weight_to_tensor(const platform::Place &place, gpuStream_t stream,
     const T *in_data = weight_list[i]->data<T>();
     auto in_size = weight_list[i]->numel();
 
-    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, weight->place()),
-                 weight_data + weight_offset,
-                 BOOST_GET_CONST(platform::CUDAPlace, weight_list[i]->place()),
-                 in_data, in_size * sizeof(T), stream);
+    memory::Copy(weight->place(), weight_data + weight_offset,
+                 weight_list[i]->place(), in_data, in_size * sizeof(T), stream);
     weight_offset += in_size;
   }
 }
@@ -276,10 +274,8 @@ void weight_to_tensor_list(const platform::Place &place, gpuStream_t stream,
     T *weight_grad_data = (*weight_grad)[i]->mutable_data<T>(place);
     const T *src = weight_data + weight_offset;
 
-    memory::Copy(
-        BOOST_GET_CONST(platform::CUDAPlace, (*weight_grad)[i]->place()),
-        weight_grad_data, BOOST_GET_CONST(platform::CUDAPlace, weight->place()),
-        src, in_size * sizeof(T), stream);
+    memory::Copy((*weight_grad)[i]->place(), weight_grad_data, weight->place(),
+                 src, in_size * sizeof(T), stream);
     weight_offset += in_size;
   }
 }
@@ -295,10 +291,8 @@ void weight_list_to_tensor(const platform::Place &place, gpuStream_t stream,
   for (size_t i = 0; i < tensor_list.size(); ++i) {
     const T *in_data = tensor_list[i].data<T>();
     auto in_size = tensor_list[i].numel();
-    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, weight_whole->place()),
-                 weight_data + weight_offset,
-                 BOOST_GET_CONST(platform::CUDAPlace, tensor_list[i].place()),
-                 in_data, in_size * sizeof(T), stream);
+    memory::Copy(weight_whole->place(), weight_data + weight_offset,
+                 tensor_list[i].place(), in_data, in_size * sizeof(T), stream);
     weight_offset += in_size;
   }
 }
@@ -430,8 +424,7 @@ class RNNCudnnKernel : public framework::OpKernel<T> {
     bool is_test = ctx.Attr<bool>("is_test");
     int seed = ctx.Attr<int>("seed");
     if (!is_test) {
-      int device_id =
-          BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).GetDeviceId();
+      int device_id = ctx.GetPlace().GetDeviceId();
       auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
       if (gen_cuda->GetIsInitPy() && seed == 0) {
         // If perform `manual_seed` in python and inner seed is not specified
diff --git a/paddle/fluid/operators/roi_align_op.cu b/paddle/fluid/operators/roi_align_op.cu
index 3b25676fb0c36..520023229fe1b 100644
--- a/paddle/fluid/operators/roi_align_op.cu
+++ b/paddle/fluid/operators/roi_align_op.cu
@@ -271,7 +271,7 @@ class GPUROIAlignOpKernel : public framework::OpKernel<T> {
     auto cplace = platform::CPUPlace();
     int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
     auto& dev_ctx = ctx.cuda_device_context();
-    auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
+    auto gplace = ctx.GetPlace();
     if (ctx.HasInput("RoisNum")) {
       auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
       int rois_batch_size = rois_num_t->numel();
@@ -365,7 +365,7 @@ class GPUROIAlignGradOpKernel : public framework::OpKernel<T> {
     int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
 
     auto& dev_ctx = ctx.cuda_device_context();
-    auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
+    auto gplace = ctx.GetPlace();
     if (ctx.HasInput("RoisNum")) {
       auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
       int rois_batch_size = rois_num_t->numel();
diff --git a/paddle/fluid/operators/roi_align_op_xpu.cc b/paddle/fluid/operators/roi_align_op_xpu.cc
index f35cf06e5f704..7764e52c2f6da 100644
--- a/paddle/fluid/operators/roi_align_op_xpu.cc
+++ b/paddle/fluid/operators/roi_align_op_xpu.cc
@@ -48,7 +48,7 @@ class XPUROIAlignOpKernel : public framework::OpKernel<T> {
     auto cplace = platform::CPUPlace();
     int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto xplace = BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace());
+    auto xplace = ctx.GetPlace();
     int rois_batch_size = 0;
     int* cpu_lod = nullptr;
     if (ctx.HasInput("RoisNum")) {
@@ -157,7 +157,7 @@ class XPUROIAlignGradOpKernel : public framework::OpKernel<T> {
     auto cplace = platform::CPUPlace();
 
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto xplace = BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace());
+    auto xplace = ctx.GetPlace();
 
     int rois_batch_size = 0;
     int* cpu_lod = nullptr;
diff --git a/paddle/fluid/operators/roi_pool_op.cu b/paddle/fluid/operators/roi_pool_op.cu
index 0a4a076c6caae..16a8e2bf586a7 100644
--- a/paddle/fluid/operators/roi_pool_op.cu
+++ b/paddle/fluid/operators/roi_pool_op.cu
@@ -156,7 +156,7 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> {
     auto cplace = platform::CPUPlace();
     int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
     auto& dev_ctx = ctx.cuda_device_context();
-    auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
+    auto gplace = ctx.GetPlace();
     if (ctx.HasInput("RoisNum")) {
       auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
       int rois_batch_size = rois_num_t->numel();
@@ -244,7 +244,7 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
       int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
 
       auto& dev_ctx = ctx.cuda_device_context();
-      auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
+      auto gplace = ctx.GetPlace();
       if (ctx.HasInput("RoisNum")) {
         auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
         int rois_batch_size = rois_num_t->numel();
diff --git a/paddle/fluid/operators/run_program_op.h b/paddle/fluid/operators/run_program_op.h
index 04e4dc62b039b..c130dbb35a0da 100644
--- a/paddle/fluid/operators/run_program_op.h
+++ b/paddle/fluid/operators/run_program_op.h
@@ -261,7 +261,7 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
     VLOG(2) << "The number of sub scopes after forward: "
             << out_scope_vec->front()->kids().size();
 #ifdef PADDLE_WITH_MKLDNN
-    if (FLAGS_use_mkldnn) DontClearMKLDNNCache(ctx.GetPlace());
+    if (FLAGS_use_mkldnn) platform::DontClearMKLDNNCache(ctx.GetPlace());
 #endif
   }
 };
diff --git a/paddle/fluid/operators/scatter.cu.h b/paddle/fluid/operators/scatter.cu.h
index 6c7a0a8886ef0..98311ff404b47 100644
--- a/paddle/fluid/operators/scatter.cu.h
+++ b/paddle/fluid/operators/scatter.cu.h
@@ -221,7 +221,7 @@ void GPUScatterNdAdd(const framework::ExecutionContext& context,
   // put output_dims int CUDA
   // gplace and cplace
   const auto& ctx = context.template device_context<DeviceContext>();
-  const auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
+  const auto gplace = ctx.GetPlace();
   auto cplace = platform::CPUPlace();
 
   std::vector<int64_t> v_output_dims(output_dims_size);
diff --git a/paddle/fluid/operators/seed_op.cu b/paddle/fluid/operators/seed_op.cu
index 4ca75bcf76e51..2154b08ae86fe 100644
--- a/paddle/fluid/operators/seed_op.cu
+++ b/paddle/fluid/operators/seed_op.cu
@@ -37,8 +37,7 @@ class GPUSeedKernel : public framework::OpKernel<T> {
               out, static_cast<T>(seed));
     } else {
       auto *out_data = out->mutable_data<T>(context.GetPlace());
-      auto target_gpu_place =
-          BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace());
+      auto target_gpu_place = context.GetPlace();
       auto stream = context.cuda_device_context().stream();
       memory::Copy(target_gpu_place, out_data, platform::CPUPlace(), &seed,
                    sizeof(int), stream);
diff --git a/paddle/fluid/operators/segment_pool_op.h b/paddle/fluid/operators/segment_pool_op.h
index 307bf4010f7ff..4f180a31ce518 100644
--- a/paddle/fluid/operators/segment_pool_op.h
+++ b/paddle/fluid/operators/segment_pool_op.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/segment_pooling.h"
 #include "paddle/fluid/platform/macros.h"
+#include "paddle/pten/common/place.h"
 
 namespace paddle {
 namespace operators {
@@ -48,7 +49,7 @@ void SegmentKernelLaunchHelper(const framework::ExecutionContext& context) {
     return;
   }
 
-  bool cpu_place = context.GetPlace().type() == typeid(platform::CPUPlace);
+  bool cpu_place = context.GetPlace().GetType() == pten::AllocationType::CPU;
   if (cpu_place) {
     auto dims = input->dims();
     auto* segment_ids = segment->data<IndexT>();
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
index f63fa5be7f496..4c9faa1875df6 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
@@ -98,8 +98,7 @@ static int ExpandByMemoryCopy(const platform::CUDADeviceContext& context,
   auto out_data = out->data<T>();
   auto x_data = x.data<T>();
 
-  const auto& gpu_place =
-      BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace());
+  const auto& gpu_place = context.GetPlace();
 
   int x_item_length = x.numel() / x.dims()[0];
   int out_offset = 0;
diff --git a/paddle/fluid/operators/set_value_op.cc b/paddle/fluid/operators/set_value_op.cc
index 41e6d2d40061e..ec3e04e71faf0 100644
--- a/paddle/fluid/operators/set_value_op.cc
+++ b/paddle/fluid/operators/set_value_op.cc
@@ -28,7 +28,6 @@ class OpBase;
 }  // namespace imperative
 namespace platform {
 class CPUDeviceContext;
-struct CPUPlace;
 }  // namespace platform
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/split_op.h b/paddle/fluid/operators/split_op.h
index ceba0dfddf0f5..96ac2c7a1bd08 100644
--- a/paddle/fluid/operators/split_op.h
+++ b/paddle/fluid/operators/split_op.h
@@ -142,8 +142,6 @@ class SplitOpKernel : public framework::OpKernel<T> {
       }
     }
 
-    auto place = ctx.GetPlace();
-
     std::vector<const framework::Tensor*> shape_refer;
     for (size_t j = 0; j < outs.size(); ++j) {
       outs[j]->mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/split_op_npu.cc b/paddle/fluid/operators/split_op_npu.cc
index b23c58f16925d..5e570572c35e1 100644
--- a/paddle/fluid/operators/split_op_npu.cc
+++ b/paddle/fluid/operators/split_op_npu.cc
@@ -45,7 +45,6 @@ class SplitNPUKernel : public framework::OpKernel<T> {
     }
 
     std::vector<Tensor> outputs;
-    auto place = ctx.GetPlace();
     for (size_t j = 0; j < outs.size(); ++j) {
       outs[j]->mutable_data<T>(ctx.GetPlace());
       outputs.push_back(*outs[j]);
diff --git a/paddle/fluid/operators/stack_op.cu b/paddle/fluid/operators/stack_op.cu
index 5b3f03445d352..2cebe0e320e7e 100644
--- a/paddle/fluid/operators/stack_op.cu
+++ b/paddle/fluid/operators/stack_op.cu
@@ -64,8 +64,7 @@ class StackGPUKernel : public framework::OpKernel<T> {
 
     auto& dev_ctx = ctx.template device_context<plat::CUDADeviceContext>();
     auto tmp_x_data = memory::Alloc(dev_ctx, x_datas.size() * sizeof(T*));
-    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
-                 tmp_x_data->ptr(), platform::CPUPlace(),
+    memory::Copy(dev_ctx.GetPlace(), tmp_x_data->ptr(), platform::CPUPlace(),
                  reinterpret_cast<void*>(x_datas.data()),
                  x_datas.size() * sizeof(T*), dev_ctx.stream());
 
@@ -169,8 +168,7 @@ class StackGradGPUKernel : public framework::OpKernel<T> {
 
     auto& dev_ctx = ctx.template device_context<plat::CUDADeviceContext>();
     auto tmp_out_data = memory::Alloc(dev_ctx, outputs.size() * sizeof(T*));
-    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
-                 tmp_out_data->ptr(), platform::CPUPlace(),
+    memory::Copy(dev_ctx.GetPlace(), tmp_out_data->ptr(), platform::CPUPlace(),
                  reinterpret_cast<void*>(outputs.data()),
                  outputs.size() * sizeof(T*), dev_ctx.stream());
 
diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h
index 159450aa178d1..c92d468f3462c 100644
--- a/paddle/fluid/operators/strided_memcpy.h
+++ b/paddle/fluid/operators/strided_memcpy.h
@@ -94,18 +94,18 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
 
   for (int64_t i = 0; i < before; ++i) {
     if (platform::is_cpu_place(place)) {
-      auto& cpu_place = BOOST_GET_CONST(platform::CPUPlace, place);
+      auto& cpu_place = place;
       memory::Copy(cpu_place, dst + i * dst_after, cpu_place,
                    src + i * src_after, sizeof(T) * size);
     } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      auto& gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place);
+      auto& gpu_place = place;
       auto& cuda_ctx =
           reinterpret_cast<const platform::CUDADeviceContext&>(ctx);
       memory::Copy(gpu_place, dst + i * dst_after, gpu_place,
                    src + i * src_after, sizeof(T) * size, cuda_ctx.stream());
 #elif defined(PADDLE_WITH_ASCEND_CL)
-      auto& npu_place = BOOST_GET_CONST(platform::NPUPlace, place);
+      auto& npu_place = place;
       auto& npu_ctx = reinterpret_cast<const platform::NPUDeviceContext&>(ctx);
       memory::Copy(npu_place, dst + i * dst_after, npu_place,
                    src + i * src_after, sizeof(T) * size, npu_ctx.stream());
diff --git a/paddle/fluid/operators/sum_op.cu b/paddle/fluid/operators/sum_op.cu
index 6034cda50c32a..4288e9415aa86 100644
--- a/paddle/fluid/operators/sum_op.cu
+++ b/paddle/fluid/operators/sum_op.cu
@@ -196,8 +196,8 @@ void SumToLoDTensor(const framework::ExecutionContext &context) {
       auto tmp_sr_in_out_array =
           memory::Alloc(dev_ctx, sr_in_out_data.size() * sizeof(T *));
 
-      memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
-                   tmp_sr_in_out_array->ptr(), platform::CPUPlace(),
+      memory::Copy(dev_ctx.GetPlace(), tmp_sr_in_out_array->ptr(),
+                   platform::CPUPlace(),
                    reinterpret_cast<void *>(sr_in_out_data.data()),
                    sr_in_out_data.size() * sizeof(T *), dev_ctx.stream());
 
@@ -214,8 +214,7 @@ void SumToLoDTensor(const framework::ExecutionContext &context) {
   if (!in_data.empty()) {
     auto tmp_in_array = memory::Alloc(dev_ctx, in_data.size() * sizeof(T *));
 
-    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
-                 tmp_in_array->ptr(), platform::CPUPlace(),
+    memory::Copy(dev_ctx.GetPlace(), tmp_in_array->ptr(), platform::CPUPlace(),
                  reinterpret_cast<void *>(in_data.data()),
                  in_data.size() * sizeof(T *), dev_ctx.stream());
 
diff --git a/paddle/fluid/operators/svd_op.cu b/paddle/fluid/operators/svd_op.cu
index 0a7ed093ad0b8..f17e92e47b731 100644
--- a/paddle/fluid/operators/svd_op.cu
+++ b/paddle/fluid/operators/svd_op.cu
@@ -108,8 +108,7 @@ void SvdGPUKernel<float>::GesvdjBatched(
         info, gesvdj_params));
     // check the error info
     int error_info;
-    memory::Copy(platform::CPUPlace(), &error_info,
-                 BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), info,
+    memory::Copy(platform::CPUPlace(), &error_info, dev_ctx.GetPlace(), info,
                  sizeof(int), dev_ctx.stream());
     PADDLE_ENFORCE_EQ(
         error_info, 0,
@@ -151,8 +150,7 @@ void SvdGPUKernel<double>::GesvdjBatched(
         info, gesvdj_params));
     // check the error info
     int error_info;
-    memory::Copy(platform::CPUPlace(), &error_info,
-                 BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), info,
+    memory::Copy(platform::CPUPlace(), &error_info, dev_ctx.GetPlace(), info,
                  sizeof(int), dev_ctx.stream());
     PADDLE_ENFORCE_EQ(
         error_info, 0,
diff --git a/paddle/fluid/operators/tensor_formatter.cc b/paddle/fluid/operators/tensor_formatter.cc
index a0cda54b31b4c..558f5f2a3128f 100644
--- a/paddle/fluid/operators/tensor_formatter.cc
+++ b/paddle/fluid/operators/tensor_formatter.cc
@@ -120,7 +120,7 @@ void TensorFormatter::FormatData(const framework::LoDTensor& print_tensor,
                            : std::min(summarize_, print_tensor.numel());
   const T* data = nullptr;
   framework::LoDTensor cpu_tensor;
-  if (is_cpu_place(print_tensor.place())) {
+  if (paddle::platform::is_cpu_place(print_tensor.place())) {
     data = print_tensor.data<T>();
   } else {
     platform::CPUPlace cpu_place;
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index 5ebf67587f3cb..9357eb4b2295a 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -388,8 +388,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
       calib_res->thr_.reset(new std::thread([&]() {
         calib_res->engine_.reset(new TensorRTEngine(
             max_batch_size_, workspace_size_, precision_mode_,
-            calib_res->calib_.get(),
-            BOOST_GET_CONST(platform::CUDAPlace, dev_place).device));
+            calib_res->calib_.get(), dev_place.device));
         VLOG(3) << "start the calib trt engine thread";
         PrepareTRTEngine(scope, calib_res->engine_.get());
       }));
@@ -567,8 +566,8 @@ class TensorRTEngineOp : public framework::OperatorBase {
                             "than the number of bindings, but got binding "
                             "index = %d, number of bindings = %d.",
                             bind_index, num_bindings));
-      buffers[bind_index] = static_cast<void *>(fluid_t->mutable_data<float>(
-          BOOST_GET_CONST(platform::CUDAPlace, dev_place)));
+      buffers[bind_index] =
+          static_cast<void *>(fluid_t->mutable_data<float>(dev_place));
 
       output_index += 1;
     }
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cu b/paddle/fluid/operators/truncated_gaussian_random_op.cu
index aaed8e5b62584..5e530a5bb5248 100644
--- a/paddle/fluid/operators/truncated_gaussian_random_op.cu
+++ b/paddle/fluid/operators/truncated_gaussian_random_op.cu
@@ -100,8 +100,7 @@ class GPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
     thrust::counting_iterator<int64_t> index_sequence_begin(0);
     int64_t size = tensor->numel();
 
-    int device_id =
-        BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()).GetDeviceId();
+    int device_id = context.GetPlace().GetDeviceId();
     auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
 
     if (gen_cuda->GetIsInitPy() && seed_flag) {
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc b/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc
index b2ff91a37451e..803b61fbe813f 100644
--- a/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc
+++ b/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc
@@ -48,9 +48,8 @@ class XPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
       data_cpu[i] = truncated_normal(dist(*engine));
     }
 
-    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, context.GetPlace()), data,
-                 platform::CPUPlace(), reinterpret_cast<void*>(data_cpu.get()),
-                 size * sizeof(T));
+    memory::Copy(context.GetPlace(), data, platform::CPUPlace(),
+                 reinterpret_cast<void*>(data_cpu.get()), size * sizeof(T));
   }
 };
 
diff --git a/paddle/fluid/operators/unbind_op.h b/paddle/fluid/operators/unbind_op.h
index 2d968260f0a58..365dc9547a2d6 100644
--- a/paddle/fluid/operators/unbind_op.h
+++ b/paddle/fluid/operators/unbind_op.h
@@ -43,9 +43,6 @@ class UnbindOpKernel : public framework::OpKernel<T> {
     int axis = ctx.Attr<int>("axis");
 
     auto in_dims = in->dims();
-
-    auto place = ctx.GetPlace();
-
     axis = axis < 0 ? in_dims.size() + axis : axis;
     std::vector<const framework::Tensor*> shape_refer;
     for (size_t j = 0; j < outs.size(); ++j) {
diff --git a/paddle/fluid/operators/uniform_random_inplace_op.cu b/paddle/fluid/operators/uniform_random_inplace_op.cu
index bf82af865a1eb..a5231354eb47e 100644
--- a/paddle/fluid/operators/uniform_random_inplace_op.cu
+++ b/paddle/fluid/operators/uniform_random_inplace_op.cu
@@ -120,8 +120,7 @@ class GPUUniformRandomInplaceKernel : public framework::OpKernel<T> {
     T diag_val = static_cast<T>(ctx.Attr<float>("diag_val"));
     thrust::counting_iterator<int64_t> index_sequence_begin(0);
     int64_t size = tensor->numel();
-    int device_id =
-        BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).GetDeviceId();
+    int device_id = ctx.GetPlace().GetDeviceId();
     auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
     if (gen_cuda->GetIsInitPy() && seed_flag) {
       auto seed_offset = gen_cuda->IncrementOffset(1);
diff --git a/paddle/fluid/operators/uniform_random_inplace_op_xpu.cc b/paddle/fluid/operators/uniform_random_inplace_op_xpu.cc
index 24b1459a09510..fe43bb4ec60ca 100644
--- a/paddle/fluid/operators/uniform_random_inplace_op_xpu.cc
+++ b/paddle/fluid/operators/uniform_random_inplace_op_xpu.cc
@@ -59,9 +59,8 @@ class XPUUniformRandomInplaceKernel : public framework::OpKernel<T> {
         data_cpu[pos] = diag_val;
       }
     }
-    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()), data,
-                 platform::CPUPlace(), reinterpret_cast<void *>(data_cpu.get()),
-                 size * sizeof(T));
+    memory::Copy(ctx.GetPlace(), data, platform::CPUPlace(),
+                 reinterpret_cast<void *>(data_cpu.get()), size * sizeof(T));
   }
 };
 
@@ -77,8 +76,7 @@ class XPUUniformRandomInplaceGradKernel : public framework::OpKernel<T> {
       for (int64_t i = 0; i < size; ++i) {
         data_cpu[i] = T(0);
       }
-      memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()), data,
-                   platform::CPUPlace(),
+      memory::Copy(ctx.GetPlace(), data, platform::CPUPlace(),
                    reinterpret_cast<void *>(data_cpu.get()), size * sizeof(T));
     }
   }
diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu
index 97288b2b1fa7c..440c9b786b69c 100644
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -141,8 +141,7 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
     T diag_val = static_cast<T>(context.Attr<float>("diag_val"));
     thrust::counting_iterator<int64_t> index_sequence_begin(0);
     int64_t size = tensor->numel();
-    int device_id =
-        BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()).GetDeviceId();
+    int device_id = context.GetPlace().GetDeviceId();
     auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
     if (gen_cuda->GetIsInitPy() && seed_flag) {
       auto seed_offset = gen_cuda->IncrementOffset(1);
diff --git a/paddle/fluid/operators/uniform_random_op_xpu.cc b/paddle/fluid/operators/uniform_random_op_xpu.cc
index d8b82ad5f863e..fed0accd8a14c 100644
--- a/paddle/fluid/operators/uniform_random_op_xpu.cc
+++ b/paddle/fluid/operators/uniform_random_op_xpu.cc
@@ -91,9 +91,8 @@ class XPUUniformRandomKernel : public framework::OpKernel<T> {
       }
     }
 
-    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()), data,
-                 platform::CPUPlace(), reinterpret_cast<void *>(data_cpu.get()),
-                 size * sizeof(T));
+    memory::Copy(ctx.GetPlace(), data, platform::CPUPlace(),
+                 reinterpret_cast<void *>(data_cpu.get()), size * sizeof(T));
   }
 };
 
diff --git a/paddle/fluid/operators/where_index_op.cu b/paddle/fluid/operators/where_index_op.cu
index feb8e83864e84..50b856bfe9841 100644
--- a/paddle/fluid/operators/where_index_op.cu
+++ b/paddle/fluid/operators/where_index_op.cu
@@ -128,14 +128,12 @@ class CUDAWhereIndexKernel : public framework::OpKernel<T> {
     for (int i = rank - 2; i >= 0; i--) {
       h_stride_array[i] = h_stride_array[i + 1] * dims[i + 1];
     }
-    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
-                 d_stride_array, platform::CPUPlace(), h_stride_array,
-                 rank * sizeof(int64_t), dev_ctx.stream());
+    memory::Copy(dev_ctx.GetPlace(), d_stride_array, platform::CPUPlace(),
+                 h_stride_array, rank * sizeof(int64_t), dev_ctx.stream());
 
     // get total ture number and set output size
     // the last element of cub::InclusiveSum is the total number
-    memory::Copy(platform::CPUPlace(), h_total_true_num,
-                 BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
+    memory::Copy(platform::CPUPlace(), h_total_true_num, dev_ctx.GetPlace(),
                  d_true_num_array + numel - 1, sizeof(int64_t),
                  dev_ctx.stream());
     dev_ctx.Wait();
diff --git a/paddle/fluid/operators/where_index_op_xpu.cc b/paddle/fluid/operators/where_index_op_xpu.cc
index 53ddefbbe0cab..d80a266846e95 100644
--- a/paddle/fluid/operators/where_index_op_xpu.cc
+++ b/paddle/fluid/operators/where_index_op_xpu.cc
@@ -44,8 +44,8 @@ class WhereIndexXPUKernel : public framework::OpKernel<T> {
             ret, XPUAPIErrorMsg[ret]));
 
     memory::Copy(platform::CPUPlace(), static_cast<void*>(&true_num_cpu),
-                 BOOST_GET_CONST(platform::XPUPlace, context.GetPlace()),
-                 static_cast<void*>(true_num), sizeof(int32_t));
+                 context.GetPlace(), static_cast<void*>(true_num),
+                 sizeof(int32_t));
 
     out->Resize(
         framework::make_ddim({static_cast<int64_t>(true_num_cpu), rank}));
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 517b4a28a690f..9e0a0cb5f8d35 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -55,7 +55,7 @@ ELSE()
     cc_library(cuda_graph_with_memory_pool SRCS cuda_graph_with_memory_pool.cc DEPS device_context allocator_facade)
 ENDIF()
 
-cc_library(place SRCS place.cc DEPS enforce boost)
+cc_library(place SRCS place.cc DEPS enforce boost pten_place)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 
 add_subdirectory(device)
@@ -122,7 +122,7 @@ cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc DEPS boost)
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
 cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc xxhash ${STREAM_CALLBACK_DEPS}
-    place eigen3 stringpiece cpu_helper cpu_info framework_proto ${IPU_CTX_DEPS} ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
+    place pten_place eigen3 stringpiece cpu_helper cpu_info framework_proto ${IPU_CTX_DEPS} ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
     ${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS} ${MLU_CTX_DEPS})
 
 cc_library(collective_helper SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto device_context enforce)
diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc
index 7d2ea57545d08..dd2dc9a40799e 100644
--- a/paddle/fluid/platform/collective_helper.cc
+++ b/paddle/fluid/platform/collective_helper.cc
@@ -32,9 +32,7 @@ class NCCLCommImpl : public NCCLComm {
   void set_rank(int rank) { rank_ = rank; }
   int rank() const override { return rank_; }
 
-  int device_id() const override {
-    return BOOST_GET_CONST(CUDAPlace, dev_ctx_->GetPlace()).device;
-  }
+  int device_id() const override { return dev_ctx_->GetPlace().device; }
 
   void set_comm(ncclComm_t comm) { comm_ = comm; }
   ncclComm_t comm() const override { return comm_; }
@@ -246,9 +244,7 @@ class BKCLCommImpl : public BKCLComm {
   void set_rank(int rank) { rank_ = rank; }
   int rank() const override { return rank_; }
 
-  int device_id() const override {
-    return BOOST_GET_CONST(XPUPlace, dev_ctx_->GetPlace()).device;
-  }
+  int device_id() const override { return dev_ctx_->GetPlace().device; }
 
   void set_comm(BKCLContext_t comm) { comm_ = comm; }
   BKCLContext_t comm() const override { return comm_; }
diff --git a/paddle/fluid/platform/collective_helper.h b/paddle/fluid/platform/collective_helper.h
index 2fdc462a693ec..62a07669259a4 100644
--- a/paddle/fluid/platform/collective_helper.h
+++ b/paddle/fluid/platform/collective_helper.h
@@ -115,7 +115,7 @@ class NCCLCommContext {
 
   // retrieve a communicator by the ring id and place
   NCCLComm* Get(int ring_id, Place place) const {
-    return Get(ring_id, BOOST_GET_CONST(CUDAPlace, place).device);
+    return Get(ring_id, place.device);
   }
 
  private:
@@ -212,7 +212,7 @@ class HCCLCommContext {
 
   // retrieve a communicator by the ring id and place
   HCCLComm* Get(int ring_id, Place place) const {
-    return Get(ring_id, BOOST_GET_CONST(NPUPlace, place).device);
+    return Get(ring_id, place.device);
   }
 
  private:
@@ -317,7 +317,7 @@ class BKCLCommContext {
 
   // retrieve a communicator by the ring id and place
   BKCLComm* Get(int ring_id, Place place) const {
-    return Get(ring_id, BOOST_GET_CONST(XPUPlace, place).device);
+    return Get(ring_id, place.device);
   }
 
  private:
diff --git a/paddle/fluid/platform/device/gpu/nccl_helper.h b/paddle/fluid/platform/device/gpu/nccl_helper.h
index f26116749077e..261916b2555be 100644
--- a/paddle/fluid/platform/device/gpu/nccl_helper.h
+++ b/paddle/fluid/platform/device/gpu/nccl_helper.h
@@ -89,9 +89,7 @@ struct NCCLContext {
   gpuStream_t stream() const { return ctx_->stream(); }
   ncclComm_t comm() const { return comm_; }
 
-  int device_id() const {
-    return BOOST_GET_CONST(platform::CUDAPlace, ctx_->GetPlace()).device;
-  }
+  int device_id() const { return ctx_->GetPlace().device; }
 };
 
 struct NCCLContextMap {
@@ -106,7 +104,7 @@ struct NCCLContextMap {
                           "The NCCL place should not be empty."));
     order_.reserve(places.size());
     for (auto &p : places) {
-      int dev_id = BOOST_GET_CONST(CUDAPlace, p).device;
+      int dev_id = p.device;
       order_.emplace_back(dev_id);
       contexts_.emplace(dev_id, NCCLContext(dev_id));
     }
@@ -155,12 +153,10 @@ struct NCCLContextMap {
   CUDADeviceContext *DevCtx(int dev_id) const { return at(dev_id).ctx_.get(); }
 
   CUDADeviceContext *DevCtx(platform::Place p) const {
-    return DevCtx(BOOST_GET_CONST(CUDAPlace, p).device);
+    return DevCtx(p.device);
   }
 
-  const NCCLContext &at(platform::Place p) const {
-    return this->at(BOOST_GET_CONST(CUDAPlace, p).device);
-  }
+  const NCCLContext &at(platform::Place p) const { return this->at(p.device); }
 
   const NCCLContext &at(int dev_id) const { return contexts_.at(dev_id); }
 
@@ -259,7 +255,7 @@ class NCCLCommunicator {
     for (int ring_id = 0; ring_id < nrings; ++ring_id) {
       for (size_t p = 0; p < places.size(); ++p) {
         int rank = trainer_id * places.size() + p;
-        int dev_id = BOOST_GET_CONST(CUDAPlace, places[p]).device;
+        int dev_id = places[p].device;
         auto &ctx = flat_ctxs_[ring_id]->contexts_.at(dev_id);
         NCCLCommContext::Instance().AssignNCCLComm(ctx.comm_, nranks, rank,
                                                    dev_id, ring_id);
diff --git a/paddle/fluid/platform/device/mlu/device_context_allocator.h b/paddle/fluid/platform/device/mlu/device_context_allocator.h
index 408016c0f0d99..2be960ef4ae41 100644
--- a/paddle/fluid/platform/device/mlu/device_context_allocator.h
+++ b/paddle/fluid/platform/device/mlu/device_context_allocator.h
@@ -128,8 +128,7 @@ class MLUDeviceContextAllocatorPool {
   }
 
   AllocationPtr Alloc(const platform::MLUDeviceContext &dev_ctx, size_t size) {
-    auto iter = allocators_.find(
-        BOOST_GET_CONST(platform::MLUPlace, dev_ctx.GetPlace()));
+    auto iter = allocators_.find(dev_ctx.GetPlace());
     PADDLE_ENFORCE_NE(
         iter, allocators_.end(),
         platform::errors::NotFound("No allocator found for MLUPlace."));
diff --git a/paddle/fluid/platform/device/npu/hccl_helper.h b/paddle/fluid/platform/device/npu/hccl_helper.h
index 69cea31446680..c2338fff02926 100644
--- a/paddle/fluid/platform/device/npu/hccl_helper.h
+++ b/paddle/fluid/platform/device/npu/hccl_helper.h
@@ -85,9 +85,7 @@ struct HCCLContext {
   aclrtStream stream() const { return ctx_->stream(); }
   HcclComm comm() const { return comm_; }
 
-  int device_id() const {
-    return BOOST_GET_CONST(platform::NPUPlace, ctx_->GetPlace()).device;
-  }
+  int device_id() const { return ctx_->GetPlace().device; }
 };
 
 struct HCCLContextMap {
@@ -102,7 +100,7 @@ struct HCCLContextMap {
                           "The HCCL place should not be empty."));
     order_.reserve(places.size());
     for (auto &p : places) {
-      int dev_id = BOOST_GET_CONST(NPUPlace, p).device;
+      int dev_id = p.device;
       order_.emplace_back(dev_id);
       contexts_.emplace(dev_id, HCCLContext(dev_id));
     }
@@ -151,13 +149,9 @@ struct HCCLContextMap {
 
   NPUDeviceContext *DevCtx(int dev_id) const { return at(dev_id).ctx_.get(); }
 
-  NPUDeviceContext *DevCtx(platform::Place p) const {
-    return DevCtx(BOOST_GET_CONST(NPUPlace, p).device);
-  }
+  NPUDeviceContext *DevCtx(platform::Place p) const { return DevCtx(p.device); }
 
-  const HCCLContext &at(platform::Place p) const {
-    return this->at(BOOST_GET_CONST(NPUPlace, p).device);
-  }
+  const HCCLContext &at(platform::Place p) const { return this->at(p.device); }
 
   const HCCLContext &at(int dev_id) const { return contexts_.at(dev_id); }
 
@@ -257,7 +251,7 @@ class HCCLCommunicator {
     for (int ring_id = 0; ring_id < nrings; ++ring_id) {
       for (size_t p = 0; p < places.size(); ++p) {
         int rank = trainer_id * places.size() + p;
-        int dev_id = BOOST_GET_CONST(NPUPlace, places[p]).device;
+        int dev_id = places[p].device;
         auto &ctx = flat_ctxs_[ring_id]->contexts_.at(dev_id);
         HCCLCommContext::Instance().AssignHCCLComm(ctx.comm_, nranks, rank,
                                                    dev_id, ring_id);
diff --git a/paddle/fluid/platform/device/npu/npu_collective_helper.cc b/paddle/fluid/platform/device/npu/npu_collective_helper.cc
index 4d1f444411f71..cdec3519a23f3 100644
--- a/paddle/fluid/platform/device/npu/npu_collective_helper.cc
+++ b/paddle/fluid/platform/device/npu/npu_collective_helper.cc
@@ -31,9 +31,7 @@ class HCCLCommImpl : public HCCLComm {
   void set_rank(int rank) { rank_ = rank; }
   int rank() const override { return rank_; }
 
-  int device_id() const override {
-    return BOOST_GET_CONST(NPUPlace, dev_ctx_->GetPlace()).device;
-  }
+  int device_id() const override { return dev_ctx_->GetPlace().device; }
 
   ~HCCLCommImpl() {
     PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclCommDestroy(comm_));
diff --git a/paddle/fluid/platform/device/npu/npu_op_runner.h b/paddle/fluid/platform/device/npu/npu_op_runner.h
index c049da3b33566..39d2b9ffa9b1d 100644
--- a/paddle/fluid/platform/device/npu/npu_op_runner.h
+++ b/paddle/fluid/platform/device/npu/npu_op_runner.h
@@ -149,9 +149,8 @@ void FillNpuTensorWithConstant(Tensor *tensor, T val) {
         npu_pinned_tensor.mutable_data<T>({1}, npu_pinned_place);
     *npu_pinned_ptr = val;
 
-    memory::Copy(BOOST_GET_CONST(platform::NPUPlace, tensor->place()),
-                 tensor->data(), npu_pinned_place, npu_pinned_ptr, sizeof(T),
-                 GetCurrentNPUStream());
+    memory::Copy(tensor->place(), tensor->data(), npu_pinned_place,
+                 npu_pinned_ptr, sizeof(T), GetCurrentNPUStream());
 
     auto npu_pinned_allocator =
         static_cast<paddle::memory::allocation::NPUPinnedAllocator *>(
diff --git a/paddle/fluid/platform/device/npu/npu_stream.cc b/paddle/fluid/platform/device/npu/npu_stream.cc
index e86b30f3244c0..0b15a0d937e82 100644
--- a/paddle/fluid/platform/device/npu/npu_stream.cc
+++ b/paddle/fluid/platform/device/npu/npu_stream.cc
@@ -24,7 +24,7 @@ bool NPUStream::Init(const Place& place) {
                     platform::errors::InvalidArgument(
                         "NPU stream must be created using npu place."));
   place_ = place;
-  NPUDeviceGuard guard(BOOST_GET_CONST(NPUPlace, place_).device);
+  NPUDeviceGuard guard(place_.device);
   NPUStreamCreate(&stream_);
   callback_manager_.reset(new StreamCallbackManager<aclrtStream>(stream_));
   VLOG(3) << "NPUStream Init stream: " << stream_;
@@ -32,7 +32,7 @@ bool NPUStream::Init(const Place& place) {
 }
 
 void NPUStream::Destroy() {
-  NPUDeviceGuard guard(BOOST_GET_CONST(NPUPlace, place_).device);
+  NPUDeviceGuard guard(place_.device);
   Wait();
   WaitCallback();
   if (stream_) {
diff --git a/paddle/fluid/platform/device/xpu/bkcl_helper.h b/paddle/fluid/platform/device/xpu/bkcl_helper.h
index d9ffbfe011f91..24fd8b5faa4e9 100644
--- a/paddle/fluid/platform/device/xpu/bkcl_helper.h
+++ b/paddle/fluid/platform/device/xpu/bkcl_helper.h
@@ -58,9 +58,7 @@ struct BKCLContext {
 
   BKCLContext_t comm() const { return comm_; }
 
-  int device_id() const {
-    return BOOST_GET_CONST(platform::XPUPlace, ctx_->GetPlace()).device;
-  }
+  int device_id() const { return ctx_->GetPlace().device; }
 };
 
 struct InitBKCLPara {
@@ -104,7 +102,7 @@ struct BKCLContextMap {
                           "The BKCL place should not be empty."));
     order_.reserve(places_.size());
     for (auto &p : places_) {
-      int dev_id = BOOST_GET_CONST(platform::XPUPlace, p).device;
+      int dev_id = p.device;
       order_.emplace_back(dev_id);
       contexts_.emplace(dev_id, BKCLContext(dev_id));
     }
@@ -165,13 +163,9 @@ struct BKCLContextMap {
 
   XPUDeviceContext *DevCtx(int dev_id) const { return at(dev_id).ctx_.get(); }
 
-  XPUDeviceContext *DevCtx(platform::Place p) const {
-    return DevCtx(BOOST_GET_CONST(platform::XPUPlace, p).device);
-  }
+  XPUDeviceContext *DevCtx(platform::Place p) const { return DevCtx(p.device); }
 
-  const BKCLContext &at(platform::Place p) const {
-    return this->at(BOOST_GET_CONST(platform::XPUPlace, p).device);
-  }
+  const BKCLContext &at(platform::Place p) const { return this->at(p.device); }
 
   const BKCLContext &at(int dev_id) const { return contexts_.at(dev_id); }
 
diff --git a/paddle/fluid/platform/device/xpu/xpu_info.h b/paddle/fluid/platform/device/xpu/xpu_info.h
index 018ba1bce163b..220bebb9e6b05 100644
--- a/paddle/fluid/platform/device/xpu/xpu_info.h
+++ b/paddle/fluid/platform/device/xpu/xpu_info.h
@@ -12,11 +12,11 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 #include <vector>
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace platform {
 
-class XPUPlace;
 /***** Version Management *****/
 
 //! Get the version of XPU Driver
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.cc b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
index 7561830fc76c1..448559a9edfee 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_list.cc
+++ b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
@@ -23,8 +23,7 @@ namespace platform {
 
 bool is_xpu_support_op(const std::string& op_name, const pOpKernelType& type) {
   auto& ops = get_kl1_ops();
-  auto v =
-      get_xpu_version(BOOST_GET_CONST(platform::XPUPlace, type.place_).device);
+  auto v = get_xpu_version(type.place_.device);
   if (v == XPU2) {
     ops = get_kl2_ops();
   }
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index b2f444c30c248..effd67fa5c967 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -10,6 +10,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"
+#include <memory>
 #include <set>
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -140,7 +141,7 @@ inline void EmplaceDeviceContext(
   map_ptr->emplace(p, std::async(std::launch::deferred, [=] {
                      // lazy evaluation. i.e., only create device context at
                      // first `Get`
-                     return PtrType(new DevCtx(BOOST_GET_CONST(PlaceType, p)));
+                     return PtrType(new DevCtx(p));
                    }));
 }
 
@@ -157,14 +158,19 @@ DeviceContextPool::DeviceContextPool(
   }
   for (auto& p : set) {
     if (platform::is_cpu_place(p)) {
+      platform::CPUPlace place;
 #ifdef PADDLE_WITH_MKLDNN
-      EmplaceDeviceContext<MKLDNNDeviceContext, CPUPlace>(&device_contexts_, p);
+      EmplaceDeviceContext<MKLDNNDeviceContext, CPUPlace>(&device_contexts_,
+                                                          place);
 #else
-      EmplaceDeviceContext<CPUDeviceContext, CPUPlace>(&device_contexts_, p);
+      EmplaceDeviceContext<CPUDeviceContext, CPUPlace>(&device_contexts_,
+                                                       place);
 #endif
     } else if (platform::is_gpu_place(p)) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      EmplaceDeviceContext<CUDADeviceContext, CUDAPlace>(&device_contexts_, p);
+      platform::CUDAPlace place(p.GetDeviceId());
+      EmplaceDeviceContext<CUDADeviceContext, CUDAPlace>(&device_contexts_,
+                                                         place);
 #else
       PADDLE_THROW(
           platform::errors::Unimplemented("CUDAPlace is not supported. Please "
@@ -172,8 +178,9 @@ DeviceContextPool::DeviceContextPool(
 #endif
     } else if (platform::is_cuda_pinned_place(p)) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+      platform::CUDAPinnedPlace place;
       EmplaceDeviceContext<CUDAPinnedDeviceContext, CUDAPinnedPlace>(
-          &device_contexts_, p);
+          &device_contexts_, place);
 #else
       PADDLE_THROW(platform::errors::Unimplemented(
           "CUDAPlace is not supported. Please re-compile with WITH_GPU "
@@ -181,7 +188,9 @@ DeviceContextPool::DeviceContextPool(
 #endif
     } else if (platform::is_xpu_place(p)) {
 #ifdef PADDLE_WITH_XPU
-      EmplaceDeviceContext<XPUDeviceContext, XPUPlace>(&device_contexts_, p);
+      platform::XPUPlace place(p.GetDeviceId());
+      EmplaceDeviceContext<XPUDeviceContext, XPUPlace>(&device_contexts_,
+                                                       place);
 #else
       PADDLE_THROW(
           platform::errors::Unimplemented("XPUPlace is not supported. Please "
@@ -189,7 +198,9 @@ DeviceContextPool::DeviceContextPool(
 #endif
     } else if (platform::is_mlu_place(p)) {
 #ifdef PADDLE_WITH_MLU
-      EmplaceDeviceContext<MLUDeviceContext, MLUPlace>(&device_contexts_, p);
+      platform::MLUPlace place(p.GetDeviceId());
+      EmplaceDeviceContext<MLUDeviceContext, MLUPlace>(&device_contexts_,
+                                                       place);
 #else
       PADDLE_THROW(
           platform::errors::Unimplemented("MLUPlace is not supported. Please "
@@ -197,7 +208,9 @@ DeviceContextPool::DeviceContextPool(
 #endif
     } else if (platform::is_ipu_place(p)) {
 #ifdef PADDLE_WITH_IPU
-      EmplaceDeviceContext<IPUDeviceContext, IPUPlace>(&device_contexts_, p);
+      platform::IPUPlace place(p.GetDeviceId());
+      EmplaceDeviceContext<IPUDeviceContext, IPUPlace>(&device_contexts_,
+                                                       place);
 #else
       PADDLE_THROW(
           platform::errors::Unimplemented("IPUPlace is not supported. Please "
@@ -205,7 +218,9 @@ DeviceContextPool::DeviceContextPool(
 #endif
     } else if (platform::is_npu_place(p)) {
 #ifdef PADDLE_WITH_ASCEND_CL
-      EmplaceDeviceContext<NPUDeviceContext, NPUPlace>(&device_contexts_, p);
+      platform::NPUPlace place(p.GetDeviceId());
+      EmplaceDeviceContext<NPUDeviceContext, NPUPlace>(&device_contexts_,
+                                                       place);
 #else
       PADDLE_THROW(platform::errors::Unimplemented(
           "NPUPlace is not supported. Please "
@@ -213,8 +228,9 @@ DeviceContextPool::DeviceContextPool(
 #endif
     } else if (platform::is_npu_pinned_place(p)) {
 #ifdef PADDLE_WITH_ASCEND_CL
+      platform::NPUPinnedPlace place;
       EmplaceDeviceContext<NPUPinnedDeviceContext, NPUPinnedPlace>(
-          &device_contexts_, p);
+          &device_contexts_, place);
 #else
       PADDLE_THROW(platform::errors::Unimplemented(
           "NPUPinnedPlace is not supported. Please re-compile with "
diff --git a/paddle/fluid/platform/device_event_gpu.cc b/paddle/fluid/platform/device_event_gpu.cc
index bc842ef9c74de..0a6b3917fbc21 100644
--- a/paddle/fluid/platform/device_event_gpu.cc
+++ b/paddle/fluid/platform/device_event_gpu.cc
@@ -26,7 +26,7 @@ struct CUDADeviceEventWrapper {
         platform::errors::PreconditionNotMet(
             "Required device shall be CUDAPlace, but received %d. ", place));
 
-    device_id_ = BOOST_GET_CONST(platform::CUDAPlace, place).device;
+    device_id_ = place.device;
     PADDLE_ENFORCE_GT(
         device_id_, -1,
         platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc
index ff11bfd62c138..73847ce24aa72 100644
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -643,8 +643,7 @@ class DeviceTracerImpl : public DeviceTracer {
           event->set_place(proto::MemEvent::CPUPlace);
         } else if (platform::is_gpu_place(r.place)) {
           event->set_place(proto::MemEvent::CUDAPlace);
-          event->set_device_id(
-              BOOST_GET_CONST(platform::CUDAPlace, r.place).GetDeviceId());
+          event->set_device_id(r.place.GetDeviceId());
         } else if (platform::is_cuda_pinned_place(r.place)) {
           event->set_place(proto::MemEvent::CUDAPinnedPlace);
         } else if (platform::is_npu_place(r.place)) {
diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc
index 6251a28823ac3..e73e3736f64b4 100644
--- a/paddle/fluid/platform/place.cc
+++ b/paddle/fluid/platform/place.cc
@@ -24,89 +24,62 @@ PADDLE_DEFINE_EXPORTED_bool(
 namespace paddle {
 namespace platform {
 
-namespace detail {
-
-class PlacePrinter : public boost::static_visitor<> {
- public:
-  explicit PlacePrinter(std::ostream &os) : os_(os) {}
-  void operator()(const CPUPlace &) { os_ << "CPUPlace"; }
-  void operator()(const CUDAPlace &p) {
-    os_ << "CUDAPlace(" << p.device << ")";
-  }
-  void operator()(const XPUPlace &p) { os_ << "XPUPlace(" << p.device << ")"; }
-  void operator()(const MLUPlace &p) { os_ << "MLUPlace(" << p.device << ")"; }
-  void operator()(const NPUPlace &p) { os_ << "NPUPlace(" << p.device << ")"; }
-  void operator()(const NPUPinnedPlace &p) { os_ << "NPUPinnedPlace"; }
-  void operator()(const IPUPlace &p) { os_ << "IPUPlace(" << p.device << ")"; }
-  void operator()(const CUDAPinnedPlace &p) { os_ << "CUDAPinnedPlace"; }
-
- private:
-  std::ostream &os_;
-};
-
-}  // namespace detail
-
 bool is_gpu_place(const Place &p) {
-  return boost::apply_visitor(IsCUDAPlace(), p);
+  return p.GetType() == pten::AllocationType::GPU;
 }
 
 bool is_xpu_place(const Place &p) {
-  return boost::apply_visitor(IsXPUPlace(), p);
+  return p.GetType() == pten::AllocationType::XPU;
 }
 
 bool is_mlu_place(const Place &p) {
-  return boost::apply_visitor(IsMLUPlace(), p);
+  return p.GetType() == pten::AllocationType::MLU;
 }
 
 bool is_npu_place(const Place &p) {
-  return boost::apply_visitor(IsNPUPlace(), p);
+  return p.GetType() == pten::AllocationType::NPU;
 }
 
 bool is_ipu_place(const Place &p) {
-  return boost::apply_visitor(IsIPUPlace(), p);
+  return p.GetType() == pten::AllocationType::IPU;
 }
 
 bool is_cpu_place(const Place &p) {
-  return boost::apply_visitor(IsCPUPlace(), p);
+  return p.GetType() == pten::AllocationType::CPU;
 }
 
 bool is_cuda_pinned_place(const Place &p) {
-  return boost::apply_visitor(IsCUDAPinnedPlace(), p);
+  return p.GetType() == pten::AllocationType::GPUPINNED;
 }
 
 bool is_npu_pinned_place(const Place &p) {
-  return boost::apply_visitor(IsNPUPinnedPlace(), p);
+  return p.GetType() == pten::AllocationType::NPUPINNED;
 }
 
 bool places_are_same_class(const Place &p1, const Place &p2) {
-  return p1.which() == p2.which();
+  return p1.GetType() == p2.GetType();
 }
 
 bool is_same_place(const Place &p1, const Place &p2) {
   if (places_are_same_class(p1, p2)) {
-    if (is_cpu_place(p1) || is_cuda_pinned_place(p1)) {
+    if (is_cpu_place(p1) || is_cuda_pinned_place(p1) ||
+        is_npu_pinned_place(p1)) {
       return true;
     } else if (is_xpu_place(p1)) {
-      return BOOST_GET_CONST(XPUPlace, p1) == BOOST_GET_CONST(XPUPlace, p2);
+      return p1 == p2;
     } else if (is_mlu_place(p1)) {
-      return BOOST_GET_CONST(MLUPlace, p1) == BOOST_GET_CONST(MLUPlace, p2);
+      return p1 == p2;
     } else if (is_npu_place(p1)) {
-      return BOOST_GET_CONST(NPUPlace, p1) == BOOST_GET_CONST(NPUPlace, p2);
+      return p1 == p2;
     } else if (is_ipu_place(p1)) {
-      return BOOST_GET_CONST(IPUPlace, p1) == BOOST_GET_CONST(IPUPlace, p2);
+      return p1 == p2;
     } else {
-      return BOOST_GET_CONST(CUDAPlace, p1) == BOOST_GET_CONST(CUDAPlace, p2);
+      return p1 == p2;
     }
   } else {
     return false;
   }
 }
 
-std::ostream &operator<<(std::ostream &os, const Place &p) {
-  detail::PlacePrinter printer(os);
-  boost::apply_visitor(printer, p);
-  return os;
-}
-
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h
index 886eb05813bd8..80bbeac251810 100644
--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
@@ -13,229 +13,29 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 
-#include <functional>
-#include <iostream>
-#include <vector>
+// #include <functional>
+// #include <iostream>
+// #include <vector>
 
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/variant.h"
+// #include "paddle/fluid/platform/variant.h"
 #ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/platform/device/npu/enforce_npu.h"
 #endif
 
+#include "paddle/pten/common/place.h"
 namespace paddle {
 namespace platform {
 
-struct CPUPlace {
-  // WORKAROUND: for some reason, omitting this constructor
-  // causes errors with boost 1.59 and OSX
-  CPUPlace() {}
-
-  // needed for variant equality comparison
-  inline bool operator==(const CPUPlace &) const { return true; }
-  inline bool operator!=(const CPUPlace &) const { return false; }
-  inline bool operator<(const CPUPlace &) const { return false; }
-};
-
-struct CUDAPlace {
-  CUDAPlace() : CUDAPlace(0) {}
-  explicit CUDAPlace(int d) : device(d) {}
-
-  inline int GetDeviceId() const { return device; }
-  // needed for variant equality comparison
-  inline bool operator==(const CUDAPlace &o) const {
-    return device == o.device;
-  }
-  inline bool operator!=(const CUDAPlace &o) const { return !(*this == o); }
-  inline bool operator<(const CUDAPlace &o) const { return device < o.device; }
-
-  int device;
-};
-
-struct CUDAPinnedPlace {
-  CUDAPinnedPlace() {}
-
-  // needed for variant equality comparison
-  inline bool operator==(const CUDAPinnedPlace &) const { return true; }
-  inline bool operator!=(const CUDAPinnedPlace &) const { return false; }
-  inline bool operator<(const CUDAPinnedPlace &) const { return false; }
-};
-
-// Place for Baidu Kunlun Accelerator
-struct XPUPlace {
-  XPUPlace() : XPUPlace(0) {}
-  explicit XPUPlace(int d) : device(d) {}
-
-  inline int GetDeviceId() const { return device; }
-  // needed for variant equality comparison
-  inline bool operator==(const XPUPlace &o) const { return device == o.device; }
-  inline bool operator!=(const XPUPlace &o) const { return !(*this == o); }
-  inline bool operator<(const XPUPlace &o) const { return device < o.device; }
-
-  int device;
-};
-
-struct NPUPlace {
-  NPUPlace() : NPUPlace(0) {}
-  explicit NPUPlace(int d) : device(d) {}
-
-  inline int GetDeviceId() const { return device; }
-  // needed for variant equality comparison
-  inline bool operator==(const NPUPlace &o) const { return device == o.device; }
-  inline bool operator!=(const NPUPlace &o) const { return !(*this == o); }
-  inline bool operator<(const NPUPlace &o) const { return device < o.device; }
-
-  int device;
-};
-
-struct NPUPinnedPlace {
-  NPUPinnedPlace() {}
-
-  inline bool operator==(const NPUPinnedPlace &) const { return true; }
-  inline bool operator!=(const NPUPinnedPlace &) const { return false; }
-  inline bool operator<(const NPUPinnedPlace &) const { return false; }
-};
-struct IPUPlace {
-  IPUPlace() : IPUPlace(0) {}
-  explicit IPUPlace(int d) : device(d) {}
-
-  inline int GetDeviceId() const { return device; }
-  // needed for variant equality comparison
-  inline bool operator==(const IPUPlace &o) const { return device == o.device; }
-  inline bool operator!=(const IPUPlace &o) const { return !(*this == o); }
-  inline bool operator<(const IPUPlace &o) const { return device < o.device; }
-
-  int device;
-};
-
-struct MLUPlace {
-  MLUPlace() : MLUPlace(0) {}
-  explicit MLUPlace(int d) : device(d) {}
-
-  inline int GetDeviceId() const { return device; }
-  // needed for variant equality comparison
-  inline bool operator==(const MLUPlace &o) const { return device == o.device; }
-  inline bool operator!=(const MLUPlace &o) const { return !(*this == o); }
-  inline bool operator<(const MLUPlace &o) const { return device < o.device; }
-
-  int device;
-};
-
-struct IsCUDAPlace : public boost::static_visitor<bool> {
-  bool operator()(const CPUPlace &) const { return false; }
-  bool operator()(const XPUPlace &) const { return false; }
-  bool operator()(const NPUPlace &) const { return false; }
-  bool operator()(const NPUPinnedPlace &) const { return false; }
-  bool operator()(const MLUPlace &) const { return false; }
-  bool operator()(const IPUPlace &) const { return false; }
-  bool operator()(const CUDAPlace &) const { return true; }
-  bool operator()(const CUDAPinnedPlace &) const { return false; }
-};
-
-struct IsCPUPlace : public boost::static_visitor<bool> {
-  bool operator()(const CPUPlace &) const { return true; }
-  bool operator()(const XPUPlace &) const { return false; }
-  bool operator()(const NPUPlace &) const { return false; }
-  bool operator()(const NPUPinnedPlace &) const { return false; }
-  bool operator()(const MLUPlace &) const { return false; }
-  bool operator()(const IPUPlace &) const { return false; }
-  bool operator()(const CUDAPlace &) const { return false; }
-  bool operator()(const CUDAPinnedPlace &) const { return false; }
-};
-
-struct IsCUDAPinnedPlace : public boost::static_visitor<bool> {
-  bool operator()(const CPUPlace &) const { return false; }
-  bool operator()(const XPUPlace &) const { return false; }
-  bool operator()(const NPUPlace &) const { return false; }
-  bool operator()(const NPUPinnedPlace &) const { return false; }
-  bool operator()(const MLUPlace &) const { return false; }
-  bool operator()(const IPUPlace &) const { return false; }
-  bool operator()(const CUDAPlace &) const { return false; }
-  bool operator()(const CUDAPinnedPlace &cuda_pinned) const { return true; }
-};
-
-struct IsXPUPlace : public boost::static_visitor<bool> {
-  bool operator()(const CPUPlace &) const { return false; }
-  bool operator()(const XPUPlace &) const { return true; }
-  bool operator()(const NPUPlace &) const { return false; }
-  bool operator()(const NPUPinnedPlace &) const { return false; }
-  bool operator()(const MLUPlace &) const { return false; }
-  bool operator()(const IPUPlace &) const { return false; }
-  bool operator()(const CUDAPlace &) const { return false; }
-  bool operator()(const CUDAPinnedPlace &) const { return false; }
-};
-
-struct IsNPUPlace : public boost::static_visitor<bool> {
-  bool operator()(const CPUPlace &) const { return false; }
-  bool operator()(const XPUPlace &) const { return false; }
-  bool operator()(const NPUPlace &) const { return true; }
-  bool operator()(const NPUPinnedPlace &) const { return false; }
-  bool operator()(const MLUPlace &) const { return false; }
-  bool operator()(const IPUPlace &) const { return false; }
-  bool operator()(const CUDAPlace &) const { return false; }
-  bool operator()(const CUDAPinnedPlace &) const { return false; }
-};
-
-struct IsNPUPinnedPlace : public boost::static_visitor<bool> {
-  bool operator()(const CPUPlace &) const { return false; }
-  bool operator()(const XPUPlace &) const { return false; }
-  bool operator()(const NPUPlace &) const { return false; }
-  bool operator()(const MLUPlace &) const { return false; }
-  bool operator()(const IPUPlace &) const { return false; }
-  bool operator()(const CUDAPlace &) const { return false; }
-  bool operator()(const CUDAPinnedPlace &) const { return false; }
-  bool operator()(const NPUPinnedPlace &) const { return true; }
-};
-
-struct IsMLUPlace : public boost::static_visitor<bool> {
-  bool operator()(const CPUPlace &) const { return false; }
-  bool operator()(const XPUPlace &) const { return false; }
-  bool operator()(const NPUPlace &) const { return false; }
-  bool operator()(const NPUPinnedPlace &) const { return false; }
-  bool operator()(const MLUPlace &) const { return true; }
-  bool operator()(const IPUPlace &) const { return false; }
-  bool operator()(const CUDAPlace &) const { return false; }
-  bool operator()(const CUDAPinnedPlace &) const { return false; }
-};
-struct IsIPUPlace : public boost::static_visitor<bool> {
-  bool operator()(const CPUPlace &) const { return false; }
-  bool operator()(const XPUPlace &) const { return false; }
-  bool operator()(const NPUPlace &) const { return false; }
-  bool operator()(const IPUPlace &) const { return true; }
-  bool operator()(const MLUPlace &) const { return false; }
-  bool operator()(const CUDAPlace &) const { return false; }
-  bool operator()(const CUDAPinnedPlace &) const { return false; }
-  bool operator()(const NPUPinnedPlace &) const { return false; }
-};
-
-class Place : public boost::variant<CUDAPlace, XPUPlace, NPUPlace, CPUPlace,
-                                    CUDAPinnedPlace, NPUPinnedPlace, IPUPlace,
-                                    MLUPlace> {
- private:
-  using PlaceBase =
-      boost::variant<CUDAPlace, XPUPlace, NPUPlace, CPUPlace, CUDAPinnedPlace,
-                     NPUPinnedPlace, IPUPlace, MLUPlace>;
-
- public:
-  Place() = default;
-  Place(const CPUPlace &cpu_place) : PlaceBase(cpu_place) {}     // NOLINT
-  Place(const XPUPlace &xpu_place) : PlaceBase(xpu_place) {}     // NOLINT
-  Place(const NPUPlace &npu_place) : PlaceBase(npu_place) {}     // NOLINT
-  Place(const MLUPlace &mlu_place) : PlaceBase(mlu_place) {}     // NOLINT
-  Place(const IPUPlace &ipu_place) : PlaceBase(ipu_place) {}     // NOLINT
-  Place(const CUDAPlace &cuda_place) : PlaceBase(cuda_place) {}  // NOLINT
-  Place(const CUDAPinnedPlace &cuda_pinned_place)                // NOLINT
-      : PlaceBase(cuda_pinned_place) {}
-  Place(const NPUPinnedPlace &npu_pinned_place)  // NOLINT
-      : PlaceBase(npu_pinned_place) {}
-
-  bool operator<(const Place &place) const {
-    return PlaceBase::operator<(static_cast<const PlaceBase &>(place));
-  }
-  bool operator==(const Place &place) const {
-    return PlaceBase::operator==(static_cast<const PlaceBase &>(place));
-  }
-};
+using Place = pten::Place;
+using CPUPlace = pten::CPUPlace;
+using CUDAPlace = pten::GPUPlace;
+using CUDAPinnedPlace = pten::GPUPinnedPlace;
+using NPUPlace = pten::NPUPlace;
+using NPUPinnedPlace = pten::NPUPinnedPlace;
+using XPUPlace = pten::XPUPlace;
+using IPUPlace = pten::IPUPlace;
+using MLUPlace = pten::MLUPlace;
 
 using PlaceList = std::vector<Place>;
 
@@ -250,94 +50,84 @@ bool is_npu_pinned_place(const Place &);
 bool places_are_same_class(const Place &, const Place &);
 bool is_same_place(const Place &, const Place &);
 
-std::ostream &operator<<(std::ostream &, const Place &);
-
 template <typename Visitor>
-struct PlaceVisitorWrapper
-    : public boost::static_visitor<typename Visitor::result_type> {
-  const Visitor &visitor_;
-  explicit PlaceVisitorWrapper(const Visitor &visitor) : visitor_(visitor) {}
-
-  typename Visitor::result_type operator()(const CPUPlace &cpu) const {
-    return visitor_(cpu);
-  }
-
-  typename Visitor::result_type operator()(const XPUPlace &xpu) const {
-#ifdef PADDLE_WITH_XPU
-    return visitor_(xpu);
+typename Visitor::result_type VisitPlace(const Place &place,
+                                         const Visitor &visitor) {
+  switch (place.GetType()) {
+    case pten::AllocationType::GPU: {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+      platform::CUDAPlace p(place.GetDeviceId());
+      return visitor(p);
 #else
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Paddle is not compiled with XPU. Cannot visit xpu device"));
-    return typename Visitor::result_type();
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Paddle is not compiled with CUDA. Cannot visit cuda_pinned"));
+      return typename Visitor::result_type();
 #endif
-  }
-
-  typename Visitor::result_type operator()(const NPUPlace &npu) const {
-#ifdef PADDLE_WITH_ASCEND
-    return visitor_(npu);
+    }
+    case pten::AllocationType::GPUPINNED: {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+      platform::CUDAPinnedPlace p;
+      return visitor(p);
 #else
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Paddle is not compiled with NPU. Cannot visit npu device"));
-    return typename Visitor::result_type();
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Paddle is not compiled with CUDA. Cannot visit cuda_pinned"));
+      return typename Visitor::result_type();
 #endif
-  }
-
-  typename Visitor::result_type operator()(
-      const NPUPinnedPlace &npu_pinned) const {
-#ifdef PADDLE_WITH_ASCEND_CL
-    return visitor_(npu_pinned);
+    }
+    case pten::AllocationType::XPU: {
+#ifdef PADDLE_WITH_XPU
+      platform::XPUPlace p(place.GetDeviceId());
+      return visitor(p);
 #else
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Paddle is not compiled with NPU. Cannot visit npu_pinned"));
-    return typename Visitor::result_type();
+      PADDLE_THROW(paddle::platform::errors::Unavailable(
+          "Paddle is not compiled with XPU. Cannot visit xpu device"));
+      return typename Visitor::result_type();
 #endif
-  }
-
-  typename Visitor::result_type operator()(const MLUPlace &mlu) const {
-#ifdef PADDLE_WITH_MLU
-    return visitor_(mlu);
+    }
+    case pten::AllocationType::NPU: {
+#ifdef PADDLE_WITH_ASCEND_CL
+      platform::NPUPlace p(place.GetDeviceId());
+      return visitor(p);
 #else
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Paddle is not compiled with MLU. Cannot visit mlu device"));
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Paddle is not compiled with NPU. Cannot visit npu_pinned"));
+      return typename Visitor::result_type();
 #endif
-  }
-
-  typename Visitor::result_type operator()(const IPUPlace &ipu) const {
-#ifdef PADDLE_WITH_IPU
-    return visitor_(ipu);
+    }
+    case pten::AllocationType::NPUPINNED: {
+#ifdef PADDLE_WITH_ASCEND_CL
+      platform::NPUPinnedPlace p;
+      return visitor(p);
 #else
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Paddle is not compiled with IPU. Cannot visit ipu device"));
-    return typename Visitor::result_type();
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Paddle is not compiled with NPU. Cannot visit npu_pinned"));
+      return typename Visitor::result_type();
 #endif
-  }
-
-  typename Visitor::result_type operator()(const CUDAPlace &cuda) const {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    return visitor_(cuda);
+    }
+    case pten::AllocationType::IPU: {
+#ifdef PADDLE_WITH_IPU
+      platform::IPUPlace p(place.GetDeviceId());
+      return visitor(p);
 #else
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Paddle is not compiled with CUDA. Cannot visit cuda device"));
-    return typename Visitor::result_type();
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Paddle is not compiled with IPU. Cannot visit ipu device"));
+      return typename Visitor::result_type();
 #endif
-  }
-
-  typename Visitor::result_type operator()(
-      const CUDAPinnedPlace &cuda_pinned) const {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    return visitor_(cuda_pinned);
+    }
+    case pten::AllocationType::MLU: {
+#ifdef PADDLE_WITH_MLU
+      platform::MLUPlace p(place.GetDeviceId());
+      return visitor(p);
 #else
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Paddle is not compiled with CUDA. Cannot visit cuda_pinned"));
-    return typename Visitor::result_type();
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Paddle is not compiled with MLU. Cannot visit mlu device"));
 #endif
+    }
+    default: {
+      platform::CPUPlace p;
+      return visitor(p);
+    }
   }
-};
-
-template <typename Visitor>
-typename Visitor::result_type VisitPlace(const Place &place,
-                                         const Visitor &visitor) {
-  return boost::apply_visitor(PlaceVisitorWrapper<Visitor>(visitor), place);
 }
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/place_test.cc b/paddle/fluid/platform/place_test.cc
index ba19f14fb8f87..4fccb0eda70fd 100644
--- a/paddle/fluid/platform/place_test.cc
+++ b/paddle/fluid/platform/place_test.cc
@@ -47,21 +47,21 @@ TEST(Place, Print) {
   {
     std::stringstream ss;
     ss << paddle::platform::XPUPlace(1);
-    EXPECT_EQ("XPUPlace(1)", ss.str());
+    EXPECT_EQ("Place(xpu:1)", ss.str());
   }
   {
     std::stringstream ss;
     ss << paddle::platform::MLUPlace(1);
-    EXPECT_EQ("MLUPlace(1)", ss.str());
+    EXPECT_EQ("Place(mlu:1)", ss.str());
   }
   {
     std::stringstream ss;
     ss << paddle::platform::CUDAPlace(1);
-    EXPECT_EQ("CUDAPlace(1)", ss.str());
+    EXPECT_EQ("Place(gpu:1)", ss.str());
   }
   {
     std::stringstream ss;
     ss << paddle::platform::CPUPlace();
-    EXPECT_EQ("CPUPlace", ss.str());
+    EXPECT_EQ("Place(cpu)", ss.str());
   }
 }
diff --git a/paddle/fluid/platform/stream/cuda_stream.cc b/paddle/fluid/platform/stream/cuda_stream.cc
index 742d267b59543..5697bbee0bb92 100644
--- a/paddle/fluid/platform/stream/cuda_stream.cc
+++ b/paddle/fluid/platform/stream/cuda_stream.cc
@@ -27,7 +27,7 @@ bool CUDAStream::Init(const Place& place, const Priority& priority,
                     platform::errors::InvalidArgument(
                         "Cuda stream must be created using cuda place."));
   place_ = place;
-  CUDADeviceGuard guard(BOOST_GET_CONST(CUDAPlace, place_).device);
+  CUDADeviceGuard guard(place_.device);
   if (priority == Priority::kHigh) {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreateWithPriority(
@@ -53,7 +53,7 @@ bool CUDAStream::Init(const Place& place, const Priority& priority,
 }
 
 void CUDAStream::Destroy() {
-  CUDADeviceGuard guard(BOOST_GET_CONST(CUDAPlace, place_).device);
+  CUDADeviceGuard guard(place_.device);
   Wait();
   WaitCallback();
   if (stream_ && owned_stream_) {
diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc
index 102bc9f162b0f..3439f96984d99 100644
--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
@@ -108,25 +108,20 @@ void InitEagerTensorWithNumpyValue(EagerTensorObject* self,
   paddle::platform::Place place = impl_ptr->place();
   paddle::framework::LoDTensor temp_tensor = paddle::framework::LoDTensor();
   if (platform::is_cpu_place(place)) {
-    SetTensorFromPyArray<platform::CPUPlace>(
-        &temp_tensor, array, BOOST_GET_CONST(platform::CPUPlace, place),
-        zero_copy);
+    SetTensorFromPyArray<platform::CPUPlace>(&temp_tensor, array, place,
+                                             zero_copy);
   } else if (platform::is_xpu_place(place)) {
-    SetTensorFromPyArray<platform::XPUPlace>(
-        &temp_tensor, array, BOOST_GET_CONST(platform::XPUPlace, place),
-        zero_copy);
+    SetTensorFromPyArray<platform::XPUPlace>(&temp_tensor, array, place,
+                                             zero_copy);
   } else if (platform::is_gpu_place(place)) {
-    SetTensorFromPyArray<platform::CUDAPlace>(
-        &temp_tensor, array, BOOST_GET_CONST(platform::CUDAPlace, place),
-        zero_copy);
+    SetTensorFromPyArray<platform::CUDAPlace>(&temp_tensor, array, place,
+                                              zero_copy);
   } else if (platform::is_cuda_pinned_place(place)) {
-    SetTensorFromPyArray<platform::CUDAPinnedPlace>(
-        &temp_tensor, array, BOOST_GET_CONST(platform::CUDAPinnedPlace, place),
-        zero_copy);
+    SetTensorFromPyArray<platform::CUDAPinnedPlace>(&temp_tensor, array, place,
+                                                    zero_copy);
   } else if (platform::is_npu_place(place)) {
-    SetTensorFromPyArray<platform::NPUPlace>(
-        &temp_tensor, array, BOOST_GET_CONST(platform::NPUPlace, place),
-        zero_copy);
+    SetTensorFromPyArray<platform::NPUPlace>(&temp_tensor, array, place,
+                                             zero_copy);
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "Place should be one of "
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 4f22e83ac626f..3650b44ed0a85 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -170,24 +170,18 @@ static void InitVarBaseAndTensor(
   auto *tensor = self->MutableVar()->GetMutable<framework::LoDTensor>();
   VLOG(4) << "zero_copy: " << zero_copy;
   if (platform::is_cpu_place(place)) {
-    SetTensorFromPyArray<platform::CPUPlace>(
-        tensor, array, BOOST_GET_CONST(platform::CPUPlace, place), zero_copy);
+    SetTensorFromPyArray<platform::CPUPlace>(tensor, array, place, zero_copy);
   } else if (platform::is_xpu_place(place)) {
-    SetTensorFromPyArray<platform::XPUPlace>(
-        tensor, array, BOOST_GET_CONST(platform::XPUPlace, place), zero_copy);
+    SetTensorFromPyArray<platform::XPUPlace>(tensor, array, place, zero_copy);
   } else if (platform::is_gpu_place(place)) {
-    SetTensorFromPyArray<platform::CUDAPlace>(
-        tensor, array, BOOST_GET_CONST(platform::CUDAPlace, place), zero_copy);
+    SetTensorFromPyArray<platform::CUDAPlace>(tensor, array, place, zero_copy);
   } else if (platform::is_cuda_pinned_place(place)) {
-    SetTensorFromPyArray<platform::CUDAPinnedPlace>(
-        tensor, array, BOOST_GET_CONST(platform::CUDAPinnedPlace, place),
-        zero_copy);
+    SetTensorFromPyArray<platform::CUDAPinnedPlace>(tensor, array, place,
+                                                    zero_copy);
   } else if (platform::is_npu_place(place)) {
-    SetTensorFromPyArray<platform::NPUPlace>(
-        tensor, array, BOOST_GET_CONST(platform::NPUPlace, place), zero_copy);
+    SetTensorFromPyArray<platform::NPUPlace>(tensor, array, place, zero_copy);
   } else if (platform::is_mlu_place(place)) {
-    SetTensorFromPyArray<platform::MLUPlace>(
-        tensor, array, BOOST_GET_CONST(platform::MLUPlace, place), zero_copy);
+    SetTensorFromPyArray<platform::MLUPlace>(tensor, array, place, zero_copy);
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "Place should be one of "
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 3eabf255ccbac..63f1e817137d4 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -372,7 +372,7 @@ static inline bool IsSamePlace(const PlaceType1 &p1, const PlaceType2 &p2) {
 
 template <typename PlaceType>
 static inline int PlaceIndex(const PlaceType &p) {
-  return static_cast<int>(paddle::platform::Place(p).which());
+  return static_cast<int>(paddle::platform::Place(p).GetType());
 }
 
 static PyObject *GetPythonAttribute(PyObject *obj, const char *attr_name) {
@@ -2050,26 +2050,11 @@ All parameter, weight, gradient are variables in Paddle.
            })
       .def("is_mlu_place",
            [](platform::Place &self) { return platform::is_mlu_place(self); })
-      .def("gpu_device_id",
-           [](platform::Place &self) {
-             return BOOST_GET_CONST(platform::CUDAPlace, self).device;
-           })
-      .def("xpu_device_id",
-           [](platform::Place &self) {
-             return BOOST_GET_CONST(platform::XPUPlace, self).device;
-           })
-      .def("npu_device_id",
-           [](platform::Place &self) {
-             return BOOST_GET_CONST(platform::NPUPlace, self).device;
-           })
-      .def("ipu_device_id",
-           [](platform::Place &self) {
-             return BOOST_GET_CONST(platform::IPUPlace, self).device;
-           })
-      .def("mlu_device_id",
-           [](platform::Place &self) {
-             return BOOST_GET_CONST(platform::MLUPlace, self).device;
-           })
+      .def("gpu_device_id", [](platform::Place &self) { return self.device; })
+      .def("xpu_device_id", [](platform::Place &self) { return self.device; })
+      .def("npu_device_id", [](platform::Place &self) { return self.device; })
+      .def("ipu_device_id", [](platform::Place &self) { return self.device; })
+      .def("mlu_device_id", [](platform::Place &self) { return self.device; })
       .def("set_place", [](platform::Place &self,
                            const platform::Place &other) { self = other; })
       .def("set_place",
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 1fe6686919453..5fe361b148c41 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -223,27 +223,27 @@ T TensorGetElement(const framework::Tensor &self, size_t offset) {
   } else if (platform::is_xpu_place(self.place())) {
 #ifdef PADDLE_WITH_XPU
     const T *a = self.data<T>();
-    auto p = BOOST_GET_CONST(platform::XPUPlace, self.place());
+    auto p = self.place();
     paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T));
 #endif
   } else if (platform::is_gpu_place(self.place())) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     const T *a = self.data<T>();
-    auto p = BOOST_GET_CONST(platform::CUDAPlace, self.place());
+    auto p = self.place();
     paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T),
                          nullptr);
 #endif
   } else if (platform::is_mlu_place(self.place())) {
 #ifdef PADDLE_WITH_MLU
     const T *a = self.data<T>();
-    auto p = BOOST_GET_CONST(platform::MLUPlace, self.place());
+    auto p = self.place();
     paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T),
                          nullptr);
 #endif
   } else if (platform::is_npu_place(self.place())) {
 #if defined(PADDLE_WITH_ASCEND_CL)
     const T *a = self.data<T>();
-    auto p = BOOST_GET_CONST(platform::NPUPlace, self.place());
+    auto p = self.place();
     paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T),
                          nullptr);
 #endif
@@ -264,27 +264,27 @@ void TensorSetElement(framework::Tensor *self, size_t offset, T elem) {
     self->mutable_data<T>(self->place())[offset] = elem;
   } else if (platform::is_xpu_place(self->place())) {
 #ifdef PADDLE_WITH_XPU
-    auto p = BOOST_GET_CONST(platform::XPUPlace, self->place());
+    auto p = self->place();
     T *a = self->mutable_data<T>(p);
     paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T));
 #endif
   } else if (platform::is_gpu_place(self->place())) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    auto p = BOOST_GET_CONST(platform::CUDAPlace, self->place());
+    auto p = self->place();
     T *a = self->mutable_data<T>(p);
     paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T),
                          nullptr);
 #endif
   } else if (platform::is_mlu_place(self->place())) {
 #ifdef PADDLE_WITH_MLU
-    auto p = BOOST_GET_CONST(platform::MLUPlace, self->place());
+    auto p = self->place();
     T *a = self->mutable_data<T>(p);
     paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T),
                          nullptr);
 #endif
   } else if (platform::is_npu_place(self->place())) {
 #if defined(PADDLE_WITH_ASCEND_CL)
-    auto p = BOOST_GET_CONST(platform::NPUPlace, self->place());
+    auto p = self->place();
     T *a = self->mutable_data<T>(p);
     paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T),
                          nullptr);
@@ -318,11 +318,9 @@ void SetTensorFromPyArrayT(
     // NOTE(wangxi): When copying data to the accelerator card,
     // we need set_device(dev_id) first.
     platform::Place tmp_place = place;
-    platform::XPUDeviceGuard guard(
-        BOOST_GET_CONST(platform::XPUPlace, tmp_place).device);
+    platform::XPUDeviceGuard guard(tmp_place.device);
     auto dst = self->mutable_data<T>(place);
-    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, tmp_place),
-                 static_cast<void *>(dst), platform::CPUPlace(),
+    memory::Copy(tmp_place, static_cast<void *>(dst), platform::CPUPlace(),
                  static_cast<const void *>(array.data()), array.nbytes());
 #else
     PADDLE_THROW(platform::errors::PermissionDenied(
@@ -347,8 +345,7 @@ void SetTensorFromPyArrayT(
   } else if (paddle::platform::is_npu_place(place)) {
 #ifdef PADDLE_WITH_ASCEND_CL
     platform::Place tmp_place = place;
-    platform::NPUDeviceGuard guard(
-        BOOST_GET_CONST(platform::NPUPlace, tmp_place).device);
+    platform::NPUDeviceGuard guard(tmp_place.device);
     auto dst = self->mutable_data<T>(place);
     platform::NPUMemcpySync(dst, array.data(), array.nbytes(),
                             ACL_MEMCPY_HOST_TO_DEVICE);
@@ -363,8 +360,7 @@ void SetTensorFromPyArrayT(
   } else if (paddle::platform::is_mlu_place(place)) {
 #ifdef PADDLE_WITH_MLU
     platform::Place tmp_place = place;
-    platform::MLUDeviceGuard guard(
-        BOOST_GET_CONST(platform::MLUPlace, tmp_place).device);
+    platform::MLUDeviceGuard guard(tmp_place.device);
     auto dst = self->mutable_data<T>(place);
     paddle::platform::MLUMemcpyH2DSync(dst, array.data(), array.nbytes());
 #else
@@ -377,9 +373,7 @@ void SetTensorFromPyArrayT(
     if (paddle::platform::is_gpu_place(place)) {
       // NOTE(wangxi): When copying data to the accelerator card,
       // we need set_device(dev_id) first.
-      platform::Place tmp_place = place;
-      platform::CUDADeviceGuard guard(
-          BOOST_GET_CONST(platform::CUDAPlace, tmp_place).device);
+      platform::CUDADeviceGuard guard(place.device);
       auto dst = self->mutable_data<T>(place);
 #ifdef PADDLE_WITH_HIP
       paddle::platform::GpuMemcpySync(dst, array.data(), array.nbytes(),
@@ -460,7 +454,6 @@ void _sliceCompute(const framework::Tensor *in, framework::Tensor *out,
                    const std::vector<int> &axes,
                    const std::vector<int> &starts) {
   auto &eigen_place = *ctx.eigen_device();
-  auto place = in->place();
   auto out_dims = out->dims();
   auto in_dims = in->dims();
 
@@ -551,26 +544,21 @@ inline framework::Tensor *_getTensor(const framework::Tensor &self,
   output->Resize(ddim);
   auto place = self.place();
   if (platform::is_cpu_place(place)) {
-    output->mutable_data(BOOST_GET_CONST(platform::CPUPlace, place),
-                         self.type());
+    output->mutable_data(place, self.type());
   } else if (platform::is_xpu_place(place)) {
 #ifdef PADDLE_WITH_XPU
-    output->mutable_data(BOOST_GET_CONST(platform::XPUPlace, place),
-                         self.type());
+    output->mutable_data(place, self.type());
 #endif
   } else if (platform::is_mlu_place(place)) {
 #ifdef PADDLE_WITH_MLU
-    output->mutable_data(BOOST_GET_CONST(platform::MLUPlace, place),
-                         self.type());
+    output->mutable_data(place, self.type());
 #endif
   } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (platform::is_cuda_pinned_place(place)) {
-      output->mutable_data(BOOST_GET_CONST(platform::CUDAPinnedPlace, place),
-                           self.type());
+      output->mutable_data(place, self.type());
     } else if ((platform::is_gpu_place(place))) {
-      output->mutable_data(BOOST_GET_CONST(platform::CUDAPlace, place),
-                           self.type());
+      output->mutable_data(place, self.type());
     }
 #endif
   }
@@ -789,7 +777,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
             "or double free would occur"));
 
     size_t copy_bytes = sizeof_dtype * numel;
-    auto p = BOOST_GET_CONST(platform::XPUPlace, tensor.place());
+    auto p = tensor.place();
     paddle::memory::Copy(platform::CPUPlace(), py_arr.mutable_data(), p,
                          tensor_buf_ptr, copy_bytes);
     return py_arr;
@@ -812,7 +800,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
             "or double free would occur"));
 
     size_t copy_bytes = sizeof_dtype * numel;
-    auto p = BOOST_GET_CONST(platform::CUDAPlace, tensor.place());
+    auto p = tensor.place();
     paddle::memory::Copy(platform::CPUPlace(), py_arr.mutable_data(), p,
                          tensor_buf_ptr, copy_bytes, nullptr);
     return py_arr;
@@ -835,7 +823,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
             "or double free would occur"));
 
     size_t copy_bytes = sizeof_dtype * numel;
-    auto p = BOOST_GET_CONST(platform::NPUPlace, tensor.place());
+    auto p = tensor.place();
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto &ctx = *pool.Get(tensor.place());
     paddle::memory::Copy(
@@ -863,7 +851,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
             "or double free would occur"));
 
     size_t copy_bytes = sizeof_dtype * numel;
-    auto p = BOOST_GET_CONST(platform::MLUPlace, tensor.place());
+    auto p = tensor.place();
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto &ctx = *pool.Get(tensor.place());
     paddle::memory::Copy(
diff --git a/paddle/pten/api/include/tensor.h b/paddle/pten/api/include/tensor.h
index b22d2d65a439c..a6e2c4d103769 100644
--- a/paddle/pten/api/include/tensor.h
+++ b/paddle/pten/api/include/tensor.h
@@ -34,6 +34,7 @@ using gpuStream_t = hipStream_t;
 #include "paddle/pten/common/backend.h"
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/common/layout.h"
+#include "paddle/pten/common/place.h"
 
 namespace pten {
 class TensorBase;
@@ -43,9 +44,7 @@ namespace paddle {
 namespace framework {
 class DDim;
 }
-namespace platform {
-class Place;
-}
+
 namespace experimental {
 
 class Tensor;
@@ -229,7 +228,7 @@ class PADDLE_API Tensor final {
    *
    * @return paddle::platform::Place
    */
-  paddle::platform::Place inner_place() const;
+  pten::Place inner_place() const;
 
   /**
    * @brief Determine whether the tensor device is CPU
diff --git a/paddle/pten/common/place.cc b/paddle/pten/common/place.cc
index 2d33bb508af44..e2cb934f0a1c5 100644
--- a/paddle/pten/common/place.cc
+++ b/paddle/pten/common/place.cc
@@ -23,20 +23,20 @@ namespace pten {
 
 const char *AllocationTypeStr(AllocationType type) {
   switch (type) {
-    case AllocationType::UNDEF:
-      return "undef";
+    case AllocationType::UNDEFINED:
+      return "undefined";
     case AllocationType::CPU:
       return "cpu";
     case AllocationType::GPU:
       return "gpu";
     case AllocationType::GPUPINNED:
-      return "gpu pinned";
+      return "gpu_pinned";
     case AllocationType::XPU:
       return "xpu";
     case AllocationType::NPU:
       return "npu";
     case AllocationType::NPUPINNED:
-      return "npu pinned";
+      return "npu_pinned";
     case AllocationType::IPU:
       return "ipu";
     case AllocationType::MLU:
diff --git a/paddle/pten/common/place.h b/paddle/pten/common/place.h
index 24d24305202cf..75f1f4de9984c 100644
--- a/paddle/pten/common/place.h
+++ b/paddle/pten/common/place.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace pten {
 
 enum class AllocationType : int8_t {
-  UNDEF = 0,
+  UNDEFINED = 0,
   CPU = 1,
   GPU = 2,
   GPUPINNED = 3,
@@ -30,12 +30,12 @@ enum class AllocationType : int8_t {
   MLU = 8,
 };
 
-const char *AllocationTypeStr(AllocationType type);
+const char* AllocationTypeStr(AllocationType type);
 
 /// \brief The place is used to specify where the data is stored.
 class Place {
  public:
-  Place() : device(0), alloc_type_(AllocationType::UNDEF) {}
+  Place() : device(0), alloc_type_(AllocationType::UNDEFINED) {}
 
   explicit Place(AllocationType type, int8_t id)
       : device(id), alloc_type_(type) {}
@@ -53,60 +53,110 @@ class Place {
 
   std::string DebugString() const;
 
+  inline bool operator==(const Place& rhs) const {
+    if (alloc_type_ != rhs.GetType()) {
+      return false;
+    }
+    if (alloc_type_ == AllocationType::CPU ||
+        alloc_type_ == AllocationType::GPUPINNED ||
+        alloc_type_ == AllocationType::NPUPINNED) {
+      return true;
+    }
+    return device == rhs.GetDeviceId();
+  }
+  inline bool operator!=(const Place& rhs) const { return !(*this == rhs); }
+  inline bool operator<(const Place& rhs) const {
+    if (alloc_type_ != rhs.GetType()) {
+      return static_cast<int>(alloc_type_) < static_cast<int>(rhs.GetType());
+    }
+    return device < rhs.GetDeviceId();
+  }
+
  public:
   // TODO(wilber): Just because of backward compatibility, it needs to be
   // changed to private in the future.
-  int8_t device;
+  int8_t device{0};
 
  private:
-  AllocationType alloc_type_;
+  AllocationType alloc_type_{AllocationType::UNDEFINED};
 };
 
 class CPUPlace : public Place {
  public:
-  CPUPlace() : Place(AllocationType::CPU, 0) {}
+  CPUPlace() : Place(AllocationType::CPU) {}
+
+  CPUPlace(const CPUPlace&) = default;
+  CPUPlace(const Place& place) : Place(AllocationType::CPU) {}  // NOLINT
 };
 
 class GPUPlace : public Place {
  public:
   GPUPlace() : Place(AllocationType::GPU, 0) {}
   explicit GPUPlace(int device_id) : Place(AllocationType::GPU, device_id) {}
+
+  GPUPlace(const GPUPlace&) = default;
+  GPUPlace(const Place& place)  // NOLINT
+      : Place(AllocationType::GPU, place.GetDeviceId()) {}
 };
 
 class GPUPinnedPlace : public Place {
  public:
   GPUPinnedPlace() : Place(AllocationType::GPUPINNED) {}
+
+  GPUPinnedPlace(const GPUPinnedPlace&) = default;
+  GPUPinnedPlace(const Place& place)  // NOLINT
+      : Place(AllocationType::GPUPINNED) {}
 };
 
 class XPUPlace : public Place {
  public:
   XPUPlace() : Place(AllocationType::XPU, 0) {}
   explicit XPUPlace(int device_id) : Place(AllocationType::XPU, device_id) {}
+
+  XPUPlace(const XPUPlace&) = default;
+  XPUPlace(const Place& place)  // NOLINT
+      : Place(AllocationType::XPU, place.GetDeviceId()) {}
 };
 
 class NPUPlace : public Place {
  public:
   NPUPlace() : Place(AllocationType::NPU, 0) {}
-  explicit NPUPlace(int device_id) : Place(AllocationType::XPU, device_id) {}
+  explicit NPUPlace(int device_id) : Place(AllocationType::NPU, device_id) {}
+
+  NPUPlace(const NPUPlace&) = default;
+  NPUPlace(const Place& place)  // NOLINT
+      : Place(AllocationType::NPU, place.GetDeviceId()) {}
 };
 
 class NPUPinnedPlace : public Place {
  public:
   NPUPinnedPlace() : Place(AllocationType::NPUPINNED) {}
+
+  NPUPinnedPlace(const NPUPinnedPlace&) = default;
+  NPUPinnedPlace(const Place& place)  // NOLINT
+      : Place(AllocationType::NPUPINNED) {}
 };
 
 class IPUPlace : public Place {
  public:
-  IPUPlace() : Place(AllocationType::XPU, 0) {}
-  explicit IPUPlace(int device_id) : Place(AllocationType::XPU, device_id) {}
+  IPUPlace() : Place(AllocationType::IPU, 0) {}
+  explicit IPUPlace(int device_id) : Place(AllocationType::IPU, device_id) {}
+
+  IPUPlace(const IPUPlace&) = default;
+  IPUPlace(const Place& place)  // NOLINT
+      : Place(AllocationType::IPU, place.GetDeviceId()) {}
 };
 
 class MLUPlace : public Place {
  public:
   MLUPlace() : Place(AllocationType::MLU, 0) {}
   explicit MLUPlace(int device_id) : Place(AllocationType::MLU, device_id) {}
+
+  MLUPlace(const MLUPlace&) = default;
+  MLUPlace(const Place& place)  // NOLINT
+      : Place(AllocationType::MLU, place.GetDeviceId()) {}
 };
 
-std::ostream &operator<<(std::ostream &, const Place &);
+std::ostream& operator<<(std::ostream&, const Place&);
 
 }  // namespace pten
diff --git a/paddle/pten/kernels/cpu/copy_kernel.cc b/paddle/pten/kernels/cpu/copy_kernel.cc
index f3c4156fcddf0..28623b539d847 100644
--- a/paddle/pten/kernels/cpu/copy_kernel.cc
+++ b/paddle/pten/kernels/cpu/copy_kernel.cc
@@ -53,11 +53,7 @@ void Copy(const Context& dev_ctx,
 
   if (paddle::platform::is_cpu_place(src_place) &&
       paddle::platform::is_cpu_place(dst_place)) {
-    paddle::memory::Copy(BOOST_GET_CONST(paddle::platform::CPUPlace, dst_place),
-                         dst_ptr,
-                         BOOST_GET_CONST(paddle::platform::CPUPlace, src_place),
-                         src_ptr,
-                         size);
+    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
 }
 
diff --git a/paddle/pten/kernels/funcs/transpose.cu b/paddle/pten/kernels/funcs/transpose.cu
index e03c538e38682..77a345d7a0f7c 100644
--- a/paddle/pten/kernels/funcs/transpose.cu
+++ b/paddle/pten/kernels/funcs/transpose.cu
@@ -64,8 +64,7 @@ struct TransposeNormal<GPUContext, T> {
     auto* out_ptr = out->mutable_data<T>();
 
     // copy in_stride, out_stride, axis to gpu device
-    const paddle::platform::CUDAPlace& cuda_place =
-        BOOST_GET_CONST(paddle::platform::CUDAPlace, dev_ctx.GetPlace());
+    const paddle::platform::CUDAPlace& cuda_place = dev_ctx.GetPlace();
     paddle::platform::CPUPlace cpu_place = paddle::platform::CPUPlace();
     size_t size = 3 * rank * sizeof(int64_t);
     auto cpu_buf_holder = paddle::memory::Alloc(cpu_place, size);
diff --git a/paddle/pten/kernels/gpu/copy_kernel.cu b/paddle/pten/kernels/gpu/copy_kernel.cu
index 877a06ce33e5d..7eeef85f0f3e6 100644
--- a/paddle/pten/kernels/gpu/copy_kernel.cu
+++ b/paddle/pten/kernels/gpu/copy_kernel.cu
@@ -58,33 +58,17 @@ void Copy(const Context& dev_ctx,
 
   if (paddle::platform::is_cuda_pinned_place(src_place) &&  // NOLINT
       paddle::platform::is_cuda_pinned_place(dst_place)) {
-    paddle::memory::Copy(
-        BOOST_GET_CONST(paddle::platform::CUDAPinnedPlace, dst_place),
-        dst_ptr,
-        BOOST_GET_CONST(paddle::platform::CUDAPinnedPlace, src_place),
-        src_ptr,
-        size);
+    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   } else if (paddle::platform::is_cuda_pinned_place(src_place) &&  // NOLINT
              paddle::platform::is_cpu_place(dst_place)) {
-    paddle::memory::Copy(
-        BOOST_GET_CONST(paddle::platform::CPUPlace, dst_place),
-        dst_ptr,
-        BOOST_GET_CONST(paddle::platform::CUDAPinnedPlace, src_place),
-        src_ptr,
-        size);
+    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   } else if (paddle::platform::is_cpu_place(src_place) &&  // NOLINT
              paddle::platform::is_cuda_pinned_place(dst_place)) {
-    paddle::memory::Copy(
-        BOOST_GET_CONST(paddle::platform::CUDAPinnedPlace, dst_place),
-        dst_ptr,
-        BOOST_GET_CONST(paddle::platform::CPUPlace, src_place),
-        src_ptr,
-        size);
+    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   } else if (paddle::platform::is_gpu_place(src_place) &&  // NOLINT
              paddle::platform::is_cpu_place(dst_place)) {
-    auto src_gpu_place =
-        BOOST_GET_CONST(paddle::platform::CUDAPlace, src_place);
-    auto dst_cpu_place = BOOST_GET_CONST(paddle::platform::CPUPlace, dst_place);
+    auto src_gpu_place = src_place;
+    auto dst_cpu_place = dst_place;
     auto ctx_place = dev_ctx.GetPlace();
     PADDLE_ENFORCE_EQ(
         paddle::platform::is_gpu_place(ctx_place),
@@ -92,8 +76,7 @@ void Copy(const Context& dev_ctx,
         paddle::platform::errors::PreconditionNotMet(
             "Context place error, excepted GPUPlace, but actually %s.",
             ctx_place));
-    auto ctx_gpu_place =
-        BOOST_GET_CONST(paddle::platform::CUDAPlace, ctx_place);
+    auto ctx_gpu_place = ctx_place;
     PADDLE_ENFORCE_EQ(src_gpu_place,
                       ctx_gpu_place,
                       paddle::platform::errors::Unavailable(
@@ -110,9 +93,8 @@ void Copy(const Context& dev_ctx,
         dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
   } else if (paddle::platform::is_cpu_place(src_place) &&  // NOLINT
              paddle::platform::is_gpu_place(dst_place)) {
-    auto src_cpu_place = BOOST_GET_CONST(paddle::platform::CPUPlace, src_place);
-    auto dst_gpu_place =
-        BOOST_GET_CONST(paddle::platform::CUDAPlace, dst_place);
+    auto src_cpu_place = src_place;
+    auto dst_gpu_place = dst_place;
     auto ctx_place = dev_ctx.GetPlace();
     PADDLE_ENFORCE_EQ(
         paddle::platform::is_gpu_place(ctx_place),
@@ -120,8 +102,7 @@ void Copy(const Context& dev_ctx,
         paddle::platform::errors::PreconditionNotMet(
             "Context place error, excepted GPUPlace, but actually %s.",
             ctx_place));
-    auto ctx_gpu_place =
-        BOOST_GET_CONST(paddle::platform::CUDAPlace, ctx_place);
+    auto ctx_gpu_place = ctx_place;
     PADDLE_ENFORCE_EQ(dst_gpu_place,
                       ctx_gpu_place,
                       paddle::platform::errors::Unavailable(
@@ -138,10 +119,8 @@ void Copy(const Context& dev_ctx,
         dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
   } else if (paddle::platform::is_gpu_place(src_place) &&  // NOLINT
              paddle::platform::is_cuda_pinned_place(dst_place)) {
-    auto src_gpu_place =
-        BOOST_GET_CONST(paddle::platform::CUDAPlace, src_place);
-    auto dst_cuda_pinned_place =
-        BOOST_GET_CONST(paddle::platform::CUDAPinnedPlace, dst_place);
+    auto src_gpu_place = src_place;
+    auto dst_cuda_pinned_place = dst_place;
     auto ctx_place = dev_ctx.GetPlace();
     PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(ctx_place),
                       true,
@@ -149,8 +128,7 @@ void Copy(const Context& dev_ctx,
                           "Device context place mismatch. When copying Tensor "
                           "data from GPU memory to CUDA Pinned memory, current "
                           "device context place should be GPU."));
-    auto ctx_gpu_place =
-        BOOST_GET_CONST(paddle::platform::CUDAPlace, ctx_place);
+    auto ctx_gpu_place = ctx_place;
     PADDLE_ENFORCE_EQ(src_gpu_place,
                       ctx_gpu_place,
                       paddle::platform::errors::PreconditionNotMet(
@@ -168,10 +146,8 @@ void Copy(const Context& dev_ctx,
         dst_cuda_pinned_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
   } else if (paddle::platform::is_cuda_pinned_place(src_place) &&  // NOLINT
              paddle::platform::is_gpu_place(dst_place)) {
-    auto src_cuda_pinned_place =
-        BOOST_GET_CONST(paddle::platform::CUDAPinnedPlace, src_place);
-    auto dst_gpu_place =
-        BOOST_GET_CONST(paddle::platform::CUDAPlace, dst_place);
+    auto src_cuda_pinned_place = src_place;
+    auto dst_gpu_place = dst_place;
     auto ctx_place = dev_ctx.GetPlace();
     PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(ctx_place),
                       true,
@@ -179,8 +155,7 @@ void Copy(const Context& dev_ctx,
                           "Device context place mismatch. When copying Tensor "
                           "data from CUDA Pinned memory to GPU memory, current "
                           "device context place should be GPU."));
-    auto ctx_gpu_place =
-        BOOST_GET_CONST(paddle::platform::CUDAPlace, ctx_place);
+    auto ctx_gpu_place = ctx_place;
     PADDLE_ENFORCE_EQ(dst_gpu_place,
                       ctx_gpu_place,
                       paddle::platform::errors::PreconditionNotMet(
@@ -198,10 +173,8 @@ void Copy(const Context& dev_ctx,
         dst_gpu_place, dst_ptr, src_cuda_pinned_place, src_ptr, size, stream);
   } else if (paddle::platform::is_gpu_place(src_place) &&  // NOLINT
              paddle::platform::is_gpu_place(dst_place)) {
-    auto src_gpu_place =
-        BOOST_GET_CONST(paddle::platform::CUDAPlace, src_place);
-    auto dst_gpu_place =
-        BOOST_GET_CONST(paddle::platform::CUDAPlace, dst_place);
+    auto src_gpu_place = src_place;
+    auto dst_gpu_place = dst_place;
     auto ctx_place = dev_ctx.GetPlace();
     PADDLE_ENFORCE_EQ(
         paddle::platform::is_gpu_place(ctx_place),
diff --git a/paddle/pten/kernels/gpu/elementwise.h b/paddle/pten/kernels/gpu/elementwise.h
index 5abc40c75d17f..a024495a9ff0f 100644
--- a/paddle/pten/kernels/gpu/elementwise.h
+++ b/paddle/pten/kernels/gpu/elementwise.h
@@ -1540,8 +1540,7 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
                              const GPUContext &ctx,
                              DX_OP dx_op,
                              DY_OP dy_op) {
-  const auto gplace =
-      BOOST_GET_CONST(paddle::platform::CUDAPlace, ctx.GetPlace());
+  const auto gplace = ctx.GetPlace();
   auto cplace = paddle::platform::CPUPlace();
   const T *x_data = x.data<T>();
   const T *y_data = y.data<T>();
diff --git a/paddle/pten/kernels/xpu/copy_kernel.cc b/paddle/pten/kernels/xpu/copy_kernel.cc
index 190eb39e22ecd..f464a4926d3b5 100644
--- a/paddle/pten/kernels/xpu/copy_kernel.cc
+++ b/paddle/pten/kernels/xpu/copy_kernel.cc
@@ -50,18 +50,10 @@ void Copy(const Context& dev_ctx,
 
   if (paddle::platform::is_xpu_place(src_place) &&  // NOLINT
       paddle::platform::is_cpu_place(dst_place)) {
-    paddle::memory::Copy(BOOST_GET_CONST(paddle::platform::CPUPlace, dst_place),
-                         dst_ptr,
-                         BOOST_GET_CONST(paddle::platform::XPUPlace, src_place),
-                         src_ptr,
-                         size);
+    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   } else if (paddle::platform::is_cpu_place(src_place) &&
              paddle::platform::is_xpu_place(dst_place)) {
-    paddle::memory::Copy(BOOST_GET_CONST(paddle::platform::XPUPlace, dst_place),
-                         dst_ptr,
-                         BOOST_GET_CONST(paddle::platform::CPUPlace, src_place),
-                         src_ptr,
-                         size);
+    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   } else if (paddle::platform::is_xpu_place(src_place) &&
              paddle::platform::is_xpu_place(dst_place)) {
     if (src_ptr == dst_ptr) {
@@ -69,11 +61,7 @@ void Copy(const Context& dev_ctx,
               << dst_place;
       return;
     }
-    paddle::memory::Copy(BOOST_GET_CONST(paddle::platform::XPUPlace, dst_place),
-                         dst_ptr,
-                         BOOST_GET_CONST(paddle::platform::XPUPlace, src_place),
-                         src_ptr,
-                         size);
+    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   } else {
     PADDLE_THROW(paddle::platform::errors::Unimplemented(
         "Copy from %s to %s is not supported.", src_place, dst_place));
diff --git a/paddle/pten/tests/common/test_place.cc b/paddle/pten/tests/common/test_place.cc
index 0bbd8f1d42273..39a5cdef6b580 100644
--- a/paddle/pten/tests/common/test_place.cc
+++ b/paddle/pten/tests/common/test_place.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/pten/common/place.h"
 
+#include <map>  // NOLINT
 #include "gtest/gtest.h"
 
 namespace pten {
@@ -21,7 +22,7 @@ namespace tests {
 
 TEST(PtenPlace, place) {
   pten::Place place;
-  EXPECT_EQ(place.GetType(), pten::AllocationType::UNDEF);
+  EXPECT_EQ(place.GetType(), pten::AllocationType::UNDEFINED);
 
   place.Reset(pten::AllocationType::GPU, 1);
   EXPECT_EQ(place.GetType(), pten::AllocationType::GPU);
@@ -47,6 +48,34 @@ TEST(Place, gpu_place) {
   pten::GPUPinnedPlace place2;
   EXPECT_EQ(place2.GetType(), pten::AllocationType::GPUPINNED);
   std::cout << "gpu pinned place repr: " << place2 << std::endl;
+
+  EXPECT_NE(place2, pten::CPUPlace());
+}
+
+TEST(Place, convert_place) {
+  pten::Place base_place(pten::AllocationType::CPU);
+  pten::CPUPlace cpu_place = base_place;
+  EXPECT_EQ(cpu_place.GetType(), base_place.GetType());
+  base_place.Reset(pten::AllocationType::GPU, 2);
+  pten::GPUPlace gpu_place = base_place;
+  EXPECT_EQ(gpu_place.GetType(), base_place.GetType());
+  EXPECT_EQ(gpu_place.GetDeviceId(), base_place.GetDeviceId());
+  pten::Place place = gpu_place;
+  EXPECT_EQ(gpu_place.GetType(), place.GetType());
+  EXPECT_EQ(gpu_place.GetDeviceId(), place.GetDeviceId());
+  place = cpu_place;
+  EXPECT_EQ(cpu_place.GetType(), place.GetType());
+
+  std::map<pten::Place, int> maps;
+  maps[pten::CPUPlace()] = 1;
+  maps[pten::GPUPlace(0)] = 2;
+  maps[pten::GPUPlace(1)] = 3;
+  maps[pten::GPUPlace(2)] = 4;
+  maps[pten::GPUPlace(3)] = 5;
+  maps[pten::GPUPinnedPlace()] = 6;
+  for (auto iter = maps.begin(); iter != maps.end(); ++iter) {
+    std::cout << iter->first << ":" << iter->second << std::endl;
+  }
 }
 
 }  // namespace tests
diff --git a/python/paddle/fluid/tests/unittests/test_egr_python_api.py b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
index 9630462b4963a..e84c11e8601c1 100644
--- a/python/paddle/fluid/tests/unittests/test_egr_python_api.py
+++ b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
@@ -689,7 +689,7 @@ def test_properties(self):
             tensor.persistable = False
             self.assertEqual(tensor.persistable, False)
             self.assertTrue(tensor.place.is_cpu_place())
-            self.assertEqual(tensor._place_str, 'CPUPlace')
+            self.assertEqual(tensor._place_str, 'Place(cpu)')
             self.assertEqual(tensor.stop_gradient, True)
             tensor.stop_gradient = False
             self.assertEqual(tensor.stop_gradient, False)
diff --git a/python/paddle/fluid/tests/unittests/test_memcpy_op.py b/python/paddle/fluid/tests/unittests/test_memcpy_op.py
index d6efe4d471efd..623c43f5b75f3 100755
--- a/python/paddle/fluid/tests/unittests/test_memcpy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_memcpy_op.py
@@ -202,7 +202,7 @@ class TestMemcpyApi(unittest.TestCase):
     def test_api(self):
         a = paddle.ones([1024, 1024])
         b = paddle.tensor.creation._memcpy(a, paddle.CUDAPinnedPlace())
-        self.assertEqual(b.place.__repr__(), "CUDAPinnedPlace")
+        self.assertEqual(b.place.__repr__(), "Place(gpu_pinned)")
         self.assertTrue(np.array_equal(a.numpy(), b.numpy()))
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index c4c4edbbb9335..c74dd24b78bac 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -68,19 +68,19 @@ def _test_place(place):
                     np.array_equal(x.grad.numpy(),
                                    np.array([2.4]).astype('float32')))
                 y = x.cpu()
-                self.assertEqual(y.place.__repr__(), "CPUPlace")
+                self.assertEqual(y.place.__repr__(), "Place(cpu)")
                 if core.is_compiled_with_cuda():
                     y = x.pin_memory()
-                    self.assertEqual(y.place.__repr__(), "CUDAPinnedPlace")
+                    self.assertEqual(y.place.__repr__(), "Place(gpu_pinned)")
                     y = x.cuda()
                     y = x.cuda(None)
-                    self.assertEqual(y.place.__repr__(), "CUDAPlace(0)")
+                    self.assertEqual(y.place.__repr__(), "Place(gpu:0)")
                     y = x.cuda(device_id=0)
-                    self.assertEqual(y.place.__repr__(), "CUDAPlace(0)")
+                    self.assertEqual(y.place.__repr__(), "Place(gpu:0)")
                     y = x.cuda(blocking=False)
-                    self.assertEqual(y.place.__repr__(), "CUDAPlace(0)")
+                    self.assertEqual(y.place.__repr__(), "Place(gpu:0)")
                     y = x.cuda(blocking=True)
-                    self.assertEqual(y.place.__repr__(), "CUDAPlace(0)")
+                    self.assertEqual(y.place.__repr__(), "Place(gpu:0)")
                     with self.assertRaises(ValueError):
                         y = x.cuda("test")
 
@@ -271,17 +271,17 @@ def test_to_tensor_change_place(self):
             with paddle.fluid.dygraph.guard(core.CPUPlace()):
                 a = paddle.to_tensor(a_np, place=paddle.CUDAPinnedPlace())
                 a = paddle.to_tensor(a)
-                self.assertEqual(a.place.__repr__(), "CPUPlace")
+                self.assertEqual(a.place.__repr__(), "Place(cpu)")
 
             with paddle.fluid.dygraph.guard(core.CUDAPlace(0)):
                 a = paddle.to_tensor(a_np, place=paddle.CUDAPinnedPlace())
                 a = paddle.to_tensor(a)
-                self.assertEqual(a.place.__repr__(), "CUDAPlace(0)")
+                self.assertEqual(a.place.__repr__(), "Place(gpu:0)")
 
             with paddle.fluid.dygraph.guard(core.CUDAPlace(0)):
                 a = paddle.to_tensor(a_np, place=paddle.CPUPlace())
                 a = paddle.to_tensor(a, place=paddle.CUDAPinnedPlace())
-                self.assertEqual(a.place.__repr__(), "CUDAPinnedPlace")
+                self.assertEqual(a.place.__repr__(), "Place(gpu_pinned)")
 
     def test_to_tensor_with_lodtensor(self):
         if core.is_compiled_with_cuda():
@@ -297,7 +297,7 @@ def test_to_tensor_with_lodtensor(self):
                 lod_tensor.set(a_np, core.CUDAPlace(0))
                 a = paddle.to_tensor(lod_tensor, place=core.CPUPlace())
                 self.assertTrue(np.array_equal(a_np, a.numpy()))
-                self.assertTrue(a.place.__repr__(), "CPUPlace")
+                self.assertTrue(a.place.__repr__(), "Place(cpu)")
 
     def test_to_variable(self):
         with fluid.dygraph.guard():
@@ -984,7 +984,7 @@ def test_tensor_str(self):
         paddle.set_printoptions(4, 100, 3)
         a_str = str(a)
 
-        expected = '''Tensor(shape=[10, 20], dtype=float32, place=CPUPlace, stop_gradient=True,
+        expected = '''Tensor(shape=[10, 20], dtype=float32, place=Place(cpu), stop_gradient=True,
        [[0.2727, 0.5489, 0.8655, ..., 0.2916, 0.8525, 0.9000],
         [0.3806, 0.8996, 0.0928, ..., 0.9535, 0.8378, 0.6409],
         [0.1484, 0.4038, 0.8294, ..., 0.0148, 0.6520, 0.4250],
@@ -1001,7 +1001,7 @@ def test_tensor_str2(self):
         a = paddle.to_tensor([[1.5111111, 1.0], [0, 0]])
         a_str = str(a)
 
-        expected = '''Tensor(shape=[2, 2], dtype=float32, place=CPUPlace, stop_gradient=True,
+        expected = '''Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
        [[1.5111, 1.    ],
         [0.    , 0.    ]])'''
 
@@ -1013,7 +1013,7 @@ def test_tensor_str3(self):
         a = paddle.to_tensor([[-1.5111111, 1.0], [0, -0.5]])
         a_str = str(a)
 
-        expected = '''Tensor(shape=[2, 2], dtype=float32, place=CPUPlace, stop_gradient=True,
+        expected = '''Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
        [[-1.5111,  1.    ],
         [ 0.    , -0.5000]])'''
 
@@ -1025,7 +1025,7 @@ def test_tensor_str_scaler(self):
         a = paddle.to_tensor(np.array(False))
         a_str = str(a)
 
-        expected = '''Tensor(shape=[], dtype=bool, place=CPUPlace, stop_gradient=True,
+        expected = '''Tensor(shape=[], dtype=bool, place=Place(cpu), stop_gradient=True,
        False)'''
 
         self.assertEqual(a_str, expected)
@@ -1037,7 +1037,7 @@ def test_tensor_str_shape_with_zero(self):
         y = paddle.fluid.layers.where(x == 0)
         a_str = str(y)
 
-        expected = '''Tensor(shape=[0, 2], dtype=int64, place=CPUPlace, stop_gradient=True,
+        expected = '''Tensor(shape=[0, 2], dtype=int64, place=Place(cpu), stop_gradient=True,
        [])'''
 
         self.assertEqual(a_str, expected)
@@ -1051,7 +1051,7 @@ def test_tensor_str_linewidth(self):
             precision=4, threshold=1000, edgeitems=3, linewidth=80)
         a_str = str(x)
 
-        expected = '''Tensor(shape=[128], dtype=float32, place=CPUPlace, stop_gradient=True,
+        expected = '''Tensor(shape=[128], dtype=float32, place=Place(cpu), stop_gradient=True,
        [0.3759, 0.0278, 0.2489, 0.3110, 0.9105, 0.7381, 0.1905, 0.4726, 0.2435,
         0.9142, 0.3367, 0.7243, 0.7664, 0.9915, 0.2921, 0.1363, 0.8096, 0.2915,
         0.9564, 0.9972, 0.2573, 0.2597, 0.3429, 0.2484, 0.9579, 0.7003, 0.4126,
@@ -1078,7 +1078,7 @@ def test_tensor_str_linewidth2(self):
         paddle.set_printoptions(precision=4, linewidth=160, sci_mode=True)
         a_str = str(x)
 
-        expected = '''Tensor(shape=[128], dtype=float32, place=CPUPlace, stop_gradient=True,
+        expected = '''Tensor(shape=[128], dtype=float32, place=Place(cpu), stop_gradient=True,
        [3.7587e-01, 2.7798e-02, 2.4891e-01, 3.1097e-01, 9.1053e-01, 7.3811e-01, 1.9045e-01, 4.7258e-01, 2.4354e-01, 9.1415e-01, 3.3666e-01, 7.2428e-01,
         7.6640e-01, 9.9146e-01, 2.9215e-01, 1.3625e-01, 8.0957e-01, 2.9153e-01, 9.5642e-01, 9.9718e-01, 2.5732e-01, 2.5973e-01, 3.4292e-01, 2.4841e-01,
         9.5794e-01, 7.0029e-01, 4.1260e-01, 4.2737e-01, 7.3788e-03, 9.6863e-01, 9.9102e-01, 1.4416e-02, 6.5640e-01, 2.9318e-01, 7.1136e-01, 9.3008e-01,