diff --git a/paddle/fluid/framework/new_executor/data_transfer.cc b/paddle/fluid/framework/new_executor/data_transfer.cc index b856bbec4b0c4..3cf16266baf08 100644 --- a/paddle/fluid/framework/new_executor/data_transfer.cc +++ b/paddle/fluid/framework/new_executor/data_transfer.cc @@ -315,6 +315,7 @@ std::shared_ptr TransferDevice(const std::string& var_name, op_type = kMemcpyH2D; int dst_place_type = platform::is_gpu_place(dst_place) ? 0 : platform::is_npu_place(dst_place) ? 1 + : platform::is_ipu_place(dst_place) ? 3 : platform::is_xpu_place(dst_place) ? 2 : -1; attr_map = {{"dst_place_type", dst_place_type}}; diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index c321069537c89..3680f0aa900c6 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -25,6 +25,7 @@ #include "paddle/fluid/platform/os_info.h" #include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/platform/profiler/supplement_tracing.h" +#include "paddle/phi/common/place.h" #include "paddle/phi/core/kernel_context.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" @@ -475,8 +476,13 @@ void InterpreterCore::Convert( BuildSkipShareLoDInfo(); for (size_t i = 0; i < vec_instruction_.size(); ++i) { +#ifdef PADDLE_WITH_IPU + gc_event_.emplace_back(phi::CPUPlace(), 0); +#else gc_event_.emplace_back(vec_instruction_[i].DeviceContext().GetPlace(), platform::GenerateDeviceEventFlag()); + +#endif } bool inplaced = false; for (auto inst : vec_instruction_) { diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h index af3951f4538f1..31e27a07c665d 100644 --- a/paddle/fluid/framework/new_executor/new_executor_defs.h +++ b/paddle/fluid/framework/new_executor/new_executor_defs.h @@ -390,7 +390,7 @@ static bool IsCpuOp(const Instruction& instr) { // is supported heterogeneous place static bool IsSupportedHetePlace(const phi::Place& place) { return platform::is_gpu_place(place) || platform::is_npu_place(place) || - platform::is_xpu_place(place); + platform::is_xpu_place(place) || platform::is_ipu_place(place); } } // namespace interpreter diff --git a/paddle/fluid/framework/new_executor/stream_analyzer.cc b/paddle/fluid/framework/new_executor/stream_analyzer.cc index 086dac8dac1fb..760a852baee68 100644 --- a/paddle/fluid/framework/new_executor/stream_analyzer.cc +++ b/paddle/fluid/framework/new_executor/stream_analyzer.cc @@ -204,8 +204,9 @@ bool StreamAnalyzer::IsDirectRun(Instruction& cur_instr, const Instruction& next_instr) { if (&cur_instr.DeviceContext() == &next_instr.DeviceContext()) return true; - // xpu memcpy kerenl is synchronous. - if (platform::is_xpu_place(place_)) return true; + // xpu&ipu memcpy kerenl is synchronous. + if (platform::is_ipu_place(place_) || platform::is_xpu_place(place_)) + return true; // npu d2h kernel is asynchronous. if (platform::is_npu_place(place_)) { diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index d38efbff3165c..53b77d538b3ed 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -408,6 +408,12 @@ struct OpKernelRegistrarFunctorEx, + ops::MemcpyD2HKernel, + paddle::platform::complex, + ops::MemcpyD2HKernel, + plat::float16, + ops::MemcpyD2HKernel, + int16_t, + ops::MemcpyD2HKernel); +#endif diff --git a/paddle/fluid/operators/memcpy_h2d_op.cc b/paddle/fluid/operators/memcpy_h2d_op.cc index 98ed68cf84f87..ff7b786d04018 100644 --- a/paddle/fluid/operators/memcpy_h2d_op.cc +++ b/paddle/fluid/operators/memcpy_h2d_op.cc @@ -100,6 +100,7 @@ class MemcpyH2DOpProtoMaker : public framework::OpProtoAndCheckerMaker { "0. CUDAPinnedPlace/CPU <->CUDAPlace" "1. NPUPinnedPlace/CPU <-> NPUPlace" "2. CPU <->XPUPlace" + "3. CPU <->IPUPlace" "Other place type is Unimplemented and will cause ERROR."); AddComment(R"DOC( MemcpyD2H Operator. @@ -233,3 +234,31 @@ REGISTER_OP_NPU_KERNEL_FUNCTOR(memcpy_h2d, int16_t, ops::MemcpyH2DKernel); #endif + +#ifdef PADDLE_WITH_IPU +REGISTER_OP_IPU_KERNEL_FUNCTOR(memcpy_h2d, + float, + ops::MemcpyH2DKernel, + double, + ops::MemcpyH2DKernel, + int8_t, + ops::MemcpyH2DKernel, + uint8_t, + ops::MemcpyH2DKernel, + int, + ops::MemcpyH2DKernel, + int64_t, + ops::MemcpyH2DKernel, + bool, + ops::MemcpyH2DKernel, + paddle::platform::bfloat16, + ops::MemcpyH2DKernel, + paddle::platform::complex, + ops::MemcpyH2DKernel, + paddle::platform::complex, + ops::MemcpyH2DKernel, + plat::float16, + ops::MemcpyH2DKernel, + int16_t, + ops::MemcpyH2DKernel); +#endif diff --git a/paddle/fluid/operators/memcpy_h2d_op.h b/paddle/fluid/operators/memcpy_h2d_op.h index 3fcc4b89eefe8..8cd84f4b59e8c 100644 --- a/paddle/fluid/operators/memcpy_h2d_op.h +++ b/paddle/fluid/operators/memcpy_h2d_op.h @@ -50,7 +50,7 @@ class MemcpyH2DFunctor { lod_tensor.dtype(), phi::Stream(reinterpret_cast(stream))); - if (dst_place_type_ == 0 || dst_place_type_ == 1 || dst_place_type_ == 2) { + if (dst_place_type_ >= 0 && dst_place_type_ <= 3) { framework::TensorCopy( lod_tensor, dev_ctx_.GetPlace(), dev_ctx_, &out_tensor); } else { diff --git a/paddle/fluid/platform/device_event_base.h b/paddle/fluid/platform/device_event_base.h index 51df0fd4f40ad..a2d3fc1dc3818 100644 --- a/paddle/fluid/platform/device_event_base.h +++ b/paddle/fluid/platform/device_event_base.h @@ -64,7 +64,7 @@ class DeviceEvent { "Required type < %d, but received type = %d", MaxDeviceTypes, type_id_)); - // TODO(Aurelius84): only support CPU/CUDA, need consider XPU/NPU later + // TODO(Aurelius84): only support CPU/CUDA/XPU/NPU. PADDLE_ENFORCE_LT(type_id_, 4, platform::errors::Unavailable( diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index cf00075edcf86..c7bfd19e5a9d0 100755 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -1388,8 +1388,8 @@ def _run_impl(self, program, feed, fetch_list, feed_var_name, program = pruned_program def _can_use_interpreter_core(program, place): - if core.is_compiled_with_mlu() or core.is_compiled_with_ipu( - ) or isinstance(place, core.CustomPlace): + if core.is_compiled_with_mlu() or isinstance( + place, core.CustomPlace): return False compiled = isinstance(program, compiler.CompiledProgram)