diff --git a/paddle/fluid/framework/boxps_worker.cc b/paddle/fluid/framework/boxps_worker.cc index 1bc261f8df439..42aef068f7471 100644 --- a/paddle/fluid/framework/boxps_worker.cc +++ b/paddle/fluid/framework/boxps_worker.cc @@ -695,6 +695,9 @@ void BoxPSWorker::SyncParam(void) { TensorScaleValue(place_, param_sync_, ¶m_sync_, scale); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); #elif defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_XPU) + // Other dense op use default stream, so we need wait other op calc finished before call bkcl_all_reduce. + xpu_wait(0); + PADDLE_ENFORCE_EQ( bkcl_all_reduce(comm->comm(), sendbuff, diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h index 718c77aaa6ff2..0ebb3989f54ae 100644 --- a/paddle/fluid/operators/collective/c_allreduce_op.h +++ b/paddle/fluid/operators/collective/c_allreduce_op.h @@ -343,6 +343,9 @@ class CAllReduceOpXPUKernel : public framework::OpKernel { "Invalid reduce type: %d", red_type)); } + // Other dense op use default stream, so we need wait other op calc finished before call bkcl_all_reduce. + xpu_wait(0); + PADDLE_ENFORCE_EQ( bkcl_all_reduce(comm->comm(), sendbuff, diff --git a/paddle/fluid/operators/collective/c_mixallgather_op.cc b/paddle/fluid/operators/collective/c_mixallgather_op.cc index eac299bddffe4..d187ff5026866 100644 --- a/paddle/fluid/operators/collective/c_mixallgather_op.cc +++ b/paddle/fluid/operators/collective/c_mixallgather_op.cc @@ -417,6 +417,10 @@ class CMixAllGatherOpXPUKernel : public framework::OpKernel { #ifdef TRACE_PROFILE TRACE_SCOPE_START("bkcl_all_reduce", xpu_wait(stream)); #endif + + // Other dense op use default stream, so we need wait other op calc finished before call bkcl_all_reduce. + xpu_wait(0); + PADDLE_ENFORCE_EQ( bkcl_all_reduce(comm->comm(), recvbuff,