diff --git a/paddle/fluid/framework/boxps_worker.cc b/paddle/fluid/framework/boxps_worker.cc
index 1bc261f8df439..42aef068f7471 100644
--- a/paddle/fluid/framework/boxps_worker.cc
+++ b/paddle/fluid/framework/boxps_worker.cc
@@ -695,6 +695,9 @@ void BoxPSWorker::SyncParam(void) {
   TensorScaleValue(place_, param_sync_, &param_sync_, scale);
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
 #elif defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_XPU)
+  // Other dense op use default stream, so we need wait other op calc finished before call bkcl_all_reduce.
+  xpu_wait(0);
+
   PADDLE_ENFORCE_EQ(
       bkcl_all_reduce(comm->comm(),
                       sendbuff,
diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h
index 718c77aaa6ff2..0ebb3989f54ae 100644
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -343,6 +343,9 @@ class CAllReduceOpXPUKernel : public framework::OpKernel<T> {
             "Invalid reduce type: %d", red_type));
     }
 
+    // Other dense op use default stream, so we need wait other op calc finished before call bkcl_all_reduce.
+    xpu_wait(0);
+
     PADDLE_ENFORCE_EQ(
         bkcl_all_reduce(comm->comm(),
                         sendbuff,
diff --git a/paddle/fluid/operators/collective/c_mixallgather_op.cc b/paddle/fluid/operators/collective/c_mixallgather_op.cc
index eac299bddffe4..d187ff5026866 100644
--- a/paddle/fluid/operators/collective/c_mixallgather_op.cc
+++ b/paddle/fluid/operators/collective/c_mixallgather_op.cc
@@ -417,6 +417,10 @@ class CMixAllGatherOpXPUKernel : public framework::OpKernel<T> {
 #ifdef TRACE_PROFILE
     TRACE_SCOPE_START("bkcl_all_reduce", xpu_wait(stream));
 #endif
+
+    // Other dense op use default stream, so we need wait other op calc finished before call bkcl_all_reduce.
+    xpu_wait(0);
+
     PADDLE_ENFORCE_EQ(
         bkcl_all_reduce(comm->comm(),
                         recvbuff,