From bee4736e2bb25df3cbeffda8823853477f6a4553 Mon Sep 17 00:00:00 2001
From: xiaoye <50870160+xiaoyewww@users.noreply.github.com>
Date: Mon, 22 Jan 2024 11:24:19 +0800
Subject: [PATCH 01/34] =?UTF-8?q?=E3=80=90paddle=5Ftest=20No.34=E3=80=91re?=
 =?UTF-8?q?place=20cc=5Ftest=20with=20paddle=5Ftest=20(#60961)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat(cmake): push paddle_test of backends

* feat(ctest): move ut to test
---
 paddle/phi/backends/CMakeLists.txt                    | 7 -------
 test/cpp/phi/backends/CMakeLists.txt                  | 3 +++
 {paddle => test/cpp}/phi/backends/custom/capi_test.cc | 0
 3 files changed, 3 insertions(+), 7 deletions(-)
 create mode 100644 test/cpp/phi/backends/CMakeLists.txt
 rename {paddle => test/cpp}/phi/backends/custom/capi_test.cc (100%)
diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt
index ed47487553bee7..50da99217b153d 100644
--- a/paddle/phi/backends/CMakeLists.txt
+++ b/paddle/phi/backends/CMakeLists.txt
@@ -59,10 +59,3 @@ if(WITH_CUSTOM_DEVICE)
 endif()
 
 collect_srcs(backends_srcs SRCS ${BACKENDS_SRCS})
-
-if(WITH_CUSTOM_DEVICE)
-  cc_test(
-    capi_test
-    SRCS custom/capi_test.cc
-    DEPS phi common)
-endif()
diff --git a/test/cpp/phi/backends/CMakeLists.txt b/test/cpp/phi/backends/CMakeLists.txt
new file mode 100644
index 00000000000000..216d6c098f3351
--- /dev/null
+++ b/test/cpp/phi/backends/CMakeLists.txt
@@ -0,0 +1,3 @@
+if(WITH_CUSTOM_DEVICE)
+  paddle_test(capi_test SRCS custom/capi_test.cc DEPS phi common)
+endif()
diff --git a/paddle/phi/backends/custom/capi_test.cc b/test/cpp/phi/backends/custom/capi_test.cc
similarity index 100%
rename from paddle/phi/backends/custom/capi_test.cc
rename to test/cpp/phi/backends/custom/capi_test.cc

From 41c280eaa71b5998b59e7d3984f775c944f630ca Mon Sep 17 00:00:00 2001
From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com>
Date: Mon, 22 Jan 2024 13:01:00 +0800
Subject: [PATCH 02/34] [PIR] Open some ut for cf (#60966)

* fix

* fix

* fix

* fix

* fix
---
 .../pir_adaptor/pir_adaptor_util.cc           | 30 +++++++++++--------
 .../pir_adaptor/pir_adaptor_util.h            |  6 ++--
 .../pir/transforms/pd_op_to_kernel_pass.cc    | 18 ++++++++++-
 test/dygraph_to_static/test_break_continue.py |  9 ++----
 test/dygraph_to_static/test_jit_setitem.py    |  7 ++---
 5 files changed, 43 insertions(+), 27 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.cc b/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.cc
index 13110c5c4bd153..d7342b7773c185 100644
--- a/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.cc
+++ b/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.cc
@@ -87,11 +87,8 @@ void ValueExecutionInfo::Add(::pir::Value value, const std::string& var_name) {
           "The size of variable_list and var_name_2_id map should be equal"));
 }
 
-void ValueExecutionInfo::Rename(pir::Value value,
-                                const std::string& new_name,
+void ValueExecutionInfo::Rename(const std::string& new_name,
                                 const std::string& orig_name) {
-  value_2_var_name_[value] = new_name;
-
   for (auto kv : value_2_var_name_) {
     if (kv.second == orig_name) {
       value_2_var_name_[kv.first] = new_name;
@@ -148,6 +145,11 @@ void ValueExecutionInfo::AddValue2VarName(::pir::Value value,
   value_2_var_name_.emplace(value, var_name);
 }
 
+void ValueExecutionInfo::UpdateValue2VarName(::pir::Value value,
+                                             const std::string& var_name) {
+  value_2_var_name_[value] = var_name;
+}
+
 const std::unordered_map<const Variable*, std::string>&
 ValueExecutionInfo::GetVar2VarName() const {
   return var_2_var_name_;
@@ -542,7 +544,7 @@ void HandleForSpecialOp(pir::Operation* op,
               << param_name;
     }
 
-    value_exe_info->Rename(value, param_name, orig_name);
+    value_exe_info->Rename(param_name, orig_name);
   } else if (op->isa<pir::ShadowOutputOp>()) {
     VLOG(6) << "Handle for builtin.shadow_ouptut";
     auto var_name = op->attributes()
@@ -558,14 +560,18 @@ void HandleForSpecialOp(pir::Operation* op,
       return;
     }
 
-    if (value_exe_info->GetScope()->FindVar(var_name) != nullptr) {
-      const_cast<Scope*>(value_exe_info->GetScope())->EraseVars({var_name});
-      VLOG(1) << "var " << var_name << " has been removed from scope";
+    if (value_exe_info->HasVar(var_name)) {
+      value_exe_info->UpdateValue2VarName(value, var_name);
+    } else {
+      if (value_exe_info->GetScope()->FindVar(var_name) != nullptr) {
+        const_cast<Scope*>(value_exe_info->GetScope())->EraseVars({var_name});
+        VLOG(1) << "var " << var_name << " has been removed from scope";
+      }
+      const_cast<Scope*>(value_exe_info->GetScope())
+          ->Rename(orig_name, var_name);
+      VLOG(8) << "var " << orig_name << " has been renamed to " << var_name;
+      value_exe_info->Rename(var_name, orig_name);
     }
-    const_cast<Scope*>(value_exe_info->GetScope())->Rename(orig_name, var_name);
-    VLOG(8) << "var " << orig_name << " has been renamed to " << var_name;
-
-    value_exe_info->Rename(value, var_name, orig_name);
   } else if (op->isa<pir::ParameterOp>()) {
     VLOG(6) << "Handle for builtin.parameter:";
     auto param_name = op->attributes()
diff --git a/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h b/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h
index 089800985c2fd1..15d39da3c63812 100644
--- a/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h
+++ b/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h
@@ -58,9 +58,7 @@ class ValueExecutionInfo {
 
   void Add(::pir::Value value, const std::string& var_name);
 
-  void Rename(pir::Value value,
-              const std::string& new_name,
-              const std::string& orig_name);
+  void Rename(const std::string& new_name, const std::string& orig_name);
 
   int GetIdByName(const std::string& name) const;
 
@@ -72,6 +70,8 @@ class ValueExecutionInfo {
 
   void AddValue2VarName(::pir::Value value, const std::string& var_name);
 
+  void UpdateValue2VarName(::pir::Value value, const std::string& var_name);
+
   const std::unordered_map<const paddle::framework::Variable*, std::string>&
   GetVar2VarName() const;
 
diff --git a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
index 3c6391dcd06d3a..622f47a92725f6 100644
--- a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
+++ b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
@@ -187,6 +187,7 @@ static bool NeedFallBackCpu(const pir::Operation* op,
 }
 
 static bool NeedFallBackFromGPUDNN2GPU(pir::Operation* op,
+                                       const std::string& kernel_name,
                                        const phi::KernelKey kernel_key) {
   // NOTE(phlrain): keep the same kernel select strategy with
   // GetExepectKernelKey
@@ -224,6 +225,21 @@ static bool NeedFallBackFromGPUDNN2GPU(pir::Operation* op,
 #endif
     return !use_cudnn;
   }
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  if (kernel_key.backend() == phi::Backend::GPUDNN) {
+    auto iter = phi::KernelFactory::Instance().kernels().find(kernel_name);
+    if (iter != phi::KernelFactory::Instance().kernels().end()) {
+      auto kernel_iter = iter->second.find({phi::Backend::GPUDNN,
+                                            phi::DataLayout::ALL_LAYOUT,
+                                            kernel_key.dtype()});
+      if (kernel_iter == iter->second.end()) {
+        return true;
+      }
+    }
+  }
+#endif
+
   return false;
 }
 
@@ -1069,7 +1085,7 @@ phi::KernelKey GetKernelKey(
     VLOG(8) << "kernel backend must be on CPU when need fallback";
   }
 
-  if (NeedFallBackFromGPUDNN2GPU(op, res)) {
+  if (NeedFallBackFromGPUDNN2GPU(op, kernel_fn_str, res)) {
     res.set_backend(phi::Backend::GPU);
     VLOG(8) << "kernel backend must be on GPU when need fallback from GPUDNN "
                "to GPU";
diff --git a/test/dygraph_to_static/test_break_continue.py b/test/dygraph_to_static/test_break_continue.py
index d2d8d2c10d9894..fdb909cb4c45a9 100644
--- a/test/dygraph_to_static/test_break_continue.py
+++ b/test/dygraph_to_static/test_break_continue.py
@@ -17,9 +17,6 @@
 import numpy as np
 from dygraph_to_static_utils import (
     Dy2StTestBase,
-    IrMode,
-    ToStaticMode,
-    disable_test_case,
     enable_to_static_guard,
     test_ast_only,
     test_legacy_and_pt,
@@ -300,7 +297,7 @@ def init_dygraph_func(self):
         self.dygraph_func = test_continue_in_while
 
     # TODO(dev): Remove this after fix PT Rename issue
-    @disable_test_case((ToStaticMode.AST, IrMode.PT))
+    @test_legacy_and_pt
     def test_transformed_static_result(self):
         self.init_dygraph_func()
         dygraph_res = self.run_dygraph_mode()
@@ -318,7 +315,7 @@ def init_dygraph_func(self):
         self.dygraph_func = test_break_in_while
 
     # TODO(dev): Remove this after fix PT Rename issue
-    @disable_test_case((ToStaticMode.AST, IrMode.PT))
+    @test_legacy_and_pt
     def test_transformed_static_result(self):
         self.init_dygraph_func()
         dygraph_res = self.run_dygraph_mode()
@@ -360,7 +357,7 @@ def init_dygraph_func(self):
         self.dygraph_func = test_optim_break_in_while
 
     # TODO(dev): Remove this after fix PT Rename issue
-    @disable_test_case((ToStaticMode.AST, IrMode.PT))
+    @test_legacy_and_pt
     def test_transformed_static_result(self):
         self.init_dygraph_func()
         dygraph_res = self.run_dygraph_mode()
diff --git a/test/dygraph_to_static/test_jit_setitem.py b/test/dygraph_to_static/test_jit_setitem.py
index 791cbdc6c1e653..9b915dfc71e84d 100644
--- a/test/dygraph_to_static/test_jit_setitem.py
+++ b/test/dygraph_to_static/test_jit_setitem.py
@@ -18,9 +18,7 @@
 import numpy as np
 from dygraph_to_static_utils import (
     Dy2StTestBase,
-    IrMode,
-    ToStaticMode,
-    disable_test_case,
+    test_legacy_and_pt,
     test_legacy_and_pt_and_pir,
 )
 
@@ -206,8 +204,7 @@ def run_dygraph(self, func):
         return (y,)
 
     # TODO: Open PIR test when while_loop dy2st fixed
-    @disable_test_case((ToStaticMode.AST, IrMode.PIR))
-    @test_legacy_and_pt_and_pir
+    @test_legacy_and_pt
     def test_case(self):
         func = self.init_func()
         dy_res = self.run_dygraph(func)

From 0ab867e24ee8c57cafeb8a23f73bf7738ba7d9fe Mon Sep 17 00:00:00 2001
From: HankYang <97599656+Hhankyangg@users.noreply.github.com>
Date: Mon, 22 Jan 2024 14:25:43 +0800
Subject: [PATCH 03/34] fix: fix docstring issue (#60962)

* fix: fix docstring issue

* update executor.py
---
 python/paddle/tensor/linalg.py | 40 ++++++++++++++++++++++++----------
 1 file changed, 28 insertions(+), 12 deletions(-)

diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index ef5a9f04d3a0ef..321366276c6115 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -63,24 +63,40 @@ def transpose(x, perm, name=None):
 
         .. code-block:: text
 
-            x = [[[ 1  2  3  4] [ 5  6  7  8] [ 9 10 11 12]]
-                 [[13 14 15 16] [17 18 19 20] [21 22 23 24]]]
-            shape(x) =  [2,3,4]
+            # The following codes in this code block are pseudocode, designed to show the execution logic and results of the function.
+
+            x = to_tensor([[[ 1  2  3  4] [ 5  6  7  8] [ 9 10 11 12]]
+                           [[13 14 15 16] [17 18 19 20] [21 22 23 24]]])
+            shape(x): return [2,3,4]
 
             # Example 1
             perm0 = [1,0,2]
-            y_perm0 = [[[ 1  2  3  4] [13 14 15 16]]
-                       [[ 5  6  7  8]  [17 18 19 20]]
-                       [[ 9 10 11 12]  [21 22 23 24]]]
-            shape(y_perm0) = [3,2,4]
+            y_perm0 = transpose(x, perm0) # Permute x by perm0
+
+            # dim:0 of y_perm0 is dim:1 of x
+            # dim:1 of y_perm0 is dim:0 of x
+            # dim:2 of y_perm0 is dim:2 of x
+            # The above two lines can also be understood as exchanging the zeroth and first dimensions of x
+
+            y_perm0.data = [[[ 1  2  3  4]  [13 14 15 16]]
+                            [[ 5  6  7  8]  [17 18 19 20]]
+                            [[ 9 10 11 12]  [21 22 23 24]]]
+            shape(y_perm0): return [3,2,4]
 
             # Example 2
             perm1 = [2,1,0]
-            y_perm1 = [[[ 1 13] [ 5 17] [ 9 21]]
-                       [[ 2 14] [ 6 18] [10 22]]
-                       [[ 3 15]  [ 7 19]  [11 23]]
-                       [[ 4 16]  [ 8 20]  [12 24]]]
-            shape(y_perm1) = [4,3,2]
+            y_perm1 = transpose(x, perm1) # Permute x by perm1
+
+            # dim:0 of y_perm1 is dim:2 of x
+            # dim:1 of y_perm1 is dim:1 of x
+            # dim:2 of y_perm1 is dim:0 of x
+            # The above two lines can also be understood as exchanging the zeroth and second dimensions of x
+
+            y_perm1.data = [[[ 1 13]  [ 5 17]  [ 9 21]]
+                            [[ 2 14]  [ 6 18]  [10 22]]
+                            [[ 3 15]  [ 7 19]  [11 23]]
+                            [[ 4 16]  [ 8 20]  [12 24]]]
+            shape(y_perm1): return [4,3,2]
 
     Examples:
 

From 44f93dcc36a26ebb881cd13da4395334379f6ecf Mon Sep 17 00:00:00 2001
From: ooo oo <106524776+ooooo-create@users.noreply.github.com>
Date: Mon, 22 Jan 2024 14:26:50 +0800
Subject: [PATCH 04/34] [Docs] Fix delimiter of argument in docstring (#60942)

* fix docstring

* update
---
 python/paddle/base/framework.py        | 2 +-
 python/paddle/nn/functional/vision.py  | 2 +-
 python/paddle/nn/quant/stub.py         | 8 ++++----
 python/paddle/quantization/factory.py  | 2 +-
 python/paddle/quantization/ptq.py      | 4 ++--
 python/paddle/quantization/qat.py      | 6 +++---
 python/paddle/quantization/quantize.py | 8 ++++----
 python/paddle/quantization/wrapper.py  | 6 +++---
 8 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/python/paddle/base/framework.py b/python/paddle/base/framework.py
index 8fa7af0ef291ce..655f2e9b6a586b 100644
--- a/python/paddle/base/framework.py
+++ b/python/paddle/base/framework.py
@@ -6969,7 +6969,7 @@ def block(self, index):
         Get the :code:`index`  :ref:`api_guide_Block_en`  of this Program
 
         Args:
-            index (int) - The index of  :ref:`api_guide_Block_en`  to get
+            index (int): The index of  :ref:`api_guide_Block_en`  to get
 
         Returns:
             :ref:`api_guide_Block_en`: The :code:`index` block
diff --git a/python/paddle/nn/functional/vision.py b/python/paddle/nn/functional/vision.py
index 3d7aa2f6fc8489..2c7327d947a7ce 100644
--- a/python/paddle/nn/functional/vision.py
+++ b/python/paddle/nn/functional/vision.py
@@ -36,7 +36,7 @@ def affine_grid(theta, out_shape, align_corners=True, name=None):
     output feature map.
 
     Args:
-        theta (Tensor) - A tensor with shape [N, 2, 3] or [N, 3, 4]. It contains a batch of affine transform parameters.
+        theta (Tensor): A tensor with shape [N, 2, 3] or [N, 3, 4]. It contains a batch of affine transform parameters.
                            The data type can be float32 or float64.
         out_shape (Tensor | list | tuple): Type can be a 1-D Tensor, list, or tuple. It is used to represent the shape of the output in an affine transformation, in the format ``[N, C, H, W]`` or ``[N, C, D, H, W]``.
                                            When the format is ``[N, C, H, W]``, it represents the batch size, number of channels, height and width. When the format is ``[N, C, D, H, W]``, it represents the batch size, number of channels, depth, height and width.
diff --git a/python/paddle/nn/quant/stub.py b/python/paddle/nn/quant/stub.py
index 314319caa707dc..96c08c534a4216 100644
--- a/python/paddle/nn/quant/stub.py
+++ b/python/paddle/nn/quant/stub.py
@@ -26,8 +26,8 @@ class Stub(Layer):
     stub will observe or quantize the inputs of the functional API.
 
     Args:
-        observer(QuanterFactory) - The configured information of the observer to be inserted.
-        It will use a global configuration to create the observers if the 'observer' is none.
+        observer(QuanterFactory): The configured information of the observer to be inserted.
+            It will use a global configuration to create the observers if the 'observer' is none.
 
     Examples:
         .. code-block:: python
@@ -81,9 +81,9 @@ class QuanterStub(Layer):
     The user should not use this class directly.
 
     Args:
-        layer(paddle.nn.Layer) - The stub layer with an observer configure factory. If the observer
+        layer(paddle.nn.Layer): The stub layer with an observer configure factory. If the observer
         of the stub layer is none, it will use 'q_config' to create an observer instance.
-        q_config(QuantConfig) - The quantization configuration for the current stub layer.
+        q_config(QuantConfig): The quantization configuration for the current stub layer.
     """
 
     def __init__(self, layer: Stub, q_config):
diff --git a/python/paddle/quantization/factory.py b/python/paddle/quantization/factory.py
index eb8916460975c8..c1635a03aaa712 100644
--- a/python/paddle/quantization/factory.py
+++ b/python/paddle/quantization/factory.py
@@ -78,7 +78,7 @@ def quanter(class_name):
     Annotation to declare a factory class for quanter.
 
     Args:
-        class_name (str) - The name of factory class to be declared.
+        class_name (str): The name of factory class to be declared.
 
     Examples:
         .. code-block:: python
diff --git a/python/paddle/quantization/ptq.py b/python/paddle/quantization/ptq.py
index cdd1b9403e0f7d..6a5562abb94076 100644
--- a/python/paddle/quantization/ptq.py
+++ b/python/paddle/quantization/ptq.py
@@ -47,8 +47,8 @@ def quantize(self, model: Layer, inplace=False):
         quantization parameters.
 
         Args:
-            model(Layer) - The model to be quantized.
-            inplace(bool) - Whether to modify the model in-place.
+            model(Layer): The model to be quantized.
+            inplace(bool): Whether to modify the model in-place.
 
         Return: The prepared model for post-training quantization.
 
diff --git a/python/paddle/quantization/qat.py b/python/paddle/quantization/qat.py
index b851e473b6003f..e762400c316f7f 100644
--- a/python/paddle/quantization/qat.py
+++ b/python/paddle/quantization/qat.py
@@ -24,7 +24,7 @@ class QAT(Quantization):
     r"""
     Tools used to prepare model for quantization-aware training.
     Args:
-        config(QuantConfig) - Quantization configuration
+        config(QuantConfig): Quantization configuration
 
     Examples:
         .. code-block:: python
@@ -47,8 +47,8 @@ def quantize(self, model: Layer, inplace=False):
         And it will insert fake quanters into the model to simulate the quantization.
 
         Args:
-            model(Layer) - The model to be quantized.
-            inplace(bool) - Whether to modify the model in-place.
+            model(Layer): The model to be quantized.
+            inplace(bool): Whether to modify the model in-place.
 
         Return: The prepared model for quantization-aware training.
 
diff --git a/python/paddle/quantization/quantize.py b/python/paddle/quantization/quantize.py
index b7887ffc46e1c4..9bd0940d1c5b2b 100644
--- a/python/paddle/quantization/quantize.py
+++ b/python/paddle/quantization/quantize.py
@@ -29,7 +29,7 @@ class Quantization(metaclass=abc.ABCMeta):
     r"""
     Abstract class used to prepares a copy of the model for quantization calibration or quantization-aware training.
     Args:
-        config(QuantConfig) - Quantization configuration
+        config(QuantConfig): Quantization configuration
     """
 
     def __init__(self, config: QuantConfig):
@@ -44,9 +44,9 @@ def convert(self, model: Layer, inplace=False, remain_weight=False):
         r"""Convert the quantization model to ONNX style. And the converted
         model can be saved as inference model by calling paddle.jit.save.
         Args:
-            model(Layer) - The quantized model to be converted.
-            inplace(bool, optional) - Whether to modify the model in-place, default is False.
-            remain_weight(bool, optional) - Whether to remain weights in floats, default is False.
+            model(Layer): The quantized model to be converted.
+            inplace(bool, optional): Whether to modify the model in-place, default is False.
+            remain_weight(bool, optional): Whether to remain weights in floats, default is False.
 
         Return: The converted model
 
diff --git a/python/paddle/quantization/wrapper.py b/python/paddle/quantization/wrapper.py
index cef847a5a1b051..3c722ab95d544f 100644
--- a/python/paddle/quantization/wrapper.py
+++ b/python/paddle/quantization/wrapper.py
@@ -22,9 +22,9 @@ class ObserveWrapper(Layer):
     Put an observer layer and an observed layer into a wrapping layer.
     It is used to insert layers into the model for QAT or PTQ.
     Args:
-        observer(BaseQuanter) - Observer layer
-        observed(Layer) - Observed layer
-        observe_input(bool) - If it is true the observer layer will be called before observed layer.
+        observer(BaseQuanter): Observer layer
+        observed(Layer): Observed layer
+        observe_input(bool): If it is true the observer layer will be called before observed layer.
             If it is false the observed layer will be called before observer layer. Default: True.
     """
 

From 0d7bad29bcd968f736f87018dad72ff34d4b1b92 Mon Sep 17 00:00:00 2001
From: winter-wang <78149749+winter-wang@users.noreply.github.com>
Date: Mon, 22 Jan 2024 15:04:23 +0800
Subject: [PATCH 05/34] [PIR] add python api for if (#60895)

---
 .../control_flow/if_instruction.cc            | 32 ++++++++-
 .../pir/dialect/operator/ir/api_builder.h     |  2 +-
 .../dialect/operator/ir/control_flow_op.cc    | 32 +++++----
 .../pir/transforms/pd_op_to_kernel_pass.cc    | 21 ------
 paddle/fluid/pybind/control_flow_api.cc       |  7 ++
 python/paddle/base/layer_helper.py            |  3 +
 python/paddle/base/layer_helper_base.py       |  3 +
 python/paddle/static/nn/common.py             | 34 +++++++---
 python/paddle/static/nn/control_flow.py       | 65 +++++++++++++++++--
 test/dygraph_to_static/test_loop.py           |  3 -
 test/legacy_test/test_conditional_block.py    |  7 +-
 11 files changed, 153 insertions(+), 56 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/instruction/control_flow/if_instruction.cc b/paddle/fluid/framework/new_executor/instruction/control_flow/if_instruction.cc
index 624ce6221cd5e7..d7ad210102b94b 100644
--- a/paddle/fluid/framework/new_executor/instruction/control_flow/if_instruction.cc
+++ b/paddle/fluid/framework/new_executor/instruction/control_flow/if_instruction.cc
@@ -215,8 +215,36 @@ void IfInstruction::CopyBranchOutput(const std::vector<std::string>& var_names,
 }
 
 void IfInstruction::Run() {
-  DeviceContext().Wait();
-  if (cond_var_->Get<phi::DenseTensor>().data<bool>()[0]) {
+  bool cond = true;
+  if (cond_var_->IsType<phi::DenseTensor>()) {
+    auto& cond_tensor = cond_var_->Get<phi::DenseTensor>();
+    if (paddle::platform::is_cpu_place(cond_tensor.place())) {
+      cond = cond_tensor.data<bool>()[0];
+    } else {
+      // when platform::is_gpu_place(cond.place()) or
+      // platform::is_xpu_place(cond.place()) is true
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE)
+      DeviceContext().Wait();
+      phi::DenseTensor cpu_cond;
+      paddle::framework::TensorCopySync(
+          cond_tensor, platform::CPUPlace(), &cpu_cond);
+      cond = cpu_cond.data<bool>()[0];
+#else
+      PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
+          "This version of PaddlePaddle does NOT support GPU/XPU but got "
+          "GPU/XPU tensor Cond in WhileOp. Please compile WITH_GPU or "
+          "WITH_XPU option."));
+#endif
+    }
+  } else if (cond_var_->IsType<VariableRefArray>()) {
+    auto& cond_array = cond_var_->Get<VariableRefArray>();
+    cond = std::all_of(
+        cond_array.begin(), cond_array.end(), [](const Variable* t) {
+          return t->Get<phi::DenseTensor>().numel() != 0;
+        });
+  }
+  if (cond) {
     true_branch_inter_->Run({}, false);
     CopyBranchOutput(true_branch_outputs_, true_branch_inter_);
   } else {
diff --git a/paddle/fluid/pir/dialect/operator/ir/api_builder.h b/paddle/fluid/pir/dialect/operator/ir/api_builder.h
index aa20ef34e17a64..70f048a0acf10f 100644
--- a/paddle/fluid/pir/dialect/operator/ir/api_builder.h
+++ b/paddle/fluid/pir/dialect/operator/ir/api_builder.h
@@ -43,7 +43,7 @@ class ApiBuilder {
   void SetParameter(const std::string& name,
                     std::unique_ptr<pir::Parameter>&& parameter);
 
-  std::shared_ptr<pir::Builder> GetBuilder() { return builder_; }
+  const std::shared_ptr<pir::Builder>& GetBuilder() const { return builder_; }
 
   const pir::InsertionPoint& GetCurrentInsertionPoint() const {
     return builder_->insertion_point();
diff --git a/paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc b/paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc
index 11f99a4a5f7576..0611cea789129f 100644
--- a/paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc
@@ -218,8 +218,14 @@ void IfOp::VerifyRegion() {
       1u,
       phi::errors::PreconditionNotMet("The size %d of true_region must be 1.",
                                       (*this)->region(0).size()));
-  if ((*this)->region(0).front().size() > 0) {
-    auto &true_last_op = (*this)->region(0).front().back();
+  if ((*this)->num_results() != 0) {
+    auto &true_block = (*this)->region(0).front();
+    PADDLE_ENFORCE_GT(
+        true_block.size(),
+        0u,
+        phi::errors::PreconditionNotMet(
+            "The true block must have at least one op yield op."));
+    auto &true_last_op = true_block.back();
     PADDLE_ENFORCE_EQ(true,
                       true_last_op.isa<pir::YieldOp>(),
                       phi::errors::PreconditionNotMet(
@@ -229,15 +235,19 @@ void IfOp::VerifyRegion() {
                       phi::errors::PreconditionNotMet(
                           "The size of last of true block op's input must be "
                           "equal to IfOp's outputs num."));
-  }
-  VLOG(4) << "Start Verifying false branch.";
-  PADDLE_ENFORCE_EQ(
-      (*this)->region(1).size(),
-      1u,
-      phi::errors::PreconditionNotMet("The size %d of false_region must be 1.",
-                                      (*this)->region(0).size()));
-  if ((*this)->region(1).front().size() > 0) {
-    auto &false_last_op = (*this)->region(1).front().back();
+    VLOG(4) << "Start Verifying false branch.";
+    PADDLE_ENFORCE_EQ((*this)->region(1).size(),
+                      1u,
+                      phi::errors::PreconditionNotMet(
+                          "The size %d of false_region must be 1.",
+                          (*this)->region(0).size()));
+    auto &false_block = (*this)->region(1).front();
+    PADDLE_ENFORCE_GT(
+        false_block.size(),
+        0u,
+        phi::errors::PreconditionNotMet(
+            "The false block must have at least one op yield op."));
+    auto &false_last_op = false_block.back();
     PADDLE_ENFORCE_EQ(true,
                       false_last_op.isa<pir::YieldOp>(),
                       phi::errors::PreconditionNotMet(
diff --git a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
index 622f47a92725f6..b77ab6208c9ecd 100644
--- a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
+++ b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
@@ -1130,27 +1130,6 @@ void HandleForIfOp(
           "[%d]'s input of [%s] op MUST in map pair", 0, op_item->name()));
   auto new_cond = map_value_pair->at(old_cond);
 
-  // NOTE(zhangbo): IfOp's input cond should be a cpu type.
-  AllocatedDenseTensorType new_cond_type =
-      new_cond.type().dyn_cast<AllocatedDenseTensorType>();
-  if (new_cond_type) {
-    if (new_cond_type.place().GetType() == phi::AllocationType::GPU) {
-      auto out_type = AllocatedDenseTensorType::get(
-          ctx, phi::CPUPlace(), old_cond.type().dyn_cast<DenseTensorType>());
-      phi::KernelKey kernel_key(
-          phi::Backend::GPU, phi::DataLayout::ALL_LAYOUT, phi::DataType::BOOL);
-      new_cond = AddPlaceTransferOp(new_cond,
-                                    out_type,
-                                    new_cond_type.place(),
-                                    phi::CPUPlace(),
-                                    kernel_key,
-                                    block);
-    }
-  } else {
-    PADDLE_THROW(
-        phi::errors::Unimplemented("IfOp onlu support DenseTensorType"));
-  }
-
   // Create IfOp and insert to kernel dialect program
   pir::Builder builder(ctx, block);
   auto old_ifop = op_item->dyn_cast<IfOp>();
diff --git a/paddle/fluid/pybind/control_flow_api.cc b/paddle/fluid/pybind/control_flow_api.cc
index 7976497b2954a4..d085b6a4cfc509 100644
--- a/paddle/fluid/pybind/control_flow_api.cc
+++ b/paddle/fluid/pybind/control_flow_api.cc
@@ -44,6 +44,7 @@ using paddle::pybind::PyIfOp;
 using paddle::pybind::PyWhileOp;
 using pir::Block;
 using pir::Builder;
+using pir::CombineOp;
 using pir::Operation;
 using pir::Program;
 using pir::Region;
@@ -60,6 +61,11 @@ void BindIfOp(py::module* m) {
     return PyIfOp(ApiBuilder::Instance().GetBuilder()->Build<IfOp>(
         cond, std::vector<Type>{}));
   });
+  m->def("build_if_op", [](const std::vector<Value>& cond) {
+    auto& builder = ApiBuilder::Instance().GetBuilder();
+    auto new_cond = builder->Build<CombineOp>(cond).out();
+    return PyIfOp(builder->Build<IfOp>(new_cond, std::vector<Type>{}));
+  });
   py::class_<PyIfOp> if_op(*m, "IfOp", R"DOC(
     The PyIfOp is a encapsulation of IfOp. Compared with ifOp, it provides an additional 'update_output' interface.
     The 'update_output' interface will construct a new IfOp operation to replace its underlying IfOp. In the process, the original
@@ -67,6 +73,7 @@ void BindIfOp(py::module* m) {
   )DOC");
   if_op.def("true_block", &PyIfOp::true_block, return_value_policy::reference)
       .def("false_block", &PyIfOp::false_block, return_value_policy::reference)
+      .def("cond", &PyIfOp::cond)
       .def("update_output", &PyIfOp::UpdateOutput)
       .def("as_operation", &PyIfOp::operation, return_value_policy::reference)
       .def("results", [](PyIfOp& self) -> py::list {
diff --git a/python/paddle/base/layer_helper.py b/python/paddle/base/layer_helper.py
index 8f4b068d4e8978..add33e0a4edeed 100644
--- a/python/paddle/base/layer_helper.py
+++ b/python/paddle/base/layer_helper.py
@@ -22,6 +22,7 @@
     Parameter,
     dtype_is_floating,
     in_dygraph_mode,
+    in_pir_mode,
 )
 from .layer_helper_base import LayerHelperBase
 from .param_attr import ParamAttr
@@ -132,6 +133,8 @@ def append_bias_op(self, input_var, dim_start=1, dim_end=None):
         b = self.create_parameter(
             attr=bias_attr, shape=size, dtype=input_var.dtype, is_bias=True
         )
+        if in_pir_mode():
+            return input_var + b
         tmp = self.create_variable_for_type_inference(dtype=input_var.dtype)
         self.append_op(
             type='elementwise_add',
diff --git a/python/paddle/base/layer_helper_base.py b/python/paddle/base/layer_helper_base.py
index 003aef14655bbc..197782813ad608 100644
--- a/python/paddle/base/layer_helper_base.py
+++ b/python/paddle/base/layer_helper_base.py
@@ -19,6 +19,7 @@
 import paddle
 
 from . import core, unique_name
+from .data_feeder import convert_dtype
 from .framework import (
     Variable,
     _current_expected_place,
@@ -359,6 +360,8 @@ def create_parameter(
         # set global dtype
         if not dtype:
             dtype = self.__dtype
+        if isinstance(dtype, core.DataType):
+            dtype = convert_dtype(dtype)
         if is_bias:
             suffix = 'b'
             default_initializer = (
diff --git a/python/paddle/static/nn/common.py b/python/paddle/static/nn/common.py
index 65f21a169cb63b..88cd6ab9e0b5a8 100644
--- a/python/paddle/static/nn/common.py
+++ b/python/paddle/static/nn/common.py
@@ -27,6 +27,7 @@
     default_main_program,
     in_dygraph_mode,
     in_dynamic_or_pir_mode,
+    in_pir_mode,
     name_scope,
     program_guard,
     static_only,
@@ -191,10 +192,17 @@ def fc_base(
         name=None,
     ):
         helper = LayerHelper("fc", **locals())
-        check_type(input, 'input', (list, tuple, Variable), 'fc')
+        check_type(
+            input, 'input', (list, tuple, Variable, paddle.pir.Value), 'fc'
+        )
         if isinstance(input, (list, tuple)):
             for i, input_x in enumerate(input):
-                check_type(input_x, 'input[' + str(i) + ']', Variable, 'fc')
+                check_type(
+                    input_x,
+                    'input[' + str(i) + ']',
+                    (Variable, paddle.pir.Value),
+                    'fc',
+                )
         dtype = helper.input_dtype()
         check_dtype(
             dtype, 'input', ['float16', 'uint16', 'float32', 'float64'], 'fc'
@@ -210,17 +218,25 @@ def fc_base(
             w = helper.create_parameter(
                 attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False
             )
-            tmp = helper.create_variable_for_type_inference(dtype)
-            helper.append_op(
-                type="mul",
-                inputs={"X": input_var, "Y": w},
-                outputs={"Out": tmp},
-                attrs={"x_num_col_dims": num_flatten_dims, "y_num_col_dims": 1},
-            )
+            if in_pir_mode():
+                tmp = paddle.matmul(input_var, w)
+            else:
+                tmp = helper.create_variable_for_type_inference(dtype)
+                helper.append_op(
+                    type="mul",
+                    inputs={"X": input_var, "Y": w},
+                    outputs={"Out": tmp},
+                    attrs={
+                        "x_num_col_dims": num_flatten_dims,
+                        "y_num_col_dims": 1,
+                    },
+                )
             mul_results.append(tmp)
 
         if len(mul_results) == 1:
             pre_bias = mul_results[0]
+        elif in_pir_mode():
+            pre_bias = paddle.add_n(mul_results)
         else:
             pre_bias = helper.create_variable_for_type_inference(dtype)
             helper.append_op(
diff --git a/python/paddle/static/nn/control_flow.py b/python/paddle/static/nn/control_flow.py
index 1302a808cecc2b..fefb16a8379c4c 100644
--- a/python/paddle/static/nn/control_flow.py
+++ b/python/paddle/static/nn/control_flow.py
@@ -175,6 +175,51 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         return super().__exit__(exc_type, exc_val, exc_tb)
 
 
+class If:
+    '''
+    **If**
+
+    If is an operator that bind two blocks (true_block and false_block) to a specific condition,
+    According to the condition, the corresponding block will be executed.
+
+    Args:
+        cond (Value): A value whose data type is bool controlling which block is executed.
+
+    Examples:
+        .. code-block:: python
+
+            >>> import paddle
+            >>> from paddle.static.nn.control_flow import ConditionalBlock
+
+            >>> label = paddle.rand([1])
+            >>> limit = paddle.ones([1]) * 0.5
+            >>> cond = paddle.less_than(x=label, y=limit)
+            >>> if_op = If(cond)
+            >>> with if_op.true_block():
+            ...     pass
+            >>> with if_op.false_block():
+            ...     pass
+    '''
+
+    def __init__(self, cond):
+        if not isinstance(cond, list):
+            check_variable_and_dtype(cond, 'cond', ['bool'], 'static.nn.If')
+            if reduce(lambda a, b: a * b, cond.shape, 1) != 1:
+                raise TypeError(
+                    "condition expected shape as [1], but given shape as {}.".format(
+                        list(cond.shape)
+                    )
+                )
+        self.if_op = build_if_op(cond)
+        self.cond_var = self.if_op.cond()
+
+    def true_block(self):
+        return self.if_op.true_block()
+
+    def false_block(self):
+        return self.if_op.false_block()
+
+
 class ConditionalBlock:
     '''
     **ConditionalBlock**
@@ -208,13 +253,23 @@ class ConditionalBlock:
     '''
 
     def __init__(self, inputs, is_scalar_condition=False, name=None):
-        for each_input in inputs:
-            check_type(each_input, "input", Variable, "ConditionalBlock")
         self.inputs = inputs
+        if in_pir_mode():
+            if is_scalar_condition and len(inputs) != 1:
+                raise TypeError(
+                    "For ConditionalBlock Api,  Only support one input while is_scalar_condition is True"
+                )
+            return
+        else:
+            for each_input in inputs:
+                check_type(each_input, "input", Variable, "ConditionalBlock")
+
         self.is_scalar_condition = is_scalar_condition
         self.helper = LayerHelper('conditional_block', name=name)
 
     def block(self):
+        if in_pir_mode():
+            return If(self.inputs).true_block()
         return ConditionalBlockGuard(self)
 
     def complete(self):
@@ -1244,9 +1299,9 @@ def cond(pred, true_fn=None, false_fn=None, name=None, return_names=None):
         return None
     true_output = None
     false_output = None
+    check_variable_and_dtype(pred, "pred", ['bool'], "base.layers.cond")
+    check_type(name, "name", (str, type(None)), "base.layers.cond")
     if in_pir_mode():
-        check_variable_and_dtype(pred, "pred", ['bool'], "base.layers.cond")
-        check_type(name, "name", (str, type(None)), "base.layers.cond")
         if_op = build_if_op(pred)
         if true_fn is not None:
             if not callable(true_fn):
@@ -1267,8 +1322,6 @@ def cond(pred, true_fn=None, false_fn=None, name=None, return_names=None):
             with if_op.false_block():
                 false_output = false_fn()
     else:
-        check_variable_and_dtype(pred, "pred", ['bool'], "base.layers.cond")
-        check_type(name, "name", (str, type(None)), "base.layers.cond")
         helper = LayerHelper('cond', **locals())
         copy_to_parent_func = lambda var: copy_var_to_parent_block(var, helper)
         if true_fn is not None:
diff --git a/test/dygraph_to_static/test_loop.py b/test/dygraph_to_static/test_loop.py
index 8414c488aba23c..fb2600b8ac2dc0 100644
--- a/test/dygraph_to_static/test_loop.py
+++ b/test/dygraph_to_static/test_loop.py
@@ -19,7 +19,6 @@
 import numpy as np
 from dygraph_to_static_utils import (
     Dy2StTestBase,
-    test_legacy_and_pt_and_pir,
 )
 
 import paddle
@@ -252,7 +251,6 @@ def setUp(self):
 
         self.nested_for_loop_func = nested_for_loop_dyfunc
 
-    @test_legacy_and_pt_and_pir
     def test_loop_vars(self):
         for i in range(len(self.loop_funcs)):
             func = self.loop_funcs[i]
@@ -268,7 +266,6 @@ def test_loop_vars(self):
                     self.assertEqual(loop_var_names, self.loop_var_names[i])
                     self.assertEqual(create_var_names, self.create_var_names[i])
 
-    @test_legacy_and_pt_and_pir
     def test_nested_loop_vars(self):
         func = self.nested_for_loop_func
         test_func = inspect.getsource(func)
diff --git a/test/legacy_test/test_conditional_block.py b/test/legacy_test/test_conditional_block.py
index 90a8200375c65a..b5f5df9205ae8a 100644
--- a/test/legacy_test/test_conditional_block.py
+++ b/test/legacy_test/test_conditional_block.py
@@ -31,7 +31,10 @@ def test_forward(self):
             data = paddle.static.data(name='X', shape=[-1, 1], dtype='float32')
             data.stop_gradient = False
             cond = ConditionalBlock(inputs=[data])
-            out = paddle.tensor.create_tensor(dtype='float32')
+            out = paddle.tensor.fill_constant(
+                [10, 10], dtype='float32', value=0.0
+            )
+            out.stop_gradient = False
             with cond.block():
                 hidden = paddle.static.nn.fc(x=data, size=10)
                 paddle.assign(hidden, out)
@@ -43,7 +46,6 @@ def test_forward(self):
             x = np.random.random(size=(10, 1)).astype('float32')
 
             outs = exe.run(main_program, feed={'X': x}, fetch_list=[out])[0]
-            print(outs)
             loss = paddle.mean(out)
             append_backward(loss=loss)
             outs = exe.run(
@@ -51,7 +53,6 @@ def test_forward(self):
                 feed={'X': x},
                 fetch_list=[main_program.block(0).var(data.name + "@GRAD")],
             )[0]
-            print(outs)
 
 
 class TestConditionalBlockOpInferShape(unittest.TestCase):

From 3dfe88e3ea37cee04a5e6ed642740ad31d21c825 Mon Sep 17 00:00:00 2001
From: Zhan Rongrui <46243324+zrr1999@users.noreply.github.com>
Date: Mon, 22 Jan 2024 16:39:28 +0800
Subject: [PATCH 06/34] Add test_with_pir_api in error test (#60693)

---
 test/legacy_test/test_arange.py            |   1 +
 test/legacy_test/test_assign_op.py         |   1 +
 test/legacy_test/test_cast_op.py           |   1 +
 test/legacy_test/test_compare_op.py        |   1 +
 test/legacy_test/test_numel_op.py          |  11 +--
 test/legacy_test/test_reshape_op.py        |   1 +
 test/legacy_test/test_scale_op.py          |   1 +
 test/legacy_test/test_sum_op.py            | 110 ++++++++++++---------
 test/legacy_test/test_uniform_random_op.py |   1 +
 test/legacy_test/test_unsqueeze2_op.py     |   1 +
 test/legacy_test/test_where_op.py          |   1 +
 11 files changed, 76 insertions(+), 54 deletions(-)

diff --git a/test/legacy_test/test_arange.py b/test/legacy_test/test_arange.py
index e901a060c32337..1413e9e626ce42 100644
--- a/test/legacy_test/test_arange.py
+++ b/test/legacy_test/test_arange.py
@@ -132,6 +132,7 @@ def init_config(self):
 
 
 class TestArangeOpError(unittest.TestCase):
+    @test_with_pir_api
     def test_static_errors(self):
         with program_guard(Program(), Program()):
             paddle.enable_static()
diff --git a/test/legacy_test/test_assign_op.py b/test/legacy_test/test_assign_op.py
index 3bff65836286af..b4ccbf56a72d44 100644
--- a/test/legacy_test/test_assign_op.py
+++ b/test/legacy_test/test_assign_op.py
@@ -170,6 +170,7 @@ def test_assign_tensor_array(self):
 
 
 class TestAssignOpError(unittest.TestCase):
+    @test_with_pir_api
     def test_errors(self):
         paddle.enable_static()
         with program_guard(Program(), Program()):
diff --git a/test/legacy_test/test_cast_op.py b/test/legacy_test/test_cast_op.py
index 19369a05a057e2..fb1de332310b32 100644
--- a/test/legacy_test/test_cast_op.py
+++ b/test/legacy_test/test_cast_op.py
@@ -190,6 +190,7 @@ def test_grad(self):
 
 
 class TestCastOpError(unittest.TestCase):
+    @test_with_pir_api
     def test_errors(self):
         paddle.enable_static()
         with program_guard(Program(), Program()):
diff --git a/test/legacy_test/test_compare_op.py b/test/legacy_test/test_compare_op.py
index 79aa2736eeb0cb..f79e2e09e6d162 100755
--- a/test/legacy_test/test_compare_op.py
+++ b/test/legacy_test/test_compare_op.py
@@ -524,6 +524,7 @@ def test_check_output(self):
 
 
 class TestCompareOpError(unittest.TestCase):
+    @test_with_pir_api
     def test_int16_support(self):
         paddle.enable_static()
         with paddle.static.program_guard(
diff --git a/test/legacy_test/test_numel_op.py b/test/legacy_test/test_numel_op.py
index bc48c8a8424411..8bfbed64ef329c 100644
--- a/test/legacy_test/test_numel_op.py
+++ b/test/legacy_test/test_numel_op.py
@@ -188,6 +188,7 @@ def test_numel_imperative(self):
         np.testing.assert_array_equal(out_2.numpy().item(0), np.size(input_2))
         paddle.enable_static()
 
+    @test_with_pir_api
     def test_error(self):
         main_program = paddle.static.Program()
         startup_program = paddle.static.Program()
@@ -200,16 +201,6 @@ def test_x_type():
 
             self.assertRaises(TypeError, test_x_type)
 
-    def test_pir_error(self):
-        with paddle.pir_utils.IrGuard():
-
-            def test_x_type():
-                shape = [1, 4, 5]
-                input_1 = np.random.random(shape).astype("int32")
-                out_1 = paddle.numel(input_1)
-
-            self.assertRaises(TypeError, test_x_type)
-
 
 if __name__ == '__main__':
     paddle.enable_static()
diff --git a/test/legacy_test/test_reshape_op.py b/test/legacy_test/test_reshape_op.py
index 99350c9eb043e4..903b00a246da9c 100755
--- a/test/legacy_test/test_reshape_op.py
+++ b/test/legacy_test/test_reshape_op.py
@@ -560,6 +560,7 @@ def _set_paddle_api(self):
         self.data = paddle.static.data
         self.reshape = paddle.reshape
 
+    @test_with_pir_api
     def _test_errors(self):
         paddle.enable_static()
         with program_guard(Program(), Program()):
diff --git a/test/legacy_test/test_scale_op.py b/test/legacy_test/test_scale_op.py
index ab89a4c2424afb..af29d4484fe90b 100644
--- a/test/legacy_test/test_scale_op.py
+++ b/test/legacy_test/test_scale_op.py
@@ -132,6 +132,7 @@ def test_scale_selected_rows_inplace(self):
 
 
 class TestScaleRaiseError(unittest.TestCase):
+    @test_with_pir_api
     def test_errors(self):
         paddle.enable_static()
 
diff --git a/test/legacy_test/test_sum_op.py b/test/legacy_test/test_sum_op.py
index 951df097ad2ad0..194da8bc9484b4 100644
--- a/test/legacy_test/test_sum_op.py
+++ b/test/legacy_test/test_sum_op.py
@@ -465,69 +465,91 @@ def test_add_n_and_add_and_grad(self):
 
 
 class TestRaiseSumError(unittest.TestCase):
+    @test_with_pir_api
     def test_errors(self):
-        def test_type():
-            paddle.add_n([11, 22])
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
 
-        self.assertRaises(TypeError, test_type)
+            def test_type():
+                paddle.add_n([11, 22])
 
-        def test_dtype():
-            data1 = paddle.static.data(name="input1", shape=[10], dtype="int8")
-            data2 = paddle.static.data(name="input2", shape=[10], dtype="int8")
-            paddle.add_n([data1, data2])
+            self.assertRaises(TypeError, test_type)
 
-        self.assertRaises(TypeError, test_dtype)
+            def test_dtype():
+                data1 = paddle.static.data(
+                    name="input1", shape=[10], dtype="int8"
+                )
+                data2 = paddle.static.data(
+                    name="input2", shape=[10], dtype="int8"
+                )
+                paddle.add_n([data1, data2])
 
-        def test_dtype1():
-            data1 = paddle.static.data(name="input1", shape=[10], dtype="int8")
-            paddle.add_n(data1)
+            self.assertRaises(TypeError, test_dtype)
+
+            def test_dtype1():
+                data1 = paddle.static.data(
+                    name="input1", shape=[10], dtype="int8"
+                )
+                paddle.add_n(data1)
 
-        self.assertRaises(TypeError, test_dtype1)
+            self.assertRaises(TypeError, test_dtype1)
 
 
 class TestRaiseSumsError(unittest.TestCase):
+    @test_with_pir_api
     def test_errors(self):
-        def test_type():
-            paddle.add_n([11, 22])
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+
+            def test_type():
+                paddle.add_n([11, 22])
 
-        self.assertRaises(TypeError, test_type)
+            self.assertRaises(TypeError, test_type)
 
-        def test_dtype():
-            data1 = paddle.static.data(name="input1", shape=[10], dtype="int8")
-            data2 = paddle.static.data(name="input2", shape=[10], dtype="int8")
-            paddle.add_n([data1, data2])
+            def test_dtype():
+                data1 = paddle.static.data(
+                    name="input1", shape=[10], dtype="int8"
+                )
+                data2 = paddle.static.data(
+                    name="input2", shape=[10], dtype="int8"
+                )
+                paddle.add_n([data1, data2])
 
-        self.assertRaises(TypeError, test_dtype)
+            self.assertRaises(TypeError, test_dtype)
 
-        def test_dtype1():
-            data1 = paddle.static.data(name="input1", shape=[10], dtype="int8")
-            paddle.add_n(data1)
+            def test_dtype1():
+                data1 = paddle.static.data(
+                    name="input1", shape=[10], dtype="int8"
+                )
+                paddle.add_n(data1)
 
-        self.assertRaises(TypeError, test_dtype1)
+            self.assertRaises(TypeError, test_dtype1)
 
-        def test_out_type():
-            data1 = paddle.static.data(
-                name="input1", shape=[10], dtype="flaot32"
-            )
-            data2 = paddle.static.data(
-                name="input2", shape=[10], dtype="float32"
-            )
-            out = [10]
-            out = paddle.add_n([data1, data2])
+            def test_out_type():
+                data1 = paddle.static.data(
+                    name="input1", shape=[10], dtype="flaot32"
+                )
+                data2 = paddle.static.data(
+                    name="input2", shape=[10], dtype="float32"
+                )
+                out = [10]
+                out = paddle.add_n([data1, data2])
 
-        self.assertRaises(TypeError, test_out_type)
+            self.assertRaises(TypeError, test_out_type)
 
-        def test_out_dtype():
-            data1 = paddle.static.data(
-                name="input1", shape=[10], dtype="flaot32"
-            )
-            data2 = paddle.static.data(
-                name="input2", shape=[10], dtype="float32"
-            )
-            out = paddle.static.data(name="out", shape=[10], dtype="int8")
-            out = paddle.add_n([data1, data2])
+            def test_out_dtype():
+                data1 = paddle.static.data(
+                    name="input1", shape=[10], dtype="flaot32"
+                )
+                data2 = paddle.static.data(
+                    name="input2", shape=[10], dtype="float32"
+                )
+                out = paddle.static.data(name="out", shape=[10], dtype="int8")
+                out = paddle.add_n([data1, data2])
 
-        self.assertRaises(TypeError, test_out_dtype)
+            self.assertRaises(TypeError, test_out_dtype)
 
 
 class TestSumOpError(unittest.TestCase):
diff --git a/test/legacy_test/test_uniform_random_op.py b/test/legacy_test/test_uniform_random_op.py
index 0a5174214919be..d64b716d71bd26 100644
--- a/test/legacy_test/test_uniform_random_op.py
+++ b/test/legacy_test/test_uniform_random_op.py
@@ -203,6 +203,7 @@ def init_dtype(self):
 
 
 class TestUniformRandomOpError(unittest.TestCase):
+    @test_with_pir_api
     def test_errors(self):
         paddle.enable_static()
         main_prog = Program()
diff --git a/test/legacy_test/test_unsqueeze2_op.py b/test/legacy_test/test_unsqueeze2_op.py
index e43dd8fee5d1f0..dd1e2b1809df7b 100755
--- a/test/legacy_test/test_unsqueeze2_op.py
+++ b/test/legacy_test/test_unsqueeze2_op.py
@@ -297,6 +297,7 @@ def test_api(self):
         np.testing.assert_array_equal(res_4, input.reshape([3, 2, 5, 1]))
         np.testing.assert_array_equal(res_5, input.reshape([3, 1, 1, 2, 5, 1]))
 
+    @test_with_pir_api
     def test_error(self):
         def test_axes_type():
             x2 = paddle.static.data(name="x2", shape=[2, 25], dtype="int32")
diff --git a/test/legacy_test/test_where_op.py b/test/legacy_test/test_where_op.py
index b338d6df0e3789..6f64ff15f45b97 100644
--- a/test/legacy_test/test_where_op.py
+++ b/test/legacy_test/test_where_op.py
@@ -784,6 +784,7 @@ def test_where_condition(self):
 
 
 class TestWhereOpError(unittest.TestCase):
+    @test_with_pir_api
     def test_errors(self):
         with paddle.static.program_guard(
             paddle.static.Program(), paddle.static.Program()

From 1345e24660583b3965f5607750d7c21e563bbffd Mon Sep 17 00:00:00 2001
From: 6clc <chaoliu.lc@foxmail.com>
Date: Mon, 22 Jan 2024 17:00:47 +0800
Subject: [PATCH 07/34] Adapt to dim expr (#60843)

In the check for Int32, insert TryElevateInt32ToInt64(). This function checks the data type of ir and raises it to int64 if an expression contains an Int64.
int64 is imported only with shape expressions passed in by dim_expr.
---
 paddle/cinn/backends/codegen_cuda_host.cc     |  2 +-
 paddle/cinn/backends/codegen_cuda_util.cc     |  7 ++--
 paddle/cinn/backends/codegen_cuda_util.h      |  2 +-
 paddle/cinn/common/ir_util.cc                 |  6 ++-
 paddle/cinn/common/type.h                     |  6 +++
 .../cinn/hlir/framework/op_lowering_impl.cc   |  2 +-
 .../cinn/hlir/framework/op_lowering_util.cc   |  8 ++--
 .../hlir/framework/pir/op_lowering_impl.cc    |  4 +-
 paddle/cinn/ir/buffer.cc                      | 10 +++--
 paddle/cinn/ir/buffer.h                       |  2 +-
 paddle/cinn/ir/dim.cc                         |  8 +---
 paddle/cinn/ir/ir.cc                          | 20 +++++++---
 paddle/cinn/ir/ir_base.cc                     | 38 ++++++++++++++++++-
 paddle/cinn/ir/ir_base.h                      |  6 ++-
 paddle/cinn/ir/lowered_func.cc                |  6 ++-
 paddle/cinn/ir/schedule/impl/for_type.cc      |  4 +-
 paddle/cinn/ir/schedule/ir_schedule.cc        |  4 +-
 paddle/cinn/ir/tensor.cc                      | 26 ++++++++++---
 paddle/cinn/optim/resize_buffer.cc            |  5 ++-
 paddle/cinn/optim/transform_gpu_forloop.cc    |  2 +-
 paddle/cinn/optim/unroll_loops.cc             |  4 +-
 paddle/cinn/optim/vectorize_loops.cc          |  3 +-
 paddle/cinn/poly/dim.h                        |  4 ++
 paddle/cinn/poly/domain.cc                    | 29 +++++++++++++-
 paddle/cinn/runtime/cuda/cuda_intrinsics.cc   |  4 +-
 paddle/cinn/runtime/cuda/cuda_util.cc         |  4 +-
 paddle/cinn/runtime/cuda/cuda_util.h          |  4 +-
 .../instruction/cinn_jit_instruction.cc       | 10 ++---
 28 files changed, 172 insertions(+), 58 deletions(-)

diff --git a/paddle/cinn/backends/codegen_cuda_host.cc b/paddle/cinn/backends/codegen_cuda_host.cc
index 11e986bb9ace1b..71b11f228acd61 100644
--- a/paddle/cinn/backends/codegen_cuda_host.cc
+++ b/paddle/cinn/backends/codegen_cuda_host.cc
@@ -222,7 +222,7 @@ llvm::Value* CodeGenCUDA_Host::LowerHostFunc(const ir::_LoweredFunc_* func) {
 
 llvm::Value* CodeGenCUDA_Host::LowerParseArgsValueCall(
     const ir::Call* call_ir) {
-  auto ret_type = CinnTypeToLLVMType(Int(32), m_);
+  auto ret_type = CinnTypeToLLVMType(Int(64), m_);
   std::vector<llvm::Type*> args_type;
   CHECK_EQ(call_ir->read_args.size(), 2);
   CHECK(call_ir->read_args[0].is_var() &&
diff --git a/paddle/cinn/backends/codegen_cuda_util.cc b/paddle/cinn/backends/codegen_cuda_util.cc
index 660eee9160a6bf..1f9966b5b28810 100644
--- a/paddle/cinn/backends/codegen_cuda_util.cc
+++ b/paddle/cinn/backends/codegen_cuda_util.cc
@@ -114,15 +114,16 @@ void detail::CollectBucketStrategyHostFunctionVisitor::ProcessArgs(
   for (int i = 0; i < args.size(); ++i) {
     if (args[i].is_var()) {
       ir::Expr call_get_value_in_kernel_args =
-          ir::Call::Make(Int(32),
+          ir::Call::Make(Int(64),
                          runtime::intrinsic::get_value_in_cuda_kernel_args,
                          {kernel_args_, ir::Expr(i)},
                          {},
                          ir::CallType::Extern,
                          ir::FunctionRef(),
                          0);
-      ir::Expr stmt = ir::Let::Make(ir::Expr(args[i].var_arg()),
-                                    call_get_value_in_kernel_args);
+      ir::Expr let_symbol = ir::Expr(args[i].var_arg());
+      let_symbol->set_type(type_of<int64_t>());
+      ir::Expr stmt = ir::Let::Make(let_symbol, call_get_value_in_kernel_args);
       arg_defs_.push_back(stmt);
     }
   }
diff --git a/paddle/cinn/backends/codegen_cuda_util.h b/paddle/cinn/backends/codegen_cuda_util.h
index 52296bd2a8807b..01caff457a50ca 100644
--- a/paddle/cinn/backends/codegen_cuda_util.h
+++ b/paddle/cinn/backends/codegen_cuda_util.h
@@ -152,7 +152,7 @@ struct CollectBucketStrategyHostFunctionVisitor
         kernel_args_(KERNEL_ARGS, type_of<void*>()),
         kernel_args_num_(KERNEL_ARGS_NUM, type_of<int>()),
         kernel_stream_(KERNEL_STREAM, type_of<void*>()),
-        tensor_shape_args_(TENSOR_SHAPE_ARGS, type_of<int32_t**>()) {}
+        tensor_shape_args_(TENSOR_SHAPE_ARGS, type_of<int64_t**>()) {}
 
   std::tuple<ir::Module, ir::Module> operator()(Expr* expr) {
     ir::IRMutator<>::Visit(expr, expr);
diff --git a/paddle/cinn/common/ir_util.cc b/paddle/cinn/common/ir_util.cc
index 774d7514e6fb23..d326e652a7be7f 100644
--- a/paddle/cinn/common/ir_util.cc
+++ b/paddle/cinn/common/ir_util.cc
@@ -143,8 +143,12 @@ Expr IndiceToAbsOffset(const std::vector<Expr> &shape,
   VLOG(3) << "indices is : " << utils::Join(indices, ",");
   CHECK_LE(shape.size(), indices.size());
   Expr res;
+  ir::TryElevateInt32ToInt64(shape);
   for (int i = 0; i < shape.size(); i++) {
-    CHECK_EQ(shape[i].type(), Int(32));
+    CHECK(shape[i].type() == Int(64) || shape[i].type() == Int(32))
+        << "The shape data type currently supports only int32 or int64, but "
+           "the current data type of shape["
+        << i << "] is " << shape[i].type();
     Expr indice_prod = indices[i];
     optim::SimplifyCast(&indice_prod);
     for (int j = i + 1; j < shape.size(); j++) {
diff --git a/paddle/cinn/common/type.h b/paddle/cinn/common/type.h
index b11a320bbd5a19..420a31b5824c2d 100644
--- a/paddle/cinn/common/type.h
+++ b/paddle/cinn/common/type.h
@@ -263,6 +263,12 @@ inline Type type_of<int32_t**>() {
   return x;
 }
 template <>
+inline Type type_of<int64_t**>() {
+  Type x = Int(64);
+  x.set_cpp_handle2();
+  return x;
+}
+template <>
 inline Type type_of<void*>() {
   Type x = type_of<void>();
   x.set_cpp_handle();
diff --git a/paddle/cinn/hlir/framework/op_lowering_impl.cc b/paddle/cinn/hlir/framework/op_lowering_impl.cc
index 1b3a39850e2e4e..cef59686395111 100644
--- a/paddle/cinn/hlir/framework/op_lowering_impl.cc
+++ b/paddle/cinn/hlir/framework/op_lowering_impl.cc
@@ -599,7 +599,7 @@ ir::Expr OpLowererImpl::DoGroupSchedule(
           auto master_loops = ir_sch.GetLoops(GetNodeData(master)->id());
           std::vector<int> splits;
           for (auto loop : master_loops) {
-            splits.push_back(loop.As<ir::For>()->extent.as_int32());
+            splits.push_back(loop.As<ir::For>()->extent.as_int64());
           }
           loops = ir_sch.GetLoops(GetNodeData(node)->id());
           ir_sch.Split(loops[0], splits);
diff --git a/paddle/cinn/hlir/framework/op_lowering_util.cc b/paddle/cinn/hlir/framework/op_lowering_util.cc
index 5a332324c7c89b..a7b988a735cdbd 100644
--- a/paddle/cinn/hlir/framework/op_lowering_util.cc
+++ b/paddle/cinn/hlir/framework/op_lowering_util.cc
@@ -1537,8 +1537,8 @@ void MergeReduceLoop(
     auto dst_loops = ir_sch.GetLoops(tensor_->name);
     auto src_loops = ir_sch.GetLoops(tensor__->name);
     int index = -1;
-    while (src_loops[index + 1].As<ir::For>()->extent.as_int32() ==
-           dst_loops[index + 1].As<ir::For>()->extent.as_int32()) {
+    while (src_loops[index + 1].As<ir::For>()->extent.as_int64() ==
+           dst_loops[index + 1].As<ir::For>()->extent.as_int64()) {
       ++index;
       if (src_loops.size() == index + 1 || dst_loops.size() == index + 1) {
         break;
@@ -1661,8 +1661,8 @@ void LoopComputeAt(
   int index = std::min(node_loops.size(), master_loops.size()) - 1;
   do {
     // if loop range is not equal.
-    if (node_loops[index].As<ir::For>()->extent.as_int32() !=
-        master_loops[index].As<ir::For>()->extent.as_int32()) {
+    if (node_loops[index].As<ir::For>()->extent.as_int64() !=
+        master_loops[index].As<ir::For>()->extent.as_int64()) {
       continue;
     }
     MergeLoops(
diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc b/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc
index 59ee965a4b91a8..44f78f062874f0 100644
--- a/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc
+++ b/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc
@@ -494,7 +494,7 @@ std::vector<ir::LoweredFunc> OpLowererImpl::PostProcess(
         }
         int_args_set.insert(symbol_name);
         group_func_args->emplace_back(
-            ir::_Var_::Make(symbol_name, cinn::common::Int(32)));
+            ir::_Var_::Make(symbol_name, cinn::common::Int(64)));
         group->int_args_map[non_tensor_arg_idx++] = {tensor_arg_idx,
                                                      tensor_arg_dim_idx};
         VLOG(4) << "device kernel func's " << non_tensor_arg_idx << " is from "
@@ -860,7 +860,7 @@ ir::LoweredFunc OpLowererImpl::GenerateInferShapeFunc(
     int tensor_dim_size = tensor_dim.size();
     auto tensor_shape = group_func_arg_tensors[tensor_arg_idx]->shape;
 
-    ir::Var tensor_shape_args(TENSOR_SHAPE_ARGS, type_of<int32_t**>());
+    ir::Var tensor_shape_args(TENSOR_SHAPE_ARGS, type_of<int64_t**>());
     for (int i = 0; i < tensor_shape.size(); i++) {
       ir::Expr call_set_infer_shape_value =
           ir::Call::Make(type_of<void>(),
diff --git a/paddle/cinn/ir/buffer.cc b/paddle/cinn/ir/buffer.cc
index ada0d4487b7f02..350cde0189fdf8 100644
--- a/paddle/cinn/ir/buffer.cc
+++ b/paddle/cinn/ir/buffer.cc
@@ -103,11 +103,15 @@ Var _Buffer_::buffer_addr() const {
   return _Var_::Make(name, thetype);
 }
 
-int _Buffer_::numel() const {
-  int res = 1;
+int64_t _Buffer_::numel() const {
+  int64_t res = 1;
   for (auto &i : shape) {
     CHECK(i.is_constant());
-    res *= i.as_int32();
+    if (i->type() == Int(64)) {
+      res *= i.as_int64();
+    } else {
+      res *= i.as_int32();
+    }
   }
   return res;
 }
diff --git a/paddle/cinn/ir/buffer.h b/paddle/cinn/ir/buffer.h
index 7e80b6de9297fa..4b83a2bcd2e0f1 100755
--- a/paddle/cinn/ir/buffer.h
+++ b/paddle/cinn/ir/buffer.h
@@ -141,7 +141,7 @@ class _Buffer_ : public ExprNode<_Buffer_> {
 
   void Verify() const override;
 
-  int numel() const;
+  int64_t numel() const;
 
   static const IrNodeTy _node_type_ = IrNodeTy::_Buffer_;
 
diff --git a/paddle/cinn/ir/dim.cc b/paddle/cinn/ir/dim.cc
index 98ab3918720918..fe63fb31158a9c 100644
--- a/paddle/cinn/ir/dim.cc
+++ b/paddle/cinn/ir/dim.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/cinn/ir/dim.h"
+#include "paddle/cinn/common/dim_expr_converter.h"
 #include "paddle/cinn/ir/ir.h"
 
 namespace cinn {
@@ -31,12 +32,7 @@ Dim _Dim_::Make(const std::string& name, const symbol::DimExpr& sym_dim) {
   auto* n = make_shared<_Dim_>();
   n->name = name;
   n->sym_dim = sym_dim;
-  if (sym_dim.isa<std::string>()) {
-    n->dim_expr =
-        Expr(Var(sym_dim.dyn_cast<std::string>(), cinn::common::Int(32)));
-  } else {
-    n->dim_expr = Expr(static_cast<int32_t>(sym_dim.dyn_cast<int64_t>()));
-  }
+  n->dim_expr = common::DimExprConverter().ConvertToIrExpr(sym_dim);
 
   return Dim(n);
 }
diff --git a/paddle/cinn/ir/ir.cc b/paddle/cinn/ir/ir.cc
index b556dad00cb324..d57344e77d238e 100644
--- a/paddle/cinn/ir/ir.cc
+++ b/paddle/cinn/ir/ir.cc
@@ -58,6 +58,7 @@ Add::Add(Expr a, Expr b) : BinaryOpNode<Add>(a.type(), a, b) {}
 void BinaryNodeVerify(const Expr &a, const Expr &b, absl::string_view ir_name) {
   CHECK(a.defined());
   CHECK(b.defined());
+  TryElevateInt32ToInt64({a, b});
   CHECK_EQ(a.type(), b.type())
       << "The operands' types of the node [" << ir_name << "] don't match";
 }
@@ -72,9 +73,7 @@ Expr Sub::Make(Expr a, Expr b) {
 void Sub::Verify() const { BinaryNodeVerify(a(), b(), "Sub"); }
 
 Expr Mul::Make(Expr a, Expr b) {
-  CHECK(a.defined());
-  CHECK(b.defined());
-  CHECK_EQ(a.type(), b.type()) << "a=" << a << ", b=" << b;
+  BinaryNodeVerify(a, b, "Mul");
   auto node = make_shared<Mul>(a, b);
   return Expr(node);
 }
@@ -203,6 +202,7 @@ void Let::Verify() const {
   CHECK(symbol.defined());
   // The default value(contained in body) is not required.
   if (body.defined()) {
+    TryElevateInt32ToInt64({symbol, body});
     CHECK_EQ(symbol.type(), body.type());
   }
 }
@@ -583,7 +583,11 @@ Var &Var::operator=(const _Var_ *x) {
 Expr Load::Make(Expr tensor, const std::vector<Expr> &indices) {
   CHECK(tensor->type().valid());
   CHECK(!indices.empty());
-  for (auto &idx : indices) CHECK_EQ(idx.type().ElementOf(), Int(32));
+  TryElevateInt32ToInt64(indices);
+  for (auto &idx : indices) {
+    CHECK(idx.type().ElementOf() == Int(64) ||
+          idx.type().ElementOf() == Int(32));
+  }
   auto node = make_shared<Load>();
   node->tensor = tensor;
   node->indices = indices;
@@ -695,8 +699,13 @@ Expr Sum::Make(const std::vector<Expr> &vs) {
   if (vs.size() == 1) return vs.front();
 
   auto *n = make_shared<Sum>();
+  TryElevateInt32ToInt64(vs);
   auto type = vs.front().type();
-  for (auto &v : vs) CHECK_EQ(v.type(), type) << vs.front() << " " << v;
+  for (auto &v : vs) {
+    CHECK_EQ(v.type(), type) << "The operands' types of the node ["
+                             << n->node_type() << "] don't match: "
+                             << "(" << v << " vs " << vs.front() << ")";
+  }
 
   n->operands() = vs;
 
@@ -709,6 +718,7 @@ Expr Product::Make(const std::vector<Expr> &vs) {
   CHECK_GE(vs.size(), 1);
 
   auto *n = make_shared<Product>();
+  TryElevateInt32ToInt64(vs);
   auto type = vs.front().type();
   for (auto &v : vs) CHECK_EQ(v.type(), type);
 
diff --git a/paddle/cinn/ir/ir_base.cc b/paddle/cinn/ir/ir_base.cc
index ed1980511d6863..b89342662eb7ca 100644
--- a/paddle/cinn/ir/ir_base.cc
+++ b/paddle/cinn/ir/ir_base.cc
@@ -119,7 +119,7 @@ int32_t Expr::as_int32() const {
   return As<IntImm>()->value;
 }
 int64_t Expr::as_int64() const {
-  CHECK(type().is_int(64));
+  CHECK(type().is_int(64) || type().is_int(32));
   return As<IntImm>()->value;
 }
 
@@ -235,5 +235,41 @@ const Expr &IrNode::operand(int i) {
   return operands[i];
 }
 
+void IrNode::set_type(Type type) { type_ = type; }
+
+void IrNode::convert_int32_to_int64() {
+  CHECK(type_ == Int(64) || type_ == Int(32) || type_.is_unk())
+      << "Current only support convert int32_t to int64_t, but get type is "
+      << type_;
+  type_ = Int(64);
+  for (Expr &operand : operands) {
+    operand->convert_int32_to_int64();
+  }
+}
+
+void TryElevateInt32ToInt64(const std::vector<Expr> &expr_vec) {
+  Type type = expr_vec.front()->type();
+  for (const Expr &expr : expr_vec) {
+    if (expr->type() == Int(64)) {
+      type = Int(64);
+      break;
+    }
+  }
+
+  // Not need Elevate to Int(64)
+  if (type != Int(64)) {
+    return;
+  }
+  for (const Expr &expr : expr_vec) {
+    CHECK(expr->type() == Int(64) || expr->type() == Int(32) ||
+          expr->type().is_unk())
+        << "Current only support convert int32_t to int64_t, but get type is "
+        << expr->type();
+    if (expr->type() == Int(32)) {
+      expr->convert_int32_to_int64();
+    }
+  }
+}
+
 }  // namespace ir
 }  // namespace cinn
diff --git a/paddle/cinn/ir/ir_base.h b/paddle/cinn/ir/ir_base.h
index 0047100ebcfdfc..24a7c2271d1fd5 100644
--- a/paddle/cinn/ir/ir_base.h
+++ b/paddle/cinn/ir/ir_base.h
@@ -162,7 +162,9 @@ class IrNode : public cinn::common::Object {
 
   virtual IrNodeTy node_type() const { return IrNodeTy::kUnk; }
   virtual Type type() const { return type_; }
-  void set_type(Type type) { type_ = type; }
+  void set_type(Type type);
+  //! Elevate int32 to int64 if needed
+  void convert_int32_to_int64();
 
   //! Get i-th operand
   const Expr& operand(int i);
@@ -502,6 +504,8 @@ Expr ExprNode<T>::Copy() const {
   return Expr();
 }
 
+void TryElevateInt32ToInt64(const std::vector<Expr>& expr_vec);
+
 }  // namespace ir
 }  // namespace cinn
 
diff --git a/paddle/cinn/ir/lowered_func.cc b/paddle/cinn/ir/lowered_func.cc
index d252a5e44954f5..3537bfaf2fe4fd 100644
--- a/paddle/cinn/ir/lowered_func.cc
+++ b/paddle/cinn/ir/lowered_func.cc
@@ -333,7 +333,8 @@ void _LoweredFunc_::PrepareArgumentExprs() {
     // cast arg to cinn_pod_value_t*
 
     // something like `_args[0]`
-    Expr load_expr = Load::Make(pod_value_ptr, {cinn::common::make_const(i)});
+    Expr load_expr = Load::Make(
+        pod_value_ptr, {cinn::common::make_const(static_cast<int32_t>(i))});
     CHECK_EQ(load_expr.type(), type_of<cinn_pod_value_t>());
     load_expr = ir::intrinsics::GetAddr::Make(load_expr);
 
@@ -404,6 +405,9 @@ void _LoweredFunc_::PrepareArgumentExprs() {
     } else if (arg.type() == type_of<int32_t**>()) {
       pod_cast_expr =
           ir::intrinsics::PodValueToX::Make(load_expr, type_of<int32_t**>());
+    } else if (arg.type() == type_of<int64_t**>()) {
+      pod_cast_expr =
+          ir::intrinsics::PodValueToX::Make(load_expr, type_of<int64_t**>());
     } else if (arg.type() == type_of<void**>()) {
       pod_cast_expr =
           ir::intrinsics::PodValueToX::Make(load_expr, type_of<void**>());
diff --git a/paddle/cinn/ir/schedule/impl/for_type.cc b/paddle/cinn/ir/schedule/impl/for_type.cc
index 6b045fcc2b3427..bab2b312bde12a 100644
--- a/paddle/cinn/ir/schedule/impl/for_type.cc
+++ b/paddle/cinn/ir/schedule/impl/for_type.cc
@@ -132,7 +132,7 @@ void DyScheduleImpl::Bind(const Expr& loop, const std::string& thread_axis) {
   const std::array<int, 3> kMaxBlockDims = cur_dev_info->GetMaxBlockDims();
   const std::array<int, 3> kMaxGridDims = cur_dev_info->GetMaxGridDims();
   auto check_offset = [&](const char& c) -> bool {
-    auto extent = loop.As<ir::For>()->extent.as_int32();
+    auto extent = loop.As<ir::For>()->extent.as_int64();
     return extent <= (c == 'b' ? kMaxGridDims[offset] : kMaxBlockDims[offset]);
   };
   if (thread_axis[0] == 'b') {
@@ -210,7 +210,7 @@ void StScheduleImpl::Bind(const Expr& loop, const std::string& thread_axis) {
   const std::array<int, 3> kMaxBlockDims = cur_dev_info->GetMaxBlockDims();
   const std::array<int, 3> kMaxGridDims = cur_dev_info->GetMaxGridDims();
   auto check_offset = [&](const char& c) -> bool {
-    auto extent = loop.As<ir::For>()->extent.as_int32();
+    auto extent = loop.As<ir::For>()->extent.as_int64();
     return extent <= (c == 'b' ? kMaxGridDims[offset] : kMaxBlockDims[offset]);
   };
   if (thread_axis[0] == 'b') {
diff --git a/paddle/cinn/ir/schedule/ir_schedule.cc b/paddle/cinn/ir/schedule/ir_schedule.cc
index fb151051f0b67f..b4c44d062e47bf 100644
--- a/paddle/cinn/ir/schedule/ir_schedule.cc
+++ b/paddle/cinn/ir/schedule/ir_schedule.cc
@@ -384,7 +384,7 @@ std::vector<Expr> IRSchedule::Split(const Expr& loop,
                                     const std::vector<int>& factors) {
   if (IsDynamicShape()) return impl_->Split(loop, factors);
   std::vector<Expr> decision = SamplePerfectTile(
-      loop, factors.size(), loop.As<ir::For>()->extent.as_int32(), factors);
+      loop, factors.size(), loop.As<ir::For>()->extent.as_int64(), factors);
   auto results = Split(loop, decision);
   return results;
 }
@@ -407,7 +407,7 @@ std::vector<Expr> IRSchedule::Split(const Expr& loop,
   std::vector<int> int_factors;
   std::vector<Expr> results;
   std::for_each(factors.begin(), factors.end(), [&int_factors](const Expr& e) {
-    if (e.is_constant()) int_factors.push_back(e.as_int32());
+    if (e.is_constant()) int_factors.push_back(e.as_int64());
   });
   if (int_factors.size() == factors.size()) {
     results = impl_->Split(loop, int_factors);
diff --git a/paddle/cinn/ir/tensor.cc b/paddle/cinn/ir/tensor.cc
index 261db949b997bc..f6897b81560dd4 100644
--- a/paddle/cinn/ir/tensor.cc
+++ b/paddle/cinn/ir/tensor.cc
@@ -227,12 +227,28 @@ isl::set _Tensor_::GenerateIslDomain() const {
     auto _axis_with_reduce = axis_with_reduce();
     for (int i = 0; i < domain.size(); i++) {
       auto dim = domain[i];
-      if (dim.is_constant()) {
-        dims.emplace_back(_axis_with_reduce[i]->name, 0, dim.as_int32() - 1);
+      if (dim.type() == type_of<int64_t>()) {
+        if (dim.is_constant()) {
+          dims.emplace_back(_axis_with_reduce[i]->name,
+                            static_cast<int64_t>(0),
+                            static_cast<int64_t>(dim.as_int64() - 1));
+        } else {
+          dims.emplace_back(
+              _axis_with_reduce[i]->name,
+              Expr(static_cast<int64_t>(0)),
+              Sub::Make(dim,
+                        cinn::common::make_const(static_cast<int64_t>(1))));
+        }
       } else {
-        dims.emplace_back(_axis_with_reduce[i]->name,
-                          Expr(0),
-                          Sub::Make(dim, cinn::common::make_const(1)));
+        if (dim.is_constant()) {
+          dims.emplace_back(_axis_with_reduce[i]->name,
+                            static_cast<uint32_t>(0),
+                            dim.as_int32() - 1);
+        } else {
+          dims.emplace_back(_axis_with_reduce[i]->name,
+                            Expr(0),
+                            Sub::Make(dim, cinn::common::make_const(1)));
+        }
       }
     }
   }
diff --git a/paddle/cinn/optim/resize_buffer.cc b/paddle/cinn/optim/resize_buffer.cc
index dda54c44c0aadf..f36eef0704946d 100644
--- a/paddle/cinn/optim/resize_buffer.cc
+++ b/paddle/cinn/optim/resize_buffer.cc
@@ -125,10 +125,11 @@ class AnalyzeLoopVarRange : public ir::IRMutator<> {
         for (int i = 0; i < indice_extent.size(); ++i) {
           if (stored_indice_extent[i].is_constant() &&
               indice_extent[i].is_constant()) {
-            int stored_extent = stored_indice_extent[i].as_int32();
-            int cur_extent = indice_extent[i].as_int32();
+            int64_t stored_extent = stored_indice_extent[i].as_int64();
+            int64_t cur_extent = indice_extent[i].as_int64();
             if (cur_extent > stored_extent) {
               stored_indice_extent[i] = ir::Expr(cur_extent);
+              stored_indice_extent[i]->set_type(indice_extent[i].type());
             }
           }
           // if there indice extent is not constant, which means dynamic shape
diff --git a/paddle/cinn/optim/transform_gpu_forloop.cc b/paddle/cinn/optim/transform_gpu_forloop.cc
index 99232192354286..06da7f56c140ae 100644
--- a/paddle/cinn/optim/transform_gpu_forloop.cc
+++ b/paddle/cinn/optim/transform_gpu_forloop.cc
@@ -408,7 +408,7 @@ class ReplaceVarToZero : public ir::IRMutator<> {
     auto var_name = for_ir->loop_var->name;
     auto extent_i = for_ir->extent;
 
-    if (extent_i.is_constant() && extent_i.as_int32() == 1)
+    if (extent_i.is_constant() && extent_i.as_int64() == 1)
       loop_var_.insert(var_name);
     ir::IRMutator<>::Visit(op, expr);
     loop_var_.erase(var_name);
diff --git a/paddle/cinn/optim/unroll_loops.cc b/paddle/cinn/optim/unroll_loops.cc
index 2bc7df11844779..9f2e8bf244e4ce 100644
--- a/paddle/cinn/optim/unroll_loops.cc
+++ b/paddle/cinn/optim/unroll_loops.cc
@@ -65,7 +65,7 @@ struct UnrollMutator : public ir::IRMutator<Expr*> {
       VLOG(5) << "loop to be unrolled should have a contant extent";
       return;
     }
-    int extent = op->extent.as_int32();
+    int64_t extent = op->extent.as_int64();
 
     // predicate this for-loop can be unrolled by auto-unroll conditions
     bool unrollable =
@@ -109,7 +109,7 @@ struct UnrollMutator : public ir::IRMutator<Expr*> {
   int max_unroll_extent_ = 50;
 
   // the number of steps that have been unrolled or plain statement
-  int flat_step_ = 0;
+  int64_t flat_step_ = 0;
   // the number of nested loops not to be unrolled
   int not_unrolled_depth_ = 0;
 };
diff --git a/paddle/cinn/optim/vectorize_loops.cc b/paddle/cinn/optim/vectorize_loops.cc
index c807e3210824da..69bd39f8a4c927 100644
--- a/paddle/cinn/optim/vectorize_loops.cc
+++ b/paddle/cinn/optim/vectorize_loops.cc
@@ -731,7 +731,8 @@ struct VectorizeLoops_ : public IRMutator<Expr *> {
     if (forloop->extent.As<IntImm>()) {
       var_intervals.emplace(
           loopvar_name,
-          cinn::common::CasInterval{0, forloop->extent.as_int32() - 1});
+          cinn::common::CasInterval{static_cast<int64_t>(0),
+                                    forloop->extent.as_int64() - 1});
     } else {
       var_intervals.emplace(
           loopvar_name,
diff --git a/paddle/cinn/poly/dim.h b/paddle/cinn/poly/dim.h
index 5ae7ee7a897d6f..15a3999cc7a3ae 100644
--- a/paddle/cinn/poly/dim.h
+++ b/paddle/cinn/poly/dim.h
@@ -52,6 +52,10 @@ struct Dim {
   Dim(std::string id, uint32_t lower_bound, uint32_t upper_bound)
       : id(std::move(id)), lower_bound(lower_bound), upper_bound(upper_bound) {}
 
+  //! Construct a dimension with int64_t range.
+  Dim(std::string id, int64_t lower_bound, int64_t upper_bound)
+      : id(std::move(id)), lower_bound(lower_bound), upper_bound(upper_bound) {}
+
   //! Construct a dimension with expression range.
   Dim(std::string id, ir::Expr lower_bound, ir::Expr upper_bound);
 
diff --git a/paddle/cinn/poly/domain.cc b/paddle/cinn/poly/domain.cc
index c6f4479bf8bba9..08b6da5ef04472 100644
--- a/paddle/cinn/poly/domain.cc
+++ b/paddle/cinn/poly/domain.cc
@@ -61,8 +61,35 @@ std::string Domain::__str__() const {
 }
 
 isl::set Domain::to_isl() const {
+  // TODO(6clc): will be removed in future
   VLOG(3) << "isl::set " << __str__();
-  isl::set x(cinn::common::Context::isl_ctx(), __str__());
+  auto replace_substr = [](std::string& s,
+                           std::string const& toReplace,
+                           std::string const& replaceWith) {
+    std::string buf;
+    std::size_t pos = 0;
+    std::size_t prevPos = -1;
+
+    // Reserves rough estimate of final size of string.
+    buf.reserve(s.size());
+
+    while (true) {
+      prevPos = pos;
+      pos = s.find(toReplace, pos);
+      if (pos == std::string::npos) break;
+      buf.append(s, prevPos, pos - prevPos);
+      buf += replaceWith;
+      pos += toReplace.size();
+    }
+
+    buf.append(s, prevPos, s.size() - prevPos);
+    s.swap(buf);
+  };
+
+  std::string isl_string = __str__();
+  replace_substr(isl_string, "(ll)", "");
+  replace_substr(isl_string, "ll", "");
+  isl::set x(cinn::common::Context::isl_ctx(), isl_string);
   return x;
 }
 
diff --git a/paddle/cinn/runtime/cuda/cuda_intrinsics.cc b/paddle/cinn/runtime/cuda/cuda_intrinsics.cc
index c4f335603963be..a990192a1d1e66 100644
--- a/paddle/cinn/runtime/cuda/cuda_intrinsics.cc
+++ b/paddle/cinn/runtime/cuda/cuda_intrinsics.cc
@@ -440,8 +440,8 @@ CINN_REGISTER_HELPER(cinn_cuda_host_api) {
       .SetRetType<void>()
       .AddInputType<int>()
       .AddInputType<int>()
-      .AddInputType<int32_t>()
-      .AddInputType<int32_t **>()
+      .AddInputType<int64_t>()
+      .AddInputType<int64_t **>()
       .End();
 
   using cinn::runtime::cuda::cinn_call_cuda_kernel;
diff --git a/paddle/cinn/runtime/cuda/cuda_util.cc b/paddle/cinn/runtime/cuda/cuda_util.cc
index 98ba1c52d7edc3..a33427df4fce12 100644
--- a/paddle/cinn/runtime/cuda/cuda_util.cc
+++ b/paddle/cinn/runtime/cuda/cuda_util.cc
@@ -78,7 +78,7 @@ class CublasHandle {
   cublasHandle_t cuhandle;
 };
 
-int32_t cinn_get_value_in_cuda_kernel_args(void *v_args, int idx) {
+int64_t cinn_get_value_in_cuda_kernel_args(void *v_args, int idx) {
   cinn_pod_value_t *args = static_cast<cinn_pod_value_t *>(v_args);
   return args[idx].operator int64_t();
 }
@@ -2748,7 +2748,7 @@ void cinn_gpu_cudnn_pool2d(const std::vector<int> &attrs,
   cudnnDestroyPoolingDescriptor(pooling_desc);
 }
 
-void infer_shape_set_value(int row, int col, int32_t value, int32_t **v) {
+void infer_shape_set_value(int row, int col, int64_t value, int64_t **v) {
   v[row][col] = value;
 }
 void cinn_gpu_cudnn_softmax(const std::vector<int> &attrs,
diff --git a/paddle/cinn/runtime/cuda/cuda_util.h b/paddle/cinn/runtime/cuda/cuda_util.h
index c7d9220e00688f..3e8a93ecce4a85 100644
--- a/paddle/cinn/runtime/cuda/cuda_util.h
+++ b/paddle/cinn/runtime/cuda/cuda_util.h
@@ -95,8 +95,8 @@ void cinn_call_cuda_memcpy(void* v_args,
                            size_t count,
                            void* stream = nullptr);
 
-int32_t cinn_get_value_in_cuda_kernel_args(void* v_args, int idx);
-void infer_shape_set_value(int row, int col, int32_t value, int32_t** v);
+int64_t cinn_get_value_in_cuda_kernel_args(void* v_args, int idx);
+void infer_shape_set_value(int row, int col, int64_t value, int64_t** v);
 
 /**
  * Call a CUDA compiled kernel.
diff --git a/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.cc b/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.cc
index d8fd3db290b331..a88221bc23e8b7 100644
--- a/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.cc
+++ b/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.cc
@@ -30,7 +30,7 @@ namespace paddle {
 namespace framework {
 
 typedef void (*lower_func_ptr_g)(void*, int32_t, void*);
-typedef void (*infer_shape_func_ptr_g)(void*, int32_t, int32_t**);
+typedef void (*infer_shape_func_ptr_g)(void*, int32_t, int64_t**);
 
 class CinnJitInstruction::FnPtrImpl {
   using CINNKernelInfo = cinn::hlir::framework::pir::CINNKernelInfo;
@@ -81,18 +81,18 @@ class CinnJitInstruction::FnPtrImpl {
     }
 
     // 3. Define an array of Pointers to hold the output tensor shape
-    int32_t* output_tensor_shapes[output_tensor_size];
+    std::vector<int64_t*> output_tensor_shapes(output_tensor_size);
     for (int i = 0; i < output_tensor_size; ++i) {
-      output_tensor_shapes[i] = reinterpret_cast<int32_t*>(
+      output_tensor_shapes[i] = reinterpret_cast<int64_t*>(
           malloc(kernel_args[input_tensor_size + i]->dims().size() *
-                 sizeof(int32_t*)));
+                 sizeof(int64_t*)));
     }
 
     // 4. Launch infer_shape_fn_ptr to infer shape of output tensor
     ((infer_shape_func_ptr_g)cinn_kernel_info_.infer_shape_fn_ptr)(
         static_cast<void*>(func_args_.data()),
         func_args_.size(),
-        output_tensor_shapes);
+        output_tensor_shapes.data());
 
     // 5. Resize shape of output tensor
     for (int i = 0; i < output_tensor_size; ++i) {

From 0d451d2ba7a89ceb7606172aa6611c9e36e2c53d Mon Sep 17 00:00:00 2001
From: Kunbo Ding <kunbo_ding@163.com>
Date: Mon, 22 Jan 2024 17:15:28 +0800
Subject: [PATCH 08/34] [AutoConfig] Adapt to eb-toolkit (#60858)

* [AutoConfig] Adapt to eb-toolkit

* fix code style

* fix codestyle

* adapt to eb-toolkit
---
 python/paddle/distributed/auto_tuner/prune.py | 14 ++++++++-----
 .../launch/controllers/controller.py          |  2 ++
 python/paddle/distributed/launch/main.py      | 20 +++++++++++--------
 3 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/python/paddle/distributed/auto_tuner/prune.py b/python/paddle/distributed/auto_tuner/prune.py
index e77248f9b35a0c..14c1ca64c0c9c0 100644
--- a/python/paddle/distributed/auto_tuner/prune.py
+++ b/python/paddle/distributed/auto_tuner/prune.py
@@ -16,8 +16,6 @@
 import os
 import subprocess
 
-from paddle.distributed.launch.main import ctx
-
 logger = logging.getLogger('auto_tuner')
 _PRUNE_FUNC = []
 _PRUNE_HISTORY_FUNC = []
@@ -35,9 +33,15 @@ def log_pruned_info(cur_cfg, pruned_reason):
         cur_cfg["use_recompute"],
         cur_cfg["recompute_granularity"],
     )
-    ctx.logger.info(
-        f"Strategy {pruned_strategy} has been pruned that {pruned_reason}"
-    )
+
+    try:
+        from paddle.distributed.launch.main import ctx
+
+        ctx.logger.info(
+            f"Strategy {pruned_strategy} has been pruned that {pruned_reason}"
+        )
+    except:
+        pass
     logger.info(
         f"Strategy {pruned_strategy} has been pruned that {pruned_reason}"
     )
diff --git a/python/paddle/distributed/launch/controllers/controller.py b/python/paddle/distributed/launch/controllers/controller.py
index 8b812776d0a969..76d21a40eab066 100644
--- a/python/paddle/distributed/launch/controllers/controller.py
+++ b/python/paddle/distributed/launch/controllers/controller.py
@@ -227,6 +227,8 @@ def _get_entrypoint(self):
                     "-u",
                     self.ctx.args.training_script,
                 ]
+        elif self.ctx.args.training_script.endswith('.pyxes'):
+            entrypoint = [sys.executable, self.ctx.args.training_script]
         else:
             entrypoint = [self.ctx.args.training_script]
 
diff --git a/python/paddle/distributed/launch/main.py b/python/paddle/distributed/launch/main.py
index eb9a1064f1bebf..c5a9fe6c3aa602 100644
--- a/python/paddle/distributed/launch/main.py
+++ b/python/paddle/distributed/launch/main.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .context import Context
+from paddle.distributed.launch.context import Context
 
 ctx = None
 
@@ -303,9 +303,9 @@ def launch():
         import sys
         import time
 
-        from ..auto_tuner.recorder import HistoryRecorder
-        from ..auto_tuner.tuner import AutoTuner
-        from ..auto_tuner.utils import (
+        from paddle.distributed.auto_tuner.recorder import HistoryRecorder
+        from paddle.distributed.auto_tuner.tuner import AutoTuner
+        from paddle.distributed.auto_tuner.utils import (
             add_overlap_performance,
             find_error_from_log,
             gen_new_args,
@@ -313,7 +313,7 @@ def launch():
             read_log,
             read_step_time_log,
         )
-        from . import controllers
+        from paddle.distributed.launch import controllers
 
         start_time = time.time()
         # read user defined tuner config json
@@ -354,6 +354,8 @@ def launch():
                 ]
             else:
                 entrypoint = [sys.executable, "-u", ctx.args.training_script]
+        elif ctx.args.training_script.endswith('.pyxes'):
+            entrypoint = [sys.executable, ctx.args.training_script]
         else:
             entrypoint = [ctx.args.training_script]
         entrypoint.extend(ctx.args.training_script_args)
@@ -383,7 +385,7 @@ def launch():
         sorted_ips = []
         ip = None
         if nnodes > 1:
-            from .utils.etcd_client import ETCDClient
+            from paddle.distributed.launch.utils.etcd_client import ETCDClient
 
             assert "etcd://" in ctx.args.master
             master_ip, port = ctx.args.master.strip("etcd://").split(':')
@@ -962,7 +964,9 @@ def launch():
             # if need accurate peak memory
             if os.environ.get("FLAGS_log_memory_stats", False):
                 max_peak_memory = None
-                from ..auto_tuner.utils import read_allocated_memory_log
+                from paddle.distributed.auto_tuner.utils import (
+                    read_allocated_memory_log,
+                )
 
                 for root, dirs, files in os.walk(ctx.args.log_dir):
                     for file in files:
@@ -1258,7 +1262,7 @@ def launch():
         c.finalize(exit=True)
 
     else:
-        from . import controllers
+        from paddle.distributed.launch import controllers
 
         # initialize the selected controller
         c = controllers.init(ctx)

From 483d377c0182098ed9030bb731c82f953a3722f3 Mon Sep 17 00:00:00 2001
From: xiaoguoguo626807 <100397923+xiaoguoguo626807@users.noreply.github.com>
Date: Mon, 22 Jan 2024 18:17:01 +0800
Subject: [PATCH 09/34] =?UTF-8?q?=E3=80=90pir=E3=80=91modify=20test=5Fcase?=
 =?UTF-8?q?.py=20=20(#60976)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* modify test_case

* Update test/legacy_test/test_dynamic_rnn_stop_gradient.py

* Update test/legacy_test/test_case.py
---
 python/paddle/autograd/ir_backward.py         | 27 ++++--
 test/legacy_test/test_case.py                 | 96 ++++++++++---------
 .../test_dynamic_rnn_stop_gradient.py         |  3 +-
 3 files changed, 71 insertions(+), 55 deletions(-)

diff --git a/python/paddle/autograd/ir_backward.py b/python/paddle/autograd/ir_backward.py
index 84699f26f2e7f2..4d0ad1e6479d5e 100644
--- a/python/paddle/autograd/ir_backward.py
+++ b/python/paddle/autograd/ir_backward.py
@@ -1058,6 +1058,21 @@ def all_stop_gradient_true(block):
     return True
 
 
+def update_total_ops(block):
+    '''
+    when block is sub_block, forward op should include its parent block ops
+    (sub block nest should Add on demand to aviod block copy)
+    '''
+    total_ops = []
+    if block.parent_block is not None:
+        if block.parent_block.parent_block:
+            total_ops += block.parent_block.parent_block.ops
+        total_ops += block.parent_block.ops
+    total_ops += block.ops
+
+    return total_ops
+
+
 def calc_gradient_helper(outputs, inputs, grad_outputs, no_grad_set):
     block = outputs[0].get_defining_op().get_parent_block()
     state = State(block)
@@ -1067,16 +1082,14 @@ def calc_gradient_helper(outputs, inputs, grad_outputs, no_grad_set):
         )
         return state.value_to_valuegrad
 
-    total_ops = []
-    if block.parent_block is not None:
-        total_ops += block.parent_block.ops
-    total_ops += block.ops
+    total_ops = update_total_ops(block)
 
     # update no_grad_set if some value stop_gradient=True
     update_no_grad_set_by_stopgradient(block, no_grad_set)
-    complete_outputs, backward_ops = prepare_grad_outputs(
-        grad_outputs, outputs, state
-    )
+    with block:
+        complete_outputs, backward_ops = prepare_grad_outputs(
+            grad_outputs, outputs, state
+        )
 
     inputs_set = ValueSet(inputs)
     stop_gradient_false_outputs = []
diff --git a/test/legacy_test/test_case.py b/test/legacy_test/test_case.py
index 0f2c65865f6905..770044c265396e 100644
--- a/test/legacy_test/test_case.py
+++ b/test/legacy_test/test_case.py
@@ -415,7 +415,6 @@ def fn_3():
             out_1 = paddle.static.nn.control_flow.case(
                 pred_fn_pairs=[(pred_1, fn_1), (pred_2, fn_2)], default=fn_3
             )
-
             out_2 = paddle.static.nn.control_flow.case(
                 pred_fn_pairs=[(pred_2, fn_1), (pred_1, fn_2)], default=fn_3
             )
@@ -612,58 +611,61 @@ def type_error_default():
 
 # when optimizer in case
 class TestMutiTask(unittest.TestCase):
+    @test_with_pir_api
     def test_optimizer_in_case(self):
         BATCH_SIZE = 1
         INPUT_SIZE = 784
         EPOCH_NUM = 2
-
-        x = paddle.static.data(
-            name='x', shape=[BATCH_SIZE, INPUT_SIZE], dtype='float32'
-        )
-        y = paddle.static.data(
-            name='y', shape=[BATCH_SIZE, INPUT_SIZE], dtype='float32'
-        )
-
-        switch_id = paddle.static.data(
-            name='switch_id', shape=[1], dtype='int32'
-        )
-
-        one = paddle.tensor.fill_constant(shape=[1], dtype='int32', value=1)
-        adam = paddle.optimizer.Adam(learning_rate=0.001)
-        adagrad = paddle.optimizer.Adagrad(learning_rate=0.001)
-
-        def fn_1():
-            sum = paddle.multiply(x, y)
-            loss = paddle.mean(sum, name="f_1_loss")
-            adam.minimize(loss)
-
-        def fn_2():
-            sum = paddle.multiply(x, y)
-            loss = paddle.mean(sum, name="f_2_loss")
-            adagrad.minimize(loss)
-
-        paddle.static.nn.control_flow.case(
-            pred_fn_pairs=[(switch_id == one, fn_1)], default=fn_2
-        )
-
-        exe = base.Executor(base.CPUPlace())
-        exe.run(base.default_startup_program())
-
-        for epoch in range(EPOCH_NUM):
-            np.random.seed(epoch)
-            feed_image = np.random.random(size=[BATCH_SIZE, INPUT_SIZE]).astype(
-                'float32'
+        main_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        with paddle.static.program_guard(main_program, startup_program):
+            x = paddle.static.data(
+                name='x', shape=[BATCH_SIZE, INPUT_SIZE], dtype='float32'
             )
-            main_program = base.default_main_program()
-            out = exe.run(
-                main_program,
-                feed={
-                    'x': feed_image,
-                    'y': feed_image,
-                    'switch_id': np.array([epoch]).astype('int32'),
-                },
-                fetch_list=[],
+            y = paddle.static.data(
+                name='y', shape=[BATCH_SIZE, INPUT_SIZE], dtype='float32'
             )
+            x.stop_gradient = False
+            y.stop_gradient = False
+            switch_id = paddle.static.data(
+                name='switch_id', shape=[1], dtype='int32'
+            )
+
+            one = paddle.tensor.fill_constant(shape=[1], dtype='int32', value=1)
+            adam = paddle.optimizer.Adam(learning_rate=0.001)
+            adagrad = paddle.optimizer.Adagrad(learning_rate=0.001)
+
+            def fn_1():
+                sum = paddle.multiply(x, y)
+                loss = paddle.mean(sum, name="f_1_loss")
+                adam.minimize(loss)
+
+            def fn_2():
+                sum = paddle.multiply(x, y)
+                loss = paddle.mean(sum, name="f_2_loss")
+                adagrad.minimize(loss)
+
+            paddle.static.nn.control_flow.case(
+                pred_fn_pairs=[(switch_id == one, fn_1)], default=fn_2
+            )
+
+            exe = base.Executor(base.CPUPlace())
+            exe.run(startup_program)
+
+            for epoch in range(EPOCH_NUM):
+                np.random.seed(epoch)
+                feed_image = np.random.random(
+                    size=[BATCH_SIZE, INPUT_SIZE]
+                ).astype('float32')
+                out = exe.run(
+                    main_program,
+                    feed={
+                        'x': feed_image,
+                        'y': feed_image,
+                        'switch_id': np.array([epoch]).astype('int32'),
+                    },
+                    fetch_list=[],
+                )
 
 
 if __name__ == '__main__':
diff --git a/test/legacy_test/test_dynamic_rnn_stop_gradient.py b/test/legacy_test/test_dynamic_rnn_stop_gradient.py
index 7d28931887d7c3..c6d85b864c8d7c 100644
--- a/test/legacy_test/test_dynamic_rnn_stop_gradient.py
+++ b/test/legacy_test/test_dynamic_rnn_stop_gradient.py
@@ -18,6 +18,7 @@
 
 import paddle
 from paddle import base
+from paddle.pir_utils import test_with_pir_api
 from paddle.tensor.manipulation import tensor_array_to_tensor
 
 paddle.enable_static()
@@ -77,7 +78,7 @@ def setUp(self):
         self.batch_size = 2
         self.beam_size = 2
 
-    # @test_with_pir_api
+    @test_with_pir_api
     def run_main(self, place):
         with paddle.pir_utils.IrGuard():
             main_program = paddle.static.Program()

From ae9706f3f966389174e19cf8c232b93422f4651f Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuan29@baidu.com>
Date: Mon, 22 Jan 2024 18:45:02 +0800
Subject: [PATCH 10/34] refine onednn ctx inputs name, outputs name (#61011)

---
 .../instruction/onednn/onednn_instruction.cc        | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/instruction/onednn/onednn_instruction.cc b/paddle/fluid/framework/new_executor/instruction/onednn/onednn_instruction.cc
index 659164b1ec77cb..eedda2350e425b 100644
--- a/paddle/fluid/framework/new_executor/instruction/onednn/onednn_instruction.cc
+++ b/paddle/fluid/framework/new_executor/instruction/onednn/onednn_instruction.cc
@@ -100,6 +100,7 @@ void TensorNameMap(pir::Operation* op,
                        inputs_tensor_name_map,  // NOLINT
                    std::map<std::string, std::vector<std::string>>&
                        outputs_tensor_name_map) {  // NOLINT
+  static int unique_id = 0;
   const Scope* inner_scope = value_exec_info.GetScope();
   VLOG(6) << "TensorNameMap in scope[" << inner_scope << "]";
 
@@ -132,14 +133,16 @@ void TensorNameMap(pir::Operation* op,
     auto type = ptr.type();
     if (type.isa<paddle::dialect::AllocatedDenseTensorType>() ||
         type.isa<paddle::dialect::AllocatedSelectedRowsType>()) {
-      inputs_tensor_name_map[legacy_arg_name] = {in_var_name};
+      inputs_tensor_name_map[legacy_arg_name] = {in_var_name +
+                                                 std::to_string(unique_id++)};
     } else if (type.isa<pir::VectorType>()) {
       auto var = inner_scope->FindVar(in_var_name);
       auto var_ref = var->Get<VariableRefArray>();
       std::vector<std::string> vec_tmp;
       vec_tmp.reserve(var_ref.size());
       for (size_t k = 0; k < var_ref.size(); ++k) {
-        vec_tmp.push_back(value_exec_info.GetVarName(var_ref[k]));
+        vec_tmp.push_back(value_exec_info.GetVarName(var_ref[k]) +
+                          std::to_string(unique_id++));
       }
       inputs_tensor_name_map[legacy_arg_name] = vec_tmp;
     } else {
@@ -168,14 +171,16 @@ void TensorNameMap(pir::Operation* op,
     auto type = ptr.type();
     if (type.isa<paddle::dialect::AllocatedDenseTensorType>() ||
         type.isa<paddle::dialect::AllocatedSelectedRowsType>()) {
-      outputs_tensor_name_map[legacy_arg_name] = {out_var_name};
+      outputs_tensor_name_map[legacy_arg_name] = {out_var_name +
+                                                  std::to_string(unique_id++)};
     } else if (type.isa<pir::VectorType>()) {
       auto var = inner_scope->FindVar(out_var_name);
       auto var_ref = var->Get<VariableRefArray>();
       std::vector<std::string> vec_tmp;
       vec_tmp.reserve(var_ref.size());
       for (size_t k = 0; k < var_ref.size(); ++k) {
-        vec_tmp.push_back(value_exec_info.GetVarName(var_ref[k]));
+        vec_tmp.push_back(value_exec_info.GetVarName(var_ref[k]) +
+                          std::to_string(unique_id++));
       }
       outputs_tensor_name_map[legacy_arg_name] = vec_tmp;
     } else {

From c47428cdd16f21990d14d118b9d360eb0c47428a Mon Sep 17 00:00:00 2001
From: HongyuJia <jiahongyu@baidu.com>
Date: Mon, 22 Jan 2024 19:47:34 +0800
Subject: [PATCH 11/34] [Dynamic Shape] Cinn backend rms norm (#60997)

* [pir]Adding FusionOp and CinnFusionLoweringPass

* fix

* fix

* ConvertStaticDimToDynamicPass

* minor fix

* minor fix

* Bug Fix

* address pr comments

* Compile success

* rms_norm smoking test for cinn-backend

* refine infer_symbolic_shape_pass

* Add unittest

* Fix cmake property

---------

Co-authored-by: zhangyuqin1998 <2368719370@qq.com>
---
 .../hlir/framework/pir/op_lowering_impl.cc    |  42 ++++--
 paddle/cinn/hlir/op/broadcast.cc              |   2 +
 paddle/cinn/hlir/op/elementwise.cc            | 126 ++++++++++++++++++
 paddle/fluid/pybind/pir.cc                    |   7 +-
 .../jit/dy2static/pir_partial_program.py      |   4 +-
 test/ir/pir/cinn/CMakeLists.txt               |  13 +-
 test/ir/pir/cinn/test_rms_norm.py             |  67 ++++++++++
 7 files changed, 244 insertions(+), 17 deletions(-)
 create mode 100644 test/ir/pir/cinn/test_rms_norm.py

diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc b/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc
index 44f78f062874f0..b796ccca95b241 100644
--- a/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc
+++ b/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc
@@ -550,15 +550,20 @@ std::vector<ir::Expr> OpLowererImpl::LowerOps(
       std::vector<Type> out_types;
       std::vector<std::vector<ir::Dim>> out_shapes;
       CollectOutputInfo(op, &out_types, &out_shapes, group);
+      CHECK_EQ(out_types.size(), out_shapes.size());
       VLOG(4) << "out_types.size(): " << out_types.size();
       NodeAttr node_attrs = details::CollectAttrs(*op);
-      auto& strategy =
+      auto& strategy_map =
           Operator::GetAttrs<StrategyFunctionSymbolic>("CINNStrategySymbolic");
-      op_impl = OpStrategy::SelectImpl(strategy[cinn_op](node_attrs,
-                                                         op_func_arg_tensors,
-                                                         out_types,
-                                                         out_shapes,
-                                                         this->target_));
+      StrategyFunctionSymbolic strategy = strategy_map[cinn_op];
+      CHECK(static_cast<bool>(strategy))
+          << " cinn_op_name: " << cinn_op_name
+          << "has no CINNStrategySymbolic registered.";
+      op_impl = OpStrategy::SelectImpl(strategy(node_attrs,
+                                                op_func_arg_tensors,
+                                                out_types,
+                                                out_shapes,
+                                                this->target_));
     } else {
       std::vector<Type> out_types;
       std::vector<std::vector<int>> out_shapes;
@@ -797,14 +802,25 @@ void OpLowererImpl::CollectOutputInfo(
         out_value.type().dyn_cast<paddle::dialect::DenseTensorType>();
 
     out_types->push_back(CompatibleInfo::ConvertIRType(type_info.dtype()));
-    if (!group->value_to_shape_or_data_exprs.empty()) {
-      auto sym_vec = group->GetShapeOrDataExprs(out_value).shape();
-      std::vector<ir::Dim> sym_shape;
-      for (auto& sym : sym_vec) {
-        sym_shape.emplace_back(output_id, sym);
+
+    auto ForEachDimExpr = [&](const auto& DoEach) {
+      if (!group->value_to_shape_or_data_exprs.empty()) {
+        auto sym_vec = group->GetShapeOrDataExprs(out_value).shape();
+        std::vector<ir::Dim> sym_shape;
+        for (const auto& sym : sym_vec) {
+          DoEach(sym);
+        }
+      } else {
+        auto out_shape = ::common::vectorize<int64_t>(type_info.dims());
+        for (int64_t dim : out_shape) {
+          DoEach(symbol::DimExpr{dim});
+        }
       }
-      out_shapes->push_back(std::move(sym_shape));
-    }
+    };
+    std::vector<ir::Dim> sym_shape;
+    ForEachDimExpr(
+        [&](const auto& sym) { sym_shape.emplace_back(output_id, sym); });
+    out_shapes->emplace_back(std::move(sym_shape));
   }
 }
 
diff --git a/paddle/cinn/hlir/op/broadcast.cc b/paddle/cinn/hlir/op/broadcast.cc
index 1a1bbe81de914f..bf71267b2c6184 100644
--- a/paddle/cinn/hlir/op/broadcast.cc
+++ b/paddle/cinn/hlir/op/broadcast.cc
@@ -301,12 +301,14 @@ std::shared_ptr<OpStrategy> StrategyForBroadcastToSymbolic(
     const std::vector<Type> &out_type,
     const std::vector<std::vector<ir::Dim>> &output_shapes,
     const Target &target) {
+  CHECK_EQ(output_shapes.size(), 1);
   std::vector<ir::Expr> out_shape(output_shapes[0].size());
   std::transform(output_shapes[0].begin(),
                  output_shapes[0].end(),
                  out_shape.begin(),
                  [](const ir::Dim &dim) { return dim->dim_expr; });
   std::vector<int> broadcast_axes;
+  CHECK_GT(attrs.attr_store.count("broadcast_axes"), 0);
   broadcast_axes =
       absl::get<std::vector<int>>(attrs.attr_store.at("broadcast_axes"));
   VLOG(3) << "broadcast out shape: " << utils::Join(out_shape, ", ");
diff --git a/paddle/cinn/hlir/op/elementwise.cc b/paddle/cinn/hlir/op/elementwise.cc
index 0a81e42700d401..b215e0dd859521 100644
--- a/paddle/cinn/hlir/op/elementwise.cc
+++ b/paddle/cinn/hlir/op/elementwise.cc
@@ -251,6 +251,75 @@ std::shared_ptr<OpStrategy> StrategyForScale(
   return strategy;
 }
 
+std::shared_ptr<OpStrategy> StrategyForScaleSymbolic(
+    const framework::NodeAttr &attrs,
+    const std::vector<ir::Tensor> &inputs,
+    const std::vector<Type> &out_type,
+    const std::vector<std::vector<ir::Dim>> &output_shapes,
+    const Target &target) {
+  float scale = 1.f;
+  float bias = 0.f;
+  bool bias_after_scale = true;
+  for (auto &iter : attrs.attr_store) {
+    if (iter.first == "scale") {
+      scale = absl::get<float>(iter.second);
+    } else if (iter.first == "bias") {
+      bias = absl::get<float>(iter.second);
+    } else if (iter.first == "bias_after_scale") {
+      bias_after_scale = absl::get<bool>(iter.second);
+    }
+  }
+  framework::CINNCompute scale_compute(
+      [=](lang::Args args, lang::RetValue *ret) {
+        CHECK(!args.empty())
+            << "The input arguments of scale compute is empty! Please check.";
+        CINNValuePack pack_args = args[0];
+        CHECK(!pack_args.empty())
+            << "The input tensors of scale compute is empty! Please check.";
+        Expr A_expr = pack_args[0];
+        CHECK(A_expr.as_tensor());
+        ir::Tensor A = A_expr.as_tensor_ref();
+        ir::Tensor out;
+        CHECK_EQ(pack_args.size(), 2);
+        CHECK(pack_args[1].is_string());
+        std::string tensor_name = pack_args[1].operator std::string();
+
+        // Paddle upscale float16 or bfloat16 compute to float32,
+        // we made CINN consistent with this behavior of Paddle
+        bool should_upscale_fp32 = A->type() == cinn::common::F16() ||
+                                   A->type() == cinn::common::BF16();
+
+        out = Compute(
+            A->shape,
+            [=](const std::vector<Expr> &indice) {
+              Expr cast_scale = should_upscale_fp32
+                                    ? Expr(scale)
+                                    : ir::Cast::Make(A->type(), Expr(scale));
+              Expr cast_bias = should_upscale_fp32
+                                   ? Expr(bias)
+                                   : ir::Cast::Make(A->type(), Expr(bias));
+              Expr cast_A_indice =
+                  should_upscale_fp32
+                      ? ir::Cast::Make(cinn::common::F32(), A(indice))
+                      : A(indice);
+              Expr add_result = bias_after_scale
+                                    ? cast_scale * cast_A_indice + cast_bias
+                                    : cast_scale * (cast_A_indice + cast_bias);
+              return should_upscale_fp32 ? ir::Cast::Make(A->type(), add_result)
+                                         : add_result;
+            },
+            tensor_name);
+
+        auto stages = CreateStages({out});
+        *ret = CINNValuePack{{CINNValue(Expr(out.get())), CINNValue(stages)}};
+      });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(scale_compute, lang::PackedFunc(), "strategy.scale.x86", 1);
+
+  return strategy;
+}
+
 Expr GetScalarExpr(const framework::NodeAttr::attr_t &attr) {
   Expr scalar;
   struct Visitor {
@@ -450,6 +519,58 @@ std::shared_ptr<OpStrategy> StrategyForFillConstant(
   return strategy;
 }
 
+std::shared_ptr<OpStrategy> StrategyForFillConstantSymbolic(
+    const framework::NodeAttr &attrs,
+    const std::vector<ir::Tensor> &inputs,
+    const std::vector<Type> &out_type,
+    const std::vector<std::vector<ir::Dim>> &output_shapes,
+    const Target &target) {
+  framework::CINNCompute fill_constant_compute(
+      [=](lang::Args args, lang::RetValue *ret) {
+        CHECK(!args.empty()) << "The input argument of fill_constant compute "
+                                "is empty! Please check.";
+        bool force_cpu = false;
+        CHECK(attrs.attr_store.count("shape"));
+        auto shape = absl::get<std::vector<int>>(attrs.attr_store.at("shape"));
+        CHECK(attrs.attr_store.count("value"));
+        auto value = GetScalarExpr(attrs.attr_store.at("value"));
+        CHECK(attrs.attr_store.count("force_cpu"));
+        force_cpu = absl::get<bool>(attrs.attr_store.at("force_cpu"));
+
+        if (force_cpu && target != cinn::common::DefaultHostTarget()) {
+          LOG(WARNING) << "The attribute \"force_cpu\" of \"fill_constant\" "
+                          "not supported in CINN! The \"fill_constant\"'s "
+                          "output tensor will placed on "
+                       << target;
+        }
+
+        CINNValuePack arg_pack = args[0];
+        CHECK_EQ(arg_pack.size(), 1U);
+        CHECK(arg_pack[0].is_string());
+        std::string tensor_name = arg_pack[0].operator std::string();
+        CHECK(!shape.empty()) << "shape attr is empty!";
+        auto shape_exprs = ToCinnExprs(shape);
+        auto out = lang::Compute(
+            shape_exprs,
+            [=](const std::vector<Expr> &indice) {
+              return ir::Cast::Make(out_type[0], value);
+            },
+            tensor_name);
+        CHECK(out.defined())
+            << "can't create fill_constant with the given type " << out_type[0];
+        auto stages = CreateStages({out});
+        *ret = CINNValuePack{{CINNValue(out), CINNValue(stages)}};
+      });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(fill_constant_compute,
+                    lang::PackedFunc(),
+                    "strategy.fill_constant.x86",
+                    1);
+
+  return strategy;
+}
+
 std::vector<shape_t> InferShapeForFillConstant(
     const std::vector<shape_t> &inputs_shape,
     const framework::AttrMapType &attrs) {
@@ -1178,6 +1299,8 @@ CINN_REGISTER_HELPER(elementwise_ops) {
       .set_num_outputs(1)
       .set_attr<cinn::hlir::framework::StrategyFunction>(
           "CINNStrategy", cinn::hlir::op::StrategyForScale)
+      .set_attr<cinn::hlir::framework::StrategyFunctionSymbolic>(
+          "CINNStrategySymbolic", cinn::hlir::op::StrategyForScaleSymbolic)
       .set_attr("infershape",
                 MakeOpFunction(cinn::hlir::op::InferShapeForElementwise))
       .set_attr("inferdtype",
@@ -1226,6 +1349,9 @@ CINN_REGISTER_HELPER(elementwise_ops) {
       .set_num_outputs(1)
       .set_attr<cinn::hlir::framework::StrategyFunction>(
           "CINNStrategy", cinn::hlir::op::StrategyForFillConstant)
+      .set_attr<cinn::hlir::framework::StrategyFunctionSymbolic>(
+          "CINNStrategySymbolic",
+          cinn::hlir::op::StrategyForFillConstantSymbolic)
       .set_attr("infershape",
                 MakeOpFunction(cinn::hlir::op::InferShapeForFillConstant))
       .set_attr("inferdtype",
diff --git a/paddle/fluid/pybind/pir.cc b/paddle/fluid/pybind/pir.cc
index 56ba857b3b775c..b2a687829498ca 100644
--- a/paddle/fluid/pybind/pir.cc
+++ b/paddle/fluid/pybind/pir.cc
@@ -1590,10 +1590,13 @@ void AddCinnPass(std::shared_ptr<PassManager> &pass_manager,  // NOLINT
 }
 
 void InferSymbolicShapePass(
-    std::shared_ptr<PassManager> &pass_manager) {  // NOLINT
+    std::shared_ptr<PassManager> &pass_manager,  // NOLINT
+    Program &program) {                          // NOLINT
   pir::IrContext *ctx = pir::IrContext::Instance();
   ctx->GetOrRegisterDialect<pir::shape::ShapeDialect>();
-  pass_manager->AddPass(pir::CreateShapeOptimizationPass());
+  if (HasDynamicShape(program)) {
+    pass_manager->AddPass(pir::CreateShapeOptimizationPass());
+  }
 }
 
 void BindIrPass(pybind11::module *m) {
diff --git a/python/paddle/jit/dy2static/pir_partial_program.py b/python/paddle/jit/dy2static/pir_partial_program.py
index 9ed34732b59a8f..3d2bdb149a54d1 100644
--- a/python/paddle/jit/dy2static/pir_partial_program.py
+++ b/python/paddle/jit/dy2static/pir_partial_program.py
@@ -551,7 +551,9 @@ def _create_program(self, is_infer_mode=False):
 
             def pass_fn(forward_program, backward_program):
                 pm = paddle.base.libpaddle.pir.PassManager()
-                paddle.base.libpaddle.pir.infer_symbolic_shape_pass(pm)
+                paddle.base.libpaddle.pir.infer_symbolic_shape_pass(
+                    pm, forward_program
+                )
                 if self._build_strategy.build_cinn_pass:
                     paddle.base.libpaddle.pir.add_cinn_pass(pm, forward_program)
                 pm.run(forward_program)
diff --git a/test/ir/pir/cinn/CMakeLists.txt b/test/ir/pir/cinn/CMakeLists.txt
index fd774eb157095b..7c351fb3505b49 100644
--- a/test/ir/pir/cinn/CMakeLists.txt
+++ b/test/ir/pir/cinn/CMakeLists.txt
@@ -8,7 +8,7 @@ if(WITH_GPU)
     "test_*.py")
   string(REPLACE ".py" "" CINN_PIR_TEST "${CINN_PIR_TEST}")
 
-  list(REMOVE_ITEM CINN_PIR_TEST test_subgraph_checker)
+  list(REMOVE_ITEM CINN_PIR_TEST test_subgraph_checker test_rms_norm)
 
   foreach(cinn_pir_test_name ${CINN_PIR_TEST})
     add_test(
@@ -33,4 +33,15 @@ if(WITH_GPU)
     WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
   set_tests_properties(test_subgraph_checker PROPERTIES LABELS "RUN_TYPE=CINN")
 
+  add_test(
+    NAME test_rms_norm
+    COMMAND
+      ${CMAKE_COMMAND} -E env
+      PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH}
+      FLAGS_cinn_convert_static_dim_to_dynamic=2048:S0
+      FLAGS_cinn_bucket_compile=True FLAGS_enable_pir_api=1 ${PYTHON_EXECUTABLE}
+      ${CMAKE_CURRENT_SOURCE_DIR}/test_rms_norm.py
+    WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+  set_tests_properties(test_rms_norm PROPERTIES LABELS "RUN_TYPE=CINN")
+
 endif()
diff --git a/test/ir/pir/cinn/test_rms_norm.py b/test/ir/pir/cinn/test_rms_norm.py
new file mode 100644
index 00000000000000..02fb8e485943ff
--- /dev/null
+++ b/test/ir/pir/cinn/test_rms_norm.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import numpy as np
+from test_cinn_sub_graph import TestCinnSubGraphBase, apply_to_static
+
+import paddle
+from paddle import nn
+
+
+class LlamaRMSNorm(nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.hidden_size = 768
+        self.weight = paddle.create_parameter(
+            shape=[self.hidden_size],
+            dtype=paddle.get_default_dtype(),
+            default_initializer=nn.initializer.Constant(1.0),
+        )
+        self.variance_epsilon = 1e-6
+
+    def forward(self, hidden_states):
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = (
+            paddle.rsqrt(variance + self.variance_epsilon) * hidden_states
+        )
+        return hidden_states * self.weight
+
+
+class TestLlamaRMSNorm(TestCinnSubGraphBase):
+    def prepare_data(self):
+        self.shape = [1, 2048, 768]
+        self.hidden_states = paddle.randn(self.shape, dtype="float32")
+        self.hidden_states.stop_gradient = False
+
+    def eval(self, use_cinn):
+        paddle.seed(2022)
+        net = LlamaRMSNorm()
+        # TODO(Aurelius84): Need to remove it after verify CINN
+        if use_cinn:
+            net = apply_to_static(net, use_cinn)
+        net.eval()
+        out = net(self.hidden_states)
+        return out
+
+    def test_eval(self):
+        cinn_out = self.eval(use_cinn=True)
+        dy_out = self.eval(use_cinn=False)
+        np.testing.assert_allclose(
+            cinn_out.numpy(), dy_out.numpy(), atol=1e-6, rtol=1e-6
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()

From 4db394f5a530e9f1a324ca272fcbe4c442e5a747 Mon Sep 17 00:00:00 2001
From: Chen Zhiyang <1792266893@qq.com>
Date: Mon, 22 Jan 2024 23:23:29 +0800
Subject: [PATCH 12/34] [PIR]Fix array_write stop_gradient (#60970)

* fix array_write stop_gradient

* fix stop_gradient bug

* set array_write sg to true
---
 paddle/fluid/pir/dialect/operator/ir/manual_op.cc  | 9 ++++++++-
 test/legacy_test/test_assign_op.py                 | 2 --
 test/legacy_test/test_dynamic_rnn_stop_gradient.py | 1 +
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_op.cc b/paddle/fluid/pir/dialect/operator/ir/manual_op.cc
index 497282e48909d2..9e387d4ba85be5 100644
--- a/paddle/fluid/pir/dialect/operator/ir/manual_op.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_op.cc
@@ -2179,7 +2179,14 @@ void ArrayWrite_Op::Build(pir::Builder &builder,
       ArrayWrite_Op::InferMeta(argument_inputs, argument_attributes);
 
   argument.AddOutputs(argument_outputs.begin(), argument_outputs.end());
-  ::pir::PassStopGradientsDefaultly(argument);
+  constexpr char kStopGradientAttrName[] = "stop_gradient";
+  auto stop_gradient0 =
+      argument.inputs[0].attribute<pir::BoolAttribute>(kStopGradientAttrName);
+  auto stop_gradient1 =
+      argument.inputs[1].attribute<pir::BoolAttribute>(kStopGradientAttrName);
+  auto stop_gradient = stop_gradient0.data() && stop_gradient1.data();
+  argument.inputs[0].set_attribute(kStopGradientAttrName,
+                                   builder.bool_attr(stop_gradient));
 }
 
 void ArrayWrite_Op::VerifySig() {
diff --git a/test/legacy_test/test_assign_op.py b/test/legacy_test/test_assign_op.py
index b4ccbf56a72d44..5a2ff191a712f7 100644
--- a/test/legacy_test/test_assign_op.py
+++ b/test/legacy_test/test_assign_op.py
@@ -129,8 +129,6 @@ def test_assign_tensor_array(self):
             z = paddle.add(x=x, y=y)
             i = paddle.tensor.fill_constant(shape=[1], dtype='int64', value=0)
             init_array = paddle.tensor.array_write(x=z, i=i)
-            # TODO(xiaoguoguo626807): Remove this stop_gradient=False.
-            init_array.stop_gradient = False
             array = paddle.assign(init_array)
             sums = paddle.tensor.array_read(array=init_array, i=i)
             mean = paddle.mean(sums)
diff --git a/test/legacy_test/test_dynamic_rnn_stop_gradient.py b/test/legacy_test/test_dynamic_rnn_stop_gradient.py
index c6d85b864c8d7c..8ffaca6b456ecd 100644
--- a/test/legacy_test/test_dynamic_rnn_stop_gradient.py
+++ b/test/legacy_test/test_dynamic_rnn_stop_gradient.py
@@ -59,6 +59,7 @@ def build_and_run_program(place, batch_size, beam_size, stop_gradient=False):
         paddle.tensor.array_write(score, i=step_idx, array=scores)
         length_cond = paddle.less_than(x=step_idx, y=max_len)
         paddle.assign(length_cond, cond)
+    scores.stop_gradient = True
     out = tensor_array_to_tensor(scores, axis=0, use_stack=True)[0]
     loss = paddle.mean(out)
     opt = paddle.optimizer.Adam(0.01)

From f522f044ad6a431b7e9c7985a61cae2091687bef Mon Sep 17 00:00:00 2001
From: lanxianghit <47554610+lanxianghit@users.noreply.github.com>
Date: Tue, 23 Jan 2024 10:05:11 +0800
Subject: [PATCH 13/34] [PIR][DynamixShape] Bug fix (#61022)

fix bugs in StackOpInferSymbolicShape
---
 .../dialect/operator/interface/infer_symbolic_shape.cc | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape.cc
index bbb1ec69267ae8..ba8c30be9f9e22 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape.cc
@@ -260,17 +260,17 @@ bool StackOpInferSymbolicShape(pir::Operation *op,
       shape_analysis->GetShapeOrDataForValue(operand_source);
 
   std::vector<symbol::DimExpr> out_dims;
+  std::vector<symbol::DimExpr> out_dims_data;
   if (operand_shape_or_data.data().has_value()) {
-    out_dims = operand_shape_or_data.data().value();
+    out_dims_data = operand_shape_or_data.data().value();
+    out_dims.emplace_back(
+        static_cast<std::int64_t>(operand_shape_or_data.shape().size()));
   }
   // else : pir::VectorType x =
   // operand_source.type().dyn_cast<pir::VectorType>();
   // TODO(zhangbopd): else branch is not implemented yet.
 
-  symbol::ShapeOrDataDimExprs shape_data{out_dims};
-  if (operand_shape_or_data.data().has_value()) {
-    shape_data.SetData(operand_shape_or_data.shape());
-  }
+  symbol::ShapeOrDataDimExprs shape_data{out_dims, out_dims_data};
 
   op->set_attribute(
       "symbolic_shape",

From 3bad17b8ee21239a487ace48e2b17ff430b5b46d Mon Sep 17 00:00:00 2001
From: BiynXu <62832681+BiynXu@users.noreply.github.com>
Date: Tue, 23 Jan 2024 10:28:50 +0800
Subject: [PATCH 14/34] [CINN] fix reduction schedule (#60743)

* [Dynamic Shape] Provide Reduce Demo

* tmp reduce compute

* [CINN] fix dynamic shape reduce bug

* fix reduction schedule

* fix compile error

* fix symbolic SimpleComputeAt

* fix if stmt in FactorizeReduction

* fix sum op infer symbolic shape

* [CINN] Add is_symbolic_constant in ir::Var

* remove useless code

* fix tmp buffer compare

* [InferSymbolicShape] Provide SumOpInferSymbolicShape

* Use const

* Solve warning

* Rerun ShapeOptimizationPass after PdOpToCinnOpPass

* fix arrange storage bug

* fix dynamic alloc local buffer

* fix buffer free of shared memory

* remove tmp code  reduce_run.sh

* fix unittests

---------

Co-authored-by: jiahongyu <jiahongyu@baidu.com>
Co-authored-by: 6clc <chaoliu.lc@foxmail.com>
---
 paddle/cinn/backends/codegen_cuda_dev.cc      | 73 ++++++++++++++--
 paddle/cinn/backends/codegen_cuda_dev.h       |  2 +
 paddle/cinn/common/dim_expr_converter.cc      |  6 +-
 paddle/cinn/common/dim_expr_converter_test.cc | 25 +++++-
 paddle/cinn/ir/buffer.cc                      |  9 ++
 paddle/cinn/ir/buffer.h                       |  2 +
 paddle/cinn/ir/dim.cc                         |  1 -
 .../tactic/arrange_storage_tactic.cc          | 87 +++++++++++--------
 .../tactic/optimize_reduction_tactic.cc       | 22 +++--
 .../ir/group_schedule/tactic/tile_tactic.cc   |  5 +-
 paddle/cinn/ir/ir.cc                          |  4 +-
 paddle/cinn/ir/ir.h                           |  4 +-
 paddle/cinn/ir/schedule/factorize_reduction.h | 20 +++++
 .../cinn/ir/schedule/impl/compute_location.cc | 17 +++-
 paddle/cinn/ir/utils/ir_copy.cc               |  1 +
 paddle/cinn/lang/lower.cc                     | 21 +++--
 paddle/cinn/pybind/ir/ir_api.cc               | 10 ++-
 17 files changed, 235 insertions(+), 74 deletions(-)

diff --git a/paddle/cinn/backends/codegen_cuda_dev.cc b/paddle/cinn/backends/codegen_cuda_dev.cc
index 4e7fa79d2d0b30..d9af9c5ac56cc5 100644
--- a/paddle/cinn/backends/codegen_cuda_dev.cc
+++ b/paddle/cinn/backends/codegen_cuda_dev.cc
@@ -114,6 +114,31 @@ std::vector<Expr> CodeGenCUDA_Dev::GenerateBufferAliasExprs(
   return buffer_alias;
 }
 
+std::vector<Expr> FilterDeallocTempBuffers(const std::vector<Expr> &frees) {
+  std::vector<Expr> filtered;
+  for (const Expr &free : frees) {
+    const ir::Free *op = free.As<ir::Free>();
+    CHECK_NOTNULL(op);
+    bool has_symbolic_constant = false;
+    const ir::_Buffer_ *buffer = op->destination.As<ir::_Buffer_>();
+    for (Expr shape : buffer->shape) {
+      ir::ir_utils::CollectIRNodes(shape, [&](const Expr *x) {
+        if (x->as_var()) {
+          CHECK(x->as_var()->is_symbolic_constant)
+              << "var in buffer shape must be symbolic constant.";
+          has_symbolic_constant = true;
+        }
+        return false;
+      });
+    }
+    if (has_symbolic_constant &&
+        buffer->memory_type == ir::MemoryType::GPULocal) {
+      filtered.emplace_back(free);
+    }
+  }
+  return filtered;
+}
+
 void CodeGenCUDA_Dev::Visit(const ir::_LoweredFunc_ *op) {
   // clear names valid within scope when enter a new function
   vectorized_tensor_names_.clear();
@@ -129,6 +154,8 @@ void CodeGenCUDA_Dev::Visit(const ir::_LoweredFunc_ *op) {
   auto alloca_temp_buffers = op->PrepareAllocTempBufferExprs();
   auto temp_buffer_alias = GenerateBufferAliasExprs(op, op->temp_bufs);
   auto alis_var_exprs = op->CudaAliasVarExprs();
+  auto dealloc_temp_buffers =
+      FilterDeallocTempBuffers(op->PrepareDeallocTempBufferExprs());
 
 #define APPEND_TO_NEW_BODY(field__) \
   new_body.insert(std::end(new_body), std::begin(field__), std::end(field__));
@@ -137,6 +164,7 @@ void CodeGenCUDA_Dev::Visit(const ir::_LoweredFunc_ *op) {
   APPEND_TO_NEW_BODY(alis_var_exprs)
 
   new_body.push_back(op->body);
+  APPEND_TO_NEW_BODY(dealloc_temp_buffers);
 
   Expr func_body = ir::Block::Make(new_body);
 
@@ -148,6 +176,12 @@ void CodeGenCUDA_Dev::Visit(const ir::_LoweredFunc_ *op) {
   IrPrinter::Visit(func_body);
 }
 
+void CodeGenCUDA_Dev::Visit(const ir::Free *op) {
+  str_ += "delete [] ";
+  str_ += op->destination.As<ir::_Buffer_>()->name;
+  str_ += ";\n";
+}
+
 void CodeGenCUDA_Dev::Visit(const ir::_Var_ *op) {
   if (utils::Startswith(op->name, "threadIdx") ||
       utils::Startswith(op->name, "blockIdx")) {
@@ -258,6 +292,22 @@ void CodeGenCUDA_Dev::PrintIncludes() { str_ += GetSourceHeader(); }
 
 void CodeGenCUDA_Dev::PrintTempBufferCreation(const ir::Buffer &buffer) {
   CHECK_NE(buffer->type(), Void());
+  // Calculate buffer size and determine if it contains a symbolic constant
+  Expr buffer_size(1);
+  for (int i = 0; i < buffer->shape.size(); i++) {
+    buffer_size = buffer_size * buffer->shape[i];
+  }
+  optim::Simplify(&buffer_size);
+  bool has_symbolic_constant = false;
+  ir::ir_utils::CollectIRNodes(buffer_size, [&](const Expr *x) {
+    if (x->as_var()) {
+      CHECK(x->as_var()->is_symbolic_constant)
+          << "var in buffer size must be symbolic constant.";
+      has_symbolic_constant = true;
+    }
+    return false;
+  });
+  // print func of static allocation
   auto print_gpu_memory = [&](const std::string &mark) {
     str_ += mark;
     str_ += GetTypeRepr(buffer->dtype);
@@ -266,21 +316,32 @@ void CodeGenCUDA_Dev::PrintTempBufferCreation(const ir::Buffer &buffer) {
     str_ += " ";
 
     str_ += "[ ";
-    Expr buffer_size(1);
-    for (int i = 0; i < buffer->shape.size(); i++) {
-      buffer_size = buffer_size * buffer->shape[i];
-    }
-    optim::Simplify(&buffer_size);
     IrPrinter::Visit(buffer_size);
     str_ += " ]";
   };
+  // print func of dynamic allocation
+  auto print_gpu_local_memory_dynamic_allocation = [&]() {
+    str_ += GetTypeRepr(buffer->dtype);
+    str_ += " *";
+    str_ += buffer->name;
+    str_ += " = new ";
+    str_ += GetTypeRepr(buffer->dtype);
+    str_ += "[ ";
+    IrPrinter::Visit(buffer_size);
+    str_ += " ]";
+  };
+  // print
   switch (buffer->memory_type) {
     case ir::MemoryType::GPUShared:
       print_gpu_memory("__shared__ ");
       break;
 
     case ir::MemoryType::GPULocal:
-      print_gpu_memory("");
+      if (has_symbolic_constant) {
+        print_gpu_local_memory_dynamic_allocation();
+      } else {
+        print_gpu_memory("");
+      }
       break;
 
     default:
diff --git a/paddle/cinn/backends/codegen_cuda_dev.h b/paddle/cinn/backends/codegen_cuda_dev.h
index cc32edcaa820e6..3f83318b3c65f0 100644
--- a/paddle/cinn/backends/codegen_cuda_dev.h
+++ b/paddle/cinn/backends/codegen_cuda_dev.h
@@ -73,6 +73,7 @@ class CodeGenCUDA_Dev : public CodeGenC {
  protected:
   void Visit(const ir::_Var_* op) override;
   void Visit(const ir::_LoweredFunc_* op) override;
+  void Visit(const ir::Free* op) override;
   void Visit(const ir::Min* op) override;
   void Visit(const ir::Max* op) override;
   void Visit(const ir::Alloc* op) override;
@@ -113,6 +114,7 @@ class CodeGenCUDA_Dev : public CodeGenC {
   // prefix
   std::unordered_set<std::string> vectorized_tensor_names_;
   static const std::string source_header_;
+  std::vector<ir::Buffer> dynamic_alloc_buffers_;
 };
 
 }  // namespace backends
diff --git a/paddle/cinn/common/dim_expr_converter.cc b/paddle/cinn/common/dim_expr_converter.cc
index e17b961689b295..c4e6ac64194ebe 100644
--- a/paddle/cinn/common/dim_expr_converter.cc
+++ b/paddle/cinn/common/dim_expr_converter.cc
@@ -28,7 +28,11 @@ struct DimExprToIrExprVisitor {
   ir::Expr operator()(const int64_t& dim) { return ir::Expr(dim); }
 
   ir::Expr operator()(const std::string& dim_expr) {
-    Var x = ir::_Var_::Make(dim_expr, Int(64));
+    Var x = ir::_Var_::Make(ir::Expr(static_cast<int64_t>(0)),
+                            ir::Expr(INT64_MAX),
+                            dim_expr,
+                            /* is_reduce  = */ false,
+                            /* is_symbolic_constant = */ true);
     return x;
   }
 
diff --git a/paddle/cinn/common/dim_expr_converter_test.cc b/paddle/cinn/common/dim_expr_converter_test.cc
index a2313e72977981..821f2e2830979d 100644
--- a/paddle/cinn/common/dim_expr_converter_test.cc
+++ b/paddle/cinn/common/dim_expr_converter_test.cc
@@ -31,7 +31,13 @@ TEST(Convert, AddExpr) {
 
   ir::Expr expr1 =
       ir::Add::Make(ir::Expr(std::int64_t(4)), ir::Expr(std::int64_t(5)));
-  ir::Expr dst_expr = ir::Add::Make(expr1, ir::_Var_::Make("sym_0", Int(64)));
+  ir::Expr dst_expr =
+      ir::Add::Make(expr1,
+                    ir::_Var_::Make(ir::Expr(static_cast<int64_t>(0)),
+                                    ir::Expr(INT64_MAX),
+                                    "sym_0",
+                                    /* is_reduce  = */ false,
+                                    /* is_symbolic_constant = */ true));
   ASSERT_TRUE(MathEqual(src_expr, dst_expr));
 }
 
@@ -39,8 +45,13 @@ TEST(Convert, SubExpr) {
   DimExpr dim_expr = DimExpr(4) - DimExpr("sym_0");
   ir::Expr src_expr = DimExprConverter().ConvertToIrExpr(dim_expr);
 
-  ir::Expr expr1 = ir::Sub::Make(ir::Expr(std::int64_t(0)),
-                                 ir::_Var_::Make("sym_0", Int(64)));
+  ir::Expr expr1 =
+      ir::Sub::Make(ir::Expr(std::int64_t(0)),
+                    ir::_Var_::Make(ir::Expr(static_cast<int64_t>(0)),
+                                    ir::Expr(INT64_MAX),
+                                    "sym_0",
+                                    /* is_reduce  = */ false,
+                                    /* is_symbolic_constant = */ true));
   ir::Expr dst_expr = ir::Add::Make(ir::Expr(std::int64_t(4)), expr1);
   ASSERT_TRUE(MathEqual(src_expr, dst_expr));
 }
@@ -52,7 +63,13 @@ TEST(Convert, MulExpr) {
 
   ir::Expr expr1 =
       ir::Mul::Make(ir::Expr(std::int64_t(4)), ir::Expr(std::int64_t(5)));
-  ir::Expr dst_expr = ir::Mul::Make(expr1, ir::_Var_::Make("sym_0", Int(64)));
+  ir::Expr dst_expr =
+      ir::Mul::Make(expr1,
+                    ir::_Var_::Make(ir::Expr(static_cast<int64_t>(0)),
+                                    ir::Expr(INT64_MAX),
+                                    "sym_0",
+                                    /* is_reduce  = */ false,
+                                    /* is_symbolic_constant = */ true));
   ASSERT_TRUE(MathEqual(src_expr, dst_expr));
 }
 
diff --git a/paddle/cinn/ir/buffer.cc b/paddle/cinn/ir/buffer.cc
index 350cde0189fdf8..69bf16bea297e7 100644
--- a/paddle/cinn/ir/buffer.cc
+++ b/paddle/cinn/ir/buffer.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/cinn/ir/buffer.h"
 
+#include "paddle/cinn/common/cas.h"
 #include "paddle/cinn/common/common.h"
 #include "paddle/cinn/common/ir_util.h"
 #include "paddle/cinn/ir/ir_visitor.h"
@@ -116,6 +117,14 @@ int64_t _Buffer_::numel() const {
   return res;
 }
 
+ir::Expr _Buffer_::SymbolicNumel() const {
+  ir::Expr res{1};
+  for (auto &i : shape) {
+    res = res * i;
+  }
+  return common::AutoSimplify(res);
+}
+
 void _Buffer_::Verify() const {
   CHECK(!shape.empty());
   CHECK(!name.empty());
diff --git a/paddle/cinn/ir/buffer.h b/paddle/cinn/ir/buffer.h
index 4b83a2bcd2e0f1..b5e162ae52bc64 100755
--- a/paddle/cinn/ir/buffer.h
+++ b/paddle/cinn/ir/buffer.h
@@ -143,6 +143,8 @@ class _Buffer_ : public ExprNode<_Buffer_> {
 
   int64_t numel() const;
 
+  ir::Expr SymbolicNumel() const;
+
   static const IrNodeTy _node_type_ = IrNodeTy::_Buffer_;
 
   // Copy the meta infos to other.
diff --git a/paddle/cinn/ir/dim.cc b/paddle/cinn/ir/dim.cc
index fe63fb31158a9c..16258c809f0d4c 100644
--- a/paddle/cinn/ir/dim.cc
+++ b/paddle/cinn/ir/dim.cc
@@ -33,7 +33,6 @@ Dim _Dim_::Make(const std::string& name, const symbol::DimExpr& sym_dim) {
   n->name = name;
   n->sym_dim = sym_dim;
   n->dim_expr = common::DimExprConverter().ConvertToIrExpr(sym_dim);
-
   return Dim(n);
 }
 
diff --git a/paddle/cinn/ir/group_schedule/tactic/arrange_storage_tactic.cc b/paddle/cinn/ir/group_schedule/tactic/arrange_storage_tactic.cc
index cec04ba2c1e877..942a306a81b553 100644
--- a/paddle/cinn/ir/group_schedule/tactic/arrange_storage_tactic.cc
+++ b/paddle/cinn/ir/group_schedule/tactic/arrange_storage_tactic.cc
@@ -139,6 +139,8 @@ IntSet Evaluate(Expr expr,
       Expr var_max = var_domain.at(var).Max();
       optim::ReplaceVarWithExpr(&copy_for_lower_bound, var, var_min);
       optim::ReplaceVarWithExpr(&copy_for_upper_bound, var, var_max);
+    } else if (var->is_symbolic_constant) {
+      continue;
     } else {
       CHECK(var->lower_bound.defined());
       CHECK(var->upper_bound.defined());
@@ -167,44 +169,56 @@ std::unordered_map<ir::Var, ir::Var> GetFixedVar(
     const ir::For* for_node = var2for.second.As<ir::For>();
     if (type == CudaAxisType::kCudaBlock && for_node->is_gpu_block_binded()) {
       if (for_node->bind_info().offset == 0) {
-        fix_var_map.insert({var2for.first,
-                            ir::_Var_::Make(cuda_space.x.Min(),
-                                            cuda_space.x.Max(),
-                                            CudaIterVarName::kCudaBlockX,
-                                            var2for.first->is_reduce_axis)});
+        fix_var_map.insert(
+            {var2for.first,
+             ir::_Var_::Make(cuda_space.x.Min(),
+                             cuda_space.x.Max(),
+                             CudaIterVarName::kCudaBlockX,
+                             var2for.first->is_reduce_axis,
+                             /* is_symbolic_constant = */ true)});
       } else if (for_node->bind_info().offset == 1) {
-        fix_var_map.insert({var2for.first,
-                            ir::_Var_::Make(cuda_space.y.Min(),
-                                            cuda_space.y.Max(),
-                                            CudaIterVarName::kCudaBlockY,
-                                            var2for.first->is_reduce_axis)});
+        fix_var_map.insert(
+            {var2for.first,
+             ir::_Var_::Make(cuda_space.y.Min(),
+                             cuda_space.y.Max(),
+                             CudaIterVarName::kCudaBlockY,
+                             var2for.first->is_reduce_axis,
+                             /* is_symbolic_constant = */ true)});
       } else if (for_node->bind_info().offset == 2) {
-        fix_var_map.insert({var2for.first,
-                            ir::_Var_::Make(cuda_space.z.Min(),
-                                            cuda_space.z.Max(),
-                                            CudaIterVarName::kCudaBlockZ,
-                                            var2for.first->is_reduce_axis)});
+        fix_var_map.insert(
+            {var2for.first,
+             ir::_Var_::Make(cuda_space.z.Min(),
+                             cuda_space.z.Max(),
+                             CudaIterVarName::kCudaBlockZ,
+                             var2for.first->is_reduce_axis,
+                             /* is_symbolic_constant = */ true)});
       }
     } else if (type == CudaAxisType::kCudaThread &&
                for_node->is_gpu_thread_binded()) {
       if (for_node->bind_info().offset == 0) {
-        fix_var_map.insert({var2for.first,
-                            ir::_Var_::Make(cuda_space.x.Min(),
-                                            cuda_space.x.Max(),
-                                            CudaIterVarName::kCudaThreadX,
-                                            var2for.first->is_reduce_axis)});
+        fix_var_map.insert(
+            {var2for.first,
+             ir::_Var_::Make(cuda_space.x.Min(),
+                             cuda_space.x.Max(),
+                             CudaIterVarName::kCudaThreadX,
+                             var2for.first->is_reduce_axis,
+                             /* is_symbolic_constant = */ true)});
       } else if (for_node->bind_info().offset == 1) {
-        fix_var_map.insert({var2for.first,
-                            ir::_Var_::Make(cuda_space.y.Min(),
-                                            cuda_space.y.Max(),
-                                            CudaIterVarName::kCudaThreadY,
-                                            var2for.first->is_reduce_axis)});
+        fix_var_map.insert(
+            {var2for.first,
+             ir::_Var_::Make(cuda_space.y.Min(),
+                             cuda_space.y.Max(),
+                             CudaIterVarName::kCudaThreadY,
+                             var2for.first->is_reduce_axis,
+                             /* is_symbolic_constant = */ true)});
       } else if (for_node->bind_info().offset == 2) {
-        fix_var_map.insert({var2for.first,
-                            ir::_Var_::Make(cuda_space.z.Min(),
-                                            cuda_space.z.Max(),
-                                            CudaIterVarName::kCudaThreadZ,
-                                            var2for.first->is_reduce_axis)});
+        fix_var_map.insert(
+            {var2for.first,
+             ir::_Var_::Make(cuda_space.z.Min(),
+                             cuda_space.z.Max(),
+                             CudaIterVarName::kCudaThreadZ,
+                             var2for.first->is_reduce_axis,
+                             /* is_symbolic_constant = */ true)});
       }
     }
   }
@@ -344,13 +358,14 @@ void ArrangeStorageTactic::Init(ScheduleContext* context) {
 void ArrangeStorageTactic::Apply(ir::IRSchedule* sch,
                                  const std::string& block_id) {
   ir::Expr store_block = sch->GetBlock(block_id);
-  ir::Expr root_block = sch->GetRootBlock(store_block);
-  ir::Expr store = *ir::ir_utils::CollectIRNodesWithoutTensor(
-                        store_block,
-                        [&](const ir::Expr* x) { return x->As<ir::Store>(); },
-                        true)
-                        .begin();
+  ir::Tensor store_tensor = analyzer::GetStoreTensorOfSBlock(store_block);
+  // Skip if the store tensor has already been allocated to GPU shared or local
+  // memory.
+  if (store_tensor->buffer.defined() && store_tensor->buffer->is_on_gpu())
+    return;
 
+  ir::Expr root_block = sch->GetRootBlock(store_block);
+  ir::Expr store = analyzer::GetStoreOfSBlock(store_block);
   VarToForMap var2for_map =
       analyzer::CollectVarToForMap({root_block}, sch->GetAllBlocks());
 
diff --git a/paddle/cinn/ir/group_schedule/tactic/optimize_reduction_tactic.cc b/paddle/cinn/ir/group_schedule/tactic/optimize_reduction_tactic.cc
index 07a58b17432fba..7765d0f2190995 100644
--- a/paddle/cinn/ir/group_schedule/tactic/optimize_reduction_tactic.cc
+++ b/paddle/cinn/ir/group_schedule/tactic/optimize_reduction_tactic.cc
@@ -66,10 +66,11 @@ bool CanApply(const std::string& block_name, ir::IRSchedule* sch) {
       if (body.As<ir::Block>()->stmts.size() == 1) {
         if (body.As<ir::Block>()->stmts[0].As<ir::For>() == nullptr &&
             body.As<ir::Block>()->stmts[0].As<ir::ScheduleBlockRealize>() ==
-                nullptr) {
+                nullptr &&
+            body.As<ir::Block>()->stmts[0].As<ir::IfThenElse>() == nullptr) {
           VLOG(6) << "the block: " << block_name
                   << " has a block stmt that is not any of "
-                     "schedule_block/for_loop, so can not apply "
+                     "schedule_block/for_loop/if, so can not apply "
                      "OptimizeReductionTactic";
           return false;
         }
@@ -82,10 +83,11 @@ bool CanApply(const std::string& block_name, ir::IRSchedule* sch) {
         }
         if (body.As<ir::Block>()->stmts[1].As<ir::For>() == nullptr &&
             body.As<ir::Block>()->stmts[1].As<ir::ScheduleBlockRealize>() ==
-                nullptr) {
+                nullptr &&
+            body.As<ir::Block>()->stmts[0].As<ir::IfThenElse>() == nullptr) {
           VLOG(6) << "the block: " << block_name
                   << " has a block stmt that is not any of "
-                     "schedule_block/for_loop, so can not apply "
+                     "schedule_block/for_loop/if, so can not apply "
                      "OptimizeReductionTactic";
           return false;
         }
@@ -95,12 +97,14 @@ bool CanApply(const std::string& block_name, ir::IRSchedule* sch) {
                    "OptimizeReductionTactic";
         return false;
       }
-    } else if (body.As<ir::For>() || body.As<ir::ScheduleBlockRealize>()) {
+    } else if (body.As<ir::For>() || body.As<ir::ScheduleBlockRealize>() ||
+               body.As<ir::IfThenElse>()) {
       continue;
     } else {
-      VLOG(6) << "the block: " << block_name
-              << " has a loop body that is not any of schedule_block/for_loop, "
-                 "so can not apply OptimizeReductionTactic";
+      VLOG(6)
+          << "the block: " << block_name
+          << " has a loop body that is not any of schedule_block/for_loop/if, "
+             "so can not apply OptimizeReductionTactic";
       return false;
     }
   }
@@ -141,7 +145,7 @@ void OptimizeReductionTactic::Apply(ir::IRSchedule* sch,
     rb_loops = sch->GetLoops(block_id);
     rf_block = sch->GetBlock(rf_block_id);
     sch->Bind(rb_loops.back(), "threadIdx.x");
-    sch->SetBuffer(rf_block, "shared");
+    sch->SetBuffer(rf_block, "local");
   }
   VLOG(6) << "Loop fusion and cross thread reduction: "
           << sch->GetModule().GetExprs()[0];
diff --git a/paddle/cinn/ir/group_schedule/tactic/tile_tactic.cc b/paddle/cinn/ir/group_schedule/tactic/tile_tactic.cc
index 07bc8225d0f671..6a34e8eb020cb4 100644
--- a/paddle/cinn/ir/group_schedule/tactic/tile_tactic.cc
+++ b/paddle/cinn/ir/group_schedule/tactic/tile_tactic.cc
@@ -42,7 +42,8 @@ void TileTactic::Init(ScheduleContext* context) {
     int sp_factor = GetFirstFactor(context_->bucket_info.sp_lower_bound);
     context_->iter_space_info.sp_space.emplace_back(
         ir::Expr(context_->bucket_info.sp_lower_bound / sp_factor),
-        IterativeSpaceInfo::AxisType::kCudaBlockX);
+        has_rb_iter ? IterativeSpaceInfo::AxisType::kCudaBlockY
+                    : IterativeSpaceInfo::AxisType::kCudaBlockX);
     VLOG(6) << "sp_space: <"
             << std::get<0>(context_->iter_space_info.sp_space.back())
             << ", AxisType["
@@ -51,7 +52,7 @@ void TileTactic::Init(ScheduleContext* context) {
             << "]>";
     context_->iter_space_info.sp_space.emplace_back(
         ir::Expr(sp_factor),
-        has_rb_iter ? IterativeSpaceInfo::AxisType::kCudaThreadY
+        has_rb_iter ? IterativeSpaceInfo::AxisType::kCudaBlockX
                     : IterativeSpaceInfo::AxisType::kCudaThreadX);
     VLOG(6) << "sp_space: <"
             << std::get<0>(context_->iter_space_info.sp_space.back())
diff --git a/paddle/cinn/ir/ir.cc b/paddle/cinn/ir/ir.cc
index d57344e77d238e..8bc64fdef18bce 100644
--- a/paddle/cinn/ir/ir.cc
+++ b/paddle/cinn/ir/ir.cc
@@ -217,11 +217,13 @@ Expr _Var_::Make(const std::string &name, const Type &type) {
 Expr _Var_::Make(Expr lower_bound,
                  Expr upper_bound,
                  const std::string &name,
-                 bool is_reduce_axis) {
+                 bool is_reduce_axis,
+                 bool is_symbolic_constant) {
   auto *n = make_shared<_Var_>();
   n->lower_bound = lower_bound;
   n->upper_bound = upper_bound;
   n->is_reduce_axis = is_reduce_axis;
+  n->is_symbolic_constant = is_symbolic_constant;
   n->name = name;
   n->set_type(lower_bound.type());
   return Expr(n);
diff --git a/paddle/cinn/ir/ir.h b/paddle/cinn/ir/ir.h
index 3e9460e084a36f..d4bdf38894dfde 100644
--- a/paddle/cinn/ir/ir.h
+++ b/paddle/cinn/ir/ir.h
@@ -381,6 +381,7 @@ struct _Var_ : public ExprNode<_Var_> {
   std::string name;
 
   bool is_reduce_axis{false};
+  bool is_symbolic_constant{false};
   //! Lower bound and upper bound of a axis.
   // @{
   Expr lower_bound;
@@ -399,7 +400,8 @@ struct _Var_ : public ExprNode<_Var_> {
   static Expr Make(Expr lower_bound,
                    Expr upper_bound,
                    const std::string& name,
-                   bool is_reduce);
+                   bool is_reduce,
+                   bool is_symbolic_constant = false);
 
   void Verify() const override;
 
diff --git a/paddle/cinn/ir/schedule/factorize_reduction.h b/paddle/cinn/ir/schedule/factorize_reduction.h
index 3438bce94f8d28..d6252bb0a4663e 100644
--- a/paddle/cinn/ir/schedule/factorize_reduction.h
+++ b/paddle/cinn/ir/schedule/factorize_reduction.h
@@ -131,6 +131,26 @@ class ReduceBlockCreater {
         body = Block::Make({new_init_block_realize_, body});
         has_add_init_block = true;
       }
+      // Add If
+      if (original_loops_[i].As<For>()->body.As<IfThenElse>()) {
+        const IfThenElse* original_if =
+            original_loops_[i].As<For>()->body.As<IfThenElse>();
+        body = IfThenElse::Make(original_if->condition, body);
+      }
+      if (original_loops_[i].As<For>()->body.As<Block>() &&
+          original_loops_[i].As<For>()->body.As<Block>()->stmts.size() == 1 &&
+          original_loops_[i]
+              .As<For>()
+              ->body.As<Block>()
+              ->stmts[0]
+              .As<IfThenElse>()) {
+        const IfThenElse* original_if = original_loops_[i]
+                                            .As<For>()
+                                            ->body.As<Block>()
+                                            ->stmts[0]
+                                            .As<IfThenElse>();
+        body = IfThenElse::Make(original_if->condition, body);
+      }
       // Add loops
       Var loop_var = ir_utils::IRCopy(original_loops_[i].As<For>()->loop_var);
       Expr min = ir_utils::IRCopy(original_loops_[i].As<For>()->min);
diff --git a/paddle/cinn/ir/schedule/impl/compute_location.cc b/paddle/cinn/ir/schedule/impl/compute_location.cc
index d27eb23c1ea88d..cb5a6d3693b55c 100644
--- a/paddle/cinn/ir/schedule/impl/compute_location.cc
+++ b/paddle/cinn/ir/schedule/impl/compute_location.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/cinn/common/integer_set.h"
 #include "paddle/cinn/common/macros.h"
 #include "paddle/cinn/ir/schedule/impl/ir_schedule.h"
 
@@ -97,10 +98,15 @@ void DyScheduleImpl::SimpleComputeAt(const Expr& block, const Expr& loop) {
   auto this_loop = loop;
   auto block_name = GetTensor(block)->name;
   auto this_block = block;
-  if (GetLoopExtent(loops[0]) == 1 && GetLoopExtent(block_loops[0]) != 1) {
+  if (loops[0].As<ir::For>()->extent.is_constant() &&
+      GetLoopExtent(loops[0]) == 1 &&
+      (!block_loops[0].As<ir::For>()->extent.is_constant() ||
+       GetLoopExtent(block_loops[0]) != 1)) {
     this->Split(block_loops[0], {1, -1});
     this_block = this->GetBlock(block_name);
-  } else if (GetLoopExtent(loops[0]) != 1 &&
+  } else if ((!loops[0].As<ir::For>()->extent.is_constant() ||
+              GetLoopExtent(loops[0]) != 1) &&
+             block_loops[0].As<ir::For>()->extent.is_constant() &&
              GetLoopExtent(block_loops[0]) == 1) {
     auto splited = this->Split(loops[0], {1, -1});
     this_loop = splited[1];
@@ -114,10 +120,15 @@ void DyScheduleImpl::SimpleComputeAt(const Expr& block, const Expr& loop) {
 
   std::vector<Var> replaced_var;
   std::vector<Expr> substitute_expr;
+  common::cas_intervals_t var_intervals;
+  common::SymbolicExprAnalyzer analyzer{var_intervals};
   for (int i = 0; i < loops.size(); ++i) {
     VLOG(3) << i << "-th loop is:\n " << loops[i];
     VLOG(3) << i << "-th block_loop:\n" << block_loops[i];
-    if (GetLoopExtent(loops[i]) != GetLoopExtent(block_loops[i])) {
+    std::optional<bool> prove_eq = analyzer.ProveEQ(
+        loops[i].As<ir::For>()->extent, block_loops[i].As<ir::For>()->extent);
+    CHECK(prove_eq.has_value() && prove_eq.value());
+    if (!prove_eq.has_value() || prove_eq.value() == false) {
       os << "Extent of loop in Expr Param(loop) and extent of loop in Expr "
             "Param(block) should be equal correspondingly!\n";
       throw IRScheduleErrorHandler(primitive, os.str(), module_expr_);
diff --git a/paddle/cinn/ir/utils/ir_copy.cc b/paddle/cinn/ir/utils/ir_copy.cc
index b444be218c39a5..687cf3e6f34cec 100644
--- a/paddle/cinn/ir/utils/ir_copy.cc
+++ b/paddle/cinn/ir/utils/ir_copy.cc
@@ -101,6 +101,7 @@ struct IRCopyVisitor : public ir::IRVisitorRequireReImpl<Expr> {
 
     n->name = op->name;
     n->is_reduce_axis = op->is_reduce_axis;
+    n->is_symbolic_constant = op->is_symbolic_constant;
     n->set_type(op->type());
 
     if (op->lower_bound.defined()) {
diff --git a/paddle/cinn/lang/lower.cc b/paddle/cinn/lang/lower.cc
index 9d78a91eb5c0e5..b40ac1be4ea6b2 100644
--- a/paddle/cinn/lang/lower.cc
+++ b/paddle/cinn/lang/lower.cc
@@ -21,6 +21,7 @@
 #include <unordered_set>
 #include <utility>
 
+#include "paddle/cinn/common/integer_set.h"
 #include "paddle/cinn/ir/buffer.h"
 #include "paddle/cinn/ir/ir_printer.h"
 #include "paddle/cinn/lang/lower_impl.h"
@@ -86,6 +87,14 @@ std::vector<ir::Argument> GetArgs(
   return res;
 }
 
+bool CanProveBufferNumelLT(const ir::Buffer& lhs, const ir::Buffer& rhs) {
+  common::cas_intervals_t var_intervals;
+  common::SymbolicExprAnalyzer analyzer(var_intervals);
+  std::optional<bool> prove_lt =
+      analyzer.ProveLT(lhs->SymbolicNumel(), rhs->SymbolicNumel());
+  return prove_lt.value_or(false);
+}
+
 // Collect the temporary tensors from a computational graph.
 std::vector<ir::Buffer> GetTempBuffers(
     const std::vector<cinn::ir::Tensor>& tensor_args, Expr body) {
@@ -113,8 +122,8 @@ std::vector<ir::Buffer> GetTempBuffers(
       name_to_buffer[buffer_name] = e.as_tensor()->buffer;
     } else {
       // TODO(phlrain): why update
-      if (e.as_tensor()->buffer->numel() <
-          name_to_buffer[buffer_name]->numel()) {
+      if (CanProveBufferNumelLT(e.as_tensor()->buffer,
+                                name_to_buffer[buffer_name])) {
         name_to_buffer[buffer_name] = e.as_tensor()->buffer;
       }
     }
@@ -200,8 +209,8 @@ std::vector<ir::Buffer> GetTempBuffers(const std::vector<Tensor>& tensor_args,
     if (!name_to_buffer.count(buffer_name)) {
       name_to_buffer[buffer_name] = e.as_tensor()->buffer;
     } else {
-      if (e.as_tensor()->buffer->numel() <
-          name_to_buffer[buffer_name]->numel()) {
+      if (CanProveBufferNumelLT(e.as_tensor()->buffer,
+                                name_to_buffer[buffer_name])) {
         name_to_buffer[buffer_name] = e.as_tensor()->buffer;
       }
     }
@@ -212,8 +221,8 @@ std::vector<ir::Buffer> GetTempBuffers(const std::vector<Tensor>& tensor_args,
         if (x->as_tensor() && x->as_tensor()->buffer.defined()) {
           auto buffer_name = x->as_tensor()->buffer->name;
           if (name_to_buffer.count(buffer_name) &&
-              x->as_tensor()->buffer->numel() <
-                  name_to_buffer[buffer_name]->numel()) {
+              CanProveBufferNumelLT(x->as_tensor()->buffer,
+                                    name_to_buffer[buffer_name])) {
             name_to_buffer[buffer_name] = x->as_tensor()->buffer;
           }
         }
diff --git a/paddle/cinn/pybind/ir/ir_api.cc b/paddle/cinn/pybind/ir/ir_api.cc
index b2e625e741ba62..56dff498dd7101 100644
--- a/paddle/cinn/pybind/ir/ir_api.cc
+++ b/paddle/cinn/pybind/ir/ir_api.cc
@@ -378,10 +378,12 @@ void BindIrIr(py::module *m) {
       .def_static("make",
                   py::overload_cast<const std::string &, const Type &>(
                       &ir::_Var_::Make))
-      .def_static(
-          "make",
-          py::overload_cast<ir::Expr, ir::Expr, const std::string &, bool>(
-              &ir::_Var_::Make))
+      .def_static("make",
+                  py::overload_cast<ir::Expr,
+                                    ir::Expr,
+                                    const std::string &,
+                                    bool,
+                                    bool>(&ir::_Var_::Make))
       .def("copy", &ir::_Var_::Copy);
 
   // struct Select

From 71111516375f3707dd9bed68802e89305789655a Mon Sep 17 00:00:00 2001
From: Liuyinfeng <30849840+gitliuyf@users.noreply.github.com>
Date: Tue, 23 Jan 2024 10:38:34 +0800
Subject: [PATCH 15/34] [XPU] Delete squeeze_excitation_fuse_pass out_max
 (#61005)

---
 .../fluid/framework/ir/xpu/squeeze_excitation_fuse_pass.cc  | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/paddle/fluid/framework/ir/xpu/squeeze_excitation_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/squeeze_excitation_fuse_pass.cc
index b9af8314c3a104..8009529854c9d5 100644
--- a/paddle/fluid/framework/ir/xpu/squeeze_excitation_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/squeeze_excitation_fuse_pass.cc
@@ -473,11 +473,6 @@ int SqueezeExcitationFusePass::ApplyImpl(ir::Graph* graph,
       output_name = ew_mul_out->Name();
     }
     fused_op_desc.SetOutput("out", {output_name});
-    std::string max_output_name = output_name + "_max";
-    VarDesc max_out_desc(max_output_name);
-    auto* max_output_node = graph->CreateVarNode(&max_out_desc);
-
-    fused_op_desc.SetOutput("out_max", {max_output_name});
     fused_op_desc.SetAttr("op_type", std::vector<int>{4});
     fused_op_desc.SetAttr("place_x", std::vector<int>{0});
     fused_op_desc.SetAttr("place_y", std::vector<int>{9});
@@ -539,7 +534,6 @@ int SqueezeExcitationFusePass::ApplyImpl(ir::Graph* graph,
     } else {
       IR_NODE_LINK_TO(new_op_node, ew_mul_out);
     }
-    IR_NODE_LINK_TO(new_op_node, max_output_node);
     // delete useless node
     std::unordered_set<const Node*> delete_nodes = {
         pool2d, mul_1, mul_1_out, mul_2, mul_2_out, ew_mul};

From 166aced98db2a1319b604bd6589d47d08d5481ac Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuan29@baidu.com>
Date: Tue, 23 Jan 2024 10:47:19 +0800
Subject: [PATCH 16/34] [PIR] pir onednn add conv2d test, and support
 fused_conv2d (#60974)

* pir onednn add conv2d test, and support fused_conv2d
---
 .../instruction/onednn/onednn_instruction.cc  | 30 ++++++++++++++++++-
 .../instruction/onednn/onednn_instruction.h   |  1 +
 .../fluid/pir/dialect/op_generator/op_gen.py  |  9 ++++++
 .../fluid/pir/dialect/operator/ir/onednn.yaml | 10 +++++++
 .../dialect/operator/ir/ops_onednn_extra.yaml |  4 +++
 paddle/phi/api/yaml/op_compat.yaml            |  6 ++++
 paddle/phi/api/yaml/ops.yaml                  |  3 ++
 paddle/phi/infermeta/multiary.h               |  2 +-
 paddle/phi/kernels/onednn/conv_function.h     |  4 +++
 test/legacy_test/CMakeLists.txt               |  2 +-
 test/legacy_test/op_test.py                   |  1 +
 test/legacy_test/test_conv2d_op.py            | 16 ++++++++--
 test/legacy_test/test_elementwise_add_op.py   |  1 -
 test/mkldnn/test_conv2d_bf16_mkldnn_op.py     |  8 ++++-
 test/mkldnn/test_conv2d_int8_mkldnn_op.py     |  6 +++-
 test/mkldnn/test_conv2d_mkldnn_op.py          | 13 ++------
 16 files changed, 97 insertions(+), 19 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/instruction/onednn/onednn_instruction.cc b/paddle/fluid/framework/new_executor/instruction/onednn/onednn_instruction.cc
index eedda2350e425b..cd4d228661f445 100644
--- a/paddle/fluid/framework/new_executor/instruction/onednn/onednn_instruction.cc
+++ b/paddle/fluid/framework/new_executor/instruction/onednn/onednn_instruction.cc
@@ -314,11 +314,33 @@ OneDNNPhiKernelInstruction::OneDNNPhiKernelInstruction(
             .at("extra_args")
             .dyn_cast<pir::ArrayAttribute>()
             .AsVector();
-    std::vector<std::string> extra_args;
+    auto& op_normalizer = paddle::translator::OpNameNormalizer::instance();
+    std::string fluid_op_name = yaml_info_parser.GetOriginOpName();
+
     for (auto& attr : extra_args_attr) {
       auto attr_name = attr.dyn_cast<pir::StrAttribute>().AsString();
       extra_attr_[attr_name] = ConvertPirAttribute2RuntimeAttribute(
           op_attributes.at(attr_name), attr_name, yaml_info_parser);
+      auto legacy_attr_name =
+          op_normalizer.GetLegacyAttrName(fluid_op_name, attr_name);
+      if (legacy_attr_name != attr_name) {
+        extra_attr_[legacy_attr_name] = extra_attr_[attr_name];
+      }
+    }
+    auto attr_name_list = yaml_info_parser.AttrParams(true);
+    for (auto& attr : attr_name_list) {
+      auto attr_name = attr;
+      if (!op_attributes.count(attr_name)) {
+        // In PIR, IntArray attr will be input, but not attr.
+        continue;
+      }
+      ctx_attr_[attr_name] = ConvertPirAttribute2RuntimeAttribute(
+          op_attributes.at(attr_name), attr_name, yaml_info_parser);
+      auto legacy_attr_name =
+          op_normalizer.GetLegacyAttrName(fluid_op_name, attr_name);
+      if (legacy_attr_name != attr_name) {
+        ctx_attr_[legacy_attr_name] = ctx_attr_[attr_name];
+      }
     }
   }
   TensorNameMap(op, *value_exec_info_, yaml_info_parser, inputs_, outputs_);
@@ -336,6 +358,9 @@ void OneDNNPhiKernelInstruction::Run() {
       size_t(0), kernel_context_.InputsSize());
   for (size_t i = 0; i < inputs.size(); ++i) {
     auto input = inputs[i];
+    if (input == nullptr) {
+      continue;
+    }
     if (input->layout() != phi::DataLayout::ONEDNN) {
       phi::DataLayout from_layout = input->layout();
 
@@ -375,6 +400,9 @@ void OneDNNPhiKernelInstruction::Run() {
   for (auto& attr : extra_attr_) {
     one_dnn_ctx->SetDnnAttr(attr.first, attr.second);
   }
+  for (auto& attr : ctx_attr_) {
+    one_dnn_ctx->SetDnnAttr(attr.first, attr.second);
+  }
   one_dnn_ctx->SetInputsName(inputs_);
   one_dnn_ctx->SetOutputsName(outputs_);
 
diff --git a/paddle/fluid/framework/new_executor/instruction/onednn/onednn_instruction.h b/paddle/fluid/framework/new_executor/instruction/onednn/onednn_instruction.h
index 196ee18d0278b1..9cca848549f2b3 100644
--- a/paddle/fluid/framework/new_executor/instruction/onednn/onednn_instruction.h
+++ b/paddle/fluid/framework/new_executor/instruction/onednn/onednn_instruction.h
@@ -71,6 +71,7 @@ class OneDNNPhiKernelInstruction : public InstructionBase {
   std::set<int> data_format_tensors_{};
   phi::DataLayout input_layout_{phi::DataLayout::kAnyLayout};
   std::map<std::string, phi::Attribute> extra_attr_{};
+  std::map<std::string, phi::Attribute> ctx_attr_{};
   std::map<std::string, std::vector<std::string>> inputs_{};
   std::map<std::string, std::vector<std::string>> outputs_{};
 };
diff --git a/paddle/fluid/pir/dialect/op_generator/op_gen.py b/paddle/fluid/pir/dialect/op_generator/op_gen.py
index 161aa4ac9a1b5e..d0c1c438c195b7 100644
--- a/paddle/fluid/pir/dialect/op_generator/op_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/op_gen.py
@@ -1990,6 +1990,15 @@ def OpGenerator(
                 if first_file:
                     op["is_onednn_only"] = True
                     onednn_only_op_list.append("\"" + op['name'] + "\"")
+                    if op['name'] in ops_onednn_extra_map:
+                        onednn_item = ops_onednn_extra_map[op['name']]
+                        op["is_onednn_only"] = onednn_item["is_onednn_only"]
+                        op["extra_args"] = onednn_item["extra_args"]
+                        op["data_format_tensors"] = onednn_item[
+                            "data_format_tensors"
+                        ]
+                        op["dynamic_fallback"] = onednn_item["dynamic_fallback"]
+                        op["attrs"] = op["attrs"] + onednn_item["attrs"]
                 elif op['name'] in ops_onednn_extra_map:
                     onednn_item = ops_onednn_extra_map[op['name']]
                     op["is_onednn_only"] = onednn_item["is_onednn_only"]
diff --git a/paddle/fluid/pir/dialect/operator/ir/onednn.yaml b/paddle/fluid/pir/dialect/operator/ir/onednn.yaml
index fa33b498225250..2cc42f8c185f47 100644
--- a/paddle/fluid/pir/dialect/operator/ir/onednn.yaml
+++ b/paddle/fluid/pir/dialect/operator/ir/onednn.yaml
@@ -8,6 +8,16 @@
     func : dequantize
     data_type : input
 
+- op : fused_conv2d
+  args : (Tensor input, Tensor filter, Tensor bias, Tensor residual_param, int[] strides={1, 1}, int[] paddings={0, 0}, str padding_algorithm="EXPLICIT", int[] dilations={1, 1}, int groups=1, str data_format="NCHW", str mkldnn_data_type="float32", str fuse_activation="", bool fuse_residual_connection=false, bool force_fp32_output=false)
+  output : Tensor(output)
+  infer_meta :
+    func : FusedConvInferMeta
+  kernel :
+    func : fused_conv2d
+    data_type : input
+  optional : bias, residual_param
+
 - op : quantize
   args : (Tensor input, bool is_negative_input=false, float scale=1.0, float shift=0.0, str output_format="NHWC", bool bfloat16=false)
   output : Tensor(output)
diff --git a/paddle/fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml b/paddle/fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml
index a8042f29343dce..e0a0f4f2798525 100644
--- a/paddle/fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml
+++ b/paddle/fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml
@@ -19,6 +19,10 @@
   extra_args : bool is_test=false
   data_format_tensors : input, out_grad
 
+- op : fused_conv2d
+  extra_args : float fuse_alpha = 0.0, float fuse_beta = 0.0, float scale_in=1.0, float scale_out=1.0, float scale_in_eltwise=1.0, float[] scale_weights={1.0f}
+  data_format_tensors : input
+
 - op : lrn
   extra_args : bool is_test=false
   data_format_tensors : x
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index 6e3313f7552df2..92c8c56ee456a5 100755
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -1307,6 +1307,12 @@
     reserve_space: ReserveSpace
 
 - op : fused_conv2d
+  inputs :
+    {input : Input, filter : Filter, bias : Bias, residual_param : ResidualData}
+  outputs :
+    {output : Output}
+  attrs :
+    {scale_in : Scale_in, scale_out : Scale_out, scale_in_eltwise : Scale_in_eltwise, scale_weights : Scale_weights}
   extra :
     attrs : [bool use_cudnn = false, float fuse_alpha = 0.0f, float fuse_beta = 0.0f, float Scale_in = 1.0f,
              float Scale_out = 1.0f, float Scale_in_eltwise = 1.0f, 'float[] Scale_weights = {1.0f}', bool use_mkldnn = true, str mkldnn_data_type = "float32"]
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
index 235992da72b0cb..3b253dcad7ae65 100755
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -567,6 +567,7 @@
     func : ConvInferMeta
   kernel :
     func : conv2d
+    data_type : input
   backward : conv2d_grad
 
 - op : conv3d
@@ -576,6 +577,7 @@
     func : Conv3DInferMeta
   kernel :
     func : conv3d
+    data_type : input
   backward : conv3d_grad
 
 - op : conv3d_transpose
@@ -713,6 +715,7 @@
     func : DepthwiseConvInferMeta
   kernel :
     func : depthwise_conv2d
+    data_type : input
   backward : depthwise_conv2d_grad
 
 - op : det
diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
index 0c18d27836adc4..eecf01cd968cb6 100644
--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -827,7 +827,7 @@ void FusedConvInferMeta(const MetaTensor& input,
                         bool fuse_residual_conn,
                         bool force_fp32_output,
                         MetaTensor* out,
-                        MetaConfig config);
+                        MetaConfig config = MetaConfig());
 
 void MoeInferMeta(const MetaTensor& x,
                   const MetaTensor& gate,
diff --git a/paddle/phi/kernels/onednn/conv_function.h b/paddle/phi/kernels/onednn/conv_function.h
index 7d7e74f691a025..cd4dd6725f76ab 100644
--- a/paddle/phi/kernels/onednn/conv_function.h
+++ b/paddle/phi/kernels/onednn/conv_function.h
@@ -60,6 +60,10 @@ static dnnl::memory::data_type GetDstType(
           NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__)          \
       PD_PRIVATE_CASE_TYPE(                                               \
           NAME, ::paddle::DataType::INT8, int8_t, __VA_ARGS__)            \
+      PD_PRIVATE_CASE_TYPE(NAME,                                          \
+                           ::paddle::DataType::BFLOAT16,                  \
+                           ::phi::dtype::bfloat16,                        \
+                           __VA_ARGS__)                                   \
       default:                                                            \
         PD_THROW("function " #NAME " is not implemented for data type `", \
                  __dtype__,                                               \
diff --git a/test/legacy_test/CMakeLists.txt b/test/legacy_test/CMakeLists.txt
index 3ca147526bf8b2..705170af62f667 100644
--- a/test/legacy_test/CMakeLists.txt
+++ b/test/legacy_test/CMakeLists.txt
@@ -1065,7 +1065,7 @@ set_tests_properties(test_sigmoid_cross_entropy_with_logits_op
                      PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_optimizer_v2 PROPERTIES TIMEOUT 150)
 set_tests_properties(test_partial_sum_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_cond PROPERTIES TIMEOUT 120)
+set_tests_properties(test_cond PROPERTIES TIMEOUT 240)
 set_tests_properties(test_sgd_op PROPERTIES TIMEOUT 250)
 set_tests_properties(test_parallel_executor_seresnext_base_gpu
                      PROPERTIES TIMEOUT 120)
diff --git a/test/legacy_test/op_test.py b/test/legacy_test/op_test.py
index 9e64b88dcfd1dd..320e0ada326da2 100644
--- a/test/legacy_test/op_test.py
+++ b/test/legacy_test/op_test.py
@@ -431,6 +431,7 @@ def setUpClass(cls):
         cls.check_prim = False
         cls.check_prim_pir = False
         cls._check_cinn = False
+        cls.check_pir_onednn = False
 
         np.random.seed(123)
         random.seed(124)
diff --git a/test/legacy_test/test_conv2d_op.py b/test/legacy_test/test_conv2d_op.py
index 88bfd4cb5c02fc..e601e8c7bc6816 100644
--- a/test/legacy_test/test_conv2d_op.py
+++ b/test/legacy_test/test_conv2d_op.py
@@ -499,7 +499,10 @@ def test_check_output(self):
         place = core.CUDAPlace(0) if self.has_cuda() else core.CPUPlace()
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
         self.check_output_with_place(
-            place, atol=1e-5, check_dygraph=(not self.use_mkldnn)
+            place,
+            atol=1e-5,
+            check_dygraph=(not self.use_mkldnn),
+            check_pir_onednn=self.check_pir_onednn,
         )
 
     def test_check_grad(self):
@@ -515,6 +518,7 @@ def test_check_grad(self):
             'Output',
             max_relative_error=0.02,
             check_dygraph=(not self.use_mkldnn),
+            check_pir_onednn=self.check_pir_onednn,
         )
 
     def test_check_grad_no_filter(self):
@@ -531,6 +535,7 @@ def test_check_grad_no_filter(self):
             max_relative_error=0.02,
             no_grad_set={'Filter'},
             check_dygraph=(not self.use_mkldnn),
+            check_pir_onednn=self.check_pir_onednn,
         )
 
     def test_check_grad_no_input(self):
@@ -546,6 +551,7 @@ def test_check_grad_no_input(self):
             'Output',
             no_grad_set={'Input'},
             check_dygraph=(not self.use_mkldnn),
+            check_pir_onednn=self.check_pir_onednn,
         )
 
     def init_test_case(self):
@@ -824,7 +830,10 @@ def test_check_output(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
         place = core.CUDAPlace(0) if self.has_cuda() else core.CPUPlace()
         self.check_output_with_place(
-            place, atol=1e-5, check_dygraph=(not self.use_mkldnn)
+            place,
+            atol=1e-5,
+            check_dygraph=(not self.use_mkldnn),
+            check_pir_onednn=self.check_pir_onednn,
         )
 
     def test_check_grad(self):
@@ -838,6 +847,7 @@ def test_check_grad(self):
             'Output',
             max_relative_error=0.02,
             check_dygraph=(not self.use_mkldnn),
+            check_pir_onednn=self.check_pir_onednn,
         )
 
     def test_check_grad_no_filter(self):
@@ -852,6 +862,7 @@ def test_check_grad_no_filter(self):
             max_relative_error=0.02,
             no_grad_set={'Filter'},
             check_dygraph=(not self.use_mkldnn),
+            check_pir_onednn=self.check_pir_onednn,
         )
 
     def test_check_grad_no_input(self):
@@ -865,6 +876,7 @@ def test_check_grad_no_input(self):
             'Output',
             no_grad_set={'Input'},
             check_dygraph=(not self.use_mkldnn),
+            check_pir_onednn=self.check_pir_onednn,
         )
 
     def init_test_case(self):
diff --git a/test/legacy_test/test_elementwise_add_op.py b/test/legacy_test/test_elementwise_add_op.py
index b3c4c20b889cb0..126d881d53e506 100644
--- a/test/legacy_test/test_elementwise_add_op.py
+++ b/test/legacy_test/test_elementwise_add_op.py
@@ -30,7 +30,6 @@
 class TestElementwiseAddOp(OpTest):
     def init_kernel_type(self):
         self.use_mkldnn = False
-        self.check_pir_onednn = False
 
     def setUp(self):
         self.op_type = "elementwise_add"
diff --git a/test/mkldnn/test_conv2d_bf16_mkldnn_op.py b/test/mkldnn/test_conv2d_bf16_mkldnn_op.py
index a4526dcf0851aa..7a91ab4ccf1dfc 100644
--- a/test/mkldnn/test_conv2d_bf16_mkldnn_op.py
+++ b/test/mkldnn/test_conv2d_bf16_mkldnn_op.py
@@ -50,6 +50,7 @@ def setUp(self):
         self.init_data_type()
         self.init_force_fp32_output()
         self.init_infer_or_train()
+        self.check_pir_onednn = True
 
         self.conv2d_param = {
             'stride': self.stride,
@@ -117,7 +118,9 @@ def setUp(self):
         self.init_additional_attrs()
 
     def test_check_output(self):
-        self.check_output_with_place(core.CPUPlace())
+        self.check_output_with_place(
+            core.CPUPlace(), check_pir_onednn=self.check_pir_onednn
+        )
 
     def test_check_grad(self):
         pass
@@ -186,6 +189,7 @@ def test_check_grad(self):
             "Output",
             user_defined_grads=[dx, dweights],
             user_defined_grad_outputs=[convert_float_to_uint16(dout)],
+            check_pir_onednn=self.check_pir_onednn,
         )
 
     def test_check_grad_no_filter(self):
@@ -202,6 +206,7 @@ def test_check_grad_no_filter(self):
             {'Filter'},
             user_defined_grads=[dx],
             user_defined_grad_outputs=[convert_float_to_uint16(dout)],
+            check_pir_onednn=self.check_pir_onednn,
         )
 
     def test_check_grad_no_input(self):
@@ -218,6 +223,7 @@ def test_check_grad_no_input(self):
             {'Input'},
             user_defined_grads=[dweights],
             user_defined_grad_outputs=[convert_float_to_uint16(dout)],
+            check_pir_onednn=self.check_pir_onednn,
         )
 
 
diff --git a/test/mkldnn/test_conv2d_int8_mkldnn_op.py b/test/mkldnn/test_conv2d_int8_mkldnn_op.py
index 981570cd5bb524..c72f70b07e2183 100644
--- a/test/mkldnn/test_conv2d_int8_mkldnn_op.py
+++ b/test/mkldnn/test_conv2d_int8_mkldnn_op.py
@@ -48,6 +48,7 @@ def setUp(self):
         self.init_fuse_activation()
         self.init_fuse_residual()
         self.init_data_type()
+        self.check_pir_onednn = True
 
         conv2d_param = {
             'stride': self.stride,
@@ -184,7 +185,10 @@ def test_check_output(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
         # the atol for integer tests should be 1
         self.check_output_with_place(
-            core.CPUPlace(), atol=1, check_dygraph=False
+            core.CPUPlace(),
+            atol=1,
+            check_dygraph=False,
+            check_pir_onednn=self.check_pir_onednn,
         )
 
     def test_check_grad(self):
diff --git a/test/mkldnn/test_conv2d_mkldnn_op.py b/test/mkldnn/test_conv2d_mkldnn_op.py
index 2d6cafdbc3734b..606c86ce62f4b7 100644
--- a/test/mkldnn/test_conv2d_mkldnn_op.py
+++ b/test/mkldnn/test_conv2d_mkldnn_op.py
@@ -17,9 +17,6 @@
 import numpy as np
 from op_test import OpTest, skip_check_grad_ci
 from test_conv2d_op import TestConv2DOp, TestConv2DOp_v2
-from utils import compare_legacy_with_pt
-
-from paddle.base import core
 
 
 def conv2d_bias_naive(out, bias):
@@ -64,6 +61,7 @@ def setUp(self):
         self.input_residual_size = None
 
         TestConv2DOp.setUp(self)
+        self.check_pir_onednn = True
 
         output = self.outputs['Output']
 
@@ -144,6 +142,7 @@ def setUp(self):
         self.input_residual_size = None
 
         TestConv2DOp.setUp(self)
+        self.check_pir_onednn = True
 
         output = self.outputs['Output']
 
@@ -195,14 +194,6 @@ def setUp(self):
 
         self.outputs['Output'] = output
 
-    @compare_legacy_with_pt
-    def test_check_output(self):
-        place = core.CUDAPlace(0) if self.has_cuda() else core.CPUPlace()
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output_with_place(
-            place, atol=1e-5, check_dygraph=(not self.use_mkldnn)
-        )
-
 
 @skip_check_grad_ci(
     reason="Fusion is for inference only, check_grad is not required."

From cb5c1c7ea805f9d976b0b49927a4cca0970ed056 Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Tue, 23 Jan 2024 11:07:22 +0800
Subject: [PATCH 17/34] [CINN] Add pass for broadcast tree to condition op in
 dynamic shape (#60828)

* add pass for broadcast tree to condition op in dynamic shape

* update group

* fix bug

* update broadcast tree

* add some debug log

* fix conflict

* output_types of JitKernelOp come from compiled data.

* polish debug code

* fix bug of static shape

* fix bug

* polish code

* update by review

* update group

* update pir.cc

* fix bug

---------

Co-authored-by: jiahy0825 <jiahongyu@baidu.com>
---
 paddle/cinn/common/broadcast_tree.cc          |   7 +-
 paddle/cinn/common/broadcast_tree.h           |   2 -
 .../divide_group_op_to_fusion_op_pass.cc      |  99 +--
 .../group_merge/lower_cinn_fusion_op_pass.cc  | 622 +++++++++++++++++-
 .../group_merge/lower_cinn_fusion_op_pass.h   |   2 +
 .../group_merge/op_with_group_merge_pass.cc   |   2 -
 paddle/cinn/hlir/framework/pir/group.h        |  27 +
 .../instruction/cinn_jit_instruction.cc       |  16 +-
 paddle/fluid/pybind/pir.cc                    |   7 +-
 9 files changed, 654 insertions(+), 130 deletions(-)

diff --git a/paddle/cinn/common/broadcast_tree.cc b/paddle/cinn/common/broadcast_tree.cc
index c20a78c2c9f84e..02edb349ee607f 100644
--- a/paddle/cinn/common/broadcast_tree.cc
+++ b/paddle/cinn/common/broadcast_tree.cc
@@ -137,7 +137,8 @@ std::optional<symbol::Broadcastable<symbol::DimExpr>> GetFirstCstrBroadcastable(
       }
     }
     if (lhs_symbol.has_value() && rhs_symbol.has_value()) {
-      CHECK(lhs_symbol != rhs_symbol);
+      CHECK(lhs_symbol != rhs_symbol)
+          << lhs_symbol.value() << " != " << rhs_symbol.value();
       ret = symbol::Broadcastable<symbol::DimExpr>{lhs_symbol.value(),
                                                    rhs_symbol.value()};
       return true;
@@ -341,8 +342,4 @@ std::string ToTxtString(const BroadcastTree& tree) {
                     tree.variant());
 }
 
-std::ostream& operator<<(std::ostream& os, const BroadcastTree& tree) {
-  os << ToTxtString(tree);
-}
-
 }  // namespace cinn::common
diff --git a/paddle/cinn/common/broadcast_tree.h b/paddle/cinn/common/broadcast_tree.h
index b542180bf4728f..3b688934c597ab 100644
--- a/paddle/cinn/common/broadcast_tree.h
+++ b/paddle/cinn/common/broadcast_tree.h
@@ -33,6 +33,4 @@ BroadcastTree ConstructBroadcastTree(const BroadcastLeaf& leaves);
 
 std::string ToTxtString(const BroadcastTree&);
 
-std::ostream& operator<<(std::ostream& os, const BroadcastTree& tree);
-
 }  // namespace cinn::common
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/divide_group_op_to_fusion_op_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/divide_group_op_to_fusion_op_pass.cc
index bfde4f92cd769e..0ed2b174a421a9 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/divide_group_op_to_fusion_op_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/divide_group_op_to_fusion_op_pass.cc
@@ -19,11 +19,13 @@
 #include <unordered_map>
 
 #include "paddle/cinn/adt/generate_map_expr.h"
+#include "paddle/cinn/common/broadcast_tree.h"
+#include "paddle/cinn/hlir/dialect/operator/ir/cinn_op.h"
+#include "paddle/cinn/hlir/dialect/operator/ir/generate_shape_util.h"
 #include "paddle/cinn/hlir/dialect/operator/ir/manual_op.h"
 #include "paddle/cinn/hlir/dialect/operator/ir/op_attribute.h"
 #include "paddle/cinn/hlir/dialect/operator/ir/op_dialect.h"
 #include "paddle/cinn/hlir/dialect/operator/transforms/group_merge/op_with_group_merge_pass.h"
-#include "paddle/cinn/hlir/dialect/runtime/ir/jit_kernel_op.h"
 #include "paddle/cinn/hlir/dialect/runtime/ir/runtime_dialect.h"
 #include "paddle/cinn/hlir/framework/pir_compiler.h"
 #include "paddle/cinn/runtime/flags.h"
@@ -42,77 +44,6 @@ PD_DECLARE_bool(cinn_enable_map_expr);
 namespace {
 
 /*
-using ShapeOrDataDimExprs4ValueT =
-    std::function<const symbol::ShapeOrDataDimExprs&(pir::Value)>;
-
-pir::Block::ConstIterator FindFirstExpandOp(pir::Block* block) {
-  for (auto iter = block->begin(); iter != block->end(); ++iter) {
-    if (iter->isa<paddle::dialect::ExpandOp>()) {
-      return iter;
-    }
-  }
-}
-
-bool SameInputOutputShape(
-    paddle::dialect::ExpandOp expand_op,
-    const ShapeOrDataDimExprs4ValueT& ShapeOrDataDimExprs4Value) {
-  const auto& x = ShapeOrDataDimExprs4Value(expand_op.x());
-  const auto& shape = ShapeOrDataDimExprs4Value(expand_op.shape());
-  const auto& out = ShapeOrDataDimExprs4Value(expand_op.out());
-  if (x.data().has_value()) return false;
-  if (!shape.data().has_value()) return false;
-  if (out.data().has_value()) return false;
-  CHECK(shape.data().value() == out.shape());
-  return x.shape() == out.shape();
-}
-
-void ReplaceAllUsesWithInput(paddle::dialect::ExpandOp expand) {
-  pir::Value x = expand.x();
-  expand.out().ReplaceAllUsesWith(x);
-}
-
-void EraseExpandOp(pir::Block* block, pir::Block::ConstIterator expand_it) {
-  block->erase(expand_it);
-}
-
-void EraseUpstreamGenerateShapeOp(
-    pir::Block* block, cinn::dialect::GenerateShapeOp generate_shape_op) {
-  for (auto iter = block->begin(); iter != block->end(); ++iter) {
-    if (iter->isa<cinn::dialect::GenerateShapeOp>()) {
-      if (iter->dyn_cast<cinn::dialect::GenerateShapeOp>() ==
-          generate_shape_op) {
-        block->erase(iter);
-      }
-    }
-  }
-}
-
-// Returns true if success
-bool EraseOneExpand(
-    pir::Block* block,
-    const ShapeOrDataDimExprs4ValueT& ShapeOrDataDimExprs4Value) {
-  for (auto expand_it = block->begin(); expand_it != block->end();
-       ++expand_it) {
-    if (!expand_it->isa<paddle::dialect::ExpandOp>()) continue;
-    auto expand = expand_it->dyn_cast<paddle::dialect::ExpandOp>();
-    if (!SameInputOutputShape(expand, ShapeOrDataDimExprs4Value)) continue;
-    auto generate_shape_op =
-        expand.shape().defining_op<cinn::dialect::GenerateShapeOp>();
-    CHECK_NOTNULL(generate_shape_op);
-    ReplaceAllUsesWithInput(expand);
-    EraseExpandOp(block, expand_it);
-    EraseUpstreamGenerateShapeOp(block, generate_shape_op);
-    return true;
-  }
-  return false;
-}
-
-void EraseExpands(pir::Block* block,
-                  const ShapeOrDataDimExprs4ValueT& ShapeOrDataDimExprs4Value) {
-  while (EraseOneExpand(block, ShapeOrDataDimExprs4Value)) {
-    // Do nothing.
-  }
-}
 
 std::vector<pir::Value> GetBlockOutsideInput(
     const std::vector<pir::Operation*> op_list) {
@@ -230,30 +161,6 @@ std::vector<pir::Operation*> GetOutputOpList(
   return vec_res;
 }
 
-std::unordered_map<::pir::Value, symbol::ShapeOrDataDimExprs>
-CreateGroupShapeOrDataExprs(const cinn::dialect::ir::GroupPtr& group,
-                            pir::ShapeConstraintIRAnalysis* shape_analysis) {
-  std::unordered_map<::pir::Value, symbol::ShapeOrDataDimExprs> value2shape;
-  for (auto* op : group->ops) {
-    for (size_t i = 0; i < op->num_operands(); ++i) {
-      auto operand = op->operand_source(i);
-      if (shape_analysis->HasShapeOrDataForValue(operand)) {
-        value2shape.insert(
-            {operand, shape_analysis->GetShapeOrDataForValue(operand)});
-      }
-    }
-    for (size_t i = 0; i < op->num_results(); ++i) {
-      auto result = op->result(i);
-      if (value2shape.find(result) == value2shape.end() &&
-          shape_analysis->HasShapeOrDataForValue(result)) {
-        value2shape.insert(
-            {result, shape_analysis->GetShapeOrDataForValue(result)});
-      }
-    }
-  }
-  return value2shape;
-}
-
 class GroupOpPattern : public pir::OpRewritePattern<cinn::dialect::GroupOp> {
  public:
   explicit GroupOpPattern(::pir::IrContext* context)
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/lower_cinn_fusion_op_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/lower_cinn_fusion_op_pass.cc
index 4f0dda24934738..b043b7dc16ddf4 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/lower_cinn_fusion_op_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/lower_cinn_fusion_op_pass.cc
@@ -19,6 +19,9 @@
 #include <unordered_map>
 
 #include "paddle/cinn/adt/generate_map_expr.h"
+#include "paddle/cinn/common/broadcast_tree.h"
+#include "paddle/cinn/hlir/dialect/operator/ir/cinn_op.h"
+#include "paddle/cinn/hlir/dialect/operator/ir/generate_shape_util.h"
 #include "paddle/cinn/hlir/dialect/operator/ir/manual_op.h"
 #include "paddle/cinn/hlir/dialect/operator/ir/op_attribute.h"
 #include "paddle/cinn/hlir/dialect/operator/ir/op_dialect.h"
@@ -29,6 +32,7 @@
 #include "paddle/cinn/hlir/framework/pir_compiler.h"
 #include "paddle/cinn/runtime/flags.h"
 #include "paddle/fluid/pir/dialect/kernel/ir/kernel_dialect.h"
+#include "paddle/fluid/pir/dialect/operator/ir/control_flow_op.h"
 #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
 #include "paddle/pir/core/program.h"
 #include "paddle/pir/dialect/control_flow/ir/cf_op.h"
@@ -45,6 +49,98 @@ namespace {
 using cinn::dialect::ir::Group;
 using cinn::hlir::framework::pir::CompatibleInfo;
 
+using ShapeOrDataDimExprs4ValueT =
+    std::function<const symbol::ShapeOrDataDimExprs&(pir::Value)>;
+
+bool SameInputOutputShape(
+    paddle::dialect::ExpandOp expand_op,
+    const ShapeOrDataDimExprs4ValueT& ShapeOrDataDimExprs4Value) {
+  const auto& x = ShapeOrDataDimExprs4Value(expand_op.x());
+  const auto& shape = ShapeOrDataDimExprs4Value(expand_op.shape());
+  const auto& out = ShapeOrDataDimExprs4Value(expand_op.out());
+  if (x.data().has_value()) return false;
+  if (!shape.data().has_value()) return false;
+  if (out.data().has_value()) return false;
+  CHECK(shape.data().value() == out.shape());
+  return x.shape() == out.shape();
+}
+
+void ReplaceAllUsesWithInput(paddle::dialect::ExpandOp expand) {
+  pir::Value x = expand.x();
+  expand.out().ReplaceAllUsesWith(x);
+}
+
+// Returns true if success
+bool EraseOneExpand(
+    pir::Block* block,
+    pir::PatternRewriter& rewriter,  // NOLINT
+    const ShapeOrDataDimExprs4ValueT& ShapeOrDataDimExprs4Value) {
+  for (auto expand_it = block->begin(); expand_it != block->end();
+       ++expand_it) {
+    if (!expand_it->isa<paddle::dialect::ExpandOp>()) continue;
+    auto expand = expand_it->dyn_cast<paddle::dialect::ExpandOp>();
+    if (!SameInputOutputShape(expand, ShapeOrDataDimExprs4Value)) continue;
+    auto generate_shape_op =
+        expand.shape().defining_op<cinn::dialect::GenerateShapeOp>();
+    CHECK_NOTNULL(generate_shape_op);
+    ReplaceAllUsesWithInput(expand);
+    rewriter.EraseOp(expand);
+    rewriter.EraseOp(generate_shape_op);
+    return true;
+  }
+  return false;
+}
+
+void EraseUneccessaryExpandsInBlock(
+    pir::Block* block,
+    pir::PatternRewriter& rewriter,  // NOLINT
+    const ShapeOrDataDimExprs4ValueT& ShapeOrDataDimExprs4Value) {
+  while (EraseOneExpand(block, rewriter, ShapeOrDataDimExprs4Value)) {
+    // Do nothing.
+  }
+}
+
+void ReplaceExpandWithBroadcast(pir::IrContext* ir_context,
+                                pir::Block* block,
+                                const cinn::dialect::ir::GroupPtr& group) {
+  std::vector<pir::Operation*> op_list;
+  for (auto& op : *block) {
+    op_list.push_back(&op);
+  }
+  pir::Builder builder(ir_context, block);
+  for (auto* op : op_list) {
+    if (op && op->isa<paddle::dialect::ExpandOp>() &&
+        op->operand_source(1)
+            .defining_op()
+            ->isa<cinn::dialect::GenerateShapeOp>()) {
+      builder.SetInsertionPointAfter(op);
+      auto x_rank = op->operand_source(0)
+                        .type()
+                        .dyn_cast<pir::ShapedTypeInterface>()
+                        .GetRank();
+      auto out_rank =
+          op->result(0).type().dyn_cast<pir::ShapedTypeInterface>().GetRank();
+      std::vector<int64_t> broadcast_axes(x_rank, 0);
+      size_t index_gap = out_rank - x_rank;
+      for (size_t i = 0; i < x_rank; ++i) {
+        broadcast_axes[i] = i + index_gap;
+      }
+      std::vector<int64_t> out_shape(out_rank, -1);
+      auto broadcast = builder.Build<cinn::dialect::BroadcastOp>(
+          op->operand_source(0), broadcast_axes, out_shape);
+      auto broadcast_out = broadcast.result(0);
+      auto expand_out = op->result(0);
+      expand_out.ReplaceAllUsesWith(broadcast_out);
+      group->value_to_shape_or_data_exprs[broadcast_out] =
+          group->GetShapeOrDataExprs(expand_out);
+      CHECK(op->use_empty());
+      auto generate_shape_op = op->operand_source(1).defining_op();
+      op->Erase();
+      generate_shape_op->Erase();
+    }
+  }
+}
+
 std::vector<pir::Value> GetBlockOutsideInput(
     const std::vector<pir::Operation*>& op_list) {
   std::vector<pir::Value> vec_res;
@@ -68,24 +164,451 @@ std::vector<pir::Value> GetBlockOutsideInput(
   return vec_res;
 }
 
+std::tuple<pir::Value, pir::Value, pir::Value> BroadcastableToCondValue(
+    const symbol::Broadcastable<symbol::DimExpr>& broadcastable_condition,
+    pir::ShapeConstraintIRAnalysis& shape_analysis,  // NOLINT
+    const std::vector<pir::Value>& group_inputs,
+    pir::Builder& builder) {  // NOLINT
+  const auto& lhs_expr = broadcastable_condition->lhs;
+  const auto& rhs_expr = broadcastable_condition->rhs;
+  auto ShapeOrDataDimExprs4Value = [&shape_analysis](pir::Value value) {
+    return shape_analysis.GetShapeOrDataForValue(value);
+  };
+
+  std::vector<pir::Value> lhs_minial_inputs;
+  std::vector<pir::Attribute> lhs_output_dim_expr_attrs;
+  cinn::dialect::GenerateShapeOp::SymbolBindings lhs_symbol_bindings;
+  bool success =
+      cinn::dialect::MakeGenerateShapeOpAttribute(builder.ir_context(),
+                                                  ShapeOrDataDimExprs4Value,
+                                                  {lhs_expr},
+                                                  group_inputs,
+                                                  &lhs_minial_inputs,
+                                                  &lhs_output_dim_expr_attrs,
+                                                  &lhs_symbol_bindings);
+  CHECK(success);
+  std::vector<pir::Value> rhs_minial_inputs;
+  std::vector<pir::Attribute> rhs_output_dim_expr_attrs;
+  cinn::dialect::GenerateShapeOp::SymbolBindings rhs_symbol_bindings;
+  success =
+      cinn::dialect::MakeGenerateShapeOpAttribute(builder.ir_context(),
+                                                  ShapeOrDataDimExprs4Value,
+                                                  {rhs_expr},
+                                                  group_inputs,
+                                                  &rhs_minial_inputs,
+                                                  &rhs_output_dim_expr_attrs,
+                                                  &rhs_symbol_bindings);
+  CHECK(success);
+
+  auto lhs_value =
+      builder
+          .Build<cinn::dialect::GenerateShapeOp>(
+              lhs_minial_inputs, lhs_output_dim_expr_attrs, lhs_symbol_bindings)
+          .out();
+  auto rhs_value =
+      builder
+          .Build<cinn::dialect::GenerateShapeOp>(
+              rhs_minial_inputs, rhs_output_dim_expr_attrs, rhs_symbol_bindings)
+          .out();
+
+  auto const_one = builder
+                       .Build<paddle::dialect::FullOp>(
+                           std::vector<int64_t>{1}, 1, phi::DataType::INT64)
+                       .out();
+  auto lhs_eq_rhs_cond =
+      builder.Build<paddle::dialect::EqualOp>(lhs_value, rhs_value).out();
+  auto lhs_eq_one_cond =
+      builder.Build<paddle::dialect::EqualOp>(lhs_value, const_one).out();
+  auto rhs_eq_one_cond =
+      builder.Build<paddle::dialect::EqualOp>(rhs_value, const_one).out();
+  return std::tuple<pir::Value, pir::Value, pir::Value>(
+      lhs_eq_rhs_cond, lhs_eq_one_cond, rhs_eq_one_cond);
+}
+
+cinn::dialect::ir::GroupPtr CloneGroup(const cinn::dialect::ir::GroupPtr& group,
+                                       pir::Block* block,
+                                       pir::IrMapping* ir_mapping) {
+  return group->Clone(block, *ir_mapping);
+}
+
+void UpdateGroupShapeExprs(
+    const cinn::dialect::ir::GroupPtr& new_group,
+    const cinn::dialect::ir::GroupPtr& origin_group,
+    const pir::IrMapping& ir_mapping,
+    const cinn::common::BroadcastLeaf& value_dim_exprs_list,
+    const std::unordered_map<pir::Value, size_t>& value_to_dim_expr_idx) {
+  for (const auto& [origin_val, new_val] : ir_mapping.GetMap<pir::Value>()) {
+    const auto& shape_dim_expr =
+        value_dim_exprs_list->at(value_to_dim_expr_idx.at(origin_val));
+    const auto& origin_shape_or_data =
+        origin_group->GetShapeOrDataExprs(origin_val);
+    if (origin_shape_or_data.data()) {
+      new_group->value_to_shape_or_data_exprs[new_val] =
+          symbol::ShapeOrDataDimExprs(
+              std::vector<symbol::DimExpr>{shape_dim_expr.size()},
+              shape_dim_expr);
+    } else {
+      new_group->value_to_shape_or_data_exprs[new_val] =
+          symbol::ShapeOrDataDimExprs(shape_dim_expr);
+    }
+  }
+}
+
+void SetLeafBlockByGroupView(
+    const cinn::dialect::ir::GroupPtr& origin_group,
+    const cinn::common::BroadcastLeaf& value_dim_exprs_list,
+    const std::unordered_map<pir::Value, size_t>& value_to_dim_expr_idx,
+    pir::Builder& builder,  // NOLINT
+    pir::Block* block,
+    std::unordered_map<pir::Block*, cinn::dialect::ir::GroupPtr>* group_map) {
+  pir::IrMapping ir_mapping;
+  auto origin_group_inputs = GetBlockOutsideInput(origin_group->ops);
+  for (auto input : origin_group_inputs) {
+    ir_mapping.Add(input, input);
+  }
+
+  auto new_group = CloneGroup(origin_group, block, &ir_mapping);
+  CHECK_EQ(origin_group->ops.size(), new_group->ops.size());
+  UpdateGroupShapeExprs(new_group,
+                        origin_group,
+                        ir_mapping,
+                        value_dim_exprs_list,
+                        value_to_dim_expr_idx);
+
+  // Insert YieldOp for outputs
+  std::vector<pir::Value> outputs;
+  builder.SetInsertionPointToBlockEnd(block);
+  for (auto output : origin_group->GetGroupOutputValues()) {
+    outputs.push_back(ir_mapping.Lookup(output));
+  }
+  builder.Build<pir::YieldOp>(outputs);
+
+  group_map->insert({block, new_group});
+}
+
+std::vector<pir::Value> GetOpOuputValues(const pir::Operation* op) {
+  std::vector<pir::Value> outputs;
+  outputs.reserve(op->num_results());
+  for (size_t i = 0; i < op->num_results(); ++i) {
+    outputs.push_back(op->result(i));
+  }
+  return outputs;
+}
+
+void InsertYieldOpForCondBlock(pir::Operation* cond_op,
+                               pir::Builder& builder) {  // NOLINT
+  if (cond_op) {
+    builder.SetInsertionPointAfter(cond_op);
+    builder.Build<pir::YieldOp>(GetOpOuputValues(cond_op));
+  }
+}
+
+// Visit broadcast_tree by dfs
+pir::Operation* CreateConditionBlock(
+    const cinn::common::BroadcastTree& broadcast_tree,
+    const cinn::dialect::ir::GroupPtr& origin_group,
+    pir::ShapeConstraintIRAnalysis& shape_analysis,  // NOLINT
+    const std::unordered_map<pir::Value, size_t>& value_to_dim_expr_idx,
+    const std::vector<pir::Value>& group_inputs,
+    const std::vector<pir::Type>& output_types,
+    pir::Builder& builder,  // NOLINT
+    pir::Block* block,
+    std::unordered_map<pir::Block*, cinn::dialect::ir::GroupPtr>* group_map) {
+  if (broadcast_tree.Has<cinn::common::BroadcastLeaf>()) {
+    const auto& broadcast_leaf =
+        broadcast_tree.Get<cinn::common::BroadcastLeaf>();
+    SetLeafBlockByGroupView(origin_group,
+                            broadcast_leaf,
+                            value_to_dim_expr_idx,
+                            builder,
+                            block,
+                            group_map);
+    return nullptr;
+  } else {
+    const auto& branch =
+        broadcast_tree
+            .Get<cinn::common::BroadcastBranch<cinn::common::BroadcastTree>>();
+    const auto& [lhs_eq_rhs_cond, lhs_eq_one_cond, rhs_eq_one_cond] =
+        BroadcastableToCondValue(
+            branch.Get<0>(), shape_analysis, group_inputs, builder);
+
+    // lhs == rhs
+    auto lhs_eq_rhs_cond_op = builder.Build<paddle::dialect::IfOp>(
+        lhs_eq_rhs_cond, std::vector<pir::Type>{output_types});
+    pir::Block& lhs_eq_rhs_block = lhs_eq_rhs_cond_op.true_block();
+    builder.SetInsertionPointToBlockEnd(&lhs_eq_rhs_block);
+    auto* lhs_eq_rhs_block_op = CreateConditionBlock(branch.Get<1>(),
+                                                     origin_group,
+                                                     shape_analysis,
+                                                     value_to_dim_expr_idx,
+                                                     group_inputs,
+                                                     output_types,
+                                                     builder,
+                                                     &lhs_eq_rhs_block,
+                                                     group_map);
+    InsertYieldOpForCondBlock(lhs_eq_rhs_block_op, builder);
+
+    pir::Block& lhs_not_eq_rhs_block = lhs_eq_rhs_cond_op.false_block();
+    builder.SetInsertionPointToBlockEnd(&lhs_not_eq_rhs_block);
+
+    // lhs != rhs && lhs == 1
+    auto lhs_eq_one_cond_op = builder.Build<paddle::dialect::IfOp>(
+        lhs_eq_one_cond, std::vector<pir::Type>{output_types});
+    pir::Block& lhs_eq_one_block = lhs_eq_one_cond_op.true_block();
+    builder.SetInsertionPointToBlockEnd(&lhs_eq_one_block);
+    auto* lhs_eq_one_block_op = CreateConditionBlock(branch.Get<2>(),
+                                                     origin_group,
+                                                     shape_analysis,
+                                                     value_to_dim_expr_idx,
+                                                     group_inputs,
+                                                     output_types,
+                                                     builder,
+                                                     &lhs_eq_one_block,
+                                                     group_map);
+    InsertYieldOpForCondBlock(lhs_eq_one_block_op, builder);
+
+    // lhs != rhs && rhs == 1
+    pir::Block& rhs_eq_one_block = lhs_eq_one_cond_op.false_block();
+    builder.SetInsertionPointToBlockEnd(&rhs_eq_one_block);
+    auto* rhs_eq_one_block_op = CreateConditionBlock(branch.Get<3>(),
+                                                     origin_group,
+                                                     shape_analysis,
+                                                     value_to_dim_expr_idx,
+                                                     group_inputs,
+                                                     output_types,
+                                                     builder,
+                                                     &rhs_eq_one_block,
+                                                     group_map);
+    InsertYieldOpForCondBlock(rhs_eq_one_block_op, builder);
+
+    builder.SetInsertionPointToBlockEnd(&lhs_not_eq_rhs_block);
+    builder.Build<pir::YieldOp>(GetOpOuputValues(lhs_eq_one_cond_op));
+
+    return lhs_eq_rhs_cond_op;
+  }
+}
+
+std::unordered_map<cinn::dialect::ir::GroupPtr,
+                   std::unordered_map<std::string, pir::Attribute>>
+ComplieGroupAsOpAttribute(
+    const std::shared_ptr<cinn::hlir::framework::PirCompiler>& pir_compiler,
+    const std::vector<cinn::dialect::ir::GroupPtr>& group_list) {
+  auto fn_ptr_res = pir_compiler->BuildCUDAJITInfo(group_list);
+
+  std::unordered_map<cinn::dialect::ir::GroupPtr,
+                     std::unordered_map<std::string, pir::Attribute>>
+      result;
+  for (size_t i = 0; i < group_list.size(); ++i) {
+    std::unordered_map<std::string, ::pir::Attribute> op_attrs{
+        {cinn::dialect::JitKernelOp::kAttrName,
+         cinn::dialect::CINNKernelInfoAttribute::get(pir::IrContext::Instance(),
+                                                     fn_ptr_res[i])},
+    };
+    result.insert({group_list[i], op_attrs});
+  }
+  return result;
+}
+
+void SimplyConditionBlock(
+    pir::PatternRewriter& rewriter,  // NOLINT
+    std::unordered_map<pir::Block*, cinn::dialect::ir::GroupPtr>* group_map) {
+  VLOG(4) << "simply condition block";
+  using DoEachMutBlockGroupT =
+      std::function<void(pir::Block*, const cinn::dialect::ir::GroupPtr&)>;
+  const auto& ForEachMutBlockGroup = [&](const DoEachMutBlockGroupT& DoEach) {
+    for (auto& [block, group] : *group_map) {
+      DoEach(block, group);
+      std::vector<pir::Operation*> group_new_ops;
+      group_new_ops.reserve(block->size());
+      std::unordered_set<pir::Operation*> group_ops_set;
+      for (auto& op : *block) {
+        if (!op.isa<pir::YieldOp>()) {
+          group_new_ops.push_back(&op);
+          group_ops_set.insert(&op);
+        }
+      }
+      group->ops = group_new_ops;
+      group->ops_set = group_ops_set;
+    }
+  };
+  ForEachMutBlockGroup([&](auto* block, const auto& group) {
+    auto GetShapeOrDataForValue =
+        [&group](pir::Value value) -> const symbol::ShapeOrDataDimExprs& {
+      return group->GetShapeOrDataExprs(value);
+    };
+    EraseUneccessaryExpandsInBlock(block, rewriter, GetShapeOrDataForValue);
+  });
+  ForEachMutBlockGroup([&](auto* block, const auto& group) {
+    ReplaceExpandWithBroadcast(rewriter.ir_context(), block, group);
+  });
+}
+
+void CompileGroupToJitKernelOp(
+    const std::vector<pir::Value>& group_inputs,
+    const std::vector<pir::Type>& output_types,
+    const std::shared_ptr<cinn::hlir::framework::PirCompiler>& pir_compiler,
+    pir::PatternRewriter& rewriter,  // NOLINT
+    std::unordered_map<pir::Block*, cinn::dialect::ir::GroupPtr>* group_map) {
+  // prepare attribute for jit_kernel_op
+  std::vector<cinn::dialect::ir::GroupPtr> group_list;
+  group_list.reserve(group_map->size());
+  for (const auto& [_, group] : *group_map) {
+    group_list.push_back(group);
+  }
+  auto op_attr_map = ComplieGroupAsOpAttribute(pir_compiler, group_list);
+  VLOG(4) << "The size of group_map is : " << group_map->size();
+  for (auto& [block, group] : *group_map) {
+    auto& yeild_op = block->back();
+    CHECK(yeild_op.isa<pir::YieldOp>()) << "Last op of block should be yield";
+    rewriter.set_insertion_point(&yeild_op);
+    auto jit_kernel_op = rewriter.Build<cinn::dialect::JitKernelOp>(
+        group_inputs, op_attr_map.at(group), output_types);
+    auto group_output_values = group->GetGroupOutputValues();
+    CHECK(jit_kernel_op.num_results() == group_output_values.size());
+    for (size_t i = 0; i < jit_kernel_op.num_results(); ++i) {
+      rewriter.ReplaceAllUsesWith(group_output_values[i],
+                                  jit_kernel_op.result(i));
+    }
+
+    // Delete origin group ops
+    std::vector<pir::Operation*> group_ops;
+    for (auto iter = block->rbegin(); iter != block->rend(); iter++) {
+      if (!iter->isa<pir::YieldOp>()) {
+        group_ops.push_back(&(*iter));
+      }
+    }
+    for (auto* op : group_ops) {
+      if (op->use_empty()) {
+        op->Erase();
+      }
+    }
+  }
+}
+
+pir::Operation* ComplieBroadcastTreeToConditionBlock(
+    const cinn::common::BroadcastTree& broadcast_tree,
+    const cinn::dialect::ir::GroupPtr& group,
+    pir::ShapeConstraintIRAnalysis& shape_analysis,  // NOLINT
+    const std::shared_ptr<cinn::hlir::framework::PirCompiler>& pir_compiler,
+    const std::unordered_map<pir::Value, size_t>& value_to_dim_expr_idx,
+    const std::vector<pir::Value>& group_inputs,
+    const std::vector<pir::Type>& output_types,
+    pir::PatternRewriter& rewriter) {  // NOLINT
+  // 1. broadcast tree to condition op
+  VLOG(4) << "broadcast tree to condition op";
+  std::unordered_map<pir::Block*, cinn::dialect::ir::GroupPtr> group_map;
+  pir::Operation* cond_op = CreateConditionBlock(broadcast_tree,
+                                                 group,
+                                                 shape_analysis,
+                                                 value_to_dim_expr_idx,
+                                                 group_inputs,
+                                                 output_types,
+                                                 rewriter,
+                                                 rewriter.block(),
+                                                 &group_map);
+  // 2. simply every condition block
+  auto* program = group->ops.front()->GetParentProgram();
+  VLOG(6) << "Before simply condition block: " << *program;
+
+  SimplyConditionBlock(rewriter, &group_map);
+  VLOG(6) << "After simply condition block: " << *program;
+
+  // 3. complie condition block to jit_kernel_op
+  CompileGroupToJitKernelOp(
+      group_inputs, output_types, pir_compiler, rewriter, &group_map);
+  VLOG(6) << "complie condition block to jit_kernel_op: " << *program;
+
+  return cond_op;
+}
+
+pir::Operation* ProcessDyShapeGroup(
+    const cinn::dialect::ir::GroupPtr& group,
+    pir::ShapeConstraintIRAnalysis& shape_analysis,  // NOLINT
+    const std::shared_ptr<cinn::hlir::framework::PirCompiler>& pir_compiler,
+    pir::PatternRewriter& rewriter) {  // NOLINT
+  std::unordered_set<pir::Value> value_view;
+  group->WalkOps([&group, &value_view](pir::Operation* op) {
+    for (size_t i = 0; i < op->num_operands(); ++i) {
+      value_view.insert(op->operand_source(i));
+    }
+    for (size_t i = 0; i < op->num_results(); ++i) {
+      value_view.insert(op->result(i));
+    }
+  });
+
+  // construct broadcast tree
+  VLOG(4) << "construct broadcast tree";
+  cinn::adt::List<std::vector<symbol::DimExpr>> all_value_dim_exprs;
+  std::unordered_map<pir::Value, size_t> value_to_dim_expr_idx;
+  for (auto value : value_view) {
+    const auto& shape_dim_expr = group->GetShapeOrDataExprs(value);
+    const auto& data_shape = shape_dim_expr.data();
+    if (data_shape) {
+      all_value_dim_exprs->push_back(*data_shape);
+    } else {
+      all_value_dim_exprs->push_back(shape_dim_expr.shape());
+    }
+    value_to_dim_expr_idx[value] = all_value_dim_exprs->size() - 1;
+  }
+  VLOG(6) << "before constructed. broadcast-leaf: \n"
+          << ToTxtString(cinn::common::BroadcastTree(all_value_dim_exprs));
+  cinn::common::BroadcastTree broadcast_tree =
+      cinn::common::ConstructBroadcastTree(
+          cinn::common::BroadcastLeaf(all_value_dim_exprs));
+  VLOG(0) << "broadcast-tree: \n" << ToTxtString(broadcast_tree);
+
+  auto group_inputs = GetBlockOutsideInput(group->ops);
+  shape_analysis.PrintShapeOrDatas();
+
+  // has multiple branch
+  if (broadcast_tree
+          .Has<cinn::common::BroadcastBranch<cinn::common::BroadcastTree>>()) {
+    std::vector<pir::Type> output_types;
+    auto group_output_values = group->GetGroupOutputValues();
+    for (size_t i = 0; i < group_output_values.size(); ++i) {
+      output_types.push_back(group_output_values[i].type());
+    }
+    return ComplieBroadcastTreeToConditionBlock(broadcast_tree,
+                                                group,
+                                                shape_analysis,
+                                                pir_compiler,
+                                                value_to_dim_expr_idx,
+                                                group_inputs,
+                                                output_types,
+                                                rewriter);
+  } else {  // no condition block
+    // complie group to jit_kernel_op
+    auto op_attr_map = ComplieGroupAsOpAttribute(pir_compiler, {group});
+    std::vector<pir::Type> output_types;
+    const auto& group_output_values = group->output_values;
+    for (size_t i = 0; i < group_output_values.size(); ++i) {
+      output_types.push_back(group_output_values[i].type());
+    }
+    auto jit_kernel_op = rewriter.Build<cinn::dialect::JitKernelOp>(
+        group_inputs, op_attr_map.at(group), output_types);
+    return jit_kernel_op;
+  }
+}
+
 std::unordered_map<::pir::Value, symbol::ShapeOrDataDimExprs>
-CreateGroupShapeOrDataExprs(const cinn::dialect::ir::GroupPtr& group,
-                            pir::ShapeConstraintIRAnalysis* shape_analysis) {
+CreateGroupShapeOrDataExprs(
+    const cinn::dialect::ir::GroupPtr& group,
+    pir::ShapeConstraintIRAnalysis& shape_analysis) {  // NOLINT
   std::unordered_map<::pir::Value, symbol::ShapeOrDataDimExprs> value2shape;
   for (auto* op : group->ops) {
     for (size_t i = 0; i < op->num_operands(); ++i) {
       auto operand = op->operand_source(i);
-      if (shape_analysis->HasShapeOrDataForValue(operand)) {
+      if (shape_analysis.HasShapeOrDataForValue(operand)) {
         value2shape.insert(
-            {operand, shape_analysis->GetShapeOrDataForValue(operand)});
+            {operand, shape_analysis.GetShapeOrDataForValue(operand)});
       }
     }
     for (size_t i = 0; i < op->num_results(); ++i) {
       auto result = op->result(i);
       if (value2shape.find(result) == value2shape.end() &&
-          shape_analysis->HasShapeOrDataForValue(result)) {
+          shape_analysis.HasShapeOrDataForValue(result)) {
         value2shape.insert(
-            {result, shape_analysis->GetShapeOrDataForValue(result)});
+            {result, shape_analysis.GetShapeOrDataForValue(result)});
       }
     }
   }
@@ -116,41 +639,59 @@ class FusionOpPattern : public pir::OpRewritePattern<cinn::dialect::FusionOp> {
     auto& shape_analysis = pir::ShapeAnalysisManager::Instance().Get(
         fusion_op->GetParentProgram());
     group->value_to_shape_or_data_exprs =
-        CreateGroupShapeOrDataExprs(group, &shape_analysis);
+        CreateGroupShapeOrDataExprs(group, shape_analysis);
     if (FLAGS_cinn_enable_map_expr) {
       cinn::adt::TryGenerateMapExprFromGroup(group);
     }
 
     // TODO(zhangyuqin1998): Replace pir::Group with a new structure
-    auto fn_ptr_res = ir_compiler->BuildCUDAJITInfo({group});
-    std::unordered_map<std::string, ::pir::Attribute> op_attrs{
-        {cinn::dialect::JitKernelOp::kAttrName,
-         cinn::dialect::CINNKernelInfoAttribute::get(ctx, fn_ptr_res[0])},
-    };
-
-    // Generate jit kernel op input and output
-    auto vec_ins = GetBlockOutsideInput(group->ops);
+    pir::Operation* complied_op =
+        ProcessGroup(group, shape_analysis, ir_compiler, rewriter);
 
-    std::vector<pir::Type> vec_types;
+    // the output_values of group may be changed.
     for (size_t i = 0; i < group->output_values.size(); ++i) {
-      vec_types.push_back(group->output_values[i].type());
       value2id[group->output_values[i]] = i;
     }
 
-    auto jit_kernel_op = rewriter.Build<cinn::dialect::JitKernelOp>(
-        vec_ins, op_attrs, vec_types);
-
     auto yeild_op = fusion_op.GetOperators().back();
     for (size_t i = 0; i < fusion_op.num_results(); ++i) {
       rewriter.ReplaceAllUsesWith(
           fusion_op.result(i),
-          jit_kernel_op.result(value2id[yeild_op->operand_source(i)]));
+          complied_op->result(value2id[yeild_op->operand_source(i)]));
+      if (shape_analysis.HasShapeOrDataForValue(fusion_op.result(i))) {
+        shape_analysis.SetShapeOrDataForValue(
+            complied_op->result(value2id[yeild_op->operand_source(i)]),
+            shape_analysis.GetShapeOrDataForValue(fusion_op.result(i)));
+      } else {
+        LOG(WARNING) << "No shape_data for "
+                     << fusion_op.result(i).defining_op()->name() << "_result_"
+                     << i;
+      }
     }
 
     rewriter.EraseOp(fusion_op);
     return true;
   }
 
+ protected:
+  virtual pir::Operation* ProcessGroup(
+      const cinn::dialect::ir::GroupPtr& group,
+      pir::ShapeConstraintIRAnalysis& shape_analysis,  // NOLINT
+      const std::shared_ptr<cinn::hlir::framework::PirCompiler>& pir_compiler,
+      pir::PatternRewriter& rewriter) const {  // NOLINT
+    auto group_inputs = GetBlockOutsideInput(group->ops);
+    // complie group to jit_kernel_op
+    auto op_attr_map = ComplieGroupAsOpAttribute(pir_compiler, {group});
+    std::vector<pir::Type> output_types;
+    const auto& group_output_values = group->output_values;
+    for (size_t i = 0; i < group_output_values.size(); ++i) {
+      output_types.push_back(group_output_values[i].type());
+    }
+    auto jit_kernel_op = rewriter.Build<cinn::dialect::JitKernelOp>(
+        group_inputs, op_attr_map.at(group), output_types);
+    return jit_kernel_op;
+  }
+
  private:
   std::shared_ptr<Group> RebuildGroup(cinn::dialect::FusionOp fusion_op) const {
     auto group = std::make_shared<Group>();
@@ -181,6 +722,20 @@ class FusionOpPattern : public pir::OpRewritePattern<cinn::dialect::FusionOp> {
   }
 };
 
+class DyShapeFusionOpPattern : public FusionOpPattern {
+ public:
+  using FusionOpPattern::FusionOpPattern;
+
+ protected:
+  virtual pir::Operation* ProcessGroup(
+      const cinn::dialect::ir::GroupPtr& group,
+      pir::ShapeConstraintIRAnalysis& shape_analysis,  // NOLINT
+      const std::shared_ptr<cinn::hlir::framework::PirCompiler>& pir_compiler,
+      pir::PatternRewriter& rewriter) const {  // NOLINT
+    return ProcessDyShapeGroup(group, shape_analysis, pir_compiler, rewriter);
+  }
+};
+
 class LowerCinnFusionOpPass : public pir::PatternRewritePass {
  public:
   LowerCinnFusionOpPass()
@@ -202,6 +757,27 @@ class LowerCinnFusionOpPass : public pir::PatternRewritePass {
   }
 };
 
+class LowerCinnDyShapeFusionOpPass : public pir::PatternRewritePass {
+ public:
+  LowerCinnDyShapeFusionOpPass()
+      : pir::PatternRewritePass("lower_cinn_dynamic_shape_fusion_op", 1) {}
+
+  pir::RewritePatternSet InitializePatterns(pir::IrContext* context) override {
+    context->GetOrRegisterDialect<cinn::dialect::RuntimeDialect>();
+    context->GetOrRegisterDialect<cinn::dialect::OperatorDialect>();
+    context->GetOrRegisterDialect<paddle::dialect::KernelDialect>();
+
+    pir::RewritePatternSet ps(context);
+    ps.Add<DyShapeFusionOpPattern>(context);
+
+    return ps;
+  }
+
+  bool CanApplyOn(pir::Operation* op) const override {
+    return op->isa<pir::ModuleOp>() && op->num_regions() > 0;
+  }
+};
+
 }  // namespace
 
 namespace cinn {
@@ -212,6 +788,10 @@ std::unique_ptr<::pir::Pass> CreateLowerCinnFusionOpPass() {
   return std::make_unique<LowerCinnFusionOpPass>();
 }
 
+std::unique_ptr<::pir::Pass> CreateLowerCinnDyShapeFusionOpPass() {
+  return std::make_unique<LowerCinnDyShapeFusionOpPass>();
+}
+
 }  // namespace ir
 }  // namespace dialect
 }  // namespace cinn
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/lower_cinn_fusion_op_pass.h b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/lower_cinn_fusion_op_pass.h
index cb9075a24c6d9a..2df189179c4174 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/lower_cinn_fusion_op_pass.h
+++ b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/lower_cinn_fusion_op_pass.h
@@ -21,6 +21,8 @@ namespace cinn {
 namespace dialect {
 namespace ir {
 std::unique_ptr<::pir::Pass> CreateLowerCinnFusionOpPass();
+
+std::unique_ptr<::pir::Pass> CreateLowerCinnDyShapeFusionOpPass();
 }  // namespace ir
 }  // namespace dialect
 }  // namespace cinn
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/op_with_group_merge_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/op_with_group_merge_pass.cc
index 54005eb22f25b3..e982a2c2e7a40f 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/op_with_group_merge_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/op_with_group_merge_pass.cc
@@ -664,8 +664,6 @@ GroupList OpFusionPassInternal(
   }
   VLOG(3) << "OpFusionPass Finish...!";
 
-  VLOG(3) << "OpFusionPass Finish...!";
-
   return res;
 }
 
diff --git a/paddle/cinn/hlir/framework/pir/group.h b/paddle/cinn/hlir/framework/pir/group.h
index 8202ab6f172996..94cfe542990b0f 100644
--- a/paddle/cinn/hlir/framework/pir/group.h
+++ b/paddle/cinn/hlir/framework/pir/group.h
@@ -21,6 +21,7 @@
 
 #include "paddle/cinn/hlir/framework/op.h"
 #include "paddle/cinn/hlir/framework/pir/utils.h"
+#include "paddle/pir/core/builtin_type_interfaces.h"
 #include "paddle/pir/core/operation.h"
 #include "paddle/pir/core/value.h"
 #include "paddle/pir/dialect/shape/utils/shape_analysis.h"
@@ -218,6 +219,32 @@ struct Group {
     return group_outputs;
   }
 
+  std::vector<::pir::Value> GetGroupOutputValues() const {
+    std::unordered_set<::pir::Operation*> group_ops_set;
+    for (auto* op : this->ops) {
+      group_ops_set.insert(op);
+    }
+
+    std::vector<::pir::Value> output_values;
+    for (auto* op : this->ops) {
+      for (size_t i = 0; i < op->num_results(); ++i) {
+        auto result = op->result(i);
+        if (!result) {
+          continue;
+        }
+        for (auto use_iter = result.use_begin(); use_iter != result.use_end();
+             ++use_iter) {
+          auto* use_op = use_iter->owner();
+          if (group_ops_set.find(use_op) == group_ops_set.end()) {
+            output_values.push_back(result);
+            break;
+          }
+        }
+      }
+    }
+    return output_values;
+  }
+
   std::string GetFuncName() { return "fn_" + group_id + unique_id; }
 
   std::shared_ptr<adt::MapExprCtx> mut_map_expr_ctx() {
diff --git a/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.cc b/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.cc
index a88221bc23e8b7..3708c255d59e47 100644
--- a/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.cc
+++ b/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.cc
@@ -55,6 +55,13 @@ class CinnJitInstruction::FnPtrImpl {
               int_arg_mp.second.dim_idx)));
     }
 
+    if (VLOG_IS_ON(4)) {
+      VLOG(4) << "Run func_args_ size: " << func_args_.size();
+      for (const auto& args : func_args_) {
+        VLOG(4) << " args type_code: " << args.type_code();
+      }
+    }
+
     // 3. Launch host kernel
     ((lower_func_ptr_g)cinn_kernel_info_.fn_ptr)(
         static_cast<void*>(func_args_.data()), func_args_.size(), stream);
@@ -73,8 +80,6 @@ class CinnJitInstruction::FnPtrImpl {
 
     // 2. Convert arg's data about shape of Tensor to cinn_pod_value_t
     for (const auto& int_arg_mp : cinn_kernel_info_.int_args_map) {
-      func_args_.emplace_back(kernel_args[int_arg_mp.second.arg_idx]->dims().at(
-          int_arg_mp.second.dim_idx));
       func_args_.emplace_back(static_cast<int64_t>(
           kernel_args[int_arg_mp.second.arg_idx]->dims().at(
               int_arg_mp.second.dim_idx)));
@@ -88,6 +93,13 @@ class CinnJitInstruction::FnPtrImpl {
                  sizeof(int64_t*)));
     }
 
+    if (VLOG_IS_ON(4)) {
+      VLOG(4) << "InferShape func_args_ size: " << func_args_.size();
+      for (const auto& args : func_args_) {
+        VLOG(4) << " args type_code: " << args.type_code();
+      }
+    }
+
     // 4. Launch infer_shape_fn_ptr to infer shape of output tensor
     ((infer_shape_func_ptr_g)cinn_kernel_info_.infer_shape_fn_ptr)(
         static_cast<void*>(func_args_.data()),
diff --git a/paddle/fluid/pybind/pir.cc b/paddle/fluid/pybind/pir.cc
index b2a687829498ca..eb58d6289267d9 100644
--- a/paddle/fluid/pybind/pir.cc
+++ b/paddle/fluid/pybind/pir.cc
@@ -1566,6 +1566,7 @@ void AddCinnPass(std::shared_ptr<PassManager> &pass_manager,  // NOLINT
       has_dynamic_shape ? std::make_shared<pir::ShapeConstraintIRAnalysis>(ctx)
                         : nullptr;
 
+  pass_manager->EnableIRPrinting();
   pass_manager->AddPass(cinn::dialect::ir::CreatePdOpToCinnOpPass());
   if (has_dynamic_shape) {
     pass_manager->AddPass(pir::CreateShapeOptimizationPass());
@@ -1579,9 +1580,11 @@ void AddCinnPass(std::shared_ptr<PassManager> &pass_manager,  // NOLINT
   if (auto pass = cinn::dialect::ir::CreateConvertStaticDimToDynamicPass()) {
     pass_manager->AddPass(std::move(pass.value()));
   }
+  if (has_dynamic_shape) {
+    pass_manager->AddPass(
+        cinn::dialect::ir::CreateLowerCinnDyShapeFusionOpPass());
+  }
   pass_manager->AddPass(cinn::dialect::ir::CreateLowerCinnFusionOpPass());
-  VLOG(4) << "has_dynamic_shape :" << has_dynamic_shape
-          << ", shape_analysis: " << shape_analysis;
 #else
   PADDLE_THROW(platform::errors::Unimplemented(
       "Currently we only support CINN Pass for Pir under @to_static, please "

From c68d6b748ccd2048b3b6e7cb6be0d9cac9c9ca9f Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Tue, 23 Jan 2024 11:13:56 +0800
Subject: [PATCH 18/34] Fix reproducability reproducibility (#61026)

---
 test/collective/fleet/hybrid_parallel_mp_layers.py              | 2 +-
 test/collective/fleet/hybrid_parallel_mp_model.py               | 2 +-
 .../fleet/hybrid_parallel_mp_model_with_sequence_parallel.py    | 2 +-
 test/collective/fleet/hybrid_parallel_pp_amp.py                 | 2 +-
 test/collective/fleet/hybrid_parallel_pp_bf16.py                | 2 +-
 test/collective/fleet/hybrid_parallel_pp_embedding.py           | 2 +-
 test/collective/fleet/hybrid_parallel_pp_fp16.py                | 2 +-
 test/collective/fleet/hybrid_parallel_pp_recompute.py           | 2 +-
 test/collective/fleet/hybrid_parallel_pp_transformer.py         | 2 +-
 .../fleet/hybrid_parallel_pp_transformer_with_virtual_stage.py  | 2 +-
 test/collective/fleet/hybrid_parallel_qat.py                    | 2 +-
 test/collective/fleet/hybrid_parallel_shared_weight.py          | 2 +-
 test/collective/fleet/parallel_class_center_sample.py           | 2 +-
 test/collective/fleet/parallel_margin_cross_entropy.py          | 2 +-
 test/legacy_test/hybrid_parallel_perf_test.py                   | 2 +-
 test/legacy_test/hybrid_parallel_pp_alexnet.py                  | 2 +-
 16 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/test/collective/fleet/hybrid_parallel_mp_layers.py b/test/collective/fleet/hybrid_parallel_mp_layers.py
index 4a7b223c6879a4..fc16adb97a04b4 100644
--- a/test/collective/fleet/hybrid_parallel_mp_layers.py
+++ b/test/collective/fleet/hybrid_parallel_mp_layers.py
@@ -23,7 +23,7 @@
 
 
 def set_random_seed(seed):
-    """Set random seed for reproducability."""
+    """Set random seed for reproducibility."""
     random.seed(seed)
     np.random.seed(seed)
     paddle.seed(seed)
diff --git a/test/collective/fleet/hybrid_parallel_mp_model.py b/test/collective/fleet/hybrid_parallel_mp_model.py
index 08ae8f51e47f13..0cd8cf24b54354 100644
--- a/test/collective/fleet/hybrid_parallel_mp_model.py
+++ b/test/collective/fleet/hybrid_parallel_mp_model.py
@@ -23,7 +23,7 @@
 
 
 def set_random_seed(seed, dp_id, rank_id):
-    """Set random seed for reproducability."""
+    """Set random seed for reproducibility."""
     random.seed(seed)
     np.random.seed(seed + dp_id)
     paddle.seed(seed + rank_id)
diff --git a/test/collective/fleet/hybrid_parallel_mp_model_with_sequence_parallel.py b/test/collective/fleet/hybrid_parallel_mp_model_with_sequence_parallel.py
index a4f11294f3815b..85cc311a16ce71 100644
--- a/test/collective/fleet/hybrid_parallel_mp_model_with_sequence_parallel.py
+++ b/test/collective/fleet/hybrid_parallel_mp_model_with_sequence_parallel.py
@@ -24,7 +24,7 @@
 
 
 def set_random_seed(seed, dp_id, rank_id):
-    """Set random seed for reproducability."""
+    """Set random seed for reproducibility."""
     random.seed(seed)
     np.random.seed(seed + dp_id)
     paddle.seed(seed + rank_id)
diff --git a/test/collective/fleet/hybrid_parallel_pp_amp.py b/test/collective/fleet/hybrid_parallel_pp_amp.py
index f3fe88a9161cfc..c4dfbd1f73644e 100644
--- a/test/collective/fleet/hybrid_parallel_pp_amp.py
+++ b/test/collective/fleet/hybrid_parallel_pp_amp.py
@@ -24,7 +24,7 @@
 
 
 def set_random_seed(seed, dp_id, rank_id):
-    """Set random seed for reproducability."""
+    """Set random seed for reproducibility."""
     random.seed(seed)
     np.random.seed(seed + dp_id)
     paddle.seed(seed + dp_id)
diff --git a/test/collective/fleet/hybrid_parallel_pp_bf16.py b/test/collective/fleet/hybrid_parallel_pp_bf16.py
index f260cd88f2f20b..c4857f979e42d1 100644
--- a/test/collective/fleet/hybrid_parallel_pp_bf16.py
+++ b/test/collective/fleet/hybrid_parallel_pp_bf16.py
@@ -25,7 +25,7 @@
 
 
 def set_random_seed(seed, dp_id, rank_id):
-    """Set random seed for reproducability."""
+    """Set random seed for reproducibility."""
     random.seed(seed)
     np.random.seed(seed + dp_id)
     paddle.seed(seed + dp_id)
diff --git a/test/collective/fleet/hybrid_parallel_pp_embedding.py b/test/collective/fleet/hybrid_parallel_pp_embedding.py
index d485e77a799728..115868bcdc162a 100644
--- a/test/collective/fleet/hybrid_parallel_pp_embedding.py
+++ b/test/collective/fleet/hybrid_parallel_pp_embedding.py
@@ -26,7 +26,7 @@
 
 
 def set_random_seed(seed, dp_id, rank_id):
-    """Set random seed for reproducability."""
+    """Set random seed for reproducibility."""
     random.seed(seed)
     np.random.seed(seed + dp_id)
     paddle.seed(seed + dp_id)
diff --git a/test/collective/fleet/hybrid_parallel_pp_fp16.py b/test/collective/fleet/hybrid_parallel_pp_fp16.py
index c6c107a852a222..33c335c9d85ab5 100644
--- a/test/collective/fleet/hybrid_parallel_pp_fp16.py
+++ b/test/collective/fleet/hybrid_parallel_pp_fp16.py
@@ -28,7 +28,7 @@
 
 
 def set_random_seed(seed, dp_id, rank_id):
-    """Set random seed for reproducability."""
+    """Set random seed for reproducibility."""
     random.seed(seed)
     np.random.seed(seed + dp_id)
     paddle.seed(seed + dp_id)
diff --git a/test/collective/fleet/hybrid_parallel_pp_recompute.py b/test/collective/fleet/hybrid_parallel_pp_recompute.py
index 3d95375d1da8ac..a4a7311a154204 100644
--- a/test/collective/fleet/hybrid_parallel_pp_recompute.py
+++ b/test/collective/fleet/hybrid_parallel_pp_recompute.py
@@ -27,7 +27,7 @@
 
 
 def set_random_seed(seed, dp_id, rank_id):
-    """Set random seed for reproducability."""
+    """Set random seed for reproducibility."""
     random.seed(seed)
     np.random.seed(seed + dp_id)
     paddle.seed(seed + dp_id)
diff --git a/test/collective/fleet/hybrid_parallel_pp_transformer.py b/test/collective/fleet/hybrid_parallel_pp_transformer.py
index 18986e3df34fbe..3e1d6c157ad538 100644
--- a/test/collective/fleet/hybrid_parallel_pp_transformer.py
+++ b/test/collective/fleet/hybrid_parallel_pp_transformer.py
@@ -27,7 +27,7 @@
 
 
 def set_random_seed(seed, dp_id, rank_id):
-    """Set random seed for reproducability."""
+    """Set random seed for reproducibility."""
     random.seed(seed)
     np.random.seed(seed + dp_id)
     paddle.seed(seed + dp_id)
diff --git a/test/collective/fleet/hybrid_parallel_pp_transformer_with_virtual_stage.py b/test/collective/fleet/hybrid_parallel_pp_transformer_with_virtual_stage.py
index 35a17f17acea24..d155d14a13730a 100644
--- a/test/collective/fleet/hybrid_parallel_pp_transformer_with_virtual_stage.py
+++ b/test/collective/fleet/hybrid_parallel_pp_transformer_with_virtual_stage.py
@@ -27,7 +27,7 @@
 
 
 def set_random_seed(seed, dp_id, rank_id):
-    """Set random seed for reproducability."""
+    """Set random seed for reproducibility."""
     random.seed(seed)
     np.random.seed(seed + dp_id)
     paddle.seed(seed + dp_id)
diff --git a/test/collective/fleet/hybrid_parallel_qat.py b/test/collective/fleet/hybrid_parallel_qat.py
index 484cfd168b5305..0feeca40771b8e 100644
--- a/test/collective/fleet/hybrid_parallel_qat.py
+++ b/test/collective/fleet/hybrid_parallel_qat.py
@@ -26,7 +26,7 @@
 
 
 def set_random_seed(seed, dp_id, rank_id):
-    """Set random seed for reproducability."""
+    """Set random seed for reproducibility."""
     random.seed(seed)
     np.random.seed(seed + dp_id)
     paddle.seed(seed + rank_id)
diff --git a/test/collective/fleet/hybrid_parallel_shared_weight.py b/test/collective/fleet/hybrid_parallel_shared_weight.py
index f54b994768740a..2202d88e907233 100644
--- a/test/collective/fleet/hybrid_parallel_shared_weight.py
+++ b/test/collective/fleet/hybrid_parallel_shared_weight.py
@@ -34,7 +34,7 @@ def print_hook_fn(grad):
 
 
 def set_random_seed(seed, dp_id, rank_id):
-    """Set random seed for reproducability."""
+    """Set random seed for reproducibility."""
     random.seed(seed)
     np.random.seed(seed + dp_id)
     paddle.seed(seed + dp_id)
diff --git a/test/collective/fleet/parallel_class_center_sample.py b/test/collective/fleet/parallel_class_center_sample.py
index 250ff00e5bcdc8..42f891d9feff67 100644
--- a/test/collective/fleet/parallel_class_center_sample.py
+++ b/test/collective/fleet/parallel_class_center_sample.py
@@ -23,7 +23,7 @@
 
 
 def set_random_seed(seed):
-    """Set random seed for reproducability."""
+    """Set random seed for reproducibility."""
     random.seed(seed)
     np.random.seed(seed)
     paddle.seed(seed)
diff --git a/test/collective/fleet/parallel_margin_cross_entropy.py b/test/collective/fleet/parallel_margin_cross_entropy.py
index 6db97b2c87a701..68fd68ff6d72e1 100644
--- a/test/collective/fleet/parallel_margin_cross_entropy.py
+++ b/test/collective/fleet/parallel_margin_cross_entropy.py
@@ -23,7 +23,7 @@
 
 
 def set_random_seed(seed):
-    """Set random seed for reproducability."""
+    """Set random seed for reproducibility."""
     random.seed(seed)
     np.random.seed(seed)
     paddle.seed(seed)
diff --git a/test/legacy_test/hybrid_parallel_perf_test.py b/test/legacy_test/hybrid_parallel_perf_test.py
index da0148b01b88cf..c0f58939149835 100644
--- a/test/legacy_test/hybrid_parallel_perf_test.py
+++ b/test/legacy_test/hybrid_parallel_perf_test.py
@@ -22,7 +22,7 @@
 
 
 def set_random_seed(seed, dp_id, rank_id):
-    """Set random seed for reproducability."""
+    """Set random seed for reproducibility."""
     random.seed(seed)
     np.random.seed(seed + dp_id)
     paddle.seed(seed + dp_id)
diff --git a/test/legacy_test/hybrid_parallel_pp_alexnet.py b/test/legacy_test/hybrid_parallel_pp_alexnet.py
index dc461176da1679..b9d5a98a199556 100644
--- a/test/legacy_test/hybrid_parallel_pp_alexnet.py
+++ b/test/legacy_test/hybrid_parallel_pp_alexnet.py
@@ -31,7 +31,7 @@
 
 
 def set_random_seed(seed, dp_id, rank_id):
-    """Set random seed for reproducability."""
+    """Set random seed for reproducibility."""
     random.seed(seed)
     np.random.seed(seed + dp_id)
     paddle.seed(seed + dp_id)

From c585dcc5c390c77ca2c648950f81393d6d6e7394 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Tue, 23 Jan 2024 11:15:06 +0800
Subject: [PATCH 19/34] Fix euqal equal (#61009)

---
 paddle/cinn/ir/schedule/schedule_desc.h       |  4 +-
 paddle/cinn/ir/test/ir_compare_test.cc        |  2 +-
 paddle/cinn/ir/test/schedule_desc_test.cc     |  2 +-
 paddle/cinn/ir/utils/ir_compare.cc            |  2 +-
 paddle/cinn/ir/utils/ir_compare.h             |  4 +-
 .../fluid/framework/fleet/ps_gpu_wrapper.cc   |  4 +-
 paddle/fluid/pir/transforms/inplace_pass.cc   | 10 ++---
 .../pir/transforms/pd_op_to_kernel_pass.cc    |  2 +-
 .../transforms/transform_general_functions.h  |  2 +-
 paddle/phi/infermeta/binary.cc                |  4 +-
 paddle/phi/infermeta/fusion.cc                |  4 +-
 paddle/phi/infermeta/multiary.cc              |  8 ++--
 paddle/phi/infermeta/unary.cc                 |  4 +-
 paddle/phi/kernels/cpu/conv_util.h            |  2 +-
 .../memory_efficient_attention_grad_kernel.cu | 40 +++++++++----------
 15 files changed, 47 insertions(+), 47 deletions(-)

diff --git a/paddle/cinn/ir/schedule/schedule_desc.h b/paddle/cinn/ir/schedule/schedule_desc.h
index eeaca70ca0e94e..4458bcb4ed1175 100644
--- a/paddle/cinn/ir/schedule/schedule_desc.h
+++ b/paddle/cinn/ir/schedule/schedule_desc.h
@@ -31,7 +31,7 @@ namespace ir {
 // records all transform/getting operations executed by a corresponding
 // ir::IRSchedule. A ScheduleDesc can be serialized to JSON format and saved to
 // file. For deserializing, it can be re-applied to a new IRSchedule that is
-// initialzied by a semantics-euqal original ir::ModuleExpr, and then achieves
+// initialzied by a semantics-equal original ir::ModuleExpr, and then achieves
 // the same result.
 
 class IRSchedule;  // forward declartion to avoid cross-reference
@@ -78,7 +78,7 @@ class ScheduleDesc {
 
   /**
    * \brief Replay this description to a new IRSchedule that is initialzied by a
-   * semantics-euqal original ModuleExpr.
+   * semantics-equal original ModuleExpr.
    * @param schedule The original IRSchedule to be replayed the description on.
    * @param without_post_schedule Determine whether to delete the post
    * schedules.
diff --git a/paddle/cinn/ir/test/ir_compare_test.cc b/paddle/cinn/ir/test/ir_compare_test.cc
index bb1c6eb46866cd..c705c537b3a28e 100644
--- a/paddle/cinn/ir/test/ir_compare_test.cc
+++ b/paddle/cinn/ir/test/ir_compare_test.cc
@@ -132,7 +132,7 @@ TEST(TestIrCompare, SingleFunction) {
   ASSERT_TRUE(IRCompare(funcs_1.front(), funcs_2.front()));
   // compare with itself
   ASSERT_TRUE(IRCompare(funcs_1.front(), funcs_1.front()));
-  // they are euqal if allowing suffix of name different
+  // they are equal if allowing suffix of name different
   ASSERT_TRUE(IRCompare(funcs_1.front(), funcs_2.front(), true));
 
   ASSERT_FALSE(IRCompare(funcs_1.front(), funcs_3.front()));
diff --git a/paddle/cinn/ir/test/schedule_desc_test.cc b/paddle/cinn/ir/test/schedule_desc_test.cc
index dcd8b90ef120dd..860142797be7b1 100644
--- a/paddle/cinn/ir/test/schedule_desc_test.cc
+++ b/paddle/cinn/ir/test/schedule_desc_test.cc
@@ -105,7 +105,7 @@ std::string SourceCodeGen(const ModuleExpr& module_expr,
                           const std::vector<ir::LoweredFunc>& lowered_funcs,
                           const Target& target) {
   auto exprs = module_expr.GetExprs();
-  CHECK_EQ(exprs.size(), lowered_funcs.size()) << "size of func is not euqal";
+  CHECK_EQ(exprs.size(), lowered_funcs.size()) << "size of func is not equal";
   std::vector<ir::LoweredFunc> updated_funcs =
       ir::ir_utils::IRCopy(lowered_funcs);
   Module::Builder builder("test_module", target);
diff --git a/paddle/cinn/ir/utils/ir_compare.cc b/paddle/cinn/ir/utils/ir_compare.cc
index 0cd521c65542eb..b88d90f8817c9f 100644
--- a/paddle/cinn/ir/utils/ir_compare.cc
+++ b/paddle/cinn/ir/utils/ir_compare.cc
@@ -76,7 +76,7 @@ bool IrEqualVisitor::Compare(const std::string& lhs, const std::string& rhs) {
   }
 
   if (!equal) {
-    VLOG(5) << "Not euqal on name, lhs=" << lhs << ", rhs=" << rhs;
+    VLOG(5) << "Not equal on name, lhs=" << lhs << ", rhs=" << rhs;
   }
 
   return equal;
diff --git a/paddle/cinn/ir/utils/ir_compare.h b/paddle/cinn/ir/utils/ir_compare.h
index ef978a024b0f71..b59222530eab90 100644
--- a/paddle/cinn/ir/utils/ir_compare.h
+++ b/paddle/cinn/ir/utils/ir_compare.h
@@ -22,7 +22,7 @@ namespace cinn {
 namespace ir {
 namespace ir_utils {
 
-// Determine whether two ir AST trees are euqal by comparing their struct and
+// Determine whether two ir AST trees are equal by comparing their struct and
 // fields of each node through dfs visitor
 class IrEqualVisitor : public IRVisitorRequireReImpl<bool, const Expr*> {
  public:
@@ -30,7 +30,7 @@ class IrEqualVisitor : public IRVisitorRequireReImpl<bool, const Expr*> {
                           bool only_compare_structure = false)
       : allow_name_suffix_diff_(allow_name_suffix_diff),
         only_compare_structure_(only_compare_structure) {}
-  // Return true if they are euqal, otherwise false;
+  // Return true if they are equal, otherwise false;
   bool Compare(const Expr& lhs, const Expr& rhs);
 
  private:
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
index a13f6372501c6f..b3e48e0d5b63b9 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
@@ -386,7 +386,7 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task,
             for (size_t i = 0; i < keys_vec.size(); ++i) {
               if (!infer_mode_ || sage_mode_) {
                 CHECK_EQ(keys_vec[i]->size(), ranks_vec[i]->size())
-                    << keys_vec[i]->size() << " should be euqal to "
+                    << keys_vec[i]->size() << " should be equal to "
                     << ranks_vec[i]->size();
                 for (size_t j = 0; j < keys_vec[i]->size(); ++j) {
                   auto& key = (*keys_vec[i])[j];
@@ -398,7 +398,7 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task,
                 }
               } else {
                 CHECK_EQ(ranks_vec[i]->size(), 0UL)
-                    << ranks_vec[i]->size() << " should be euqal to 0";
+                    << ranks_vec[i]->size() << " should be equal to 0";
                 for (size_t j = 0; j < keys_vec[i]->size(); ++j) {
                   auto& key = (*keys_vec[i])[j];
                   int shard_idx = key % thread_keys_thread_num_;
diff --git a/paddle/fluid/pir/transforms/inplace_pass.cc b/paddle/fluid/pir/transforms/inplace_pass.cc
index c862ec92457015..9b645535473fa9 100644
--- a/paddle/fluid/pir/transforms/inplace_pass.cc
+++ b/paddle/fluid/pir/transforms/inplace_pass.cc
@@ -91,7 +91,7 @@ bool CanDoInplace(const std::unordered_set<pir::Value>& eager_dels,
       return true;
     }
 
-    auto is_numel_euqal = [](const TensorType& in,
+    auto is_numel_equal = [](const TensorType& in,
                              const TensorType& out) -> bool {
       int64_t in_numel = 1;
       int64_t out_numel = 1;
@@ -127,7 +127,7 @@ bool CanDoInplace(const std::unordered_set<pir::Value>& eager_dels,
     };
     // In this version, we don't consider the -1 in ddim, we just calculate the
     // result.
-    auto is_numel_euqal_loose_version = [](const TensorType& in,
+    auto is_numel_equal_loose_version = [](const TensorType& in,
                                            const TensorType& out) -> bool {
       auto calculate_numel = [](const phi::DDim& ddim) -> int64_t {
         int64_t numel = 1;
@@ -144,10 +144,10 @@ bool CanDoInplace(const std::unordered_set<pir::Value>& eager_dels,
     bool equal = false;
     bool relax = (RelaxShapeCheckOps.count(op_name) > 0);
     if (relax) {
-      equal = is_numel_euqal_loose_version(input_alloc_tensor_type,
+      equal = is_numel_equal_loose_version(input_alloc_tensor_type,
                                            output_alloc_tensor_type);
     } else {
-      equal = is_numel_euqal(input_alloc_tensor_type, output_alloc_tensor_type);
+      equal = is_numel_equal(input_alloc_tensor_type, output_alloc_tensor_type);
     }
 
     if (!equal) {
@@ -159,7 +159,7 @@ bool CanDoInplace(const std::unordered_set<pir::Value>& eager_dels,
     return false;
   }
   if (eager_dels.count(input) == 0) {
-    VLOG(9) << "     -- input not in eager_deletion_valus, can't do inplace";
+    VLOG(9) << "     -- input not in eager_deletion_vars, can't do inplace";
     return false;
   }
   return true;
diff --git a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
index b77ab6208c9ecd..65d8dd3af15177 100644
--- a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
+++ b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
@@ -190,7 +190,7 @@ static bool NeedFallBackFromGPUDNN2GPU(pir::Operation* op,
                                        const std::string& kernel_name,
                                        const phi::KernelKey kernel_key) {
   // NOTE(phlrain): keep the same kernel select strategy with
-  // GetExepectKernelKey
+  // GetExpectKernelKey
   if (op->isa<Pool2dOp>() || op->isa<Pool2dGradOp>() || op->isa<Pool3dOp>() ||
       op->isa<Pool3dGradOp>()) {
     if (kernel_key.backend() == phi::Backend::GPUDNN &&
diff --git a/paddle/fluid/pir/transforms/transform_general_functions.h b/paddle/fluid/pir/transforms/transform_general_functions.h
index 28314527c09086..82ca3ae79491f1 100644
--- a/paddle/fluid/pir/transforms/transform_general_functions.h
+++ b/paddle/fluid/pir/transforms/transform_general_functions.h
@@ -67,7 +67,7 @@ Operation* GetDefiningOpForInput(const Operation* op, uint32_t index);
  * @brief Get operations and the index of designative op operand (op result)
  that use the specific output of the operation.
  *
- * @param const Operation* cosnt pointer to an operation
+ * @param const Operation* const pointer to an operation
  * @param uint32_t index of result of the operation
 
  * @return std::vector<std::pair<Operation*, int32_t>>
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index 09184c24eb5dea..b94f28b203943b 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -561,7 +561,7 @@ void ConvInferMeta(const MetaTensor& input,
       strides.size() + 2U,
       phi::errors::InvalidArgument(
           "The difference of input's dimension and Attr(strides)'s "
-          "length must be euqal to 2 for Op(Conv). "
+          "length must be equal to 2 for Op(Conv). "
           "But received: input's dimension is %d, input's shape is [%s]; "
           "Attr(stride)'s length is %d, Attr(stride) is [%s]; "
           "difference of input's dimention and Attr(strides)'s length = %u.",
@@ -732,7 +732,7 @@ void ConvTransposeInferMeta(const MetaTensor& x,
       2U,
       errors::InvalidArgument(
           "The input's dimension size minus Attr(stride)'s size must "
-          "be euqal to 2 for Op(conv_transpose). But received: [%d], the "
+          "be equal to 2 for Op(conv_transpose). But received: [%d], the "
           "input's dimension size is [%d], the shape of input "
           "is [%s], the Attr(stride)'s size is [%d].",
           in_sub_stride_size,
diff --git a/paddle/phi/infermeta/fusion.cc b/paddle/phi/infermeta/fusion.cc
index ada895fc28d92a..2182640ddbb2f7 100644
--- a/paddle/phi/infermeta/fusion.cc
+++ b/paddle/phi/infermeta/fusion.cc
@@ -390,7 +390,7 @@ void Conv2dXPUInferMeta(const MetaTensor& x,
       strides.size() + 2U,
       phi::errors::InvalidArgument(
           "The difference of input's dimension and Attr(strides)'s "
-          "length must be euqal to 2 for Op(Conv_xpu). "
+          "length must be equal to 2 for Op(Conv_xpu). "
           "But received: input's dimension is %d, input's shape is [%s]; "
           "Attr(stride)'s length is %d, Attr(stride) is [%s]; "
           "difference of input's dimention and Attr(strides)'s length = %u.",
@@ -2005,7 +2005,7 @@ void ConvTransposeXPUInferMeta(const MetaTensor& x,
       2U,
       errors::InvalidArgument(
           "The input's dimension size minus Attr(stride)'s size must "
-          "be euqal to 2 for Op(conv_transpose). But received: [%d], the "
+          "be equal to 2 for Op(conv_transpose). But received: [%d], the "
           "input's dimension size is [%d], the shape of input "
           "is [%s], the Attr(stride)'s size is [%d].",
           in_sub_stride_size,
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index 36faa875fbe19b..4caf2cc8ac608e 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -2836,7 +2836,7 @@ void LLMInt8LinearInferMeta(const MetaTensor& x,
       x_dims[x_dims.size() - 1],
       w_dims[1],
       errors::InvalidArgument(
-          "Input(X) dim[-1] and Input(Weight) dim[1] should be euqal."
+          "Input(X) dim[-1] and Input(Weight) dim[1] should be equal."
           "But received Input(X) dim[-1](%s) != Input(Weight) dim[1](%s)",
           x_dims[x_dims.size() - 1],
           w_dims[1]));
@@ -2856,7 +2856,7 @@ void LLMInt8LinearInferMeta(const MetaTensor& x,
       weight_scale.dims()[0],
       w_dims[0],
       errors::InvalidArgument(
-          "Input(weight_scale) dim[0] and Input(Weight) dim[0] should be euqal."
+          "Input(weight_scale) dim[0] and Input(Weight) dim[0] should be equal."
           "But received Input(weight_scale) dim[0](%s) != Input(Weight) "
           "dim[0](%s)",
           weight_scale.dims()[0],
@@ -4117,7 +4117,7 @@ void WeightOnlyLinearInferMeta(const MetaTensor& x,
       x_dims[x_dims.size() - 1],
       w_dims[1],
       errors::InvalidArgument(
-          "Input(X) dim[-1] and Input(Weight) dim[1] should be euqal."
+          "Input(X) dim[-1] and Input(Weight) dim[1] should be equal."
           "But received Input(X) dim[-1](%s) != Input(Weight) dim[1](%s)",
           x_dims[x_dims.size() - 1],
           w_dims[1]));
@@ -4205,7 +4205,7 @@ void YoloLossInferMeta(const MetaTensor& x,
   PADDLE_ENFORCE_EQ(
       dim_x[2],
       dim_x[3],
-      phi::errors::InvalidArgument("Input(X) dim[3] and dim[4] should be euqal."
+      phi::errors::InvalidArgument("Input(X) dim[3] and dim[4] should be equal."
                                    "But received dim[3](%s) != dim[4](%s)",
                                    dim_x[2],
                                    dim_x[3]));
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 333c9c393a8180..d208611ca315b3 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -938,7 +938,7 @@ void DirichletInferMeta(const MetaTensor& alpha, MetaTensor* out) {
                     1,
                     phi::errors::InvalidArgument(
                         "ShapeError: The number of dimensions of 'Alpha' "
-                        "must be greater than or euqal to 1. "
+                        "must be greater than or equal to 1. "
                         "But received Alpha's dimensions = %d,",
                         alpha_dim.size()));
   out->set_dims(alpha_dim);
@@ -3192,7 +3192,7 @@ void PoolInferMeta(const MetaTensor& x,
                     2U,
                     errors::InvalidArgument(
                         "the dimension of input minus the size of "
-                        "Attr(kernel_size_) must be euqal to 2 in Op(pool). "
+                        "Attr(kernel_size_) must be equal to 2 in Op(pool). "
                         "But received: the dimension of input minus the size "
                         "of Attr(kernel_size_) is %d, the "
                         "input's dimension is %d, the shape of input "
diff --git a/paddle/phi/kernels/cpu/conv_util.h b/paddle/phi/kernels/cpu/conv_util.h
index 1b83c6d7de59e3..765d0c9b4bdecf 100644
--- a/paddle/phi/kernels/cpu/conv_util.h
+++ b/paddle/phi/kernels/cpu/conv_util.h
@@ -166,7 +166,7 @@ inline std::vector<int64_t> ComputeOutputShape(
       strides.size() + 2U,
       phi::errors::InvalidArgument(
           "The difference of input's dimension and Attr(strides)'s "
-          "length must be euqal to 2 for Op(Conv). "
+          "length must be equal to 2 for Op(Conv). "
           "But received: input's dimension is %d, input's shape is [%s]; "
           "Attr(stride)'s length is %d, Attr(stride) is [%s]; "
           "difference of input's dimension and Attr(strides)'s length = %u.",
diff --git a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention_grad_kernel.cu b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention_grad_kernel.cu
index 2dd5334548f6a5..c72a1b69e7ef82 100644
--- a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention_grad_kernel.cu
+++ b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention_grad_kernel.cu
@@ -65,7 +65,7 @@ void MemoryEfficientAttentionGradKernel(
         output_grad.dims().size(),
         phi::errors::InvalidArgument(
             "The size of query's dimensions "
-            "should be euqal to output grad. But received query's "
+            "should be equal to output grad. But received query's "
             "dimensions = %d, output grad's dimensions = %d.",
             query.dims().size(),
             output_grad.dims().size()));
@@ -73,7 +73,7 @@ void MemoryEfficientAttentionGradKernel(
                       key.dims().size(),
                       phi::errors::InvalidArgument(
                           "The size of query's dimensions "
-                          "should be euqal to key. But received query's "
+                          "should be equal to key. But received query's "
                           "dimensions = %d, key's dimensions = %d.",
                           query.dims().size(),
                           key.dims().size()));
@@ -81,7 +81,7 @@ void MemoryEfficientAttentionGradKernel(
                       value.dims().size(),
                       phi::errors::InvalidArgument(
                           "The size of query's dimensions "
-                          "should be euqal to value. But received query's "
+                          "should be equal to value. But received query's "
                           "dimensions = %d, value's dimensions = %d.",
                           query.dims().size(),
                           key.dims().size()));
@@ -99,7 +99,7 @@ void MemoryEfficientAttentionGradKernel(
         output_grad.dims()[0],
         phi::errors::InvalidArgument(
             "The batch size of query's dimensions "
-            "should be euqal to output grad. But received query's "
+            "should be equal to output grad. But received query's "
             "batch size = %d, output grad's batch size = %d.",
             query.dims()[0],
             output_grad.dims()[0]));
@@ -107,7 +107,7 @@ void MemoryEfficientAttentionGradKernel(
                       key.dims()[0],
                       phi::errors::InvalidArgument(
                           "The batch size of query's dimensions "
-                          "should be euqal to key. But received query's "
+                          "should be equal to key. But received query's "
                           "batch size = %d, key's batch size = %d.",
                           query.dims()[0],
                           key.dims()[0]));
@@ -115,7 +115,7 @@ void MemoryEfficientAttentionGradKernel(
                       value.dims()[0],
                       phi::errors::InvalidArgument(
                           "The batch size of query's dimensions "
-                          "should be euqal to value. But received query's "
+                          "should be equal to value. But received query's "
                           "batch size = %d, value's batch size = %d.",
                           query.dims()[0],
                           value.dims()[0]));
@@ -126,7 +126,7 @@ void MemoryEfficientAttentionGradKernel(
         value.dims()[1],
         phi::errors::InvalidArgument(
             "The sequence length of key"
-            "should be euqal to value. But received key's sequence length = "
+            "should be equal to value. But received key's sequence length = "
             "%d, value's sequence length = %d.",
             key.dims()[1],
             value.dims()[1]));
@@ -134,7 +134,7 @@ void MemoryEfficientAttentionGradKernel(
                       output_grad.dims()[1],
                       phi::errors::InvalidArgument(
                           "The sequence length of query"
-                          "should be euqal to output grad. But received "
+                          "should be equal to output grad. But received "
                           "query's sequence length = "
                           "%d, output grad's sequence length = %d.",
                           query.dims()[1],
@@ -146,7 +146,7 @@ void MemoryEfficientAttentionGradKernel(
         key.dims()[2],
         phi::errors::InvalidArgument(
             "The head number of query"
-            "should be euqal to key. But received query's head number = "
+            "should be equal to key. But received query's head number = "
             "%d, key's head number = %d.",
             query.dims()[2],
             key.dims()[2]));
@@ -155,7 +155,7 @@ void MemoryEfficientAttentionGradKernel(
         value.dims()[2],
         phi::errors::InvalidArgument(
             "The head number of query"
-            "should be euqal to value. But received query's head number = "
+            "should be equal to value. But received query's head number = "
             "%d, value's head number = %d.",
             query.dims()[2],
             value.dims()[2]));
@@ -163,7 +163,7 @@ void MemoryEfficientAttentionGradKernel(
                       output_grad.dims()[2],
                       phi::errors::InvalidArgument(
                           "The head number of query"
-                          "should be euqal to output grad. But received "
+                          "should be equal to output grad. But received "
                           "query's head number = "
                           "%d, output grad's head number = %d.",
                           query.dims()[2],
@@ -175,7 +175,7 @@ void MemoryEfficientAttentionGradKernel(
         key.dims()[3],
         phi::errors::InvalidArgument(
             "The head size of query"
-            "should be euqal to key. But received query's head size = "
+            "should be equal to key. But received query's head size = "
             "%d, key's head size = %d.",
             query.dims()[3],
             key.dims()[3]));
@@ -184,7 +184,7 @@ void MemoryEfficientAttentionGradKernel(
         output_grad.dims()[3],
         phi::errors::InvalidArgument(
             "The head size of value"
-            "should be euqal to output grad. But received value's head size = "
+            "should be equal to output grad. But received value's head size = "
             "%d, output grad's head size = %d.",
             value.dims()[3],
             output_grad.dims()[3]));
@@ -237,7 +237,7 @@ void MemoryEfficientAttentionGradKernel(
           cu_seqlens_q.get().dims()[0],
           cu_seqlens_k.get().dims()[0],
           phi::errors::InvalidArgument("The first dimension of cu_seqlens_q"
-                                       "should be euqal to cu_seqlens_q."));
+                                       "should be equal to cu_seqlens_q."));
       PADDLE_ENFORCE_EQ(
           q_dims[0],
           1,
@@ -361,7 +361,7 @@ void MemoryEfficientAttentionGradKernel(
         query.dims()[0],
         phi::errors::InvalidArgument(
             "The first dimension of delta"
-            "should be euqal to query. But received delta's first dimension = "
+            "should be equal to query. But received delta's first dimension = "
             "%d, query's first dimension = %d.",
             delta.dims()[0],
             query.dims()[0]));
@@ -369,7 +369,7 @@ void MemoryEfficientAttentionGradKernel(
                       query.dims()[2],
                       phi::errors::InvalidArgument(
                           "The second dimension of delta"
-                          "should be euqal to third dimension query. But "
+                          "should be equal to third dimension query. But "
                           "received delta's second dimension = "
                           "%d, query's third dimension = %d.",
                           delta.dims()[1],
@@ -378,7 +378,7 @@ void MemoryEfficientAttentionGradKernel(
                       query.dims()[1],
                       phi::errors::InvalidArgument(
                           "The third dimension of delta"
-                          "should be euqal to second dimension query. But "
+                          "should be equal to second dimension query. But "
                           "received delta's third dimension = "
                           "%d, query's second dimension = %d.",
                           delta.dims()[2],
@@ -454,19 +454,19 @@ void MemoryEfficientAttentionGradKernel(
                       DimStride(query_grad->dims(), 1),
                       phi::errors::InvalidArgument(
                           "The strideM of grad query"
-                          "should be euqal to the first dimension size of "
+                          "should be equal to the first dimension size of "
                           "query grad's stride"));
     PADDLE_ENFORCE_EQ(k_dims[2] * k_dims[3],
                       DimStride(key_grad->dims(), 1),
                       phi::errors::InvalidArgument(
                           "The strideM of grad key"
-                          "should be euqal to the first dimension size of key "
+                          "should be equal to the first dimension size of key "
                           "grad's stride"));
     PADDLE_ENFORCE_EQ(v_dims[2] * v_dims[3],
                       DimStride(value_grad->dims(), 1),
                       phi::errors::InvalidArgument(
                           "The strideM of grad value"
-                          "should be euqal to the first dimension size of "
+                          "should be equal to the first dimension size of "
                           "value grad's stride"));
 
     PD_MEA_CHECK_OVERFLOW(p.q_strideB, DimStride(query.dims(), 0));

From da60eaed4f190dd3d2709413d463ed609825d207 Mon Sep 17 00:00:00 2001
From: cyber-pioneer <116002591+cyber-pioneer@users.noreply.github.com>
Date: Tue, 23 Jan 2024 11:22:15 +0800
Subject: [PATCH 20/34] [Prim][PIR]simplify maybe reduce in prim vjp (#60987)

* simplify maybe reduce in prim vjp

* debug

* fix code

* polish all maybe reduce

* fix bug

* fix bug

* fix expand vjp bug
---
 paddle/fluid/primitive/rule/vjp/details.h | 256 ++++++++--------------
 paddle/fluid/primitive/utils/utils.h      |   2 +-
 test/prim/pir_prim/test_vjp_prim.py       |   8 +-
 3 files changed, 97 insertions(+), 169 deletions(-)

diff --git a/paddle/fluid/primitive/rule/vjp/details.h b/paddle/fluid/primitive/rule/vjp/details.h
index 15e4617fa2fb23..6e4bdb58888eb1 100644
--- a/paddle/fluid/primitive/rule/vjp/details.h
+++ b/paddle/fluid/primitive/rule/vjp/details.h
@@ -70,17 +70,13 @@ void divide_grad(const Tensor& x,
   if (dy) {
     // dy = -(x/y^2) * dout
     auto dy_res = -(x / y.pow(2.0)) * out_grad;
-    if (x.dims() != y.dims()) {
-      // Maybe need reduce here
-      phi::DDim reduce_dim = get_reduce_dims(y.dims(), x.dims());
-      if (!reduce_dim.size()) {
-        set_output<T>(dy_res, dy);
-      } else {
-        auto dy_reduce_res =
-            sum<T>(dy_res, common::vectorize(reduce_dim), y.dtype(), false);
-        auto dy_tmp = reshape<T>(dy_reduce_res, common::vectorize(y.dims()));
-        set_output<T>(dy_tmp, dy);
-      }
+    if (out_grad.dims() != y.dims()) {
+      phi::DDim reduce_dim =
+          get_reduce_dims_from_out(out_grad.dims(), y.dims());
+      auto dy_reduce_res =
+          sum<T>(dy_res, common::vectorize(reduce_dim), y.dtype(), false);
+      auto dy_tmp = reshape<T>(dy_reduce_res, common::vectorize(y.dims()));
+      set_output<T>(dy_tmp, dy);
     } else {
       set_output<T>(dy_res, dy);
     }
@@ -89,18 +85,12 @@ void divide_grad(const Tensor& x,
     // dx = (1/y) * dout
     auto one_tensor = full<T>(common::vectorize(y.dims()), 1.0, y.dtype());
     auto dx_res = one_tensor / y * out_grad;
-    if (y.dims() != x.dims()) {
-      // Maybe need reduce here
-      auto reduce_dim = get_reduce_dims(x.dims(), y.dims());
-      if (!reduce_dim.size()) {
-        set_output<T>(dx_res, dx);
-      } else {
-        auto dx_reduce_res =
-            sum<T>(dx_res, common::vectorize(reduce_dim), x.dtype(), false);
-        auto dx_tmp = reshape<T>(dx_reduce_res, common::vectorize(x.dims()));
-        set_output<T>(dx_tmp, dx);
-      }
-
+    if (out_grad.dims() != x.dims()) {
+      auto reduce_dim = get_reduce_dims_from_out(out_grad.dims(), x.dims());
+      auto dx_reduce_res =
+          sum<T>(dx_res, common::vectorize(reduce_dim), x.dtype(), false);
+      auto dx_tmp = reshape<T>(dx_reduce_res, common::vectorize(x.dims()));
+      set_output<T>(dx_tmp, dx);
     } else {
       set_output<T>(dx_res, dx);
     }
@@ -394,34 +384,25 @@ void add_grad(const Tensor& x,
               Tensor* dx,
               Tensor* dy) {
   if (dy) {
-    if (x.dims() != y.dims()) {
-      // Maybe need reduce here
-      phi::DDim reduce_dim = get_reduce_dims(y.dims(), x.dims());
-      if (!reduce_dim.size()) {
-        by_pass<T>(out_grad, dy);
-      } else {
-        auto dy_reduce_res =
-            out_grad.sum(common::vectorize(reduce_dim), y.dtype(), false);
-        auto dy_tmp = reshape<T>(dy_reduce_res, common::vectorize(y.dims()));
-        set_output<T>(dy_tmp, dy);
-      }
+    if (out_grad.dims() != y.dims()) {
+      phi::DDim reduce_dim =
+          get_reduce_dims_from_out(out_grad.dims(), y.dims());
+      auto dy_reduce_res =
+          out_grad.sum(common::vectorize(reduce_dim), y.dtype(), false);
+      auto dy_tmp = reshape<T>(dy_reduce_res, common::vectorize(y.dims()));
+      set_output<T>(dy_tmp, dy);
 
     } else {
       by_pass<T>(out_grad, dy);
     }
   }
   if (dx) {
-    if (y.dims() != x.dims()) {
-      // Maybe need reduce here
-      auto reduce_dim = get_reduce_dims(x.dims(), y.dims());
-      if (!reduce_dim.size()) {
-        by_pass<T>(out_grad, dx);
-      } else {
-        auto dx_reduce_res =
-            out_grad.sum(common::vectorize(reduce_dim), x.dtype(), false);
-        auto dx_tmp = reshape<T>(dx_reduce_res, common::vectorize(x.dims()));
-        set_output<T>(dx_tmp, dx);
-      }
+    if (out_grad.dims() != x.dims()) {
+      auto reduce_dim = get_reduce_dims_from_out(out_grad.dims(), x.dims());
+      auto dx_reduce_res =
+          out_grad.sum(common::vectorize(reduce_dim), x.dtype(), false);
+      auto dx_tmp = reshape<T>(dx_reduce_res, common::vectorize(x.dims()));
+      set_output<T>(dx_tmp, dx);
     } else {
       by_pass<T>(out_grad, dx);
     }
@@ -437,33 +418,24 @@ void subtract_grad(const Tensor& x,
                    Tensor* dy) {
   if (dy) {
     auto scale_out_grad = scale<T>(out_grad, -1.0, 0.0, true);
-    if (x.dims() != y.dims()) {
-      // Maybe need reduce here
-      phi::DDim reduce_dim = get_reduce_dims(y.dims(), x.dims());
-      if (!reduce_dim.size()) {
-        by_pass<T>(scale_out_grad, dy);
-      } else {
-        auto dy_reduce_res =
-            scale_out_grad.sum(common::vectorize(reduce_dim), y.dtype(), false);
-        auto dy_tmp = reshape<T>(dy_reduce_res, common::vectorize(y.dims()));
-        set_output<T>(dy_tmp, dy);
-      }
+    if (out_grad.dims() != y.dims()) {
+      phi::DDim reduce_dim =
+          get_reduce_dims_from_out(out_grad.dims(), y.dims());
+      auto dy_reduce_res =
+          scale_out_grad.sum(common::vectorize(reduce_dim), y.dtype(), false);
+      auto dy_tmp = reshape<T>(dy_reduce_res, common::vectorize(y.dims()));
+      set_output<T>(dy_tmp, dy);
     } else {
       by_pass<T>(scale_out_grad, dy);
     }
   }
   if (dx) {
-    if (y.dims() != x.dims()) {
-      // Maybe need reduce here
-      auto reduce_dim = get_reduce_dims(x.dims(), y.dims());
-      if (!reduce_dim.size()) {
-        by_pass<T>(out_grad, dx);
-      } else {
-        auto dx_reduce_res =
-            out_grad.sum(common::vectorize(reduce_dim), x.dtype(), false);
-        auto dx_tmp = reshape<T>(dx_reduce_res, common::vectorize(x.dims()));
-        set_output<T>(dx_tmp, dx);
-      }
+    if (out_grad.dims() != x.dims()) {
+      auto reduce_dim = get_reduce_dims_from_out(out_grad.dims(), x.dims());
+      auto dx_reduce_res =
+          out_grad.sum(common::vectorize(reduce_dim), x.dtype(), false);
+      auto dx_tmp = reshape<T>(dx_reduce_res, common::vectorize(x.dims()));
+      set_output<T>(dx_tmp, dx);
     } else {
       by_pass<T>(out_grad, dx);
     }
@@ -481,16 +453,12 @@ void multiply_grad(const Tensor& x,
     auto x_grad_unreduce = out_grad * y;
     if (x_grad_unreduce.dims() != x.dims()) {
       auto axes = get_reduce_dims_from_out(x_grad_unreduce.dims(), x.dims());
-      if (!axes.size()) {
-        set_output<T>(x_grad_unreduce, x_grad);
-      } else {
-        auto x_grad_reduced = x_grad_unreduce.sum(
-            common::vectorize(axes), x_grad_unreduce.dtype(), false);
-        if (x_grad_reduced.dims().size() != x.dims().size()) {
-          x_grad_reduced = reshape<T>(x_grad_reduced, x.shape());
-        }
-        set_output<T>(x_grad_reduced, x_grad);
+      auto x_grad_reduced = x_grad_unreduce.sum(
+          common::vectorize(axes), x_grad_unreduce.dtype(), false);
+      if (x_grad_reduced.dims().size() != x.dims().size()) {
+        x_grad_reduced = reshape<T>(x_grad_reduced, x.shape());
       }
+      set_output<T>(x_grad_reduced, x_grad);
     } else {
       set_output<T>(x_grad_unreduce, x_grad);
     }
@@ -499,16 +467,12 @@ void multiply_grad(const Tensor& x,
     auto y_grad_unreduce = out_grad * x;
     if (y_grad_unreduce.dims() != y.dims()) {
       auto axes = get_reduce_dims_from_out(y_grad_unreduce.dims(), y.dims());
-      if (!axes.size()) {
-        set_output<T>(y_grad_unreduce, y_grad);
-      } else {
-        auto y_grad_reduced = y_grad_unreduce.sum(
-            common::vectorize(axes), y_grad_unreduce.dtype(), false);
-        if (y_grad_reduced.dims().size() != y.dims().size()) {
-          y_grad_reduced = reshape<T>(y_grad_reduced, y.shape());
-        }
-        set_output<T>(y_grad_reduced, y_grad);
+      auto y_grad_reduced = y_grad_unreduce.sum(
+          common::vectorize(axes), y_grad_unreduce.dtype(), false);
+      if (y_grad_reduced.dims().size() != y.dims().size()) {
+        y_grad_reduced = reshape<T>(y_grad_reduced, y.shape());
       }
+      set_output<T>(y_grad_reduced, y_grad);
     } else {
       set_output<T>(y_grad_unreduce, y_grad);
     }
@@ -526,17 +490,13 @@ void elementwise_pow_grad(const Tensor& x,
     auto lnx = log<T>(x);
     auto x_pow_y = elementwise_pow<T>(x, y);
     auto dy_res = lnx * x_pow_y * out_grad;
-    if (x.dims() != y.dims()) {
-      // Maybe need reduce here
-      phi::DDim reduce_dim = get_reduce_dims(y.dims(), x.dims());
-      if (!reduce_dim.size()) {
-        set_output<T>(dy_res, dy);
-      } else {
-        auto dy_reduce_res =
-            dy_res.sum(common::vectorize(reduce_dim), y.dtype(), false);
-        auto dy_tmp = reshape<T>(dy_reduce_res, common::vectorize(y.dims()));
-        set_output<T>(dy_tmp, dy);
-      }
+    if (out_grad.dims() != y.dims()) {
+      phi::DDim reduce_dim =
+          get_reduce_dims_from_out(out_grad.dims(), y.dims());
+      auto dy_reduce_res =
+          dy_res.sum(common::vectorize(reduce_dim), y.dtype(), false);
+      auto dy_tmp = reshape<T>(dy_reduce_res, common::vectorize(y.dims()));
+      set_output<T>(dy_tmp, dy);
     } else {
       set_output<T>(dy_res, dy);
     }
@@ -546,17 +506,12 @@ void elementwise_pow_grad(const Tensor& x,
     auto tmp_z = y - 1.0;
     auto x_pow_z = elementwise_pow<T>(x, tmp_z);
     auto dx_res = y * x_pow_z * out_grad;
-    if (y.dims() != x.dims()) {
-      // Maybe need reduce here
-      auto reduce_dim = get_reduce_dims(x.dims(), y.dims());
-      if (!reduce_dim.size()) {
-        set_output<T>(dx_res, dx);
-      } else {
-        auto dx_reduce_res =
-            dx_res.sum(common::vectorize(reduce_dim), x.dtype(), false);
-        auto dx_tmp = reshape<T>(dx_reduce_res, common::vectorize(x.dims()));
-        set_output<T>(dx_tmp, dx);
-      }
+    if (out_grad.dims() != x.dims()) {
+      auto reduce_dim = get_reduce_dims_from_out(out_grad.dims(), x.dims());
+      auto dx_reduce_res =
+          dx_res.sum(common::vectorize(reduce_dim), x.dtype(), false);
+      auto dx_tmp = reshape<T>(dx_reduce_res, common::vectorize(x.dims()));
+      set_output<T>(dx_tmp, dx);
 
     } else {
       set_output<T>(dx_res, dx);
@@ -721,18 +676,13 @@ void expand_grad(const Tensor& x,
                  const IntArray& shape,
                  Tensor* x_grad) {
   if (x_grad) {
-    auto out_dims = common::make_ddim(shape.GetData());
-    if (out_dims != x.dims()) {
-      auto axes = get_reduce_dims(x.dims(), out_dims);
-      if (!axes.size()) {
-        by_pass<T>(out_grad, x_grad);
-      } else {
-        auto reduced = out_grad.sum(common::vectorize(axes), x.dtype(), false);
-        if (reduced.dims().size() != x.dims().size()) {
-          reduced = reshape<T>(reduced, x.shape());
-        }
-        set_output<T>(reduced, x_grad);
+    if (out_grad.dims() != x.dims()) {
+      auto axes = get_reduce_dims_from_out(out_grad.dims(), x.dims());
+      auto reduced = out_grad.sum(common::vectorize(axes), x.dtype(), false);
+      if (reduced.dims().size() != x.dims().size()) {
+        reduced = reshape<T>(reduced, x.shape());
       }
+      set_output<T>(reduced, x_grad);
     } else {
       by_pass<T>(out_grad, x_grad);
     }
@@ -831,17 +781,12 @@ void maximum_grad(const Tensor& x,
   if (x_grad) {
     auto x_tmp = cast<T>(greater_than<T>(x, y), out_grad.dtype());
     auto dx_res = out_grad * x_tmp;
-    if (y.dims() != x.dims()) {
-      // Maybe need reduce here
-      auto reduce_dim = get_reduce_dims(x.dims(), y.dims());
-      if (!reduce_dim.size()) {
-        set_output<T>(dx_res, x_grad);
-      } else {
-        auto dx_reduce_res =
-            dx_res.sum(common::vectorize(reduce_dim), x.dtype(), false);
-        auto dx_tmp = reshape<T>(dx_reduce_res, common::vectorize(x.dims()));
-        set_output<T>(dx_tmp, x_grad);
-      }
+    if (out_grad.dims() != x.dims()) {
+      auto reduce_dim = get_reduce_dims_from_out(out_grad.dims(), x.dims());
+      auto dx_reduce_res =
+          dx_res.sum(common::vectorize(reduce_dim), x.dtype(), false);
+      auto dx_tmp = reshape<T>(dx_reduce_res, common::vectorize(x.dims()));
+      set_output<T>(dx_tmp, x_grad);
     } else {
       set_output<T>(dx_res, x_grad);
     }
@@ -850,17 +795,13 @@ void maximum_grad(const Tensor& x,
   if (y_grad) {
     auto y_tmp = cast<T>(less_equal<T>(x, y), out_grad.dtype());
     auto dy_res = out_grad * y_tmp;
-    if (x.dims() != y.dims()) {
-      // Maybe need reduce here
-      phi::DDim reduce_dim = get_reduce_dims(y.dims(), x.dims());
-      if (!reduce_dim.size()) {
-        set_output<T>(dy_res, y_grad);
-      } else {
-        auto dy_reduce_res =
-            dy_res.sum(common::vectorize(reduce_dim), y.dtype(), false);
-        auto dy_tmp = reshape<T>(dy_reduce_res, common::vectorize(y.dims()));
-        set_output<T>(dy_tmp, y_grad);
-      }
+    if (out_grad.dims() != y.dims()) {
+      phi::DDim reduce_dim =
+          get_reduce_dims_from_out(out_grad.dims(), y.dims());
+      auto dy_reduce_res =
+          dy_res.sum(common::vectorize(reduce_dim), y.dtype(), false);
+      auto dy_tmp = reshape<T>(dy_reduce_res, common::vectorize(y.dims()));
+      set_output<T>(dy_tmp, y_grad);
     } else {
       set_output<T>(dy_res, y_grad);
     }
@@ -1315,17 +1256,12 @@ void minimum_grad(const Tensor& x,
   if (x_grad) {
     auto x_tmp = cast<T>(less_than<T>(x, y), out_grad.dtype());
     auto dx_res = out_grad * x_tmp;
-    if (y.dims() != x.dims()) {
-      // Maybe need reduce here
-      auto reduce_dim = get_reduce_dims(x.dims(), y.dims());
-      if (!reduce_dim.size()) {
-        set_output<T>(dx_res, x_grad);
-      } else {
-        auto dx_reduce_res =
-            dx_res.sum(common::vectorize(reduce_dim), x.dtype(), false);
-        auto dx_tmp = reshape<T>(dx_reduce_res, common::vectorize(x.dims()));
-        set_output<T>(dx_tmp, x_grad);
-      }
+    if (out_grad.dims() != x.dims()) {
+      auto reduce_dim = get_reduce_dims_from_out(out_grad.dims(), x.dims());
+      auto dx_reduce_res =
+          dx_res.sum(common::vectorize(reduce_dim), x.dtype(), false);
+      auto dx_tmp = reshape<T>(dx_reduce_res, common::vectorize(x.dims()));
+      set_output<T>(dx_tmp, x_grad);
     } else {
       set_output<T>(dx_res, x_grad);
     }
@@ -1334,17 +1270,13 @@ void minimum_grad(const Tensor& x,
   if (y_grad) {
     auto y_tmp = cast<T>(greater_equal<T>(x, y), out_grad.dtype());
     auto dy_res = out_grad * y_tmp;
-    if (x.dims() != y.dims()) {
-      // Maybe need reduce here
-      phi::DDim reduce_dim = get_reduce_dims(y.dims(), x.dims());
-      if (!reduce_dim.size()) {
-        set_output<T>(dy_res, y_grad);
-      } else {
-        auto dy_reduce_res =
-            dy_res.sum(common::vectorize(reduce_dim), y.dtype(), false);
-        auto dy_tmp = reshape<T>(dy_reduce_res, common::vectorize(y.dims()));
-        set_output<T>(dy_tmp, y_grad);
-      }
+    if (out_grad.dims() != y.dims()) {
+      phi::DDim reduce_dim =
+          get_reduce_dims_from_out(out_grad.dims(), y.dims());
+      auto dy_reduce_res =
+          dy_res.sum(common::vectorize(reduce_dim), y.dtype(), false);
+      auto dy_tmp = reshape<T>(dy_reduce_res, common::vectorize(y.dims()));
+      set_output<T>(dy_tmp, y_grad);
     } else {
       set_output<T>(dy_res, y_grad);
     }
diff --git a/paddle/fluid/primitive/utils/utils.h b/paddle/fluid/primitive/utils/utils.h
index bfb1e956a72f0e..70fc4c7e3ee361 100644
--- a/paddle/fluid/primitive/utils/utils.h
+++ b/paddle/fluid/primitive/utils/utils.h
@@ -138,7 +138,7 @@ static phi::DDim get_reduce_dims_from_out(const phi::DDim& dout_dims,
     result.push_back(i);
   }
   for (int i = 0; i < in_dims.size(); ++i) {
-    if (in_dims[i] == 1) {
+    if (in_dims[i] == 1 && dout_dims[i] != 1) {
       result.push_back(i + bat);
     } else {
       PADDLE_ENFORCE_EQ(
diff --git a/test/prim/pir_prim/test_vjp_prim.py b/test/prim/pir_prim/test_vjp_prim.py
index fcf4d77212f0cb..ada7fc496b89bf 100644
--- a/test/prim/pir_prim/test_vjp_prim.py
+++ b/test/prim/pir_prim/test_vjp_prim.py
@@ -79,9 +79,9 @@ def test_divide_grad_prim_case1(self):
                     stop_gradients,
                 )
             reshape_op2 = pir_program.global_block().ops[-1]
-            reshape_op1 = pir_program.global_block().ops[-8]
+            reshape_op1 = pir_program.global_block().ops[-4]
             self.assertEqual(len(grad_outs), 2)
-            self.assertEqual(len(pir_program.global_block().ops), 21)
+            self.assertEqual(len(pir_program.global_block().ops), 17)
             self.assertTrue(reshape_op2.result(0).is_same(grad_outs[0][0]))
             self.assertTrue(reshape_op1.result(0).is_same(grad_outs[1][0]))
             all_op_names = [
@@ -102,10 +102,6 @@ def test_divide_grad_prim_case1(self):
                 "pd_op.full",
                 "pd_op.divide",
                 "pd_op.multiply",
-                "pd_op.full_int_array",
-                "pd_op.sum",
-                "pd_op.full_int_array",
-                "pd_op.reshape",
             ]
             for idx, op in enumerate(pir_program.global_block().ops):
                 self.assertEqual(op.name(), all_op_names[idx])

From b8f9a0ad4b6f0c0ac19dbe13c904ce2d1a1acff7 Mon Sep 17 00:00:00 2001
From: ndren <andreien@proton.me>
Date: Tue, 23 Jan 2024 03:45:22 +0000
Subject: [PATCH 21/34] Fix CVE-2024-0521 (#61032)

This uses shlex for safe command parsing to fix arbitrary code injection
---
 python/paddle/utils/download.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/paddle/utils/download.py b/python/paddle/utils/download.py
index b9ca1f35976c63..e1673739f90f5e 100644
--- a/python/paddle/utils/download.py
+++ b/python/paddle/utils/download.py
@@ -15,6 +15,7 @@
 import hashlib
 import os
 import os.path as osp
+import shlex
 import shutil
 import subprocess
 import sys
@@ -204,7 +205,8 @@ def _wget_download(url: str, fullname: str):
             'https',
         ), 'Only support https and http url'
         # using wget to download url
-        tmp_fullname = fullname + "_tmp"
+        tmp_fullname = shlex.quote(fullname + "_tmp")
+        url = shlex.quote(url)
         # –user-agent
         command = f'wget -O {tmp_fullname} -t {DOWNLOAD_RETRY_LIMIT} {url}'
         subprc = subprocess.Popen(

From f0f0d84ea9d8c36f206046de33d36d5ed76d86c6 Mon Sep 17 00:00:00 2001
From: Xinyu Yang <ltyxy@buaa.edu.cn>
Date: Tue, 23 Jan 2024 13:19:55 +0800
Subject: [PATCH 22/34] [PIR] Add auto mixed precision pass in pir (#58738)

---
 .../transforms/auto_mixed_precision_pass.cc   | 663 ++++++++++++++++++
 .../transforms/auto_mixed_precision_pass.h    |  26 +
 2 files changed, 689 insertions(+)
 create mode 100644 paddle/fluid/pir/transforms/auto_mixed_precision_pass.cc
 create mode 100644 paddle/fluid/pir/transforms/auto_mixed_precision_pass.h

diff --git a/paddle/fluid/pir/transforms/auto_mixed_precision_pass.cc b/paddle/fluid/pir/transforms/auto_mixed_precision_pass.cc
new file mode 100644
index 00000000000000..8431517a4db244
--- /dev/null
+++ b/paddle/fluid/pir/transforms/auto_mixed_precision_pass.cc
@@ -0,0 +1,663 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pir/transforms/auto_mixed_precision_pass.h"
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "paddle/common/enforce.h"
+#include "paddle/common/errors.h"
+
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/pir/dialect/operator/ir/control_flow_op.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+#include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.h"
+#include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_util.h"
+#include "paddle/fluid/pir/dialect/operator/utils/utils.h"
+#include "paddle/fluid/pir/transforms/transform_general_functions.h"
+
+#include "paddle/phi/common/backend.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+#include "paddle/pir/core/builtin_op.h"
+#include "paddle/pir/core/ir_context.h"
+#include "paddle/pir/core/operation.h"
+#include "paddle/pir/core/parameter.h"
+#include "paddle/pir/core/program.h"
+#include "paddle/pir/pass/pass.h"
+#include "paddle/pir/pattern_rewrite/frozen_rewrite_pattern_set.h"
+#include "paddle/pir/pattern_rewrite/pattern_match.h"
+#include "paddle/pir/pattern_rewrite/pattern_rewrite_driver.h"
+
+namespace {
+
+class AutoMixedPrecisionPass : public pir::Pass {
+ public:
+  AutoMixedPrecisionPass()
+      : pir::Pass("auto_mixed_precision_pass", 1),
+        place_(phi::CPUPlace{}),
+        precision_mode_(phi::DataType::FLOAT16) {}
+
+  bool Initialize(pir::IrContext* context) override {
+    IR_ENFORCE(Has(pir::kPlaceAttr),
+               "Pass initialize failed."
+               "When using AutoMixedPrecisionPass, place attribute is required!"
+               "Use Set method to set the place attribute.");
+    IR_ENFORCE(Has("__mixed_precision_mode__"),
+               "Pass initialize failed."
+               "When using AutoMixedPrecisionPass, precison_mode attribute is "
+               "required!"
+               "Use Set method to set the scope attribute.");
+
+    place_ = Get<phi::Place>(pir::kPlaceAttr);
+    precision_mode_ = Get<phi::DataType>("__mixed_precision_mode__");
+    context_ = context;
+    enable_low_precision_io_ = false;
+    SetDefaultBlacklist();
+    return true;
+  }
+
+  void Run(pir::Operation* op) override {
+    for (size_t i = 0; i < op->num_regions(); ++i) {
+      auto& region = op->region(i);
+      for (auto& block : region) {
+        GetOpPrecision(&block);
+        UpdateOpPrecision(&block);
+        pir::Builder builder = pir::Builder(context_, &block);
+        ProcessBlock(&block, builder);
+      }
+    }
+  }
+
+  bool CanApplyOn(pir::Operation* op) const override {
+    return op->num_regions() > 0 && place_ == paddle::PlaceType::kGPU &&
+           (precision_mode_ == phi::DataType::FLOAT16 ||
+            precision_mode_ == phi::DataType::BFLOAT16);
+  }
+
+ private:
+  phi::Place place_;
+  phi::DataType precision_mode_;
+  bool enable_low_precision_io_;
+  pir::IrContext* context_;
+
+  std::unordered_set<std::string> black_list_;
+  std::unordered_set<std::string> white_list_;
+
+  std::unordered_set<pir::Operation*> op_run_low_precision_;
+  std::unordered_set<pir::Operation*> op_should_not_handle_;
+  std::unordered_map<pir::Value, paddle::dialect::CastOp> cached_cast_ops_;
+
+  int insert_cast_op_num_ = 0;
+
+  void SetDefaultBlacklist() {
+    black_list_.insert({
+        paddle::dialect::ExpOp::name(),
+        paddle::dialect::SquareOp::name(),
+        paddle::dialect::LogOp::name(),
+        paddle::dialect::MeanOp::name(),
+        paddle::dialect::SumOp::name(),
+        paddle::dialect::SigmoidCrossEntropyWithLogitsOp::name(),
+        paddle::dialect::CrossEntropyWithSoftmax_Op::name(),
+    });
+  }
+
+  void ProcessBlock(pir::Block* block, pir::Builder& builder) {  // NOLINT
+    for (auto& op_item : *block) {
+      auto op = &op_item;
+      if (op_should_not_handle_.count(op)) continue;
+      RewriteOp(op, builder);
+    }
+  }
+
+  void GetOpPrecision(pir::Block* block) {
+    for (auto& op_item : *block) {
+      auto op = &op_item;
+      auto op_name = op->name();
+      bool support_low_precision = true;
+      if (black_list_.count(op_name)) {
+        support_low_precision = false;
+      } else if (IsBuiltinOp(op)) {  // other builtin ops
+        if (op->isa<pir::ParameterOp>() || op->isa<pir::SetParameterOp>())
+          support_low_precision = false;
+      } else if (op->isa<paddle::dialect::FeedOp>() ||
+                 op->isa<paddle::dialect::FetchOp>()) {
+        support_low_precision = enable_low_precision_io_;
+      } else if (OpHasFloatOpOperand(op) ||
+                 OpHasFloatResult(op)) {  // pd op without float result,
+        auto op_type = op_name.substr(op_name.find(".") + 1);
+        auto backend = ConvertPlaceToBackend(place_);
+        support_low_precision =
+            OpSupportPrecision(op_type, backend, precision_mode_);
+      } else {  // pd op without float result
+        support_low_precision = false;
+        op_should_not_handle_.insert(op);
+      }
+      if (support_low_precision) {
+        op_run_low_precision_.insert(op);
+      }
+    }
+  }
+
+  bool CheckUseOpsScalaAttribute(
+      const std::vector<std::pair<pir::Operation*, int32_t>>& use_ops) const {
+    for (auto [use_op, idx] : use_ops) {
+      if (use_op->isa<pir::CombineOp>()) {
+        if (CheckOutputIsScalarAttribute(use_op)) {
+          return true;
+        }
+      } else if (use_op->HasInterface<paddle::dialect::OpYamlInfoInterface>()) {
+        auto [input_infos, _1, _2, _3, _4] =
+            use_op->dyn_cast<paddle::dialect::OpYamlInfoInterface>()
+                .GetOpInfo();
+        if (input_infos[idx].type_name.find("ScalarAttribute") !=
+            std::string::npos) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
+  bool CheckOutputIsScalarAttribute(pir::Operation* op) const {
+    for (uint32_t i = 0; i < op->num_results(); i++) {
+      auto use_ops = pir::GetUseOpsForOutput(op, i);
+      if (CheckUseOpsScalaAttribute(use_ops)) return true;
+    }
+    return false;
+  }
+
+  void UpdateOpPrecision(pir::Block* block) {
+    bool precision_updated = false;
+    do {
+      precision_updated = false;
+      // handle full like op
+      for (auto& op_item : *block) {
+        auto op = &op_item;
+        if (op_should_not_handle_.count(op)) continue;
+        if (!OpRunLowPrecision(op)) continue;
+        if (op->isa<paddle::dialect::FullLikeOp>()) {
+          auto input_operation = GetDefiningOpForInput(op, 0);
+          if (!op_run_low_precision_.count(input_operation)) {
+            op_run_low_precision_.erase(op);
+            precision_updated = true;
+          }
+        }
+        if (!OpRunLowPrecision(op)) continue;
+        // if datatype of cast op result is not float, then cast op should be
+        // not handled
+        if (op->isa<paddle::dialect::CastOp>()) {
+          auto result_dtype = paddle::dialect::TransToPhiDataType(
+              pir::GetDataTypeFromValue(op->result(0)));
+          if (!IsPhiDataTypeFloat(result_dtype)) {
+            op_run_low_precision_.erase(op);
+            op_should_not_handle_.insert(op);
+            precision_updated = true;
+          }
+        }
+        if (!OpRunLowPrecision(op)) continue;
+        // if consumer's input is a ScalarAttribute, the producer should be in
+        // high precision
+        if (CheckOutputIsScalarAttribute(op)) {
+          op_run_low_precision_.erase(op);
+          precision_updated = true;
+        }
+        if (!OpRunLowPrecision(op)) continue;
+        // if the producer's output is in float VectorType, then the precsion
+        // between two op should be the same
+        for (size_t idx = 0; idx < op->num_operands(); ++idx) {
+          if (!op->operand_source(idx)) continue;
+          auto operand = op->operand(idx);
+          if (operand.type() && operand.type().isa<pir::VectorType>()) {
+            // check if there are all float in the vectortype
+            auto vec_type = operand.type().dyn_cast<pir::VectorType>();
+            if (IsVectorTypeFloat(vec_type)) {
+              auto input_operation = GetDefiningOpForInput(op, idx);
+              if (!op_run_low_precision_.count(op) ||
+                  !op_run_low_precision_.count(input_operation)) {
+                op_run_low_precision_.erase(op);
+                op_run_low_precision_.erase(input_operation);
+                precision_updated = true;
+              }
+            }
+          }
+        }
+      }
+    } while (precision_updated);
+  }
+
+  void RewriteOp(pir::Operation* op,
+                 pir::Builder& builder) {  // NOLINT
+    if (IsBuiltinOp(op)) {
+      RewriteBuiltinOp(op, builder);
+      return;
+    } else {
+      RewritePdOp(op, builder);
+      return;
+    }
+  }
+
+  bool PhiKernelSupportPrecision(
+      const std::string& op_type,
+      phi::Backend backend,
+      phi::DataType data_type,
+      phi::DataLayout layout = phi::DataLayout::ALL_LAYOUT) const {
+    const auto& kernels = phi::KernelFactory::Instance().kernels();
+    if (kernels.count(op_type) == 0) {
+      return false;
+    }
+    phi::KernelKey kernel_key(backend, layout, data_type);
+    return phi::KernelFactory::Instance().HasKernel(op_type, kernel_key);
+  }
+
+  phi::Backend ConvertPlaceToBackend(const phi::Place& place) const {
+    switch (place.GetType()) {
+      case phi::AllocationType::CPU:
+        return phi::Backend::CPU;
+      case phi::AllocationType::GPU:
+        return phi::Backend::GPU;
+      case phi::AllocationType::XPU:
+        return phi::Backend::XPU;
+      default:
+        return phi::Backend::UNDEFINED;
+    }
+    return phi::Backend::UNDEFINED;
+  }
+
+  bool KernelSupportPrecision(
+      const std::string& op_type,
+      phi::Backend backend,
+      phi::DataType precision,
+      phi::DataLayout layout = phi::DataLayout::ALL_LAYOUT) const {
+    auto& phi_op_type = op_type;
+
+    bool support =
+        PhiKernelSupportPrecision(phi_op_type, backend, precision, layout);
+    if (backend == phi::Backend::GPU) {
+      support |= PhiKernelSupportPrecision(
+          phi_op_type, phi::Backend::GPUDNN, precision, layout);
+    }
+
+    if (!support) {
+      const auto& all_kernels =
+          paddle::framework::OperatorWithKernel::AllOpKernels();
+      auto it = all_kernels.find(op_type);
+      if (it != all_kernels.end()) {
+        for (const auto& kern_pair : it->second) {
+          if (ConvertPlaceToBackend(kern_pair.first.place_) == backend &&
+              kern_pair.first.data_type_ ==
+                  paddle::framework::TransToProtoVarType(precision)) {
+            support = true;
+            break;
+          }
+        }
+      }
+    }
+    return support;
+  }
+
+  phi::Kernel GetPhiKernelInPrecision(const std::string& kernel_fn_str,
+                                      phi::Backend backend,
+                                      phi::DataType precision) const {
+    if (backend == phi::Backend::GPU) {
+      if (PhiKernelSupportPrecision(
+              kernel_fn_str, phi::Backend::GPUDNN, precision)) {
+        phi::KernelKey kernel_key(
+            phi::Backend::GPUDNN, phi::DataLayout::ALL_LAYOUT, precision);
+        return phi::KernelFactory::Instance().SelectKernel(kernel_fn_str,
+                                                           kernel_key);
+      }
+      phi::KernelKey kernel_key(
+          phi::Backend::GPU, phi::DataLayout::ALL_LAYOUT, precision);
+      return phi::KernelFactory::Instance().SelectKernel(kernel_fn_str,
+                                                         kernel_key);
+    }
+    return phi::KernelFactory::Instance().SelectKernel(
+        kernel_fn_str,
+        phi::KernelKey(backend, phi::DataLayout::ALL_LAYOUT, precision));
+  }
+
+  bool IsBuiltinOp(pir::Operation* op) const {
+    return op->name().find("builtin") != std::string::npos;
+  }
+
+  bool OpSupportPrecision(const std::string& kernel_fn_str,
+                          phi::Backend backend,
+                          phi::DataType precision) const {
+    // if the op is in white list, return true
+    if (white_list_.count(kernel_fn_str)) {
+      return true;
+    }
+
+    // if the op is in black list, return false
+    if (black_list_.count(kernel_fn_str)) {
+      return false;
+    }
+
+    return KernelSupportPrecision(kernel_fn_str, backend, precision);
+  }
+
+  void SetResultDataType(pir::Value result,
+                         phi::DataType precision,
+                         pir::IrContext* context) const {
+    auto type = result.type();
+    if (type.isa<paddle::dialect::DenseTensorType>()) {
+      auto dense_type = type.dyn_cast<paddle::dialect::DenseTensorType>();
+      auto new_type = paddle::dialect::DenseTensorType::get(
+          context,
+          paddle::dialect::TransToIrDataType(precision, context),
+          dense_type.dims(),
+          dense_type.data_layout(),
+          dense_type.lod(),
+          dense_type.offset());
+      result.set_type(new_type);
+    } else if (type.isa<pir::VectorType>()) {
+      auto vec_type = type.dyn_cast<pir::VectorType>();
+      auto output_num = vec_type.size();
+      std::vector<pir::Type> results_type(output_num);
+      for (size_t idx = 0; idx < output_num; ++idx) {
+        auto dense_type =
+            vec_type[idx].dyn_cast<paddle::dialect::DenseTensorType>();
+        auto new_type = paddle::dialect::DenseTensorType::get(
+            context,
+            paddle::dialect::TransToIrDataType(precision, context),
+            dense_type.dims(),
+            dense_type.data_layout(),
+            dense_type.lod(),
+            dense_type.offset());
+        results_type[idx] = new_type;
+      }
+      auto new_vec_type = pir::VectorType::get(context, results_type);
+      result.set_type(new_vec_type);
+    } else {
+      PADDLE_THROW(phi::errors::Unimplemented(
+          "result type is not DenseTensorType or VectorType"));
+    }
+  }
+
+  bool OpHasFloatOpOperand(pir::Operation* op) const {
+    for (size_t i = 0; i < op->num_operands(); i++) {
+      auto operand = op->operand_source(i);
+      if (!operand.type()) continue;
+      if (operand.type().isa<paddle::dialect::DenseTensorType>() &&
+          IsDenseTensorTypeFloat(
+              operand.type().dyn_cast<paddle::dialect::DenseTensorType>())) {
+        return true;
+      } else if (operand.type().isa<pir::VectorType>() &&
+                 IsVectorTypeFloat(
+                     operand.type().dyn_cast<pir::VectorType>())) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  bool OpHasFloatResult(pir::Operation* op) const {
+    for (size_t i = 0; i < op->num_results(); i++) {
+      auto result = op->result(i);
+      if (!result.type()) continue;
+      if (result.type().isa<paddle::dialect::DenseTensorType>() &&
+          IsDenseTensorTypeFloat(
+              result.type().dyn_cast<paddle::dialect::DenseTensorType>())) {
+      } else if (result.type().isa<pir::VectorType>() &&
+                 IsVectorTypeFloat(result.type().dyn_cast<pir::VectorType>())) {
+      }
+    }
+    return false;
+  }
+
+  bool IsPhiDataTypeFloat(const phi::DataType& dtype) const {
+    return dtype == phi::DataType::FLOAT32 || dtype == phi::DataType::FLOAT16 ||
+           dtype == phi::DataType::BFLOAT16;
+  }
+
+  bool IsDenseTensorTypeFloat(
+      paddle::dialect::DenseTensorType dense_type) const {
+    auto dtype = dense_type.dtype();
+    return IsPhiDataTypeFloat(paddle::dialect::TransToPhiDataType(dtype));
+  }
+
+  bool IsVectorTypeFloat(pir::VectorType vec_type) const {
+    size_t output_num = vec_type.size();
+    for (size_t j = 0; j < output_num; j++) {
+      auto dtype =
+          vec_type[j].dyn_cast<paddle::dialect::DenseTensorType>().dtype();
+      if (!IsPhiDataTypeFloat(paddle::dialect::TransToPhiDataType(dtype))) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  phi::DataType GetPhiDataTypeFromOpOperand(
+      const pir::OpOperand& operand) const {
+    return GetPhiDataTypeFromValue(operand.source());
+  }
+
+  phi::DataType GetPhiDataTypeFromValue(const pir::Value& value) const {
+    auto dtype = pir::GetDataTypeFromValue(value);
+    return paddle::dialect::TransToPhiDataType(dtype);
+  }
+
+  bool IsOperandHasDenseTensorType(pir::OpOperand operand) const {
+    return operand.type() &&
+           operand.type().isa<paddle::dialect::DenseTensorType>();
+  }
+
+  void DoInsertCastOp(pir::Operation* op,
+                      pir::OpOperand operand,
+                      phi::DataType precision,
+                      pir::Builder& builder) {  // NOLINT
+    auto value = operand.source();
+    if (cached_cast_ops_.count(value)) {
+      operand.set_source(cached_cast_ops_[value]->result(0));
+      return;
+    }
+    builder.set_insertion_point(op);  // before op
+    paddle::dialect::CastOp cast_op =
+        builder.Build<paddle::dialect::CastOp>(value, precision);
+    operand.set_source(cast_op->result(0));
+    cached_cast_ops_[value] = cast_op;
+    insert_cast_op_num_++;
+  }
+
+  bool OpRunLowPrecision(pir::Operation* op) const {
+    return op_run_low_precision_.count(op);
+  }
+
+  void RewriteBuiltinOp(pir::Operation* op,
+                        pir::Builder& builder) {  // NOLINT
+    // Rewrite CombineOp
+    if (op->isa<pir::CombineOp>()) {
+      auto input_num = op->num_operands();
+      if (OpRunLowPrecision(op)) {
+        for (size_t i = 0; i < input_num; ++i) {
+          auto operand = op->operand(i);
+          auto operand_phi_dtype = GetPhiDataTypeFromOpOperand(operand);
+          if (IsPhiDataTypeFloat(operand_phi_dtype) &&
+              operand_phi_dtype != precision_mode_) {
+            DoInsertCastOp(op, operand, precision_mode_, builder);
+          }
+        }
+        std::vector<pir::Type> inputs_type(input_num);
+        for (size_t idx = 0; idx < input_num; ++idx) {
+          inputs_type[idx] = op->operand(idx).type();
+        }
+        auto new_vec_type =
+            pir::VectorType::get(builder.ir_context(), inputs_type);
+        op->result(0).set_type(new_vec_type);
+      } else {
+        for (size_t i = 0; i < input_num; ++i) {
+          auto operand = op->operand(i);
+          auto operand_phi_dtype = GetPhiDataTypeFromOpOperand(operand);
+          if (operand_phi_dtype == precision_mode_) {
+            DoInsertCastOp(op, operand, phi::DataType::FLOAT32, builder);
+          }
+        }
+      }
+    }
+
+    // Rewrite SliceOp
+    if (op->isa<pir::SliceOp>()) {
+      if (!OpRunLowPrecision(op)) return;
+      auto index =
+          op->attribute("index").dyn_cast<pir::Int32Attribute>().data();
+      auto input_type = op->operand(0).type().dyn_cast<pir::VectorType>();
+      auto new_type = input_type[index];
+      op->result(0).set_type(new_type);
+    }
+
+    // Rewrite SplitOp
+    if (op->isa<pir::SplitOp>()) {
+      if (!OpRunLowPrecision(op)) return;
+      auto input_type = op->operand(0).type().dyn_cast<pir::VectorType>();
+      int output_num = op->num_results();
+      for (int i = 0; i < output_num; ++i) {
+        op->result(i).set_type(input_type[i]);
+      }
+    }
+  }
+
+  void RewritePdOp(pir::Operation* op,
+                   pir::Builder& builder) {  // NOLINT
+    std::string op_type = op->name().substr(op->name().find(".") + 1);
+    phi::Backend backend = ConvertPlaceToBackend(place_);
+    // Rewrite FetchOp
+    if (op->isa<paddle::dialect::FetchOp>()) {
+      auto fetch_operand = op->operand(0);
+      auto fetch_operand_phi_dtype = GetPhiDataTypeFromOpOperand(fetch_operand);
+      if (OpRunLowPrecision(op)) {
+        SetResultDataType(op->result(0), precision_mode_, builder.ir_context());
+      }
+      if (!op->result(0).type().isa<paddle::dialect::DenseTensorType>()) return;
+      auto result_dtype = paddle::dialect::TransToPhiDataType(
+          pir::GetDataTypeFromValue(op->result(0)));
+      if (fetch_operand_phi_dtype != result_dtype) {
+        DoInsertCastOp(op, fetch_operand, result_dtype, builder);
+      }
+      return;
+    }
+    // Rewrite FeedOp
+    if (op->isa<paddle::dialect::FeedOp>() && OpRunLowPrecision(op)) {
+      SetResultDataType(op->result(0), precision_mode_, builder.ir_context());
+      return;
+    }
+
+    // Rewrite ShareDataOp
+    if (op->isa<paddle::dialect::ShareDataOp>() && OpRunLowPrecision(op)) {
+      SetResultDataType(op->result(0), precision_mode_, builder.ir_context());
+      return;
+    }
+
+    // Other pd ops
+    if (OpRunLowPrecision(op)) {
+      auto phi_kernel =
+          GetPhiKernelInPrecision(op_type, backend, precision_mode_);
+      PADDLE_ENFORCE(
+          phi_kernel.IsValid(),
+          phi::errors::PreconditionNotMet(
+              "op [%s] kernel doesn't support precision [%s] on backend [%s]",
+              op->name(),
+              phi::DataTypeToString(precision_mode_).c_str(),
+              paddle::experimental::BackendToString(backend).c_str()));
+
+      auto args_def = phi_kernel.args_def();
+      auto input_defs = args_def.input_defs();
+      auto output_defs = args_def.output_defs();
+
+      // if any of the op's input is not in low precision, insert cast op
+      for (size_t i = 0; i < input_defs.size(); i++) {
+        auto operand = op->operand(i);
+        auto in_phi_dtype = input_defs[i].dtype;
+        if (!IsOperandHasDenseTensorType(operand)) continue;
+        auto operand_phi_dtype = GetPhiDataTypeFromOpOperand(operand);
+        if (IsPhiDataTypeFloat(operand_phi_dtype) &&
+            operand_phi_dtype != in_phi_dtype) {
+          DoInsertCastOp(op, operand, in_phi_dtype, builder);
+        }
+      }
+
+      // change result's dtype to low precision
+      if (op->HasAttribute("dtype")) {
+        auto phi_dtype = op->attribute("dtype")
+                             .dyn_cast<paddle::dialect::DataTypeAttribute>()
+                             .data();
+        if (IsPhiDataTypeFloat(phi_dtype)) {
+          pir::Attribute attr_dtype = paddle::dialect::DataTypeAttribute::get(
+              builder.ir_context(), precision_mode_);
+          op->set_attribute("dtype", attr_dtype);
+        } else if (phi_dtype ==
+                   phi::DataType::UNDEFINED) {  // dtype is not set, means all
+                                                // ok
+          pir::Attribute attr_dtype = paddle::dialect::DataTypeAttribute::get(
+              builder.ir_context(), precision_mode_);
+          op->set_attribute("dtype", attr_dtype);
+        } else {
+          return;  // don't modify output dtype
+        }
+      }
+
+      PADDLE_ENFORCE_EQ(
+          op->num_results(),
+          output_defs.size(),
+          phi::errors::PreconditionNotMet(
+              "op [%s] kernel output args defs should equal op outputs",
+              op->name()));
+
+      for (size_t i = 0; i < op->num_results(); i++) {
+        auto result = op->result(i);
+        if (!result.type()) continue;
+        phi::DataType out_phi_dtype = output_defs[i].dtype;
+        if (out_phi_dtype == phi::DataType::UNDEFINED)
+          out_phi_dtype = precision_mode_;
+        if (!IsPhiDataTypeFloat(out_phi_dtype))
+          continue;  // here handle op like "unequal", which has bool result
+                     // type
+        SetResultDataType(result, out_phi_dtype, builder.ir_context());
+      }
+    } else {
+      // current op doesn't support low precision
+      // if the op's input is in low precision, insert cast op
+      auto phi_dtype = phi::DataType::FLOAT32;
+      for (size_t i = 0; i < op->num_operands(); i++) {
+        auto operand = op->operand(i);
+        if (!IsOperandHasDenseTensorType(operand)) continue;
+        auto operand_phi_dtype = GetPhiDataTypeFromOpOperand(operand);
+        if (IsPhiDataTypeFloat(operand_phi_dtype) &&
+            operand_phi_dtype == precision_mode_) {
+          DoInsertCastOp(op, operand, phi_dtype, builder);
+        }
+      }
+    }
+  }
+};
+}  // namespace
+
+namespace pir {
+
+std::unique_ptr<Pass> CreateAutoMixedPrecisionPass() {
+  return std::make_unique<AutoMixedPrecisionPass>();
+}
+
+}  // namespace pir
diff --git a/paddle/fluid/pir/transforms/auto_mixed_precision_pass.h b/paddle/fluid/pir/transforms/auto_mixed_precision_pass.h
new file mode 100644
index 00000000000000..5d28438c5d9690
--- /dev/null
+++ b/paddle/fluid/pir/transforms/auto_mixed_precision_pass.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include "paddle/pir/core/dll_decl.h"
+
+namespace pir {
+
+class Pass;
+
+IR_API std::unique_ptr<Pass> CreateAutoMixedPrecisionPass();
+
+}  // namespace pir

From 8450e04281507cd618021c6d7dc314a729e35f75 Mon Sep 17 00:00:00 2001
From: lanxianghit <47554610+lanxianghit@users.noreply.github.com>
Date: Tue, 23 Jan 2024 14:05:21 +0800
Subject: [PATCH 23/34] infer symbolic shape for sub_graph repeate_kv (#61035)

Infer symbolic shape for sub_graph repeate_kv
---
 .../interface/infer_symbolic_shape.cc         | 56 +++++++++++++++++
 .../symbolic/test_cinn_sub_graph_symbolic.py  | 60 +++++++++++++++++++
 2 files changed, 116 insertions(+)

diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape.cc
index ba8c30be9f9e22..892c6b3c3d2e7f 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape.cc
@@ -508,6 +508,14 @@ bool FullOpInferSymbolicShape(pir::Operation *op,
     sym_shape.push_back(dim_expr);
   }
 
+  // DimExpr only keep shape info, which always be int type
+  int64_t value = attributes.at("value")
+                      .dyn_cast<paddle::dialect::ScalarAttribute>()
+                      .data()
+                      .to<int64_t>();
+  std::vector<symbol::DimExpr> sym_data;
+  sym_data.emplace_back(value);
+
   symbol::ShapeOrDataDimExprs shape_data{sym_shape};
 
   op->set_attribute(
@@ -601,6 +609,54 @@ bool Unsqueeze_OpInferSymbolicShape(
 
 bool TileOpInferSymbolicShape(pir::Operation *op,
                               pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  pir::Value operand_x = op->operand_source(0);
+  symbol::ShapeOrDataDimExprs x_shape_or_data =
+      shape_analysis->GetShapeOrDataForValue(operand_x);
+  pir::Value operand_repeat_times = op->operand_source(1);
+  symbol::ShapeOrDataDimExprs repeat_times_shape_or_data =
+      shape_analysis->GetShapeOrDataForValue(operand_repeat_times);
+
+  std::vector<symbol::DimExpr> x_dimexpr;
+  if (x_shape_or_data.data().has_value()) {
+    x_dimexpr = x_shape_or_data.data().value();
+  } else {
+    x_dimexpr = x_shape_or_data.shape();
+  }
+
+  std::vector<symbol::DimExpr> repeat_times_dimexpr;
+  if (repeat_times_shape_or_data.data().has_value()) {
+    repeat_times_dimexpr = repeat_times_shape_or_data.data().value();
+  } else {
+    repeat_times_dimexpr = repeat_times_shape_or_data.shape();
+  }
+  if (repeat_times_dimexpr.empty()) {
+    repeat_times_dimexpr = std::vector<symbol::DimExpr>(x_dimexpr.size(), 1);
+  }
+
+  auto out_rank = std::max(static_cast<size_t>(x_dimexpr.size()),
+                           repeat_times_dimexpr.size());
+  std::vector<symbol::DimExpr> out_shape(out_rank);
+  if (x_dimexpr.size() > repeat_times_dimexpr.size()) {
+    auto diff = x_dimexpr.size() - repeat_times_dimexpr.size();
+    repeat_times_dimexpr.insert(repeat_times_dimexpr.begin(), diff, 1);
+  } else {
+    auto diff = repeat_times_dimexpr.size() - x_dimexpr.size();
+    x_dimexpr.insert(x_dimexpr.begin(), diff, 1);
+  }
+
+  for (size_t i = 0; i < repeat_times_dimexpr.size(); ++i) {
+    out_shape[i] = x_dimexpr[i] * repeat_times_dimexpr[i];
+  }
+
+  symbol::ShapeOrDataDimExprs shape_data{out_shape};
+
+  op->set_attribute(
+      "symbolic_shape",
+      pir::shape::SymbolAttribute::get(pir::IrContext::Instance(), shape_data));
+
+  pir::OpResult res = op->result(0);
+  shape_analysis->SetShapeOrDataForValue(res, shape_data);
+
   return true;
 }
 
diff --git a/test/ir/pir/cinn/symbolic/test_cinn_sub_graph_symbolic.py b/test/ir/pir/cinn/symbolic/test_cinn_sub_graph_symbolic.py
index 7d84d55dab0672..738dd79eb840ad 100644
--- a/test/ir/pir/cinn/symbolic/test_cinn_sub_graph_symbolic.py
+++ b/test/ir/pir/cinn/symbolic/test_cinn_sub_graph_symbolic.py
@@ -232,5 +232,65 @@ def test_eval_symbolic(self):
         # np.testing.assert_allclose(cinn_out.numpy(), dy_out.numpy(), atol=1e-8)
 
 
+def unsqueeze_composite(x, axis):
+    """define composite rule of op unsqueeze"""
+    """using reshape to implement unsqueeze op"""
+    x_shape = list(x.shape)
+    axis_list = list(axis)
+    for i in axis_list:
+        if i < 0:
+            i += len(x_shape) + 1
+        x_shape = (
+            x_shape[:i]
+            + [
+                1,
+            ]
+            + x_shape[i:]
+        )
+    out = paddle.reshape(x, x_shape)
+    return out
+
+
+class LlamaRepeatKV(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.n_rep = 4
+
+    def forward(self, hidden_states):
+        batch, slen, num_key_value_heads, head_dim = hidden_states.shape
+        rst_unsqueeze = unsqueeze_composite(hidden_states, [-2])
+        rst_tile = rst_unsqueeze.tile([1, 1, 1, self.n_rep, 1])
+        out = rst_tile.reshape(
+            [batch, slen, num_key_value_heads * self.n_rep, head_dim]
+        )
+
+        return out
+
+
+class TestCinnDyShapeRepeatKV(TestCinnDyShapeBase):
+    def prepare_data(self):
+        self.hidden_states_shape = [1, 300, 32, 128]
+        self.hidden_states = paddle.randn(
+            self.hidden_states_shape, dtype="float32"
+        )
+        self.hidden_states.stop_gradient = False
+
+    def eval_symbolic(self, use_cinn):
+        paddle.seed(2022)
+        net = LlamaRepeatKV()
+        input_spec = [
+            InputSpec(shape=[None, None, 32, 128], dtype='float32'),
+        ]
+        net = apply_to_static(net, use_cinn, input_spec)
+        net.eval()
+        out = net(self.hidden_states)
+        return out
+
+    def test_eval_symbolic(self):
+        # cinn_out = self.eval_symbolic(use_cinn=True)
+        dy_out = self.eval_symbolic(use_cinn=False)
+        # np.testing.assert_allclose(cinn_out.numpy(), dy_out.numpy(), atol=1e-8)
+
+
 if __name__ == '__main__':
     unittest.main()

From 2d684d8bd360f636fcee29b056a64300ea981a55 Mon Sep 17 00:00:00 2001
From: huangjiyi <43315610+huangjiyi@users.noreply.github.com>
Date: Tue, 23 Jan 2024 14:23:20 +0800
Subject: [PATCH 24/34] [PR] remove BindOpResult (#61012)

---
 .../translator/program_translator.cc          | 14 ++--
 .../translator/program_translator.h           |  3 +-
 .../fluid/pir/dialect/op_generator/op_gen.py  |  2 +-
 .../dialect/op_generator/op_interface_gen.py  |  8 +-
 .../pir/dialect/op_generator/python_c_gen.py  |  6 +-
 .../pir/dialect/operator/interface/decomp.h   |  8 +-
 .../pir/dialect/operator/interface/vjp.h      |  8 +-
 .../dialect/operator/ir/control_flow_op.cc    | 12 +--
 .../pir/dialect/operator/ir/control_flow_op.h |  6 +-
 .../fluid/pir/dialect/operator/ir/manual_op.h | 16 ++--
 .../dialect/operator/ir/manual_op_decomp.cc   | 44 ++++-------
 .../pir/dialect/operator/ir/manual_op_vjp.cc  | 42 +++++-----
 paddle/fluid/primitive/base/decomp_trans.cc   | 33 ++++----
 paddle/fluid/primitive/base/decomp_trans.h    | 24 +++---
 .../templates/decomp/generated_decomp.j2      | 21 +++--
 paddle/fluid/pybind/control_flow_api.cc       |  2 +-
 paddle/fluid/pybind/eager_utils.cc            | 77 ++++++-------------
 paddle/fluid/pybind/eager_utils.h             |  7 +-
 .../fluid/pybind/manual_static_op_function.h  | 20 ++---
 paddle/fluid/pybind/pir.cc                    | 74 +++++++-----------
 paddle/fluid/pybind/pybind.cc                 |  9 +--
 .../test_set_static_op_arg_pre_cast_hook.py   |  2 +-
 22 files changed, 184 insertions(+), 254 deletions(-)

diff --git a/paddle/fluid/ir_adaptor/translator/program_translator.cc b/paddle/fluid/ir_adaptor/translator/program_translator.cc
index c1e84a865041ea..f4dcc560bcf695 100644
--- a/paddle/fluid/ir_adaptor/translator/program_translator.cc
+++ b/paddle/fluid/ir_adaptor/translator/program_translator.cc
@@ -751,7 +751,7 @@ const VariableDefiningInfo& ProgramTranslator::CreateUndefinedVariable(
   auto var_desc = block.FindVarRecursive(var_name);
   pir::Builder builder(ctx_, program_->block(), program_->block()->begin());
   auto dtype = ::phi::TransToPhiDataType(var_desc->GetDataType());
-  auto val = pir::OpResult(nullptr);
+  auto val = pir::Value(nullptr);
   if (var_desc->GetType() ==
       paddle::framework::proto::VarType::LOD_TENSOR_ARRAY) {
     val = builder.Build<dialect::CreateArrayOp>(dtype).result(0);
@@ -806,17 +806,15 @@ void ProgramTranslator::SetIsPersisableAttributeForAllValue(
   }
 }
 
-std::unordered_map<std::string, std::vector<pir::OpResult>>
-ProgramTranslator::VarDesc2OpResult() {
-  std::unordered_map<std::string, std::vector<pir::OpResult>>
-      var_desc_2_opresult;
+std::unordered_map<std::string, std::vector<pir::Value>>
+ProgramTranslator::VarDesc2Value() {
+  std::unordered_map<std::string, std::vector<pir::Value>> var_desc_2_value;
   for (const auto& [var_name, value_info_list] : param_map_) {
     for (const auto& value_info : value_info_list) {
-      var_desc_2_opresult[var_name].push_back(
-          value_info.value.dyn_cast<pir::OpResult>());
+      var_desc_2_value[var_name].push_back(value_info.value);
     }
   }
-  return var_desc_2_opresult;
+  return var_desc_2_value;
 }
 
 }  // namespace translator
diff --git a/paddle/fluid/ir_adaptor/translator/program_translator.h b/paddle/fluid/ir_adaptor/translator/program_translator.h
index f2a9bf3c5b3c59..09e09f5c5fc046 100644
--- a/paddle/fluid/ir_adaptor/translator/program_translator.h
+++ b/paddle/fluid/ir_adaptor/translator/program_translator.h
@@ -89,8 +89,7 @@ class ProgramTranslator {
 
   void Translate();
 
-  std::unordered_map<std::string, std::vector<pir::OpResult>>
-  VarDesc2OpResult();
+  std::unordered_map<std::string, std::vector<pir::Value>> VarDesc2Value();
 
  private:
   const ProgramDesc* legacy_program_;  // not owned
diff --git a/paddle/fluid/pir/dialect/op_generator/op_gen.py b/paddle/fluid/pir/dialect/op_generator/op_gen.py
index d0c1c438c195b7..1ea887a7a1920e 100644
--- a/paddle/fluid/pir/dialect/op_generator/op_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/op_gen.py
@@ -1204,7 +1204,7 @@ def AutoCodeGen(op_info_items, all_op_info_items, namespaces, dialect_name):
         op_interfaces_tmp = op_interfaces
         exclusive_interface_str_tmp = exclusive_interface_str
         decomp_interface_str = "paddle::dialect::DecompInterface"
-        decomp_interface_declare_str = "\n  static std::vector<std::vector<pir::OpResult>> Decomp(pir::Operation* op);"
+        decomp_interface_declare_str = "\n  static std::vector<std::vector<pir::Value>> Decomp(pir::Operation* op);"
 
         # If op has inplace info, we will generate inplace op and non-inplace op.
         for op_name in op_info.op_phi_name:
diff --git a/paddle/fluid/pir/dialect/op_generator/op_interface_gen.py b/paddle/fluid/pir/dialect/op_generator/op_interface_gen.py
index 21fcc02b11634c..0a0cae38ec2e5b 100644
--- a/paddle/fluid/pir/dialect/op_generator/op_interface_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/op_interface_gen.py
@@ -77,18 +77,18 @@
         {inputs_list}stop_gradients);"""
 
 OP_VJP_STOPGRADIENT_TEMPLATE = """
-    std::vector<std::vector<pir::OpResult>> res(tensor_res.size());
+    std::vector<std::vector<pir::Value>> res(tensor_res.size());
     for (size_t i = 0; i < tensor_res.size(); ++i) {
         res[i].resize(tensor_res[i].size());
         for (size_t j = 0; j < tensor_res[i].size(); ++j) {
             if(tensor_res[i][j].defined()){
-                res[i][j] = std::static_pointer_cast<primitive::LazyTensor>(tensor_res[i][j].impl())->value().dyn_cast<pir::OpResult>();
+                res[i][j] = std::static_pointer_cast<primitive::LazyTensor>(tensor_res[i][j].impl())->value();
             }
         }
     }"""
 
 OP_VJP_DEFINE_TEMPLATE = """
-std::vector<std::vector<pir::OpResult>> {op_class_name}::Vjp(pir::Operation* op, const std::vector<std::vector<pir::Value>>& inputs_, const std::vector<std::vector<pir::Value>>& outputs, const std::vector<std::vector<pir::Value>>& out_grads, const std::vector<std::vector<bool>>& stop_gradients){{
+std::vector<std::vector<pir::Value>> {op_class_name}::Vjp(pir::Operation* op, const std::vector<std::vector<pir::Value>>& inputs_, const std::vector<std::vector<pir::Value>>& outputs, const std::vector<std::vector<pir::Value>>& out_grads, const std::vector<std::vector<bool>>& stop_gradients){{
 {check_param}
     VLOG(6) << "Prepare inputs of {op_grad_name}";
 {backward_input_code}
@@ -304,5 +304,5 @@ def gen_exclusive_interface_str(op_info, op_info_items):
                 "  static std::vector<pir::Type> InferMeta( const std::vector<pir::Value>& input_values, const pir::AttributeMap& attributes );"
             )
     if op_info.op_phi_name[0] not in vjp_interface_black_list:
-        exclusive_interface_str += "\n  static std::vector<std::vector<pir::OpResult>> Vjp(pir::Operation* op, const std::vector<std::vector<pir::Value>>& inputs_, const std::vector<std::vector<pir::Value>>& outputs, const std::vector<std::vector<pir::Value>>& out_grads, const std::vector<std::vector<bool>>& stop_gradients);"
+        exclusive_interface_str += "\n  static std::vector<std::vector<pir::Value>> Vjp(pir::Operation* op, const std::vector<std::vector<pir::Value>>& inputs_, const std::vector<std::vector<pir::Value>>& outputs, const std::vector<std::vector<pir::Value>>& out_grads, const std::vector<std::vector<bool>>& stop_gradients);"
     return exclusive_interface_str
diff --git a/paddle/fluid/pir/dialect/op_generator/python_c_gen.py b/paddle/fluid/pir/dialect/op_generator/python_c_gen.py
index d1284f0c9866dc..38619ec22e049e 100644
--- a/paddle/fluid/pir/dialect/op_generator/python_c_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/python_c_gen.py
@@ -144,16 +144,16 @@
        {type} {name};
 """
 MUTABLE_ATTR_TEMPLATE = """
-        if (PyObject_CheckIROpResult({name}_obj)){{
+        if (PyObject_CheckIRValue({name}_obj)){{
             {mutable_cast_attrs}
         }}else{{
             {no_mutable_cast_attrs}
         }}"""
 
 MUTABLE_ATTR_LIST_TEMPLATE = """
-        if (PyObject_CheckIROpResult({name}_obj)){{
+        if (PyObject_CheckIRValue({name}_obj)){{
            {mutable_cast_attrs}
-        }}else if (PyObject_CheckIRVectorOfOpResult({name}_obj)){{
+        }}else if (PyObject_CheckIRVectorOfValue({name}_obj)){{
            {mutable_vector_cast_attrs}
         }}else{{
            {no_mutable_cast_attrs}
diff --git a/paddle/fluid/pir/dialect/operator/interface/decomp.h b/paddle/fluid/pir/dialect/operator/interface/decomp.h
index 10a6e51e7db3c6..75bbe3b8f02945 100644
--- a/paddle/fluid/pir/dialect/operator/interface/decomp.h
+++ b/paddle/fluid/pir/dialect/operator/interface/decomp.h
@@ -21,14 +21,14 @@ class DecompInterface : public pir::OpInterfaceBase<DecompInterface> {
  public:
   struct Concept {
     explicit Concept(
-        std::vector<std::vector<pir::OpResult>> (*decomp)(pir::Operation* op))
+        std::vector<std::vector<pir::Value>> (*decomp)(pir::Operation* op))
         : decomp_(decomp) {}
-    std::vector<std::vector<pir::OpResult>> (*decomp_)(pir::Operation* op);
+    std::vector<std::vector<pir::Value>> (*decomp_)(pir::Operation* op);
   };
 
   template <class ConcreteOp>
   struct Model : public Concept {
-    static std::vector<std::vector<pir::OpResult>> Decomp(pir::Operation* op) {
+    static std::vector<std::vector<pir::Value>> Decomp(pir::Operation* op) {
       return ConcreteOp::Decomp(op);
     }
     Model() : Concept(Decomp) {}
@@ -38,7 +38,7 @@ class DecompInterface : public pir::OpInterfaceBase<DecompInterface> {
   DecompInterface(pir::Operation* op, Concept* impl)
       : pir::OpInterfaceBase<DecompInterface>(op), impl_(impl) {}
 
-  std::vector<std::vector<pir::OpResult>> Decomp(pir::Operation* op) {
+  std::vector<std::vector<pir::Value>> Decomp(pir::Operation* op) {
     return impl_->decomp_(op);
   }
 
diff --git a/paddle/fluid/pir/dialect/operator/interface/vjp.h b/paddle/fluid/pir/dialect/operator/interface/vjp.h
index 5246a2867665e4..4faa80bed54e42 100644
--- a/paddle/fluid/pir/dialect/operator/interface/vjp.h
+++ b/paddle/fluid/pir/dialect/operator/interface/vjp.h
@@ -20,14 +20,14 @@ namespace dialect {
 class VjpInterface : public pir::OpInterfaceBase<VjpInterface> {
  public:
   struct Concept {
-    explicit Concept(std::vector<std::vector<pir::OpResult>> (*vjp)(
+    explicit Concept(std::vector<std::vector<pir::Value>> (*vjp)(
         pir::Operation* op,
         const std::vector<std::vector<pir::Value>>& inputs,
         const std::vector<std::vector<pir::Value>>& outputs,
         const std::vector<std::vector<pir::Value>>& out_grads,
         const std::vector<std::vector<bool>>& stop_gradients))
         : vjp_(vjp) {}
-    std::vector<std::vector<pir::OpResult>> (*vjp_)(
+    std::vector<std::vector<pir::Value>> (*vjp_)(
         pir::Operation* op,
         const std::vector<std::vector<pir::Value>>& inputs,
         const std::vector<std::vector<pir::Value>>& outputs,
@@ -37,7 +37,7 @@ class VjpInterface : public pir::OpInterfaceBase<VjpInterface> {
 
   template <class ConcreteOp>
   struct Model : public Concept {
-    static std::vector<std::vector<pir::OpResult>> Vjp(
+    static std::vector<std::vector<pir::Value>> Vjp(
         pir::Operation* op,
         const std::vector<std::vector<pir::Value>>& inputs,
         const std::vector<std::vector<pir::Value>>& outputs,
@@ -53,7 +53,7 @@ class VjpInterface : public pir::OpInterfaceBase<VjpInterface> {
   VjpInterface(pir::Operation* op, Concept* impl)
       : pir::OpInterfaceBase<VjpInterface>(op), impl_(impl) {}
 
-  std::vector<std::vector<pir::OpResult>> Vjp(
+  std::vector<std::vector<pir::Value>> Vjp(
       pir::Operation* op,
       const std::vector<std::vector<pir::Value>>& inputs,
       const std::vector<std::vector<pir::Value>>& outputs,
diff --git a/paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc b/paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc
index 0611cea789129f..382d3a1b3123f6 100644
--- a/paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc
@@ -260,7 +260,7 @@ void IfOp::VerifyRegion() {
   }
 }
 
-std::vector<std::vector<pir::OpResult>> IfOp::Vjp(
+std::vector<std::vector<pir::Value>> IfOp::Vjp(
     pir::Operation *op,
     const std::vector<std::vector<pir::Value>> &inputs_,
     const std::vector<std::vector<pir::Value>> &outputs,
@@ -291,7 +291,7 @@ std::vector<std::vector<pir::OpResult>> IfOp::Vjp(
   auto if_grad = ApiBuilder::Instance().GetBuilder()->Build<IfOp>(
       cond_val, std::move(output_types));
 
-  std::vector<std::vector<pir::OpResult>> res{inputs_.size()};
+  std::vector<std::vector<pir::Value>> res{inputs_.size()};
   for (size_t i = 0, j = 0; i < inputs_.size(); ++i) {
     res[i].resize(1);
     if (!stop_gradients[i][0]) {
@@ -460,7 +460,7 @@ void WhileOp::VerifyRegion() {
   VLOG(4) << "Successful end verifying sub regions for: WhileOp.";
 }
 
-std::vector<std::vector<pir::OpResult>> WhileOp::Vjp(
+std::vector<std::vector<pir::Value>> WhileOp::Vjp(
     pir::Operation *op,
     const std::vector<std::vector<pir::Value>> &inputs,
     const std::vector<std::vector<pir::Value>> &outputs,
@@ -525,13 +525,13 @@ std::vector<std::vector<pir::OpResult>> WhileOp::Vjp(
   }
   auto while_grad = builder.Build<WhileOp>(cond_val, loop_vars);
 
-  std::vector<std::vector<pir::OpResult>> res(inputs.size());
+  std::vector<std::vector<pir::Value>> res(inputs.size());
   for (size_t i = 0, j = 0; i < inputs.size(); ++i) {
     res[i].push_back(stop_gradients[i][0] ? nullptr : while_grad.result(j++));
   }
   return res;
 }
-std::vector<std::vector<pir::OpResult>> TuplePushOpVjpInterfaceModel::Vjp(
+std::vector<std::vector<pir::Value>> TuplePushOpVjpInterfaceModel::Vjp(
     pir::Operation *op,
     const std::vector<std::vector<pir::Value>> &inputs,
     const std::vector<std::vector<pir::Value>> &outputs,
@@ -547,7 +547,7 @@ std::vector<std::vector<pir::OpResult>> TuplePushOpVjpInterfaceModel::Vjp(
           inputs.size()));
   auto pop_op = ApiBuilder::Instance().GetBuilder()->Build<TuplePopOp>(
       TuplePushOp::dyn_cast(op).outlet());
-  std::vector<std::vector<pir::OpResult>> res{inputs.size()};
+  std::vector<std::vector<pir::Value>> res{inputs.size()};
   res[0].resize(1);
   for (size_t i = 1u; i < inputs.size(); ++i) {
     res[i].resize(1);
diff --git a/paddle/fluid/pir/dialect/operator/ir/control_flow_op.h b/paddle/fluid/pir/dialect/operator/ir/control_flow_op.h
index 90df8eb2ff8f99..9ec36397340d07 100644
--- a/paddle/fluid/pir/dialect/operator/ir/control_flow_op.h
+++ b/paddle/fluid/pir/dialect/operator/ir/control_flow_op.h
@@ -49,7 +49,7 @@ class IfOp : public pir::Op<IfOp, VjpInterface> {
   void VerifySig();
   void VerifyRegion();
 
-  static std::vector<std::vector<pir::OpResult>> Vjp(
+  static std::vector<std::vector<pir::Value>> Vjp(
       pir::Operation *op,
       const std::vector<std::vector<pir::Value>> &inputs_,
       const std::vector<std::vector<pir::Value>> &outputs,
@@ -85,7 +85,7 @@ class WhileOp : public pir::Op<WhileOp, VjpInterface> {
   void Print(pir::IrPrinter &printer);  // NOLINT
   void VerifySig();
   void VerifyRegion();
-  static std::vector<std::vector<pir::OpResult>> Vjp(
+  static std::vector<std::vector<pir::Value>> Vjp(
       pir::Operation *op,
       const std::vector<std::vector<pir::Value>> &inputs_,
       const std::vector<std::vector<pir::Value>> &outputs,
@@ -94,7 +94,7 @@ class WhileOp : public pir::Op<WhileOp, VjpInterface> {
 };
 
 struct TuplePushOpVjpInterfaceModel : public VjpInterface::Concept {
-  static std::vector<std::vector<pir::OpResult>> Vjp(
+  static std::vector<std::vector<pir::Value>> Vjp(
       pir::Operation *op,
       const std::vector<std::vector<pir::Value>> &inputs,
       const std::vector<std::vector<pir::Value>> &outputs,
diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_op.h b/paddle/fluid/pir/dialect/operator/ir/manual_op.h
index 983080049fc4f6..1bc97166c09d9a 100644
--- a/paddle/fluid/pir/dialect/operator/ir/manual_op.h
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_op.h
@@ -57,13 +57,13 @@ class AddNOp : public pir::Op<AddNOp,
       const std::vector<pir::Value> &input_values,
       const pir::AttributeMap &attributes);
 
-  static std::vector<std::vector<pir::OpResult>> Vjp(
+  static std::vector<std::vector<pir::Value>> Vjp(
       pir::Operation *op,
       const std::vector<std::vector<pir::Value>> &inputs_,
       const std::vector<std::vector<pir::Value>> &outputs,
       const std::vector<std::vector<pir::Value>> &out_grads,
       const std::vector<std::vector<bool>> &stop_gradients);
-  static std::vector<std::vector<pir::OpResult>> Decomp(pir::Operation *op);
+  static std::vector<std::vector<pir::Value>> Decomp(pir::Operation *op);
 };
 
 class AddN_Op : public pir::Op<AddN_Op,
@@ -312,7 +312,7 @@ class ArrayReadOp : public pir::Op<ArrayReadOp,
   static std::vector<pir::Type> InferMeta(
       const std::vector<pir::Value> &input_values,
       const pir::AttributeMap &attributes);
-  static std::vector<std::vector<pir::OpResult>> Vjp(
+  static std::vector<std::vector<pir::Value>> Vjp(
       pir::Operation *op,
       const std::vector<std::vector<pir::Value>> &inputs_,
       const std::vector<std::vector<pir::Value>> &outputs,
@@ -345,7 +345,7 @@ class ArrayWrite_Op : public pir::Op<ArrayWrite_Op,
   static std::vector<pir::Type> InferMeta(
       const std::vector<pir::Value> &input_values,
       const pir::AttributeMap &attributes);
-  static std::vector<std::vector<pir::OpResult>> Vjp(
+  static std::vector<std::vector<pir::Value>> Vjp(
       pir::Operation *op,
       const std::vector<std::vector<pir::Value>> &inputs_,
       const std::vector<std::vector<pir::Value>> &outputs,
@@ -376,7 +376,7 @@ class ArrayToTensorOp : public pir::Op<ArrayToTensorOp,
   static std::vector<pir::Type> InferMeta(
       const std::vector<pir::Value> &input_values,
       const pir::AttributeMap &attributes);
-  static std::vector<std::vector<pir::OpResult>> Vjp(
+  static std::vector<std::vector<pir::Value>> Vjp(
       pir::Operation *op,
       const std::vector<std::vector<pir::Value>> &inputs_,
       const std::vector<std::vector<pir::Value>> &outputs,
@@ -569,7 +569,7 @@ class ExpandOp : public pir::Op<ExpandOp,
   static std::vector<pir::Type> InferMeta(
       const std::vector<pir::Value> &input_values,
       const pir::AttributeMap &attributes);
-  static std::vector<std::vector<pir::OpResult>> Vjp(
+  static std::vector<std::vector<pir::Value>> Vjp(
       pir::Operation *op,
       const std::vector<std::vector<pir::Value>> &inputs_,
       const std::vector<std::vector<pir::Value>> &outputs,
@@ -614,7 +614,7 @@ class IncrementOp
   static std::vector<pir::Type> InferMeta(
       const std::vector<pir::Value> &input_values,
       const pir::AttributeMap &attributes);
-  static std::vector<std::vector<pir::OpResult>> Vjp(
+  static std::vector<std::vector<pir::Value>> Vjp(
       pir::Operation *op,
       const std::vector<std::vector<pir::Value>> &inputs_,
       const std::vector<std::vector<pir::Value>> &outputs,
@@ -659,7 +659,7 @@ class Increment_Op
   static std::vector<pir::Type> InferMeta(
       const std::vector<pir::Value> &input_values,
       const pir::AttributeMap &attributes);
-  static std::vector<std::vector<pir::OpResult>> Vjp(
+  static std::vector<std::vector<pir::Value>> Vjp(
       pir::Operation *op,
       const std::vector<std::vector<pir::Value>> &inputs_,
       const std::vector<std::vector<pir::Value>> &outputs,
diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_op_decomp.cc b/paddle/fluid/pir/dialect/operator/ir/manual_op_decomp.cc
index cc8af56c2f4819..c78038f19a294d 100644
--- a/paddle/fluid/pir/dialect/operator/ir/manual_op_decomp.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_op_decomp.cc
@@ -29,8 +29,7 @@ namespace paddle {
 namespace dialect {
 using IntArray = paddle::experimental::IntArray;
 
-std::vector<std::vector<pir::OpResult>> BatchNormOp::Decomp(
-    pir::Operation* op) {
+std::vector<std::vector<pir::Value>> BatchNormOp::Decomp(pir::Operation* op) {
   VLOG(4) << "Decomp call batch_norm's decomp interface begin";
   BatchNormOp op_obj = op->dyn_cast<BatchNormOp>();
   (void)op_obj;
@@ -70,7 +69,7 @@ std::vector<std::vector<pir::OpResult>> BatchNormOp::Decomp(
   VLOG(6) << "Decomp call batch_norm's forward composite rule prepare";
 
   auto org_res = op->results();
-  std::vector<std::vector<pir::OpResult>> res(org_res.size());
+  std::vector<std::vector<pir::Value>> res(org_res.size());
 
   VLOG(6) << "Decomp call batch_norm's forward composite rule begin";
 
@@ -92,33 +91,27 @@ std::vector<std::vector<pir::OpResult>> BatchNormOp::Decomp(
 
   res[0].push_back(std::static_pointer_cast<primitive::LazyTensor>(
                        std::get<0>(op_res).impl())
-                       ->value()
-                       .dyn_cast<pir::OpResult>());
+                       ->value());
   res[1].push_back(std::static_pointer_cast<primitive::LazyTensor>(
                        std::get<1>(op_res).impl())
-                       ->value()
-                       .dyn_cast<pir::OpResult>());
+                       ->value());
   res[2].push_back(std::static_pointer_cast<primitive::LazyTensor>(
                        std::get<2>(op_res).impl())
-                       ->value()
-                       .dyn_cast<pir::OpResult>());
+                       ->value());
   res[3].push_back(std::static_pointer_cast<primitive::LazyTensor>(
                        std::get<3>(op_res).impl())
-                       ->value()
-                       .dyn_cast<pir::OpResult>());
+                       ->value());
   res[4].push_back(std::static_pointer_cast<primitive::LazyTensor>(
                        std::get<4>(op_res).impl())
-                       ->value()
-                       .dyn_cast<pir::OpResult>());
-  pir::OpResult reserve_space;
+                       ->value());
+  pir::Value reserve_space;
   res[5].push_back(reserve_space);
 
   VLOG(4) << "Decomp call batch_norm's decomp interface end";
   return res;
 }
 
-std::vector<std::vector<pir::OpResult>> BatchNorm_Op::Decomp(
-    pir::Operation* op) {
+std::vector<std::vector<pir::Value>> BatchNorm_Op::Decomp(pir::Operation* op) {
   VLOG(4) << "Decomp call batch_norm_'s decomp interface begin";
   BatchNorm_Op op_obj = op->dyn_cast<BatchNorm_Op>();
   (void)op_obj;
@@ -158,7 +151,7 @@ std::vector<std::vector<pir::OpResult>> BatchNorm_Op::Decomp(
   VLOG(6) << "Decomp call batch_norm_'s forward composite rule prepare";
 
   auto org_res = op->results();
-  std::vector<std::vector<pir::OpResult>> res(org_res.size());
+  std::vector<std::vector<pir::Value>> res(org_res.size());
 
   VLOG(6) << "Decomp call batch_norm_'s forward composite rule begin";
 
@@ -180,25 +173,20 @@ std::vector<std::vector<pir::OpResult>> BatchNorm_Op::Decomp(
 
   res[0].push_back(std::static_pointer_cast<primitive::LazyTensor>(
                        std::get<0>(op_res).impl())
-                       ->value()
-                       .dyn_cast<pir::OpResult>());
+                       ->value());
   res[1].push_back(std::static_pointer_cast<primitive::LazyTensor>(
                        std::get<1>(op_res).impl())
-                       ->value()
-                       .dyn_cast<pir::OpResult>());
+                       ->value());
   res[2].push_back(std::static_pointer_cast<primitive::LazyTensor>(
                        std::get<2>(op_res).impl())
-                       ->value()
-                       .dyn_cast<pir::OpResult>());
+                       ->value());
   res[3].push_back(std::static_pointer_cast<primitive::LazyTensor>(
                        std::get<3>(op_res).impl())
-                       ->value()
-                       .dyn_cast<pir::OpResult>());
+                       ->value());
   res[4].push_back(std::static_pointer_cast<primitive::LazyTensor>(
                        std::get<4>(op_res).impl())
-                       ->value()
-                       .dyn_cast<pir::OpResult>());
-  pir::OpResult reserve_space;
+                       ->value());
+  pir::Value reserve_space;
   res[5].push_back(reserve_space);
 
   VLOG(4) << "Decomp call batch_norm_'s decomp interface end";
diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_op_vjp.cc b/paddle/fluid/pir/dialect/operator/ir/manual_op_vjp.cc
index 68e60ed5ea0bd3..7104b7471f4bdd 100644
--- a/paddle/fluid/pir/dialect/operator/ir/manual_op_vjp.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_op_vjp.cc
@@ -28,7 +28,7 @@ namespace paddle {
 namespace dialect {
 using IntArray = paddle::experimental::IntArray;
 
-std::vector<std::vector<pir::OpResult>> AddNOp::Vjp(
+std::vector<std::vector<pir::Value>> AddNOp::Vjp(
     pir::Operation* op,
     const std::vector<std::vector<pir::Value>>& inputs_,
     const std::vector<std::vector<pir::Value>>& outputs,
@@ -65,22 +65,21 @@ std::vector<std::vector<pir::OpResult>> AddNOp::Vjp(
 
   VLOG(6) << "Vjp prepare stop gradient of add_n_grad";
 
-  std::vector<std::vector<pir::OpResult>> res(tensor_res.size());
+  std::vector<std::vector<pir::Value>> res(tensor_res.size());
   for (size_t i = 0; i < tensor_res.size(); ++i) {
     res[i].resize(tensor_res[i].size());
     for (size_t j = 0; j < tensor_res[i].size(); ++j) {
       if (tensor_res[i][j].defined()) {
         res[i][j] = std::static_pointer_cast<primitive::LazyTensor>(
                         tensor_res[i][j].impl())
-                        ->value()
-                        .dyn_cast<pir::OpResult>();
+                        ->value();
       }
     }
   }
   return res;
 }
 
-std::vector<std::vector<pir::OpResult>> ExpandOp::Vjp(
+std::vector<std::vector<pir::Value>> ExpandOp::Vjp(
     pir::Operation* op,
     const std::vector<std::vector<pir::Value>>& inputs_,
     const std::vector<std::vector<pir::Value>>& outputs,
@@ -113,22 +112,21 @@ std::vector<std::vector<pir::OpResult>> ExpandOp::Vjp(
 
   VLOG(6) << "Vjp prepare stop gradient of expand_grad";
 
-  std::vector<std::vector<pir::OpResult>> res(tensor_res.size());
+  std::vector<std::vector<pir::Value>> res(tensor_res.size());
   for (size_t i = 0; i < tensor_res.size(); ++i) {
     res[i].resize(tensor_res[i].size());
     for (size_t j = 0; j < tensor_res[i].size(); ++j) {
       if (tensor_res[i][j].defined()) {
         res[i][j] = std::static_pointer_cast<primitive::LazyTensor>(
                         tensor_res[i][j].impl())
-                        ->value()
-                        .dyn_cast<pir::OpResult>();
+                        ->value();
       }
     }
   }
   return res;
 }
 
-std::vector<std::vector<pir::OpResult>> IncrementOp::Vjp(
+std::vector<std::vector<pir::Value>> IncrementOp::Vjp(
     pir::Operation* op,
     const std::vector<std::vector<pir::Value>>& inputs_,
     const std::vector<std::vector<pir::Value>>& outputs,
@@ -151,14 +149,14 @@ std::vector<std::vector<pir::OpResult>> IncrementOp::Vjp(
 
   VLOG(6) << "Vjp prepare call increment's vjp inteface";
 
-  pir::OpResult tensor_res = paddle::dialect::scale(out_grads[0][0]);
+  pir::Value tensor_res = paddle::dialect::scale(out_grads[0][0]);
 
-  std::vector<std::vector<pir::OpResult>> res{{tensor_res}};
+  std::vector<std::vector<pir::Value>> res{{tensor_res}};
 
   return res;
 }
 
-std::vector<std::vector<pir::OpResult>> Increment_Op::Vjp(
+std::vector<std::vector<pir::Value>> Increment_Op::Vjp(
     pir::Operation* op,
     const std::vector<std::vector<pir::Value>>& inputs_,
     const std::vector<std::vector<pir::Value>>& outputs,
@@ -185,11 +183,11 @@ std::vector<std::vector<pir::OpResult>> Increment_Op::Vjp(
 
   paddle::dialect::increment_(inputs_[0][0], -value);
 
-  std::vector<std::vector<pir::OpResult>> res;
+  std::vector<std::vector<pir::Value>> res;
   return res;
 }
 
-std::vector<std::vector<pir::OpResult>> ArrayWrite_Op::Vjp(
+std::vector<std::vector<pir::Value>> ArrayWrite_Op::Vjp(
     pir::Operation* op,
     const std::vector<std::vector<pir::Value>>& inputs_,
     const std::vector<std::vector<pir::Value>>& outputs,
@@ -216,11 +214,11 @@ std::vector<std::vector<pir::OpResult>> ArrayWrite_Op::Vjp(
           outputs.size()));
 
   VLOG(6) << "Vjp prepare call  ArrayWrite_'s vjp inteface";
-  pir::OpResult x_grad =
+  pir::Value x_grad =
       paddle::dialect::array_read(in_grads[0][0], inputs_[2][0]);
-  pir::OpResult zero = paddle::dialect::zeros_like(inputs_[1][0]);
+  pir::Value zero = paddle::dialect::zeros_like(inputs_[1][0]);
   paddle::dialect::array_write_(in_grads[0][0], zero, inputs_[2][0]);
-  std::vector<std::vector<pir::OpResult>> res(1);
+  std::vector<std::vector<pir::Value>> res(1);
   res[0].resize(1);
   if (!stop_gradients[0][0]) {
     res[0][0] = x_grad;
@@ -228,7 +226,7 @@ std::vector<std::vector<pir::OpResult>> ArrayWrite_Op::Vjp(
   return res;
 }
 
-std::vector<std::vector<pir::OpResult>> ArrayReadOp::Vjp(
+std::vector<std::vector<pir::Value>> ArrayReadOp::Vjp(
     pir::Operation* op,
     const std::vector<std::vector<pir::Value>>& inputs_,
     const std::vector<std::vector<pir::Value>>& outputs,
@@ -264,11 +262,11 @@ std::vector<std::vector<pir::OpResult>> ArrayReadOp::Vjp(
       paddle::dialect::add(array_grad_i_origin, out_grads[0][0]);
   paddle::dialect::array_write_(out_grads[1][0], array_grad_i, inputs_[1][0]);
 
-  std::vector<std::vector<pir::OpResult>> res;
+  std::vector<std::vector<pir::Value>> res;
   return res;
 }
 
-std::vector<std::vector<pir::OpResult>> ArrayToTensorOp::Vjp(
+std::vector<std::vector<pir::Value>> ArrayToTensorOp::Vjp(
     pir::Operation* op,
     const std::vector<std::vector<pir::Value>>& inputs_,
     const std::vector<std::vector<pir::Value>>& outputs,
@@ -301,10 +299,10 @@ std::vector<std::vector<pir::OpResult>> ArrayToTensorOp::Vjp(
 
   VLOG(6) << "Vjp prepare call ArrayToTensor's vjp inteface";
 
-  pir::OpResult tensor_res = paddle::dialect::tensor_to_array(
+  pir::Value tensor_res = paddle::dialect::tensor_to_array(
       inputs_[0][0], out_grads[0][0], axis, use_stack);
 
-  std::vector<std::vector<pir::OpResult>> res(1);
+  std::vector<std::vector<pir::Value>> res(1);
   res[0].resize(1);
   if (!stop_gradients[0][0]) {
     res[0][0] = tensor_res;
diff --git a/paddle/fluid/primitive/base/decomp_trans.cc b/paddle/fluid/primitive/base/decomp_trans.cc
index c5d3cd104ef6e5..edfa2cea6dd771 100644
--- a/paddle/fluid/primitive/base/decomp_trans.cc
+++ b/paddle/fluid/primitive/base/decomp_trans.cc
@@ -101,7 +101,7 @@ bool DecompProgram::check_decomp_dynamic_shape(pir::Operation* op) {
     auto value = item.source();
     // check if initialized in case of optional input.
     if (!paddle::dialect::IsEmptyValue(value)) {
-      pir::Operation* prev_op = value.dyn_cast<pir::OpResult>().owner();
+      pir::Operation* prev_op = value.defining_op();
       if (prev_op->name() == "builtin.combine") {
         for (pir::OpOperand& sub_item : prev_op->operands()) {
           if (check_dynamic_shape(sub_item, *op)) {
@@ -121,7 +121,7 @@ bool DecompProgram::check_decomp_dynamic_shape(pir::Operation* op) {
 void DecompProgram::check_decomp_outputs(
     const std::string& op_name,
     const std::vector<pir::OpResult>& orig_outs,
-    const std::vector<pir::OpResult>& decomp_outs) {
+    const std::vector<pir::Value>& decomp_outs) {
   bool skip_invalid_op_check =
       decomp_op_contain_none.find(op_name) != decomp_op_contain_none.end();
   for (size_t i = 0; i < orig_outs.size(); i++) {
@@ -187,10 +187,10 @@ void DecompProgram::check_decomp_outputs(
   return;
 }
 
-std::vector<pir::OpResult> DecompProgram::format_decomp_res(
+std::vector<pir::Value> DecompProgram::format_decomp_res(
     const std::string& op_name,
     const std::vector<pir::OpResult>& orig_outs,
-    const std::vector<std::vector<pir::OpResult>>& decomp_outs) {
+    const std::vector<std::vector<pir::Value>>& decomp_outs) {
   PADDLE_ENFORCE_EQ(
       orig_outs.size(),
       decomp_outs.size(),
@@ -200,7 +200,7 @@ std::vector<pir::OpResult> DecompProgram::format_decomp_res(
           op_name,
           orig_outs.size(),
           decomp_outs.size()));
-  std::vector<pir::OpResult> new_decomp_outs(orig_outs.size());
+  std::vector<pir::Value> new_decomp_outs(orig_outs.size());
   for (size_t i = 0; i < orig_outs.size(); i++) {
     if (orig_outs[i]) {
       PADDLE_ENFORCE_EQ(
@@ -218,12 +218,12 @@ std::vector<pir::OpResult> DecompProgram::format_decomp_res(
   return new_decomp_outs;
 }
 
-std::vector<pir::OpResult> DecompProgram::construct_dst_vars(
+std::vector<pir::Value> DecompProgram::construct_dst_vars(
     const std::string& op_name,
     const std::vector<pir::OpResult>& orig_outs,
-    const std::vector<pir::OpResult>& decomp_outs,
-    std::unordered_map<pir::OpResult, int> orig_vars_dict) {
-  std::vector<pir::OpResult> tar_vars(src_vars_.size());
+    const std::vector<pir::Value>& decomp_outs,
+    std::unordered_map<pir::Value, int> orig_vars_dict) {
+  std::vector<pir::Value> tar_vars(src_vars_.size());
   PADDLE_ENFORCE_EQ(
       orig_outs.size(),
       decomp_outs.size(),
@@ -241,7 +241,7 @@ std::vector<pir::OpResult> DecompProgram::construct_dst_vars(
   return tar_vars;
 }
 
-std::vector<pir::OpResult> DecompProgram::get_dst_vars() {
+std::vector<pir::Value> DecompProgram::get_dst_vars() {
   if (!paddle::prim::PrimCommonUtils::IsFwdPrimEnabled()) {
     return src_vars_;
   } else {
@@ -265,20 +265,19 @@ bool DecompProgram::enable_decomp_by_filter(const std::string& op_name) {
   return flag;
 }
 
-std::vector<std::vector<pir::OpResult>> call_decomp_rule(pir::Operation* op) {
+std::vector<std::vector<pir::Value>> call_decomp_rule(pir::Operation* op) {
   paddle::dialect::DecompInterface decomp_interface =
       op->dyn_cast<paddle::dialect::DecompInterface>();
   PADDLE_ENFORCE(decomp_interface,
                  phi::errors::InvalidArgument(
                      "[Prim] The decomp function is not registered in %s op ",
                      op->name()));
-  std::vector<std::vector<pir::OpResult>> decomp_res =
-      decomp_interface.Decomp(op);
+  std::vector<std::vector<pir::Value>> decomp_res = decomp_interface.Decomp(op);
   return decomp_res;
 }
 
 void DecompProgram::decomp_program() {
-  std::unordered_map<pir::OpResult, int> orig_vars_dict;
+  std::unordered_map<pir::Value, int> orig_vars_dict;
   for (size_t i = 0; i < src_vars_.size(); i++) {
     orig_vars_dict[src_vars_[i]] = static_cast<int>(i);
   }
@@ -290,7 +289,7 @@ void DecompProgram::decomp_program() {
   if (!paddle::prim::PrimCommonUtils::IsFwdPrimEnabled()) {
     return;
   }
-  std::vector<pir::OpResult> tar_vars(src_vars_.size());
+  std::vector<pir::Value> tar_vars(src_vars_.size());
   pir::Block* block = program_->block();
   std::vector<pir::Operation*> ops_list;
   for (auto& op : *block) {
@@ -309,9 +308,9 @@ void DecompProgram::decomp_program() {
       check_decomp_dynamic_shape(op);
       auto& builder = *(paddle::dialect::ApiBuilder::Instance().GetBuilder());
       builder.set_insertion_point(op);
-      std::vector<std::vector<pir::OpResult>> decomp_res = call_decomp_rule(op);
+      std::vector<std::vector<pir::Value>> decomp_res = call_decomp_rule(op);
       std::vector<pir::OpResult> orig_outs = op->results();
-      std::vector<pir::OpResult> standard_decomp_res =
+      std::vector<pir::Value> standard_decomp_res =
           format_decomp_res(op->name(), orig_outs, decomp_res);
       check_decomp_outputs(op->name(), orig_outs, standard_decomp_res);
       tar_vars = construct_dst_vars(
diff --git a/paddle/fluid/primitive/base/decomp_trans.h b/paddle/fluid/primitive/base/decomp_trans.h
index 4f3a83d326b337..c69a758a0b8c65 100644
--- a/paddle/fluid/primitive/base/decomp_trans.h
+++ b/paddle/fluid/primitive/base/decomp_trans.h
@@ -29,7 +29,7 @@ class DecompProgram {
   explicit DecompProgram(pir::Program* program) : program_(program) {}
 
   DecompProgram(pir::Program* program,
-                const std::vector<pir::OpResult>& src_vars,
+                const std::vector<pir::Value>& src_vars,
                 const std::set<std::string>& blacklist,
                 const std::set<std::string>& whitelist)
       : program_(program),
@@ -41,18 +41,18 @@ class DecompProgram {
   bool check_decomp_dynamic_shape(pir::Operation* op);
   void check_decomp_outputs(const std::string& op_name,
                             const std::vector<pir::OpResult>& orig_outs,
-                            const std::vector<pir::OpResult>& decomp_outs);
-  std::vector<pir::OpResult> format_decomp_res(
+                            const std::vector<pir::Value>& decomp_outs);
+  std::vector<pir::Value> format_decomp_res(
       const std::string& op_name,
       const std::vector<pir::OpResult>& orig_outs,
-      const std::vector<std::vector<pir::OpResult>>& decomp_outs);
-  std::vector<pir::OpResult> construct_dst_vars(
+      const std::vector<std::vector<pir::Value>>& decomp_outs);
+  std::vector<pir::Value> construct_dst_vars(
       const std::string& op_name,
       const std::vector<pir::OpResult>& orig_outs,
-      const std::vector<pir::OpResult>& decomp_outs,
-      std::unordered_map<pir::OpResult, int> orig_vars_dict);
+      const std::vector<pir::Value>& decomp_outs,
+      std::unordered_map<pir::Value, int> orig_vars_dict);
   bool enable_decomp_by_filter(const std::string& op_name);
-  void set_src_vars(const std::vector<pir::OpResult>& src_vars) {
+  void set_src_vars(const std::vector<pir::Value>& src_vars) {
     src_vars_ = src_vars;
   }
   void set_blacklist(const std::set<std::string>& blacklist) {
@@ -61,18 +61,18 @@ class DecompProgram {
   void set_whitelist(const std::set<std::string>& whitelist) {
     whitelist_ = whitelist;
   }
-  std::vector<pir::OpResult> get_dst_vars();
+  std::vector<pir::Value> get_dst_vars();
 
  private:
   pir::Program* program_;
-  std::vector<pir::OpResult> src_vars_;
-  std::vector<pir::OpResult> dst_vars_;
+  std::vector<pir::Value> src_vars_;
+  std::vector<pir::Value> dst_vars_;
   std::set<std::string> blacklist_;
   std::set<std::string> whitelist_;
 };
 
 bool has_decomp_rule(const pir::Operation& op);
 
-std::vector<std::vector<pir::OpResult>> call_decomp_rule(pir::Operation* op);
+std::vector<std::vector<pir::Value>> call_decomp_rule(pir::Operation* op);
 
 }  // namespace paddle
diff --git a/paddle/fluid/primitive/codegen/templates/decomp/generated_decomp.j2 b/paddle/fluid/primitive/codegen/templates/decomp/generated_decomp.j2
index 3efab61bf901a1..6c1d3e7fbe4356 100644
--- a/paddle/fluid/primitive/codegen/templates/decomp/generated_decomp.j2
+++ b/paddle/fluid/primitive/codegen/templates/decomp/generated_decomp.j2
@@ -20,7 +20,7 @@ using IntArray = paddle::experimental::IntArray;
 {% set output_names=[] %}
 {% set output_types=[] %}
 
-std::vector<std::vector<pir::OpResult>> {{class_name}}::Decomp(pir::Operation* op) {
+std::vector<std::vector<pir::Value>> {{class_name}}::Decomp(pir::Operation* op) {
   VLOG(4) << "Decomp call {{fwd_name}}'s decomp interface begin";
 
   {{class_name}} op_obj = op->dyn_cast<{{class_name}}>();
@@ -47,7 +47,7 @@ std::vector<std::vector<pir::OpResult>> {{class_name}}::Decomp(pir::Operation* o
   paddle::optional<std::vector<Tensor>> {{item.name}};
   if (!IsEmptyValue(op_obj.{{item.name}}())){
       pir::CombineOp combine_op_obj =
-          op_obj.{{item.name}}().dyn_cast<pir::OpResult>().owner()->dyn_cast<pir::CombineOp>();
+          op_obj.{{item.name}}().defining_op()->dyn_cast<pir::CombineOp>();
       std::vector<Tensor> optional_{{item.name}};
       for (size_t idx = 0; idx < combine_op_obj.inputs().size(); idx++) {
           optional_{{item.name}}.emplace_back(
@@ -58,7 +58,7 @@ std::vector<std::vector<pir::OpResult>> {{class_name}}::Decomp(pir::Operation* o
 
         {% else %}
   pir::CombineOp combine_op_obj_{{item.name}} =
-    op_obj.{{item.name}}().dyn_cast<pir::OpResult>().owner()->dyn_cast<pir::CombineOp>();
+    op_obj.{{item.name}}().defining_op()->dyn_cast<pir::CombineOp>();
   std::vector<Tensor> {{item.name}};
   for (size_t idx = 0; idx < combine_op_obj_{{item.name}}.inputs().size(); idx++) {
       {{item.name}}.emplace_back(
@@ -79,8 +79,7 @@ std::vector<std::vector<pir::OpResult>> {{class_name}}::Decomp(pir::Operation* o
   auto* {{item.name}}_define_op =
       std::static_pointer_cast<primitive::LazyTensor>({{item.name}}_.impl())
           ->value()
-          .dyn_cast<pir::OpResult>()
-          .owner();
+          .defining_op();
   if ({{item.name}}_define_op->name() != "pd_op.full") {
     PADDLE_THROW(
         platform::errors::Unimplemented("We don't support dynamic tensors "
@@ -96,8 +95,7 @@ std::vector<std::vector<pir::OpResult>> {{class_name}}::Decomp(pir::Operation* o
   auto* {{item.name}}_define_op =
       std::static_pointer_cast<primitive::LazyTensor>({{item.name}}_.impl())
           ->value()
-          .dyn_cast<pir::OpResult>()
-          .owner();
+          .defining_op();
   if ({{item.name}}_define_op->name() != "pd_op.full_int_array") {
     PADDLE_THROW(
         platform::errors::Unimplemented("We don't support dynamic tensors "
@@ -120,7 +118,7 @@ std::vector<std::vector<pir::OpResult>> {{class_name}}::Decomp(pir::Operation* o
   VLOG(6) << "Decomp call {{fwd_name}}'s forward composite rule prepare";
 
   auto org_res = op->results();
-  std::vector<std::vector<pir::OpResult>> res(org_res.size());
+  std::vector<std::vector<pir::Value>> res(org_res.size());
 
   VLOG(6) << "Decomp call {{fwd_name}}'s forward composite rule begin";
   {% if outputs|length == 1 %}
@@ -129,8 +127,7 @@ std::vector<std::vector<pir::OpResult>> {{class_name}}::Decomp(pir::Operation* o
 
   res[0].push_back(
     std::static_pointer_cast<primitive::LazyTensor>(op_res.impl())
-        ->value()
-        .dyn_cast<pir::OpResult>());
+        ->value());
   {% else %}
     {% for item in outputs %}
       {% do output_names.append(item.name) %}
@@ -141,10 +138,10 @@ std::vector<std::vector<pir::OpResult>> {{class_name}}::Decomp(pir::Operation* o
   VLOG(6) << "Decomp call {{fwd_name}}'s forward composite rule end";
   {% for k in range(outputs|length) %}
     {% if outputs[k].intermediate and fwd_name in decomp_ops_list_contain_unused_output %}
-  pir::OpResult {{outputs[k].name}};
+  pir::Value {{outputs[k].name}};
   res[{{k}}].push_back({{outputs[k].name}});
     {% else %}
-  res[{{k}}].push_back(std::static_pointer_cast<primitive::LazyTensor>(std::get<{{k}}>(op_res).impl())->value().dyn_cast<pir::OpResult>());
+  res[{{k}}].push_back(std::static_pointer_cast<primitive::LazyTensor>(std::get<{{k}}>(op_res).impl())->value());
     {% endif %}
   {% endfor %}
   {% endif %}
diff --git a/paddle/fluid/pybind/control_flow_api.cc b/paddle/fluid/pybind/control_flow_api.cc
index d085b6a4cfc509..ab1d268e9e3bc2 100644
--- a/paddle/fluid/pybind/control_flow_api.cc
+++ b/paddle/fluid/pybind/control_flow_api.cc
@@ -79,7 +79,7 @@ void BindIfOp(py::module* m) {
       .def("results", [](PyIfOp& self) -> py::list {
         py::list op_list;
         for (uint32_t i = 0; i < self->num_results(); i++) {
-          op_list.append(self.result(i));
+          op_list.append(static_cast<pir::Value>(self.result(i)));
         }
         return op_list;
       });
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index 1e3a8c8cf4cb8a..e53ed10b9a4f79 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -58,7 +58,6 @@ extern PyTypeObject* p_tensor_type;
 extern PyTypeObject* p_string_tensor_type;
 
 extern PyTypeObject* g_framework_scope_pytype;
-extern PyTypeObject* g_ir_opresult_pytype;
 extern PyTypeObject* g_ir_value_pytype;
 extern PyTypeObject* g_vartype_pytype;
 extern PyTypeObject* g_data_type_pytype;
@@ -185,10 +184,6 @@ bool PyObject_CheckFloatOrConvertToFloat(PyObject** obj) {
 
 bool PyObject_CheckStr(PyObject* obj) { return PyUnicode_Check(obj); }
 
-bool PyObject_CheckIROpResult(PyObject* obj) {
-  return PyObject_TypeCheck(obj, g_ir_opresult_pytype);
-}
-
 bool PyObject_CheckIRValue(PyObject* obj) {
   return PyObject_TypeCheck(obj, g_ir_value_pytype);
 }
@@ -228,40 +223,6 @@ bool PyObject_CheckIRVectorOfValue(PyObject* obj) {
   }
 }
 
-bool PyObject_CheckIRVectorOfOpResult(PyObject* obj) {
-  if (PyList_Check(obj)) {
-    Py_ssize_t len = PyList_Size(obj);
-    PyObject* item = nullptr;
-    // if obj is [], parse it as std::vector<scalar>
-    if (len == 0) {
-      return false;
-    }
-    for (Py_ssize_t i = 0; i < len; i++) {
-      item = PyList_GetItem(obj, i);
-      if (!PyObject_CheckIROpResult(item)) {
-        return false;
-      }
-    }
-    return true;
-  } else if (PyTuple_Check(obj)) {
-    Py_ssize_t len = PyTuple_Size(obj);
-    PyObject* item = nullptr;
-    if (len == 0) {
-      return false;
-    }
-    for (Py_ssize_t i = 0; i < len; i++) {
-      item = PyTuple_GetItem(obj, i);
-      if (!PyObject_CheckIROpResult(item)) {
-        return false;
-      }
-    }
-    return true;
-  } else if (PyObject_TypeCheck(obj, g_ir_opresult_pytype)) {
-    return true;
-  } else {
-    return false;
-  }
-}
 bool CastPyArg2AttrBoolean(PyObject* obj, ssize_t arg_pos) {
   if (obj == Py_None || obj == Py_False) {
     return false;  // To be compatible with QA integration testing. Some
@@ -1115,12 +1076,22 @@ PyObject* ToPyObject(const phi::DenseTensor* value) {
   return obj.ptr();
 }
 
-PyObject* ToPyObject(const pir::OpResult& value) {
+PyObject* ToPyObject(const pir::Value& value) {
   auto obj = ::pybind11::cast(value);
   obj.inc_ref();
   return obj.ptr();
 }
 
+PyObject* ToPyObject(const std::vector<pir::Value>& value) {
+  PyObject* result = PyList_New((Py_ssize_t)value.size());
+
+  for (size_t i = 0; i < value.size(); i++) {
+    PyList_SET_ITEM(result, static_cast<Py_ssize_t>(i), ToPyObject(value[i]));
+  }
+
+  return result;
+}
+
 PyObject* ToPyObject(const std::vector<pir::OpResult>& value) {
   PyObject* result = PyList_New((Py_ssize_t)value.size());
 
@@ -1987,7 +1958,7 @@ PyObject* GetEmptyTensorsWithVarDesc(PyObject* self, PyObject* args) {
   return ToPyObject(result);
 }
 
-paddle::Tensor CreateTensorFromOpResult(const pir::OpResult& op_result) {
+paddle::Tensor CreateTensorFromValue(const pir::Value& op_result) {
   auto tensor = paddle::Tensor();
 
   auto dims = phi::vectorize(GetValueDims(op_result));
@@ -2028,9 +1999,9 @@ paddle::Tensor CreateTensorFromOpResult(const pir::OpResult& op_result) {
   return tensor;
 }
 
-PyObject* GetEmptyTensorsWithOpResult(PyObject* self, PyObject* args) {
+PyObject* GetEmptyTensorsWithValue(PyObject* self, PyObject* args) {
   std::vector<paddle::Tensor> result;
-  std::unordered_map<pir::OpResult, paddle::Tensor> out_tensor_map;
+  std::unordered_map<pir::Value, paddle::Tensor> out_tensor_map;
 
   auto op_result_list = PyTuple_GetItem(args, 0);
 
@@ -2038,9 +2009,9 @@ PyObject* GetEmptyTensorsWithOpResult(PyObject* self, PyObject* args) {
     Py_ssize_t len = PyList_Size(op_result_list);
     for (Py_ssize_t i = 0; i < len; i++) {
       auto op_result =
-          PyObjectCast<pir::OpResult>(PyList_GetItem(op_result_list, i));
+          PyObjectCast<pir::Value>(PyList_GetItem(op_result_list, i));
       if (out_tensor_map.find(op_result) == out_tensor_map.end()) {
-        paddle::Tensor tensor = CreateTensorFromOpResult(op_result);
+        paddle::Tensor tensor = CreateTensorFromValue(op_result);
         out_tensor_map[op_result] = tensor;
         result.emplace_back(tensor);
       } else {
@@ -2051,9 +2022,9 @@ PyObject* GetEmptyTensorsWithOpResult(PyObject* self, PyObject* args) {
     Py_ssize_t len = PyTuple_Size(op_result_list);
     for (Py_ssize_t i = 0; i < len; i++) {
       auto op_result =
-          PyObjectCast<pir::OpResult>(PyTuple_GetItem(op_result_list, i));
+          PyObjectCast<pir::Value>(PyTuple_GetItem(op_result_list, i));
       if (out_tensor_map.find(op_result) == out_tensor_map.end()) {
-        paddle::Tensor tensor = CreateTensorFromOpResult(op_result);
+        paddle::Tensor tensor = CreateTensorFromValue(op_result);
         out_tensor_map[op_result] = tensor;
         result.emplace_back(tensor);
       } else {
@@ -2062,7 +2033,7 @@ PyObject* GetEmptyTensorsWithOpResult(PyObject* self, PyObject* args) {
     }
   } else if (op_result_list != Py_None) {
     PADDLE_THROW(platform::errors::InvalidArgument(
-        "Argument of GetTensorsWithOpResultInArgs must be list of OpResult, "
+        "Argument of GetTensorsWithValueInArgs must be list of Value, "
         "but got "
         "%s",
         (reinterpret_cast<PyTypeObject*>(op_result_list->ob_type))->tp_name));
@@ -2143,14 +2114,12 @@ pir::Value CastPyArg2Value(PyObject* obj,
                            const std::string& op_type,
                            size_t arg_pos) {
   obj = CastPyArg2ValuePreHook(obj);
-  if (PyObject_TypeCheck(obj, g_ir_opresult_pytype)) {
-    return ::pybind11::handle(obj).cast<pir::OpResult>();
-  } else if (PyObject_TypeCheck(obj, g_ir_value_pytype)) {
+  if (PyObject_TypeCheck(obj, g_ir_value_pytype)) {
     return ::pybind11::handle(obj).cast<pir::Value>();
   } else {
     PADDLE_THROW(platform::errors::InvalidType(
         "%s(): argument (position %d) must be "
-        "OpResult, but got %s",
+        "Value, but got %s",
         op_type,
         arg_pos + 1,
         ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
@@ -2764,9 +2733,9 @@ static PyMethodDef EagerUtilMethods[] = {
      METH_VARARGS,
      "GetEmptyTensorsWithVarDesc"},
     {"create_empty_tensors_with_op_results",
-     (PyCFunction)(void (*)(void))GetEmptyTensorsWithOpResult,
+     (PyCFunction)(void (*)(void))GetEmptyTensorsWithValue,
      METH_VARARGS,
-     "GetEmptyTensorsWithOpResult."},
+     "GetEmptyTensorsWithValue."},
     {"set_static_op_arg_pre_cast_hook",
      (PyCFunction)SetStaticOpArgPreCastHook,
      METH_O,
diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
index fb2b02d65e3922..b528217b788ff8 100644
--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -71,9 +71,7 @@ int TensorDtype2NumpyDtype(phi::DataType dtype);
 bool PyObject_CheckLongOrConvertToLong(PyObject** obj);
 bool PyObject_CheckFloatOrConvertToFloat(PyObject** obj);
 bool PyObject_CheckStr(PyObject* obj);
-bool PyObject_CheckIROpResult(PyObject* obj);
 bool PyObject_CheckIRValue(PyObject* obj);
-bool PyObject_CheckIRVectorOfOpResult(PyObject* obj);
 bool PyObject_CheckIRVectorOfValue(PyObject* obj);
 bool CastPyArg2AttrBoolean(PyObject* obj, ssize_t arg_pos);
 int CastPyArg2AttrInt(PyObject* obj, ssize_t arg_pos);
@@ -157,7 +155,8 @@ PyObject* ToPyObject(
 PyObject* ToPyObject(const paddle::framework::Vocab& value);
 
 PyObject* ToPyObject(std::shared_ptr<egr::GradNodeBase> grad_node);
-PyObject* ToPyObject(const pir::OpResult& value);
+PyObject* ToPyObject(const pir::Value& value);
+PyObject* ToPyObject(const std::vector<pir::Value>& value);
 PyObject* ToPyObject(const std::vector<pir::OpResult>& value);
 
 class PyTensorHook : public egr::TensorHook {
@@ -397,7 +396,7 @@ paddle::Tensor& UnSafeGetTensorFromPyObject(PyObject* obj);
 
 PyObject* GetEmptyTensorsWithVarDesc(PyObject* self, PyObject* args);
 
-PyObject* GetEmptyTensorsWithOpResult(PyObject* self, PyObject* args);
+PyObject* GetEmptyTensorsWithValue(PyObject* self, PyObject* args);
 
 // end of Slice related methods
 
diff --git a/paddle/fluid/pybind/manual_static_op_function.h b/paddle/fluid/pybind/manual_static_op_function.h
index af733ecbce53f8..a8e354feb129dd 100644
--- a/paddle/fluid/pybind/manual_static_op_function.h
+++ b/paddle/fluid/pybind/manual_static_op_function.h
@@ -83,9 +83,9 @@ PyObject *static_api_full(PyObject *self, PyObject *args, PyObject *kwargs) {
     phi::DataType dtype = CastPyArg2DataTypeDirectly(dtype_obj, "full", 2);
     Place place = CastPyArg2Place(place_obj, "full", 3);
 
-    if (!PyObject_CheckIROpResult(shape_obj) &&
-        !PyObject_CheckIRVectorOfOpResult(shape_obj) &&
-        !PyObject_CheckIROpResult(value_obj)) {
+    if (!PyObject_CheckIRValue(shape_obj) &&
+        !PyObject_CheckIRVectorOfValue(shape_obj) &&
+        !PyObject_CheckIRValue(value_obj)) {
       std::vector<int64_t> shape = CastPyArg2Longs(shape_obj, "full", 0);
       float value = CastPyArg2Float(value_obj, "full", 1);
       auto static_api_out = paddle::dialect::full(shape, value, dtype, place);
@@ -93,9 +93,9 @@ PyObject *static_api_full(PyObject *self, PyObject *args, PyObject *kwargs) {
     } else {
       pir::Value shape, value;
 
-      if (PyObject_CheckIROpResult(shape_obj)) {
+      if (PyObject_CheckIRValue(shape_obj)) {
         shape = CastPyArg2Value(shape_obj, "full", 0);
-      } else if (PyObject_CheckIRVectorOfOpResult(shape_obj)) {
+      } else if (PyObject_CheckIRVectorOfValue(shape_obj)) {
         std::vector<pir::Value> shape_tmp =
             CastPyArg2VectorOfValue(shape_obj, "full", 0);
         shape = paddle::dialect::stack(shape_tmp, 0);
@@ -105,7 +105,7 @@ PyObject *static_api_full(PyObject *self, PyObject *args, PyObject *kwargs) {
             shape_tmp, phi::DataType::INT64, phi::CPUPlace());
       }
 
-      if (PyObject_CheckIROpResult(value_obj)) {
+      if (PyObject_CheckIRValue(value_obj)) {
         value = CastPyArg2Value(value_obj, "full", 1);
       } else {
         float value_tmp = CastPyArg2Float(value_obj, "full", 1);
@@ -270,9 +270,9 @@ static PyObject *static_api_array_to_tensor(PyObject *self,
     // Get Value from args
     PyObject *x_obj = PyTuple_GET_ITEM(args, 0);
     pir::Value x;
-    if (PyObject_CheckIROpResult(x_obj)) {
+    if (PyObject_CheckIRValue(x_obj)) {
       x = CastPyArg2Value(x_obj, "array_to_tensor", 0);
-    } else if (PyObject_CheckIRVectorOfOpResult(x_obj)) {
+    } else if (PyObject_CheckIRVectorOfValue(x_obj)) {
       std::vector<pir::Value> x_tmp =
           CastPyArg2VectorOfValue(x_obj, "array_to_tensor", 0);
       if (x_tmp.size() != 1) {
@@ -334,9 +334,9 @@ static PyObject *static_api_slice_array_dense(PyObject *self,
 
     PyObject *starts_obj = PyTuple_GET_ITEM(args, 1);
     pir::Value starts;
-    if (PyObject_CheckIROpResult(starts_obj)) {
+    if (PyObject_CheckIRValue(starts_obj)) {
       starts = CastPyArg2Value(starts_obj, "slice_array_dense", 1);
-    } else if (PyObject_CheckIRVectorOfOpResult(starts_obj)) {
+    } else if (PyObject_CheckIRVectorOfValue(starts_obj)) {
       std::vector<pir::Value> starts_tmp =
           CastPyArg2VectorOfValue(starts_obj, "slice_array_dense", 1);
       starts = paddle::dialect::stack(starts_tmp, /*axis*/ 0);
diff --git a/paddle/fluid/pybind/pir.cc b/paddle/fluid/pybind/pir.cc
index eb58d6289267d9..64328a8cb26c45 100644
--- a/paddle/fluid/pybind/pir.cc
+++ b/paddle/fluid/pybind/pir.cc
@@ -135,7 +135,6 @@ PHI_DECLARE_bool(print_ir);
 namespace paddle {
 namespace pybind {
 
-PyTypeObject *g_ir_opresult_pytype = nullptr;
 PyTypeObject *g_ir_value_pytype = nullptr;
 
 void BindOpsAPI(pybind11::module *module);
@@ -417,7 +416,7 @@ void BindBlock(py::module *m) {
                    bool is_persistable =
                        attrs[i].dyn_cast<pir::BoolAttribute>().data();
                    if (is_persistable) {
-                     param_list.append(op.result(i));
+                     param_list.append(static_cast<pir::Value>(op.result(i)));
                    }
                  }
                }
@@ -449,10 +448,20 @@ void BindOperation(py::module *m) {
       .def("num_operands", &Operation::num_operands)
       .def("num_results", &Operation::num_results)
       .def("operand", &Operation::operand)
-      .def("result", &Operation::result)
+      .def("result",
+           [](Operation &self, uint32_t index) {
+             return static_cast<pir::Value>(self.result(index));
+           })
       .def("operand_source", &Operation::operand_source)
       .def("operands", &Operation::operands)
-      .def("results", &Operation::results)
+      .def("results",
+           [](Operation &self) -> py::list {
+             py::list op_list;
+             for (uint32_t i = 0; i < self.num_results(); i++) {
+               op_list.append(static_cast<pir::Value>(self.result(i)));
+             }
+             return op_list;
+           })
       .def(
           "blocks",
           [](Operation &self) { return &self.blocks(); },
@@ -608,7 +617,7 @@ const phi::DDim &GetValueDims(Value value) {
   }
 }
 
-pir::OpResult apply(Value self, py::object func) {
+pir::Value apply(Value self, py::object func) {
   py::gil_scoped_acquire gil;
   auto stop_gradient = self.attribute<BoolAttribute>(kAttrStopGradients);
   if (stop_gradient && !stop_gradient.data()) {
@@ -632,12 +641,12 @@ pir::OpResult apply(Value self, py::object func) {
         "Apply function of Tensor raises an unknown exception."));
   }
   if (res == Py_None) {
-    return self.dyn_cast<OpResult>();
+    return self;
   }
   auto out = CastPyArg2Value(res, "", 0);
   Py_DECREF(py_func);
   Py_DECREF(res);
-  return out.dyn_cast<OpResult>();
+  return out;
 }
 
 void BindValue(py::module *m) {
@@ -754,12 +763,7 @@ void BindValue(py::module *m) {
            })
       .def(
           "get_defining_op",
-          [](Value self) -> pir::Operation * {
-            if (auto op_result = self.dyn_cast<pir::OpResult>()) {
-              return op_result.owner();
-            }
-            return nullptr;
-          },
+          [](Value self) -> pir::Operation * { return self.defining_op(); },
           return_value_policy::reference)
       .def("numel", [](Value self) { return phi::product(GetValueDims(self)); })
       .def("type", &Value::type)
@@ -822,13 +826,10 @@ void BindOpOperand(py::module *m) {
         when build network.
 
   )DOC");
-  op_operand
-      .def("source",
-           [](OpOperand &self) { return self.source().dyn_cast<OpResult>(); })
-      .def("set_source",
-           [](OpOperand &self, const OpResult &result) {
-             self.set_source(result);
-           })
+  op_operand.def("source", [](OpOperand &self) { return self.source(); })
+      .def(
+          "set_source",
+          [](OpOperand &self, const Value &result) { self.set_source(result); })
       .def("owner", &OpOperand::owner, return_value_policy::reference)
       .def("index", &OpOperand::index);
 }
@@ -838,21 +839,6 @@ bool GetValueBoolAttr(Value value, const std::string &attr_name) {
   return !bool_attr || bool_attr.data();
 }
 
-void BindOpResult(py::module *m) {
-  py::class_<OpResult, Value> op_result(*m, "OpResult", R"DOC(
-    OpResult class represents the value(output) defined by a result of operation.
-
-    Notes:
-        The constructor of OpResult should not be invoked directly. OpResult can be automatically constructed
-        when build network.
-  )DOC");
-  g_ir_opresult_pytype = reinterpret_cast<PyTypeObject *>(op_result.ptr());
-  op_result.def(
-      "__init__",
-      [](OpResult &self) { new (&self) OpResult(); },
-      pybind11::return_value_policy::reference);
-}
-
 void BindType(py::module *m) {
   py::class_<Type> ir_type(*m, "Type");
   ir_type.def("__eq__", [](Type &self, Type &other) { return self == other; })
@@ -1023,18 +1009,17 @@ static auto GetNoNeedBufferValue(const ::pir::Block *whole_block,
                                    no_need_buffer_values.end());
 }
 
-using OpResultMap =
-    std::pair<std::vector<pir::OpResult>, std::vector<pir::OpResult>>;
-std::pair<std::shared_ptr<Program>, OpResultMap> CloneProgram(
+using ValueMap = std::pair<std::vector<pir::Value>, std::vector<pir::Value>>;
+std::pair<std::shared_ptr<Program>, ValueMap> CloneProgram(
     const Program &program) {
   // Limitation of this function:
   // 1. don't support Parameters.
   pir::IrMapping mapper;
   auto cloned_program = program.Clone(mapper);
-  std::vector<pir::OpResult> associated_array_key, associated_array_value;
+  std::vector<pir::Value> associated_array_key, associated_array_value;
   for (auto &pair : mapper.GetMap<pir::Value>()) {
-    associated_array_key.push_back(pair.first.dyn_cast<pir::OpResult>());
-    associated_array_value.push_back(pair.second.dyn_cast<pir::OpResult>());
+    associated_array_key.push_back(pair.first);
+    associated_array_value.push_back(pair.second);
   }
   return std::make_pair(
       cloned_program,
@@ -1471,10 +1456,10 @@ void BindUtils(pybind11::module *m) {
         translator::ProgramTranslator program_translator(&legacy_program,
                                                          program.get());
         program_translator.Translate();
-        return std::make_pair(program, program_translator.VarDesc2OpResult());
+        return std::make_pair(program, program_translator.VarDesc2Value());
       },
       R"DOC(
-        Convert Fluid Program to New IR Program and get the mappings of VarDesc -> pir::OpResult.
+        Convert Fluid Program to New IR Program and get the mappings of VarDesc -> pir::Value.
 
         Args:
 
@@ -1482,7 +1467,7 @@ void BindUtils(pybind11::module *m) {
 
         Returns:
             Program: The New IR Program
-            dict[str, pir::OpResult]: Mapping between VarDesc(by name) and pir::OpResult.
+            dict[str, pir::Value]: Mapping between VarDesc(by name) and pir::Value.
 
         Raises:
             PreconditionNotMet: If legacy_program has multi block will raise error.
@@ -1659,7 +1644,6 @@ void BindPir(pybind11::module *module) {
   BindOperation(&ir_module);
   BindValue(&ir_module);
   BindOpOperand(&ir_module);
-  BindOpResult(&ir_module);
   BindType(&ir_module);
   BindAttribute(&ir_module);
   BindInsertionPoint(&ir_module);
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 35b3b613d7bfc8..ffc2549d5e73ff 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -720,7 +720,7 @@ void BindVjp(pybind11::module *m) {
             vjp_interface,
             phi::errors::InvalidArgument(
                 "The vjp function is not registered in %s op ", fwd_op.name()));
-        std::vector<std::vector<pir::OpResult>> vjp_res = vjp_interface.Vjp(
+        std::vector<std::vector<pir::Value>> vjp_res = vjp_interface.Vjp(
             &fwd_op, inputs, outputs, out_grads, stop_gradients);
         PADDLE_ENFORCE_EQ(
             stop_gradients.size(),
@@ -787,14 +787,14 @@ void BindVjp(pybind11::module *m) {
 void BindDecomp(pybind11::module *m) {
   m->def("sinking_decomp",
          [](pir::Program *program,
-            std::vector<pir::OpResult> &src_vars,
+            std::vector<pir::Value> &src_vars,
             std::set<std::string> &blacklist,
             std::set<std::string> &whitelist) {
            VLOG(4) << "[Prim] Bind Decomp sinking_decomp begin.";
            py::list res;
            DecompProgram decomp_object(program, src_vars, blacklist, whitelist);
            decomp_object.decomp_program();
-           std::vector<pir::OpResult> tar_vars = decomp_object.get_dst_vars();
+           std::vector<pir::Value> tar_vars = decomp_object.get_dst_vars();
            for (size_t i = 0; i < tar_vars.size(); ++i) {
              if (!tar_vars[i]) {
                res.append(nullptr);
@@ -808,8 +808,7 @@ void BindDecomp(pybind11::module *m) {
 
   m->def("call_decomp", [](pir::Operation &fwd_op) {
     py::list res;
-    std::vector<std::vector<pir::OpResult>> decomp_res =
-        call_decomp_rule(&fwd_op);
+    std::vector<std::vector<pir::Value>> decomp_res = call_decomp_rule(&fwd_op);
     for (size_t i = 0; i < decomp_res.size(); ++i) {
       py::list sub_res;
       for (size_t j = 0; j < decomp_res[i].size(); ++j) {
diff --git a/test/dygraph_to_static/test_set_static_op_arg_pre_cast_hook.py b/test/dygraph_to_static/test_set_static_op_arg_pre_cast_hook.py
index d6051929adfc63..192f2134f63f64 100644
--- a/test/dygraph_to_static/test_set_static_op_arg_pre_cast_hook.py
+++ b/test/dygraph_to_static/test_set_static_op_arg_pre_cast_hook.py
@@ -36,7 +36,7 @@ def test_set_static_op_arg_pre_cast_hook(self):
         with static_guard():
             with self.assertRaisesRegex(
                 TypeError,
-                r"abs\(\): argument \(position 1\) must be OpResult, but got Tensor",
+                r"abs\(\): argument \(position 1\) must be Value, but got Tensor",
             ):
                 paddle.abs(eager_tensor)
 

From 4110bb1dbf794a81451e3fa2d3daa64fecfe7c55 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Tue, 23 Jan 2024 14:25:45 +0800
Subject: [PATCH 25/34]  Fix paralel parallel (#61034)

---
 paddle/phi/api/yaml/generator/dist_api_gen.py             | 8 ++++----
 paddle/phi/api/yaml/generator/dist_bw_api_gen.py          | 4 ++--
 .../distributed/auto_parallel/static/operators/common.py  | 4 ++--
 python/paddle/distributed/auto_tuner/prune.py             | 4 ++--
 python/paddle/distributed/auto_tuner/utils.py             | 6 ++++--
 python/paddle/distributed/fleet/base/topology.py          | 2 +-
 test/legacy_test/test_directory_migration.py              | 2 +-
 7 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/paddle/phi/api/yaml/generator/dist_api_gen.py b/paddle/phi/api/yaml/generator/dist_api_gen.py
index b1a775d912f271..de5628d023998c 100644
--- a/paddle/phi/api/yaml/generator/dist_api_gen.py
+++ b/paddle/phi/api/yaml/generator/dist_api_gen.py
@@ -1730,8 +1730,8 @@ def generate_output_dist_attr_setting(self) -> str:
     def generate_return_code(self) -> str:
         return self.gene_return_code()
 
-    def generate_auto_paralel_branch(self) -> str:
-        # if no tensor input, do not genetate auto parallel branch
+    def generate_auto_parallel_branch(self) -> str:
+        # if no tensor input, do not generate auto parallel branch
         if len(self.inputs['names']) == 0:
             return ""
 
@@ -1815,7 +1815,7 @@ def gene_base_api_code(self, inplace_flag=False):
                     and not self.api.endswith("_double_grad")
                     and not self.api.endswith("_triple_grad")
                 ):
-                    dist_branch_code += self.generate_auto_paralel_branch()
+                    dist_branch_code += self.generate_auto_parallel_branch()
             kernel_dispatch_code += dist_branch_code
             for kernel_name in self.kernel['func']:
                 kernel_dispatch_code += self.gene_dispatch_code(
@@ -1837,7 +1837,7 @@ def gene_base_api_code(self, inplace_flag=False):
                 and not self.api.endswith("_double_grad")
                 and not self.api.endswith("_triple_grad")
             ):
-                dist_branch_code = self.generate_auto_paralel_branch()
+                dist_branch_code = self.generate_auto_parallel_branch()
             return API_IMPL_TEMPLATE.format(
                 self.get_return_type(inplace_flag),
                 api_func_name,
diff --git a/paddle/phi/api/yaml/generator/dist_bw_api_gen.py b/paddle/phi/api/yaml/generator/dist_bw_api_gen.py
index 3fd8d8a383f3ec..320d6d0193a25f 100644
--- a/paddle/phi/api/yaml/generator/dist_bw_api_gen.py
+++ b/paddle/phi/api/yaml/generator/dist_bw_api_gen.py
@@ -355,8 +355,8 @@ def generate_reshard_output_code(self):
 
         return reshard_output_code
 
-    def generate_auto_paralel_branch(self) -> str:
-        # if no tensor input, do not genetate auto parallel branch
+    def generate_auto_parallel_branch(self) -> str:
+        # if no tensor input, do not generate auto parallel branch
         if len(self.inputs['names']) == 0:
             return ""
         infer_spmd_code = self.generate_infer_spmd_code()
diff --git a/python/paddle/distributed/auto_parallel/static/operators/common.py b/python/paddle/distributed/auto_parallel/static/operators/common.py
index 88045d831435ed..75a45a510b0cad 100644
--- a/python/paddle/distributed/auto_parallel/static/operators/common.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/common.py
@@ -66,13 +66,13 @@ class ParallelMode:
 
     DataParallel = "auto_parallel/data_parallel"
     TensorParallel = "auto_parallel/tensor_parallel"
-    PipelineParalel = "auto_parallel/pipeline_paralel"
+    PipelineParallel = "auto_parallel/pipeline_parallel"
     MoEParallel = "auto_parallel/moe_parallel"
 
 
 class SyncMode:
     """
-    the synchorization mode for communication or auxiliary operator
+    the synchronization mode for communication or auxiliary operator
     """
 
     AmpFlagSync = "auto_parallel/amp_flag_synchorization"
diff --git a/python/paddle/distributed/auto_tuner/prune.py b/python/paddle/distributed/auto_tuner/prune.py
index 14c1ca64c0c9c0..5aa3340685e191 100644
--- a/python/paddle/distributed/auto_tuner/prune.py
+++ b/python/paddle/distributed/auto_tuner/prune.py
@@ -127,7 +127,7 @@ def prune_by_mp(tuner_cfg, cur_cfg, history_cfgs=[]):
         "num_attention_heads", None
     )
     seq_length = tuner_cfg["model_cfg"].get("seq_length", None)
-    use_sequence_paralel = tuner_cfg.get("use_sequence_paralel", False)
+    use_sequence_parallel = tuner_cfg.get("use_sequence_parallel", False)
 
     if mp_degree is None:
         return False
@@ -141,7 +141,7 @@ def prune_by_mp(tuner_cfg, cur_cfg, history_cfgs=[]):
     if num_attention_heads and num_attention_heads % mp_degree != 0:
         return True
 
-    if seq_length and seq_length % mp_degree != 0 and use_sequence_paralel:
+    if seq_length and seq_length % mp_degree != 0 and use_sequence_parallel:
         return True
 
     mp_degree_candidates = tuner_cfg.get("mp_degree", None)
diff --git a/python/paddle/distributed/auto_tuner/utils.py b/python/paddle/distributed/auto_tuner/utils.py
index 798b34a21f13f6..e264373bb6e88d 100644
--- a/python/paddle/distributed/auto_tuner/utils.py
+++ b/python/paddle/distributed/auto_tuner/utils.py
@@ -107,7 +107,9 @@ def dist_degree(mode, num_gpus, num_nodes, tuner_cfg=None):
                 "num_attention_heads", None
             )
             seq_length = tuner_cfg["model_cfg"].get("seq_length", None)
-            use_sequence_paralel = tuner_cfg.get("use_sequence_paralel", False)
+            use_sequence_parallel = tuner_cfg.get(
+                "use_sequence_parallel", False
+            )
 
             if hidden_size and hidden_size % mp_degree != 0:
                 prune_flag = True
@@ -121,7 +123,7 @@ def dist_degree(mode, num_gpus, num_nodes, tuner_cfg=None):
             if (
                 seq_length
                 and seq_length % mp_degree != 0
-                and use_sequence_paralel
+                and use_sequence_parallel
             ):
                 prune_flag = True
 
diff --git a/python/paddle/distributed/fleet/base/topology.py b/python/paddle/distributed/fleet/base/topology.py
index bced953eff1397..2aab44bd849480 100644
--- a/python/paddle/distributed/fleet/base/topology.py
+++ b/python/paddle/distributed/fleet/base/topology.py
@@ -291,7 +291,7 @@ def __init__(self, topology):
         _HYBRID_PARALLEL_GROUP = self
 
     def get_parallel_mode(self):
-        # there are five modes : DataParallel / TensorParallel / PipelineParallel / ShardingParallel / SepParalel
+        # there are five modes : DataParallel / TensorParallel / PipelineParallel / ShardingParallel / SepParallel
         # NOTE when sharding conjugates with other parallel, sharding should act like a optimizer and
         # adding its parallel logic within that parallelism
         # when use sharding alone, it should have its own parallelism for its parallel logic
diff --git a/test/legacy_test/test_directory_migration.py b/test/legacy_test/test_directory_migration.py
index 3230a0ecc666ff..8dc4e3106a0e59 100644
--- a/test/legacy_test/test_directory_migration.py
+++ b/test/legacy_test/test_directory_migration.py
@@ -137,7 +137,7 @@ def test_old_directory(self):
             'paddle.imperative.load',
             'paddle.imperative.ParallelEnv',
             'paddle.imperative.prepare_context',
-            'paddle.imperative.DataParalell',
+            'paddle.imperative.DataParallel',
             'paddle.imperative.jit',
             'paddle.imperative.TracedLayer',
             'paddle.imperative.declarative',

From a60e25108b7cbcb2dffd0c6d733672dad72a6630 Mon Sep 17 00:00:00 2001
From: kevin <chengyf112@gmail.com>
Date: Tue, 23 Jan 2024 14:29:58 +0800
Subject: [PATCH 26/34] [Prim][PIR] add index_select op forward prim (#61024)

* add index_select prim

* close complex test case
---
 .../op_generator/decomp_interface_gen_op_list.py       |  2 ++
 paddle/fluid/primitive/composite/composite.h           | 10 ++++++++++
 test/legacy_test/test_index_select_op.py               | 10 +++++++---
 3 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/pir/dialect/op_generator/decomp_interface_gen_op_list.py b/paddle/fluid/pir/dialect/op_generator/decomp_interface_gen_op_list.py
index 364a2a8de7724e..a4487f405e8444 100644
--- a/paddle/fluid/pir/dialect/op_generator/decomp_interface_gen_op_list.py
+++ b/paddle/fluid/pir/dialect/op_generator/decomp_interface_gen_op_list.py
@@ -25,6 +25,7 @@
     "dropout",
     "full_like",
     "gelu",
+    "index_select",
     "instance_norm",
     "layer_norm",
     "leaky_relu",
@@ -48,6 +49,7 @@
     "dropout",
     "full_like",
     "gelu",
+    "index_select",
     "instance_norm",
     "layer_norm",
     "leaky_relu",
diff --git a/paddle/fluid/primitive/composite/composite.h b/paddle/fluid/primitive/composite/composite.h
index e4600292054320..dd3168136d5b45 100644
--- a/paddle/fluid/primitive/composite/composite.h
+++ b/paddle/fluid/primitive/composite/composite.h
@@ -637,6 +637,16 @@ std::tuple<Tensor, Tensor, Tensor> instance_norm_decomp(
   return std::make_tuple(res, mean_out, variance_out);
 }
 
+template <typename T>
+Tensor index_select_decomp(const Tensor& x, const Tensor& index, int axis) {
+  int axis_tmp = axis;
+  if (axis < 0) {
+    axis_tmp += x.dims().size();
+  }
+
+  return gather<T>(x, index, axis_tmp);
+}
+
 }  // namespace details
 
 }  // namespace primitive
diff --git a/test/legacy_test/test_index_select_op.py b/test/legacy_test/test_index_select_op.py
index b4808150dc81fd..0eead5a4fb2777 100644
--- a/test/legacy_test/test_index_select_op.py
+++ b/test/legacy_test/test_index_select_op.py
@@ -67,9 +67,13 @@ def init_dtype_type(self):
 
     def test_check_output(self):
         if self.x_type == np.complex64 or self.x_type == np.complex128:
-            self.check_output(check_prim=False, check_pir=True)
+            self.check_output(
+                check_prim=False, check_pir=True, check_prim_pir=False
+            )
         else:
-            self.check_output(check_prim=True, check_pir=True)
+            self.check_output(
+                check_prim=True, check_pir=True, check_prim_pir=True
+            )
 
     def test_check_grad_normal(self):
         if self.x_type == np.complex64 or self.x_type == np.complex128:
@@ -151,7 +155,7 @@ def init_dtype_type(self):
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place, check_pir=True)
+        self.check_output_with_place(place, check_pir=True, check_prim_pir=True)
 
     def test_check_grad_normal(self):
         place = core.CUDAPlace(0)

From 38b9793e51f70ef8739111c2152ecafd9c270a36 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Tue, 23 Jan 2024 14:41:27 +0800
Subject: [PATCH 27/34] Fix precison precision (#61033)

* Fix

* Fix
---
 .../ipu/popart_canonicalization/tensor_ops.cc      |  2 +-
 paddle/phi/kernels/funcs/cufft_util.h              |  8 ++++----
 paddle/scripts/paddle_build.sh                     |  4 ++--
 python/paddle/amp/auto_cast.py                     |  6 +++---
 python/paddle/nn/layer/layers.py                   |  2 +-
 python/paddle/static/amp/fp16_utils.py             | 14 +++++++-------
 6 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc
index 9df51d5c42fc94..65a31917ec818c 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc
@@ -213,7 +213,7 @@ Node *cast_handler(Graph *graph, Node *node) {
                              node->inputs,
                              node->outputs,
                              static_cast<VarType::Type>(otype));
-  // Cast op created in mixed-precison has no pipline attrs
+  // Cast op created in mixed-precision has no pipline attrs
   auto &prev_nodes = node->inputs.front()->inputs;
   if (!prev_nodes.empty()) {
     auto *prev_op = prev_nodes.front()->Op();
diff --git a/paddle/phi/kernels/funcs/cufft_util.h b/paddle/phi/kernels/funcs/cufft_util.h
index 52dfb8733f8a5e..ee9539d41da17c 100644
--- a/paddle/phi/kernels/funcs/cufft_util.h
+++ b/paddle/phi/kernels/funcs/cufft_util.h
@@ -84,8 +84,8 @@ class FFTConfig {
   // sizes are full signal, including batch size and always two-sided
   FFTConfig(const std::vector<int64_t>& sizes,
             FFTTransformType fft_type,
-            DataType precison)
-      : fft_type_(fft_type), precision_(precison) {
+            DataType precision)
+      : fft_type_(fft_type), precision_(precision) {
     const auto batch_size = static_cast<plan_size_type>(sizes[0]);
     std::vector<plan_size_type> signal_sizes(sizes.cbegin() + 1, sizes.cend());
     const int signal_ndim = sizes.size() - 1;
@@ -93,11 +93,11 @@ class FFTConfig {
     cudaDataType itype, otype, exec_type;
     const bool complex_input = has_complex_input(fft_type);
     const bool complex_output = has_complex_output(fft_type);
-    if (precison == DataType::FLOAT32) {
+    if (precision == DataType::FLOAT32) {
       itype = complex_input ? CUDA_C_32F : CUDA_R_32F;
       otype = complex_output ? CUDA_C_32F : CUDA_R_32F;
       exec_type = CUDA_C_32F;
-    } else if (precison == DataType::FLOAT64) {
+    } else if (precision == DataType::FLOAT64) {
       itype = complex_input ? CUDA_C_64F : CUDA_R_64F;
       otype = complex_output ? CUDA_C_64F : CUDA_R_64F;
       exec_type = CUDA_C_64F;
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index c7a8c76e064a4e..96cf5e5c9a143e 100644
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -1527,7 +1527,7 @@ set -x
         # set trt_convert ut to run 15% cases.
         export TEST_NUM_PERCENT_CASES=0.15
         export FLAGS_trt_ibuilder_cache=1
-        precison_cases=""
+        precision_cases=""
         bash $PADDLE_ROOT/tools/check_added_ut.sh
         if [ ${PRECISION_TEST:-OFF} == "ON" ]; then
             python $PADDLE_ROOT/tools/get_pr_ut.py
@@ -2529,7 +2529,7 @@ set -x
         # set trt_convert ut to run 15% cases.
         export TEST_NUM_PERCENT_CASES=0.15
         export FLAGS_trt_ibuilder_cache=1
-        precison_cases=""
+        precision_cases=""
         bash $PADDLE_ROOT/tools/check_added_ut.sh
         #check change of pr_unittests and dev_unittests
         check_approvals_of_unittest 2
diff --git a/python/paddle/amp/auto_cast.py b/python/paddle/amp/auto_cast.py
index dd2b2c68cff7aa..07ee8d76b7c331 100644
--- a/python/paddle/amp/auto_cast.py
+++ b/python/paddle/amp/auto_cast.py
@@ -138,7 +138,7 @@ def need_keep_fp32(layer, dtype):
     need_keep_fp32 = False
     # Highest prority. Because all the layers except BN will use bfloat16 params in bfoat16 training,
     # here we provide a option to keep fp32 param.
-    if not layer._cast_to_low_precison:
+    if not layer._cast_to_low_precision:
         need_keep_fp32 = True
     # The BN layers will keep fp32
     elif isinstance(
@@ -197,12 +197,12 @@ def set_excluded_layers(models, excluded_layers):
         for layer in excluded_layers_instances[idx].sublayers(
             include_self=True
         ):
-            layer._cast_to_low_precison = False
+            layer._cast_to_low_precision = False
     excluded_layers_types = tuple(excluded_layers_types)
     for idx in range(len(models)):
         for layer in models[idx].sublayers(include_self=True):
             if isinstance(layer, excluded_layers_types):
-                layer._cast_to_low_precison = False
+                layer._cast_to_low_precision = False
 
 
 @dygraph_only
diff --git a/python/paddle/nn/layer/layers.py b/python/paddle/nn/layer/layers.py
index 91a993ddc45637..fcbda20cff38c9 100644
--- a/python/paddle/nn/layer/layers.py
+++ b/python/paddle/nn/layer/layers.py
@@ -412,7 +412,7 @@ def __init__(self, name_scope=None, dtype="float32"):
         self._forward_post_hooks = collections.OrderedDict()
 
         # only used in AMP Training
-        self._cast_to_low_precison = True
+        self._cast_to_low_precision = True
 
         self._state_dict_hooks = collections.OrderedDict()
         # Records orignal functions after @to_static to support to rollback
diff --git a/python/paddle/static/amp/fp16_utils.py b/python/paddle/static/amp/fp16_utils.py
index 2add5781026d4e..03c216e4338414 100644
--- a/python/paddle/static/amp/fp16_utils.py
+++ b/python/paddle/static/amp/fp16_utils.py
@@ -422,7 +422,7 @@ def fp16_guard():
 def set_var_dst_dtype(
     op, var_names, block, global_block, dtype, need_set_dtype
 ):
-    low_precison_var_names = set()
+    low_precision_var_names = set()
     for var_name in var_names:
         var = None
         try:
@@ -439,7 +439,7 @@ def set_var_dst_dtype(
             continue
 
         if var.dtype in FLOAT_TYPES:
-            low_precison_var_names.add(var_name)
+            low_precision_var_names.add(var_name)
             if need_set_dtype:
                 var.desc.set_dtype(dtype)
 
@@ -449,7 +449,7 @@ def set_var_dst_dtype(
             )
         )
 
-    return low_precison_var_names
+    return low_precision_var_names
 
 
 def set_param_dtype(program, dtype, amp_lists, use_fp16_guard, level):
@@ -581,8 +581,8 @@ def get_amp_dst_dtype(
 
 
 def process_op_input_and_outputs(op, block, global_block, dtype):
-    low_precison_var_names = set()
-    # Get the FP16 input because the low_precison_var_names is required for the parameter casting.
+    low_precision_var_names = set()
+    # Get the FP16 input because the low_precision_var_names is required for the parameter casting.
     # The dtype of the input is not set to fp16, because it is done in the step 3 of cast_model_to_fp16.
     for in_name in op.input_names:
         # for ipu, all inputs must be converted to fp16
@@ -596,7 +596,7 @@ def process_op_input_and_outputs(op, block, global_block, dtype):
             dtype,
             need_set_dtype=False,
         )
-        low_precison_var_names = low_precison_var_names.union(in_vars)
+        low_precision_var_names = low_precision_var_names.union(in_vars)
     # Set the output to FP16 because its consumer OP needs to determine if the dtype needs
     # to be promoted.
     for out_name in op.output_names:
@@ -611,7 +611,7 @@ def process_op_input_and_outputs(op, block, global_block, dtype):
             dtype,
             need_set_dtype=True,
         )
-    return low_precison_var_names
+    return low_precision_var_names
 
 
 def map_block(block, fn, parent_op=None):

From 4db3c14e0029249e4bd7ddb11b9241ba875a3dd2 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Tue, 23 Jan 2024 14:43:28 +0800
Subject: [PATCH 28/34] Fix optinal optional (#61020)

* Fix

* Fix
---
 paddle/cinn/auto_schedule/measure/measure.h   |  2 +-
 paddle/cinn/frontend/decomposer_registry.h    |  2 +-
 .../ir/schedule/impl/loop_transformation.cc   |  8 +++---
 paddle/cinn/ir/schedule/ir_schedule_util.cc   |  2 +-
 .../fused_multi_transformer_decoder_pass.cc   | 18 ++++++-------
 .../fused_multi_transformer_encoder_pass.cc   | 24 ++++++++---------
 .../ir/multihead_matmul_fuse_pass.cc          | 12 ++++-----
 .../new_executor/feed_fetch_utils.cc          |  2 +-
 paddle/fluid/framework/parallel_executor.cc   |  8 +++---
 .../tensorrt/convert/io_converter.cc          |  4 +--
 paddle/fluid/inference/tensorrt/op_teller.cc  |  2 +-
 .../tensorrt/plugin/gather_nd_op_plugin.cu    |  2 +-
 paddle/phi/core/generator.cc                  |  4 +--
 paddle/phi/infermeta/fusion.cc                |  4 +--
 paddle/phi/kernels/cpu/send_uv_grad_kernel.cc |  2 +-
 paddle/phi/kernels/cpu/send_uv_kernel.cc      |  2 +-
 paddle/phi/kernels/funcs/common_shape.h       |  2 +-
 paddle/phi/kernels/fusion/onednn/fc_kernel.cc |  2 +-
 paddle/phi/kernels/gpu/send_uv_grad_kernel.cu |  2 +-
 paddle/phi/kernels/gpu/send_uv_kernel.cu      |  2 +-
 python/paddle/amp/auto_cast.py                |  2 +-
 python/paddle/amp/debugging.py                |  4 +--
 python/paddle/autograd/autograd.py            |  2 +-
 python/paddle/base/framework.py               |  4 +--
 python/paddle/base/multiprocess_utils.py      |  6 ++---
 python/paddle/base/param_attr.py              | 12 ++++-----
 python/paddle/base/reader.py                  |  4 +--
 .../auto_parallel/static/dist_context.py      |  2 +-
 .../auto_parallel/static/engine.py            |  2 +-
 python/paddle/distributed/auto_tuner/prune.py |  2 +-
 .../distributed/fleet/elastic/collective.py   |  6 ++---
 .../dygraph_sharding_optimizer.py             | 22 ++++++++--------
 .../meta_optimizers/sharding_optimizer.py     |  2 +-
 .../parallel_layers/pp_layers.py              |  2 +-
 .../passes/auto_parallel_sharding.py          |  2 +-
 python/paddle/distribution/transform.py       |  4 +--
 python/paddle/fft.py                          | 26 +++++++++----------
 python/paddle/framework/random.py             | 12 ++++-----
 python/paddle/hapi/hub.py                     |  4 +--
 python/paddle/hapi/model.py                   |  2 +-
 python/paddle/hapi/static_flops.py            |  2 +-
 python/paddle/incubate/autograd/functional.py |  4 +--
 python/paddle/incubate/autograd/primx.py      |  2 +-
 python/paddle/incubate/framework/random.py    | 16 ++++++------
 .../paddle/jit/dy2static/partial_program.py   |  2 +-
 .../executor/opcode_executor.py               |  2 +-
 python/paddle/tensor/creation.py              |  4 +--
 test/legacy_test/test_prod_op.py              |  4 +--
 test/xpu/test_prod_op_xpu.py                  |  4 +--
 49 files changed, 134 insertions(+), 134 deletions(-)

diff --git a/paddle/cinn/auto_schedule/measure/measure.h b/paddle/cinn/auto_schedule/measure/measure.h
index 36140580ee1b51..87f0d54096b5e5 100644
--- a/paddle/cinn/auto_schedule/measure/measure.h
+++ b/paddle/cinn/auto_schedule/measure/measure.h
@@ -70,7 +70,7 @@ class ScheduleBuilder {
 };
 
 // This interface defines how to run the built result. Like above
-// ScheduleBuilder, a runner shoule be implemented with not bound to a specific
+// ScheduleBuilder, a runner should be implemented with not bound to a specific
 // task.
 class ScheduleRunner {
  public:
diff --git a/paddle/cinn/frontend/decomposer_registry.h b/paddle/cinn/frontend/decomposer_registry.h
index 3dc142468a9e5e..a94708db631d53 100644
--- a/paddle/cinn/frontend/decomposer_registry.h
+++ b/paddle/cinn/frontend/decomposer_registry.h
@@ -46,7 +46,7 @@ class DecomposerContext {
     }
     if (new_var->type != ori_var->type) {
       LOG(FATAL)
-          << "The output type shoule be equal to the original. But received : "
+          << "The output type should be equal to the original. But received : "
           << new_var->id << ".type=" << new_var->type
           << " and the original var " << ori_var->id
           << ".type=" << ori_var->type;
diff --git a/paddle/cinn/ir/schedule/impl/loop_transformation.cc b/paddle/cinn/ir/schedule/impl/loop_transformation.cc
index 77781e193d22ba..43ef9a88cfff94 100644
--- a/paddle/cinn/ir/schedule/impl/loop_transformation.cc
+++ b/paddle/cinn/ir/schedule/impl/loop_transformation.cc
@@ -332,7 +332,7 @@ Expr DyScheduleImpl::Fuse(const std::string& block_name,
   for (int i = 0; i < loops_index.size(); ++i) {
     if (i > 0) {
       if (loops_index[i - 1] + 1 != loops_index[i]) {
-        os << "Loops index in Fuse shoule be continuous!\n";
+        os << "Loops index in Fuse should be continuous!\n";
         throw IRScheduleErrorHandler(primitive, os.str(), module_expr_);
       }
     }
@@ -364,7 +364,7 @@ Expr DyScheduleImpl::Fuse(const Expr& block,
   for (int i = 0; i < loops_index.size(); ++i) {
     if (i > 0) {
       if (loops_index[i - 1] + 1 != loops_index[i]) {
-        os << "Loops index in Fuse shoule be continuous!\n";
+        os << "Loops index in Fuse should be continuous!\n";
         throw IRScheduleErrorHandler(primitive, os.str(), module_expr_);
       }
     }
@@ -604,7 +604,7 @@ Expr StScheduleImpl::Fuse(const std::string& block_name,
   for (int i = 0; i < loops_index.size(); ++i) {
     if (i > 0)
       CHECK_EQ(loops_index[i - 1] + 1, loops_index[i])
-          << "Loops index in Fuse shoule be continuous!";
+          << "Loops index in Fuse should be continuous!";
   }
   for (int i : loops_index) {
     CHECK_LT(i, (int)all_loops.size())
@@ -623,7 +623,7 @@ Expr StScheduleImpl::Fuse(const Expr& block,
   for (int i = 0; i < loops_index.size(); ++i) {
     if (i > 0)
       CHECK_EQ(loops_index[i - 1] + 1, loops_index[i])
-          << "Loops index in Fuse shoule be continuous!";
+          << "Loops index in Fuse should be continuous!";
   }
   for (int i : loops_index) {
     CHECK_LT(i, (int)all_loops.size())
diff --git a/paddle/cinn/ir/schedule/ir_schedule_util.cc b/paddle/cinn/ir/schedule/ir_schedule_util.cc
index ac9f609e0c20fc..daf17fddba678c 100644
--- a/paddle/cinn/ir/schedule/ir_schedule_util.cc
+++ b/paddle/cinn/ir/schedule/ir_schedule_util.cc
@@ -137,7 +137,7 @@ Expr GetNextForLoop(const Expr& for_loop) {
       << "The input of GetNextForLoop should be ir::For!";
   Expr for_body = for_loop.As<ir::For>()->body;
   ir::Block* for_body_block = for_body.As<ir::Block>();
-  CHECK(for_body_block) << "The for_loop's body shoule be Block!";
+  CHECK(for_body_block) << "The for_loop's body should be Block!";
 
   // Only support for body block contains a sub for loop
   int next_idx = -1;
diff --git a/paddle/fluid/framework/ir/fused_multi_transformer_decoder_pass.cc b/paddle/fluid/framework/ir/fused_multi_transformer_decoder_pass.cc
index e9a1c2cd8d390c..db281b64f92991 100644
--- a/paddle/fluid/framework/ir/fused_multi_transformer_decoder_pass.cc
+++ b/paddle/fluid/framework/ir/fused_multi_transformer_decoder_pass.cc
@@ -1718,13 +1718,13 @@ FusedMultiTransformerDecoderPass::FusedMultiTransformerDecoderPass() {
       .End();
 
   AddOpCompat(OpCompat("matmul_v2"))
-      .AddInput("X")  // the shape shoule be (B, S, N*H)
+      .AddInput("X")  // the shape should be (B, S, N*H)
       .IsTensor()
       .End()
-      .AddInput("Y")  // the shape shoule be (N*H, N*H)
+      .AddInput("Y")  // the shape should be (N*H, N*H)
       .IsTensor()
       .End()
-      .AddOutput("Out")  // the shape shoule be (B, S, N*H)
+      .AddOutput("Out")  // the shape should be (B, S, N*H)
       .IsTensor()
       .End()
       .AddAttr("trans_x")
@@ -2452,13 +2452,13 @@ FusedMultiTransformerDecoderFuseQKVPass::
       .End();
 
   AddOpCompat(OpCompat("matmul_v2"))
-      .AddInput("X")  // the shape shoule be (B, S, N*H)
+      .AddInput("X")  // the shape should be (B, S, N*H)
       .IsTensor()
       .End()
-      .AddInput("Y")  // the shape shoule be (N*H, N*H)
+      .AddInput("Y")  // the shape should be (N*H, N*H)
       .IsTensor()
       .End()
-      .AddOutput("Out")  // the shape shoule be (B, S, N*H)
+      .AddOutput("Out")  // the shape should be (B, S, N*H)
       .IsTensor()
       .End()
       .AddAttr("trans_x")
@@ -3235,13 +3235,13 @@ MultiDevicesFusedMultiTransformerDecoderFuseQKVPass::
       .End();
 
   AddOpCompat(OpCompat("matmul_v2"))
-      .AddInput("X")  // the shape shoule be (B, S, N*H)
+      .AddInput("X")  // the shape should be (B, S, N*H)
       .IsTensor()
       .End()
-      .AddInput("Y")  // the shape shoule be (N*H, N*H)
+      .AddInput("Y")  // the shape should be (N*H, N*H)
       .IsTensor()
       .End()
-      .AddOutput("Out")  // the shape shoule be (B, S, N*H)
+      .AddOutput("Out")  // the shape should be (B, S, N*H)
       .IsTensor()
       .End()
       .AddAttr("trans_x")
diff --git a/paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass.cc b/paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass.cc
index bfd071b0f42ce1..e8be50b71917c6 100644
--- a/paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass.cc
+++ b/paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass.cc
@@ -2447,13 +2447,13 @@ FusedMultiTransformerEncoderPass::FusedMultiTransformerEncoderPass() {
       .End();
 
   AddOpCompat(OpCompat("matmul_v2"))
-      .AddInput("X")  // the shape shoule be (B, S, N*H)
+      .AddInput("X")  // the shape should be (B, S, N*H)
       .IsTensor()
       .End()
-      .AddInput("Y")  // the shape shoule be (N*H, N*H)
+      .AddInput("Y")  // the shape should be (N*H, N*H)
       .IsTensor()
       .End()
-      .AddOutput("Out")  // the shape shoule be (B, S, N*H)
+      .AddOutput("Out")  // the shape should be (B, S, N*H)
       .IsTensor()
       .End()
       .AddAttr("trans_x")
@@ -3287,13 +3287,13 @@ FusedMultiTransformerEncoderFuseQKVPass::
       .End();
 
   AddOpCompat(OpCompat("matmul_v2"))
-      .AddInput("X")  // the shape shoule be (B, S, N*H)
+      .AddInput("X")  // the shape should be (B, S, N*H)
       .IsTensor()
       .End()
-      .AddInput("Y")  // the shape shoule be (N*H, N*H)
+      .AddInput("Y")  // the shape should be (N*H, N*H)
       .IsTensor()
       .End()
-      .AddOutput("Out")  // the shape shoule be (B, S, N*H)
+      .AddOutput("Out")  // the shape should be (B, S, N*H)
       .IsTensor()
       .End()
       .AddAttr("trans_x")
@@ -4083,13 +4083,13 @@ MultiDevicesFusedMultiTransformerEncoderPass::
       .End();
 
   AddOpCompat(OpCompat("matmul_v2"))
-      .AddInput("X")  // the shape shoule be (B, S, N*H)
+      .AddInput("X")  // the shape should be (B, S, N*H)
       .IsTensor()
       .End()
-      .AddInput("Y")  // the shape shoule be (N*H, N*H)
+      .AddInput("Y")  // the shape should be (N*H, N*H)
       .IsTensor()
       .End()
-      .AddOutput("Out")  // the shape shoule be (B, S, N*H)
+      .AddOutput("Out")  // the shape should be (B, S, N*H)
       .IsTensor()
       .End()
       .AddAttr("trans_x")
@@ -4973,13 +4973,13 @@ MultiDevicesFusedMultiTransformerEncoderFuseQKVPass::
       .End();
 
   AddOpCompat(OpCompat("matmul_v2"))
-      .AddInput("X")  // the shape shoule be (B, S, N*H)
+      .AddInput("X")  // the shape should be (B, S, N*H)
       .IsTensor()
       .End()
-      .AddInput("Y")  // the shape shoule be (N*H, N*H)
+      .AddInput("Y")  // the shape should be (N*H, N*H)
       .IsTensor()
       .End()
-      .AddOutput("Out")  // the shape shoule be (B, S, N*H)
+      .AddOutput("Out")  // the shape should be (B, S, N*H)
       .IsTensor()
       .End()
       .AddAttr("trans_x")
diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
index 22802dbddd8efe..25f120c7866b50 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
@@ -709,13 +709,13 @@ void MultiHeadMatmulFusePass::ApplyImpl(Graph* graph) const {
 
 MultiHeadMatmulV2FusePass::MultiHeadMatmulV2FusePass() {
   AddOpCompat(OpCompat("mul"))
-      .AddInput("X")  // the shape shoule be (B, S, N*H)
+      .AddInput("X")  // the shape should be (B, S, N*H)
       .IsTensor()
       .End()
-      .AddInput("Y")  // the shape shoule be (N*H, N*H)
+      .AddInput("Y")  // the shape should be (N*H, N*H)
       .IsTensor()
       .End()
-      .AddOutput("Out")  // the shape shoule be (B, S, N*H)
+      .AddOutput("Out")  // the shape should be (B, S, N*H)
       .IsTensor()
       .End()
       .AddAttr("x_num_col_dims")
@@ -1177,13 +1177,13 @@ void MultiHeadMatmulV2FusePass::ApplyImpl(Graph* graph) const {
 
 MultiHeadMatmulV3FusePass::MultiHeadMatmulV3FusePass() {
   AddOpCompat(OpCompat("mul"))
-      .AddInput("X")  // the shape shoule be (B, S, N*H)
+      .AddInput("X")  // the shape should be (B, S, N*H)
       .IsTensor()
       .End()
-      .AddInput("Y")  // the shape shoule be (N*H, N*H)
+      .AddInput("Y")  // the shape should be (N*H, N*H)
       .IsTensor()
       .End()
-      .AddOutput("Out")  // the shape shoule be (B, S, N*H)
+      .AddOutput("Out")  // the shape should be (B, S, N*H)
       .IsTensor()
       .End()
       .AddAttr("x_num_col_dims")
diff --git a/paddle/fluid/framework/new_executor/feed_fetch_utils.cc b/paddle/fluid/framework/new_executor/feed_fetch_utils.cc
index 5e65f5eab2120a..0a713b89727f61 100644
--- a/paddle/fluid/framework/new_executor/feed_fetch_utils.cc
+++ b/paddle/fluid/framework/new_executor/feed_fetch_utils.cc
@@ -130,7 +130,7 @@ void MergeFetchTensors(const FetchUnmergedList& fetch_list,
   PADDLE_ENFORCE_EQ(
       fetch_list.size(),
       micro_batch_num,
-      phi::errors::Unavailable("The fetch_list size (%lld) shoule be equal to "
+      phi::errors::Unavailable("The fetch_list size (%lld) should be equal to "
                                "the micro_batch_num (%lld)",
                                fetch_list.size(),
                                micro_batch_num));
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index e6c11df275b569..491fda2e9d59a3 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -1506,7 +1506,7 @@ std::vector<ir::Graph *> ParallelExecutor::CompileGraphWithBuildStrategy(
     PADDLE_ENFORCE_EQ(graphs.size(),
                       device_count,
                       platform::errors::PreconditionNotMet(
-                          "graphs.size() shoule be %d, but received %d",
+                          "graphs.size() should be %d, but received %d",
                           device_count,
                           graphs.size()));
     VLOG(3) << "use local async mode";
@@ -1541,7 +1541,7 @@ std::vector<ir::Graph *> ParallelExecutor::CompileGraphWithBuildStrategy(
     PADDLE_ENFORCE_EQ(graphs.size(),
                       device_count,
                       platform::errors::PreconditionNotMet(
-                          "graphs.size() shoule be %d, but received %d",
+                          "graphs.size() should be %d, but received %d",
                           device_count,
                           graphs.size()));
     VLOG(3) << "use local async mode";
@@ -1608,12 +1608,12 @@ void ParallelExecutor::CreateVariableInfos(
       var_infos->size(),
       0,
       platform::errors::PreconditionNotMet(
-          "var_infos->size() shoule be 0, but received %d", var_infos->size()));
+          "var_infos->size() should be 0, but received %d", var_infos->size()));
   PADDLE_ENFORCE_EQ(
       member_->is_persistable_.size(),
       0,
       platform::errors::PreconditionNotMet(
-          "member_->is_persistable_.size() shoule be 0, but received %d",
+          "member_->is_persistable_.size() should be 0, but received %d",
           member_->is_persistable_.size()));
   for (auto &node : graph->Nodes()) {
     if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
diff --git a/paddle/fluid/inference/tensorrt/convert/io_converter.cc b/paddle/fluid/inference/tensorrt/convert/io_converter.cc
index f7a4c4cf3cc165..832cd6fe2bc074 100644
--- a/paddle/fluid/inference/tensorrt/convert/io_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/io_converter.cc
@@ -45,7 +45,7 @@ class DefaultIOConverter : public EngineIOConverter {
         size,
         max_size,
         platform::errors::InvalidArgument(
-            "The input Tensor in's memory_size shoule be less than or equal to "
+            "The input Tensor in's memory_size should be less than or equal to "
             "the input max_size. But in's memory_size = %u, max_size = %u.",
             size,
             max_size));
@@ -81,7 +81,7 @@ class DefaultIOConverter : public EngineIOConverter {
         size,
         max_size,
         platform::errors::InvalidArgument(
-            "The input Tensor out's memory_size shoule be less than or equal "
+            "The input Tensor out's memory_size should be less than or equal "
             "to the input max_size. "
             "But out's memory_size = %u, max_size = %u.",
             size,
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index d752850b43df6b..61cf80b7fe1098 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -377,7 +377,7 @@ struct SimpleOpTypeSetTeller : public Teller {
       const std::vector<int> paddings =
           PADDLE_GET_CONST(std::vector<int>, desc.GetAttr("paddings"));
       if (paddings.size() != 2) {
-        VLOG(3) << "The size of paddings shoule be 2, but got "
+        VLOG(3) << "The size of paddings should be 2, but got "
                 << paddings.size();
         return false;
       }
diff --git a/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.cu
index 046a725b2bd1e3..30ad665437352f 100644
--- a/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.cu
@@ -95,7 +95,7 @@ nvinfer1::DimsExprs GatherNdPluginDynamic::getOutputDimensions(
   int32_t x_dims_size = x_dims.nbDims;
   int32_t index_dims_size = index_dims.nbDims;
 
-  // TODO(wilber): The result dims shoule be Index.shape[:-1] +
+  // TODO(wilber): The result dims should be Index.shape[:-1] +
   // X.shape[Index.shape[-1]:], but the trt DimsExprs is an expression we can't
   // get the actual value. So we only support one scenario: input_dims.size ==
   // index_dims.size.
diff --git a/paddle/phi/core/generator.cc b/paddle/phi/core/generator.cc
index 4aa3568ea6ae9e..d9afa93c3ee7ca 100644
--- a/paddle/phi/core/generator.cc
+++ b/paddle/phi/core/generator.cc
@@ -47,7 +47,7 @@ const std::shared_ptr<Generator>& DefaultXPUGenerator(int64_t device_id) {
   });
   if (device_id < 0) {
     PADDLE_THROW(
-        phi::errors::InvalidArgument("xpu device id shoule be greater than 0"));
+        phi::errors::InvalidArgument("xpu device id should be greater than 0"));
   }
 
   std::call_once(xpu_device_flags[device_id], [device_id]() {
@@ -78,7 +78,7 @@ const std::shared_ptr<Generator>& DefaultCUDAGenerator(int64_t device_id) {
   });
   if (device_id < 0) {
     PADDLE_THROW(phi::errors::InvalidArgument(
-        "cuda device id shoule be greater than 0"));
+        "cuda device id should be greater than 0"));
   }
 
   std::call_once(cuda_device_flags[device_id], [device_id]() {
diff --git a/paddle/phi/infermeta/fusion.cc b/paddle/phi/infermeta/fusion.cc
index 2182640ddbb2f7..342f4f4e4b4148 100644
--- a/paddle/phi/infermeta/fusion.cc
+++ b/paddle/phi/infermeta/fusion.cc
@@ -2630,7 +2630,7 @@ void FusedEmbeddingEltWiseLayerNormInferMeta(
         embs_dim.size(),
         2,
         phi::errors::InvalidArgument(
-            "The Emb dim's size shoule be 2, but found %d.", embs_dim.size()));
+            "The Emb dim's size should be 2, but found %d.", embs_dim.size()));
     PADDLE_ENFORCE_EQ(
         embs_dim[1],
         dims_bias[0],
@@ -3158,7 +3158,7 @@ void FusionGRUInferMeta(const MetaTensor& x,
                 std::end(mkldnn_data_type_list),
                 mkldnn_data_type) != std::end(mkldnn_data_type_list),
       true,
-      phi::errors::InvalidArgument("The mkldnn_data_type shoule be [float32, "
+      phi::errors::InvalidArgument("The mkldnn_data_type should be [float32, "
                                    "int8, bfloat16], but found %s.",
                                    mkldnn_data_type.c_str()));
 
diff --git a/paddle/phi/kernels/cpu/send_uv_grad_kernel.cc b/paddle/phi/kernels/cpu/send_uv_grad_kernel.cc
index 152cd948562311..4423f03ef31cc9 100644
--- a/paddle/phi/kernels/cpu/send_uv_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/send_uv_grad_kernel.cc
@@ -172,7 +172,7 @@ void GraphSendUVGradOpKernelLaunchHelper(const Context& ctx,
       index_size,
       0,
       errors::InvalidArgument("The first dimension of src_index or dst_index "
-                              "shoule be greater than 0, but received %d.",
+                              "should be greater than 0, but received %d.",
                               index_size));
 
   ctx.template Alloc<T>(x_grad);
diff --git a/paddle/phi/kernels/cpu/send_uv_kernel.cc b/paddle/phi/kernels/cpu/send_uv_kernel.cc
index c5200182a1d08c..301611d13d7be5 100644
--- a/paddle/phi/kernels/cpu/send_uv_kernel.cc
+++ b/paddle/phi/kernels/cpu/send_uv_kernel.cc
@@ -62,7 +62,7 @@ void GraphSendUVOpKernelLaunchHelper(const Context& ctx,
       index_size,
       0,
       errors::InvalidArgument("The first dimension of src_index or dst_index "
-                              "shoule be greater than 0, but received %d.",
+                              "should be greater than 0, but received %d.",
                               index_size));
 
   auto out_dims = out->dims();
diff --git a/paddle/phi/kernels/funcs/common_shape.h b/paddle/phi/kernels/funcs/common_shape.h
index dea6e9f6ab3e0b..19f2fa1f2fac4c 100644
--- a/paddle/phi/kernels/funcs/common_shape.h
+++ b/paddle/phi/kernels/funcs/common_shape.h
@@ -177,7 +177,7 @@ static inline std::vector<int64_t> MatrixGetBroadcastBatchPortion(
 
 // Just For Matrix OP, for example:
 // x's dim = [5, 3, 2, M, M] ; y's dim = [3, 1, M, N]
-// out shoule be [5, 3, 2, M, M] + [5, 3, 2, M, N], and [5, 3, 2] is
+// out should be [5, 3, 2, M, M] + [5, 3, 2, M, N], and [5, 3, 2] is
 // batch_size of matrix
 static inline std::tuple<std::vector<int64_t>, std::vector<int64_t>>
 MatrixGetBroadcastDims(const DenseTensor &x, const DenseTensor &y) {
diff --git a/paddle/phi/kernels/fusion/onednn/fc_kernel.cc b/paddle/phi/kernels/fusion/onednn/fc_kernel.cc
index 0d39677276ead8..c62cbddb28cb5c 100644
--- a/paddle/phi/kernels/fusion/onednn/fc_kernel.cc
+++ b/paddle/phi/kernels/fusion/onednn/fc_kernel.cc
@@ -606,7 +606,7 @@ void FCKernel(const Context& dev_ctx,
                 std::end(mkldnn_data_type_list),
                 mkldnn_data_type) != std::end(mkldnn_data_type_list),
       true,
-      phi::errors::InvalidArgument("The mkldnn_data_type shoule be [float32, "
+      phi::errors::InvalidArgument("The mkldnn_data_type should be [float32, "
                                    "int8, bfloat16], but found %s.",
                                    mkldnn_data_type.c_str()));
   auto in_dims = input.dims();
diff --git a/paddle/phi/kernels/gpu/send_uv_grad_kernel.cu b/paddle/phi/kernels/gpu/send_uv_grad_kernel.cu
index d5126979b84f6c..3b618570b11d2a 100644
--- a/paddle/phi/kernels/gpu/send_uv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/send_uv_grad_kernel.cu
@@ -225,7 +225,7 @@ void GraphSendUVGradOpCUDAKernelLaunchHelper(const Context& ctx,
       index_size,
       0,
       errors::InvalidArgument("The first dimension of src_index or dst_index "
-                              "shoule be greater than 0, but received %d.",
+                              "should be greater than 0, but received %d.",
                               index_size));
 
   ctx.template Alloc<T>(x_grad);
diff --git a/paddle/phi/kernels/gpu/send_uv_kernel.cu b/paddle/phi/kernels/gpu/send_uv_kernel.cu
index cbc9e42ff84ea4..94e1576dc371c2 100644
--- a/paddle/phi/kernels/gpu/send_uv_kernel.cu
+++ b/paddle/phi/kernels/gpu/send_uv_kernel.cu
@@ -75,7 +75,7 @@ void GraphSendUVOpCUDAKernelLaunchHelper(const Context& ctx,
       index_size,
       0,
       errors::InvalidArgument("The first dimension of src_index or dst_index "
-                              "shoule be greater than 0, but received %d.",
+                              "should be greater than 0, but received %d.",
                               index_size));
 
   auto out_dims = out->dims();
diff --git a/python/paddle/amp/auto_cast.py b/python/paddle/amp/auto_cast.py
index 07ee8d76b7c331..2d8e1d22b16427 100644
--- a/python/paddle/amp/auto_cast.py
+++ b/python/paddle/amp/auto_cast.py
@@ -806,7 +806,7 @@ def decorate(
         level(str, optional): Auto mixed precision level. Accepted values are 'O1' and 'O2': O1 represent mixed precision, the decorator will do nothing;
              O2 represent Pure float16/bfloat16, the decorator will cast all parameters of models to float16/bfloat16, except BatchNorm, InstanceNorm and LayerNorm. Default is O1(amp)
         dtype(str, optional): Whether to use 'float16' or 'bfloat16'. Default is 'float16'.
-        master_weight(bool, optinal): For level='O2', whether to use multi-precision during weight updating. If master_weight is None, in O2 level optimizer will use multi-precision. Default is None.
+        master_weight(bool, optional): For level='O2', whether to use multi-precision during weight updating. If master_weight is None, in O2 level optimizer will use multi-precision. Default is None.
         save_dtype(float, optional): The save model parameter dtype when use `paddle.save` or `paddle.jit.save`,it should be float16, bfloat16, float32, float64 or None.
              The save_dtype will not change model parameters dtype, it just change the state_dict dtype. When save_dtype is None, the save dtype is same as model dtype. Default is None.
         master_grad(bool, optional): For level='O2', whether to use float32 weight gradients for calculations such as gradient clipping, weight decay, and weight updates. If master_grad is enabled, the weight
diff --git a/python/paddle/amp/debugging.py b/python/paddle/amp/debugging.py
index e57ee9a65d8dc4..0fd8fce8fe5f8f 100644
--- a/python/paddle/amp/debugging.py
+++ b/python/paddle/amp/debugging.py
@@ -443,7 +443,7 @@ def _print_operator_stats(op_count_dict):
                 called = value.split(",")
             else:
                 raise ValueError(
-                    "Input {} is expected to be a list of str, but recieved {}.".format(
+                    "Input {} is expected to be a list of str, but received {}.".format(
                         value, type(value)
                     )
                 )
@@ -585,7 +585,7 @@ def compare_accuracy(
     Args:
         dump_path(str): The path of the running log, such as the log for execution using the float32 data type.
         another_dump_path(str): the path of another running log ,such as the log for execution using the float16 data type.
-        output_filename(str): the excel file nmae of compare output.
+        output_filename(str): the excel file name of compare output.
         loss_scale(float, optional): the loss_scale during the training phase. Default is 1.
         dump_all_tensors(bool, optional): dump all tensor, It is currently not support. Default is False.
 
diff --git a/python/paddle/autograd/autograd.py b/python/paddle/autograd/autograd.py
index a114d3de650206..93e0a845908b19 100644
--- a/python/paddle/autograd/autograd.py
+++ b/python/paddle/autograd/autograd.py
@@ -404,7 +404,7 @@ def _multi_index(indexes, shape):
 
     Currently supporting following input format:
         * ([positive|negative|slice], ...), the right-most elements can be
-            omited.
+            omitted.
 
     The standard format after converted is slice tuple which contains N elements:
         * ([positive|slice], ..., [positive|slice])
diff --git a/python/paddle/base/framework.py b/python/paddle/base/framework.py
index 655f2e9b6a586b..cc5c231158d038 100644
--- a/python/paddle/base/framework.py
+++ b/python/paddle/base/framework.py
@@ -7427,7 +7427,7 @@ class Parameter(Variable, metaclass=ParameterMetaClass):
             be applied on the parameter. Default: None
         do_model_average(bool): True if the model average strategy will
             be applied on this parameter.
-        need_clip (bool): Whether the parameter gradient need to be cliped
+        need_clip (bool): Whether the parameter gradient need to be clipped
             in optimizer. Default is True.
     """
 
@@ -7543,7 +7543,7 @@ class EagerParamBase(core.eager.Tensor):
             be applied on the EagerParamBase. Default: None
         do_model_average(bool): True if the model average strategy will
             be applied on this EagerParamBase.
-        need_clip (bool): Whether the parameter gradient need to be cliped
+        need_clip (bool): Whether the parameter gradient need to be clipped
             in optimizer. Default is True.
     """
 
diff --git a/python/paddle/base/multiprocess_utils.py b/python/paddle/base/multiprocess_utils.py
index 9b70cacd1c2cd8..1445e985773cd0 100644
--- a/python/paddle/base/multiprocess_utils.py
+++ b/python/paddle/base/multiprocess_utils.py
@@ -62,7 +62,7 @@ class CleanupFuncRegistrar:
 
     @classmethod
     def register(cls, function, signals=[]):
-        def _func_exectuor():
+        def _func_executor():
             if function not in cls._executed_func_set:
                 try:
                     function()
@@ -74,11 +74,11 @@ def _func_register(function):
                 raise TypeError("%s is not callable object." % (function))
             # check function object whether hash-able
             if function not in cls._registered_func_set:
-                atexit.register(_func_exectuor)
+                atexit.register(_func_executor)
                 cls._registered_func_set.add(function)
 
         def _signal_handler(signum=None, frame=None):
-            _func_exectuor()
+            _func_executor()
             if signum is not None:
                 if signum == signal.SIGINT:
                     raise KeyboardInterrupt
diff --git a/python/paddle/base/param_attr.py b/python/paddle/base/param_attr.py
index 33cbafa98f1702..86d4e0e47db6ef 100644
--- a/python/paddle/base/param_attr.py
+++ b/python/paddle/base/param_attr.py
@@ -24,7 +24,7 @@ class ParamAttr:
 
     Note:
         ``gradient_clip`` of ``ParamAttr`` HAS BEEN DEPRECATED since 2.0.
-        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
+        Please use ``need_clip`` in ``ParamAttr`` to specify the clip scope.
         There are three clipping strategies: :ref:`api_paddle_nn_ClipGradByGlobalNorm` ,
         :ref:`api_paddle_nn_ClipGradByNorm` , :ref:`api_paddle_nn_ClipGradByValue` .
 
@@ -49,7 +49,7 @@ class ParamAttr:
         trainable (bool, optional): Whether this parameter is trainable. Default True.
         do_model_average (bool, optional): Whether this parameter should do model average
                 when model average is enabled. Only used in ExponentialMovingAverage. Default True.
-        need_clip (bool, optional): Whether the parameter gradient need to be cliped in optimizer. Default is True.
+        need_clip (bool, optional): Whether the parameter gradient need to be clipped in optimizer. Default is True.
 
     Returns:
        ParamAttr Object.
@@ -217,7 +217,7 @@ class WeightNormParamAttr(ParamAttr):
 
     Note:
         ``gradient_clip`` of ``ParamAttr`` HAS BEEN DEPRECATED since 2.0.
-        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
+        Please use ``need_clip`` in ``ParamAttr`` to specify the clip scope.
         There are three clipping strategies: :ref:`api_paddle_nn_ClipGradByGlobalNorm` ,
         :ref:`api_paddle_nn_ClipGradByNorm` , :ref:`api_paddle_nn_ClipGradByValue` .
 
@@ -245,13 +245,13 @@ class WeightNormParamAttr(ParamAttr):
         regularizer (WeightDecayRegularizer, optional): Regularization strategy. There are
             two method: :ref:`api_paddle_regularizer_L1Decay` ,
             :ref:`api_paddle_regularizer_L2Decay`.
-            If regularizer isralso set in ``optimizer``
+            If regularizer is also set in ``optimizer``
             (such as :ref:`api_paddle_optimizer_SGD` ), that regularizer setting in
             optimizer will be ignored. Default None, meaning there is no regularization.
         trainable(bool, optional): Whether this parameter is trainable. Default True.
         do_model_average(bool, optional): Whether this parameter should do model average.
             Default False.
-        need_clip (bool, optional): Whether the parameter gradient need to be cliped in optimizer. Default is True.
+        need_clip (bool, optional): Whether the parameter gradient need to be clipped in optimizer. Default is True.
 
     Examples:
 
@@ -279,7 +279,7 @@ class WeightNormParamAttr(ParamAttr):
     # List to record the parameters reparameterized by weight normalization.
     # If these parameters are treated as Variable rather than Parameter,
     # it can be used to discriminate these parameters and help to serialize
-    # these paramters for inference.
+    # these parameters for inference.
     params_with_weight_norm = []
 
     def __init__(
diff --git a/python/paddle/base/reader.py b/python/paddle/base/reader.py
index 7fcccf8910fc46..e90378249da03f 100644
--- a/python/paddle/base/reader.py
+++ b/python/paddle/base/reader.py
@@ -616,7 +616,7 @@ def _start(self):
             # or just hang, the main process will hang waiting for data, so here need to deal
             # with SIGSEGV and SIGBUS of child process; 2. if the main process end before child
             # process, it shuts the all its daemonic children down with a SIGTERM (instead of
-            # joining them without a timeout), so here nedd to deal with SIGTERM.
+            # joining them without a timeout), so here need to deal with SIGTERM.
             core._set_process_pids(id(self), [self._process.pid])
             _set_SIGCHLD_handler()
 
@@ -1102,7 +1102,7 @@ def set_batch_generator(self, reader, places=None):
         else:
             if places is not None:
                 logging.info(
-                    'places would be ommited when DataLoader is not iterable'
+                    'places would be omitted when DataLoader is not iterable'
                 )
         return self
 
diff --git a/python/paddle/distributed/auto_parallel/static/dist_context.py b/python/paddle/distributed/auto_parallel/static/dist_context.py
index ed4fc68160307d..eefc0d332957f1 100644
--- a/python/paddle/distributed/auto_parallel/static/dist_context.py
+++ b/python/paddle/distributed/auto_parallel/static/dist_context.py
@@ -81,7 +81,7 @@ def __init__(
         self._serial_optimizer = None
         self._serial_feed_vars = {}
         self._serial_fetch_vars = {}
-        self._lr_optimizer = None  # record the optimzier holding lr_scheduler
+        self._lr_optimizer = None  # record the optimizer holding lr_scheduler
 
         # Data members related to the program
         self._dist_tensors_for_program = {}
diff --git a/python/paddle/distributed/auto_parallel/static/engine.py b/python/paddle/distributed/auto_parallel/static/engine.py
index 422c1a76c1d54d..c4242aa30e63e0 100644
--- a/python/paddle/distributed/auto_parallel/static/engine.py
+++ b/python/paddle/distributed/auto_parallel/static/engine.py
@@ -907,7 +907,7 @@ def _initialize(self, mode, init_parameters=True):
             self.program_helper.init(
                 dist_main_program, self._place, dist_context
             )
-            # The model's instance variables (not paramters), used in forward function,
+            # The model's instance variables (not parameters), used in forward function,
             # have been initialized when initialize model in dynamic mode.
             if self._model and len(self._model.buffers()) > 0:
                 for buffer in self._model.buffers():
diff --git a/python/paddle/distributed/auto_tuner/prune.py b/python/paddle/distributed/auto_tuner/prune.py
index 5aa3340685e191..84c983bfdb580f 100644
--- a/python/paddle/distributed/auto_tuner/prune.py
+++ b/python/paddle/distributed/auto_tuner/prune.py
@@ -509,7 +509,7 @@ def prune_by_memory_estimation(tuner_cfg, cur_cfg, history_cfgs=[]):
 
     if not os.path.exists(memory_estimation_tool):
         raise ValueError(
-            f"memory_estimation_tool shoule be a valid path, but got {memory_estimation_tool}"
+            f"memory_estimation_tool should be a valid path, but got {memory_estimation_tool}"
         )
 
     if max_memory_usage is None:
diff --git a/python/paddle/distributed/fleet/elastic/collective.py b/python/paddle/distributed/fleet/elastic/collective.py
index 171ab773091c3c..c2d94568094099 100644
--- a/python/paddle/distributed/fleet/elastic/collective.py
+++ b/python/paddle/distributed/fleet/elastic/collective.py
@@ -31,7 +31,7 @@ def __init__(self, args):
         self.procs = []
 
     def launch(self):
-        logger.info("collective lauchner launch ...")
+        logger.info("collective launcher launch ...")
         args = self.args
         self.tmp_dir = tempfile.mkdtemp()
         cluster, pod = paddle.distributed.fleet.launch.get_cluster_info(args)
@@ -52,14 +52,14 @@ def launch(self):
             logger.info(f"launch proc_id:{proc.proc.pid} idx:{idx}")
 
     def stop(self):
-        logger.info("collective lauchner stop ...")
+        logger.info("collective launcher stop ...")
         if not self._terminate_procs():
             logger.error("kill process failed")
         if os.path.exists(self.tmp_dir):
             shutil.rmtree(self.tmp_dir)
 
     def watch(self):
-        logger.debug("collective lauchner watch ...")
+        logger.debug("collective launcher watch ...")
         for p in self.procs:
             if p.log_fn and p.local_rank == 0:
                 pull_worker_log(p)
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
index 605c08039d534e..fef3f878c2e972 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
@@ -56,10 +56,10 @@ class DygraphShardingOptimizer:
     """
 
     # TODO (JZ-LIANG)
-    # TO support following featrues in future:
+    # TO support following features in future:
     # 1. fused update parameter sync
     # 2. parameters_groups
-    # 3. dynamic trainable params, which is the case bewteen pretraining and finetuning
+    # 3. dynamic trainable params, which is the case between pretraining and finetuning
     # 4. option to choose fuse comm (more GPU MEM need) or un-fuse comm
 
     def __init__(self, optimizer, hcg):
@@ -73,9 +73,9 @@ def __init__(self, optimizer, hcg):
             optimizer._apply_optimize
         ):
             raise ValueError(
-                "the optimzier object should have _apply_optimize function"
+                "the optimizer object should have _apply_optimize function"
             )
-        # the self._parameter_list holds the whole model paramters
+        # the self._parameter_list holds the whole model parameters
         self._parameter_list = optimizer._parameter_list
         self._origin_parameter_list = self._parameter_list
         self._inner_opt = optimizer
@@ -161,12 +161,12 @@ def __init__(self, optimizer, hcg):
                     '_apply_decay_param_fun', apply_decay_param_fun
                 )
             # Note: during the tensor fusion for parameters, the allocator will apply for
-            # some extra GPU memory for the fused big paramters. This extra GPU memory will
+            # some extra GPU memory for the fused big parameters. This extra GPU memory will
             # be useless at once the fusion has done. But the Paddle's allocator won't
             # release those memory, it will hold that part in the memory poll. So after
             # tensor fusion, the 'reserved' memory will increase but the 'allocate' memory
             # won't change. To avoid failure on some other applications (such as some nvtx
-            # operations), here we manulay let the allocator release the cached memory.
+            # operations), here we manually let the allocator release the cached memory.
             paddle.device.cuda.empty_cache()
 
     def clear_grad(self, set_to_zero=True):
@@ -224,7 +224,7 @@ def _partition_parameters(self):
         """
         # TODO(JZ-LIANG) support multiple partition methods
         # method1: greedy even but unorder
-        # method2: roughly even with oreder
+        # method2: roughly even with order
 
         mapping = {}
         for rank_ in range(self._sharding_world_size):
@@ -478,10 +478,10 @@ class DygraphShardingOptimizerV2:
     """
 
     # TODO (JZ-LIANG)
-    # TO support following featrues in future:
+    # TO support following features in future:
     # 1. fused update parameter sync
     # 2. parameters_groups
-    # 3. dynamic trainable params, which is the case bewteen pretraining and finetuning
+    # 3. dynamic trainable params, which is the case between pretraining and finetuning
     # 4. option to choose fuse comm (more GPU MEM need) or un-fuse comm
     # 5. do not shard small params
 
@@ -500,7 +500,7 @@ def __init__(self, optimizer, hcg):
             optimizer._apply_optimize
         ):
             raise ValueError(
-                "the optimzier object should have _apply_optimize function"
+                "the optimizer object should have _apply_optimize function"
             )
 
         self._inner_opt = optimizer
@@ -560,7 +560,7 @@ def __init__(self, optimizer, hcg):
         # Determine the use of pipeline parallelism
         self._use_pipeline_parallel = strategy.hybrid_configs["pp_degree"] > 1
 
-        # Ensure pipelie parallel and comm_overlap are not used together
+        # Ensure pipeline parallel and comm_overlap are not used together
         if self._use_pipeline_parallel:
             assert (
                 not self.comm_overlap
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
index 61b0a214eab154..1830773317462d 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
@@ -702,7 +702,7 @@ def minimize_impl(
         # step6: (optional) sharding gradient merge
         self._sharding_gradient_merge()
 
-        # # check op dependecy
+        # # check op dependency
         # FIXME (JZ-LIANG) enable checking in future.
         # check_broadcast(main_block)
         # check_allreduce_sum(main_block, self._shard, self.sharding_ring_id,
diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
index 26b0c7a12ace70..8db44dd5e8750e 100755
--- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
@@ -341,7 +341,7 @@ def __init__(
                 ), "seg_method should be a str for interleave scheduler"
                 assert seg_method.startswith(
                     'layer:'
-                ), "seg_method shoud be start with layer: for interleave scheduler"
+                ), "seg_method should be start with layer: for interleave scheduler"
 
         self._num_virtual_pipeline_stages = (
             1
diff --git a/python/paddle/distributed/passes/auto_parallel_sharding.py b/python/paddle/distributed/passes/auto_parallel_sharding.py
index 486215a7bf8fa9..454a01d0a69fdd 100644
--- a/python/paddle/distributed/passes/auto_parallel_sharding.py
+++ b/python/paddle/distributed/passes/auto_parallel_sharding.py
@@ -1713,7 +1713,7 @@ def re_order_program(block, param_grads, dist_context):
         if len(use_order) == len(pname_to_pg_pairs):
             break
 
-    # reorder optimzier
+    # reorder optimizer
     last_op = block.ops[-1]
     pname_to_op = {}
     num_ops = len(block.ops)
diff --git a/python/paddle/distribution/transform.py b/python/paddle/distribution/transform.py
index 9e850da2b2fbc2..cdb612aea2c0ff 100644
--- a/python/paddle/distribution/transform.py
+++ b/python/paddle/distribution/transform.py
@@ -595,9 +595,9 @@ def _domain(self):
         # Suppose the dimensions of input tensor is N, and chain [t0,...ti,...tm],
         # ti(in) denotes ti.domain.event_rank, ti(out) denotes ti.codomain.event_rank,
         # delta(ti) denotes (ti(out) - ti(in)).
-        # For transform ti, N shoud satisfy the constraint:
+        # For transform ti, N should satisfy the constraint:
         #   N + delta(t0) + delta(t1)...delta(t(i-1)) >= ti(in)
-        # So, for all transform in chain, N shoud satisfy follow constraints:
+        # So, for all transform in chain, N should satisfy follow constraints:
         #   t0: N >= t0(in)
         #   t1: N >= t1(in) - delta(t0)
         #   ...
diff --git a/python/paddle/fft.py b/python/paddle/fft.py
index 5f83985c6d273b..d0cbbb28c81233 100644
--- a/python/paddle/fft.py
+++ b/python/paddle/fft.py
@@ -61,7 +61,7 @@ def _check_normalization(norm):
 def _check_fft_n(n):
     if not isinstance(n, int):
         raise ValueError(
-            f"Invalid FFT argument n({n}), it shoule be an integer."
+            f"Invalid FFT argument n({n}), it should be an integer."
         )
     if n <= 0:
         raise ValueError(f"Invalid FFT argument n({n}), it should be positive.")
@@ -71,7 +71,7 @@ def _check_fft_shape(x, s):
     ndim = x.ndim
     if not isinstance(s, Sequence):
         raise ValueError(
-            "Invaid FFT argument s({}), it should be a sequence of integers."
+            "Invalid FFT argument s({}), it should be a sequence of integers."
         )
 
     if len(s) > ndim:
@@ -87,7 +87,7 @@ def _check_fft_shape(x, s):
 def _check_fft_axis(x, axis):
     ndim = x.ndim
     if not isinstance(axis, int):
-        raise ValueError(f"Invalid FFT axis ({axis}), it shoule be an integer.")
+        raise ValueError(f"Invalid FFT axis ({axis}), it should be an integer.")
     if axis < -ndim or axis >= ndim:
         raise ValueError(
             f"Invalid FFT axis ({axis}), it should be in range [-{ndim}, {ndim})"
@@ -177,7 +177,7 @@ def fft(x, n=None, axis=-1, norm="backward", name=None):
             pair and what normalization factor to use. The parameter value must be one
             of "forward" or "backward" or "ortho". Default is "backward", meaning no normalization on
             the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies
-            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are
+            the ``1/n`` factor on the forward transform. For ``norm="ortho"``, both directions are
             scaled by ``1/sqrt(n)``.
         name (str, optional): The default value is None.  Normally there is no need for user to set
             this property. For more information, please refer to :ref:`api_guide_Name`.
@@ -240,7 +240,7 @@ def ifft(x, n=None, axis=-1, norm="backward", name=None):
             pair and what normalization factor to use. The parameter value must be one
             of "forward" or "backward" or "ortho". Default is "backward", meaning no normalization on
             the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies
-            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are
+            the ``1/n`` factor on the forward transform. For ``norm="ortho"``, both directions are
             scaled by ``1/sqrt(n)``.
         name (str, optional): The default value is None.  Normally there is no need for user to set
             this property. For more information, please refer to :ref:`api_guide_Name`.
@@ -300,7 +300,7 @@ def rfft(x, n=None, axis=-1, norm="backward", name=None):
 
                 - "backward": The factor of forward direction and backward direction are ``1`` and ``1/n`` respectively;
                 - "forward": The factor of forward direction and backward direction are ``1/n`` and ``1`` respectively;
-                - "ortho": The factor of forward direction and backword direction are both ``1/sqrt(n)``.
+                - "ortho": The factor of forward direction and backward direction are both ``1/sqrt(n)``.
 
             Where ``n`` is the multiplication of each element in  ``s`` .
         name(str, optional): The default value is None.  Normally there is no
@@ -496,7 +496,7 @@ def fftn(x, s=None, axes=None, norm="backward", name=None):
             pair and what normalization factor to use. The parameter value must be one
             of "forward" or "backward" or "ortho". Default is "backward", meaning no normalization on
             the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies
-            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are
+            the ``1/n`` factor on the forward transform. For ``norm="ortho"``, both directions are
             scaled by ``1/sqrt(n)``.
         name (str, optional): The default value is None.  Normally there is no need for user to set
             this property. For more information, please refer to :ref:`api_guide_Name`.
@@ -573,7 +573,7 @@ def ifftn(x, s=None, axes=None, norm="backward", name=None):
             pair and what normalization factor to use. The parameter value must be one
             of "forward" or "backward" or "ortho". Default is "backward", meaning no normalization on
             the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies
-            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are
+            the ``1/n`` factor on the forward transform. For ``norm="ortho"``, both directions are
             scaled by ``1/sqrt(n)``.
         name (str, optional): The default value is None.  Normally there is no need for user to set
             this property. For more information, please refer to :ref:`api_guide_Name`.
@@ -649,7 +649,7 @@ def rfftn(x, s=None, axes=None, norm="backward", name=None):
                   and ``1/n`` respectively;
                 - "forward": The factor of forward direction and backward direction are ``1/n``
                   and ``1`` respectively;
-                - "ortho": The factor of forward direction and backword direction are both ``1/sqrt(n)``.
+                - "ortho": The factor of forward direction and backward direction are both ``1/sqrt(n)``.
 
             Where ``n`` is the multiplication of each element in  ``s`` .
         name(str, optional): The default value is None.  Normally there is no
@@ -724,7 +724,7 @@ def irfftn(x, s=None, axes=None, norm="backward", name=None):
 
                 - "backward": The factor of forward direction and backward direction are ``1`` and ``1/n`` respectively;
                 - "forward": The factor of forward direction and backward direction are ``1/n`` and ``1`` respectively;
-                - "ortho": The factor of forward direction and backword direction are both ``1/sqrt(n)``.
+                - "ortho": The factor of forward direction and backward direction are both ``1/sqrt(n)``.
 
             Where ``n`` is the multiplication of each element in  ``s`` .
         name (str, optional): The default value is None.  Normally there is no need for user to set
@@ -1003,7 +1003,7 @@ def rfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
 
                 - "backward": The factor of forward direction and backward direction are ``1`` and ``1/n`` respectively;
                 - "forward": The factor of forward direction and backward direction are ``1/n`` and ``1`` respectively;
-                - "ortho": The factor of forward direction and backword direction are both ``1/sqrt(n)``.
+                - "ortho": The factor of forward direction and backward direction are both ``1/sqrt(n)``.
 
             Where ``n`` is the multiplication of each element in  ``s`` .
         name(str, optional): The default value is None.  Normally there is no
@@ -1060,7 +1060,7 @@ def irfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
 
                 - "backward": The factor of forward direction and backward direction are ``1`` and ``1/n`` respectively;
                 - "forward": The factor of forward direction and backward direction are ``1/n`` and ``1`` respectively;
-                - "ortho": The factor of forward direction and backword direction are both ``1/sqrt(n)``.
+                - "ortho": The factor of forward direction and backward direction are both ``1/sqrt(n)``.
 
             Where ``n`` is the multiplication of each element in  ``s`` .
         name (str, optional): The default value is None.  Normally there is no need for user to set
@@ -1151,7 +1151,7 @@ def ihfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
     Args:
         x(Tensor): Input tensor.
         s(Sequence[int], optional): Shape of the real input to the inverse FFT.
-        axes(Sequance[int], optional): The axes over which to compute the
+        axes(Sequence[int], optional): The axes over which to compute the
             inverse fft. Default is the last two axes.
         norm(str, optional): {"backward", "ortho", "forward"}. Default is
             "backward".
diff --git a/python/paddle/framework/random.py b/python/paddle/framework/random.py
index 9395de8566c2e9..b9b87663d9f0be 100644
--- a/python/paddle/framework/random.py
+++ b/python/paddle/framework/random.py
@@ -172,14 +172,14 @@ def set_rng_state(state_list, device=None):
     if isinstance(place, paddle.CUDAPlace):
         if not len(state_list) == core.get_cuda_device_count():
             raise ValueError(
-                "Length of gpu state list shoule be equal to the gpu device count"
+                "Length of gpu state list should be equal to the gpu device count"
             )
         for i in range(core.get_cuda_device_count()):
             core.default_cuda_generator(i).set_state(state_list[i])
     elif isinstance(place, paddle.XPUPlace):
         if not len(state_list) == core.get_xpu_device_count():
             raise ValueError(
-                "Length of xpu state list shoule be equal to the xpu device count"
+                "Length of xpu state list should be equal to the xpu device count"
             )
         for i in range(core.get_xpu_device_count()):
             core.default_xpu_generator(i).set_state(state_list[i])
@@ -192,7 +192,7 @@ def set_rng_state(state_list, device=None):
         )
         if not len(state_list) == dev_cnt:
             raise ValueError(
-                f"Length of custom device state list shoule be equal to the {place.get_dtype_type()} device count"
+                f"Length of custom device state list should be equal to the {place.get_dtype_type()} device count"
             )
         for i in range(dev_cnt):
             core.default_custom_device_generator(
@@ -200,7 +200,7 @@ def set_rng_state(state_list, device=None):
             ).set_state(state_list[i])
     elif isinstance(place, core.CPUPlace):
         if not len(state_list) == 1:
-            raise ValueError("Length of cpu state list shoule be equal to 1")
+            raise ValueError("Length of cpu state list should be equal to 1")
         core.default_cpu_generator().set_state(state_list[0])
     else:
         raise ValueError(
@@ -230,7 +230,7 @@ def set_cuda_rng_state(state_list):
     if paddle.is_compiled_with_cuda():
         if not len(state_list) == core.get_cuda_device_count():
             raise ValueError(
-                "Length of cuda state list shoule be equal to the cuda device count"
+                "Length of cuda state list should be equal to the cuda device count"
             )
         for i in range(core.get_cuda_device_count()):
             core.default_cuda_generator(i).set_state(state_list[i])
@@ -240,7 +240,7 @@ def _manual_program_seed(seed):
     """
     Sets global seed for generating random numbers.
 
-    NOTE(zhiqiu): This is the original implemention of seed. Keeps it temporally
+    NOTE(zhiqiu): This is the original implementation of seed. Keeps it temporally
     since CUDA generator is not developed, so we need it in the unittest.
 
     Args:
diff --git a/python/paddle/hapi/hub.py b/python/paddle/hapi/hub.py
index c39fa57ad56816..254e3a833f0db1 100644
--- a/python/paddle/hapi/hub.py
+++ b/python/paddle/hapi/hub.py
@@ -122,8 +122,8 @@ def _get_cache_or_reload(repo, force_reload, verbose=True, source='github'):
         shutil.move(fpath, cached_file)
 
         with zipfile.ZipFile(cached_file) as cached_zipfile:
-            extraced_repo_name = cached_zipfile.infolist()[0].filename
-            extracted_repo = os.path.join(hub_dir, extraced_repo_name)
+            extracted_repo_name = cached_zipfile.infolist()[0].filename
+            extracted_repo = os.path.join(hub_dir, extracted_repo_name)
             _remove_if_exists(extracted_repo)
             # Unzip the code and rename the base folder
             cached_zipfile.extractall(hub_dir)
diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index d8e408703db421..866901b840a31b 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -1359,7 +1359,7 @@ def save(self, path, training=True):
         """
 
         This function saves parameters, optimizer information or model and
-        paramters only for inference to path. It depends on the parameter
+        parameters only for inference to path. It depends on the parameter
         `training`.
 
         If `training` is set to True, the parameters saved contain all
diff --git a/python/paddle/hapi/static_flops.py b/python/paddle/hapi/static_flops.py
index 5b67049fc4e081..6d9209ae76a4a9 100644
--- a/python/paddle/hapi/static_flops.py
+++ b/python/paddle/hapi/static_flops.py
@@ -36,7 +36,7 @@ def name(self):
 
     def shape(self):
         """
-        Get the shape of the varibale.
+        Get the shape of the variable.
         """
         return self._var.shape
 
diff --git a/python/paddle/incubate/autograd/functional.py b/python/paddle/incubate/autograd/functional.py
index fbf054b05808a6..9490a10e1ec8d0 100644
--- a/python/paddle/incubate/autograd/functional.py
+++ b/python/paddle/incubate/autograd/functional.py
@@ -666,7 +666,7 @@ def _check_inputs(func, xs, v=None):
     if isinstance(xs, typing.Sequence) and not all(
         isinstance(x, framework.Variable) for x in xs
     ):
-        raise TypeError("All elements of 'xs' shoule be Tensor.")
+        raise TypeError("All elements of 'xs' should be Tensor.")
 
     if not isinstance(v, (framework.Variable, typing.Sequence, type(None))):
         raise TypeError(
@@ -676,7 +676,7 @@ def _check_inputs(func, xs, v=None):
     if isinstance(v, typing.Sequence) and not all(
         isinstance(e, framework.Variable) for e in v
     ):
-        raise TypeError("All elements of 'xs' shoule be Tensor.")
+        raise TypeError("All elements of 'xs' should be Tensor.")
 
 
 def _check_v_shape(v, refs):
diff --git a/python/paddle/incubate/autograd/primx.py b/python/paddle/incubate/autograd/primx.py
index 49c6876640a710..892010e28bac03 100644
--- a/python/paddle/incubate/autograd/primx.py
+++ b/python/paddle/incubate/autograd/primx.py
@@ -149,7 +149,7 @@ def add_rec(self, key_vars, value_vars):
             self.tab[id(key_vars)] = id(value_vars)
         else:
             assert len(key_vars) == len(value_vars), (
-                f'len(key_vars) shoule be equal to len(value_vars), '
+                f'len(key_vars) should be equal to len(value_vars), '
                 f'but len(key_vars)={len(key_vars)} and len(value_vars)={len(value_vars)}.'
             )
             for key_var, value_var in zip(key_vars, value_vars):
diff --git a/python/paddle/incubate/framework/random.py b/python/paddle/incubate/framework/random.py
index 32388dbd015466..49d458195c7673 100644
--- a/python/paddle/incubate/framework/random.py
+++ b/python/paddle/incubate/framework/random.py
@@ -117,14 +117,14 @@ def set_state(generator, state):
     if isinstance(place, core.CUDAPlace):
         if not len(state_list) == core.get_cuda_device_count():
             raise ValueError(
-                "Length of gpu state list shoule be equal to the gpu device count"
+                "Length of gpu state list should be equal to the gpu device count"
             )
         for i in range(core.get_cuda_device_count()):
             set_state(core.default_cuda_generator(i), state_list[i])
     elif isinstance(place, core.XPUPlace):
         if not len(state_list) == core.get_xpu_device_count():
             raise ValueError(
-                "Length of xpu state list shoule be equal to the xpu device count"
+                "Length of xpu state list should be equal to the xpu device count"
             )
         for i in range(core.get_xpu_device_count()):
             set_state(core.default_xpu_generator(i), state_list[i])
@@ -137,7 +137,7 @@ def set_state(generator, state):
         )
         if not len(state_list) == dev_cnt:
             raise ValueError(
-                f"Length of custom device state list shoule be equal to the {place.get_dtype_type()} device count"
+                f"Length of custom device state list should be equal to the {place.get_dtype_type()} device count"
             )
         for i in range(dev_cnt):
             set_state(
@@ -148,7 +148,7 @@ def set_state(generator, state):
             )
     elif isinstance(place, core.CPUPlace):
         if not len(state_list) == 1:
-            raise ValueError("Length of cpu state list shoule be equal to 1")
+            raise ValueError("Length of cpu state list should be equal to 1")
         set_state(core.default_cpu_generator(), state_list[0])
     else:
         raise ValueError(
@@ -202,7 +202,7 @@ def register_rng_state_as_index(state_list=None, device=None):
     if isinstance(place, core.CUDAPlace):
         if not len(state_list) == core.get_cuda_device_count():
             raise ValueError(
-                "Length of gpu state list shoule be equal to the gpu device count"
+                "Length of gpu state list should be equal to the gpu device count"
             )
         for i in range(core.get_cuda_device_count()):
             new_state_index_list.append(
@@ -213,7 +213,7 @@ def register_rng_state_as_index(state_list=None, device=None):
     elif isinstance(place, core.XPUPlace):
         if not len(state_list) == core.get_xpu_device_count():
             raise ValueError(
-                "Length of xpu state list shoule be equal to the xpu device count"
+                "Length of xpu state list should be equal to the xpu device count"
             )
         for i in range(core.get_xpu_device_count()):
             new_state_index_list.append(
@@ -230,7 +230,7 @@ def register_rng_state_as_index(state_list=None, device=None):
         )
         if not len(state_list) == dev_cnt:
             raise ValueError(
-                f"Length of custom device state list shoule be equal to the {place.get_dtype_type()} device count"
+                f"Length of custom device state list should be equal to the {place.get_dtype_type()} device count"
             )
         for i in range(dev_cnt):
             new_state_index_list.append(
@@ -240,7 +240,7 @@ def register_rng_state_as_index(state_list=None, device=None):
             )
     elif isinstance(place, core.CPUPlace):
         if not len(state_list) == 1:
-            raise ValueError("Length of cpu state list shoule be equal to 1")
+            raise ValueError("Length of cpu state list should be equal to 1")
         new_state_index_list.append(
             core.default_cpu_generator().register_state_index(state_list[0])
         )
diff --git a/python/paddle/jit/dy2static/partial_program.py b/python/paddle/jit/dy2static/partial_program.py
index 84719c3eee7928..808b1de5f08d07 100644
--- a/python/paddle/jit/dy2static/partial_program.py
+++ b/python/paddle/jit/dy2static/partial_program.py
@@ -842,7 +842,7 @@ def _get_forward_backward_program_form(
             self._outputs.var_ids
         )
         backward_end_op_index = whole_program.desc.block(0).op_size()
-        # For Backward process in CINN, all param@GRAD shoule be skipped for GC, because
+        # For Backward process in CINN, all param@GRAD should be skipped for GC, because
         # they will be shared in scope and used by optimizer.
         backward_skip_vars = self._parse_skip_gc_vars(
             whole_program
diff --git a/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py b/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py
index 655eb7317dccc5..55177f0601b9fc 100644
--- a/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py
+++ b/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py
@@ -339,7 +339,7 @@ class EmptyCode:
     def validate_value(value):
         assert isinstance(
             value, VariableBase
-        ), f"value: {value}, type shoule be VariableBase(or derived), but get {type(value)}"
+        ), f"value: {value}, type should be VariableBase(or derived), but get {type(value)}"
         assert not isinstance(value.tracker, DanglingTracker) or isinstance(
             value, (NullVariable, CellVariable)
         ), f"dangling variable {value} should not be pushed into stack."
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index e121348bf5a41e..0e09f4aabb529c 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -1240,9 +1240,9 @@ def full(shape, fill_value, dtype=None, name=None):
             If ``shape`` is a list or tuple, each element of it should be integer or 0-D Tensor with shape [].
             If ``shape`` is an Tensor, it should be an 1-D Tensor which represents a list.
         fill_value(bool|float|int|Tensor): The constant value used to initialize the Tensor to be created.
-            If ``fill_value`` is an Tensor, it shoule be an 0-D Tensor which represents a scalar.
+            If ``fill_value`` is an Tensor, it should be an 0-D Tensor which represents a scalar.
         dtype(np.dtype|str, optional): Data type of the output Tensor
-            which can be float16, float32, float64, int32, int64, if dytpe is `None`, the data
+            which can be float16, float32, float64, int32, int64, if dtype is `None`, the data
             type of created Tensor is `float32`.
         name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
diff --git a/test/legacy_test/test_prod_op.py b/test/legacy_test/test_prod_op.py
index 7a69a840c393dd..d7f8b4050caf66 100644
--- a/test/legacy_test/test_prod_op.py
+++ b/test/legacy_test/test_prod_op.py
@@ -159,13 +159,13 @@ def test_error(self):
             bool_x = paddle.static.data(
                 name='bool_x', shape=[2, 2, 4], dtype='bool'
             )
-            # The argument x shoule be a Tensor
+            # The argument x should be a Tensor
             self.assertRaises(TypeError, paddle.prod, [1])
 
             # The data type of x should be float32, float64, int32, int64
             self.assertRaises(TypeError, paddle.prod, bool_x)
 
-            # The argument axis's type shoule be int ,list or tuple
+            # The argument axis's type should be int ,list or tuple
             self.assertRaises(TypeError, paddle.prod, x, 1.5)
 
             # The argument dtype of prod_op should be float32, float64, int32 or int64.
diff --git a/test/xpu/test_prod_op_xpu.py b/test/xpu/test_prod_op_xpu.py
index eb720a5e0e73ed..160e1022209d65 100644
--- a/test/xpu/test_prod_op_xpu.py
+++ b/test/xpu/test_prod_op_xpu.py
@@ -148,13 +148,13 @@ def test_error(self):
             bool_x = paddle.static.data(
                 name='bool_x', shape=[2, 2, 4], dtype='bool'
             )
-            # The argument x shoule be a Tensor
+            # The argument x should be a Tensor
             self.assertRaises(TypeError, paddle.prod, [1])
 
             # The data type of x should be float32, float64, int32, int64
             self.assertRaises(TypeError, paddle.prod, bool_x)
 
-            # The argument axis's type shoule be int ,list or tuple
+            # The argument axis's type should be int ,list or tuple
             self.assertRaises(TypeError, paddle.prod, x, 1.5)
 
             # The argument dtype of prod_op should be float32, float64, int32 or int64.

From f31985185dbb21a8587bac28da88788f09c50d78 Mon Sep 17 00:00:00 2001
From: feifei-111 <2364819892@qq.com>
Date: Tue, 23 Jan 2024 14:55:29 +0800
Subject: [PATCH 29/34] [SOT] fix sot export bugs (#60707)

---
 .../executor/function_graph.py                |  28 +--
 .../executor/variables/callable.py            |   2 +-
 .../paddle/jit/sot/symbolic/compile_cache.py  |   4 +-
 python/paddle/jit/sot/symbolic/export.py      | 172 +++++++++++-------
 .../paddle/jit/sot/symbolic/statement_ir.py   |   4 +-
 test/sot/test_sot_export.py                   |   5 +-
 6 files changed, 134 insertions(+), 81 deletions(-)

diff --git a/python/paddle/jit/sot/opcode_translator/executor/function_graph.py b/python/paddle/jit/sot/opcode_translator/executor/function_graph.py
index 7af0e67ff7fb52..37c9f16bd08a34 100644
--- a/python/paddle/jit/sot/opcode_translator/executor/function_graph.py
+++ b/python/paddle/jit/sot/opcode_translator/executor/function_graph.py
@@ -108,16 +108,23 @@ def func(x):
     return map_variables(func, inputs)
 
 
-def get_symbol_meta_map(inputs):
-    output = {}
+def record_symbols(SIR, *args, **kwargs):
+    symbol_meta_map = {}
+    params = set()
+    non_params = set()
 
-    def func(x):
-        if isinstance(x, TensorVariable):
-            output[x.get_symbol()] = x.meta
-        return x
+    def fn(value):
+        if isinstance(value, TensorVariable):
+            symbol_meta_map[value.get_symbol()] = value.meta
+            if isinstance(value, ParameterVariable):
+                params.add(value.get_symbol())
+            else:
+                non_params.add(value.get_symbol())
+        return value
 
-    map_variables(func, inputs)
-    return output
+    map_variables(fn, [args, kwargs])
+    SIR.set_symbol_meta_map(symbol_meta_map)
+    SIR.set_parameter_info(params, non_params)
 
 
 def get_params_and_non_param_symbol(*args, **kwargs):
@@ -573,10 +580,7 @@ def symbolic_call(self, infer_meta_fn, compute_fn, func, *args, **kwargs):
             convert_to_symbol(kwargs),
         )
 
-        self.sir_ctx.TOS.set_symbol_meta_map(get_symbol_meta_map(args))
-        self.sir_ctx.TOS.set_symbol_meta_map(get_symbol_meta_map(kwargs))
-        params, non_params = get_params_and_non_param_symbol(*args, **kwargs)
-        self.sir_ctx.TOS.set_parameter_info(params, non_params)
+        record_symbols(self.sir_ctx.TOS, *args, **kwargs)
 
         log(3, f"         inputs : {inputs_symbols}", "\n")
 
diff --git a/python/paddle/jit/sot/opcode_translator/executor/variables/callable.py b/python/paddle/jit/sot/opcode_translator/executor/variables/callable.py
index 6723ebf66db62c..6ba6740c770c91 100644
--- a/python/paddle/jit/sot/opcode_translator/executor/variables/callable.py
+++ b/python/paddle/jit/sot/opcode_translator/executor/variables/callable.py
@@ -769,7 +769,7 @@ def call_function(self, /, *args, **kwargs):
         assert self.check_no_weight_and_buffers(
             new_layer
         ), "You have created a layer in to_static function which may have Potential bugs. please create it in __init__/main function."
-        return PaddleLayerVariable(
+        return VariableFactory.from_value(
             new_layer, self.graph, CreateLayerTracker(self, args, kwargs)
         )
 
diff --git a/python/paddle/jit/sot/symbolic/compile_cache.py b/python/paddle/jit/sot/symbolic/compile_cache.py
index f8ff14b3369076..93eb89a6a98f82 100644
--- a/python/paddle/jit/sot/symbolic/compile_cache.py
+++ b/python/paddle/jit/sot/symbolic/compile_cache.py
@@ -65,6 +65,7 @@ def __init__(self, compiled_fn, SIR, is_training: bool):
         self.concrete_program = None
         self.SIR = SIR  # for debug
         self.is_training = is_training
+        self.exported = False
 
     def amp_cast_inputs(self, args, kwargs):
         """Prepare inputs for amp, cast float16 into float32 if needed."""
@@ -145,8 +146,9 @@ def __call__(self, *args, **kwargs):
                 4,
                 lambda: print("[CompileCache] run sir forward success."),
             )
-            if ENV_SOT_EXPORT.get() != "":
+            if ENV_SOT_EXPORT.get() != "" and not self.exported:
                 export(self.SIR, ENV_SOT_EXPORT.get())
+                self.exported = True
 
             return outputs
 
diff --git a/python/paddle/jit/sot/symbolic/export.py b/python/paddle/jit/sot/symbolic/export.py
index b65abf0d10584a..56e6d92a18b2e5 100644
--- a/python/paddle/jit/sot/symbolic/export.py
+++ b/python/paddle/jit/sot/symbolic/export.py
@@ -15,6 +15,7 @@
 import os
 from itertools import chain
 
+import paddle
 from paddle.utils import flatten
 
 from ..utils import ConstTypes, ExportError, NameGenerator
@@ -43,14 +44,74 @@ def __str__(self):
         return "\n".join(self.get_lines())
 
 
+class NameGener:
+    def __init__(self, SIR):
+        self.SIR = SIR
+        self.name_map = {}
+        self.param_name_generator = NameGenerator("self.parameter_")
+        self.non_param_name_generator = NameGenerator("var_")
+
+    def __call__(self, var):
+        return self.get_str(var)
+
+    def get_str(self, var):
+        if isinstance(var, list):
+            return self.get_list_str(var)
+        elif isinstance(var, tuple):
+            return self.get_tuple_str(var)
+        elif isinstance(var, dict):
+            return self.get_dict_str(var)
+        elif isinstance(var, set):
+            return self.get_set_str(var)
+        else:
+            return self.get_obj_str(var)
+
+    def get_list_str(self, list_):
+        return "[{}]".format(", ".join(self.get_str(var) for var in list_))
+
+    def get_tuple_str(self, tuple_):
+        return "({},)".format(", ".join(self.get_str(var) for var in tuple_))
+
+    def get_dict_str(self, dict_):
+        return "{{{},}}".format(
+            ", ".join(
+                f"{self.get_str(k)}: {self.get_str(v)}"
+                for k, v in dict_.items()
+            )
+        )
+
+    def get_set_str(self, set_):
+        return "{{{},}}".format(", ".join(self.get_str(var) for var in set_))
+
+    def get_obj_str(self, var):
+        if isinstance(var, Symbol):
+            if var not in self.name_map:
+                self.register_symbol(var)
+            return self.name_map[var]
+
+        elif isinstance(var, str):
+            return f"'{var}'"
+        else:
+            return str(var)
+
+    def register_symbol(self, symbol):
+        if symbol in self.SIR.param_symbol:
+            name = self.param_name_generator.next()
+        else:
+            name = self.non_param_name_generator.next()
+        self.name_map[symbol] = name
+
+
 class PyFileGen:
     def __init__(self, SIR):
         self.SIR = SIR
         self.roots = []
 
-        self.layer_name_map = {}
-        self.layer_name_generator = NameGenerator("_")
-        self.SIR_name = SIR.name.replace("_", "")
+        self.name_gener = NameGener(self.SIR)
+
+        self.SIR_sig = "||".join(
+            f"{stmt.type}:{stmt.name}" for stmt in SIR.statements
+        )
 
     def new_root(self, *args):
         stmt = PyStatement(*args)
@@ -93,21 +154,22 @@ def check_exportable(self):
 
     def create_header(self):
         self.new_root(
+            f"# {self.SIR_sig}",
             "import paddle",
             "import unittest",
             "import numpy as np",
         )
 
     def create_layer(self):
-        layer_class = self.new_root(f"class {self.SIR_name}(paddle.nn.Layer):")
+        layer_class = self.new_root("class LayerCase(paddle.nn.Layer):")
 
         init_fn = layer_class.add_sub("def __init__(self):")
         init_fn.add_sub("super().__init__()")
 
         for param in self.SIR.param_symbol:
-            meta = self.SIR.symbol_meta_map[param.name]
+            meta = self.SIR.symbol_meta_map[param]
             init_fn.add_sub(
-                f"self.{param.name} = self.create_parameter(",
+                f"{self.name_gener(param)} = self.create_parameter(",
                 f"   shape={meta.shape},",
                 f"   dtype={meta.dtype},",
                 ")",
@@ -116,20 +178,16 @@ def create_layer(self):
         for stmt in self.SIR.statements:
             if stmt.type == "layer":
                 layer = stmt.layer()
-                if id(layer) not in self.layer_name_map:
-                    layer_name = (
-                        layer.__class__.__name__
-                        + self.layer_name_generator.next()
-                    )
-                    self.layer_name_map[id(layer)] = layer_name
-                    init_fn.add_sub(self.init_sub_layer(layer, layer_name))
+                init_fn.add_sub(self.init_sub_layer(layer))
 
         forward_definition = ["def forward(", "    self,"]
 
         for inp in self.SIR.inputs:
             if inp in self.SIR.non_param_symbol:
-                meta = self.SIR.symbol_meta_map[inp.name]
-                forward_definition.append(f"    {inp.name},    # {str(meta)}")
+                meta = self.SIR.symbol_meta_map[inp]
+                forward_definition.append(
+                    f"    {self.name_gener(inp)},    # {str(meta)}"
+                )
         forward_definition.append("):")
 
         forward_fn = layer_class.add_sub(*forward_definition)
@@ -139,14 +197,12 @@ def create_layer(self):
 
         forward_fn.add_sub(
             "return {}".format(
-                ", ".join(self.true_name(out) for out in self.SIR.outputs)
+                ", ".join(self.name_gener(out) for out in self.SIR.outputs)
             )
         )
 
     def create_test(self):
-        test_class = self.new_root(
-            f"class Test{self.SIR_name}(unittest.TestCase):"
-        )
+        test_class = self.new_root("class TestLayer(unittest.TestCase):")
 
         setup = test_class.add_sub("def setUp(self):")
         test_inputs = [
@@ -155,16 +211,29 @@ def create_test(self):
         for inp in self.SIR.inputs:
             if inp in self.SIR.non_param_symbol:
                 meta = self.SIR.symbol_meta_map[inp.name]
-                test_inputs.append(
-                    f"    paddle.rand(shape={meta.shape}, dtype={meta.dtype}),"
-                )
+                shape_str = "[1]" if len(meta.shape) == 0 else str(meta.shape)
+                if meta.dtype in (
+                    paddle.int8,
+                    paddle.int16,
+                    paddle.int32,
+                    paddle.int64,
+                ):
+                    test_inputs.append(
+                        f"    paddle.randint(low=0, high=10, shape={shape_str}, dtype={meta.dtype}),"
+                    )
+                else:
+                    test_inputs.append(
+                        f"    paddle.rand(shape={shape_str}, dtype={meta.dtype}),"
+                    )
         test_inputs.append(")")
         setup.add_sub(*test_inputs)
+        setup.add_sub("self.net = LayerCase()")
 
         train = test_class.add_sub(
-            "def train(self, net, to_static, with_cinn=False):"
+            "def train(self, net, to_static, with_prim=False, with_cinn=False):"
         )
         train.add_sub(
+            "paddle.set_flags({'FLAGS_prim_all': with_prim})",
             "if to_static:",
             "    if with_cinn:",
             "        build_strategy = paddle.static.BuildStrategy()",
@@ -176,24 +245,14 @@ def create_test(self):
             "return outs",
         )
 
-        test_ast_static = test_class.add_sub("def test_ast_static(self):")
-        test_ast_static.add_sub(
-            "net = SIR0()",
-            "dy_out = self.train(net, to_static=False)",
-            "st_out = self.train(net, to_static=True, with_cinn=False)",
-            "for dy, st in zip(paddle.utils.flatten(dy_out), paddle.utils.flatten(st_out)):",
-            "    np.testing.assert_allclose(dy.numpy(), st.numpy(), atol=1e-8)",
-        )
-
         test_ast_cinn_static = test_class.add_sub(
-            "def test_ast_cinn_static(self):"
+            "def test_ast_prim_cinn(self):"
         )
         test_ast_cinn_static.add_sub(
-            "net = SIR0()",
-            "dy_out = self.train(net, to_static=False)",
-            "st_out = self.train(net, to_static=True, with_cinn=True)",
-            "for dy, st in zip(paddle.utils.flatten(dy_out), paddle.utils.flatten(st_out)):",
-            "    np.testing.assert_allclose(dy.numpy(), st.numpy(), atol=1e-8)",
+            "st_out = self.train(self.net, to_static=True)",
+            "cinn_out = self.train(self.net, to_static=True, with_prim=True, with_cinn=True)",
+            "for st, cinn in zip(paddle.utils.flatten(st_out), paddle.utils.flatten(cinn_out)):",
+            "    np.testing.assert_allclose(st.numpy(), cinn.numpy(), atol=1e-8)",
         )
 
     def create_tail(self):
@@ -202,15 +261,6 @@ def create_tail(self):
             "    unittest.main()",
         )
 
-    def true_name(self, var):
-        if isinstance(var, Symbol):
-            if var in self.SIR.param_symbol:
-                return "self." + var.name
-            else:
-                return var.name
-        else:
-            return str(var)
-
     def init_sub_layer(self, layer, layer_name):
         # TODO @wuzhanfei need more effecient way to create a sub layer
         # now, we just close call_Layer behavior
@@ -219,8 +269,8 @@ def init_sub_layer(self, layer, layer_name):
     def create_input_string(self, args, kwargs):
         return ", ".join(
             chain(
-                (self.true_name(arg) for arg in args),
-                (f"{k}={self.true_name(v)}" for k, v in kwargs.items()),
+                (self.name_gener(arg) for arg in args),
+                (f"{k}={self.name_gener(v)}" for k, v in kwargs.items()),
             )
         )
 
@@ -234,7 +284,7 @@ def search(outputs, path, result):
             elif isinstance(outputs, dict):
                 search_dict(outputs, path, result)
             elif isinstance(outputs, Symbol):
-                result.append(self.true_name(outputs) + " = " + "".join(path))
+                result.append(self.name_gener(outputs) + " = " + "".join(path))
 
         def search_sequnce(outputs, path, result):
             for idx, out in enumerate(outputs):
@@ -260,7 +310,7 @@ def create_api_stmt(self, stmt):
         api = stmt.api
         api_str = api.__module__ + "." + api.__name__
         if isinstance(stmt.outputs, Symbol):
-            return [f"{stmt.outputs.name} = {api_str}({input_str})"]
+            return [f"{self.name_gener(stmt.outputs)} = {api_str}({input_str})"]
         else:
             compute_code = f"out = {api_str}({input_str})"
             unpack_codes = self.create_unpack_output_string(stmt.outputs)
@@ -269,32 +319,23 @@ def create_api_stmt(self, stmt):
     def create_method_stmt(self, stmt):
         args, kwargs = stmt.inputs
         input_str = self.create_input_string(args[1:], kwargs)
-        method_str = args[0].name + "." + stmt.method
+        method_str = self.name_gener(args[0]) + "." + stmt.method
         if isinstance(stmt.outputs, Symbol):
-            return [f"{stmt.outputs.name} = {method_str}({input_str})"]
+            return [
+                f"{self.name_gener(stmt.outputs)} = {method_str}({input_str})"
+            ]
         else:
             compute_code = f"out = {method_str}({input_str})"
             unpack_codes = self.create_unpack_output_string(stmt.outputs)
             return [compute_code] + unpack_codes
 
-    def create_layer_stmt(self, stmt):
-        args, kwargs = stmt.inputs
-        input_str = self.create_input_string(args, kwargs)
-        layer_str = "self." + self.layer_name_map[id(stmt.layer())]
-        if isinstance(stmt.outputs, Symbol):
-            return [f"{stmt.outputs.name} = {layer_str}({input_str})"]
-        else:
-            compute_code = f"out = {layer_str}({input_str})"
-            unpack_codes = self.create_unpack_output_string(stmt.outputs)
-            return [compute_code] + unpack_codes
-
 
 def export(SIR, path):
     try:
         pygen = PyFileGen(SIR)
         string = pygen.gen_py_codes()
     except ExportError as e:
-        print("[SOT] Export SIR Failed:", e)
+        print(f"[SOT] Export {SIR.name} Failed:", e)
         return
 
     if not os.path.exists(path):
@@ -302,3 +343,4 @@ def export(SIR, path):
 
     with open(os.path.join(path, f"{SIR.name}.py"), "w") as f:
         f.write(string)
+        print(f"[SOT] Export {SIR.name} Sucess with size {len(SIR.statements)}")
diff --git a/python/paddle/jit/sot/symbolic/statement_ir.py b/python/paddle/jit/sot/symbolic/statement_ir.py
index 01f59428ab3829..8ae4eb5be114ac 100644
--- a/python/paddle/jit/sot/symbolic/statement_ir.py
+++ b/python/paddle/jit/sot/symbolic/statement_ir.py
@@ -135,7 +135,7 @@ def __init__(
         stacks: list[str],
     ):
         super().__init__(
-            "api", "paddle." + api.__name__, inputs, outputs, stacks
+            "api", api.__module__ + "." + api.__name__, inputs, outputs, stacks
         )
         self.api = api
 
@@ -228,6 +228,8 @@ def __deepcopy__(self, memo=None):
         new_sir.outputs = list(self.outputs)
         new_sir.statements = list(self.statements)
         new_sir.symbol_meta_map = dict(self.symbol_meta_map.items())
+        new_sir.param_symbol = set(self.param_symbol)
+        new_sir.non_param_symbol = set(self.non_param_symbol)
         return new_sir
 
     def set_parameter_info(self, params, non_params):
diff --git a/test/sot/test_sot_export.py b/test/sot/test_sot_export.py
index 2ad865ee53cec7..3269953dacd1e4 100644
--- a/test/sot/test_sot_export.py
+++ b/test/sot/test_sot_export.py
@@ -12,10 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import unittest
 
 import paddle
-from paddle.jit.sot.utils import with_export_guard
+from paddle.jit.sot.utils import min_graph_size_guard, with_export_guard
 
 
 class Net(paddle.nn.Layer):
@@ -35,11 +36,13 @@ def forward(self, x):
 
 
 class TestSotExport(unittest.TestCase):
+    @min_graph_size_guard(0)
     @with_export_guard("/tmp")
     def test_basic(self):
         net = Net()
         x = paddle.to_tensor([2, 3], dtype="float32", stop_gradient=True)
         y = paddle.jit.to_static(net)(x)
+        assert os.path.exists("/tmp/SIR_0.py")
 
 
 if __name__ == "__main__":

From 47fca844056da1e1c4b662a9699ea540850c4bae Mon Sep 17 00:00:00 2001
From: JYChen <zoooo0820@qq.com>
Date: Tue, 23 Jan 2024 16:03:51 +0800
Subject: [PATCH 30/34] fast pass for bool setitem (#61021)

* fast pass for bool setitem

* fix 0-size value case
---
 paddle/fluid/pybind/eager_method.cc  | 109 +++++++++++++++------------
 python/paddle/base/variable_index.py |  39 +++++++---
 2 files changed, 88 insertions(+), 60 deletions(-)

diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index 516708790ffcfd..a6026a4038f378 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -1725,60 +1725,71 @@ static PyObject* tensor__setitem_dygraph(TensorObject* self,
 
     // Release gil and do tracing
     py::gil_scoped_release release;
-
-    if (value_tensor.initialized() &&
-        (self->tensor.dtype() != value_tensor.dtype())) {
-      if (egr::Controller::Instance().GetAMPLevel() !=
-          paddle::imperative::AmpLevel::O0) {
-        paddle::small_vector<std::vector<paddle::Tensor>,
-                             egr::kSlotSmallVectorSize>
-            tmps = {{self->tensor}, {value_tensor}};
-        auto amp_dtype = egr::GetAmpDestDtype("index_put", tmps);
-        self->tensor = egr::EagerAmpAutoCast(
-            self->tensor.name(), self->tensor, amp_dtype, "index_put");
-        value_tensor = egr::EagerAmpAutoCast(
-            value_tensor.name(), value_tensor, amp_dtype, "index_put");
-      }
+    if (value_tensor.initialized()) {
       if (self->tensor.dtype() != value_tensor.dtype()) {
-        value_tensor = cast_ad_func(value_tensor, self->tensor.dtype());
+        if (egr::Controller::Instance().GetAMPLevel() !=
+            paddle::imperative::AmpLevel::O0) {
+          paddle::small_vector<std::vector<paddle::Tensor>,
+                               egr::kSlotSmallVectorSize>
+              tmps = {{self->tensor}, {value_tensor}};
+          auto amp_dtype = egr::GetAmpDestDtype("index_put", tmps);
+          self->tensor = egr::EagerAmpAutoCast(
+              self->tensor.name(), self->tensor, amp_dtype, "index_put");
+          value_tensor = egr::EagerAmpAutoCast(
+              value_tensor.name(), value_tensor, amp_dtype, "index_put");
+        }
+        if (self->tensor.dtype() != value_tensor.dtype()) {
+          value_tensor = cast_ad_func(value_tensor, self->tensor.dtype());
+        }
       }
-    }
 
-    if (value_tensor.dims().size() > 1 && pos_of_new_dim != 0) {
-      value_tensor = transpose_ad_func(value_tensor, trans_dim);
-    }
-
-    const phi::distributed::ProcessMesh* mesh = nullptr;
-    if (InputsContainDistTensor(
-            &mesh, self->tensor, transed_sub_tensor, value_tensor)) {
-      ConvertAllInputsToDistTensor(
-          mesh, self->tensor, transed_sub_tensor, value_tensor);
-    }
-
-    transed_sub_tensor =
-        index_put__ad_func(transed_sub_tensor, transed_index, value_tensor);
+      if (value_tensor.dims().size() > 1 && pos_of_new_dim != 0) {
+        value_tensor = transpose_ad_func(value_tensor, trans_dim);
+      }
 
-    // TODO(zoooo0820) Remove following code after backward bug fixed.
-    if (out_is_view) {
-      paddle::Tensor transback_sub_tensor =
-          transpose_ad_func(transed_sub_tensor, trans_back_dim);
+      const phi::distributed::ProcessMesh* mesh = nullptr;
+      if (InputsContainDistTensor(
+              &mesh, self->tensor, transed_sub_tensor, value_tensor)) {
+        ConvertAllInputsToDistTensor(
+            mesh, self->tensor, transed_sub_tensor, value_tensor);
+      }
 
-      self->tensor = set_value_with_tensor__ad_func(self->tensor,
-                                                    transback_sub_tensor,
-                                                    slice_starts,
-                                                    slice_ends,
-                                                    slice_strides,
-                                                    slice_axes,
-                                                    decrease_axis,
-                                                    none_axes);
-    }
-    if (PyCheckTensor(value_obj)) {
-      // pass the stop_gradient from value to tensor.
-      // pass stop gradient should be done after CheckInplace in
-      // set_value__dygraph_function.
-      if (!egr::EagerUtils::autograd_meta(&value_tensor)->StopGradient() &&
-          egr::EagerUtils::autograd_meta(&self->tensor)->StopGradient()) {
-        egr::EagerUtils::autograd_meta(&self->tensor)->SetStopGradient(false);
+      if (transed_index.size() == 1 &&
+          transed_index[0].dtype() == phi::DataType::BOOL &&
+          transed_index[0].shape().size() == self->tensor.shape().size()) {
+        if (value_tensor.shape() != self->tensor.shape()) {
+          value_tensor = expand_ad_func(value_tensor, self->tensor.shape());
+        }
+        transed_sub_tensor =
+            where__ad_func(logical_not_ad_func(transed_index[0]),
+                           transed_sub_tensor,
+                           value_tensor);
+      } else {
+        transed_sub_tensor =
+            index_put__ad_func(transed_sub_tensor, transed_index, value_tensor);
+      }
+      // TODO(zoooo0820) Remove following code after backward bug fixed.
+      if (out_is_view) {
+        paddle::Tensor transback_sub_tensor =
+            transpose_ad_func(transed_sub_tensor, trans_back_dim);
+
+        self->tensor = set_value_with_tensor__ad_func(self->tensor,
+                                                      transback_sub_tensor,
+                                                      slice_starts,
+                                                      slice_ends,
+                                                      slice_strides,
+                                                      slice_axes,
+                                                      decrease_axis,
+                                                      none_axes);
+      }
+      if (PyCheckTensor(value_obj)) {
+        // pass the stop_gradient from value to tensor.
+        // pass stop gradient should be done after CheckInplace in
+        // set_value__dygraph_function.
+        if (!egr::EagerUtils::autograd_meta(&value_tensor)->StopGradient() &&
+            egr::EagerUtils::autograd_meta(&self->tensor)->StopGradient()) {
+          egr::EagerUtils::autograd_meta(&self->tensor)->SetStopGradient(false);
+        }
       }
     }
   }
diff --git a/python/paddle/base/variable_index.py b/python/paddle/base/variable_index.py
index 83b0ad108bd318..c1f6ea09900e25 100644
--- a/python/paddle/base/variable_index.py
+++ b/python/paddle/base/variable_index.py
@@ -113,7 +113,6 @@ def deal_attrs(attrs, attr, attr_name, tensor_attr_name, inputs, infer_flags):
         attrs[attr_name] = attr
 
 
-# the item is a tensor of bool
 def get_value_for_bool_tensor(var, item):
     if len(item.shape) > len(var.shape):
         raise IndexError(
@@ -625,12 +624,31 @@ def _setitem_static(x, indices, values):
             values = values.astype(transed_sub_tensor.dtype)
 
         if paddle.in_dynamic_mode():
-            # NOTE(zoooo0820): directly return result instead of another set_value, after backward bug fixed.
-            transed_sub_tensor = transed_sub_tensor.index_put_(
-                adjusted_advanced_index, values
-            )
-            if not is_view:
-                return transed_sub_tensor
+            if (
+                len(adjusted_advanced_index) == 1
+                and adjusted_advanced_index[0].dtype
+                in (paddle.bool, paddle.base.libpaddle.BOOL)
+                and len(
+                    adjusted_advanced_index[0].shape
+                    == len(transed_sub_tensor.shape)
+                )
+            ):
+                if values.shape != transed_sub_tensor.shape:
+                    values = values.expand(transed_sub_tensor.shape)
+                transed_sub_tensor = paddle._C_ops.where_(
+                    paddle.logical_not(adjusted_advanced_index[0]),
+                    transed_sub_tensor,
+                    values,
+                )
+                if not is_view:
+                    return x
+            else:
+                # NOTE(zoooo0820): directly return result instead of another set_value, after backward bug fixed.
+                transed_sub_tensor = transed_sub_tensor.index_put_(
+                    adjusted_advanced_index, values
+                )
+                if not is_view:
+                    return x
         else:
             transed_sub_tensor = transed_sub_tensor.index_put(
                 adjusted_advanced_index, values
@@ -845,10 +863,9 @@ def _getitem_static(x, indices):
         ) = deal_advanced_index(out, advanced_index, False, None)
 
         # TODO(zooooo0820): Replacing gather_nd to another advanded OP for handling of mixed indexes more efficiently
-        if (
-            len(adjusted_advanced_index) == 1
-            and adjusted_advanced_index[0].dtype == paddle.bool
-        ):
+        if len(adjusted_advanced_index) == 1 and adjusted_advanced_index[
+            0
+        ].dtype in (paddle.bool, paddle.base.libpaddle.BOOL):
             # Note: now slice not support 0-size Tensor, so only one bool tensor can return empty 0-size.
             out = get_value_for_bool_tensor(
                 transed_tensor, adjusted_advanced_index[0]

From 865b905311d5470af6277a7b83d913bb6c05fadf Mon Sep 17 00:00:00 2001
From: feifei-111 <2364819892@qq.com>
Date: Tue, 23 Jan 2024 16:10:07 +0800
Subject: [PATCH 31/34] [SOT] update logic about the judgement of "is None"
 (#60993)

---
 .../executor/variable_dispatch.py             | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/python/paddle/jit/sot/opcode_translator/executor/variable_dispatch.py b/python/paddle/jit/sot/opcode_translator/executor/variable_dispatch.py
index 5b83e8bc1fe1f7..2a15c3836a941a 100644
--- a/python/paddle/jit/sot/opcode_translator/executor/variable_dispatch.py
+++ b/python/paddle/jit/sot/opcode_translator/executor/variable_dispatch.py
@@ -782,33 +782,33 @@ def is_not_func(var: VariableBase, other: VariableBase):
 # is None
 Dispatcher.register(
     operator_is_none,
-    ("TensorVariable",),
-    lambda var: ConstantVariable(False, var.graph, DummyTracker([var])),
+    ("ConstantVariable",),
+    lambda var: BuiltinVariable(operator.is_, var.graph, DanglingTracker())(
+        var, ConstantVariable.wrap_literal(None, var.graph)
+    ),
 )
 
 # is not None
 Dispatcher.register(
     operator_is_not_none,
-    ("TensorVariable",),
-    lambda var: ConstantVariable(True, var.graph, DummyTracker([var])),
+    ("ConstantVariable",),
+    lambda var: BuiltinVariable(operator.is_not, var.graph, DanglingTracker())(
+        var, ConstantVariable.wrap_literal(None, var.graph)
+    ),
 )
 
 # is None
 Dispatcher.register(
     operator_is_none,
     ("VariableBase",),
-    lambda var: BuiltinVariable(operator.is_, var.graph, DanglingTracker())(
-        var, ConstantVariable.wrap_literal(None, var.graph)
-    ),
+    lambda var: ConstantVariable(False, var.graph, DummyTracker([var])),
 )
 
 # is not None
 Dispatcher.register(
     operator_is_not_none,
     ("VariableBase",),
-    lambda var: BuiltinVariable(operator.is_not, var.graph, DanglingTracker())(
-        var, ConstantVariable.wrap_literal(None, var.graph)
-    ),
+    lambda var: ConstantVariable(True, var.graph, DummyTracker([var])),
 )
 
 

From 500e21b749cced928d53d34ac0f40eaa60875e45 Mon Sep 17 00:00:00 2001
From: Tian <121000916+SylarTiaNII@users.noreply.github.com>
Date: Tue, 23 Jan 2024 16:10:42 +0800
Subject: [PATCH 32/34] [CustomDevice] enable memory stat for custom device
 allocator (#61030)

---
 paddle/fluid/memory/allocation/allocator_facade.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 59ab4eaf154724..6401a82956492a 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -1349,7 +1349,7 @@ class AllocatorFacadePrivate {
       const platform::Place& place = pair.first;
       if (platform::is_cpu_place(place) ||
           platform::is_cuda_pinned_place(place) ||
-          platform::is_gpu_place(place)) {
+          platform::is_gpu_place(place) || platform::is_custom_place(place)) {
         pair.second = std::make_shared<StatAllocator>(pair.second);
       }
     }

From bc4bc09aba1c2efcc7ac81b1c2277dad1494df0b Mon Sep 17 00:00:00 2001
From: xingmingyyj <135400902+xingmingyyj@users.noreply.github.com>
Date: Tue, 23 Jan 2024 17:13:50 +0800
Subject: [PATCH 33/34] =?UTF-8?q?=E3=80=90PIR=20OpTest=20Fix=20No.15?=
 =?UTF-8?q?=E3=80=91=20fix=20=20test=20match=20matrix=20tensor=20op=20(#60?=
 =?UTF-8?q?277)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix_test_match_matrix_tensor_op

* fix

* fix

* fix

* fix

* fix

* fix
---
 .../fluid/operators/match_matrix_tensor_op.cc | 87 +++++++++++++++++++
 .../pir/dialect/op_generator/ops_api_gen.py   |  1 +
 paddle/fluid/pir/dialect/operator/ir/ops.yaml |  9 ++
 .../pir/dialect/operator/ir/ops_backward.yaml |  9 ++
 .../fluid/pir/dialect/operator/utils/utils.cc |  2 +
 paddle/phi/api/yaml/op_compat.yaml            |  7 ++
 paddle/phi/infermeta/backward.cc              | 26 +++++-
 paddle/phi/infermeta/backward.h               | 10 +++
 paddle/phi/infermeta/ternary.cc               | 73 ++++++++++++++++
 paddle/phi/infermeta/ternary.h                |  8 ++
 test/white_list/pir_op_test_white_list        |  1 +
 11 files changed, 232 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/match_matrix_tensor_op.cc b/paddle/fluid/operators/match_matrix_tensor_op.cc
index 7055f3ca95efe0..dd2dcd548180a4 100644
--- a/paddle/fluid/operators/match_matrix_tensor_op.cc
+++ b/paddle/fluid/operators/match_matrix_tensor_op.cc
@@ -251,6 +251,93 @@ class CPUMatchMatrixTensorOPKernel : public framework::OpKernel<T> {
     auto* tmp = ctx.Output<phi::DenseTensor>("Tmp");
 
     int dim_t = ctx.Attr<int>("dim_t");
+
+    const auto& x_lod = x->lod();
+    PADDLE_ENFORCE_EQ(x_lod.empty(),
+                      false,
+                      platform::errors::InvalidArgument(
+                          "The Input(X) should hold LoD information, but "
+                          "received Input(X).lod() is empty."));
+    const auto& x_lod_0 = x_lod[0];
+    PADDLE_ENFORCE_GE(x_lod_0.size(),
+                      2,
+                      platform::errors::InvalidArgument(
+                          "The dimensions of Input(X)'s LoD data should be "
+                          "equal to 2, but received %d.",
+                          x_lod_0.size()));
+    auto x_dims = x->dims();
+    PADDLE_ENFORCE_EQ(x_dims[0],
+                      static_cast<int64_t>(x_lod_0.back()),
+                      platform::errors::InvalidArgument(
+                          "The last element of Input(X)'s LoD data should be "
+                          "equal to the first dimension of Input(X). "
+                          "But received the last element of Input(X)'s LoD "
+                          "data is %d, the first dimension of Input(X) is %d.",
+                          x_lod_0.back(),
+                          x_dims[0]));
+    const auto& y_lod = y->lod();
+    PADDLE_ENFORCE_EQ(y_lod.empty(),
+                      false,
+                      platform::errors::InvalidArgument(
+                          "The Input(Y) should hold LoD information, but "
+                          "received Input(Y).lod() is empty."));
+    const auto& y_lod_0 = y_lod[0];
+    PADDLE_ENFORCE_GE(y_lod_0.size(),
+                      2,
+                      platform::errors::InvalidArgument(
+                          "The dimensions of Input(Y)'s LoD data should be "
+                          "equal to 2, but received %d.",
+                          y_lod_0.size()));
+    auto y_dims = y->dims();
+    PADDLE_ENFORCE_EQ(y_dims[0],
+                      static_cast<int64_t>(y_lod_0.back()),
+                      platform::errors::InvalidArgument(
+                          "The last element of Input(Y)'s LoD data should be "
+                          "equal to the first dimension of Input(Y). "
+                          "But received the last element of Input(Y)'s LoD "
+                          "data is %d, the first dimension of Input(Y) is %d.",
+                          y_lod_0.back(),
+                          y_dims[0]));
+
+    PADDLE_ENFORCE_EQ(x_lod_0.size(),
+                      y_lod_0.size(),
+                      platform::errors::InvalidArgument(
+                          "The dimensions of Input(X)'s and Input(Y)'s LoD "
+                          "data should be equal. "
+                          "But received the dimensions of Input(X)'s LoD is "
+                          "%d, the dimensions of Input(Y)'s LoD is %d.",
+                          x_lod_0.size(),
+                          y_lod_0.size()));
+
+    int64_t out_dim_0 = 0;
+    int64_t tmp_dim_0 = -1;
+    for (size_t i = 1; i < x_lod_0.size(); i++) {
+      int64_t x_len = x_lod_0[i] - x_lod_0[i - 1];
+      int64_t y_len = y_lod_0[i] - y_lod_0[i - 1];
+      out_dim_0 += (x_len * y_len);
+    }
+    out_dim_0 *= dim_t;
+
+    tmp_dim_0 = x_dims[0] * dim_t * x_dims[1];
+    std::vector<int64_t> out_dims_vec{out_dim_0};
+    out_dims_vec.push_back(1);
+    std::vector<int64_t> tmp_dims_vec{tmp_dim_0};
+    tmp_dims_vec.push_back(1);
+
+    auto& out_meta = out->meta();
+    phi::DenseTensorMeta new_out_meta(out_meta.dtype,
+                                      common::make_ddim(out_dims_vec),
+                                      out_meta.layout,
+                                      out_meta.lod);
+    out->set_meta(new_out_meta);
+
+    auto& tmp_meta = tmp->meta();
+    phi::DenseTensorMeta new_tmp_meta(tmp_meta.dtype,
+                                      common::make_ddim(tmp_dims_vec),
+                                      tmp_meta.layout,
+                                      tmp_meta.lod);
+    tmp->set_meta(new_tmp_meta);
+
     int64_t dim_in = x->dims()[1];
 
     const auto& offset_l = x->lod()[0];
diff --git a/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
index d2a91bdc4eb6f8..bb3917100e8938 100644
--- a/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
@@ -146,6 +146,7 @@
     'tdm_sampler',
     'soft_relu',
     'uniform_random_batch_size_like',
+    'match_matrix_tensor',
     'c_reduce_min',
     'c_reduce_min_',
     'push_sparse_v2',
diff --git a/paddle/fluid/pir/dialect/operator/ir/ops.yaml b/paddle/fluid/pir/dialect/operator/ir/ops.yaml
index 19dd345eee2f78..e1c15219ebbd0c 100644
--- a/paddle/fluid/pir/dialect/operator/ir/ops.yaml
+++ b/paddle/fluid/pir/dialect/operator/ir/ops.yaml
@@ -1574,6 +1574,15 @@
     data_type: param
   optional: master_param, master_param_out
 
+- op: match_matrix_tensor
+  args: (Tensor x, Tensor y, Tensor w, int dim_t=1)
+  output: Tensor(out), Tensor(tmp)
+  infer_meta:
+     func: MatchMatrixTensorInferMeta
+  kernel:
+     func: match_matrix_tensor
+  backward: match_matrix_tensor_grad
+
 - op: nce
   args: (Tensor input, Tensor label, Tensor weight, Tensor bias, Tensor sample_weight, Tensor custom_dist_probs, Tensor custom_dist_alias, Tensor custom_dist_alias_probs, int num_total_classes, int[] custom_neg_classes={}, int num_neg_samples=10, int sampler=0, int seed=0, bool is_sparse=false, bool remote_prefetch=false, bool is_test=false)
   output: Tensor(cost), Tensor(sample_logits), Tensor(sample_labels)
diff --git a/paddle/fluid/pir/dialect/operator/ir/ops_backward.yaml b/paddle/fluid/pir/dialect/operator/ir/ops_backward.yaml
index 8d6ebe8203ded5..61d0d179c55474 100644
--- a/paddle/fluid/pir/dialect/operator/ir/ops_backward.yaml
+++ b/paddle/fluid/pir/dialect/operator/ir/ops_backward.yaml
@@ -948,6 +948,15 @@
     func: fused_elemwise_add_activation_grad
   optional : x, intermediate_out
 
+- backward_op: match_matrix_tensor_grad
+  forward: match_matrix_tensor (Tensor x, Tensor y, Tensor w, int dim_t=1) ->  Tensor(out), Tensor(tmp)
+  args: (Tensor x, Tensor y, Tensor w, Tensor tmp, Tensor out_grad, int dim_t=1)
+  output: Tensor(x_grad), Tensor(y_grad), Tensor(w_grad)
+  infer_meta:
+    func: MatchMatrixTensorGradInferMeta
+  kernel:
+    func: match_matrix_tensor_grad
+
 - backward_op: shuffle_batch_grad
   forward: shuffle_batch (Tensor x, Tensor seed, int startup_seed=0) -> Tensor(out), Tensor(shuffle_idx), Tensor(seed_out)
   args: (Tensor shuffle_idx, Tensor out_grad,int startup_seed=0)
diff --git a/paddle/fluid/pir/dialect/operator/utils/utils.cc b/paddle/fluid/pir/dialect/operator/utils/utils.cc
index 6bb5ccbd10249b..0c4faf9e4f7989 100644
--- a/paddle/fluid/pir/dialect/operator/utils/utils.cc
+++ b/paddle/fluid/pir/dialect/operator/utils/utils.cc
@@ -67,6 +67,8 @@ const std::unordered_set<std::string> LegacyOpList = {
     RowConvGradOp::name(),
     SoftReluOp::name(),
     SoftReluGradOp::name(),
+    MatchMatrixTensorOp::name(),
+    MatchMatrixTensorGradOp::name(),
     NceOp::name(),
     NceGradOp::name(),
     LrnOp::name(),
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index 92c8c56ee456a5..1fe56b8080b0bc 100755
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -3578,6 +3578,13 @@
   attrs:
     pivot : pivots
 
+- op: match_matrix_tensor
+  backward: match_matrix_tensor_grad
+  inputs:
+    {x : X, y : Y, w : W}
+  outputs:
+    {out : Out, tmp : Tmp}
+
 - op: memcpy
   inputs:
     x: X
diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc
index bf6db9bd8d5c0f..c06d400d999dd5 100644
--- a/paddle/phi/infermeta/backward.cc
+++ b/paddle/phi/infermeta/backward.cc
@@ -662,6 +662,31 @@ void MarginCrossEntropyGradInferMeta(const MetaTensor& logits,
   logits_grad->set_dtype(softmax.dtype());
 }
 
+void MatchMatrixTensorGradInferMeta(const MetaTensor& x,
+                                    const MetaTensor& y,
+                                    const MetaTensor& w,
+                                    const MetaTensor& tmp,
+                                    const MetaTensor& out_grad,
+                                    int dim_t,
+                                    MetaTensor* x_grad,
+                                    MetaTensor* y_grad,
+                                    MetaTensor* w_grad) {
+  if (x_grad != nullptr) {
+    x_grad->set_dims(x.dims());
+    x_grad->share_lod(x);
+    x_grad->set_dtype(x.dtype());
+  }
+  if (y_grad != nullptr) {
+    y_grad->set_dims(y.dims());
+    y_grad->share_lod(y);
+    y_grad->set_dtype(y.dtype());
+  }
+  if (w_grad != nullptr) {
+    w_grad->set_dims(w.dims());
+    w_grad->set_dtype(w.dtype());
+  }
+}
+
 void MaxPoolWithIndexGradInferMeta(const MetaTensor& x,
                                    const MetaTensor& mask,
                                    const MetaTensor& dout,
@@ -1352,5 +1377,4 @@ void SetValueGradInferMeta(const MetaTensor& out_grad,
     value_grad->share_lod(values);
   }
 }
-
 }  // namespace phi
diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h
index 3112fa8b9ddad4..f1458f3f4b8fd7 100644
--- a/paddle/phi/infermeta/backward.h
+++ b/paddle/phi/infermeta/backward.h
@@ -312,6 +312,16 @@ void MarginCrossEntropyGradInferMeta(const MetaTensor& logits,
                                      float scale,
                                      MetaTensor* logits_grad);
 
+void MatchMatrixTensorGradInferMeta(const MetaTensor& x,
+                                    const MetaTensor& y,
+                                    const MetaTensor& w,
+                                    const MetaTensor& tmp,
+                                    const MetaTensor& out_grad,
+                                    int dim_t,
+                                    MetaTensor* x_grad,
+                                    MetaTensor* y_grad,
+                                    MetaTensor* w_grad);
+
 void MaxPoolWithIndexGradInferMeta(const MetaTensor& x,
                                    const MetaTensor& mask,
                                    const MetaTensor& dout,
diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc
index 2f53eb9ef71992..10807b7a3a87a9 100644
--- a/paddle/phi/infermeta/ternary.cc
+++ b/paddle/phi/infermeta/ternary.cc
@@ -725,6 +725,79 @@ void LinspaceInferMeta(const MetaTensor& start,
   LinspaceRawInferMeta(start, stop, number, out);
 }
 
+void MatchMatrixTensorInferMeta(const MetaTensor& x,
+                                const MetaTensor& y,
+                                const MetaTensor& w,
+                                int dim_t,
+                                MetaTensor* out,
+                                MetaTensor* tmp,
+                                MetaConfig config) {
+  auto x_dims = x.dims();
+  PADDLE_ENFORCE_EQ(x_dims.size(),
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The dimensions of Input(X) should be equal to 2, "
+                        "but received %d.",
+                        x_dims.size()));
+
+  auto y_dims = y.dims();
+  PADDLE_ENFORCE_EQ(y_dims.size(),
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The dimensions of Input(Y) should be equal to 2, "
+                        "but received %d.",
+                        y_dims.size()));
+
+  auto w_dims = w.dims();
+  PADDLE_ENFORCE_EQ(w_dims.size(),
+                    3,
+                    phi::errors::InvalidArgument(
+                        "The dimensions of Input(W) should be equal to 3, "
+                        "but received %d.",
+                        w_dims.size()));
+
+  PADDLE_ENFORCE_EQ(
+      w_dims[0],
+      x_dims[1],
+      phi::errors::InvalidArgument(
+          "The first dimension of Input(W) should be equal to the second "
+          "dimension of Input(X). But received the first dimension of Input(W) "
+          "is %d, the second dimension of Input(X) is %d.",
+          w_dims[0],
+          x_dims[1]));
+  PADDLE_ENFORCE_EQ(
+      w_dims[1],
+      dim_t,
+      phi::errors::InvalidArgument(
+          "The second dimension of Input(W) should be equal to 'dim_t', but "
+          "received the second dimension of Input(W) is %d, 'dim_t' is %d.",
+          w_dims[1],
+          dim_t));
+  PADDLE_ENFORCE_EQ(
+      w_dims[2],
+      y_dims[1],
+      phi::errors::InvalidArgument(
+          "The last dimension of Input(W) should be equal to "
+          "the second dimension of Input(Y). But received the last dimension "
+          "of Input(W) is %d, the second dimension of Input(Y) is %d.",
+          w_dims[2],
+          y_dims[1]));
+
+  int64_t out_dim_0 = -1;
+  int64_t tmp_dim_0 = -1;
+  if (!config.is_runtime) {
+    out->share_lod(x);
+    std::vector<int64_t> out_dims_vec{out_dim_0};
+    out_dims_vec.push_back(1);
+    std::vector<int64_t> tmp_dims_vec{tmp_dim_0};
+    tmp_dims_vec.push_back(1);
+    out->set_dims(common::make_ddim(out_dims_vec));
+    out->set_dtype(x.dtype());
+    tmp->set_dims(common::make_ddim(tmp_dims_vec));
+    tmp->set_dtype(x.dtype());
+  }
+}
+
 void MultiClassNMSInferMeta(const MetaTensor& bboxes,
                             const MetaTensor& scores,
                             const MetaTensor& rois_num,
diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h
index 36ed324ee1d6cd..7ffdc3d272069f 100644
--- a/paddle/phi/infermeta/ternary.h
+++ b/paddle/phi/infermeta/ternary.h
@@ -132,6 +132,14 @@ void LinspaceInferMeta(const MetaTensor& start,
                        DataType dtype,
                        MetaTensor* out);
 
+void MatchMatrixTensorInferMeta(const MetaTensor& x,
+                                const MetaTensor& y,
+                                const MetaTensor& w,
+                                int dim_t,
+                                MetaTensor* out,
+                                MetaTensor* tmp,
+                                MetaConfig config = MetaConfig());
+
 void MultiClassNMSInferMeta(const MetaTensor& bboxes,
                             const MetaTensor& scores,
                             const MetaTensor& rois_num,
diff --git a/test/white_list/pir_op_test_white_list b/test/white_list/pir_op_test_white_list
index fdaa016752a4e4..d2b35b54f49e61 100644
--- a/test/white_list/pir_op_test_white_list
+++ b/test/white_list/pir_op_test_white_list
@@ -165,6 +165,7 @@ test_lu_op
 test_lu_unpack_op
 test_margin_cross_entropy_op
 test_masked_select_op
+test_match_matrix_tensor_op
 test_matmul_v2_op
 test_matmul_v2_op_static_build
 test_matrix_nms_op

From 7e5e2137c9870f3f2b45294f3388b00975e4d769 Mon Sep 17 00:00:00 2001
From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com>
Date: Tue, 23 Jan 2024 17:39:48 +0800
Subject: [PATCH 34/34] add_script,test=document_fix (#61057)

---
 paddle/scripts/get_target_size.py | 51 +++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)
 create mode 100644 paddle/scripts/get_target_size.py

diff --git a/paddle/scripts/get_target_size.py b/paddle/scripts/get_target_size.py
new file mode 100644
index 00000000000000..ec0ebd34d79553
--- /dev/null
+++ b/paddle/scripts/get_target_size.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import subprocess
+import sys
+
+
+class HandleTarget:
+    def __init__(self) -> None:
+        argv = sys.argv
+        len_ = len(argv)
+        fromPath = './' if len_ <= 1 else argv[1]
+        cmd = f"find {fromPath} -name '*.o' -not -path './third_party/*' | xargs du -sch > log"
+        subprocess.run(cmd, shell=True)
+
+    def calcuSize(self, item):
+        size = float(item[:-1])
+        res = size * 1024 if item.find('G') >= 0 else size
+        return size / 1024 if item.find('K') >= 0 else res
+
+    def getSize(self):
+        ctx = self.getDatas()
+        sum = 0
+        for item in ctx:
+            if item.find('total') >= 0:
+                item = item.split('\t')[0]
+                sum += self.calcuSize(item)
+
+        print(f"Total size is {sum} M (without third_party)")
+
+    def getDatas(self):
+        with open('log', 'r') as file:
+            ctx = file.read()
+        ctx = ctx.split('\n')
+        return ctx
+
+
+if __name__ == '__main__':
+    handler = HandleTarget()
+    handler.getSize()