PaddlePaddle · YuanRisheng · Jan 21, 2022 · Jan 17, 2022 · Jan 17, 2022 · Jan 18, 2022
diff --git a/cmake/pten_kernel.cmake b/cmake/pten_kernel.cmake
@@ -103,38 +103,55 @@ function(kernel_library TARGET)
     list(LENGTH gpu_srcs gpu_srcs_len)
     list(LENGTH xpu_srcs xpu_srcs_len)
 
-    if (${common_srcs_len} GREATER 0)
-        # If the kernel has a device independent public implementation,
-        # we will use this implementation and will not adopt the implementation
-        # under specific devices
+    # Build Target according different src organization
+    if((${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR
+        ${xpu_srcs_len} GREATER 0) AND ${common_srcs_len} GREATER 0)
+        # If the common_srcs depends on specific device srcs, build target using this rule.
+        if (WITH_GPU)
+            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
+                nv_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+                nv_library(${TARGET} SRCS ${common_srcs} DEPS ${TARGET}_part)
+            endif()
+        elseif (WITH_ROCM)
+            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
+                hip_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+                hip_library(${TARGET} SRCS ${common_srcs} DEPS ${TARGET}_part)
+            endif()
+        else()
+            if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
+                cc_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+                cc_library(${TARGET} SRCS ${common_srcs} DEPS ${TARGET}_part)
+            endif()
+        endif()
+    elseif (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
         if (WITH_GPU)
-            nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
+                nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            endif()
         elseif (WITH_ROCM)
-            hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
+                hip_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            endif()
         else()
-            cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
+                cc_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            endif()
         endif()
     else()
-        # If the kernel has a header file declaration, but no corresponding
-        # implementation can be found, this is not allowed
-        if (${cpu_srcs_len} EQUAL 0 AND ${gpu_srcs_len} EQUAL 0 AND
-            ${xpu_srcs_len} EQUAL 0)
-            message(FATAL_ERROR "Cannot find any implementation for ${TARGET}")
+        if (${common_srcs_len} EQUAL 0)
+             message(FATAL_ERROR "Cannot find any implementation for ${TARGET}")
         else()
+            # If the kernel has a device independent public implementation,
+            # we will use this implementation and will not adopt the implementation
+            # under specific devices
             if (WITH_GPU)
-                if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
-                    nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-                endif()
+                nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
             elseif (WITH_ROCM)
-                if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
-                    hip_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-                endif()
+                hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
             else()
-                if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
-                    cc_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-                endif()
+                cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
             endif()
-        endif()
+         endif()
     endif()
 
     if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR

diff --git a/paddle/fluid/operators/cholesky_solve_op.h b/paddle/fluid/operators/cholesky_solve_op.h
@@ -202,7 +202,7 @@ class CholeskySolveGradKernel : public framework::OpKernel<T> {
       commonterm_for_range(commonterm_functor);
       commonterm_conj = helper.Transpose(commonterm_conj);
 
-      pten::AddKernel<T>(
+      pten::AddRawKernel<T>(
           static_cast<const typename paddle::framework::ConvertToPtenContext<
               DeviceContext>::TYPE &>(dev_ctx),
           commonterm, commonterm_conj, -1, &commonterm);

diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h
@@ -61,7 +61,7 @@ class ElementwiseAddKernel : public framework::OpKernel<T> {
     auto pt_x = paddle::experimental::MakePtenDenseTensor(*x);
     auto pt_y = paddle::experimental::MakePtenDenseTensor(*y);
     auto pt_z = paddle::experimental::MakePtenDenseTensor(*z);
-    pten::AddKernel<T>(
+    pten::AddRawKernel<T>(
         static_cast<const typename framework::ConvertToPtenContext<
             DeviceContext>::TYPE &>(dev_ctx),
         *pt_x.get(), *pt_y.get(), axis, pt_z.get());

diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.h b/paddle/fluid/operators/elementwise/elementwise_div_op.h
@@ -51,7 +51,7 @@ class ElementwiseDivKernel : public framework::OpKernel<T> {
     auto pt_x = paddle::experimental::MakePtenDenseTensor(*x);
     auto pt_y = paddle::experimental::MakePtenDenseTensor(*y);
     auto pt_z = paddle::experimental::MakePtenDenseTensor(*z);
-    pten::DivideKernel<T>(
+    pten::DivideRawKernel<T>(
         static_cast<const typename framework::ConvertToPtenContext<
             DeviceContext>::TYPE&>(dev_ctx),
         *pt_x.get(), *pt_y.get(), axis, pt_z.get());

diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
@@ -51,8 +51,8 @@ class ElementwiseMulKernel<platform::CUDADeviceContext, T>
       auto pt_x = paddle::experimental::MakePtenDenseTensor(*x_lod);
       auto pt_y = paddle::experimental::MakePtenDenseTensor(*y_lod);
       auto pt_z = paddle::experimental::MakePtenDenseTensor(*z_lod);
-      pten::MultiplyKernel<T>(cuda_ctx, *pt_x.get(), *pt_y.get(), axis,
-                              pt_z.get());
+      pten::MultiplyRawKernel<T>(cuda_ctx, *pt_x.get(), *pt_y.get(), axis,
+                                 pt_z.get());
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "X's type[%s] is not supported by elementwise_op. X's type should be "

diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -124,7 +124,7 @@ class ElementwiseMulKernel : public framework::OpKernel<T> {
       auto pt_x = paddle::experimental::MakePtenDenseTensor(*x_lod);
       auto pt_y = paddle::experimental::MakePtenDenseTensor(*y);
       auto pt_z = paddle::experimental::MakePtenDenseTensor(*z_lod);
-      pten::MultiplyKernel<T>(
+      pten::MultiplyRawKernel<T>(
           static_cast<const typename framework::ConvertToPtenContext<
               DeviceContext>::TYPE&>(dev_ctx),
           *pt_x.get(), *pt_y.get(), axis, pt_z.get());

diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -140,26 +140,42 @@ class ElementwiseOp : public framework::OperatorWithKernel {
 
   framework::KernelSignature GetExpectedPtenKernelArgs(
       const framework::ExecutionContext &ctx) const override {
+    int axis = ctx.Attr<int>("axis");
     if (Type() == "elementwise_add") {
       if (ctx.InputVar("X")->IsType<framework::LoDTensor>()) {
-        return framework::KernelSignature("add", {"X", "Y"}, {"axis"}, {"Out"});
+        if (axis == -1) {
+          return framework::KernelSignature("add", {"X", "Y"}, {}, {"Out"});
+        }
+        return framework::KernelSignature("add_raw", {"X", "Y"}, {"axis"},
+                                          {"Out"});
       }
     }
     if (Type() == "elementwise_sub") {
       if (ctx.InputVar("X")->IsType<framework::LoDTensor>()) {
-        return framework::KernelSignature("subtract", {"X", "Y"}, {"axis"},
+        if (axis == -1) {
+          return framework::KernelSignature("subtract", {"X", "Y"}, {},
+                                            {"Out"});
+        }
+        return framework::KernelSignature("subtract_raw", {"X", "Y"}, {"axis"},
                                           {"Out"});
       }
     }
     if (Type() == "elementwise_div") {
       if (ctx.InputVar("X")->IsType<framework::LoDTensor>()) {
-        return framework::KernelSignature("divide", {"X", "Y"}, {"axis"},
+        if (axis == -1) {
+          return framework::KernelSignature("divide", {"X", "Y"}, {}, {"Out"});
+        }
+        return framework::KernelSignature("divide_raw", {"X", "Y"}, {"axis"},
                                           {"Out"});
       }
     }
     if (Type() == "elementwise_mul") {
       if (ctx.InputVar("X")->IsType<framework::LoDTensor>()) {
-        return framework::KernelSignature("multiply", {"X", "Y"}, {"axis"},
+        if (axis == -1) {
+          return framework::KernelSignature("multiply", {"X", "Y"}, {},
+                                            {"Out"});
+        }
+        return framework::KernelSignature("multiply_raw", {"X", "Y"}, {"axis"},
                                           {"Out"});
       }
     }

diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.h b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
@@ -51,7 +51,7 @@ class ElementwiseSubKernel : public framework::OpKernel<T> {
     auto pt_x = paddle::experimental::MakePtenDenseTensor(*x);
     auto pt_y = paddle::experimental::MakePtenDenseTensor(*y);
     auto pt_z = paddle::experimental::MakePtenDenseTensor(*z);
-    pten::SubtractKernel<T>(
+    pten::SubtractRawKernel<T>(
         static_cast<const typename framework::ConvertToPtenContext<
             DeviceContext>::TYPE&>(dev_ctx),
         *pt_x.get(), *pt_y.get(), axis, pt_z.get());

diff --git a/paddle/fluid/operators/lu_op.h b/paddle/fluid/operators/lu_op.h
@@ -221,7 +221,7 @@ void Tensor_Add(const DeviceContext& dev_ctx, const framework::Tensor& src1,
   out->Resize(src1.dims());
   out->mutable_data<T>(dev_ctx.GetPlace());
 
-  pten::AddKernel<
+  pten::AddRawKernel<
       T, typename paddle::framework::ConvertToPtenContext<DeviceContext>::TYPE>(
       static_cast<const typename paddle::framework::ConvertToPtenContext<
           DeviceContext>::TYPE&>(dev_ctx),
@@ -234,7 +234,7 @@ void Tensor_Sub(const DeviceContext& dev_ctx, const framework::Tensor& src1,
   out->Resize(src1.dims());
   out->mutable_data<T>(dev_ctx.GetPlace());
 
-  pten::SubtractKernel<
+  pten::SubtractRawKernel<
       T, typename paddle::framework::ConvertToPtenContext<DeviceContext>::TYPE>(
       static_cast<const typename paddle::framework::ConvertToPtenContext<
           DeviceContext>::TYPE&>(dev_ctx),

diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -551,17 +551,26 @@ class ReduceOp : public framework::OperatorWithKernel {
 
   framework::KernelSignature GetExpectedPtenKernelArgs(
       const framework::ExecutionContext& ctx) const override {
+    bool reduce_all = ctx.Attr<bool>("reduce_all");
     if (Type() == "reduce_sum") {
       if (ctx.InputVar("X")->IsType<framework::LoDTensor>()) {
+        if (!reduce_all) {
+          return framework::KernelSignature(
+              "sum", {"X"}, {"dim", "keep_dim", "out_dtype"}, {"Out"});
+        }
         return framework::KernelSignature(
-            "sum", {"X"}, {"dim", "keep_dim", "reduce_all", "out_dtype"},
+            "sum_raw", {"X"}, {"dim", "keep_dim", "reduce_all", "out_dtype"},
             {"Out"});
       }
     }
     if (Type() == "reduce_mean") {
       if (ctx.InputVar("X")->IsType<framework::LoDTensor>()) {
+        if (!reduce_all) {
+          return framework::KernelSignature("mean", {"X"}, {"dim", "keep_dim"},
+                                            {"Out"});
+        }
         return framework::KernelSignature(
-            "mean", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
+            "mean_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
       }
     }
     // TODO(chentianyu03): support other cases after selected rows added

diff --git a/paddle/pten/api/include/kernel_signature.h b/paddle/pten/api/include/kernel_signature.h
@@ -30,7 +30,6 @@ using DeviceContext = paddle::platform::DeviceContext;
 using add_kernel = void (*)(const DeviceContext&,
                             const DenseTensor&,
                             const DenseTensor&,
-                            int,
                             DenseTensor*);
 
 using cast_kernel = void (*)(const DeviceContext&,
@@ -41,7 +40,6 @@ using cast_kernel = void (*)(const DeviceContext&,
 using divide_kernel = void (*)(const DeviceContext&,
                                const DenseTensor&,
                                const DenseTensor&,
-                               int,
                                DenseTensor*);
 
 using dot_kernel = void (*)(const DeviceContext&,
@@ -77,13 +75,11 @@ using mean_kernel = void (*)(const DeviceContext&,
                              const DenseTensor&,
                              const std::vector<int64_t>&,
                              bool,
-                             bool,
                              DenseTensor*);
 
 using multiply_kernel = void (*)(const DeviceContext&,
                                  const DenseTensor&,
                                  const DenseTensor&,
-                                 int,
                                  DenseTensor*);
 
 using reshape_kernel = void (*)(const DeviceContext&,
@@ -102,14 +98,12 @@ using sum_kernel = void (*)(const DeviceContext&,
                             const DenseTensor&,
                             const std::vector<int64_t>&,
                             bool,
-                            bool,
                             DataType,
                             DenseTensor*);
 
 using subtract_kernel = void (*)(const DeviceContext&,
                                  const DenseTensor&,
                                  const DenseTensor&,
-                                 int,
                                  DenseTensor*);
 
 using conj_kernel = void (*)(const DeviceContext&,

diff --git a/paddle/pten/core/kernel_alias_name.h b/paddle/pten/core/kernel_alias_name.h
@@ -20,10 +20,10 @@ namespace pten {
 // the key is kernel_name in fluid, the value is the kernel_name in pten
 // the key is sorted by key's alphabet
 const std::unordered_map<std::string, std::string> kernel_alias_name_map = {
-    {"elementwise_add", "add"},
-    {"elementwise_div", "divide"},
-    {"elementwise_mul", "muliply"},
-    {"elementwise_sub", "subtract"},
+    {"elementwise_add", "add_raw"},
+    {"elementwise_div", "divide_raw"},
+    {"elementwise_mul", "muliply_raw"},
+    {"elementwise_sub", "subtract_raw"},
     {"fill_any_like", "full_like"},
     {"fill_constant", "full"},
     {"flatten_contiguous_range", "flatten"},
@@ -32,8 +32,8 @@ const std::unordered_map<std::string, std::string> kernel_alias_name_map = {
     {"matmul_v2_grad", "matmul_grad"},
     {"matmul_v2_grad_grad", "matmul_double_grad"},
     {"matmul_v2_triple_grad", "matmul_triple_grad"},
-    {"reduce_mean", "mean"},
-    {"reduce_sum", "sum"},
+    {"reduce_mean", "mean_raw"},
+    {"reduce_sum", "sum_raw"},
     {"reshape2", "reshape"},
     {"reshape2_grad", "reshape_grad"},
     {"reshape2_grad_grad", "reshape_double_grad"},