From 6c4a2f6dd949ae37635d6908f1cd22b2b372a32f Mon Sep 17 00:00:00 2001
From: zhouwei25 <zhouwei25@baidu.com>
Date: Thu, 21 Jul 2022 09:09:18 +0000
Subject: [PATCH 1/2] fix behavior of device_id=None in Tensor.cuda

---
 .../fluid/dygraph/varbase_patch_methods.py    | 19 ++++++++-----
 .../fluid/tests/unittests/test_var_base.py    | 28 +++++++++----------
 2 files changed, 26 insertions(+), 21 deletions(-)

diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index 48497f4b9092f..d24d0c8577897 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -866,15 +866,20 @@ def cpu(self):
             return res
 
     @framework.dygraph_only
-    def cuda(self, device_id=0, blocking=True):
-        if device_id is None:
-            device_id = 0
-        if not isinstance(device_id, int):
-            raise ValueError("\'device_id\' must be a positive integer")
-        if self.place.is_gpu_place():
+    def cuda(self, device=None, blocking=True):
+        if device is None:
+            res_place = framework._current_expected_place()
+        elif isinstance(device, int):
+            res_place = core.CUDAPlace(device_id)
+        elif isinstance(device, core.CUDAPlace()):
+            res_place = device
+        else:
+            raise TypeError("device must be CUDAPlace device id or CUDAPlace")
+
+        if self.place._equals(res_place):
             return self
         else:
-            res = self._copy_to(core.CUDAPlace(device_id), True)
+            res = self._copy_to(res_place, True)
             res.stop_gradient = self.stop_gradient
             res.persistable = self.persistable
             return res
diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index e66f310eb977d..227d05c37b60f 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -34,7 +34,7 @@ def setUp(self):
 
     def func_test_to_tensor(self):
 
-        def _test_place(place):
+        def check_with_place(place):
             with fluid.dygraph.guard():
                 paddle.set_default_dtype('float32')
                 # set_default_dtype should not take effect on int
@@ -76,17 +76,19 @@ def _test_place(place):
                 y = x.cpu()
                 self.assertEqual(y.place.__repr__(), "Place(cpu)")
                 if core.is_compiled_with_cuda():
+                    res_place = paddle.framework._current_expected_place()
                     y = x.pin_memory()
                     self.assertEqual(y.place.__repr__(), "Place(gpu_pinned)")
                     y = x.cuda()
+                    self.assertEqual(y.place.__repr__(), res_place.__repr__())
                     y = x.cuda(None)
-                    self.assertEqual(y.place.__repr__(), "Place(gpu:0)")
+                    self.assertEqual(y.place.__repr__(), res_place.__repr__())
                     y = x.cuda(device_id=0)
                     self.assertEqual(y.place.__repr__(), "Place(gpu:0)")
                     y = x.cuda(blocking=False)
-                    self.assertEqual(y.place.__repr__(), "Place(gpu:0)")
+                    self.assertEqual(y.place.__repr__(), res_place.__repr__())
                     y = x.cuda(blocking=True)
-                    self.assertEqual(y.place.__repr__(), "Place(gpu:0)")
+                    self.assertEqual(y.place.__repr__(), res_place.__repr__())
                     with self.assertRaises(ValueError):
                         y = x.cuda("test")
 
@@ -266,20 +268,18 @@ def _test_place(place):
                 with self.assertRaises(ValueError):
                     paddle.to_tensor([[1], [2, 3]], place=1)
 
-        _test_place(core.CPUPlace())
-        _test_place("cpu")
+        check_with_place(core.CPUPlace())
+        check_with_place("cpu")
         if core.is_compiled_with_cuda():
-            _test_place(core.CUDAPinnedPlace())
-            _test_place("gpu_pinned")
-            _test_place(core.CUDAPlace(0))
-            _test_place("gpu:0")
+            check_with_place(core.CUDAPinnedPlace())
+            check_with_place("gpu_pinned")
+            check_with_place(core.CUDAPlace(0))
+            check_with_place("gpu:0")
         if core.is_compiled_with_npu():
-            _test_place(core.NPUPlace(0))
-            _test_place("npu:0")
+            check_with_place(core.NPUPlace(0))
+            check_with_place("npu:0")
 
     def test_to_tensor(self):
-        with _test_eager_guard():
-            self.func_test_to_tensor()
         self.func_test_to_tensor()
 
     def func_test_to_tensor_not_change_input_stop_gradient(self):

From 3f67446a1968cbdd00eef8fb8b064c99246d994a Mon Sep 17 00:00:00 2001
From: zhouwei25 <zhouwei25@baidu.com>
Date: Mon, 25 Jul 2022 09:10:23 +0000
Subject: [PATCH 2/2] fix CI

---
 .../sparse/gpu/fused_attention_grad_kernel.cu        |  2 +-
 .../phi/kernels/sparse/gpu/fused_attention_kernel.cu |  2 +-
 python/paddle/fluid/dygraph/varbase_patch_methods.py | 12 ++++++------
 python/paddle/fluid/tests/unittests/test_var_base.py | 11 ++++++-----
 4 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/paddle/phi/kernels/sparse/gpu/fused_attention_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/fused_attention_grad_kernel.cu
index 4d31ad96cdd3b..70203836d4412 100644
--- a/paddle/phi/kernels/sparse/gpu/fused_attention_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/fused_attention_grad_kernel.cu
@@ -75,7 +75,7 @@ void FusedAttentionCsrGradKernel(const Context& dev_ctx,
 #if CUDA_VERSION >= 11070
   /* Step1: Forward: softmax{CSR} * value{Dense} -> out{Dense}, reuse */
   SparseCsrTensor dsoftmax;
-  CsrDenseMatmulGradKernel<T, Context>(
+  MatmulCsrDenseGradKernel<T, Context>(
       dev_ctx, softmax, value, dout, &dsoftmax, dvalue);
 
   /* Step2: Calculate grad of sdd_result, manualy not reuse */
diff --git a/paddle/phi/kernels/sparse/gpu/fused_attention_kernel.cu b/paddle/phi/kernels/sparse/gpu/fused_attention_kernel.cu
index 46412d57f16c7..b1e30f3b654a4 100644
--- a/paddle/phi/kernels/sparse/gpu/fused_attention_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/fused_attention_kernel.cu
@@ -263,7 +263,7 @@ void FusedAttentionCsrKernel(
 
   /* Step3: DSD Matmul, reuse */
   softmax->set_dims(phi::make_ddim({q_dim[0], q_dim[1], q_dim[2], q_dim[2]}));
-  CsrDenseMatmulKernel<T, Context>(dev_ctx, *softmax, value, out);
+  MatmulCsrDenseKernel<T, Context>(dev_ctx, *softmax, value, out);
 #else
   PADDLE_THROW(
       phi::errors::Unimplemented("forward of 'sparse.nn.functional.attention' "
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index d24d0c8577897..5b0aba7a9dabb 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -866,15 +866,15 @@ def cpu(self):
             return res
 
     @framework.dygraph_only
-    def cuda(self, device=None, blocking=True):
-        if device is None:
+    def cuda(self, device_id=None, blocking=True):
+        if device_id is None:
             res_place = framework._current_expected_place()
-        elif isinstance(device, int):
+            if not isinstance(res_place, core.CUDAPlace):
+                res_place = core.CUDAPlace(0)
+        elif isinstance(device_id, int):
             res_place = core.CUDAPlace(device_id)
-        elif isinstance(device, core.CUDAPlace()):
-            res_place = device
         else:
-            raise TypeError("device must be CUDAPlace device id or CUDAPlace")
+            raise ValueError("device_id must be int|None")
 
         if self.place._equals(res_place):
             return self
diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index 1b32d05c08487..c16238486df94 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -76,19 +76,18 @@ def check_with_place(place):
                 y = x.cpu()
                 self.assertEqual(y.place.__repr__(), "Place(cpu)")
                 if core.is_compiled_with_cuda():
-                    res_place = paddle.framework._current_expected_place()
                     y = x.pin_memory()
                     self.assertEqual(y.place.__repr__(), "Place(gpu_pinned)")
                     y = x.cuda()
-                    self.assertEqual(y.place.__repr__(), res_place.__repr__())
+                    self.assertEqual(y.place.__repr__(), "Place(gpu:0)")
                     y = x.cuda(None)
-                    self.assertEqual(y.place.__repr__(), res_place.__repr__())
+                    self.assertEqual(y.place.__repr__(), "Place(gpu:0)")
                     y = x.cuda(device_id=0)
                     self.assertEqual(y.place.__repr__(), "Place(gpu:0)")
                     y = x.cuda(blocking=False)
-                    self.assertEqual(y.place.__repr__(), res_place.__repr__())
+                    self.assertEqual(y.place.__repr__(), "Place(gpu:0)")
                     y = x.cuda(blocking=True)
-                    self.assertEqual(y.place.__repr__(), res_place.__repr__())
+                    self.assertEqual(y.place.__repr__(), "Place(gpu:0)")
                     with self.assertRaises(ValueError):
                         y = x.cuda("test")
 
@@ -280,6 +279,8 @@ def check_with_place(place):
             check_with_place("npu:0")
 
     def test_to_tensor(self):
+        with _test_eager_guard():
+            self.func_test_to_tensor()
         self.func_test_to_tensor()
 
     def func_test_to_tensor_not_change_input_stop_gradient(self):