[New features] Support batch_jacobian and batch_hessian (PaddlePaddle…

…#37547) * native commit for triple grad of sigmod * Updated unittests files * init functional jacobian api * Updated trible_test func * Updated gradient_checker & test_script * finish test with dtype float32 * add float64 test case * polish code * use atol=1e-5 with dtype float64 * fix for ci * set timeout for test_jacobian * fix dygraph grad to support high differential * polish API docstring * Updated gradient checker and some related files * fix double grad strip error for high differential * fix double grad strip error for high differential * Add Sigmoid triple grad tests * fix dygraph double grad dtype error when calling for high differential senario * Updated triple grad teses func * Use np.random to initialize ddx * Updated triple_grad_check func * add todo for gradient checker and refine some comments * remove additional code * add test for warnging in backward.py * format python code * support multi input in triple gradient checker * Add matmul triple grad kernel * Updated comments of TODO * Supported some special tests * Change code-format to follow CI std * Updated gradient_checker.py * Fix conflicts * Removed unnecessary printing log * Change code style to follow CI std * support batch in jacobian and hessian * add batch jacobian and batch hessian * Add batch_jacobian test, draft version * [New features] Add elementwise_mul triple grad kernel (PaddlePaddle#37152) * Add elementwise_mul triple grad kernel * Removed InplaceInferer and polished code * Add numerical_batch_jacobian,numerical_batch_hessian and tests * Support batch_jacobian and batch_numerical * Use pre-commit to check code format * Update doc, polish code, add unit test * Reset the TIMEOUT properties of test_jacobian to pass CI Co-authored-by: levi131 <limaolin01@baidu.com> Co-authored-by: Jiabin Yang <360788950@qq.com>
Zjq9409 · Dec 10, 2021 · 784c754 · 784c754
1 parent ef68ba5
commit 784c754
Show file tree

Hide file tree

Showing 6 changed files with 643 additions and 4 deletions.
diff --git a/python/paddle/autograd/__init__.py b/python/paddle/autograd/__init__.py
@@ -18,6 +18,7 @@
 from .py_layer import PyLayer, PyLayerContext  # noqa: F401
 from ..framework import set_grad_enabled  # noqa: F401
 from ..fluid.dygraph.base import no_grad_ as no_grad  # noqa: F401
-from .functional import vjp, jvp, jacobian, hessian, vhp  # noqa: F401
+from .functional import jacobian, hessian, batch_jacobian, batch_hessian  # noqa: F401
+from .functional import vjp, jvp, vhp  # noqa: F401
 
 __all__ = ['backward', 'PyLayer', 'PyLayerContext']
diff --git a/python/paddle/autograd/functional.py b/python/paddle/autograd/functional.py
@@ -385,6 +385,297 @@ def func(x, y):
         return jacobian
 
 
+@framework.dygraph_only
+def batch_jacobian(func, inputs, create_graph=False, allow_unused=False):
+    ''' 
+    .. note::
+        **This API is ONLY available in the imperative mode.**
+
+    This function computes the batch Jacobian matrix of `func` with respect to `inputs`.
+    Noted that the first dimension of inputs is batch size.
+
+    Parameters:
+        func (function): a Python function that takes a Tensor or a Tensor
+            list/tuple as inputs(the first dimension is batch size) and 
+            returns a Tensor or a Tensor tuple.
+        inputs (Tensor|list(Tensor)|tuple(Tensor)): the input Tensor or 
+            Tensor list/tuple of the function ``func``, Noted that
+            the first dimension of inputs is batch size.
+        create_graph (bool, optional): whether to create the gradient graphs
+            of the computing process. When it is True, higher order derivatives
+            are supported to compute; when it is False, the gradient graphs of
+            the computing process would be discarded. Defaults to ``False``.
+        allow_unused (bool, optional): whether to raise error or return None if
+            some Tensors of `inputs` are unreachable in the graph. Error would
+            be raised if allow_unused=False, and None would be returned as
+            their gradients if allow_unused=True. Default False.
+    Returns:
+        Jacobian (Tensor or nested tuple of Tensors): if function ``func``
+        takes a Tensor as inputs and returns a Tensor as outputs, Jacobian
+        will be a single Tensor containing the Jacobian matrix for the
+        linearized inputs and outputs. If one of the inputs and outputs is
+        a Tensor, and another is a Tensor list/tuple, then the Jacobian will
+        be a tuple of Tensors. If both of inputs and outputs are Tensor
+        list/tuple, then the Jacobian will be a tuple of tuple of Tensors.
+        Noted that the first dimension of inputs is batch size.
+        
+        For example,
+        the inputs shape and outputs shape of function ``func` is [batch_size, num] 
+        and [batch_size, num] respectively, then the Jacobian will be a Tensor with
+        a shape of [num, batch_size * num], where ``Jacobian[i][j]`` will contain 
+        the Jacobian matrix of the ``i``th column output and the ``j``th input and 
+        will have same dtype and device as the corresponding input.
+        Other situations can be deduced by analogy.
+
+    Examples 1:
+        .. code-block:: python
+
+            import paddle
+
+            x = paddle.ones(shape=(4, 2), dtype='float64')
+            weight = paddle.ones(shape=(2, 4), dtype='float64')
+            y = paddle.ones(shape=(4, 2), dtype='float64')
+
+            def func(x):
+                return paddle.matmul(paddle.matmul(x, weight), y)
+
+            x.stop_gradient = False
+            batch_jacobian = paddle.autograd.batch_jacobian(func, x)
+            print(batch_jacobian)
+            # Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
+            #      [[4., 4., 4., 4., 4., 4., 4., 4.],
+            #       [4., 4., 4., 4., 4., 4., 4., 4.]])
+
+    Examples 2:
+        .. code-block:: python
+
+            import paddle
+
+            x = paddle.ones(shape=(4, 2), dtype='float64')
+            weight = paddle.ones(shape=(2, 4), dtype='float64')
+            y = paddle.ones(shape=(4, 2), dtype='float64')
+
+            def func(x):
+                return paddle.matmul(paddle.matmul(x, weight), y), x * x
+
+            x.stop_gradient = False
+            batch_jacobian = paddle.autograd.batch_jacobian(func, x) 
+            print(batch_jacobian)    
+            # (Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
+            #       [[4., 4., 4., 4., 4., 4., 4., 4.],
+            #        [4., 4., 4., 4., 4., 4., 4., 4.]]), Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
+            #       [[2., 0., 2., 0., 2., 0., 2., 0.],
+            #        [0., 2., 0., 2., 0., 2., 0., 2.]]))
+
+    Examples 3:
+        .. code-block:: python
+
+            import paddle
+
+            x = paddle.ones(shape=(4, 2), dtype='float64')
+            weight = paddle.ones(shape=(2, 4), dtype='float64')
+            y = paddle.ones(shape=(4, 2), dtype='float64')
+
+            def func(x, y):
+                return x * y
+
+            x.stop_gradient = False
+            y.stop_gradient = False
+            batch_jacobian = paddle.autograd.batch_jacobian(func, [x, y])
+            print(batch_jacobian)
+            # (Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
+            #       [[1., 0., 1., 0., 1., 0., 1., 0.],
+            #        [0., 1., 0., 1., 0., 1., 0., 1.]]), Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
+            #       [[1., 0., 1., 0., 1., 0., 1., 0.],
+            #        [0., 1., 0., 1., 0., 1., 0., 1.]]))
+   
+    '''
+    inputs = _tensors(inputs, "inputs")
+    outputs = _tensors(func(*inputs), "outputs")
+    batch_size = inputs[0].shape[0]
+    for input in inputs:
+        assert input.shape[
+            0] == batch_size, "The first dimension of input should equals to the same batch size!"
+    for output in outputs:
+        assert output.shape[
+            0] == batch_size, "The first dimension of output should equals to the same batch size!"
+    fin_size = len(inputs)
+    fout_size = len(outputs)
+    flat_outputs = tuple(
+        reshape(
+            output, shape=[batch_size, -1]) for output in outputs)
+    jacobian = tuple()
+    for i, flat_output in enumerate(flat_outputs):
+        jac_i = list([] for _ in range(fin_size))
+        for k in range(flat_output.shape[1]):
+            row_k = grad(
+                flat_output[:, k],
+                inputs,
+                create_graph=create_graph,
+                retain_graph=True,
+                allow_unused=allow_unused)
+            for j in range(fin_size):
+                jac_i[j].append(
+                    reshape(
+                        row_k[j], shape=[-1])
+                    if isinstance(row_k[j], paddle.Tensor) else None)
+        jacobian += (tuple(
+            _stack_tensor_or_return_none(jac_i_j) for jac_i_j in jac_i), )
+    if fin_size == 1 and fout_size == 1:
+        return jacobian[0][0]
+    elif fin_size == 1 and fout_size != 1:
+        return tuple(jacobian[i][0] for i in range(fout_size))
+    elif fin_size != 1 and fout_size == 1:
+        return jacobian[0]
+    else:
+        return jacobian
+
+
+@framework.dygraph_only
+def batch_hessian(func, inputs, create_graph=False, allow_unused=False):
+    ''' 
+    .. note::
+        **This API is ONLY available in the imperative mode.**
+
+    This function computes the batch Hessian matrix of `func` with respect to `inputs`.
+    Noted that the first dimension of inputs is batch size.
+
+    Parameters:
+        func (function): a Python function that takes a Tensor or a Tensor
+            list/tuple as inputs(the first dimension is batch size) and
+            returns a Tensor with shape [batch_size, 1].
+        inputs (Tensor|list(Tensor)|tuple(Tensor)): the input Tensor or 
+            Tensor list/tuple of the function ``func``.
+            Noted that the first dimension of inputs is batch size.
+        create_graph (bool, optional): whether to create the gradient graphs
+            of the computing process. When it is True, higher order derivatives
+            are supported to compute; when it is False, the gradient graphs of
+            the computing process would be discarded. Defaults to ``False``.
+        allow_unused (bool, optional): whether to raise error or return None if
+            some Tensors of `inputs` are unreachable in the graph. Error would
+            be raised if allow_unused=False, and None would be returned as
+            their gradients if allow_unused=True. Default False.
+    Returns:
+        Hessian (Tensor or a tuple of tuple of Tensors): if function ``func``
+        takes a Tensor as ``inputs``, Hessian will be a single Tensor containing
+        the Hessian matrix for the linearized ``inputs`` Tensor. If function
+        ``func`` takes a Tensor list/tuple as ``inputs``, then the Hessian will
+        be a tuple of tuple of Tensors. Noted that the first dimension of inputs 
+        is batch size and the execution step is to obtain the result of the 
+        first order differentiation, and then differentiate the batch input.
+
+        For example,
+        the inputs shape and outputs shape of function ``func` is [batch_size, num] 
+        and [batch_size, 1] respectively, then the batched Hessian will be a Tensor with
+        a shape of [num, batch_size * num].
+        
+        Why the final shape in this case is that?
+        because batch_hessian will create a inner func(the wrapper of paddle.grad() func)
+        to computes the sum of gradients of `outputs` with respect to each `inputs`,
+        this inner func will get the first order differentiation and shape is [batch_size, num], 
+        then call batch_jacobian to compute jacobian between the first order differentiation
+        and the origin inputs. The final result ``Hessian[i][j]`` will contain the Jacobian 
+        matrix of the ``i``th column output(Noted that this output means the first order 
+        differentiation) and the ``j``th input and will have same dtype and device as the 
+        corresponding input. Other situations can be deduced by analogy.
+    
+
+    Examples 1:
+        .. code-block:: python
+
+            import paddle
+
+            x = paddle.ones(shape=(4, 2), dtype='float64')
+            weight = paddle.ones(shape=(2, 4), dtype='float64')
+            y = paddle.ones(shape=(4, 2), dtype='float64')
+
+            def func(x):
+                return paddle.matmul(x * x, weight)[:, 0:1]
+            
+           
+            x.stop_gradient = False
+            batch_hessian = paddle.autograd.batch_hessian(func, x)
+            print(batch_hessian)
+            # Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
+            #      [[2., 0., 2., 0., 2., 0., 2., 0.],
+            #       [0., 2., 0., 2., 0., 2., 0., 2.]])
+
+    Examples 2:
+        .. code-block:: python
+
+            import paddle
+
+            x = paddle.ones(shape=(4, 2), dtype='float64')
+            weight = paddle.ones(shape=(2, 4), dtype='float64')
+            y = paddle.ones(shape=(4, 2), dtype='float64')
+
+            def func(x, y):
+                return paddle.matmul(x * x * y * y, weight)[:, 0:1]
+            
+            x.stop_gradient = False
+            y.stop_gradient = False
+            batch_hessian = paddle.autograd.batch_hessian(func, [x, y])
+            print(batch_hessian)
+            # ((Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
+            #        [[2., 0., 2., 0., 2., 0., 2., 0.],
+            #         [0., 2., 0., 2., 0., 2., 0., 2.]]), 
+            #   Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
+            #        [[4., 0., 4., 0., 4., 0., 4., 0.],
+            #         [0., 4., 0., 4., 0., 4., 0., 4.]])), 
+            #  (Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
+            #        [[4., 0., 4., 0., 4., 0., 4., 0.],
+            #         [0., 4., 0., 4., 0., 4., 0., 4.]]), 
+            #   Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
+            #        [[2., 0., 2., 0., 2., 0., 2., 0.],
+            #         [0., 2., 0., 2., 0., 2., 0., 2.]])))
+            
+
+    Examples 3:
+        .. code-block:: python
+
+            import paddle
+
+            x = paddle.ones(shape=(4, 2), dtype='float64')
+            weight = paddle.ones(shape=(2, 4), dtype='float64')
+            y = paddle.ones(shape=(4, 2), dtype='float64')
+            
+            def func(x, y):
+                return paddle.matmul(x * x, weight)[:, 0:1]
+
+            x.stop_gradient = False
+            y.stop_gradient = False
+            batch_hessian = paddle.autograd.batch_hessian(func, [x, y], allow_unused=True)
+            print(batch_hessian)
+            # ((Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
+            #        [[2., 0., 2., 0., 2., 0., 2., 0.],
+            #         [0., 2., 0., 2., 0., 2., 0., 2.]]), None), (None, None))
+
+    '''
+    inputs = _tensors(inputs, "inputs")
+    outputs = func(*inputs)
+    batch_size = inputs[0].shape[0]
+    for input in inputs:
+        assert input.shape[
+            0] == batch_size, "The first dimension of input should equals to the same batch size!"
+    assert isinstance(outputs, paddle.Tensor) and outputs.shape == [
+        batch_size, 1
+    ], "The function to compute batched Hessian matrix should return a Tensor of shape [batch_size, 1]"
+
+    def jac_func(*ins):
+        grad_inputs = grad(
+            outputs,
+            ins,
+            create_graph=True,
+            retain_graph=True,
+            allow_unused=allow_unused)
+        return tuple(
+            _replace_none_with_zero_tensor(grad_inputs[i], inputs[i])
+            for i in range(len(inputs)))
+
+    return batch_jacobian(
+        jac_func, inputs, create_graph=create_graph, allow_unused=allow_unused)
+
+
 @framework.dygraph_only
 def hessian(func, inputs, create_graph=False, allow_unused=False):
     ''' 

diff --git a/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt b/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt
@@ -6,6 +6,6 @@ foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS ${GC_ENVS})
 endforeach(TEST_OP)
 
-set_tests_properties(test_jacobian PROPERTIES TIMEOUT 20)
+set_tests_properties(test_jacobian PROPERTIES TIMEOUT 50)
 set_tests_properties(test_hessian PROPERTIES TIMEOUT 50)
 set_tests_properties(test_vhp PROPERTIES TIMEOUT 50)