Skip to content

Commit

Permalink
[New features] Support batch_jacobian and batch_hessian (PaddlePaddle…
Browse files Browse the repository at this point in the history
…#37547)

* native commit for triple grad of sigmod

* Updated unittests files

* init functional jacobian api

* Updated trible_test func

* Updated gradient_checker & test_script

* finish test with dtype float32

* add float64 test case

* polish code

* use atol=1e-5 with dtype float64

* fix for ci

* set timeout for test_jacobian

* fix dygraph grad to support high differential

* polish API docstring

* Updated gradient checker and some related files

* fix double grad strip error for high differential

* fix double grad strip error for high differential

* Add Sigmoid triple grad tests

* fix dygraph double grad dtype error when calling for high differential senario

* Updated triple grad teses func

* Use np.random to initialize ddx

* Updated triple_grad_check func

* add todo for gradient checker and refine some comments

* remove additional code

* add test for warnging in backward.py

* format python code

* support multi input in triple gradient checker

* Add matmul triple grad kernel

* Updated comments of TODO

* Supported some special tests

* Change code-format to follow CI std

* Updated gradient_checker.py

* Fix conflicts

* Removed unnecessary printing log

* Change code style to follow CI std

* support batch in jacobian and hessian

* add batch jacobian and batch hessian

* Add batch_jacobian test, draft version

* [New features] Add elementwise_mul triple grad kernel (PaddlePaddle#37152)

* Add elementwise_mul triple grad kernel

* Removed InplaceInferer and polished code

* Add numerical_batch_jacobian,numerical_batch_hessian and tests

* Support batch_jacobian and batch_numerical

* Use pre-commit to check code format

* Update doc, polish code, add unit test

* Reset the TIMEOUT properties of test_jacobian to pass CI

Co-authored-by: levi131 <limaolin01@baidu.com>
Co-authored-by: Jiabin Yang <360788950@qq.com>
  • Loading branch information
3 people authored and Zjq9409 committed Dec 10, 2021
1 parent ef68ba5 commit 784c754
Show file tree
Hide file tree
Showing 6 changed files with 643 additions and 4 deletions.
3 changes: 2 additions & 1 deletion python/paddle/autograd/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from .py_layer import PyLayer, PyLayerContext # noqa: F401
from ..framework import set_grad_enabled # noqa: F401
from ..fluid.dygraph.base import no_grad_ as no_grad # noqa: F401
from .functional import vjp, jvp, jacobian, hessian, vhp # noqa: F401
from .functional import jacobian, hessian, batch_jacobian, batch_hessian # noqa: F401
from .functional import vjp, jvp, vhp # noqa: F401

__all__ = ['backward', 'PyLayer', 'PyLayerContext']
291 changes: 291 additions & 0 deletions python/paddle/autograd/functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,297 @@ def func(x, y):
return jacobian


@framework.dygraph_only
def batch_jacobian(func, inputs, create_graph=False, allow_unused=False):
'''
.. note::
**This API is ONLY available in the imperative mode.**
This function computes the batch Jacobian matrix of `func` with respect to `inputs`.
Noted that the first dimension of inputs is batch size.
Parameters:
func (function): a Python function that takes a Tensor or a Tensor
list/tuple as inputs(the first dimension is batch size) and
returns a Tensor or a Tensor tuple.
inputs (Tensor|list(Tensor)|tuple(Tensor)): the input Tensor or
Tensor list/tuple of the function ``func``, Noted that
the first dimension of inputs is batch size.
create_graph (bool, optional): whether to create the gradient graphs
of the computing process. When it is True, higher order derivatives
are supported to compute; when it is False, the gradient graphs of
the computing process would be discarded. Defaults to ``False``.
allow_unused (bool, optional): whether to raise error or return None if
some Tensors of `inputs` are unreachable in the graph. Error would
be raised if allow_unused=False, and None would be returned as
their gradients if allow_unused=True. Default False.
Returns:
Jacobian (Tensor or nested tuple of Tensors): if function ``func``
takes a Tensor as inputs and returns a Tensor as outputs, Jacobian
will be a single Tensor containing the Jacobian matrix for the
linearized inputs and outputs. If one of the inputs and outputs is
a Tensor, and another is a Tensor list/tuple, then the Jacobian will
be a tuple of Tensors. If both of inputs and outputs are Tensor
list/tuple, then the Jacobian will be a tuple of tuple of Tensors.
Noted that the first dimension of inputs is batch size.
For example,
the inputs shape and outputs shape of function ``func` is [batch_size, num]
and [batch_size, num] respectively, then the Jacobian will be a Tensor with
a shape of [num, batch_size * num], where ``Jacobian[i][j]`` will contain
the Jacobian matrix of the ``i``th column output and the ``j``th input and
will have same dtype and device as the corresponding input.
Other situations can be deduced by analogy.
Examples 1:
.. code-block:: python
import paddle
x = paddle.ones(shape=(4, 2), dtype='float64')
weight = paddle.ones(shape=(2, 4), dtype='float64')
y = paddle.ones(shape=(4, 2), dtype='float64')
def func(x):
return paddle.matmul(paddle.matmul(x, weight), y)
x.stop_gradient = False
batch_jacobian = paddle.autograd.batch_jacobian(func, x)
print(batch_jacobian)
# Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
# [[4., 4., 4., 4., 4., 4., 4., 4.],
# [4., 4., 4., 4., 4., 4., 4., 4.]])
Examples 2:
.. code-block:: python
import paddle
x = paddle.ones(shape=(4, 2), dtype='float64')
weight = paddle.ones(shape=(2, 4), dtype='float64')
y = paddle.ones(shape=(4, 2), dtype='float64')
def func(x):
return paddle.matmul(paddle.matmul(x, weight), y), x * x
x.stop_gradient = False
batch_jacobian = paddle.autograd.batch_jacobian(func, x)
print(batch_jacobian)
# (Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
# [[4., 4., 4., 4., 4., 4., 4., 4.],
# [4., 4., 4., 4., 4., 4., 4., 4.]]), Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
# [[2., 0., 2., 0., 2., 0., 2., 0.],
# [0., 2., 0., 2., 0., 2., 0., 2.]]))
Examples 3:
.. code-block:: python
import paddle
x = paddle.ones(shape=(4, 2), dtype='float64')
weight = paddle.ones(shape=(2, 4), dtype='float64')
y = paddle.ones(shape=(4, 2), dtype='float64')
def func(x, y):
return x * y
x.stop_gradient = False
y.stop_gradient = False
batch_jacobian = paddle.autograd.batch_jacobian(func, [x, y])
print(batch_jacobian)
# (Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
# [[1., 0., 1., 0., 1., 0., 1., 0.],
# [0., 1., 0., 1., 0., 1., 0., 1.]]), Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
# [[1., 0., 1., 0., 1., 0., 1., 0.],
# [0., 1., 0., 1., 0., 1., 0., 1.]]))
'''
inputs = _tensors(inputs, "inputs")
outputs = _tensors(func(*inputs), "outputs")
batch_size = inputs[0].shape[0]
for input in inputs:
assert input.shape[
0] == batch_size, "The first dimension of input should equals to the same batch size!"
for output in outputs:
assert output.shape[
0] == batch_size, "The first dimension of output should equals to the same batch size!"
fin_size = len(inputs)
fout_size = len(outputs)
flat_outputs = tuple(
reshape(
output, shape=[batch_size, -1]) for output in outputs)
jacobian = tuple()
for i, flat_output in enumerate(flat_outputs):
jac_i = list([] for _ in range(fin_size))
for k in range(flat_output.shape[1]):
row_k = grad(
flat_output[:, k],
inputs,
create_graph=create_graph,
retain_graph=True,
allow_unused=allow_unused)
for j in range(fin_size):
jac_i[j].append(
reshape(
row_k[j], shape=[-1])
if isinstance(row_k[j], paddle.Tensor) else None)
jacobian += (tuple(
_stack_tensor_or_return_none(jac_i_j) for jac_i_j in jac_i), )
if fin_size == 1 and fout_size == 1:
return jacobian[0][0]
elif fin_size == 1 and fout_size != 1:
return tuple(jacobian[i][0] for i in range(fout_size))
elif fin_size != 1 and fout_size == 1:
return jacobian[0]
else:
return jacobian


@framework.dygraph_only
def batch_hessian(func, inputs, create_graph=False, allow_unused=False):
'''
.. note::
**This API is ONLY available in the imperative mode.**
This function computes the batch Hessian matrix of `func` with respect to `inputs`.
Noted that the first dimension of inputs is batch size.
Parameters:
func (function): a Python function that takes a Tensor or a Tensor
list/tuple as inputs(the first dimension is batch size) and
returns a Tensor with shape [batch_size, 1].
inputs (Tensor|list(Tensor)|tuple(Tensor)): the input Tensor or
Tensor list/tuple of the function ``func``.
Noted that the first dimension of inputs is batch size.
create_graph (bool, optional): whether to create the gradient graphs
of the computing process. When it is True, higher order derivatives
are supported to compute; when it is False, the gradient graphs of
the computing process would be discarded. Defaults to ``False``.
allow_unused (bool, optional): whether to raise error or return None if
some Tensors of `inputs` are unreachable in the graph. Error would
be raised if allow_unused=False, and None would be returned as
their gradients if allow_unused=True. Default False.
Returns:
Hessian (Tensor or a tuple of tuple of Tensors): if function ``func``
takes a Tensor as ``inputs``, Hessian will be a single Tensor containing
the Hessian matrix for the linearized ``inputs`` Tensor. If function
``func`` takes a Tensor list/tuple as ``inputs``, then the Hessian will
be a tuple of tuple of Tensors. Noted that the first dimension of inputs
is batch size and the execution step is to obtain the result of the
first order differentiation, and then differentiate the batch input.
For example,
the inputs shape and outputs shape of function ``func` is [batch_size, num]
and [batch_size, 1] respectively, then the batched Hessian will be a Tensor with
a shape of [num, batch_size * num].
Why the final shape in this case is that?
because batch_hessian will create a inner func(the wrapper of paddle.grad() func)
to computes the sum of gradients of `outputs` with respect to each `inputs`,
this inner func will get the first order differentiation and shape is [batch_size, num],
then call batch_jacobian to compute jacobian between the first order differentiation
and the origin inputs. The final result ``Hessian[i][j]`` will contain the Jacobian
matrix of the ``i``th column output(Noted that this output means the first order
differentiation) and the ``j``th input and will have same dtype and device as the
corresponding input. Other situations can be deduced by analogy.
Examples 1:
.. code-block:: python
import paddle
x = paddle.ones(shape=(4, 2), dtype='float64')
weight = paddle.ones(shape=(2, 4), dtype='float64')
y = paddle.ones(shape=(4, 2), dtype='float64')
def func(x):
return paddle.matmul(x * x, weight)[:, 0:1]
x.stop_gradient = False
batch_hessian = paddle.autograd.batch_hessian(func, x)
print(batch_hessian)
# Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
# [[2., 0., 2., 0., 2., 0., 2., 0.],
# [0., 2., 0., 2., 0., 2., 0., 2.]])
Examples 2:
.. code-block:: python
import paddle
x = paddle.ones(shape=(4, 2), dtype='float64')
weight = paddle.ones(shape=(2, 4), dtype='float64')
y = paddle.ones(shape=(4, 2), dtype='float64')
def func(x, y):
return paddle.matmul(x * x * y * y, weight)[:, 0:1]
x.stop_gradient = False
y.stop_gradient = False
batch_hessian = paddle.autograd.batch_hessian(func, [x, y])
print(batch_hessian)
# ((Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
# [[2., 0., 2., 0., 2., 0., 2., 0.],
# [0., 2., 0., 2., 0., 2., 0., 2.]]),
# Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
# [[4., 0., 4., 0., 4., 0., 4., 0.],
# [0., 4., 0., 4., 0., 4., 0., 4.]])),
# (Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
# [[4., 0., 4., 0., 4., 0., 4., 0.],
# [0., 4., 0., 4., 0., 4., 0., 4.]]),
# Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
# [[2., 0., 2., 0., 2., 0., 2., 0.],
# [0., 2., 0., 2., 0., 2., 0., 2.]])))
Examples 3:
.. code-block:: python
import paddle
x = paddle.ones(shape=(4, 2), dtype='float64')
weight = paddle.ones(shape=(2, 4), dtype='float64')
y = paddle.ones(shape=(4, 2), dtype='float64')
def func(x, y):
return paddle.matmul(x * x, weight)[:, 0:1]
x.stop_gradient = False
y.stop_gradient = False
batch_hessian = paddle.autograd.batch_hessian(func, [x, y], allow_unused=True)
print(batch_hessian)
# ((Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
# [[2., 0., 2., 0., 2., 0., 2., 0.],
# [0., 2., 0., 2., 0., 2., 0., 2.]]), None), (None, None))
'''
inputs = _tensors(inputs, "inputs")
outputs = func(*inputs)
batch_size = inputs[0].shape[0]
for input in inputs:
assert input.shape[
0] == batch_size, "The first dimension of input should equals to the same batch size!"
assert isinstance(outputs, paddle.Tensor) and outputs.shape == [
batch_size, 1
], "The function to compute batched Hessian matrix should return a Tensor of shape [batch_size, 1]"

def jac_func(*ins):
grad_inputs = grad(
outputs,
ins,
create_graph=True,
retain_graph=True,
allow_unused=allow_unused)
return tuple(
_replace_none_with_zero_tensor(grad_inputs[i], inputs[i])
for i in range(len(inputs)))

return batch_jacobian(
jac_func, inputs, create_graph=create_graph, allow_unused=allow_unused)


@framework.dygraph_only
def hessian(func, inputs, create_graph=False, allow_unused=False):
'''
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,6 @@ foreach(TEST_OP ${TEST_OPS})
py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS ${GC_ENVS})
endforeach(TEST_OP)

set_tests_properties(test_jacobian PROPERTIES TIMEOUT 20)
set_tests_properties(test_jacobian PROPERTIES TIMEOUT 50)
set_tests_properties(test_hessian PROPERTIES TIMEOUT 50)
set_tests_properties(test_vhp PROPERTIES TIMEOUT 50)
Loading

0 comments on commit 784c754

Please sign in to comment.