Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AMP]split minimize and add unscale_ for GradScaler #35825

Merged
merged 11 commits into from
Sep 22, 2021
99 changes: 92 additions & 7 deletions python/paddle/amp/grad_scaler.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,28 @@
# limitations under the License.

from paddle.fluid.dygraph.amp import AmpScaler
from paddle.fluid.dygraph.amp import OptimizerState
from collections import defaultdict

__all__ = []


def _refresh_optimizer_state():
return {"state": OptimizerState.INIT}


class GradScaler(AmpScaler):
"""
GradScaler is used for Auto-Mixed-Precision training in dynamic graph mode.
It controls the scaling of loss, helps avoiding numerical overflow.
The object of this class has two methods `scale()`, `minimize()`.
The object of this class has nineteen methods `scale()`, `unscale_()`, `minimize()`, `step()`, `update()` and `get`/`set` api of parameters.

`scale()` is used to multiply the loss by a scale ratio.
`minimize()` is similar as `optimizer.minimize()`, performs parameters updating.
`unscale_()` is used to unscale the gradients of parameters, multiplies the gradients of parameters by 1/(scale ratio)
`minimize()` is similar as `optimizer.minimize()`, performs parameters updating, and it will update the loss_scaling, it equal to `step()` + `update()`.
`step()` is similar as `optimizer.step()`, which performs parameters updating.
`update` is used to update the loss_scaling.


Commonly, it is used together with `paddle.amp.auto_cast` to achieve Auto-Mixed-Precision in
dynamic graph mode.
Expand Down Expand Up @@ -115,7 +125,7 @@ def minimize(self, optimizer, *args, **kwargs):
This function is similar as `optimizer.minimize()`, which performs parameters updating.

If the scaled gradients of parameters contains NAN or INF, the parameters updating is skipped.
Otherwise, it first unscales the scaled gradients of parameters, then updates the parameters.
Otherwise, if `unscale_()` has not been called, it first unscales the scaled gradients of parameters, then updates the parameters.

Finally, the loss scaling ratio is updated.

Expand Down Expand Up @@ -151,16 +161,18 @@ def step(self, optimizer):
This function is similar as `optimizer.step()`, which performs parameters updating.

If the scaled gradients of parameters contains NAN or INF, the parameters updating is skipped.
Otherwise, it first unscales the scaled gradients of parameters, then updates the parameters.
Otherwise, if `unscale_()` has not been called, it first unscales the scaled gradients of parameters, then updates the parameters.

Args:
optimizer(Optimizer): The optimizer used to update parameters.

Examples:

.. code-block:: python

# required: gpu
import paddle

model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
Expand All @@ -170,24 +182,97 @@ def step(self, optimizer):
loss = paddle.mean(conv)
scaled = scaler.scale(loss) # scale the loss
scaled.backward() # do backward
scaler.step(optimizer)
scaler.step(optimizer) # update parameters
scaler.update() # update the loss scaling ratio
optimizer.clear_grad()
"""
if not self._enable:
return optimizer.step()

optimizer_state = self._optimizer_states[id(optimizer)]
if optimizer_state["state"] is OptimizerState.STEPPED:
raise RuntimeError(
"step() has already been called since the last update().")

# unscale the grad
self._unscale(optimizer)
if optimizer_state["state"] is OptimizerState.INIT:
self._unscale(optimizer)

if self._found_inf:
self._cache_founf_inf = True
else:
optimizer.step()
self._cache_founf_inf = False

optimizer_state["state"] = OptimizerState.STEPPED

if not self._use_dynamic_loss_scaling:
self._optimizer_states = defaultdict(_refresh_optimizer_state)

def update(self):
"""
Updates the loss_scaling.

Examples:

.. code-block:: python

# required: gpu
import paddle

model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
data = paddle.rand([10, 3, 32, 32])
with paddle.amp.auto_cast():
conv = model(data)
loss = paddle.mean(conv)
scaled = scaler.scale(loss) # scale the loss
scaled.backward() # do backward
scaler.step(optimizer) # update parameters
scaler.update() # update the loss scaling ratio
optimizer.clear_grad()
"""
if not self._enable:
return
if self._use_dynamic_loss_scaling:
# uopdate the scale
self._update()
self._optimizer_states = defaultdict(_refresh_optimizer_state)
return

def unscale_(self, optimizer):
"""
Unscale the gradients of parameters, multiplies the gradients of parameters by 1/(loss scaling ratio).
If this instance of :class:`GradScaler` is not enabled, output are returned unmodified.

Args:
optimizer(Optimizer): The optimizer used to update parameters.

Returns:
The unscaled parameters or original parameters.

Examples:

.. code-block:: python

# required: gpu
import paddle

model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
data = paddle.rand([10, 3, 32, 32])
with paddle.amp.auto_cast():
conv = model(data)
loss = paddle.mean(conv)
scaled = scaler.scale(loss) # scale the loss
scaled.backward() # do backward
scaler.unscale_(optimizer) # unscale the parameter
scaler.step(optimizer)
scaler.update()
optimizer.clear_grad()
"""
return super(GradScaler, self)._unscale(optimizer)

def is_enable(self):
"""
Expand Down
114 changes: 77 additions & 37 deletions python/paddle/fluid/dygraph/amp/loss_scaler.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,20 @@
import warnings
import numpy as np
from paddle import _C_ops
from collections import defaultdict
from enum import Enum

__all__ = ['AmpScaler']
__all__ = ['AmpScaler', 'OptimizerState']


class OptimizerState(Enum):
INIT = 0
UNSCALED = 1
STEPPED = 2


def _refresh_optimizer_state():
return {"state": OptimizerState.INIT}


class AmpScaler(object):
Expand All @@ -31,10 +43,11 @@ class AmpScaler(object):

AmpScaler is used for Auto-Mixed-Precision training/inferring in imperative
mode. It controls the scaling of loss, helps avoiding numerical overflow.
The object of this class has two methods `scale()`, `minimize()`.
The object of this class has seventeen methods `scale()`, `unscale_()`, `minimize()` and `get`/`set` api of parameters.

`scale()` is used to multiply the loss by a scale ratio.
`minimize()` is similar as `Optimizer.minimize()`, performs parameters updating.
`unscale_()` is used to unscale the gradients of parameters, multiplies the gradients of parameters by 1/(scale ratio)
`minimize()` is similar as `optimizer.minimize()`, performs parameters updating, and it will update the loss_scaling.

Commonly, it is used together with `amp_guard` to achieve Auto-Mixed-Precision in
imperative mode.
Expand Down Expand Up @@ -117,6 +130,7 @@ def __init__(self,
self._scale = to_variable(
np.array([self._init_loss_scaling]).astype(np.float32))
self._cache_founf_inf = None
self._optimizer_states = defaultdict(_refresh_optimizer_state)

def scale(self, var):
"""
Expand All @@ -129,24 +143,25 @@ def scale(self, var):
The scaled variable or original variable.

Examples:

.. code-block:: python

import numpy as np
import paddle.fluid as fluid

data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
with fluid.dygraph.guard():
model = fluid.dygraph.Conv2D(3, 2, 3)
optimizer = fluid.optimizer.SGDOptimizer(
learning_rate=0.01, parameter_list=model.parameters())
scaler = fluid.dygraph.AmpScaler(init_loss_scaling=1024)
data = fluid.dygraph.to_variable(data)
with fluid.dygraph.amp_guard():
conv = model(data)
loss = fluid.layers.reduce_mean(conv)
scaled = scaler.scale(loss)
scaled.backward()
scaler.minimize(optimizer, scaled)
import numpy as np
import paddle.fluid as fluid

data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
with fluid.dygraph.guard():
model = fluid.dygraph.Conv2D(3, 2, 3)
optimizer = fluid.optimizer.SGDOptimizer(
learning_rate=0.01, parameter_list=model.parameters())
scaler = fluid.dygraph.AmpScaler(init_loss_scaling=1024)
data = fluid.dygraph.to_variable(data)
with fluid.dygraph.amp_guard():
conv = model(data)
loss = fluid.layers.reduce_mean(conv)
scaled = scaler.scale(loss)
scaled.backward()
scaler.minimize(optimizer, scaled)
"""
check_type(var, "var", core.VarBase, 'AmpScaler.scale()')

Expand All @@ -160,7 +175,7 @@ def minimize(self, optimizer, *args, **kwargs):
This function is similar as `Optimizer.minimize()`, which performs parameters updating.

If the scaled gradients of parameters contains NAN or INF, the parameters updating is skipped.
Otherwise, it first unscales the scaled gradients of parameters, then updates the parameters.
Otherwise, if `unscale_()` has not been called, it first unscales the scaled gradients of parameters, then updates the parameters.

Finally, the loss scaling ratio is updated.

Expand All @@ -170,30 +185,34 @@ def minimize(self, optimizer, *args, **kwargs):
kwargs: Keyword arguments, which will be forward to `Optimizer.minimize()`.

Examples:

.. code-block:: python

import numpy as np
import paddle.fluid as fluid

data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
with fluid.dygraph.guard():
model = fluid.dygraph.Conv2D(3, 2, 3)
optimizer = fluid.optimizer.SGDOptimizer(
learning_rate=0.01, parameter_list=model.parameters())
scaler = fluid.dygraph.AmpScaler(init_loss_scaling=1024)
data = fluid.dygraph.to_variable(data)
with fluid.dygraph.amp_guard():
conv = model(data)
loss = fluid.layers.reduce_mean(conv)
scaled = scaler.scale(loss)
scaled.backward()
scaler.minimize(optimizer, scaled)
import numpy as np
import paddle.fluid as fluid

data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
with fluid.dygraph.guard():
model = fluid.dygraph.Conv2D(3, 2, 3)
optimizer = fluid.optimizer.SGDOptimizer(
learning_rate=0.01, parameter_list=model.parameters())
scaler = fluid.dygraph.AmpScaler(init_loss_scaling=1024)
data = fluid.dygraph.to_variable(data)
with fluid.dygraph.amp_guard():
conv = model(data)
loss = fluid.layers.reduce_mean(conv)
scaled = scaler.scale(loss)
scaled.backward()
scaler.minimize(optimizer, scaled)
"""
if not self._enable:
return optimizer.minimize(*args, **kwargs)

optimizer_state = self._optimizer_states[id(optimizer)]

# unscale the grad
self._unscale(optimizer)
if optimizer_state["state"] is OptimizerState.INIT:
self._unscale(optimizer)

optimize_ops, params_grads = (None, None)

Expand All @@ -207,12 +226,31 @@ def minimize(self, optimizer, *args, **kwargs):
# uopdate the scale
self._update()

self._optimizer_states = defaultdict(_refresh_optimizer_state)

return optimize_ops, params_grads

def _unscale(self, optimizer):
"""
Unscale the gradients of parameters, multiplies the gradients of parameters by 1/(loss scaling ratio).
If this instance of :class:`GradScaler` is not enabled, output are returned unmodified.
Args:
optimizer(Optimizer): The optimizer used to update parameters.
Returns:
The unscaled parameters or original parameters.
"""
if not self._enable:
return

optimizer_state = self._optimizer_states[id(optimizer)]

if optimizer_state["state"] is OptimizerState.UNSCALED:
raise RuntimeError(
"unscale_() has already been called on this optimizer since the last update()."
)
elif optimizer_state["state"] is OptimizerState.STEPPED:
raise RuntimeError("unscale_() is being called after step().")

if getattr(optimizer, '_param_groups', None) and isinstance(
optimizer._param_groups[0], dict):
param_grads = []
Expand Down Expand Up @@ -256,6 +294,8 @@ def _unscale(self, optimizer):
temp_found_inf_fp32)
self._found_inf = temp_found_inf_fp16 or temp_found_inf_fp32

optimizer_state["state"] = OptimizerState.UNSCALED

def _update(self):
"""
Updates the loss_scaling.
Expand Down
Loading