Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[cherry pick] Refine param conversion logic in layer.to #38068

Merged
merged 3 commits into from
Dec 13, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion paddle/fluid/pybind/pybind.cc
Original file line number Diff line number Diff line change
Expand Up @@ -527,8 +527,13 @@ PYBIND11_MODULE(core_noavx, m) {

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
m.def("cudnn_version", &platform::CudnnVersion);
m.def("gpu_memory_available", []() {
size_t available = 0;
size_t total = 0;
paddle::platform::GpuMemoryUsage(&available, &total);
return available;
});
#endif

#ifdef PADDLE_WITH_NCCL
m.def("nccl_version", &GetNCCLVersion);
#endif
Expand Down
46 changes: 3 additions & 43 deletions python/paddle/fluid/dygraph/amp/auto_cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,10 +119,7 @@ def _in_amp_guard():


@dygraph_only
def pure_fp16_initialize(enable_pure_fp16, models, optimizers):
if not enable_pure_fp16:
return models, optimizers

def pure_fp16_initialize(models):
for idx in range(len(models)):
for layer in models[idx].sublayers(include_self=True):
layer._casted_by_pure_fp16 = True
Expand All @@ -132,43 +129,7 @@ def pure_fp16_initialize(enable_pure_fp16, models, optimizers):
paddle.nn.BatchNorm, paddle.nn.LayerNorm)):
continue
layer.to(dtype='float16')

for idx_opt in range(len(optimizers)):
# update _param_groups
if getattr(optimizers[idx_opt], '_param_groups', None) and isinstance(
optimizers[idx_opt]._param_groups[0], dict):
for param_group in optimizers[idx_opt]._param_groups:
for i, param in enumerate(param_group['params']):
for idx_model in range(len(models)):
for layer in models[idx_model].sublayers(
include_self=True):
if id(param) in layer._parameters_transform_map:
param_group['params'][
i] = layer._parameters_transform_map[id(
param)][0]
for param_group in optimizers[idx_opt]._parameter_list:
params = param_group['params']
for i, param in enumerate(params):
for idx_model in range(len(models)):
for layer in models[idx_model].sublayers(
include_self=True):
if id(param) in layer._parameters_transform_map:
params[i] = layer._parameters_transform_map[id(
param)][0]
# update _parameter_list
else:
for i, param in enumerate(optimizers[idx_opt]._parameter_list):
for idx_model in range(len(models)):
for layer in models[idx_model].sublayers(include_self=True):
if id(param) in layer._parameters_transform_map:
optimizers[idx_opt]._parameter_list[
i] = layer._parameters_transform_map[id(param)][
0]
if hasattr(optimizers[idx_opt], '_param_groups'):
optimizers[idx_opt]._param_groups[
i] = layer._parameters_transform_map[id(
param)][0]
return models, optimizers
return models


def check_models(models):
Expand Down Expand Up @@ -397,8 +358,7 @@ def amp_decorate(models,
"optimizers must be either a single optimizer or a list of optimizers."
)

models, optimizers = pure_fp16_initialize(
enable_pure_fp16=True, models=models, optimizers=optimizers)
models = pure_fp16_initialize(models=models)

# supprot master_weight
for idx_opt in range(len(optimizers)):
Expand Down
73 changes: 45 additions & 28 deletions python/paddle/fluid/dygraph/layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,9 +121,6 @@ def __init__(self, name_scope=None, dtype="float32"):
self._forward_pre_hooks = collections.OrderedDict()
self._forward_post_hooks = collections.OrderedDict()

self._parameters_transform_map = {}
self._buffers_transform_map = {}

self._casted_by_pure_fp16 = False

self._state_dict_hooks = collections.OrderedDict()
Expand Down Expand Up @@ -1473,24 +1470,14 @@ def _apply(self, func, device, dtype, blocking):
if param is not None:
with no_grad():
param_applied = func(param, device, dtype, blocking)
assert param.is_leaf
param_applied.stop_gradient = param.stop_gradient
self._parameters[key] = param_applied

if param.grad is not None:
with no_grad():
grad_applied = func(param._grad_ivar(), device, dtype,
blocking)

grad_applied.stop_gradient = param._grad_ivar(
).stop_gradient
self._parameters[key]._set_grad_ivar(grad_applied)

self._parameters_transform_map[id(param)] = [param_applied, key]

for key, buf in self._buffers.items():
self._buffers[key] = func(buf, device, dtype, blocking)
self._buffers_transform_map[id(buf)] = [self._buffers[key], key]

def to(self, device=None, dtype=None, blocking=None):
'''
Expand Down Expand Up @@ -1568,24 +1555,54 @@ def transform(t, device, dtype, blocking):
if dtype is None:
dtype = t.dtype

new_t = t._copy_to(device, blocking)
if isinstance(t, framework.ParamBase):
if dtype is not None and dtype != t.dtype:
framework._dygraph_tracer().trace_op(
type='cast',
inputs={'X': new_t},
outputs={'Out': new_t},
attrs={
'in_dtype': t.dtype,
'out_dtype': convert_np_dtype_to_dtype_(dtype)
})
if type(dtype) is str:
dtype = convert_np_dtype_to_dtype_(dtype)

# 1. gpu place need to determine whether the memory is sufficient for allocation:
if t.place.is_gpu_place():
# for gpu, minimum memory allocation unit is 256 bytes.
size_dtype = core.size_of_dtype(dtype)
# Note(zhangbo): Paddle GPU minimum memory allocation unit is 256 bytes, waiting_alloc_memory will comput ‘t’ occupied memory space.
# Coefficient 1.2 is used to avoid OOM that may occur in this critical state when the memory is just enough.
waiting_alloc_memory = (
(np.prod(t.shape) * size_dtype) / 256 + 1) * 256 * 1.2
gpu_memory_available = core.gpu_memory_available()
if gpu_memory_available < waiting_alloc_memory:
# Copy param / Tensor to cpu
t_used = t._copy_to(paddle.CPUPlace(),
blocking) # k-v type will error
# Release mem of t
t.value().get_tensor()._clear()
else:
t_used = t
else:
t_used = t

# 2. cast param / Tensor to dtype
if dtype is not None and dtype != t_used.dtype:
with paddle.fluid.framework._dygraph_place_guard(
place=t_used.place):
t_casted = t_used.cast(dtype=dtype)
else:
t_casted = t_used

# 3. Copy casted cpu param / Tensor to device
if device is not None and not t_casted.place._equals(device):
new_t = t_casted._copy_to(device, blocking)
else:
if dtype is not None and dtype != t.dtype:
new_t = new_t.cast(dtype=dtype)
new_t = t_casted

# 4. share Tensor to origin param / Tensor
dst_tensor = t.value().get_tensor()
src_tensor = new_t.value().get_tensor()
dst_tensor._share_data_with(src_tensor)

return t

return new_t
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=UserWarning)
self._apply(transform, device, dtype, blocking)

self._apply(transform, device, dtype, blocking)
self._dtype = dtype

# [aliases] Compatible with old method names
Expand Down