diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 7089ddffa7ceb..3f85c637d7ff3 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -527,8 +527,13 @@ PYBIND11_MODULE(core_noavx, m) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) m.def("cudnn_version", &platform::CudnnVersion); + m.def("gpu_memory_available", []() { + size_t available = 0; + size_t total = 0; + paddle::platform::GpuMemoryUsage(&available, &total); + return available; + }); #endif - #ifdef PADDLE_WITH_NCCL m.def("nccl_version", &GetNCCLVersion); #endif diff --git a/python/paddle/fluid/dygraph/amp/auto_cast.py b/python/paddle/fluid/dygraph/amp/auto_cast.py index d218e6b7490d9..d1fcef4f71d0e 100644 --- a/python/paddle/fluid/dygraph/amp/auto_cast.py +++ b/python/paddle/fluid/dygraph/amp/auto_cast.py @@ -119,10 +119,7 @@ def _in_amp_guard(): @dygraph_only -def pure_fp16_initialize(enable_pure_fp16, models, optimizers): - if not enable_pure_fp16: - return models, optimizers - +def pure_fp16_initialize(models): for idx in range(len(models)): for layer in models[idx].sublayers(include_self=True): layer._casted_by_pure_fp16 = True @@ -132,43 +129,7 @@ def pure_fp16_initialize(enable_pure_fp16, models, optimizers): paddle.nn.BatchNorm, paddle.nn.LayerNorm)): continue layer.to(dtype='float16') - - for idx_opt in range(len(optimizers)): - # update _param_groups - if getattr(optimizers[idx_opt], '_param_groups', None) and isinstance( - optimizers[idx_opt]._param_groups[0], dict): - for param_group in optimizers[idx_opt]._param_groups: - for i, param in enumerate(param_group['params']): - for idx_model in range(len(models)): - for layer in models[idx_model].sublayers( - include_self=True): - if id(param) in layer._parameters_transform_map: - param_group['params'][ - i] = layer._parameters_transform_map[id( - param)][0] - for param_group in optimizers[idx_opt]._parameter_list: - params = param_group['params'] - for i, param in enumerate(params): - for idx_model in range(len(models)): - for layer in models[idx_model].sublayers( - include_self=True): - if id(param) in layer._parameters_transform_map: - params[i] = layer._parameters_transform_map[id( - param)][0] - # update _parameter_list - else: - for i, param in enumerate(optimizers[idx_opt]._parameter_list): - for idx_model in range(len(models)): - for layer in models[idx_model].sublayers(include_self=True): - if id(param) in layer._parameters_transform_map: - optimizers[idx_opt]._parameter_list[ - i] = layer._parameters_transform_map[id(param)][ - 0] - if hasattr(optimizers[idx_opt], '_param_groups'): - optimizers[idx_opt]._param_groups[ - i] = layer._parameters_transform_map[id( - param)][0] - return models, optimizers + return models def check_models(models): @@ -397,8 +358,7 @@ def amp_decorate(models, "optimizers must be either a single optimizer or a list of optimizers." ) - models, optimizers = pure_fp16_initialize( - enable_pure_fp16=True, models=models, optimizers=optimizers) + models = pure_fp16_initialize(models=models) # supprot master_weight for idx_opt in range(len(optimizers)): diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py index 65fa9bc5a679b..b40ebb92534b9 100644 --- a/python/paddle/fluid/dygraph/layers.py +++ b/python/paddle/fluid/dygraph/layers.py @@ -121,9 +121,6 @@ def __init__(self, name_scope=None, dtype="float32"): self._forward_pre_hooks = collections.OrderedDict() self._forward_post_hooks = collections.OrderedDict() - self._parameters_transform_map = {} - self._buffers_transform_map = {} - self._casted_by_pure_fp16 = False self._state_dict_hooks = collections.OrderedDict() @@ -1473,24 +1470,14 @@ def _apply(self, func, device, dtype, blocking): if param is not None: with no_grad(): param_applied = func(param, device, dtype, blocking) - assert param.is_leaf - param_applied.stop_gradient = param.stop_gradient - self._parameters[key] = param_applied if param.grad is not None: with no_grad(): grad_applied = func(param._grad_ivar(), device, dtype, blocking) - grad_applied.stop_gradient = param._grad_ivar( - ).stop_gradient - self._parameters[key]._set_grad_ivar(grad_applied) - - self._parameters_transform_map[id(param)] = [param_applied, key] - for key, buf in self._buffers.items(): self._buffers[key] = func(buf, device, dtype, blocking) - self._buffers_transform_map[id(buf)] = [self._buffers[key], key] def to(self, device=None, dtype=None, blocking=None): ''' @@ -1568,24 +1555,54 @@ def transform(t, device, dtype, blocking): if dtype is None: dtype = t.dtype - new_t = t._copy_to(device, blocking) - if isinstance(t, framework.ParamBase): - if dtype is not None and dtype != t.dtype: - framework._dygraph_tracer().trace_op( - type='cast', - inputs={'X': new_t}, - outputs={'Out': new_t}, - attrs={ - 'in_dtype': t.dtype, - 'out_dtype': convert_np_dtype_to_dtype_(dtype) - }) + if type(dtype) is str: + dtype = convert_np_dtype_to_dtype_(dtype) + + # 1. gpu place need to determine whether the memory is sufficient for allocation: + if t.place.is_gpu_place(): + # for gpu, minimum memory allocation unit is 256 bytes. + size_dtype = core.size_of_dtype(dtype) + # Note(zhangbo): Paddle GPU minimum memory allocation unit is 256 bytes, waiting_alloc_memory will comput ‘t’ occupied memory space. + # Coefficient 1.2 is used to avoid OOM that may occur in this critical state when the memory is just enough. + waiting_alloc_memory = ( + (np.prod(t.shape) * size_dtype) / 256 + 1) * 256 * 1.2 + gpu_memory_available = core.gpu_memory_available() + if gpu_memory_available < waiting_alloc_memory: + # Copy param / Tensor to cpu + t_used = t._copy_to(paddle.CPUPlace(), + blocking) # k-v type will error + # Release mem of t + t.value().get_tensor()._clear() + else: + t_used = t + else: + t_used = t + + # 2. cast param / Tensor to dtype + if dtype is not None and dtype != t_used.dtype: + with paddle.fluid.framework._dygraph_place_guard( + place=t_used.place): + t_casted = t_used.cast(dtype=dtype) + else: + t_casted = t_used + + # 3. Copy casted cpu param / Tensor to device + if device is not None and not t_casted.place._equals(device): + new_t = t_casted._copy_to(device, blocking) else: - if dtype is not None and dtype != t.dtype: - new_t = new_t.cast(dtype=dtype) + new_t = t_casted + + # 4. share Tensor to origin param / Tensor + dst_tensor = t.value().get_tensor() + src_tensor = new_t.value().get_tensor() + dst_tensor._share_data_with(src_tensor) + + return t - return new_t + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=UserWarning) + self._apply(transform, device, dtype, blocking) - self._apply(transform, device, dtype, blocking) self._dtype = dtype # [aliases] Compatible with old method names