From 4821ce21f08f69d0a146c2c82d938fb24ac024f4 Mon Sep 17 00:00:00 2001 From: SylarTiaNII <15840554235@163.com> Date: Tue, 7 May 2024 15:55:43 +0800 Subject: [PATCH] [Trainer] remove redundant memory metrics and set enable as default --- paddlenlp/trainer/trainer.py | 52 ++++++++++++++------------------ scripts/distribute/ci_case_dy.sh | 2 +- 2 files changed, 23 insertions(+), 31 deletions(-) diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py index 1582fa3fd605..def303ecddf1 100644 --- a/paddlenlp/trainer/trainer.py +++ b/paddlenlp/trainer/trainer.py @@ -101,7 +101,7 @@ SAFE_PEFT_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_INDEX_NAME, ) -from ..utils.import_utils import is_datasets_available, is_paddle_cuda_available +from ..utils.import_utils import is_datasets_available from ..utils.log import logger from .argparser import strtobool from .integrations import get_reporting_integration_callbacks @@ -1259,19 +1259,27 @@ def _maybe_log_save_evaluate(self, tr_loss, model, epoch, ignore_keys_for_eval, logs["learning_rate"] = float("{0:.3e}".format(self._get_learning_rate())) logs["global_step"] = int(self.state.global_step) - divisor = 2**30 - # TODO(@gexiao): replace these codes with unified APIs in Paddle - current_device = framework._current_expected_place_() - if str(current_device) != "Place(cpu)": - device_id = current_device.get_device_id() - current_memory_allocated = core.device_memory_stat_current_value("Allocated", device_id) - current_memory_reserved = core.device_memory_stat_current_value("Reserved", device_id) - max_memory_allocated = core.device_memory_stat_peak_value("Allocated", device_id) - max_memory_reserved = core.device_memory_stat_peak_value("Reserved", device_id) - logs["current_memory_allocated"] = current_memory_allocated / divisor - logs["current_memory_reserved"] = current_memory_reserved / divisor - logs["max_memory_allocated"] = max_memory_allocated / divisor - logs["max_memory_reserved"] = max_memory_reserved / divisor + # Add additional memory in log. + if not self.args.skip_memory_metrics: + shift_bits_for_MB = 20 + logs.update( + { + "cpu_mem_used": self._memory_tracker.cpu_mem_used() >> shift_bits_for_MB, + "cpu_mem_used_peak": self._memory_tracker.cpu_mem_used_peak >> shift_bits_for_MB, + } + ) + # TODO(@gexiao): replace these codes with unified APIs in Paddle + current_device = framework._current_expected_place_() + if str(current_device) != "Place(cpu)": + device_id = current_device.get_device_id() + current_memory_allocated = core.device_memory_stat_current_value("Allocated", device_id) + current_memory_reserved = core.device_memory_stat_current_value("Reserved", device_id) + max_memory_allocated = core.device_memory_stat_peak_value("Allocated", device_id) + max_memory_reserved = core.device_memory_stat_peak_value("Reserved", device_id) + logs["current_memory_allocated"] = current_memory_allocated >> shift_bits_for_MB + logs["current_memory_reserved"] = current_memory_reserved >> shift_bits_for_MB + logs["max_memory_allocated"] = max_memory_allocated >> shift_bits_for_MB + logs["max_memory_reserved"] = max_memory_reserved >> shift_bits_for_MB total_train_batch_size = ( self.args.train_batch_size * self.args.gradient_accumulation_steps * self.args.dataset_world_size @@ -1294,22 +1302,6 @@ def _maybe_log_save_evaluate(self, tr_loss, model, epoch, ignore_keys_for_eval, self._globalstep_last_logged = self.state.global_step self._globalstep_last_start_time = time.time() - # Add additional memory in log. - if not self.args.skip_memory_metrics: - logs.update( - { - "cpu_mem_used": self._memory_tracker.cpu_mem_used() >> 20, - "cpu_mem_used_peak": self._memory_tracker.cpu_mem_used_peak >> 20, - } - ) - if is_paddle_cuda_available(): - logs.update( - { - "gpu_max_memory_allocated": paddle.device.cuda.max_memory_allocated() >> 20, - "gpu_max_memory_reserved": paddle.device.cuda.max_memory_reserved() >> 20, - } - ) - self.log(logs, **kwargs) metrics = None diff --git a/scripts/distribute/ci_case_dy.sh b/scripts/distribute/ci_case_dy.sh index 4e9697f26403..b57ae050c4ba 100644 --- a/scripts/distribute/ci_case_dy.sh +++ b/scripts/distribute/ci_case_dy.sh @@ -451,7 +451,7 @@ function llm_gpt_recompute_bs32_bf16_MP2-SD4-stage1() { >>${log_path}/$FUNCNAME 2>&1 loss=`cat $log_dir/workerlog.0 | grep 'global_step: 30' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'` ips=`cat $log_dir/workerlog.0 | grep 'global_step: 30' | awk -F 'interval_samples_per_second: ' '{print $2}' | awk -F ',' '{print $1}'` - mem=`cat $log_dir/workerlog.0 | grep 'global_step: 30' | awk -F 'gpu_max_memory_reserved: ' '{print $2}' | awk -F ',' '{print $1}'` + mem=`cat $log_dir/workerlog.0 | grep 'global_step: 30' | awk -F 'max_memory_reserved: ' '{print $2}' | awk -F ',' '{print $1}'` echo "result: loss=$loss ips=$ips mem=$mem" loss_base=8.93362617 ips_base=64.75564390065037