Skip to content

Commit

Permalink
[Trainer] remove redundant memory metrics and set enable as default
Browse files Browse the repository at this point in the history
  • Loading branch information
SylarTiaNII committed May 8, 2024
1 parent 09a0ce7 commit 4821ce2
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 31 deletions.
52 changes: 22 additions & 30 deletions paddlenlp/trainer/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@
SAFE_PEFT_WEIGHTS_INDEX_NAME,
SAFE_WEIGHTS_INDEX_NAME,
)
from ..utils.import_utils import is_datasets_available, is_paddle_cuda_available
from ..utils.import_utils import is_datasets_available
from ..utils.log import logger
from .argparser import strtobool
from .integrations import get_reporting_integration_callbacks
Expand Down Expand Up @@ -1259,19 +1259,27 @@ def _maybe_log_save_evaluate(self, tr_loss, model, epoch, ignore_keys_for_eval,
logs["learning_rate"] = float("{0:.3e}".format(self._get_learning_rate()))
logs["global_step"] = int(self.state.global_step)

divisor = 2**30
# TODO(@gexiao): replace these codes with unified APIs in Paddle
current_device = framework._current_expected_place_()
if str(current_device) != "Place(cpu)":
device_id = current_device.get_device_id()
current_memory_allocated = core.device_memory_stat_current_value("Allocated", device_id)
current_memory_reserved = core.device_memory_stat_current_value("Reserved", device_id)
max_memory_allocated = core.device_memory_stat_peak_value("Allocated", device_id)
max_memory_reserved = core.device_memory_stat_peak_value("Reserved", device_id)
logs["current_memory_allocated"] = current_memory_allocated / divisor
logs["current_memory_reserved"] = current_memory_reserved / divisor
logs["max_memory_allocated"] = max_memory_allocated / divisor
logs["max_memory_reserved"] = max_memory_reserved / divisor
# Add additional memory in log.
if not self.args.skip_memory_metrics:
shift_bits_for_MB = 20
logs.update(
{
"cpu_mem_used": self._memory_tracker.cpu_mem_used() >> shift_bits_for_MB,
"cpu_mem_used_peak": self._memory_tracker.cpu_mem_used_peak >> shift_bits_for_MB,

Check warning on line 1268 in paddlenlp/trainer/trainer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/trainer/trainer.py#L1264-L1268

Added lines #L1264 - L1268 were not covered by tests
}
)
# TODO(@gexiao): replace these codes with unified APIs in Paddle
current_device = framework._current_expected_place_()
if str(current_device) != "Place(cpu)":
device_id = current_device.get_device_id()
current_memory_allocated = core.device_memory_stat_current_value("Allocated", device_id)
current_memory_reserved = core.device_memory_stat_current_value("Reserved", device_id)
max_memory_allocated = core.device_memory_stat_peak_value("Allocated", device_id)
max_memory_reserved = core.device_memory_stat_peak_value("Reserved", device_id)
logs["current_memory_allocated"] = current_memory_allocated >> shift_bits_for_MB
logs["current_memory_reserved"] = current_memory_reserved >> shift_bits_for_MB
logs["max_memory_allocated"] = max_memory_allocated >> shift_bits_for_MB
logs["max_memory_reserved"] = max_memory_reserved >> shift_bits_for_MB

Check warning on line 1282 in paddlenlp/trainer/trainer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/trainer/trainer.py#L1272-L1282

Added lines #L1272 - L1282 were not covered by tests

total_train_batch_size = (
self.args.train_batch_size * self.args.gradient_accumulation_steps * self.args.dataset_world_size
Expand All @@ -1294,22 +1302,6 @@ def _maybe_log_save_evaluate(self, tr_loss, model, epoch, ignore_keys_for_eval,
self._globalstep_last_logged = self.state.global_step
self._globalstep_last_start_time = time.time()

# Add additional memory in log.
if not self.args.skip_memory_metrics:
logs.update(
{
"cpu_mem_used": self._memory_tracker.cpu_mem_used() >> 20,
"cpu_mem_used_peak": self._memory_tracker.cpu_mem_used_peak >> 20,
}
)
if is_paddle_cuda_available():
logs.update(
{
"gpu_max_memory_allocated": paddle.device.cuda.max_memory_allocated() >> 20,
"gpu_max_memory_reserved": paddle.device.cuda.max_memory_reserved() >> 20,
}
)

self.log(logs, **kwargs)

metrics = None
Expand Down
2 changes: 1 addition & 1 deletion scripts/distribute/ci_case_dy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -451,7 +451,7 @@ function llm_gpt_recompute_bs32_bf16_MP2-SD4-stage1() {
>>${log_path}/$FUNCNAME 2>&1
loss=`cat $log_dir/workerlog.0 | grep 'global_step: 30' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
ips=`cat $log_dir/workerlog.0 | grep 'global_step: 30' | awk -F 'interval_samples_per_second: ' '{print $2}' | awk -F ',' '{print $1}'`
mem=`cat $log_dir/workerlog.0 | grep 'global_step: 30' | awk -F 'gpu_max_memory_reserved: ' '{print $2}' | awk -F ',' '{print $1}'`
mem=`cat $log_dir/workerlog.0 | grep 'global_step: 30' | awk -F 'max_memory_reserved: ' '{print $2}' | awk -F ',' '{print $1}'`
echo "result: loss=$loss ips=$ips mem=$mem"
loss_base=8.93362617
ips_base=64.75564390065037
Expand Down

0 comments on commit 4821ce2

Please sign in to comment.