diff --git a/python/paddle/distributed/fleet/utils/fs.py b/python/paddle/distributed/fleet/utils/fs.py index 743ceac3e296c..51aeeb6840d0a 100644 --- a/python/paddle/distributed/fleet/utils/fs.py +++ b/python/paddle/distributed/fleet/utils/fs.py @@ -18,6 +18,7 @@ import os import re import shutil +import subprocess import time # (TODO: GhostScreaming) It will be removed later. @@ -513,6 +514,34 @@ def _run_cmd(self, cmd, redirect_stderr=False, retry_times=5): return ret, output.splitlines() + def _run_safe_cmd(self, cmd, redirect_stderr=False, retry_times=5): + exe_cmd = [self._base_cmd] + cmd.split() + ret = 0 + output = "" + retry_sleep_second = 3 + for x in range(retry_times + 1): + try: + process = subprocess.run( + exe_cmd, + check=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT + if redirect_stderr + else subprocess.PIPE, + text=True, + ) + output = process.stdout + break + except subprocess.CalledProcessError as e: + ret = e.returncode + output = e.output + time.sleep(retry_sleep_second) + except Exception as e: + break + + if ret == 134: + raise FSShellCmdAborted(cmd) + @_handle_errors() def list_dirs(self, fs_path): """ @@ -582,8 +611,8 @@ def ls_dir(self, fs_path): return self._ls_dir(fs_path) def _ls_dir(self, fs_path): - cmd = f"ls {fs_path}" - ret, lines = self._run_cmd(cmd) + cmd = ["-ls", fs_path] + ret, lines = self._run_safe_cmd(cmd) if ret != 0: raise ExecuteError(cmd)