From e52d89e3f5da3f92f4345b31a96dcd234a6a7395 Mon Sep 17 00:00:00 2001 From: LeoXing1996 Date: Sun, 9 Oct 2022 19:39:32 +0800 Subject: [PATCH 1/6] trigger CI From 75f9cf00544a724b8104b98ab87b334866d72575 Mon Sep 17 00:00:00 2001 From: LeoXing1996 Date: Tue, 11 Oct 2022 16:39:35 +0800 Subject: [PATCH 2/6] fix cache dir in job watcher --- .dev_scripts/job_watcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.dev_scripts/job_watcher.py b/.dev_scripts/job_watcher.py index eaf4d3cfba..2ceaff5e1c 100644 --- a/.dev_scripts/job_watcher.py +++ b/.dev_scripts/job_watcher.py @@ -9,7 +9,7 @@ from pygments.util import ClassNotFound from simple_term_menu import TerminalMenu -CACHE_DIR = '~/.task_watcher' +CACHE_DIR = osp.join(osp.abspath('~'), '.task_watcher') def show_job_out(name, root, job_name_list): From c4e0761e4cfd8bc0df200f4d6336bbe45908ec23 Mon Sep 17 00:00:00 2001 From: LeoXing1996 Date: Tue, 11 Oct 2022 16:43:55 +0800 Subject: [PATCH 3/6] support rerun failure or canceled jobs in train benchmark --- .dev_scripts/train_benchmark.py | 26 +++++-- .dev_scripts/utils.py | 118 ++++++++++++++++++++++++++++++++ 2 files changed, 139 insertions(+), 5 deletions(-) create mode 100644 .dev_scripts/utils.py diff --git a/.dev_scripts/train_benchmark.py b/.dev_scripts/train_benchmark.py index a7acc7e759..556e79f4bd 100644 --- a/.dev_scripts/train_benchmark.py +++ b/.dev_scripts/train_benchmark.py @@ -13,6 +13,7 @@ from rich.syntax import Syntax from rich.table import Table from tqdm import tqdm +from utils import filter_jobs, parse_job_list_from_file console = Console() MMEDIT_ROOT = Path(__file__).absolute().parents[1] @@ -91,6 +92,10 @@ def parse_args(): parser.add_argument('--skip', type=str, default=None) parser.add_argument('--skip-list', default=None) parser.add_argument('--rerun', type=str, default=None) + parser.add_argument( + '--rerun-fail', action='store_true', help='only rerun failed tasks') + parser.add_argument( + '--rerun-cancel', action='store_true', help='only rerun cancel tasks') parser.add_argument('--rerun-list', default=None) parser.add_argument('--gpus-per-job', type=int, default=None) parser.add_argument( @@ -145,11 +150,22 @@ def parse_args(): args.skip_list = skip_list print('skip_list: ', args.skip_list) elif args.rerun is not None: - with open(args.rerun, 'r') as fp: - rerun_list = fp.readlines() - rerun_list = [j.split('\n')[0] for j in rerun_list] - args.rerun_list = rerun_list - print('rerun_list: ', args.rerun_list) + job_id_list_full, job_name_list_full = parse_job_list_from_file( + args.rerun) + filter_target = [] + + if args.rerun_fail: + filter_target += ['FAILED'] + if args.rerun_cancel: + filter_target += ['CANCELLED'] + + _, job_name_list = filter_jobs( + job_id_list_full, + job_name_list_full, + filter_target, + show_table=True, + table_name='Rerun List') + args.rerun_list = job_name_list return args diff --git a/.dev_scripts/utils.py b/.dev_scripts/utils.py new file mode 100644 index 0000000000..42198a5475 --- /dev/null +++ b/.dev_scripts/utils.py @@ -0,0 +1,118 @@ +import os +import os.path as osp +from typing import Tuple + +from rich import print as pprint +from rich.table import Table + + +def parse_job_list(job_list) -> Tuple[list, list]: + """Parse task name and job id from list. All elements in `job_list` must. + + be formatted as `JOBID @ JOBNAME`. + + Args: + job_list (list[str]): Job list. + + Returns: + Tuple[list, list]: Job ID list and Job name list. + """ + assert all([ + ' @ ' in job for job in job_list + ]), ('Each line of job list must be formatted like \'JOBID @ JOBNAME\'.') + job_id_list, job_name_list = [], [] + for job_info in job_list: + job_id, job_name = job_info.split(' @ ') + job_id_list.append(job_id) + job_name_list.append(job_name) + return job_id_list, job_name_list + + +def parse_job_list_from_file(job_list_file: str) -> Tuple[list, list]: + """Parse job list from file and return a tuple contains list of job id and + job name. + + Args: + job_list_file (str): The path to the file list. + + Returns: + Tuple[list, list]: A tuple contains list of job id and job name. + """ + if not osp.exists(job_list_file): + return False + with open(job_list_file, 'r') as file: + job_list = [job.strip() for job in file.readlines()] + return parse_job_list(job_list) + + +def get_info_from_id(job_id: str) -> dict: + """Get the basic information of a job id with `swatch examine` command. + + Args: + job_id (str): The ID of the job. + + Returns: + dict: A dict contains information of the corresponding job id. + """ + # NOTE: do not have exception handling here + info_stream = os.popen(f'swatch examine {job_id}') + info_str = [line.strip() for line in info_stream.readlines()] + status_info = info_str[2].split() + try: + status_dict = { + 'JobID': status_info[0], + 'JobName': status_info[1], + 'Partition': status_info[2], + 'NNodes': status_info[3], + 'AllocCPUS': status_info[4], + 'State': status_info[5] + } + except Exception: + print(job_id) + print(info_str) + return status_dict + + +def filter_jobs(job_id_list: list, + job_name_list: list, + select: list = ['FAILED'], + show_table: bool = False, + table_name: str = 'Filter Results') -> Tuple[list, list]: + """Filter the job which status not belong to :attr:`select`. + + Args: + job_id_list (list): The list of job ids. + job_name_list (list): The list of job names. + select (list, optional): Which kind of jobs will be selected. + Defaults to ['FAILED']. + show_table (bool, optional): Whether display the filter result in a + table. Defaults to False. + table_name (str, optional): The name of the table. Defaults to + 'Filter Results'. + + Returns: + Tuple[list]: A tuple contains selected job ids and job names. + """ + # if ignore is not passed, return the original id list and name list + if not select: + return job_id_list, job_name_list + filtered_id_list, filtered_name_list = [], [] + job_info_list = [] + for id_, name_ in zip(job_id_list, job_name_list): + info = get_info_from_id(id_) + job_info_list.append(info) + if info['State'] in select: + filtered_id_list.append(id_) + filtered_name_list.append(name_) + + if show_table: + filter_table = Table(title=table_name) + for field in ['Name', 'ID', 'State', 'Is Selected']: + filter_table.add_column(field) + for id_, name_, info_ in zip(job_id_list, job_name_list, + job_info_list): + selected = '[green]True' \ + if info_['State'] in select else '[red]False' + filter_table.add_row(name_, id_, info_['State'], selected) + pprint(filter_table) + return filtered_id_list, filtered_name_list From b16b6bd459e48a71fe79e3b15cd6f4fb8c52fc7d Mon Sep 17 00:00:00 2001 From: LeoXing1996 Date: Tue, 11 Oct 2022 16:55:08 +0800 Subject: [PATCH 4/6] add use case and readme for rerun --- .dev_scripts/README.md | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/.dev_scripts/README.md b/.dev_scripts/README.md index 2a7e43e6a2..891ed92a4e 100644 --- a/.dev_scripts/README.md +++ b/.dev_scripts/README.md @@ -184,7 +184,34 @@ python .dev_scripts/train_benchmark.py mm_lol \ Specifically, you need to enable `--skip`, and specify the list of models to skip by `--skip-list` -## Automatically check links +## 7. Train failed or canceled jobs + +If you want to rerun failed or canceled jobs in the last run, you can combine `--rerun` flag with `--rerun-failure` and `--rerun-cancel` flags. + +For example, the log file of the last run is `train-20221009-211904.log`, and now you want to rerun the failed jobs. You can use the following command: + +```bash +python .dev_scripts/train_benchmark.py mm_lol \ + --job-name RERUN \ + --rerun train-20221009-211904.log \ + --rerun-fail \ + --run +``` + +We can combine `--rerun-fail` and `--rerun-cancel` with flag `---models` to rerun a **subset** of failed or canceled model. + +```bash +python .dev_scripts/train_benchmark.py mm_lol \ + --job-name RERUN \ + --rerun train-20221009-211904.log \ + --rerun-fail \ + --models sagan \ # only rerun 'sagan' models in all failed tasks + --run +``` + +Specifically, `--rerun-fail` and `--rerun-cancel` can be used together to rerun both failed and cancaled jobs. + +## 8. Automatically check links Use the following script to check whether the links in documentations are valid: From 332420e3ee274f3e2a86f4dae3c23394f9b1352b Mon Sep 17 00:00:00 2001 From: LeoXing1996 Date: Tue, 11 Oct 2022 20:04:15 +0800 Subject: [PATCH 5/6] avoid trigger circle CI pr-stage-test when .dev_script is modified --- .circleci/config.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index ac6568d996..e7650e63ea 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -26,6 +26,7 @@ workflows: tools/.* lint_only false configs/.* lint_only false .circleci/.* lint_only false + .dev_scripts/.* lint_only true base-revision: 1.x # this is the path of the configuration we should trigger once # path filtering and pipeline parameter value updates are From c25f27fab4ea21d4bc00325d0548aed2ae3c8cef Mon Sep 17 00:00:00 2001 From: LeoXing1996 Date: Tue, 11 Oct 2022 20:27:36 +0800 Subject: [PATCH 6/6] support change cpus-per-task in train benchmark --- .dev_scripts/train_benchmark.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.dev_scripts/train_benchmark.py b/.dev_scripts/train_benchmark.py index 556e79f4bd..212e8e8249 100644 --- a/.dev_scripts/train_benchmark.py +++ b/.dev_scripts/train_benchmark.py @@ -98,6 +98,7 @@ def parse_args(): '--rerun-cancel', action='store_true', help='only rerun cancel tasks') parser.add_argument('--rerun-list', default=None) parser.add_argument('--gpus-per-job', type=int, default=None) + parser.add_argument('--cpus-per-job', type=int, default=16) parser.add_argument( '--amp', action='store_true', help='Whether to use amp.') parser.add_argument( @@ -238,7 +239,7 @@ def create_train_job_batch(commands, model_info, args, port, script_name): job_script += (f'#SBATCH --gres=gpu:{n_gpus}\n' f'#SBATCH --ntasks-per-node={min(n_gpus, 8)}\n' f'#SBATCH --ntasks={n_gpus}\n' - f'#SBATCH --cpus-per-task=5\n\n') + f'#SBATCH --cpus-per-task={args.cpus_per_job}\n\n') else: job_script += '\n\n' + 'export CUDA_VISIBLE_DEVICES=-1\n'