From e52d89e3f5da3f92f4345b31a96dcd234a6a7395 Mon Sep 17 00:00:00 2001
From: LeoXing1996 <xingzn1996@hotmail.com>
Date: Sun, 9 Oct 2022 19:39:32 +0800
Subject: [PATCH 1/6] trigger CI


From 75f9cf00544a724b8104b98ab87b334866d72575 Mon Sep 17 00:00:00 2001
From: LeoXing1996 <xingzn1996@hotmail.com>
Date: Tue, 11 Oct 2022 16:39:35 +0800
Subject: [PATCH 2/6] fix cache dir in job watcher

---
 .dev_scripts/job_watcher.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.dev_scripts/job_watcher.py b/.dev_scripts/job_watcher.py
index eaf4d3cfba..2ceaff5e1c 100644
--- a/.dev_scripts/job_watcher.py
+++ b/.dev_scripts/job_watcher.py
@@ -9,7 +9,7 @@
 from pygments.util import ClassNotFound
 from simple_term_menu import TerminalMenu
 
-CACHE_DIR = '~/.task_watcher'
+CACHE_DIR = osp.join(osp.abspath('~'), '.task_watcher')
 
 
 def show_job_out(name, root, job_name_list):

From c4e0761e4cfd8bc0df200f4d6336bbe45908ec23 Mon Sep 17 00:00:00 2001
From: LeoXing1996 <xingzn1996@hotmail.com>
Date: Tue, 11 Oct 2022 16:43:55 +0800
Subject: [PATCH 3/6] support rerun failure or canceled jobs in train benchmark

---
 .dev_scripts/train_benchmark.py |  26 +++++--
 .dev_scripts/utils.py           | 118 ++++++++++++++++++++++++++++++++
 2 files changed, 139 insertions(+), 5 deletions(-)
 create mode 100644 .dev_scripts/utils.py

diff --git a/.dev_scripts/train_benchmark.py b/.dev_scripts/train_benchmark.py
index a7acc7e759..556e79f4bd 100644
--- a/.dev_scripts/train_benchmark.py
+++ b/.dev_scripts/train_benchmark.py
@@ -13,6 +13,7 @@
 from rich.syntax import Syntax
 from rich.table import Table
 from tqdm import tqdm
+from utils import filter_jobs, parse_job_list_from_file
 
 console = Console()
 MMEDIT_ROOT = Path(__file__).absolute().parents[1]
@@ -91,6 +92,10 @@ def parse_args():
     parser.add_argument('--skip', type=str, default=None)
     parser.add_argument('--skip-list', default=None)
     parser.add_argument('--rerun', type=str, default=None)
+    parser.add_argument(
+        '--rerun-fail', action='store_true', help='only rerun failed tasks')
+    parser.add_argument(
+        '--rerun-cancel', action='store_true', help='only rerun cancel tasks')
     parser.add_argument('--rerun-list', default=None)
     parser.add_argument('--gpus-per-job', type=int, default=None)
     parser.add_argument(
@@ -145,11 +150,22 @@ def parse_args():
             args.skip_list = skip_list
             print('skip_list: ', args.skip_list)
     elif args.rerun is not None:
-        with open(args.rerun, 'r') as fp:
-            rerun_list = fp.readlines()
-            rerun_list = [j.split('\n')[0] for j in rerun_list]
-            args.rerun_list = rerun_list
-            print('rerun_list: ', args.rerun_list)
+        job_id_list_full, job_name_list_full = parse_job_list_from_file(
+            args.rerun)
+        filter_target = []
+
+        if args.rerun_fail:
+            filter_target += ['FAILED']
+        if args.rerun_cancel:
+            filter_target += ['CANCELLED']
+
+        _, job_name_list = filter_jobs(
+            job_id_list_full,
+            job_name_list_full,
+            filter_target,
+            show_table=True,
+            table_name='Rerun List')
+        args.rerun_list = job_name_list
 
     return args
 
diff --git a/.dev_scripts/utils.py b/.dev_scripts/utils.py
new file mode 100644
index 0000000000..42198a5475
--- /dev/null
+++ b/.dev_scripts/utils.py
@@ -0,0 +1,118 @@
+import os
+import os.path as osp
+from typing import Tuple
+
+from rich import print as pprint
+from rich.table import Table
+
+
+def parse_job_list(job_list) -> Tuple[list, list]:
+    """Parse task name and job id from list. All elements in `job_list` must.
+
+    be formatted as `JOBID @ JOBNAME`.
+
+    Args:
+        job_list (list[str]): Job list.
+
+    Returns:
+        Tuple[list, list]: Job ID list and Job name list.
+    """
+    assert all([
+        ' @ ' in job for job in job_list
+    ]), ('Each line of job list must be formatted like \'JOBID @ JOBNAME\'.')
+    job_id_list, job_name_list = [], []
+    for job_info in job_list:
+        job_id, job_name = job_info.split(' @ ')
+        job_id_list.append(job_id)
+        job_name_list.append(job_name)
+    return job_id_list, job_name_list
+
+
+def parse_job_list_from_file(job_list_file: str) -> Tuple[list, list]:
+    """Parse job list from file and return a tuple contains list of job id and
+    job name.
+
+    Args:
+        job_list_file (str): The path to the file list.
+
+    Returns:
+        Tuple[list, list]: A tuple contains list of job id and job name.
+    """
+    if not osp.exists(job_list_file):
+        return False
+    with open(job_list_file, 'r') as file:
+        job_list = [job.strip() for job in file.readlines()]
+    return parse_job_list(job_list)
+
+
+def get_info_from_id(job_id: str) -> dict:
+    """Get the basic information of a job id with `swatch examine` command.
+
+    Args:
+        job_id (str): The ID of the job.
+
+    Returns:
+        dict: A dict contains information of the corresponding job id.
+    """
+    # NOTE: do not have exception handling here
+    info_stream = os.popen(f'swatch examine {job_id}')
+    info_str = [line.strip() for line in info_stream.readlines()]
+    status_info = info_str[2].split()
+    try:
+        status_dict = {
+            'JobID': status_info[0],
+            'JobName': status_info[1],
+            'Partition': status_info[2],
+            'NNodes': status_info[3],
+            'AllocCPUS': status_info[4],
+            'State': status_info[5]
+        }
+    except Exception:
+        print(job_id)
+        print(info_str)
+    return status_dict
+
+
+def filter_jobs(job_id_list: list,
+                job_name_list: list,
+                select: list = ['FAILED'],
+                show_table: bool = False,
+                table_name: str = 'Filter Results') -> Tuple[list, list]:
+    """Filter the job which status not belong to :attr:`select`.
+
+    Args:
+        job_id_list (list): The list of job ids.
+        job_name_list (list): The list of job names.
+        select (list, optional): Which kind of jobs will be selected.
+            Defaults to ['FAILED'].
+        show_table (bool, optional): Whether display the filter result in a
+            table. Defaults to False.
+        table_name (str, optional): The name of the table. Defaults to
+            'Filter Results'.
+
+    Returns:
+        Tuple[list]: A tuple contains selected job ids and job names.
+    """
+    # if ignore is not passed, return the original id list and name list
+    if not select:
+        return job_id_list, job_name_list
+    filtered_id_list, filtered_name_list = [], []
+    job_info_list = []
+    for id_, name_ in zip(job_id_list, job_name_list):
+        info = get_info_from_id(id_)
+        job_info_list.append(info)
+        if info['State'] in select:
+            filtered_id_list.append(id_)
+            filtered_name_list.append(name_)
+
+    if show_table:
+        filter_table = Table(title=table_name)
+        for field in ['Name', 'ID', 'State', 'Is Selected']:
+            filter_table.add_column(field)
+        for id_, name_, info_ in zip(job_id_list, job_name_list,
+                                     job_info_list):
+            selected = '[green]True' \
+                if info_['State'] in select else '[red]False'
+            filter_table.add_row(name_, id_, info_['State'], selected)
+        pprint(filter_table)
+    return filtered_id_list, filtered_name_list

From b16b6bd459e48a71fe79e3b15cd6f4fb8c52fc7d Mon Sep 17 00:00:00 2001
From: LeoXing1996 <xingzn1996@hotmail.com>
Date: Tue, 11 Oct 2022 16:55:08 +0800
Subject: [PATCH 4/6] add use case and readme for rerun

---
 .dev_scripts/README.md | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/.dev_scripts/README.md b/.dev_scripts/README.md
index 2a7e43e6a2..891ed92a4e 100644
--- a/.dev_scripts/README.md
+++ b/.dev_scripts/README.md
@@ -184,7 +184,34 @@ python .dev_scripts/train_benchmark.py mm_lol \
 
 Specifically, you need to enable `--skip`, and specify the list of models to skip by `--skip-list`
 
-## Automatically check links
+## 7. Train failed or canceled jobs
+
+If you want to rerun failed or canceled jobs in the last run, you can combine `--rerun` flag with `--rerun-failure` and `--rerun-cancel` flags.
+
+For example, the log file of the last run is `train-20221009-211904.log`, and now you want to rerun the failed jobs. You can use the following command:
+
+```bash
+python .dev_scripts/train_benchmark.py mm_lol \
+    --job-name RERUN \
+    --rerun train-20221009-211904.log \
+    --rerun-fail \
+    --run
+```
+
+We can combine `--rerun-fail` and `--rerun-cancel` with flag `---models` to rerun a **subset** of failed or canceled model.
+
+```bash
+python .dev_scripts/train_benchmark.py mm_lol \
+    --job-name RERUN \
+    --rerun train-20221009-211904.log \
+    --rerun-fail \
+    --models sagan \  # only rerun 'sagan' models in all failed tasks
+    --run
+```
+
+Specifically, `--rerun-fail` and `--rerun-cancel` can be used together to rerun both failed and cancaled jobs.
+
+## 8. Automatically check links
 
 Use the following script to check whether the links in documentations are valid:
 

From 332420e3ee274f3e2a86f4dae3c23394f9b1352b Mon Sep 17 00:00:00 2001
From: LeoXing1996 <xingzn1996@hotmail.com>
Date: Tue, 11 Oct 2022 20:04:15 +0800
Subject: [PATCH 5/6] avoid trigger circle CI pr-stage-test  when .dev_script
 is modified

---
 .circleci/config.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index ac6568d996..e7650e63ea 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -26,6 +26,7 @@ workflows:
             tools/.* lint_only false
             configs/.* lint_only false
             .circleci/.* lint_only false
+            .dev_scripts/.* lint_only true
           base-revision: 1.x
           # this is the path of the configuration we should trigger once
           # path filtering and pipeline parameter value updates are

From c25f27fab4ea21d4bc00325d0548aed2ae3c8cef Mon Sep 17 00:00:00 2001
From: LeoXing1996 <xingzn1996@hotmail.com>
Date: Tue, 11 Oct 2022 20:27:36 +0800
Subject: [PATCH 6/6] support change cpus-per-task in train benchmark

---
 .dev_scripts/train_benchmark.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.dev_scripts/train_benchmark.py b/.dev_scripts/train_benchmark.py
index 556e79f4bd..212e8e8249 100644
--- a/.dev_scripts/train_benchmark.py
+++ b/.dev_scripts/train_benchmark.py
@@ -98,6 +98,7 @@ def parse_args():
         '--rerun-cancel', action='store_true', help='only rerun cancel tasks')
     parser.add_argument('--rerun-list', default=None)
     parser.add_argument('--gpus-per-job', type=int, default=None)
+    parser.add_argument('--cpus-per-job', type=int, default=16)
     parser.add_argument(
         '--amp', action='store_true', help='Whether to use amp.')
     parser.add_argument(
@@ -238,7 +239,7 @@ def create_train_job_batch(commands, model_info, args, port, script_name):
         job_script += (f'#SBATCH --gres=gpu:{n_gpus}\n'
                        f'#SBATCH --ntasks-per-node={min(n_gpus, 8)}\n'
                        f'#SBATCH --ntasks={n_gpus}\n'
-                       f'#SBATCH --cpus-per-task=5\n\n')
+                       f'#SBATCH --cpus-per-task={args.cpus_per_job}\n\n')
     else:
         job_script += '\n\n' + 'export CUDA_VISIBLE_DEVICES=-1\n'