diff --git a/.dev_scripts/README.md b/.dev_scripts/README.md index 891ed92a4e..deb066b41f 100644 --- a/.dev_scripts/README.md +++ b/.dev_scripts/README.md @@ -211,7 +211,15 @@ python .dev_scripts/train_benchmark.py mm_lol \ Specifically, `--rerun-fail` and `--rerun-cancel` can be used together to rerun both failed and cancaled jobs. -## 8. Automatically check links +## 8. `deterministic` training + +Set `torch.backends.cudnn.deterministic = True` and `torch.backends.cudnn.benchmark = False` can remove randomness operation in Pytorch training. You can add `--deterministic` flag when start your benchmark training to remove the influence of randomness operation. + +```shell +python .dev_scripts/train_benchmark.py mm_lol --job-name xzn --models pix2pix --cpus-per-job 16 --run --deterministic +``` + +## 9. Automatically check links Use the following script to check whether the links in documentations are valid: diff --git a/.dev_scripts/train_benchmark.py b/.dev_scripts/train_benchmark.py index 212e8e8249..acb900cc15 100644 --- a/.dev_scripts/train_benchmark.py +++ b/.dev_scripts/train_benchmark.py @@ -117,6 +117,10 @@ def parse_args(): '--work-dir', default='work_dirs/benchmark_train', help='the dir to save metric') + parser.add_argument( + '--deterministic', + action='store_true', + help='Whether set `deterministic` during training.') parser.add_argument( '--run', action='store_true', help='run script directly') parser.add_argument( @@ -239,10 +243,14 @@ def create_train_job_batch(commands, model_info, args, port, script_name): job_script += (f'#SBATCH --gres=gpu:{n_gpus}\n' f'#SBATCH --ntasks-per-node={min(n_gpus, 8)}\n' f'#SBATCH --ntasks={n_gpus}\n' - f'#SBATCH --cpus-per-task={args.cpus_per_job}\n\n') + f'#SBATCH --cpus-per-task={args.cpus_per_job}\n' + f'#SBATCH --kill-on-bad-exit=1\n\n') else: job_script += '\n\n' + 'export CUDA_VISIBLE_DEVICES=-1\n' + if args.deterministic: + job_script += 'export CUBLAS_WORKSPACE_CONFIG=:4096:8\n' + job_script += (f'export MASTER_PORT={port}\n' f'{runner} -u {script_name} {config} ' f'--work-dir={work_dir} ' @@ -254,6 +262,9 @@ def create_train_job_batch(commands, model_info, args, port, script_name): if args.amp: job_script += ' --amp ' + if args.deterministic: + job_script += ' --cfg-options randomness.deterministic=True' + job_script += '\n' with open(work_dir / 'job.sh', 'w') as f: