forked from KIT-CMS/KingMaker
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ML_train_luigi.cfg
72 lines (66 loc) · 2.25 KB
/
ML_train_luigi.cfg
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
[core]
no_lock = True
log_level = DEBUG
[worker]
keep_alive = False
ping_interval = 20
wait_interval = 20
max_reschedules = 3
[DEFAULT]
name = ML_train
ENV_NAME = BaseWRoot
; grid storage protocol and path usable from submitting machine and worker nodes of cluster
; job in- and output will be stored in $wlcg_path under subdirectory of analysis $name
wlcg_path = root://cmsxrootd-kit.gridka.de//store/user/${USER}/LAW_storage
; default htcondor job submission configuration (modifiable for each task)
htcondor_accounting_group = cms.higgs
htcondor_remote_job = True
htcondor_request_cpus = 1
# htcondor_request_gpus = 1
; for all cores in total
htcondor_universe = docker
;image without GPU libraries
htcondor_docker_image = mschnepf/slc7-condocker:latest
;image with GPU libraries
# htcondor_docker_image = tvoigtlaender/slc7-condocker-cuda-11.5-cudnn8:base
; create log files in htcondor jobs
transfer_logs = True
; set local scheduler
local_scheduler = True
; set tolerance for workflow success with failed branches
tolerance = 0.00
acceptance = 1.00
; submit only missing htcondor workflow branches (should always be true)
only_missing = True
; bootstrap file to be sourced at beginning of htcondor jobs (relative PATH to framework.py)
bootstrap_file = setup_law_remote.sh
#Can be: 2016 2017 2018 all_eras
era = 2016
#Can be: tt et mt
channels = ["et"]
#Can be: 240 280 320 360 400 450 500 550 600 700 800 900 1000 heavier
masses = ["500"]
#Can be: 1 2 3 4 5 6 7
batch_nums = ["2"]
[PuppetMaster]
[CreateTrainingDataShardConfig]
[CreateTrainingDataShard]
htcondor_request_cpus = 1
htcondor_walltime = 3600
htcondor_request_memory = 2000
htcondor_request_disk = 2000000
additional_files = ["sm-htt-analysis/ml_datasets/create_training_datashard.py"]
tasks_per_job = 12
[CreateTrainingConfig]
[RunTraining]
ENV_NAME = ML_LAW
htcondor_request_cpus = 2
htcondor_request_gpus = 1
htcondor_docker_image = tvoigtlaender/slc7-condocker-cuda-11.5-cudnn8:base
htcondor_walltime = 3600
htcondor_request_memory = 5000
htcondor_requirements = ( TARGET.CloudSite =?= "topas" )
#&& (Machine =?= "f03-001-179-e.gridka.de")
htcondor_request_disk = 4000000
additional_files = ["sm-htt-analysis/ml_trainings", "sm-htt-analysis/utils/get_processes.py", "sm-htt-analysis/utils/setup_python.sh"]
[RunAllTrainings]