Skip to content

Commit

Permalink
Detect failure to launch pbs job in egs-parallel
Browse files Browse the repository at this point in the history
Detect pbs jobs that fail to launch in egs-parallel, by looking at the
echoed job pid: quit immediately if it is not an integer. If the first
job fails, subsequent jobs are not launched. Report the failure in the
log. Also adjust the format of a few log messages.
  • Loading branch information
ftessier committed Sep 1, 2020
1 parent f59c2af commit 258bb94
Show file tree
Hide file tree
Showing 4 changed files with 21 additions and 7 deletions.
4 changes: 2 additions & 2 deletions HEN_HOUSE/scripts/bin/egs-parallel
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ done
opt_options="${opt_options_array[@]}"

### begin script
log "BEGIN $0"
log "BEGIN $(basename $0)"

### EGSnrc environment variables
log "EGSnrc environment:"
Expand Down Expand Up @@ -238,7 +238,7 @@ log " options = $opt_options"
logfile=$egs_home/$cmd_app/$basename.egsparallel
/bin/mv egs-parallel-$$.log $logfile
exec 3>>$logfile
log "egs-parallel log: $logfile"
log "log file: $logfile"

### go to egs application directory
log "cd $egs_home/$cmd_app"
Expand Down
3 changes: 2 additions & 1 deletion HEN_HOUSE/scripts/egs-parallel-dshtask
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,8 @@ jobstr=$(printf "job %04d" $job)
log "$jobstr <- $taskstr"

# log the host and pid of this job
log "$jobstr: host=$(hostname) BEGIN pid=$$"
log "$jobstr: host=$(hostname)"
log "$jobstr: BEGIN pid=$$"

### manage jobs to avoid bottleneck and race conditions
if [ $job -eq 1 ]; then
Expand Down
12 changes: 10 additions & 2 deletions HEN_HOUSE/scripts/egs-parallel-pbs
Original file line number Diff line number Diff line change
Expand Up @@ -216,9 +216,11 @@ for job in $(seq 1 $nthread); do
fi

### launch the job
pbscommand="qsub -q $queue $scheduler_options"
runcommand="$command -b -P $nthread -j $job -f $first"
log "$jobstr: SUBMIT $runcommand"
jobpid=$(qsub -q $queue $scheduler_options <<EOF
log "$jobstr: SUBMIT $pbscommand"
log "$jobstr: LAUNCH $runcommand"
jobpid=$(eval "$pbscommand" <<EOF
#!/bin/sh
#PBS -j eo
#PBS -e ${basename}_w$job.eo
Expand Down Expand Up @@ -252,6 +254,12 @@ if [ $job -eq 1 ]; then
fi
EOF
)
if ! [[ "${jobpid%%.*}" =~ ^[0-9]+$ ]] ; then
log "FAILED to launch job $job"
if [[ "$job" = "1" ]]; then
quit $LINENO "FAILED to submit first job"
fi
fi
echo $jobpid

done
9 changes: 7 additions & 2 deletions HEN_HOUSE/scripts/egs-parallel-pbsdsh
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,9 @@ fi

### launch pbsdsh tasks
task_script=$HEN_HOUSE/scripts/egs-parallel-dshtask
jobpid=$(qsub -q $queue $scheduler_options <<EOF
pbscommand="qsub -q $queue $scheduler_options"
log "SUBMIT $pbscommand"
jobpid=$(eval "$pbscommand" <<EOF
#!/bin/sh
#PBS -j eo
#PBS -e ${basename}.eo
Expand All @@ -150,7 +152,10 @@ jobpid=$(qsub -q $queue $scheduler_options <<EOF
pbsdsh $task_script $pbsdsh_dir $basename $nthread $first $delay '$command'
EOF
)
log "SUBMIT $nthread pbsdsh tasks on $jobpid"
if ! [[ "${jobpid%%.*}" =~ ^[0-9]+$ ]] ; then
quit $LINENO "FAILED to submit job"
fi
log "LAUNCH $nthread pbsdsh tasks on $jobpid"
log "pbsdsh task logs will be collated in ${basename}.eo"

### print pid
Expand Down

0 comments on commit 258bb94

Please sign in to comment.