aboutsummaryrefslogtreecommitdiff
path: root/utils/submit-grid-jobs
diff options
context:
space:
mode:
authortlatorre <tlatorre@uchicago.edu>2020-04-15 11:37:56 -0500
committertlatorre <tlatorre@uchicago.edu>2020-04-15 11:37:56 -0500
commit1b66f2744b8ebe41ef81ffe8664ea5b37f42c847 (patch)
tree6044646bfdb235ce469c6f5029ced07dbd5b87c7 /utils/submit-grid-jobs
parent3be1c94dfdaba60899343080d8b91fb2c602e549 (diff)
downloadsddm-1b66f2744b8ebe41ef81ffe8664ea5b37f42c847.tar.gz
sddm-1b66f2744b8ebe41ef81ffe8664ea5b37f42c847.tar.bz2
sddm-1b66f2744b8ebe41ef81ffe8664ea5b37f42c847.zip
add ability to retry jobs in submit-grid-jobs
This commit updates submit-grid-jobs to look for jobs in the RETRY state and to retry them if the number of retries is less than --max-retries. This way if you ever want to retry a bunch of jobs you can update the database: $ sqlite3 ~/state.db sqlite> update state set state = 'RETRY' where state == 'FAILED'; And then rerun submit-grid-jobs with more retries: $ submit-grid-jobs --max-retries 10 ---auto
Diffstat (limited to 'utils/submit-grid-jobs')
-rwxr-xr-xutils/submit-grid-jobs17
1 files changed, 13 insertions, 4 deletions
diff --git a/utils/submit-grid-jobs b/utils/submit-grid-jobs
index 30d9983..dd44d14 100755
--- a/utils/submit-grid-jobs
+++ b/utils/submit-grid-jobs
@@ -529,31 +529,40 @@ def main(conn, dqxx_dir, max_retries, max_jobs):
if nretry < max_retries:
log.notice("Releasing job %i" % id)
if release_job(submit_file) != 0:
- log.warn("Failed to release job %i. Setting it to failed state." % id)
+ log.warn("Failed to release job %i. Setting it to FAILED state." % id)
c.execute("UPDATE state SET state = 'FAILED', message = ? WHERE id = ?", ("failed to release job",id))
else:
log.verbose("Job %i has now been retried %i times" % (id,nretry+1))
c.execute("UPDATE state SET nretry = nretry + 1 WHERE id = ?", (id,))
else:
- log.warn("Job %i has failed %i times. Clearing it from the queue. Setting it to failed state." % (id,nretry))
+ log.warn("Job %i has failed %i times. Clearing it from the queue. Setting it to FAILED state." % (id,nretry))
remove_job(submit_file)
c.execute("UPDATE state SET state = 'FAILED', message = ? WHERE id = ?", ("failed too many times", id))
elif job_status == 7:
# Retry if nretry < max_retries
if nretry < max_retries:
if submit_job(submit_file):
- log.warn("Failed to resubmit job %i. Setting it to failed state." % id)
+ log.warn("Failed to resubmit job %i. Setting it to FAILED state." % id)
c.execute("UPDATE state SET state = 'FAILED', message = ? WHERE id = ?", ("failed to resubmit job",id))
else:
log.notice("Resubmitted job %i" % id)
njobs += 1
c.execute("UPDATE state SET state = 'RUNNING', nretry = ? WHERE id = ?", (nretry+1,id))
else:
- log.warn("Job %i has failed %i times. Setting it to failed state." % (id,nretry))
+ log.warn("Job %i has failed %i times. Setting it to FAILED state." % (id,nretry))
c.execute("UPDATE state SET state = 'FAILED', message = ? WHERE id = ?", ("failed too many times", id))
else:
# Don't know what to do here for Removed or Submission_err
log.warn("Job %i is in the state %i. Don't know what to do." % (id, job_status))
+ elif state == 'RETRY' and nretry < max_retries:
+ log.notice("Resubmitting job %i from RETRY state" % id)
+ if submit_job(submit_file):
+ log.warn("Failed to resubmit job %i. Setting it to FAILED state." % id)
+ c.execute("UPDATE state SET state = 'FAILED', message = ? WHERE id = ?", ("failed to resubmit job",id))
+ else:
+ log.notice("Resubmitted job %i" % id)
+ njobs += 1
+ c.execute("UPDATE state SET state = 'RUNNING', message = 'resubmitted from RETRY state', nretry = ? WHERE id = ?", (nretry+1,id))
elif state in ('SUCCESS','FAILED'):
# Nothing to do here
pass