From 1b66f2744b8ebe41ef81ffe8664ea5b37f42c847 Mon Sep 17 00:00:00 2001 From: tlatorre Date: Wed, 15 Apr 2020 11:37:56 -0500 Subject: add ability to retry jobs in submit-grid-jobs This commit updates submit-grid-jobs to look for jobs in the RETRY state and to retry them if the number of retries is less than --max-retries. This way if you ever want to retry a bunch of jobs you can update the database: $ sqlite3 ~/state.db sqlite> update state set state = 'RETRY' where state == 'FAILED'; And then rerun submit-grid-jobs with more retries: $ submit-grid-jobs --max-retries 10 ---auto --- utils/submit-grid-jobs | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'utils/submit-grid-jobs') diff --git a/utils/submit-grid-jobs b/utils/submit-grid-jobs index 30d9983..dd44d14 100755 --- a/utils/submit-grid-jobs +++ b/utils/submit-grid-jobs @@ -529,31 +529,40 @@ def main(conn, dqxx_dir, max_retries, max_jobs): if nretry < max_retries: log.notice("Releasing job %i" % id) if release_job(submit_file) != 0: - log.warn("Failed to release job %i. Setting it to failed state." % id) + log.warn("Failed to release job %i. Setting it to FAILED state." % id) c.execute("UPDATE state SET state = 'FAILED', message = ? WHERE id = ?", ("failed to release job",id)) else: log.verbose("Job %i has now been retried %i times" % (id,nretry+1)) c.execute("UPDATE state SET nretry = nretry + 1 WHERE id = ?", (id,)) else: - log.warn("Job %i has failed %i times. Clearing it from the queue. Setting it to failed state." % (id,nretry)) + log.warn("Job %i has failed %i times. Clearing it from the queue. Setting it to FAILED state." % (id,nretry)) remove_job(submit_file) c.execute("UPDATE state SET state = 'FAILED', message = ? WHERE id = ?", ("failed too many times", id)) elif job_status == 7: # Retry if nretry < max_retries if nretry < max_retries: if submit_job(submit_file): - log.warn("Failed to resubmit job %i. Setting it to failed state." % id) + log.warn("Failed to resubmit job %i. Setting it to FAILED state." % id) c.execute("UPDATE state SET state = 'FAILED', message = ? WHERE id = ?", ("failed to resubmit job",id)) else: log.notice("Resubmitted job %i" % id) njobs += 1 c.execute("UPDATE state SET state = 'RUNNING', nretry = ? WHERE id = ?", (nretry+1,id)) else: - log.warn("Job %i has failed %i times. Setting it to failed state." % (id,nretry)) + log.warn("Job %i has failed %i times. Setting it to FAILED state." % (id,nretry)) c.execute("UPDATE state SET state = 'FAILED', message = ? WHERE id = ?", ("failed too many times", id)) else: # Don't know what to do here for Removed or Submission_err log.warn("Job %i is in the state %i. Don't know what to do." % (id, job_status)) + elif state == 'RETRY' and nretry < max_retries: + log.notice("Resubmitting job %i from RETRY state" % id) + if submit_job(submit_file): + log.warn("Failed to resubmit job %i. Setting it to FAILED state." % id) + c.execute("UPDATE state SET state = 'FAILED', message = ? WHERE id = ?", ("failed to resubmit job",id)) + else: + log.notice("Resubmitted job %i" % id) + njobs += 1 + c.execute("UPDATE state SET state = 'RUNNING', message = 'resubmitted from RETRY state', nretry = ? WHERE id = ?", (nretry+1,id)) elif state in ('SUCCESS','FAILED'): # Nothing to do here pass -- cgit