diff options
author | tlatorre <tlatorre@uchicago.edu> | 2020-04-15 11:37:56 -0500 |
---|---|---|
committer | tlatorre <tlatorre@uchicago.edu> | 2020-04-15 11:37:56 -0500 |
commit | 1b66f2744b8ebe41ef81ffe8664ea5b37f42c847 (patch) | |
tree | 6044646bfdb235ce469c6f5029ced07dbd5b87c7 | |
parent | 3be1c94dfdaba60899343080d8b91fb2c602e549 (diff) | |
download | sddm-1b66f2744b8ebe41ef81ffe8664ea5b37f42c847.tar.gz sddm-1b66f2744b8ebe41ef81ffe8664ea5b37f42c847.tar.bz2 sddm-1b66f2744b8ebe41ef81ffe8664ea5b37f42c847.zip |
add ability to retry jobs in submit-grid-jobs
This commit updates submit-grid-jobs to look for jobs in the RETRY state and to
retry them if the number of retries is less than --max-retries. This way if you
ever want to retry a bunch of jobs you can update the database:
$ sqlite3 ~/state.db
sqlite> update state set state = 'RETRY' where state == 'FAILED';
And then rerun submit-grid-jobs with more retries:
$ submit-grid-jobs --max-retries 10 ---auto
-rwxr-xr-x | utils/submit-grid-jobs | 17 |
1 files changed, 13 insertions, 4 deletions
diff --git a/utils/submit-grid-jobs b/utils/submit-grid-jobs index 30d9983..dd44d14 100755 --- a/utils/submit-grid-jobs +++ b/utils/submit-grid-jobs @@ -529,31 +529,40 @@ def main(conn, dqxx_dir, max_retries, max_jobs): if nretry < max_retries: log.notice("Releasing job %i" % id) if release_job(submit_file) != 0: - log.warn("Failed to release job %i. Setting it to failed state." % id) + log.warn("Failed to release job %i. Setting it to FAILED state." % id) c.execute("UPDATE state SET state = 'FAILED', message = ? WHERE id = ?", ("failed to release job",id)) else: log.verbose("Job %i has now been retried %i times" % (id,nretry+1)) c.execute("UPDATE state SET nretry = nretry + 1 WHERE id = ?", (id,)) else: - log.warn("Job %i has failed %i times. Clearing it from the queue. Setting it to failed state." % (id,nretry)) + log.warn("Job %i has failed %i times. Clearing it from the queue. Setting it to FAILED state." % (id,nretry)) remove_job(submit_file) c.execute("UPDATE state SET state = 'FAILED', message = ? WHERE id = ?", ("failed too many times", id)) elif job_status == 7: # Retry if nretry < max_retries if nretry < max_retries: if submit_job(submit_file): - log.warn("Failed to resubmit job %i. Setting it to failed state." % id) + log.warn("Failed to resubmit job %i. Setting it to FAILED state." % id) c.execute("UPDATE state SET state = 'FAILED', message = ? WHERE id = ?", ("failed to resubmit job",id)) else: log.notice("Resubmitted job %i" % id) njobs += 1 c.execute("UPDATE state SET state = 'RUNNING', nretry = ? WHERE id = ?", (nretry+1,id)) else: - log.warn("Job %i has failed %i times. Setting it to failed state." % (id,nretry)) + log.warn("Job %i has failed %i times. Setting it to FAILED state." % (id,nretry)) c.execute("UPDATE state SET state = 'FAILED', message = ? WHERE id = ?", ("failed too many times", id)) else: # Don't know what to do here for Removed or Submission_err log.warn("Job %i is in the state %i. Don't know what to do." % (id, job_status)) + elif state == 'RETRY' and nretry < max_retries: + log.notice("Resubmitting job %i from RETRY state" % id) + if submit_job(submit_file): + log.warn("Failed to resubmit job %i. Setting it to FAILED state." % id) + c.execute("UPDATE state SET state = 'FAILED', message = ? WHERE id = ?", ("failed to resubmit job",id)) + else: + log.notice("Resubmitted job %i" % id) + njobs += 1 + c.execute("UPDATE state SET state = 'RUNNING', message = 'resubmitted from RETRY state', nretry = ? WHERE id = ?", (nretry+1,id)) elif state in ('SUCCESS','FAILED'): # Nothing to do here pass |