diff options
-rwxr-xr-x | utils/submit-grid-jobs | 17 |
1 files changed, 13 insertions, 4 deletions
diff --git a/utils/submit-grid-jobs b/utils/submit-grid-jobs index 30d9983..dd44d14 100755 --- a/utils/submit-grid-jobs +++ b/utils/submit-grid-jobs @@ -529,31 +529,40 @@ def main(conn, dqxx_dir, max_retries, max_jobs): if nretry < max_retries: log.notice("Releasing job %i" % id) if release_job(submit_file) != 0: - log.warn("Failed to release job %i. Setting it to failed state." % id) + log.warn("Failed to release job %i. Setting it to FAILED state." % id) c.execute("UPDATE state SET state = 'FAILED', message = ? WHERE id = ?", ("failed to release job",id)) else: log.verbose("Job %i has now been retried %i times" % (id,nretry+1)) c.execute("UPDATE state SET nretry = nretry + 1 WHERE id = ?", (id,)) else: - log.warn("Job %i has failed %i times. Clearing it from the queue. Setting it to failed state." % (id,nretry)) + log.warn("Job %i has failed %i times. Clearing it from the queue. Setting it to FAILED state." % (id,nretry)) remove_job(submit_file) c.execute("UPDATE state SET state = 'FAILED', message = ? WHERE id = ?", ("failed too many times", id)) elif job_status == 7: # Retry if nretry < max_retries if nretry < max_retries: if submit_job(submit_file): - log.warn("Failed to resubmit job %i. Setting it to failed state." % id) + log.warn("Failed to resubmit job %i. Setting it to FAILED state." % id) c.execute("UPDATE state SET state = 'FAILED', message = ? WHERE id = ?", ("failed to resubmit job",id)) else: log.notice("Resubmitted job %i" % id) njobs += 1 c.execute("UPDATE state SET state = 'RUNNING', nretry = ? WHERE id = ?", (nretry+1,id)) else: - log.warn("Job %i has failed %i times. Setting it to failed state." % (id,nretry)) + log.warn("Job %i has failed %i times. Setting it to FAILED state." % (id,nretry)) c.execute("UPDATE state SET state = 'FAILED', message = ? WHERE id = ?", ("failed too many times", id)) else: # Don't know what to do here for Removed or Submission_err log.warn("Job %i is in the state %i. Don't know what to do." % (id, job_status)) + elif state == 'RETRY' and nretry < max_retries: + log.notice("Resubmitting job %i from RETRY state" % id) + if submit_job(submit_file): + log.warn("Failed to resubmit job %i. Setting it to FAILED state." % id) + c.execute("UPDATE state SET state = 'FAILED', message = ? WHERE id = ?", ("failed to resubmit job",id)) + else: + log.notice("Resubmitted job %i" % id) + njobs += 1 + c.execute("UPDATE state SET state = 'RUNNING', message = 'resubmitted from RETRY state', nretry = ? WHERE id = ?", (nretry+1,id)) elif state in ('SUCCESS','FAILED'): # Nothing to do here pass |