aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xutils/submit-grid-jobs17
1 files changed, 13 insertions, 4 deletions
diff --git a/utils/submit-grid-jobs b/utils/submit-grid-jobs
index 30d9983..dd44d14 100755
--- a/utils/submit-grid-jobs
+++ b/utils/submit-grid-jobs
@@ -529,31 +529,40 @@ def main(conn, dqxx_dir, max_retries, max_jobs):
if nretry < max_retries:
log.notice("Releasing job %i" % id)
if release_job(submit_file) != 0:
- log.warn("Failed to release job %i. Setting it to failed state." % id)
+ log.warn("Failed to release job %i. Setting it to FAILED state." % id)
c.execute("UPDATE state SET state = 'FAILED', message = ? WHERE id = ?", ("failed to release job",id))
else:
log.verbose("Job %i has now been retried %i times" % (id,nretry+1))
c.execute("UPDATE state SET nretry = nretry + 1 WHERE id = ?", (id,))
else:
- log.warn("Job %i has failed %i times. Clearing it from the queue. Setting it to failed state." % (id,nretry))
+ log.warn("Job %i has failed %i times. Clearing it from the queue. Setting it to FAILED state." % (id,nretry))
remove_job(submit_file)
c.execute("UPDATE state SET state = 'FAILED', message = ? WHERE id = ?", ("failed too many times", id))
elif job_status == 7:
# Retry if nretry < max_retries
if nretry < max_retries:
if submit_job(submit_file):
- log.warn("Failed to resubmit job %i. Setting it to failed state." % id)
+ log.warn("Failed to resubmit job %i. Setting it to FAILED state." % id)
c.execute("UPDATE state SET state = 'FAILED', message = ? WHERE id = ?", ("failed to resubmit job",id))
else:
log.notice("Resubmitted job %i" % id)
njobs += 1
c.execute("UPDATE state SET state = 'RUNNING', nretry = ? WHERE id = ?", (nretry+1,id))
else:
- log.warn("Job %i has failed %i times. Setting it to failed state." % (id,nretry))
+ log.warn("Job %i has failed %i times. Setting it to FAILED state." % (id,nretry))
c.execute("UPDATE state SET state = 'FAILED', message = ? WHERE id = ?", ("failed too many times", id))
else:
# Don't know what to do here for Removed or Submission_err
log.warn("Job %i is in the state %i. Don't know what to do." % (id, job_status))
+ elif state == 'RETRY' and nretry < max_retries:
+ log.notice("Resubmitting job %i from RETRY state" % id)
+ if submit_job(submit_file):
+ log.warn("Failed to resubmit job %i. Setting it to FAILED state." % id)
+ c.execute("UPDATE state SET state = 'FAILED', message = ? WHERE id = ?", ("failed to resubmit job",id))
+ else:
+ log.notice("Resubmitted job %i" % id)
+ njobs += 1
+ c.execute("UPDATE state SET state = 'RUNNING', message = 'resubmitted from RETRY state', nretry = ? WHERE id = ?", (nretry+1,id))
elif state in ('SUCCESS','FAILED'):
# Nothing to do here
pass