aboutsummaryrefslogtreecommitdiff
path: root/utils/cat-grid-jobs
diff options
context:
space:
mode:
authortlatorre <tlatorre@uchicago.edu>2019-07-11 09:42:23 -0500
committertlatorre <tlatorre@uchicago.edu>2019-07-11 09:42:23 -0500
commit21491ca1ca2afd6951e9b5b1e74b1c919c602b36 (patch)
treeb21b772612125c574928e4fb37221077d6a012d3 /utils/cat-grid-jobs
parent034253ab63f1029291fa046ce15760aae72ae5c5 (diff)
downloadsddm-21491ca1ca2afd6951e9b5b1e74b1c919c602b36.tar.gz
sddm-21491ca1ca2afd6951e9b5b1e74b1c919c602b36.tar.bz2
sddm-21491ca1ca2afd6951e9b5b1e74b1c919c602b36.zip
switch from YAML output to HDF5 to speed things up
Diffstat (limited to 'utils/cat-grid-jobs')
-rwxr-xr-xutils/cat-grid-jobs163
1 files changed, 114 insertions, 49 deletions
diff --git a/utils/cat-grid-jobs b/utils/cat-grid-jobs
index ba06290..83a8004 100755
--- a/utils/cat-grid-jobs
+++ b/utils/cat-grid-jobs
@@ -23,7 +23,7 @@ as YAML to stdout.
Example:
- $ cat-grid-jobs --dir ~/sddm/src/ ~/mc_atm_nu_no_osc_genie_010000_0.mcds ~/grid_job_results/*.txt > output.txt
+ $ cat-grid-jobs ~/mc_atm_nu_no_osc_genie_010000_0.mcds ~/grid_job_results/*.txt > output.txt
"""
@@ -33,80 +33,145 @@ try:
from yaml import CLoader as Loader, CDumper as Dumper
except ImportError:
from yaml import Loader, Dumper
+import os
+import sys
+
+# Check that a given file can be accessed with the correct mode.
+# Additionally check that `file` is not a directory, as on Windows
+# directories pass the os.access check.
+def _access_check(fn, mode):
+ return (os.path.exists(fn) and os.access(fn, mode) and not os.path.isdir(fn))
+
+def which(cmd, mode=os.F_OK | os.X_OK, path=None):
+ """Given a command, mode, and a PATH string, return the path which
+ conforms to the given mode on the PATH, or None if there is no such
+ file.
+ `mode` defaults to os.F_OK | os.X_OK. `path` defaults to the result
+ of os.environ.get("PATH"), or can be overridden with a custom search
+ path.
+ """
+ # If we're given a path with a directory part, look it up directly rather
+ # than referring to PATH directories. This includes checking relative to the
+ # current directory, e.g. ./script
+ if os.path.dirname(cmd):
+ if _access_check(cmd, mode):
+ return cmd
+ return None
+
+ if path is None:
+ path = os.environ.get("PATH", None)
+ if path is None:
+ try:
+ path = os.confstr("CS_PATH")
+ except (AttributeError, ValueError):
+ # os.confstr() or CS_PATH is not available
+ path = os.defpath
+ # bpo-35755: Don't use os.defpath if the PATH environment variable is
+ # set to an empty string
+
+ # PATH='' doesn't match, whereas PATH=':' looks in the current directory
+ if not path:
+ return None
+
+ path = path.split(os.pathsep)
+
+ if sys.platform == "win32":
+ # The current directory takes precedence on Windows.
+ curdir = os.curdir
+ if curdir not in path:
+ path.insert(0, curdir)
+
+ # PATHEXT is necessary to check on Windows.
+ pathext = os.environ.get("PATHEXT", "").split(os.pathsep)
+ # See if the given file matches any of the expected path extensions.
+ # This will allow us to short circuit when given "python.exe".
+ # If it does match, only test that one, otherwise we have to try
+ # others.
+ if any(cmd.lower().endswith(ext.lower()) for ext in pathext):
+ files = [cmd]
+ else:
+ files = [cmd + ext for ext in pathext]
+ else:
+ # On other platforms you don't have things like PATHEXT to tell you
+ # what file suffixes are executable, so just pass on cmd as-is.
+ files = [cmd]
+
+ seen = set()
+ for dir in path:
+ normdir = os.path.normcase(dir)
+ if not normdir in seen:
+ seen.add(normdir)
+ for thefile in files:
+ name = os.path.join(dir, thefile)
+ if _access_check(name, mode):
+ return name
+ return None
+
+# from https://stackoverflow.com/questions/287871/how-to-print-colored-text-in-terminal-in-python
+class bcolors:
+ HEADER = '\033[95m'
+ OKBLUE = '\033[94m'
+ OKGREEN = '\033[92m'
+ WARNING = '\033[93m'
+ FAIL = '\033[91m'
+ ENDC = '\033[0m'
+ BOLD = '\033[1m'
+ UNDERLINE = '\033[4m'
+
+def print_warning(msg):
+ print(bcolors.FAIL + msg + bcolors.ENDC,file=sys.stderr)
if __name__ == '__main__':
import argparse
import matplotlib.pyplot as plt
import numpy as np
- import subprocess
+ from subprocess import check_call
from os.path import join
import os
import sys
+ import h5py
parser = argparse.ArgumentParser("concatenate fit results from grid jobs into a single file")
- parser.add_argument("--dir", type=str, help="fitter directory", required=True)
parser.add_argument("zdab", help="zdab input file")
parser.add_argument("filenames", nargs='+', help="input files")
+ parser.add_argument("-o", "--output", type=str, help="output filename", required=True)
args = parser.parse_args()
- fit_results = {}
+ zdab_cat = which("zdab-cat")
- # First we create a dictionary mapping (run, gtid) -> fit results.
- for filename in args.filenames:
- with open(filename) as f:
- data = yaml.load(f,Loader=Loader)
+ if zdab_cat is None:
+ print("couldn't find zdab-cat in path!",file=sys.stderr)
+ sys.exit(1)
- if data is None:
- continue
-
- for event in data['data']:
- if event['ev'] is None:
- continue
-
- # if the ev branch is filled in, it means the event was fit
- for ev in event['ev']:
- # add the git SHA1 hash to the fit results since technically it
- # could be different than the version in zdab-cat
- ev['fit']['git_sha1'] = data['git_sha1']
- ev['fit']['git_dirty'] = data['git_dirty']
- fit_results[(ev['run'],ev['gtid'])] = ev['fit']
-
- # Next we get the full event list along with the data cleaning word, FTP
+ # First we get the full event list along with the data cleaning word, FTP
# position, FTK, and RSP energy from the original zdab and then add the fit
# results.
#
# Note: We send stderr to /dev/null since there can be a lot of warnings
# about PMT types and fit results
with open(os.devnull, 'w') as f:
- popen = subprocess.Popen([join(args.dir,"zdab-cat"),args.zdab],stdout=subprocess.PIPE,stderr=f)
+ check_call([zdab_cat,args.zdab,"-o",args.output],stderr=f)
total_events = 0
events_with_fit = 0
-
- doc = {'data': []}
-
- for data in yaml.load_all(popen.stdout,Loader=Loader):
- if 'ev' not in data:
- doc.update(data)
- continue
-
- for ev in data['ev']:
- run = ev['run']
- gtid = ev['gtid']
-
- if (run,gtid) in fit_results:
- ev['fit'] = fit_results[(run,gtid)]
- events_with_fit += 1
-
- total_events += 1
-
- doc['data'].append(data)
-
- popen.wait()
+ total_fits = 0
+
+ with h5py.File(args.output,"a") as fout:
+ total_events = fout['ev'].shape[0]
+ for filename in args.filenames:
+ with h5py.File(filename) as f:
+ # Check to see if the git sha1 match
+ if fout.attrs['git_sha1'] != f.attrs['git_sha1']:
+ print_warning("git_sha1 is %s for current version but %s for %s" % (fout.attrs['git_sha1'],f.attrs['git_sha1'],filename))
+ # get fits which match up with the events
+ valid_fits = f['fits'][np.isin(f['fits'][:][['run','gtid']],fout['ev'][:][['run','gtid']])]
+ # Add the fit results
+ fout['fits'].resize((fout['fits'].shape[0]+valid_fits.shape[0],))
+ fout['fits'][-valid_fits.shape[0]:] = valid_fits
+ events_with_fit += len(np.unique(valid_fits[['run','gtid']]))
+ total_fits += len(np.unique(f['fits']['run','gtid']))
# Print out number of fit results that were added. Hopefully, this will
# make it easy to catch an error if, for example, this gets run with a
# mismatching zdab and fit results
- print("added %i/%i fit results to a total of %i events" % (events_with_fit, len(fit_results), total_events),file=sys.stderr)
-
- print(yaml.dump(doc,default_flow_style=False,Dumper=Dumper))
+ print("added %i/%i fit results to a total of %i events" % (events_with_fit, total_fits, total_events),file=sys.stderr)