diff options
author | tlatorre <tlatorre@uchicago.edu> | 2019-07-11 09:42:23 -0500 |
---|---|---|
committer | tlatorre <tlatorre@uchicago.edu> | 2019-07-11 09:42:23 -0500 |
commit | 21491ca1ca2afd6951e9b5b1e74b1c919c602b36 (patch) | |
tree | b21b772612125c574928e4fb37221077d6a012d3 /utils/cat-grid-jobs | |
parent | 034253ab63f1029291fa046ce15760aae72ae5c5 (diff) | |
download | sddm-21491ca1ca2afd6951e9b5b1e74b1c919c602b36.tar.gz sddm-21491ca1ca2afd6951e9b5b1e74b1c919c602b36.tar.bz2 sddm-21491ca1ca2afd6951e9b5b1e74b1c919c602b36.zip |
switch from YAML output to HDF5 to speed things up
Diffstat (limited to 'utils/cat-grid-jobs')
-rwxr-xr-x | utils/cat-grid-jobs | 163 |
1 files changed, 114 insertions, 49 deletions
diff --git a/utils/cat-grid-jobs b/utils/cat-grid-jobs index ba06290..83a8004 100755 --- a/utils/cat-grid-jobs +++ b/utils/cat-grid-jobs @@ -23,7 +23,7 @@ as YAML to stdout. Example: - $ cat-grid-jobs --dir ~/sddm/src/ ~/mc_atm_nu_no_osc_genie_010000_0.mcds ~/grid_job_results/*.txt > output.txt + $ cat-grid-jobs ~/mc_atm_nu_no_osc_genie_010000_0.mcds ~/grid_job_results/*.txt > output.txt """ @@ -33,80 +33,145 @@ try: from yaml import CLoader as Loader, CDumper as Dumper except ImportError: from yaml import Loader, Dumper +import os +import sys + +# Check that a given file can be accessed with the correct mode. +# Additionally check that `file` is not a directory, as on Windows +# directories pass the os.access check. +def _access_check(fn, mode): + return (os.path.exists(fn) and os.access(fn, mode) and not os.path.isdir(fn)) + +def which(cmd, mode=os.F_OK | os.X_OK, path=None): + """Given a command, mode, and a PATH string, return the path which + conforms to the given mode on the PATH, or None if there is no such + file. + `mode` defaults to os.F_OK | os.X_OK. `path` defaults to the result + of os.environ.get("PATH"), or can be overridden with a custom search + path. + """ + # If we're given a path with a directory part, look it up directly rather + # than referring to PATH directories. This includes checking relative to the + # current directory, e.g. ./script + if os.path.dirname(cmd): + if _access_check(cmd, mode): + return cmd + return None + + if path is None: + path = os.environ.get("PATH", None) + if path is None: + try: + path = os.confstr("CS_PATH") + except (AttributeError, ValueError): + # os.confstr() or CS_PATH is not available + path = os.defpath + # bpo-35755: Don't use os.defpath if the PATH environment variable is + # set to an empty string + + # PATH='' doesn't match, whereas PATH=':' looks in the current directory + if not path: + return None + + path = path.split(os.pathsep) + + if sys.platform == "win32": + # The current directory takes precedence on Windows. + curdir = os.curdir + if curdir not in path: + path.insert(0, curdir) + + # PATHEXT is necessary to check on Windows. + pathext = os.environ.get("PATHEXT", "").split(os.pathsep) + # See if the given file matches any of the expected path extensions. + # This will allow us to short circuit when given "python.exe". + # If it does match, only test that one, otherwise we have to try + # others. + if any(cmd.lower().endswith(ext.lower()) for ext in pathext): + files = [cmd] + else: + files = [cmd + ext for ext in pathext] + else: + # On other platforms you don't have things like PATHEXT to tell you + # what file suffixes are executable, so just pass on cmd as-is. + files = [cmd] + + seen = set() + for dir in path: + normdir = os.path.normcase(dir) + if not normdir in seen: + seen.add(normdir) + for thefile in files: + name = os.path.join(dir, thefile) + if _access_check(name, mode): + return name + return None + +# from https://stackoverflow.com/questions/287871/how-to-print-colored-text-in-terminal-in-python +class bcolors: + HEADER = '\033[95m' + OKBLUE = '\033[94m' + OKGREEN = '\033[92m' + WARNING = '\033[93m' + FAIL = '\033[91m' + ENDC = '\033[0m' + BOLD = '\033[1m' + UNDERLINE = '\033[4m' + +def print_warning(msg): + print(bcolors.FAIL + msg + bcolors.ENDC,file=sys.stderr) if __name__ == '__main__': import argparse import matplotlib.pyplot as plt import numpy as np - import subprocess + from subprocess import check_call from os.path import join import os import sys + import h5py parser = argparse.ArgumentParser("concatenate fit results from grid jobs into a single file") - parser.add_argument("--dir", type=str, help="fitter directory", required=True) parser.add_argument("zdab", help="zdab input file") parser.add_argument("filenames", nargs='+', help="input files") + parser.add_argument("-o", "--output", type=str, help="output filename", required=True) args = parser.parse_args() - fit_results = {} + zdab_cat = which("zdab-cat") - # First we create a dictionary mapping (run, gtid) -> fit results. - for filename in args.filenames: - with open(filename) as f: - data = yaml.load(f,Loader=Loader) + if zdab_cat is None: + print("couldn't find zdab-cat in path!",file=sys.stderr) + sys.exit(1) - if data is None: - continue - - for event in data['data']: - if event['ev'] is None: - continue - - # if the ev branch is filled in, it means the event was fit - for ev in event['ev']: - # add the git SHA1 hash to the fit results since technically it - # could be different than the version in zdab-cat - ev['fit']['git_sha1'] = data['git_sha1'] - ev['fit']['git_dirty'] = data['git_dirty'] - fit_results[(ev['run'],ev['gtid'])] = ev['fit'] - - # Next we get the full event list along with the data cleaning word, FTP + # First we get the full event list along with the data cleaning word, FTP # position, FTK, and RSP energy from the original zdab and then add the fit # results. # # Note: We send stderr to /dev/null since there can be a lot of warnings # about PMT types and fit results with open(os.devnull, 'w') as f: - popen = subprocess.Popen([join(args.dir,"zdab-cat"),args.zdab],stdout=subprocess.PIPE,stderr=f) + check_call([zdab_cat,args.zdab,"-o",args.output],stderr=f) total_events = 0 events_with_fit = 0 - - doc = {'data': []} - - for data in yaml.load_all(popen.stdout,Loader=Loader): - if 'ev' not in data: - doc.update(data) - continue - - for ev in data['ev']: - run = ev['run'] - gtid = ev['gtid'] - - if (run,gtid) in fit_results: - ev['fit'] = fit_results[(run,gtid)] - events_with_fit += 1 - - total_events += 1 - - doc['data'].append(data) - - popen.wait() + total_fits = 0 + + with h5py.File(args.output,"a") as fout: + total_events = fout['ev'].shape[0] + for filename in args.filenames: + with h5py.File(filename) as f: + # Check to see if the git sha1 match + if fout.attrs['git_sha1'] != f.attrs['git_sha1']: + print_warning("git_sha1 is %s for current version but %s for %s" % (fout.attrs['git_sha1'],f.attrs['git_sha1'],filename)) + # get fits which match up with the events + valid_fits = f['fits'][np.isin(f['fits'][:][['run','gtid']],fout['ev'][:][['run','gtid']])] + # Add the fit results + fout['fits'].resize((fout['fits'].shape[0]+valid_fits.shape[0],)) + fout['fits'][-valid_fits.shape[0]:] = valid_fits + events_with_fit += len(np.unique(valid_fits[['run','gtid']])) + total_fits += len(np.unique(f['fits']['run','gtid'])) # Print out number of fit results that were added. Hopefully, this will # make it easy to catch an error if, for example, this gets run with a # mismatching zdab and fit results - print("added %i/%i fit results to a total of %i events" % (events_with_fit, len(fit_results), total_events),file=sys.stderr) - - print(yaml.dump(doc,default_flow_style=False,Dumper=Dumper)) + print("added %i/%i fit results to a total of %i events" % (events_with_fit, total_fits, total_events),file=sys.stderr) |