switch from YAML output to HDF5 to speed things up

author: tlatorre <tlatorre@uchicago.edu> 2019-07-11 09:42:23 -0500
committer: tlatorre <tlatorre@uchicago.edu> 2019-07-11 09:42:23 -0500
commit: 21491ca1ca2afd6951e9b5b1e74b1c919c602b36 (patch)
tree: b21b772612125c574928e4fb37221077d6a012d3 /utils/cat-grid-jobs
parent: 034253ab63f1029291fa046ce15760aae72ae5c5 (diff)
download: sddm-21491ca1ca2afd6951e9b5b1e74b1c919c602b36.tar.gz
sddm-21491ca1ca2afd6951e9b5b1e74b1c919c602b36.tar.bz2
sddm-21491ca1ca2afd6951e9b5b1e74b1c919c602b36.zip
1 files changed, 114 insertions, 49 deletions
diff --git a/utils/cat-grid-jobs b/utils/cat-grid-jobs
index ba06290..83a8004 100755
--- a/utils/cat-grid-jobs
+++ b/utils/cat-grid-jobs
@@ -23,7 +23,7 @@ as YAML to stdout.
 
 Example:
 
-    $ cat-grid-jobs --dir ~/sddm/src/ ~/mc_atm_nu_no_osc_genie_010000_0.mcds ~/grid_job_results/*.txt > output.txt
+    $ cat-grid-jobs ~/mc_atm_nu_no_osc_genie_010000_0.mcds ~/grid_job_results/*.txt > output.txt
 
 """
 
@@ -33,80 +33,145 @@ try:
     from yaml import CLoader as Loader, CDumper as Dumper
 except ImportError:
     from yaml import Loader, Dumper
+import os
+import sys
+
+# Check that a given file can be accessed with the correct mode.
+# Additionally check that `file` is not a directory, as on Windows
+# directories pass the os.access check.
+def _access_check(fn, mode):
+    return (os.path.exists(fn) and os.access(fn, mode) and not os.path.isdir(fn))
+
+def which(cmd, mode=os.F_OK | os.X_OK, path=None):
+    """Given a command, mode, and a PATH string, return the path which
+    conforms to the given mode on the PATH, or None if there is no such
+    file.
+    `mode` defaults to os.F_OK | os.X_OK. `path` defaults to the result
+    of os.environ.get("PATH"), or can be overridden with a custom search
+    path.
+    """
+    # If we're given a path with a directory part, look it up directly rather
+    # than referring to PATH directories. This includes checking relative to the
+    # current directory, e.g. ./script
+    if os.path.dirname(cmd):
+        if _access_check(cmd, mode):
+            return cmd
+        return None
+
+    if path is None:
+        path = os.environ.get("PATH", None)
+        if path is None:
+            try:
+                path = os.confstr("CS_PATH")
+            except (AttributeError, ValueError):
+                # os.confstr() or CS_PATH is not available
+                path = os.defpath
+        # bpo-35755: Don't use os.defpath if the PATH environment variable is
+        # set to an empty string
+
+    # PATH='' doesn't match, whereas PATH=':' looks in the current directory
+    if not path:
+        return None
+
+    path = path.split(os.pathsep)
+
+    if sys.platform == "win32":
+        # The current directory takes precedence on Windows.
+        curdir = os.curdir
+        if curdir not in path:
+            path.insert(0, curdir)
+
+        # PATHEXT is necessary to check on Windows.
+        pathext = os.environ.get("PATHEXT", "").split(os.pathsep)
+        # See if the given file matches any of the expected path extensions.
+        # This will allow us to short circuit when given "python.exe".
+        # If it does match, only test that one, otherwise we have to try
+        # others.
+        if any(cmd.lower().endswith(ext.lower()) for ext in pathext):
+            files = [cmd]
+        else:
+            files = [cmd + ext for ext in pathext]
+    else:
+        # On other platforms you don't have things like PATHEXT to tell you
+        # what file suffixes are executable, so just pass on cmd as-is.
+        files = [cmd]
+
+    seen = set()
+    for dir in path:
+        normdir = os.path.normcase(dir)
+        if not normdir in seen:
+            seen.add(normdir)
+            for thefile in files:
+                name = os.path.join(dir, thefile)
+                if _access_check(name, mode):
+                    return name
+    return None
+
+# from https://stackoverflow.com/questions/287871/how-to-print-colored-text-in-terminal-in-python
+class bcolors:
+    HEADER = '\033[95m'
+    OKBLUE = '\033[94m'
+    OKGREEN = '\033[92m'
+    WARNING = '\033[93m'
+    FAIL = '\033[91m'
+    ENDC = '\033[0m'
+    BOLD = '\033[1m'
+    UNDERLINE = '\033[4m'
+
+def print_warning(msg):
+    print(bcolors.FAIL + msg + bcolors.ENDC,file=sys.stderr)
 
 if __name__ == '__main__':
     import argparse
     import matplotlib.pyplot as plt
     import numpy as np
-    import subprocess
+    from subprocess import check_call
     from os.path import join
     import os
     import sys
+    import h5py
 
     parser = argparse.ArgumentParser("concatenate fit results from grid jobs into a single file")
-    parser.add_argument("--dir", type=str, help="fitter directory", required=True)
     parser.add_argument("zdab", help="zdab input file")
     parser.add_argument("filenames", nargs='+', help="input files")
+    parser.add_argument("-o", "--output", type=str, help="output filename", required=True)
     args = parser.parse_args()
 
-    fit_results = {}
+    zdab_cat = which("zdab-cat")
 
-    # First we create a dictionary mapping (run, gtid) -> fit results.
-    for filename in args.filenames:
-        with open(filename) as f:
-            data = yaml.load(f,Loader=Loader)
+    if zdab_cat is None:
+        print("couldn't find zdab-cat in path!",file=sys.stderr)
+        sys.exit(1)
 
-        if data is None:
-            continue
-
-        for event in data['data']:
-            if event['ev'] is None:
-                continue
-
-            # if the ev branch is filled in, it means the event was fit
-            for ev in event['ev']:
-                # add the git SHA1 hash to the fit results since technically it
-                # could be different than the version in zdab-cat
-                ev['fit']['git_sha1'] = data['git_sha1']
-                ev['fit']['git_dirty'] = data['git_dirty']
-                fit_results[(ev['run'],ev['gtid'])] = ev['fit']
-
-    # Next we get the full event list along with the data cleaning word, FTP
+    # First we get the full event list along with the data cleaning word, FTP
     # position, FTK, and RSP energy from the original zdab and then add the fit
     # results.
     #
     # Note: We send stderr to /dev/null since there can be a lot of warnings
     # about PMT types and fit results
     with open(os.devnull, 'w') as f:
-        popen = subprocess.Popen([join(args.dir,"zdab-cat"),args.zdab],stdout=subprocess.PIPE,stderr=f)
+        check_call([zdab_cat,args.zdab,"-o",args.output],stderr=f)
 
     total_events = 0
     events_with_fit = 0
-
-    doc = {'data': []}
-
-    for data in yaml.load_all(popen.stdout,Loader=Loader):
-        if 'ev' not in data:
-            doc.update(data)
-            continue
-
-        for ev in data['ev']:
-            run = ev['run']
-            gtid = ev['gtid']
-
-            if (run,gtid) in fit_results:
-                ev['fit'] = fit_results[(run,gtid)]
-                events_with_fit += 1
-
-            total_events += 1
-
-        doc['data'].append(data)
-
-    popen.wait()
+    total_fits = 0
+
+    with h5py.File(args.output,"a") as fout:
+        total_events = fout['ev'].shape[0]
+        for filename in args.filenames:
+            with h5py.File(filename) as f:
+                # Check to see if the git sha1 match
+                if fout.attrs['git_sha1'] != f.attrs['git_sha1']:
+                    print_warning("git_sha1 is %s for current version but %s for %s" % (fout.attrs['git_sha1'],f.attrs['git_sha1'],filename))
+                # get fits which match up with the events
+                valid_fits = f['fits'][np.isin(f['fits'][:][['run','gtid']],fout['ev'][:][['run','gtid']])]
+                # Add the fit results
+                fout['fits'].resize((fout['fits'].shape[0]+valid_fits.shape[0],))
+                fout['fits'][-valid_fits.shape[0]:] = valid_fits
+                events_with_fit += len(np.unique(valid_fits[['run','gtid']]))
+                total_fits += len(np.unique(f['fits']['run','gtid']))
 
     # Print out number of fit results that were added. Hopefully, this will
     # make it easy to catch an error if, for example, this gets run with a
     # mismatching zdab and fit results
-    print("added %i/%i fit results to a total of %i events" % (events_with_fit, len(fit_results), total_events),file=sys.stderr)
-
-    print(yaml.dump(doc,default_flow_style=False,Dumper=Dumper))
+    print("added %i/%i fit results to a total of %i events" % (events_with_fit, total_fits, total_events),file=sys.stderr)
author	tlatorre <tlatorre@uchicago.edu>	2019-07-11 09:42:23 -0500
committer	tlatorre <tlatorre@uchicago.edu>	2019-07-11 09:42:23 -0500
commit	21491ca1ca2afd6951e9b5b1e74b1c919c602b36 (patch)
tree	b21b772612125c574928e4fb37221077d6a012d3 /utils/cat-grid-jobs
parent	034253ab63f1029291fa046ce15760aae72ae5c5 (diff)
download	sddm-21491ca1ca2afd6951e9b5b1e74b1c919c602b36.tar.gz sddm-21491ca1ca2afd6951e9b5b1e74b1c919c602b36.tar.bz2 sddm-21491ca1ca2afd6951e9b5b1e74b1c919c602b36.zip