7 files changed, 564 insertions, 521 deletions
diff --git a/utils/Makefile b/utils/Makefile
index ef21a09..011a723 100644
--- a/utils/Makefile
+++ b/utils/Makefile
@@ -13,5 +13,6 @@ install:
 	$(INSTALL) plot-likelihood $(INSTALL_BIN)
 	$(INSTALL) submit-grid-jobs $(INSTALL_BIN)
 	$(INSTALL) gen-dark-matter $(INSTALL_BIN)
+	$(INSTALL) fit-wrapper $(INSTALL_BIN)
 
 .PHONY: install
diff --git a/utils/cat-grid-jobs b/utils/cat-grid-jobs
index ba06290..83a8004 100755
--- a/utils/cat-grid-jobs
+++ b/utils/cat-grid-jobs
@@ -23,7 +23,7 @@ as YAML to stdout.
 
 Example:
 
-    $ cat-grid-jobs --dir ~/sddm/src/ ~/mc_atm_nu_no_osc_genie_010000_0.mcds ~/grid_job_results/*.txt > output.txt
+    $ cat-grid-jobs ~/mc_atm_nu_no_osc_genie_010000_0.mcds ~/grid_job_results/*.txt > output.txt
 
 """
 
@@ -33,80 +33,145 @@ try:
     from yaml import CLoader as Loader, CDumper as Dumper
 except ImportError:
     from yaml import Loader, Dumper
+import os
+import sys
+
+# Check that a given file can be accessed with the correct mode.
+# Additionally check that `file` is not a directory, as on Windows
+# directories pass the os.access check.
+def _access_check(fn, mode):
+    return (os.path.exists(fn) and os.access(fn, mode) and not os.path.isdir(fn))
+
+def which(cmd, mode=os.F_OK | os.X_OK, path=None):
+    """Given a command, mode, and a PATH string, return the path which
+    conforms to the given mode on the PATH, or None if there is no such
+    file.
+    `mode` defaults to os.F_OK | os.X_OK. `path` defaults to the result
+    of os.environ.get("PATH"), or can be overridden with a custom search
+    path.
+    """
+    # If we're given a path with a directory part, look it up directly rather
+    # than referring to PATH directories. This includes checking relative to the
+    # current directory, e.g. ./script
+    if os.path.dirname(cmd):
+        if _access_check(cmd, mode):
+            return cmd
+        return None
+
+    if path is None:
+        path = os.environ.get("PATH", None)
+        if path is None:
+            try:
+                path = os.confstr("CS_PATH")
+            except (AttributeError, ValueError):
+                # os.confstr() or CS_PATH is not available
+                path = os.defpath
+        # bpo-35755: Don't use os.defpath if the PATH environment variable is
+        # set to an empty string
+
+    # PATH='' doesn't match, whereas PATH=':' looks in the current directory
+    if not path:
+        return None
+
+    path = path.split(os.pathsep)
+
+    if sys.platform == "win32":
+        # The current directory takes precedence on Windows.
+        curdir = os.curdir
+        if curdir not in path:
+            path.insert(0, curdir)
+
+        # PATHEXT is necessary to check on Windows.
+        pathext = os.environ.get("PATHEXT", "").split(os.pathsep)
+        # See if the given file matches any of the expected path extensions.
+        # This will allow us to short circuit when given "python.exe".
+        # If it does match, only test that one, otherwise we have to try
+        # others.
+        if any(cmd.lower().endswith(ext.lower()) for ext in pathext):
+            files = [cmd]
+        else:
+            files = [cmd + ext for ext in pathext]
+    else:
+        # On other platforms you don't have things like PATHEXT to tell you
+        # what file suffixes are executable, so just pass on cmd as-is.
+        files = [cmd]
+
+    seen = set()
+    for dir in path:
+        normdir = os.path.normcase(dir)
+        if not normdir in seen:
+            seen.add(normdir)
+            for thefile in files:
+                name = os.path.join(dir, thefile)
+                if _access_check(name, mode):
+                    return name
+    return None
+
+# from https://stackoverflow.com/questions/287871/how-to-print-colored-text-in-terminal-in-python
+class bcolors:
+    HEADER = '\033[95m'
+    OKBLUE = '\033[94m'
+    OKGREEN = '\033[92m'
+    WARNING = '\033[93m'
+    FAIL = '\033[91m'
+    ENDC = '\033[0m'
+    BOLD = '\033[1m'
+    UNDERLINE = '\033[4m'
+
+def print_warning(msg):
+    print(bcolors.FAIL + msg + bcolors.ENDC,file=sys.stderr)
 
 if __name__ == '__main__':
     import argparse
     import matplotlib.pyplot as plt
     import numpy as np
-    import subprocess
+    from subprocess import check_call
     from os.path import join
     import os
     import sys
+    import h5py
 
     parser = argparse.ArgumentParser("concatenate fit results from grid jobs into a single file")
-    parser.add_argument("--dir", type=str, help="fitter directory", required=True)
     parser.add_argument("zdab", help="zdab input file")
     parser.add_argument("filenames", nargs='+', help="input files")
+    parser.add_argument("-o", "--output", type=str, help="output filename", required=True)
     args = parser.parse_args()
 
-    fit_results = {}
+    zdab_cat = which("zdab-cat")
 
-    # First we create a dictionary mapping (run, gtid) -> fit results.
-    for filename in args.filenames:
-        with open(filename) as f:
-            data = yaml.load(f,Loader=Loader)
+    if zdab_cat is None:
+        print("couldn't find zdab-cat in path!",file=sys.stderr)
+        sys.exit(1)
 
-        if data is None:
-            continue
-
-        for event in data['data']:
-            if event['ev'] is None:
-                continue
-
-            # if the ev branch is filled in, it means the event was fit
-            for ev in event['ev']:
-                # add the git SHA1 hash to the fit results since technically it
-                # could be different than the version in zdab-cat
-                ev['fit']['git_sha1'] = data['git_sha1']
-                ev['fit']['git_dirty'] = data['git_dirty']
-                fit_results[(ev['run'],ev['gtid'])] = ev['fit']
-
-    # Next we get the full event list along with the data cleaning word, FTP
+    # First we get the full event list along with the data cleaning word, FTP
     # position, FTK, and RSP energy from the original zdab and then add the fit
     # results.
     #
     # Note: We send stderr to /dev/null since there can be a lot of warnings
     # about PMT types and fit results
     with open(os.devnull, 'w') as f:
-        popen = subprocess.Popen([join(args.dir,"zdab-cat"),args.zdab],stdout=subprocess.PIPE,stderr=f)
+        check_call([zdab_cat,args.zdab,"-o",args.output],stderr=f)
 
     total_events = 0
     events_with_fit = 0
-
-    doc = {'data': []}
-
-    for data in yaml.load_all(popen.stdout,Loader=Loader):
-        if 'ev' not in data:
-            doc.update(data)
-            continue
-
-        for ev in data['ev']:
-            run = ev['run']
-            gtid = ev['gtid']
-
-            if (run,gtid) in fit_results:
-                ev['fit'] = fit_results[(run,gtid)]
-                events_with_fit += 1
-
-            total_events += 1
-
-        doc['data'].append(data)
-
-    popen.wait()
+    total_fits = 0
+
+    with h5py.File(args.output,"a") as fout:
+        total_events = fout['ev'].shape[0]
+        for filename in args.filenames:
+            with h5py.File(filename) as f:
+                # Check to see if the git sha1 match
+                if fout.attrs['git_sha1'] != f.attrs['git_sha1']:
+                    print_warning("git_sha1 is %s for current version but %s for %s" % (fout.attrs['git_sha1'],f.attrs['git_sha1'],filename))
+                # get fits which match up with the events
+                valid_fits = f['fits'][np.isin(f['fits'][:][['run','gtid']],fout['ev'][:][['run','gtid']])]
+                # Add the fit results
+                fout['fits'].resize((fout['fits'].shape[0]+valid_fits.shape[0],))
+                fout['fits'][-valid_fits.shape[0]:] = valid_fits
+                events_with_fit += len(np.unique(valid_fits[['run','gtid']]))
+                total_fits += len(np.unique(f['fits']['run','gtid']))
 
     # Print out number of fit results that were added. Hopefully, this will
     # make it easy to catch an error if, for example, this gets run with a
     # mismatching zdab and fit results
-    print("added %i/%i fit results to a total of %i events" % (events_with_fit, len(fit_results), total_events),file=sys.stderr)
-
-    print(yaml.dump(doc,default_flow_style=False,Dumper=Dumper))
+    print("added %i/%i fit results to a total of %i events" % (events_with_fit, total_fits, total_events),file=sys.stderr)
diff --git a/utils/fit-wrapper b/utils/fit-wrapper
new file mode 100644
index 0000000..940b807
--- /dev/null
+++ b/utils/fit-wrapper
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+
+# Simple wrapper script to run jobs on the grid
+
+module load hdf5
+module load zlib
+./fit "$@"
diff --git a/utils/plot b/utils/plot
index ac93eda..1bac500 100755
--- a/utils/plot
+++ b/utils/plot
@@ -74,6 +74,8 @@ if __name__ == '__main__':
     import argparse
     import matplotlib.pyplot as plt
     import numpy as np
+    import h5py
+    import pandas as pd
 
     parser = argparse.ArgumentParser("plot fit results")
     parser.add_argument("filenames", nargs='+', help="input files")
@@ -81,121 +83,113 @@ if __name__ == '__main__':
 
     for filename in args.filenames:
         print(filename)
-        with open(filename) as f:
-            data = yaml.load(f.read(),Loader=Loader)
-
-        dx = []
-        dy = []
-        dz = []
-        dT = []
-        thetas = []
-        likelihood_ratio = []
-        t_electron = []
-        t_muon = []
-        psi = []
-        for event in data['data']:
-            # get the particle ID
-            id = event['mcgn'][0]['id']
-
-            if 'ev' not in event:
-                continue
-
-            if 'fit' not in event['ev'][0]:
-                # if nhit < 100 we don't fit the event
-                continue
-
-            if id not in event['ev'][0]['fit']:
-                continue
-
-            fit = event['ev'][0]['fit']
-
-            mass = SNOMAN_MASS[id]
-            # for some reason it's the *second* track which seems to contain the
-            # initial energy
-            true_energy = event['mcgn'][0]['energy']
-            # The MCTK bank has the particle's total energy (except for neutrons)
-            # so we need to convert it into kinetic energy
-            ke = true_energy - mass
-            energy = fit[id]['energy']
-            dT.append(energy-ke)
-            true_posx = event['mcgn'][0]['posx']
-            posx = fit[id]['posx']
-            dx.append(posx-true_posx)
-            true_posy = event['mcgn'][0]['posy']
-            posy = fit[id]['posy']
-            dy.append(posy-true_posy)
-            true_posz = event['mcgn'][0]['posz']
-            posz = fit[id]['posz']
-            dz.append(posz-true_posz)
-            dirx = event['mcgn'][0]['dirx']
-            diry = event['mcgn'][0]['diry']
-            dirz = event['mcgn'][0]['dirz']
-            true_dir = [dirx,diry,dirz]
-            true_dir = np.array(true_dir)/np.linalg.norm(true_dir)
-            theta = fit[id]['theta']
-            phi = fit[id]['phi']
-            dir = [np.sin(theta)*np.cos(phi),np.sin(theta)*np.sin(phi),np.cos(theta)]
-            dir = np.array(dir)/np.linalg.norm(dir)
-            thetas.append(np.degrees(np.arccos(np.dot(true_dir,dir))))
-            if IDP_E_MINUS in fit and IDP_MU_MINUS in fit:
-                fmin_electron = fit[IDP_E_MINUS]['fmin']
-                fmin_muon = fit[IDP_MU_MINUS]['fmin']
-                likelihood_ratio.append(fmin_muon-fmin_electron)
-            if IDP_E_MINUS in fit:
-                t_electron.append(fit[IDP_E_MINUS]['time'])
-            if IDP_MU_MINUS in fit:
-                t_muon.append(fit[IDP_MU_MINUS]['time'])
-
-            if 'nhit' in event['ev'][0]:
-                nhit = event['ev'][0]['nhit']
-                psi.append(fit[id]['psi']/nhit)
-
-        if len(t_electron) < 2 or len(t_muon) < 2:
+        
+        with h5py.File(filename) as f:
+            ev = pd.read_hdf(filename, "ev")
+            mcgn = pd.read_hdf(filename, "mcgn")
+            fits = pd.read_hdf(filename, "fits")
+
+            # get rid of 2nd events like Michel electrons
+            ev = ev.sort_values(['run','gtid']).groupby(['evn'],as_index=False).first()
+
+            # Now, we merge all three datasets together to produce a single
+            # dataframe. To do so, we join the ev dataframe with the mcgn frame
+            # on the evn column, and then join with the fits on the run and
+            # gtid columns.
+            #
+            # At the end we will have a single dataframe with one row for each
+            # fit, i.e. it will look like:
+            #
+            # >>> data
+            #   run   gtid nhit, ... mcgn_x, mcgn_y, mcgn_z, ..., fit_id1, fit_x, fit_y, fit_z, ...
+            #
+            # Before merging, we prefix the primary seed track table with mcgn_
+            # and the fit table with fit_ just to make things easier.
+
+            # Prefix track and fit frames
+            mcgn = mcgn.add_prefix("mcgn_")
+            fits = fits.add_prefix("fit_")
+
+            # merge ev and mcgn on evn
+            data = ev.merge(mcgn,left_on=['evn'],right_on=['mcgn_evn'])
+            # merge data and fits on run and gtid
+            data = data.merge(fits,left_on=['run','gtid'],right_on=['fit_run','fit_gtid'])
+
+            # calculate true kinetic energy
+            mass = [SNOMAN_MASS[id] for id in data['mcgn_id'].values]
+            data['T'] = data['mcgn_energy'].values - mass
+            data['dx'] = data['fit_x'].values - data['mcgn_x'].values
+            data['dy'] = data['fit_y'].values - data['mcgn_y'].values
+            data['dz'] = data['fit_z'].values - data['mcgn_z'].values
+            data['dT'] = data['fit_energy1'].values - data['T'].values
+
+            true_dir = np.dstack((data['mcgn_dirx'],data['mcgn_diry'],data['mcgn_dirz'])).squeeze()
+            dir = np.dstack((np.sin(data['fit_theta1'])*np.cos(data['fit_phi1']),
+                             np.sin(data['fit_theta1'])*np.sin(data['fit_phi1']),
+                             np.cos(data['fit_theta1']))).squeeze()
+
+            data['theta'] = np.degrees(np.arccos((true_dir*dir).sum(axis=-1)))
+
+            # only select fits which have at least 2 fits
+            data = data.groupby(['run','gtid']).filter(lambda x: len(x) > 1)
+            data_true = data[data['fit_id1'] == data['mcgn_id']]
+            data_e = data[data['fit_id1'] == IDP_E_MINUS]
+            data_mu = data[data['fit_id1'] == IDP_MU_MINUS]
+
+            data_true = data_true.set_index(['run','gtid'])
+            data_e = data_e.set_index(['run','gtid'])
+            data_mu = data_mu.set_index(['run','gtid'])
+
+            data_true['ratio'] = data_mu['fit_fmin']-data_e['fit_fmin']
+            data_true['te'] = data_e['fit_time']
+            data_true['tm'] = data_mu['fit_time']
+            data_true['Te'] = data_e['fit_energy1']
+
+        if len(data_true) < 2:
             continue
 
-        mean, mean_error, std, std_error = get_stats(dT)
+        mean, mean_error, std, std_error = get_stats(data_true.dT)
         print("dT      = %.2g +/- %.2g" % (mean, mean_error))
         print("std(dT) = %.2g +/- %.2g" % (std, std_error))
-        mean, mean_error, std, std_error = get_stats(dx)
+        mean, mean_error, std, std_error = get_stats(data_true.dx)
         print("dx      = %4.2g +/- %.2g" % (mean, mean_error))
         print("std(dx) = %4.2g +/- %.2g" % (std, std_error))
-        mean, mean_error, std, std_error = get_stats(dy)
+        mean, mean_error, std, std_error = get_stats(data_true.dy)
         print("dy      = %4.2g +/- %.2g" % (mean, mean_error))
         print("std(dy) = %4.2g +/- %.2g" % (std, std_error))
-        mean, mean_error, std, std_error = get_stats(dz)
+        mean, mean_error, std, std_error = get_stats(data_true.dz)
         print("dz      = %4.2g +/- %.2g" % (mean, mean_error))
         print("std(dz) = %4.2g +/- %.2g" % (std, std_error))
-        mean, mean_error, std, std_error = get_stats(thetas)
+        mean, mean_error, std, std_error = get_stats(data_true.theta)
         print("std(theta) = %4.2g +/- %.2g" % (std, std_error))
 
         plt.figure(1)
-        plot_hist(dT, label=filename)
+        plot_hist(data_true.dT, label=filename)
         plt.xlabel("Kinetic Energy difference (MeV)")
         plt.figure(2)
-        plot_hist(dx, label=filename)
+        plot_hist(data_true.dx, label=filename)
         plt.xlabel("X Position difference (cm)")
         plt.figure(3)
-        plot_hist(dy, label=filename)
+        plot_hist(data_true.dy, label=filename)
         plt.xlabel("Y Position difference (cm)")
         plt.figure(4)
-        plot_hist(dz, label=filename)
+        plot_hist(data_true.dz, label=filename)
         plt.xlabel("Z Position difference (cm)")
         plt.figure(5)
-        plot_hist(thetas, label=filename)
+        plot_hist(data_true.theta, label=filename)
         plt.xlabel(r"$\theta$ (deg)")
         plt.figure(6)
-        plot_hist(likelihood_ratio, label=filename)
+        plot_hist(data_true.ratio, label=filename)
         plt.xlabel(r"Log Likelihood Ratio ($e/\mu$)")
         plt.figure(7)
-        plot_hist(np.array(t_electron)/1e3/60.0, label=filename)
+        plot_hist(data_true.te/1e3/60.0, label=filename)
         plt.xlabel(r"Electron Fit time (minutes)")
         plt.figure(8)
-        plot_hist(np.array(t_muon)/1e3/60.0, label=filename)
+        plot_hist(data_true.tm/1e3/60.0, label=filename)
         plt.xlabel(r"Muon Fit time (minutes)")
-        if len(psi):
-            plt.figure(9)
-            plot_hist(psi, label=filename)
-            plt.xlabel(r"$\Psi$/Nhit")
+        plt.figure(9)
+        plot_hist(data_true.fit_psi/data_true.nhit, label=filename)
+        plt.xlabel(r"$\Psi$/Nhit")
 
     plot_legend(1)
     plot_legend(2)
@@ -205,6 +199,5 @@ if __name__ == '__main__':
     plot_legend(6)
     plot_legend(7)
     plot_legend(8)
-    if len(psi):
-        plot_legend(9)
+    plot_legend(9)
     plt.show()
diff --git a/utils/plot-energy b/utils/plot-energy
index a0274b7..4a8521b 100755
--- a/utils/plot-energy
+++ b/utils/plot-energy
@@ -107,140 +107,69 @@ if __name__ == '__main__':
     import numpy as np
     import pandas as pd
     import sys
+    import h5py
 
     parser = argparse.ArgumentParser("plot fit results")
     parser.add_argument("filenames", nargs='+', help="input files")
     args = parser.parse_args()
 
-    events = []
-    fit_results = []
-
-    for filename in args.filenames:
-        print(filename)
-        with open(filename) as f:
-            data = yaml.load(f,Loader=Loader)
-
-        for i, event in enumerate(data['data']):
-            for ev in event['ev']:
-                events.append((
-                    ev['run'],
-                    ev['gtr'],
-                    ev['nhit'],
-                    ev['gtid'],
-                    ev['dc'],
-                    ev['ftp']['x'] if 'ftp' in ev else np.nan,
-                    ev['ftp']['y'] if 'ftp' in ev else np.nan,
-                    ev['ftp']['z'] if 'ftp' in ev else np.nan,
-                    ev['rsp']['energy'] if 'rsp' in ev and ev['rsp']['energy'] > 0 else np.nan,
-                    ))
-
-                if 'fit' not in ev:
-                    continue
-
-                for id, fit_result in [x for x in ev['fit'].iteritems() if isinstance(x[0],int)]:
-                    # FIXME: Should I just store the particle ids in the YAML
-                    # output as a list of particle ids instead of a single
-                    # integer?
-                    ids = map(int,chunks(str(id),2))
-                    energy = 0.0
-                    skip = False
-                    for i, ke in zip(ids,np.atleast_1d(fit_result['energy'])):
-                        energy += ke + SNOMAN_MASS[i]
-
-                        # This is a bit of a hack. It appears that many times
-                        # the fit will actually do much better by including a
-                        # very low energy electron or muon. I believe the
-                        # reason for this is that of course my likelihood
-                        # function is not perfect (for example, I don't include
-                        # the correct angular distribution for Rayleigh
-                        # scattered light), and so the fitter often wants to
-                        # add a very low energy electron or muon to fix things.
-                        #
-                        # Ideally I would fix the likelihood function, but for
-                        # now we just discard any fit results which have a very
-                        # low energy electron or muon.
-                        if len(ids) > 1 and i == 20 and ke < 20.0:
-                            skip = True
-
-                        if len(ids) > 1 and i == 22 and ke < 200.0:
-                            skip = True
-
-                    if skip:
-                        continue
-
-                    # Calculate the approximate Ockham factor.
-                    # See Chapter 20 in "Probability Theory: The Logic of Science" by Jaynes
-                    #
-                    # Note: This is a really approximate form by assuming that
-                    # the shape of the likelihood space is equal to the average
-                    # uncertainty in the different parameters.
-                    w = len(ids)*np.log(0.1*0.001) + np.sum(np.log(fit_result['energy'])) + len(ids)*np.log(1e-4/(4*np.pi))
-
-                    fit_results.append((
-                        ev['run'],
-                        ev['gtid'],
-                        id,
-                        fit_result['posx'],
-                        fit_result['posy'],
-                        fit_result['posz'],
-                        fit_result['t0'],
-                        energy,
-                        np.atleast_1d(fit_result['theta'])[0],
-                        np.atleast_1d(fit_result['phi'])[0],
-                        fit_result['fmin'] - w,
-                        fit_result['psi']/ev['nhit']))
-
-    # create a dataframe
-    # note: we have to first create a numpy structured array since there is no
-    # way to pass a list of data types to the DataFrame constructor. See
-    # https://github.com/pandas-dev/pandas/issues/4464
-    array = np.array(fit_results,
-                     dtype=[('run',np.int),      # run number
-                            ('gtid',np.int),     # gtid
-                            ('id',np.int),       # particle id
-                            ('x', np.double),    # x
-                            ('y',np.double),     # y
-                            ('z',np.double),     # z
-                            ('t0',np.double),    # t0
-                            ('ke',np.double),    # kinetic energy
-                            ('theta',np.double), # direction of 1st particle
-                            ('phi',np.double),   # direction of 2nd particle
-                            ('fmin',np.double),  # negative log likelihood
-                            ('psi',np.double)]   # goodness of fit parameter
-                   )
-    df = pd.DataFrame.from_records(array)
-
-    array = np.array(events,
-                     dtype=[('run',np.int),           # run number
-                            ('gtr',np.double),        # 50 MHz clock in ns
-                            ('nhit',np.int),          # number of PMTs hit
-                            ('gtid',np.int),          # gtid
-                            ('dc',np.int),            # data cleaning word
-                            ('ftpx',np.double),       # FTP fitter X position
-                            ('ftpy',np.double),       # FTP fitter Y position
-                            ('ftpz',np.double),       # FTP fitter Z position
-                            ('rsp_energy',np.double)] # RSP energy
-                   )
-    df_ev = pd.DataFrame.from_records(array)
+    ev = pd.concat([pd.read_hdf(filename, "ev") for filename in args.filenames])
+    fits = pd.concat([pd.read_hdf(filename, "fits") for filename in args.filenames])
+
+    # This is a bit of a hack. It appears that many times the fit will
+    # actually do much better by including a very low energy electron or
+    # muon. I believe the reason for this is that of course my likelihood
+    # function is not perfect (for example, I don't include the correct
+    # angular distribution for Rayleigh scattered light), and so the fitter
+    # often wants to add a very low energy electron or muon to fix things.
+    #
+    # Ideally I would fix the likelihood function, but for now we just
+    # discard any fit results which have a very low energy electron or
+    # muon.
+    #
+    # FIXME: Test this since query() is new to pandas
+    fits = fits.query('not (n > 1 and ((id1 == 20 and energy1 < 20)  or (id2 == 20 and energy2 < 20)  or (id3 == 20 and energy3 < 20)))')
+    fits = fits.query('not (n > 1 and ((id2 == 22 and energy1 < 200) or (id2 == 22 and energy2 < 200) or (id3 == 22 and energy3 < 200)))')
+
+    # Calculate the approximate Ockham factor.
+    # See Chapter 20 in "Probability Theory: The Logic of Science" by Jaynes
+    #
+    # Note: This is a really approximate form by assuming that the shape of
+    # the likelihood space is equal to the average uncertainty in the
+    # different parameters.
+    fits['w'] = fits['n']*np.log(0.1*0.001) + np.log(fits['energy1']) + fits['n']*np.log(1e-4/(4*np.pi))
+
+    # Note: we index on the left hand site with loc to avoid a copy error
+    #
+    # See https://www.dataquest.io/blog/settingwithcopywarning/
+    fits.loc[fits['n'] > 1, 'w'] += np.log(fits[fits['n'] > 1]['energy2'])
+    fits.loc[fits['n'] > 2, 'w'] += np.log(fits[fits['n'] > 2]['energy3'])
+
+    fits['fmin'] = fits['fmin'] - fits['w']
+
+    fits['psi'] /= fits.merge(ev,on=['run','gtid'])['nhit']
+    fits['ke'] = fits['energy1']
+    fits['id'] = fits['id1'] + fits['id2']*100 + fits['id3']*10000
+    fits['theta'] = fits['theta1']
 
     # Make sure events are in order. We use run number and GTID here which
     # should be in time order as well except for bit flips in the GTID
     # This is important for later when we look at time differences in the 50
     # MHz clock. We should perhaps do a check here based on the 10 MHz clock to
     # make sure things are in order
-    df_ev = df_ev.sort_values(by=['run','gtid'])
+    ev = ev.sort_values(by=['run','gtid'])
 
-    print("number of events = %i" % len(df_ev))
+    print("number of events = %i" % len(ev))
 
     # First, do basic data cleaning which is done for all events.
-    df_ev = df_ev[df_ev.dc & (DC_JUNK | DC_CRATE_ISOTROPY | DC_QVNHIT | DC_FLASHER | DC_NECK) == 0]
+    ev = ev[ev.dc & (DC_JUNK | DC_CRATE_ISOTROPY | DC_QVNHIT | DC_FLASHER | DC_NECK) == 0]
 
-    print("number of events after data cleaning = %i" % len(df_ev))
+    print("number of events after data cleaning = %i" % len(ev))
 
     # Now, we select events tagged by the muon tag which should tag only
     # external muons. We keep the sample of muons since it's needed later to
     # identify Michel electrons and to apply the muon follower cut
-    muons = df_ev[(df_ev.dc & DC_MUON) != 0]
+    muons = ev[(ev.dc & DC_MUON) != 0]
 
     print("number of muons = %i" % len(muons))
 
@@ -251,12 +180,12 @@ if __name__ == '__main__':
     #
     # Should I use 10 MHz clock here since the time isn't critical and then I
     # don't have to worry about 50 MHz clock rollover?
-    prompt = df_ev[df_ev.nhit >= 100]
+    prompt = ev[ev.nhit >= 100]
     prompt_mask = np.concatenate(([True],np.diff(prompt.gtr.values) > 250e6))
 
     # Michel electrons and neutrons can be any event which is not a prompt
     # event
-    follower = pd.concat([df_ev[df_ev.nhit < 100],prompt[~prompt_mask]])
+    follower = pd.concat([ev[ev.nhit < 100],prompt[~prompt_mask]])
 
     # Apply the prompt mask
     prompt = prompt[prompt_mask]
@@ -275,39 +204,78 @@ if __name__ == '__main__':
     if prompt_plus_muons.size and follower.size:
         # require Michel events to pass more of the SNO data cleaning cuts
         michel = follower[follower.dc & (DC_JUNK | DC_CRATE_ISOTROPY | DC_QVNHIT | DC_FLASHER | DC_NECK | DC_ESUM | DC_OWL | DC_OWL_TRIGGER | DC_FTS) == DC_FTS]
+
         michel = michel[michel.nhit >= 100]
-        # accept events which had a muon more than 800 nanoseconds but less than 20 microseconds before them
-        # The 800 nanoseconds cut comes from Richie's thesis. He also mentions
-        # that the In Time Channel Spread Cut is very effective at cutting
-        # electrons events caused by muons, so I should implement that
-        michel = michel[np.any((michel.run.values == prompt_plus_muons.run.values[:,np.newaxis]) & \
-                               (michel.gtr.values > prompt_plus_muons.gtr.values[:,np.newaxis] + 800) & \
-                               (michel.gtr.values < prompt_plus_muons.gtr.values[:,np.newaxis] + 20e3),axis=0)]
+
+        # Accept events which had a muon more than 800 nanoseconds but less
+        # than 20 microseconds before them. The 800 nanoseconds cut comes from
+        # Richie's thesis. He also mentions that the In Time Channel Spread Cut
+        # is very effective at cutting electrons events caused by muons, so I
+        # should implement that.
+        #
+        # Note: We currently don't look across run boundaries. This should be a
+        # *very* small effect, and the logic to do so would be very complicated
+        # since I would have to deal with 50 MHz clock rollovers, etc.
+        #
+        # Do a simple python for loop here over the runs since we create
+        # temporary arrays with shape (michel.size,prompt_plus_muons.size)
+        # which could be too big for the full dataset.
+        #
+        # There might be some clever way to do this without the loop in Pandas,
+        # but I don't know how.
+        michel_sum = None
+        for run, michel_run in michel.groupby(['run']):
+            prompt_plus_muons_run = prompt_plus_muons[prompt_plus_muons['run'] == run]
+            michel_run = michel_run[np.any((michel_run.gtr.values > prompt_plus_muons_run.gtr.values[:,np.newaxis] + 800) & \
+                                           (michel_run.gtr.values < prompt_plus_muons_run.gtr.values[:,np.newaxis] + 20e3),axis=0)]
+
+            if michel_sum is None:
+                michel_sum = michel_run
+            else:
+                michel_sum = michel_sum.append(michel_run)
+
+        michel = michel_sum
     else:
         michel = pd.DataFrame(columns=follower.columns)
 
     if prompt.size and follower.size:
         # neutron followers have to obey stricter set of data cleaning cuts
         neutron = follower[follower.dc & (DC_JUNK | DC_CRATE_ISOTROPY | DC_QVNHIT | DC_FLASHER | DC_NECK | DC_ESUM | DC_OWL | DC_OWL_TRIGGER | DC_FTS) == DC_FTS]
-        neutron = neutron[~np.isnan(neutron.ftpx) & ~np.isnan(neutron.rsp_energy)]
-        r = np.sqrt(neutron.ftpx**2 + neutron.ftpy**2 + neutron.ftpz**2)
+        neutron = neutron[~np.isnan(neutron.ftp_x) & ~np.isnan(neutron.rsp_energy)]
+        r = np.sqrt(neutron.ftp_x**2 + neutron.ftp_y**2 + neutron.ftp_z**2)
         neutron = neutron[r < AV_RADIUS]
         neutron = neutron[neutron.rsp_energy > 4.0]
+
         # neutron events accepted after 20 microseconds and before 250 ms (50 ms during salt)
         # FIXME: need to deal with 50 MHz clock rollover
-        neutron_mask = np.any((neutron.run.values == prompt.run.values[:,np.newaxis]) & \
-                              (neutron.gtr.values > prompt.gtr.values[:,np.newaxis] + 20e3) & \
-                              (neutron.gtr.values < prompt.gtr.values[:,np.newaxis] + 250e6),axis=1)
-        df_atm = prompt[neutron_mask]
-        prompt = prompt[~neutron_mask]
+        prompt_sum = None
+        atm_sum = None
+        for run, prompt_run in prompt.groupby(['run']):
+            neutron_run = neutron[neutron['run'] == run]
+
+            neutron_mask = np.any((neutron_run.gtr.values > prompt_run.gtr.values[:,np.newaxis] + 20e3) & \
+                                  (neutron_run.gtr.values < prompt_run.gtr.values[:,np.newaxis] + 250e6),axis=1)
+
+            if prompt_sum is None:
+                prompt_sum = prompt_run[~neutron_mask]
+            else:
+                prompt_sum = prompt_sum.append(prompt_run[~neutron_mask])
+
+            if atm_sum is None:
+                atm_sum = prompt_run[neutron_mask]
+            else:
+                atm_sum = atm_sum.append(prompt_run[neutron_mask])
+
+        atm = atm_sum
+        prompt = prompt_sum
     else:
-        df_atm = pd.DataFrame(columns=prompt.columns)
+        atm = pd.DataFrame(columns=prompt.columns)
 
     print("number of events after neutron follower cut = %i" % len(prompt))
 
     # Get rid of muon events in our main event sample
     prompt = prompt[(prompt.dc & DC_MUON) == 0]
-    df_atm = df_atm[(df_atm.dc & DC_MUON) == 0]
+    atm = atm[(atm.dc & DC_MUON) == 0]
 
     print("number of events after muon cut = %i" % len(prompt))
 
@@ -317,52 +285,52 @@ if __name__ == '__main__':
         prompt = prompt[~np.any((prompt.run.values == muons.run.values[:,np.newaxis]) & \
                                 (prompt.gtr.values > muons.gtr.values[:,np.newaxis]) & \
                                 (prompt.gtr.values <= (muons.gtr.values[:,np.newaxis] + 200e3)),axis=0)]
-        df_atm = df_atm[~np.any((df_atm.run.values == muons.run.values[:,np.newaxis]) & \
-                                (df_atm.gtr.values > muons.gtr.values[:,np.newaxis]) & \
-                                (df_atm.gtr.values <= (muons.gtr.values[:,np.newaxis] + 200e3)),axis=0)]
+        atm = atm[~np.any((atm.run.values == muons.run.values[:,np.newaxis]) & \
+                          (atm.gtr.values > muons.gtr.values[:,np.newaxis]) & \
+                          (atm.gtr.values <= (muons.gtr.values[:,np.newaxis] + 200e3)),axis=0)]
 
     # Check to see if there are any events with missing fit information
-    df_atm_ra = df_atm[['run','gtid']].to_records(index=False)
+    atm_ra = atm[['run','gtid']].to_records(index=False)
     muons_ra = muons[['run','gtid']].to_records(index=False)
     prompt_ra = prompt[['run','gtid']].to_records(index=False)
     michel_ra = michel[['run','gtid']].to_records(index=False)
-    df_ra = df[['run','gtid']].to_records(index=False)
+    fits_ra = fits[['run','gtid']].to_records(index=False)
 
-    if np.count_nonzero(~np.isin(df_atm_ra,df_ra)):
-        print_warning("skipping %i atmospheric events because they are missing fit information!" % np.count_nonzero(~np.isin(df_atm_ra,df_ra)))
+    if len(atm_ra) and np.count_nonzero(~np.isin(atm_ra,fits_ra)):
+        print_warning("skipping %i atmospheric events because they are missing fit information!" % np.count_nonzero(~np.isin(atm_ra,fits_ra)))
 
-    if np.count_nonzero(~np.isin(muons_ra,df_ra)):
-        print_warning("skipping %i muon events because they are missing fit information!" % np.count_nonzero(~np.isin(muons_ra,df_ra)))
+    if len(muons_ra) and np.count_nonzero(~np.isin(muons_ra,fits_ra)):
+        print_warning("skipping %i muon events because they are missing fit information!" % np.count_nonzero(~np.isin(muons_ra,fits_ra)))
 
-    if np.count_nonzero(~np.isin(prompt_ra,df_ra)):
-        print_warning("skipping %i signal events because they are missing fit information!" % np.count_nonzero(~np.isin(prompt_ra,df_ra)))
+    if len(prompt_ra) and np.count_nonzero(~np.isin(prompt_ra,fits_ra)):
+        print_warning("skipping %i signal events because they are missing fit information!" % np.count_nonzero(~np.isin(prompt_ra,fits_ra)))
 
-    if np.count_nonzero(~np.isin(michel_ra,df_ra)):
-        print_warning("skipping %i Michel events because they are missing fit information!" % np.count_nonzero(~np.isin(michel_ra,df_ra)))
+    if len(michel_ra) and np.count_nonzero(~np.isin(michel_ra,fits_ra)):
+        print_warning("skipping %i Michel events because they are missing fit information!" % np.count_nonzero(~np.isin(michel_ra,fits_ra)))
 
     # Now, we merge the event info with the fitter info.
     #
     # Note: This means that the dataframe now contains multiple rows for each
     # event, one for each fit hypothesis.
-    df_atm = pd.merge(df,df_atm,how='inner',on=['run','gtid'])
-    muons = pd.merge(df,muons,how='inner',on=['run','gtid'])
-    michel = pd.merge(df,michel,how='inner',on=['run','gtid'])
-    df = pd.merge(df,prompt,how='inner',on=['run','gtid'])
+    atm = pd.merge(fits,atm,how='inner',on=['run','gtid'])
+    muons = pd.merge(fits,muons,how='inner',on=['run','gtid'])
+    michel = pd.merge(fits,michel,how='inner',on=['run','gtid'])
+    prompt = pd.merge(fits,prompt,how='inner',on=['run','gtid'])
 
     # get rid of events which don't have a fit
-    nan = np.isnan(df.fmin.values)
+    nan = np.isnan(prompt.fmin.values)
 
     if np.count_nonzero(nan):
-        print_warning("skipping %i signal events because the negative log likelihood is nan!" % len(df[nan].groupby(['run','gtid'])))
+        print_warning("skipping %i signal events because the negative log likelihood is nan!" % len(prompt[nan].groupby(['run','gtid'])))
 
-    df = df[~nan]
+    prompt = prompt[~nan]
 
-    nan_atm = np.isnan(df_atm.fmin.values)
+    nan_atm = np.isnan(atm.fmin.values)
 
     if np.count_nonzero(nan_atm):
-        print_warning("skipping %i atmospheric events because the negative log likelihood is nan!" % len(df_atm[nan_atm].groupby(['run','gtid'])))
+        print_warning("skipping %i atmospheric events because the negative log likelihood is nan!" % len(atm[nan_atm].groupby(['run','gtid'])))
 
-    df_atm = df_atm[~nan_atm]
+    atm = atm[~nan_atm]
 
     nan_muon = np.isnan(muons.fmin.values)
 
@@ -379,24 +347,24 @@ if __name__ == '__main__':
     michel = michel[~nan_michel]
 
     # get the best fit
-    df = df.sort_values('fmin').groupby(['run','gtid']).first()
-    df_atm = df_atm.sort_values('fmin').groupby(['run','gtid']).first()
+    prompt = prompt.sort_values('fmin').groupby(['run','gtid']).first()
+    atm = atm.sort_values('fmin').groupby(['run','gtid']).first()
     michel_best_fit = michel.sort_values('fmin').groupby(['run','gtid']).first()
     muon_best_fit = muons.sort_values('fmin').groupby(['run','gtid']).first()
     muons = muons[muons.id == 22]
 
     # require r < 6 meters
-    df = df[np.sqrt(df.x.values**2 + df.y.values**2 + df.z.values**2) < AV_RADIUS]
-    df_atm = df_atm[np.sqrt(df_atm.x.values**2 + df_atm.y.values**2 + df_atm.z.values**2) < AV_RADIUS]
+    prompt = prompt[np.sqrt(prompt.x.values**2 + prompt.y.values**2 + prompt.z.values**2) < AV_RADIUS]
+    atm = atm[np.sqrt(atm.x.values**2 + atm.y.values**2 + atm.z.values**2) < AV_RADIUS]
 
-    print("number of events after radius cut = %i" % len(df))
+    print("number of events after radius cut = %i" % len(prompt))
 
     # Note: Need to design and apply a psi based cut here
 
     plt.figure()
-    plot_hist(df,"Without Neutron Follower")
+    plot_hist(prompt,"Without Neutron Follower")
     plt.figure()
-    plot_hist(df_atm,"With Neutron Follower")
+    plot_hist(atm,"With Neutron Follower")
     plt.figure()
     plot_hist(michel_best_fit,"Michel Electrons")
     plt.figure()
diff --git a/utils/plot-fit-results b/utils/plot-fit-results
index 773a0dc..7115b81 100755
--- a/utils/plot-fit-results
+++ b/utils/plot-fit-results
@@ -15,11 +15,6 @@
 # this program. If not, see <https://www.gnu.org/licenses/>.
 
 from __future__ import print_function, division
-import yaml
-try:
-    from yaml import CLoader as Loader
-except ImportError:
-    from yaml.loader import SafeLoader as Loader
 import numpy as np
 from scipy.stats import iqr
 from matplotlib.lines import Line2D
@@ -71,228 +66,137 @@ def get_stats(x):
     error = np.sqrt((u4-(n-3)*std**4/(n-1))/n)/(2*std)
     return mean, std/np.sqrt(n), std, error
 
+def std_err(x):
+    x = x.dropna()
+    mean = np.mean(x)
+    std = np.std(x)
+    n = len(x)
+    if n == 0:
+        return np.nan
+    u4 = np.mean((x-mean)**4)
+    error = np.sqrt((u4-(n-3)*std**4/(n-1))/n)/(2*std)
+    return error
+
 if __name__ == '__main__':
     import argparse
     import matplotlib.pyplot as plt
     import numpy as np
+    import h5py
+    import pandas as pd
 
     parser = argparse.ArgumentParser("plot fit results")
     parser.add_argument("filenames", nargs='+', help="input files")
     args = parser.parse_args()
 
-    events = []
-
-    for filename in args.filenames:
-        print(filename)
-        with open(filename) as f:
-            data = yaml.load(f.read(),Loader=Loader)
-
-        a = np.ma.empty(len(data['data']),
-                        dtype=[('id',np.int),       # particle id
-                               ('T', np.double),    # true energy
-                               ('dx',np.double),    # dx
-                               ('dy',np.double),    # dy
-                               ('dz',np.double),    # dz
-                               ('dT',np.double),    # dT
-                               ('theta',np.double), # theta
-                               ('ratio',np.double), # likelihood ratio
-                               ('te',np.double),    # time electron
-                               ('tm',np.double),    # time muon
-                               ('Te',np.double)]    # electron energy
-                    )
-
-        for i, event in enumerate(data['data']):
-            # get the particle ID
-            id = event['mcgn'][0]['id']
-
-            a[i]['id'] = id
-
-            if 'fit' not in event['ev'][0]:
-                # if nhit < 100 we don't fit the event
-                continue
-
-            if id not in event['ev'][0]['fit']:
-                a[i] = np.ma.masked
-                continue
-
-            mass = SNOMAN_MASS[id]
-            # for some reason it's the *second* track which seems to contain the
-            # initial energy
-            true_energy = event['mcgn'][0]['energy']
-            # The MCTK bank has the particle's total energy (except for neutrons)
-            # so we need to convert it into kinetic energy
-            ke = true_energy - mass
-
-            fit = event['ev'][0]['fit']
-
-            a[i]['T'] = ke
-            energy = fit[id]['energy']
-            a[i]['dT'] = energy-ke
-
-            # store the fit position residuals
-            true_posx = event['mcgn'][0]['posx']
-            posx = fit[id]['posx']
-            a[i]['dx'] = posx-true_posx
-            true_posy = event['mcgn'][0]['posy']
-            posy = fit[id]['posy']
-            a[i]['dy'] = posy-true_posy
-            true_posz = event['mcgn'][0]['posz']
-            posz = fit[id]['posz']
-            a[i]['dz'] = posz-true_posz
-
-            # compute the angle between the fit direction and the true
-            # direction
-            dirx = event['mcgn'][0]['dirx']
-            diry = event['mcgn'][0]['diry']
-            dirz = event['mcgn'][0]['dirz']
-            true_dir = [dirx,diry,dirz]
-            true_dir = np.array(true_dir)/np.linalg.norm(true_dir)
-            theta = fit[id]['theta']
-            phi = fit[id]['phi']
-            dir = [np.sin(theta)*np.cos(phi),np.sin(theta)*np.sin(phi),np.cos(theta)]
-            dir = np.array(dir)/np.linalg.norm(dir)
-            a[i]['theta'] = np.degrees(np.arccos(np.dot(true_dir,dir)))
-
-            # compute the log likelihood ratio
-            if IDP_E_MINUS in fit and IDP_MU_MINUS in fit:
-                fmin_electron = fit[IDP_E_MINUS]['fmin']
-                fmin_muon = fit[IDP_MU_MINUS]['fmin']
-                a[i]['ratio'] = fmin_muon-fmin_electron
-            else:
-                a[i]['ratio'] = np.ma.masked
-
-            # store the time taken for electron and muon fits
-            if IDP_E_MINUS in fit:
-                a[i]['te'] = fit[IDP_E_MINUS]['time']
-                a[i]['Te'] = fit[IDP_E_MINUS]['energy']
-            else:
-                a[i]['te'] = np.ma.masked
-                a[i]['Te'] = np.ma.masked
-            if IDP_MU_MINUS in fit:
-                a[i]['tm'] = fit[IDP_MU_MINUS]['time']
-            else:
-                a[i]['tm'] = np.ma.masked
-
-        events.append(a)
-
-    a = np.ma.concatenate(events)
-
+    # Read in all the data.
+    #
+    # Note: We have to add the filename as a column to each dataset since this
+    # script is used to analyze MC data which all has the same run number.
+    ev = pd.concat([pd.read_hdf(filename, "ev").assign(filename=filename) for filename in args.filenames])
+    fits = pd.concat([pd.read_hdf(filename, "fits").assign(filename=filename) for filename in args.filenames])
+    mcgn = pd.concat([pd.read_hdf(filename, "mcgn").assign(filename=filename) for filename in args.filenames])
+
+    # get rid of 2nd events like Michel electrons
+    ev = ev.sort_values(['run','gtid']).groupby(['filename','evn'],as_index=False).first()
+
+    # Now, we merge all three datasets together to produce a single
+    # dataframe. To do so, we join the ev dataframe with the mcgn frame
+    # on the evn column, and then join with the fits on the run and
+    # gtid columns.
+    #
+    # At the end we will have a single dataframe with one row for each
+    # fit, i.e. it will look like:
+    #
+    # >>> data
+    #   run   gtid nhit, ... mcgn_x, mcgn_y, mcgn_z, ..., fit_id1, fit_x, fit_y, fit_z, ...
+    #
+    # Before merging, we prefix the primary seed track table with mcgn_
+    # and the fit table with fit_ just to make things easier.
+
+    # Prefix track and fit frames
+    mcgn = mcgn.add_prefix("mcgn_")
+    fits = fits.add_prefix("fit_")
+
+    # merge ev and mcgn on evn
+    data = ev.merge(mcgn,left_on=['filename','evn'],right_on=['mcgn_filename','mcgn_evn'])
+    # merge data and fits on run and gtid
+    data = data.merge(fits,left_on=['filename','run','gtid'],right_on=['fit_filename','fit_run','fit_gtid'])
+
+    # calculate true kinetic energy
+    mass = [SNOMAN_MASS[id] for id in data['mcgn_id'].values]
+    data['T'] = data['mcgn_energy'].values - mass
+    data['dx'] = data['fit_x'].values - data['mcgn_x'].values
+    data['dy'] = data['fit_y'].values - data['mcgn_y'].values
+    data['dz'] = data['fit_z'].values - data['mcgn_z'].values
+    data['dT'] = data['fit_energy1'].values - data['T'].values
+
+    true_dir = np.dstack((data['mcgn_dirx'],data['mcgn_diry'],data['mcgn_dirz'])).squeeze()
+    dir = np.dstack((np.sin(data['fit_theta1'])*np.cos(data['fit_phi1']),
+                     np.sin(data['fit_theta1'])*np.sin(data['fit_phi1']),
+                     np.cos(data['fit_theta1']))).squeeze()
+
+    data['theta'] = np.degrees(np.arccos((true_dir*dir).sum(axis=-1)))
+
+    # only select fits which have at least 2 fits
+    data = data.groupby(['filename','run','gtid']).filter(lambda x: len(x) > 1)
+    data_true = data[data['fit_id1'] == data['mcgn_id']]
+    data_e = data[data['fit_id1'] == IDP_E_MINUS]
+    data_mu = data[data['fit_id1'] == IDP_MU_MINUS]
+
+    data_true = data_true.set_index(['filename','run','gtid'])
+    data_e = data_e.set_index(['filename','run','gtid'])
+    data_mu = data_mu.set_index(['filename','run','gtid'])
+
+    data_true['ratio'] = data_mu['fit_fmin']-data_e['fit_fmin']
+    data_true['te'] = data_e['fit_time']
+    data_true['tm'] = data_mu['fit_time']
+    data_true['Te'] = data_e['fit_energy1']
+
+    # 100 bins between 50 MeV and 1 GeV
     bins = np.arange(50,1000,100)
 
-    stats_array = np.ma.empty(len(bins)-1,
-                     dtype=[('T',             np.double),
-                            ('dT',            np.double),
-                            ('dT_err',        np.double),
-                            ('dT_std',        np.double),
-                            ('dT_std_err',    np.double),
-                            ('dx',            np.double),
-                            ('dx_err',        np.double),
-                            ('dx_std',        np.double),
-                            ('dx_std_err',    np.double),
-                            ('dy',            np.double),
-                            ('dy_err',        np.double),
-                            ('dy_std',        np.double),
-                            ('dy_std_err',    np.double),
-                            ('dz',            np.double),
-                            ('dz_err',        np.double),
-                            ('dz_std',        np.double),
-                            ('dz_std_err',    np.double),
-                            ('theta',         np.double),
-                            ('theta_err',     np.double),
-                            ('theta_std',     np.double),
-                            ('theta_std_err', np.double)])
-
-    stats = {IDP_E_MINUS: stats_array, IDP_MU_MINUS: stats_array.copy()}
-
-    for id in stats:
-        electron_events = a[a['id'] == id]
-
-        for i, (ablah, b) in enumerate(zip(bins[:-1], bins[1:])):
-            events = electron_events[(electron_events['T'] >= ablah) & (electron_events['T'] < b)]
-
-            if len(events) < 2:
-                stats[id][i] = np.ma.masked
-                continue
-
-            stats[id][i]['T'] = (ablah+b)/2
-            mean, mean_error, std, std_error = get_stats(events['dT'].compressed())
-            stats[id][i]['dT'] = mean
-            stats[id][i]['dT_err'] = mean_error
-            stats[id][i]['dT_std'] = std
-            stats[id][i]['dT_std_err'] = std_error
-            mean, mean_error, std, std_error = get_stats(events['dx'].compressed())
-            stats[id][i]['dx'] = mean
-            stats[id][i]['dx_err'] = mean_error
-            stats[id][i]['dx_std'] = std
-            stats[id][i]['dx_std_err'] = std_error
-            mean, mean_error, std, std_error = get_stats(events['dy'].compressed())
-            stats[id][i]['dy'] = mean
-            stats[id][i]['dy_err'] = mean_error
-            stats[id][i]['dy_std'] = std
-            stats[id][i]['dy_std_err'] = std_error
-            mean, mean_error, std, std_error = get_stats(events['dz'].compressed())
-            stats[id][i]['dz'] = mean
-            stats[id][i]['dz_err'] = mean_error
-            stats[id][i]['dz_std'] = std
-            stats[id][i]['dz_std_err'] = std_error
-            mean, mean_error, std, std_error = get_stats(events['theta'].compressed())
-            stats[id][i]['theta'] = mean
-            stats[id][i]['theta_err'] = mean_error
-            stats[id][i]['theta_std'] = std
-            stats[id][i]['theta_std_err'] = std_error
-
-    for id in stats:
+    for id in [IDP_E_MINUS, IDP_MU_MINUS]:
+        events = data_true[data_true['mcgn_id'] == id]
+
+        pd_bins = pd.cut(events['T'],bins)
+
+        dT = events.groupby(pd_bins)['dT'].agg(['mean','sem','std',std_err])
+        dx = events.groupby(pd_bins)['dx'].agg(['mean','sem','std',std_err])
+        dy = events.groupby(pd_bins)['dy'].agg(['mean','sem','std',std_err])
+        dz = events.groupby(pd_bins)['dz'].agg(['mean','sem','std',std_err])
+        theta = events.groupby(pd_bins)['theta'].agg(['mean','sem','std',std_err])
+
         label = 'Muon' if id == IDP_MU_MINUS else 'Electron'
 
-        T = stats[id]['T']
-        dT = stats[id]['dT']
-        dT_err = stats[id]['dT_err']
-        std_dT = stats[id]['dT_std']
-        std_dT_err = stats[id]['dT_std_err']
-        dx = stats[id]['dx']
-        dx_err = stats[id]['dx_err']
-        std_dx = stats[id]['dx_std']
-        std_dx_err = stats[id]['dx_std_err']
-        dy = stats[id]['dy']
-        dy_err = stats[id]['dy_err']
-        std_dy = stats[id]['dy_std']
-        std_dy_err = stats[id]['dy_std_err']
-        dz = stats[id]['dz']
-        dz_err = stats[id]['dz_err']
-        std_dz = stats[id]['dz_std']
-        std_dz_err = stats[id]['dz_std_err']
-        theta = stats[id]['theta']
-        theta_err = stats[id]['theta_err']
-        std_theta = stats[id]['theta_std']
-        std_theta_err = stats[id]['theta_std_err']
+        T = (bins[1:] + bins[:-1])/2
 
         plt.figure(1)
-        plt.errorbar(T,dT*100/T,yerr=dT_err*100/T,fmt='o',label=label)
+        plt.errorbar(T,dT['mean']*100/T,yerr=dT['sem']*100/T,fmt='o',label=label)
         plt.xlabel("Kinetic Energy (MeV)")
         plt.ylabel("Energy bias (%)")
         plt.title("Energy Bias")
         plt.legend()
 
         plt.figure(2)
-        plt.errorbar(T,std_dT*100/T,yerr=std_dT_err*100/T,fmt='o',label=label)
+        plt.errorbar(T,dT['std']*100/T,yerr=dT['std_err']*100/T,fmt='o',label=label)
         plt.xlabel("Kinetic Energy (MeV)")
         plt.ylabel("Energy resolution (%)")
         plt.title("Energy Resolution")
         plt.legend()
 
         plt.figure(3)
-        plt.errorbar(T,dx,yerr=dx_err,fmt='o',label='%s (x)' % label)
-        plt.errorbar(T,dy,yerr=dy_err,fmt='o',label='%s (y)' % label)
-        plt.errorbar(T,dz,yerr=dz_err,fmt='o',label='%s (z)' % label)
+        plt.errorbar(T,dx['mean'],yerr=dx['sem'],fmt='o',label='%s (x)' % label)
+        plt.errorbar(T,dy['mean'],yerr=dy['sem'],fmt='o',label='%s (y)' % label)
+        plt.errorbar(T,dz['mean'],yerr=dz['sem'],fmt='o',label='%s (z)' % label)
         plt.xlabel("Kinetic Energy (MeV)")
         plt.ylabel("Position bias (cm)")
         plt.title("Position Bias")
         plt.legend()
 
         plt.figure(4)
-        plt.errorbar(T,std_dx,yerr=std_dx_err,fmt='o',label='%s (x)' % label)
-        plt.errorbar(T,std_dy,yerr=std_dy_err,fmt='o',label='%s (y)' % label)
-        plt.errorbar(T,std_dz,yerr=std_dz_err,fmt='o',label='%s (z)' % label)
+        plt.errorbar(T,dx['std'],yerr=dx['std_err'],fmt='o',label='%s (x)' % label)
+        plt.errorbar(T,dy['std'],yerr=dy['std_err'],fmt='o',label='%s (y)' % label)
+        plt.errorbar(T,dz['std'],yerr=dz['std_err'],fmt='o',label='%s (z)' % label)
         plt.xlabel("Kinetic Energy (MeV)")
         plt.ylabel("Position resolution (cm)")
         plt.title("Position Resolution")
@@ -300,7 +204,7 @@ if __name__ == '__main__':
         plt.legend()
 
         plt.figure(5)
-        plt.errorbar(T,std_theta,yerr=std_theta_err,fmt='o',label=label)
+        plt.errorbar(T,theta['std'],yerr=theta['std_err'],fmt='o',label=label)
         plt.xlabel("Kinetic Energy (MeV)")
         plt.ylabel("Angular resolution (deg)")
         plt.title("Angular Resolution")
@@ -308,9 +212,10 @@ if __name__ == '__main__':
         plt.legend()
 
         plt.figure(6)
-        plt.scatter(a[a['id'] == id]['Te'],a[a['id'] == id]['ratio'],label=label)
+        plt.scatter(events['Te'],events['ratio'],label=label)
         plt.xlabel("Reconstructed Electron Energy (MeV)")
         plt.ylabel(r"Log Likelihood Ratio (e/$\mu$)")
         plt.title("Log Likelihood Ratio vs Reconstructed Electron Energy")
         plt.legend()
+
     plt.show()
diff --git a/utils/submit-grid-jobs b/utils/submit-grid-jobs
index 670172d..63cbcb7 100755
--- a/utils/submit-grid-jobs
+++ b/utils/submit-grid-jobs
@@ -24,6 +24,83 @@ import string
 from os.path import split, splitext, join, abspath
 import uuid
 from subprocess import check_call
+import os
+import sys
+
+# Next two functions are a backport of the shutil.which() function from Python
+# 3.3 from Lib/shutil.py in the CPython code See
+# https://github.com/python/cpython/blob/master/Lib/shutil.py.
+
+# Check that a given file can be accessed with the correct mode.
+# Additionally check that `file` is not a directory, as on Windows
+# directories pass the os.access check.
+def _access_check(fn, mode):
+    return (os.path.exists(fn) and os.access(fn, mode) and not os.path.isdir(fn))
+
+def which(cmd, mode=os.F_OK | os.X_OK, path=None):
+    """Given a command, mode, and a PATH string, return the path which
+    conforms to the given mode on the PATH, or None if there is no such
+    file.
+    `mode` defaults to os.F_OK | os.X_OK. `path` defaults to the result
+    of os.environ.get("PATH"), or can be overridden with a custom search
+    path.
+    """
+    # If we're given a path with a directory part, look it up directly rather
+    # than referring to PATH directories. This includes checking relative to the
+    # current directory, e.g. ./script
+    if os.path.dirname(cmd):
+        if _access_check(cmd, mode):
+            return cmd
+        return None
+
+    if path is None:
+        path = os.environ.get("PATH", None)
+        if path is None:
+            try:
+                path = os.confstr("CS_PATH")
+            except (AttributeError, ValueError):
+                # os.confstr() or CS_PATH is not available
+                path = os.defpath
+        # bpo-35755: Don't use os.defpath if the PATH environment variable is
+        # set to an empty string
+
+    # PATH='' doesn't match, whereas PATH=':' looks in the current directory
+    if not path:
+        return None
+
+    path = path.split(os.pathsep)
+
+    if sys.platform == "win32":
+        # The current directory takes precedence on Windows.
+        curdir = os.curdir
+        if curdir not in path:
+            path.insert(0, curdir)
+
+        # PATHEXT is necessary to check on Windows.
+        pathext = os.environ.get("PATHEXT", "").split(os.pathsep)
+        # See if the given file matches any of the expected path extensions.
+        # This will allow us to short circuit when given "python.exe".
+        # If it does match, only test that one, otherwise we have to try
+        # others.
+        if any(cmd.lower().endswith(ext.lower()) for ext in pathext):
+            files = [cmd]
+        else:
+            files = [cmd + ext for ext in pathext]
+    else:
+        # On other platforms you don't have things like PATHEXT to tell you
+        # what file suffixes are executable, so just pass on cmd as-is.
+        files = [cmd]
+
+    seen = set()
+    for dir in path:
+        normdir = os.path.normcase(dir)
+        if not normdir in seen:
+            seen.add(normdir)
+            for thefile in files:
+                name = os.path.join(dir, thefile)
+                if _access_check(name, mode):
+                    return name
+    return None
 
 CONDOR_TEMPLATE = \
 """
@@ -42,7 +119,7 @@ log = @log
 
 # The below are good base requirements for first testing jobs on OSG, 
 #  if you don't have a good idea of memory and disk usage.
-requirements = (OSGVO_OS_STRING == "RHEL 7") && (OpSys == "LINUX")
+requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == "RHEL 7") && (OpSys == "LINUX")
 request_cpus = 1
 request_memory = 1 GB
 request_disk = 1 GB
@@ -72,14 +149,24 @@ def submit_job(filename, run, gtid, dir, dqxx_dir, min_nhit, max_particles):
     prefix = "%s_%08i_%s" % (root,gtid,ID.hex)
 
     # fit output filename
-    output = "%s.txt" % prefix
+    output = "%s.hdf5" % prefix
     # condor submit filename
     condor_submit = "%s.submit" % prefix
 
     # set up the arguments for the template
-    executable = join(dir,"fit")
+    executable = which("fit")
+    wrapper = which("fit-wrapper")
+
+    if executable is None:
+        print("couldn't find fit in path!",file=sys.stderr)
+        sys.exit(1)
+
+    if wrapper is None:
+        print("couldn't find fit-wrapper in path!",file=sys.stderr)
+        sys.exit(1)
+
     args = [tail,"-o",output,"--gtid",gtid,"--min-nhit",min_nhit,"--max-particles",max_particles]
-    transfer_input_files = ",".join([filename,join(dqxx_dir,"DQXX_%010i.dat" % run)] + [join(dir,filename) for filename in INPUT_FILES])
+    transfer_input_files = ",".join([executable,filename,join(dqxx_dir,"DQXX_%010i.dat" % run)] + [join(dir,filename) for filename in INPUT_FILES])
     transfer_output_files = ",".join([output])
 
     condor_error = "%s.error" % prefix
@@ -89,7 +176,7 @@ def submit_job(filename, run, gtid, dir, dqxx_dir, min_nhit, max_particles):
     template = MyTemplate(CONDOR_TEMPLATE)
 
     submit_string = template.safe_substitute(
-            executable=executable,
+            executable=wrapper,
             args=" ".join(map(str,args)),
             transfer_input_files=transfer_input_files,
             transfer_output_files=transfer_output_files,
@@ -106,13 +193,13 @@ def submit_job(filename, run, gtid, dir, dqxx_dir, min_nhit, max_particles):
 
 if __name__ == '__main__':
     import argparse
-    import subprocess
+    from subprocess import check_call
     import os
+    import tempfile
+    import h5py
 
     parser = argparse.ArgumentParser("submit grid jobs")
     parser.add_argument("filenames", nargs='+', help="input files")
-    parser.add_argument("--dir", type=str, help="fitter directory", required=True)
-    parser.add_argument("--dqxx-dir", type=str, help="dqxx directory", required=True)
     parser.add_argument("--min-nhit", type=int, help="minimum nhit to fit an event", default=100)
     parser.add_argument("--max-particles", type=int, help="maximum number of particles to fit for", default=3)
     parser.add_argument("--skip-second-event", action='store_true', help="only fit the first event after a MAST bank", default=False)
@@ -123,14 +210,32 @@ if __name__ == '__main__':
         home = os.path.expanduser("~")
         args.state = join(home,'state.txt')
 
+    if 'SDDM_DATA' not in os.environ:
+        print("Please set the SDDM_DATA environment variable to point to the fitter source code location", file=sys.stderr)
+        sys.exit(1)
+
+    dir = os.environ['SDDM_DATA']
+
+    if 'DQXX_DIR' not in os.environ:
+        print("Please set the DQXX_DIR environment variable to point to the directory with the DQXX files", file=sys.stderr)
+        sys.exit(1)
+
+    dqxx_dir = os.environ['DQXX_DIR']
+
     args.state = abspath(args.state)
 
     # get the current working directory
     home_dir = os.getcwd()
 
     # get absolute paths since we are going to create a new directory
-    args.dir = abspath(args.dir)
-    args.dqxx_dir = abspath(args.dqxx_dir)
+    dir = abspath(dir)
+    dqxx_dir = abspath(dqxx_dir)
+
+    zdab_cat = which("zdab-cat")
+
+    if zdab_cat is None:
+        print("couldn't find zdab-cat in path!",file=sys.stderr)
+        sys.exit(1)
 
     for filename in args.filenames:
         filename = abspath(filename)
@@ -152,29 +257,28 @@ if __name__ == '__main__':
             continue
 
         with open(os.devnull, 'w') as f:
+            # Create a temporary file to store the events. Docs recommended
+            # this method instead of mkstemp(), but I think they're the same.
+            output = tempfile.NamedTemporaryFile(suffix='.hdf5',delete=False)
+            output.close()
+
             if args.skip_second_event:
-                popen = subprocess.Popen([join(args.dir,"zdab-cat"),"--skip-second-event",filename],stdout=subprocess.PIPE,stderr=f)
+                check_call([zdab_cat,"--skip-second-event",filename,"-o",output.name],stderr=f)
             else:
-                popen = subprocess.Popen([join(args.dir,"zdab-cat"),filename],stdout=subprocess.PIPE,stderr=f)
+                check_call([zdab_cat,filename,"-o",output.name],stderr=f)
 
         new_dir = "%s_%s" % (root,ID.hex)
 
         os.mkdir(new_dir)
         os.chdir(new_dir)
 
-        for data in yaml.load_all(popen.stdout,Loader=Loader):
-            if 'ev' not in data:
-                continue
-
-            for ev in data['ev']:
-                run = ev['run']
-                gtid = ev['gtid']
-                nhit = ev['nhit']
-
-                if nhit >= args.min_nhit:
-                    submit_job(filename, run, gtid, args.dir, args.dqxx_dir, args.min_nhit, args.max_particles)
+        with h5py.File(output.name) as f:
+            for ev in f['ev']:
+                if ev['nhit'] >= args.min_nhit:
+                    submit_job(filename, ev['run'], ev['gtid'], dir, dqxx_dir, args.min_nhit, args.max_particles)
 
-        popen.wait()
+        # Delete temporary HDF5 file
+        os.unlink(output.name)
 
         state.append(tail)