From 21491ca1ca2afd6951e9b5b1e74b1c919c602b36 Mon Sep 17 00:00:00 2001
From: tlatorre <tlatorre@uchicago.edu>
Date: Thu, 11 Jul 2019 09:42:23 -0500
Subject: switch from YAML output to HDF5 to speed things up

---
 README                 |   4 +-
 src/Makefile           |   7 +-
 src/fit.c              | 194 ++++++++++++++++++--------------
 src/hdf5_utils.c       | 193 ++++++++++++++++++++++++++++++++
 src/hdf5_utils.h       |  85 ++++++++++++++
 src/zdab-cat.c         | 117 +++++++++----------
 src/zdab_utils.c       |  18 +++
 src/zdab_utils.h       |  32 ++++++
 utils/Makefile         |   1 +
 utils/cat-grid-jobs    | 163 +++++++++++++++++++--------
 utils/fit-wrapper      |   7 ++
 utils/plot             | 171 ++++++++++++++--------------
 utils/plot-energy      | 294 ++++++++++++++++++++++--------------------------
 utils/plot-fit-results | 297 +++++++++++++++++--------------------------------
 utils/submit-grid-jobs | 152 +++++++++++++++++++++----
 15 files changed, 1063 insertions(+), 672 deletions(-)
 create mode 100644 src/hdf5_utils.c
 create mode 100644 src/hdf5_utils.h
 create mode 100644 utils/fit-wrapper

diff --git a/README b/README
index e1b6f60..ce5dbc9 100644
--- a/README
+++ b/README
@@ -1,10 +1,10 @@
 Building
 ========
 
-First you have to install the gsl and nlopt libraries. On Fedora you can
+First you have to install the gsl, nlopt, and hdf5 libraries. On Fedora you can
 install these with the following command:
 
-    $ yum install gsl gsl-devel NLopt NLopt-devel
+    $ yum install gsl gsl-devel NLopt NLopt-devel hdf5 hdf5-devel
 
 You can also install these packages yourself. For instructions, see below
 (Installing GSL and Installing NLopt).
diff --git a/src/Makefile b/src/Makefile
index 99cff5e..e5ca1cd 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,7 +1,7 @@
 release_hdr := $(shell sh -c './mkreleasehdr.sh')
 
 CFLAGS=-fdiagnostics-color -O2 -Wall -g -DSWAP_BYTES
-LDLIBS=-fdiagnostics-color -lm -lgsl -lgslcblas -lnlopt_cxx -lstdc++ -lz
+LDLIBS=-fdiagnostics-color -lm -lgsl -lgslcblas -lnlopt_cxx -lstdc++ -lz -lhdf5
 
 PREFIX?=$(HOME)/local
 INSTALL_BIN=$(PREFIX)/bin
@@ -32,17 +32,18 @@ test-time-pdf: test-time-pdf.o zebra.o likelihood.o pmt.o vector.o misc.o muon.o
 
 test-zebra: test-zebra.o zebra.o pack2b.o
 
-fit: fit.o zebra.o likelihood.o pmt.o vector.o misc.o muon.o optics.o quantum_efficiency.o solid_angle.o pdg.o scattering.o zdab_utils.o pack2b.o sno_charge.o db.o dqxx.o dict.o siphash.o path.o pmt_response.o release.o electron.o proton.o find_peaks.o quad.o dc.o sort.o util.o
+fit: fit.o zebra.o likelihood.o pmt.o vector.o misc.o muon.o optics.o quantum_efficiency.o solid_angle.o pdg.o scattering.o zdab_utils.o pack2b.o sno_charge.o db.o dqxx.o dict.o siphash.o path.o pmt_response.o release.o electron.o proton.o find_peaks.o quad.o dc.o sort.o util.o hdf5_utils.o
 
 test-find-peaks: test-find-peaks.o zebra.o likelihood.o pmt.o vector.o misc.o muon.o optics.o quantum_efficiency.o solid_angle.o pdg.o scattering.o zdab_utils.o pack2b.o sno_charge.o db.o dqxx.o dict.o siphash.o path.o pmt_response.o release.o electron.o proton.o find_peaks.o util.o quad.o
 
 calculate-csda-range: calculate-csda-range.o
 
-zdab-cat: zdab-cat.o zebra.o pmt.o vector.o misc.o zdab_utils.o pack2b.o db.o dqxx.o dict.o siphash.o release.o dc.o sort.o util.o
+zdab-cat: zdab-cat.o zebra.o pmt.o vector.o misc.o zdab_utils.o pack2b.o db.o dqxx.o dict.o siphash.o release.o dc.o sort.o util.o hdf5_utils.o
 
 install:
 	@mkdir -p $(INSTALL_BIN)
 	$(INSTALL) fit $(INSTALL_BIN)
+	$(INSTALL) zdab-cat $(INSTALL_BIN)
 
 clean:
 	rm -f *.o calculate_limits test test-vector test-likelihood fit test-charge test-path calculate-csda-range test-time-pdf test-zebra test-find-peaks Makefile.dep zdab-cat
diff --git a/src/fit.c b/src/fit.c
index adc6ebc..0360a8f 100644
--- a/src/fit.c
+++ b/src/fit.c
@@ -47,12 +47,21 @@
 #include "electron.h"
 #include "muon.h"
 #include "proton.h"
+#include "hdf5.h"
+#include "hdf5_utils.h"
 
 /* Maximum number of fit parameters. Should be at least 4 + 3*MAX_VERTICES. */
 #define MAX_PARS 100
+
 /* Maximum number of peaks to search for in Hough transform. */
 #define MAX_NPEAKS 10
 
+/* Maximum number of events, primary seed particle tracks, and fits.
+ *
+ * FIXME: I should write out the events, tracks, and fits as they come up
+ * instead of storing them until the end. */
+#define MAX_NEVENTS 1000000
+
 char *GitSHA1(void);
 char *GitDirty(void);
 
@@ -5869,12 +5878,13 @@ int main(int argc, char **argv)
 {
     int i, j, k;
     zebraFile *f;
-    zebraBank bmast, bmc, bmcgn, mctk, b;
+    zebraBank bmast, mc, bmcgn, mctk, b;
     int rv;
     EVBank bev;
     FTPVBank bftpv;
     FTXKBank bftxk;
     RSPBank bftxr;
+    MCBank bmc;
     MCTKBank bmctk;
     MCVXBank bmcvx;
     event ev = {0};
@@ -5882,7 +5892,6 @@ int main(int argc, char **argv)
     double xopt[MAX_PARS];
     char *filename = NULL;
     char *output = NULL;
-    FILE *fout = NULL;
     int skip_second_event = 0;
     struct timeval tv_start, tv_stop;
     long long elapsed;
@@ -5892,12 +5901,29 @@ int main(int argc, char **argv)
     int id[MAX_VERTICES];
     size_t combos[100];
     size_t len;
-    char tmp[256];
-    size_t nhit, min_nhit = 100;
+    size_t min_nhit = 100;
     int last_run;
     char dqxx_file[256];
     int32_t gtid = -1;
 
+    int nevents = 0;
+    /* Array of events to write out to HDF5 file.
+     *
+     * Note: Declared static since otherwise it will cause a stack overflow. */
+    static HDF5Event hdf5_events[MAX_NEVENTS];
+
+    int nmcgn = 0;
+    /* Array of primary seed tracks to write out to HDF5 file.
+     *
+     * Note: Declared static since otherwise it will cause a stack overflow. */
+    static HDF5MCGN hdf5_mcgn[MAX_NEVENTS];
+
+    int nfits = 0;
+    /* Array of fit results to write out to HDF5 file.
+     *
+     * Note: Declared static since otherwise it will cause a stack overflow. */
+    static HDF5Fit hdf5_fits[MAX_NEVENTS];
+
     for (i = 1; i < argc; i++) {
         if (strlen(argv[i]) >= 2 && !strncmp(argv[i], "--", 2)) {
             if (!strcmp(argv[i]+2,"skip-second-event")) {
@@ -5953,22 +5979,8 @@ int main(int argc, char **argv)
         return 1;
     }
 
-    if (output) {
-        fout = fopen(output, "w");
-
-        if (!fout) {
-            fprintf(stderr, "failed to open '%s': %s\n", output, strerror(errno));
-            return 1;
-        }
-
-        fprintf(fout, "git_sha1: %s\n", GitSHA1());
-        fprintf(fout, "git_dirty: %s\n", GitDirty());
-        fprintf(fout, "data:\n");
-    }
-
     if (load_pmt_info()) {
         zebra_close(f);
-        if (output) fclose(fout);
         return 1;
     }
 
@@ -6038,30 +6050,29 @@ int main(int argc, char **argv)
             continue;
         }
 
-        if (fout) fprintf(fout, "  -\n");
-
         if (bmast.links[KMAST_MC-1] == 0) goto skip_mc;
 
-        rv = zebra_get_bank(f,&bmc,bmast.links[KMAST_MC-1]);
+        rv = zebra_get_bank(f,&mc,bmast.links[KMAST_MC-1]);
 
         if (rv) {
             fprintf(stderr, "error getting MC bank: %s\n", zebra_err);
             goto err;
         }
 
+        unpack_mc(mc.data, &bmc);
+
         if (bmast.links[KMC_MCGN-1] == 0) {
             fprintf(stderr, "MCGN link is zero!\n");
             goto err;
         }
 
-        rv = zebra_get_bank(f,&bmcgn,bmc.links[KMC_MCGN-1]);
+        rv = zebra_get_bank(f,&bmcgn,mc.links[KMC_MCGN-1]);
 
         if (rv) {
             fprintf(stderr, "error getting MCGN bank: %s\n", zebra_err);
             goto err;
         }
 
-        if (fout) fprintf(fout, "    mcgn:\n");
         while (1) {
             if (bmcgn.links[KMCGN_MCTK-1] == 0) {
                 fprintf(stderr, "MCTK link is zero!\n");
@@ -6110,17 +6121,18 @@ int main(int argc, char **argv)
 
             unpack_mcvx(b.data, &bmcvx);
 
-            if (fout) {
-                fprintf(fout, "      -\n");
-                fprintf(fout, "        id: %" PRIu32 "\n", bmctk.idp);
-                fprintf(fout, "        energy: %.2f\n", bmctk.ene);
-                fprintf(fout, "        posx: %.2f\n", bmcvx.x);
-                fprintf(fout, "        posy: %.2f\n", bmcvx.y);
-                fprintf(fout, "        posz: %.2f\n", bmcvx.z);
-                fprintf(fout, "        dirx: %.4f\n", bmctk.drx);
-                fprintf(fout, "        diry: %.4f\n", bmctk.dry);
-                fprintf(fout, "        dirz: %.4f\n", bmctk.drz);
-                fprintf(fout, "        time: %.4f\n", bmcvx.tim);
+            if (output) {
+                hdf5_mcgn[nmcgn].evn = bmc.evn;
+                hdf5_mcgn[nmcgn].id = bmctk.idp;
+                hdf5_mcgn[nmcgn].energy = bmctk.ene;
+                hdf5_mcgn[nmcgn].x = bmcvx.x;
+                hdf5_mcgn[nmcgn].y = bmcvx.y;
+                hdf5_mcgn[nmcgn].z = bmcvx.z;
+                hdf5_mcgn[nmcgn].dirx = bmctk.drx;
+                hdf5_mcgn[nmcgn].diry = bmctk.dry;
+                hdf5_mcgn[nmcgn].dirz = bmctk.drz;
+                hdf5_mcgn[nmcgn].time = bmcvx.tim;
+                nmcgn++;
             }
 
             if (bmcgn.next) {
@@ -6156,7 +6168,6 @@ skip_mc:
             }
         }
 
-        if (fout) fprintf(fout, "    ev:\n");
         while (1) {
             unpack_ev(b.data, &bev);
             ev.run = bev.run;
@@ -6185,49 +6196,44 @@ skip_mc:
 
             if (get_event(f,&ev,&b)) goto skip_event;
 
-            nhit = get_nhit(&ev);
+            if (output) {
+                hdf5_events[nevents].run = ev.run;
+                hdf5_events[nevents].evn = bmc.evn;
+                hdf5_events[nevents].gtr = ev.trigger_time;
+                hdf5_events[nevents].nhit = ev.nhit;
+                hdf5_events[nevents].gtid = ev.gtid;
+                hdf5_events[nevents].trg_type = ev.trigger_type;
+                hdf5_events[nevents].dc = get_dc_word(&ev, f, &bmast, &b);
 
-            if (fout) {
-                fprintf(fout, "      - run: %i\n", ev.run);
-                fprintf(fout, "        gtr: %.0f\n", ev.trigger_time);
-                fprintf(fout, "        nhit: %zu\n", nhit);
-                fprintf(fout, "        gtid: %i\n", ev.gtid);
-                fprintf(fout, "        trg_type: 0x%08x\n", ev.trigger_type);
-                fprintf(fout, "        dc: 0x%08x\n", get_dc_word(&ev, f, &bmast, &b));
-            }
-
-            if (fout) {
                 if (get_ftpv(f,&b,&bftpv)) {
                     fprintf(stderr, "%s\n", zdab_err);
+                    hdf5_events[nevents].ftp_x = NAN;
+                    hdf5_events[nevents].ftp_y = NAN;
+                    hdf5_events[nevents].ftp_z = NAN;
                 } else {
-                    fprintf(fout, "        ftp:\n");
-                    fprintf(fout, "            x: %.2f\n", bftpv.x);
-                    fprintf(fout, "            y: %.2f\n", bftpv.y);
-                    fprintf(fout, "            z: %.2f\n", bftpv.z);
+                    hdf5_events[nevents].ftp_x = bftpv.x;
+                    hdf5_events[nevents].ftp_y = bftpv.y;
+                    hdf5_events[nevents].ftp_z = bftpv.z;
                 }
-            }
 
-            if (fout) {
                 if (get_ftxk(f,&b,&bftxk)) {
                     fprintf(stderr, "%s\n", zdab_err);
+                    hdf5_events[nevents].ftk_energy = NAN;
                 } else {
-                    fprintf(fout, "        ftk:\n");
-                    fprintf(fout, "            energy: %.2f\n", bftxk.energy);
+                    hdf5_events[nevents].ftk_energy = bftxk.energy;
                 }
-            }
 
-            if (fout) {
                 if (get_rsp(f,&b,&bftxr)) {
                     fprintf(stderr, "%s\n", zdab_err);
+                    hdf5_events[nevents].rsp_energy = NAN;
                 } else {
-                    fprintf(fout, "        rsp:\n");
-                    fprintf(fout, "            energy: %.2f\n", bftxr.ene);
+                    hdf5_events[nevents].rsp_energy = bftxr.ene;
                 }
-            }
 
-            if (nhit < min_nhit) goto skip_event;
+                nevents++;
+            }
 
-            if (fout) fprintf(fout, "        fit:\n");
+            if (ev.nhit < min_nhit) goto skip_event;
 
             fmin_best = nll_best(&ev);
 
@@ -6251,25 +6257,47 @@ skip_mc:
 
                     elapsed = (tv_stop.tv_sec - tv_start.tv_sec)*1000 + (tv_stop.tv_usec - tv_start.tv_usec)/1000;
 
-                    if (fout) {
-                        sprintf_particle_string(id,i,tmp);
-                        fprintf(fout, "          %s:\n", tmp);
-                        fprintf(fout, "            posx: %.2f\n", xopt[0]);
-                        fprintf(fout, "            posy: %.2f\n", xopt[1]);
-                        fprintf(fout, "            posz: %.2f\n", xopt[2]);
-                        fprintf(fout, "            t0: %.2f\n", xopt[3]);
-                        sprintf_yaml_list(xopt+4,i,3,tmp);
-                        fprintf(fout, "            energy: %s\n", tmp);
-                        sprintf_yaml_list(xopt+5,i,3,tmp);
-                        fprintf(fout, "            theta: %s\n", tmp);
-                        sprintf_yaml_list(xopt+6,i,3,tmp);
-                        fprintf(fout, "            phi: %s\n", tmp);
-                        //fprintf(fout, "            z1: %.2f\n", xopt[7]);
-                        //fprintf(fout, "            z2: %.2f\n", xopt[8]);
-                        fprintf(fout, "            fmin: %.2f\n", fmin);
-                        fprintf(fout, "            time: %lld\n", elapsed);
-                        fprintf(fout, "            psi: %.2f\n", fmin-fmin_best);
-                        fflush(fout);
+                    if (output) {
+                        hdf5_fits[nfits].run = ev.run;
+                        hdf5_fits[nfits].gtid = ev.gtid;
+                        hdf5_fits[nfits].n = i;
+                        hdf5_fits[nfits].x = xopt[0];
+                        hdf5_fits[nfits].y = xopt[1];
+                        hdf5_fits[nfits].z = xopt[2];
+                        hdf5_fits[nfits].t0 = xopt[3];
+                        hdf5_fits[nfits].id1 = id[0];
+                        hdf5_fits[nfits].energy1 = xopt[4];
+                        hdf5_fits[nfits].theta1 = xopt[5];
+                        hdf5_fits[nfits].phi1 = xopt[6];
+
+                        if (i > 1) {
+                            hdf5_fits[nfits].id2 = id[1];
+                            hdf5_fits[nfits].energy2 = xopt[7];
+                            hdf5_fits[nfits].theta2 = xopt[8];
+                            hdf5_fits[nfits].phi2 = xopt[9];
+                        } else {
+                            hdf5_fits[nfits].id2 = 0;
+                            hdf5_fits[nfits].energy2 = NAN;
+                            hdf5_fits[nfits].theta2 = NAN;
+                            hdf5_fits[nfits].phi2 = NAN;
+                        }
+
+                        if (i > 2) {
+                            hdf5_fits[nfits].id3 = id[2];
+                            hdf5_fits[nfits].energy3 = xopt[10];
+                            hdf5_fits[nfits].theta3 = xopt[11];
+                            hdf5_fits[nfits].phi3 = xopt[12];
+                        } else {
+                            hdf5_fits[nfits].id3 = 0;
+                            hdf5_fits[nfits].energy3 = NAN;
+                            hdf5_fits[nfits].theta3 = NAN;
+                            hdf5_fits[nfits].phi3 = NAN;
+                        }
+
+                        hdf5_fits[nfits].fmin = fmin;
+                        hdf5_fits[nfits].time = elapsed;
+                        hdf5_fits[nfits].psi = fmin-fmin_best;
+                        nfits++;
                     }
                 }
             }
@@ -6292,13 +6320,13 @@ skip_event:
 
 end:
 
+    if (output) save_output(output, hdf5_events, nevents, hdf5_mcgn, nmcgn, hdf5_fits, nfits);
+
     free_interpolation();
     pmt_response_free();
 
     db_free(db);
 
-    if (fout) fclose(fout);
-
     zebra_close(f);
 
     return 0;
@@ -6309,8 +6337,6 @@ err:
 
     db_free(db);
 
-    if (fout) fclose(fout);
-
     zebra_close(f);
 
     return 1;
diff --git a/src/hdf5_utils.c b/src/hdf5_utils.c
new file mode 100644
index 0000000..05d1002
--- /dev/null
+++ b/src/hdf5_utils.c
@@ -0,0 +1,193 @@
+#include "hdf5_utils.h"
+#include "hdf5.h"
+#include <string.h> /* for strlen() */
+
+char *GitSHA1(void);
+char *GitDirty(void);
+
+/* Write out data to an HDF5 file.
+ *
+ * This function writes out three datasets to an HDF5 file:
+ *
+ * 1. ev: the event dataset
+ *
+ *    This dataset contains event level information similar to the EV bank in
+ *    SNOMAN. It also contains a data cleaning word and the FTP, FTK, and RSP
+ *    fit values for the position and energy.
+ *
+ * 2. mcgn: the primary particle tracks
+ *
+ *     This dataset contains vertex and track information for all the primary
+ *     particle tracks used to simulate the event.
+ *
+ * 3. fits: the fit information
+ *
+ *     This dataset contains the reconstructed fit information for each event.
+ *     Note that this dataset will in general contain multiple rows for a
+ *     single event since we fit for different particle hypotheses.
+ *
+ * Currently the ev and mcgn datasets are fixed size, and the fits dataset is
+ * chunked and extensible to allow the cat-grid-jobs script to work.
+ *
+ */
+int save_output(const char *output, HDF5Event *hdf5_events, size_t nevents, HDF5MCGN *hdf5_mcgn, size_t nmcgn, HDF5Fit *hdf5_fits, size_t nfits)
+{
+    hid_t      hdf5_events_tid;     /* File datatype identifier */
+    hid_t      file, dataset, space; /* Handles */
+    herr_t     status;
+    hsize_t    dim[1];   /* Dataspace dimensions */
+    hid_t      hdf5_mcgn_tid;     /* File datatype identifier */
+    hid_t      dataset_mcgn, space_mcgn; /* Handles */
+    hsize_t    dim_mcgn[1];   /* Dataspace dimensions */
+    hid_t      hdf5_fits_tid;     /* File datatype identifier */
+    hid_t      dataset_fits, space_fits; /* Handles */
+    hsize_t    dim_fits[1];   /* Dataspace dimensions */
+    hsize_t    chunk_dims[1];   /* Dataspace dimensions */
+    hid_t      dcpl_id; /* Handles */
+    hsize_t    maxdims[1] = {H5S_UNLIMITED};
+
+    /*
+     * Create the data space.
+     */
+    dim[0] = nevents;
+    space = H5Screate_simple(1, dim, NULL);
+    dim_mcgn[0] = nmcgn;
+    space_mcgn = H5Screate_simple(1, dim_mcgn, NULL);
+    dim_fits[0] = nfits;
+    space_fits = H5Screate_simple(1, dim_fits, maxdims);
+    chunk_dims[0] = 10;
+    dcpl_id = H5Pcreate(H5P_DATASET_CREATE);
+    H5Pset_chunk(dcpl_id, 1, chunk_dims);
+
+    /*
+     * Create the file.
+     */
+    file = H5Fcreate(output, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);
+
+    hid_t   attr1, attr2; /* Attribute identifiers */
+    hid_t   aid1, aid2;    /* Attribute dataspace identifiers */
+    hid_t   atype1, atype2;    /* Attribute type */
+
+    char *git_sha1 = GitSHA1();
+    char *git_dirty = GitDirty();
+
+    /*
+     * Create string attribute.
+     */
+    aid1  = H5Screate(H5S_SCALAR);
+    atype1 = H5Tcopy(H5T_C_S1);
+             H5Tset_size(atype1, strlen(git_sha1));
+             H5Tset_strpad(atype1,H5T_STR_NULLTERM);
+    attr1 = H5Acreate2(file, "git_sha1", atype1, aid1, H5P_DEFAULT, H5P_DEFAULT);
+
+    /*
+     * Create string attribute.
+     */
+    aid2  = H5Screate(H5S_SCALAR);
+    atype2 = H5Tcopy(H5T_C_S1);
+             H5Tset_size(atype2, strlen(git_dirty));
+             H5Tset_strpad(atype2,H5T_STR_NULLTERM);
+    attr2 = H5Acreate2(file, "git_dirty", atype2, aid2, H5P_DEFAULT, H5P_DEFAULT);
+
+    herr_t  ret;                /* Return value */
+
+    /*
+     * Write string attribute.
+     */
+    ret = H5Awrite(attr1, atype1, git_sha1);
+    ret = H5Awrite(attr2, atype2, git_dirty);
+
+    /*
+     * Create the memory data type.
+     */
+    hdf5_events_tid = H5Tcreate(H5T_COMPOUND, sizeof(HDF5Event));
+    H5Tinsert(hdf5_events_tid, "run",        HOFFSET(HDF5Event, run),        H5T_NATIVE_INT);
+    H5Tinsert(hdf5_events_tid, "evn",        HOFFSET(HDF5Event, evn),        H5T_NATIVE_INT);
+    H5Tinsert(hdf5_events_tid, "gtr",        HOFFSET(HDF5Event, gtr),        H5T_NATIVE_DOUBLE);
+    H5Tinsert(hdf5_events_tid, "nhit",       HOFFSET(HDF5Event, nhit),       H5T_NATIVE_INT);
+    H5Tinsert(hdf5_events_tid, "gtid",       HOFFSET(HDF5Event, gtid),       H5T_NATIVE_UINT32);
+    H5Tinsert(hdf5_events_tid, "trg_type",   HOFFSET(HDF5Event, trg_type),   H5T_NATIVE_UINT32);
+    H5Tinsert(hdf5_events_tid, "dc",         HOFFSET(HDF5Event, dc),         H5T_NATIVE_UINT32);
+    H5Tinsert(hdf5_events_tid, "ftp_x",      HOFFSET(HDF5Event, ftp_x),      H5T_NATIVE_DOUBLE);
+    H5Tinsert(hdf5_events_tid, "ftp_y",      HOFFSET(HDF5Event, ftp_y),      H5T_NATIVE_DOUBLE);
+    H5Tinsert(hdf5_events_tid, "ftp_z",      HOFFSET(HDF5Event, ftp_z),      H5T_NATIVE_DOUBLE);
+    H5Tinsert(hdf5_events_tid, "ftk_energy", HOFFSET(HDF5Event, ftk_energy), H5T_NATIVE_DOUBLE);
+    H5Tinsert(hdf5_events_tid, "rsp_energy", HOFFSET(HDF5Event, rsp_energy), H5T_NATIVE_DOUBLE);
+
+    hdf5_mcgn_tid = H5Tcreate(H5T_COMPOUND, sizeof(HDF5MCGN));
+    H5Tinsert(hdf5_mcgn_tid, "run",    HOFFSET(HDF5MCGN, run),    H5T_NATIVE_INT);
+    H5Tinsert(hdf5_mcgn_tid, "evn",    HOFFSET(HDF5MCGN, evn),    H5T_NATIVE_INT);
+    H5Tinsert(hdf5_mcgn_tid, "id",     HOFFSET(HDF5MCGN, id),     H5T_NATIVE_INT);
+    H5Tinsert(hdf5_mcgn_tid, "energy", HOFFSET(HDF5MCGN, energy), H5T_NATIVE_DOUBLE);
+    H5Tinsert(hdf5_mcgn_tid, "x",      HOFFSET(HDF5MCGN, x),      H5T_NATIVE_DOUBLE);
+    H5Tinsert(hdf5_mcgn_tid, "y",      HOFFSET(HDF5MCGN, y),      H5T_NATIVE_DOUBLE);
+    H5Tinsert(hdf5_mcgn_tid, "z",      HOFFSET(HDF5MCGN, z),      H5T_NATIVE_DOUBLE);
+    H5Tinsert(hdf5_mcgn_tid, "dirx",   HOFFSET(HDF5MCGN, dirx),   H5T_NATIVE_DOUBLE);
+    H5Tinsert(hdf5_mcgn_tid, "diry",   HOFFSET(HDF5MCGN, diry),   H5T_NATIVE_DOUBLE);
+    H5Tinsert(hdf5_mcgn_tid, "dirz",   HOFFSET(HDF5MCGN, dirz),   H5T_NATIVE_DOUBLE);
+    H5Tinsert(hdf5_mcgn_tid, "time",   HOFFSET(HDF5MCGN, time),   H5T_NATIVE_DOUBLE);
+
+    hdf5_fits_tid = H5Tcreate(H5T_COMPOUND, sizeof(HDF5Fit));
+    H5Tinsert(hdf5_fits_tid, "run",     HOFFSET(HDF5Fit, run),     H5T_NATIVE_INT);
+    H5Tinsert(hdf5_fits_tid, "gtid",    HOFFSET(HDF5Fit, gtid),    H5T_NATIVE_UINT32);
+    H5Tinsert(hdf5_fits_tid, "n",       HOFFSET(HDF5Fit, n),       H5T_NATIVE_INT);
+    H5Tinsert(hdf5_fits_tid, "x",       HOFFSET(HDF5Fit, x),       H5T_NATIVE_DOUBLE);
+    H5Tinsert(hdf5_fits_tid, "y",       HOFFSET(HDF5Fit, y),       H5T_NATIVE_DOUBLE);
+    H5Tinsert(hdf5_fits_tid, "z",       HOFFSET(HDF5Fit, z),       H5T_NATIVE_DOUBLE);
+    H5Tinsert(hdf5_fits_tid, "t0",      HOFFSET(HDF5Fit, t0),      H5T_NATIVE_DOUBLE);
+    H5Tinsert(hdf5_fits_tid, "id1",     HOFFSET(HDF5Fit, id1),     H5T_NATIVE_INT);
+    H5Tinsert(hdf5_fits_tid, "energy1", HOFFSET(HDF5Fit, energy1), H5T_NATIVE_DOUBLE);
+    H5Tinsert(hdf5_fits_tid, "theta1",  HOFFSET(HDF5Fit, theta1),  H5T_NATIVE_DOUBLE);
+    H5Tinsert(hdf5_fits_tid, "phi1",    HOFFSET(HDF5Fit, phi1),    H5T_NATIVE_DOUBLE);
+    H5Tinsert(hdf5_fits_tid, "id2",     HOFFSET(HDF5Fit, id2),     H5T_NATIVE_INT);
+    H5Tinsert(hdf5_fits_tid, "energy2", HOFFSET(HDF5Fit, energy2), H5T_NATIVE_DOUBLE);
+    H5Tinsert(hdf5_fits_tid, "theta2",  HOFFSET(HDF5Fit, theta2),  H5T_NATIVE_DOUBLE);
+    H5Tinsert(hdf5_fits_tid, "phi2",    HOFFSET(HDF5Fit, phi2),    H5T_NATIVE_DOUBLE);
+    H5Tinsert(hdf5_fits_tid, "id3",     HOFFSET(HDF5Fit, id3),     H5T_NATIVE_INT);
+    H5Tinsert(hdf5_fits_tid, "energy3", HOFFSET(HDF5Fit, energy3), H5T_NATIVE_DOUBLE);
+    H5Tinsert(hdf5_fits_tid, "theta3",  HOFFSET(HDF5Fit, theta3),  H5T_NATIVE_DOUBLE);
+    H5Tinsert(hdf5_fits_tid, "phi3",    HOFFSET(HDF5Fit, phi3),    H5T_NATIVE_DOUBLE);
+    H5Tinsert(hdf5_fits_tid, "fmin",    HOFFSET(HDF5Fit, fmin),    H5T_NATIVE_DOUBLE);
+    H5Tinsert(hdf5_fits_tid, "time",    HOFFSET(HDF5Fit, time),    H5T_NATIVE_DOUBLE);
+    H5Tinsert(hdf5_fits_tid, "psi",     HOFFSET(HDF5Fit, psi),     H5T_NATIVE_DOUBLE);
+
+    /*
+     * Create the dataset.
+     */
+    dataset = H5Dcreate2(file, "ev", hdf5_events_tid, space, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+    dataset_mcgn = H5Dcreate2(file, "mcgn", hdf5_mcgn_tid, space_mcgn, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+    dataset_fits = H5Dcreate2(file, "fits", hdf5_fits_tid, space_fits, H5P_DEFAULT, dcpl_id, H5P_DEFAULT);
+
+    /*
+     * Wtite data to the dataset;
+     */
+    if (nevents)
+        status = H5Dwrite(dataset, hdf5_events_tid, H5S_ALL, H5S_ALL, H5P_DEFAULT, hdf5_events);
+    if (nmcgn)
+        status = H5Dwrite(dataset_mcgn, hdf5_mcgn_tid, H5S_ALL, H5S_ALL, H5P_DEFAULT, hdf5_mcgn);
+    if (nfits)
+        status = H5Dwrite(dataset_fits, hdf5_fits_tid, H5S_ALL, H5S_ALL, H5P_DEFAULT, hdf5_fits);
+
+    /*
+     * Release resources
+     */
+    H5Sclose(aid1);
+    H5Sclose(aid2);
+    H5Tclose(atype1);
+    H5Tclose(atype2);
+    H5Aclose(attr1);
+    H5Aclose(attr2);
+    H5Tclose(hdf5_events_tid);
+    H5Tclose(hdf5_mcgn_tid);
+    H5Tclose(hdf5_fits_tid);
+    H5Sclose(space);
+    H5Sclose(space_mcgn);
+    H5Sclose(space_fits);
+    H5Pclose(dcpl_id);
+    H5Dclose(dataset);
+    H5Dclose(dataset_mcgn);
+    H5Dclose(dataset_fits);
+    H5Fclose(file);
+
+    return 0;
+}
diff --git a/src/hdf5_utils.h b/src/hdf5_utils.h
new file mode 100644
index 0000000..4f6ad32
--- /dev/null
+++ b/src/hdf5_utils.h
@@ -0,0 +1,85 @@
+#ifndef HDF5_UTILS
+#define HDF5_UTILS
+
+#include <stdint.h>
+#include <stdlib.h> /* for size_t */
+
+/* Struct to hold information for each triggered event (similar to the EV bank,
+ * but with some reconstructed information).
+ *
+ * This is the struct that is written out to the 'ev' dataset in the HDF5 file.
+ *
+ * Note: If there is no FTP, FTK, or RSP info in the zdab file these values
+ * will be set to nan. */
+typedef struct HDF5Event {
+    int run;
+    int evn;
+    double gtr;
+    int nhit;
+    uint32_t gtid;
+    uint32_t trg_type;
+    uint32_t dc;
+    double ftp_x;
+    double ftp_y;
+    double ftp_z;
+    double ftk_energy;
+    double rsp_energy;
+} HDF5Event;
+
+/* Struct to hold information for each primary seed tracks that is written out
+ * as the `mcgn` dataset in the HDF5 file. To figure out which track
+ * corresponds to a given GTID you can match them up based on the evn column,
+ * which is filled with the MC event number in both datasets. */
+typedef struct HDF5MCGN {
+    int run;
+    int evn;
+    int id;
+    double energy;
+    double x;
+    double y;
+    double z;
+    double dirx;
+    double diry;
+    double dirz;
+    double time;
+} HDF5MCGN;
+
+/* Struct to hold information for each fit that is written out
+ * as the `fits` dataset in the HDF5 file.
+ *
+ * Note that the energy2, theta2, phi2 fields will only be filled in for fits
+ * with 2 tracks and similarly for energy3, theta3, and phi3 for 3 track fits. */
+typedef struct HDF5Fit {
+    int run;
+    uint32_t gtid;
+    double x;
+    double y;
+    double z;
+    double t0;
+
+    /* Number of tracks. */
+    int n;
+
+    int id1;
+    double energy1;
+    double theta1;
+    double phi1;
+
+    int id2;
+    double energy2;
+    double theta2;
+    double phi2;
+
+    int id3;
+    double energy3;
+    double theta3;
+    double phi3;
+
+    double fmin;
+    double time;
+    double psi;
+} HDF5Fit;
+
+int save_output(const char *output, HDF5Event *hdf5_events, size_t nevents, HDF5MCGN *hdf5_mcgn, size_t nmcgn, HDF5Fit *hdf5_fits, size_t nfits);
+
+#endif
diff --git a/src/zdab-cat.c b/src/zdab-cat.c
index 5b9bb3d..278236c 100644
--- a/src/zdab-cat.c
+++ b/src/zdab-cat.c
@@ -26,6 +26,15 @@
 #include <errno.h> /* for errno */
 #include "release.h"
 #include "dc.h"
+#include "hdf5.h"
+#include <math.h>
+#include "hdf5_utils.h"
+
+/* Maximum number of events, primary seed particle tracks, and fits.
+ *
+ * FIXME: I should write out the events, tracks, and fits as they come up
+ * instead of storing them until the end. */
+#define MAX_NEVENTS 1000000
 
 #define EV_RECORD      0x45562020 // 'EV  '
 #define MCTK_RECORD    0x4d43544b // 'MCTK'
@@ -47,22 +56,25 @@ int main(int argc, char **argv)
 {
     int i;
     zebraFile *f;
-    zebraBank bmast, bmc, bmcgn, mctk, b;
+    zebraBank bmast, mc, bmcgn, mctk, b;
     int rv;
     EVBank bev;
     FTPVBank bftpv;
     FTXKBank bftxk;
     RSPBank bftxr;
+    MCBank bmc;
     MCTKBank bmctk;
     MCVXBank bmcvx;
     event ev = {0};
     char *filename = NULL;
     char *output = NULL;
-    FILE *fout = stdout;
     int skip_second_event = 0;
-    size_t nhit;
     int last_run;
     char dqxx_file[256];
+    int nevents = 0;
+    static HDF5Event hdf5_events[MAX_NEVENTS];
+    int nmcgn = 0;
+    static HDF5MCGN hdf5_mcgn[MAX_NEVENTS];
 
     for (i = 1; i < argc; i++) {
         if (strlen(argv[i]) >= 2 && !strncmp(argv[i], "--", 2)) {
@@ -96,23 +108,8 @@ int main(int argc, char **argv)
         return 1;
     }
 
-    if (output) {
-        fout = fopen(output, "w");
-
-        if (!fout) {
-            fprintf(stderr, "failed to open '%s': %s\n", output, strerror(errno));
-            return 1;
-        }
-    }
-
-    if (fout) {
-        fprintf(fout, "git_sha1: %s\n", GitSHA1());
-        fprintf(fout, "git_dirty: %s\n", GitDirty());
-    }
-
     if (load_pmt_info()) {
         zebra_close(f);
-        if (output) fclose(fout);
         return 1;
     }
 
@@ -160,11 +157,9 @@ int main(int argc, char **argv)
             continue;
         }
 
-        if (fout) fprintf(fout, "---\n");
-
         if (bmast.links[KMAST_MC-1] == 0) goto skip_mc;
 
-        rv = zebra_get_bank(f,&bmc,bmast.links[KMAST_MC-1]);
+        rv = zebra_get_bank(f,&mc,bmast.links[KMAST_MC-1]);
 
         if (rv) {
             fprintf(stderr, "error getting MC bank: %s\n", zebra_err);
@@ -176,14 +171,15 @@ int main(int argc, char **argv)
             goto err;
         }
 
-        rv = zebra_get_bank(f,&bmcgn,bmc.links[KMC_MCGN-1]);
+        unpack_mc(mc.data, &bmc);
+
+        rv = zebra_get_bank(f,&bmcgn,mc.links[KMC_MCGN-1]);
 
         if (rv) {
             fprintf(stderr, "error getting MCGN bank: %s\n", zebra_err);
             goto err;
         }
 
-        if (fout) fprintf(fout, "    mcgn:\n");
         while (1) {
             if (bmcgn.links[KMCGN_MCTK-1] == 0) {
                 fprintf(stderr, "MCTK link is zero!\n");
@@ -232,17 +228,18 @@ int main(int argc, char **argv)
 
             unpack_mcvx(b.data, &bmcvx);
 
-            if (fout) {
-                fprintf(fout, "      -\n");
-                fprintf(fout, "        id: %" PRIu32 "\n", bmctk.idp);
-                fprintf(fout, "        energy: %.2f\n", bmctk.ene);
-                fprintf(fout, "        posx: %.2f\n", bmcvx.x);
-                fprintf(fout, "        posy: %.2f\n", bmcvx.y);
-                fprintf(fout, "        posz: %.2f\n", bmcvx.z);
-                fprintf(fout, "        dirx: %.4f\n", bmctk.drx);
-                fprintf(fout, "        diry: %.4f\n", bmctk.dry);
-                fprintf(fout, "        dirz: %.4f\n", bmctk.drz);
-                fprintf(fout, "        time: %.4f\n", bmcvx.tim);
+            if (output) {
+                hdf5_mcgn[nmcgn].evn = bmc.evn;
+                hdf5_mcgn[nmcgn].id = bmctk.idp;
+                hdf5_mcgn[nmcgn].energy = bmctk.ene;
+                hdf5_mcgn[nmcgn].x = bmcvx.x;
+                hdf5_mcgn[nmcgn].y = bmcvx.y;
+                hdf5_mcgn[nmcgn].z = bmcvx.z;
+                hdf5_mcgn[nmcgn].dirx = bmctk.drx;
+                hdf5_mcgn[nmcgn].diry = bmctk.dry;
+                hdf5_mcgn[nmcgn].dirz = bmctk.drz;
+                hdf5_mcgn[nmcgn].time = bmcvx.tim;
+                nmcgn++;
             }
 
             if (bmcgn.next) {
@@ -278,7 +275,6 @@ skip_mc:
             }
         }
 
-        if (fout) fprintf(fout, "    ev:\n");
         while (1) {
             unpack_ev(b.data, &bev);
             ev.run = bev.run;
@@ -305,44 +301,41 @@ skip_mc:
 
             rv = get_event(f,&ev,&b);
 
-            nhit = get_nhit(&ev);
+            if (output) {
+                hdf5_events[nevents].run = ev.run;
+                hdf5_events[nevents].evn = bmc.evn;
+                hdf5_events[nevents].gtr = ev.trigger_time;
+                hdf5_events[nevents].nhit = ev.nhit;
+                hdf5_events[nevents].gtid = ev.gtid;
+                hdf5_events[nevents].trg_type = ev.trigger_type;
+                hdf5_events[nevents].dc = get_dc_word(&ev, f, &bmast, &b);
 
-            if (fout) {
-                fprintf(fout, "      - run: %i\n", ev.run);
-                fprintf(fout, "        gtr: %.0f\n", ev.trigger_time);
-                fprintf(fout, "        nhit: %zu\n", nhit);
-                fprintf(fout, "        gtid: %i\n", ev.gtid);
-                fprintf(fout, "        trg_type: 0x%08x\n", ev.trigger_type);
-                fprintf(fout, "        dc: 0x%08x\n", get_dc_word(&ev, f, &bmast, &b));
-            }
-
-            if (fout) {
                 if (get_ftpv(f,&b,&bftpv)) {
                     fprintf(stderr, "%s\n", zdab_err);
+                    hdf5_events[nevents].ftp_x = NAN;
+                    hdf5_events[nevents].ftp_y = NAN;
+                    hdf5_events[nevents].ftp_z = NAN;
                 } else {
-                    fprintf(fout, "        ftp:\n");
-                    fprintf(fout, "            x: %.2f\n", bftpv.x);
-                    fprintf(fout, "            y: %.2f\n", bftpv.y);
-                    fprintf(fout, "            z: %.2f\n", bftpv.z);
+                    hdf5_events[nevents].ftp_x = bftpv.x;
+                    hdf5_events[nevents].ftp_y = bftpv.y;
+                    hdf5_events[nevents].ftp_z = bftpv.z;
                 }
-            }
 
-            if (fout) {
                 if (get_ftxk(f,&b,&bftxk)) {
                     fprintf(stderr, "%s\n", zdab_err);
+                    hdf5_events[nevents].ftk_energy = NAN;
                 } else {
-                    fprintf(fout, "        ftk:\n");
-                    fprintf(fout, "            energy: %.2f\n", bftxk.energy);
+                    hdf5_events[nevents].ftk_energy = bftxk.energy;
                 }
-            }
 
-            if (fout) {
                 if (get_rsp(f,&b,&bftxr)) {
                     fprintf(stderr, "%s\n", zdab_err);
+                    hdf5_events[nevents].rsp_energy = NAN;
                 } else {
-                    fprintf(fout, "        rsp:\n");
-                    fprintf(fout, "            energy: %.2f\n", bftxr.ene);
+                    hdf5_events[nevents].rsp_energy = bftxr.ene;
                 }
+
+                nevents++;
             }
 
             /* Note the origin link for the first EV bank points back to the
@@ -359,9 +352,11 @@ skip_mc:
         }
     }
 
-    db_free(db);
+    if (output) {
+        save_output(output, hdf5_events, nevents, hdf5_mcgn, nmcgn, NULL, 0);
+    }
 
-    if (fout) fclose(fout);
+    db_free(db);
 
     zebra_close(f);
 
@@ -370,8 +365,6 @@ skip_mc:
 err:
     db_free(db);
 
-    if (fout) fclose(fout);
-
     zebra_close(f);
 
     return 1;
diff --git a/src/zdab_utils.c b/src/zdab_utils.c
index 716503c..416bd79 100644
--- a/src/zdab_utils.c
+++ b/src/zdab_utils.c
@@ -580,6 +580,24 @@ void unpack_ftk(uint32_t *data, FTKBank *b)
     unpack((uint8_t *) (data+25), "f",&b->spare5);
 }
 
+void unpack_mc(uint32_t *data, MCBank *b)
+{
+    unpack((uint8_t *) data,      "l",&b->jdy);
+    unpack((uint8_t *) (data+1),  "l",&b->ut1);
+    unpack((uint8_t *) (data+2),  "l",&b->ut2);
+    unpack((uint8_t *) (data+3),  "l",&b->dte);
+    unpack((uint8_t *) (data+4),  "l",&b->hmsc);
+    unpack((uint8_t *) (data+5),  "l",&b->seed1);
+    unpack((uint8_t *) (data+6),  "l",&b->seed2);
+    unpack((uint8_t *) (data+7),  "l",&b->seed_num);
+    unpack((uint8_t *) (data+8),  "l",&b->dtp);
+    unpack((uint8_t *) (data+9),  "f",&b->mcver);
+    unpack((uint8_t *) (data+10), "l",&b->evn);
+    unpack((uint8_t *) (data+11), "F",&b->gtr);
+    unpack((uint8_t *) (data+13), "l",&b->num_ge_err);
+    unpack((uint8_t *) (data+14), "f",&b->pmt_eff);
+}
+
 void unpack_mcgn(uint32_t *data, MCGNBank *b)
 {
     unpack((uint8_t *) data,      "l",&b->id);
diff --git a/src/zdab_utils.h b/src/zdab_utils.h
index d4411d9..043e185 100644
--- a/src/zdab_utils.h
+++ b/src/zdab_utils.h
@@ -378,6 +378,37 @@ typedef struct FTKBank {
     float spare5;
 } FTKBank;
 
+typedef struct MCBank {
+    /* Julian date. */
+    uint32_t jdy;
+    /* Universal time seconds. */
+    uint32_t ut1;
+    /* Universal time nanoseconds. */
+    uint32_t ut2;
+    /* Date (format: yyyymmdd). */
+    uint32_t dte;
+    /* Time (format: hhmmsscc  - cc is centisec). */
+    uint32_t hmsc;
+    /* First random number seed. */
+    uint32_t seed1;
+    /* Second random number seed. */
+    uint32_t seed2;
+    /* Random number number. */
+    uint32_t seed_num;
+    /* Data type. */
+    uint32_t dtp;
+    /* SNOMAN MC version number. */
+    float mcver;
+    /* Monte Carlo event number. */
+    uint32_t evn;
+    /* Generation time in nsec (first word). */
+    double gtr;
+    /* Number of suppressed Cerenkov photon tracking errors. */
+    uint32_t num_ge_err;
+    /* The current PMT collection efficiency. */
+    float pmt_eff;
+} MCBank;
+
 typedef struct MCGNBank {
     /* Particle id. */
     uint32_t id;
@@ -638,6 +669,7 @@ void unpack_ftpt(uint32_t *data, FTPTBank *b);
 void unpack_ftpv(uint32_t *data, FTPVBank *b);
 void unpack_ftxk(uint32_t *data, FTXKBank *b);
 void unpack_ftk(uint32_t *data, FTKBank *b);
+void unpack_mc(uint32_t *data, MCBank *b);
 void unpack_mcgn(uint32_t *data, MCGNBank *b);
 void unpack_mcvx(uint32_t *data, MCVXBank *b);
 void unpack_mctk(uint32_t *data, MCTKBank *b);
diff --git a/utils/Makefile b/utils/Makefile
index ef21a09..011a723 100644
--- a/utils/Makefile
+++ b/utils/Makefile
@@ -13,5 +13,6 @@ install:
 	$(INSTALL) plot-likelihood $(INSTALL_BIN)
 	$(INSTALL) submit-grid-jobs $(INSTALL_BIN)
 	$(INSTALL) gen-dark-matter $(INSTALL_BIN)
+	$(INSTALL) fit-wrapper $(INSTALL_BIN)
 
 .PHONY: install
diff --git a/utils/cat-grid-jobs b/utils/cat-grid-jobs
index ba06290..83a8004 100755
--- a/utils/cat-grid-jobs
+++ b/utils/cat-grid-jobs
@@ -23,7 +23,7 @@ as YAML to stdout.
 
 Example:
 
-    $ cat-grid-jobs --dir ~/sddm/src/ ~/mc_atm_nu_no_osc_genie_010000_0.mcds ~/grid_job_results/*.txt > output.txt
+    $ cat-grid-jobs ~/mc_atm_nu_no_osc_genie_010000_0.mcds ~/grid_job_results/*.txt > output.txt
 
 """
 
@@ -33,80 +33,145 @@ try:
     from yaml import CLoader as Loader, CDumper as Dumper
 except ImportError:
     from yaml import Loader, Dumper
+import os
+import sys
+
+# Check that a given file can be accessed with the correct mode.
+# Additionally check that `file` is not a directory, as on Windows
+# directories pass the os.access check.
+def _access_check(fn, mode):
+    return (os.path.exists(fn) and os.access(fn, mode) and not os.path.isdir(fn))
+
+def which(cmd, mode=os.F_OK | os.X_OK, path=None):
+    """Given a command, mode, and a PATH string, return the path which
+    conforms to the given mode on the PATH, or None if there is no such
+    file.
+    `mode` defaults to os.F_OK | os.X_OK. `path` defaults to the result
+    of os.environ.get("PATH"), or can be overridden with a custom search
+    path.
+    """
+    # If we're given a path with a directory part, look it up directly rather
+    # than referring to PATH directories. This includes checking relative to the
+    # current directory, e.g. ./script
+    if os.path.dirname(cmd):
+        if _access_check(cmd, mode):
+            return cmd
+        return None
+
+    if path is None:
+        path = os.environ.get("PATH", None)
+        if path is None:
+            try:
+                path = os.confstr("CS_PATH")
+            except (AttributeError, ValueError):
+                # os.confstr() or CS_PATH is not available
+                path = os.defpath
+        # bpo-35755: Don't use os.defpath if the PATH environment variable is
+        # set to an empty string
+
+    # PATH='' doesn't match, whereas PATH=':' looks in the current directory
+    if not path:
+        return None
+
+    path = path.split(os.pathsep)
+
+    if sys.platform == "win32":
+        # The current directory takes precedence on Windows.
+        curdir = os.curdir
+        if curdir not in path:
+            path.insert(0, curdir)
+
+        # PATHEXT is necessary to check on Windows.
+        pathext = os.environ.get("PATHEXT", "").split(os.pathsep)
+        # See if the given file matches any of the expected path extensions.
+        # This will allow us to short circuit when given "python.exe".
+        # If it does match, only test that one, otherwise we have to try
+        # others.
+        if any(cmd.lower().endswith(ext.lower()) for ext in pathext):
+            files = [cmd]
+        else:
+            files = [cmd + ext for ext in pathext]
+    else:
+        # On other platforms you don't have things like PATHEXT to tell you
+        # what file suffixes are executable, so just pass on cmd as-is.
+        files = [cmd]
+
+    seen = set()
+    for dir in path:
+        normdir = os.path.normcase(dir)
+        if not normdir in seen:
+            seen.add(normdir)
+            for thefile in files:
+                name = os.path.join(dir, thefile)
+                if _access_check(name, mode):
+                    return name
+    return None
+
+# from https://stackoverflow.com/questions/287871/how-to-print-colored-text-in-terminal-in-python
+class bcolors:
+    HEADER = '\033[95m'
+    OKBLUE = '\033[94m'
+    OKGREEN = '\033[92m'
+    WARNING = '\033[93m'
+    FAIL = '\033[91m'
+    ENDC = '\033[0m'
+    BOLD = '\033[1m'
+    UNDERLINE = '\033[4m'
+
+def print_warning(msg):
+    print(bcolors.FAIL + msg + bcolors.ENDC,file=sys.stderr)
 
 if __name__ == '__main__':
     import argparse
     import matplotlib.pyplot as plt
     import numpy as np
-    import subprocess
+    from subprocess import check_call
     from os.path import join
     import os
     import sys
+    import h5py
 
     parser = argparse.ArgumentParser("concatenate fit results from grid jobs into a single file")
-    parser.add_argument("--dir", type=str, help="fitter directory", required=True)
     parser.add_argument("zdab", help="zdab input file")
     parser.add_argument("filenames", nargs='+', help="input files")
+    parser.add_argument("-o", "--output", type=str, help="output filename", required=True)
     args = parser.parse_args()
 
-    fit_results = {}
+    zdab_cat = which("zdab-cat")
 
-    # First we create a dictionary mapping (run, gtid) -> fit results.
-    for filename in args.filenames:
-        with open(filename) as f:
-            data = yaml.load(f,Loader=Loader)
+    if zdab_cat is None:
+        print("couldn't find zdab-cat in path!",file=sys.stderr)
+        sys.exit(1)
 
-        if data is None:
-            continue
-
-        for event in data['data']:
-            if event['ev'] is None:
-                continue
-
-            # if the ev branch is filled in, it means the event was fit
-            for ev in event['ev']:
-                # add the git SHA1 hash to the fit results since technically it
-                # could be different than the version in zdab-cat
-                ev['fit']['git_sha1'] = data['git_sha1']
-                ev['fit']['git_dirty'] = data['git_dirty']
-                fit_results[(ev['run'],ev['gtid'])] = ev['fit']
-
-    # Next we get the full event list along with the data cleaning word, FTP
+    # First we get the full event list along with the data cleaning word, FTP
     # position, FTK, and RSP energy from the original zdab and then add the fit
     # results.
     #
     # Note: We send stderr to /dev/null since there can be a lot of warnings
     # about PMT types and fit results
     with open(os.devnull, 'w') as f:
-        popen = subprocess.Popen([join(args.dir,"zdab-cat"),args.zdab],stdout=subprocess.PIPE,stderr=f)
+        check_call([zdab_cat,args.zdab,"-o",args.output],stderr=f)
 
     total_events = 0
     events_with_fit = 0
-
-    doc = {'data': []}
-
-    for data in yaml.load_all(popen.stdout,Loader=Loader):
-        if 'ev' not in data:
-            doc.update(data)
-            continue
-
-        for ev in data['ev']:
-            run = ev['run']
-            gtid = ev['gtid']
-
-            if (run,gtid) in fit_results:
-                ev['fit'] = fit_results[(run,gtid)]
-                events_with_fit += 1
-
-            total_events += 1
-
-        doc['data'].append(data)
-
-    popen.wait()
+    total_fits = 0
+
+    with h5py.File(args.output,"a") as fout:
+        total_events = fout['ev'].shape[0]
+        for filename in args.filenames:
+            with h5py.File(filename) as f:
+                # Check to see if the git sha1 match
+                if fout.attrs['git_sha1'] != f.attrs['git_sha1']:
+                    print_warning("git_sha1 is %s for current version but %s for %s" % (fout.attrs['git_sha1'],f.attrs['git_sha1'],filename))
+                # get fits which match up with the events
+                valid_fits = f['fits'][np.isin(f['fits'][:][['run','gtid']],fout['ev'][:][['run','gtid']])]
+                # Add the fit results
+                fout['fits'].resize((fout['fits'].shape[0]+valid_fits.shape[0],))
+                fout['fits'][-valid_fits.shape[0]:] = valid_fits
+                events_with_fit += len(np.unique(valid_fits[['run','gtid']]))
+                total_fits += len(np.unique(f['fits']['run','gtid']))
 
     # Print out number of fit results that were added. Hopefully, this will
     # make it easy to catch an error if, for example, this gets run with a
     # mismatching zdab and fit results
-    print("added %i/%i fit results to a total of %i events" % (events_with_fit, len(fit_results), total_events),file=sys.stderr)
-
-    print(yaml.dump(doc,default_flow_style=False,Dumper=Dumper))
+    print("added %i/%i fit results to a total of %i events" % (events_with_fit, total_fits, total_events),file=sys.stderr)
diff --git a/utils/fit-wrapper b/utils/fit-wrapper
new file mode 100644
index 0000000..940b807
--- /dev/null
+++ b/utils/fit-wrapper
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+
+# Simple wrapper script to run jobs on the grid
+
+module load hdf5
+module load zlib
+./fit "$@"
diff --git a/utils/plot b/utils/plot
index ac93eda..1bac500 100755
--- a/utils/plot
+++ b/utils/plot
@@ -74,6 +74,8 @@ if __name__ == '__main__':
     import argparse
     import matplotlib.pyplot as plt
     import numpy as np
+    import h5py
+    import pandas as pd
 
     parser = argparse.ArgumentParser("plot fit results")
     parser.add_argument("filenames", nargs='+', help="input files")
@@ -81,121 +83,113 @@ if __name__ == '__main__':
 
     for filename in args.filenames:
         print(filename)
-        with open(filename) as f:
-            data = yaml.load(f.read(),Loader=Loader)
-
-        dx = []
-        dy = []
-        dz = []
-        dT = []
-        thetas = []
-        likelihood_ratio = []
-        t_electron = []
-        t_muon = []
-        psi = []
-        for event in data['data']:
-            # get the particle ID
-            id = event['mcgn'][0]['id']
-
-            if 'ev' not in event:
-                continue
-
-            if 'fit' not in event['ev'][0]:
-                # if nhit < 100 we don't fit the event
-                continue
-
-            if id not in event['ev'][0]['fit']:
-                continue
-
-            fit = event['ev'][0]['fit']
-
-            mass = SNOMAN_MASS[id]
-            # for some reason it's the *second* track which seems to contain the
-            # initial energy
-            true_energy = event['mcgn'][0]['energy']
-            # The MCTK bank has the particle's total energy (except for neutrons)
-            # so we need to convert it into kinetic energy
-            ke = true_energy - mass
-            energy = fit[id]['energy']
-            dT.append(energy-ke)
-            true_posx = event['mcgn'][0]['posx']
-            posx = fit[id]['posx']
-            dx.append(posx-true_posx)
-            true_posy = event['mcgn'][0]['posy']
-            posy = fit[id]['posy']
-            dy.append(posy-true_posy)
-            true_posz = event['mcgn'][0]['posz']
-            posz = fit[id]['posz']
-            dz.append(posz-true_posz)
-            dirx = event['mcgn'][0]['dirx']
-            diry = event['mcgn'][0]['diry']
-            dirz = event['mcgn'][0]['dirz']
-            true_dir = [dirx,diry,dirz]
-            true_dir = np.array(true_dir)/np.linalg.norm(true_dir)
-            theta = fit[id]['theta']
-            phi = fit[id]['phi']
-            dir = [np.sin(theta)*np.cos(phi),np.sin(theta)*np.sin(phi),np.cos(theta)]
-            dir = np.array(dir)/np.linalg.norm(dir)
-            thetas.append(np.degrees(np.arccos(np.dot(true_dir,dir))))
-            if IDP_E_MINUS in fit and IDP_MU_MINUS in fit:
-                fmin_electron = fit[IDP_E_MINUS]['fmin']
-                fmin_muon = fit[IDP_MU_MINUS]['fmin']
-                likelihood_ratio.append(fmin_muon-fmin_electron)
-            if IDP_E_MINUS in fit:
-                t_electron.append(fit[IDP_E_MINUS]['time'])
-            if IDP_MU_MINUS in fit:
-                t_muon.append(fit[IDP_MU_MINUS]['time'])
-
-            if 'nhit' in event['ev'][0]:
-                nhit = event['ev'][0]['nhit']
-                psi.append(fit[id]['psi']/nhit)
-
-        if len(t_electron) < 2 or len(t_muon) < 2:
+        
+        with h5py.File(filename) as f:
+            ev = pd.read_hdf(filename, "ev")
+            mcgn = pd.read_hdf(filename, "mcgn")
+            fits = pd.read_hdf(filename, "fits")
+
+            # get rid of 2nd events like Michel electrons
+            ev = ev.sort_values(['run','gtid']).groupby(['evn'],as_index=False).first()
+
+            # Now, we merge all three datasets together to produce a single
+            # dataframe. To do so, we join the ev dataframe with the mcgn frame
+            # on the evn column, and then join with the fits on the run and
+            # gtid columns.
+            #
+            # At the end we will have a single dataframe with one row for each
+            # fit, i.e. it will look like:
+            #
+            # >>> data
+            #   run   gtid nhit, ... mcgn_x, mcgn_y, mcgn_z, ..., fit_id1, fit_x, fit_y, fit_z, ...
+            #
+            # Before merging, we prefix the primary seed track table with mcgn_
+            # and the fit table with fit_ just to make things easier.
+
+            # Prefix track and fit frames
+            mcgn = mcgn.add_prefix("mcgn_")
+            fits = fits.add_prefix("fit_")
+
+            # merge ev and mcgn on evn
+            data = ev.merge(mcgn,left_on=['evn'],right_on=['mcgn_evn'])
+            # merge data and fits on run and gtid
+            data = data.merge(fits,left_on=['run','gtid'],right_on=['fit_run','fit_gtid'])
+
+            # calculate true kinetic energy
+            mass = [SNOMAN_MASS[id] for id in data['mcgn_id'].values]
+            data['T'] = data['mcgn_energy'].values - mass
+            data['dx'] = data['fit_x'].values - data['mcgn_x'].values
+            data['dy'] = data['fit_y'].values - data['mcgn_y'].values
+            data['dz'] = data['fit_z'].values - data['mcgn_z'].values
+            data['dT'] = data['fit_energy1'].values - data['T'].values
+
+            true_dir = np.dstack((data['mcgn_dirx'],data['mcgn_diry'],data['mcgn_dirz'])).squeeze()
+            dir = np.dstack((np.sin(data['fit_theta1'])*np.cos(data['fit_phi1']),
+                             np.sin(data['fit_theta1'])*np.sin(data['fit_phi1']),
+                             np.cos(data['fit_theta1']))).squeeze()
+
+            data['theta'] = np.degrees(np.arccos((true_dir*dir).sum(axis=-1)))
+
+            # only select fits which have at least 2 fits
+            data = data.groupby(['run','gtid']).filter(lambda x: len(x) > 1)
+            data_true = data[data['fit_id1'] == data['mcgn_id']]
+            data_e = data[data['fit_id1'] == IDP_E_MINUS]
+            data_mu = data[data['fit_id1'] == IDP_MU_MINUS]
+
+            data_true = data_true.set_index(['run','gtid'])
+            data_e = data_e.set_index(['run','gtid'])
+            data_mu = data_mu.set_index(['run','gtid'])
+
+            data_true['ratio'] = data_mu['fit_fmin']-data_e['fit_fmin']
+            data_true['te'] = data_e['fit_time']
+            data_true['tm'] = data_mu['fit_time']
+            data_true['Te'] = data_e['fit_energy1']
+
+        if len(data_true) < 2:
             continue
 
-        mean, mean_error, std, std_error = get_stats(dT)
+        mean, mean_error, std, std_error = get_stats(data_true.dT)
         print("dT      = %.2g +/- %.2g" % (mean, mean_error))
         print("std(dT) = %.2g +/- %.2g" % (std, std_error))
-        mean, mean_error, std, std_error = get_stats(dx)
+        mean, mean_error, std, std_error = get_stats(data_true.dx)
         print("dx      = %4.2g +/- %.2g" % (mean, mean_error))
         print("std(dx) = %4.2g +/- %.2g" % (std, std_error))
-        mean, mean_error, std, std_error = get_stats(dy)
+        mean, mean_error, std, std_error = get_stats(data_true.dy)
         print("dy      = %4.2g +/- %.2g" % (mean, mean_error))
         print("std(dy) = %4.2g +/- %.2g" % (std, std_error))
-        mean, mean_error, std, std_error = get_stats(dz)
+        mean, mean_error, std, std_error = get_stats(data_true.dz)
         print("dz      = %4.2g +/- %.2g" % (mean, mean_error))
         print("std(dz) = %4.2g +/- %.2g" % (std, std_error))
-        mean, mean_error, std, std_error = get_stats(thetas)
+        mean, mean_error, std, std_error = get_stats(data_true.theta)
         print("std(theta) = %4.2g +/- %.2g" % (std, std_error))
 
         plt.figure(1)
-        plot_hist(dT, label=filename)
+        plot_hist(data_true.dT, label=filename)
         plt.xlabel("Kinetic Energy difference (MeV)")
         plt.figure(2)
-        plot_hist(dx, label=filename)
+        plot_hist(data_true.dx, label=filename)
         plt.xlabel("X Position difference (cm)")
         plt.figure(3)
-        plot_hist(dy, label=filename)
+        plot_hist(data_true.dy, label=filename)
         plt.xlabel("Y Position difference (cm)")
         plt.figure(4)
-        plot_hist(dz, label=filename)
+        plot_hist(data_true.dz, label=filename)
         plt.xlabel("Z Position difference (cm)")
         plt.figure(5)
-        plot_hist(thetas, label=filename)
+        plot_hist(data_true.theta, label=filename)
         plt.xlabel(r"$\theta$ (deg)")
         plt.figure(6)
-        plot_hist(likelihood_ratio, label=filename)
+        plot_hist(data_true.ratio, label=filename)
         plt.xlabel(r"Log Likelihood Ratio ($e/\mu$)")
         plt.figure(7)
-        plot_hist(np.array(t_electron)/1e3/60.0, label=filename)
+        plot_hist(data_true.te/1e3/60.0, label=filename)
         plt.xlabel(r"Electron Fit time (minutes)")
         plt.figure(8)
-        plot_hist(np.array(t_muon)/1e3/60.0, label=filename)
+        plot_hist(data_true.tm/1e3/60.0, label=filename)
         plt.xlabel(r"Muon Fit time (minutes)")
-        if len(psi):
-            plt.figure(9)
-            plot_hist(psi, label=filename)
-            plt.xlabel(r"$\Psi$/Nhit")
+        plt.figure(9)
+        plot_hist(data_true.fit_psi/data_true.nhit, label=filename)
+        plt.xlabel(r"$\Psi$/Nhit")
 
     plot_legend(1)
     plot_legend(2)
@@ -205,6 +199,5 @@ if __name__ == '__main__':
     plot_legend(6)
     plot_legend(7)
     plot_legend(8)
-    if len(psi):
-        plot_legend(9)
+    plot_legend(9)
     plt.show()
diff --git a/utils/plot-energy b/utils/plot-energy
index a0274b7..4a8521b 100755
--- a/utils/plot-energy
+++ b/utils/plot-energy
@@ -107,140 +107,69 @@ if __name__ == '__main__':
     import numpy as np
     import pandas as pd
     import sys
+    import h5py
 
     parser = argparse.ArgumentParser("plot fit results")
     parser.add_argument("filenames", nargs='+', help="input files")
     args = parser.parse_args()
 
-    events = []
-    fit_results = []
-
-    for filename in args.filenames:
-        print(filename)
-        with open(filename) as f:
-            data = yaml.load(f,Loader=Loader)
-
-        for i, event in enumerate(data['data']):
-            for ev in event['ev']:
-                events.append((
-                    ev['run'],
-                    ev['gtr'],
-                    ev['nhit'],
-                    ev['gtid'],
-                    ev['dc'],
-                    ev['ftp']['x'] if 'ftp' in ev else np.nan,
-                    ev['ftp']['y'] if 'ftp' in ev else np.nan,
-                    ev['ftp']['z'] if 'ftp' in ev else np.nan,
-                    ev['rsp']['energy'] if 'rsp' in ev and ev['rsp']['energy'] > 0 else np.nan,
-                    ))
-
-                if 'fit' not in ev:
-                    continue
-
-                for id, fit_result in [x for x in ev['fit'].iteritems() if isinstance(x[0],int)]:
-                    # FIXME: Should I just store the particle ids in the YAML
-                    # output as a list of particle ids instead of a single
-                    # integer?
-                    ids = map(int,chunks(str(id),2))
-                    energy = 0.0
-                    skip = False
-                    for i, ke in zip(ids,np.atleast_1d(fit_result['energy'])):
-                        energy += ke + SNOMAN_MASS[i]
-
-                        # This is a bit of a hack. It appears that many times
-                        # the fit will actually do much better by including a
-                        # very low energy electron or muon. I believe the
-                        # reason for this is that of course my likelihood
-                        # function is not perfect (for example, I don't include
-                        # the correct angular distribution for Rayleigh
-                        # scattered light), and so the fitter often wants to
-                        # add a very low energy electron or muon to fix things.
-                        #
-                        # Ideally I would fix the likelihood function, but for
-                        # now we just discard any fit results which have a very
-                        # low energy electron or muon.
-                        if len(ids) > 1 and i == 20 and ke < 20.0:
-                            skip = True
-
-                        if len(ids) > 1 and i == 22 and ke < 200.0:
-                            skip = True
-
-                    if skip:
-                        continue
-
-                    # Calculate the approximate Ockham factor.
-                    # See Chapter 20 in "Probability Theory: The Logic of Science" by Jaynes
-                    #
-                    # Note: This is a really approximate form by assuming that
-                    # the shape of the likelihood space is equal to the average
-                    # uncertainty in the different parameters.
-                    w = len(ids)*np.log(0.1*0.001) + np.sum(np.log(fit_result['energy'])) + len(ids)*np.log(1e-4/(4*np.pi))
-
-                    fit_results.append((
-                        ev['run'],
-                        ev['gtid'],
-                        id,
-                        fit_result['posx'],
-                        fit_result['posy'],
-                        fit_result['posz'],
-                        fit_result['t0'],
-                        energy,
-                        np.atleast_1d(fit_result['theta'])[0],
-                        np.atleast_1d(fit_result['phi'])[0],
-                        fit_result['fmin'] - w,
-                        fit_result['psi']/ev['nhit']))
-
-    # create a dataframe
-    # note: we have to first create a numpy structured array since there is no
-    # way to pass a list of data types to the DataFrame constructor. See
-    # https://github.com/pandas-dev/pandas/issues/4464
-    array = np.array(fit_results,
-                     dtype=[('run',np.int),      # run number
-                            ('gtid',np.int),     # gtid
-                            ('id',np.int),       # particle id
-                            ('x', np.double),    # x
-                            ('y',np.double),     # y
-                            ('z',np.double),     # z
-                            ('t0',np.double),    # t0
-                            ('ke',np.double),    # kinetic energy
-                            ('theta',np.double), # direction of 1st particle
-                            ('phi',np.double),   # direction of 2nd particle
-                            ('fmin',np.double),  # negative log likelihood
-                            ('psi',np.double)]   # goodness of fit parameter
-                   )
-    df = pd.DataFrame.from_records(array)
-
-    array = np.array(events,
-                     dtype=[('run',np.int),           # run number
-                            ('gtr',np.double),        # 50 MHz clock in ns
-                            ('nhit',np.int),          # number of PMTs hit
-                            ('gtid',np.int),          # gtid
-                            ('dc',np.int),            # data cleaning word
-                            ('ftpx',np.double),       # FTP fitter X position
-                            ('ftpy',np.double),       # FTP fitter Y position
-                            ('ftpz',np.double),       # FTP fitter Z position
-                            ('rsp_energy',np.double)] # RSP energy
-                   )
-    df_ev = pd.DataFrame.from_records(array)
+    ev = pd.concat([pd.read_hdf(filename, "ev") for filename in args.filenames])
+    fits = pd.concat([pd.read_hdf(filename, "fits") for filename in args.filenames])
+
+    # This is a bit of a hack. It appears that many times the fit will
+    # actually do much better by including a very low energy electron or
+    # muon. I believe the reason for this is that of course my likelihood
+    # function is not perfect (for example, I don't include the correct
+    # angular distribution for Rayleigh scattered light), and so the fitter
+    # often wants to add a very low energy electron or muon to fix things.
+    #
+    # Ideally I would fix the likelihood function, but for now we just
+    # discard any fit results which have a very low energy electron or
+    # muon.
+    #
+    # FIXME: Test this since query() is new to pandas
+    fits = fits.query('not (n > 1 and ((id1 == 20 and energy1 < 20)  or (id2 == 20 and energy2 < 20)  or (id3 == 20 and energy3 < 20)))')
+    fits = fits.query('not (n > 1 and ((id2 == 22 and energy1 < 200) or (id2 == 22 and energy2 < 200) or (id3 == 22 and energy3 < 200)))')
+
+    # Calculate the approximate Ockham factor.
+    # See Chapter 20 in "Probability Theory: The Logic of Science" by Jaynes
+    #
+    # Note: This is a really approximate form by assuming that the shape of
+    # the likelihood space is equal to the average uncertainty in the
+    # different parameters.
+    fits['w'] = fits['n']*np.log(0.1*0.001) + np.log(fits['energy1']) + fits['n']*np.log(1e-4/(4*np.pi))
+
+    # Note: we index on the left hand site with loc to avoid a copy error
+    #
+    # See https://www.dataquest.io/blog/settingwithcopywarning/
+    fits.loc[fits['n'] > 1, 'w'] += np.log(fits[fits['n'] > 1]['energy2'])
+    fits.loc[fits['n'] > 2, 'w'] += np.log(fits[fits['n'] > 2]['energy3'])
+
+    fits['fmin'] = fits['fmin'] - fits['w']
+
+    fits['psi'] /= fits.merge(ev,on=['run','gtid'])['nhit']
+    fits['ke'] = fits['energy1']
+    fits['id'] = fits['id1'] + fits['id2']*100 + fits['id3']*10000
+    fits['theta'] = fits['theta1']
 
     # Make sure events are in order. We use run number and GTID here which
     # should be in time order as well except for bit flips in the GTID
     # This is important for later when we look at time differences in the 50
     # MHz clock. We should perhaps do a check here based on the 10 MHz clock to
     # make sure things are in order
-    df_ev = df_ev.sort_values(by=['run','gtid'])
+    ev = ev.sort_values(by=['run','gtid'])
 
-    print("number of events = %i" % len(df_ev))
+    print("number of events = %i" % len(ev))
 
     # First, do basic data cleaning which is done for all events.
-    df_ev = df_ev[df_ev.dc & (DC_JUNK | DC_CRATE_ISOTROPY | DC_QVNHIT | DC_FLASHER | DC_NECK) == 0]
+    ev = ev[ev.dc & (DC_JUNK | DC_CRATE_ISOTROPY | DC_QVNHIT | DC_FLASHER | DC_NECK) == 0]
 
-    print("number of events after data cleaning = %i" % len(df_ev))
+    print("number of events after data cleaning = %i" % len(ev))
 
     # Now, we select events tagged by the muon tag which should tag only
     # external muons. We keep the sample of muons since it's needed later to
     # identify Michel electrons and to apply the muon follower cut
-    muons = df_ev[(df_ev.dc & DC_MUON) != 0]
+    muons = ev[(ev.dc & DC_MUON) != 0]
 
     print("number of muons = %i" % len(muons))
 
@@ -251,12 +180,12 @@ if __name__ == '__main__':
     #
     # Should I use 10 MHz clock here since the time isn't critical and then I
     # don't have to worry about 50 MHz clock rollover?
-    prompt = df_ev[df_ev.nhit >= 100]
+    prompt = ev[ev.nhit >= 100]
     prompt_mask = np.concatenate(([True],np.diff(prompt.gtr.values) > 250e6))
 
     # Michel electrons and neutrons can be any event which is not a prompt
     # event
-    follower = pd.concat([df_ev[df_ev.nhit < 100],prompt[~prompt_mask]])
+    follower = pd.concat([ev[ev.nhit < 100],prompt[~prompt_mask]])
 
     # Apply the prompt mask
     prompt = prompt[prompt_mask]
@@ -275,39 +204,78 @@ if __name__ == '__main__':
     if prompt_plus_muons.size and follower.size:
         # require Michel events to pass more of the SNO data cleaning cuts
         michel = follower[follower.dc & (DC_JUNK | DC_CRATE_ISOTROPY | DC_QVNHIT | DC_FLASHER | DC_NECK | DC_ESUM | DC_OWL | DC_OWL_TRIGGER | DC_FTS) == DC_FTS]
+
         michel = michel[michel.nhit >= 100]
-        # accept events which had a muon more than 800 nanoseconds but less than 20 microseconds before them
-        # The 800 nanoseconds cut comes from Richie's thesis. He also mentions
-        # that the In Time Channel Spread Cut is very effective at cutting
-        # electrons events caused by muons, so I should implement that
-        michel = michel[np.any((michel.run.values == prompt_plus_muons.run.values[:,np.newaxis]) & \
-                               (michel.gtr.values > prompt_plus_muons.gtr.values[:,np.newaxis] + 800) & \
-                               (michel.gtr.values < prompt_plus_muons.gtr.values[:,np.newaxis] + 20e3),axis=0)]
+
+        # Accept events which had a muon more than 800 nanoseconds but less
+        # than 20 microseconds before them. The 800 nanoseconds cut comes from
+        # Richie's thesis. He also mentions that the In Time Channel Spread Cut
+        # is very effective at cutting electrons events caused by muons, so I
+        # should implement that.
+        #
+        # Note: We currently don't look across run boundaries. This should be a
+        # *very* small effect, and the logic to do so would be very complicated
+        # since I would have to deal with 50 MHz clock rollovers, etc.
+        #
+        # Do a simple python for loop here over the runs since we create
+        # temporary arrays with shape (michel.size,prompt_plus_muons.size)
+        # which could be too big for the full dataset.
+        #
+        # There might be some clever way to do this without the loop in Pandas,
+        # but I don't know how.
+        michel_sum = None
+        for run, michel_run in michel.groupby(['run']):
+            prompt_plus_muons_run = prompt_plus_muons[prompt_plus_muons['run'] == run]
+            michel_run = michel_run[np.any((michel_run.gtr.values > prompt_plus_muons_run.gtr.values[:,np.newaxis] + 800) & \
+                                           (michel_run.gtr.values < prompt_plus_muons_run.gtr.values[:,np.newaxis] + 20e3),axis=0)]
+
+            if michel_sum is None:
+                michel_sum = michel_run
+            else:
+                michel_sum = michel_sum.append(michel_run)
+
+        michel = michel_sum
     else:
         michel = pd.DataFrame(columns=follower.columns)
 
     if prompt.size and follower.size:
         # neutron followers have to obey stricter set of data cleaning cuts
         neutron = follower[follower.dc & (DC_JUNK | DC_CRATE_ISOTROPY | DC_QVNHIT | DC_FLASHER | DC_NECK | DC_ESUM | DC_OWL | DC_OWL_TRIGGER | DC_FTS) == DC_FTS]
-        neutron = neutron[~np.isnan(neutron.ftpx) & ~np.isnan(neutron.rsp_energy)]
-        r = np.sqrt(neutron.ftpx**2 + neutron.ftpy**2 + neutron.ftpz**2)
+        neutron = neutron[~np.isnan(neutron.ftp_x) & ~np.isnan(neutron.rsp_energy)]
+        r = np.sqrt(neutron.ftp_x**2 + neutron.ftp_y**2 + neutron.ftp_z**2)
         neutron = neutron[r < AV_RADIUS]
         neutron = neutron[neutron.rsp_energy > 4.0]
+
         # neutron events accepted after 20 microseconds and before 250 ms (50 ms during salt)
         # FIXME: need to deal with 50 MHz clock rollover
-        neutron_mask = np.any((neutron.run.values == prompt.run.values[:,np.newaxis]) & \
-                              (neutron.gtr.values > prompt.gtr.values[:,np.newaxis] + 20e3) & \
-                              (neutron.gtr.values < prompt.gtr.values[:,np.newaxis] + 250e6),axis=1)
-        df_atm = prompt[neutron_mask]
-        prompt = prompt[~neutron_mask]
+        prompt_sum = None
+        atm_sum = None
+        for run, prompt_run in prompt.groupby(['run']):
+            neutron_run = neutron[neutron['run'] == run]
+
+            neutron_mask = np.any((neutron_run.gtr.values > prompt_run.gtr.values[:,np.newaxis] + 20e3) & \
+                                  (neutron_run.gtr.values < prompt_run.gtr.values[:,np.newaxis] + 250e6),axis=1)
+
+            if prompt_sum is None:
+                prompt_sum = prompt_run[~neutron_mask]
+            else:
+                prompt_sum = prompt_sum.append(prompt_run[~neutron_mask])
+
+            if atm_sum is None:
+                atm_sum = prompt_run[neutron_mask]
+            else:
+                atm_sum = atm_sum.append(prompt_run[neutron_mask])
+
+        atm = atm_sum
+        prompt = prompt_sum
     else:
-        df_atm = pd.DataFrame(columns=prompt.columns)
+        atm = pd.DataFrame(columns=prompt.columns)
 
     print("number of events after neutron follower cut = %i" % len(prompt))
 
     # Get rid of muon events in our main event sample
     prompt = prompt[(prompt.dc & DC_MUON) == 0]
-    df_atm = df_atm[(df_atm.dc & DC_MUON) == 0]
+    atm = atm[(atm.dc & DC_MUON) == 0]
 
     print("number of events after muon cut = %i" % len(prompt))
 
@@ -317,52 +285,52 @@ if __name__ == '__main__':
         prompt = prompt[~np.any((prompt.run.values == muons.run.values[:,np.newaxis]) & \
                                 (prompt.gtr.values > muons.gtr.values[:,np.newaxis]) & \
                                 (prompt.gtr.values <= (muons.gtr.values[:,np.newaxis] + 200e3)),axis=0)]
-        df_atm = df_atm[~np.any((df_atm.run.values == muons.run.values[:,np.newaxis]) & \
-                                (df_atm.gtr.values > muons.gtr.values[:,np.newaxis]) & \
-                                (df_atm.gtr.values <= (muons.gtr.values[:,np.newaxis] + 200e3)),axis=0)]
+        atm = atm[~np.any((atm.run.values == muons.run.values[:,np.newaxis]) & \
+                          (atm.gtr.values > muons.gtr.values[:,np.newaxis]) & \
+                          (atm.gtr.values <= (muons.gtr.values[:,np.newaxis] + 200e3)),axis=0)]
 
     # Check to see if there are any events with missing fit information
-    df_atm_ra = df_atm[['run','gtid']].to_records(index=False)
+    atm_ra = atm[['run','gtid']].to_records(index=False)
     muons_ra = muons[['run','gtid']].to_records(index=False)
     prompt_ra = prompt[['run','gtid']].to_records(index=False)
     michel_ra = michel[['run','gtid']].to_records(index=False)
-    df_ra = df[['run','gtid']].to_records(index=False)
+    fits_ra = fits[['run','gtid']].to_records(index=False)
 
-    if np.count_nonzero(~np.isin(df_atm_ra,df_ra)):
-        print_warning("skipping %i atmospheric events because they are missing fit information!" % np.count_nonzero(~np.isin(df_atm_ra,df_ra)))
+    if len(atm_ra) and np.count_nonzero(~np.isin(atm_ra,fits_ra)):
+        print_warning("skipping %i atmospheric events because they are missing fit information!" % np.count_nonzero(~np.isin(atm_ra,fits_ra)))
 
-    if np.count_nonzero(~np.isin(muons_ra,df_ra)):
-        print_warning("skipping %i muon events because they are missing fit information!" % np.count_nonzero(~np.isin(muons_ra,df_ra)))
+    if len(muons_ra) and np.count_nonzero(~np.isin(muons_ra,fits_ra)):
+        print_warning("skipping %i muon events because they are missing fit information!" % np.count_nonzero(~np.isin(muons_ra,fits_ra)))
 
-    if np.count_nonzero(~np.isin(prompt_ra,df_ra)):
-        print_warning("skipping %i signal events because they are missing fit information!" % np.count_nonzero(~np.isin(prompt_ra,df_ra)))
+    if len(prompt_ra) and np.count_nonzero(~np.isin(prompt_ra,fits_ra)):
+        print_warning("skipping %i signal events because they are missing fit information!" % np.count_nonzero(~np.isin(prompt_ra,fits_ra)))
 
-    if np.count_nonzero(~np.isin(michel_ra,df_ra)):
-        print_warning("skipping %i Michel events because they are missing fit information!" % np.count_nonzero(~np.isin(michel_ra,df_ra)))
+    if len(michel_ra) and np.count_nonzero(~np.isin(michel_ra,fits_ra)):
+        print_warning("skipping %i Michel events because they are missing fit information!" % np.count_nonzero(~np.isin(michel_ra,fits_ra)))
 
     # Now, we merge the event info with the fitter info.
     #
     # Note: This means that the dataframe now contains multiple rows for each
     # event, one for each fit hypothesis.
-    df_atm = pd.merge(df,df_atm,how='inner',on=['run','gtid'])
-    muons = pd.merge(df,muons,how='inner',on=['run','gtid'])
-    michel = pd.merge(df,michel,how='inner',on=['run','gtid'])
-    df = pd.merge(df,prompt,how='inner',on=['run','gtid'])
+    atm = pd.merge(fits,atm,how='inner',on=['run','gtid'])
+    muons = pd.merge(fits,muons,how='inner',on=['run','gtid'])
+    michel = pd.merge(fits,michel,how='inner',on=['run','gtid'])
+    prompt = pd.merge(fits,prompt,how='inner',on=['run','gtid'])
 
     # get rid of events which don't have a fit
-    nan = np.isnan(df.fmin.values)
+    nan = np.isnan(prompt.fmin.values)
 
     if np.count_nonzero(nan):
-        print_warning("skipping %i signal events because the negative log likelihood is nan!" % len(df[nan].groupby(['run','gtid'])))
+        print_warning("skipping %i signal events because the negative log likelihood is nan!" % len(prompt[nan].groupby(['run','gtid'])))
 
-    df = df[~nan]
+    prompt = prompt[~nan]
 
-    nan_atm = np.isnan(df_atm.fmin.values)
+    nan_atm = np.isnan(atm.fmin.values)
 
     if np.count_nonzero(nan_atm):
-        print_warning("skipping %i atmospheric events because the negative log likelihood is nan!" % len(df_atm[nan_atm].groupby(['run','gtid'])))
+        print_warning("skipping %i atmospheric events because the negative log likelihood is nan!" % len(atm[nan_atm].groupby(['run','gtid'])))
 
-    df_atm = df_atm[~nan_atm]
+    atm = atm[~nan_atm]
 
     nan_muon = np.isnan(muons.fmin.values)
 
@@ -379,24 +347,24 @@ if __name__ == '__main__':
     michel = michel[~nan_michel]
 
     # get the best fit
-    df = df.sort_values('fmin').groupby(['run','gtid']).first()
-    df_atm = df_atm.sort_values('fmin').groupby(['run','gtid']).first()
+    prompt = prompt.sort_values('fmin').groupby(['run','gtid']).first()
+    atm = atm.sort_values('fmin').groupby(['run','gtid']).first()
     michel_best_fit = michel.sort_values('fmin').groupby(['run','gtid']).first()
     muon_best_fit = muons.sort_values('fmin').groupby(['run','gtid']).first()
     muons = muons[muons.id == 22]
 
     # require r < 6 meters
-    df = df[np.sqrt(df.x.values**2 + df.y.values**2 + df.z.values**2) < AV_RADIUS]
-    df_atm = df_atm[np.sqrt(df_atm.x.values**2 + df_atm.y.values**2 + df_atm.z.values**2) < AV_RADIUS]
+    prompt = prompt[np.sqrt(prompt.x.values**2 + prompt.y.values**2 + prompt.z.values**2) < AV_RADIUS]
+    atm = atm[np.sqrt(atm.x.values**2 + atm.y.values**2 + atm.z.values**2) < AV_RADIUS]
 
-    print("number of events after radius cut = %i" % len(df))
+    print("number of events after radius cut = %i" % len(prompt))
 
     # Note: Need to design and apply a psi based cut here
 
     plt.figure()
-    plot_hist(df,"Without Neutron Follower")
+    plot_hist(prompt,"Without Neutron Follower")
     plt.figure()
-    plot_hist(df_atm,"With Neutron Follower")
+    plot_hist(atm,"With Neutron Follower")
     plt.figure()
     plot_hist(michel_best_fit,"Michel Electrons")
     plt.figure()
diff --git a/utils/plot-fit-results b/utils/plot-fit-results
index 773a0dc..7115b81 100755
--- a/utils/plot-fit-results
+++ b/utils/plot-fit-results
@@ -15,11 +15,6 @@
 # this program. If not, see <https://www.gnu.org/licenses/>.
 
 from __future__ import print_function, division
-import yaml
-try:
-    from yaml import CLoader as Loader
-except ImportError:
-    from yaml.loader import SafeLoader as Loader
 import numpy as np
 from scipy.stats import iqr
 from matplotlib.lines import Line2D
@@ -71,228 +66,137 @@ def get_stats(x):
     error = np.sqrt((u4-(n-3)*std**4/(n-1))/n)/(2*std)
     return mean, std/np.sqrt(n), std, error
 
+def std_err(x):
+    x = x.dropna()
+    mean = np.mean(x)
+    std = np.std(x)
+    n = len(x)
+    if n == 0:
+        return np.nan
+    u4 = np.mean((x-mean)**4)
+    error = np.sqrt((u4-(n-3)*std**4/(n-1))/n)/(2*std)
+    return error
+
 if __name__ == '__main__':
     import argparse
     import matplotlib.pyplot as plt
     import numpy as np
+    import h5py
+    import pandas as pd
 
     parser = argparse.ArgumentParser("plot fit results")
     parser.add_argument("filenames", nargs='+', help="input files")
     args = parser.parse_args()
 
-    events = []
-
-    for filename in args.filenames:
-        print(filename)
-        with open(filename) as f:
-            data = yaml.load(f.read(),Loader=Loader)
-
-        a = np.ma.empty(len(data['data']),
-                        dtype=[('id',np.int),       # particle id
-                               ('T', np.double),    # true energy
-                               ('dx',np.double),    # dx
-                               ('dy',np.double),    # dy
-                               ('dz',np.double),    # dz
-                               ('dT',np.double),    # dT
-                               ('theta',np.double), # theta
-                               ('ratio',np.double), # likelihood ratio
-                               ('te',np.double),    # time electron
-                               ('tm',np.double),    # time muon
-                               ('Te',np.double)]    # electron energy
-                    )
-
-        for i, event in enumerate(data['data']):
-            # get the particle ID
-            id = event['mcgn'][0]['id']
-
-            a[i]['id'] = id
-
-            if 'fit' not in event['ev'][0]:
-                # if nhit < 100 we don't fit the event
-                continue
-
-            if id not in event['ev'][0]['fit']:
-                a[i] = np.ma.masked
-                continue
-
-            mass = SNOMAN_MASS[id]
-            # for some reason it's the *second* track which seems to contain the
-            # initial energy
-            true_energy = event['mcgn'][0]['energy']
-            # The MCTK bank has the particle's total energy (except for neutrons)
-            # so we need to convert it into kinetic energy
-            ke = true_energy - mass
-
-            fit = event['ev'][0]['fit']
-
-            a[i]['T'] = ke
-            energy = fit[id]['energy']
-            a[i]['dT'] = energy-ke
-
-            # store the fit position residuals
-            true_posx = event['mcgn'][0]['posx']
-            posx = fit[id]['posx']
-            a[i]['dx'] = posx-true_posx
-            true_posy = event['mcgn'][0]['posy']
-            posy = fit[id]['posy']
-            a[i]['dy'] = posy-true_posy
-            true_posz = event['mcgn'][0]['posz']
-            posz = fit[id]['posz']
-            a[i]['dz'] = posz-true_posz
-
-            # compute the angle between the fit direction and the true
-            # direction
-            dirx = event['mcgn'][0]['dirx']
-            diry = event['mcgn'][0]['diry']
-            dirz = event['mcgn'][0]['dirz']
-            true_dir = [dirx,diry,dirz]
-            true_dir = np.array(true_dir)/np.linalg.norm(true_dir)
-            theta = fit[id]['theta']
-            phi = fit[id]['phi']
-            dir = [np.sin(theta)*np.cos(phi),np.sin(theta)*np.sin(phi),np.cos(theta)]
-            dir = np.array(dir)/np.linalg.norm(dir)
-            a[i]['theta'] = np.degrees(np.arccos(np.dot(true_dir,dir)))
-
-            # compute the log likelihood ratio
-            if IDP_E_MINUS in fit and IDP_MU_MINUS in fit:
-                fmin_electron = fit[IDP_E_MINUS]['fmin']
-                fmin_muon = fit[IDP_MU_MINUS]['fmin']
-                a[i]['ratio'] = fmin_muon-fmin_electron
-            else:
-                a[i]['ratio'] = np.ma.masked
-
-            # store the time taken for electron and muon fits
-            if IDP_E_MINUS in fit:
-                a[i]['te'] = fit[IDP_E_MINUS]['time']
-                a[i]['Te'] = fit[IDP_E_MINUS]['energy']
-            else:
-                a[i]['te'] = np.ma.masked
-                a[i]['Te'] = np.ma.masked
-            if IDP_MU_MINUS in fit:
-                a[i]['tm'] = fit[IDP_MU_MINUS]['time']
-            else:
-                a[i]['tm'] = np.ma.masked
-
-        events.append(a)
-
-    a = np.ma.concatenate(events)
-
+    # Read in all the data.
+    #
+    # Note: We have to add the filename as a column to each dataset since this
+    # script is used to analyze MC data which all has the same run number.
+    ev = pd.concat([pd.read_hdf(filename, "ev").assign(filename=filename) for filename in args.filenames])
+    fits = pd.concat([pd.read_hdf(filename, "fits").assign(filename=filename) for filename in args.filenames])
+    mcgn = pd.concat([pd.read_hdf(filename, "mcgn").assign(filename=filename) for filename in args.filenames])
+
+    # get rid of 2nd events like Michel electrons
+    ev = ev.sort_values(['run','gtid']).groupby(['filename','evn'],as_index=False).first()
+
+    # Now, we merge all three datasets together to produce a single
+    # dataframe. To do so, we join the ev dataframe with the mcgn frame
+    # on the evn column, and then join with the fits on the run and
+    # gtid columns.
+    #
+    # At the end we will have a single dataframe with one row for each
+    # fit, i.e. it will look like:
+    #
+    # >>> data
+    #   run   gtid nhit, ... mcgn_x, mcgn_y, mcgn_z, ..., fit_id1, fit_x, fit_y, fit_z, ...
+    #
+    # Before merging, we prefix the primary seed track table with mcgn_
+    # and the fit table with fit_ just to make things easier.
+
+    # Prefix track and fit frames
+    mcgn = mcgn.add_prefix("mcgn_")
+    fits = fits.add_prefix("fit_")
+
+    # merge ev and mcgn on evn
+    data = ev.merge(mcgn,left_on=['filename','evn'],right_on=['mcgn_filename','mcgn_evn'])
+    # merge data and fits on run and gtid
+    data = data.merge(fits,left_on=['filename','run','gtid'],right_on=['fit_filename','fit_run','fit_gtid'])
+
+    # calculate true kinetic energy
+    mass = [SNOMAN_MASS[id] for id in data['mcgn_id'].values]
+    data['T'] = data['mcgn_energy'].values - mass
+    data['dx'] = data['fit_x'].values - data['mcgn_x'].values
+    data['dy'] = data['fit_y'].values - data['mcgn_y'].values
+    data['dz'] = data['fit_z'].values - data['mcgn_z'].values
+    data['dT'] = data['fit_energy1'].values - data['T'].values
+
+    true_dir = np.dstack((data['mcgn_dirx'],data['mcgn_diry'],data['mcgn_dirz'])).squeeze()
+    dir = np.dstack((np.sin(data['fit_theta1'])*np.cos(data['fit_phi1']),
+                     np.sin(data['fit_theta1'])*np.sin(data['fit_phi1']),
+                     np.cos(data['fit_theta1']))).squeeze()
+
+    data['theta'] = np.degrees(np.arccos((true_dir*dir).sum(axis=-1)))
+
+    # only select fits which have at least 2 fits
+    data = data.groupby(['filename','run','gtid']).filter(lambda x: len(x) > 1)
+    data_true = data[data['fit_id1'] == data['mcgn_id']]
+    data_e = data[data['fit_id1'] == IDP_E_MINUS]
+    data_mu = data[data['fit_id1'] == IDP_MU_MINUS]
+
+    data_true = data_true.set_index(['filename','run','gtid'])
+    data_e = data_e.set_index(['filename','run','gtid'])
+    data_mu = data_mu.set_index(['filename','run','gtid'])
+
+    data_true['ratio'] = data_mu['fit_fmin']-data_e['fit_fmin']
+    data_true['te'] = data_e['fit_time']
+    data_true['tm'] = data_mu['fit_time']
+    data_true['Te'] = data_e['fit_energy1']
+
+    # 100 bins between 50 MeV and 1 GeV
     bins = np.arange(50,1000,100)
 
-    stats_array = np.ma.empty(len(bins)-1,
-                     dtype=[('T',             np.double),
-                            ('dT',            np.double),
-                            ('dT_err',        np.double),
-                            ('dT_std',        np.double),
-                            ('dT_std_err',    np.double),
-                            ('dx',            np.double),
-                            ('dx_err',        np.double),
-                            ('dx_std',        np.double),
-                            ('dx_std_err',    np.double),
-                            ('dy',            np.double),
-                            ('dy_err',        np.double),
-                            ('dy_std',        np.double),
-                            ('dy_std_err',    np.double),
-                            ('dz',            np.double),
-                            ('dz_err',        np.double),
-                            ('dz_std',        np.double),
-                            ('dz_std_err',    np.double),
-                            ('theta',         np.double),
-                            ('theta_err',     np.double),
-                            ('theta_std',     np.double),
-                            ('theta_std_err', np.double)])
-
-    stats = {IDP_E_MINUS: stats_array, IDP_MU_MINUS: stats_array.copy()}
-
-    for id in stats:
-        electron_events = a[a['id'] == id]
-
-        for i, (ablah, b) in enumerate(zip(bins[:-1], bins[1:])):
-            events = electron_events[(electron_events['T'] >= ablah) & (electron_events['T'] < b)]
-
-            if len(events) < 2:
-                stats[id][i] = np.ma.masked
-                continue
-
-            stats[id][i]['T'] = (ablah+b)/2
-            mean, mean_error, std, std_error = get_stats(events['dT'].compressed())
-            stats[id][i]['dT'] = mean
-            stats[id][i]['dT_err'] = mean_error
-            stats[id][i]['dT_std'] = std
-            stats[id][i]['dT_std_err'] = std_error
-            mean, mean_error, std, std_error = get_stats(events['dx'].compressed())
-            stats[id][i]['dx'] = mean
-            stats[id][i]['dx_err'] = mean_error
-            stats[id][i]['dx_std'] = std
-            stats[id][i]['dx_std_err'] = std_error
-            mean, mean_error, std, std_error = get_stats(events['dy'].compressed())
-            stats[id][i]['dy'] = mean
-            stats[id][i]['dy_err'] = mean_error
-            stats[id][i]['dy_std'] = std
-            stats[id][i]['dy_std_err'] = std_error
-            mean, mean_error, std, std_error = get_stats(events['dz'].compressed())
-            stats[id][i]['dz'] = mean
-            stats[id][i]['dz_err'] = mean_error
-            stats[id][i]['dz_std'] = std
-            stats[id][i]['dz_std_err'] = std_error
-            mean, mean_error, std, std_error = get_stats(events['theta'].compressed())
-            stats[id][i]['theta'] = mean
-            stats[id][i]['theta_err'] = mean_error
-            stats[id][i]['theta_std'] = std
-            stats[id][i]['theta_std_err'] = std_error
-
-    for id in stats:
+    for id in [IDP_E_MINUS, IDP_MU_MINUS]:
+        events = data_true[data_true['mcgn_id'] == id]
+
+        pd_bins = pd.cut(events['T'],bins)
+
+        dT = events.groupby(pd_bins)['dT'].agg(['mean','sem','std',std_err])
+        dx = events.groupby(pd_bins)['dx'].agg(['mean','sem','std',std_err])
+        dy = events.groupby(pd_bins)['dy'].agg(['mean','sem','std',std_err])
+        dz = events.groupby(pd_bins)['dz'].agg(['mean','sem','std',std_err])
+        theta = events.groupby(pd_bins)['theta'].agg(['mean','sem','std',std_err])
+
         label = 'Muon' if id == IDP_MU_MINUS else 'Electron'
 
-        T = stats[id]['T']
-        dT = stats[id]['dT']
-        dT_err = stats[id]['dT_err']
-        std_dT = stats[id]['dT_std']
-        std_dT_err = stats[id]['dT_std_err']
-        dx = stats[id]['dx']
-        dx_err = stats[id]['dx_err']
-        std_dx = stats[id]['dx_std']
-        std_dx_err = stats[id]['dx_std_err']
-        dy = stats[id]['dy']
-        dy_err = stats[id]['dy_err']
-        std_dy = stats[id]['dy_std']
-        std_dy_err = stats[id]['dy_std_err']
-        dz = stats[id]['dz']
-        dz_err = stats[id]['dz_err']
-        std_dz = stats[id]['dz_std']
-        std_dz_err = stats[id]['dz_std_err']
-        theta = stats[id]['theta']
-        theta_err = stats[id]['theta_err']
-        std_theta = stats[id]['theta_std']
-        std_theta_err = stats[id]['theta_std_err']
+        T = (bins[1:] + bins[:-1])/2
 
         plt.figure(1)
-        plt.errorbar(T,dT*100/T,yerr=dT_err*100/T,fmt='o',label=label)
+        plt.errorbar(T,dT['mean']*100/T,yerr=dT['sem']*100/T,fmt='o',label=label)
         plt.xlabel("Kinetic Energy (MeV)")
         plt.ylabel("Energy bias (%)")
         plt.title("Energy Bias")
         plt.legend()
 
         plt.figure(2)
-        plt.errorbar(T,std_dT*100/T,yerr=std_dT_err*100/T,fmt='o',label=label)
+        plt.errorbar(T,dT['std']*100/T,yerr=dT['std_err']*100/T,fmt='o',label=label)
         plt.xlabel("Kinetic Energy (MeV)")
         plt.ylabel("Energy resolution (%)")
         plt.title("Energy Resolution")
         plt.legend()
 
         plt.figure(3)
-        plt.errorbar(T,dx,yerr=dx_err,fmt='o',label='%s (x)' % label)
-        plt.errorbar(T,dy,yerr=dy_err,fmt='o',label='%s (y)' % label)
-        plt.errorbar(T,dz,yerr=dz_err,fmt='o',label='%s (z)' % label)
+        plt.errorbar(T,dx['mean'],yerr=dx['sem'],fmt='o',label='%s (x)' % label)
+        plt.errorbar(T,dy['mean'],yerr=dy['sem'],fmt='o',label='%s (y)' % label)
+        plt.errorbar(T,dz['mean'],yerr=dz['sem'],fmt='o',label='%s (z)' % label)
         plt.xlabel("Kinetic Energy (MeV)")
         plt.ylabel("Position bias (cm)")
         plt.title("Position Bias")
         plt.legend()
 
         plt.figure(4)
-        plt.errorbar(T,std_dx,yerr=std_dx_err,fmt='o',label='%s (x)' % label)
-        plt.errorbar(T,std_dy,yerr=std_dy_err,fmt='o',label='%s (y)' % label)
-        plt.errorbar(T,std_dz,yerr=std_dz_err,fmt='o',label='%s (z)' % label)
+        plt.errorbar(T,dx['std'],yerr=dx['std_err'],fmt='o',label='%s (x)' % label)
+        plt.errorbar(T,dy['std'],yerr=dy['std_err'],fmt='o',label='%s (y)' % label)
+        plt.errorbar(T,dz['std'],yerr=dz['std_err'],fmt='o',label='%s (z)' % label)
         plt.xlabel("Kinetic Energy (MeV)")
         plt.ylabel("Position resolution (cm)")
         plt.title("Position Resolution")
@@ -300,7 +204,7 @@ if __name__ == '__main__':
         plt.legend()
 
         plt.figure(5)
-        plt.errorbar(T,std_theta,yerr=std_theta_err,fmt='o',label=label)
+        plt.errorbar(T,theta['std'],yerr=theta['std_err'],fmt='o',label=label)
         plt.xlabel("Kinetic Energy (MeV)")
         plt.ylabel("Angular resolution (deg)")
         plt.title("Angular Resolution")
@@ -308,9 +212,10 @@ if __name__ == '__main__':
         plt.legend()
 
         plt.figure(6)
-        plt.scatter(a[a['id'] == id]['Te'],a[a['id'] == id]['ratio'],label=label)
+        plt.scatter(events['Te'],events['ratio'],label=label)
         plt.xlabel("Reconstructed Electron Energy (MeV)")
         plt.ylabel(r"Log Likelihood Ratio (e/$\mu$)")
         plt.title("Log Likelihood Ratio vs Reconstructed Electron Energy")
         plt.legend()
+
     plt.show()
diff --git a/utils/submit-grid-jobs b/utils/submit-grid-jobs
index 670172d..63cbcb7 100755
--- a/utils/submit-grid-jobs
+++ b/utils/submit-grid-jobs
@@ -24,6 +24,83 @@ import string
 from os.path import split, splitext, join, abspath
 import uuid
 from subprocess import check_call
+import os
+import sys
+
+# Next two functions are a backport of the shutil.which() function from Python
+# 3.3 from Lib/shutil.py in the CPython code See
+# https://github.com/python/cpython/blob/master/Lib/shutil.py.
+
+# Check that a given file can be accessed with the correct mode.
+# Additionally check that `file` is not a directory, as on Windows
+# directories pass the os.access check.
+def _access_check(fn, mode):
+    return (os.path.exists(fn) and os.access(fn, mode) and not os.path.isdir(fn))
+
+def which(cmd, mode=os.F_OK | os.X_OK, path=None):
+    """Given a command, mode, and a PATH string, return the path which
+    conforms to the given mode on the PATH, or None if there is no such
+    file.
+    `mode` defaults to os.F_OK | os.X_OK. `path` defaults to the result
+    of os.environ.get("PATH"), or can be overridden with a custom search
+    path.
+    """
+    # If we're given a path with a directory part, look it up directly rather
+    # than referring to PATH directories. This includes checking relative to the
+    # current directory, e.g. ./script
+    if os.path.dirname(cmd):
+        if _access_check(cmd, mode):
+            return cmd
+        return None
+
+    if path is None:
+        path = os.environ.get("PATH", None)
+        if path is None:
+            try:
+                path = os.confstr("CS_PATH")
+            except (AttributeError, ValueError):
+                # os.confstr() or CS_PATH is not available
+                path = os.defpath
+        # bpo-35755: Don't use os.defpath if the PATH environment variable is
+        # set to an empty string
+
+    # PATH='' doesn't match, whereas PATH=':' looks in the current directory
+    if not path:
+        return None
+
+    path = path.split(os.pathsep)
+
+    if sys.platform == "win32":
+        # The current directory takes precedence on Windows.
+        curdir = os.curdir
+        if curdir not in path:
+            path.insert(0, curdir)
+
+        # PATHEXT is necessary to check on Windows.
+        pathext = os.environ.get("PATHEXT", "").split(os.pathsep)
+        # See if the given file matches any of the expected path extensions.
+        # This will allow us to short circuit when given "python.exe".
+        # If it does match, only test that one, otherwise we have to try
+        # others.
+        if any(cmd.lower().endswith(ext.lower()) for ext in pathext):
+            files = [cmd]
+        else:
+            files = [cmd + ext for ext in pathext]
+    else:
+        # On other platforms you don't have things like PATHEXT to tell you
+        # what file suffixes are executable, so just pass on cmd as-is.
+        files = [cmd]
+
+    seen = set()
+    for dir in path:
+        normdir = os.path.normcase(dir)
+        if not normdir in seen:
+            seen.add(normdir)
+            for thefile in files:
+                name = os.path.join(dir, thefile)
+                if _access_check(name, mode):
+                    return name
+    return None
 
 CONDOR_TEMPLATE = \
 """
@@ -42,7 +119,7 @@ log = @log
 
 # The below are good base requirements for first testing jobs on OSG, 
 #  if you don't have a good idea of memory and disk usage.
-requirements = (OSGVO_OS_STRING == "RHEL 7") && (OpSys == "LINUX")
+requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == "RHEL 7") && (OpSys == "LINUX")
 request_cpus = 1
 request_memory = 1 GB
 request_disk = 1 GB
@@ -72,14 +149,24 @@ def submit_job(filename, run, gtid, dir, dqxx_dir, min_nhit, max_particles):
     prefix = "%s_%08i_%s" % (root,gtid,ID.hex)
 
     # fit output filename
-    output = "%s.txt" % prefix
+    output = "%s.hdf5" % prefix
     # condor submit filename
     condor_submit = "%s.submit" % prefix
 
     # set up the arguments for the template
-    executable = join(dir,"fit")
+    executable = which("fit")
+    wrapper = which("fit-wrapper")
+
+    if executable is None:
+        print("couldn't find fit in path!",file=sys.stderr)
+        sys.exit(1)
+
+    if wrapper is None:
+        print("couldn't find fit-wrapper in path!",file=sys.stderr)
+        sys.exit(1)
+
     args = [tail,"-o",output,"--gtid",gtid,"--min-nhit",min_nhit,"--max-particles",max_particles]
-    transfer_input_files = ",".join([filename,join(dqxx_dir,"DQXX_%010i.dat" % run)] + [join(dir,filename) for filename in INPUT_FILES])
+    transfer_input_files = ",".join([executable,filename,join(dqxx_dir,"DQXX_%010i.dat" % run)] + [join(dir,filename) for filename in INPUT_FILES])
     transfer_output_files = ",".join([output])
 
     condor_error = "%s.error" % prefix
@@ -89,7 +176,7 @@ def submit_job(filename, run, gtid, dir, dqxx_dir, min_nhit, max_particles):
     template = MyTemplate(CONDOR_TEMPLATE)
 
     submit_string = template.safe_substitute(
-            executable=executable,
+            executable=wrapper,
             args=" ".join(map(str,args)),
             transfer_input_files=transfer_input_files,
             transfer_output_files=transfer_output_files,
@@ -106,13 +193,13 @@ def submit_job(filename, run, gtid, dir, dqxx_dir, min_nhit, max_particles):
 
 if __name__ == '__main__':
     import argparse
-    import subprocess
+    from subprocess import check_call
     import os
+    import tempfile
+    import h5py
 
     parser = argparse.ArgumentParser("submit grid jobs")
     parser.add_argument("filenames", nargs='+', help="input files")
-    parser.add_argument("--dir", type=str, help="fitter directory", required=True)
-    parser.add_argument("--dqxx-dir", type=str, help="dqxx directory", required=True)
     parser.add_argument("--min-nhit", type=int, help="minimum nhit to fit an event", default=100)
     parser.add_argument("--max-particles", type=int, help="maximum number of particles to fit for", default=3)
     parser.add_argument("--skip-second-event", action='store_true', help="only fit the first event after a MAST bank", default=False)
@@ -123,14 +210,32 @@ if __name__ == '__main__':
         home = os.path.expanduser("~")
         args.state = join(home,'state.txt')
 
+    if 'SDDM_DATA' not in os.environ:
+        print("Please set the SDDM_DATA environment variable to point to the fitter source code location", file=sys.stderr)
+        sys.exit(1)
+
+    dir = os.environ['SDDM_DATA']
+
+    if 'DQXX_DIR' not in os.environ:
+        print("Please set the DQXX_DIR environment variable to point to the directory with the DQXX files", file=sys.stderr)
+        sys.exit(1)
+
+    dqxx_dir = os.environ['DQXX_DIR']
+
     args.state = abspath(args.state)
 
     # get the current working directory
     home_dir = os.getcwd()
 
     # get absolute paths since we are going to create a new directory
-    args.dir = abspath(args.dir)
-    args.dqxx_dir = abspath(args.dqxx_dir)
+    dir = abspath(dir)
+    dqxx_dir = abspath(dqxx_dir)
+
+    zdab_cat = which("zdab-cat")
+
+    if zdab_cat is None:
+        print("couldn't find zdab-cat in path!",file=sys.stderr)
+        sys.exit(1)
 
     for filename in args.filenames:
         filename = abspath(filename)
@@ -152,29 +257,28 @@ if __name__ == '__main__':
             continue
 
         with open(os.devnull, 'w') as f:
+            # Create a temporary file to store the events. Docs recommended
+            # this method instead of mkstemp(), but I think they're the same.
+            output = tempfile.NamedTemporaryFile(suffix='.hdf5',delete=False)
+            output.close()
+
             if args.skip_second_event:
-                popen = subprocess.Popen([join(args.dir,"zdab-cat"),"--skip-second-event",filename],stdout=subprocess.PIPE,stderr=f)
+                check_call([zdab_cat,"--skip-second-event",filename,"-o",output.name],stderr=f)
             else:
-                popen = subprocess.Popen([join(args.dir,"zdab-cat"),filename],stdout=subprocess.PIPE,stderr=f)
+                check_call([zdab_cat,filename,"-o",output.name],stderr=f)
 
         new_dir = "%s_%s" % (root,ID.hex)
 
         os.mkdir(new_dir)
         os.chdir(new_dir)
 
-        for data in yaml.load_all(popen.stdout,Loader=Loader):
-            if 'ev' not in data:
-                continue
-
-            for ev in data['ev']:
-                run = ev['run']
-                gtid = ev['gtid']
-                nhit = ev['nhit']
-
-                if nhit >= args.min_nhit:
-                    submit_job(filename, run, gtid, args.dir, args.dqxx_dir, args.min_nhit, args.max_particles)
+        with h5py.File(output.name) as f:
+            for ev in f['ev']:
+                if ev['nhit'] >= args.min_nhit:
+                    submit_job(filename, ev['run'], ev['gtid'], dir, dqxx_dir, args.min_nhit, args.max_particles)
 
-        popen.wait()
+        # Delete temporary HDF5 file
+        os.unlink(output.name)
 
         state.append(tail)
 
-- 
cgit