diff options
| -rw-r--r-- | README | 4 | ||||
| -rw-r--r-- | src/Makefile | 7 | ||||
| -rw-r--r-- | src/fit.c | 194 | ||||
| -rw-r--r-- | src/hdf5_utils.c | 193 | ||||
| -rw-r--r-- | src/hdf5_utils.h | 85 | ||||
| -rw-r--r-- | src/zdab-cat.c | 117 | ||||
| -rw-r--r-- | src/zdab_utils.c | 18 | ||||
| -rw-r--r-- | src/zdab_utils.h | 32 | ||||
| -rw-r--r-- | utils/Makefile | 1 | ||||
| -rwxr-xr-x | utils/cat-grid-jobs | 163 | ||||
| -rw-r--r-- | utils/fit-wrapper | 7 | ||||
| -rwxr-xr-x | utils/plot | 171 | ||||
| -rwxr-xr-x | utils/plot-energy | 294 | ||||
| -rwxr-xr-x | utils/plot-fit-results | 297 | ||||
| -rwxr-xr-x | utils/submit-grid-jobs | 152 | 
15 files changed, 1063 insertions, 672 deletions
| @@ -1,10 +1,10 @@  Building  ======== -First you have to install the gsl and nlopt libraries. On Fedora you can +First you have to install the gsl, nlopt, and hdf5 libraries. On Fedora you can  install these with the following command: -    $ yum install gsl gsl-devel NLopt NLopt-devel +    $ yum install gsl gsl-devel NLopt NLopt-devel hdf5 hdf5-devel  You can also install these packages yourself. For instructions, see below  (Installing GSL and Installing NLopt). diff --git a/src/Makefile b/src/Makefile index 99cff5e..e5ca1cd 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,7 +1,7 @@  release_hdr := $(shell sh -c './mkreleasehdr.sh')  CFLAGS=-fdiagnostics-color -O2 -Wall -g -DSWAP_BYTES -LDLIBS=-fdiagnostics-color -lm -lgsl -lgslcblas -lnlopt_cxx -lstdc++ -lz +LDLIBS=-fdiagnostics-color -lm -lgsl -lgslcblas -lnlopt_cxx -lstdc++ -lz -lhdf5  PREFIX?=$(HOME)/local  INSTALL_BIN=$(PREFIX)/bin @@ -32,17 +32,18 @@ test-time-pdf: test-time-pdf.o zebra.o likelihood.o pmt.o vector.o misc.o muon.o  test-zebra: test-zebra.o zebra.o pack2b.o -fit: fit.o zebra.o likelihood.o pmt.o vector.o misc.o muon.o optics.o quantum_efficiency.o solid_angle.o pdg.o scattering.o zdab_utils.o pack2b.o sno_charge.o db.o dqxx.o dict.o siphash.o path.o pmt_response.o release.o electron.o proton.o find_peaks.o quad.o dc.o sort.o util.o +fit: fit.o zebra.o likelihood.o pmt.o vector.o misc.o muon.o optics.o quantum_efficiency.o solid_angle.o pdg.o scattering.o zdab_utils.o pack2b.o sno_charge.o db.o dqxx.o dict.o siphash.o path.o pmt_response.o release.o electron.o proton.o find_peaks.o quad.o dc.o sort.o util.o hdf5_utils.o  test-find-peaks: test-find-peaks.o zebra.o likelihood.o pmt.o vector.o misc.o muon.o optics.o quantum_efficiency.o solid_angle.o pdg.o scattering.o zdab_utils.o pack2b.o sno_charge.o db.o dqxx.o dict.o siphash.o path.o pmt_response.o release.o electron.o proton.o find_peaks.o util.o quad.o  calculate-csda-range: calculate-csda-range.o -zdab-cat: zdab-cat.o zebra.o pmt.o vector.o misc.o zdab_utils.o pack2b.o db.o dqxx.o dict.o siphash.o release.o dc.o sort.o util.o +zdab-cat: zdab-cat.o zebra.o pmt.o vector.o misc.o zdab_utils.o pack2b.o db.o dqxx.o dict.o siphash.o release.o dc.o sort.o util.o hdf5_utils.o  install:  	@mkdir -p $(INSTALL_BIN)  	$(INSTALL) fit $(INSTALL_BIN) +	$(INSTALL) zdab-cat $(INSTALL_BIN)  clean:  	rm -f *.o calculate_limits test test-vector test-likelihood fit test-charge test-path calculate-csda-range test-time-pdf test-zebra test-find-peaks Makefile.dep zdab-cat @@ -47,12 +47,21 @@  #include "electron.h"  #include "muon.h"  #include "proton.h" +#include "hdf5.h" +#include "hdf5_utils.h"  /* Maximum number of fit parameters. Should be at least 4 + 3*MAX_VERTICES. */  #define MAX_PARS 100 +  /* Maximum number of peaks to search for in Hough transform. */  #define MAX_NPEAKS 10 +/* Maximum number of events, primary seed particle tracks, and fits. + * + * FIXME: I should write out the events, tracks, and fits as they come up + * instead of storing them until the end. */ +#define MAX_NEVENTS 1000000 +  char *GitSHA1(void);  char *GitDirty(void); @@ -5869,12 +5878,13 @@ int main(int argc, char **argv)  {      int i, j, k;      zebraFile *f; -    zebraBank bmast, bmc, bmcgn, mctk, b; +    zebraBank bmast, mc, bmcgn, mctk, b;      int rv;      EVBank bev;      FTPVBank bftpv;      FTXKBank bftxk;      RSPBank bftxr; +    MCBank bmc;      MCTKBank bmctk;      MCVXBank bmcvx;      event ev = {0}; @@ -5882,7 +5892,6 @@ int main(int argc, char **argv)      double xopt[MAX_PARS];      char *filename = NULL;      char *output = NULL; -    FILE *fout = NULL;      int skip_second_event = 0;      struct timeval tv_start, tv_stop;      long long elapsed; @@ -5892,12 +5901,29 @@ int main(int argc, char **argv)      int id[MAX_VERTICES];      size_t combos[100];      size_t len; -    char tmp[256]; -    size_t nhit, min_nhit = 100; +    size_t min_nhit = 100;      int last_run;      char dqxx_file[256];      int32_t gtid = -1; +    int nevents = 0; +    /* Array of events to write out to HDF5 file. +     * +     * Note: Declared static since otherwise it will cause a stack overflow. */ +    static HDF5Event hdf5_events[MAX_NEVENTS]; + +    int nmcgn = 0; +    /* Array of primary seed tracks to write out to HDF5 file. +     * +     * Note: Declared static since otherwise it will cause a stack overflow. */ +    static HDF5MCGN hdf5_mcgn[MAX_NEVENTS]; + +    int nfits = 0; +    /* Array of fit results to write out to HDF5 file. +     * +     * Note: Declared static since otherwise it will cause a stack overflow. */ +    static HDF5Fit hdf5_fits[MAX_NEVENTS]; +      for (i = 1; i < argc; i++) {          if (strlen(argv[i]) >= 2 && !strncmp(argv[i], "--", 2)) {              if (!strcmp(argv[i]+2,"skip-second-event")) { @@ -5953,22 +5979,8 @@ int main(int argc, char **argv)          return 1;      } -    if (output) { -        fout = fopen(output, "w"); - -        if (!fout) { -            fprintf(stderr, "failed to open '%s': %s\n", output, strerror(errno)); -            return 1; -        } - -        fprintf(fout, "git_sha1: %s\n", GitSHA1()); -        fprintf(fout, "git_dirty: %s\n", GitDirty()); -        fprintf(fout, "data:\n"); -    } -      if (load_pmt_info()) {          zebra_close(f); -        if (output) fclose(fout);          return 1;      } @@ -6038,30 +6050,29 @@ int main(int argc, char **argv)              continue;          } -        if (fout) fprintf(fout, "  -\n"); -          if (bmast.links[KMAST_MC-1] == 0) goto skip_mc; -        rv = zebra_get_bank(f,&bmc,bmast.links[KMAST_MC-1]); +        rv = zebra_get_bank(f,&mc,bmast.links[KMAST_MC-1]);          if (rv) {              fprintf(stderr, "error getting MC bank: %s\n", zebra_err);              goto err;          } +        unpack_mc(mc.data, &bmc); +          if (bmast.links[KMC_MCGN-1] == 0) {              fprintf(stderr, "MCGN link is zero!\n");              goto err;          } -        rv = zebra_get_bank(f,&bmcgn,bmc.links[KMC_MCGN-1]); +        rv = zebra_get_bank(f,&bmcgn,mc.links[KMC_MCGN-1]);          if (rv) {              fprintf(stderr, "error getting MCGN bank: %s\n", zebra_err);              goto err;          } -        if (fout) fprintf(fout, "    mcgn:\n");          while (1) {              if (bmcgn.links[KMCGN_MCTK-1] == 0) {                  fprintf(stderr, "MCTK link is zero!\n"); @@ -6110,17 +6121,18 @@ int main(int argc, char **argv)              unpack_mcvx(b.data, &bmcvx); -            if (fout) { -                fprintf(fout, "      -\n"); -                fprintf(fout, "        id: %" PRIu32 "\n", bmctk.idp); -                fprintf(fout, "        energy: %.2f\n", bmctk.ene); -                fprintf(fout, "        posx: %.2f\n", bmcvx.x); -                fprintf(fout, "        posy: %.2f\n", bmcvx.y); -                fprintf(fout, "        posz: %.2f\n", bmcvx.z); -                fprintf(fout, "        dirx: %.4f\n", bmctk.drx); -                fprintf(fout, "        diry: %.4f\n", bmctk.dry); -                fprintf(fout, "        dirz: %.4f\n", bmctk.drz); -                fprintf(fout, "        time: %.4f\n", bmcvx.tim); +            if (output) { +                hdf5_mcgn[nmcgn].evn = bmc.evn; +                hdf5_mcgn[nmcgn].id = bmctk.idp; +                hdf5_mcgn[nmcgn].energy = bmctk.ene; +                hdf5_mcgn[nmcgn].x = bmcvx.x; +                hdf5_mcgn[nmcgn].y = bmcvx.y; +                hdf5_mcgn[nmcgn].z = bmcvx.z; +                hdf5_mcgn[nmcgn].dirx = bmctk.drx; +                hdf5_mcgn[nmcgn].diry = bmctk.dry; +                hdf5_mcgn[nmcgn].dirz = bmctk.drz; +                hdf5_mcgn[nmcgn].time = bmcvx.tim; +                nmcgn++;              }              if (bmcgn.next) { @@ -6156,7 +6168,6 @@ skip_mc:              }          } -        if (fout) fprintf(fout, "    ev:\n");          while (1) {              unpack_ev(b.data, &bev);              ev.run = bev.run; @@ -6185,49 +6196,44 @@ skip_mc:              if (get_event(f,&ev,&b)) goto skip_event; -            nhit = get_nhit(&ev); +            if (output) { +                hdf5_events[nevents].run = ev.run; +                hdf5_events[nevents].evn = bmc.evn; +                hdf5_events[nevents].gtr = ev.trigger_time; +                hdf5_events[nevents].nhit = ev.nhit; +                hdf5_events[nevents].gtid = ev.gtid; +                hdf5_events[nevents].trg_type = ev.trigger_type; +                hdf5_events[nevents].dc = get_dc_word(&ev, f, &bmast, &b); -            if (fout) { -                fprintf(fout, "      - run: %i\n", ev.run); -                fprintf(fout, "        gtr: %.0f\n", ev.trigger_time); -                fprintf(fout, "        nhit: %zu\n", nhit); -                fprintf(fout, "        gtid: %i\n", ev.gtid); -                fprintf(fout, "        trg_type: 0x%08x\n", ev.trigger_type); -                fprintf(fout, "        dc: 0x%08x\n", get_dc_word(&ev, f, &bmast, &b)); -            } - -            if (fout) {                  if (get_ftpv(f,&b,&bftpv)) {                      fprintf(stderr, "%s\n", zdab_err); +                    hdf5_events[nevents].ftp_x = NAN; +                    hdf5_events[nevents].ftp_y = NAN; +                    hdf5_events[nevents].ftp_z = NAN;                  } else { -                    fprintf(fout, "        ftp:\n"); -                    fprintf(fout, "            x: %.2f\n", bftpv.x); -                    fprintf(fout, "            y: %.2f\n", bftpv.y); -                    fprintf(fout, "            z: %.2f\n", bftpv.z); +                    hdf5_events[nevents].ftp_x = bftpv.x; +                    hdf5_events[nevents].ftp_y = bftpv.y; +                    hdf5_events[nevents].ftp_z = bftpv.z;                  } -            } -            if (fout) {                  if (get_ftxk(f,&b,&bftxk)) {                      fprintf(stderr, "%s\n", zdab_err); +                    hdf5_events[nevents].ftk_energy = NAN;                  } else { -                    fprintf(fout, "        ftk:\n"); -                    fprintf(fout, "            energy: %.2f\n", bftxk.energy); +                    hdf5_events[nevents].ftk_energy = bftxk.energy;                  } -            } -            if (fout) {                  if (get_rsp(f,&b,&bftxr)) {                      fprintf(stderr, "%s\n", zdab_err); +                    hdf5_events[nevents].rsp_energy = NAN;                  } else { -                    fprintf(fout, "        rsp:\n"); -                    fprintf(fout, "            energy: %.2f\n", bftxr.ene); +                    hdf5_events[nevents].rsp_energy = bftxr.ene;                  } -            } -            if (nhit < min_nhit) goto skip_event; +                nevents++; +            } -            if (fout) fprintf(fout, "        fit:\n"); +            if (ev.nhit < min_nhit) goto skip_event;              fmin_best = nll_best(&ev); @@ -6251,25 +6257,47 @@ skip_mc:                      elapsed = (tv_stop.tv_sec - tv_start.tv_sec)*1000 + (tv_stop.tv_usec - tv_start.tv_usec)/1000; -                    if (fout) { -                        sprintf_particle_string(id,i,tmp); -                        fprintf(fout, "          %s:\n", tmp); -                        fprintf(fout, "            posx: %.2f\n", xopt[0]); -                        fprintf(fout, "            posy: %.2f\n", xopt[1]); -                        fprintf(fout, "            posz: %.2f\n", xopt[2]); -                        fprintf(fout, "            t0: %.2f\n", xopt[3]); -                        sprintf_yaml_list(xopt+4,i,3,tmp); -                        fprintf(fout, "            energy: %s\n", tmp); -                        sprintf_yaml_list(xopt+5,i,3,tmp); -                        fprintf(fout, "            theta: %s\n", tmp); -                        sprintf_yaml_list(xopt+6,i,3,tmp); -                        fprintf(fout, "            phi: %s\n", tmp); -                        //fprintf(fout, "            z1: %.2f\n", xopt[7]); -                        //fprintf(fout, "            z2: %.2f\n", xopt[8]); -                        fprintf(fout, "            fmin: %.2f\n", fmin); -                        fprintf(fout, "            time: %lld\n", elapsed); -                        fprintf(fout, "            psi: %.2f\n", fmin-fmin_best); -                        fflush(fout); +                    if (output) { +                        hdf5_fits[nfits].run = ev.run; +                        hdf5_fits[nfits].gtid = ev.gtid; +                        hdf5_fits[nfits].n = i; +                        hdf5_fits[nfits].x = xopt[0]; +                        hdf5_fits[nfits].y = xopt[1]; +                        hdf5_fits[nfits].z = xopt[2]; +                        hdf5_fits[nfits].t0 = xopt[3]; +                        hdf5_fits[nfits].id1 = id[0]; +                        hdf5_fits[nfits].energy1 = xopt[4]; +                        hdf5_fits[nfits].theta1 = xopt[5]; +                        hdf5_fits[nfits].phi1 = xopt[6]; + +                        if (i > 1) { +                            hdf5_fits[nfits].id2 = id[1]; +                            hdf5_fits[nfits].energy2 = xopt[7]; +                            hdf5_fits[nfits].theta2 = xopt[8]; +                            hdf5_fits[nfits].phi2 = xopt[9]; +                        } else { +                            hdf5_fits[nfits].id2 = 0; +                            hdf5_fits[nfits].energy2 = NAN; +                            hdf5_fits[nfits].theta2 = NAN; +                            hdf5_fits[nfits].phi2 = NAN; +                        } + +                        if (i > 2) { +                            hdf5_fits[nfits].id3 = id[2]; +                            hdf5_fits[nfits].energy3 = xopt[10]; +                            hdf5_fits[nfits].theta3 = xopt[11]; +                            hdf5_fits[nfits].phi3 = xopt[12]; +                        } else { +                            hdf5_fits[nfits].id3 = 0; +                            hdf5_fits[nfits].energy3 = NAN; +                            hdf5_fits[nfits].theta3 = NAN; +                            hdf5_fits[nfits].phi3 = NAN; +                        } + +                        hdf5_fits[nfits].fmin = fmin; +                        hdf5_fits[nfits].time = elapsed; +                        hdf5_fits[nfits].psi = fmin-fmin_best; +                        nfits++;                      }                  }              } @@ -6292,13 +6320,13 @@ skip_event:  end: +    if (output) save_output(output, hdf5_events, nevents, hdf5_mcgn, nmcgn, hdf5_fits, nfits); +      free_interpolation();      pmt_response_free();      db_free(db); -    if (fout) fclose(fout); -      zebra_close(f);      return 0; @@ -6309,8 +6337,6 @@ err:      db_free(db); -    if (fout) fclose(fout); -      zebra_close(f);      return 1; diff --git a/src/hdf5_utils.c b/src/hdf5_utils.c new file mode 100644 index 0000000..05d1002 --- /dev/null +++ b/src/hdf5_utils.c @@ -0,0 +1,193 @@ +#include "hdf5_utils.h" +#include "hdf5.h" +#include <string.h> /* for strlen() */ + +char *GitSHA1(void); +char *GitDirty(void); + +/* Write out data to an HDF5 file. + * + * This function writes out three datasets to an HDF5 file: + * + * 1. ev: the event dataset + * + *    This dataset contains event level information similar to the EV bank in + *    SNOMAN. It also contains a data cleaning word and the FTP, FTK, and RSP + *    fit values for the position and energy. + * + * 2. mcgn: the primary particle tracks + * + *     This dataset contains vertex and track information for all the primary + *     particle tracks used to simulate the event. + * + * 3. fits: the fit information + * + *     This dataset contains the reconstructed fit information for each event. + *     Note that this dataset will in general contain multiple rows for a + *     single event since we fit for different particle hypotheses. + * + * Currently the ev and mcgn datasets are fixed size, and the fits dataset is + * chunked and extensible to allow the cat-grid-jobs script to work. + * + */ +int save_output(const char *output, HDF5Event *hdf5_events, size_t nevents, HDF5MCGN *hdf5_mcgn, size_t nmcgn, HDF5Fit *hdf5_fits, size_t nfits) +{ +    hid_t      hdf5_events_tid;     /* File datatype identifier */ +    hid_t      file, dataset, space; /* Handles */ +    herr_t     status; +    hsize_t    dim[1];   /* Dataspace dimensions */ +    hid_t      hdf5_mcgn_tid;     /* File datatype identifier */ +    hid_t      dataset_mcgn, space_mcgn; /* Handles */ +    hsize_t    dim_mcgn[1];   /* Dataspace dimensions */ +    hid_t      hdf5_fits_tid;     /* File datatype identifier */ +    hid_t      dataset_fits, space_fits; /* Handles */ +    hsize_t    dim_fits[1];   /* Dataspace dimensions */ +    hsize_t    chunk_dims[1];   /* Dataspace dimensions */ +    hid_t      dcpl_id; /* Handles */ +    hsize_t    maxdims[1] = {H5S_UNLIMITED}; + +    /* +     * Create the data space. +     */ +    dim[0] = nevents; +    space = H5Screate_simple(1, dim, NULL); +    dim_mcgn[0] = nmcgn; +    space_mcgn = H5Screate_simple(1, dim_mcgn, NULL); +    dim_fits[0] = nfits; +    space_fits = H5Screate_simple(1, dim_fits, maxdims); +    chunk_dims[0] = 10; +    dcpl_id = H5Pcreate(H5P_DATASET_CREATE); +    H5Pset_chunk(dcpl_id, 1, chunk_dims); + +    /* +     * Create the file. +     */ +    file = H5Fcreate(output, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT); + +    hid_t   attr1, attr2; /* Attribute identifiers */ +    hid_t   aid1, aid2;    /* Attribute dataspace identifiers */ +    hid_t   atype1, atype2;    /* Attribute type */ + +    char *git_sha1 = GitSHA1(); +    char *git_dirty = GitDirty(); + +    /* +     * Create string attribute. +     */ +    aid1  = H5Screate(H5S_SCALAR); +    atype1 = H5Tcopy(H5T_C_S1); +             H5Tset_size(atype1, strlen(git_sha1)); +             H5Tset_strpad(atype1,H5T_STR_NULLTERM); +    attr1 = H5Acreate2(file, "git_sha1", atype1, aid1, H5P_DEFAULT, H5P_DEFAULT); + +    /* +     * Create string attribute. +     */ +    aid2  = H5Screate(H5S_SCALAR); +    atype2 = H5Tcopy(H5T_C_S1); +             H5Tset_size(atype2, strlen(git_dirty)); +             H5Tset_strpad(atype2,H5T_STR_NULLTERM); +    attr2 = H5Acreate2(file, "git_dirty", atype2, aid2, H5P_DEFAULT, H5P_DEFAULT); + +    herr_t  ret;                /* Return value */ + +    /* +     * Write string attribute. +     */ +    ret = H5Awrite(attr1, atype1, git_sha1); +    ret = H5Awrite(attr2, atype2, git_dirty); + +    /* +     * Create the memory data type. +     */ +    hdf5_events_tid = H5Tcreate(H5T_COMPOUND, sizeof(HDF5Event)); +    H5Tinsert(hdf5_events_tid, "run",        HOFFSET(HDF5Event, run),        H5T_NATIVE_INT); +    H5Tinsert(hdf5_events_tid, "evn",        HOFFSET(HDF5Event, evn),        H5T_NATIVE_INT); +    H5Tinsert(hdf5_events_tid, "gtr",        HOFFSET(HDF5Event, gtr),        H5T_NATIVE_DOUBLE); +    H5Tinsert(hdf5_events_tid, "nhit",       HOFFSET(HDF5Event, nhit),       H5T_NATIVE_INT); +    H5Tinsert(hdf5_events_tid, "gtid",       HOFFSET(HDF5Event, gtid),       H5T_NATIVE_UINT32); +    H5Tinsert(hdf5_events_tid, "trg_type",   HOFFSET(HDF5Event, trg_type),   H5T_NATIVE_UINT32); +    H5Tinsert(hdf5_events_tid, "dc",         HOFFSET(HDF5Event, dc),         H5T_NATIVE_UINT32); +    H5Tinsert(hdf5_events_tid, "ftp_x",      HOFFSET(HDF5Event, ftp_x),      H5T_NATIVE_DOUBLE); +    H5Tinsert(hdf5_events_tid, "ftp_y",      HOFFSET(HDF5Event, ftp_y),      H5T_NATIVE_DOUBLE); +    H5Tinsert(hdf5_events_tid, "ftp_z",      HOFFSET(HDF5Event, ftp_z),      H5T_NATIVE_DOUBLE); +    H5Tinsert(hdf5_events_tid, "ftk_energy", HOFFSET(HDF5Event, ftk_energy), H5T_NATIVE_DOUBLE); +    H5Tinsert(hdf5_events_tid, "rsp_energy", HOFFSET(HDF5Event, rsp_energy), H5T_NATIVE_DOUBLE); + +    hdf5_mcgn_tid = H5Tcreate(H5T_COMPOUND, sizeof(HDF5MCGN)); +    H5Tinsert(hdf5_mcgn_tid, "run",    HOFFSET(HDF5MCGN, run),    H5T_NATIVE_INT); +    H5Tinsert(hdf5_mcgn_tid, "evn",    HOFFSET(HDF5MCGN, evn),    H5T_NATIVE_INT); +    H5Tinsert(hdf5_mcgn_tid, "id",     HOFFSET(HDF5MCGN, id),     H5T_NATIVE_INT); +    H5Tinsert(hdf5_mcgn_tid, "energy", HOFFSET(HDF5MCGN, energy), H5T_NATIVE_DOUBLE); +    H5Tinsert(hdf5_mcgn_tid, "x",      HOFFSET(HDF5MCGN, x),      H5T_NATIVE_DOUBLE); +    H5Tinsert(hdf5_mcgn_tid, "y",      HOFFSET(HDF5MCGN, y),      H5T_NATIVE_DOUBLE); +    H5Tinsert(hdf5_mcgn_tid, "z",      HOFFSET(HDF5MCGN, z),      H5T_NATIVE_DOUBLE); +    H5Tinsert(hdf5_mcgn_tid, "dirx",   HOFFSET(HDF5MCGN, dirx),   H5T_NATIVE_DOUBLE); +    H5Tinsert(hdf5_mcgn_tid, "diry",   HOFFSET(HDF5MCGN, diry),   H5T_NATIVE_DOUBLE); +    H5Tinsert(hdf5_mcgn_tid, "dirz",   HOFFSET(HDF5MCGN, dirz),   H5T_NATIVE_DOUBLE); +    H5Tinsert(hdf5_mcgn_tid, "time",   HOFFSET(HDF5MCGN, time),   H5T_NATIVE_DOUBLE); + +    hdf5_fits_tid = H5Tcreate(H5T_COMPOUND, sizeof(HDF5Fit)); +    H5Tinsert(hdf5_fits_tid, "run",     HOFFSET(HDF5Fit, run),     H5T_NATIVE_INT); +    H5Tinsert(hdf5_fits_tid, "gtid",    HOFFSET(HDF5Fit, gtid),    H5T_NATIVE_UINT32); +    H5Tinsert(hdf5_fits_tid, "n",       HOFFSET(HDF5Fit, n),       H5T_NATIVE_INT); +    H5Tinsert(hdf5_fits_tid, "x",       HOFFSET(HDF5Fit, x),       H5T_NATIVE_DOUBLE); +    H5Tinsert(hdf5_fits_tid, "y",       HOFFSET(HDF5Fit, y),       H5T_NATIVE_DOUBLE); +    H5Tinsert(hdf5_fits_tid, "z",       HOFFSET(HDF5Fit, z),       H5T_NATIVE_DOUBLE); +    H5Tinsert(hdf5_fits_tid, "t0",      HOFFSET(HDF5Fit, t0),      H5T_NATIVE_DOUBLE); +    H5Tinsert(hdf5_fits_tid, "id1",     HOFFSET(HDF5Fit, id1),     H5T_NATIVE_INT); +    H5Tinsert(hdf5_fits_tid, "energy1", HOFFSET(HDF5Fit, energy1), H5T_NATIVE_DOUBLE); +    H5Tinsert(hdf5_fits_tid, "theta1",  HOFFSET(HDF5Fit, theta1),  H5T_NATIVE_DOUBLE); +    H5Tinsert(hdf5_fits_tid, "phi1",    HOFFSET(HDF5Fit, phi1),    H5T_NATIVE_DOUBLE); +    H5Tinsert(hdf5_fits_tid, "id2",     HOFFSET(HDF5Fit, id2),     H5T_NATIVE_INT); +    H5Tinsert(hdf5_fits_tid, "energy2", HOFFSET(HDF5Fit, energy2), H5T_NATIVE_DOUBLE); +    H5Tinsert(hdf5_fits_tid, "theta2",  HOFFSET(HDF5Fit, theta2),  H5T_NATIVE_DOUBLE); +    H5Tinsert(hdf5_fits_tid, "phi2",    HOFFSET(HDF5Fit, phi2),    H5T_NATIVE_DOUBLE); +    H5Tinsert(hdf5_fits_tid, "id3",     HOFFSET(HDF5Fit, id3),     H5T_NATIVE_INT); +    H5Tinsert(hdf5_fits_tid, "energy3", HOFFSET(HDF5Fit, energy3), H5T_NATIVE_DOUBLE); +    H5Tinsert(hdf5_fits_tid, "theta3",  HOFFSET(HDF5Fit, theta3),  H5T_NATIVE_DOUBLE); +    H5Tinsert(hdf5_fits_tid, "phi3",    HOFFSET(HDF5Fit, phi3),    H5T_NATIVE_DOUBLE); +    H5Tinsert(hdf5_fits_tid, "fmin",    HOFFSET(HDF5Fit, fmin),    H5T_NATIVE_DOUBLE); +    H5Tinsert(hdf5_fits_tid, "time",    HOFFSET(HDF5Fit, time),    H5T_NATIVE_DOUBLE); +    H5Tinsert(hdf5_fits_tid, "psi",     HOFFSET(HDF5Fit, psi),     H5T_NATIVE_DOUBLE); + +    /* +     * Create the dataset. +     */ +    dataset = H5Dcreate2(file, "ev", hdf5_events_tid, space, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); +    dataset_mcgn = H5Dcreate2(file, "mcgn", hdf5_mcgn_tid, space_mcgn, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); +    dataset_fits = H5Dcreate2(file, "fits", hdf5_fits_tid, space_fits, H5P_DEFAULT, dcpl_id, H5P_DEFAULT); + +    /* +     * Wtite data to the dataset; +     */ +    if (nevents) +        status = H5Dwrite(dataset, hdf5_events_tid, H5S_ALL, H5S_ALL, H5P_DEFAULT, hdf5_events); +    if (nmcgn) +        status = H5Dwrite(dataset_mcgn, hdf5_mcgn_tid, H5S_ALL, H5S_ALL, H5P_DEFAULT, hdf5_mcgn); +    if (nfits) +        status = H5Dwrite(dataset_fits, hdf5_fits_tid, H5S_ALL, H5S_ALL, H5P_DEFAULT, hdf5_fits); + +    /* +     * Release resources +     */ +    H5Sclose(aid1); +    H5Sclose(aid2); +    H5Tclose(atype1); +    H5Tclose(atype2); +    H5Aclose(attr1); +    H5Aclose(attr2); +    H5Tclose(hdf5_events_tid); +    H5Tclose(hdf5_mcgn_tid); +    H5Tclose(hdf5_fits_tid); +    H5Sclose(space); +    H5Sclose(space_mcgn); +    H5Sclose(space_fits); +    H5Pclose(dcpl_id); +    H5Dclose(dataset); +    H5Dclose(dataset_mcgn); +    H5Dclose(dataset_fits); +    H5Fclose(file); + +    return 0; +} diff --git a/src/hdf5_utils.h b/src/hdf5_utils.h new file mode 100644 index 0000000..4f6ad32 --- /dev/null +++ b/src/hdf5_utils.h @@ -0,0 +1,85 @@ +#ifndef HDF5_UTILS +#define HDF5_UTILS + +#include <stdint.h> +#include <stdlib.h> /* for size_t */ + +/* Struct to hold information for each triggered event (similar to the EV bank, + * but with some reconstructed information). + * + * This is the struct that is written out to the 'ev' dataset in the HDF5 file. + * + * Note: If there is no FTP, FTK, or RSP info in the zdab file these values + * will be set to nan. */ +typedef struct HDF5Event { +    int run; +    int evn; +    double gtr; +    int nhit; +    uint32_t gtid; +    uint32_t trg_type; +    uint32_t dc; +    double ftp_x; +    double ftp_y; +    double ftp_z; +    double ftk_energy; +    double rsp_energy; +} HDF5Event; + +/* Struct to hold information for each primary seed tracks that is written out + * as the `mcgn` dataset in the HDF5 file. To figure out which track + * corresponds to a given GTID you can match them up based on the evn column, + * which is filled with the MC event number in both datasets. */ +typedef struct HDF5MCGN { +    int run; +    int evn; +    int id; +    double energy; +    double x; +    double y; +    double z; +    double dirx; +    double diry; +    double dirz; +    double time; +} HDF5MCGN; + +/* Struct to hold information for each fit that is written out + * as the `fits` dataset in the HDF5 file. + * + * Note that the energy2, theta2, phi2 fields will only be filled in for fits + * with 2 tracks and similarly for energy3, theta3, and phi3 for 3 track fits. */ +typedef struct HDF5Fit { +    int run; +    uint32_t gtid; +    double x; +    double y; +    double z; +    double t0; + +    /* Number of tracks. */ +    int n; + +    int id1; +    double energy1; +    double theta1; +    double phi1; + +    int id2; +    double energy2; +    double theta2; +    double phi2; + +    int id3; +    double energy3; +    double theta3; +    double phi3; + +    double fmin; +    double time; +    double psi; +} HDF5Fit; + +int save_output(const char *output, HDF5Event *hdf5_events, size_t nevents, HDF5MCGN *hdf5_mcgn, size_t nmcgn, HDF5Fit *hdf5_fits, size_t nfits); + +#endif diff --git a/src/zdab-cat.c b/src/zdab-cat.c index 5b9bb3d..278236c 100644 --- a/src/zdab-cat.c +++ b/src/zdab-cat.c @@ -26,6 +26,15 @@  #include <errno.h> /* for errno */  #include "release.h"  #include "dc.h" +#include "hdf5.h" +#include <math.h> +#include "hdf5_utils.h" + +/* Maximum number of events, primary seed particle tracks, and fits. + * + * FIXME: I should write out the events, tracks, and fits as they come up + * instead of storing them until the end. */ +#define MAX_NEVENTS 1000000  #define EV_RECORD      0x45562020 // 'EV  '  #define MCTK_RECORD    0x4d43544b // 'MCTK' @@ -47,22 +56,25 @@ int main(int argc, char **argv)  {      int i;      zebraFile *f; -    zebraBank bmast, bmc, bmcgn, mctk, b; +    zebraBank bmast, mc, bmcgn, mctk, b;      int rv;      EVBank bev;      FTPVBank bftpv;      FTXKBank bftxk;      RSPBank bftxr; +    MCBank bmc;      MCTKBank bmctk;      MCVXBank bmcvx;      event ev = {0};      char *filename = NULL;      char *output = NULL; -    FILE *fout = stdout;      int skip_second_event = 0; -    size_t nhit;      int last_run;      char dqxx_file[256]; +    int nevents = 0; +    static HDF5Event hdf5_events[MAX_NEVENTS]; +    int nmcgn = 0; +    static HDF5MCGN hdf5_mcgn[MAX_NEVENTS];      for (i = 1; i < argc; i++) {          if (strlen(argv[i]) >= 2 && !strncmp(argv[i], "--", 2)) { @@ -96,23 +108,8 @@ int main(int argc, char **argv)          return 1;      } -    if (output) { -        fout = fopen(output, "w"); - -        if (!fout) { -            fprintf(stderr, "failed to open '%s': %s\n", output, strerror(errno)); -            return 1; -        } -    } - -    if (fout) { -        fprintf(fout, "git_sha1: %s\n", GitSHA1()); -        fprintf(fout, "git_dirty: %s\n", GitDirty()); -    } -      if (load_pmt_info()) {          zebra_close(f); -        if (output) fclose(fout);          return 1;      } @@ -160,11 +157,9 @@ int main(int argc, char **argv)              continue;          } -        if (fout) fprintf(fout, "---\n"); -          if (bmast.links[KMAST_MC-1] == 0) goto skip_mc; -        rv = zebra_get_bank(f,&bmc,bmast.links[KMAST_MC-1]); +        rv = zebra_get_bank(f,&mc,bmast.links[KMAST_MC-1]);          if (rv) {              fprintf(stderr, "error getting MC bank: %s\n", zebra_err); @@ -176,14 +171,15 @@ int main(int argc, char **argv)              goto err;          } -        rv = zebra_get_bank(f,&bmcgn,bmc.links[KMC_MCGN-1]); +        unpack_mc(mc.data, &bmc); + +        rv = zebra_get_bank(f,&bmcgn,mc.links[KMC_MCGN-1]);          if (rv) {              fprintf(stderr, "error getting MCGN bank: %s\n", zebra_err);              goto err;          } -        if (fout) fprintf(fout, "    mcgn:\n");          while (1) {              if (bmcgn.links[KMCGN_MCTK-1] == 0) {                  fprintf(stderr, "MCTK link is zero!\n"); @@ -232,17 +228,18 @@ int main(int argc, char **argv)              unpack_mcvx(b.data, &bmcvx); -            if (fout) { -                fprintf(fout, "      -\n"); -                fprintf(fout, "        id: %" PRIu32 "\n", bmctk.idp); -                fprintf(fout, "        energy: %.2f\n", bmctk.ene); -                fprintf(fout, "        posx: %.2f\n", bmcvx.x); -                fprintf(fout, "        posy: %.2f\n", bmcvx.y); -                fprintf(fout, "        posz: %.2f\n", bmcvx.z); -                fprintf(fout, "        dirx: %.4f\n", bmctk.drx); -                fprintf(fout, "        diry: %.4f\n", bmctk.dry); -                fprintf(fout, "        dirz: %.4f\n", bmctk.drz); -                fprintf(fout, "        time: %.4f\n", bmcvx.tim); +            if (output) { +                hdf5_mcgn[nmcgn].evn = bmc.evn; +                hdf5_mcgn[nmcgn].id = bmctk.idp; +                hdf5_mcgn[nmcgn].energy = bmctk.ene; +                hdf5_mcgn[nmcgn].x = bmcvx.x; +                hdf5_mcgn[nmcgn].y = bmcvx.y; +                hdf5_mcgn[nmcgn].z = bmcvx.z; +                hdf5_mcgn[nmcgn].dirx = bmctk.drx; +                hdf5_mcgn[nmcgn].diry = bmctk.dry; +                hdf5_mcgn[nmcgn].dirz = bmctk.drz; +                hdf5_mcgn[nmcgn].time = bmcvx.tim; +                nmcgn++;              }              if (bmcgn.next) { @@ -278,7 +275,6 @@ skip_mc:              }          } -        if (fout) fprintf(fout, "    ev:\n");          while (1) {              unpack_ev(b.data, &bev);              ev.run = bev.run; @@ -305,44 +301,41 @@ skip_mc:              rv = get_event(f,&ev,&b); -            nhit = get_nhit(&ev); +            if (output) { +                hdf5_events[nevents].run = ev.run; +                hdf5_events[nevents].evn = bmc.evn; +                hdf5_events[nevents].gtr = ev.trigger_time; +                hdf5_events[nevents].nhit = ev.nhit; +                hdf5_events[nevents].gtid = ev.gtid; +                hdf5_events[nevents].trg_type = ev.trigger_type; +                hdf5_events[nevents].dc = get_dc_word(&ev, f, &bmast, &b); -            if (fout) { -                fprintf(fout, "      - run: %i\n", ev.run); -                fprintf(fout, "        gtr: %.0f\n", ev.trigger_time); -                fprintf(fout, "        nhit: %zu\n", nhit); -                fprintf(fout, "        gtid: %i\n", ev.gtid); -                fprintf(fout, "        trg_type: 0x%08x\n", ev.trigger_type); -                fprintf(fout, "        dc: 0x%08x\n", get_dc_word(&ev, f, &bmast, &b)); -            } - -            if (fout) {                  if (get_ftpv(f,&b,&bftpv)) {                      fprintf(stderr, "%s\n", zdab_err); +                    hdf5_events[nevents].ftp_x = NAN; +                    hdf5_events[nevents].ftp_y = NAN; +                    hdf5_events[nevents].ftp_z = NAN;                  } else { -                    fprintf(fout, "        ftp:\n"); -                    fprintf(fout, "            x: %.2f\n", bftpv.x); -                    fprintf(fout, "            y: %.2f\n", bftpv.y); -                    fprintf(fout, "            z: %.2f\n", bftpv.z); +                    hdf5_events[nevents].ftp_x = bftpv.x; +                    hdf5_events[nevents].ftp_y = bftpv.y; +                    hdf5_events[nevents].ftp_z = bftpv.z;                  } -            } -            if (fout) {                  if (get_ftxk(f,&b,&bftxk)) {                      fprintf(stderr, "%s\n", zdab_err); +                    hdf5_events[nevents].ftk_energy = NAN;                  } else { -                    fprintf(fout, "        ftk:\n"); -                    fprintf(fout, "            energy: %.2f\n", bftxk.energy); +                    hdf5_events[nevents].ftk_energy = bftxk.energy;                  } -            } -            if (fout) {                  if (get_rsp(f,&b,&bftxr)) {                      fprintf(stderr, "%s\n", zdab_err); +                    hdf5_events[nevents].rsp_energy = NAN;                  } else { -                    fprintf(fout, "        rsp:\n"); -                    fprintf(fout, "            energy: %.2f\n", bftxr.ene); +                    hdf5_events[nevents].rsp_energy = bftxr.ene;                  } + +                nevents++;              }              /* Note the origin link for the first EV bank points back to the @@ -359,9 +352,11 @@ skip_mc:          }      } -    db_free(db); +    if (output) { +        save_output(output, hdf5_events, nevents, hdf5_mcgn, nmcgn, NULL, 0); +    } -    if (fout) fclose(fout); +    db_free(db);      zebra_close(f); @@ -370,8 +365,6 @@ skip_mc:  err:      db_free(db); -    if (fout) fclose(fout); -      zebra_close(f);      return 1; diff --git a/src/zdab_utils.c b/src/zdab_utils.c index 716503c..416bd79 100644 --- a/src/zdab_utils.c +++ b/src/zdab_utils.c @@ -580,6 +580,24 @@ void unpack_ftk(uint32_t *data, FTKBank *b)      unpack((uint8_t *) (data+25), "f",&b->spare5);  } +void unpack_mc(uint32_t *data, MCBank *b) +{ +    unpack((uint8_t *) data,      "l",&b->jdy); +    unpack((uint8_t *) (data+1),  "l",&b->ut1); +    unpack((uint8_t *) (data+2),  "l",&b->ut2); +    unpack((uint8_t *) (data+3),  "l",&b->dte); +    unpack((uint8_t *) (data+4),  "l",&b->hmsc); +    unpack((uint8_t *) (data+5),  "l",&b->seed1); +    unpack((uint8_t *) (data+6),  "l",&b->seed2); +    unpack((uint8_t *) (data+7),  "l",&b->seed_num); +    unpack((uint8_t *) (data+8),  "l",&b->dtp); +    unpack((uint8_t *) (data+9),  "f",&b->mcver); +    unpack((uint8_t *) (data+10), "l",&b->evn); +    unpack((uint8_t *) (data+11), "F",&b->gtr); +    unpack((uint8_t *) (data+13), "l",&b->num_ge_err); +    unpack((uint8_t *) (data+14), "f",&b->pmt_eff); +} +  void unpack_mcgn(uint32_t *data, MCGNBank *b)  {      unpack((uint8_t *) data,      "l",&b->id); diff --git a/src/zdab_utils.h b/src/zdab_utils.h index d4411d9..043e185 100644 --- a/src/zdab_utils.h +++ b/src/zdab_utils.h @@ -378,6 +378,37 @@ typedef struct FTKBank {      float spare5;  } FTKBank; +typedef struct MCBank { +    /* Julian date. */ +    uint32_t jdy; +    /* Universal time seconds. */ +    uint32_t ut1; +    /* Universal time nanoseconds. */ +    uint32_t ut2; +    /* Date (format: yyyymmdd). */ +    uint32_t dte; +    /* Time (format: hhmmsscc  - cc is centisec). */ +    uint32_t hmsc; +    /* First random number seed. */ +    uint32_t seed1; +    /* Second random number seed. */ +    uint32_t seed2; +    /* Random number number. */ +    uint32_t seed_num; +    /* Data type. */ +    uint32_t dtp; +    /* SNOMAN MC version number. */ +    float mcver; +    /* Monte Carlo event number. */ +    uint32_t evn; +    /* Generation time in nsec (first word). */ +    double gtr; +    /* Number of suppressed Cerenkov photon tracking errors. */ +    uint32_t num_ge_err; +    /* The current PMT collection efficiency. */ +    float pmt_eff; +} MCBank; +  typedef struct MCGNBank {      /* Particle id. */      uint32_t id; @@ -638,6 +669,7 @@ void unpack_ftpt(uint32_t *data, FTPTBank *b);  void unpack_ftpv(uint32_t *data, FTPVBank *b);  void unpack_ftxk(uint32_t *data, FTXKBank *b);  void unpack_ftk(uint32_t *data, FTKBank *b); +void unpack_mc(uint32_t *data, MCBank *b);  void unpack_mcgn(uint32_t *data, MCGNBank *b);  void unpack_mcvx(uint32_t *data, MCVXBank *b);  void unpack_mctk(uint32_t *data, MCTKBank *b); diff --git a/utils/Makefile b/utils/Makefile index ef21a09..011a723 100644 --- a/utils/Makefile +++ b/utils/Makefile @@ -13,5 +13,6 @@ install:  	$(INSTALL) plot-likelihood $(INSTALL_BIN)  	$(INSTALL) submit-grid-jobs $(INSTALL_BIN)  	$(INSTALL) gen-dark-matter $(INSTALL_BIN) +	$(INSTALL) fit-wrapper $(INSTALL_BIN)  .PHONY: install diff --git a/utils/cat-grid-jobs b/utils/cat-grid-jobs index ba06290..83a8004 100755 --- a/utils/cat-grid-jobs +++ b/utils/cat-grid-jobs @@ -23,7 +23,7 @@ as YAML to stdout.  Example: -    $ cat-grid-jobs --dir ~/sddm/src/ ~/mc_atm_nu_no_osc_genie_010000_0.mcds ~/grid_job_results/*.txt > output.txt +    $ cat-grid-jobs ~/mc_atm_nu_no_osc_genie_010000_0.mcds ~/grid_job_results/*.txt > output.txt  """ @@ -33,80 +33,145 @@ try:      from yaml import CLoader as Loader, CDumper as Dumper  except ImportError:      from yaml import Loader, Dumper +import os +import sys + +# Check that a given file can be accessed with the correct mode. +# Additionally check that `file` is not a directory, as on Windows +# directories pass the os.access check. +def _access_check(fn, mode): +    return (os.path.exists(fn) and os.access(fn, mode) and not os.path.isdir(fn)) + +def which(cmd, mode=os.F_OK | os.X_OK, path=None): +    """Given a command, mode, and a PATH string, return the path which +    conforms to the given mode on the PATH, or None if there is no such +    file. +    `mode` defaults to os.F_OK | os.X_OK. `path` defaults to the result +    of os.environ.get("PATH"), or can be overridden with a custom search +    path. +    """ +    # If we're given a path with a directory part, look it up directly rather +    # than referring to PATH directories. This includes checking relative to the +    # current directory, e.g. ./script +    if os.path.dirname(cmd): +        if _access_check(cmd, mode): +            return cmd +        return None + +    if path is None: +        path = os.environ.get("PATH", None) +        if path is None: +            try: +                path = os.confstr("CS_PATH") +            except (AttributeError, ValueError): +                # os.confstr() or CS_PATH is not available +                path = os.defpath +        # bpo-35755: Don't use os.defpath if the PATH environment variable is +        # set to an empty string + +    # PATH='' doesn't match, whereas PATH=':' looks in the current directory +    if not path: +        return None + +    path = path.split(os.pathsep) + +    if sys.platform == "win32": +        # The current directory takes precedence on Windows. +        curdir = os.curdir +        if curdir not in path: +            path.insert(0, curdir) + +        # PATHEXT is necessary to check on Windows. +        pathext = os.environ.get("PATHEXT", "").split(os.pathsep) +        # See if the given file matches any of the expected path extensions. +        # This will allow us to short circuit when given "python.exe". +        # If it does match, only test that one, otherwise we have to try +        # others. +        if any(cmd.lower().endswith(ext.lower()) for ext in pathext): +            files = [cmd] +        else: +            files = [cmd + ext for ext in pathext] +    else: +        # On other platforms you don't have things like PATHEXT to tell you +        # what file suffixes are executable, so just pass on cmd as-is. +        files = [cmd] + +    seen = set() +    for dir in path: +        normdir = os.path.normcase(dir) +        if not normdir in seen: +            seen.add(normdir) +            for thefile in files: +                name = os.path.join(dir, thefile) +                if _access_check(name, mode): +                    return name +    return None + +# from https://stackoverflow.com/questions/287871/how-to-print-colored-text-in-terminal-in-python +class bcolors: +    HEADER = '\033[95m' +    OKBLUE = '\033[94m' +    OKGREEN = '\033[92m' +    WARNING = '\033[93m' +    FAIL = '\033[91m' +    ENDC = '\033[0m' +    BOLD = '\033[1m' +    UNDERLINE = '\033[4m' + +def print_warning(msg): +    print(bcolors.FAIL + msg + bcolors.ENDC,file=sys.stderr)  if __name__ == '__main__':      import argparse      import matplotlib.pyplot as plt      import numpy as np -    import subprocess +    from subprocess import check_call      from os.path import join      import os      import sys +    import h5py      parser = argparse.ArgumentParser("concatenate fit results from grid jobs into a single file") -    parser.add_argument("--dir", type=str, help="fitter directory", required=True)      parser.add_argument("zdab", help="zdab input file")      parser.add_argument("filenames", nargs='+', help="input files") +    parser.add_argument("-o", "--output", type=str, help="output filename", required=True)      args = parser.parse_args() -    fit_results = {} +    zdab_cat = which("zdab-cat") -    # First we create a dictionary mapping (run, gtid) -> fit results. -    for filename in args.filenames: -        with open(filename) as f: -            data = yaml.load(f,Loader=Loader) +    if zdab_cat is None: +        print("couldn't find zdab-cat in path!",file=sys.stderr) +        sys.exit(1) -        if data is None: -            continue - -        for event in data['data']: -            if event['ev'] is None: -                continue - -            # if the ev branch is filled in, it means the event was fit -            for ev in event['ev']: -                # add the git SHA1 hash to the fit results since technically it -                # could be different than the version in zdab-cat -                ev['fit']['git_sha1'] = data['git_sha1'] -                ev['fit']['git_dirty'] = data['git_dirty'] -                fit_results[(ev['run'],ev['gtid'])] = ev['fit'] - -    # Next we get the full event list along with the data cleaning word, FTP +    # First we get the full event list along with the data cleaning word, FTP      # position, FTK, and RSP energy from the original zdab and then add the fit      # results.      #      # Note: We send stderr to /dev/null since there can be a lot of warnings      # about PMT types and fit results      with open(os.devnull, 'w') as f: -        popen = subprocess.Popen([join(args.dir,"zdab-cat"),args.zdab],stdout=subprocess.PIPE,stderr=f) +        check_call([zdab_cat,args.zdab,"-o",args.output],stderr=f)      total_events = 0      events_with_fit = 0 - -    doc = {'data': []} - -    for data in yaml.load_all(popen.stdout,Loader=Loader): -        if 'ev' not in data: -            doc.update(data) -            continue - -        for ev in data['ev']: -            run = ev['run'] -            gtid = ev['gtid'] - -            if (run,gtid) in fit_results: -                ev['fit'] = fit_results[(run,gtid)] -                events_with_fit += 1 - -            total_events += 1 - -        doc['data'].append(data) - -    popen.wait() +    total_fits = 0 + +    with h5py.File(args.output,"a") as fout: +        total_events = fout['ev'].shape[0] +        for filename in args.filenames: +            with h5py.File(filename) as f: +                # Check to see if the git sha1 match +                if fout.attrs['git_sha1'] != f.attrs['git_sha1']: +                    print_warning("git_sha1 is %s for current version but %s for %s" % (fout.attrs['git_sha1'],f.attrs['git_sha1'],filename)) +                # get fits which match up with the events +                valid_fits = f['fits'][np.isin(f['fits'][:][['run','gtid']],fout['ev'][:][['run','gtid']])] +                # Add the fit results +                fout['fits'].resize((fout['fits'].shape[0]+valid_fits.shape[0],)) +                fout['fits'][-valid_fits.shape[0]:] = valid_fits +                events_with_fit += len(np.unique(valid_fits[['run','gtid']])) +                total_fits += len(np.unique(f['fits']['run','gtid']))      # Print out number of fit results that were added. Hopefully, this will      # make it easy to catch an error if, for example, this gets run with a      # mismatching zdab and fit results -    print("added %i/%i fit results to a total of %i events" % (events_with_fit, len(fit_results), total_events),file=sys.stderr) - -    print(yaml.dump(doc,default_flow_style=False,Dumper=Dumper)) +    print("added %i/%i fit results to a total of %i events" % (events_with_fit, total_fits, total_events),file=sys.stderr) diff --git a/utils/fit-wrapper b/utils/fit-wrapper new file mode 100644 index 0000000..940b807 --- /dev/null +++ b/utils/fit-wrapper @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +# Simple wrapper script to run jobs on the grid + +module load hdf5 +module load zlib +./fit "$@" @@ -74,6 +74,8 @@ if __name__ == '__main__':      import argparse      import matplotlib.pyplot as plt      import numpy as np +    import h5py +    import pandas as pd      parser = argparse.ArgumentParser("plot fit results")      parser.add_argument("filenames", nargs='+', help="input files") @@ -81,121 +83,113 @@ if __name__ == '__main__':      for filename in args.filenames:          print(filename) -        with open(filename) as f: -            data = yaml.load(f.read(),Loader=Loader) - -        dx = [] -        dy = [] -        dz = [] -        dT = [] -        thetas = [] -        likelihood_ratio = [] -        t_electron = [] -        t_muon = [] -        psi = [] -        for event in data['data']: -            # get the particle ID -            id = event['mcgn'][0]['id'] - -            if 'ev' not in event: -                continue - -            if 'fit' not in event['ev'][0]: -                # if nhit < 100 we don't fit the event -                continue - -            if id not in event['ev'][0]['fit']: -                continue - -            fit = event['ev'][0]['fit'] - -            mass = SNOMAN_MASS[id] -            # for some reason it's the *second* track which seems to contain the -            # initial energy -            true_energy = event['mcgn'][0]['energy'] -            # The MCTK bank has the particle's total energy (except for neutrons) -            # so we need to convert it into kinetic energy -            ke = true_energy - mass -            energy = fit[id]['energy'] -            dT.append(energy-ke) -            true_posx = event['mcgn'][0]['posx'] -            posx = fit[id]['posx'] -            dx.append(posx-true_posx) -            true_posy = event['mcgn'][0]['posy'] -            posy = fit[id]['posy'] -            dy.append(posy-true_posy) -            true_posz = event['mcgn'][0]['posz'] -            posz = fit[id]['posz'] -            dz.append(posz-true_posz) -            dirx = event['mcgn'][0]['dirx'] -            diry = event['mcgn'][0]['diry'] -            dirz = event['mcgn'][0]['dirz'] -            true_dir = [dirx,diry,dirz] -            true_dir = np.array(true_dir)/np.linalg.norm(true_dir) -            theta = fit[id]['theta'] -            phi = fit[id]['phi'] -            dir = [np.sin(theta)*np.cos(phi),np.sin(theta)*np.sin(phi),np.cos(theta)] -            dir = np.array(dir)/np.linalg.norm(dir) -            thetas.append(np.degrees(np.arccos(np.dot(true_dir,dir)))) -            if IDP_E_MINUS in fit and IDP_MU_MINUS in fit: -                fmin_electron = fit[IDP_E_MINUS]['fmin'] -                fmin_muon = fit[IDP_MU_MINUS]['fmin'] -                likelihood_ratio.append(fmin_muon-fmin_electron) -            if IDP_E_MINUS in fit: -                t_electron.append(fit[IDP_E_MINUS]['time']) -            if IDP_MU_MINUS in fit: -                t_muon.append(fit[IDP_MU_MINUS]['time']) - -            if 'nhit' in event['ev'][0]: -                nhit = event['ev'][0]['nhit'] -                psi.append(fit[id]['psi']/nhit) - -        if len(t_electron) < 2 or len(t_muon) < 2: +         +        with h5py.File(filename) as f: +            ev = pd.read_hdf(filename, "ev") +            mcgn = pd.read_hdf(filename, "mcgn") +            fits = pd.read_hdf(filename, "fits") + +            # get rid of 2nd events like Michel electrons +            ev = ev.sort_values(['run','gtid']).groupby(['evn'],as_index=False).first() + +            # Now, we merge all three datasets together to produce a single +            # dataframe. To do so, we join the ev dataframe with the mcgn frame +            # on the evn column, and then join with the fits on the run and +            # gtid columns. +            # +            # At the end we will have a single dataframe with one row for each +            # fit, i.e. it will look like: +            # +            # >>> data +            #   run   gtid nhit, ... mcgn_x, mcgn_y, mcgn_z, ..., fit_id1, fit_x, fit_y, fit_z, ... +            # +            # Before merging, we prefix the primary seed track table with mcgn_ +            # and the fit table with fit_ just to make things easier. + +            # Prefix track and fit frames +            mcgn = mcgn.add_prefix("mcgn_") +            fits = fits.add_prefix("fit_") + +            # merge ev and mcgn on evn +            data = ev.merge(mcgn,left_on=['evn'],right_on=['mcgn_evn']) +            # merge data and fits on run and gtid +            data = data.merge(fits,left_on=['run','gtid'],right_on=['fit_run','fit_gtid']) + +            # calculate true kinetic energy +            mass = [SNOMAN_MASS[id] for id in data['mcgn_id'].values] +            data['T'] = data['mcgn_energy'].values - mass +            data['dx'] = data['fit_x'].values - data['mcgn_x'].values +            data['dy'] = data['fit_y'].values - data['mcgn_y'].values +            data['dz'] = data['fit_z'].values - data['mcgn_z'].values +            data['dT'] = data['fit_energy1'].values - data['T'].values + +            true_dir = np.dstack((data['mcgn_dirx'],data['mcgn_diry'],data['mcgn_dirz'])).squeeze() +            dir = np.dstack((np.sin(data['fit_theta1'])*np.cos(data['fit_phi1']), +                             np.sin(data['fit_theta1'])*np.sin(data['fit_phi1']), +                             np.cos(data['fit_theta1']))).squeeze() + +            data['theta'] = np.degrees(np.arccos((true_dir*dir).sum(axis=-1))) + +            # only select fits which have at least 2 fits +            data = data.groupby(['run','gtid']).filter(lambda x: len(x) > 1) +            data_true = data[data['fit_id1'] == data['mcgn_id']] +            data_e = data[data['fit_id1'] == IDP_E_MINUS] +            data_mu = data[data['fit_id1'] == IDP_MU_MINUS] + +            data_true = data_true.set_index(['run','gtid']) +            data_e = data_e.set_index(['run','gtid']) +            data_mu = data_mu.set_index(['run','gtid']) + +            data_true['ratio'] = data_mu['fit_fmin']-data_e['fit_fmin'] +            data_true['te'] = data_e['fit_time'] +            data_true['tm'] = data_mu['fit_time'] +            data_true['Te'] = data_e['fit_energy1'] + +        if len(data_true) < 2:              continue -        mean, mean_error, std, std_error = get_stats(dT) +        mean, mean_error, std, std_error = get_stats(data_true.dT)          print("dT      = %.2g +/- %.2g" % (mean, mean_error))          print("std(dT) = %.2g +/- %.2g" % (std, std_error)) -        mean, mean_error, std, std_error = get_stats(dx) +        mean, mean_error, std, std_error = get_stats(data_true.dx)          print("dx      = %4.2g +/- %.2g" % (mean, mean_error))          print("std(dx) = %4.2g +/- %.2g" % (std, std_error)) -        mean, mean_error, std, std_error = get_stats(dy) +        mean, mean_error, std, std_error = get_stats(data_true.dy)          print("dy      = %4.2g +/- %.2g" % (mean, mean_error))          print("std(dy) = %4.2g +/- %.2g" % (std, std_error)) -        mean, mean_error, std, std_error = get_stats(dz) +        mean, mean_error, std, std_error = get_stats(data_true.dz)          print("dz      = %4.2g +/- %.2g" % (mean, mean_error))          print("std(dz) = %4.2g +/- %.2g" % (std, std_error)) -        mean, mean_error, std, std_error = get_stats(thetas) +        mean, mean_error, std, std_error = get_stats(data_true.theta)          print("std(theta) = %4.2g +/- %.2g" % (std, std_error))          plt.figure(1) -        plot_hist(dT, label=filename) +        plot_hist(data_true.dT, label=filename)          plt.xlabel("Kinetic Energy difference (MeV)")          plt.figure(2) -        plot_hist(dx, label=filename) +        plot_hist(data_true.dx, label=filename)          plt.xlabel("X Position difference (cm)")          plt.figure(3) -        plot_hist(dy, label=filename) +        plot_hist(data_true.dy, label=filename)          plt.xlabel("Y Position difference (cm)")          plt.figure(4) -        plot_hist(dz, label=filename) +        plot_hist(data_true.dz, label=filename)          plt.xlabel("Z Position difference (cm)")          plt.figure(5) -        plot_hist(thetas, label=filename) +        plot_hist(data_true.theta, label=filename)          plt.xlabel(r"$\theta$ (deg)")          plt.figure(6) -        plot_hist(likelihood_ratio, label=filename) +        plot_hist(data_true.ratio, label=filename)          plt.xlabel(r"Log Likelihood Ratio ($e/\mu$)")          plt.figure(7) -        plot_hist(np.array(t_electron)/1e3/60.0, label=filename) +        plot_hist(data_true.te/1e3/60.0, label=filename)          plt.xlabel(r"Electron Fit time (minutes)")          plt.figure(8) -        plot_hist(np.array(t_muon)/1e3/60.0, label=filename) +        plot_hist(data_true.tm/1e3/60.0, label=filename)          plt.xlabel(r"Muon Fit time (minutes)") -        if len(psi): -            plt.figure(9) -            plot_hist(psi, label=filename) -            plt.xlabel(r"$\Psi$/Nhit") +        plt.figure(9) +        plot_hist(data_true.fit_psi/data_true.nhit, label=filename) +        plt.xlabel(r"$\Psi$/Nhit")      plot_legend(1)      plot_legend(2) @@ -205,6 +199,5 @@ if __name__ == '__main__':      plot_legend(6)      plot_legend(7)      plot_legend(8) -    if len(psi): -        plot_legend(9) +    plot_legend(9)      plt.show() diff --git a/utils/plot-energy b/utils/plot-energy index a0274b7..4a8521b 100755 --- a/utils/plot-energy +++ b/utils/plot-energy @@ -107,140 +107,69 @@ if __name__ == '__main__':      import numpy as np      import pandas as pd      import sys +    import h5py      parser = argparse.ArgumentParser("plot fit results")      parser.add_argument("filenames", nargs='+', help="input files")      args = parser.parse_args() -    events = [] -    fit_results = [] - -    for filename in args.filenames: -        print(filename) -        with open(filename) as f: -            data = yaml.load(f,Loader=Loader) - -        for i, event in enumerate(data['data']): -            for ev in event['ev']: -                events.append(( -                    ev['run'], -                    ev['gtr'], -                    ev['nhit'], -                    ev['gtid'], -                    ev['dc'], -                    ev['ftp']['x'] if 'ftp' in ev else np.nan, -                    ev['ftp']['y'] if 'ftp' in ev else np.nan, -                    ev['ftp']['z'] if 'ftp' in ev else np.nan, -                    ev['rsp']['energy'] if 'rsp' in ev and ev['rsp']['energy'] > 0 else np.nan, -                    )) - -                if 'fit' not in ev: -                    continue - -                for id, fit_result in [x for x in ev['fit'].iteritems() if isinstance(x[0],int)]: -                    # FIXME: Should I just store the particle ids in the YAML -                    # output as a list of particle ids instead of a single -                    # integer? -                    ids = map(int,chunks(str(id),2)) -                    energy = 0.0 -                    skip = False -                    for i, ke in zip(ids,np.atleast_1d(fit_result['energy'])): -                        energy += ke + SNOMAN_MASS[i] - -                        # This is a bit of a hack. It appears that many times -                        # the fit will actually do much better by including a -                        # very low energy electron or muon. I believe the -                        # reason for this is that of course my likelihood -                        # function is not perfect (for example, I don't include -                        # the correct angular distribution for Rayleigh -                        # scattered light), and so the fitter often wants to -                        # add a very low energy electron or muon to fix things. -                        # -                        # Ideally I would fix the likelihood function, but for -                        # now we just discard any fit results which have a very -                        # low energy electron or muon. -                        if len(ids) > 1 and i == 20 and ke < 20.0: -                            skip = True - -                        if len(ids) > 1 and i == 22 and ke < 200.0: -                            skip = True - -                    if skip: -                        continue - -                    # Calculate the approximate Ockham factor. -                    # See Chapter 20 in "Probability Theory: The Logic of Science" by Jaynes -                    # -                    # Note: This is a really approximate form by assuming that -                    # the shape of the likelihood space is equal to the average -                    # uncertainty in the different parameters. -                    w = len(ids)*np.log(0.1*0.001) + np.sum(np.log(fit_result['energy'])) + len(ids)*np.log(1e-4/(4*np.pi)) - -                    fit_results.append(( -                        ev['run'], -                        ev['gtid'], -                        id, -                        fit_result['posx'], -                        fit_result['posy'], -                        fit_result['posz'], -                        fit_result['t0'], -                        energy, -                        np.atleast_1d(fit_result['theta'])[0], -                        np.atleast_1d(fit_result['phi'])[0], -                        fit_result['fmin'] - w, -                        fit_result['psi']/ev['nhit'])) - -    # create a dataframe -    # note: we have to first create a numpy structured array since there is no -    # way to pass a list of data types to the DataFrame constructor. See -    # https://github.com/pandas-dev/pandas/issues/4464 -    array = np.array(fit_results, -                     dtype=[('run',np.int),      # run number -                            ('gtid',np.int),     # gtid -                            ('id',np.int),       # particle id -                            ('x', np.double),    # x -                            ('y',np.double),     # y -                            ('z',np.double),     # z -                            ('t0',np.double),    # t0 -                            ('ke',np.double),    # kinetic energy -                            ('theta',np.double), # direction of 1st particle -                            ('phi',np.double),   # direction of 2nd particle -                            ('fmin',np.double),  # negative log likelihood -                            ('psi',np.double)]   # goodness of fit parameter -                   ) -    df = pd.DataFrame.from_records(array) - -    array = np.array(events, -                     dtype=[('run',np.int),           # run number -                            ('gtr',np.double),        # 50 MHz clock in ns -                            ('nhit',np.int),          # number of PMTs hit -                            ('gtid',np.int),          # gtid -                            ('dc',np.int),            # data cleaning word -                            ('ftpx',np.double),       # FTP fitter X position -                            ('ftpy',np.double),       # FTP fitter Y position -                            ('ftpz',np.double),       # FTP fitter Z position -                            ('rsp_energy',np.double)] # RSP energy -                   ) -    df_ev = pd.DataFrame.from_records(array) +    ev = pd.concat([pd.read_hdf(filename, "ev") for filename in args.filenames]) +    fits = pd.concat([pd.read_hdf(filename, "fits") for filename in args.filenames]) + +    # This is a bit of a hack. It appears that many times the fit will +    # actually do much better by including a very low energy electron or +    # muon. I believe the reason for this is that of course my likelihood +    # function is not perfect (for example, I don't include the correct +    # angular distribution for Rayleigh scattered light), and so the fitter +    # often wants to add a very low energy electron or muon to fix things. +    # +    # Ideally I would fix the likelihood function, but for now we just +    # discard any fit results which have a very low energy electron or +    # muon. +    # +    # FIXME: Test this since query() is new to pandas +    fits = fits.query('not (n > 1 and ((id1 == 20 and energy1 < 20)  or (id2 == 20 and energy2 < 20)  or (id3 == 20 and energy3 < 20)))') +    fits = fits.query('not (n > 1 and ((id2 == 22 and energy1 < 200) or (id2 == 22 and energy2 < 200) or (id3 == 22 and energy3 < 200)))') + +    # Calculate the approximate Ockham factor. +    # See Chapter 20 in "Probability Theory: The Logic of Science" by Jaynes +    # +    # Note: This is a really approximate form by assuming that the shape of +    # the likelihood space is equal to the average uncertainty in the +    # different parameters. +    fits['w'] = fits['n']*np.log(0.1*0.001) + np.log(fits['energy1']) + fits['n']*np.log(1e-4/(4*np.pi)) + +    # Note: we index on the left hand site with loc to avoid a copy error +    # +    # See https://www.dataquest.io/blog/settingwithcopywarning/ +    fits.loc[fits['n'] > 1, 'w'] += np.log(fits[fits['n'] > 1]['energy2']) +    fits.loc[fits['n'] > 2, 'w'] += np.log(fits[fits['n'] > 2]['energy3']) + +    fits['fmin'] = fits['fmin'] - fits['w'] + +    fits['psi'] /= fits.merge(ev,on=['run','gtid'])['nhit'] +    fits['ke'] = fits['energy1'] +    fits['id'] = fits['id1'] + fits['id2']*100 + fits['id3']*10000 +    fits['theta'] = fits['theta1']      # Make sure events are in order. We use run number and GTID here which      # should be in time order as well except for bit flips in the GTID      # This is important for later when we look at time differences in the 50      # MHz clock. We should perhaps do a check here based on the 10 MHz clock to      # make sure things are in order -    df_ev = df_ev.sort_values(by=['run','gtid']) +    ev = ev.sort_values(by=['run','gtid']) -    print("number of events = %i" % len(df_ev)) +    print("number of events = %i" % len(ev))      # First, do basic data cleaning which is done for all events. -    df_ev = df_ev[df_ev.dc & (DC_JUNK | DC_CRATE_ISOTROPY | DC_QVNHIT | DC_FLASHER | DC_NECK) == 0] +    ev = ev[ev.dc & (DC_JUNK | DC_CRATE_ISOTROPY | DC_QVNHIT | DC_FLASHER | DC_NECK) == 0] -    print("number of events after data cleaning = %i" % len(df_ev)) +    print("number of events after data cleaning = %i" % len(ev))      # Now, we select events tagged by the muon tag which should tag only      # external muons. We keep the sample of muons since it's needed later to      # identify Michel electrons and to apply the muon follower cut -    muons = df_ev[(df_ev.dc & DC_MUON) != 0] +    muons = ev[(ev.dc & DC_MUON) != 0]      print("number of muons = %i" % len(muons)) @@ -251,12 +180,12 @@ if __name__ == '__main__':      #      # Should I use 10 MHz clock here since the time isn't critical and then I      # don't have to worry about 50 MHz clock rollover? -    prompt = df_ev[df_ev.nhit >= 100] +    prompt = ev[ev.nhit >= 100]      prompt_mask = np.concatenate(([True],np.diff(prompt.gtr.values) > 250e6))      # Michel electrons and neutrons can be any event which is not a prompt      # event -    follower = pd.concat([df_ev[df_ev.nhit < 100],prompt[~prompt_mask]]) +    follower = pd.concat([ev[ev.nhit < 100],prompt[~prompt_mask]])      # Apply the prompt mask      prompt = prompt[prompt_mask] @@ -275,39 +204,78 @@ if __name__ == '__main__':      if prompt_plus_muons.size and follower.size:          # require Michel events to pass more of the SNO data cleaning cuts          michel = follower[follower.dc & (DC_JUNK | DC_CRATE_ISOTROPY | DC_QVNHIT | DC_FLASHER | DC_NECK | DC_ESUM | DC_OWL | DC_OWL_TRIGGER | DC_FTS) == DC_FTS] +          michel = michel[michel.nhit >= 100] -        # accept events which had a muon more than 800 nanoseconds but less than 20 microseconds before them -        # The 800 nanoseconds cut comes from Richie's thesis. He also mentions -        # that the In Time Channel Spread Cut is very effective at cutting -        # electrons events caused by muons, so I should implement that -        michel = michel[np.any((michel.run.values == prompt_plus_muons.run.values[:,np.newaxis]) & \ -                               (michel.gtr.values > prompt_plus_muons.gtr.values[:,np.newaxis] + 800) & \ -                               (michel.gtr.values < prompt_plus_muons.gtr.values[:,np.newaxis] + 20e3),axis=0)] + +        # Accept events which had a muon more than 800 nanoseconds but less +        # than 20 microseconds before them. The 800 nanoseconds cut comes from +        # Richie's thesis. He also mentions that the In Time Channel Spread Cut +        # is very effective at cutting electrons events caused by muons, so I +        # should implement that. +        # +        # Note: We currently don't look across run boundaries. This should be a +        # *very* small effect, and the logic to do so would be very complicated +        # since I would have to deal with 50 MHz clock rollovers, etc. +        # +        # Do a simple python for loop here over the runs since we create +        # temporary arrays with shape (michel.size,prompt_plus_muons.size) +        # which could be too big for the full dataset. +        # +        # There might be some clever way to do this without the loop in Pandas, +        # but I don't know how. +        michel_sum = None +        for run, michel_run in michel.groupby(['run']): +            prompt_plus_muons_run = prompt_plus_muons[prompt_plus_muons['run'] == run] +            michel_run = michel_run[np.any((michel_run.gtr.values > prompt_plus_muons_run.gtr.values[:,np.newaxis] + 800) & \ +                                           (michel_run.gtr.values < prompt_plus_muons_run.gtr.values[:,np.newaxis] + 20e3),axis=0)] + +            if michel_sum is None: +                michel_sum = michel_run +            else: +                michel_sum = michel_sum.append(michel_run) + +        michel = michel_sum      else:          michel = pd.DataFrame(columns=follower.columns)      if prompt.size and follower.size:          # neutron followers have to obey stricter set of data cleaning cuts          neutron = follower[follower.dc & (DC_JUNK | DC_CRATE_ISOTROPY | DC_QVNHIT | DC_FLASHER | DC_NECK | DC_ESUM | DC_OWL | DC_OWL_TRIGGER | DC_FTS) == DC_FTS] -        neutron = neutron[~np.isnan(neutron.ftpx) & ~np.isnan(neutron.rsp_energy)] -        r = np.sqrt(neutron.ftpx**2 + neutron.ftpy**2 + neutron.ftpz**2) +        neutron = neutron[~np.isnan(neutron.ftp_x) & ~np.isnan(neutron.rsp_energy)] +        r = np.sqrt(neutron.ftp_x**2 + neutron.ftp_y**2 + neutron.ftp_z**2)          neutron = neutron[r < AV_RADIUS]          neutron = neutron[neutron.rsp_energy > 4.0] +          # neutron events accepted after 20 microseconds and before 250 ms (50 ms during salt)          # FIXME: need to deal with 50 MHz clock rollover -        neutron_mask = np.any((neutron.run.values == prompt.run.values[:,np.newaxis]) & \ -                              (neutron.gtr.values > prompt.gtr.values[:,np.newaxis] + 20e3) & \ -                              (neutron.gtr.values < prompt.gtr.values[:,np.newaxis] + 250e6),axis=1) -        df_atm = prompt[neutron_mask] -        prompt = prompt[~neutron_mask] +        prompt_sum = None +        atm_sum = None +        for run, prompt_run in prompt.groupby(['run']): +            neutron_run = neutron[neutron['run'] == run] + +            neutron_mask = np.any((neutron_run.gtr.values > prompt_run.gtr.values[:,np.newaxis] + 20e3) & \ +                                  (neutron_run.gtr.values < prompt_run.gtr.values[:,np.newaxis] + 250e6),axis=1) + +            if prompt_sum is None: +                prompt_sum = prompt_run[~neutron_mask] +            else: +                prompt_sum = prompt_sum.append(prompt_run[~neutron_mask]) + +            if atm_sum is None: +                atm_sum = prompt_run[neutron_mask] +            else: +                atm_sum = atm_sum.append(prompt_run[neutron_mask]) + +        atm = atm_sum +        prompt = prompt_sum      else: -        df_atm = pd.DataFrame(columns=prompt.columns) +        atm = pd.DataFrame(columns=prompt.columns)      print("number of events after neutron follower cut = %i" % len(prompt))      # Get rid of muon events in our main event sample      prompt = prompt[(prompt.dc & DC_MUON) == 0] -    df_atm = df_atm[(df_atm.dc & DC_MUON) == 0] +    atm = atm[(atm.dc & DC_MUON) == 0]      print("number of events after muon cut = %i" % len(prompt)) @@ -317,52 +285,52 @@ if __name__ == '__main__':          prompt = prompt[~np.any((prompt.run.values == muons.run.values[:,np.newaxis]) & \                                  (prompt.gtr.values > muons.gtr.values[:,np.newaxis]) & \                                  (prompt.gtr.values <= (muons.gtr.values[:,np.newaxis] + 200e3)),axis=0)] -        df_atm = df_atm[~np.any((df_atm.run.values == muons.run.values[:,np.newaxis]) & \ -                                (df_atm.gtr.values > muons.gtr.values[:,np.newaxis]) & \ -                                (df_atm.gtr.values <= (muons.gtr.values[:,np.newaxis] + 200e3)),axis=0)] +        atm = atm[~np.any((atm.run.values == muons.run.values[:,np.newaxis]) & \ +                          (atm.gtr.values > muons.gtr.values[:,np.newaxis]) & \ +                          (atm.gtr.values <= (muons.gtr.values[:,np.newaxis] + 200e3)),axis=0)]      # Check to see if there are any events with missing fit information -    df_atm_ra = df_atm[['run','gtid']].to_records(index=False) +    atm_ra = atm[['run','gtid']].to_records(index=False)      muons_ra = muons[['run','gtid']].to_records(index=False)      prompt_ra = prompt[['run','gtid']].to_records(index=False)      michel_ra = michel[['run','gtid']].to_records(index=False) -    df_ra = df[['run','gtid']].to_records(index=False) +    fits_ra = fits[['run','gtid']].to_records(index=False) -    if np.count_nonzero(~np.isin(df_atm_ra,df_ra)): -        print_warning("skipping %i atmospheric events because they are missing fit information!" % np.count_nonzero(~np.isin(df_atm_ra,df_ra))) +    if len(atm_ra) and np.count_nonzero(~np.isin(atm_ra,fits_ra)): +        print_warning("skipping %i atmospheric events because they are missing fit information!" % np.count_nonzero(~np.isin(atm_ra,fits_ra))) -    if np.count_nonzero(~np.isin(muons_ra,df_ra)): -        print_warning("skipping %i muon events because they are missing fit information!" % np.count_nonzero(~np.isin(muons_ra,df_ra))) +    if len(muons_ra) and np.count_nonzero(~np.isin(muons_ra,fits_ra)): +        print_warning("skipping %i muon events because they are missing fit information!" % np.count_nonzero(~np.isin(muons_ra,fits_ra))) -    if np.count_nonzero(~np.isin(prompt_ra,df_ra)): -        print_warning("skipping %i signal events because they are missing fit information!" % np.count_nonzero(~np.isin(prompt_ra,df_ra))) +    if len(prompt_ra) and np.count_nonzero(~np.isin(prompt_ra,fits_ra)): +        print_warning("skipping %i signal events because they are missing fit information!" % np.count_nonzero(~np.isin(prompt_ra,fits_ra))) -    if np.count_nonzero(~np.isin(michel_ra,df_ra)): -        print_warning("skipping %i Michel events because they are missing fit information!" % np.count_nonzero(~np.isin(michel_ra,df_ra))) +    if len(michel_ra) and np.count_nonzero(~np.isin(michel_ra,fits_ra)): +        print_warning("skipping %i Michel events because they are missing fit information!" % np.count_nonzero(~np.isin(michel_ra,fits_ra)))      # Now, we merge the event info with the fitter info.      #      # Note: This means that the dataframe now contains multiple rows for each      # event, one for each fit hypothesis. -    df_atm = pd.merge(df,df_atm,how='inner',on=['run','gtid']) -    muons = pd.merge(df,muons,how='inner',on=['run','gtid']) -    michel = pd.merge(df,michel,how='inner',on=['run','gtid']) -    df = pd.merge(df,prompt,how='inner',on=['run','gtid']) +    atm = pd.merge(fits,atm,how='inner',on=['run','gtid']) +    muons = pd.merge(fits,muons,how='inner',on=['run','gtid']) +    michel = pd.merge(fits,michel,how='inner',on=['run','gtid']) +    prompt = pd.merge(fits,prompt,how='inner',on=['run','gtid'])      # get rid of events which don't have a fit -    nan = np.isnan(df.fmin.values) +    nan = np.isnan(prompt.fmin.values)      if np.count_nonzero(nan): -        print_warning("skipping %i signal events because the negative log likelihood is nan!" % len(df[nan].groupby(['run','gtid']))) +        print_warning("skipping %i signal events because the negative log likelihood is nan!" % len(prompt[nan].groupby(['run','gtid']))) -    df = df[~nan] +    prompt = prompt[~nan] -    nan_atm = np.isnan(df_atm.fmin.values) +    nan_atm = np.isnan(atm.fmin.values)      if np.count_nonzero(nan_atm): -        print_warning("skipping %i atmospheric events because the negative log likelihood is nan!" % len(df_atm[nan_atm].groupby(['run','gtid']))) +        print_warning("skipping %i atmospheric events because the negative log likelihood is nan!" % len(atm[nan_atm].groupby(['run','gtid']))) -    df_atm = df_atm[~nan_atm] +    atm = atm[~nan_atm]      nan_muon = np.isnan(muons.fmin.values) @@ -379,24 +347,24 @@ if __name__ == '__main__':      michel = michel[~nan_michel]      # get the best fit -    df = df.sort_values('fmin').groupby(['run','gtid']).first() -    df_atm = df_atm.sort_values('fmin').groupby(['run','gtid']).first() +    prompt = prompt.sort_values('fmin').groupby(['run','gtid']).first() +    atm = atm.sort_values('fmin').groupby(['run','gtid']).first()      michel_best_fit = michel.sort_values('fmin').groupby(['run','gtid']).first()      muon_best_fit = muons.sort_values('fmin').groupby(['run','gtid']).first()      muons = muons[muons.id == 22]      # require r < 6 meters -    df = df[np.sqrt(df.x.values**2 + df.y.values**2 + df.z.values**2) < AV_RADIUS] -    df_atm = df_atm[np.sqrt(df_atm.x.values**2 + df_atm.y.values**2 + df_atm.z.values**2) < AV_RADIUS] +    prompt = prompt[np.sqrt(prompt.x.values**2 + prompt.y.values**2 + prompt.z.values**2) < AV_RADIUS] +    atm = atm[np.sqrt(atm.x.values**2 + atm.y.values**2 + atm.z.values**2) < AV_RADIUS] -    print("number of events after radius cut = %i" % len(df)) +    print("number of events after radius cut = %i" % len(prompt))      # Note: Need to design and apply a psi based cut here      plt.figure() -    plot_hist(df,"Without Neutron Follower") +    plot_hist(prompt,"Without Neutron Follower")      plt.figure() -    plot_hist(df_atm,"With Neutron Follower") +    plot_hist(atm,"With Neutron Follower")      plt.figure()      plot_hist(michel_best_fit,"Michel Electrons")      plt.figure() diff --git a/utils/plot-fit-results b/utils/plot-fit-results index 773a0dc..7115b81 100755 --- a/utils/plot-fit-results +++ b/utils/plot-fit-results @@ -15,11 +15,6 @@  # this program. If not, see <https://www.gnu.org/licenses/>.  from __future__ import print_function, division -import yaml -try: -    from yaml import CLoader as Loader -except ImportError: -    from yaml.loader import SafeLoader as Loader  import numpy as np  from scipy.stats import iqr  from matplotlib.lines import Line2D @@ -71,228 +66,137 @@ def get_stats(x):      error = np.sqrt((u4-(n-3)*std**4/(n-1))/n)/(2*std)      return mean, std/np.sqrt(n), std, error +def std_err(x): +    x = x.dropna() +    mean = np.mean(x) +    std = np.std(x) +    n = len(x) +    if n == 0: +        return np.nan +    u4 = np.mean((x-mean)**4) +    error = np.sqrt((u4-(n-3)*std**4/(n-1))/n)/(2*std) +    return error +  if __name__ == '__main__':      import argparse      import matplotlib.pyplot as plt      import numpy as np +    import h5py +    import pandas as pd      parser = argparse.ArgumentParser("plot fit results")      parser.add_argument("filenames", nargs='+', help="input files")      args = parser.parse_args() -    events = [] - -    for filename in args.filenames: -        print(filename) -        with open(filename) as f: -            data = yaml.load(f.read(),Loader=Loader) - -        a = np.ma.empty(len(data['data']), -                        dtype=[('id',np.int),       # particle id -                               ('T', np.double),    # true energy -                               ('dx',np.double),    # dx -                               ('dy',np.double),    # dy -                               ('dz',np.double),    # dz -                               ('dT',np.double),    # dT -                               ('theta',np.double), # theta -                               ('ratio',np.double), # likelihood ratio -                               ('te',np.double),    # time electron -                               ('tm',np.double),    # time muon -                               ('Te',np.double)]    # electron energy -                    ) - -        for i, event in enumerate(data['data']): -            # get the particle ID -            id = event['mcgn'][0]['id'] - -            a[i]['id'] = id - -            if 'fit' not in event['ev'][0]: -                # if nhit < 100 we don't fit the event -                continue - -            if id not in event['ev'][0]['fit']: -                a[i] = np.ma.masked -                continue - -            mass = SNOMAN_MASS[id] -            # for some reason it's the *second* track which seems to contain the -            # initial energy -            true_energy = event['mcgn'][0]['energy'] -            # The MCTK bank has the particle's total energy (except for neutrons) -            # so we need to convert it into kinetic energy -            ke = true_energy - mass - -            fit = event['ev'][0]['fit'] - -            a[i]['T'] = ke -            energy = fit[id]['energy'] -            a[i]['dT'] = energy-ke - -            # store the fit position residuals -            true_posx = event['mcgn'][0]['posx'] -            posx = fit[id]['posx'] -            a[i]['dx'] = posx-true_posx -            true_posy = event['mcgn'][0]['posy'] -            posy = fit[id]['posy'] -            a[i]['dy'] = posy-true_posy -            true_posz = event['mcgn'][0]['posz'] -            posz = fit[id]['posz'] -            a[i]['dz'] = posz-true_posz - -            # compute the angle between the fit direction and the true -            # direction -            dirx = event['mcgn'][0]['dirx'] -            diry = event['mcgn'][0]['diry'] -            dirz = event['mcgn'][0]['dirz'] -            true_dir = [dirx,diry,dirz] -            true_dir = np.array(true_dir)/np.linalg.norm(true_dir) -            theta = fit[id]['theta'] -            phi = fit[id]['phi'] -            dir = [np.sin(theta)*np.cos(phi),np.sin(theta)*np.sin(phi),np.cos(theta)] -            dir = np.array(dir)/np.linalg.norm(dir) -            a[i]['theta'] = np.degrees(np.arccos(np.dot(true_dir,dir))) - -            # compute the log likelihood ratio -            if IDP_E_MINUS in fit and IDP_MU_MINUS in fit: -                fmin_electron = fit[IDP_E_MINUS]['fmin'] -                fmin_muon = fit[IDP_MU_MINUS]['fmin'] -                a[i]['ratio'] = fmin_muon-fmin_electron -            else: -                a[i]['ratio'] = np.ma.masked - -            # store the time taken for electron and muon fits -            if IDP_E_MINUS in fit: -                a[i]['te'] = fit[IDP_E_MINUS]['time'] -                a[i]['Te'] = fit[IDP_E_MINUS]['energy'] -            else: -                a[i]['te'] = np.ma.masked -                a[i]['Te'] = np.ma.masked -            if IDP_MU_MINUS in fit: -                a[i]['tm'] = fit[IDP_MU_MINUS]['time'] -            else: -                a[i]['tm'] = np.ma.masked - -        events.append(a) - -    a = np.ma.concatenate(events) - +    # Read in all the data. +    # +    # Note: We have to add the filename as a column to each dataset since this +    # script is used to analyze MC data which all has the same run number. +    ev = pd.concat([pd.read_hdf(filename, "ev").assign(filename=filename) for filename in args.filenames]) +    fits = pd.concat([pd.read_hdf(filename, "fits").assign(filename=filename) for filename in args.filenames]) +    mcgn = pd.concat([pd.read_hdf(filename, "mcgn").assign(filename=filename) for filename in args.filenames]) + +    # get rid of 2nd events like Michel electrons +    ev = ev.sort_values(['run','gtid']).groupby(['filename','evn'],as_index=False).first() + +    # Now, we merge all three datasets together to produce a single +    # dataframe. To do so, we join the ev dataframe with the mcgn frame +    # on the evn column, and then join with the fits on the run and +    # gtid columns. +    # +    # At the end we will have a single dataframe with one row for each +    # fit, i.e. it will look like: +    # +    # >>> data +    #   run   gtid nhit, ... mcgn_x, mcgn_y, mcgn_z, ..., fit_id1, fit_x, fit_y, fit_z, ... +    # +    # Before merging, we prefix the primary seed track table with mcgn_ +    # and the fit table with fit_ just to make things easier. + +    # Prefix track and fit frames +    mcgn = mcgn.add_prefix("mcgn_") +    fits = fits.add_prefix("fit_") + +    # merge ev and mcgn on evn +    data = ev.merge(mcgn,left_on=['filename','evn'],right_on=['mcgn_filename','mcgn_evn']) +    # merge data and fits on run and gtid +    data = data.merge(fits,left_on=['filename','run','gtid'],right_on=['fit_filename','fit_run','fit_gtid']) + +    # calculate true kinetic energy +    mass = [SNOMAN_MASS[id] for id in data['mcgn_id'].values] +    data['T'] = data['mcgn_energy'].values - mass +    data['dx'] = data['fit_x'].values - data['mcgn_x'].values +    data['dy'] = data['fit_y'].values - data['mcgn_y'].values +    data['dz'] = data['fit_z'].values - data['mcgn_z'].values +    data['dT'] = data['fit_energy1'].values - data['T'].values + +    true_dir = np.dstack((data['mcgn_dirx'],data['mcgn_diry'],data['mcgn_dirz'])).squeeze() +    dir = np.dstack((np.sin(data['fit_theta1'])*np.cos(data['fit_phi1']), +                     np.sin(data['fit_theta1'])*np.sin(data['fit_phi1']), +                     np.cos(data['fit_theta1']))).squeeze() + +    data['theta'] = np.degrees(np.arccos((true_dir*dir).sum(axis=-1))) + +    # only select fits which have at least 2 fits +    data = data.groupby(['filename','run','gtid']).filter(lambda x: len(x) > 1) +    data_true = data[data['fit_id1'] == data['mcgn_id']] +    data_e = data[data['fit_id1'] == IDP_E_MINUS] +    data_mu = data[data['fit_id1'] == IDP_MU_MINUS] + +    data_true = data_true.set_index(['filename','run','gtid']) +    data_e = data_e.set_index(['filename','run','gtid']) +    data_mu = data_mu.set_index(['filename','run','gtid']) + +    data_true['ratio'] = data_mu['fit_fmin']-data_e['fit_fmin'] +    data_true['te'] = data_e['fit_time'] +    data_true['tm'] = data_mu['fit_time'] +    data_true['Te'] = data_e['fit_energy1'] + +    # 100 bins between 50 MeV and 1 GeV      bins = np.arange(50,1000,100) -    stats_array = np.ma.empty(len(bins)-1, -                     dtype=[('T',             np.double), -                            ('dT',            np.double), -                            ('dT_err',        np.double), -                            ('dT_std',        np.double), -                            ('dT_std_err',    np.double), -                            ('dx',            np.double), -                            ('dx_err',        np.double), -                            ('dx_std',        np.double), -                            ('dx_std_err',    np.double), -                            ('dy',            np.double), -                            ('dy_err',        np.double), -                            ('dy_std',        np.double), -                            ('dy_std_err',    np.double), -                            ('dz',            np.double), -                            ('dz_err',        np.double), -                            ('dz_std',        np.double), -                            ('dz_std_err',    np.double), -                            ('theta',         np.double), -                            ('theta_err',     np.double), -                            ('theta_std',     np.double), -                            ('theta_std_err', np.double)]) - -    stats = {IDP_E_MINUS: stats_array, IDP_MU_MINUS: stats_array.copy()} - -    for id in stats: -        electron_events = a[a['id'] == id] - -        for i, (ablah, b) in enumerate(zip(bins[:-1], bins[1:])): -            events = electron_events[(electron_events['T'] >= ablah) & (electron_events['T'] < b)] - -            if len(events) < 2: -                stats[id][i] = np.ma.masked -                continue - -            stats[id][i]['T'] = (ablah+b)/2 -            mean, mean_error, std, std_error = get_stats(events['dT'].compressed()) -            stats[id][i]['dT'] = mean -            stats[id][i]['dT_err'] = mean_error -            stats[id][i]['dT_std'] = std -            stats[id][i]['dT_std_err'] = std_error -            mean, mean_error, std, std_error = get_stats(events['dx'].compressed()) -            stats[id][i]['dx'] = mean -            stats[id][i]['dx_err'] = mean_error -            stats[id][i]['dx_std'] = std -            stats[id][i]['dx_std_err'] = std_error -            mean, mean_error, std, std_error = get_stats(events['dy'].compressed()) -            stats[id][i]['dy'] = mean -            stats[id][i]['dy_err'] = mean_error -            stats[id][i]['dy_std'] = std -            stats[id][i]['dy_std_err'] = std_error -            mean, mean_error, std, std_error = get_stats(events['dz'].compressed()) -            stats[id][i]['dz'] = mean -            stats[id][i]['dz_err'] = mean_error -            stats[id][i]['dz_std'] = std -            stats[id][i]['dz_std_err'] = std_error -            mean, mean_error, std, std_error = get_stats(events['theta'].compressed()) -            stats[id][i]['theta'] = mean -            stats[id][i]['theta_err'] = mean_error -            stats[id][i]['theta_std'] = std -            stats[id][i]['theta_std_err'] = std_error - -    for id in stats: +    for id in [IDP_E_MINUS, IDP_MU_MINUS]: +        events = data_true[data_true['mcgn_id'] == id] + +        pd_bins = pd.cut(events['T'],bins) + +        dT = events.groupby(pd_bins)['dT'].agg(['mean','sem','std',std_err]) +        dx = events.groupby(pd_bins)['dx'].agg(['mean','sem','std',std_err]) +        dy = events.groupby(pd_bins)['dy'].agg(['mean','sem','std',std_err]) +        dz = events.groupby(pd_bins)['dz'].agg(['mean','sem','std',std_err]) +        theta = events.groupby(pd_bins)['theta'].agg(['mean','sem','std',std_err]) +          label = 'Muon' if id == IDP_MU_MINUS else 'Electron' -        T = stats[id]['T'] -        dT = stats[id]['dT'] -        dT_err = stats[id]['dT_err'] -        std_dT = stats[id]['dT_std'] -        std_dT_err = stats[id]['dT_std_err'] -        dx = stats[id]['dx'] -        dx_err = stats[id]['dx_err'] -        std_dx = stats[id]['dx_std'] -        std_dx_err = stats[id]['dx_std_err'] -        dy = stats[id]['dy'] -        dy_err = stats[id]['dy_err'] -        std_dy = stats[id]['dy_std'] -        std_dy_err = stats[id]['dy_std_err'] -        dz = stats[id]['dz'] -        dz_err = stats[id]['dz_err'] -        std_dz = stats[id]['dz_std'] -        std_dz_err = stats[id]['dz_std_err'] -        theta = stats[id]['theta'] -        theta_err = stats[id]['theta_err'] -        std_theta = stats[id]['theta_std'] -        std_theta_err = stats[id]['theta_std_err'] +        T = (bins[1:] + bins[:-1])/2          plt.figure(1) -        plt.errorbar(T,dT*100/T,yerr=dT_err*100/T,fmt='o',label=label) +        plt.errorbar(T,dT['mean']*100/T,yerr=dT['sem']*100/T,fmt='o',label=label)          plt.xlabel("Kinetic Energy (MeV)")          plt.ylabel("Energy bias (%)")          plt.title("Energy Bias")          plt.legend()          plt.figure(2) -        plt.errorbar(T,std_dT*100/T,yerr=std_dT_err*100/T,fmt='o',label=label) +        plt.errorbar(T,dT['std']*100/T,yerr=dT['std_err']*100/T,fmt='o',label=label)          plt.xlabel("Kinetic Energy (MeV)")          plt.ylabel("Energy resolution (%)")          plt.title("Energy Resolution")          plt.legend()          plt.figure(3) -        plt.errorbar(T,dx,yerr=dx_err,fmt='o',label='%s (x)' % label) -        plt.errorbar(T,dy,yerr=dy_err,fmt='o',label='%s (y)' % label) -        plt.errorbar(T,dz,yerr=dz_err,fmt='o',label='%s (z)' % label) +        plt.errorbar(T,dx['mean'],yerr=dx['sem'],fmt='o',label='%s (x)' % label) +        plt.errorbar(T,dy['mean'],yerr=dy['sem'],fmt='o',label='%s (y)' % label) +        plt.errorbar(T,dz['mean'],yerr=dz['sem'],fmt='o',label='%s (z)' % label)          plt.xlabel("Kinetic Energy (MeV)")          plt.ylabel("Position bias (cm)")          plt.title("Position Bias")          plt.legend()          plt.figure(4) -        plt.errorbar(T,std_dx,yerr=std_dx_err,fmt='o',label='%s (x)' % label) -        plt.errorbar(T,std_dy,yerr=std_dy_err,fmt='o',label='%s (y)' % label) -        plt.errorbar(T,std_dz,yerr=std_dz_err,fmt='o',label='%s (z)' % label) +        plt.errorbar(T,dx['std'],yerr=dx['std_err'],fmt='o',label='%s (x)' % label) +        plt.errorbar(T,dy['std'],yerr=dy['std_err'],fmt='o',label='%s (y)' % label) +        plt.errorbar(T,dz['std'],yerr=dz['std_err'],fmt='o',label='%s (z)' % label)          plt.xlabel("Kinetic Energy (MeV)")          plt.ylabel("Position resolution (cm)")          plt.title("Position Resolution") @@ -300,7 +204,7 @@ if __name__ == '__main__':          plt.legend()          plt.figure(5) -        plt.errorbar(T,std_theta,yerr=std_theta_err,fmt='o',label=label) +        plt.errorbar(T,theta['std'],yerr=theta['std_err'],fmt='o',label=label)          plt.xlabel("Kinetic Energy (MeV)")          plt.ylabel("Angular resolution (deg)")          plt.title("Angular Resolution") @@ -308,9 +212,10 @@ if __name__ == '__main__':          plt.legend()          plt.figure(6) -        plt.scatter(a[a['id'] == id]['Te'],a[a['id'] == id]['ratio'],label=label) +        plt.scatter(events['Te'],events['ratio'],label=label)          plt.xlabel("Reconstructed Electron Energy (MeV)")          plt.ylabel(r"Log Likelihood Ratio (e/$\mu$)")          plt.title("Log Likelihood Ratio vs Reconstructed Electron Energy")          plt.legend() +      plt.show() diff --git a/utils/submit-grid-jobs b/utils/submit-grid-jobs index 670172d..63cbcb7 100755 --- a/utils/submit-grid-jobs +++ b/utils/submit-grid-jobs @@ -24,6 +24,83 @@ import string  from os.path import split, splitext, join, abspath  import uuid  from subprocess import check_call +import os +import sys + +# Next two functions are a backport of the shutil.which() function from Python +# 3.3 from Lib/shutil.py in the CPython code See +# https://github.com/python/cpython/blob/master/Lib/shutil.py. + +# Check that a given file can be accessed with the correct mode. +# Additionally check that `file` is not a directory, as on Windows +# directories pass the os.access check. +def _access_check(fn, mode): +    return (os.path.exists(fn) and os.access(fn, mode) and not os.path.isdir(fn)) + +def which(cmd, mode=os.F_OK | os.X_OK, path=None): +    """Given a command, mode, and a PATH string, return the path which +    conforms to the given mode on the PATH, or None if there is no such +    file. +    `mode` defaults to os.F_OK | os.X_OK. `path` defaults to the result +    of os.environ.get("PATH"), or can be overridden with a custom search +    path. +    """ +    # If we're given a path with a directory part, look it up directly rather +    # than referring to PATH directories. This includes checking relative to the +    # current directory, e.g. ./script +    if os.path.dirname(cmd): +        if _access_check(cmd, mode): +            return cmd +        return None + +    if path is None: +        path = os.environ.get("PATH", None) +        if path is None: +            try: +                path = os.confstr("CS_PATH") +            except (AttributeError, ValueError): +                # os.confstr() or CS_PATH is not available +                path = os.defpath +        # bpo-35755: Don't use os.defpath if the PATH environment variable is +        # set to an empty string + +    # PATH='' doesn't match, whereas PATH=':' looks in the current directory +    if not path: +        return None + +    path = path.split(os.pathsep) + +    if sys.platform == "win32": +        # The current directory takes precedence on Windows. +        curdir = os.curdir +        if curdir not in path: +            path.insert(0, curdir) + +        # PATHEXT is necessary to check on Windows. +        pathext = os.environ.get("PATHEXT", "").split(os.pathsep) +        # See if the given file matches any of the expected path extensions. +        # This will allow us to short circuit when given "python.exe". +        # If it does match, only test that one, otherwise we have to try +        # others. +        if any(cmd.lower().endswith(ext.lower()) for ext in pathext): +            files = [cmd] +        else: +            files = [cmd + ext for ext in pathext] +    else: +        # On other platforms you don't have things like PATHEXT to tell you +        # what file suffixes are executable, so just pass on cmd as-is. +        files = [cmd] + +    seen = set() +    for dir in path: +        normdir = os.path.normcase(dir) +        if not normdir in seen: +            seen.add(normdir) +            for thefile in files: +                name = os.path.join(dir, thefile) +                if _access_check(name, mode): +                    return name +    return None  CONDOR_TEMPLATE = \  """ @@ -42,7 +119,7 @@ log = @log  # The below are good base requirements for first testing jobs on OSG,   #  if you don't have a good idea of memory and disk usage. -requirements = (OSGVO_OS_STRING == "RHEL 7") && (OpSys == "LINUX") +requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == "RHEL 7") && (OpSys == "LINUX")  request_cpus = 1  request_memory = 1 GB  request_disk = 1 GB @@ -72,14 +149,24 @@ def submit_job(filename, run, gtid, dir, dqxx_dir, min_nhit, max_particles):      prefix = "%s_%08i_%s" % (root,gtid,ID.hex)      # fit output filename -    output = "%s.txt" % prefix +    output = "%s.hdf5" % prefix      # condor submit filename      condor_submit = "%s.submit" % prefix      # set up the arguments for the template -    executable = join(dir,"fit") +    executable = which("fit") +    wrapper = which("fit-wrapper") + +    if executable is None: +        print("couldn't find fit in path!",file=sys.stderr) +        sys.exit(1) + +    if wrapper is None: +        print("couldn't find fit-wrapper in path!",file=sys.stderr) +        sys.exit(1) +      args = [tail,"-o",output,"--gtid",gtid,"--min-nhit",min_nhit,"--max-particles",max_particles] -    transfer_input_files = ",".join([filename,join(dqxx_dir,"DQXX_%010i.dat" % run)] + [join(dir,filename) for filename in INPUT_FILES]) +    transfer_input_files = ",".join([executable,filename,join(dqxx_dir,"DQXX_%010i.dat" % run)] + [join(dir,filename) for filename in INPUT_FILES])      transfer_output_files = ",".join([output])      condor_error = "%s.error" % prefix @@ -89,7 +176,7 @@ def submit_job(filename, run, gtid, dir, dqxx_dir, min_nhit, max_particles):      template = MyTemplate(CONDOR_TEMPLATE)      submit_string = template.safe_substitute( -            executable=executable, +            executable=wrapper,              args=" ".join(map(str,args)),              transfer_input_files=transfer_input_files,              transfer_output_files=transfer_output_files, @@ -106,13 +193,13 @@ def submit_job(filename, run, gtid, dir, dqxx_dir, min_nhit, max_particles):  if __name__ == '__main__':      import argparse -    import subprocess +    from subprocess import check_call      import os +    import tempfile +    import h5py      parser = argparse.ArgumentParser("submit grid jobs")      parser.add_argument("filenames", nargs='+', help="input files") -    parser.add_argument("--dir", type=str, help="fitter directory", required=True) -    parser.add_argument("--dqxx-dir", type=str, help="dqxx directory", required=True)      parser.add_argument("--min-nhit", type=int, help="minimum nhit to fit an event", default=100)      parser.add_argument("--max-particles", type=int, help="maximum number of particles to fit for", default=3)      parser.add_argument("--skip-second-event", action='store_true', help="only fit the first event after a MAST bank", default=False) @@ -123,14 +210,32 @@ if __name__ == '__main__':          home = os.path.expanduser("~")          args.state = join(home,'state.txt') +    if 'SDDM_DATA' not in os.environ: +        print("Please set the SDDM_DATA environment variable to point to the fitter source code location", file=sys.stderr) +        sys.exit(1) + +    dir = os.environ['SDDM_DATA'] + +    if 'DQXX_DIR' not in os.environ: +        print("Please set the DQXX_DIR environment variable to point to the directory with the DQXX files", file=sys.stderr) +        sys.exit(1) + +    dqxx_dir = os.environ['DQXX_DIR'] +      args.state = abspath(args.state)      # get the current working directory      home_dir = os.getcwd()      # get absolute paths since we are going to create a new directory -    args.dir = abspath(args.dir) -    args.dqxx_dir = abspath(args.dqxx_dir) +    dir = abspath(dir) +    dqxx_dir = abspath(dqxx_dir) + +    zdab_cat = which("zdab-cat") + +    if zdab_cat is None: +        print("couldn't find zdab-cat in path!",file=sys.stderr) +        sys.exit(1)      for filename in args.filenames:          filename = abspath(filename) @@ -152,29 +257,28 @@ if __name__ == '__main__':              continue          with open(os.devnull, 'w') as f: +            # Create a temporary file to store the events. Docs recommended +            # this method instead of mkstemp(), but I think they're the same. +            output = tempfile.NamedTemporaryFile(suffix='.hdf5',delete=False) +            output.close() +              if args.skip_second_event: -                popen = subprocess.Popen([join(args.dir,"zdab-cat"),"--skip-second-event",filename],stdout=subprocess.PIPE,stderr=f) +                check_call([zdab_cat,"--skip-second-event",filename,"-o",output.name],stderr=f)              else: -                popen = subprocess.Popen([join(args.dir,"zdab-cat"),filename],stdout=subprocess.PIPE,stderr=f) +                check_call([zdab_cat,filename,"-o",output.name],stderr=f)          new_dir = "%s_%s" % (root,ID.hex)          os.mkdir(new_dir)          os.chdir(new_dir) -        for data in yaml.load_all(popen.stdout,Loader=Loader): -            if 'ev' not in data: -                continue - -            for ev in data['ev']: -                run = ev['run'] -                gtid = ev['gtid'] -                nhit = ev['nhit'] - -                if nhit >= args.min_nhit: -                    submit_job(filename, run, gtid, args.dir, args.dqxx_dir, args.min_nhit, args.max_particles) +        with h5py.File(output.name) as f: +            for ev in f['ev']: +                if ev['nhit'] >= args.min_nhit: +                    submit_job(filename, ev['run'], ev['gtid'], dir, dqxx_dir, args.min_nhit, args.max_particles) -        popen.wait() +        # Delete temporary HDF5 file +        os.unlink(output.name)          state.append(tail) | 
