From 21491ca1ca2afd6951e9b5b1e74b1c919c602b36 Mon Sep 17 00:00:00 2001 From: tlatorre Date: Thu, 11 Jul 2019 09:42:23 -0500 Subject: switch from YAML output to HDF5 to speed things up --- README | 4 +- src/Makefile | 7 +- src/fit.c | 194 ++++++++++++++++++-------------- src/hdf5_utils.c | 193 ++++++++++++++++++++++++++++++++ src/hdf5_utils.h | 85 ++++++++++++++ src/zdab-cat.c | 117 +++++++++---------- src/zdab_utils.c | 18 +++ src/zdab_utils.h | 32 ++++++ utils/Makefile | 1 + utils/cat-grid-jobs | 163 +++++++++++++++++++-------- utils/fit-wrapper | 7 ++ utils/plot | 171 ++++++++++++++-------------- utils/plot-energy | 294 ++++++++++++++++++++++-------------------------- utils/plot-fit-results | 297 +++++++++++++++++-------------------------------- utils/submit-grid-jobs | 152 +++++++++++++++++++++---- 15 files changed, 1063 insertions(+), 672 deletions(-) create mode 100644 src/hdf5_utils.c create mode 100644 src/hdf5_utils.h create mode 100644 utils/fit-wrapper diff --git a/README b/README index e1b6f60..ce5dbc9 100644 --- a/README +++ b/README @@ -1,10 +1,10 @@ Building ======== -First you have to install the gsl and nlopt libraries. On Fedora you can +First you have to install the gsl, nlopt, and hdf5 libraries. On Fedora you can install these with the following command: - $ yum install gsl gsl-devel NLopt NLopt-devel + $ yum install gsl gsl-devel NLopt NLopt-devel hdf5 hdf5-devel You can also install these packages yourself. For instructions, see below (Installing GSL and Installing NLopt). diff --git a/src/Makefile b/src/Makefile index 99cff5e..e5ca1cd 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,7 +1,7 @@ release_hdr := $(shell sh -c './mkreleasehdr.sh') CFLAGS=-fdiagnostics-color -O2 -Wall -g -DSWAP_BYTES -LDLIBS=-fdiagnostics-color -lm -lgsl -lgslcblas -lnlopt_cxx -lstdc++ -lz +LDLIBS=-fdiagnostics-color -lm -lgsl -lgslcblas -lnlopt_cxx -lstdc++ -lz -lhdf5 PREFIX?=$(HOME)/local INSTALL_BIN=$(PREFIX)/bin @@ -32,17 +32,18 @@ test-time-pdf: test-time-pdf.o zebra.o likelihood.o pmt.o vector.o misc.o muon.o test-zebra: test-zebra.o zebra.o pack2b.o -fit: fit.o zebra.o likelihood.o pmt.o vector.o misc.o muon.o optics.o quantum_efficiency.o solid_angle.o pdg.o scattering.o zdab_utils.o pack2b.o sno_charge.o db.o dqxx.o dict.o siphash.o path.o pmt_response.o release.o electron.o proton.o find_peaks.o quad.o dc.o sort.o util.o +fit: fit.o zebra.o likelihood.o pmt.o vector.o misc.o muon.o optics.o quantum_efficiency.o solid_angle.o pdg.o scattering.o zdab_utils.o pack2b.o sno_charge.o db.o dqxx.o dict.o siphash.o path.o pmt_response.o release.o electron.o proton.o find_peaks.o quad.o dc.o sort.o util.o hdf5_utils.o test-find-peaks: test-find-peaks.o zebra.o likelihood.o pmt.o vector.o misc.o muon.o optics.o quantum_efficiency.o solid_angle.o pdg.o scattering.o zdab_utils.o pack2b.o sno_charge.o db.o dqxx.o dict.o siphash.o path.o pmt_response.o release.o electron.o proton.o find_peaks.o util.o quad.o calculate-csda-range: calculate-csda-range.o -zdab-cat: zdab-cat.o zebra.o pmt.o vector.o misc.o zdab_utils.o pack2b.o db.o dqxx.o dict.o siphash.o release.o dc.o sort.o util.o +zdab-cat: zdab-cat.o zebra.o pmt.o vector.o misc.o zdab_utils.o pack2b.o db.o dqxx.o dict.o siphash.o release.o dc.o sort.o util.o hdf5_utils.o install: @mkdir -p $(INSTALL_BIN) $(INSTALL) fit $(INSTALL_BIN) + $(INSTALL) zdab-cat $(INSTALL_BIN) clean: rm -f *.o calculate_limits test test-vector test-likelihood fit test-charge test-path calculate-csda-range test-time-pdf test-zebra test-find-peaks Makefile.dep zdab-cat diff --git a/src/fit.c b/src/fit.c index adc6ebc..0360a8f 100644 --- a/src/fit.c +++ b/src/fit.c @@ -47,12 +47,21 @@ #include "electron.h" #include "muon.h" #include "proton.h" +#include "hdf5.h" +#include "hdf5_utils.h" /* Maximum number of fit parameters. Should be at least 4 + 3*MAX_VERTICES. */ #define MAX_PARS 100 + /* Maximum number of peaks to search for in Hough transform. */ #define MAX_NPEAKS 10 +/* Maximum number of events, primary seed particle tracks, and fits. + * + * FIXME: I should write out the events, tracks, and fits as they come up + * instead of storing them until the end. */ +#define MAX_NEVENTS 1000000 + char *GitSHA1(void); char *GitDirty(void); @@ -5869,12 +5878,13 @@ int main(int argc, char **argv) { int i, j, k; zebraFile *f; - zebraBank bmast, bmc, bmcgn, mctk, b; + zebraBank bmast, mc, bmcgn, mctk, b; int rv; EVBank bev; FTPVBank bftpv; FTXKBank bftxk; RSPBank bftxr; + MCBank bmc; MCTKBank bmctk; MCVXBank bmcvx; event ev = {0}; @@ -5882,7 +5892,6 @@ int main(int argc, char **argv) double xopt[MAX_PARS]; char *filename = NULL; char *output = NULL; - FILE *fout = NULL; int skip_second_event = 0; struct timeval tv_start, tv_stop; long long elapsed; @@ -5892,12 +5901,29 @@ int main(int argc, char **argv) int id[MAX_VERTICES]; size_t combos[100]; size_t len; - char tmp[256]; - size_t nhit, min_nhit = 100; + size_t min_nhit = 100; int last_run; char dqxx_file[256]; int32_t gtid = -1; + int nevents = 0; + /* Array of events to write out to HDF5 file. + * + * Note: Declared static since otherwise it will cause a stack overflow. */ + static HDF5Event hdf5_events[MAX_NEVENTS]; + + int nmcgn = 0; + /* Array of primary seed tracks to write out to HDF5 file. + * + * Note: Declared static since otherwise it will cause a stack overflow. */ + static HDF5MCGN hdf5_mcgn[MAX_NEVENTS]; + + int nfits = 0; + /* Array of fit results to write out to HDF5 file. + * + * Note: Declared static since otherwise it will cause a stack overflow. */ + static HDF5Fit hdf5_fits[MAX_NEVENTS]; + for (i = 1; i < argc; i++) { if (strlen(argv[i]) >= 2 && !strncmp(argv[i], "--", 2)) { if (!strcmp(argv[i]+2,"skip-second-event")) { @@ -5953,22 +5979,8 @@ int main(int argc, char **argv) return 1; } - if (output) { - fout = fopen(output, "w"); - - if (!fout) { - fprintf(stderr, "failed to open '%s': %s\n", output, strerror(errno)); - return 1; - } - - fprintf(fout, "git_sha1: %s\n", GitSHA1()); - fprintf(fout, "git_dirty: %s\n", GitDirty()); - fprintf(fout, "data:\n"); - } - if (load_pmt_info()) { zebra_close(f); - if (output) fclose(fout); return 1; } @@ -6038,30 +6050,29 @@ int main(int argc, char **argv) continue; } - if (fout) fprintf(fout, " -\n"); - if (bmast.links[KMAST_MC-1] == 0) goto skip_mc; - rv = zebra_get_bank(f,&bmc,bmast.links[KMAST_MC-1]); + rv = zebra_get_bank(f,&mc,bmast.links[KMAST_MC-1]); if (rv) { fprintf(stderr, "error getting MC bank: %s\n", zebra_err); goto err; } + unpack_mc(mc.data, &bmc); + if (bmast.links[KMC_MCGN-1] == 0) { fprintf(stderr, "MCGN link is zero!\n"); goto err; } - rv = zebra_get_bank(f,&bmcgn,bmc.links[KMC_MCGN-1]); + rv = zebra_get_bank(f,&bmcgn,mc.links[KMC_MCGN-1]); if (rv) { fprintf(stderr, "error getting MCGN bank: %s\n", zebra_err); goto err; } - if (fout) fprintf(fout, " mcgn:\n"); while (1) { if (bmcgn.links[KMCGN_MCTK-1] == 0) { fprintf(stderr, "MCTK link is zero!\n"); @@ -6110,17 +6121,18 @@ int main(int argc, char **argv) unpack_mcvx(b.data, &bmcvx); - if (fout) { - fprintf(fout, " -\n"); - fprintf(fout, " id: %" PRIu32 "\n", bmctk.idp); - fprintf(fout, " energy: %.2f\n", bmctk.ene); - fprintf(fout, " posx: %.2f\n", bmcvx.x); - fprintf(fout, " posy: %.2f\n", bmcvx.y); - fprintf(fout, " posz: %.2f\n", bmcvx.z); - fprintf(fout, " dirx: %.4f\n", bmctk.drx); - fprintf(fout, " diry: %.4f\n", bmctk.dry); - fprintf(fout, " dirz: %.4f\n", bmctk.drz); - fprintf(fout, " time: %.4f\n", bmcvx.tim); + if (output) { + hdf5_mcgn[nmcgn].evn = bmc.evn; + hdf5_mcgn[nmcgn].id = bmctk.idp; + hdf5_mcgn[nmcgn].energy = bmctk.ene; + hdf5_mcgn[nmcgn].x = bmcvx.x; + hdf5_mcgn[nmcgn].y = bmcvx.y; + hdf5_mcgn[nmcgn].z = bmcvx.z; + hdf5_mcgn[nmcgn].dirx = bmctk.drx; + hdf5_mcgn[nmcgn].diry = bmctk.dry; + hdf5_mcgn[nmcgn].dirz = bmctk.drz; + hdf5_mcgn[nmcgn].time = bmcvx.tim; + nmcgn++; } if (bmcgn.next) { @@ -6156,7 +6168,6 @@ skip_mc: } } - if (fout) fprintf(fout, " ev:\n"); while (1) { unpack_ev(b.data, &bev); ev.run = bev.run; @@ -6185,49 +6196,44 @@ skip_mc: if (get_event(f,&ev,&b)) goto skip_event; - nhit = get_nhit(&ev); + if (output) { + hdf5_events[nevents].run = ev.run; + hdf5_events[nevents].evn = bmc.evn; + hdf5_events[nevents].gtr = ev.trigger_time; + hdf5_events[nevents].nhit = ev.nhit; + hdf5_events[nevents].gtid = ev.gtid; + hdf5_events[nevents].trg_type = ev.trigger_type; + hdf5_events[nevents].dc = get_dc_word(&ev, f, &bmast, &b); - if (fout) { - fprintf(fout, " - run: %i\n", ev.run); - fprintf(fout, " gtr: %.0f\n", ev.trigger_time); - fprintf(fout, " nhit: %zu\n", nhit); - fprintf(fout, " gtid: %i\n", ev.gtid); - fprintf(fout, " trg_type: 0x%08x\n", ev.trigger_type); - fprintf(fout, " dc: 0x%08x\n", get_dc_word(&ev, f, &bmast, &b)); - } - - if (fout) { if (get_ftpv(f,&b,&bftpv)) { fprintf(stderr, "%s\n", zdab_err); + hdf5_events[nevents].ftp_x = NAN; + hdf5_events[nevents].ftp_y = NAN; + hdf5_events[nevents].ftp_z = NAN; } else { - fprintf(fout, " ftp:\n"); - fprintf(fout, " x: %.2f\n", bftpv.x); - fprintf(fout, " y: %.2f\n", bftpv.y); - fprintf(fout, " z: %.2f\n", bftpv.z); + hdf5_events[nevents].ftp_x = bftpv.x; + hdf5_events[nevents].ftp_y = bftpv.y; + hdf5_events[nevents].ftp_z = bftpv.z; } - } - if (fout) { if (get_ftxk(f,&b,&bftxk)) { fprintf(stderr, "%s\n", zdab_err); + hdf5_events[nevents].ftk_energy = NAN; } else { - fprintf(fout, " ftk:\n"); - fprintf(fout, " energy: %.2f\n", bftxk.energy); + hdf5_events[nevents].ftk_energy = bftxk.energy; } - } - if (fout) { if (get_rsp(f,&b,&bftxr)) { fprintf(stderr, "%s\n", zdab_err); + hdf5_events[nevents].rsp_energy = NAN; } else { - fprintf(fout, " rsp:\n"); - fprintf(fout, " energy: %.2f\n", bftxr.ene); + hdf5_events[nevents].rsp_energy = bftxr.ene; } - } - if (nhit < min_nhit) goto skip_event; + nevents++; + } - if (fout) fprintf(fout, " fit:\n"); + if (ev.nhit < min_nhit) goto skip_event; fmin_best = nll_best(&ev); @@ -6251,25 +6257,47 @@ skip_mc: elapsed = (tv_stop.tv_sec - tv_start.tv_sec)*1000 + (tv_stop.tv_usec - tv_start.tv_usec)/1000; - if (fout) { - sprintf_particle_string(id,i,tmp); - fprintf(fout, " %s:\n", tmp); - fprintf(fout, " posx: %.2f\n", xopt[0]); - fprintf(fout, " posy: %.2f\n", xopt[1]); - fprintf(fout, " posz: %.2f\n", xopt[2]); - fprintf(fout, " t0: %.2f\n", xopt[3]); - sprintf_yaml_list(xopt+4,i,3,tmp); - fprintf(fout, " energy: %s\n", tmp); - sprintf_yaml_list(xopt+5,i,3,tmp); - fprintf(fout, " theta: %s\n", tmp); - sprintf_yaml_list(xopt+6,i,3,tmp); - fprintf(fout, " phi: %s\n", tmp); - //fprintf(fout, " z1: %.2f\n", xopt[7]); - //fprintf(fout, " z2: %.2f\n", xopt[8]); - fprintf(fout, " fmin: %.2f\n", fmin); - fprintf(fout, " time: %lld\n", elapsed); - fprintf(fout, " psi: %.2f\n", fmin-fmin_best); - fflush(fout); + if (output) { + hdf5_fits[nfits].run = ev.run; + hdf5_fits[nfits].gtid = ev.gtid; + hdf5_fits[nfits].n = i; + hdf5_fits[nfits].x = xopt[0]; + hdf5_fits[nfits].y = xopt[1]; + hdf5_fits[nfits].z = xopt[2]; + hdf5_fits[nfits].t0 = xopt[3]; + hdf5_fits[nfits].id1 = id[0]; + hdf5_fits[nfits].energy1 = xopt[4]; + hdf5_fits[nfits].theta1 = xopt[5]; + hdf5_fits[nfits].phi1 = xopt[6]; + + if (i > 1) { + hdf5_fits[nfits].id2 = id[1]; + hdf5_fits[nfits].energy2 = xopt[7]; + hdf5_fits[nfits].theta2 = xopt[8]; + hdf5_fits[nfits].phi2 = xopt[9]; + } else { + hdf5_fits[nfits].id2 = 0; + hdf5_fits[nfits].energy2 = NAN; + hdf5_fits[nfits].theta2 = NAN; + hdf5_fits[nfits].phi2 = NAN; + } + + if (i > 2) { + hdf5_fits[nfits].id3 = id[2]; + hdf5_fits[nfits].energy3 = xopt[10]; + hdf5_fits[nfits].theta3 = xopt[11]; + hdf5_fits[nfits].phi3 = xopt[12]; + } else { + hdf5_fits[nfits].id3 = 0; + hdf5_fits[nfits].energy3 = NAN; + hdf5_fits[nfits].theta3 = NAN; + hdf5_fits[nfits].phi3 = NAN; + } + + hdf5_fits[nfits].fmin = fmin; + hdf5_fits[nfits].time = elapsed; + hdf5_fits[nfits].psi = fmin-fmin_best; + nfits++; } } } @@ -6292,13 +6320,13 @@ skip_event: end: + if (output) save_output(output, hdf5_events, nevents, hdf5_mcgn, nmcgn, hdf5_fits, nfits); + free_interpolation(); pmt_response_free(); db_free(db); - if (fout) fclose(fout); - zebra_close(f); return 0; @@ -6309,8 +6337,6 @@ err: db_free(db); - if (fout) fclose(fout); - zebra_close(f); return 1; diff --git a/src/hdf5_utils.c b/src/hdf5_utils.c new file mode 100644 index 0000000..05d1002 --- /dev/null +++ b/src/hdf5_utils.c @@ -0,0 +1,193 @@ +#include "hdf5_utils.h" +#include "hdf5.h" +#include /* for strlen() */ + +char *GitSHA1(void); +char *GitDirty(void); + +/* Write out data to an HDF5 file. + * + * This function writes out three datasets to an HDF5 file: + * + * 1. ev: the event dataset + * + * This dataset contains event level information similar to the EV bank in + * SNOMAN. It also contains a data cleaning word and the FTP, FTK, and RSP + * fit values for the position and energy. + * + * 2. mcgn: the primary particle tracks + * + * This dataset contains vertex and track information for all the primary + * particle tracks used to simulate the event. + * + * 3. fits: the fit information + * + * This dataset contains the reconstructed fit information for each event. + * Note that this dataset will in general contain multiple rows for a + * single event since we fit for different particle hypotheses. + * + * Currently the ev and mcgn datasets are fixed size, and the fits dataset is + * chunked and extensible to allow the cat-grid-jobs script to work. + * + */ +int save_output(const char *output, HDF5Event *hdf5_events, size_t nevents, HDF5MCGN *hdf5_mcgn, size_t nmcgn, HDF5Fit *hdf5_fits, size_t nfits) +{ + hid_t hdf5_events_tid; /* File datatype identifier */ + hid_t file, dataset, space; /* Handles */ + herr_t status; + hsize_t dim[1]; /* Dataspace dimensions */ + hid_t hdf5_mcgn_tid; /* File datatype identifier */ + hid_t dataset_mcgn, space_mcgn; /* Handles */ + hsize_t dim_mcgn[1]; /* Dataspace dimensions */ + hid_t hdf5_fits_tid; /* File datatype identifier */ + hid_t dataset_fits, space_fits; /* Handles */ + hsize_t dim_fits[1]; /* Dataspace dimensions */ + hsize_t chunk_dims[1]; /* Dataspace dimensions */ + hid_t dcpl_id; /* Handles */ + hsize_t maxdims[1] = {H5S_UNLIMITED}; + + /* + * Create the data space. + */ + dim[0] = nevents; + space = H5Screate_simple(1, dim, NULL); + dim_mcgn[0] = nmcgn; + space_mcgn = H5Screate_simple(1, dim_mcgn, NULL); + dim_fits[0] = nfits; + space_fits = H5Screate_simple(1, dim_fits, maxdims); + chunk_dims[0] = 10; + dcpl_id = H5Pcreate(H5P_DATASET_CREATE); + H5Pset_chunk(dcpl_id, 1, chunk_dims); + + /* + * Create the file. + */ + file = H5Fcreate(output, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT); + + hid_t attr1, attr2; /* Attribute identifiers */ + hid_t aid1, aid2; /* Attribute dataspace identifiers */ + hid_t atype1, atype2; /* Attribute type */ + + char *git_sha1 = GitSHA1(); + char *git_dirty = GitDirty(); + + /* + * Create string attribute. + */ + aid1 = H5Screate(H5S_SCALAR); + atype1 = H5Tcopy(H5T_C_S1); + H5Tset_size(atype1, strlen(git_sha1)); + H5Tset_strpad(atype1,H5T_STR_NULLTERM); + attr1 = H5Acreate2(file, "git_sha1", atype1, aid1, H5P_DEFAULT, H5P_DEFAULT); + + /* + * Create string attribute. + */ + aid2 = H5Screate(H5S_SCALAR); + atype2 = H5Tcopy(H5T_C_S1); + H5Tset_size(atype2, strlen(git_dirty)); + H5Tset_strpad(atype2,H5T_STR_NULLTERM); + attr2 = H5Acreate2(file, "git_dirty", atype2, aid2, H5P_DEFAULT, H5P_DEFAULT); + + herr_t ret; /* Return value */ + + /* + * Write string attribute. + */ + ret = H5Awrite(attr1, atype1, git_sha1); + ret = H5Awrite(attr2, atype2, git_dirty); + + /* + * Create the memory data type. + */ + hdf5_events_tid = H5Tcreate(H5T_COMPOUND, sizeof(HDF5Event)); + H5Tinsert(hdf5_events_tid, "run", HOFFSET(HDF5Event, run), H5T_NATIVE_INT); + H5Tinsert(hdf5_events_tid, "evn", HOFFSET(HDF5Event, evn), H5T_NATIVE_INT); + H5Tinsert(hdf5_events_tid, "gtr", HOFFSET(HDF5Event, gtr), H5T_NATIVE_DOUBLE); + H5Tinsert(hdf5_events_tid, "nhit", HOFFSET(HDF5Event, nhit), H5T_NATIVE_INT); + H5Tinsert(hdf5_events_tid, "gtid", HOFFSET(HDF5Event, gtid), H5T_NATIVE_UINT32); + H5Tinsert(hdf5_events_tid, "trg_type", HOFFSET(HDF5Event, trg_type), H5T_NATIVE_UINT32); + H5Tinsert(hdf5_events_tid, "dc", HOFFSET(HDF5Event, dc), H5T_NATIVE_UINT32); + H5Tinsert(hdf5_events_tid, "ftp_x", HOFFSET(HDF5Event, ftp_x), H5T_NATIVE_DOUBLE); + H5Tinsert(hdf5_events_tid, "ftp_y", HOFFSET(HDF5Event, ftp_y), H5T_NATIVE_DOUBLE); + H5Tinsert(hdf5_events_tid, "ftp_z", HOFFSET(HDF5Event, ftp_z), H5T_NATIVE_DOUBLE); + H5Tinsert(hdf5_events_tid, "ftk_energy", HOFFSET(HDF5Event, ftk_energy), H5T_NATIVE_DOUBLE); + H5Tinsert(hdf5_events_tid, "rsp_energy", HOFFSET(HDF5Event, rsp_energy), H5T_NATIVE_DOUBLE); + + hdf5_mcgn_tid = H5Tcreate(H5T_COMPOUND, sizeof(HDF5MCGN)); + H5Tinsert(hdf5_mcgn_tid, "run", HOFFSET(HDF5MCGN, run), H5T_NATIVE_INT); + H5Tinsert(hdf5_mcgn_tid, "evn", HOFFSET(HDF5MCGN, evn), H5T_NATIVE_INT); + H5Tinsert(hdf5_mcgn_tid, "id", HOFFSET(HDF5MCGN, id), H5T_NATIVE_INT); + H5Tinsert(hdf5_mcgn_tid, "energy", HOFFSET(HDF5MCGN, energy), H5T_NATIVE_DOUBLE); + H5Tinsert(hdf5_mcgn_tid, "x", HOFFSET(HDF5MCGN, x), H5T_NATIVE_DOUBLE); + H5Tinsert(hdf5_mcgn_tid, "y", HOFFSET(HDF5MCGN, y), H5T_NATIVE_DOUBLE); + H5Tinsert(hdf5_mcgn_tid, "z", HOFFSET(HDF5MCGN, z), H5T_NATIVE_DOUBLE); + H5Tinsert(hdf5_mcgn_tid, "dirx", HOFFSET(HDF5MCGN, dirx), H5T_NATIVE_DOUBLE); + H5Tinsert(hdf5_mcgn_tid, "diry", HOFFSET(HDF5MCGN, diry), H5T_NATIVE_DOUBLE); + H5Tinsert(hdf5_mcgn_tid, "dirz", HOFFSET(HDF5MCGN, dirz), H5T_NATIVE_DOUBLE); + H5Tinsert(hdf5_mcgn_tid, "time", HOFFSET(HDF5MCGN, time), H5T_NATIVE_DOUBLE); + + hdf5_fits_tid = H5Tcreate(H5T_COMPOUND, sizeof(HDF5Fit)); + H5Tinsert(hdf5_fits_tid, "run", HOFFSET(HDF5Fit, run), H5T_NATIVE_INT); + H5Tinsert(hdf5_fits_tid, "gtid", HOFFSET(HDF5Fit, gtid), H5T_NATIVE_UINT32); + H5Tinsert(hdf5_fits_tid, "n", HOFFSET(HDF5Fit, n), H5T_NATIVE_INT); + H5Tinsert(hdf5_fits_tid, "x", HOFFSET(HDF5Fit, x), H5T_NATIVE_DOUBLE); + H5Tinsert(hdf5_fits_tid, "y", HOFFSET(HDF5Fit, y), H5T_NATIVE_DOUBLE); + H5Tinsert(hdf5_fits_tid, "z", HOFFSET(HDF5Fit, z), H5T_NATIVE_DOUBLE); + H5Tinsert(hdf5_fits_tid, "t0", HOFFSET(HDF5Fit, t0), H5T_NATIVE_DOUBLE); + H5Tinsert(hdf5_fits_tid, "id1", HOFFSET(HDF5Fit, id1), H5T_NATIVE_INT); + H5Tinsert(hdf5_fits_tid, "energy1", HOFFSET(HDF5Fit, energy1), H5T_NATIVE_DOUBLE); + H5Tinsert(hdf5_fits_tid, "theta1", HOFFSET(HDF5Fit, theta1), H5T_NATIVE_DOUBLE); + H5Tinsert(hdf5_fits_tid, "phi1", HOFFSET(HDF5Fit, phi1), H5T_NATIVE_DOUBLE); + H5Tinsert(hdf5_fits_tid, "id2", HOFFSET(HDF5Fit, id2), H5T_NATIVE_INT); + H5Tinsert(hdf5_fits_tid, "energy2", HOFFSET(HDF5Fit, energy2), H5T_NATIVE_DOUBLE); + H5Tinsert(hdf5_fits_tid, "theta2", HOFFSET(HDF5Fit, theta2), H5T_NATIVE_DOUBLE); + H5Tinsert(hdf5_fits_tid, "phi2", HOFFSET(HDF5Fit, phi2), H5T_NATIVE_DOUBLE); + H5Tinsert(hdf5_fits_tid, "id3", HOFFSET(HDF5Fit, id3), H5T_NATIVE_INT); + H5Tinsert(hdf5_fits_tid, "energy3", HOFFSET(HDF5Fit, energy3), H5T_NATIVE_DOUBLE); + H5Tinsert(hdf5_fits_tid, "theta3", HOFFSET(HDF5Fit, theta3), H5T_NATIVE_DOUBLE); + H5Tinsert(hdf5_fits_tid, "phi3", HOFFSET(HDF5Fit, phi3), H5T_NATIVE_DOUBLE); + H5Tinsert(hdf5_fits_tid, "fmin", HOFFSET(HDF5Fit, fmin), H5T_NATIVE_DOUBLE); + H5Tinsert(hdf5_fits_tid, "time", HOFFSET(HDF5Fit, time), H5T_NATIVE_DOUBLE); + H5Tinsert(hdf5_fits_tid, "psi", HOFFSET(HDF5Fit, psi), H5T_NATIVE_DOUBLE); + + /* + * Create the dataset. + */ + dataset = H5Dcreate2(file, "ev", hdf5_events_tid, space, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + dataset_mcgn = H5Dcreate2(file, "mcgn", hdf5_mcgn_tid, space_mcgn, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + dataset_fits = H5Dcreate2(file, "fits", hdf5_fits_tid, space_fits, H5P_DEFAULT, dcpl_id, H5P_DEFAULT); + + /* + * Wtite data to the dataset; + */ + if (nevents) + status = H5Dwrite(dataset, hdf5_events_tid, H5S_ALL, H5S_ALL, H5P_DEFAULT, hdf5_events); + if (nmcgn) + status = H5Dwrite(dataset_mcgn, hdf5_mcgn_tid, H5S_ALL, H5S_ALL, H5P_DEFAULT, hdf5_mcgn); + if (nfits) + status = H5Dwrite(dataset_fits, hdf5_fits_tid, H5S_ALL, H5S_ALL, H5P_DEFAULT, hdf5_fits); + + /* + * Release resources + */ + H5Sclose(aid1); + H5Sclose(aid2); + H5Tclose(atype1); + H5Tclose(atype2); + H5Aclose(attr1); + H5Aclose(attr2); + H5Tclose(hdf5_events_tid); + H5Tclose(hdf5_mcgn_tid); + H5Tclose(hdf5_fits_tid); + H5Sclose(space); + H5Sclose(space_mcgn); + H5Sclose(space_fits); + H5Pclose(dcpl_id); + H5Dclose(dataset); + H5Dclose(dataset_mcgn); + H5Dclose(dataset_fits); + H5Fclose(file); + + return 0; +} diff --git a/src/hdf5_utils.h b/src/hdf5_utils.h new file mode 100644 index 0000000..4f6ad32 --- /dev/null +++ b/src/hdf5_utils.h @@ -0,0 +1,85 @@ +#ifndef HDF5_UTILS +#define HDF5_UTILS + +#include +#include /* for size_t */ + +/* Struct to hold information for each triggered event (similar to the EV bank, + * but with some reconstructed information). + * + * This is the struct that is written out to the 'ev' dataset in the HDF5 file. + * + * Note: If there is no FTP, FTK, or RSP info in the zdab file these values + * will be set to nan. */ +typedef struct HDF5Event { + int run; + int evn; + double gtr; + int nhit; + uint32_t gtid; + uint32_t trg_type; + uint32_t dc; + double ftp_x; + double ftp_y; + double ftp_z; + double ftk_energy; + double rsp_energy; +} HDF5Event; + +/* Struct to hold information for each primary seed tracks that is written out + * as the `mcgn` dataset in the HDF5 file. To figure out which track + * corresponds to a given GTID you can match them up based on the evn column, + * which is filled with the MC event number in both datasets. */ +typedef struct HDF5MCGN { + int run; + int evn; + int id; + double energy; + double x; + double y; + double z; + double dirx; + double diry; + double dirz; + double time; +} HDF5MCGN; + +/* Struct to hold information for each fit that is written out + * as the `fits` dataset in the HDF5 file. + * + * Note that the energy2, theta2, phi2 fields will only be filled in for fits + * with 2 tracks and similarly for energy3, theta3, and phi3 for 3 track fits. */ +typedef struct HDF5Fit { + int run; + uint32_t gtid; + double x; + double y; + double z; + double t0; + + /* Number of tracks. */ + int n; + + int id1; + double energy1; + double theta1; + double phi1; + + int id2; + double energy2; + double theta2; + double phi2; + + int id3; + double energy3; + double theta3; + double phi3; + + double fmin; + double time; + double psi; +} HDF5Fit; + +int save_output(const char *output, HDF5Event *hdf5_events, size_t nevents, HDF5MCGN *hdf5_mcgn, size_t nmcgn, HDF5Fit *hdf5_fits, size_t nfits); + +#endif diff --git a/src/zdab-cat.c b/src/zdab-cat.c index 5b9bb3d..278236c 100644 --- a/src/zdab-cat.c +++ b/src/zdab-cat.c @@ -26,6 +26,15 @@ #include /* for errno */ #include "release.h" #include "dc.h" +#include "hdf5.h" +#include +#include "hdf5_utils.h" + +/* Maximum number of events, primary seed particle tracks, and fits. + * + * FIXME: I should write out the events, tracks, and fits as they come up + * instead of storing them until the end. */ +#define MAX_NEVENTS 1000000 #define EV_RECORD 0x45562020 // 'EV ' #define MCTK_RECORD 0x4d43544b // 'MCTK' @@ -47,22 +56,25 @@ int main(int argc, char **argv) { int i; zebraFile *f; - zebraBank bmast, bmc, bmcgn, mctk, b; + zebraBank bmast, mc, bmcgn, mctk, b; int rv; EVBank bev; FTPVBank bftpv; FTXKBank bftxk; RSPBank bftxr; + MCBank bmc; MCTKBank bmctk; MCVXBank bmcvx; event ev = {0}; char *filename = NULL; char *output = NULL; - FILE *fout = stdout; int skip_second_event = 0; - size_t nhit; int last_run; char dqxx_file[256]; + int nevents = 0; + static HDF5Event hdf5_events[MAX_NEVENTS]; + int nmcgn = 0; + static HDF5MCGN hdf5_mcgn[MAX_NEVENTS]; for (i = 1; i < argc; i++) { if (strlen(argv[i]) >= 2 && !strncmp(argv[i], "--", 2)) { @@ -96,23 +108,8 @@ int main(int argc, char **argv) return 1; } - if (output) { - fout = fopen(output, "w"); - - if (!fout) { - fprintf(stderr, "failed to open '%s': %s\n", output, strerror(errno)); - return 1; - } - } - - if (fout) { - fprintf(fout, "git_sha1: %s\n", GitSHA1()); - fprintf(fout, "git_dirty: %s\n", GitDirty()); - } - if (load_pmt_info()) { zebra_close(f); - if (output) fclose(fout); return 1; } @@ -160,11 +157,9 @@ int main(int argc, char **argv) continue; } - if (fout) fprintf(fout, "---\n"); - if (bmast.links[KMAST_MC-1] == 0) goto skip_mc; - rv = zebra_get_bank(f,&bmc,bmast.links[KMAST_MC-1]); + rv = zebra_get_bank(f,&mc,bmast.links[KMAST_MC-1]); if (rv) { fprintf(stderr, "error getting MC bank: %s\n", zebra_err); @@ -176,14 +171,15 @@ int main(int argc, char **argv) goto err; } - rv = zebra_get_bank(f,&bmcgn,bmc.links[KMC_MCGN-1]); + unpack_mc(mc.data, &bmc); + + rv = zebra_get_bank(f,&bmcgn,mc.links[KMC_MCGN-1]); if (rv) { fprintf(stderr, "error getting MCGN bank: %s\n", zebra_err); goto err; } - if (fout) fprintf(fout, " mcgn:\n"); while (1) { if (bmcgn.links[KMCGN_MCTK-1] == 0) { fprintf(stderr, "MCTK link is zero!\n"); @@ -232,17 +228,18 @@ int main(int argc, char **argv) unpack_mcvx(b.data, &bmcvx); - if (fout) { - fprintf(fout, " -\n"); - fprintf(fout, " id: %" PRIu32 "\n", bmctk.idp); - fprintf(fout, " energy: %.2f\n", bmctk.ene); - fprintf(fout, " posx: %.2f\n", bmcvx.x); - fprintf(fout, " posy: %.2f\n", bmcvx.y); - fprintf(fout, " posz: %.2f\n", bmcvx.z); - fprintf(fout, " dirx: %.4f\n", bmctk.drx); - fprintf(fout, " diry: %.4f\n", bmctk.dry); - fprintf(fout, " dirz: %.4f\n", bmctk.drz); - fprintf(fout, " time: %.4f\n", bmcvx.tim); + if (output) { + hdf5_mcgn[nmcgn].evn = bmc.evn; + hdf5_mcgn[nmcgn].id = bmctk.idp; + hdf5_mcgn[nmcgn].energy = bmctk.ene; + hdf5_mcgn[nmcgn].x = bmcvx.x; + hdf5_mcgn[nmcgn].y = bmcvx.y; + hdf5_mcgn[nmcgn].z = bmcvx.z; + hdf5_mcgn[nmcgn].dirx = bmctk.drx; + hdf5_mcgn[nmcgn].diry = bmctk.dry; + hdf5_mcgn[nmcgn].dirz = bmctk.drz; + hdf5_mcgn[nmcgn].time = bmcvx.tim; + nmcgn++; } if (bmcgn.next) { @@ -278,7 +275,6 @@ skip_mc: } } - if (fout) fprintf(fout, " ev:\n"); while (1) { unpack_ev(b.data, &bev); ev.run = bev.run; @@ -305,44 +301,41 @@ skip_mc: rv = get_event(f,&ev,&b); - nhit = get_nhit(&ev); + if (output) { + hdf5_events[nevents].run = ev.run; + hdf5_events[nevents].evn = bmc.evn; + hdf5_events[nevents].gtr = ev.trigger_time; + hdf5_events[nevents].nhit = ev.nhit; + hdf5_events[nevents].gtid = ev.gtid; + hdf5_events[nevents].trg_type = ev.trigger_type; + hdf5_events[nevents].dc = get_dc_word(&ev, f, &bmast, &b); - if (fout) { - fprintf(fout, " - run: %i\n", ev.run); - fprintf(fout, " gtr: %.0f\n", ev.trigger_time); - fprintf(fout, " nhit: %zu\n", nhit); - fprintf(fout, " gtid: %i\n", ev.gtid); - fprintf(fout, " trg_type: 0x%08x\n", ev.trigger_type); - fprintf(fout, " dc: 0x%08x\n", get_dc_word(&ev, f, &bmast, &b)); - } - - if (fout) { if (get_ftpv(f,&b,&bftpv)) { fprintf(stderr, "%s\n", zdab_err); + hdf5_events[nevents].ftp_x = NAN; + hdf5_events[nevents].ftp_y = NAN; + hdf5_events[nevents].ftp_z = NAN; } else { - fprintf(fout, " ftp:\n"); - fprintf(fout, " x: %.2f\n", bftpv.x); - fprintf(fout, " y: %.2f\n", bftpv.y); - fprintf(fout, " z: %.2f\n", bftpv.z); + hdf5_events[nevents].ftp_x = bftpv.x; + hdf5_events[nevents].ftp_y = bftpv.y; + hdf5_events[nevents].ftp_z = bftpv.z; } - } - if (fout) { if (get_ftxk(f,&b,&bftxk)) { fprintf(stderr, "%s\n", zdab_err); + hdf5_events[nevents].ftk_energy = NAN; } else { - fprintf(fout, " ftk:\n"); - fprintf(fout, " energy: %.2f\n", bftxk.energy); + hdf5_events[nevents].ftk_energy = bftxk.energy; } - } - if (fout) { if (get_rsp(f,&b,&bftxr)) { fprintf(stderr, "%s\n", zdab_err); + hdf5_events[nevents].rsp_energy = NAN; } else { - fprintf(fout, " rsp:\n"); - fprintf(fout, " energy: %.2f\n", bftxr.ene); + hdf5_events[nevents].rsp_energy = bftxr.ene; } + + nevents++; } /* Note the origin link for the first EV bank points back to the @@ -359,9 +352,11 @@ skip_mc: } } - db_free(db); + if (output) { + save_output(output, hdf5_events, nevents, hdf5_mcgn, nmcgn, NULL, 0); + } - if (fout) fclose(fout); + db_free(db); zebra_close(f); @@ -370,8 +365,6 @@ skip_mc: err: db_free(db); - if (fout) fclose(fout); - zebra_close(f); return 1; diff --git a/src/zdab_utils.c b/src/zdab_utils.c index 716503c..416bd79 100644 --- a/src/zdab_utils.c +++ b/src/zdab_utils.c @@ -580,6 +580,24 @@ void unpack_ftk(uint32_t *data, FTKBank *b) unpack((uint8_t *) (data+25), "f",&b->spare5); } +void unpack_mc(uint32_t *data, MCBank *b) +{ + unpack((uint8_t *) data, "l",&b->jdy); + unpack((uint8_t *) (data+1), "l",&b->ut1); + unpack((uint8_t *) (data+2), "l",&b->ut2); + unpack((uint8_t *) (data+3), "l",&b->dte); + unpack((uint8_t *) (data+4), "l",&b->hmsc); + unpack((uint8_t *) (data+5), "l",&b->seed1); + unpack((uint8_t *) (data+6), "l",&b->seed2); + unpack((uint8_t *) (data+7), "l",&b->seed_num); + unpack((uint8_t *) (data+8), "l",&b->dtp); + unpack((uint8_t *) (data+9), "f",&b->mcver); + unpack((uint8_t *) (data+10), "l",&b->evn); + unpack((uint8_t *) (data+11), "F",&b->gtr); + unpack((uint8_t *) (data+13), "l",&b->num_ge_err); + unpack((uint8_t *) (data+14), "f",&b->pmt_eff); +} + void unpack_mcgn(uint32_t *data, MCGNBank *b) { unpack((uint8_t *) data, "l",&b->id); diff --git a/src/zdab_utils.h b/src/zdab_utils.h index d4411d9..043e185 100644 --- a/src/zdab_utils.h +++ b/src/zdab_utils.h @@ -378,6 +378,37 @@ typedef struct FTKBank { float spare5; } FTKBank; +typedef struct MCBank { + /* Julian date. */ + uint32_t jdy; + /* Universal time seconds. */ + uint32_t ut1; + /* Universal time nanoseconds. */ + uint32_t ut2; + /* Date (format: yyyymmdd). */ + uint32_t dte; + /* Time (format: hhmmsscc - cc is centisec). */ + uint32_t hmsc; + /* First random number seed. */ + uint32_t seed1; + /* Second random number seed. */ + uint32_t seed2; + /* Random number number. */ + uint32_t seed_num; + /* Data type. */ + uint32_t dtp; + /* SNOMAN MC version number. */ + float mcver; + /* Monte Carlo event number. */ + uint32_t evn; + /* Generation time in nsec (first word). */ + double gtr; + /* Number of suppressed Cerenkov photon tracking errors. */ + uint32_t num_ge_err; + /* The current PMT collection efficiency. */ + float pmt_eff; +} MCBank; + typedef struct MCGNBank { /* Particle id. */ uint32_t id; @@ -638,6 +669,7 @@ void unpack_ftpt(uint32_t *data, FTPTBank *b); void unpack_ftpv(uint32_t *data, FTPVBank *b); void unpack_ftxk(uint32_t *data, FTXKBank *b); void unpack_ftk(uint32_t *data, FTKBank *b); +void unpack_mc(uint32_t *data, MCBank *b); void unpack_mcgn(uint32_t *data, MCGNBank *b); void unpack_mcvx(uint32_t *data, MCVXBank *b); void unpack_mctk(uint32_t *data, MCTKBank *b); diff --git a/utils/Makefile b/utils/Makefile index ef21a09..011a723 100644 --- a/utils/Makefile +++ b/utils/Makefile @@ -13,5 +13,6 @@ install: $(INSTALL) plot-likelihood $(INSTALL_BIN) $(INSTALL) submit-grid-jobs $(INSTALL_BIN) $(INSTALL) gen-dark-matter $(INSTALL_BIN) + $(INSTALL) fit-wrapper $(INSTALL_BIN) .PHONY: install diff --git a/utils/cat-grid-jobs b/utils/cat-grid-jobs index ba06290..83a8004 100755 --- a/utils/cat-grid-jobs +++ b/utils/cat-grid-jobs @@ -23,7 +23,7 @@ as YAML to stdout. Example: - $ cat-grid-jobs --dir ~/sddm/src/ ~/mc_atm_nu_no_osc_genie_010000_0.mcds ~/grid_job_results/*.txt > output.txt + $ cat-grid-jobs ~/mc_atm_nu_no_osc_genie_010000_0.mcds ~/grid_job_results/*.txt > output.txt """ @@ -33,80 +33,145 @@ try: from yaml import CLoader as Loader, CDumper as Dumper except ImportError: from yaml import Loader, Dumper +import os +import sys + +# Check that a given file can be accessed with the correct mode. +# Additionally check that `file` is not a directory, as on Windows +# directories pass the os.access check. +def _access_check(fn, mode): + return (os.path.exists(fn) and os.access(fn, mode) and not os.path.isdir(fn)) + +def which(cmd, mode=os.F_OK | os.X_OK, path=None): + """Given a command, mode, and a PATH string, return the path which + conforms to the given mode on the PATH, or None if there is no such + file. + `mode` defaults to os.F_OK | os.X_OK. `path` defaults to the result + of os.environ.get("PATH"), or can be overridden with a custom search + path. + """ + # If we're given a path with a directory part, look it up directly rather + # than referring to PATH directories. This includes checking relative to the + # current directory, e.g. ./script + if os.path.dirname(cmd): + if _access_check(cmd, mode): + return cmd + return None + + if path is None: + path = os.environ.get("PATH", None) + if path is None: + try: + path = os.confstr("CS_PATH") + except (AttributeError, ValueError): + # os.confstr() or CS_PATH is not available + path = os.defpath + # bpo-35755: Don't use os.defpath if the PATH environment variable is + # set to an empty string + + # PATH='' doesn't match, whereas PATH=':' looks in the current directory + if not path: + return None + + path = path.split(os.pathsep) + + if sys.platform == "win32": + # The current directory takes precedence on Windows. + curdir = os.curdir + if curdir not in path: + path.insert(0, curdir) + + # PATHEXT is necessary to check on Windows. + pathext = os.environ.get("PATHEXT", "").split(os.pathsep) + # See if the given file matches any of the expected path extensions. + # This will allow us to short circuit when given "python.exe". + # If it does match, only test that one, otherwise we have to try + # others. + if any(cmd.lower().endswith(ext.lower()) for ext in pathext): + files = [cmd] + else: + files = [cmd + ext for ext in pathext] + else: + # On other platforms you don't have things like PATHEXT to tell you + # what file suffixes are executable, so just pass on cmd as-is. + files = [cmd] + + seen = set() + for dir in path: + normdir = os.path.normcase(dir) + if not normdir in seen: + seen.add(normdir) + for thefile in files: + name = os.path.join(dir, thefile) + if _access_check(name, mode): + return name + return None + +# from https://stackoverflow.com/questions/287871/how-to-print-colored-text-in-terminal-in-python +class bcolors: + HEADER = '\033[95m' + OKBLUE = '\033[94m' + OKGREEN = '\033[92m' + WARNING = '\033[93m' + FAIL = '\033[91m' + ENDC = '\033[0m' + BOLD = '\033[1m' + UNDERLINE = '\033[4m' + +def print_warning(msg): + print(bcolors.FAIL + msg + bcolors.ENDC,file=sys.stderr) if __name__ == '__main__': import argparse import matplotlib.pyplot as plt import numpy as np - import subprocess + from subprocess import check_call from os.path import join import os import sys + import h5py parser = argparse.ArgumentParser("concatenate fit results from grid jobs into a single file") - parser.add_argument("--dir", type=str, help="fitter directory", required=True) parser.add_argument("zdab", help="zdab input file") parser.add_argument("filenames", nargs='+', help="input files") + parser.add_argument("-o", "--output", type=str, help="output filename", required=True) args = parser.parse_args() - fit_results = {} + zdab_cat = which("zdab-cat") - # First we create a dictionary mapping (run, gtid) -> fit results. - for filename in args.filenames: - with open(filename) as f: - data = yaml.load(f,Loader=Loader) + if zdab_cat is None: + print("couldn't find zdab-cat in path!",file=sys.stderr) + sys.exit(1) - if data is None: - continue - - for event in data['data']: - if event['ev'] is None: - continue - - # if the ev branch is filled in, it means the event was fit - for ev in event['ev']: - # add the git SHA1 hash to the fit results since technically it - # could be different than the version in zdab-cat - ev['fit']['git_sha1'] = data['git_sha1'] - ev['fit']['git_dirty'] = data['git_dirty'] - fit_results[(ev['run'],ev['gtid'])] = ev['fit'] - - # Next we get the full event list along with the data cleaning word, FTP + # First we get the full event list along with the data cleaning word, FTP # position, FTK, and RSP energy from the original zdab and then add the fit # results. # # Note: We send stderr to /dev/null since there can be a lot of warnings # about PMT types and fit results with open(os.devnull, 'w') as f: - popen = subprocess.Popen([join(args.dir,"zdab-cat"),args.zdab],stdout=subprocess.PIPE,stderr=f) + check_call([zdab_cat,args.zdab,"-o",args.output],stderr=f) total_events = 0 events_with_fit = 0 - - doc = {'data': []} - - for data in yaml.load_all(popen.stdout,Loader=Loader): - if 'ev' not in data: - doc.update(data) - continue - - for ev in data['ev']: - run = ev['run'] - gtid = ev['gtid'] - - if (run,gtid) in fit_results: - ev['fit'] = fit_results[(run,gtid)] - events_with_fit += 1 - - total_events += 1 - - doc['data'].append(data) - - popen.wait() + total_fits = 0 + + with h5py.File(args.output,"a") as fout: + total_events = fout['ev'].shape[0] + for filename in args.filenames: + with h5py.File(filename) as f: + # Check to see if the git sha1 match + if fout.attrs['git_sha1'] != f.attrs['git_sha1']: + print_warning("git_sha1 is %s for current version but %s for %s" % (fout.attrs['git_sha1'],f.attrs['git_sha1'],filename)) + # get fits which match up with the events + valid_fits = f['fits'][np.isin(f['fits'][:][['run','gtid']],fout['ev'][:][['run','gtid']])] + # Add the fit results + fout['fits'].resize((fout['fits'].shape[0]+valid_fits.shape[0],)) + fout['fits'][-valid_fits.shape[0]:] = valid_fits + events_with_fit += len(np.unique(valid_fits[['run','gtid']])) + total_fits += len(np.unique(f['fits']['run','gtid'])) # Print out number of fit results that were added. Hopefully, this will # make it easy to catch an error if, for example, this gets run with a # mismatching zdab and fit results - print("added %i/%i fit results to a total of %i events" % (events_with_fit, len(fit_results), total_events),file=sys.stderr) - - print(yaml.dump(doc,default_flow_style=False,Dumper=Dumper)) + print("added %i/%i fit results to a total of %i events" % (events_with_fit, total_fits, total_events),file=sys.stderr) diff --git a/utils/fit-wrapper b/utils/fit-wrapper new file mode 100644 index 0000000..940b807 --- /dev/null +++ b/utils/fit-wrapper @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +# Simple wrapper script to run jobs on the grid + +module load hdf5 +module load zlib +./fit "$@" diff --git a/utils/plot b/utils/plot index ac93eda..1bac500 100755 --- a/utils/plot +++ b/utils/plot @@ -74,6 +74,8 @@ if __name__ == '__main__': import argparse import matplotlib.pyplot as plt import numpy as np + import h5py + import pandas as pd parser = argparse.ArgumentParser("plot fit results") parser.add_argument("filenames", nargs='+', help="input files") @@ -81,121 +83,113 @@ if __name__ == '__main__': for filename in args.filenames: print(filename) - with open(filename) as f: - data = yaml.load(f.read(),Loader=Loader) - - dx = [] - dy = [] - dz = [] - dT = [] - thetas = [] - likelihood_ratio = [] - t_electron = [] - t_muon = [] - psi = [] - for event in data['data']: - # get the particle ID - id = event['mcgn'][0]['id'] - - if 'ev' not in event: - continue - - if 'fit' not in event['ev'][0]: - # if nhit < 100 we don't fit the event - continue - - if id not in event['ev'][0]['fit']: - continue - - fit = event['ev'][0]['fit'] - - mass = SNOMAN_MASS[id] - # for some reason it's the *second* track which seems to contain the - # initial energy - true_energy = event['mcgn'][0]['energy'] - # The MCTK bank has the particle's total energy (except for neutrons) - # so we need to convert it into kinetic energy - ke = true_energy - mass - energy = fit[id]['energy'] - dT.append(energy-ke) - true_posx = event['mcgn'][0]['posx'] - posx = fit[id]['posx'] - dx.append(posx-true_posx) - true_posy = event['mcgn'][0]['posy'] - posy = fit[id]['posy'] - dy.append(posy-true_posy) - true_posz = event['mcgn'][0]['posz'] - posz = fit[id]['posz'] - dz.append(posz-true_posz) - dirx = event['mcgn'][0]['dirx'] - diry = event['mcgn'][0]['diry'] - dirz = event['mcgn'][0]['dirz'] - true_dir = [dirx,diry,dirz] - true_dir = np.array(true_dir)/np.linalg.norm(true_dir) - theta = fit[id]['theta'] - phi = fit[id]['phi'] - dir = [np.sin(theta)*np.cos(phi),np.sin(theta)*np.sin(phi),np.cos(theta)] - dir = np.array(dir)/np.linalg.norm(dir) - thetas.append(np.degrees(np.arccos(np.dot(true_dir,dir)))) - if IDP_E_MINUS in fit and IDP_MU_MINUS in fit: - fmin_electron = fit[IDP_E_MINUS]['fmin'] - fmin_muon = fit[IDP_MU_MINUS]['fmin'] - likelihood_ratio.append(fmin_muon-fmin_electron) - if IDP_E_MINUS in fit: - t_electron.append(fit[IDP_E_MINUS]['time']) - if IDP_MU_MINUS in fit: - t_muon.append(fit[IDP_MU_MINUS]['time']) - - if 'nhit' in event['ev'][0]: - nhit = event['ev'][0]['nhit'] - psi.append(fit[id]['psi']/nhit) - - if len(t_electron) < 2 or len(t_muon) < 2: + + with h5py.File(filename) as f: + ev = pd.read_hdf(filename, "ev") + mcgn = pd.read_hdf(filename, "mcgn") + fits = pd.read_hdf(filename, "fits") + + # get rid of 2nd events like Michel electrons + ev = ev.sort_values(['run','gtid']).groupby(['evn'],as_index=False).first() + + # Now, we merge all three datasets together to produce a single + # dataframe. To do so, we join the ev dataframe with the mcgn frame + # on the evn column, and then join with the fits on the run and + # gtid columns. + # + # At the end we will have a single dataframe with one row for each + # fit, i.e. it will look like: + # + # >>> data + # run gtid nhit, ... mcgn_x, mcgn_y, mcgn_z, ..., fit_id1, fit_x, fit_y, fit_z, ... + # + # Before merging, we prefix the primary seed track table with mcgn_ + # and the fit table with fit_ just to make things easier. + + # Prefix track and fit frames + mcgn = mcgn.add_prefix("mcgn_") + fits = fits.add_prefix("fit_") + + # merge ev and mcgn on evn + data = ev.merge(mcgn,left_on=['evn'],right_on=['mcgn_evn']) + # merge data and fits on run and gtid + data = data.merge(fits,left_on=['run','gtid'],right_on=['fit_run','fit_gtid']) + + # calculate true kinetic energy + mass = [SNOMAN_MASS[id] for id in data['mcgn_id'].values] + data['T'] = data['mcgn_energy'].values - mass + data['dx'] = data['fit_x'].values - data['mcgn_x'].values + data['dy'] = data['fit_y'].values - data['mcgn_y'].values + data['dz'] = data['fit_z'].values - data['mcgn_z'].values + data['dT'] = data['fit_energy1'].values - data['T'].values + + true_dir = np.dstack((data['mcgn_dirx'],data['mcgn_diry'],data['mcgn_dirz'])).squeeze() + dir = np.dstack((np.sin(data['fit_theta1'])*np.cos(data['fit_phi1']), + np.sin(data['fit_theta1'])*np.sin(data['fit_phi1']), + np.cos(data['fit_theta1']))).squeeze() + + data['theta'] = np.degrees(np.arccos((true_dir*dir).sum(axis=-1))) + + # only select fits which have at least 2 fits + data = data.groupby(['run','gtid']).filter(lambda x: len(x) > 1) + data_true = data[data['fit_id1'] == data['mcgn_id']] + data_e = data[data['fit_id1'] == IDP_E_MINUS] + data_mu = data[data['fit_id1'] == IDP_MU_MINUS] + + data_true = data_true.set_index(['run','gtid']) + data_e = data_e.set_index(['run','gtid']) + data_mu = data_mu.set_index(['run','gtid']) + + data_true['ratio'] = data_mu['fit_fmin']-data_e['fit_fmin'] + data_true['te'] = data_e['fit_time'] + data_true['tm'] = data_mu['fit_time'] + data_true['Te'] = data_e['fit_energy1'] + + if len(data_true) < 2: continue - mean, mean_error, std, std_error = get_stats(dT) + mean, mean_error, std, std_error = get_stats(data_true.dT) print("dT = %.2g +/- %.2g" % (mean, mean_error)) print("std(dT) = %.2g +/- %.2g" % (std, std_error)) - mean, mean_error, std, std_error = get_stats(dx) + mean, mean_error, std, std_error = get_stats(data_true.dx) print("dx = %4.2g +/- %.2g" % (mean, mean_error)) print("std(dx) = %4.2g +/- %.2g" % (std, std_error)) - mean, mean_error, std, std_error = get_stats(dy) + mean, mean_error, std, std_error = get_stats(data_true.dy) print("dy = %4.2g +/- %.2g" % (mean, mean_error)) print("std(dy) = %4.2g +/- %.2g" % (std, std_error)) - mean, mean_error, std, std_error = get_stats(dz) + mean, mean_error, std, std_error = get_stats(data_true.dz) print("dz = %4.2g +/- %.2g" % (mean, mean_error)) print("std(dz) = %4.2g +/- %.2g" % (std, std_error)) - mean, mean_error, std, std_error = get_stats(thetas) + mean, mean_error, std, std_error = get_stats(data_true.theta) print("std(theta) = %4.2g +/- %.2g" % (std, std_error)) plt.figure(1) - plot_hist(dT, label=filename) + plot_hist(data_true.dT, label=filename) plt.xlabel("Kinetic Energy difference (MeV)") plt.figure(2) - plot_hist(dx, label=filename) + plot_hist(data_true.dx, label=filename) plt.xlabel("X Position difference (cm)") plt.figure(3) - plot_hist(dy, label=filename) + plot_hist(data_true.dy, label=filename) plt.xlabel("Y Position difference (cm)") plt.figure(4) - plot_hist(dz, label=filename) + plot_hist(data_true.dz, label=filename) plt.xlabel("Z Position difference (cm)") plt.figure(5) - plot_hist(thetas, label=filename) + plot_hist(data_true.theta, label=filename) plt.xlabel(r"$\theta$ (deg)") plt.figure(6) - plot_hist(likelihood_ratio, label=filename) + plot_hist(data_true.ratio, label=filename) plt.xlabel(r"Log Likelihood Ratio ($e/\mu$)") plt.figure(7) - plot_hist(np.array(t_electron)/1e3/60.0, label=filename) + plot_hist(data_true.te/1e3/60.0, label=filename) plt.xlabel(r"Electron Fit time (minutes)") plt.figure(8) - plot_hist(np.array(t_muon)/1e3/60.0, label=filename) + plot_hist(data_true.tm/1e3/60.0, label=filename) plt.xlabel(r"Muon Fit time (minutes)") - if len(psi): - plt.figure(9) - plot_hist(psi, label=filename) - plt.xlabel(r"$\Psi$/Nhit") + plt.figure(9) + plot_hist(data_true.fit_psi/data_true.nhit, label=filename) + plt.xlabel(r"$\Psi$/Nhit") plot_legend(1) plot_legend(2) @@ -205,6 +199,5 @@ if __name__ == '__main__': plot_legend(6) plot_legend(7) plot_legend(8) - if len(psi): - plot_legend(9) + plot_legend(9) plt.show() diff --git a/utils/plot-energy b/utils/plot-energy index a0274b7..4a8521b 100755 --- a/utils/plot-energy +++ b/utils/plot-energy @@ -107,140 +107,69 @@ if __name__ == '__main__': import numpy as np import pandas as pd import sys + import h5py parser = argparse.ArgumentParser("plot fit results") parser.add_argument("filenames", nargs='+', help="input files") args = parser.parse_args() - events = [] - fit_results = [] - - for filename in args.filenames: - print(filename) - with open(filename) as f: - data = yaml.load(f,Loader=Loader) - - for i, event in enumerate(data['data']): - for ev in event['ev']: - events.append(( - ev['run'], - ev['gtr'], - ev['nhit'], - ev['gtid'], - ev['dc'], - ev['ftp']['x'] if 'ftp' in ev else np.nan, - ev['ftp']['y'] if 'ftp' in ev else np.nan, - ev['ftp']['z'] if 'ftp' in ev else np.nan, - ev['rsp']['energy'] if 'rsp' in ev and ev['rsp']['energy'] > 0 else np.nan, - )) - - if 'fit' not in ev: - continue - - for id, fit_result in [x for x in ev['fit'].iteritems() if isinstance(x[0],int)]: - # FIXME: Should I just store the particle ids in the YAML - # output as a list of particle ids instead of a single - # integer? - ids = map(int,chunks(str(id),2)) - energy = 0.0 - skip = False - for i, ke in zip(ids,np.atleast_1d(fit_result['energy'])): - energy += ke + SNOMAN_MASS[i] - - # This is a bit of a hack. It appears that many times - # the fit will actually do much better by including a - # very low energy electron or muon. I believe the - # reason for this is that of course my likelihood - # function is not perfect (for example, I don't include - # the correct angular distribution for Rayleigh - # scattered light), and so the fitter often wants to - # add a very low energy electron or muon to fix things. - # - # Ideally I would fix the likelihood function, but for - # now we just discard any fit results which have a very - # low energy electron or muon. - if len(ids) > 1 and i == 20 and ke < 20.0: - skip = True - - if len(ids) > 1 and i == 22 and ke < 200.0: - skip = True - - if skip: - continue - - # Calculate the approximate Ockham factor. - # See Chapter 20 in "Probability Theory: The Logic of Science" by Jaynes - # - # Note: This is a really approximate form by assuming that - # the shape of the likelihood space is equal to the average - # uncertainty in the different parameters. - w = len(ids)*np.log(0.1*0.001) + np.sum(np.log(fit_result['energy'])) + len(ids)*np.log(1e-4/(4*np.pi)) - - fit_results.append(( - ev['run'], - ev['gtid'], - id, - fit_result['posx'], - fit_result['posy'], - fit_result['posz'], - fit_result['t0'], - energy, - np.atleast_1d(fit_result['theta'])[0], - np.atleast_1d(fit_result['phi'])[0], - fit_result['fmin'] - w, - fit_result['psi']/ev['nhit'])) - - # create a dataframe - # note: we have to first create a numpy structured array since there is no - # way to pass a list of data types to the DataFrame constructor. See - # https://github.com/pandas-dev/pandas/issues/4464 - array = np.array(fit_results, - dtype=[('run',np.int), # run number - ('gtid',np.int), # gtid - ('id',np.int), # particle id - ('x', np.double), # x - ('y',np.double), # y - ('z',np.double), # z - ('t0',np.double), # t0 - ('ke',np.double), # kinetic energy - ('theta',np.double), # direction of 1st particle - ('phi',np.double), # direction of 2nd particle - ('fmin',np.double), # negative log likelihood - ('psi',np.double)] # goodness of fit parameter - ) - df = pd.DataFrame.from_records(array) - - array = np.array(events, - dtype=[('run',np.int), # run number - ('gtr',np.double), # 50 MHz clock in ns - ('nhit',np.int), # number of PMTs hit - ('gtid',np.int), # gtid - ('dc',np.int), # data cleaning word - ('ftpx',np.double), # FTP fitter X position - ('ftpy',np.double), # FTP fitter Y position - ('ftpz',np.double), # FTP fitter Z position - ('rsp_energy',np.double)] # RSP energy - ) - df_ev = pd.DataFrame.from_records(array) + ev = pd.concat([pd.read_hdf(filename, "ev") for filename in args.filenames]) + fits = pd.concat([pd.read_hdf(filename, "fits") for filename in args.filenames]) + + # This is a bit of a hack. It appears that many times the fit will + # actually do much better by including a very low energy electron or + # muon. I believe the reason for this is that of course my likelihood + # function is not perfect (for example, I don't include the correct + # angular distribution for Rayleigh scattered light), and so the fitter + # often wants to add a very low energy electron or muon to fix things. + # + # Ideally I would fix the likelihood function, but for now we just + # discard any fit results which have a very low energy electron or + # muon. + # + # FIXME: Test this since query() is new to pandas + fits = fits.query('not (n > 1 and ((id1 == 20 and energy1 < 20) or (id2 == 20 and energy2 < 20) or (id3 == 20 and energy3 < 20)))') + fits = fits.query('not (n > 1 and ((id2 == 22 and energy1 < 200) or (id2 == 22 and energy2 < 200) or (id3 == 22 and energy3 < 200)))') + + # Calculate the approximate Ockham factor. + # See Chapter 20 in "Probability Theory: The Logic of Science" by Jaynes + # + # Note: This is a really approximate form by assuming that the shape of + # the likelihood space is equal to the average uncertainty in the + # different parameters. + fits['w'] = fits['n']*np.log(0.1*0.001) + np.log(fits['energy1']) + fits['n']*np.log(1e-4/(4*np.pi)) + + # Note: we index on the left hand site with loc to avoid a copy error + # + # See https://www.dataquest.io/blog/settingwithcopywarning/ + fits.loc[fits['n'] > 1, 'w'] += np.log(fits[fits['n'] > 1]['energy2']) + fits.loc[fits['n'] > 2, 'w'] += np.log(fits[fits['n'] > 2]['energy3']) + + fits['fmin'] = fits['fmin'] - fits['w'] + + fits['psi'] /= fits.merge(ev,on=['run','gtid'])['nhit'] + fits['ke'] = fits['energy1'] + fits['id'] = fits['id1'] + fits['id2']*100 + fits['id3']*10000 + fits['theta'] = fits['theta1'] # Make sure events are in order. We use run number and GTID here which # should be in time order as well except for bit flips in the GTID # This is important for later when we look at time differences in the 50 # MHz clock. We should perhaps do a check here based on the 10 MHz clock to # make sure things are in order - df_ev = df_ev.sort_values(by=['run','gtid']) + ev = ev.sort_values(by=['run','gtid']) - print("number of events = %i" % len(df_ev)) + print("number of events = %i" % len(ev)) # First, do basic data cleaning which is done for all events. - df_ev = df_ev[df_ev.dc & (DC_JUNK | DC_CRATE_ISOTROPY | DC_QVNHIT | DC_FLASHER | DC_NECK) == 0] + ev = ev[ev.dc & (DC_JUNK | DC_CRATE_ISOTROPY | DC_QVNHIT | DC_FLASHER | DC_NECK) == 0] - print("number of events after data cleaning = %i" % len(df_ev)) + print("number of events after data cleaning = %i" % len(ev)) # Now, we select events tagged by the muon tag which should tag only # external muons. We keep the sample of muons since it's needed later to # identify Michel electrons and to apply the muon follower cut - muons = df_ev[(df_ev.dc & DC_MUON) != 0] + muons = ev[(ev.dc & DC_MUON) != 0] print("number of muons = %i" % len(muons)) @@ -251,12 +180,12 @@ if __name__ == '__main__': # # Should I use 10 MHz clock here since the time isn't critical and then I # don't have to worry about 50 MHz clock rollover? - prompt = df_ev[df_ev.nhit >= 100] + prompt = ev[ev.nhit >= 100] prompt_mask = np.concatenate(([True],np.diff(prompt.gtr.values) > 250e6)) # Michel electrons and neutrons can be any event which is not a prompt # event - follower = pd.concat([df_ev[df_ev.nhit < 100],prompt[~prompt_mask]]) + follower = pd.concat([ev[ev.nhit < 100],prompt[~prompt_mask]]) # Apply the prompt mask prompt = prompt[prompt_mask] @@ -275,39 +204,78 @@ if __name__ == '__main__': if prompt_plus_muons.size and follower.size: # require Michel events to pass more of the SNO data cleaning cuts michel = follower[follower.dc & (DC_JUNK | DC_CRATE_ISOTROPY | DC_QVNHIT | DC_FLASHER | DC_NECK | DC_ESUM | DC_OWL | DC_OWL_TRIGGER | DC_FTS) == DC_FTS] + michel = michel[michel.nhit >= 100] - # accept events which had a muon more than 800 nanoseconds but less than 20 microseconds before them - # The 800 nanoseconds cut comes from Richie's thesis. He also mentions - # that the In Time Channel Spread Cut is very effective at cutting - # electrons events caused by muons, so I should implement that - michel = michel[np.any((michel.run.values == prompt_plus_muons.run.values[:,np.newaxis]) & \ - (michel.gtr.values > prompt_plus_muons.gtr.values[:,np.newaxis] + 800) & \ - (michel.gtr.values < prompt_plus_muons.gtr.values[:,np.newaxis] + 20e3),axis=0)] + + # Accept events which had a muon more than 800 nanoseconds but less + # than 20 microseconds before them. The 800 nanoseconds cut comes from + # Richie's thesis. He also mentions that the In Time Channel Spread Cut + # is very effective at cutting electrons events caused by muons, so I + # should implement that. + # + # Note: We currently don't look across run boundaries. This should be a + # *very* small effect, and the logic to do so would be very complicated + # since I would have to deal with 50 MHz clock rollovers, etc. + # + # Do a simple python for loop here over the runs since we create + # temporary arrays with shape (michel.size,prompt_plus_muons.size) + # which could be too big for the full dataset. + # + # There might be some clever way to do this without the loop in Pandas, + # but I don't know how. + michel_sum = None + for run, michel_run in michel.groupby(['run']): + prompt_plus_muons_run = prompt_plus_muons[prompt_plus_muons['run'] == run] + michel_run = michel_run[np.any((michel_run.gtr.values > prompt_plus_muons_run.gtr.values[:,np.newaxis] + 800) & \ + (michel_run.gtr.values < prompt_plus_muons_run.gtr.values[:,np.newaxis] + 20e3),axis=0)] + + if michel_sum is None: + michel_sum = michel_run + else: + michel_sum = michel_sum.append(michel_run) + + michel = michel_sum else: michel = pd.DataFrame(columns=follower.columns) if prompt.size and follower.size: # neutron followers have to obey stricter set of data cleaning cuts neutron = follower[follower.dc & (DC_JUNK | DC_CRATE_ISOTROPY | DC_QVNHIT | DC_FLASHER | DC_NECK | DC_ESUM | DC_OWL | DC_OWL_TRIGGER | DC_FTS) == DC_FTS] - neutron = neutron[~np.isnan(neutron.ftpx) & ~np.isnan(neutron.rsp_energy)] - r = np.sqrt(neutron.ftpx**2 + neutron.ftpy**2 + neutron.ftpz**2) + neutron = neutron[~np.isnan(neutron.ftp_x) & ~np.isnan(neutron.rsp_energy)] + r = np.sqrt(neutron.ftp_x**2 + neutron.ftp_y**2 + neutron.ftp_z**2) neutron = neutron[r < AV_RADIUS] neutron = neutron[neutron.rsp_energy > 4.0] + # neutron events accepted after 20 microseconds and before 250 ms (50 ms during salt) # FIXME: need to deal with 50 MHz clock rollover - neutron_mask = np.any((neutron.run.values == prompt.run.values[:,np.newaxis]) & \ - (neutron.gtr.values > prompt.gtr.values[:,np.newaxis] + 20e3) & \ - (neutron.gtr.values < prompt.gtr.values[:,np.newaxis] + 250e6),axis=1) - df_atm = prompt[neutron_mask] - prompt = prompt[~neutron_mask] + prompt_sum = None + atm_sum = None + for run, prompt_run in prompt.groupby(['run']): + neutron_run = neutron[neutron['run'] == run] + + neutron_mask = np.any((neutron_run.gtr.values > prompt_run.gtr.values[:,np.newaxis] + 20e3) & \ + (neutron_run.gtr.values < prompt_run.gtr.values[:,np.newaxis] + 250e6),axis=1) + + if prompt_sum is None: + prompt_sum = prompt_run[~neutron_mask] + else: + prompt_sum = prompt_sum.append(prompt_run[~neutron_mask]) + + if atm_sum is None: + atm_sum = prompt_run[neutron_mask] + else: + atm_sum = atm_sum.append(prompt_run[neutron_mask]) + + atm = atm_sum + prompt = prompt_sum else: - df_atm = pd.DataFrame(columns=prompt.columns) + atm = pd.DataFrame(columns=prompt.columns) print("number of events after neutron follower cut = %i" % len(prompt)) # Get rid of muon events in our main event sample prompt = prompt[(prompt.dc & DC_MUON) == 0] - df_atm = df_atm[(df_atm.dc & DC_MUON) == 0] + atm = atm[(atm.dc & DC_MUON) == 0] print("number of events after muon cut = %i" % len(prompt)) @@ -317,52 +285,52 @@ if __name__ == '__main__': prompt = prompt[~np.any((prompt.run.values == muons.run.values[:,np.newaxis]) & \ (prompt.gtr.values > muons.gtr.values[:,np.newaxis]) & \ (prompt.gtr.values <= (muons.gtr.values[:,np.newaxis] + 200e3)),axis=0)] - df_atm = df_atm[~np.any((df_atm.run.values == muons.run.values[:,np.newaxis]) & \ - (df_atm.gtr.values > muons.gtr.values[:,np.newaxis]) & \ - (df_atm.gtr.values <= (muons.gtr.values[:,np.newaxis] + 200e3)),axis=0)] + atm = atm[~np.any((atm.run.values == muons.run.values[:,np.newaxis]) & \ + (atm.gtr.values > muons.gtr.values[:,np.newaxis]) & \ + (atm.gtr.values <= (muons.gtr.values[:,np.newaxis] + 200e3)),axis=0)] # Check to see if there are any events with missing fit information - df_atm_ra = df_atm[['run','gtid']].to_records(index=False) + atm_ra = atm[['run','gtid']].to_records(index=False) muons_ra = muons[['run','gtid']].to_records(index=False) prompt_ra = prompt[['run','gtid']].to_records(index=False) michel_ra = michel[['run','gtid']].to_records(index=False) - df_ra = df[['run','gtid']].to_records(index=False) + fits_ra = fits[['run','gtid']].to_records(index=False) - if np.count_nonzero(~np.isin(df_atm_ra,df_ra)): - print_warning("skipping %i atmospheric events because they are missing fit information!" % np.count_nonzero(~np.isin(df_atm_ra,df_ra))) + if len(atm_ra) and np.count_nonzero(~np.isin(atm_ra,fits_ra)): + print_warning("skipping %i atmospheric events because they are missing fit information!" % np.count_nonzero(~np.isin(atm_ra,fits_ra))) - if np.count_nonzero(~np.isin(muons_ra,df_ra)): - print_warning("skipping %i muon events because they are missing fit information!" % np.count_nonzero(~np.isin(muons_ra,df_ra))) + if len(muons_ra) and np.count_nonzero(~np.isin(muons_ra,fits_ra)): + print_warning("skipping %i muon events because they are missing fit information!" % np.count_nonzero(~np.isin(muons_ra,fits_ra))) - if np.count_nonzero(~np.isin(prompt_ra,df_ra)): - print_warning("skipping %i signal events because they are missing fit information!" % np.count_nonzero(~np.isin(prompt_ra,df_ra))) + if len(prompt_ra) and np.count_nonzero(~np.isin(prompt_ra,fits_ra)): + print_warning("skipping %i signal events because they are missing fit information!" % np.count_nonzero(~np.isin(prompt_ra,fits_ra))) - if np.count_nonzero(~np.isin(michel_ra,df_ra)): - print_warning("skipping %i Michel events because they are missing fit information!" % np.count_nonzero(~np.isin(michel_ra,df_ra))) + if len(michel_ra) and np.count_nonzero(~np.isin(michel_ra,fits_ra)): + print_warning("skipping %i Michel events because they are missing fit information!" % np.count_nonzero(~np.isin(michel_ra,fits_ra))) # Now, we merge the event info with the fitter info. # # Note: This means that the dataframe now contains multiple rows for each # event, one for each fit hypothesis. - df_atm = pd.merge(df,df_atm,how='inner',on=['run','gtid']) - muons = pd.merge(df,muons,how='inner',on=['run','gtid']) - michel = pd.merge(df,michel,how='inner',on=['run','gtid']) - df = pd.merge(df,prompt,how='inner',on=['run','gtid']) + atm = pd.merge(fits,atm,how='inner',on=['run','gtid']) + muons = pd.merge(fits,muons,how='inner',on=['run','gtid']) + michel = pd.merge(fits,michel,how='inner',on=['run','gtid']) + prompt = pd.merge(fits,prompt,how='inner',on=['run','gtid']) # get rid of events which don't have a fit - nan = np.isnan(df.fmin.values) + nan = np.isnan(prompt.fmin.values) if np.count_nonzero(nan): - print_warning("skipping %i signal events because the negative log likelihood is nan!" % len(df[nan].groupby(['run','gtid']))) + print_warning("skipping %i signal events because the negative log likelihood is nan!" % len(prompt[nan].groupby(['run','gtid']))) - df = df[~nan] + prompt = prompt[~nan] - nan_atm = np.isnan(df_atm.fmin.values) + nan_atm = np.isnan(atm.fmin.values) if np.count_nonzero(nan_atm): - print_warning("skipping %i atmospheric events because the negative log likelihood is nan!" % len(df_atm[nan_atm].groupby(['run','gtid']))) + print_warning("skipping %i atmospheric events because the negative log likelihood is nan!" % len(atm[nan_atm].groupby(['run','gtid']))) - df_atm = df_atm[~nan_atm] + atm = atm[~nan_atm] nan_muon = np.isnan(muons.fmin.values) @@ -379,24 +347,24 @@ if __name__ == '__main__': michel = michel[~nan_michel] # get the best fit - df = df.sort_values('fmin').groupby(['run','gtid']).first() - df_atm = df_atm.sort_values('fmin').groupby(['run','gtid']).first() + prompt = prompt.sort_values('fmin').groupby(['run','gtid']).first() + atm = atm.sort_values('fmin').groupby(['run','gtid']).first() michel_best_fit = michel.sort_values('fmin').groupby(['run','gtid']).first() muon_best_fit = muons.sort_values('fmin').groupby(['run','gtid']).first() muons = muons[muons.id == 22] # require r < 6 meters - df = df[np.sqrt(df.x.values**2 + df.y.values**2 + df.z.values**2) < AV_RADIUS] - df_atm = df_atm[np.sqrt(df_atm.x.values**2 + df_atm.y.values**2 + df_atm.z.values**2) < AV_RADIUS] + prompt = prompt[np.sqrt(prompt.x.values**2 + prompt.y.values**2 + prompt.z.values**2) < AV_RADIUS] + atm = atm[np.sqrt(atm.x.values**2 + atm.y.values**2 + atm.z.values**2) < AV_RADIUS] - print("number of events after radius cut = %i" % len(df)) + print("number of events after radius cut = %i" % len(prompt)) # Note: Need to design and apply a psi based cut here plt.figure() - plot_hist(df,"Without Neutron Follower") + plot_hist(prompt,"Without Neutron Follower") plt.figure() - plot_hist(df_atm,"With Neutron Follower") + plot_hist(atm,"With Neutron Follower") plt.figure() plot_hist(michel_best_fit,"Michel Electrons") plt.figure() diff --git a/utils/plot-fit-results b/utils/plot-fit-results index 773a0dc..7115b81 100755 --- a/utils/plot-fit-results +++ b/utils/plot-fit-results @@ -15,11 +15,6 @@ # this program. If not, see . from __future__ import print_function, division -import yaml -try: - from yaml import CLoader as Loader -except ImportError: - from yaml.loader import SafeLoader as Loader import numpy as np from scipy.stats import iqr from matplotlib.lines import Line2D @@ -71,228 +66,137 @@ def get_stats(x): error = np.sqrt((u4-(n-3)*std**4/(n-1))/n)/(2*std) return mean, std/np.sqrt(n), std, error +def std_err(x): + x = x.dropna() + mean = np.mean(x) + std = np.std(x) + n = len(x) + if n == 0: + return np.nan + u4 = np.mean((x-mean)**4) + error = np.sqrt((u4-(n-3)*std**4/(n-1))/n)/(2*std) + return error + if __name__ == '__main__': import argparse import matplotlib.pyplot as plt import numpy as np + import h5py + import pandas as pd parser = argparse.ArgumentParser("plot fit results") parser.add_argument("filenames", nargs='+', help="input files") args = parser.parse_args() - events = [] - - for filename in args.filenames: - print(filename) - with open(filename) as f: - data = yaml.load(f.read(),Loader=Loader) - - a = np.ma.empty(len(data['data']), - dtype=[('id',np.int), # particle id - ('T', np.double), # true energy - ('dx',np.double), # dx - ('dy',np.double), # dy - ('dz',np.double), # dz - ('dT',np.double), # dT - ('theta',np.double), # theta - ('ratio',np.double), # likelihood ratio - ('te',np.double), # time electron - ('tm',np.double), # time muon - ('Te',np.double)] # electron energy - ) - - for i, event in enumerate(data['data']): - # get the particle ID - id = event['mcgn'][0]['id'] - - a[i]['id'] = id - - if 'fit' not in event['ev'][0]: - # if nhit < 100 we don't fit the event - continue - - if id not in event['ev'][0]['fit']: - a[i] = np.ma.masked - continue - - mass = SNOMAN_MASS[id] - # for some reason it's the *second* track which seems to contain the - # initial energy - true_energy = event['mcgn'][0]['energy'] - # The MCTK bank has the particle's total energy (except for neutrons) - # so we need to convert it into kinetic energy - ke = true_energy - mass - - fit = event['ev'][0]['fit'] - - a[i]['T'] = ke - energy = fit[id]['energy'] - a[i]['dT'] = energy-ke - - # store the fit position residuals - true_posx = event['mcgn'][0]['posx'] - posx = fit[id]['posx'] - a[i]['dx'] = posx-true_posx - true_posy = event['mcgn'][0]['posy'] - posy = fit[id]['posy'] - a[i]['dy'] = posy-true_posy - true_posz = event['mcgn'][0]['posz'] - posz = fit[id]['posz'] - a[i]['dz'] = posz-true_posz - - # compute the angle between the fit direction and the true - # direction - dirx = event['mcgn'][0]['dirx'] - diry = event['mcgn'][0]['diry'] - dirz = event['mcgn'][0]['dirz'] - true_dir = [dirx,diry,dirz] - true_dir = np.array(true_dir)/np.linalg.norm(true_dir) - theta = fit[id]['theta'] - phi = fit[id]['phi'] - dir = [np.sin(theta)*np.cos(phi),np.sin(theta)*np.sin(phi),np.cos(theta)] - dir = np.array(dir)/np.linalg.norm(dir) - a[i]['theta'] = np.degrees(np.arccos(np.dot(true_dir,dir))) - - # compute the log likelihood ratio - if IDP_E_MINUS in fit and IDP_MU_MINUS in fit: - fmin_electron = fit[IDP_E_MINUS]['fmin'] - fmin_muon = fit[IDP_MU_MINUS]['fmin'] - a[i]['ratio'] = fmin_muon-fmin_electron - else: - a[i]['ratio'] = np.ma.masked - - # store the time taken for electron and muon fits - if IDP_E_MINUS in fit: - a[i]['te'] = fit[IDP_E_MINUS]['time'] - a[i]['Te'] = fit[IDP_E_MINUS]['energy'] - else: - a[i]['te'] = np.ma.masked - a[i]['Te'] = np.ma.masked - if IDP_MU_MINUS in fit: - a[i]['tm'] = fit[IDP_MU_MINUS]['time'] - else: - a[i]['tm'] = np.ma.masked - - events.append(a) - - a = np.ma.concatenate(events) - + # Read in all the data. + # + # Note: We have to add the filename as a column to each dataset since this + # script is used to analyze MC data which all has the same run number. + ev = pd.concat([pd.read_hdf(filename, "ev").assign(filename=filename) for filename in args.filenames]) + fits = pd.concat([pd.read_hdf(filename, "fits").assign(filename=filename) for filename in args.filenames]) + mcgn = pd.concat([pd.read_hdf(filename, "mcgn").assign(filename=filename) for filename in args.filenames]) + + # get rid of 2nd events like Michel electrons + ev = ev.sort_values(['run','gtid']).groupby(['filename','evn'],as_index=False).first() + + # Now, we merge all three datasets together to produce a single + # dataframe. To do so, we join the ev dataframe with the mcgn frame + # on the evn column, and then join with the fits on the run and + # gtid columns. + # + # At the end we will have a single dataframe with one row for each + # fit, i.e. it will look like: + # + # >>> data + # run gtid nhit, ... mcgn_x, mcgn_y, mcgn_z, ..., fit_id1, fit_x, fit_y, fit_z, ... + # + # Before merging, we prefix the primary seed track table with mcgn_ + # and the fit table with fit_ just to make things easier. + + # Prefix track and fit frames + mcgn = mcgn.add_prefix("mcgn_") + fits = fits.add_prefix("fit_") + + # merge ev and mcgn on evn + data = ev.merge(mcgn,left_on=['filename','evn'],right_on=['mcgn_filename','mcgn_evn']) + # merge data and fits on run and gtid + data = data.merge(fits,left_on=['filename','run','gtid'],right_on=['fit_filename','fit_run','fit_gtid']) + + # calculate true kinetic energy + mass = [SNOMAN_MASS[id] for id in data['mcgn_id'].values] + data['T'] = data['mcgn_energy'].values - mass + data['dx'] = data['fit_x'].values - data['mcgn_x'].values + data['dy'] = data['fit_y'].values - data['mcgn_y'].values + data['dz'] = data['fit_z'].values - data['mcgn_z'].values + data['dT'] = data['fit_energy1'].values - data['T'].values + + true_dir = np.dstack((data['mcgn_dirx'],data['mcgn_diry'],data['mcgn_dirz'])).squeeze() + dir = np.dstack((np.sin(data['fit_theta1'])*np.cos(data['fit_phi1']), + np.sin(data['fit_theta1'])*np.sin(data['fit_phi1']), + np.cos(data['fit_theta1']))).squeeze() + + data['theta'] = np.degrees(np.arccos((true_dir*dir).sum(axis=-1))) + + # only select fits which have at least 2 fits + data = data.groupby(['filename','run','gtid']).filter(lambda x: len(x) > 1) + data_true = data[data['fit_id1'] == data['mcgn_id']] + data_e = data[data['fit_id1'] == IDP_E_MINUS] + data_mu = data[data['fit_id1'] == IDP_MU_MINUS] + + data_true = data_true.set_index(['filename','run','gtid']) + data_e = data_e.set_index(['filename','run','gtid']) + data_mu = data_mu.set_index(['filename','run','gtid']) + + data_true['ratio'] = data_mu['fit_fmin']-data_e['fit_fmin'] + data_true['te'] = data_e['fit_time'] + data_true['tm'] = data_mu['fit_time'] + data_true['Te'] = data_e['fit_energy1'] + + # 100 bins between 50 MeV and 1 GeV bins = np.arange(50,1000,100) - stats_array = np.ma.empty(len(bins)-1, - dtype=[('T', np.double), - ('dT', np.double), - ('dT_err', np.double), - ('dT_std', np.double), - ('dT_std_err', np.double), - ('dx', np.double), - ('dx_err', np.double), - ('dx_std', np.double), - ('dx_std_err', np.double), - ('dy', np.double), - ('dy_err', np.double), - ('dy_std', np.double), - ('dy_std_err', np.double), - ('dz', np.double), - ('dz_err', np.double), - ('dz_std', np.double), - ('dz_std_err', np.double), - ('theta', np.double), - ('theta_err', np.double), - ('theta_std', np.double), - ('theta_std_err', np.double)]) - - stats = {IDP_E_MINUS: stats_array, IDP_MU_MINUS: stats_array.copy()} - - for id in stats: - electron_events = a[a['id'] == id] - - for i, (ablah, b) in enumerate(zip(bins[:-1], bins[1:])): - events = electron_events[(electron_events['T'] >= ablah) & (electron_events['T'] < b)] - - if len(events) < 2: - stats[id][i] = np.ma.masked - continue - - stats[id][i]['T'] = (ablah+b)/2 - mean, mean_error, std, std_error = get_stats(events['dT'].compressed()) - stats[id][i]['dT'] = mean - stats[id][i]['dT_err'] = mean_error - stats[id][i]['dT_std'] = std - stats[id][i]['dT_std_err'] = std_error - mean, mean_error, std, std_error = get_stats(events['dx'].compressed()) - stats[id][i]['dx'] = mean - stats[id][i]['dx_err'] = mean_error - stats[id][i]['dx_std'] = std - stats[id][i]['dx_std_err'] = std_error - mean, mean_error, std, std_error = get_stats(events['dy'].compressed()) - stats[id][i]['dy'] = mean - stats[id][i]['dy_err'] = mean_error - stats[id][i]['dy_std'] = std - stats[id][i]['dy_std_err'] = std_error - mean, mean_error, std, std_error = get_stats(events['dz'].compressed()) - stats[id][i]['dz'] = mean - stats[id][i]['dz_err'] = mean_error - stats[id][i]['dz_std'] = std - stats[id][i]['dz_std_err'] = std_error - mean, mean_error, std, std_error = get_stats(events['theta'].compressed()) - stats[id][i]['theta'] = mean - stats[id][i]['theta_err'] = mean_error - stats[id][i]['theta_std'] = std - stats[id][i]['theta_std_err'] = std_error - - for id in stats: + for id in [IDP_E_MINUS, IDP_MU_MINUS]: + events = data_true[data_true['mcgn_id'] == id] + + pd_bins = pd.cut(events['T'],bins) + + dT = events.groupby(pd_bins)['dT'].agg(['mean','sem','std',std_err]) + dx = events.groupby(pd_bins)['dx'].agg(['mean','sem','std',std_err]) + dy = events.groupby(pd_bins)['dy'].agg(['mean','sem','std',std_err]) + dz = events.groupby(pd_bins)['dz'].agg(['mean','sem','std',std_err]) + theta = events.groupby(pd_bins)['theta'].agg(['mean','sem','std',std_err]) + label = 'Muon' if id == IDP_MU_MINUS else 'Electron' - T = stats[id]['T'] - dT = stats[id]['dT'] - dT_err = stats[id]['dT_err'] - std_dT = stats[id]['dT_std'] - std_dT_err = stats[id]['dT_std_err'] - dx = stats[id]['dx'] - dx_err = stats[id]['dx_err'] - std_dx = stats[id]['dx_std'] - std_dx_err = stats[id]['dx_std_err'] - dy = stats[id]['dy'] - dy_err = stats[id]['dy_err'] - std_dy = stats[id]['dy_std'] - std_dy_err = stats[id]['dy_std_err'] - dz = stats[id]['dz'] - dz_err = stats[id]['dz_err'] - std_dz = stats[id]['dz_std'] - std_dz_err = stats[id]['dz_std_err'] - theta = stats[id]['theta'] - theta_err = stats[id]['theta_err'] - std_theta = stats[id]['theta_std'] - std_theta_err = stats[id]['theta_std_err'] + T = (bins[1:] + bins[:-1])/2 plt.figure(1) - plt.errorbar(T,dT*100/T,yerr=dT_err*100/T,fmt='o',label=label) + plt.errorbar(T,dT['mean']*100/T,yerr=dT['sem']*100/T,fmt='o',label=label) plt.xlabel("Kinetic Energy (MeV)") plt.ylabel("Energy bias (%)") plt.title("Energy Bias") plt.legend() plt.figure(2) - plt.errorbar(T,std_dT*100/T,yerr=std_dT_err*100/T,fmt='o',label=label) + plt.errorbar(T,dT['std']*100/T,yerr=dT['std_err']*100/T,fmt='o',label=label) plt.xlabel("Kinetic Energy (MeV)") plt.ylabel("Energy resolution (%)") plt.title("Energy Resolution") plt.legend() plt.figure(3) - plt.errorbar(T,dx,yerr=dx_err,fmt='o',label='%s (x)' % label) - plt.errorbar(T,dy,yerr=dy_err,fmt='o',label='%s (y)' % label) - plt.errorbar(T,dz,yerr=dz_err,fmt='o',label='%s (z)' % label) + plt.errorbar(T,dx['mean'],yerr=dx['sem'],fmt='o',label='%s (x)' % label) + plt.errorbar(T,dy['mean'],yerr=dy['sem'],fmt='o',label='%s (y)' % label) + plt.errorbar(T,dz['mean'],yerr=dz['sem'],fmt='o',label='%s (z)' % label) plt.xlabel("Kinetic Energy (MeV)") plt.ylabel("Position bias (cm)") plt.title("Position Bias") plt.legend() plt.figure(4) - plt.errorbar(T,std_dx,yerr=std_dx_err,fmt='o',label='%s (x)' % label) - plt.errorbar(T,std_dy,yerr=std_dy_err,fmt='o',label='%s (y)' % label) - plt.errorbar(T,std_dz,yerr=std_dz_err,fmt='o',label='%s (z)' % label) + plt.errorbar(T,dx['std'],yerr=dx['std_err'],fmt='o',label='%s (x)' % label) + plt.errorbar(T,dy['std'],yerr=dy['std_err'],fmt='o',label='%s (y)' % label) + plt.errorbar(T,dz['std'],yerr=dz['std_err'],fmt='o',label='%s (z)' % label) plt.xlabel("Kinetic Energy (MeV)") plt.ylabel("Position resolution (cm)") plt.title("Position Resolution") @@ -300,7 +204,7 @@ if __name__ == '__main__': plt.legend() plt.figure(5) - plt.errorbar(T,std_theta,yerr=std_theta_err,fmt='o',label=label) + plt.errorbar(T,theta['std'],yerr=theta['std_err'],fmt='o',label=label) plt.xlabel("Kinetic Energy (MeV)") plt.ylabel("Angular resolution (deg)") plt.title("Angular Resolution") @@ -308,9 +212,10 @@ if __name__ == '__main__': plt.legend() plt.figure(6) - plt.scatter(a[a['id'] == id]['Te'],a[a['id'] == id]['ratio'],label=label) + plt.scatter(events['Te'],events['ratio'],label=label) plt.xlabel("Reconstructed Electron Energy (MeV)") plt.ylabel(r"Log Likelihood Ratio (e/$\mu$)") plt.title("Log Likelihood Ratio vs Reconstructed Electron Energy") plt.legend() + plt.show() diff --git a/utils/submit-grid-jobs b/utils/submit-grid-jobs index 670172d..63cbcb7 100755 --- a/utils/submit-grid-jobs +++ b/utils/submit-grid-jobs @@ -24,6 +24,83 @@ import string from os.path import split, splitext, join, abspath import uuid from subprocess import check_call +import os +import sys + +# Next two functions are a backport of the shutil.which() function from Python +# 3.3 from Lib/shutil.py in the CPython code See +# https://github.com/python/cpython/blob/master/Lib/shutil.py. + +# Check that a given file can be accessed with the correct mode. +# Additionally check that `file` is not a directory, as on Windows +# directories pass the os.access check. +def _access_check(fn, mode): + return (os.path.exists(fn) and os.access(fn, mode) and not os.path.isdir(fn)) + +def which(cmd, mode=os.F_OK | os.X_OK, path=None): + """Given a command, mode, and a PATH string, return the path which + conforms to the given mode on the PATH, or None if there is no such + file. + `mode` defaults to os.F_OK | os.X_OK. `path` defaults to the result + of os.environ.get("PATH"), or can be overridden with a custom search + path. + """ + # If we're given a path with a directory part, look it up directly rather + # than referring to PATH directories. This includes checking relative to the + # current directory, e.g. ./script + if os.path.dirname(cmd): + if _access_check(cmd, mode): + return cmd + return None + + if path is None: + path = os.environ.get("PATH", None) + if path is None: + try: + path = os.confstr("CS_PATH") + except (AttributeError, ValueError): + # os.confstr() or CS_PATH is not available + path = os.defpath + # bpo-35755: Don't use os.defpath if the PATH environment variable is + # set to an empty string + + # PATH='' doesn't match, whereas PATH=':' looks in the current directory + if not path: + return None + + path = path.split(os.pathsep) + + if sys.platform == "win32": + # The current directory takes precedence on Windows. + curdir = os.curdir + if curdir not in path: + path.insert(0, curdir) + + # PATHEXT is necessary to check on Windows. + pathext = os.environ.get("PATHEXT", "").split(os.pathsep) + # See if the given file matches any of the expected path extensions. + # This will allow us to short circuit when given "python.exe". + # If it does match, only test that one, otherwise we have to try + # others. + if any(cmd.lower().endswith(ext.lower()) for ext in pathext): + files = [cmd] + else: + files = [cmd + ext for ext in pathext] + else: + # On other platforms you don't have things like PATHEXT to tell you + # what file suffixes are executable, so just pass on cmd as-is. + files = [cmd] + + seen = set() + for dir in path: + normdir = os.path.normcase(dir) + if not normdir in seen: + seen.add(normdir) + for thefile in files: + name = os.path.join(dir, thefile) + if _access_check(name, mode): + return name + return None CONDOR_TEMPLATE = \ """ @@ -42,7 +119,7 @@ log = @log # The below are good base requirements for first testing jobs on OSG, # if you don't have a good idea of memory and disk usage. -requirements = (OSGVO_OS_STRING == "RHEL 7") && (OpSys == "LINUX") +requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == "RHEL 7") && (OpSys == "LINUX") request_cpus = 1 request_memory = 1 GB request_disk = 1 GB @@ -72,14 +149,24 @@ def submit_job(filename, run, gtid, dir, dqxx_dir, min_nhit, max_particles): prefix = "%s_%08i_%s" % (root,gtid,ID.hex) # fit output filename - output = "%s.txt" % prefix + output = "%s.hdf5" % prefix # condor submit filename condor_submit = "%s.submit" % prefix # set up the arguments for the template - executable = join(dir,"fit") + executable = which("fit") + wrapper = which("fit-wrapper") + + if executable is None: + print("couldn't find fit in path!",file=sys.stderr) + sys.exit(1) + + if wrapper is None: + print("couldn't find fit-wrapper in path!",file=sys.stderr) + sys.exit(1) + args = [tail,"-o",output,"--gtid",gtid,"--min-nhit",min_nhit,"--max-particles",max_particles] - transfer_input_files = ",".join([filename,join(dqxx_dir,"DQXX_%010i.dat" % run)] + [join(dir,filename) for filename in INPUT_FILES]) + transfer_input_files = ",".join([executable,filename,join(dqxx_dir,"DQXX_%010i.dat" % run)] + [join(dir,filename) for filename in INPUT_FILES]) transfer_output_files = ",".join([output]) condor_error = "%s.error" % prefix @@ -89,7 +176,7 @@ def submit_job(filename, run, gtid, dir, dqxx_dir, min_nhit, max_particles): template = MyTemplate(CONDOR_TEMPLATE) submit_string = template.safe_substitute( - executable=executable, + executable=wrapper, args=" ".join(map(str,args)), transfer_input_files=transfer_input_files, transfer_output_files=transfer_output_files, @@ -106,13 +193,13 @@ def submit_job(filename, run, gtid, dir, dqxx_dir, min_nhit, max_particles): if __name__ == '__main__': import argparse - import subprocess + from subprocess import check_call import os + import tempfile + import h5py parser = argparse.ArgumentParser("submit grid jobs") parser.add_argument("filenames", nargs='+', help="input files") - parser.add_argument("--dir", type=str, help="fitter directory", required=True) - parser.add_argument("--dqxx-dir", type=str, help="dqxx directory", required=True) parser.add_argument("--min-nhit", type=int, help="minimum nhit to fit an event", default=100) parser.add_argument("--max-particles", type=int, help="maximum number of particles to fit for", default=3) parser.add_argument("--skip-second-event", action='store_true', help="only fit the first event after a MAST bank", default=False) @@ -123,14 +210,32 @@ if __name__ == '__main__': home = os.path.expanduser("~") args.state = join(home,'state.txt') + if 'SDDM_DATA' not in os.environ: + print("Please set the SDDM_DATA environment variable to point to the fitter source code location", file=sys.stderr) + sys.exit(1) + + dir = os.environ['SDDM_DATA'] + + if 'DQXX_DIR' not in os.environ: + print("Please set the DQXX_DIR environment variable to point to the directory with the DQXX files", file=sys.stderr) + sys.exit(1) + + dqxx_dir = os.environ['DQXX_DIR'] + args.state = abspath(args.state) # get the current working directory home_dir = os.getcwd() # get absolute paths since we are going to create a new directory - args.dir = abspath(args.dir) - args.dqxx_dir = abspath(args.dqxx_dir) + dir = abspath(dir) + dqxx_dir = abspath(dqxx_dir) + + zdab_cat = which("zdab-cat") + + if zdab_cat is None: + print("couldn't find zdab-cat in path!",file=sys.stderr) + sys.exit(1) for filename in args.filenames: filename = abspath(filename) @@ -152,29 +257,28 @@ if __name__ == '__main__': continue with open(os.devnull, 'w') as f: + # Create a temporary file to store the events. Docs recommended + # this method instead of mkstemp(), but I think they're the same. + output = tempfile.NamedTemporaryFile(suffix='.hdf5',delete=False) + output.close() + if args.skip_second_event: - popen = subprocess.Popen([join(args.dir,"zdab-cat"),"--skip-second-event",filename],stdout=subprocess.PIPE,stderr=f) + check_call([zdab_cat,"--skip-second-event",filename,"-o",output.name],stderr=f) else: - popen = subprocess.Popen([join(args.dir,"zdab-cat"),filename],stdout=subprocess.PIPE,stderr=f) + check_call([zdab_cat,filename,"-o",output.name],stderr=f) new_dir = "%s_%s" % (root,ID.hex) os.mkdir(new_dir) os.chdir(new_dir) - for data in yaml.load_all(popen.stdout,Loader=Loader): - if 'ev' not in data: - continue - - for ev in data['ev']: - run = ev['run'] - gtid = ev['gtid'] - nhit = ev['nhit'] - - if nhit >= args.min_nhit: - submit_job(filename, run, gtid, args.dir, args.dqxx_dir, args.min_nhit, args.max_particles) + with h5py.File(output.name) as f: + for ev in f['ev']: + if ev['nhit'] >= args.min_nhit: + submit_job(filename, ev['run'], ev['gtid'], dir, dqxx_dir, args.min_nhit, args.max_particles) - popen.wait() + # Delete temporary HDF5 file + os.unlink(output.name) state.append(tail) -- cgit