diff options
Diffstat (limited to 'utils')
-rw-r--r-- | utils/Makefile | 1 | ||||
-rwxr-xr-x | utils/cat-grid-jobs | 163 | ||||
-rw-r--r-- | utils/fit-wrapper | 7 | ||||
-rwxr-xr-x | utils/plot | 171 | ||||
-rwxr-xr-x | utils/plot-energy | 294 | ||||
-rwxr-xr-x | utils/plot-fit-results | 297 | ||||
-rwxr-xr-x | utils/submit-grid-jobs | 152 |
7 files changed, 564 insertions, 521 deletions
diff --git a/utils/Makefile b/utils/Makefile index ef21a09..011a723 100644 --- a/utils/Makefile +++ b/utils/Makefile @@ -13,5 +13,6 @@ install: $(INSTALL) plot-likelihood $(INSTALL_BIN) $(INSTALL) submit-grid-jobs $(INSTALL_BIN) $(INSTALL) gen-dark-matter $(INSTALL_BIN) + $(INSTALL) fit-wrapper $(INSTALL_BIN) .PHONY: install diff --git a/utils/cat-grid-jobs b/utils/cat-grid-jobs index ba06290..83a8004 100755 --- a/utils/cat-grid-jobs +++ b/utils/cat-grid-jobs @@ -23,7 +23,7 @@ as YAML to stdout. Example: - $ cat-grid-jobs --dir ~/sddm/src/ ~/mc_atm_nu_no_osc_genie_010000_0.mcds ~/grid_job_results/*.txt > output.txt + $ cat-grid-jobs ~/mc_atm_nu_no_osc_genie_010000_0.mcds ~/grid_job_results/*.txt > output.txt """ @@ -33,80 +33,145 @@ try: from yaml import CLoader as Loader, CDumper as Dumper except ImportError: from yaml import Loader, Dumper +import os +import sys + +# Check that a given file can be accessed with the correct mode. +# Additionally check that `file` is not a directory, as on Windows +# directories pass the os.access check. +def _access_check(fn, mode): + return (os.path.exists(fn) and os.access(fn, mode) and not os.path.isdir(fn)) + +def which(cmd, mode=os.F_OK | os.X_OK, path=None): + """Given a command, mode, and a PATH string, return the path which + conforms to the given mode on the PATH, or None if there is no such + file. + `mode` defaults to os.F_OK | os.X_OK. `path` defaults to the result + of os.environ.get("PATH"), or can be overridden with a custom search + path. + """ + # If we're given a path with a directory part, look it up directly rather + # than referring to PATH directories. This includes checking relative to the + # current directory, e.g. ./script + if os.path.dirname(cmd): + if _access_check(cmd, mode): + return cmd + return None + + if path is None: + path = os.environ.get("PATH", None) + if path is None: + try: + path = os.confstr("CS_PATH") + except (AttributeError, ValueError): + # os.confstr() or CS_PATH is not available + path = os.defpath + # bpo-35755: Don't use os.defpath if the PATH environment variable is + # set to an empty string + + # PATH='' doesn't match, whereas PATH=':' looks in the current directory + if not path: + return None + + path = path.split(os.pathsep) + + if sys.platform == "win32": + # The current directory takes precedence on Windows. + curdir = os.curdir + if curdir not in path: + path.insert(0, curdir) + + # PATHEXT is necessary to check on Windows. + pathext = os.environ.get("PATHEXT", "").split(os.pathsep) + # See if the given file matches any of the expected path extensions. + # This will allow us to short circuit when given "python.exe". + # If it does match, only test that one, otherwise we have to try + # others. + if any(cmd.lower().endswith(ext.lower()) for ext in pathext): + files = [cmd] + else: + files = [cmd + ext for ext in pathext] + else: + # On other platforms you don't have things like PATHEXT to tell you + # what file suffixes are executable, so just pass on cmd as-is. + files = [cmd] + + seen = set() + for dir in path: + normdir = os.path.normcase(dir) + if not normdir in seen: + seen.add(normdir) + for thefile in files: + name = os.path.join(dir, thefile) + if _access_check(name, mode): + return name + return None + +# from https://stackoverflow.com/questions/287871/how-to-print-colored-text-in-terminal-in-python +class bcolors: + HEADER = '\033[95m' + OKBLUE = '\033[94m' + OKGREEN = '\033[92m' + WARNING = '\033[93m' + FAIL = '\033[91m' + ENDC = '\033[0m' + BOLD = '\033[1m' + UNDERLINE = '\033[4m' + +def print_warning(msg): + print(bcolors.FAIL + msg + bcolors.ENDC,file=sys.stderr) if __name__ == '__main__': import argparse import matplotlib.pyplot as plt import numpy as np - import subprocess + from subprocess import check_call from os.path import join import os import sys + import h5py parser = argparse.ArgumentParser("concatenate fit results from grid jobs into a single file") - parser.add_argument("--dir", type=str, help="fitter directory", required=True) parser.add_argument("zdab", help="zdab input file") parser.add_argument("filenames", nargs='+', help="input files") + parser.add_argument("-o", "--output", type=str, help="output filename", required=True) args = parser.parse_args() - fit_results = {} + zdab_cat = which("zdab-cat") - # First we create a dictionary mapping (run, gtid) -> fit results. - for filename in args.filenames: - with open(filename) as f: - data = yaml.load(f,Loader=Loader) + if zdab_cat is None: + print("couldn't find zdab-cat in path!",file=sys.stderr) + sys.exit(1) - if data is None: - continue - - for event in data['data']: - if event['ev'] is None: - continue - - # if the ev branch is filled in, it means the event was fit - for ev in event['ev']: - # add the git SHA1 hash to the fit results since technically it - # could be different than the version in zdab-cat - ev['fit']['git_sha1'] = data['git_sha1'] - ev['fit']['git_dirty'] = data['git_dirty'] - fit_results[(ev['run'],ev['gtid'])] = ev['fit'] - - # Next we get the full event list along with the data cleaning word, FTP + # First we get the full event list along with the data cleaning word, FTP # position, FTK, and RSP energy from the original zdab and then add the fit # results. # # Note: We send stderr to /dev/null since there can be a lot of warnings # about PMT types and fit results with open(os.devnull, 'w') as f: - popen = subprocess.Popen([join(args.dir,"zdab-cat"),args.zdab],stdout=subprocess.PIPE,stderr=f) + check_call([zdab_cat,args.zdab,"-o",args.output],stderr=f) total_events = 0 events_with_fit = 0 - - doc = {'data': []} - - for data in yaml.load_all(popen.stdout,Loader=Loader): - if 'ev' not in data: - doc.update(data) - continue - - for ev in data['ev']: - run = ev['run'] - gtid = ev['gtid'] - - if (run,gtid) in fit_results: - ev['fit'] = fit_results[(run,gtid)] - events_with_fit += 1 - - total_events += 1 - - doc['data'].append(data) - - popen.wait() + total_fits = 0 + + with h5py.File(args.output,"a") as fout: + total_events = fout['ev'].shape[0] + for filename in args.filenames: + with h5py.File(filename) as f: + # Check to see if the git sha1 match + if fout.attrs['git_sha1'] != f.attrs['git_sha1']: + print_warning("git_sha1 is %s for current version but %s for %s" % (fout.attrs['git_sha1'],f.attrs['git_sha1'],filename)) + # get fits which match up with the events + valid_fits = f['fits'][np.isin(f['fits'][:][['run','gtid']],fout['ev'][:][['run','gtid']])] + # Add the fit results + fout['fits'].resize((fout['fits'].shape[0]+valid_fits.shape[0],)) + fout['fits'][-valid_fits.shape[0]:] = valid_fits + events_with_fit += len(np.unique(valid_fits[['run','gtid']])) + total_fits += len(np.unique(f['fits']['run','gtid'])) # Print out number of fit results that were added. Hopefully, this will # make it easy to catch an error if, for example, this gets run with a # mismatching zdab and fit results - print("added %i/%i fit results to a total of %i events" % (events_with_fit, len(fit_results), total_events),file=sys.stderr) - - print(yaml.dump(doc,default_flow_style=False,Dumper=Dumper)) + print("added %i/%i fit results to a total of %i events" % (events_with_fit, total_fits, total_events),file=sys.stderr) diff --git a/utils/fit-wrapper b/utils/fit-wrapper new file mode 100644 index 0000000..940b807 --- /dev/null +++ b/utils/fit-wrapper @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +# Simple wrapper script to run jobs on the grid + +module load hdf5 +module load zlib +./fit "$@" @@ -74,6 +74,8 @@ if __name__ == '__main__': import argparse import matplotlib.pyplot as plt import numpy as np + import h5py + import pandas as pd parser = argparse.ArgumentParser("plot fit results") parser.add_argument("filenames", nargs='+', help="input files") @@ -81,121 +83,113 @@ if __name__ == '__main__': for filename in args.filenames: print(filename) - with open(filename) as f: - data = yaml.load(f.read(),Loader=Loader) - - dx = [] - dy = [] - dz = [] - dT = [] - thetas = [] - likelihood_ratio = [] - t_electron = [] - t_muon = [] - psi = [] - for event in data['data']: - # get the particle ID - id = event['mcgn'][0]['id'] - - if 'ev' not in event: - continue - - if 'fit' not in event['ev'][0]: - # if nhit < 100 we don't fit the event - continue - - if id not in event['ev'][0]['fit']: - continue - - fit = event['ev'][0]['fit'] - - mass = SNOMAN_MASS[id] - # for some reason it's the *second* track which seems to contain the - # initial energy - true_energy = event['mcgn'][0]['energy'] - # The MCTK bank has the particle's total energy (except for neutrons) - # so we need to convert it into kinetic energy - ke = true_energy - mass - energy = fit[id]['energy'] - dT.append(energy-ke) - true_posx = event['mcgn'][0]['posx'] - posx = fit[id]['posx'] - dx.append(posx-true_posx) - true_posy = event['mcgn'][0]['posy'] - posy = fit[id]['posy'] - dy.append(posy-true_posy) - true_posz = event['mcgn'][0]['posz'] - posz = fit[id]['posz'] - dz.append(posz-true_posz) - dirx = event['mcgn'][0]['dirx'] - diry = event['mcgn'][0]['diry'] - dirz = event['mcgn'][0]['dirz'] - true_dir = [dirx,diry,dirz] - true_dir = np.array(true_dir)/np.linalg.norm(true_dir) - theta = fit[id]['theta'] - phi = fit[id]['phi'] - dir = [np.sin(theta)*np.cos(phi),np.sin(theta)*np.sin(phi),np.cos(theta)] - dir = np.array(dir)/np.linalg.norm(dir) - thetas.append(np.degrees(np.arccos(np.dot(true_dir,dir)))) - if IDP_E_MINUS in fit and IDP_MU_MINUS in fit: - fmin_electron = fit[IDP_E_MINUS]['fmin'] - fmin_muon = fit[IDP_MU_MINUS]['fmin'] - likelihood_ratio.append(fmin_muon-fmin_electron) - if IDP_E_MINUS in fit: - t_electron.append(fit[IDP_E_MINUS]['time']) - if IDP_MU_MINUS in fit: - t_muon.append(fit[IDP_MU_MINUS]['time']) - - if 'nhit' in event['ev'][0]: - nhit = event['ev'][0]['nhit'] - psi.append(fit[id]['psi']/nhit) - - if len(t_electron) < 2 or len(t_muon) < 2: + + with h5py.File(filename) as f: + ev = pd.read_hdf(filename, "ev") + mcgn = pd.read_hdf(filename, "mcgn") + fits = pd.read_hdf(filename, "fits") + + # get rid of 2nd events like Michel electrons + ev = ev.sort_values(['run','gtid']).groupby(['evn'],as_index=False).first() + + # Now, we merge all three datasets together to produce a single + # dataframe. To do so, we join the ev dataframe with the mcgn frame + # on the evn column, and then join with the fits on the run and + # gtid columns. + # + # At the end we will have a single dataframe with one row for each + # fit, i.e. it will look like: + # + # >>> data + # run gtid nhit, ... mcgn_x, mcgn_y, mcgn_z, ..., fit_id1, fit_x, fit_y, fit_z, ... + # + # Before merging, we prefix the primary seed track table with mcgn_ + # and the fit table with fit_ just to make things easier. + + # Prefix track and fit frames + mcgn = mcgn.add_prefix("mcgn_") + fits = fits.add_prefix("fit_") + + # merge ev and mcgn on evn + data = ev.merge(mcgn,left_on=['evn'],right_on=['mcgn_evn']) + # merge data and fits on run and gtid + data = data.merge(fits,left_on=['run','gtid'],right_on=['fit_run','fit_gtid']) + + # calculate true kinetic energy + mass = [SNOMAN_MASS[id] for id in data['mcgn_id'].values] + data['T'] = data['mcgn_energy'].values - mass + data['dx'] = data['fit_x'].values - data['mcgn_x'].values + data['dy'] = data['fit_y'].values - data['mcgn_y'].values + data['dz'] = data['fit_z'].values - data['mcgn_z'].values + data['dT'] = data['fit_energy1'].values - data['T'].values + + true_dir = np.dstack((data['mcgn_dirx'],data['mcgn_diry'],data['mcgn_dirz'])).squeeze() + dir = np.dstack((np.sin(data['fit_theta1'])*np.cos(data['fit_phi1']), + np.sin(data['fit_theta1'])*np.sin(data['fit_phi1']), + np.cos(data['fit_theta1']))).squeeze() + + data['theta'] = np.degrees(np.arccos((true_dir*dir).sum(axis=-1))) + + # only select fits which have at least 2 fits + data = data.groupby(['run','gtid']).filter(lambda x: len(x) > 1) + data_true = data[data['fit_id1'] == data['mcgn_id']] + data_e = data[data['fit_id1'] == IDP_E_MINUS] + data_mu = data[data['fit_id1'] == IDP_MU_MINUS] + + data_true = data_true.set_index(['run','gtid']) + data_e = data_e.set_index(['run','gtid']) + data_mu = data_mu.set_index(['run','gtid']) + + data_true['ratio'] = data_mu['fit_fmin']-data_e['fit_fmin'] + data_true['te'] = data_e['fit_time'] + data_true['tm'] = data_mu['fit_time'] + data_true['Te'] = data_e['fit_energy1'] + + if len(data_true) < 2: continue - mean, mean_error, std, std_error = get_stats(dT) + mean, mean_error, std, std_error = get_stats(data_true.dT) print("dT = %.2g +/- %.2g" % (mean, mean_error)) print("std(dT) = %.2g +/- %.2g" % (std, std_error)) - mean, mean_error, std, std_error = get_stats(dx) + mean, mean_error, std, std_error = get_stats(data_true.dx) print("dx = %4.2g +/- %.2g" % (mean, mean_error)) print("std(dx) = %4.2g +/- %.2g" % (std, std_error)) - mean, mean_error, std, std_error = get_stats(dy) + mean, mean_error, std, std_error = get_stats(data_true.dy) print("dy = %4.2g +/- %.2g" % (mean, mean_error)) print("std(dy) = %4.2g +/- %.2g" % (std, std_error)) - mean, mean_error, std, std_error = get_stats(dz) + mean, mean_error, std, std_error = get_stats(data_true.dz) print("dz = %4.2g +/- %.2g" % (mean, mean_error)) print("std(dz) = %4.2g +/- %.2g" % (std, std_error)) - mean, mean_error, std, std_error = get_stats(thetas) + mean, mean_error, std, std_error = get_stats(data_true.theta) print("std(theta) = %4.2g +/- %.2g" % (std, std_error)) plt.figure(1) - plot_hist(dT, label=filename) + plot_hist(data_true.dT, label=filename) plt.xlabel("Kinetic Energy difference (MeV)") plt.figure(2) - plot_hist(dx, label=filename) + plot_hist(data_true.dx, label=filename) plt.xlabel("X Position difference (cm)") plt.figure(3) - plot_hist(dy, label=filename) + plot_hist(data_true.dy, label=filename) plt.xlabel("Y Position difference (cm)") plt.figure(4) - plot_hist(dz, label=filename) + plot_hist(data_true.dz, label=filename) plt.xlabel("Z Position difference (cm)") plt.figure(5) - plot_hist(thetas, label=filename) + plot_hist(data_true.theta, label=filename) plt.xlabel(r"$\theta$ (deg)") plt.figure(6) - plot_hist(likelihood_ratio, label=filename) + plot_hist(data_true.ratio, label=filename) plt.xlabel(r"Log Likelihood Ratio ($e/\mu$)") plt.figure(7) - plot_hist(np.array(t_electron)/1e3/60.0, label=filename) + plot_hist(data_true.te/1e3/60.0, label=filename) plt.xlabel(r"Electron Fit time (minutes)") plt.figure(8) - plot_hist(np.array(t_muon)/1e3/60.0, label=filename) + plot_hist(data_true.tm/1e3/60.0, label=filename) plt.xlabel(r"Muon Fit time (minutes)") - if len(psi): - plt.figure(9) - plot_hist(psi, label=filename) - plt.xlabel(r"$\Psi$/Nhit") + plt.figure(9) + plot_hist(data_true.fit_psi/data_true.nhit, label=filename) + plt.xlabel(r"$\Psi$/Nhit") plot_legend(1) plot_legend(2) @@ -205,6 +199,5 @@ if __name__ == '__main__': plot_legend(6) plot_legend(7) plot_legend(8) - if len(psi): - plot_legend(9) + plot_legend(9) plt.show() diff --git a/utils/plot-energy b/utils/plot-energy index a0274b7..4a8521b 100755 --- a/utils/plot-energy +++ b/utils/plot-energy @@ -107,140 +107,69 @@ if __name__ == '__main__': import numpy as np import pandas as pd import sys + import h5py parser = argparse.ArgumentParser("plot fit results") parser.add_argument("filenames", nargs='+', help="input files") args = parser.parse_args() - events = [] - fit_results = [] - - for filename in args.filenames: - print(filename) - with open(filename) as f: - data = yaml.load(f,Loader=Loader) - - for i, event in enumerate(data['data']): - for ev in event['ev']: - events.append(( - ev['run'], - ev['gtr'], - ev['nhit'], - ev['gtid'], - ev['dc'], - ev['ftp']['x'] if 'ftp' in ev else np.nan, - ev['ftp']['y'] if 'ftp' in ev else np.nan, - ev['ftp']['z'] if 'ftp' in ev else np.nan, - ev['rsp']['energy'] if 'rsp' in ev and ev['rsp']['energy'] > 0 else np.nan, - )) - - if 'fit' not in ev: - continue - - for id, fit_result in [x for x in ev['fit'].iteritems() if isinstance(x[0],int)]: - # FIXME: Should I just store the particle ids in the YAML - # output as a list of particle ids instead of a single - # integer? - ids = map(int,chunks(str(id),2)) - energy = 0.0 - skip = False - for i, ke in zip(ids,np.atleast_1d(fit_result['energy'])): - energy += ke + SNOMAN_MASS[i] - - # This is a bit of a hack. It appears that many times - # the fit will actually do much better by including a - # very low energy electron or muon. I believe the - # reason for this is that of course my likelihood - # function is not perfect (for example, I don't include - # the correct angular distribution for Rayleigh - # scattered light), and so the fitter often wants to - # add a very low energy electron or muon to fix things. - # - # Ideally I would fix the likelihood function, but for - # now we just discard any fit results which have a very - # low energy electron or muon. - if len(ids) > 1 and i == 20 and ke < 20.0: - skip = True - - if len(ids) > 1 and i == 22 and ke < 200.0: - skip = True - - if skip: - continue - - # Calculate the approximate Ockham factor. - # See Chapter 20 in "Probability Theory: The Logic of Science" by Jaynes - # - # Note: This is a really approximate form by assuming that - # the shape of the likelihood space is equal to the average - # uncertainty in the different parameters. - w = len(ids)*np.log(0.1*0.001) + np.sum(np.log(fit_result['energy'])) + len(ids)*np.log(1e-4/(4*np.pi)) - - fit_results.append(( - ev['run'], - ev['gtid'], - id, - fit_result['posx'], - fit_result['posy'], - fit_result['posz'], - fit_result['t0'], - energy, - np.atleast_1d(fit_result['theta'])[0], - np.atleast_1d(fit_result['phi'])[0], - fit_result['fmin'] - w, - fit_result['psi']/ev['nhit'])) - - # create a dataframe - # note: we have to first create a numpy structured array since there is no - # way to pass a list of data types to the DataFrame constructor. See - # https://github.com/pandas-dev/pandas/issues/4464 - array = np.array(fit_results, - dtype=[('run',np.int), # run number - ('gtid',np.int), # gtid - ('id',np.int), # particle id - ('x', np.double), # x - ('y',np.double), # y - ('z',np.double), # z - ('t0',np.double), # t0 - ('ke',np.double), # kinetic energy - ('theta',np.double), # direction of 1st particle - ('phi',np.double), # direction of 2nd particle - ('fmin',np.double), # negative log likelihood - ('psi',np.double)] # goodness of fit parameter - ) - df = pd.DataFrame.from_records(array) - - array = np.array(events, - dtype=[('run',np.int), # run number - ('gtr',np.double), # 50 MHz clock in ns - ('nhit',np.int), # number of PMTs hit - ('gtid',np.int), # gtid - ('dc',np.int), # data cleaning word - ('ftpx',np.double), # FTP fitter X position - ('ftpy',np.double), # FTP fitter Y position - ('ftpz',np.double), # FTP fitter Z position - ('rsp_energy',np.double)] # RSP energy - ) - df_ev = pd.DataFrame.from_records(array) + ev = pd.concat([pd.read_hdf(filename, "ev") for filename in args.filenames]) + fits = pd.concat([pd.read_hdf(filename, "fits") for filename in args.filenames]) + + # This is a bit of a hack. It appears that many times the fit will + # actually do much better by including a very low energy electron or + # muon. I believe the reason for this is that of course my likelihood + # function is not perfect (for example, I don't include the correct + # angular distribution for Rayleigh scattered light), and so the fitter + # often wants to add a very low energy electron or muon to fix things. + # + # Ideally I would fix the likelihood function, but for now we just + # discard any fit results which have a very low energy electron or + # muon. + # + # FIXME: Test this since query() is new to pandas + fits = fits.query('not (n > 1 and ((id1 == 20 and energy1 < 20) or (id2 == 20 and energy2 < 20) or (id3 == 20 and energy3 < 20)))') + fits = fits.query('not (n > 1 and ((id2 == 22 and energy1 < 200) or (id2 == 22 and energy2 < 200) or (id3 == 22 and energy3 < 200)))') + + # Calculate the approximate Ockham factor. + # See Chapter 20 in "Probability Theory: The Logic of Science" by Jaynes + # + # Note: This is a really approximate form by assuming that the shape of + # the likelihood space is equal to the average uncertainty in the + # different parameters. + fits['w'] = fits['n']*np.log(0.1*0.001) + np.log(fits['energy1']) + fits['n']*np.log(1e-4/(4*np.pi)) + + # Note: we index on the left hand site with loc to avoid a copy error + # + # See https://www.dataquest.io/blog/settingwithcopywarning/ + fits.loc[fits['n'] > 1, 'w'] += np.log(fits[fits['n'] > 1]['energy2']) + fits.loc[fits['n'] > 2, 'w'] += np.log(fits[fits['n'] > 2]['energy3']) + + fits['fmin'] = fits['fmin'] - fits['w'] + + fits['psi'] /= fits.merge(ev,on=['run','gtid'])['nhit'] + fits['ke'] = fits['energy1'] + fits['id'] = fits['id1'] + fits['id2']*100 + fits['id3']*10000 + fits['theta'] = fits['theta1'] # Make sure events are in order. We use run number and GTID here which # should be in time order as well except for bit flips in the GTID # This is important for later when we look at time differences in the 50 # MHz clock. We should perhaps do a check here based on the 10 MHz clock to # make sure things are in order - df_ev = df_ev.sort_values(by=['run','gtid']) + ev = ev.sort_values(by=['run','gtid']) - print("number of events = %i" % len(df_ev)) + print("number of events = %i" % len(ev)) # First, do basic data cleaning which is done for all events. - df_ev = df_ev[df_ev.dc & (DC_JUNK | DC_CRATE_ISOTROPY | DC_QVNHIT | DC_FLASHER | DC_NECK) == 0] + ev = ev[ev.dc & (DC_JUNK | DC_CRATE_ISOTROPY | DC_QVNHIT | DC_FLASHER | DC_NECK) == 0] - print("number of events after data cleaning = %i" % len(df_ev)) + print("number of events after data cleaning = %i" % len(ev)) # Now, we select events tagged by the muon tag which should tag only # external muons. We keep the sample of muons since it's needed later to # identify Michel electrons and to apply the muon follower cut - muons = df_ev[(df_ev.dc & DC_MUON) != 0] + muons = ev[(ev.dc & DC_MUON) != 0] print("number of muons = %i" % len(muons)) @@ -251,12 +180,12 @@ if __name__ == '__main__': # # Should I use 10 MHz clock here since the time isn't critical and then I # don't have to worry about 50 MHz clock rollover? - prompt = df_ev[df_ev.nhit >= 100] + prompt = ev[ev.nhit >= 100] prompt_mask = np.concatenate(([True],np.diff(prompt.gtr.values) > 250e6)) # Michel electrons and neutrons can be any event which is not a prompt # event - follower = pd.concat([df_ev[df_ev.nhit < 100],prompt[~prompt_mask]]) + follower = pd.concat([ev[ev.nhit < 100],prompt[~prompt_mask]]) # Apply the prompt mask prompt = prompt[prompt_mask] @@ -275,39 +204,78 @@ if __name__ == '__main__': if prompt_plus_muons.size and follower.size: # require Michel events to pass more of the SNO data cleaning cuts michel = follower[follower.dc & (DC_JUNK | DC_CRATE_ISOTROPY | DC_QVNHIT | DC_FLASHER | DC_NECK | DC_ESUM | DC_OWL | DC_OWL_TRIGGER | DC_FTS) == DC_FTS] + michel = michel[michel.nhit >= 100] - # accept events which had a muon more than 800 nanoseconds but less than 20 microseconds before them - # The 800 nanoseconds cut comes from Richie's thesis. He also mentions - # that the In Time Channel Spread Cut is very effective at cutting - # electrons events caused by muons, so I should implement that - michel = michel[np.any((michel.run.values == prompt_plus_muons.run.values[:,np.newaxis]) & \ - (michel.gtr.values > prompt_plus_muons.gtr.values[:,np.newaxis] + 800) & \ - (michel.gtr.values < prompt_plus_muons.gtr.values[:,np.newaxis] + 20e3),axis=0)] + + # Accept events which had a muon more than 800 nanoseconds but less + # than 20 microseconds before them. The 800 nanoseconds cut comes from + # Richie's thesis. He also mentions that the In Time Channel Spread Cut + # is very effective at cutting electrons events caused by muons, so I + # should implement that. + # + # Note: We currently don't look across run boundaries. This should be a + # *very* small effect, and the logic to do so would be very complicated + # since I would have to deal with 50 MHz clock rollovers, etc. + # + # Do a simple python for loop here over the runs since we create + # temporary arrays with shape (michel.size,prompt_plus_muons.size) + # which could be too big for the full dataset. + # + # There might be some clever way to do this without the loop in Pandas, + # but I don't know how. + michel_sum = None + for run, michel_run in michel.groupby(['run']): + prompt_plus_muons_run = prompt_plus_muons[prompt_plus_muons['run'] == run] + michel_run = michel_run[np.any((michel_run.gtr.values > prompt_plus_muons_run.gtr.values[:,np.newaxis] + 800) & \ + (michel_run.gtr.values < prompt_plus_muons_run.gtr.values[:,np.newaxis] + 20e3),axis=0)] + + if michel_sum is None: + michel_sum = michel_run + else: + michel_sum = michel_sum.append(michel_run) + + michel = michel_sum else: michel = pd.DataFrame(columns=follower.columns) if prompt.size and follower.size: # neutron followers have to obey stricter set of data cleaning cuts neutron = follower[follower.dc & (DC_JUNK | DC_CRATE_ISOTROPY | DC_QVNHIT | DC_FLASHER | DC_NECK | DC_ESUM | DC_OWL | DC_OWL_TRIGGER | DC_FTS) == DC_FTS] - neutron = neutron[~np.isnan(neutron.ftpx) & ~np.isnan(neutron.rsp_energy)] - r = np.sqrt(neutron.ftpx**2 + neutron.ftpy**2 + neutron.ftpz**2) + neutron = neutron[~np.isnan(neutron.ftp_x) & ~np.isnan(neutron.rsp_energy)] + r = np.sqrt(neutron.ftp_x**2 + neutron.ftp_y**2 + neutron.ftp_z**2) neutron = neutron[r < AV_RADIUS] neutron = neutron[neutron.rsp_energy > 4.0] + # neutron events accepted after 20 microseconds and before 250 ms (50 ms during salt) # FIXME: need to deal with 50 MHz clock rollover - neutron_mask = np.any((neutron.run.values == prompt.run.values[:,np.newaxis]) & \ - (neutron.gtr.values > prompt.gtr.values[:,np.newaxis] + 20e3) & \ - (neutron.gtr.values < prompt.gtr.values[:,np.newaxis] + 250e6),axis=1) - df_atm = prompt[neutron_mask] - prompt = prompt[~neutron_mask] + prompt_sum = None + atm_sum = None + for run, prompt_run in prompt.groupby(['run']): + neutron_run = neutron[neutron['run'] == run] + + neutron_mask = np.any((neutron_run.gtr.values > prompt_run.gtr.values[:,np.newaxis] + 20e3) & \ + (neutron_run.gtr.values < prompt_run.gtr.values[:,np.newaxis] + 250e6),axis=1) + + if prompt_sum is None: + prompt_sum = prompt_run[~neutron_mask] + else: + prompt_sum = prompt_sum.append(prompt_run[~neutron_mask]) + + if atm_sum is None: + atm_sum = prompt_run[neutron_mask] + else: + atm_sum = atm_sum.append(prompt_run[neutron_mask]) + + atm = atm_sum + prompt = prompt_sum else: - df_atm = pd.DataFrame(columns=prompt.columns) + atm = pd.DataFrame(columns=prompt.columns) print("number of events after neutron follower cut = %i" % len(prompt)) # Get rid of muon events in our main event sample prompt = prompt[(prompt.dc & DC_MUON) == 0] - df_atm = df_atm[(df_atm.dc & DC_MUON) == 0] + atm = atm[(atm.dc & DC_MUON) == 0] print("number of events after muon cut = %i" % len(prompt)) @@ -317,52 +285,52 @@ if __name__ == '__main__': prompt = prompt[~np.any((prompt.run.values == muons.run.values[:,np.newaxis]) & \ (prompt.gtr.values > muons.gtr.values[:,np.newaxis]) & \ (prompt.gtr.values <= (muons.gtr.values[:,np.newaxis] + 200e3)),axis=0)] - df_atm = df_atm[~np.any((df_atm.run.values == muons.run.values[:,np.newaxis]) & \ - (df_atm.gtr.values > muons.gtr.values[:,np.newaxis]) & \ - (df_atm.gtr.values <= (muons.gtr.values[:,np.newaxis] + 200e3)),axis=0)] + atm = atm[~np.any((atm.run.values == muons.run.values[:,np.newaxis]) & \ + (atm.gtr.values > muons.gtr.values[:,np.newaxis]) & \ + (atm.gtr.values <= (muons.gtr.values[:,np.newaxis] + 200e3)),axis=0)] # Check to see if there are any events with missing fit information - df_atm_ra = df_atm[['run','gtid']].to_records(index=False) + atm_ra = atm[['run','gtid']].to_records(index=False) muons_ra = muons[['run','gtid']].to_records(index=False) prompt_ra = prompt[['run','gtid']].to_records(index=False) michel_ra = michel[['run','gtid']].to_records(index=False) - df_ra = df[['run','gtid']].to_records(index=False) + fits_ra = fits[['run','gtid']].to_records(index=False) - if np.count_nonzero(~np.isin(df_atm_ra,df_ra)): - print_warning("skipping %i atmospheric events because they are missing fit information!" % np.count_nonzero(~np.isin(df_atm_ra,df_ra))) + if len(atm_ra) and np.count_nonzero(~np.isin(atm_ra,fits_ra)): + print_warning("skipping %i atmospheric events because they are missing fit information!" % np.count_nonzero(~np.isin(atm_ra,fits_ra))) - if np.count_nonzero(~np.isin(muons_ra,df_ra)): - print_warning("skipping %i muon events because they are missing fit information!" % np.count_nonzero(~np.isin(muons_ra,df_ra))) + if len(muons_ra) and np.count_nonzero(~np.isin(muons_ra,fits_ra)): + print_warning("skipping %i muon events because they are missing fit information!" % np.count_nonzero(~np.isin(muons_ra,fits_ra))) - if np.count_nonzero(~np.isin(prompt_ra,df_ra)): - print_warning("skipping %i signal events because they are missing fit information!" % np.count_nonzero(~np.isin(prompt_ra,df_ra))) + if len(prompt_ra) and np.count_nonzero(~np.isin(prompt_ra,fits_ra)): + print_warning("skipping %i signal events because they are missing fit information!" % np.count_nonzero(~np.isin(prompt_ra,fits_ra))) - if np.count_nonzero(~np.isin(michel_ra,df_ra)): - print_warning("skipping %i Michel events because they are missing fit information!" % np.count_nonzero(~np.isin(michel_ra,df_ra))) + if len(michel_ra) and np.count_nonzero(~np.isin(michel_ra,fits_ra)): + print_warning("skipping %i Michel events because they are missing fit information!" % np.count_nonzero(~np.isin(michel_ra,fits_ra))) # Now, we merge the event info with the fitter info. # # Note: This means that the dataframe now contains multiple rows for each # event, one for each fit hypothesis. - df_atm = pd.merge(df,df_atm,how='inner',on=['run','gtid']) - muons = pd.merge(df,muons,how='inner',on=['run','gtid']) - michel = pd.merge(df,michel,how='inner',on=['run','gtid']) - df = pd.merge(df,prompt,how='inner',on=['run','gtid']) + atm = pd.merge(fits,atm,how='inner',on=['run','gtid']) + muons = pd.merge(fits,muons,how='inner',on=['run','gtid']) + michel = pd.merge(fits,michel,how='inner',on=['run','gtid']) + prompt = pd.merge(fits,prompt,how='inner',on=['run','gtid']) # get rid of events which don't have a fit - nan = np.isnan(df.fmin.values) + nan = np.isnan(prompt.fmin.values) if np.count_nonzero(nan): - print_warning("skipping %i signal events because the negative log likelihood is nan!" % len(df[nan].groupby(['run','gtid']))) + print_warning("skipping %i signal events because the negative log likelihood is nan!" % len(prompt[nan].groupby(['run','gtid']))) - df = df[~nan] + prompt = prompt[~nan] - nan_atm = np.isnan(df_atm.fmin.values) + nan_atm = np.isnan(atm.fmin.values) if np.count_nonzero(nan_atm): - print_warning("skipping %i atmospheric events because the negative log likelihood is nan!" % len(df_atm[nan_atm].groupby(['run','gtid']))) + print_warning("skipping %i atmospheric events because the negative log likelihood is nan!" % len(atm[nan_atm].groupby(['run','gtid']))) - df_atm = df_atm[~nan_atm] + atm = atm[~nan_atm] nan_muon = np.isnan(muons.fmin.values) @@ -379,24 +347,24 @@ if __name__ == '__main__': michel = michel[~nan_michel] # get the best fit - df = df.sort_values('fmin').groupby(['run','gtid']).first() - df_atm = df_atm.sort_values('fmin').groupby(['run','gtid']).first() + prompt = prompt.sort_values('fmin').groupby(['run','gtid']).first() + atm = atm.sort_values('fmin').groupby(['run','gtid']).first() michel_best_fit = michel.sort_values('fmin').groupby(['run','gtid']).first() muon_best_fit = muons.sort_values('fmin').groupby(['run','gtid']).first() muons = muons[muons.id == 22] # require r < 6 meters - df = df[np.sqrt(df.x.values**2 + df.y.values**2 + df.z.values**2) < AV_RADIUS] - df_atm = df_atm[np.sqrt(df_atm.x.values**2 + df_atm.y.values**2 + df_atm.z.values**2) < AV_RADIUS] + prompt = prompt[np.sqrt(prompt.x.values**2 + prompt.y.values**2 + prompt.z.values**2) < AV_RADIUS] + atm = atm[np.sqrt(atm.x.values**2 + atm.y.values**2 + atm.z.values**2) < AV_RADIUS] - print("number of events after radius cut = %i" % len(df)) + print("number of events after radius cut = %i" % len(prompt)) # Note: Need to design and apply a psi based cut here plt.figure() - plot_hist(df,"Without Neutron Follower") + plot_hist(prompt,"Without Neutron Follower") plt.figure() - plot_hist(df_atm,"With Neutron Follower") + plot_hist(atm,"With Neutron Follower") plt.figure() plot_hist(michel_best_fit,"Michel Electrons") plt.figure() diff --git a/utils/plot-fit-results b/utils/plot-fit-results index 773a0dc..7115b81 100755 --- a/utils/plot-fit-results +++ b/utils/plot-fit-results @@ -15,11 +15,6 @@ # this program. If not, see <https://www.gnu.org/licenses/>. from __future__ import print_function, division -import yaml -try: - from yaml import CLoader as Loader -except ImportError: - from yaml.loader import SafeLoader as Loader import numpy as np from scipy.stats import iqr from matplotlib.lines import Line2D @@ -71,228 +66,137 @@ def get_stats(x): error = np.sqrt((u4-(n-3)*std**4/(n-1))/n)/(2*std) return mean, std/np.sqrt(n), std, error +def std_err(x): + x = x.dropna() + mean = np.mean(x) + std = np.std(x) + n = len(x) + if n == 0: + return np.nan + u4 = np.mean((x-mean)**4) + error = np.sqrt((u4-(n-3)*std**4/(n-1))/n)/(2*std) + return error + if __name__ == '__main__': import argparse import matplotlib.pyplot as plt import numpy as np + import h5py + import pandas as pd parser = argparse.ArgumentParser("plot fit results") parser.add_argument("filenames", nargs='+', help="input files") args = parser.parse_args() - events = [] - - for filename in args.filenames: - print(filename) - with open(filename) as f: - data = yaml.load(f.read(),Loader=Loader) - - a = np.ma.empty(len(data['data']), - dtype=[('id',np.int), # particle id - ('T', np.double), # true energy - ('dx',np.double), # dx - ('dy',np.double), # dy - ('dz',np.double), # dz - ('dT',np.double), # dT - ('theta',np.double), # theta - ('ratio',np.double), # likelihood ratio - ('te',np.double), # time electron - ('tm',np.double), # time muon - ('Te',np.double)] # electron energy - ) - - for i, event in enumerate(data['data']): - # get the particle ID - id = event['mcgn'][0]['id'] - - a[i]['id'] = id - - if 'fit' not in event['ev'][0]: - # if nhit < 100 we don't fit the event - continue - - if id not in event['ev'][0]['fit']: - a[i] = np.ma.masked - continue - - mass = SNOMAN_MASS[id] - # for some reason it's the *second* track which seems to contain the - # initial energy - true_energy = event['mcgn'][0]['energy'] - # The MCTK bank has the particle's total energy (except for neutrons) - # so we need to convert it into kinetic energy - ke = true_energy - mass - - fit = event['ev'][0]['fit'] - - a[i]['T'] = ke - energy = fit[id]['energy'] - a[i]['dT'] = energy-ke - - # store the fit position residuals - true_posx = event['mcgn'][0]['posx'] - posx = fit[id]['posx'] - a[i]['dx'] = posx-true_posx - true_posy = event['mcgn'][0]['posy'] - posy = fit[id]['posy'] - a[i]['dy'] = posy-true_posy - true_posz = event['mcgn'][0]['posz'] - posz = fit[id]['posz'] - a[i]['dz'] = posz-true_posz - - # compute the angle between the fit direction and the true - # direction - dirx = event['mcgn'][0]['dirx'] - diry = event['mcgn'][0]['diry'] - dirz = event['mcgn'][0]['dirz'] - true_dir = [dirx,diry,dirz] - true_dir = np.array(true_dir)/np.linalg.norm(true_dir) - theta = fit[id]['theta'] - phi = fit[id]['phi'] - dir = [np.sin(theta)*np.cos(phi),np.sin(theta)*np.sin(phi),np.cos(theta)] - dir = np.array(dir)/np.linalg.norm(dir) - a[i]['theta'] = np.degrees(np.arccos(np.dot(true_dir,dir))) - - # compute the log likelihood ratio - if IDP_E_MINUS in fit and IDP_MU_MINUS in fit: - fmin_electron = fit[IDP_E_MINUS]['fmin'] - fmin_muon = fit[IDP_MU_MINUS]['fmin'] - a[i]['ratio'] = fmin_muon-fmin_electron - else: - a[i]['ratio'] = np.ma.masked - - # store the time taken for electron and muon fits - if IDP_E_MINUS in fit: - a[i]['te'] = fit[IDP_E_MINUS]['time'] - a[i]['Te'] = fit[IDP_E_MINUS]['energy'] - else: - a[i]['te'] = np.ma.masked - a[i]['Te'] = np.ma.masked - if IDP_MU_MINUS in fit: - a[i]['tm'] = fit[IDP_MU_MINUS]['time'] - else: - a[i]['tm'] = np.ma.masked - - events.append(a) - - a = np.ma.concatenate(events) - + # Read in all the data. + # + # Note: We have to add the filename as a column to each dataset since this + # script is used to analyze MC data which all has the same run number. + ev = pd.concat([pd.read_hdf(filename, "ev").assign(filename=filename) for filename in args.filenames]) + fits = pd.concat([pd.read_hdf(filename, "fits").assign(filename=filename) for filename in args.filenames]) + mcgn = pd.concat([pd.read_hdf(filename, "mcgn").assign(filename=filename) for filename in args.filenames]) + + # get rid of 2nd events like Michel electrons + ev = ev.sort_values(['run','gtid']).groupby(['filename','evn'],as_index=False).first() + + # Now, we merge all three datasets together to produce a single + # dataframe. To do so, we join the ev dataframe with the mcgn frame + # on the evn column, and then join with the fits on the run and + # gtid columns. + # + # At the end we will have a single dataframe with one row for each + # fit, i.e. it will look like: + # + # >>> data + # run gtid nhit, ... mcgn_x, mcgn_y, mcgn_z, ..., fit_id1, fit_x, fit_y, fit_z, ... + # + # Before merging, we prefix the primary seed track table with mcgn_ + # and the fit table with fit_ just to make things easier. + + # Prefix track and fit frames + mcgn = mcgn.add_prefix("mcgn_") + fits = fits.add_prefix("fit_") + + # merge ev and mcgn on evn + data = ev.merge(mcgn,left_on=['filename','evn'],right_on=['mcgn_filename','mcgn_evn']) + # merge data and fits on run and gtid + data = data.merge(fits,left_on=['filename','run','gtid'],right_on=['fit_filename','fit_run','fit_gtid']) + + # calculate true kinetic energy + mass = [SNOMAN_MASS[id] for id in data['mcgn_id'].values] + data['T'] = data['mcgn_energy'].values - mass + data['dx'] = data['fit_x'].values - data['mcgn_x'].values + data['dy'] = data['fit_y'].values - data['mcgn_y'].values + data['dz'] = data['fit_z'].values - data['mcgn_z'].values + data['dT'] = data['fit_energy1'].values - data['T'].values + + true_dir = np.dstack((data['mcgn_dirx'],data['mcgn_diry'],data['mcgn_dirz'])).squeeze() + dir = np.dstack((np.sin(data['fit_theta1'])*np.cos(data['fit_phi1']), + np.sin(data['fit_theta1'])*np.sin(data['fit_phi1']), + np.cos(data['fit_theta1']))).squeeze() + + data['theta'] = np.degrees(np.arccos((true_dir*dir).sum(axis=-1))) + + # only select fits which have at least 2 fits + data = data.groupby(['filename','run','gtid']).filter(lambda x: len(x) > 1) + data_true = data[data['fit_id1'] == data['mcgn_id']] + data_e = data[data['fit_id1'] == IDP_E_MINUS] + data_mu = data[data['fit_id1'] == IDP_MU_MINUS] + + data_true = data_true.set_index(['filename','run','gtid']) + data_e = data_e.set_index(['filename','run','gtid']) + data_mu = data_mu.set_index(['filename','run','gtid']) + + data_true['ratio'] = data_mu['fit_fmin']-data_e['fit_fmin'] + data_true['te'] = data_e['fit_time'] + data_true['tm'] = data_mu['fit_time'] + data_true['Te'] = data_e['fit_energy1'] + + # 100 bins between 50 MeV and 1 GeV bins = np.arange(50,1000,100) - stats_array = np.ma.empty(len(bins)-1, - dtype=[('T', np.double), - ('dT', np.double), - ('dT_err', np.double), - ('dT_std', np.double), - ('dT_std_err', np.double), - ('dx', np.double), - ('dx_err', np.double), - ('dx_std', np.double), - ('dx_std_err', np.double), - ('dy', np.double), - ('dy_err', np.double), - ('dy_std', np.double), - ('dy_std_err', np.double), - ('dz', np.double), - ('dz_err', np.double), - ('dz_std', np.double), - ('dz_std_err', np.double), - ('theta', np.double), - ('theta_err', np.double), - ('theta_std', np.double), - ('theta_std_err', np.double)]) - - stats = {IDP_E_MINUS: stats_array, IDP_MU_MINUS: stats_array.copy()} - - for id in stats: - electron_events = a[a['id'] == id] - - for i, (ablah, b) in enumerate(zip(bins[:-1], bins[1:])): - events = electron_events[(electron_events['T'] >= ablah) & (electron_events['T'] < b)] - - if len(events) < 2: - stats[id][i] = np.ma.masked - continue - - stats[id][i]['T'] = (ablah+b)/2 - mean, mean_error, std, std_error = get_stats(events['dT'].compressed()) - stats[id][i]['dT'] = mean - stats[id][i]['dT_err'] = mean_error - stats[id][i]['dT_std'] = std - stats[id][i]['dT_std_err'] = std_error - mean, mean_error, std, std_error = get_stats(events['dx'].compressed()) - stats[id][i]['dx'] = mean - stats[id][i]['dx_err'] = mean_error - stats[id][i]['dx_std'] = std - stats[id][i]['dx_std_err'] = std_error - mean, mean_error, std, std_error = get_stats(events['dy'].compressed()) - stats[id][i]['dy'] = mean - stats[id][i]['dy_err'] = mean_error - stats[id][i]['dy_std'] = std - stats[id][i]['dy_std_err'] = std_error - mean, mean_error, std, std_error = get_stats(events['dz'].compressed()) - stats[id][i]['dz'] = mean - stats[id][i]['dz_err'] = mean_error - stats[id][i]['dz_std'] = std - stats[id][i]['dz_std_err'] = std_error - mean, mean_error, std, std_error = get_stats(events['theta'].compressed()) - stats[id][i]['theta'] = mean - stats[id][i]['theta_err'] = mean_error - stats[id][i]['theta_std'] = std - stats[id][i]['theta_std_err'] = std_error - - for id in stats: + for id in [IDP_E_MINUS, IDP_MU_MINUS]: + events = data_true[data_true['mcgn_id'] == id] + + pd_bins = pd.cut(events['T'],bins) + + dT = events.groupby(pd_bins)['dT'].agg(['mean','sem','std',std_err]) + dx = events.groupby(pd_bins)['dx'].agg(['mean','sem','std',std_err]) + dy = events.groupby(pd_bins)['dy'].agg(['mean','sem','std',std_err]) + dz = events.groupby(pd_bins)['dz'].agg(['mean','sem','std',std_err]) + theta = events.groupby(pd_bins)['theta'].agg(['mean','sem','std',std_err]) + label = 'Muon' if id == IDP_MU_MINUS else 'Electron' - T = stats[id]['T'] - dT = stats[id]['dT'] - dT_err = stats[id]['dT_err'] - std_dT = stats[id]['dT_std'] - std_dT_err = stats[id]['dT_std_err'] - dx = stats[id]['dx'] - dx_err = stats[id]['dx_err'] - std_dx = stats[id]['dx_std'] - std_dx_err = stats[id]['dx_std_err'] - dy = stats[id]['dy'] - dy_err = stats[id]['dy_err'] - std_dy = stats[id]['dy_std'] - std_dy_err = stats[id]['dy_std_err'] - dz = stats[id]['dz'] - dz_err = stats[id]['dz_err'] - std_dz = stats[id]['dz_std'] - std_dz_err = stats[id]['dz_std_err'] - theta = stats[id]['theta'] - theta_err = stats[id]['theta_err'] - std_theta = stats[id]['theta_std'] - std_theta_err = stats[id]['theta_std_err'] + T = (bins[1:] + bins[:-1])/2 plt.figure(1) - plt.errorbar(T,dT*100/T,yerr=dT_err*100/T,fmt='o',label=label) + plt.errorbar(T,dT['mean']*100/T,yerr=dT['sem']*100/T,fmt='o',label=label) plt.xlabel("Kinetic Energy (MeV)") plt.ylabel("Energy bias (%)") plt.title("Energy Bias") plt.legend() plt.figure(2) - plt.errorbar(T,std_dT*100/T,yerr=std_dT_err*100/T,fmt='o',label=label) + plt.errorbar(T,dT['std']*100/T,yerr=dT['std_err']*100/T,fmt='o',label=label) plt.xlabel("Kinetic Energy (MeV)") plt.ylabel("Energy resolution (%)") plt.title("Energy Resolution") plt.legend() plt.figure(3) - plt.errorbar(T,dx,yerr=dx_err,fmt='o',label='%s (x)' % label) - plt.errorbar(T,dy,yerr=dy_err,fmt='o',label='%s (y)' % label) - plt.errorbar(T,dz,yerr=dz_err,fmt='o',label='%s (z)' % label) + plt.errorbar(T,dx['mean'],yerr=dx['sem'],fmt='o',label='%s (x)' % label) + plt.errorbar(T,dy['mean'],yerr=dy['sem'],fmt='o',label='%s (y)' % label) + plt.errorbar(T,dz['mean'],yerr=dz['sem'],fmt='o',label='%s (z)' % label) plt.xlabel("Kinetic Energy (MeV)") plt.ylabel("Position bias (cm)") plt.title("Position Bias") plt.legend() plt.figure(4) - plt.errorbar(T,std_dx,yerr=std_dx_err,fmt='o',label='%s (x)' % label) - plt.errorbar(T,std_dy,yerr=std_dy_err,fmt='o',label='%s (y)' % label) - plt.errorbar(T,std_dz,yerr=std_dz_err,fmt='o',label='%s (z)' % label) + plt.errorbar(T,dx['std'],yerr=dx['std_err'],fmt='o',label='%s (x)' % label) + plt.errorbar(T,dy['std'],yerr=dy['std_err'],fmt='o',label='%s (y)' % label) + plt.errorbar(T,dz['std'],yerr=dz['std_err'],fmt='o',label='%s (z)' % label) plt.xlabel("Kinetic Energy (MeV)") plt.ylabel("Position resolution (cm)") plt.title("Position Resolution") @@ -300,7 +204,7 @@ if __name__ == '__main__': plt.legend() plt.figure(5) - plt.errorbar(T,std_theta,yerr=std_theta_err,fmt='o',label=label) + plt.errorbar(T,theta['std'],yerr=theta['std_err'],fmt='o',label=label) plt.xlabel("Kinetic Energy (MeV)") plt.ylabel("Angular resolution (deg)") plt.title("Angular Resolution") @@ -308,9 +212,10 @@ if __name__ == '__main__': plt.legend() plt.figure(6) - plt.scatter(a[a['id'] == id]['Te'],a[a['id'] == id]['ratio'],label=label) + plt.scatter(events['Te'],events['ratio'],label=label) plt.xlabel("Reconstructed Electron Energy (MeV)") plt.ylabel(r"Log Likelihood Ratio (e/$\mu$)") plt.title("Log Likelihood Ratio vs Reconstructed Electron Energy") plt.legend() + plt.show() diff --git a/utils/submit-grid-jobs b/utils/submit-grid-jobs index 670172d..63cbcb7 100755 --- a/utils/submit-grid-jobs +++ b/utils/submit-grid-jobs @@ -24,6 +24,83 @@ import string from os.path import split, splitext, join, abspath import uuid from subprocess import check_call +import os +import sys + +# Next two functions are a backport of the shutil.which() function from Python +# 3.3 from Lib/shutil.py in the CPython code See +# https://github.com/python/cpython/blob/master/Lib/shutil.py. + +# Check that a given file can be accessed with the correct mode. +# Additionally check that `file` is not a directory, as on Windows +# directories pass the os.access check. +def _access_check(fn, mode): + return (os.path.exists(fn) and os.access(fn, mode) and not os.path.isdir(fn)) + +def which(cmd, mode=os.F_OK | os.X_OK, path=None): + """Given a command, mode, and a PATH string, return the path which + conforms to the given mode on the PATH, or None if there is no such + file. + `mode` defaults to os.F_OK | os.X_OK. `path` defaults to the result + of os.environ.get("PATH"), or can be overridden with a custom search + path. + """ + # If we're given a path with a directory part, look it up directly rather + # than referring to PATH directories. This includes checking relative to the + # current directory, e.g. ./script + if os.path.dirname(cmd): + if _access_check(cmd, mode): + return cmd + return None + + if path is None: + path = os.environ.get("PATH", None) + if path is None: + try: + path = os.confstr("CS_PATH") + except (AttributeError, ValueError): + # os.confstr() or CS_PATH is not available + path = os.defpath + # bpo-35755: Don't use os.defpath if the PATH environment variable is + # set to an empty string + + # PATH='' doesn't match, whereas PATH=':' looks in the current directory + if not path: + return None + + path = path.split(os.pathsep) + + if sys.platform == "win32": + # The current directory takes precedence on Windows. + curdir = os.curdir + if curdir not in path: + path.insert(0, curdir) + + # PATHEXT is necessary to check on Windows. + pathext = os.environ.get("PATHEXT", "").split(os.pathsep) + # See if the given file matches any of the expected path extensions. + # This will allow us to short circuit when given "python.exe". + # If it does match, only test that one, otherwise we have to try + # others. + if any(cmd.lower().endswith(ext.lower()) for ext in pathext): + files = [cmd] + else: + files = [cmd + ext for ext in pathext] + else: + # On other platforms you don't have things like PATHEXT to tell you + # what file suffixes are executable, so just pass on cmd as-is. + files = [cmd] + + seen = set() + for dir in path: + normdir = os.path.normcase(dir) + if not normdir in seen: + seen.add(normdir) + for thefile in files: + name = os.path.join(dir, thefile) + if _access_check(name, mode): + return name + return None CONDOR_TEMPLATE = \ """ @@ -42,7 +119,7 @@ log = @log # The below are good base requirements for first testing jobs on OSG, # if you don't have a good idea of memory and disk usage. -requirements = (OSGVO_OS_STRING == "RHEL 7") && (OpSys == "LINUX") +requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == "RHEL 7") && (OpSys == "LINUX") request_cpus = 1 request_memory = 1 GB request_disk = 1 GB @@ -72,14 +149,24 @@ def submit_job(filename, run, gtid, dir, dqxx_dir, min_nhit, max_particles): prefix = "%s_%08i_%s" % (root,gtid,ID.hex) # fit output filename - output = "%s.txt" % prefix + output = "%s.hdf5" % prefix # condor submit filename condor_submit = "%s.submit" % prefix # set up the arguments for the template - executable = join(dir,"fit") + executable = which("fit") + wrapper = which("fit-wrapper") + + if executable is None: + print("couldn't find fit in path!",file=sys.stderr) + sys.exit(1) + + if wrapper is None: + print("couldn't find fit-wrapper in path!",file=sys.stderr) + sys.exit(1) + args = [tail,"-o",output,"--gtid",gtid,"--min-nhit",min_nhit,"--max-particles",max_particles] - transfer_input_files = ",".join([filename,join(dqxx_dir,"DQXX_%010i.dat" % run)] + [join(dir,filename) for filename in INPUT_FILES]) + transfer_input_files = ",".join([executable,filename,join(dqxx_dir,"DQXX_%010i.dat" % run)] + [join(dir,filename) for filename in INPUT_FILES]) transfer_output_files = ",".join([output]) condor_error = "%s.error" % prefix @@ -89,7 +176,7 @@ def submit_job(filename, run, gtid, dir, dqxx_dir, min_nhit, max_particles): template = MyTemplate(CONDOR_TEMPLATE) submit_string = template.safe_substitute( - executable=executable, + executable=wrapper, args=" ".join(map(str,args)), transfer_input_files=transfer_input_files, transfer_output_files=transfer_output_files, @@ -106,13 +193,13 @@ def submit_job(filename, run, gtid, dir, dqxx_dir, min_nhit, max_particles): if __name__ == '__main__': import argparse - import subprocess + from subprocess import check_call import os + import tempfile + import h5py parser = argparse.ArgumentParser("submit grid jobs") parser.add_argument("filenames", nargs='+', help="input files") - parser.add_argument("--dir", type=str, help="fitter directory", required=True) - parser.add_argument("--dqxx-dir", type=str, help="dqxx directory", required=True) parser.add_argument("--min-nhit", type=int, help="minimum nhit to fit an event", default=100) parser.add_argument("--max-particles", type=int, help="maximum number of particles to fit for", default=3) parser.add_argument("--skip-second-event", action='store_true', help="only fit the first event after a MAST bank", default=False) @@ -123,14 +210,32 @@ if __name__ == '__main__': home = os.path.expanduser("~") args.state = join(home,'state.txt') + if 'SDDM_DATA' not in os.environ: + print("Please set the SDDM_DATA environment variable to point to the fitter source code location", file=sys.stderr) + sys.exit(1) + + dir = os.environ['SDDM_DATA'] + + if 'DQXX_DIR' not in os.environ: + print("Please set the DQXX_DIR environment variable to point to the directory with the DQXX files", file=sys.stderr) + sys.exit(1) + + dqxx_dir = os.environ['DQXX_DIR'] + args.state = abspath(args.state) # get the current working directory home_dir = os.getcwd() # get absolute paths since we are going to create a new directory - args.dir = abspath(args.dir) - args.dqxx_dir = abspath(args.dqxx_dir) + dir = abspath(dir) + dqxx_dir = abspath(dqxx_dir) + + zdab_cat = which("zdab-cat") + + if zdab_cat is None: + print("couldn't find zdab-cat in path!",file=sys.stderr) + sys.exit(1) for filename in args.filenames: filename = abspath(filename) @@ -152,29 +257,28 @@ if __name__ == '__main__': continue with open(os.devnull, 'w') as f: + # Create a temporary file to store the events. Docs recommended + # this method instead of mkstemp(), but I think they're the same. + output = tempfile.NamedTemporaryFile(suffix='.hdf5',delete=False) + output.close() + if args.skip_second_event: - popen = subprocess.Popen([join(args.dir,"zdab-cat"),"--skip-second-event",filename],stdout=subprocess.PIPE,stderr=f) + check_call([zdab_cat,"--skip-second-event",filename,"-o",output.name],stderr=f) else: - popen = subprocess.Popen([join(args.dir,"zdab-cat"),filename],stdout=subprocess.PIPE,stderr=f) + check_call([zdab_cat,filename,"-o",output.name],stderr=f) new_dir = "%s_%s" % (root,ID.hex) os.mkdir(new_dir) os.chdir(new_dir) - for data in yaml.load_all(popen.stdout,Loader=Loader): - if 'ev' not in data: - continue - - for ev in data['ev']: - run = ev['run'] - gtid = ev['gtid'] - nhit = ev['nhit'] - - if nhit >= args.min_nhit: - submit_job(filename, run, gtid, args.dir, args.dqxx_dir, args.min_nhit, args.max_particles) + with h5py.File(output.name) as f: + for ev in f['ev']: + if ev['nhit'] >= args.min_nhit: + submit_job(filename, ev['run'], ev['gtid'], dir, dqxx_dir, args.min_nhit, args.max_particles) - popen.wait() + # Delete temporary HDF5 file + os.unlink(output.name) state.append(tail) |