aboutsummaryrefslogtreecommitdiff
path: root/utils
diff options
context:
space:
mode:
Diffstat (limited to 'utils')
-rw-r--r--utils/Makefile1
-rwxr-xr-xutils/cat-grid-jobs163
-rw-r--r--utils/fit-wrapper7
-rwxr-xr-xutils/plot171
-rwxr-xr-xutils/plot-energy294
-rwxr-xr-xutils/plot-fit-results297
-rwxr-xr-xutils/submit-grid-jobs152
7 files changed, 564 insertions, 521 deletions
diff --git a/utils/Makefile b/utils/Makefile
index ef21a09..011a723 100644
--- a/utils/Makefile
+++ b/utils/Makefile
@@ -13,5 +13,6 @@ install:
$(INSTALL) plot-likelihood $(INSTALL_BIN)
$(INSTALL) submit-grid-jobs $(INSTALL_BIN)
$(INSTALL) gen-dark-matter $(INSTALL_BIN)
+ $(INSTALL) fit-wrapper $(INSTALL_BIN)
.PHONY: install
diff --git a/utils/cat-grid-jobs b/utils/cat-grid-jobs
index ba06290..83a8004 100755
--- a/utils/cat-grid-jobs
+++ b/utils/cat-grid-jobs
@@ -23,7 +23,7 @@ as YAML to stdout.
Example:
- $ cat-grid-jobs --dir ~/sddm/src/ ~/mc_atm_nu_no_osc_genie_010000_0.mcds ~/grid_job_results/*.txt > output.txt
+ $ cat-grid-jobs ~/mc_atm_nu_no_osc_genie_010000_0.mcds ~/grid_job_results/*.txt > output.txt
"""
@@ -33,80 +33,145 @@ try:
from yaml import CLoader as Loader, CDumper as Dumper
except ImportError:
from yaml import Loader, Dumper
+import os
+import sys
+
+# Check that a given file can be accessed with the correct mode.
+# Additionally check that `file` is not a directory, as on Windows
+# directories pass the os.access check.
+def _access_check(fn, mode):
+ return (os.path.exists(fn) and os.access(fn, mode) and not os.path.isdir(fn))
+
+def which(cmd, mode=os.F_OK | os.X_OK, path=None):
+ """Given a command, mode, and a PATH string, return the path which
+ conforms to the given mode on the PATH, or None if there is no such
+ file.
+ `mode` defaults to os.F_OK | os.X_OK. `path` defaults to the result
+ of os.environ.get("PATH"), or can be overridden with a custom search
+ path.
+ """
+ # If we're given a path with a directory part, look it up directly rather
+ # than referring to PATH directories. This includes checking relative to the
+ # current directory, e.g. ./script
+ if os.path.dirname(cmd):
+ if _access_check(cmd, mode):
+ return cmd
+ return None
+
+ if path is None:
+ path = os.environ.get("PATH", None)
+ if path is None:
+ try:
+ path = os.confstr("CS_PATH")
+ except (AttributeError, ValueError):
+ # os.confstr() or CS_PATH is not available
+ path = os.defpath
+ # bpo-35755: Don't use os.defpath if the PATH environment variable is
+ # set to an empty string
+
+ # PATH='' doesn't match, whereas PATH=':' looks in the current directory
+ if not path:
+ return None
+
+ path = path.split(os.pathsep)
+
+ if sys.platform == "win32":
+ # The current directory takes precedence on Windows.
+ curdir = os.curdir
+ if curdir not in path:
+ path.insert(0, curdir)
+
+ # PATHEXT is necessary to check on Windows.
+ pathext = os.environ.get("PATHEXT", "").split(os.pathsep)
+ # See if the given file matches any of the expected path extensions.
+ # This will allow us to short circuit when given "python.exe".
+ # If it does match, only test that one, otherwise we have to try
+ # others.
+ if any(cmd.lower().endswith(ext.lower()) for ext in pathext):
+ files = [cmd]
+ else:
+ files = [cmd + ext for ext in pathext]
+ else:
+ # On other platforms you don't have things like PATHEXT to tell you
+ # what file suffixes are executable, so just pass on cmd as-is.
+ files = [cmd]
+
+ seen = set()
+ for dir in path:
+ normdir = os.path.normcase(dir)
+ if not normdir in seen:
+ seen.add(normdir)
+ for thefile in files:
+ name = os.path.join(dir, thefile)
+ if _access_check(name, mode):
+ return name
+ return None
+
+# from https://stackoverflow.com/questions/287871/how-to-print-colored-text-in-terminal-in-python
+class bcolors:
+ HEADER = '\033[95m'
+ OKBLUE = '\033[94m'
+ OKGREEN = '\033[92m'
+ WARNING = '\033[93m'
+ FAIL = '\033[91m'
+ ENDC = '\033[0m'
+ BOLD = '\033[1m'
+ UNDERLINE = '\033[4m'
+
+def print_warning(msg):
+ print(bcolors.FAIL + msg + bcolors.ENDC,file=sys.stderr)
if __name__ == '__main__':
import argparse
import matplotlib.pyplot as plt
import numpy as np
- import subprocess
+ from subprocess import check_call
from os.path import join
import os
import sys
+ import h5py
parser = argparse.ArgumentParser("concatenate fit results from grid jobs into a single file")
- parser.add_argument("--dir", type=str, help="fitter directory", required=True)
parser.add_argument("zdab", help="zdab input file")
parser.add_argument("filenames", nargs='+', help="input files")
+ parser.add_argument("-o", "--output", type=str, help="output filename", required=True)
args = parser.parse_args()
- fit_results = {}
+ zdab_cat = which("zdab-cat")
- # First we create a dictionary mapping (run, gtid) -> fit results.
- for filename in args.filenames:
- with open(filename) as f:
- data = yaml.load(f,Loader=Loader)
+ if zdab_cat is None:
+ print("couldn't find zdab-cat in path!",file=sys.stderr)
+ sys.exit(1)
- if data is None:
- continue
-
- for event in data['data']:
- if event['ev'] is None:
- continue
-
- # if the ev branch is filled in, it means the event was fit
- for ev in event['ev']:
- # add the git SHA1 hash to the fit results since technically it
- # could be different than the version in zdab-cat
- ev['fit']['git_sha1'] = data['git_sha1']
- ev['fit']['git_dirty'] = data['git_dirty']
- fit_results[(ev['run'],ev['gtid'])] = ev['fit']
-
- # Next we get the full event list along with the data cleaning word, FTP
+ # First we get the full event list along with the data cleaning word, FTP
# position, FTK, and RSP energy from the original zdab and then add the fit
# results.
#
# Note: We send stderr to /dev/null since there can be a lot of warnings
# about PMT types and fit results
with open(os.devnull, 'w') as f:
- popen = subprocess.Popen([join(args.dir,"zdab-cat"),args.zdab],stdout=subprocess.PIPE,stderr=f)
+ check_call([zdab_cat,args.zdab,"-o",args.output],stderr=f)
total_events = 0
events_with_fit = 0
-
- doc = {'data': []}
-
- for data in yaml.load_all(popen.stdout,Loader=Loader):
- if 'ev' not in data:
- doc.update(data)
- continue
-
- for ev in data['ev']:
- run = ev['run']
- gtid = ev['gtid']
-
- if (run,gtid) in fit_results:
- ev['fit'] = fit_results[(run,gtid)]
- events_with_fit += 1
-
- total_events += 1
-
- doc['data'].append(data)
-
- popen.wait()
+ total_fits = 0
+
+ with h5py.File(args.output,"a") as fout:
+ total_events = fout['ev'].shape[0]
+ for filename in args.filenames:
+ with h5py.File(filename) as f:
+ # Check to see if the git sha1 match
+ if fout.attrs['git_sha1'] != f.attrs['git_sha1']:
+ print_warning("git_sha1 is %s for current version but %s for %s" % (fout.attrs['git_sha1'],f.attrs['git_sha1'],filename))
+ # get fits which match up with the events
+ valid_fits = f['fits'][np.isin(f['fits'][:][['run','gtid']],fout['ev'][:][['run','gtid']])]
+ # Add the fit results
+ fout['fits'].resize((fout['fits'].shape[0]+valid_fits.shape[0],))
+ fout['fits'][-valid_fits.shape[0]:] = valid_fits
+ events_with_fit += len(np.unique(valid_fits[['run','gtid']]))
+ total_fits += len(np.unique(f['fits']['run','gtid']))
# Print out number of fit results that were added. Hopefully, this will
# make it easy to catch an error if, for example, this gets run with a
# mismatching zdab and fit results
- print("added %i/%i fit results to a total of %i events" % (events_with_fit, len(fit_results), total_events),file=sys.stderr)
-
- print(yaml.dump(doc,default_flow_style=False,Dumper=Dumper))
+ print("added %i/%i fit results to a total of %i events" % (events_with_fit, total_fits, total_events),file=sys.stderr)
diff --git a/utils/fit-wrapper b/utils/fit-wrapper
new file mode 100644
index 0000000..940b807
--- /dev/null
+++ b/utils/fit-wrapper
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+
+# Simple wrapper script to run jobs on the grid
+
+module load hdf5
+module load zlib
+./fit "$@"
diff --git a/utils/plot b/utils/plot
index ac93eda..1bac500 100755
--- a/utils/plot
+++ b/utils/plot
@@ -74,6 +74,8 @@ if __name__ == '__main__':
import argparse
import matplotlib.pyplot as plt
import numpy as np
+ import h5py
+ import pandas as pd
parser = argparse.ArgumentParser("plot fit results")
parser.add_argument("filenames", nargs='+', help="input files")
@@ -81,121 +83,113 @@ if __name__ == '__main__':
for filename in args.filenames:
print(filename)
- with open(filename) as f:
- data = yaml.load(f.read(),Loader=Loader)
-
- dx = []
- dy = []
- dz = []
- dT = []
- thetas = []
- likelihood_ratio = []
- t_electron = []
- t_muon = []
- psi = []
- for event in data['data']:
- # get the particle ID
- id = event['mcgn'][0]['id']
-
- if 'ev' not in event:
- continue
-
- if 'fit' not in event['ev'][0]:
- # if nhit < 100 we don't fit the event
- continue
-
- if id not in event['ev'][0]['fit']:
- continue
-
- fit = event['ev'][0]['fit']
-
- mass = SNOMAN_MASS[id]
- # for some reason it's the *second* track which seems to contain the
- # initial energy
- true_energy = event['mcgn'][0]['energy']
- # The MCTK bank has the particle's total energy (except for neutrons)
- # so we need to convert it into kinetic energy
- ke = true_energy - mass
- energy = fit[id]['energy']
- dT.append(energy-ke)
- true_posx = event['mcgn'][0]['posx']
- posx = fit[id]['posx']
- dx.append(posx-true_posx)
- true_posy = event['mcgn'][0]['posy']
- posy = fit[id]['posy']
- dy.append(posy-true_posy)
- true_posz = event['mcgn'][0]['posz']
- posz = fit[id]['posz']
- dz.append(posz-true_posz)
- dirx = event['mcgn'][0]['dirx']
- diry = event['mcgn'][0]['diry']
- dirz = event['mcgn'][0]['dirz']
- true_dir = [dirx,diry,dirz]
- true_dir = np.array(true_dir)/np.linalg.norm(true_dir)
- theta = fit[id]['theta']
- phi = fit[id]['phi']
- dir = [np.sin(theta)*np.cos(phi),np.sin(theta)*np.sin(phi),np.cos(theta)]
- dir = np.array(dir)/np.linalg.norm(dir)
- thetas.append(np.degrees(np.arccos(np.dot(true_dir,dir))))
- if IDP_E_MINUS in fit and IDP_MU_MINUS in fit:
- fmin_electron = fit[IDP_E_MINUS]['fmin']
- fmin_muon = fit[IDP_MU_MINUS]['fmin']
- likelihood_ratio.append(fmin_muon-fmin_electron)
- if IDP_E_MINUS in fit:
- t_electron.append(fit[IDP_E_MINUS]['time'])
- if IDP_MU_MINUS in fit:
- t_muon.append(fit[IDP_MU_MINUS]['time'])
-
- if 'nhit' in event['ev'][0]:
- nhit = event['ev'][0]['nhit']
- psi.append(fit[id]['psi']/nhit)
-
- if len(t_electron) < 2 or len(t_muon) < 2:
+
+ with h5py.File(filename) as f:
+ ev = pd.read_hdf(filename, "ev")
+ mcgn = pd.read_hdf(filename, "mcgn")
+ fits = pd.read_hdf(filename, "fits")
+
+ # get rid of 2nd events like Michel electrons
+ ev = ev.sort_values(['run','gtid']).groupby(['evn'],as_index=False).first()
+
+ # Now, we merge all three datasets together to produce a single
+ # dataframe. To do so, we join the ev dataframe with the mcgn frame
+ # on the evn column, and then join with the fits on the run and
+ # gtid columns.
+ #
+ # At the end we will have a single dataframe with one row for each
+ # fit, i.e. it will look like:
+ #
+ # >>> data
+ # run gtid nhit, ... mcgn_x, mcgn_y, mcgn_z, ..., fit_id1, fit_x, fit_y, fit_z, ...
+ #
+ # Before merging, we prefix the primary seed track table with mcgn_
+ # and the fit table with fit_ just to make things easier.
+
+ # Prefix track and fit frames
+ mcgn = mcgn.add_prefix("mcgn_")
+ fits = fits.add_prefix("fit_")
+
+ # merge ev and mcgn on evn
+ data = ev.merge(mcgn,left_on=['evn'],right_on=['mcgn_evn'])
+ # merge data and fits on run and gtid
+ data = data.merge(fits,left_on=['run','gtid'],right_on=['fit_run','fit_gtid'])
+
+ # calculate true kinetic energy
+ mass = [SNOMAN_MASS[id] for id in data['mcgn_id'].values]
+ data['T'] = data['mcgn_energy'].values - mass
+ data['dx'] = data['fit_x'].values - data['mcgn_x'].values
+ data['dy'] = data['fit_y'].values - data['mcgn_y'].values
+ data['dz'] = data['fit_z'].values - data['mcgn_z'].values
+ data['dT'] = data['fit_energy1'].values - data['T'].values
+
+ true_dir = np.dstack((data['mcgn_dirx'],data['mcgn_diry'],data['mcgn_dirz'])).squeeze()
+ dir = np.dstack((np.sin(data['fit_theta1'])*np.cos(data['fit_phi1']),
+ np.sin(data['fit_theta1'])*np.sin(data['fit_phi1']),
+ np.cos(data['fit_theta1']))).squeeze()
+
+ data['theta'] = np.degrees(np.arccos((true_dir*dir).sum(axis=-1)))
+
+ # only select fits which have at least 2 fits
+ data = data.groupby(['run','gtid']).filter(lambda x: len(x) > 1)
+ data_true = data[data['fit_id1'] == data['mcgn_id']]
+ data_e = data[data['fit_id1'] == IDP_E_MINUS]
+ data_mu = data[data['fit_id1'] == IDP_MU_MINUS]
+
+ data_true = data_true.set_index(['run','gtid'])
+ data_e = data_e.set_index(['run','gtid'])
+ data_mu = data_mu.set_index(['run','gtid'])
+
+ data_true['ratio'] = data_mu['fit_fmin']-data_e['fit_fmin']
+ data_true['te'] = data_e['fit_time']
+ data_true['tm'] = data_mu['fit_time']
+ data_true['Te'] = data_e['fit_energy1']
+
+ if len(data_true) < 2:
continue
- mean, mean_error, std, std_error = get_stats(dT)
+ mean, mean_error, std, std_error = get_stats(data_true.dT)
print("dT = %.2g +/- %.2g" % (mean, mean_error))
print("std(dT) = %.2g +/- %.2g" % (std, std_error))
- mean, mean_error, std, std_error = get_stats(dx)
+ mean, mean_error, std, std_error = get_stats(data_true.dx)
print("dx = %4.2g +/- %.2g" % (mean, mean_error))
print("std(dx) = %4.2g +/- %.2g" % (std, std_error))
- mean, mean_error, std, std_error = get_stats(dy)
+ mean, mean_error, std, std_error = get_stats(data_true.dy)
print("dy = %4.2g +/- %.2g" % (mean, mean_error))
print("std(dy) = %4.2g +/- %.2g" % (std, std_error))
- mean, mean_error, std, std_error = get_stats(dz)
+ mean, mean_error, std, std_error = get_stats(data_true.dz)
print("dz = %4.2g +/- %.2g" % (mean, mean_error))
print("std(dz) = %4.2g +/- %.2g" % (std, std_error))
- mean, mean_error, std, std_error = get_stats(thetas)
+ mean, mean_error, std, std_error = get_stats(data_true.theta)
print("std(theta) = %4.2g +/- %.2g" % (std, std_error))
plt.figure(1)
- plot_hist(dT, label=filename)
+ plot_hist(data_true.dT, label=filename)
plt.xlabel("Kinetic Energy difference (MeV)")
plt.figure(2)
- plot_hist(dx, label=filename)
+ plot_hist(data_true.dx, label=filename)
plt.xlabel("X Position difference (cm)")
plt.figure(3)
- plot_hist(dy, label=filename)
+ plot_hist(data_true.dy, label=filename)
plt.xlabel("Y Position difference (cm)")
plt.figure(4)
- plot_hist(dz, label=filename)
+ plot_hist(data_true.dz, label=filename)
plt.xlabel("Z Position difference (cm)")
plt.figure(5)
- plot_hist(thetas, label=filename)
+ plot_hist(data_true.theta, label=filename)
plt.xlabel(r"$\theta$ (deg)")
plt.figure(6)
- plot_hist(likelihood_ratio, label=filename)
+ plot_hist(data_true.ratio, label=filename)
plt.xlabel(r"Log Likelihood Ratio ($e/\mu$)")
plt.figure(7)
- plot_hist(np.array(t_electron)/1e3/60.0, label=filename)
+ plot_hist(data_true.te/1e3/60.0, label=filename)
plt.xlabel(r"Electron Fit time (minutes)")
plt.figure(8)
- plot_hist(np.array(t_muon)/1e3/60.0, label=filename)
+ plot_hist(data_true.tm/1e3/60.0, label=filename)
plt.xlabel(r"Muon Fit time (minutes)")
- if len(psi):
- plt.figure(9)
- plot_hist(psi, label=filename)
- plt.xlabel(r"$\Psi$/Nhit")
+ plt.figure(9)
+ plot_hist(data_true.fit_psi/data_true.nhit, label=filename)
+ plt.xlabel(r"$\Psi$/Nhit")
plot_legend(1)
plot_legend(2)
@@ -205,6 +199,5 @@ if __name__ == '__main__':
plot_legend(6)
plot_legend(7)
plot_legend(8)
- if len(psi):
- plot_legend(9)
+ plot_legend(9)
plt.show()
diff --git a/utils/plot-energy b/utils/plot-energy
index a0274b7..4a8521b 100755
--- a/utils/plot-energy
+++ b/utils/plot-energy
@@ -107,140 +107,69 @@ if __name__ == '__main__':
import numpy as np
import pandas as pd
import sys
+ import h5py
parser = argparse.ArgumentParser("plot fit results")
parser.add_argument("filenames", nargs='+', help="input files")
args = parser.parse_args()
- events = []
- fit_results = []
-
- for filename in args.filenames:
- print(filename)
- with open(filename) as f:
- data = yaml.load(f,Loader=Loader)
-
- for i, event in enumerate(data['data']):
- for ev in event['ev']:
- events.append((
- ev['run'],
- ev['gtr'],
- ev['nhit'],
- ev['gtid'],
- ev['dc'],
- ev['ftp']['x'] if 'ftp' in ev else np.nan,
- ev['ftp']['y'] if 'ftp' in ev else np.nan,
- ev['ftp']['z'] if 'ftp' in ev else np.nan,
- ev['rsp']['energy'] if 'rsp' in ev and ev['rsp']['energy'] > 0 else np.nan,
- ))
-
- if 'fit' not in ev:
- continue
-
- for id, fit_result in [x for x in ev['fit'].iteritems() if isinstance(x[0],int)]:
- # FIXME: Should I just store the particle ids in the YAML
- # output as a list of particle ids instead of a single
- # integer?
- ids = map(int,chunks(str(id),2))
- energy = 0.0
- skip = False
- for i, ke in zip(ids,np.atleast_1d(fit_result['energy'])):
- energy += ke + SNOMAN_MASS[i]
-
- # This is a bit of a hack. It appears that many times
- # the fit will actually do much better by including a
- # very low energy electron or muon. I believe the
- # reason for this is that of course my likelihood
- # function is not perfect (for example, I don't include
- # the correct angular distribution for Rayleigh
- # scattered light), and so the fitter often wants to
- # add a very low energy electron or muon to fix things.
- #
- # Ideally I would fix the likelihood function, but for
- # now we just discard any fit results which have a very
- # low energy electron or muon.
- if len(ids) > 1 and i == 20 and ke < 20.0:
- skip = True
-
- if len(ids) > 1 and i == 22 and ke < 200.0:
- skip = True
-
- if skip:
- continue
-
- # Calculate the approximate Ockham factor.
- # See Chapter 20 in "Probability Theory: The Logic of Science" by Jaynes
- #
- # Note: This is a really approximate form by assuming that
- # the shape of the likelihood space is equal to the average
- # uncertainty in the different parameters.
- w = len(ids)*np.log(0.1*0.001) + np.sum(np.log(fit_result['energy'])) + len(ids)*np.log(1e-4/(4*np.pi))
-
- fit_results.append((
- ev['run'],
- ev['gtid'],
- id,
- fit_result['posx'],
- fit_result['posy'],
- fit_result['posz'],
- fit_result['t0'],
- energy,
- np.atleast_1d(fit_result['theta'])[0],
- np.atleast_1d(fit_result['phi'])[0],
- fit_result['fmin'] - w,
- fit_result['psi']/ev['nhit']))
-
- # create a dataframe
- # note: we have to first create a numpy structured array since there is no
- # way to pass a list of data types to the DataFrame constructor. See
- # https://github.com/pandas-dev/pandas/issues/4464
- array = np.array(fit_results,
- dtype=[('run',np.int), # run number
- ('gtid',np.int), # gtid
- ('id',np.int), # particle id
- ('x', np.double), # x
- ('y',np.double), # y
- ('z',np.double), # z
- ('t0',np.double), # t0
- ('ke',np.double), # kinetic energy
- ('theta',np.double), # direction of 1st particle
- ('phi',np.double), # direction of 2nd particle
- ('fmin',np.double), # negative log likelihood
- ('psi',np.double)] # goodness of fit parameter
- )
- df = pd.DataFrame.from_records(array)
-
- array = np.array(events,
- dtype=[('run',np.int), # run number
- ('gtr',np.double), # 50 MHz clock in ns
- ('nhit',np.int), # number of PMTs hit
- ('gtid',np.int), # gtid
- ('dc',np.int), # data cleaning word
- ('ftpx',np.double), # FTP fitter X position
- ('ftpy',np.double), # FTP fitter Y position
- ('ftpz',np.double), # FTP fitter Z position
- ('rsp_energy',np.double)] # RSP energy
- )
- df_ev = pd.DataFrame.from_records(array)
+ ev = pd.concat([pd.read_hdf(filename, "ev") for filename in args.filenames])
+ fits = pd.concat([pd.read_hdf(filename, "fits") for filename in args.filenames])
+
+ # This is a bit of a hack. It appears that many times the fit will
+ # actually do much better by including a very low energy electron or
+ # muon. I believe the reason for this is that of course my likelihood
+ # function is not perfect (for example, I don't include the correct
+ # angular distribution for Rayleigh scattered light), and so the fitter
+ # often wants to add a very low energy electron or muon to fix things.
+ #
+ # Ideally I would fix the likelihood function, but for now we just
+ # discard any fit results which have a very low energy electron or
+ # muon.
+ #
+ # FIXME: Test this since query() is new to pandas
+ fits = fits.query('not (n > 1 and ((id1 == 20 and energy1 < 20) or (id2 == 20 and energy2 < 20) or (id3 == 20 and energy3 < 20)))')
+ fits = fits.query('not (n > 1 and ((id2 == 22 and energy1 < 200) or (id2 == 22 and energy2 < 200) or (id3 == 22 and energy3 < 200)))')
+
+ # Calculate the approximate Ockham factor.
+ # See Chapter 20 in "Probability Theory: The Logic of Science" by Jaynes
+ #
+ # Note: This is a really approximate form by assuming that the shape of
+ # the likelihood space is equal to the average uncertainty in the
+ # different parameters.
+ fits['w'] = fits['n']*np.log(0.1*0.001) + np.log(fits['energy1']) + fits['n']*np.log(1e-4/(4*np.pi))
+
+ # Note: we index on the left hand site with loc to avoid a copy error
+ #
+ # See https://www.dataquest.io/blog/settingwithcopywarning/
+ fits.loc[fits['n'] > 1, 'w'] += np.log(fits[fits['n'] > 1]['energy2'])
+ fits.loc[fits['n'] > 2, 'w'] += np.log(fits[fits['n'] > 2]['energy3'])
+
+ fits['fmin'] = fits['fmin'] - fits['w']
+
+ fits['psi'] /= fits.merge(ev,on=['run','gtid'])['nhit']
+ fits['ke'] = fits['energy1']
+ fits['id'] = fits['id1'] + fits['id2']*100 + fits['id3']*10000
+ fits['theta'] = fits['theta1']
# Make sure events are in order. We use run number and GTID here which
# should be in time order as well except for bit flips in the GTID
# This is important for later when we look at time differences in the 50
# MHz clock. We should perhaps do a check here based on the 10 MHz clock to
# make sure things are in order
- df_ev = df_ev.sort_values(by=['run','gtid'])
+ ev = ev.sort_values(by=['run','gtid'])
- print("number of events = %i" % len(df_ev))
+ print("number of events = %i" % len(ev))
# First, do basic data cleaning which is done for all events.
- df_ev = df_ev[df_ev.dc & (DC_JUNK | DC_CRATE_ISOTROPY | DC_QVNHIT | DC_FLASHER | DC_NECK) == 0]
+ ev = ev[ev.dc & (DC_JUNK | DC_CRATE_ISOTROPY | DC_QVNHIT | DC_FLASHER | DC_NECK) == 0]
- print("number of events after data cleaning = %i" % len(df_ev))
+ print("number of events after data cleaning = %i" % len(ev))
# Now, we select events tagged by the muon tag which should tag only
# external muons. We keep the sample of muons since it's needed later to
# identify Michel electrons and to apply the muon follower cut
- muons = df_ev[(df_ev.dc & DC_MUON) != 0]
+ muons = ev[(ev.dc & DC_MUON) != 0]
print("number of muons = %i" % len(muons))
@@ -251,12 +180,12 @@ if __name__ == '__main__':
#
# Should I use 10 MHz clock here since the time isn't critical and then I
# don't have to worry about 50 MHz clock rollover?
- prompt = df_ev[df_ev.nhit >= 100]
+ prompt = ev[ev.nhit >= 100]
prompt_mask = np.concatenate(([True],np.diff(prompt.gtr.values) > 250e6))
# Michel electrons and neutrons can be any event which is not a prompt
# event
- follower = pd.concat([df_ev[df_ev.nhit < 100],prompt[~prompt_mask]])
+ follower = pd.concat([ev[ev.nhit < 100],prompt[~prompt_mask]])
# Apply the prompt mask
prompt = prompt[prompt_mask]
@@ -275,39 +204,78 @@ if __name__ == '__main__':
if prompt_plus_muons.size and follower.size:
# require Michel events to pass more of the SNO data cleaning cuts
michel = follower[follower.dc & (DC_JUNK | DC_CRATE_ISOTROPY | DC_QVNHIT | DC_FLASHER | DC_NECK | DC_ESUM | DC_OWL | DC_OWL_TRIGGER | DC_FTS) == DC_FTS]
+
michel = michel[michel.nhit >= 100]
- # accept events which had a muon more than 800 nanoseconds but less than 20 microseconds before them
- # The 800 nanoseconds cut comes from Richie's thesis. He also mentions
- # that the In Time Channel Spread Cut is very effective at cutting
- # electrons events caused by muons, so I should implement that
- michel = michel[np.any((michel.run.values == prompt_plus_muons.run.values[:,np.newaxis]) & \
- (michel.gtr.values > prompt_plus_muons.gtr.values[:,np.newaxis] + 800) & \
- (michel.gtr.values < prompt_plus_muons.gtr.values[:,np.newaxis] + 20e3),axis=0)]
+
+ # Accept events which had a muon more than 800 nanoseconds but less
+ # than 20 microseconds before them. The 800 nanoseconds cut comes from
+ # Richie's thesis. He also mentions that the In Time Channel Spread Cut
+ # is very effective at cutting electrons events caused by muons, so I
+ # should implement that.
+ #
+ # Note: We currently don't look across run boundaries. This should be a
+ # *very* small effect, and the logic to do so would be very complicated
+ # since I would have to deal with 50 MHz clock rollovers, etc.
+ #
+ # Do a simple python for loop here over the runs since we create
+ # temporary arrays with shape (michel.size,prompt_plus_muons.size)
+ # which could be too big for the full dataset.
+ #
+ # There might be some clever way to do this without the loop in Pandas,
+ # but I don't know how.
+ michel_sum = None
+ for run, michel_run in michel.groupby(['run']):
+ prompt_plus_muons_run = prompt_plus_muons[prompt_plus_muons['run'] == run]
+ michel_run = michel_run[np.any((michel_run.gtr.values > prompt_plus_muons_run.gtr.values[:,np.newaxis] + 800) & \
+ (michel_run.gtr.values < prompt_plus_muons_run.gtr.values[:,np.newaxis] + 20e3),axis=0)]
+
+ if michel_sum is None:
+ michel_sum = michel_run
+ else:
+ michel_sum = michel_sum.append(michel_run)
+
+ michel = michel_sum
else:
michel = pd.DataFrame(columns=follower.columns)
if prompt.size and follower.size:
# neutron followers have to obey stricter set of data cleaning cuts
neutron = follower[follower.dc & (DC_JUNK | DC_CRATE_ISOTROPY | DC_QVNHIT | DC_FLASHER | DC_NECK | DC_ESUM | DC_OWL | DC_OWL_TRIGGER | DC_FTS) == DC_FTS]
- neutron = neutron[~np.isnan(neutron.ftpx) & ~np.isnan(neutron.rsp_energy)]
- r = np.sqrt(neutron.ftpx**2 + neutron.ftpy**2 + neutron.ftpz**2)
+ neutron = neutron[~np.isnan(neutron.ftp_x) & ~np.isnan(neutron.rsp_energy)]
+ r = np.sqrt(neutron.ftp_x**2 + neutron.ftp_y**2 + neutron.ftp_z**2)
neutron = neutron[r < AV_RADIUS]
neutron = neutron[neutron.rsp_energy > 4.0]
+
# neutron events accepted after 20 microseconds and before 250 ms (50 ms during salt)
# FIXME: need to deal with 50 MHz clock rollover
- neutron_mask = np.any((neutron.run.values == prompt.run.values[:,np.newaxis]) & \
- (neutron.gtr.values > prompt.gtr.values[:,np.newaxis] + 20e3) & \
- (neutron.gtr.values < prompt.gtr.values[:,np.newaxis] + 250e6),axis=1)
- df_atm = prompt[neutron_mask]
- prompt = prompt[~neutron_mask]
+ prompt_sum = None
+ atm_sum = None
+ for run, prompt_run in prompt.groupby(['run']):
+ neutron_run = neutron[neutron['run'] == run]
+
+ neutron_mask = np.any((neutron_run.gtr.values > prompt_run.gtr.values[:,np.newaxis] + 20e3) & \
+ (neutron_run.gtr.values < prompt_run.gtr.values[:,np.newaxis] + 250e6),axis=1)
+
+ if prompt_sum is None:
+ prompt_sum = prompt_run[~neutron_mask]
+ else:
+ prompt_sum = prompt_sum.append(prompt_run[~neutron_mask])
+
+ if atm_sum is None:
+ atm_sum = prompt_run[neutron_mask]
+ else:
+ atm_sum = atm_sum.append(prompt_run[neutron_mask])
+
+ atm = atm_sum
+ prompt = prompt_sum
else:
- df_atm = pd.DataFrame(columns=prompt.columns)
+ atm = pd.DataFrame(columns=prompt.columns)
print("number of events after neutron follower cut = %i" % len(prompt))
# Get rid of muon events in our main event sample
prompt = prompt[(prompt.dc & DC_MUON) == 0]
- df_atm = df_atm[(df_atm.dc & DC_MUON) == 0]
+ atm = atm[(atm.dc & DC_MUON) == 0]
print("number of events after muon cut = %i" % len(prompt))
@@ -317,52 +285,52 @@ if __name__ == '__main__':
prompt = prompt[~np.any((prompt.run.values == muons.run.values[:,np.newaxis]) & \
(prompt.gtr.values > muons.gtr.values[:,np.newaxis]) & \
(prompt.gtr.values <= (muons.gtr.values[:,np.newaxis] + 200e3)),axis=0)]
- df_atm = df_atm[~np.any((df_atm.run.values == muons.run.values[:,np.newaxis]) & \
- (df_atm.gtr.values > muons.gtr.values[:,np.newaxis]) & \
- (df_atm.gtr.values <= (muons.gtr.values[:,np.newaxis] + 200e3)),axis=0)]
+ atm = atm[~np.any((atm.run.values == muons.run.values[:,np.newaxis]) & \
+ (atm.gtr.values > muons.gtr.values[:,np.newaxis]) & \
+ (atm.gtr.values <= (muons.gtr.values[:,np.newaxis] + 200e3)),axis=0)]
# Check to see if there are any events with missing fit information
- df_atm_ra = df_atm[['run','gtid']].to_records(index=False)
+ atm_ra = atm[['run','gtid']].to_records(index=False)
muons_ra = muons[['run','gtid']].to_records(index=False)
prompt_ra = prompt[['run','gtid']].to_records(index=False)
michel_ra = michel[['run','gtid']].to_records(index=False)
- df_ra = df[['run','gtid']].to_records(index=False)
+ fits_ra = fits[['run','gtid']].to_records(index=False)
- if np.count_nonzero(~np.isin(df_atm_ra,df_ra)):
- print_warning("skipping %i atmospheric events because they are missing fit information!" % np.count_nonzero(~np.isin(df_atm_ra,df_ra)))
+ if len(atm_ra) and np.count_nonzero(~np.isin(atm_ra,fits_ra)):
+ print_warning("skipping %i atmospheric events because they are missing fit information!" % np.count_nonzero(~np.isin(atm_ra,fits_ra)))
- if np.count_nonzero(~np.isin(muons_ra,df_ra)):
- print_warning("skipping %i muon events because they are missing fit information!" % np.count_nonzero(~np.isin(muons_ra,df_ra)))
+ if len(muons_ra) and np.count_nonzero(~np.isin(muons_ra,fits_ra)):
+ print_warning("skipping %i muon events because they are missing fit information!" % np.count_nonzero(~np.isin(muons_ra,fits_ra)))
- if np.count_nonzero(~np.isin(prompt_ra,df_ra)):
- print_warning("skipping %i signal events because they are missing fit information!" % np.count_nonzero(~np.isin(prompt_ra,df_ra)))
+ if len(prompt_ra) and np.count_nonzero(~np.isin(prompt_ra,fits_ra)):
+ print_warning("skipping %i signal events because they are missing fit information!" % np.count_nonzero(~np.isin(prompt_ra,fits_ra)))
- if np.count_nonzero(~np.isin(michel_ra,df_ra)):
- print_warning("skipping %i Michel events because they are missing fit information!" % np.count_nonzero(~np.isin(michel_ra,df_ra)))
+ if len(michel_ra) and np.count_nonzero(~np.isin(michel_ra,fits_ra)):
+ print_warning("skipping %i Michel events because they are missing fit information!" % np.count_nonzero(~np.isin(michel_ra,fits_ra)))
# Now, we merge the event info with the fitter info.
#
# Note: This means that the dataframe now contains multiple rows for each
# event, one for each fit hypothesis.
- df_atm = pd.merge(df,df_atm,how='inner',on=['run','gtid'])
- muons = pd.merge(df,muons,how='inner',on=['run','gtid'])
- michel = pd.merge(df,michel,how='inner',on=['run','gtid'])
- df = pd.merge(df,prompt,how='inner',on=['run','gtid'])
+ atm = pd.merge(fits,atm,how='inner',on=['run','gtid'])
+ muons = pd.merge(fits,muons,how='inner',on=['run','gtid'])
+ michel = pd.merge(fits,michel,how='inner',on=['run','gtid'])
+ prompt = pd.merge(fits,prompt,how='inner',on=['run','gtid'])
# get rid of events which don't have a fit
- nan = np.isnan(df.fmin.values)
+ nan = np.isnan(prompt.fmin.values)
if np.count_nonzero(nan):
- print_warning("skipping %i signal events because the negative log likelihood is nan!" % len(df[nan].groupby(['run','gtid'])))
+ print_warning("skipping %i signal events because the negative log likelihood is nan!" % len(prompt[nan].groupby(['run','gtid'])))
- df = df[~nan]
+ prompt = prompt[~nan]
- nan_atm = np.isnan(df_atm.fmin.values)
+ nan_atm = np.isnan(atm.fmin.values)
if np.count_nonzero(nan_atm):
- print_warning("skipping %i atmospheric events because the negative log likelihood is nan!" % len(df_atm[nan_atm].groupby(['run','gtid'])))
+ print_warning("skipping %i atmospheric events because the negative log likelihood is nan!" % len(atm[nan_atm].groupby(['run','gtid'])))
- df_atm = df_atm[~nan_atm]
+ atm = atm[~nan_atm]
nan_muon = np.isnan(muons.fmin.values)
@@ -379,24 +347,24 @@ if __name__ == '__main__':
michel = michel[~nan_michel]
# get the best fit
- df = df.sort_values('fmin').groupby(['run','gtid']).first()
- df_atm = df_atm.sort_values('fmin').groupby(['run','gtid']).first()
+ prompt = prompt.sort_values('fmin').groupby(['run','gtid']).first()
+ atm = atm.sort_values('fmin').groupby(['run','gtid']).first()
michel_best_fit = michel.sort_values('fmin').groupby(['run','gtid']).first()
muon_best_fit = muons.sort_values('fmin').groupby(['run','gtid']).first()
muons = muons[muons.id == 22]
# require r < 6 meters
- df = df[np.sqrt(df.x.values**2 + df.y.values**2 + df.z.values**2) < AV_RADIUS]
- df_atm = df_atm[np.sqrt(df_atm.x.values**2 + df_atm.y.values**2 + df_atm.z.values**2) < AV_RADIUS]
+ prompt = prompt[np.sqrt(prompt.x.values**2 + prompt.y.values**2 + prompt.z.values**2) < AV_RADIUS]
+ atm = atm[np.sqrt(atm.x.values**2 + atm.y.values**2 + atm.z.values**2) < AV_RADIUS]
- print("number of events after radius cut = %i" % len(df))
+ print("number of events after radius cut = %i" % len(prompt))
# Note: Need to design and apply a psi based cut here
plt.figure()
- plot_hist(df,"Without Neutron Follower")
+ plot_hist(prompt,"Without Neutron Follower")
plt.figure()
- plot_hist(df_atm,"With Neutron Follower")
+ plot_hist(atm,"With Neutron Follower")
plt.figure()
plot_hist(michel_best_fit,"Michel Electrons")
plt.figure()
diff --git a/utils/plot-fit-results b/utils/plot-fit-results
index 773a0dc..7115b81 100755
--- a/utils/plot-fit-results
+++ b/utils/plot-fit-results
@@ -15,11 +15,6 @@
# this program. If not, see <https://www.gnu.org/licenses/>.
from __future__ import print_function, division
-import yaml
-try:
- from yaml import CLoader as Loader
-except ImportError:
- from yaml.loader import SafeLoader as Loader
import numpy as np
from scipy.stats import iqr
from matplotlib.lines import Line2D
@@ -71,228 +66,137 @@ def get_stats(x):
error = np.sqrt((u4-(n-3)*std**4/(n-1))/n)/(2*std)
return mean, std/np.sqrt(n), std, error
+def std_err(x):
+ x = x.dropna()
+ mean = np.mean(x)
+ std = np.std(x)
+ n = len(x)
+ if n == 0:
+ return np.nan
+ u4 = np.mean((x-mean)**4)
+ error = np.sqrt((u4-(n-3)*std**4/(n-1))/n)/(2*std)
+ return error
+
if __name__ == '__main__':
import argparse
import matplotlib.pyplot as plt
import numpy as np
+ import h5py
+ import pandas as pd
parser = argparse.ArgumentParser("plot fit results")
parser.add_argument("filenames", nargs='+', help="input files")
args = parser.parse_args()
- events = []
-
- for filename in args.filenames:
- print(filename)
- with open(filename) as f:
- data = yaml.load(f.read(),Loader=Loader)
-
- a = np.ma.empty(len(data['data']),
- dtype=[('id',np.int), # particle id
- ('T', np.double), # true energy
- ('dx',np.double), # dx
- ('dy',np.double), # dy
- ('dz',np.double), # dz
- ('dT',np.double), # dT
- ('theta',np.double), # theta
- ('ratio',np.double), # likelihood ratio
- ('te',np.double), # time electron
- ('tm',np.double), # time muon
- ('Te',np.double)] # electron energy
- )
-
- for i, event in enumerate(data['data']):
- # get the particle ID
- id = event['mcgn'][0]['id']
-
- a[i]['id'] = id
-
- if 'fit' not in event['ev'][0]:
- # if nhit < 100 we don't fit the event
- continue
-
- if id not in event['ev'][0]['fit']:
- a[i] = np.ma.masked
- continue
-
- mass = SNOMAN_MASS[id]
- # for some reason it's the *second* track which seems to contain the
- # initial energy
- true_energy = event['mcgn'][0]['energy']
- # The MCTK bank has the particle's total energy (except for neutrons)
- # so we need to convert it into kinetic energy
- ke = true_energy - mass
-
- fit = event['ev'][0]['fit']
-
- a[i]['T'] = ke
- energy = fit[id]['energy']
- a[i]['dT'] = energy-ke
-
- # store the fit position residuals
- true_posx = event['mcgn'][0]['posx']
- posx = fit[id]['posx']
- a[i]['dx'] = posx-true_posx
- true_posy = event['mcgn'][0]['posy']
- posy = fit[id]['posy']
- a[i]['dy'] = posy-true_posy
- true_posz = event['mcgn'][0]['posz']
- posz = fit[id]['posz']
- a[i]['dz'] = posz-true_posz
-
- # compute the angle between the fit direction and the true
- # direction
- dirx = event['mcgn'][0]['dirx']
- diry = event['mcgn'][0]['diry']
- dirz = event['mcgn'][0]['dirz']
- true_dir = [dirx,diry,dirz]
- true_dir = np.array(true_dir)/np.linalg.norm(true_dir)
- theta = fit[id]['theta']
- phi = fit[id]['phi']
- dir = [np.sin(theta)*np.cos(phi),np.sin(theta)*np.sin(phi),np.cos(theta)]
- dir = np.array(dir)/np.linalg.norm(dir)
- a[i]['theta'] = np.degrees(np.arccos(np.dot(true_dir,dir)))
-
- # compute the log likelihood ratio
- if IDP_E_MINUS in fit and IDP_MU_MINUS in fit:
- fmin_electron = fit[IDP_E_MINUS]['fmin']
- fmin_muon = fit[IDP_MU_MINUS]['fmin']
- a[i]['ratio'] = fmin_muon-fmin_electron
- else:
- a[i]['ratio'] = np.ma.masked
-
- # store the time taken for electron and muon fits
- if IDP_E_MINUS in fit:
- a[i]['te'] = fit[IDP_E_MINUS]['time']
- a[i]['Te'] = fit[IDP_E_MINUS]['energy']
- else:
- a[i]['te'] = np.ma.masked
- a[i]['Te'] = np.ma.masked
- if IDP_MU_MINUS in fit:
- a[i]['tm'] = fit[IDP_MU_MINUS]['time']
- else:
- a[i]['tm'] = np.ma.masked
-
- events.append(a)
-
- a = np.ma.concatenate(events)
-
+ # Read in all the data.
+ #
+ # Note: We have to add the filename as a column to each dataset since this
+ # script is used to analyze MC data which all has the same run number.
+ ev = pd.concat([pd.read_hdf(filename, "ev").assign(filename=filename) for filename in args.filenames])
+ fits = pd.concat([pd.read_hdf(filename, "fits").assign(filename=filename) for filename in args.filenames])
+ mcgn = pd.concat([pd.read_hdf(filename, "mcgn").assign(filename=filename) for filename in args.filenames])
+
+ # get rid of 2nd events like Michel electrons
+ ev = ev.sort_values(['run','gtid']).groupby(['filename','evn'],as_index=False).first()
+
+ # Now, we merge all three datasets together to produce a single
+ # dataframe. To do so, we join the ev dataframe with the mcgn frame
+ # on the evn column, and then join with the fits on the run and
+ # gtid columns.
+ #
+ # At the end we will have a single dataframe with one row for each
+ # fit, i.e. it will look like:
+ #
+ # >>> data
+ # run gtid nhit, ... mcgn_x, mcgn_y, mcgn_z, ..., fit_id1, fit_x, fit_y, fit_z, ...
+ #
+ # Before merging, we prefix the primary seed track table with mcgn_
+ # and the fit table with fit_ just to make things easier.
+
+ # Prefix track and fit frames
+ mcgn = mcgn.add_prefix("mcgn_")
+ fits = fits.add_prefix("fit_")
+
+ # merge ev and mcgn on evn
+ data = ev.merge(mcgn,left_on=['filename','evn'],right_on=['mcgn_filename','mcgn_evn'])
+ # merge data and fits on run and gtid
+ data = data.merge(fits,left_on=['filename','run','gtid'],right_on=['fit_filename','fit_run','fit_gtid'])
+
+ # calculate true kinetic energy
+ mass = [SNOMAN_MASS[id] for id in data['mcgn_id'].values]
+ data['T'] = data['mcgn_energy'].values - mass
+ data['dx'] = data['fit_x'].values - data['mcgn_x'].values
+ data['dy'] = data['fit_y'].values - data['mcgn_y'].values
+ data['dz'] = data['fit_z'].values - data['mcgn_z'].values
+ data['dT'] = data['fit_energy1'].values - data['T'].values
+
+ true_dir = np.dstack((data['mcgn_dirx'],data['mcgn_diry'],data['mcgn_dirz'])).squeeze()
+ dir = np.dstack((np.sin(data['fit_theta1'])*np.cos(data['fit_phi1']),
+ np.sin(data['fit_theta1'])*np.sin(data['fit_phi1']),
+ np.cos(data['fit_theta1']))).squeeze()
+
+ data['theta'] = np.degrees(np.arccos((true_dir*dir).sum(axis=-1)))
+
+ # only select fits which have at least 2 fits
+ data = data.groupby(['filename','run','gtid']).filter(lambda x: len(x) > 1)
+ data_true = data[data['fit_id1'] == data['mcgn_id']]
+ data_e = data[data['fit_id1'] == IDP_E_MINUS]
+ data_mu = data[data['fit_id1'] == IDP_MU_MINUS]
+
+ data_true = data_true.set_index(['filename','run','gtid'])
+ data_e = data_e.set_index(['filename','run','gtid'])
+ data_mu = data_mu.set_index(['filename','run','gtid'])
+
+ data_true['ratio'] = data_mu['fit_fmin']-data_e['fit_fmin']
+ data_true['te'] = data_e['fit_time']
+ data_true['tm'] = data_mu['fit_time']
+ data_true['Te'] = data_e['fit_energy1']
+
+ # 100 bins between 50 MeV and 1 GeV
bins = np.arange(50,1000,100)
- stats_array = np.ma.empty(len(bins)-1,
- dtype=[('T', np.double),
- ('dT', np.double),
- ('dT_err', np.double),
- ('dT_std', np.double),
- ('dT_std_err', np.double),
- ('dx', np.double),
- ('dx_err', np.double),
- ('dx_std', np.double),
- ('dx_std_err', np.double),
- ('dy', np.double),
- ('dy_err', np.double),
- ('dy_std', np.double),
- ('dy_std_err', np.double),
- ('dz', np.double),
- ('dz_err', np.double),
- ('dz_std', np.double),
- ('dz_std_err', np.double),
- ('theta', np.double),
- ('theta_err', np.double),
- ('theta_std', np.double),
- ('theta_std_err', np.double)])
-
- stats = {IDP_E_MINUS: stats_array, IDP_MU_MINUS: stats_array.copy()}
-
- for id in stats:
- electron_events = a[a['id'] == id]
-
- for i, (ablah, b) in enumerate(zip(bins[:-1], bins[1:])):
- events = electron_events[(electron_events['T'] >= ablah) & (electron_events['T'] < b)]
-
- if len(events) < 2:
- stats[id][i] = np.ma.masked
- continue
-
- stats[id][i]['T'] = (ablah+b)/2
- mean, mean_error, std, std_error = get_stats(events['dT'].compressed())
- stats[id][i]['dT'] = mean
- stats[id][i]['dT_err'] = mean_error
- stats[id][i]['dT_std'] = std
- stats[id][i]['dT_std_err'] = std_error
- mean, mean_error, std, std_error = get_stats(events['dx'].compressed())
- stats[id][i]['dx'] = mean
- stats[id][i]['dx_err'] = mean_error
- stats[id][i]['dx_std'] = std
- stats[id][i]['dx_std_err'] = std_error
- mean, mean_error, std, std_error = get_stats(events['dy'].compressed())
- stats[id][i]['dy'] = mean
- stats[id][i]['dy_err'] = mean_error
- stats[id][i]['dy_std'] = std
- stats[id][i]['dy_std_err'] = std_error
- mean, mean_error, std, std_error = get_stats(events['dz'].compressed())
- stats[id][i]['dz'] = mean
- stats[id][i]['dz_err'] = mean_error
- stats[id][i]['dz_std'] = std
- stats[id][i]['dz_std_err'] = std_error
- mean, mean_error, std, std_error = get_stats(events['theta'].compressed())
- stats[id][i]['theta'] = mean
- stats[id][i]['theta_err'] = mean_error
- stats[id][i]['theta_std'] = std
- stats[id][i]['theta_std_err'] = std_error
-
- for id in stats:
+ for id in [IDP_E_MINUS, IDP_MU_MINUS]:
+ events = data_true[data_true['mcgn_id'] == id]
+
+ pd_bins = pd.cut(events['T'],bins)
+
+ dT = events.groupby(pd_bins)['dT'].agg(['mean','sem','std',std_err])
+ dx = events.groupby(pd_bins)['dx'].agg(['mean','sem','std',std_err])
+ dy = events.groupby(pd_bins)['dy'].agg(['mean','sem','std',std_err])
+ dz = events.groupby(pd_bins)['dz'].agg(['mean','sem','std',std_err])
+ theta = events.groupby(pd_bins)['theta'].agg(['mean','sem','std',std_err])
+
label = 'Muon' if id == IDP_MU_MINUS else 'Electron'
- T = stats[id]['T']
- dT = stats[id]['dT']
- dT_err = stats[id]['dT_err']
- std_dT = stats[id]['dT_std']
- std_dT_err = stats[id]['dT_std_err']
- dx = stats[id]['dx']
- dx_err = stats[id]['dx_err']
- std_dx = stats[id]['dx_std']
- std_dx_err = stats[id]['dx_std_err']
- dy = stats[id]['dy']
- dy_err = stats[id]['dy_err']
- std_dy = stats[id]['dy_std']
- std_dy_err = stats[id]['dy_std_err']
- dz = stats[id]['dz']
- dz_err = stats[id]['dz_err']
- std_dz = stats[id]['dz_std']
- std_dz_err = stats[id]['dz_std_err']
- theta = stats[id]['theta']
- theta_err = stats[id]['theta_err']
- std_theta = stats[id]['theta_std']
- std_theta_err = stats[id]['theta_std_err']
+ T = (bins[1:] + bins[:-1])/2
plt.figure(1)
- plt.errorbar(T,dT*100/T,yerr=dT_err*100/T,fmt='o',label=label)
+ plt.errorbar(T,dT['mean']*100/T,yerr=dT['sem']*100/T,fmt='o',label=label)
plt.xlabel("Kinetic Energy (MeV)")
plt.ylabel("Energy bias (%)")
plt.title("Energy Bias")
plt.legend()
plt.figure(2)
- plt.errorbar(T,std_dT*100/T,yerr=std_dT_err*100/T,fmt='o',label=label)
+ plt.errorbar(T,dT['std']*100/T,yerr=dT['std_err']*100/T,fmt='o',label=label)
plt.xlabel("Kinetic Energy (MeV)")
plt.ylabel("Energy resolution (%)")
plt.title("Energy Resolution")
plt.legend()
plt.figure(3)
- plt.errorbar(T,dx,yerr=dx_err,fmt='o',label='%s (x)' % label)
- plt.errorbar(T,dy,yerr=dy_err,fmt='o',label='%s (y)' % label)
- plt.errorbar(T,dz,yerr=dz_err,fmt='o',label='%s (z)' % label)
+ plt.errorbar(T,dx['mean'],yerr=dx['sem'],fmt='o',label='%s (x)' % label)
+ plt.errorbar(T,dy['mean'],yerr=dy['sem'],fmt='o',label='%s (y)' % label)
+ plt.errorbar(T,dz['mean'],yerr=dz['sem'],fmt='o',label='%s (z)' % label)
plt.xlabel("Kinetic Energy (MeV)")
plt.ylabel("Position bias (cm)")
plt.title("Position Bias")
plt.legend()
plt.figure(4)
- plt.errorbar(T,std_dx,yerr=std_dx_err,fmt='o',label='%s (x)' % label)
- plt.errorbar(T,std_dy,yerr=std_dy_err,fmt='o',label='%s (y)' % label)
- plt.errorbar(T,std_dz,yerr=std_dz_err,fmt='o',label='%s (z)' % label)
+ plt.errorbar(T,dx['std'],yerr=dx['std_err'],fmt='o',label='%s (x)' % label)
+ plt.errorbar(T,dy['std'],yerr=dy['std_err'],fmt='o',label='%s (y)' % label)
+ plt.errorbar(T,dz['std'],yerr=dz['std_err'],fmt='o',label='%s (z)' % label)
plt.xlabel("Kinetic Energy (MeV)")
plt.ylabel("Position resolution (cm)")
plt.title("Position Resolution")
@@ -300,7 +204,7 @@ if __name__ == '__main__':
plt.legend()
plt.figure(5)
- plt.errorbar(T,std_theta,yerr=std_theta_err,fmt='o',label=label)
+ plt.errorbar(T,theta['std'],yerr=theta['std_err'],fmt='o',label=label)
plt.xlabel("Kinetic Energy (MeV)")
plt.ylabel("Angular resolution (deg)")
plt.title("Angular Resolution")
@@ -308,9 +212,10 @@ if __name__ == '__main__':
plt.legend()
plt.figure(6)
- plt.scatter(a[a['id'] == id]['Te'],a[a['id'] == id]['ratio'],label=label)
+ plt.scatter(events['Te'],events['ratio'],label=label)
plt.xlabel("Reconstructed Electron Energy (MeV)")
plt.ylabel(r"Log Likelihood Ratio (e/$\mu$)")
plt.title("Log Likelihood Ratio vs Reconstructed Electron Energy")
plt.legend()
+
plt.show()
diff --git a/utils/submit-grid-jobs b/utils/submit-grid-jobs
index 670172d..63cbcb7 100755
--- a/utils/submit-grid-jobs
+++ b/utils/submit-grid-jobs
@@ -24,6 +24,83 @@ import string
from os.path import split, splitext, join, abspath
import uuid
from subprocess import check_call
+import os
+import sys
+
+# Next two functions are a backport of the shutil.which() function from Python
+# 3.3 from Lib/shutil.py in the CPython code See
+# https://github.com/python/cpython/blob/master/Lib/shutil.py.
+
+# Check that a given file can be accessed with the correct mode.
+# Additionally check that `file` is not a directory, as on Windows
+# directories pass the os.access check.
+def _access_check(fn, mode):
+ return (os.path.exists(fn) and os.access(fn, mode) and not os.path.isdir(fn))
+
+def which(cmd, mode=os.F_OK | os.X_OK, path=None):
+ """Given a command, mode, and a PATH string, return the path which
+ conforms to the given mode on the PATH, or None if there is no such
+ file.
+ `mode` defaults to os.F_OK | os.X_OK. `path` defaults to the result
+ of os.environ.get("PATH"), or can be overridden with a custom search
+ path.
+ """
+ # If we're given a path with a directory part, look it up directly rather
+ # than referring to PATH directories. This includes checking relative to the
+ # current directory, e.g. ./script
+ if os.path.dirname(cmd):
+ if _access_check(cmd, mode):
+ return cmd
+ return None
+
+ if path is None:
+ path = os.environ.get("PATH", None)
+ if path is None:
+ try:
+ path = os.confstr("CS_PATH")
+ except (AttributeError, ValueError):
+ # os.confstr() or CS_PATH is not available
+ path = os.defpath
+ # bpo-35755: Don't use os.defpath if the PATH environment variable is
+ # set to an empty string
+
+ # PATH='' doesn't match, whereas PATH=':' looks in the current directory
+ if not path:
+ return None
+
+ path = path.split(os.pathsep)
+
+ if sys.platform == "win32":
+ # The current directory takes precedence on Windows.
+ curdir = os.curdir
+ if curdir not in path:
+ path.insert(0, curdir)
+
+ # PATHEXT is necessary to check on Windows.
+ pathext = os.environ.get("PATHEXT", "").split(os.pathsep)
+ # See if the given file matches any of the expected path extensions.
+ # This will allow us to short circuit when given "python.exe".
+ # If it does match, only test that one, otherwise we have to try
+ # others.
+ if any(cmd.lower().endswith(ext.lower()) for ext in pathext):
+ files = [cmd]
+ else:
+ files = [cmd + ext for ext in pathext]
+ else:
+ # On other platforms you don't have things like PATHEXT to tell you
+ # what file suffixes are executable, so just pass on cmd as-is.
+ files = [cmd]
+
+ seen = set()
+ for dir in path:
+ normdir = os.path.normcase(dir)
+ if not normdir in seen:
+ seen.add(normdir)
+ for thefile in files:
+ name = os.path.join(dir, thefile)
+ if _access_check(name, mode):
+ return name
+ return None
CONDOR_TEMPLATE = \
"""
@@ -42,7 +119,7 @@ log = @log
# The below are good base requirements for first testing jobs on OSG,
# if you don't have a good idea of memory and disk usage.
-requirements = (OSGVO_OS_STRING == "RHEL 7") && (OpSys == "LINUX")
+requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == "RHEL 7") && (OpSys == "LINUX")
request_cpus = 1
request_memory = 1 GB
request_disk = 1 GB
@@ -72,14 +149,24 @@ def submit_job(filename, run, gtid, dir, dqxx_dir, min_nhit, max_particles):
prefix = "%s_%08i_%s" % (root,gtid,ID.hex)
# fit output filename
- output = "%s.txt" % prefix
+ output = "%s.hdf5" % prefix
# condor submit filename
condor_submit = "%s.submit" % prefix
# set up the arguments for the template
- executable = join(dir,"fit")
+ executable = which("fit")
+ wrapper = which("fit-wrapper")
+
+ if executable is None:
+ print("couldn't find fit in path!",file=sys.stderr)
+ sys.exit(1)
+
+ if wrapper is None:
+ print("couldn't find fit-wrapper in path!",file=sys.stderr)
+ sys.exit(1)
+
args = [tail,"-o",output,"--gtid",gtid,"--min-nhit",min_nhit,"--max-particles",max_particles]
- transfer_input_files = ",".join([filename,join(dqxx_dir,"DQXX_%010i.dat" % run)] + [join(dir,filename) for filename in INPUT_FILES])
+ transfer_input_files = ",".join([executable,filename,join(dqxx_dir,"DQXX_%010i.dat" % run)] + [join(dir,filename) for filename in INPUT_FILES])
transfer_output_files = ",".join([output])
condor_error = "%s.error" % prefix
@@ -89,7 +176,7 @@ def submit_job(filename, run, gtid, dir, dqxx_dir, min_nhit, max_particles):
template = MyTemplate(CONDOR_TEMPLATE)
submit_string = template.safe_substitute(
- executable=executable,
+ executable=wrapper,
args=" ".join(map(str,args)),
transfer_input_files=transfer_input_files,
transfer_output_files=transfer_output_files,
@@ -106,13 +193,13 @@ def submit_job(filename, run, gtid, dir, dqxx_dir, min_nhit, max_particles):
if __name__ == '__main__':
import argparse
- import subprocess
+ from subprocess import check_call
import os
+ import tempfile
+ import h5py
parser = argparse.ArgumentParser("submit grid jobs")
parser.add_argument("filenames", nargs='+', help="input files")
- parser.add_argument("--dir", type=str, help="fitter directory", required=True)
- parser.add_argument("--dqxx-dir", type=str, help="dqxx directory", required=True)
parser.add_argument("--min-nhit", type=int, help="minimum nhit to fit an event", default=100)
parser.add_argument("--max-particles", type=int, help="maximum number of particles to fit for", default=3)
parser.add_argument("--skip-second-event", action='store_true', help="only fit the first event after a MAST bank", default=False)
@@ -123,14 +210,32 @@ if __name__ == '__main__':
home = os.path.expanduser("~")
args.state = join(home,'state.txt')
+ if 'SDDM_DATA' not in os.environ:
+ print("Please set the SDDM_DATA environment variable to point to the fitter source code location", file=sys.stderr)
+ sys.exit(1)
+
+ dir = os.environ['SDDM_DATA']
+
+ if 'DQXX_DIR' not in os.environ:
+ print("Please set the DQXX_DIR environment variable to point to the directory with the DQXX files", file=sys.stderr)
+ sys.exit(1)
+
+ dqxx_dir = os.environ['DQXX_DIR']
+
args.state = abspath(args.state)
# get the current working directory
home_dir = os.getcwd()
# get absolute paths since we are going to create a new directory
- args.dir = abspath(args.dir)
- args.dqxx_dir = abspath(args.dqxx_dir)
+ dir = abspath(dir)
+ dqxx_dir = abspath(dqxx_dir)
+
+ zdab_cat = which("zdab-cat")
+
+ if zdab_cat is None:
+ print("couldn't find zdab-cat in path!",file=sys.stderr)
+ sys.exit(1)
for filename in args.filenames:
filename = abspath(filename)
@@ -152,29 +257,28 @@ if __name__ == '__main__':
continue
with open(os.devnull, 'w') as f:
+ # Create a temporary file to store the events. Docs recommended
+ # this method instead of mkstemp(), but I think they're the same.
+ output = tempfile.NamedTemporaryFile(suffix='.hdf5',delete=False)
+ output.close()
+
if args.skip_second_event:
- popen = subprocess.Popen([join(args.dir,"zdab-cat"),"--skip-second-event",filename],stdout=subprocess.PIPE,stderr=f)
+ check_call([zdab_cat,"--skip-second-event",filename,"-o",output.name],stderr=f)
else:
- popen = subprocess.Popen([join(args.dir,"zdab-cat"),filename],stdout=subprocess.PIPE,stderr=f)
+ check_call([zdab_cat,filename,"-o",output.name],stderr=f)
new_dir = "%s_%s" % (root,ID.hex)
os.mkdir(new_dir)
os.chdir(new_dir)
- for data in yaml.load_all(popen.stdout,Loader=Loader):
- if 'ev' not in data:
- continue
-
- for ev in data['ev']:
- run = ev['run']
- gtid = ev['gtid']
- nhit = ev['nhit']
-
- if nhit >= args.min_nhit:
- submit_job(filename, run, gtid, args.dir, args.dqxx_dir, args.min_nhit, args.max_particles)
+ with h5py.File(output.name) as f:
+ for ev in f['ev']:
+ if ev['nhit'] >= args.min_nhit:
+ submit_job(filename, ev['run'], ev['gtid'], dir, dqxx_dir, args.min_nhit, args.max_particles)
- popen.wait()
+ # Delete temporary HDF5 file
+ os.unlink(output.name)
state.append(tail)