aboutsummaryrefslogtreecommitdiff
path: root/utils/plot-energy
diff options
context:
space:
mode:
Diffstat (limited to 'utils/plot-energy')
-rwxr-xr-xutils/plot-energy294
1 files changed, 131 insertions, 163 deletions
diff --git a/utils/plot-energy b/utils/plot-energy
index a0274b7..4a8521b 100755
--- a/utils/plot-energy
+++ b/utils/plot-energy
@@ -107,140 +107,69 @@ if __name__ == '__main__':
import numpy as np
import pandas as pd
import sys
+ import h5py
parser = argparse.ArgumentParser("plot fit results")
parser.add_argument("filenames", nargs='+', help="input files")
args = parser.parse_args()
- events = []
- fit_results = []
-
- for filename in args.filenames:
- print(filename)
- with open(filename) as f:
- data = yaml.load(f,Loader=Loader)
-
- for i, event in enumerate(data['data']):
- for ev in event['ev']:
- events.append((
- ev['run'],
- ev['gtr'],
- ev['nhit'],
- ev['gtid'],
- ev['dc'],
- ev['ftp']['x'] if 'ftp' in ev else np.nan,
- ev['ftp']['y'] if 'ftp' in ev else np.nan,
- ev['ftp']['z'] if 'ftp' in ev else np.nan,
- ev['rsp']['energy'] if 'rsp' in ev and ev['rsp']['energy'] > 0 else np.nan,
- ))
-
- if 'fit' not in ev:
- continue
-
- for id, fit_result in [x for x in ev['fit'].iteritems() if isinstance(x[0],int)]:
- # FIXME: Should I just store the particle ids in the YAML
- # output as a list of particle ids instead of a single
- # integer?
- ids = map(int,chunks(str(id),2))
- energy = 0.0
- skip = False
- for i, ke in zip(ids,np.atleast_1d(fit_result['energy'])):
- energy += ke + SNOMAN_MASS[i]
-
- # This is a bit of a hack. It appears that many times
- # the fit will actually do much better by including a
- # very low energy electron or muon. I believe the
- # reason for this is that of course my likelihood
- # function is not perfect (for example, I don't include
- # the correct angular distribution for Rayleigh
- # scattered light), and so the fitter often wants to
- # add a very low energy electron or muon to fix things.
- #
- # Ideally I would fix the likelihood function, but for
- # now we just discard any fit results which have a very
- # low energy electron or muon.
- if len(ids) > 1 and i == 20 and ke < 20.0:
- skip = True
-
- if len(ids) > 1 and i == 22 and ke < 200.0:
- skip = True
-
- if skip:
- continue
-
- # Calculate the approximate Ockham factor.
- # See Chapter 20 in "Probability Theory: The Logic of Science" by Jaynes
- #
- # Note: This is a really approximate form by assuming that
- # the shape of the likelihood space is equal to the average
- # uncertainty in the different parameters.
- w = len(ids)*np.log(0.1*0.001) + np.sum(np.log(fit_result['energy'])) + len(ids)*np.log(1e-4/(4*np.pi))
-
- fit_results.append((
- ev['run'],
- ev['gtid'],
- id,
- fit_result['posx'],
- fit_result['posy'],
- fit_result['posz'],
- fit_result['t0'],
- energy,
- np.atleast_1d(fit_result['theta'])[0],
- np.atleast_1d(fit_result['phi'])[0],
- fit_result['fmin'] - w,
- fit_result['psi']/ev['nhit']))
-
- # create a dataframe
- # note: we have to first create a numpy structured array since there is no
- # way to pass a list of data types to the DataFrame constructor. See
- # https://github.com/pandas-dev/pandas/issues/4464
- array = np.array(fit_results,
- dtype=[('run',np.int), # run number
- ('gtid',np.int), # gtid
- ('id',np.int), # particle id
- ('x', np.double), # x
- ('y',np.double), # y
- ('z',np.double), # z
- ('t0',np.double), # t0
- ('ke',np.double), # kinetic energy
- ('theta',np.double), # direction of 1st particle
- ('phi',np.double), # direction of 2nd particle
- ('fmin',np.double), # negative log likelihood
- ('psi',np.double)] # goodness of fit parameter
- )
- df = pd.DataFrame.from_records(array)
-
- array = np.array(events,
- dtype=[('run',np.int), # run number
- ('gtr',np.double), # 50 MHz clock in ns
- ('nhit',np.int), # number of PMTs hit
- ('gtid',np.int), # gtid
- ('dc',np.int), # data cleaning word
- ('ftpx',np.double), # FTP fitter X position
- ('ftpy',np.double), # FTP fitter Y position
- ('ftpz',np.double), # FTP fitter Z position
- ('rsp_energy',np.double)] # RSP energy
- )
- df_ev = pd.DataFrame.from_records(array)
+ ev = pd.concat([pd.read_hdf(filename, "ev") for filename in args.filenames])
+ fits = pd.concat([pd.read_hdf(filename, "fits") for filename in args.filenames])
+
+ # This is a bit of a hack. It appears that many times the fit will
+ # actually do much better by including a very low energy electron or
+ # muon. I believe the reason for this is that of course my likelihood
+ # function is not perfect (for example, I don't include the correct
+ # angular distribution for Rayleigh scattered light), and so the fitter
+ # often wants to add a very low energy electron or muon to fix things.
+ #
+ # Ideally I would fix the likelihood function, but for now we just
+ # discard any fit results which have a very low energy electron or
+ # muon.
+ #
+ # FIXME: Test this since query() is new to pandas
+ fits = fits.query('not (n > 1 and ((id1 == 20 and energy1 < 20) or (id2 == 20 and energy2 < 20) or (id3 == 20 and energy3 < 20)))')
+ fits = fits.query('not (n > 1 and ((id2 == 22 and energy1 < 200) or (id2 == 22 and energy2 < 200) or (id3 == 22 and energy3 < 200)))')
+
+ # Calculate the approximate Ockham factor.
+ # See Chapter 20 in "Probability Theory: The Logic of Science" by Jaynes
+ #
+ # Note: This is a really approximate form by assuming that the shape of
+ # the likelihood space is equal to the average uncertainty in the
+ # different parameters.
+ fits['w'] = fits['n']*np.log(0.1*0.001) + np.log(fits['energy1']) + fits['n']*np.log(1e-4/(4*np.pi))
+
+ # Note: we index on the left hand site with loc to avoid a copy error
+ #
+ # See https://www.dataquest.io/blog/settingwithcopywarning/
+ fits.loc[fits['n'] > 1, 'w'] += np.log(fits[fits['n'] > 1]['energy2'])
+ fits.loc[fits['n'] > 2, 'w'] += np.log(fits[fits['n'] > 2]['energy3'])
+
+ fits['fmin'] = fits['fmin'] - fits['w']
+
+ fits['psi'] /= fits.merge(ev,on=['run','gtid'])['nhit']
+ fits['ke'] = fits['energy1']
+ fits['id'] = fits['id1'] + fits['id2']*100 + fits['id3']*10000
+ fits['theta'] = fits['theta1']
# Make sure events are in order. We use run number and GTID here which
# should be in time order as well except for bit flips in the GTID
# This is important for later when we look at time differences in the 50
# MHz clock. We should perhaps do a check here based on the 10 MHz clock to
# make sure things are in order
- df_ev = df_ev.sort_values(by=['run','gtid'])
+ ev = ev.sort_values(by=['run','gtid'])
- print("number of events = %i" % len(df_ev))
+ print("number of events = %i" % len(ev))
# First, do basic data cleaning which is done for all events.
- df_ev = df_ev[df_ev.dc & (DC_JUNK | DC_CRATE_ISOTROPY | DC_QVNHIT | DC_FLASHER | DC_NECK) == 0]
+ ev = ev[ev.dc & (DC_JUNK | DC_CRATE_ISOTROPY | DC_QVNHIT | DC_FLASHER | DC_NECK) == 0]
- print("number of events after data cleaning = %i" % len(df_ev))
+ print("number of events after data cleaning = %i" % len(ev))
# Now, we select events tagged by the muon tag which should tag only
# external muons. We keep the sample of muons since it's needed later to
# identify Michel electrons and to apply the muon follower cut
- muons = df_ev[(df_ev.dc & DC_MUON) != 0]
+ muons = ev[(ev.dc & DC_MUON) != 0]
print("number of muons = %i" % len(muons))
@@ -251,12 +180,12 @@ if __name__ == '__main__':
#
# Should I use 10 MHz clock here since the time isn't critical and then I
# don't have to worry about 50 MHz clock rollover?
- prompt = df_ev[df_ev.nhit >= 100]
+ prompt = ev[ev.nhit >= 100]
prompt_mask = np.concatenate(([True],np.diff(prompt.gtr.values) > 250e6))
# Michel electrons and neutrons can be any event which is not a prompt
# event
- follower = pd.concat([df_ev[df_ev.nhit < 100],prompt[~prompt_mask]])
+ follower = pd.concat([ev[ev.nhit < 100],prompt[~prompt_mask]])
# Apply the prompt mask
prompt = prompt[prompt_mask]
@@ -275,39 +204,78 @@ if __name__ == '__main__':
if prompt_plus_muons.size and follower.size:
# require Michel events to pass more of the SNO data cleaning cuts
michel = follower[follower.dc & (DC_JUNK | DC_CRATE_ISOTROPY | DC_QVNHIT | DC_FLASHER | DC_NECK | DC_ESUM | DC_OWL | DC_OWL_TRIGGER | DC_FTS) == DC_FTS]
+
michel = michel[michel.nhit >= 100]
- # accept events which had a muon more than 800 nanoseconds but less than 20 microseconds before them
- # The 800 nanoseconds cut comes from Richie's thesis. He also mentions
- # that the In Time Channel Spread Cut is very effective at cutting
- # electrons events caused by muons, so I should implement that
- michel = michel[np.any((michel.run.values == prompt_plus_muons.run.values[:,np.newaxis]) & \
- (michel.gtr.values > prompt_plus_muons.gtr.values[:,np.newaxis] + 800) & \
- (michel.gtr.values < prompt_plus_muons.gtr.values[:,np.newaxis] + 20e3),axis=0)]
+
+ # Accept events which had a muon more than 800 nanoseconds but less
+ # than 20 microseconds before them. The 800 nanoseconds cut comes from
+ # Richie's thesis. He also mentions that the In Time Channel Spread Cut
+ # is very effective at cutting electrons events caused by muons, so I
+ # should implement that.
+ #
+ # Note: We currently don't look across run boundaries. This should be a
+ # *very* small effect, and the logic to do so would be very complicated
+ # since I would have to deal with 50 MHz clock rollovers, etc.
+ #
+ # Do a simple python for loop here over the runs since we create
+ # temporary arrays with shape (michel.size,prompt_plus_muons.size)
+ # which could be too big for the full dataset.
+ #
+ # There might be some clever way to do this without the loop in Pandas,
+ # but I don't know how.
+ michel_sum = None
+ for run, michel_run in michel.groupby(['run']):
+ prompt_plus_muons_run = prompt_plus_muons[prompt_plus_muons['run'] == run]
+ michel_run = michel_run[np.any((michel_run.gtr.values > prompt_plus_muons_run.gtr.values[:,np.newaxis] + 800) & \
+ (michel_run.gtr.values < prompt_plus_muons_run.gtr.values[:,np.newaxis] + 20e3),axis=0)]
+
+ if michel_sum is None:
+ michel_sum = michel_run
+ else:
+ michel_sum = michel_sum.append(michel_run)
+
+ michel = michel_sum
else:
michel = pd.DataFrame(columns=follower.columns)
if prompt.size and follower.size:
# neutron followers have to obey stricter set of data cleaning cuts
neutron = follower[follower.dc & (DC_JUNK | DC_CRATE_ISOTROPY | DC_QVNHIT | DC_FLASHER | DC_NECK | DC_ESUM | DC_OWL | DC_OWL_TRIGGER | DC_FTS) == DC_FTS]
- neutron = neutron[~np.isnan(neutron.ftpx) & ~np.isnan(neutron.rsp_energy)]
- r = np.sqrt(neutron.ftpx**2 + neutron.ftpy**2 + neutron.ftpz**2)
+ neutron = neutron[~np.isnan(neutron.ftp_x) & ~np.isnan(neutron.rsp_energy)]
+ r = np.sqrt(neutron.ftp_x**2 + neutron.ftp_y**2 + neutron.ftp_z**2)
neutron = neutron[r < AV_RADIUS]
neutron = neutron[neutron.rsp_energy > 4.0]
+
# neutron events accepted after 20 microseconds and before 250 ms (50 ms during salt)
# FIXME: need to deal with 50 MHz clock rollover
- neutron_mask = np.any((neutron.run.values == prompt.run.values[:,np.newaxis]) & \
- (neutron.gtr.values > prompt.gtr.values[:,np.newaxis] + 20e3) & \
- (neutron.gtr.values < prompt.gtr.values[:,np.newaxis] + 250e6),axis=1)
- df_atm = prompt[neutron_mask]
- prompt = prompt[~neutron_mask]
+ prompt_sum = None
+ atm_sum = None
+ for run, prompt_run in prompt.groupby(['run']):
+ neutron_run = neutron[neutron['run'] == run]
+
+ neutron_mask = np.any((neutron_run.gtr.values > prompt_run.gtr.values[:,np.newaxis] + 20e3) & \
+ (neutron_run.gtr.values < prompt_run.gtr.values[:,np.newaxis] + 250e6),axis=1)
+
+ if prompt_sum is None:
+ prompt_sum = prompt_run[~neutron_mask]
+ else:
+ prompt_sum = prompt_sum.append(prompt_run[~neutron_mask])
+
+ if atm_sum is None:
+ atm_sum = prompt_run[neutron_mask]
+ else:
+ atm_sum = atm_sum.append(prompt_run[neutron_mask])
+
+ atm = atm_sum
+ prompt = prompt_sum
else:
- df_atm = pd.DataFrame(columns=prompt.columns)
+ atm = pd.DataFrame(columns=prompt.columns)
print("number of events after neutron follower cut = %i" % len(prompt))
# Get rid of muon events in our main event sample
prompt = prompt[(prompt.dc & DC_MUON) == 0]
- df_atm = df_atm[(df_atm.dc & DC_MUON) == 0]
+ atm = atm[(atm.dc & DC_MUON) == 0]
print("number of events after muon cut = %i" % len(prompt))
@@ -317,52 +285,52 @@ if __name__ == '__main__':
prompt = prompt[~np.any((prompt.run.values == muons.run.values[:,np.newaxis]) & \
(prompt.gtr.values > muons.gtr.values[:,np.newaxis]) & \
(prompt.gtr.values <= (muons.gtr.values[:,np.newaxis] + 200e3)),axis=0)]
- df_atm = df_atm[~np.any((df_atm.run.values == muons.run.values[:,np.newaxis]) & \
- (df_atm.gtr.values > muons.gtr.values[:,np.newaxis]) & \
- (df_atm.gtr.values <= (muons.gtr.values[:,np.newaxis] + 200e3)),axis=0)]
+ atm = atm[~np.any((atm.run.values == muons.run.values[:,np.newaxis]) & \
+ (atm.gtr.values > muons.gtr.values[:,np.newaxis]) & \
+ (atm.gtr.values <= (muons.gtr.values[:,np.newaxis] + 200e3)),axis=0)]
# Check to see if there are any events with missing fit information
- df_atm_ra = df_atm[['run','gtid']].to_records(index=False)
+ atm_ra = atm[['run','gtid']].to_records(index=False)
muons_ra = muons[['run','gtid']].to_records(index=False)
prompt_ra = prompt[['run','gtid']].to_records(index=False)
michel_ra = michel[['run','gtid']].to_records(index=False)
- df_ra = df[['run','gtid']].to_records(index=False)
+ fits_ra = fits[['run','gtid']].to_records(index=False)
- if np.count_nonzero(~np.isin(df_atm_ra,df_ra)):
- print_warning("skipping %i atmospheric events because they are missing fit information!" % np.count_nonzero(~np.isin(df_atm_ra,df_ra)))
+ if len(atm_ra) and np.count_nonzero(~np.isin(atm_ra,fits_ra)):
+ print_warning("skipping %i atmospheric events because they are missing fit information!" % np.count_nonzero(~np.isin(atm_ra,fits_ra)))
- if np.count_nonzero(~np.isin(muons_ra,df_ra)):
- print_warning("skipping %i muon events because they are missing fit information!" % np.count_nonzero(~np.isin(muons_ra,df_ra)))
+ if len(muons_ra) and np.count_nonzero(~np.isin(muons_ra,fits_ra)):
+ print_warning("skipping %i muon events because they are missing fit information!" % np.count_nonzero(~np.isin(muons_ra,fits_ra)))
- if np.count_nonzero(~np.isin(prompt_ra,df_ra)):
- print_warning("skipping %i signal events because they are missing fit information!" % np.count_nonzero(~np.isin(prompt_ra,df_ra)))
+ if len(prompt_ra) and np.count_nonzero(~np.isin(prompt_ra,fits_ra)):
+ print_warning("skipping %i signal events because they are missing fit information!" % np.count_nonzero(~np.isin(prompt_ra,fits_ra)))
- if np.count_nonzero(~np.isin(michel_ra,df_ra)):
- print_warning("skipping %i Michel events because they are missing fit information!" % np.count_nonzero(~np.isin(michel_ra,df_ra)))
+ if len(michel_ra) and np.count_nonzero(~np.isin(michel_ra,fits_ra)):
+ print_warning("skipping %i Michel events because they are missing fit information!" % np.count_nonzero(~np.isin(michel_ra,fits_ra)))
# Now, we merge the event info with the fitter info.
#
# Note: This means that the dataframe now contains multiple rows for each
# event, one for each fit hypothesis.
- df_atm = pd.merge(df,df_atm,how='inner',on=['run','gtid'])
- muons = pd.merge(df,muons,how='inner',on=['run','gtid'])
- michel = pd.merge(df,michel,how='inner',on=['run','gtid'])
- df = pd.merge(df,prompt,how='inner',on=['run','gtid'])
+ atm = pd.merge(fits,atm,how='inner',on=['run','gtid'])
+ muons = pd.merge(fits,muons,how='inner',on=['run','gtid'])
+ michel = pd.merge(fits,michel,how='inner',on=['run','gtid'])
+ prompt = pd.merge(fits,prompt,how='inner',on=['run','gtid'])
# get rid of events which don't have a fit
- nan = np.isnan(df.fmin.values)
+ nan = np.isnan(prompt.fmin.values)
if np.count_nonzero(nan):
- print_warning("skipping %i signal events because the negative log likelihood is nan!" % len(df[nan].groupby(['run','gtid'])))
+ print_warning("skipping %i signal events because the negative log likelihood is nan!" % len(prompt[nan].groupby(['run','gtid'])))
- df = df[~nan]
+ prompt = prompt[~nan]
- nan_atm = np.isnan(df_atm.fmin.values)
+ nan_atm = np.isnan(atm.fmin.values)
if np.count_nonzero(nan_atm):
- print_warning("skipping %i atmospheric events because the negative log likelihood is nan!" % len(df_atm[nan_atm].groupby(['run','gtid'])))
+ print_warning("skipping %i atmospheric events because the negative log likelihood is nan!" % len(atm[nan_atm].groupby(['run','gtid'])))
- df_atm = df_atm[~nan_atm]
+ atm = atm[~nan_atm]
nan_muon = np.isnan(muons.fmin.values)
@@ -379,24 +347,24 @@ if __name__ == '__main__':
michel = michel[~nan_michel]
# get the best fit
- df = df.sort_values('fmin').groupby(['run','gtid']).first()
- df_atm = df_atm.sort_values('fmin').groupby(['run','gtid']).first()
+ prompt = prompt.sort_values('fmin').groupby(['run','gtid']).first()
+ atm = atm.sort_values('fmin').groupby(['run','gtid']).first()
michel_best_fit = michel.sort_values('fmin').groupby(['run','gtid']).first()
muon_best_fit = muons.sort_values('fmin').groupby(['run','gtid']).first()
muons = muons[muons.id == 22]
# require r < 6 meters
- df = df[np.sqrt(df.x.values**2 + df.y.values**2 + df.z.values**2) < AV_RADIUS]
- df_atm = df_atm[np.sqrt(df_atm.x.values**2 + df_atm.y.values**2 + df_atm.z.values**2) < AV_RADIUS]
+ prompt = prompt[np.sqrt(prompt.x.values**2 + prompt.y.values**2 + prompt.z.values**2) < AV_RADIUS]
+ atm = atm[np.sqrt(atm.x.values**2 + atm.y.values**2 + atm.z.values**2) < AV_RADIUS]
- print("number of events after radius cut = %i" % len(df))
+ print("number of events after radius cut = %i" % len(prompt))
# Note: Need to design and apply a psi based cut here
plt.figure()
- plot_hist(df,"Without Neutron Follower")
+ plot_hist(prompt,"Without Neutron Follower")
plt.figure()
- plot_hist(df_atm,"With Neutron Follower")
+ plot_hist(atm,"With Neutron Follower")
plt.figure()
plot_hist(michel_best_fit,"Michel Electrons")
plt.figure()