diff options
Diffstat (limited to 'utils/plot-energy')
-rwxr-xr-x | utils/plot-energy | 294 |
1 files changed, 131 insertions, 163 deletions
diff --git a/utils/plot-energy b/utils/plot-energy index a0274b7..4a8521b 100755 --- a/utils/plot-energy +++ b/utils/plot-energy @@ -107,140 +107,69 @@ if __name__ == '__main__': import numpy as np import pandas as pd import sys + import h5py parser = argparse.ArgumentParser("plot fit results") parser.add_argument("filenames", nargs='+', help="input files") args = parser.parse_args() - events = [] - fit_results = [] - - for filename in args.filenames: - print(filename) - with open(filename) as f: - data = yaml.load(f,Loader=Loader) - - for i, event in enumerate(data['data']): - for ev in event['ev']: - events.append(( - ev['run'], - ev['gtr'], - ev['nhit'], - ev['gtid'], - ev['dc'], - ev['ftp']['x'] if 'ftp' in ev else np.nan, - ev['ftp']['y'] if 'ftp' in ev else np.nan, - ev['ftp']['z'] if 'ftp' in ev else np.nan, - ev['rsp']['energy'] if 'rsp' in ev and ev['rsp']['energy'] > 0 else np.nan, - )) - - if 'fit' not in ev: - continue - - for id, fit_result in [x for x in ev['fit'].iteritems() if isinstance(x[0],int)]: - # FIXME: Should I just store the particle ids in the YAML - # output as a list of particle ids instead of a single - # integer? - ids = map(int,chunks(str(id),2)) - energy = 0.0 - skip = False - for i, ke in zip(ids,np.atleast_1d(fit_result['energy'])): - energy += ke + SNOMAN_MASS[i] - - # This is a bit of a hack. It appears that many times - # the fit will actually do much better by including a - # very low energy electron or muon. I believe the - # reason for this is that of course my likelihood - # function is not perfect (for example, I don't include - # the correct angular distribution for Rayleigh - # scattered light), and so the fitter often wants to - # add a very low energy electron or muon to fix things. - # - # Ideally I would fix the likelihood function, but for - # now we just discard any fit results which have a very - # low energy electron or muon. - if len(ids) > 1 and i == 20 and ke < 20.0: - skip = True - - if len(ids) > 1 and i == 22 and ke < 200.0: - skip = True - - if skip: - continue - - # Calculate the approximate Ockham factor. - # See Chapter 20 in "Probability Theory: The Logic of Science" by Jaynes - # - # Note: This is a really approximate form by assuming that - # the shape of the likelihood space is equal to the average - # uncertainty in the different parameters. - w = len(ids)*np.log(0.1*0.001) + np.sum(np.log(fit_result['energy'])) + len(ids)*np.log(1e-4/(4*np.pi)) - - fit_results.append(( - ev['run'], - ev['gtid'], - id, - fit_result['posx'], - fit_result['posy'], - fit_result['posz'], - fit_result['t0'], - energy, - np.atleast_1d(fit_result['theta'])[0], - np.atleast_1d(fit_result['phi'])[0], - fit_result['fmin'] - w, - fit_result['psi']/ev['nhit'])) - - # create a dataframe - # note: we have to first create a numpy structured array since there is no - # way to pass a list of data types to the DataFrame constructor. See - # https://github.com/pandas-dev/pandas/issues/4464 - array = np.array(fit_results, - dtype=[('run',np.int), # run number - ('gtid',np.int), # gtid - ('id',np.int), # particle id - ('x', np.double), # x - ('y',np.double), # y - ('z',np.double), # z - ('t0',np.double), # t0 - ('ke',np.double), # kinetic energy - ('theta',np.double), # direction of 1st particle - ('phi',np.double), # direction of 2nd particle - ('fmin',np.double), # negative log likelihood - ('psi',np.double)] # goodness of fit parameter - ) - df = pd.DataFrame.from_records(array) - - array = np.array(events, - dtype=[('run',np.int), # run number - ('gtr',np.double), # 50 MHz clock in ns - ('nhit',np.int), # number of PMTs hit - ('gtid',np.int), # gtid - ('dc',np.int), # data cleaning word - ('ftpx',np.double), # FTP fitter X position - ('ftpy',np.double), # FTP fitter Y position - ('ftpz',np.double), # FTP fitter Z position - ('rsp_energy',np.double)] # RSP energy - ) - df_ev = pd.DataFrame.from_records(array) + ev = pd.concat([pd.read_hdf(filename, "ev") for filename in args.filenames]) + fits = pd.concat([pd.read_hdf(filename, "fits") for filename in args.filenames]) + + # This is a bit of a hack. It appears that many times the fit will + # actually do much better by including a very low energy electron or + # muon. I believe the reason for this is that of course my likelihood + # function is not perfect (for example, I don't include the correct + # angular distribution for Rayleigh scattered light), and so the fitter + # often wants to add a very low energy electron or muon to fix things. + # + # Ideally I would fix the likelihood function, but for now we just + # discard any fit results which have a very low energy electron or + # muon. + # + # FIXME: Test this since query() is new to pandas + fits = fits.query('not (n > 1 and ((id1 == 20 and energy1 < 20) or (id2 == 20 and energy2 < 20) or (id3 == 20 and energy3 < 20)))') + fits = fits.query('not (n > 1 and ((id2 == 22 and energy1 < 200) or (id2 == 22 and energy2 < 200) or (id3 == 22 and energy3 < 200)))') + + # Calculate the approximate Ockham factor. + # See Chapter 20 in "Probability Theory: The Logic of Science" by Jaynes + # + # Note: This is a really approximate form by assuming that the shape of + # the likelihood space is equal to the average uncertainty in the + # different parameters. + fits['w'] = fits['n']*np.log(0.1*0.001) + np.log(fits['energy1']) + fits['n']*np.log(1e-4/(4*np.pi)) + + # Note: we index on the left hand site with loc to avoid a copy error + # + # See https://www.dataquest.io/blog/settingwithcopywarning/ + fits.loc[fits['n'] > 1, 'w'] += np.log(fits[fits['n'] > 1]['energy2']) + fits.loc[fits['n'] > 2, 'w'] += np.log(fits[fits['n'] > 2]['energy3']) + + fits['fmin'] = fits['fmin'] - fits['w'] + + fits['psi'] /= fits.merge(ev,on=['run','gtid'])['nhit'] + fits['ke'] = fits['energy1'] + fits['id'] = fits['id1'] + fits['id2']*100 + fits['id3']*10000 + fits['theta'] = fits['theta1'] # Make sure events are in order. We use run number and GTID here which # should be in time order as well except for bit flips in the GTID # This is important for later when we look at time differences in the 50 # MHz clock. We should perhaps do a check here based on the 10 MHz clock to # make sure things are in order - df_ev = df_ev.sort_values(by=['run','gtid']) + ev = ev.sort_values(by=['run','gtid']) - print("number of events = %i" % len(df_ev)) + print("number of events = %i" % len(ev)) # First, do basic data cleaning which is done for all events. - df_ev = df_ev[df_ev.dc & (DC_JUNK | DC_CRATE_ISOTROPY | DC_QVNHIT | DC_FLASHER | DC_NECK) == 0] + ev = ev[ev.dc & (DC_JUNK | DC_CRATE_ISOTROPY | DC_QVNHIT | DC_FLASHER | DC_NECK) == 0] - print("number of events after data cleaning = %i" % len(df_ev)) + print("number of events after data cleaning = %i" % len(ev)) # Now, we select events tagged by the muon tag which should tag only # external muons. We keep the sample of muons since it's needed later to # identify Michel electrons and to apply the muon follower cut - muons = df_ev[(df_ev.dc & DC_MUON) != 0] + muons = ev[(ev.dc & DC_MUON) != 0] print("number of muons = %i" % len(muons)) @@ -251,12 +180,12 @@ if __name__ == '__main__': # # Should I use 10 MHz clock here since the time isn't critical and then I # don't have to worry about 50 MHz clock rollover? - prompt = df_ev[df_ev.nhit >= 100] + prompt = ev[ev.nhit >= 100] prompt_mask = np.concatenate(([True],np.diff(prompt.gtr.values) > 250e6)) # Michel electrons and neutrons can be any event which is not a prompt # event - follower = pd.concat([df_ev[df_ev.nhit < 100],prompt[~prompt_mask]]) + follower = pd.concat([ev[ev.nhit < 100],prompt[~prompt_mask]]) # Apply the prompt mask prompt = prompt[prompt_mask] @@ -275,39 +204,78 @@ if __name__ == '__main__': if prompt_plus_muons.size and follower.size: # require Michel events to pass more of the SNO data cleaning cuts michel = follower[follower.dc & (DC_JUNK | DC_CRATE_ISOTROPY | DC_QVNHIT | DC_FLASHER | DC_NECK | DC_ESUM | DC_OWL | DC_OWL_TRIGGER | DC_FTS) == DC_FTS] + michel = michel[michel.nhit >= 100] - # accept events which had a muon more than 800 nanoseconds but less than 20 microseconds before them - # The 800 nanoseconds cut comes from Richie's thesis. He also mentions - # that the In Time Channel Spread Cut is very effective at cutting - # electrons events caused by muons, so I should implement that - michel = michel[np.any((michel.run.values == prompt_plus_muons.run.values[:,np.newaxis]) & \ - (michel.gtr.values > prompt_plus_muons.gtr.values[:,np.newaxis] + 800) & \ - (michel.gtr.values < prompt_plus_muons.gtr.values[:,np.newaxis] + 20e3),axis=0)] + + # Accept events which had a muon more than 800 nanoseconds but less + # than 20 microseconds before them. The 800 nanoseconds cut comes from + # Richie's thesis. He also mentions that the In Time Channel Spread Cut + # is very effective at cutting electrons events caused by muons, so I + # should implement that. + # + # Note: We currently don't look across run boundaries. This should be a + # *very* small effect, and the logic to do so would be very complicated + # since I would have to deal with 50 MHz clock rollovers, etc. + # + # Do a simple python for loop here over the runs since we create + # temporary arrays with shape (michel.size,prompt_plus_muons.size) + # which could be too big for the full dataset. + # + # There might be some clever way to do this without the loop in Pandas, + # but I don't know how. + michel_sum = None + for run, michel_run in michel.groupby(['run']): + prompt_plus_muons_run = prompt_plus_muons[prompt_plus_muons['run'] == run] + michel_run = michel_run[np.any((michel_run.gtr.values > prompt_plus_muons_run.gtr.values[:,np.newaxis] + 800) & \ + (michel_run.gtr.values < prompt_plus_muons_run.gtr.values[:,np.newaxis] + 20e3),axis=0)] + + if michel_sum is None: + michel_sum = michel_run + else: + michel_sum = michel_sum.append(michel_run) + + michel = michel_sum else: michel = pd.DataFrame(columns=follower.columns) if prompt.size and follower.size: # neutron followers have to obey stricter set of data cleaning cuts neutron = follower[follower.dc & (DC_JUNK | DC_CRATE_ISOTROPY | DC_QVNHIT | DC_FLASHER | DC_NECK | DC_ESUM | DC_OWL | DC_OWL_TRIGGER | DC_FTS) == DC_FTS] - neutron = neutron[~np.isnan(neutron.ftpx) & ~np.isnan(neutron.rsp_energy)] - r = np.sqrt(neutron.ftpx**2 + neutron.ftpy**2 + neutron.ftpz**2) + neutron = neutron[~np.isnan(neutron.ftp_x) & ~np.isnan(neutron.rsp_energy)] + r = np.sqrt(neutron.ftp_x**2 + neutron.ftp_y**2 + neutron.ftp_z**2) neutron = neutron[r < AV_RADIUS] neutron = neutron[neutron.rsp_energy > 4.0] + # neutron events accepted after 20 microseconds and before 250 ms (50 ms during salt) # FIXME: need to deal with 50 MHz clock rollover - neutron_mask = np.any((neutron.run.values == prompt.run.values[:,np.newaxis]) & \ - (neutron.gtr.values > prompt.gtr.values[:,np.newaxis] + 20e3) & \ - (neutron.gtr.values < prompt.gtr.values[:,np.newaxis] + 250e6),axis=1) - df_atm = prompt[neutron_mask] - prompt = prompt[~neutron_mask] + prompt_sum = None + atm_sum = None + for run, prompt_run in prompt.groupby(['run']): + neutron_run = neutron[neutron['run'] == run] + + neutron_mask = np.any((neutron_run.gtr.values > prompt_run.gtr.values[:,np.newaxis] + 20e3) & \ + (neutron_run.gtr.values < prompt_run.gtr.values[:,np.newaxis] + 250e6),axis=1) + + if prompt_sum is None: + prompt_sum = prompt_run[~neutron_mask] + else: + prompt_sum = prompt_sum.append(prompt_run[~neutron_mask]) + + if atm_sum is None: + atm_sum = prompt_run[neutron_mask] + else: + atm_sum = atm_sum.append(prompt_run[neutron_mask]) + + atm = atm_sum + prompt = prompt_sum else: - df_atm = pd.DataFrame(columns=prompt.columns) + atm = pd.DataFrame(columns=prompt.columns) print("number of events after neutron follower cut = %i" % len(prompt)) # Get rid of muon events in our main event sample prompt = prompt[(prompt.dc & DC_MUON) == 0] - df_atm = df_atm[(df_atm.dc & DC_MUON) == 0] + atm = atm[(atm.dc & DC_MUON) == 0] print("number of events after muon cut = %i" % len(prompt)) @@ -317,52 +285,52 @@ if __name__ == '__main__': prompt = prompt[~np.any((prompt.run.values == muons.run.values[:,np.newaxis]) & \ (prompt.gtr.values > muons.gtr.values[:,np.newaxis]) & \ (prompt.gtr.values <= (muons.gtr.values[:,np.newaxis] + 200e3)),axis=0)] - df_atm = df_atm[~np.any((df_atm.run.values == muons.run.values[:,np.newaxis]) & \ - (df_atm.gtr.values > muons.gtr.values[:,np.newaxis]) & \ - (df_atm.gtr.values <= (muons.gtr.values[:,np.newaxis] + 200e3)),axis=0)] + atm = atm[~np.any((atm.run.values == muons.run.values[:,np.newaxis]) & \ + (atm.gtr.values > muons.gtr.values[:,np.newaxis]) & \ + (atm.gtr.values <= (muons.gtr.values[:,np.newaxis] + 200e3)),axis=0)] # Check to see if there are any events with missing fit information - df_atm_ra = df_atm[['run','gtid']].to_records(index=False) + atm_ra = atm[['run','gtid']].to_records(index=False) muons_ra = muons[['run','gtid']].to_records(index=False) prompt_ra = prompt[['run','gtid']].to_records(index=False) michel_ra = michel[['run','gtid']].to_records(index=False) - df_ra = df[['run','gtid']].to_records(index=False) + fits_ra = fits[['run','gtid']].to_records(index=False) - if np.count_nonzero(~np.isin(df_atm_ra,df_ra)): - print_warning("skipping %i atmospheric events because they are missing fit information!" % np.count_nonzero(~np.isin(df_atm_ra,df_ra))) + if len(atm_ra) and np.count_nonzero(~np.isin(atm_ra,fits_ra)): + print_warning("skipping %i atmospheric events because they are missing fit information!" % np.count_nonzero(~np.isin(atm_ra,fits_ra))) - if np.count_nonzero(~np.isin(muons_ra,df_ra)): - print_warning("skipping %i muon events because they are missing fit information!" % np.count_nonzero(~np.isin(muons_ra,df_ra))) + if len(muons_ra) and np.count_nonzero(~np.isin(muons_ra,fits_ra)): + print_warning("skipping %i muon events because they are missing fit information!" % np.count_nonzero(~np.isin(muons_ra,fits_ra))) - if np.count_nonzero(~np.isin(prompt_ra,df_ra)): - print_warning("skipping %i signal events because they are missing fit information!" % np.count_nonzero(~np.isin(prompt_ra,df_ra))) + if len(prompt_ra) and np.count_nonzero(~np.isin(prompt_ra,fits_ra)): + print_warning("skipping %i signal events because they are missing fit information!" % np.count_nonzero(~np.isin(prompt_ra,fits_ra))) - if np.count_nonzero(~np.isin(michel_ra,df_ra)): - print_warning("skipping %i Michel events because they are missing fit information!" % np.count_nonzero(~np.isin(michel_ra,df_ra))) + if len(michel_ra) and np.count_nonzero(~np.isin(michel_ra,fits_ra)): + print_warning("skipping %i Michel events because they are missing fit information!" % np.count_nonzero(~np.isin(michel_ra,fits_ra))) # Now, we merge the event info with the fitter info. # # Note: This means that the dataframe now contains multiple rows for each # event, one for each fit hypothesis. - df_atm = pd.merge(df,df_atm,how='inner',on=['run','gtid']) - muons = pd.merge(df,muons,how='inner',on=['run','gtid']) - michel = pd.merge(df,michel,how='inner',on=['run','gtid']) - df = pd.merge(df,prompt,how='inner',on=['run','gtid']) + atm = pd.merge(fits,atm,how='inner',on=['run','gtid']) + muons = pd.merge(fits,muons,how='inner',on=['run','gtid']) + michel = pd.merge(fits,michel,how='inner',on=['run','gtid']) + prompt = pd.merge(fits,prompt,how='inner',on=['run','gtid']) # get rid of events which don't have a fit - nan = np.isnan(df.fmin.values) + nan = np.isnan(prompt.fmin.values) if np.count_nonzero(nan): - print_warning("skipping %i signal events because the negative log likelihood is nan!" % len(df[nan].groupby(['run','gtid']))) + print_warning("skipping %i signal events because the negative log likelihood is nan!" % len(prompt[nan].groupby(['run','gtid']))) - df = df[~nan] + prompt = prompt[~nan] - nan_atm = np.isnan(df_atm.fmin.values) + nan_atm = np.isnan(atm.fmin.values) if np.count_nonzero(nan_atm): - print_warning("skipping %i atmospheric events because the negative log likelihood is nan!" % len(df_atm[nan_atm].groupby(['run','gtid']))) + print_warning("skipping %i atmospheric events because the negative log likelihood is nan!" % len(atm[nan_atm].groupby(['run','gtid']))) - df_atm = df_atm[~nan_atm] + atm = atm[~nan_atm] nan_muon = np.isnan(muons.fmin.values) @@ -379,24 +347,24 @@ if __name__ == '__main__': michel = michel[~nan_michel] # get the best fit - df = df.sort_values('fmin').groupby(['run','gtid']).first() - df_atm = df_atm.sort_values('fmin').groupby(['run','gtid']).first() + prompt = prompt.sort_values('fmin').groupby(['run','gtid']).first() + atm = atm.sort_values('fmin').groupby(['run','gtid']).first() michel_best_fit = michel.sort_values('fmin').groupby(['run','gtid']).first() muon_best_fit = muons.sort_values('fmin').groupby(['run','gtid']).first() muons = muons[muons.id == 22] # require r < 6 meters - df = df[np.sqrt(df.x.values**2 + df.y.values**2 + df.z.values**2) < AV_RADIUS] - df_atm = df_atm[np.sqrt(df_atm.x.values**2 + df_atm.y.values**2 + df_atm.z.values**2) < AV_RADIUS] + prompt = prompt[np.sqrt(prompt.x.values**2 + prompt.y.values**2 + prompt.z.values**2) < AV_RADIUS] + atm = atm[np.sqrt(atm.x.values**2 + atm.y.values**2 + atm.z.values**2) < AV_RADIUS] - print("number of events after radius cut = %i" % len(df)) + print("number of events after radius cut = %i" % len(prompt)) # Note: Need to design and apply a psi based cut here plt.figure() - plot_hist(df,"Without Neutron Follower") + plot_hist(prompt,"Without Neutron Follower") plt.figure() - plot_hist(df_atm,"With Neutron Follower") + plot_hist(atm,"With Neutron Follower") plt.figure() plot_hist(michel_best_fit,"Michel Electrons") plt.figure() |