1 files changed, 131 insertions, 163 deletions
diff --git a/utils/plot-energy b/utils/plot-energy
index a0274b7..4a8521b 100755
--- a/utils/plot-energy
+++ b/utils/plot-energy
@@ -107,140 +107,69 @@ if __name__ == '__main__':
     import numpy as np
     import pandas as pd
     import sys
+    import h5py
 
     parser = argparse.ArgumentParser("plot fit results")
     parser.add_argument("filenames", nargs='+', help="input files")
     args = parser.parse_args()
 
-    events = []
-    fit_results = []
-
-    for filename in args.filenames:
-        print(filename)
-        with open(filename) as f:
-            data = yaml.load(f,Loader=Loader)
-
-        for i, event in enumerate(data['data']):
-            for ev in event['ev']:
-                events.append((
-                    ev['run'],
-                    ev['gtr'],
-                    ev['nhit'],
-                    ev['gtid'],
-                    ev['dc'],
-                    ev['ftp']['x'] if 'ftp' in ev else np.nan,
-                    ev['ftp']['y'] if 'ftp' in ev else np.nan,
-                    ev['ftp']['z'] if 'ftp' in ev else np.nan,
-                    ev['rsp']['energy'] if 'rsp' in ev and ev['rsp']['energy'] > 0 else np.nan,
-                    ))
-
-                if 'fit' not in ev:
-                    continue
-
-                for id, fit_result in [x for x in ev['fit'].iteritems() if isinstance(x[0],int)]:
-                    # FIXME: Should I just store the particle ids in the YAML
-                    # output as a list of particle ids instead of a single
-                    # integer?
-                    ids = map(int,chunks(str(id),2))
-                    energy = 0.0
-                    skip = False
-                    for i, ke in zip(ids,np.atleast_1d(fit_result['energy'])):
-                        energy += ke + SNOMAN_MASS[i]
-
-                        # This is a bit of a hack. It appears that many times
-                        # the fit will actually do much better by including a
-                        # very low energy electron or muon. I believe the
-                        # reason for this is that of course my likelihood
-                        # function is not perfect (for example, I don't include
-                        # the correct angular distribution for Rayleigh
-                        # scattered light), and so the fitter often wants to
-                        # add a very low energy electron or muon to fix things.
-                        #
-                        # Ideally I would fix the likelihood function, but for
-                        # now we just discard any fit results which have a very
-                        # low energy electron or muon.
-                        if len(ids) > 1 and i == 20 and ke < 20.0:
-                            skip = True
-
-                        if len(ids) > 1 and i == 22 and ke < 200.0:
-                            skip = True
-
-                    if skip:
-                        continue
-
-                    # Calculate the approximate Ockham factor.
-                    # See Chapter 20 in "Probability Theory: The Logic of Science" by Jaynes
-                    #
-                    # Note: This is a really approximate form by assuming that
-                    # the shape of the likelihood space is equal to the average
-                    # uncertainty in the different parameters.
-                    w = len(ids)*np.log(0.1*0.001) + np.sum(np.log(fit_result['energy'])) + len(ids)*np.log(1e-4/(4*np.pi))
-
-                    fit_results.append((
-                        ev['run'],
-                        ev['gtid'],
-                        id,
-                        fit_result['posx'],
-                        fit_result['posy'],
-                        fit_result['posz'],
-                        fit_result['t0'],
-                        energy,
-                        np.atleast_1d(fit_result['theta'])[0],
-                        np.atleast_1d(fit_result['phi'])[0],
-                        fit_result['fmin'] - w,
-                        fit_result['psi']/ev['nhit']))
-
-    # create a dataframe
-    # note: we have to first create a numpy structured array since there is no
-    # way to pass a list of data types to the DataFrame constructor. See
-    # https://github.com/pandas-dev/pandas/issues/4464
-    array = np.array(fit_results,
-                     dtype=[('run',np.int),      # run number
-                            ('gtid',np.int),     # gtid
-                            ('id',np.int),       # particle id
-                            ('x', np.double),    # x
-                            ('y',np.double),     # y
-                            ('z',np.double),     # z
-                            ('t0',np.double),    # t0
-                            ('ke',np.double),    # kinetic energy
-                            ('theta',np.double), # direction of 1st particle
-                            ('phi',np.double),   # direction of 2nd particle
-                            ('fmin',np.double),  # negative log likelihood
-                            ('psi',np.double)]   # goodness of fit parameter
-                   )
-    df = pd.DataFrame.from_records(array)
-
-    array = np.array(events,
-                     dtype=[('run',np.int),           # run number
-                            ('gtr',np.double),        # 50 MHz clock in ns
-                            ('nhit',np.int),          # number of PMTs hit
-                            ('gtid',np.int),          # gtid
-                            ('dc',np.int),            # data cleaning word
-                            ('ftpx',np.double),       # FTP fitter X position
-                            ('ftpy',np.double),       # FTP fitter Y position
-                            ('ftpz',np.double),       # FTP fitter Z position
-                            ('rsp_energy',np.double)] # RSP energy
-                   )
-    df_ev = pd.DataFrame.from_records(array)
+    ev = pd.concat([pd.read_hdf(filename, "ev") for filename in args.filenames])
+    fits = pd.concat([pd.read_hdf(filename, "fits") for filename in args.filenames])
+
+    # This is a bit of a hack. It appears that many times the fit will
+    # actually do much better by including a very low energy electron or
+    # muon. I believe the reason for this is that of course my likelihood
+    # function is not perfect (for example, I don't include the correct
+    # angular distribution for Rayleigh scattered light), and so the fitter
+    # often wants to add a very low energy electron or muon to fix things.
+    #
+    # Ideally I would fix the likelihood function, but for now we just
+    # discard any fit results which have a very low energy electron or
+    # muon.
+    #
+    # FIXME: Test this since query() is new to pandas
+    fits = fits.query('not (n > 1 and ((id1 == 20 and energy1 < 20)  or (id2 == 20 and energy2 < 20)  or (id3 == 20 and energy3 < 20)))')
+    fits = fits.query('not (n > 1 and ((id2 == 22 and energy1 < 200) or (id2 == 22 and energy2 < 200) or (id3 == 22 and energy3 < 200)))')
+
+    # Calculate the approximate Ockham factor.
+    # See Chapter 20 in "Probability Theory: The Logic of Science" by Jaynes
+    #
+    # Note: This is a really approximate form by assuming that the shape of
+    # the likelihood space is equal to the average uncertainty in the
+    # different parameters.
+    fits['w'] = fits['n']*np.log(0.1*0.001) + np.log(fits['energy1']) + fits['n']*np.log(1e-4/(4*np.pi))
+
+    # Note: we index on the left hand site with loc to avoid a copy error
+    #
+    # See https://www.dataquest.io/blog/settingwithcopywarning/
+    fits.loc[fits['n'] > 1, 'w'] += np.log(fits[fits['n'] > 1]['energy2'])
+    fits.loc[fits['n'] > 2, 'w'] += np.log(fits[fits['n'] > 2]['energy3'])
+
+    fits['fmin'] = fits['fmin'] - fits['w']
+
+    fits['psi'] /= fits.merge(ev,on=['run','gtid'])['nhit']
+    fits['ke'] = fits['energy1']
+    fits['id'] = fits['id1'] + fits['id2']*100 + fits['id3']*10000
+    fits['theta'] = fits['theta1']
 
     # Make sure events are in order. We use run number and GTID here which
     # should be in time order as well except for bit flips in the GTID
     # This is important for later when we look at time differences in the 50
     # MHz clock. We should perhaps do a check here based on the 10 MHz clock to
     # make sure things are in order
-    df_ev = df_ev.sort_values(by=['run','gtid'])
+    ev = ev.sort_values(by=['run','gtid'])
 
-    print("number of events = %i" % len(df_ev))
+    print("number of events = %i" % len(ev))
 
     # First, do basic data cleaning which is done for all events.
-    df_ev = df_ev[df_ev.dc & (DC_JUNK | DC_CRATE_ISOTROPY | DC_QVNHIT | DC_FLASHER | DC_NECK) == 0]
+    ev = ev[ev.dc & (DC_JUNK | DC_CRATE_ISOTROPY | DC_QVNHIT | DC_FLASHER | DC_NECK) == 0]
 
-    print("number of events after data cleaning = %i" % len(df_ev))
+    print("number of events after data cleaning = %i" % len(ev))
 
     # Now, we select events tagged by the muon tag which should tag only
     # external muons. We keep the sample of muons since it's needed later to
     # identify Michel electrons and to apply the muon follower cut
-    muons = df_ev[(df_ev.dc & DC_MUON) != 0]
+    muons = ev[(ev.dc & DC_MUON) != 0]
 
     print("number of muons = %i" % len(muons))
 
@@ -251,12 +180,12 @@ if __name__ == '__main__':
     #
     # Should I use 10 MHz clock here since the time isn't critical and then I
     # don't have to worry about 50 MHz clock rollover?
-    prompt = df_ev[df_ev.nhit >= 100]
+    prompt = ev[ev.nhit >= 100]
     prompt_mask = np.concatenate(([True],np.diff(prompt.gtr.values) > 250e6))
 
     # Michel electrons and neutrons can be any event which is not a prompt
     # event
-    follower = pd.concat([df_ev[df_ev.nhit < 100],prompt[~prompt_mask]])
+    follower = pd.concat([ev[ev.nhit < 100],prompt[~prompt_mask]])
 
     # Apply the prompt mask
     prompt = prompt[prompt_mask]
@@ -275,39 +204,78 @@ if __name__ == '__main__':
     if prompt_plus_muons.size and follower.size:
         # require Michel events to pass more of the SNO data cleaning cuts
         michel = follower[follower.dc & (DC_JUNK | DC_CRATE_ISOTROPY | DC_QVNHIT | DC_FLASHER | DC_NECK | DC_ESUM | DC_OWL | DC_OWL_TRIGGER | DC_FTS) == DC_FTS]
+
         michel = michel[michel.nhit >= 100]
-        # accept events which had a muon more than 800 nanoseconds but less than 20 microseconds before them
-        # The 800 nanoseconds cut comes from Richie's thesis. He also mentions
-        # that the In Time Channel Spread Cut is very effective at cutting
-        # electrons events caused by muons, so I should implement that
-        michel = michel[np.any((michel.run.values == prompt_plus_muons.run.values[:,np.newaxis]) & \
-                               (michel.gtr.values > prompt_plus_muons.gtr.values[:,np.newaxis] + 800) & \
-                               (michel.gtr.values < prompt_plus_muons.gtr.values[:,np.newaxis] + 20e3),axis=0)]
+
+        # Accept events which had a muon more than 800 nanoseconds but less
+        # than 20 microseconds before them. The 800 nanoseconds cut comes from
+        # Richie's thesis. He also mentions that the In Time Channel Spread Cut
+        # is very effective at cutting electrons events caused by muons, so I
+        # should implement that.
+        #
+        # Note: We currently don't look across run boundaries. This should be a
+        # *very* small effect, and the logic to do so would be very complicated
+        # since I would have to deal with 50 MHz clock rollovers, etc.
+        #
+        # Do a simple python for loop here over the runs since we create
+        # temporary arrays with shape (michel.size,prompt_plus_muons.size)
+        # which could be too big for the full dataset.
+        #
+        # There might be some clever way to do this without the loop in Pandas,
+        # but I don't know how.
+        michel_sum = None
+        for run, michel_run in michel.groupby(['run']):
+            prompt_plus_muons_run = prompt_plus_muons[prompt_plus_muons['run'] == run]
+            michel_run = michel_run[np.any((michel_run.gtr.values > prompt_plus_muons_run.gtr.values[:,np.newaxis] + 800) & \
+                                           (michel_run.gtr.values < prompt_plus_muons_run.gtr.values[:,np.newaxis] + 20e3),axis=0)]
+
+            if michel_sum is None:
+                michel_sum = michel_run
+            else:
+                michel_sum = michel_sum.append(michel_run)
+
+        michel = michel_sum
     else:
         michel = pd.DataFrame(columns=follower.columns)
 
     if prompt.size and follower.size:
         # neutron followers have to obey stricter set of data cleaning cuts
         neutron = follower[follower.dc & (DC_JUNK | DC_CRATE_ISOTROPY | DC_QVNHIT | DC_FLASHER | DC_NECK | DC_ESUM | DC_OWL | DC_OWL_TRIGGER | DC_FTS) == DC_FTS]
-        neutron = neutron[~np.isnan(neutron.ftpx) & ~np.isnan(neutron.rsp_energy)]
-        r = np.sqrt(neutron.ftpx**2 + neutron.ftpy**2 + neutron.ftpz**2)
+        neutron = neutron[~np.isnan(neutron.ftp_x) & ~np.isnan(neutron.rsp_energy)]
+        r = np.sqrt(neutron.ftp_x**2 + neutron.ftp_y**2 + neutron.ftp_z**2)
         neutron = neutron[r < AV_RADIUS]
         neutron = neutron[neutron.rsp_energy > 4.0]
+
         # neutron events accepted after 20 microseconds and before 250 ms (50 ms during salt)
         # FIXME: need to deal with 50 MHz clock rollover
-        neutron_mask = np.any((neutron.run.values == prompt.run.values[:,np.newaxis]) & \
-                              (neutron.gtr.values > prompt.gtr.values[:,np.newaxis] + 20e3) & \
-                              (neutron.gtr.values < prompt.gtr.values[:,np.newaxis] + 250e6),axis=1)
-        df_atm = prompt[neutron_mask]
-        prompt = prompt[~neutron_mask]
+        prompt_sum = None
+        atm_sum = None
+        for run, prompt_run in prompt.groupby(['run']):
+            neutron_run = neutron[neutron['run'] == run]
+
+            neutron_mask = np.any((neutron_run.gtr.values > prompt_run.gtr.values[:,np.newaxis] + 20e3) & \
+                                  (neutron_run.gtr.values < prompt_run.gtr.values[:,np.newaxis] + 250e6),axis=1)
+
+            if prompt_sum is None:
+                prompt_sum = prompt_run[~neutron_mask]
+            else:
+                prompt_sum = prompt_sum.append(prompt_run[~neutron_mask])
+
+            if atm_sum is None:
+                atm_sum = prompt_run[neutron_mask]
+            else:
+                atm_sum = atm_sum.append(prompt_run[neutron_mask])
+
+        atm = atm_sum
+        prompt = prompt_sum
     else:
-        df_atm = pd.DataFrame(columns=prompt.columns)
+        atm = pd.DataFrame(columns=prompt.columns)
 
     print("number of events after neutron follower cut = %i" % len(prompt))
 
     # Get rid of muon events in our main event sample
     prompt = prompt[(prompt.dc & DC_MUON) == 0]
-    df_atm = df_atm[(df_atm.dc & DC_MUON) == 0]
+    atm = atm[(atm.dc & DC_MUON) == 0]
 
     print("number of events after muon cut = %i" % len(prompt))
 
@@ -317,52 +285,52 @@ if __name__ == '__main__':
         prompt = prompt[~np.any((prompt.run.values == muons.run.values[:,np.newaxis]) & \
                                 (prompt.gtr.values > muons.gtr.values[:,np.newaxis]) & \
                                 (prompt.gtr.values <= (muons.gtr.values[:,np.newaxis] + 200e3)),axis=0)]
-        df_atm = df_atm[~np.any((df_atm.run.values == muons.run.values[:,np.newaxis]) & \
-                                (df_atm.gtr.values > muons.gtr.values[:,np.newaxis]) & \
-                                (df_atm.gtr.values <= (muons.gtr.values[:,np.newaxis] + 200e3)),axis=0)]
+        atm = atm[~np.any((atm.run.values == muons.run.values[:,np.newaxis]) & \
+                          (atm.gtr.values > muons.gtr.values[:,np.newaxis]) & \
+                          (atm.gtr.values <= (muons.gtr.values[:,np.newaxis] + 200e3)),axis=0)]
 
     # Check to see if there are any events with missing fit information
-    df_atm_ra = df_atm[['run','gtid']].to_records(index=False)
+    atm_ra = atm[['run','gtid']].to_records(index=False)
     muons_ra = muons[['run','gtid']].to_records(index=False)
     prompt_ra = prompt[['run','gtid']].to_records(index=False)
     michel_ra = michel[['run','gtid']].to_records(index=False)
-    df_ra = df[['run','gtid']].to_records(index=False)
+    fits_ra = fits[['run','gtid']].to_records(index=False)
 
-    if np.count_nonzero(~np.isin(df_atm_ra,df_ra)):
-        print_warning("skipping %i atmospheric events because they are missing fit information!" % np.count_nonzero(~np.isin(df_atm_ra,df_ra)))
+    if len(atm_ra) and np.count_nonzero(~np.isin(atm_ra,fits_ra)):
+        print_warning("skipping %i atmospheric events because they are missing fit information!" % np.count_nonzero(~np.isin(atm_ra,fits_ra)))
 
-    if np.count_nonzero(~np.isin(muons_ra,df_ra)):
-        print_warning("skipping %i muon events because they are missing fit information!" % np.count_nonzero(~np.isin(muons_ra,df_ra)))
+    if len(muons_ra) and np.count_nonzero(~np.isin(muons_ra,fits_ra)):
+        print_warning("skipping %i muon events because they are missing fit information!" % np.count_nonzero(~np.isin(muons_ra,fits_ra)))
 
-    if np.count_nonzero(~np.isin(prompt_ra,df_ra)):
-        print_warning("skipping %i signal events because they are missing fit information!" % np.count_nonzero(~np.isin(prompt_ra,df_ra)))
+    if len(prompt_ra) and np.count_nonzero(~np.isin(prompt_ra,fits_ra)):
+        print_warning("skipping %i signal events because they are missing fit information!" % np.count_nonzero(~np.isin(prompt_ra,fits_ra)))
 
-    if np.count_nonzero(~np.isin(michel_ra,df_ra)):
-        print_warning("skipping %i Michel events because they are missing fit information!" % np.count_nonzero(~np.isin(michel_ra,df_ra)))
+    if len(michel_ra) and np.count_nonzero(~np.isin(michel_ra,fits_ra)):
+        print_warning("skipping %i Michel events because they are missing fit information!" % np.count_nonzero(~np.isin(michel_ra,fits_ra)))
 
     # Now, we merge the event info with the fitter info.
     #
     # Note: This means that the dataframe now contains multiple rows for each
     # event, one for each fit hypothesis.
-    df_atm = pd.merge(df,df_atm,how='inner',on=['run','gtid'])
-    muons = pd.merge(df,muons,how='inner',on=['run','gtid'])
-    michel = pd.merge(df,michel,how='inner',on=['run','gtid'])
-    df = pd.merge(df,prompt,how='inner',on=['run','gtid'])
+    atm = pd.merge(fits,atm,how='inner',on=['run','gtid'])
+    muons = pd.merge(fits,muons,how='inner',on=['run','gtid'])
+    michel = pd.merge(fits,michel,how='inner',on=['run','gtid'])
+    prompt = pd.merge(fits,prompt,how='inner',on=['run','gtid'])
 
     # get rid of events which don't have a fit
-    nan = np.isnan(df.fmin.values)
+    nan = np.isnan(prompt.fmin.values)
 
     if np.count_nonzero(nan):
-        print_warning("skipping %i signal events because the negative log likelihood is nan!" % len(df[nan].groupby(['run','gtid'])))
+        print_warning("skipping %i signal events because the negative log likelihood is nan!" % len(prompt[nan].groupby(['run','gtid'])))
 
-    df = df[~nan]
+    prompt = prompt[~nan]
 
-    nan_atm = np.isnan(df_atm.fmin.values)
+    nan_atm = np.isnan(atm.fmin.values)
 
     if np.count_nonzero(nan_atm):
-        print_warning("skipping %i atmospheric events because the negative log likelihood is nan!" % len(df_atm[nan_atm].groupby(['run','gtid'])))
+        print_warning("skipping %i atmospheric events because the negative log likelihood is nan!" % len(atm[nan_atm].groupby(['run','gtid'])))
 
-    df_atm = df_atm[~nan_atm]
+    atm = atm[~nan_atm]
 
     nan_muon = np.isnan(muons.fmin.values)
 
@@ -379,24 +347,24 @@ if __name__ == '__main__':
     michel = michel[~nan_michel]
 
     # get the best fit
-    df = df.sort_values('fmin').groupby(['run','gtid']).first()
-    df_atm = df_atm.sort_values('fmin').groupby(['run','gtid']).first()
+    prompt = prompt.sort_values('fmin').groupby(['run','gtid']).first()
+    atm = atm.sort_values('fmin').groupby(['run','gtid']).first()
     michel_best_fit = michel.sort_values('fmin').groupby(['run','gtid']).first()
     muon_best_fit = muons.sort_values('fmin').groupby(['run','gtid']).first()
     muons = muons[muons.id == 22]
 
     # require r < 6 meters
-    df = df[np.sqrt(df.x.values**2 + df.y.values**2 + df.z.values**2) < AV_RADIUS]
-    df_atm = df_atm[np.sqrt(df_atm.x.values**2 + df_atm.y.values**2 + df_atm.z.values**2) < AV_RADIUS]
+    prompt = prompt[np.sqrt(prompt.x.values**2 + prompt.y.values**2 + prompt.z.values**2) < AV_RADIUS]
+    atm = atm[np.sqrt(atm.x.values**2 + atm.y.values**2 + atm.z.values**2) < AV_RADIUS]
 
-    print("number of events after radius cut = %i" % len(df))
+    print("number of events after radius cut = %i" % len(prompt))
 
     # Note: Need to design and apply a psi based cut here
 
     plt.figure()
-    plot_hist(df,"Without Neutron Follower")
+    plot_hist(prompt,"Without Neutron Follower")
     plt.figure()
-    plot_hist(df_atm,"With Neutron Follower")
+    plot_hist(atm,"With Neutron Follower")
     plt.figure()
     plot_hist(michel_best_fit,"Michel Electrons")
     plt.figure()