switch from YAML output to HDF5 to speed things up

author: tlatorre <tlatorre@uchicago.edu> 2019-07-11 09:42:23 -0500
committer: tlatorre <tlatorre@uchicago.edu> 2019-07-11 09:42:23 -0500
commit: 21491ca1ca2afd6951e9b5b1e74b1c919c602b36 (patch)
tree: b21b772612125c574928e4fb37221077d6a012d3 /utils/plot-fit-results
parent: 034253ab63f1029291fa046ce15760aae72ae5c5 (diff)
download: sddm-21491ca1ca2afd6951e9b5b1e74b1c919c602b36.tar.gz
sddm-21491ca1ca2afd6951e9b5b1e74b1c919c602b36.tar.bz2
sddm-21491ca1ca2afd6951e9b5b1e74b1c919c602b36.zip
1 files changed, 101 insertions, 196 deletions
diff --git a/utils/plot-fit-results b/utils/plot-fit-results
index 773a0dc..7115b81 100755
--- a/utils/plot-fit-results
+++ b/utils/plot-fit-results
@@ -15,11 +15,6 @@
 # this program. If not, see <https://www.gnu.org/licenses/>.
 
 from __future__ import print_function, division
-import yaml
-try:
-    from yaml import CLoader as Loader
-except ImportError:
-    from yaml.loader import SafeLoader as Loader
 import numpy as np
 from scipy.stats import iqr
 from matplotlib.lines import Line2D
@@ -71,228 +66,137 @@ def get_stats(x):
     error = np.sqrt((u4-(n-3)*std**4/(n-1))/n)/(2*std)
     return mean, std/np.sqrt(n), std, error
 
+def std_err(x):
+    x = x.dropna()
+    mean = np.mean(x)
+    std = np.std(x)
+    n = len(x)
+    if n == 0:
+        return np.nan
+    u4 = np.mean((x-mean)**4)
+    error = np.sqrt((u4-(n-3)*std**4/(n-1))/n)/(2*std)
+    return error
+
 if __name__ == '__main__':
     import argparse
     import matplotlib.pyplot as plt
     import numpy as np
+    import h5py
+    import pandas as pd
 
     parser = argparse.ArgumentParser("plot fit results")
     parser.add_argument("filenames", nargs='+', help="input files")
     args = parser.parse_args()
 
-    events = []
-
-    for filename in args.filenames:
-        print(filename)
-        with open(filename) as f:
-            data = yaml.load(f.read(),Loader=Loader)
-
-        a = np.ma.empty(len(data['data']),
-                        dtype=[('id',np.int),       # particle id
-                               ('T', np.double),    # true energy
-                               ('dx',np.double),    # dx
-                               ('dy',np.double),    # dy
-                               ('dz',np.double),    # dz
-                               ('dT',np.double),    # dT
-                               ('theta',np.double), # theta
-                               ('ratio',np.double), # likelihood ratio
-                               ('te',np.double),    # time electron
-                               ('tm',np.double),    # time muon
-                               ('Te',np.double)]    # electron energy
-                    )
-
-        for i, event in enumerate(data['data']):
-            # get the particle ID
-            id = event['mcgn'][0]['id']
-
-            a[i]['id'] = id
-
-            if 'fit' not in event['ev'][0]:
-                # if nhit < 100 we don't fit the event
-                continue
-
-            if id not in event['ev'][0]['fit']:
-                a[i] = np.ma.masked
-                continue
-
-            mass = SNOMAN_MASS[id]
-            # for some reason it's the *second* track which seems to contain the
-            # initial energy
-            true_energy = event['mcgn'][0]['energy']
-            # The MCTK bank has the particle's total energy (except for neutrons)
-            # so we need to convert it into kinetic energy
-            ke = true_energy - mass
-
-            fit = event['ev'][0]['fit']
-
-            a[i]['T'] = ke
-            energy = fit[id]['energy']
-            a[i]['dT'] = energy-ke
-
-            # store the fit position residuals
-            true_posx = event['mcgn'][0]['posx']
-            posx = fit[id]['posx']
-            a[i]['dx'] = posx-true_posx
-            true_posy = event['mcgn'][0]['posy']
-            posy = fit[id]['posy']
-            a[i]['dy'] = posy-true_posy
-            true_posz = event['mcgn'][0]['posz']
-            posz = fit[id]['posz']
-            a[i]['dz'] = posz-true_posz
-
-            # compute the angle between the fit direction and the true
-            # direction
-            dirx = event['mcgn'][0]['dirx']
-            diry = event['mcgn'][0]['diry']
-            dirz = event['mcgn'][0]['dirz']
-            true_dir = [dirx,diry,dirz]
-            true_dir = np.array(true_dir)/np.linalg.norm(true_dir)
-            theta = fit[id]['theta']
-            phi = fit[id]['phi']
-            dir = [np.sin(theta)*np.cos(phi),np.sin(theta)*np.sin(phi),np.cos(theta)]
-            dir = np.array(dir)/np.linalg.norm(dir)
-            a[i]['theta'] = np.degrees(np.arccos(np.dot(true_dir,dir)))
-
-            # compute the log likelihood ratio
-            if IDP_E_MINUS in fit and IDP_MU_MINUS in fit:
-                fmin_electron = fit[IDP_E_MINUS]['fmin']
-                fmin_muon = fit[IDP_MU_MINUS]['fmin']
-                a[i]['ratio'] = fmin_muon-fmin_electron
-            else:
-                a[i]['ratio'] = np.ma.masked
-
-            # store the time taken for electron and muon fits
-            if IDP_E_MINUS in fit:
-                a[i]['te'] = fit[IDP_E_MINUS]['time']
-                a[i]['Te'] = fit[IDP_E_MINUS]['energy']
-            else:
-                a[i]['te'] = np.ma.masked
-                a[i]['Te'] = np.ma.masked
-            if IDP_MU_MINUS in fit:
-                a[i]['tm'] = fit[IDP_MU_MINUS]['time']
-            else:
-                a[i]['tm'] = np.ma.masked
-
-        events.append(a)
-
-    a = np.ma.concatenate(events)
-
+    # Read in all the data.
+    #
+    # Note: We have to add the filename as a column to each dataset since this
+    # script is used to analyze MC data which all has the same run number.
+    ev = pd.concat([pd.read_hdf(filename, "ev").assign(filename=filename) for filename in args.filenames])
+    fits = pd.concat([pd.read_hdf(filename, "fits").assign(filename=filename) for filename in args.filenames])
+    mcgn = pd.concat([pd.read_hdf(filename, "mcgn").assign(filename=filename) for filename in args.filenames])
+
+    # get rid of 2nd events like Michel electrons
+    ev = ev.sort_values(['run','gtid']).groupby(['filename','evn'],as_index=False).first()
+
+    # Now, we merge all three datasets together to produce a single
+    # dataframe. To do so, we join the ev dataframe with the mcgn frame
+    # on the evn column, and then join with the fits on the run and
+    # gtid columns.
+    #
+    # At the end we will have a single dataframe with one row for each
+    # fit, i.e. it will look like:
+    #
+    # >>> data
+    #   run   gtid nhit, ... mcgn_x, mcgn_y, mcgn_z, ..., fit_id1, fit_x, fit_y, fit_z, ...
+    #
+    # Before merging, we prefix the primary seed track table with mcgn_
+    # and the fit table with fit_ just to make things easier.
+
+    # Prefix track and fit frames
+    mcgn = mcgn.add_prefix("mcgn_")
+    fits = fits.add_prefix("fit_")
+
+    # merge ev and mcgn on evn
+    data = ev.merge(mcgn,left_on=['filename','evn'],right_on=['mcgn_filename','mcgn_evn'])
+    # merge data and fits on run and gtid
+    data = data.merge(fits,left_on=['filename','run','gtid'],right_on=['fit_filename','fit_run','fit_gtid'])
+
+    # calculate true kinetic energy
+    mass = [SNOMAN_MASS[id] for id in data['mcgn_id'].values]
+    data['T'] = data['mcgn_energy'].values - mass
+    data['dx'] = data['fit_x'].values - data['mcgn_x'].values
+    data['dy'] = data['fit_y'].values - data['mcgn_y'].values
+    data['dz'] = data['fit_z'].values - data['mcgn_z'].values
+    data['dT'] = data['fit_energy1'].values - data['T'].values
+
+    true_dir = np.dstack((data['mcgn_dirx'],data['mcgn_diry'],data['mcgn_dirz'])).squeeze()
+    dir = np.dstack((np.sin(data['fit_theta1'])*np.cos(data['fit_phi1']),
+                     np.sin(data['fit_theta1'])*np.sin(data['fit_phi1']),
+                     np.cos(data['fit_theta1']))).squeeze()
+
+    data['theta'] = np.degrees(np.arccos((true_dir*dir).sum(axis=-1)))
+
+    # only select fits which have at least 2 fits
+    data = data.groupby(['filename','run','gtid']).filter(lambda x: len(x) > 1)
+    data_true = data[data['fit_id1'] == data['mcgn_id']]
+    data_e = data[data['fit_id1'] == IDP_E_MINUS]
+    data_mu = data[data['fit_id1'] == IDP_MU_MINUS]
+
+    data_true = data_true.set_index(['filename','run','gtid'])
+    data_e = data_e.set_index(['filename','run','gtid'])
+    data_mu = data_mu.set_index(['filename','run','gtid'])
+
+    data_true['ratio'] = data_mu['fit_fmin']-data_e['fit_fmin']
+    data_true['te'] = data_e['fit_time']
+    data_true['tm'] = data_mu['fit_time']
+    data_true['Te'] = data_e['fit_energy1']
+
+    # 100 bins between 50 MeV and 1 GeV
     bins = np.arange(50,1000,100)
 
-    stats_array = np.ma.empty(len(bins)-1,
-                     dtype=[('T',             np.double),
-                            ('dT',            np.double),
-                            ('dT_err',        np.double),
-                            ('dT_std',        np.double),
-                            ('dT_std_err',    np.double),
-                            ('dx',            np.double),
-                            ('dx_err',        np.double),
-                            ('dx_std',        np.double),
-                            ('dx_std_err',    np.double),
-                            ('dy',            np.double),
-                            ('dy_err',        np.double),
-                            ('dy_std',        np.double),
-                            ('dy_std_err',    np.double),
-                            ('dz',            np.double),
-                            ('dz_err',        np.double),
-                            ('dz_std',        np.double),
-                            ('dz_std_err',    np.double),
-                            ('theta',         np.double),
-                            ('theta_err',     np.double),
-                            ('theta_std',     np.double),
-                            ('theta_std_err', np.double)])
-
-    stats = {IDP_E_MINUS: stats_array, IDP_MU_MINUS: stats_array.copy()}
-
-    for id in stats:
-        electron_events = a[a['id'] == id]
-
-        for i, (ablah, b) in enumerate(zip(bins[:-1], bins[1:])):
-            events = electron_events[(electron_events['T'] >= ablah) & (electron_events['T'] < b)]
-
-            if len(events) < 2:
-                stats[id][i] = np.ma.masked
-                continue
-
-            stats[id][i]['T'] = (ablah+b)/2
-            mean, mean_error, std, std_error = get_stats(events['dT'].compressed())
-            stats[id][i]['dT'] = mean
-            stats[id][i]['dT_err'] = mean_error
-            stats[id][i]['dT_std'] = std
-            stats[id][i]['dT_std_err'] = std_error
-            mean, mean_error, std, std_error = get_stats(events['dx'].compressed())
-            stats[id][i]['dx'] = mean
-            stats[id][i]['dx_err'] = mean_error
-            stats[id][i]['dx_std'] = std
-            stats[id][i]['dx_std_err'] = std_error
-            mean, mean_error, std, std_error = get_stats(events['dy'].compressed())
-            stats[id][i]['dy'] = mean
-            stats[id][i]['dy_err'] = mean_error
-            stats[id][i]['dy_std'] = std
-            stats[id][i]['dy_std_err'] = std_error
-            mean, mean_error, std, std_error = get_stats(events['dz'].compressed())
-            stats[id][i]['dz'] = mean
-            stats[id][i]['dz_err'] = mean_error
-            stats[id][i]['dz_std'] = std
-            stats[id][i]['dz_std_err'] = std_error
-            mean, mean_error, std, std_error = get_stats(events['theta'].compressed())
-            stats[id][i]['theta'] = mean
-            stats[id][i]['theta_err'] = mean_error
-            stats[id][i]['theta_std'] = std
-            stats[id][i]['theta_std_err'] = std_error
-
-    for id in stats:
+    for id in [IDP_E_MINUS, IDP_MU_MINUS]:
+        events = data_true[data_true['mcgn_id'] == id]
+
+        pd_bins = pd.cut(events['T'],bins)
+
+        dT = events.groupby(pd_bins)['dT'].agg(['mean','sem','std',std_err])
+        dx = events.groupby(pd_bins)['dx'].agg(['mean','sem','std',std_err])
+        dy = events.groupby(pd_bins)['dy'].agg(['mean','sem','std',std_err])
+        dz = events.groupby(pd_bins)['dz'].agg(['mean','sem','std',std_err])
+        theta = events.groupby(pd_bins)['theta'].agg(['mean','sem','std',std_err])
+
         label = 'Muon' if id == IDP_MU_MINUS else 'Electron'
 
-        T = stats[id]['T']
-        dT = stats[id]['dT']
-        dT_err = stats[id]['dT_err']
-        std_dT = stats[id]['dT_std']
-        std_dT_err = stats[id]['dT_std_err']
-        dx = stats[id]['dx']
-        dx_err = stats[id]['dx_err']
-        std_dx = stats[id]['dx_std']
-        std_dx_err = stats[id]['dx_std_err']
-        dy = stats[id]['dy']
-        dy_err = stats[id]['dy_err']
-        std_dy = stats[id]['dy_std']
-        std_dy_err = stats[id]['dy_std_err']
-        dz = stats[id]['dz']
-        dz_err = stats[id]['dz_err']
-        std_dz = stats[id]['dz_std']
-        std_dz_err = stats[id]['dz_std_err']
-        theta = stats[id]['theta']
-        theta_err = stats[id]['theta_err']
-        std_theta = stats[id]['theta_std']
-        std_theta_err = stats[id]['theta_std_err']
+        T = (bins[1:] + bins[:-1])/2
 
         plt.figure(1)
-        plt.errorbar(T,dT*100/T,yerr=dT_err*100/T,fmt='o',label=label)
+        plt.errorbar(T,dT['mean']*100/T,yerr=dT['sem']*100/T,fmt='o',label=label)
         plt.xlabel("Kinetic Energy (MeV)")
         plt.ylabel("Energy bias (%)")
         plt.title("Energy Bias")
         plt.legend()
 
         plt.figure(2)
-        plt.errorbar(T,std_dT*100/T,yerr=std_dT_err*100/T,fmt='o',label=label)
+        plt.errorbar(T,dT['std']*100/T,yerr=dT['std_err']*100/T,fmt='o',label=label)
         plt.xlabel("Kinetic Energy (MeV)")
         plt.ylabel("Energy resolution (%)")
         plt.title("Energy Resolution")
         plt.legend()
 
         plt.figure(3)
-        plt.errorbar(T,dx,yerr=dx_err,fmt='o',label='%s (x)' % label)
-        plt.errorbar(T,dy,yerr=dy_err,fmt='o',label='%s (y)' % label)
-        plt.errorbar(T,dz,yerr=dz_err,fmt='o',label='%s (z)' % label)
+        plt.errorbar(T,dx['mean'],yerr=dx['sem'],fmt='o',label='%s (x)' % label)
+        plt.errorbar(T,dy['mean'],yerr=dy['sem'],fmt='o',label='%s (y)' % label)
+        plt.errorbar(T,dz['mean'],yerr=dz['sem'],fmt='o',label='%s (z)' % label)
         plt.xlabel("Kinetic Energy (MeV)")
         plt.ylabel("Position bias (cm)")
         plt.title("Position Bias")
         plt.legend()
 
         plt.figure(4)
-        plt.errorbar(T,std_dx,yerr=std_dx_err,fmt='o',label='%s (x)' % label)
-        plt.errorbar(T,std_dy,yerr=std_dy_err,fmt='o',label='%s (y)' % label)
-        plt.errorbar(T,std_dz,yerr=std_dz_err,fmt='o',label='%s (z)' % label)
+        plt.errorbar(T,dx['std'],yerr=dx['std_err'],fmt='o',label='%s (x)' % label)
+        plt.errorbar(T,dy['std'],yerr=dy['std_err'],fmt='o',label='%s (y)' % label)
+        plt.errorbar(T,dz['std'],yerr=dz['std_err'],fmt='o',label='%s (z)' % label)
         plt.xlabel("Kinetic Energy (MeV)")
         plt.ylabel("Position resolution (cm)")
         plt.title("Position Resolution")
@@ -300,7 +204,7 @@ if __name__ == '__main__':
         plt.legend()
 
         plt.figure(5)
-        plt.errorbar(T,std_theta,yerr=std_theta_err,fmt='o',label=label)
+        plt.errorbar(T,theta['std'],yerr=theta['std_err'],fmt='o',label=label)
         plt.xlabel("Kinetic Energy (MeV)")
         plt.ylabel("Angular resolution (deg)")
         plt.title("Angular Resolution")
@@ -308,9 +212,10 @@ if __name__ == '__main__':
         plt.legend()
 
         plt.figure(6)
-        plt.scatter(a[a['id'] == id]['Te'],a[a['id'] == id]['ratio'],label=label)
+        plt.scatter(events['Te'],events['ratio'],label=label)
         plt.xlabel("Reconstructed Electron Energy (MeV)")
         plt.ylabel(r"Log Likelihood Ratio (e/$\mu$)")
         plt.title("Log Likelihood Ratio vs Reconstructed Electron Energy")
         plt.legend()
+
     plt.show()
author	tlatorre <tlatorre@uchicago.edu>	2019-07-11 09:42:23 -0500
committer	tlatorre <tlatorre@uchicago.edu>	2019-07-11 09:42:23 -0500
commit	21491ca1ca2afd6951e9b5b1e74b1c919c602b36 (patch)
tree	b21b772612125c574928e4fb37221077d6a012d3 /utils/plot-fit-results
parent	034253ab63f1029291fa046ce15760aae72ae5c5 (diff)
download	sddm-21491ca1ca2afd6951e9b5b1e74b1c919c602b36.tar.gz sddm-21491ca1ca2afd6951e9b5b1e74b1c919c602b36.tar.bz2 sddm-21491ca1ca2afd6951e9b5b1e74b1c919c602b36.zip