use a hash to merge weights with MC data

author: tlatorre <tlatorre@uchicago.edu> 2020-12-16 13:43:25 -0600
committer: tlatorre <tlatorre@uchicago.edu> 2020-12-16 13:43:25 -0600
commit: 834df6cfc5737db2b2f4c0a1432c97960ed65e76 (patch)
tree: 88eca1deb4909ff5c966f0707dc96d3bcfd96318
parent: 61c0e06952f368db54fe6cf86eac6b4fb2a95c44 (diff)
download: sddm-834df6cfc5737db2b2f4c0a1432c97960ed65e76.tar.gz
sddm-834df6cfc5737db2b2f4c0a1432c97960ed65e76.tar.bz2
sddm-834df6cfc5737db2b2f4c0a1432c97960ed65e76.zip
4 files changed, 73 insertions, 18 deletions
diff --git a/utils/chi2 b/utils/chi2
index e30a058..6888158 100755
--- a/utils/chi2
+++ b/utils/chi2
@@ -409,7 +409,7 @@ def do_fit(data,muon,data_mc,weights,atmo_scale_factor,muon_scale_factor,bins,st
 
     nlls = []
     for universe in range(nuniverses):
-        data_mc_with_weights = pd.merge(data_mc,weights_dict[universe],how='left',on=['run','evn'])
+        data_mc_with_weights = pd.merge(data_mc,weights_dict[universe],how='left',on=['run','unique_id'])
         data_mc_with_weights.weight = data_mc_with_weights.weight.fillna(1.0)
 
         nll = make_nll(data,muon,data_mc_with_weights,atmo_scale_factor,muon_scale_factor,bins,reweight=True,print_nll=print_nll)
@@ -418,7 +418,7 @@ def do_fit(data,muon,data_mc,weights,atmo_scale_factor,muon_scale_factor,bins,st
     universe = np.argmin(nlls)
 
     if refit:
-        data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == universe],how='left',on=['run','evn'])
+        data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == universe],how='left',on=['run','unique_id'])
         data_mc_with_weights.weight = data_mc_with_weights.weight.fillna(1.0)
 
         # Create a new negative log likelihood function with the weighted Monte Carlo.
@@ -530,6 +530,11 @@ if __name__ == '__main__':
     mcpl = load_mcpl_files(args.mcpl)
     ev_mc = renormalize_data(ev_mc.reset_index(),mcpl)
 
+    # Merge weights with MCPL dataframe to get the unique id column in the
+    # weights dataframe since that is what we use to merge with the Monte
+    # Carlo.
+    weights = pd.merge(weights,mcpl[['run','evn','unique_id']],on=['run','evn'],how='left')
+
     # There are a handful of weights which turn out to be slightly negative for
     # some reason. For example:
     #
@@ -625,8 +630,8 @@ if __name__ == '__main__':
 
         xtrue = truncnorm_scaled(PRIORS_LOW,PRIORS_HIGH,PRIORS,PRIOR_UNCERTAINTIES)
 
-        data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == 0],how='left',on=['run','evn'])
-        data_atm_mc_with_weights = pd.merge(data_atm_mc,weights[weights.universe == 0],how='left',on=['run','evn'])
+        data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == 0],how='left',on=['run','unique_id'])
+        data_atm_mc_with_weights = pd.merge(data_atm_mc,weights[weights.universe == 0],how='left',on=['run','unique_id'])
 
         for i in range(args.coverage):
             # Calculate expected number of events
@@ -660,10 +665,10 @@ if __name__ == '__main__':
 
             xopt, universe, samples = do_fit(data,muon,data_mc,weights,atmo_scale_factor,muon_scale_factor,bins,args.steps,args.print_nll,args.walkers,args.thin)
 
-            data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == universe],how='left',on=['run','evn'])
+            data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == universe],how='left',on=['run','unique_id'])
             data_mc_with_weights.weight = data_mc_with_weights.weight.fillna(1.0)
 
-            data_atm_mc_with_weights = pd.merge(data_atm_mc,weights[weights.universe == universe],how='left',on=['run','evn'])
+            data_atm_mc_with_weights = pd.merge(data_atm_mc,weights[weights.universe == universe],how='left',on=['run','unique_id'])
             data_atm_mc_with_weights.weight = data_atm_mc_with_weights.weight.fillna(1.0)
 
             prob = get_prob(data,muon,data_mc_with_weights,atmo_scale_factor,muon_scale_factor,samples,bins,size=args.multinomial_prob_size)
@@ -792,10 +797,10 @@ if __name__ == '__main__':
 
     xopt, universe, samples = do_fit(data,muon,data_mc,weights,atmo_scale_factor,muon_scale_factor,bins,args.steps,args.print_nll,args.walkers,args.thin)
 
-    data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == universe],how='left',on=['run','evn'])
+    data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == universe],how='left',on=['run','unique_id'])
     data_mc_with_weights.weight = data_mc_with_weights.weight.fillna(1.0)
 
-    data_atm_mc_with_weights = pd.merge(data_atm_mc,weights[weights.universe == universe],how='left',on=['run','evn'])
+    data_atm_mc_with_weights = pd.merge(data_atm_mc,weights[weights.universe == universe],how='left',on=['run','unique_id'])
     data_atm_mc_with_weights.weight = data_atm_mc_with_weights.weight.fillna(1.0)
 
     prob = get_prob(data,muon,data_mc_with_weights,atmo_scale_factor,muon_scale_factor,samples,bins,size=args.multinomial_prob_size)
diff --git a/utils/dm-search b/utils/dm-search
index 91d9b87..dd4946b 100755
--- a/utils/dm-search
+++ b/utils/dm-search
@@ -350,7 +350,7 @@ def do_fit(dm_particle_id,dm_mass,dm_energy,data,muon,data_mc,weights,atmo_scale
 
     nlls = []
     for universe in range(nuniverses):
-        data_mc_with_weights = pd.merge(data_mc,weights_dict[universe],how='left',on=['run','evn'])
+        data_mc_with_weights = pd.merge(data_mc,weights_dict[universe],how='left',on=['run','unique_id'])
         data_mc_with_weights.weight = data_mc_with_weights.weight.fillna(1.0)
 
         nll = make_nll(dm_particle_id,dm_mass,dm_energy,data,muon,data_mc_with_weights,atmo_scale_factor,muon_scale_factor,bins,reweight=True,print_nll=print_nll,dm_sample=dm_sample)
@@ -359,7 +359,7 @@ def do_fit(dm_particle_id,dm_mass,dm_energy,data,muon,data_mc,weights,atmo_scale
     universe = np.argmin(nlls)
 
     if refit:
-        data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == universe],how='left',on=['run','evn'])
+        data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == universe],how='left',on=['run','unique_id'])
         data_mc_with_weights.weight = data_mc_with_weights.weight.fillna(1.0)
 
         # Create a new negative log likelihood function with the weighted Monte Carlo.
@@ -467,7 +467,7 @@ def get_limits(dm_masses,data,muon,data_mc,atmo_scale_factor,muon_scale_factor,b
             dm_energy = dm_mass
             xopt, universe, samples = do_fit(dm_particle_id,dm_mass,dm_energy,data,muon,data_mc,weights,atmo_scale_factor,muon_scale_factor,bins,steps,print_nll,walkers,thin)
 
-            data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == universe],how='left',on=['run','evn'])
+            data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == universe],how='left',on=['run','unique_id'])
             data_mc_with_weights.weight = data_mc_with_weights.weight.fillna(1.0)
 
             limit = np.percentile(samples[:,6],90)
@@ -609,6 +609,11 @@ if __name__ == '__main__':
     mcpl = load_mcpl_files(args.mcpl)
     ev_mc = renormalize_data(ev_mc.reset_index(),mcpl)
 
+    # Merge weights with MCPL dataframe to get the unique id column in the
+    # weights dataframe since that is what we use to merge with the Monte
+    # Carlo.
+    weights = pd.merge(weights,mcpl[['run','evn','unique_id']],on=['run','evn'],how='left')
+
     # There are a handful of weights which turn out to be slightly negative for
     # some reason. For example:
     #
@@ -776,8 +781,8 @@ if __name__ == '__main__':
         # Set the random seed so we get reproducible results here
         np.random.seed(0)
 
-        data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == 0],how='left',on=['run','evn'])
-        data_atm_mc_with_weights = pd.merge(data_atm_mc,weights[weights.universe == 0],how='left',on=['run','evn'])
+        data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == 0],how='left',on=['run','unique_id'])
+        data_atm_mc_with_weights = pd.merge(data_atm_mc,weights[weights.universe == 0],how='left',on=['run','unique_id'])
 
         discoveries = 0
 
diff --git a/utils/sddm/plot_energy.py b/utils/sddm/plot_energy.py
index 0533cd5..b1c9314 100644
--- a/utils/sddm/plot_energy.py
+++ b/utils/sddm/plot_energy.py
@@ -24,6 +24,7 @@ import pandas as pd
 from scipy.special import logsumexp
 from os.path import exists, join
 from os import environ
+import hashlib
 
 EPSILON = 1e-10
 
@@ -44,6 +45,26 @@ TRIG_NHIT_20      =      0x00000008
 TRIG_NHIT_20_LB   =      0x00000010
 TRIG_PULSE_GT     =      0x00000400
 
+def get_unique_id(df):
+    """
+    Create a unique ID field here by hashing the position of the event.
+    This field is later used to merge the GENIE systematics with the
+    Monte Carlo. Ideally we would have just used the run number and GENIE
+    event number but we can't do so because:
+
+        1. A significant fraction of the Monte Carlo failed to run the first
+           time and so I had to rerun it through SNOMAN which means the SNOMAN
+           event numbers no longer correspond to the GENIE event number for
+           these events.
+        2. Even so, the GENIE event number is still stored as the interaction
+           code in the MCPL file which is copied over to a pre-source MCVX
+           vertex object. However, unfortunately the pruner was set up to
+           delete these objects
+
+    Note: This has to match up with the calculation in read_mcpl_df().
+    """
+    return hashlib.sha1(np.array([df['x'],df['y'],df['z']]).astype(int).view(np.uint8)).hexdigest()
+
 def unwrap(p, delta, axis=-1):
     """
     A modified version of np.unwrap() useful for unwrapping the 50 MHz clock.
@@ -463,6 +484,12 @@ def get_events(filenames, merge_fits=False, nhit_thresh=None, mc=False):
     ev = pd.concat([read_hdf(filename, "ev").assign(filename=filename) for filename in filenames],ignore_index=True)
     fits = pd.concat([read_hdf(filename, "fits") for filename in filenames],ignore_index=True)
     rhdr = pd.concat([read_hdf(filename, "rhdr") for filename in filenames],ignore_index=True)
+    if mc:
+        mcgn = pd.concat([read_hdf(filename, "mcgn") for filename in filenames],ignore_index=True)
+        if len(mcgn):
+            mcgn = mcgn.groupby("evn").first().reset_index()
+            mcgn['unique_id'] = mcgn.apply(get_unique_id,axis=1)
+            ev = pd.merge(ev,mcgn[['evn','unique_id']],on=['evn'],how='left')
 
     if nhit_thresh is not None:
         ev = ev[ev.nhit_cal >= nhit_thresh]
diff --git a/utils/sddm/renormalize.py b/utils/sddm/renormalize.py
index 2ace984..b501a26 100644
--- a/utils/sddm/renormalize.py
+++ b/utils/sddm/renormalize.py
@@ -6,6 +6,7 @@ from . import *
 import pandas as pd
 from .flux_data import OSCILLATED_FLUX_DATA
 from io import StringIO
+import hashlib
 
 def read_mcpl(filename):
     """
@@ -94,10 +95,27 @@ def read_mcpl_df(filename):
         r = np.linalg.norm(ev[0,16:19])
         # Is the event a CC event?
         cc = nu not in ev[:,1]
-        data.append((run,evn,nu,energy,r,cc))
-
-    run, evn, nu, energy, r, cc = zip(*data)
-    return add_flux_weight(pd.DataFrame({'run':run,'evn':evn,'nu_id':nu,'nu_energy':energy,'nu_r':r,'cc':cc}))
+        # Create a unique ID field here by hashing the position of the event.
+        # This field is later used to merge the GENIE systematics with the
+        # Monte Carlo. Ideally we would have just used the run number and GENIE
+        # event number but we can't do so because:
+        #
+        #     1. A significant fraction of the Monte Carlo failed to run the
+        #        first time and so I had to rerun it through SNOMAN which means
+        #        the SNOMAN event numbers no longer correspond to the GENIE
+        #        event number for these events.
+        #     2. Even so, the GENIE event number is still stored as the
+        #        interaction code in the MCPL file which is copied over to a
+        #        pre-source MCVX vertex object. However, unfortunately the
+        #        pruner was set up to delete these objects
+        #
+        # Note: This has to match up with the calculation in get_unique_id() in
+        # plot_energy.py.
+        unique_id = hashlib.sha1(ev[0,16:19].astype(int).view(np.uint8)).hexdigest()
+        data.append((run,evn,nu,energy,r,cc,unique_id))
+
+    run, evn, nu, energy, r, cc, unique_id = zip(*data)
+    return add_flux_weight(pd.DataFrame({'run':run,'evn':evn,'nu_id':nu,'nu_energy':energy,'nu_r':r,'cc':cc,'unique_id':unique_id}))
 
 
 def load_mcpl_files(filenames):
@@ -124,6 +142,6 @@ def renormalize_data(data, mcpl):
     MC dataframe so that the "flux_weight" column is available to renormalize
     the tau neutrino data.
     """
-    data = pd.merge(data,mcpl,on=['run','evn'])
+    data = pd.merge(data,mcpl[['run','unique_id','flux_weight']],on=['run','unique_id'])
     data['flux_weight'] = data['flux_weight'].fillna(1.0)
     return data
author	tlatorre <tlatorre@uchicago.edu>	2020-12-16 13:43:25 -0600
committer	tlatorre <tlatorre@uchicago.edu>	2020-12-16 13:43:25 -0600
commit	834df6cfc5737db2b2f4c0a1432c97960ed65e76 (patch)
tree	88eca1deb4909ff5c966f0707dc96d3bcfd96318
parent	61c0e06952f368db54fe6cf86eac6b4fb2a95c44 (diff)
download	sddm-834df6cfc5737db2b2f4c0a1432c97960ed65e76.tar.gz sddm-834df6cfc5737db2b2f4c0a1432c97960ed65e76.tar.bz2 sddm-834df6cfc5737db2b2f4c0a1432c97960ed65e76.zip