aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authortlatorre <tlatorre@uchicago.edu>2020-12-16 13:43:25 -0600
committertlatorre <tlatorre@uchicago.edu>2020-12-16 13:43:25 -0600
commit834df6cfc5737db2b2f4c0a1432c97960ed65e76 (patch)
tree88eca1deb4909ff5c966f0707dc96d3bcfd96318
parent61c0e06952f368db54fe6cf86eac6b4fb2a95c44 (diff)
downloadsddm-834df6cfc5737db2b2f4c0a1432c97960ed65e76.tar.gz
sddm-834df6cfc5737db2b2f4c0a1432c97960ed65e76.tar.bz2
sddm-834df6cfc5737db2b2f4c0a1432c97960ed65e76.zip
use a hash to merge weights with MC data
-rwxr-xr-xutils/chi221
-rwxr-xr-xutils/dm-search15
-rw-r--r--utils/sddm/plot_energy.py27
-rw-r--r--utils/sddm/renormalize.py28
4 files changed, 73 insertions, 18 deletions
diff --git a/utils/chi2 b/utils/chi2
index e30a058..6888158 100755
--- a/utils/chi2
+++ b/utils/chi2
@@ -409,7 +409,7 @@ def do_fit(data,muon,data_mc,weights,atmo_scale_factor,muon_scale_factor,bins,st
nlls = []
for universe in range(nuniverses):
- data_mc_with_weights = pd.merge(data_mc,weights_dict[universe],how='left',on=['run','evn'])
+ data_mc_with_weights = pd.merge(data_mc,weights_dict[universe],how='left',on=['run','unique_id'])
data_mc_with_weights.weight = data_mc_with_weights.weight.fillna(1.0)
nll = make_nll(data,muon,data_mc_with_weights,atmo_scale_factor,muon_scale_factor,bins,reweight=True,print_nll=print_nll)
@@ -418,7 +418,7 @@ def do_fit(data,muon,data_mc,weights,atmo_scale_factor,muon_scale_factor,bins,st
universe = np.argmin(nlls)
if refit:
- data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == universe],how='left',on=['run','evn'])
+ data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == universe],how='left',on=['run','unique_id'])
data_mc_with_weights.weight = data_mc_with_weights.weight.fillna(1.0)
# Create a new negative log likelihood function with the weighted Monte Carlo.
@@ -530,6 +530,11 @@ if __name__ == '__main__':
mcpl = load_mcpl_files(args.mcpl)
ev_mc = renormalize_data(ev_mc.reset_index(),mcpl)
+ # Merge weights with MCPL dataframe to get the unique id column in the
+ # weights dataframe since that is what we use to merge with the Monte
+ # Carlo.
+ weights = pd.merge(weights,mcpl[['run','evn','unique_id']],on=['run','evn'],how='left')
+
# There are a handful of weights which turn out to be slightly negative for
# some reason. For example:
#
@@ -625,8 +630,8 @@ if __name__ == '__main__':
xtrue = truncnorm_scaled(PRIORS_LOW,PRIORS_HIGH,PRIORS,PRIOR_UNCERTAINTIES)
- data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == 0],how='left',on=['run','evn'])
- data_atm_mc_with_weights = pd.merge(data_atm_mc,weights[weights.universe == 0],how='left',on=['run','evn'])
+ data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == 0],how='left',on=['run','unique_id'])
+ data_atm_mc_with_weights = pd.merge(data_atm_mc,weights[weights.universe == 0],how='left',on=['run','unique_id'])
for i in range(args.coverage):
# Calculate expected number of events
@@ -660,10 +665,10 @@ if __name__ == '__main__':
xopt, universe, samples = do_fit(data,muon,data_mc,weights,atmo_scale_factor,muon_scale_factor,bins,args.steps,args.print_nll,args.walkers,args.thin)
- data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == universe],how='left',on=['run','evn'])
+ data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == universe],how='left',on=['run','unique_id'])
data_mc_with_weights.weight = data_mc_with_weights.weight.fillna(1.0)
- data_atm_mc_with_weights = pd.merge(data_atm_mc,weights[weights.universe == universe],how='left',on=['run','evn'])
+ data_atm_mc_with_weights = pd.merge(data_atm_mc,weights[weights.universe == universe],how='left',on=['run','unique_id'])
data_atm_mc_with_weights.weight = data_atm_mc_with_weights.weight.fillna(1.0)
prob = get_prob(data,muon,data_mc_with_weights,atmo_scale_factor,muon_scale_factor,samples,bins,size=args.multinomial_prob_size)
@@ -792,10 +797,10 @@ if __name__ == '__main__':
xopt, universe, samples = do_fit(data,muon,data_mc,weights,atmo_scale_factor,muon_scale_factor,bins,args.steps,args.print_nll,args.walkers,args.thin)
- data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == universe],how='left',on=['run','evn'])
+ data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == universe],how='left',on=['run','unique_id'])
data_mc_with_weights.weight = data_mc_with_weights.weight.fillna(1.0)
- data_atm_mc_with_weights = pd.merge(data_atm_mc,weights[weights.universe == universe],how='left',on=['run','evn'])
+ data_atm_mc_with_weights = pd.merge(data_atm_mc,weights[weights.universe == universe],how='left',on=['run','unique_id'])
data_atm_mc_with_weights.weight = data_atm_mc_with_weights.weight.fillna(1.0)
prob = get_prob(data,muon,data_mc_with_weights,atmo_scale_factor,muon_scale_factor,samples,bins,size=args.multinomial_prob_size)
diff --git a/utils/dm-search b/utils/dm-search
index 91d9b87..dd4946b 100755
--- a/utils/dm-search
+++ b/utils/dm-search
@@ -350,7 +350,7 @@ def do_fit(dm_particle_id,dm_mass,dm_energy,data,muon,data_mc,weights,atmo_scale
nlls = []
for universe in range(nuniverses):
- data_mc_with_weights = pd.merge(data_mc,weights_dict[universe],how='left',on=['run','evn'])
+ data_mc_with_weights = pd.merge(data_mc,weights_dict[universe],how='left',on=['run','unique_id'])
data_mc_with_weights.weight = data_mc_with_weights.weight.fillna(1.0)
nll = make_nll(dm_particle_id,dm_mass,dm_energy,data,muon,data_mc_with_weights,atmo_scale_factor,muon_scale_factor,bins,reweight=True,print_nll=print_nll,dm_sample=dm_sample)
@@ -359,7 +359,7 @@ def do_fit(dm_particle_id,dm_mass,dm_energy,data,muon,data_mc,weights,atmo_scale
universe = np.argmin(nlls)
if refit:
- data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == universe],how='left',on=['run','evn'])
+ data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == universe],how='left',on=['run','unique_id'])
data_mc_with_weights.weight = data_mc_with_weights.weight.fillna(1.0)
# Create a new negative log likelihood function with the weighted Monte Carlo.
@@ -467,7 +467,7 @@ def get_limits(dm_masses,data,muon,data_mc,atmo_scale_factor,muon_scale_factor,b
dm_energy = dm_mass
xopt, universe, samples = do_fit(dm_particle_id,dm_mass,dm_energy,data,muon,data_mc,weights,atmo_scale_factor,muon_scale_factor,bins,steps,print_nll,walkers,thin)
- data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == universe],how='left',on=['run','evn'])
+ data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == universe],how='left',on=['run','unique_id'])
data_mc_with_weights.weight = data_mc_with_weights.weight.fillna(1.0)
limit = np.percentile(samples[:,6],90)
@@ -609,6 +609,11 @@ if __name__ == '__main__':
mcpl = load_mcpl_files(args.mcpl)
ev_mc = renormalize_data(ev_mc.reset_index(),mcpl)
+ # Merge weights with MCPL dataframe to get the unique id column in the
+ # weights dataframe since that is what we use to merge with the Monte
+ # Carlo.
+ weights = pd.merge(weights,mcpl[['run','evn','unique_id']],on=['run','evn'],how='left')
+
# There are a handful of weights which turn out to be slightly negative for
# some reason. For example:
#
@@ -776,8 +781,8 @@ if __name__ == '__main__':
# Set the random seed so we get reproducible results here
np.random.seed(0)
- data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == 0],how='left',on=['run','evn'])
- data_atm_mc_with_weights = pd.merge(data_atm_mc,weights[weights.universe == 0],how='left',on=['run','evn'])
+ data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == 0],how='left',on=['run','unique_id'])
+ data_atm_mc_with_weights = pd.merge(data_atm_mc,weights[weights.universe == 0],how='left',on=['run','unique_id'])
discoveries = 0
diff --git a/utils/sddm/plot_energy.py b/utils/sddm/plot_energy.py
index 0533cd5..b1c9314 100644
--- a/utils/sddm/plot_energy.py
+++ b/utils/sddm/plot_energy.py
@@ -24,6 +24,7 @@ import pandas as pd
from scipy.special import logsumexp
from os.path import exists, join
from os import environ
+import hashlib
EPSILON = 1e-10
@@ -44,6 +45,26 @@ TRIG_NHIT_20 = 0x00000008
TRIG_NHIT_20_LB = 0x00000010
TRIG_PULSE_GT = 0x00000400
+def get_unique_id(df):
+ """
+ Create a unique ID field here by hashing the position of the event.
+ This field is later used to merge the GENIE systematics with the
+ Monte Carlo. Ideally we would have just used the run number and GENIE
+ event number but we can't do so because:
+
+ 1. A significant fraction of the Monte Carlo failed to run the first
+ time and so I had to rerun it through SNOMAN which means the SNOMAN
+ event numbers no longer correspond to the GENIE event number for
+ these events.
+ 2. Even so, the GENIE event number is still stored as the interaction
+ code in the MCPL file which is copied over to a pre-source MCVX
+ vertex object. However, unfortunately the pruner was set up to
+ delete these objects
+
+ Note: This has to match up with the calculation in read_mcpl_df().
+ """
+ return hashlib.sha1(np.array([df['x'],df['y'],df['z']]).astype(int).view(np.uint8)).hexdigest()
+
def unwrap(p, delta, axis=-1):
"""
A modified version of np.unwrap() useful for unwrapping the 50 MHz clock.
@@ -463,6 +484,12 @@ def get_events(filenames, merge_fits=False, nhit_thresh=None, mc=False):
ev = pd.concat([read_hdf(filename, "ev").assign(filename=filename) for filename in filenames],ignore_index=True)
fits = pd.concat([read_hdf(filename, "fits") for filename in filenames],ignore_index=True)
rhdr = pd.concat([read_hdf(filename, "rhdr") for filename in filenames],ignore_index=True)
+ if mc:
+ mcgn = pd.concat([read_hdf(filename, "mcgn") for filename in filenames],ignore_index=True)
+ if len(mcgn):
+ mcgn = mcgn.groupby("evn").first().reset_index()
+ mcgn['unique_id'] = mcgn.apply(get_unique_id,axis=1)
+ ev = pd.merge(ev,mcgn[['evn','unique_id']],on=['evn'],how='left')
if nhit_thresh is not None:
ev = ev[ev.nhit_cal >= nhit_thresh]
diff --git a/utils/sddm/renormalize.py b/utils/sddm/renormalize.py
index 2ace984..b501a26 100644
--- a/utils/sddm/renormalize.py
+++ b/utils/sddm/renormalize.py
@@ -6,6 +6,7 @@ from . import *
import pandas as pd
from .flux_data import OSCILLATED_FLUX_DATA
from io import StringIO
+import hashlib
def read_mcpl(filename):
"""
@@ -94,10 +95,27 @@ def read_mcpl_df(filename):
r = np.linalg.norm(ev[0,16:19])
# Is the event a CC event?
cc = nu not in ev[:,1]
- data.append((run,evn,nu,energy,r,cc))
-
- run, evn, nu, energy, r, cc = zip(*data)
- return add_flux_weight(pd.DataFrame({'run':run,'evn':evn,'nu_id':nu,'nu_energy':energy,'nu_r':r,'cc':cc}))
+ # Create a unique ID field here by hashing the position of the event.
+ # This field is later used to merge the GENIE systematics with the
+ # Monte Carlo. Ideally we would have just used the run number and GENIE
+ # event number but we can't do so because:
+ #
+ # 1. A significant fraction of the Monte Carlo failed to run the
+ # first time and so I had to rerun it through SNOMAN which means
+ # the SNOMAN event numbers no longer correspond to the GENIE
+ # event number for these events.
+ # 2. Even so, the GENIE event number is still stored as the
+ # interaction code in the MCPL file which is copied over to a
+ # pre-source MCVX vertex object. However, unfortunately the
+ # pruner was set up to delete these objects
+ #
+ # Note: This has to match up with the calculation in get_unique_id() in
+ # plot_energy.py.
+ unique_id = hashlib.sha1(ev[0,16:19].astype(int).view(np.uint8)).hexdigest()
+ data.append((run,evn,nu,energy,r,cc,unique_id))
+
+ run, evn, nu, energy, r, cc, unique_id = zip(*data)
+ return add_flux_weight(pd.DataFrame({'run':run,'evn':evn,'nu_id':nu,'nu_energy':energy,'nu_r':r,'cc':cc,'unique_id':unique_id}))
def load_mcpl_files(filenames):
@@ -124,6 +142,6 @@ def renormalize_data(data, mcpl):
MC dataframe so that the "flux_weight" column is available to renormalize
the tau neutrino data.
"""
- data = pd.merge(data,mcpl,on=['run','evn'])
+ data = pd.merge(data,mcpl[['run','unique_id','flux_weight']],on=['run','unique_id'])
data['flux_weight'] = data['flux_weight'].fillna(1.0)
return data