diff options
-rwxr-xr-x | utils/chi2 | 21 | ||||
-rwxr-xr-x | utils/dm-search | 15 | ||||
-rw-r--r-- | utils/sddm/plot_energy.py | 27 | ||||
-rw-r--r-- | utils/sddm/renormalize.py | 28 |
4 files changed, 73 insertions, 18 deletions
@@ -409,7 +409,7 @@ def do_fit(data,muon,data_mc,weights,atmo_scale_factor,muon_scale_factor,bins,st nlls = [] for universe in range(nuniverses): - data_mc_with_weights = pd.merge(data_mc,weights_dict[universe],how='left',on=['run','evn']) + data_mc_with_weights = pd.merge(data_mc,weights_dict[universe],how='left',on=['run','unique_id']) data_mc_with_weights.weight = data_mc_with_weights.weight.fillna(1.0) nll = make_nll(data,muon,data_mc_with_weights,atmo_scale_factor,muon_scale_factor,bins,reweight=True,print_nll=print_nll) @@ -418,7 +418,7 @@ def do_fit(data,muon,data_mc,weights,atmo_scale_factor,muon_scale_factor,bins,st universe = np.argmin(nlls) if refit: - data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == universe],how='left',on=['run','evn']) + data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == universe],how='left',on=['run','unique_id']) data_mc_with_weights.weight = data_mc_with_weights.weight.fillna(1.0) # Create a new negative log likelihood function with the weighted Monte Carlo. @@ -530,6 +530,11 @@ if __name__ == '__main__': mcpl = load_mcpl_files(args.mcpl) ev_mc = renormalize_data(ev_mc.reset_index(),mcpl) + # Merge weights with MCPL dataframe to get the unique id column in the + # weights dataframe since that is what we use to merge with the Monte + # Carlo. + weights = pd.merge(weights,mcpl[['run','evn','unique_id']],on=['run','evn'],how='left') + # There are a handful of weights which turn out to be slightly negative for # some reason. For example: # @@ -625,8 +630,8 @@ if __name__ == '__main__': xtrue = truncnorm_scaled(PRIORS_LOW,PRIORS_HIGH,PRIORS,PRIOR_UNCERTAINTIES) - data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == 0],how='left',on=['run','evn']) - data_atm_mc_with_weights = pd.merge(data_atm_mc,weights[weights.universe == 0],how='left',on=['run','evn']) + data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == 0],how='left',on=['run','unique_id']) + data_atm_mc_with_weights = pd.merge(data_atm_mc,weights[weights.universe == 0],how='left',on=['run','unique_id']) for i in range(args.coverage): # Calculate expected number of events @@ -660,10 +665,10 @@ if __name__ == '__main__': xopt, universe, samples = do_fit(data,muon,data_mc,weights,atmo_scale_factor,muon_scale_factor,bins,args.steps,args.print_nll,args.walkers,args.thin) - data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == universe],how='left',on=['run','evn']) + data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == universe],how='left',on=['run','unique_id']) data_mc_with_weights.weight = data_mc_with_weights.weight.fillna(1.0) - data_atm_mc_with_weights = pd.merge(data_atm_mc,weights[weights.universe == universe],how='left',on=['run','evn']) + data_atm_mc_with_weights = pd.merge(data_atm_mc,weights[weights.universe == universe],how='left',on=['run','unique_id']) data_atm_mc_with_weights.weight = data_atm_mc_with_weights.weight.fillna(1.0) prob = get_prob(data,muon,data_mc_with_weights,atmo_scale_factor,muon_scale_factor,samples,bins,size=args.multinomial_prob_size) @@ -792,10 +797,10 @@ if __name__ == '__main__': xopt, universe, samples = do_fit(data,muon,data_mc,weights,atmo_scale_factor,muon_scale_factor,bins,args.steps,args.print_nll,args.walkers,args.thin) - data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == universe],how='left',on=['run','evn']) + data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == universe],how='left',on=['run','unique_id']) data_mc_with_weights.weight = data_mc_with_weights.weight.fillna(1.0) - data_atm_mc_with_weights = pd.merge(data_atm_mc,weights[weights.universe == universe],how='left',on=['run','evn']) + data_atm_mc_with_weights = pd.merge(data_atm_mc,weights[weights.universe == universe],how='left',on=['run','unique_id']) data_atm_mc_with_weights.weight = data_atm_mc_with_weights.weight.fillna(1.0) prob = get_prob(data,muon,data_mc_with_weights,atmo_scale_factor,muon_scale_factor,samples,bins,size=args.multinomial_prob_size) diff --git a/utils/dm-search b/utils/dm-search index 91d9b87..dd4946b 100755 --- a/utils/dm-search +++ b/utils/dm-search @@ -350,7 +350,7 @@ def do_fit(dm_particle_id,dm_mass,dm_energy,data,muon,data_mc,weights,atmo_scale nlls = [] for universe in range(nuniverses): - data_mc_with_weights = pd.merge(data_mc,weights_dict[universe],how='left',on=['run','evn']) + data_mc_with_weights = pd.merge(data_mc,weights_dict[universe],how='left',on=['run','unique_id']) data_mc_with_weights.weight = data_mc_with_weights.weight.fillna(1.0) nll = make_nll(dm_particle_id,dm_mass,dm_energy,data,muon,data_mc_with_weights,atmo_scale_factor,muon_scale_factor,bins,reweight=True,print_nll=print_nll,dm_sample=dm_sample) @@ -359,7 +359,7 @@ def do_fit(dm_particle_id,dm_mass,dm_energy,data,muon,data_mc,weights,atmo_scale universe = np.argmin(nlls) if refit: - data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == universe],how='left',on=['run','evn']) + data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == universe],how='left',on=['run','unique_id']) data_mc_with_weights.weight = data_mc_with_weights.weight.fillna(1.0) # Create a new negative log likelihood function with the weighted Monte Carlo. @@ -467,7 +467,7 @@ def get_limits(dm_masses,data,muon,data_mc,atmo_scale_factor,muon_scale_factor,b dm_energy = dm_mass xopt, universe, samples = do_fit(dm_particle_id,dm_mass,dm_energy,data,muon,data_mc,weights,atmo_scale_factor,muon_scale_factor,bins,steps,print_nll,walkers,thin) - data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == universe],how='left',on=['run','evn']) + data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == universe],how='left',on=['run','unique_id']) data_mc_with_weights.weight = data_mc_with_weights.weight.fillna(1.0) limit = np.percentile(samples[:,6],90) @@ -609,6 +609,11 @@ if __name__ == '__main__': mcpl = load_mcpl_files(args.mcpl) ev_mc = renormalize_data(ev_mc.reset_index(),mcpl) + # Merge weights with MCPL dataframe to get the unique id column in the + # weights dataframe since that is what we use to merge with the Monte + # Carlo. + weights = pd.merge(weights,mcpl[['run','evn','unique_id']],on=['run','evn'],how='left') + # There are a handful of weights which turn out to be slightly negative for # some reason. For example: # @@ -776,8 +781,8 @@ if __name__ == '__main__': # Set the random seed so we get reproducible results here np.random.seed(0) - data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == 0],how='left',on=['run','evn']) - data_atm_mc_with_weights = pd.merge(data_atm_mc,weights[weights.universe == 0],how='left',on=['run','evn']) + data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == 0],how='left',on=['run','unique_id']) + data_atm_mc_with_weights = pd.merge(data_atm_mc,weights[weights.universe == 0],how='left',on=['run','unique_id']) discoveries = 0 diff --git a/utils/sddm/plot_energy.py b/utils/sddm/plot_energy.py index 0533cd5..b1c9314 100644 --- a/utils/sddm/plot_energy.py +++ b/utils/sddm/plot_energy.py @@ -24,6 +24,7 @@ import pandas as pd from scipy.special import logsumexp from os.path import exists, join from os import environ +import hashlib EPSILON = 1e-10 @@ -44,6 +45,26 @@ TRIG_NHIT_20 = 0x00000008 TRIG_NHIT_20_LB = 0x00000010 TRIG_PULSE_GT = 0x00000400 +def get_unique_id(df): + """ + Create a unique ID field here by hashing the position of the event. + This field is later used to merge the GENIE systematics with the + Monte Carlo. Ideally we would have just used the run number and GENIE + event number but we can't do so because: + + 1. A significant fraction of the Monte Carlo failed to run the first + time and so I had to rerun it through SNOMAN which means the SNOMAN + event numbers no longer correspond to the GENIE event number for + these events. + 2. Even so, the GENIE event number is still stored as the interaction + code in the MCPL file which is copied over to a pre-source MCVX + vertex object. However, unfortunately the pruner was set up to + delete these objects + + Note: This has to match up with the calculation in read_mcpl_df(). + """ + return hashlib.sha1(np.array([df['x'],df['y'],df['z']]).astype(int).view(np.uint8)).hexdigest() + def unwrap(p, delta, axis=-1): """ A modified version of np.unwrap() useful for unwrapping the 50 MHz clock. @@ -463,6 +484,12 @@ def get_events(filenames, merge_fits=False, nhit_thresh=None, mc=False): ev = pd.concat([read_hdf(filename, "ev").assign(filename=filename) for filename in filenames],ignore_index=True) fits = pd.concat([read_hdf(filename, "fits") for filename in filenames],ignore_index=True) rhdr = pd.concat([read_hdf(filename, "rhdr") for filename in filenames],ignore_index=True) + if mc: + mcgn = pd.concat([read_hdf(filename, "mcgn") for filename in filenames],ignore_index=True) + if len(mcgn): + mcgn = mcgn.groupby("evn").first().reset_index() + mcgn['unique_id'] = mcgn.apply(get_unique_id,axis=1) + ev = pd.merge(ev,mcgn[['evn','unique_id']],on=['evn'],how='left') if nhit_thresh is not None: ev = ev[ev.nhit_cal >= nhit_thresh] diff --git a/utils/sddm/renormalize.py b/utils/sddm/renormalize.py index 2ace984..b501a26 100644 --- a/utils/sddm/renormalize.py +++ b/utils/sddm/renormalize.py @@ -6,6 +6,7 @@ from . import * import pandas as pd from .flux_data import OSCILLATED_FLUX_DATA from io import StringIO +import hashlib def read_mcpl(filename): """ @@ -94,10 +95,27 @@ def read_mcpl_df(filename): r = np.linalg.norm(ev[0,16:19]) # Is the event a CC event? cc = nu not in ev[:,1] - data.append((run,evn,nu,energy,r,cc)) - - run, evn, nu, energy, r, cc = zip(*data) - return add_flux_weight(pd.DataFrame({'run':run,'evn':evn,'nu_id':nu,'nu_energy':energy,'nu_r':r,'cc':cc})) + # Create a unique ID field here by hashing the position of the event. + # This field is later used to merge the GENIE systematics with the + # Monte Carlo. Ideally we would have just used the run number and GENIE + # event number but we can't do so because: + # + # 1. A significant fraction of the Monte Carlo failed to run the + # first time and so I had to rerun it through SNOMAN which means + # the SNOMAN event numbers no longer correspond to the GENIE + # event number for these events. + # 2. Even so, the GENIE event number is still stored as the + # interaction code in the MCPL file which is copied over to a + # pre-source MCVX vertex object. However, unfortunately the + # pruner was set up to delete these objects + # + # Note: This has to match up with the calculation in get_unique_id() in + # plot_energy.py. + unique_id = hashlib.sha1(ev[0,16:19].astype(int).view(np.uint8)).hexdigest() + data.append((run,evn,nu,energy,r,cc,unique_id)) + + run, evn, nu, energy, r, cc, unique_id = zip(*data) + return add_flux_weight(pd.DataFrame({'run':run,'evn':evn,'nu_id':nu,'nu_energy':energy,'nu_r':r,'cc':cc,'unique_id':unique_id})) def load_mcpl_files(filenames): @@ -124,6 +142,6 @@ def renormalize_data(data, mcpl): MC dataframe so that the "flux_weight" column is available to renormalize the tau neutrino data. """ - data = pd.merge(data,mcpl,on=['run','evn']) + data = pd.merge(data,mcpl[['run','unique_id','flux_weight']],on=['run','unique_id']) data['flux_weight'] = data['flux_weight'].fillna(1.0) return data |