update chi2 script to pick the best universe

author: tlatorre <tlatorre@uchicago.edu> 2020-12-08 08:59:15 -0600
committer: tlatorre <tlatorre@uchicago.edu> 2020-12-08 08:59:15 -0600
commit: ed6302abb7bf4c3304f2e8f7cb6c7462c7caaf4f (patch)
tree: e4636affbac86adcf746a500cbc704708f9b202f /utils
parent: 3716b099fe18ff458012084d344fce5f439d3532 (diff)
download: sddm-ed6302abb7bf4c3304f2e8f7cb6c7462c7caaf4f.tar.gz
sddm-ed6302abb7bf4c3304f2e8f7cb6c7462c7caaf4f.tar.bz2
sddm-ed6302abb7bf4c3304f2e8f7cb6c7462c7caaf4f.zip
1 files changed, 46 insertions, 33 deletions
diff --git a/utils/chi2 b/utils/chi2
index acb59de..18799ff 100755
--- a/utils/chi2
+++ b/utils/chi2
@@ -205,7 +205,7 @@ def get_data_hists(data,bins,scale=1.0):
         data_hists[id] = np.histogram(data[data.id == id].ke.values,bins=bins)[0]*scale
     return data_hists
 
-def make_nll(data, muons, mc, atmo_scale_factor, muon_scale_factor, bins, print_nll=False):
+def make_nll(data, muons, mc, atmo_scale_factor, muon_scale_factor, bins, reweight=False, print_nll=False):
     df_dict = {}
     for id in (20,22,2020,2022,2222):
         df_dict[id] = mc[mc.id == id]
@@ -223,7 +223,7 @@ def make_nll(data, muons, mc, atmo_scale_factor, muon_scale_factor, bins, print_
         # Get the Monte Carlo histograms. We need to do this within the
         # likelihood function since we apply the energy resolution parameters
         # to the Monte Carlo.
-        mc_hists = get_mc_hists_fast(df_dict,x,bins,scale=1/atmo_scale_factor)
+        mc_hists = get_mc_hists_fast(df_dict,x,bins,scale=1/atmo_scale_factor,reweight=reweight)
         muon_hists = get_mc_hists_fast(df_dict_muon,x,bins,scale=1/muon_scale_factor)
 
         # Calculate the negative log of the likelihood of observing the data
@@ -277,7 +277,7 @@ def get_mc_hists_posterior(data_mc,muon_hists,data_hists,atmo_scale_factor,muon_
         mc_hists[id] += muon_hists[id]*x[5]/muon_scale_factor
     return mc_hists
 
-def get_multinomial_prob(data, data_muon, data_mc, weights, atmo_scale_factor, muon_scale_factor, id, x_samples, bins, percentile=50.0, size=10000):
+def get_multinomial_prob(data, data_muon, data_mc, atmo_scale_factor, muon_scale_factor, id, x_samples, bins, percentile=50.0, size=10000):
     """
     Returns the p-value that the histogram of the data is drawn from the MC
     histogram.
@@ -303,26 +303,13 @@ def get_multinomial_prob(data, data_muon, data_mc, weights, atmo_scale_factor, m
 
     data_hists = get_data_hists(data,bins)
 
-    # Get the total number of "universes" simulated in the GENIE reweight tool
-    if len(data_mc):
-        nuniverses = weights['universe'].max()+1
-    else:
-        nuniverses = 0
-
     ps = []
     for i in range(size):
         x = x_samples[np.random.randint(x_samples.shape[0])]
 
         muon_hists = get_mc_hists_fast(df_dict_muon,x,bins)
 
-        if nuniverses > 0:
-            universe = np.random.randint(nuniverses)
-            data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == universe],how='left',on=['run','evn'])
-            data_mc_with_weights.weight = data_mc_with_weights.weight.fillna(1.0)
-        else:
-            data_mc_with_weights = data_mc.copy()
-            data_mc['weight'] = 1.0
-        mc = get_mc_hists_posterior(data_mc_with_weights,muon_hists,data_hists,atmo_scale_factor,muon_scale_factor,x,bins)[id]
+        mc = get_mc_hists_posterior(data_mc,muon_hists,data_hists,atmo_scale_factor,muon_scale_factor,x,bins)[id]
         N = mc.sum()
         # Fix a bug in scipy(). See https://github.com/scipy/scipy/issues/8235 (I think).
         mc = mc + 1e-10
@@ -344,14 +331,14 @@ def get_multinomial_prob(data, data_muon, data_mc, weights, atmo_scale_factor, m
         ps.append(np.count_nonzero(chi2_samples >= chi2_data)/len(chi2_samples))
     return np.percentile(ps,percentile)
 
-def get_prob(data,muon,mc,weights,atmo_scale_factor,muon_scale_factor,samples,bins,size):
+def get_prob(data,muon,mc,atmo_scale_factor,muon_scale_factor,samples,bins,size):
     prob = {}
     for id in (20,22,2020,2022,2222):
-        prob[id] = get_multinomial_prob(data,muon,mc,weights,atmo_scale_factor,muon_scale_factor,id,samples,bins,size=size)
+        prob[id] = get_multinomial_prob(data,muon,mc,atmo_scale_factor,muon_scale_factor,id,samples,bins,size=size)
         print(id, prob[id])
     return prob
 
-def do_fit(data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,steps,print_nll=False,walkers=100,thin=10):
+def do_fit(data,muon,data_mc,weights,atmo_scale_factor,muon_scale_factor,bins,steps,print_nll=False,walkers=100,thin=10):
     """
     Run the fit and return the minimum along with samples from running an MCMC
     starting near the minimum.
@@ -362,13 +349,14 @@ def do_fit(data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,steps,prin
                 external muons
         - data_mc: pandas dataframe representing the expected background from
                    atmospheric neutrino events
+        - weights: pandas dataframe with the GENIE weights
         - bins: an array of bins to use for the fit
         - steps: the number of MCMC steps to run
 
-    Returns a tuple (xopt, samples) where samples is an array of shape (steps,
-    number of parameters).
+    Returns a tuple (xopt, universe, samples) where samples is an array of
+    shape (steps, number of parameters).
     """
-    nll = make_nll(data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,print_nll)
+    nll = make_nll(data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,print_nll=print_nll)
 
     pos = np.empty((walkers, len(PRIORS)),dtype=np.double)
     for i in range(pos.shape[0]):
@@ -409,7 +397,20 @@ def do_fit(data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,steps,prin
     opt.set_initial_step([0.01]*len(x0))
     xopt = opt.optimize(x0)
 
-    return xopt, samples
+    # Get the total number of "universes" simulated in the GENIE reweight tool
+    nuniverses = weights['universe'].max()+1
+
+    nlls = []
+    for universe in range(nuniverses):
+        data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == universe],how='left',on=['run','evn'])
+        data_mc_with_weights.weight = data_mc_with_weights.weight.fillna(1.0)
+
+        nll = make_nll(data,muon,data_mc_with_weights,atmo_scale_factor,muon_scale_factor,bins,reweight=True,print_nll=print_nll)
+        nlls.append(nll(xopt))
+
+    universe = np.argmin(nlls)
+
+    return xopt, universe, samples
 
 if __name__ == '__main__':
     import argparse
@@ -570,10 +571,16 @@ if __name__ == '__main__':
             data_atm.loc[data_atm.id2 == 22,'energy2'] *= (1+xtrue[3]+np.random.randn(np.count_nonzero(data_atm.id2 == 22))*xtrue[4])
             data_atm['ke'] = data_atm['energy1'].fillna(0) + data_atm['energy2'].fillna(0) + data_atm['energy3'].fillna(0)
 
-            xopt, samples = do_fit(data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,args.steps,args.print_nll,args.walkers,args.thin)
+            xopt, universe, samples = do_fit(data,muon,data_mc,weights,atmo_scale_factor,muon_scale_factor,bins,args.steps,args.print_nll,args.walkers,args.thin)
+
+            data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == universe],how='left',on=['run','evn'])
+            data_mc_with_weights.weight = data_mc_with_weights.weight.fillna(1.0)
+
+            data_atm_mc_with_weights = pd.merge(data_atm_mc,weights[weights.universe == universe],how='left',on=['run','evn'])
+            data_atm_mc_with_weights.weight = data_atm_mc_with_weights.weight.fillna(1.0)
 
-            prob = get_prob(data,muon,data_mc,weights,atmo_scale_factor,muon_scale_factor,samples,bins,size=args.multinomial_prob_size)
-            prob_atm = get_prob(data_atm,muon_atm,data_atm_mc,weights,atmo_scale_factor,muon_scale_factor,samples,bins,size=args.multinomial_prob_size)
+            prob = get_prob(data,muon,data_mc_with_weights,atmo_scale_factor,muon_scale_factor,samples,bins,size=args.multinomial_prob_size)
+            prob_atm = get_prob(data_atm,muon_atm,data_atm_mc_with_weights,atmo_scale_factor,muon_scale_factor,samples,bins,size=args.multinomial_prob_size)
 
             for id in (20,22,2020,2022,2222):
                 p_values[id].append(prob[id])
@@ -665,7 +672,7 @@ if __name__ == '__main__':
             data_atm.loc[data_atm.id2 == 22,'energy2'] *= (1+xtrue[3]+np.random.randn(np.count_nonzero(data_atm.id2 == 22))*xtrue[4])
             data_atm['ke'] = data_atm['energy1'].fillna(0) + data_atm['energy2'].fillna(0) + data_atm['energy3'].fillna(0)
 
-            xopt, samples = do_fit(data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,args.steps,args.print_nll,args.walkers,args.thin)
+            xopt, universe, samples = do_fit(data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,args.steps,args.print_nll,args.walkers,args.thin)
 
             for i in range(len(FIT_PARS)):
                 # The "pull plots" we make here are actually produced via a
@@ -696,10 +703,16 @@ if __name__ == '__main__':
 
         sys.exit(0)
 
-    xopt, samples = do_fit(data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,args.steps,args.print_nll,args.walkers,args.thin)
+    xopt, universe, samples = do_fit(data,muon,data_mc,weights,atmo_scale_factor,muon_scale_factor,bins,args.steps,args.print_nll,args.walkers,args.thin)
+
+    data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == universe],how='left',on=['run','evn'])
+    data_mc_with_weights.weight = data_mc_with_weights.weight.fillna(1.0)
+
+    data_atm_mc_with_weights = pd.merge(data_atm_mc,weights[weights.universe == universe],how='left',on=['run','evn'])
+    data_atm_mc_with_weights.weight = data_atm_mc_with_weights.weight.fillna(1.0)
 
-    prob = get_prob(data,muon,data_mc,weights,atmo_scale_factor,muon_scale_factor,samples,bins,size=args.multinomial_prob_size)
-    prob_atm = get_prob(data_atm,muon_atm,data_atm_mc,weights,atmo_scale_factor,muon_scale_factor,samples,bins,size=args.multinomial_prob_size)
+    prob = get_prob(data,muon,data_mc_with_weights,atmo_scale_factor,muon_scale_factor,samples,bins,size=args.multinomial_prob_size)
+    prob_atm = get_prob(data_atm,muon_atm,data_atm_mc_with_weights,atmo_scale_factor,muon_scale_factor,samples,bins,size=args.multinomial_prob_size)
 
     plt.figure()
     for i in range(len(FIT_PARS)):
@@ -724,7 +737,7 @@ if __name__ == '__main__':
     fig = plt.figure()
     hists = get_data_hists(data,bins)
     hists_muon = get_mc_hists(muon,xopt,bins,scale=xopt[5]/muon_scale_factor)
-    hists_mc = get_mc_hists(data_mc,xopt,bins,scale=xopt[0]/atmo_scale_factor)
+    hists_mc = get_mc_hists(data_mc_with_weights,xopt,bins,scale=xopt[0]/atmo_scale_factor,reweight=True)
     plot_hist2(hists,bins=bins,color='C0')
     plot_hist2(hists_mc,bins=bins,color='C1')
     plot_hist2(hists_muon,bins=bins,color='C2')
@@ -751,7 +764,7 @@ if __name__ == '__main__':
     fig = plt.figure()
     hists = get_data_hists(data_atm,bins)
     hists_muon = get_mc_hists(muon_atm,xopt,bins,scale=xopt[5]/muon_scale_factor)
-    hists_mc = get_mc_hists(data_atm_mc,xopt,bins,scale=xopt[0]/atmo_scale_factor)
+    hists_mc = get_mc_hists(data_atm_mc_with_weights,xopt,bins,scale=xopt[0]/atmo_scale_factor,reweight=True)
     plot_hist2(hists,bins=bins,color='C0')
     plot_hist2(hists_mc,bins=bins,color='C1')
     plot_hist2(hists_muon,bins=bins,color='C2')
author	tlatorre <tlatorre@uchicago.edu>	2020-12-08 08:59:15 -0600
committer	tlatorre <tlatorre@uchicago.edu>	2020-12-08 08:59:15 -0600
commit	ed6302abb7bf4c3304f2e8f7cb6c7462c7caaf4f (patch)
tree	e4636affbac86adcf746a500cbc704708f9b202f /utils
parent	3716b099fe18ff458012084d344fce5f439d3532 (diff)
download	sddm-ed6302abb7bf4c3304f2e8f7cb6c7462c7caaf4f.tar.gz sddm-ed6302abb7bf4c3304f2e8f7cb6c7462c7caaf4f.tar.bz2 sddm-ed6302abb7bf4c3304f2e8f7cb6c7462c7caaf4f.zip