From 6b9de6e4c01e71fd74494eb6782b99527adffe26 Mon Sep 17 00:00:00 2001
From: tlatorre <tlatorre@uchicago.edu>
Date: Fri, 9 Oct 2020 11:33:36 -0500
Subject: update chi2 with scale factors for the MC

This commit updates the chi2 script to add scale factors for the
atmospheric neutrino scale and muon scale parameters. The reason for
this is that these parameters should now have a much easier
interpretation than before. Now, the atmospheric neutrino scale
parameter is relative to the expected atmospheric neutrino flux (i.e. we
expect the fit to return something close to 1), and the muon scale
parameter is the total number of expected muons in our sample, which is
exactly what the data cleaning analysis gives us.
---
 utils/chi2 | 73 ++++++++++++++++++++++++++++++++------------------------------
 1 file changed, 38 insertions(+), 35 deletions(-)

diff --git a/utils/chi2 b/utils/chi2
index 30a0fbe..be99626 100755
--- a/utils/chi2
+++ b/utils/chi2
@@ -71,21 +71,21 @@ FIT_PARS = [
 # - For the electron + muon energy resolution, I don't have any real good way
 #   to estimate this, so we are conservative and set the error to 10%.
 PRIORS = [
-    0.01,  # Atmospheric Neutrino Scale
+    1.0,   # Atmospheric Neutrino Scale
     0.0,   # Electron energy scale
     0.0,   # Electron energy resolution
     0.066, # Muon energy scale
     0.0,   # Muon energy resolution
-    0.01,  # Muon scale
+    0.0,   # Muon scale
 ]
 
 PRIOR_UNCERTAINTIES = [
-    0.002, # Atmospheric Neutrino Scale
+    0.2  , # Atmospheric Neutrino Scale
     0.1,   # Electron energy scale
     0.1,   # Electron energy resolution
     0.011, # Muon energy scale
     0.014, # Muon energy resolution
-    0.001, # Muon scale
+    10.0,  # Muon scale
 ]
 
 # Lower bounds for the fit parameters
@@ -201,7 +201,7 @@ def get_data_hists(data,bins,scale=1.0):
         data_hists[id] = np.histogram(data[data.id == id].ke.values,bins=bins)[0]*scale
     return data_hists
 
-def make_nll(data, muons, mc, bins, print_nll=False):
+def make_nll(data, muons, mc, atmo_scale_factor, muon_scale_factor, bins, print_nll=False):
     df_dict = {}
     for id in (20,22,2020,2022,2222):
         df_dict[id] = mc[mc.id == id]
@@ -219,8 +219,8 @@ def make_nll(data, muons, mc, bins, print_nll=False):
         # Get the Monte Carlo histograms. We need to do this within the
         # likelihood function since we apply the energy resolution parameters
         # to the Monte Carlo.
-        mc_hists = get_mc_hists_fast(df_dict,x,bins)
-        muon_hists = get_mc_hists_fast(df_dict_muon,x,bins)
+        mc_hists = get_mc_hists_fast(df_dict,x,bins,scale=1/atmo_scale_factor)
+        muon_hists = get_mc_hists_fast(df_dict_muon,x,bins,scale=1/muon_scale_factor)
 
         # Calculate the negative log of the likelihood of observing the data
         # given the fit parameters
@@ -242,7 +242,7 @@ def make_nll(data, muons, mc, bins, print_nll=False):
         return nll
     return nll
 
-def get_mc_hists_posterior(data_mc,muon_hists,data_hists,x,bins):
+def get_mc_hists_posterior(data_mc,muon_hists,data_hists,atmo_scale_factor,muon_scale_factor,x,bins):
     """
     Returns the posterior on the Monte Carlo histograms.
 
@@ -268,12 +268,12 @@ def get_mc_hists_posterior(data_mc,muon_hists,data_hists,x,bins):
     """
     mc_hists = get_mc_hists(data_mc,x,bins,reweight=True)
     for id in (20,22,2020,2022,2222):
-        mc_hists[id] = get_mc_hist_posterior(mc_hists[id],data_hists[id],norm=x[0])
+        mc_hists[id] = get_mc_hist_posterior(mc_hists[id],data_hists[id],norm=x[0]/atmo_scale_factor)
         # FIXME: does the orering of when we add the muons matter here?
-        mc_hists[id] += muon_hists[id]*x[5]
+        mc_hists[id] += muon_hists[id]*x[5]/muon_scale_factor
     return mc_hists
 
-def get_multinomial_prob(data, data_muon, data_mc, weights, id, x_samples, bins, percentile=50.0, size=10000):
+def get_multinomial_prob(data, data_muon, data_mc, weights, atmo_scale_factor, muon_scale_factor, id, x_samples, bins, percentile=50.0, size=10000):
     """
     Returns the p-value that the histogram of the data is drawn from the MC
     histogram.
@@ -318,7 +318,7 @@ def get_multinomial_prob(data, data_muon, data_mc, weights, id, x_samples, bins,
         else:
             data_mc_with_weights = data_mc.copy()
             data_mc['weight'] = 1.0
-        mc = get_mc_hists_posterior(data_mc_with_weights,muon_hists,data_hists,x,bins)[id]
+        mc = get_mc_hists_posterior(data_mc_with_weights,muon_hists,data_hists,atmo_scale_factor,muon_scale_factor,x,bins)[id]
         N = mc.sum()
         # Fix a bug in scipy(). See https://github.com/scipy/scipy/issues/8235 (I think).
         mc = mc + 1e-10
@@ -340,14 +340,14 @@ def get_multinomial_prob(data, data_muon, data_mc, weights, id, x_samples, bins,
         ps.append(np.count_nonzero(chi2_samples >= chi2_data)/len(chi2_samples))
     return np.percentile(ps,percentile)
 
-def get_prob(data,muon,mc,weights,samples,bins,size):
+def get_prob(data,muon,mc,weights,atmo_scale_factor,muon_scale_factor,samples,bins,size):
     prob = {}
     for id in (20,22,2020,2022,2222):
-        prob[id] = get_multinomial_prob(data,muon,mc,weights,id,samples,bins,size=size)
+        prob[id] = get_multinomial_prob(data,muon,mc,weights,atmo_scale_factor,muon_scale_factor,id,samples,bins,size=size)
         print(id, prob[id])
     return prob
 
-def do_fit(data,muon,data_mc,bins,steps,print_nll=False,walkers=100,thin=10):
+def do_fit(data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,steps,print_nll=False,walkers=100,thin=10):
     """
     Run the fit and return the minimum along with samples from running an MCMC
     starting near the minimum.
@@ -364,7 +364,7 @@ def do_fit(data,muon,data_mc,bins,steps,print_nll=False,walkers=100,thin=10):
     Returns a tuple (xopt, samples) where samples is an array of shape (steps,
     number of parameters).
     """
-    nll = make_nll(data,muon,data_mc,bins,print_nll)
+    nll = make_nll(data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,print_nll)
 
     pos = np.empty((walkers, len(PRIORS)),dtype=np.double)
     for i in range(pos.shape[0]):
@@ -487,6 +487,9 @@ if __name__ == '__main__':
 
     bins = np.logspace(np.log10(20),np.log10(10e3),21)
 
+    atmo_scale_factor = 100.0
+    muon_scale_factor = len(muon) + len(muon_atm)
+
     if args.coverage:
         p_values = {id: [] for id in (20,22,2020,2022,2222)}
         p_values_atm = {id: [] for id in (20,22,2020,2022,2222)}
@@ -501,10 +504,10 @@ if __name__ == '__main__':
 
         for i in range(args.coverage):
             # Calculate expected number of events
-            N = len(data_mc)*xtrue[0]
-            N_atm = len(data_atm_mc)*xtrue[0]
-            N_muon = len(muon)*xtrue[5]
-            N_muon_atm = len(muon_atm)*xtrue[5]
+            N = len(data_mc)*xtrue[0]/atmo_scale_factor
+            N_atm = len(data_atm_mc)*xtrue[0]/atmo_scale_factor
+            N_muon = len(muon)*xtrue[5]/muon_scale_factor
+            N_muon_atm = len(muon_atm)*xtrue[5]/muon_scale_factor
 
             # Calculate observed number of events
             n = np.random.poisson(N)
@@ -529,10 +532,10 @@ if __name__ == '__main__':
             data_atm.loc[data_atm.id2 == 22,'energy2'] *= (1+xtrue[3]+np.random.randn(np.count_nonzero(data_atm.id2 == 22))*xtrue[4])
             data_atm['ke'] = data_atm['energy1'].fillna(0) + data_atm['energy2'].fillna(0) + data_atm['energy3'].fillna(0)
 
-            xopt, samples = do_fit(data,muon,data_mc,bins,args.steps,args.print_nll,args.walkers,args.thin)
+            xopt, samples = do_fit(data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,args.steps,args.print_nll,args.walkers,args.thin)
 
-            prob = get_prob(data,muon,data_mc,weights,samples,bins,size=args.multinomial_prob_size)
-            prob_atm = get_prob(data_atm,muon_atm,data_atm_mc,weights,samples,bins,size=args.multinomial_prob_size)
+            prob = get_prob(data,muon,data_mc,weights,atmo_scale_factor,muon_scale_factor,samples,bins,size=args.multinomial_prob_size)
+            prob_atm = get_prob(data_atm,muon_atm,data_atm_mc,weights,atmo_scale_factor,muon_scale_factor,samples,bins,size=args.multinomial_prob_size)
 
             for id in (20,22,2020,2022,2222):
                 p_values[id].append(prob[id])
@@ -596,10 +599,10 @@ if __name__ == '__main__':
             xtrue = truncnorm_scaled(PRIORS_LOW,PRIORS_HIGH,PRIORS,PRIOR_UNCERTAINTIES)
 
             # Calculate expected number of events
-            N = len(data_mc)*xtrue[0]
-            N_atm = len(data_atm_mc)*xtrue[0]
-            N_muon = len(muon)*xtrue[5]
-            N_muon_atm = len(muon_atm)*xtrue[5]
+            N = len(data_mc)*xtrue[0]/atmo_scale_factor
+            N_atm = len(data_atm_mc)*xtrue[0]/atmo_scale_factor
+            N_muon = len(muon)*xtrue[5]/muon_scale_factor
+            N_muon_atm = len(muon_atm)*xtrue[5]/muon_scale_factor
 
             # Calculate observed number of events
             n = np.random.poisson(N)
@@ -624,7 +627,7 @@ if __name__ == '__main__':
             data_atm.loc[data_atm.id2 == 22,'energy2'] *= (1+xtrue[3]+np.random.randn(np.count_nonzero(data_atm.id2 == 22))*xtrue[4])
             data_atm['ke'] = data_atm['energy1'].fillna(0) + data_atm['energy2'].fillna(0) + data_atm['energy3'].fillna(0)
 
-            xopt, samples = do_fit(data,muon,data_mc,bins,args.steps,args.print_nll,args.walkers,args.thin)
+            xopt, samples = do_fit(data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,args.steps,args.print_nll,args.walkers,args.thin)
 
             for i in range(len(FIT_PARS)):
                 # The "pull plots" we make here are actually produced via a
@@ -652,10 +655,10 @@ if __name__ == '__main__':
 
         sys.exit(0)
 
-    xopt, samples = do_fit(data,muon,data_mc,bins,args.steps,args.print_nll,args.walkers,args.thin)
+    xopt, samples = do_fit(data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,args.steps,args.print_nll,args.walkers,args.thin)
 
-    prob = get_prob(data,muon,data_mc,weights,samples,bins,size=args.multinomial_prob_size)
-    prob_atm = get_prob(data_atm,muon_atm,data_atm_mc,weights,samples,bins,size=args.multinomial_prob_size)
+    prob = get_prob(data,muon,data_mc,weights,atmo_scale_factor,muon_scale_factor,samples,bins,size=args.multinomial_prob_size)
+    prob_atm = get_prob(data_atm,muon_atm,data_atm_mc,weights,atmo_scale_factor,muon_scale_factor,samples,bins,size=args.multinomial_prob_size)
 
     plt.figure()
     for i in range(len(FIT_PARS)):
@@ -679,8 +682,8 @@ if __name__ == '__main__':
 
     fig = plt.figure()
     hists = get_data_hists(data,bins)
-    hists_muon = get_mc_hists(muon,xopt,bins,scale=xopt[5])
-    hists_mc = get_mc_hists(data_mc,xopt,bins,scale=xopt[0])
+    hists_muon = get_mc_hists(muon,xopt,bins,scale=xopt[5]/muon_scale_factor)
+    hists_mc = get_mc_hists(data_mc,xopt,bins,scale=xopt[0]/atmo_scale_factor)
     plot_hist2(hists,bins=bins,color='C0')
     plot_hist2(hists_mc,bins=bins,color='C1')
     plot_hist2(hists_muon,bins=bins,color='C2')
@@ -706,8 +709,8 @@ if __name__ == '__main__':
         plt.suptitle("Without Neutron Follower")
     fig = plt.figure()
     hists = get_data_hists(data_atm,bins)
-    hists_muon = get_mc_hists(muon_atm,xopt,bins,scale=xopt[5])
-    hists_mc = get_mc_hists(data_atm_mc,xopt,bins,scale=xopt[0])
+    hists_muon = get_mc_hists(muon_atm,xopt,bins,scale=xopt[5]/muon_scale_factor)
+    hists_mc = get_mc_hists(data_atm_mc,xopt,bins,scale=xopt[0]/atmo_scale_factor)
     plot_hist2(hists,bins=bins,color='C0')
     plot_hist2(hists_mc,bins=bins,color='C1')
     plot_hist2(hists_muon,bins=bins,color='C2')
-- 
cgit