From 031c70a265f31fcc73379f5153570cd926138462 Mon Sep 17 00:00:00 2001
From: tlatorre <tlatorre@uchicago.edu>
Date: Mon, 31 Aug 2020 10:43:27 -0500
Subject: add GENIE weights to chi2 analysis

---
 utils/chi2       | 66 +++++++++++++++++++++++++++++++++++++++++---------------
 utils/sddm/dc.py |  7 +++---
 2 files changed, 53 insertions(+), 20 deletions(-)

diff --git a/utils/chi2 b/utils/chi2
index b1c3c17..0ae7cb8 100755
--- a/utils/chi2
+++ b/utils/chi2
@@ -84,7 +84,7 @@ def plot_hist2(hists, bins, color=None):
     if len(df):
         plt.tight_layout()
 
-def get_mc_hists(data,x,bins,apply_norm=False):
+def get_mc_hists(data,x,bins,apply_norm=False,reweight=False):
     """
     Returns the expected Monte Carlo histograms for the atmospheric neutrino
     background.
@@ -113,9 +113,17 @@ def get_mc_hists(data,x,bins,apply_norm=False):
     ke_dict = {}
     for id in (20,22,2020,2022,2222):
         ke_dict[id] = data[data.id == id].ke.values
-    return get_mc_hists_fast(ke_dict,x,bins,apply_norm)
 
-def get_mc_hists_fast(ke_dict,x,bins,apply_norm=False):
+    if reweight:
+        ke_weights_dict = {}
+        for id in (20,22,2020,2022,2222):
+            ke_weights_dict[id] = data[data.id == id].weight.values
+    else:
+        ke_weights_dict = None
+
+    return get_mc_hists_fast(ke_dict,x,bins,apply_norm,weights_dict=ke_weights_dict)
+
+def get_mc_hists_fast(ke_dict,x,bins,apply_norm=False,weights_dict=None):
     """
     Same as get_mc_hists() but the first argument is a dictionary mapping
     particle id -> kinetic energy array. This is much faster than selecting the
@@ -140,7 +148,10 @@ def get_mc_hists_fast(ke_dict,x,bins,apply_norm=False):
             ke = ke*x[5]
             scale = bincenters2*max(EPSILON,x[6])
 
-        hist = np.histogram(ke,bins=bins2)[0]
+        if weights_dict:
+            hist = np.histogram(ke,bins=bins2,weights=weights_dict[id])[0]
+        else:
+            hist = np.histogram(ke,bins=bins2)[0]
 
         cdf = norm.cdf(bins[:,np.newaxis],bincenters2,scale)*hist
         mc_hists[id] = np.sum(cdf[1:] - cdf[:-1],axis=-1)
@@ -252,7 +263,7 @@ def make_nll(data, muons, mc, bins):
         return nll
     return nll
 
-def get_mc_hists_posterior(ke_dict,muon_hists,data_hists,x,bins):
+def get_mc_hists_posterior(data_mc,muon_hists,data_hists,x,bins):
     """
     Returns the posterior on the Monte Carlo histograms.
 
@@ -276,7 +287,7 @@ def get_mc_hists_posterior(ke_dict,muon_hists,data_hists,x,bins):
 
     Returns a dictionary mapping particle id combo -> histogram.
     """
-    mc_hists = get_mc_hists_fast(ke_dict,x,bins)
+    mc_hists = get_mc_hists(data_mc,x,bins,reweight=True)
     for id in (20,22,2020,2022,2222):
         mc_hists[id] = get_mc_hist_posterior(mc_hists[id],data_hists[id],norm=x[0])
         # FIXME: does the orering of when we add the muons matter here?
@@ -306,14 +317,20 @@ def get_multinomial_prob(data, data_muon, data_mc, id, x_samples, bins, percenti
     data_hists = get_data_hists(data,bins)
     muon_hists = get_data_hists(data_muon,bins)
 
-    ke_dict = {}
-    for _id in (20,22,2020,2022,2222):
-        ke_dict[_id] = data_mc[data_mc.id == _id].ke.values
+    # Get the total number of "universes" simulated in the GENIE reweight tool
+    if len(data_mc):
+        nuniverses = data_mc['universe'].max()+1
+    else:
+        nuniverses = 0
 
     ps = []
     for i in range(size):
         x = x_samples[np.random.randint(x_samples.shape[0])]
-        mc = get_mc_hists_posterior(ke_dict,muon_hists,data_hists,x,bins)[id]
+        if nuniverses > 0:
+            universe = np.random.randint(nuniverses)
+        else:
+            universe = 0
+        mc = get_mc_hists_posterior(data_mc[data_mc.universe == universe],muon_hists,data_hists,x,bins)[id]
         N = mc.sum()
         # Fix a bug in scipy(). See https://github.com/scipy/scipy/issues/8235 (I think).
         mc = mc + 1e-10
@@ -376,7 +393,7 @@ def do_fit(data,muon,data_mc,bins,steps):
     nll_xopt = nll(xopt)
     print("nll(xopt) = ", nll(xopt))
 
-    stepsizes = estimate_errors(nll,xopt,low,high,constraints)
+    stepsizes = estimate_errors(nll,xopt,low,high)
 
     with printoptions(precision=3, suppress=True):
         print("Errors: ", stepsizes)
@@ -424,6 +441,7 @@ if __name__ == '__main__':
     parser.add_argument("--steps", type=int, default=1000, help="number of steps in the MCMC chain")
     parser.add_argument("--multinomial-prob-size", type=int, default=10000, help="number of p values to compute")
     parser.add_argument("--coverage", type=int, default=0, help="plot p value coverage")
+    parser.add_argument("--weights", nargs='+', required=True, help="GENIE reweight HDF5 files")
     args = parser.parse_args()
 
     setup_matplotlib(args.save)
@@ -439,6 +457,7 @@ if __name__ == '__main__':
 
     ev_mc = get_events(args.mc, merge_fits=True, nhit_thresh=args.nhit_thresh)
     muon_mc = get_events(args.muon_mc, merge_fits=True, nhit_thresh=args.nhit_thresh)
+    weights = pd.concat([read_hdf(filename, "weights") for filename in args.weights],ignore_index=True)
 
     # Set all prompt events in the MC to be muons
     muon_mc.loc[muon_mc.prompt & muon_mc.filename.str.contains("cosmic"),'muon'] = True
@@ -484,6 +503,19 @@ if __name__ == '__main__':
     data_mc = ev_mc[ev_mc.signal & ev_mc.prompt & ~ev_mc.atm]
     data_atm_mc = ev_mc[ev_mc.signal & ev_mc.prompt & ev_mc.atm]
 
+    # Merge the reweight scales only after we've done all the data cleaning and
+    # high level cuts
+    data_mc_with_weights = pd.merge(data_mc,weights,how='left',on=['run','evn'])
+    data_atm_mc_with_weights = pd.merge(data_atm_mc,weights,how='left',on=['run','evn'])
+
+    # Make sure we are only using data with weights
+    #
+    # FIXME: There is probably a better way to do this. For example, we could
+    # use the full dataset for the fit and then scale the normalization factor
+    # by the ratio of events with weights to the total number of events.
+    data_mc = data_mc_with_weights[data_mc_with_weights.universe == 0]
+    data_atm_mc = data_atm_mc_with_weights[data_atm_mc_with_weights.universe == 0]
+
     bins = np.logspace(np.log10(20),np.log10(10e3),21)
 
     if args.coverage:
@@ -519,8 +551,8 @@ if __name__ == '__main__':
             n_muon_atm = np.random.poisson(N_muon_atm)
 
             # Sample data from Monte Carlo
-            data = pd.concat((data_mc.sample(n=n,replace=True), muon.sample(n=n_muon,replace=True)))
-            data_atm = pd.concat((data_atm_mc.sample(n=n_atm,replace=True), muon_atm.sample(n=n_muon_atm,replace=True)))
+            data = pd.concat((data_mc.sample(n=n,replace=True,weights='weights'), muon.sample(n=n_muon,replace=True)))
+            data_atm = pd.concat((data_atm_mc.sample(n=n_atm,replace=True,weights='weights'), muon_atm.sample(n=n_muon_atm,replace=True)))
 
             # Smear the energies by the additional energy resolution
             data.ke += np.random.randn(len(data.ke))*data.ke*energy_resolution
@@ -533,8 +565,8 @@ if __name__ == '__main__':
                 std = np.std(samples[:,i])
                 pull[i].append((mean - true_values[i])/std)
 
-            prob = get_prob(data,muon,data_mc,samples,bins,size=args.multinomial_prob_size)
-            prob_atm = get_prob(data_atm,muon_atm,data_atm_mc,samples,bins,size=args.multinomial_prob_size)
+            prob = get_prob(data,muon,data_mc_with_weights,samples,bins,size=args.multinomial_prob_size)
+            prob_atm = get_prob(data_atm,muon_atm,data_atm_mc_with_weights,samples,bins,size=args.multinomial_prob_size)
 
             for id in (20,22,2020,2022,2222):
                 p_values[id].append(prob[id])
@@ -613,8 +645,8 @@ if __name__ == '__main__':
 
     xopt, samples = do_fit(data,muon,data_mc,bins,args.steps)
 
-    prob = get_prob(data,muon,data_mc,samples,bins,size=args.multinomial_prob_size)
-    prob_atm = get_prob(data_atm,muon_atm,data_atm_mc,samples,bins,size=args.multinomial_prob_size)
+    prob = get_prob(data,muon,data_mc_with_weights,samples,bins,size=args.multinomial_prob_size)
+    prob_atm = get_prob(data_atm,muon_atm,data_atm_mc_with_weights,samples,bins,size=args.multinomial_prob_size)
 
     plt.figure()
     plt.subplot(3,3,1)
diff --git a/utils/sddm/dc.py b/utils/sddm/dc.py
index 1146213..461769e 100755
--- a/utils/sddm/dc.py
+++ b/utils/sddm/dc.py
@@ -93,7 +93,7 @@ def get_proposal_func(stepsizes, low, high):
         return x, log_p_x0_given_x - log_p_x_given_x0
     return proposal
 
-def estimate_errors(nll, xopt, low, high, constraints):
+def estimate_errors(nll, xopt, low, high, constraints=None):
     """
     Return approximate 1 sigma errors for each parameter assuming all
     parameters are uncorrelated.
@@ -115,11 +115,12 @@ def estimate_errors(nll, xopt, low, high, constraints):
     errors = np.empty_like(xopt)
 
     for i in range(len(xopt)):
-        if np.sign(root(low[i],xopt,i)) == np.sign(nll_xopt):
+        if np.sign(root(low[i],xopt,i)) == np.sign(root(xopt[i],xopt,i)):
             xlo = xopt[i] - low[i]
         else:
             xlo = brentq(root,low[i],xopt[i],args=(xopt,i))
-        if np.sign(root(high[i],xopt,i)) == np.sign(nll_xopt):
+
+        if np.sign(root(high[i],xopt,i)) == np.sign(root(xopt[i],xopt,i)):
             xhi = high[i] - xopt[i]
         else:
             xhi = brentq(root,xopt[i],high[i],args=(xopt,i))
-- 
cgit