1 files changed, 110 insertions, 103 deletions
diff --git a/utils/chi2 b/utils/chi2
index 3b519e2..444638b 100755
--- a/utils/chi2
+++ b/utils/chi2
@@ -36,13 +36,15 @@ from sddm.stats import *
 
 # Uncertainty on the energy scale
 # FIXME: Should get real number from stopping muons
-ENERGY_SCALE_MEAN = 1.0
-ENERGY_SCALE_UNCERTAINTY = 0.1
+ENERGY_SCALE_MEAN = {'e': 1.0, 'u': 1.0, 'eu': 1.0}
+ENERGY_SCALE_UNCERTAINTY = {'e':0.1, 'u': 0.1, 'eu': 0.1}
+ENERGY_RESOLUTION_MEAN = {'e': 0.0, 'u': 0.0, 'eu': 0.0}
+ENERGY_RESOLUTION_UNCERTAINTY = {'e':0.1, 'u': 0.1, 'eu': 0.1}
 
 particle_id = {20: 'e', 22: r'\mu'}
 
-def plot_hist2(df, norm=1.0, scale=1.0, color=None):
-    for id, df_id in sorted(df.groupby('id')):
+def plot_hist2(hists, bins, color=None):
+    for id in (20,22,2020,2022,2222):
         if id == 20:
             plt.subplot(2,3,1)
         elif id == 22:
@@ -54,8 +56,8 @@ def plot_hist2(df, norm=1.0, scale=1.0, color=None):
         elif id == 2222:
             plt.subplot(2,3,6)
 
-        bins = np.logspace(np.log10(20),np.log10(10e3),21)
-        plt.hist(df_id.ke.values*scale, bins=bins, histtype='step', weights=np.tile(norm,len(df_id.ke.values)),color=color)
+        bincenters = (bins[1:] + bins[:-1])/2
+        plt.hist(bincenters, bins=bins, histtype='step', weights=hists[id],color=color)
         plt.gca().set_xscale("log")
         plt.xlabel("Energy (MeV)")
         plt.title('$' + ''.join([particle_id[int(''.join(x))] for x in grouper(str(id),2)]) + '$')
@@ -63,69 +65,85 @@ def plot_hist2(df, norm=1.0, scale=1.0, color=None):
     if len(df):
         plt.tight_layout()
 
-def plot_hist(df, muons=False):
-    for id, df_id in sorted(df.groupby('id')):
-        if id == 20:
-            plt.subplot(3,4,1)
-        elif id == 22:
-            plt.subplot(3,4,2)
-        elif id == 2020:
-            plt.subplot(3,4,5)
+def get_mc_hists(data,x,bins,apply_norm=False):
+    mc_hists = {}
+
+    # FIXME: May need to increase number of bins here
+    bins2 = np.logspace(np.log10(20),np.log10(10e3),1000)
+    bincenters2 = (bins2[1:] + bins2[:-1])/2
+
+    for id in (20,22,2020,2022,2222):
+        ke = data[data.id == id].ke.values
+
+        if id in (20,2020):
+            ke = ke*x[1]
+            scale = bincenters2*x[2]
+        elif id in (22,2222):
+            ke = ke*x[3]
+            scale = bincenters2*x[4]
         elif id == 2022:
-            plt.subplot(3,4,6)
-        elif id == 2222:
-            plt.subplot(3,4,7)
-        elif id == 202020:
-            plt.subplot(3,4,9)
-        elif id == 202022:
-            plt.subplot(3,4,10)
-        elif id == 202222:
-            plt.subplot(3,4,11)
-        elif id == 222222:
-            plt.subplot(3,4,12)
-
-        if muons:
-            plt.hist(np.log10(df_id.ke.values/1000), bins=np.linspace(0,4.5,100), histtype='step')
-            plt.xlabel("log10(Energy (GeV))")
-        else:
-            plt.hist(df_id.ke.values, bins=np.linspace(20,10e3,100), histtype='step')
-            plt.xlabel("Energy (MeV)")
-        plt.title(str(id))
+            ke = ke*x[5]
+            scale = bincenters2*x[6]
 
-    if len(df):
-        plt.tight_layout()
+        hist = np.histogram(ke,bins=bins2)[0]
+
+        cdf = norm.cdf(bins[:,np.newaxis],bincenters2,scale)*hist
+        mc_hists[id] = np.sum(cdf[1:] - cdf[:-1],axis=-1)
+        if apply_norm:
+            mc_hists[id] *= x[0]
+    return mc_hists
+
+def get_data_hists(data,bins):
+    data_hists = {}
+    for id in (20,22,2020,2022,2222):
+        df_id = data[data.id == id]
+        data_hists[id] = np.histogram(df_id.ke.values,bins=bins)[0]
+    return data_hists
+
+# Likelihood Fit Parameters
+# 0 - Atmospheric Neutrino Flux Scale
+# 1 - Electron energy bias
+# 2 - Electron energy resolution
+# 3 - Muon energy bias
+# 4 - Muon energy resolution
+# 5 - Electron + Muon energy bias
+# 6 - Electron + Muon energy resolution
+
+def make_nll(data, mc, bins):
+    data_hists = get_data_hists(data,bins)
 
-def make_nll(data, mc_hists):
     def nll(x, grad=None):
-        if x[0] < 0 or x[1] < 0:
+        if any(x[i] < 0 for i in range(len(x))):
             return np.inf
 
-        data_hists = {}
-        for id in (20,22,2020,2022,2222):
-            df_id = data[data.id == id]
-            if len(df_id):
-                data_hists[id] = np.histogram(df_id.ke.values*x[1],bins=bins)[0]
-            else:
-                data_hists[id] = np.zeros(len(bins)-1,dtype=np.int)
+        mc_hists = get_mc_hists(mc,x,bins,apply_norm=True)
 
         nll = 0
         for id in data_hists:
-            oi = data_hists[id].sum()
+            oi = data_hists[id]
             ei = mc_hists[id] + EPSILON
-            N = oi.sum()
-            n = ei.sum()
+            N = ei.sum()
             nll -= -N - np.sum(gammaln(oi+1)) + np.sum(oi*np.log(ei))
-        return nll - norm.logpdf(x[1],ENERGY_SCALE_MEAN,ENERGY_SCALE_UNCERTAINTY)
+        nll -= norm.logpdf(x[1],ENERGY_SCALE_MEAN['e'],ENERGY_SCALE_UNCERTAINTY['e'])
+        nll -= norm.logpdf(x[3],ENERGY_SCALE_MEAN['u'],ENERGY_SCALE_UNCERTAINTY['u'])
+        nll -= norm.logpdf(x[5],ENERGY_SCALE_MEAN['eu'],ENERGY_SCALE_UNCERTAINTY['eu'])
+        nll -= norm.logpdf(x[2],ENERGY_RESOLUTION_MEAN['e'],ENERGY_RESOLUTION_UNCERTAINTY['e'])
+        nll -= norm.logpdf(x[4],ENERGY_RESOLUTION_MEAN['u'],ENERGY_RESOLUTION_UNCERTAINTY['u'])
+        nll -= norm.logpdf(x[6],ENERGY_RESOLUTION_MEAN['eu'],ENERGY_RESOLUTION_UNCERTAINTY['eu'])
+        print("nll = %.2f" % nll)
+        return nll
     return nll
 
-def get_mc_hist(data_mc,data_hist,x,bins):
-    hist = np.histogram(data_mc,bins=bins)[0]
-    return get_mc_hist_posterior(hist,data_hist,norm=x[0]/100.0)
+def get_mc_hists_posterior(data,data_hists,x,bins):
+    mc_hists = get_mc_hists(data,x,bins)
+    for id in (20,22,2020,2022,2222):
+        mc_hists[id] = get_mc_hist_posterior(mc_hists[id],data_hists[id],norm=x[0])
+    return mc_hists
 
 def get_data_hist(data,x,bins):
-    return np.histogram(data*x[1],bins=bins)[0]
+    return np.histogram(data,bins=bins)[0]
 
-def get_multinomial_prob(data, data_mc, x_samples, bins, percentile=99.0, size=10000):
+def get_multinomial_prob(data, data_mc, id, x_samples, bins, percentile=99.0, size=10000):
     """
     Returns the p-value that the histogram of the data is drawn from the MC
     histogram.
@@ -145,16 +163,17 @@ def get_multinomial_prob(data, data_mc, x_samples, bins, percentile=99.0, size=1
         bins: bins used to bin the mc histogram
         size: number of values to compute
     """
+    data_hists = get_data_hists(data,bins)
+
     ps = []
     for i in range(size):
         x = x_samples[np.random.randint(x_samples.shape[0])]
-        data_hist = get_data_hist(data,x,bins)
-        mc = get_mc_hist(data_mc,data_hist,x,bins)
+        mc = get_mc_hists_posterior(data_mc,data_hists,x,bins)[id]
         N = mc.sum()
         # Fix a bug in scipy(). See https://github.com/scipy/scipy/issues/8235 (I think).
         mc = mc + 1e-10
         p = mc/mc.sum()
-        chi2_data = nllr(data_hist,mc)
+        chi2_data = nllr(data_hists[id],mc)
         # To draw the multinomial samples we first draw the expected number of
         # events from a Poisson distribution and then loop over the counts and
         # unique values. The reason we do this is that you can't call
@@ -254,59 +273,28 @@ if __name__ == '__main__':
     mc = prompt_mc
 
     bins = np.logspace(np.log10(20),np.log10(10e3),21)
-    data_hists = {}
-    mc_hists = {}
-    for id in (20,22,2020,2022,2222):
-        df_id = data[data.id == id]
-        if len(df_id):
-            data_hists[id] = np.histogram(df_id.ke.values,bins=bins)[0]
-        else:
-            data_hists[id] = np.zeros(len(bins)-1,dtype=np.int)
 
-    for id in (20,22,2020,2022,2222):
-        df_id = mc[mc.id == id]
-        if len(df_id):
-            mc_hists[id] = np.histogram(df_id.ke.values,bins=bins)[0]/100
-        else:
-            mc_hists[id] = np.zeros(len(bins)-1,dtype=np.int)
-
-    data_atm_hists = {}
-    mc_atm_hists = {}
-    for id in (20,22,2020,2022,2222):
-        df_id = atm[atm.id == id]
-        if len(df_id):
-            data_atm_hists[id] = np.histogram(df_id.ke.values,bins=bins)[0]
-        else:
-            data_atm_hists[id] = np.zeros(len(bins)-1,dtype=np.int)
+    nll = make_nll(data,mc,bins)
 
-    for id in (20,22,2020,2022,2222):
-        df_id = atm_mc[atm_mc.id == id]
-        if len(df_id):
-            mc_atm_hists[id] = np.histogram(df_id.ke.values,bins=bins)[0]/100
-        else:
-            mc_atm_hists[id] = np.zeros(len(bins)-1,dtype=np.int)
-
-    nll = make_nll(data,mc_hists)
-
-    x0 = np.array([1.0,1.0])
+    x0 = np.array([1.0,1.0,EPSILON,1.0,EPSILON,1.0,EPSILON])
     opt = nlopt.opt(nlopt.LN_SBPLX, len(x0))
     opt.set_min_objective(nll)
-    low = np.array([1e-10,1e-10])
-    high = np.array([10,10])
+    low = np.array([EPSILON]*len(x0))
+    high = np.array([10]*len(x0))
     opt.set_lower_bounds(low)
     opt.set_upper_bounds(high)
     opt.set_ftol_abs(1e-10)
-    opt.set_initial_step([0.01,0.01])
+    opt.set_initial_step([0.01]*len(x0))
 
     xopt = opt.optimize(x0)
     print("xopt = ", xopt)
     nll_xopt = nll(xopt)
     print("nll(xopt) = ", nll(xopt))
 
-    pos = np.empty((10, len(x0)),dtype=np.double)
+    pos = np.empty((20, len(x0)),dtype=np.double)
     for i in range(pos.shape[0]):
         pos[i] = xopt + np.random.randn(len(x0))*0.1
-        pos[i,:] = np.clip(pos[i,:],1e-10,10)
+        pos[i,:] = np.clip(pos[i,:],low,high)
 
     nwalkers, ndim = pos.shape
 
@@ -324,21 +312,36 @@ if __name__ == '__main__':
     samples = sampler.chain.reshape((-1,len(x0)))
 
     plt.figure()
-    plt.subplot(2,2,1)
+    plt.subplot(3,3,1)
     plt.hist(samples[:,0],bins=100,histtype='step')
     plt.xlabel("Atmospheric Flux Scale")
-    plt.subplot(2,2,2)
+    plt.subplot(3,3,2)
+    plt.hist(samples[:,1],bins=100,histtype='step')
+    plt.xlabel("Electron Energy Scale")
+    plt.subplot(3,3,3)
+    plt.hist(samples[:,1],bins=100,histtype='step')
+    plt.xlabel("Electron Energy Resolution")
+    plt.subplot(3,3,4)
+    plt.hist(samples[:,1],bins=100,histtype='step')
+    plt.xlabel("Muon Energy Scale")
+    plt.subplot(3,3,5)
+    plt.hist(samples[:,1],bins=100,histtype='step')
+    plt.xlabel("Muon Energy Resolution")
+    plt.subplot(3,3,6)
+    plt.hist(samples[:,1],bins=100,histtype='step')
+    plt.xlabel("Electron + Muon Energy Scale")
+    plt.subplot(3,3,7)
     plt.hist(samples[:,1],bins=100,histtype='step')
-    plt.xlabel("Energy Scale")
+    plt.xlabel("Electron + Muon Energy Resolution")
 
     prob = {}
     for id in (20,22,2020,2022,2222):
-        prob[id] = get_multinomial_prob(data[data.id == id].ke.values,mc[mc.id == id].ke.values,samples,bins)
+        prob[id] = get_multinomial_prob(data,mc,id,samples,bins)
         print(id, prob[id])
 
     prob_atm = {}
     for id in (20,22,2020,2022,2222):
-        prob_atm[id] = get_multinomial_prob(atm[atm.id == id].ke.values,atm_mc[atm_mc.id == id].ke.values,samples,bins)
+        prob_atm[id] = get_multinomial_prob(atm,atm_mc,id,samples,bins)
         print(id, prob_atm[id])
 
     handles = [Line2D([0], [0], color='C0'),
@@ -346,8 +349,10 @@ if __name__ == '__main__':
     labels = ('Data','Monte Carlo')
 
     fig = plt.figure()
-    plot_hist2(prompt,scale=xopt[1],color='C0')
-    plot_hist2(prompt_mc,norm=xopt[0]/100,color='C1')
+    hists = get_data_hists(prompt,bins)
+    hists_mc = get_mc_hists(prompt_mc,xopt,bins,apply_norm=True)
+    plot_hist2(hists,bins=bins,color='C0')
+    plot_hist2(hists_mc,bins=bins,color='C1')
     for id in (20,22,2020,2022,2222):
         if id == 20:
             plt.subplot(2,3,1)
@@ -369,8 +374,10 @@ if __name__ == '__main__':
     else:
         plt.suptitle("Without Neutron Follower")
     fig = plt.figure()
-    plot_hist2(atm,scale=xopt[1],color='C0')
-    plot_hist2(atm_mc,norm=xopt[0]/100,color='C1')
+    hists = get_data_hists(atm,bins)
+    hists_mc = get_mc_hists(atm_mc,xopt,bins,apply_norm=True)
+    plot_hist2(hists,bins=bins,color='C0')
+    plot_hist2(hists_mc,bins=bins,color='C1')
     for id in (20,22,2020,2022,2222):
         if id == 20:
             plt.subplot(2,3,1)