add energy scale to chi2 fit

author: tlatorre <tlatorre@uchicago.edu> 2020-06-29 07:52:43 -0500
committer: tlatorre <tlatorre@uchicago.edu> 2020-06-29 07:52:43 -0500
commit: 21420a805f214203819dd19ffc45fca9451a10bd (patch)
tree: 864b651123c9c0347b763fc501992a50f497dbe6
parent: 83a96dec3aec3d794dcf9b2d566d0cfb6547a0f5 (diff)
download: sddm-21420a805f214203819dd19ffc45fca9451a10bd.tar.gz
sddm-21420a805f214203819dd19ffc45fca9451a10bd.tar.bz2
sddm-21420a805f214203819dd19ffc45fca9451a10bd.zip
1 files changed, 102 insertions, 30 deletions
diff --git a/utils/chi2 b/utils/chi2
index 8c9e0e3..0460993 100755
--- a/utils/chi2
+++ b/utils/chi2
@@ -33,9 +33,14 @@ from scipy.stats import iqr, norm, beta
 from scipy.special import spence
 from itertools import izip_longest
 
+# Uncertainty on the energy scale
+# FIXME: Should get real number from stopping muons
+ENERGY_SCALE_MEAN = 1.0
+ENERGY_SCALE_UNCERTAINTY = 0.1
+
 particle_id = {20: 'e', 22: r'\mu'}
 
-def plot_hist2(df, muons=False, norm=1.0, color=None):
+def plot_hist2(df, norm=1.0, scale=1.0, color=None):
     for id, df_id in sorted(df.groupby('id')):
         if id == 20:
             plt.subplot(2,3,1)
@@ -48,14 +53,10 @@ def plot_hist2(df, muons=False, norm=1.0, color=None):
         elif id == 2222:
             plt.subplot(2,3,6)
 
-        if muons:
-            plt.hist(np.log10(df_id.ke.values/1000), bins=np.linspace(0,4.5,100), histtype='step')
-            plt.xlabel("log10(Energy (GeV))")
-        else:
-            bins = np.logspace(np.log10(20),np.log10(10e3),21)
-            plt.hist(df_id.ke.values, bins=bins, histtype='step', weights=np.tile(norm,len(df_id.ke.values)),color=color)
-            plt.gca().set_xscale("log")
-            plt.xlabel("Energy (MeV)")
+        bins = np.logspace(np.log10(20),np.log10(10e3),21)
+        plt.hist(df_id.ke.values*scale, bins=bins, histtype='step', weights=np.tile(norm,len(df_id.ke.values)),color=color)
+        plt.gca().set_xscale("log")
+        plt.xlabel("Energy (MeV)")
         plt.title('$' + ''.join([particle_id[int(''.join(x))] for x in grouper(str(id),2)]) + '$')
 
     if len(df):
@@ -93,8 +94,19 @@ def plot_hist(df, muons=False):
     if len(df):
         plt.tight_layout()
 
-def make_nll(data_hists, mc_hists):
+def make_nll(data, mc_hists):
     def nll(x, grad=None):
+        if x[0] < 0 or x[1] < 0:
+            return np.inf
+
+        data_hists = {}
+        for id in (20,22,2020,2022,2222):
+            df_id = data[data.id == id]
+            if len(df_id):
+                data_hists[id] = np.histogram(df_id.ke.values*x[1],bins=bins)[0]
+            else:
+                data_hists[id] = np.zeros(len(bins)-1,dtype=np.int)
+
         nll = 0
         for id in data_hists:
             N = data_hists[id].sum()
@@ -105,24 +117,53 @@ def make_nll(data_hists, mc_hists):
                 p += 1e-10
                 p /= p.sum()
                 nll -= multinomial.logpmf(data_hists[id],N,p)
-        return nll
+        return nll - norm.logpdf(x[1],ENERGY_SCALE_MEAN,ENERGY_SCALE_UNCERTAINTY)
     return nll
 
 def chi2(samples,expected):
     return np.sum((samples-expected)**2/expected,axis=-1)
 
-def get_multinomial_prob(data, mc, size=1000):
-    N = mc.sum()
-    # Fix a bug in scipy(). See https://github.com/scipy/scipy/issues/8235 (I think).
-    mc = mc + 1e-10
-    p = mc/mc.sum()
-    chi2_data = chi2(data,mc)
+def get_mc_hist(data,x,bins):
+    if len(data):
+        return np.histogram(data,bins=bins)[0]*x[0]/100.0
+    else:
+        return np.zeros(len(bins)-1,dtype=np.int)
+
+def get_data_hist(data,x,bins):
+    if len(data):
+        return np.histogram(data*x[1],bins=bins)[0]
+    else:
+        return np.zeros(len(bins)-1,dtype=np.int)
+
+def get_multinomial_prob(data, data_mc, x_samples, bins, size=10000):
+    """
+    Returns the p-value that the histogram of the data is drawn from the MC
+    histogram.
+
+    Arguments:
+
+        data: 1D array of KE values
+        data_mc: 1D array of MC KE values
+        x_samples: MCMC samples of the floated parameters in the fit
+        bins: bins used to bin the mc histogram
+        size: number of values to compute
+    """
+    chi2_data = []
     chi2_samples = []
     for i in range(size):
+        x = x_samples[np.random.randint(x_samples.shape[0])]
+        mc = get_mc_hist(data_mc,x,bins)
+        N = mc.sum()
+        # Fix a bug in scipy(). See https://github.com/scipy/scipy/issues/8235 (I think).
+        mc = mc + 1e-10
+        p = mc/mc.sum()
+        data_hist = get_data_hist(data,x,bins)
+        chi2_data.append(chi2(data_hist,mc))
         n = np.random.poisson(N)
         samples = multinomial.rvs(n,p)
         chi2_samples.append(chi2(samples,mc))
     chi2_samples = np.array(chi2_samples)
+    chi2_data = np.array(chi2_data)
     return np.count_nonzero(chi2_samples >= chi2_data)/len(chi2_samples)
 
 if __name__ == '__main__':
@@ -135,12 +176,14 @@ if __name__ == '__main__':
     from sddm.plot import *
     from sddm import setup_matplotlib
     import nlopt
+    import emcee
 
     parser = argparse.ArgumentParser("plot fit results")
     parser.add_argument("filenames", nargs='+', help="input files")
     parser.add_argument("--save", action='store_true', default=False, help="save corner plots for backgrounds")
     parser.add_argument("--mc", nargs='+', required=True, help="atmospheric MC files")
     parser.add_argument("--nhit-thresh", type=int, default=None, help="nhit threshold to apply to events before processing (should only be used for testing to speed things up)")
+    parser.add_argument("--steps", type=int, default=1000, help="number of steps in the MCMC chain")
     args = parser.parse_args()
 
     setup_matplotlib(args.save)
@@ -210,7 +253,7 @@ if __name__ == '__main__':
     for id in (20,22,2020,2022,2222):
         df_id = mc[mc.id == id]
         if len(df_id):
-            mc_hists[id] = np.histogram(df_id.ke.values,bins=bins)[0]
+            mc_hists[id] = np.histogram(df_id.ke.values,bins=bins)[0]/100
         else:
             mc_hists[id] = np.zeros(len(bins)-1,dtype=np.int)
 
@@ -226,34 +269,63 @@ if __name__ == '__main__':
     for id in (20,22,2020,2022,2222):
         df_id = atm_mc[atm_mc.id == id]
         if len(df_id):
-            mc_atm_hists[id] = np.histogram(df_id.ke.values,bins=bins)[0]
+            mc_atm_hists[id] = np.histogram(df_id.ke.values,bins=bins)[0]/100
         else:
             mc_atm_hists[id] = np.zeros(len(bins)-1,dtype=np.int)
 
-    nll = make_nll(data_hists,mc_hists)
+    nll = make_nll(data,mc_hists)
 
-    x0 = np.array([1.0])
+    x0 = np.array([1.0,1.0])
     opt = nlopt.opt(nlopt.LN_SBPLX, len(x0))
     opt.set_min_objective(nll)
-    low = np.array([1e-10])
-    high = np.array([10])
+    low = np.array([1e-10,1e-10])
+    high = np.array([10,10])
     opt.set_lower_bounds(low)
     opt.set_upper_bounds(high)
     opt.set_ftol_abs(1e-10)
-    opt.set_initial_step([0.01])
+    opt.set_initial_step([0.01,0.01])
 
     xopt = opt.optimize(x0)
+    print("xopt = ", xopt)
     nll_xopt = nll(xopt)
     print("nll(xopt) = ", nll(xopt))
 
+    pos = np.empty((10, len(x0)),dtype=np.double)
+    for i in range(pos.shape[0]):
+        pos[i] = xopt + np.random.randn(len(x0))*0.1
+        pos[i,:] = np.clip(pos[i,:],1e-10,10)
+
+    nwalkers, ndim = pos.shape
+
+    sampler = emcee.EnsembleSampler(nwalkers, ndim, lambda x: -nll(x))
+    with np.errstate(invalid='ignore'):
+        sampler.run_mcmc(pos, args.steps)
+
+    print("Mean acceptance fraction: {0:.3f}".format(np.mean(sampler.acceptance_fraction)))
+
+    try:
+        print("autocorrelation time: ", sampler.get_autocorr_time(quiet=True))
+    except Exception as e:
+        print(e)
+
+    samples = sampler.chain.reshape((-1,len(x0)))
+
+    plt.figure()
+    plt.subplot(2,2,1)
+    plt.hist(samples[:,0],bins=100,histtype='step')
+    plt.xlabel("Atmospheric Flux Scale")
+    plt.subplot(2,2,2)
+    plt.hist(samples[:,1],bins=100,histtype='step')
+    plt.xlabel("Energy Scale")
+
     prob = {}
     for id in (20,22,2020,2022,2222):
-        prob[id] = get_multinomial_prob(data_hists[id],mc_hists[id]*xopt[0])
+        prob[id] = get_multinomial_prob(data[data.id == id].ke.values,mc[mc.id == id].ke.values,samples,bins)
         print(id, prob[id])
 
     prob_atm = {}
     for id in (20,22,2020,2022,2222):
-        prob_atm[id] = get_multinomial_prob(data_atm_hists[id],mc_atm_hists[id]*xopt[0])
+        prob_atm[id] = get_multinomial_prob(atm[atm.id == id].ke.values,atm_mc[atm_mc.id == id].ke.values,samples,bins)
         print(id, prob_atm[id])
 
     handles = [Line2D([0], [0], color='C0'),
@@ -261,8 +333,8 @@ if __name__ == '__main__':
     labels = ('Data','Monte Carlo')
 
     fig = plt.figure()
-    plot_hist2(prompt,color='C0')
-    plot_hist2(prompt_mc,norm=xopt[0],color='C1')
+    plot_hist2(prompt,scale=xopt[1],color='C0')
+    plot_hist2(prompt_mc,norm=xopt[0]/100,color='C1')
     for id in (20,22,2020,2022,2222):
         if id == 20:
             plt.subplot(2,3,1)
@@ -284,8 +356,8 @@ if __name__ == '__main__':
     else:
         plt.suptitle("Without Neutron Follower")
     fig = plt.figure()
-    plot_hist2(atm,color='C0')
-    plot_hist2(atm_mc,norm=xopt[0],color='C1')
+    plot_hist2(atm,scale=xopt[1],color='C0')
+    plot_hist2(atm_mc,norm=xopt[0]/100,color='C1')
     for id in (20,22,2020,2022,2222):
         if id == 20:
             plt.subplot(2,3,1)
author	tlatorre <tlatorre@uchicago.edu>	2020-06-29 07:52:43 -0500
committer	tlatorre <tlatorre@uchicago.edu>	2020-06-29 07:52:43 -0500
commit	21420a805f214203819dd19ffc45fca9451a10bd (patch)
tree	864b651123c9c0347b763fc501992a50f497dbe6
parent	83a96dec3aec3d794dcf9b2d566d0cfb6547a0f5 (diff)
download	sddm-21420a805f214203819dd19ffc45fca9451a10bd.tar.gz sddm-21420a805f214203819dd19ffc45fca9451a10bd.tar.bz2 sddm-21420a805f214203819dd19ffc45fca9451a10bd.zip