From 51f7458944d0b3a84ab051e7f11437e1a352e638 Mon Sep 17 00:00:00 2001
From: tlatorre <tlatorre@uchicago.edu>
Date: Mon, 27 Jul 2020 11:52:41 -0500
Subject: update how we calculte the p-value in chi2

---
 utils/chi2 | 45 ++++++++++++++++++++++++++++++---------------
 1 file changed, 30 insertions(+), 15 deletions(-)

(limited to 'utils')

diff --git a/utils/chi2 b/utils/chi2
index f1ae974..4515b91 100755
--- a/utils/chi2
+++ b/utils/chi2
@@ -121,18 +121,25 @@ def make_nll(data, mc_hists):
         return nll - norm.logpdf(x[1],ENERGY_SCALE_MEAN,ENERGY_SCALE_UNCERTAINTY)
     return nll
 
-def get_mc_hist(data,x,bins):
-    hist = np.histogram(data,bins=bins)[0]
-    return sample_mc_hist(hist,norm=x[0]/100.0)
+def get_mc_hist(data_mc,data_hist,x,bins):
+    hist = np.histogram(data_mc,bins=bins)[0]
+    return get_mc_hist_posterior(hist,data_hist,norm=x[0]/100.0)
 
 def get_data_hist(data,x,bins):
     return np.histogram(data*x[1],bins=bins)[0]
 
-def get_multinomial_prob(data, data_mc, x_samples, bins, size=10000):
+def get_multinomial_prob(data, data_mc, x_samples, bins, percentile=99.0, size=10000):
     """
     Returns the p-value that the histogram of the data is drawn from the MC
     histogram.
 
+    The p-value is calculated by first sampling the posterior of the fit
+    parameters `size` times. For each iteration we calculate a p-value. We then
+    return the `percentile` percentile of all the p-values. This approach is
+    similar to both the supremum and posterior predictive methods of
+    calculating a p-value. For more information on different methods of
+    calculating p-values see https://cds.cern.ch/record/1099967/files/p23.pdf.
+
     Arguments:
 
         data: 1D array of KE values
@@ -141,23 +148,31 @@ def get_multinomial_prob(data, data_mc, x_samples, bins, size=10000):
         bins: bins used to bin the mc histogram
         size: number of values to compute
     """
-    chi2_data = []
-    chi2_samples = []
+    ps = []
     for i in range(size):
         x = x_samples[np.random.randint(x_samples.shape[0])]
-        mc = get_mc_hist(data_mc,x,bins)
+        data_hist = get_data_hist(data,x,bins)
+        mc = get_mc_hist(data_mc,data_hist,x,bins)
         N = mc.sum()
         # Fix a bug in scipy(). See https://github.com/scipy/scipy/issues/8235 (I think).
         mc = mc + 1e-10
         p = mc/mc.sum()
-        data_hist = get_data_hist(data,x,bins)
-        chi2_data.append(chi2(data_hist,mc))
-        n = np.random.poisson(N)
-        samples = multinomial.rvs(n,p)
-        chi2_samples.append(chi2(samples,mc))
-    chi2_samples = np.array(chi2_samples)
-    chi2_data = np.array(chi2_data)
-    return np.count_nonzero(chi2_samples >= chi2_data)/len(chi2_samples)
+        chi2_data = nllr(data_hist,mc)
+        # To draw the multinomial samples we first draw the expected number of
+        # events from a Poisson distribution and then loop over the counts and
+        # unique values. The reason we do this is that you can't call
+        # multinomial.rvs with a multidimensional `n` array, and looping over every
+        # single entry takes forever
+        ns = np.random.poisson(N,size=1000)
+        samples = []
+        for n, count in zip(*np.unique(ns,return_counts=True)):
+            samples.append(multinomial.rvs(n,p,size=count))
+        samples = np.concatenate(samples)
+        # Calculate the negative log likelihood ratio for the data simulated under
+        # the null hypothesis
+        chi2_samples = nllr(samples,mc)
+        ps.append(np.count_nonzero(chi2_samples >= chi2_data)/len(chi2_samples))
+    return np.percentile(ps,percentile)
 
 if __name__ == '__main__':
     import argparse
-- 
cgit