aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xutils/chi245
1 files changed, 30 insertions, 15 deletions
diff --git a/utils/chi2 b/utils/chi2
index f1ae974..4515b91 100755
--- a/utils/chi2
+++ b/utils/chi2
@@ -121,18 +121,25 @@ def make_nll(data, mc_hists):
return nll - norm.logpdf(x[1],ENERGY_SCALE_MEAN,ENERGY_SCALE_UNCERTAINTY)
return nll
-def get_mc_hist(data,x,bins):
- hist = np.histogram(data,bins=bins)[0]
- return sample_mc_hist(hist,norm=x[0]/100.0)
+def get_mc_hist(data_mc,data_hist,x,bins):
+ hist = np.histogram(data_mc,bins=bins)[0]
+ return get_mc_hist_posterior(hist,data_hist,norm=x[0]/100.0)
def get_data_hist(data,x,bins):
return np.histogram(data*x[1],bins=bins)[0]
-def get_multinomial_prob(data, data_mc, x_samples, bins, size=10000):
+def get_multinomial_prob(data, data_mc, x_samples, bins, percentile=99.0, size=10000):
"""
Returns the p-value that the histogram of the data is drawn from the MC
histogram.
+ The p-value is calculated by first sampling the posterior of the fit
+ parameters `size` times. For each iteration we calculate a p-value. We then
+ return the `percentile` percentile of all the p-values. This approach is
+ similar to both the supremum and posterior predictive methods of
+ calculating a p-value. For more information on different methods of
+ calculating p-values see https://cds.cern.ch/record/1099967/files/p23.pdf.
+
Arguments:
data: 1D array of KE values
@@ -141,23 +148,31 @@ def get_multinomial_prob(data, data_mc, x_samples, bins, size=10000):
bins: bins used to bin the mc histogram
size: number of values to compute
"""
- chi2_data = []
- chi2_samples = []
+ ps = []
for i in range(size):
x = x_samples[np.random.randint(x_samples.shape[0])]
- mc = get_mc_hist(data_mc,x,bins)
+ data_hist = get_data_hist(data,x,bins)
+ mc = get_mc_hist(data_mc,data_hist,x,bins)
N = mc.sum()
# Fix a bug in scipy(). See https://github.com/scipy/scipy/issues/8235 (I think).
mc = mc + 1e-10
p = mc/mc.sum()
- data_hist = get_data_hist(data,x,bins)
- chi2_data.append(chi2(data_hist,mc))
- n = np.random.poisson(N)
- samples = multinomial.rvs(n,p)
- chi2_samples.append(chi2(samples,mc))
- chi2_samples = np.array(chi2_samples)
- chi2_data = np.array(chi2_data)
- return np.count_nonzero(chi2_samples >= chi2_data)/len(chi2_samples)
+ chi2_data = nllr(data_hist,mc)
+ # To draw the multinomial samples we first draw the expected number of
+ # events from a Poisson distribution and then loop over the counts and
+ # unique values. The reason we do this is that you can't call
+ # multinomial.rvs with a multidimensional `n` array, and looping over every
+ # single entry takes forever
+ ns = np.random.poisson(N,size=1000)
+ samples = []
+ for n, count in zip(*np.unique(ns,return_counts=True)):
+ samples.append(multinomial.rvs(n,p,size=count))
+ samples = np.concatenate(samples)
+ # Calculate the negative log likelihood ratio for the data simulated under
+ # the null hypothesis
+ chi2_samples = nllr(samples,mc)
+ ps.append(np.count_nonzero(chi2_samples >= chi2_data)/len(chi2_samples))
+ return np.percentile(ps,percentile)
if __name__ == '__main__':
import argparse