From 51f7458944d0b3a84ab051e7f11437e1a352e638 Mon Sep 17 00:00:00 2001 From: tlatorre Date: Mon, 27 Jul 2020 11:52:41 -0500 Subject: update how we calculte the p-value in chi2 --- utils/chi2 | 45 ++++++++++++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 15 deletions(-) (limited to 'utils') diff --git a/utils/chi2 b/utils/chi2 index f1ae974..4515b91 100755 --- a/utils/chi2 +++ b/utils/chi2 @@ -121,18 +121,25 @@ def make_nll(data, mc_hists): return nll - norm.logpdf(x[1],ENERGY_SCALE_MEAN,ENERGY_SCALE_UNCERTAINTY) return nll -def get_mc_hist(data,x,bins): - hist = np.histogram(data,bins=bins)[0] - return sample_mc_hist(hist,norm=x[0]/100.0) +def get_mc_hist(data_mc,data_hist,x,bins): + hist = np.histogram(data_mc,bins=bins)[0] + return get_mc_hist_posterior(hist,data_hist,norm=x[0]/100.0) def get_data_hist(data,x,bins): return np.histogram(data*x[1],bins=bins)[0] -def get_multinomial_prob(data, data_mc, x_samples, bins, size=10000): +def get_multinomial_prob(data, data_mc, x_samples, bins, percentile=99.0, size=10000): """ Returns the p-value that the histogram of the data is drawn from the MC histogram. + The p-value is calculated by first sampling the posterior of the fit + parameters `size` times. For each iteration we calculate a p-value. We then + return the `percentile` percentile of all the p-values. This approach is + similar to both the supremum and posterior predictive methods of + calculating a p-value. For more information on different methods of + calculating p-values see https://cds.cern.ch/record/1099967/files/p23.pdf. + Arguments: data: 1D array of KE values @@ -141,23 +148,31 @@ def get_multinomial_prob(data, data_mc, x_samples, bins, size=10000): bins: bins used to bin the mc histogram size: number of values to compute """ - chi2_data = [] - chi2_samples = [] + ps = [] for i in range(size): x = x_samples[np.random.randint(x_samples.shape[0])] - mc = get_mc_hist(data_mc,x,bins) + data_hist = get_data_hist(data,x,bins) + mc = get_mc_hist(data_mc,data_hist,x,bins) N = mc.sum() # Fix a bug in scipy(). See https://github.com/scipy/scipy/issues/8235 (I think). mc = mc + 1e-10 p = mc/mc.sum() - data_hist = get_data_hist(data,x,bins) - chi2_data.append(chi2(data_hist,mc)) - n = np.random.poisson(N) - samples = multinomial.rvs(n,p) - chi2_samples.append(chi2(samples,mc)) - chi2_samples = np.array(chi2_samples) - chi2_data = np.array(chi2_data) - return np.count_nonzero(chi2_samples >= chi2_data)/len(chi2_samples) + chi2_data = nllr(data_hist,mc) + # To draw the multinomial samples we first draw the expected number of + # events from a Poisson distribution and then loop over the counts and + # unique values. The reason we do this is that you can't call + # multinomial.rvs with a multidimensional `n` array, and looping over every + # single entry takes forever + ns = np.random.poisson(N,size=1000) + samples = [] + for n, count in zip(*np.unique(ns,return_counts=True)): + samples.append(multinomial.rvs(n,p,size=count)) + samples = np.concatenate(samples) + # Calculate the negative log likelihood ratio for the data simulated under + # the null hypothesis + chi2_samples = nllr(samples,mc) + ps.append(np.count_nonzero(chi2_samples >= chi2_data)/len(chi2_samples)) + return np.percentile(ps,percentile) if __name__ == '__main__': import argparse -- cgit