diff options
-rw-r--r-- | utils/sddm/stats.py | 92 |
1 files changed, 73 insertions, 19 deletions
diff --git a/utils/sddm/stats.py b/utils/sddm/stats.py index 01bb009..593c888 100644 --- a/utils/sddm/stats.py +++ b/utils/sddm/stats.py @@ -1,6 +1,6 @@ from __future__ import print_function, division import numpy as np -from scipy.stats import gamma, dirichlet, multinomial +from scipy.stats import gamma, dirichlet, multinomial, poisson from . import grouper particle_id = {20: 'e', 22: r'\mu'} @@ -8,10 +8,55 @@ particle_id = {20: 'e', 22: r'\mu'} def chi2(samples,expected): return np.sum((samples-expected)**2/expected,axis=-1) -def sample_mc_hist(hist, norm): - alpha = np.ones_like(hist) + hist - N = gamma.rvs(np.sum(hist)+1e-10,scale=1) - return dirichlet.rvs(alpha)[0]*N*norm +def nllr(samples,expected): + """ + Returns 2 times the negative log likelihood ratio that the `samples` + histogram was drawn from the histogram `expected`. The numerator in the + likelihood ratio is the Poisson probability of observing sum(samples) given + the total number of expected events plus the multinomial probability of + drawing those samples. The denominator in the ratio is the "best" + hypothesis where we assume the expected number of events was exactly equal + to the sample count and the distribution is equal to the sample + distribution. + + This likelihood ratio is supposed to give better results as a test + statistic when computing p-values for histograms with low statistics. + """ + samples = np.atleast_2d(samples) + n = samples.sum(axis=-1) + N = expected.sum() + p_samples = samples.astype(np.double) + 1e-10 + p_samples /= p_samples.sum(-1)[:,np.newaxis] + p_expected = expected.astype(np.double) + 1e-10 + p_expected /= p_expected.sum() + # For some reason multinomial.logpmf([0,0,...],0,p) returns nan + mask = n == 0 + rv = poisson.logpmf(n,N) - poisson.logpmf(n,n) + rv[~mask] += multinomial.logpmf(samples[~mask],n[~mask],p_expected) - multinomial.logpmf(samples[~mask],n[~mask],p_samples[~mask]) + return -2*rv.squeeze()[()] + +def get_mc_hist_posterior(hist, data, norm): + """ + Returns the most likely posterior for the expected number of events in the + MC histogram `hist` given that you observed `data`. + + The expected number of events in each bin from MC has some uncertainty just + due to statistics. When computing the p-value I use the posterior estimates + for all the parameters in the fit, so here we want the most likely number + of expected events after observing the events. In this case since the + jitter in the MC histogram is governed by the dirichlet distribution, the + posterior is simply the sum of the events in the MC and the data. If we + have lots of MC statistics, the data makes almost no difference. + + Note: In general this tends to produce a more conservative p-value when you + don't have a lot of MC statistics. + """ + N = hist.sum()*norm + alpha = hist.astype(np.double) + data + if alpha.sum() > 0: + return alpha*N/alpha.sum() + else: + return alpha def get_multinomial_prob(data, mc, norm, size=10000): """ @@ -25,20 +70,29 @@ def get_multinomial_prob(data, mc, norm, size=10000): norm: Normalization constant for the MC size: number of values to compute """ - chi2_data = [] - chi2_samples = [] - for i in range(size): - mc_hist = sample_mc_hist(mc,norm=norm) - N = mc_hist.sum() - # Fix a bug in scipy(). See https://github.com/scipy/scipy/issues/8235 (I think). - mc_hist = mc_hist + 1e-10 - p = mc_hist/mc_hist.sum() - chi2_data.append(chi2(data,mc_hist)) - n = np.random.poisson(N) - samples = multinomial.rvs(n,p) - chi2_samples.append(chi2(samples,mc_hist)) - chi2_samples = np.array(chi2_samples) - chi2_data = np.array(chi2_data) + # Get the posterior MC histogram + mc_hist = get_mc_hist_posterior(mc,data,norm=norm) + # Expected number of events + N = mc_hist.sum() + # Fix a bug in scipy(). See https://github.com/scipy/scipy/issues/8235 (I think). + mc_hist = mc_hist + 1e-10 + # Multinomial probability distribution for the MC + p = mc_hist/mc_hist.sum() + # Calculate the negative log likelihood ratio for the data + chi2_data = nllr(data,mc_hist) + # To draw the multinomial samples we first draw the expected number of + # events from a Poisson distribution and then loop over the counts and + # unique values. The reason we do this is that you can't call + # multinomial.rvs with a multidimensional `n` array, and looping over every + # single entry takes forever + ns = np.random.poisson(N,size=size) + samples = [] + for n, count in zip(*np.unique(ns,return_counts=True)): + samples.append(multinomial.rvs(n,p,size=count)) + samples = np.concatenate(samples) + # Calculate the negative log likelihood ratio for the data simulated under + # the null hypothesis + chi2_samples = nllr(samples,mc_hist) return np.count_nonzero(chi2_samples >= chi2_data)/len(chi2_samples) def dirichlet_mode(alpha): |