aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--utils/sddm/stats.py92
1 files changed, 73 insertions, 19 deletions
diff --git a/utils/sddm/stats.py b/utils/sddm/stats.py
index 01bb009..593c888 100644
--- a/utils/sddm/stats.py
+++ b/utils/sddm/stats.py
@@ -1,6 +1,6 @@
from __future__ import print_function, division
import numpy as np
-from scipy.stats import gamma, dirichlet, multinomial
+from scipy.stats import gamma, dirichlet, multinomial, poisson
from . import grouper
particle_id = {20: 'e', 22: r'\mu'}
@@ -8,10 +8,55 @@ particle_id = {20: 'e', 22: r'\mu'}
def chi2(samples,expected):
return np.sum((samples-expected)**2/expected,axis=-1)
-def sample_mc_hist(hist, norm):
- alpha = np.ones_like(hist) + hist
- N = gamma.rvs(np.sum(hist)+1e-10,scale=1)
- return dirichlet.rvs(alpha)[0]*N*norm
+def nllr(samples,expected):
+ """
+ Returns 2 times the negative log likelihood ratio that the `samples`
+ histogram was drawn from the histogram `expected`. The numerator in the
+ likelihood ratio is the Poisson probability of observing sum(samples) given
+ the total number of expected events plus the multinomial probability of
+ drawing those samples. The denominator in the ratio is the "best"
+ hypothesis where we assume the expected number of events was exactly equal
+ to the sample count and the distribution is equal to the sample
+ distribution.
+
+ This likelihood ratio is supposed to give better results as a test
+ statistic when computing p-values for histograms with low statistics.
+ """
+ samples = np.atleast_2d(samples)
+ n = samples.sum(axis=-1)
+ N = expected.sum()
+ p_samples = samples.astype(np.double) + 1e-10
+ p_samples /= p_samples.sum(-1)[:,np.newaxis]
+ p_expected = expected.astype(np.double) + 1e-10
+ p_expected /= p_expected.sum()
+ # For some reason multinomial.logpmf([0,0,...],0,p) returns nan
+ mask = n == 0
+ rv = poisson.logpmf(n,N) - poisson.logpmf(n,n)
+ rv[~mask] += multinomial.logpmf(samples[~mask],n[~mask],p_expected) - multinomial.logpmf(samples[~mask],n[~mask],p_samples[~mask])
+ return -2*rv.squeeze()[()]
+
+def get_mc_hist_posterior(hist, data, norm):
+ """
+ Returns the most likely posterior for the expected number of events in the
+ MC histogram `hist` given that you observed `data`.
+
+ The expected number of events in each bin from MC has some uncertainty just
+ due to statistics. When computing the p-value I use the posterior estimates
+ for all the parameters in the fit, so here we want the most likely number
+ of expected events after observing the events. In this case since the
+ jitter in the MC histogram is governed by the dirichlet distribution, the
+ posterior is simply the sum of the events in the MC and the data. If we
+ have lots of MC statistics, the data makes almost no difference.
+
+ Note: In general this tends to produce a more conservative p-value when you
+ don't have a lot of MC statistics.
+ """
+ N = hist.sum()*norm
+ alpha = hist.astype(np.double) + data
+ if alpha.sum() > 0:
+ return alpha*N/alpha.sum()
+ else:
+ return alpha
def get_multinomial_prob(data, mc, norm, size=10000):
"""
@@ -25,20 +70,29 @@ def get_multinomial_prob(data, mc, norm, size=10000):
norm: Normalization constant for the MC
size: number of values to compute
"""
- chi2_data = []
- chi2_samples = []
- for i in range(size):
- mc_hist = sample_mc_hist(mc,norm=norm)
- N = mc_hist.sum()
- # Fix a bug in scipy(). See https://github.com/scipy/scipy/issues/8235 (I think).
- mc_hist = mc_hist + 1e-10
- p = mc_hist/mc_hist.sum()
- chi2_data.append(chi2(data,mc_hist))
- n = np.random.poisson(N)
- samples = multinomial.rvs(n,p)
- chi2_samples.append(chi2(samples,mc_hist))
- chi2_samples = np.array(chi2_samples)
- chi2_data = np.array(chi2_data)
+ # Get the posterior MC histogram
+ mc_hist = get_mc_hist_posterior(mc,data,norm=norm)
+ # Expected number of events
+ N = mc_hist.sum()
+ # Fix a bug in scipy(). See https://github.com/scipy/scipy/issues/8235 (I think).
+ mc_hist = mc_hist + 1e-10
+ # Multinomial probability distribution for the MC
+ p = mc_hist/mc_hist.sum()
+ # Calculate the negative log likelihood ratio for the data
+ chi2_data = nllr(data,mc_hist)
+ # To draw the multinomial samples we first draw the expected number of
+ # events from a Poisson distribution and then loop over the counts and
+ # unique values. The reason we do this is that you can't call
+ # multinomial.rvs with a multidimensional `n` array, and looping over every
+ # single entry takes forever
+ ns = np.random.poisson(N,size=size)
+ samples = []
+ for n, count in zip(*np.unique(ns,return_counts=True)):
+ samples.append(multinomial.rvs(n,p,size=count))
+ samples = np.concatenate(samples)
+ # Calculate the negative log likelihood ratio for the data simulated under
+ # the null hypothesis
+ chi2_samples = nllr(samples,mc_hist)
return np.count_nonzero(chi2_samples >= chi2_data)/len(chi2_samples)
def dirichlet_mode(alpha):