1 files changed, 73 insertions, 19 deletions
diff --git a/utils/sddm/stats.py b/utils/sddm/stats.py
index 01bb009..593c888 100644
--- a/utils/sddm/stats.py
+++ b/utils/sddm/stats.py
@@ -1,6 +1,6 @@
 from __future__ import print_function, division
 import numpy as np
-from scipy.stats import gamma, dirichlet, multinomial
+from scipy.stats import gamma, dirichlet, multinomial, poisson
 from . import grouper
 
 particle_id = {20: 'e', 22: r'\mu'}
@@ -8,10 +8,55 @@ particle_id = {20: 'e', 22: r'\mu'}
 def chi2(samples,expected):
     return np.sum((samples-expected)**2/expected,axis=-1)
 
-def sample_mc_hist(hist, norm):
-    alpha = np.ones_like(hist) + hist
-    N = gamma.rvs(np.sum(hist)+1e-10,scale=1)
-    return dirichlet.rvs(alpha)[0]*N*norm
+def nllr(samples,expected):
+    """
+    Returns 2 times the negative log likelihood ratio that the `samples`
+    histogram was drawn from the histogram `expected`. The numerator in the
+    likelihood ratio is the Poisson probability of observing sum(samples) given
+    the total number of expected events plus the multinomial probability of
+    drawing those samples. The denominator in the ratio is the "best"
+    hypothesis where we assume the expected number of events was exactly equal
+    to the sample count and the distribution is equal to the sample
+    distribution.
+
+    This likelihood ratio is supposed to give better results as a test
+    statistic when computing p-values for histograms with low statistics.
+    """
+    samples = np.atleast_2d(samples)
+    n = samples.sum(axis=-1)
+    N = expected.sum()
+    p_samples = samples.astype(np.double) + 1e-10
+    p_samples /= p_samples.sum(-1)[:,np.newaxis]
+    p_expected = expected.astype(np.double) + 1e-10
+    p_expected /= p_expected.sum()
+    # For some reason multinomial.logpmf([0,0,...],0,p) returns nan
+    mask = n == 0
+    rv = poisson.logpmf(n,N) - poisson.logpmf(n,n)
+    rv[~mask] += multinomial.logpmf(samples[~mask],n[~mask],p_expected) - multinomial.logpmf(samples[~mask],n[~mask],p_samples[~mask])
+    return -2*rv.squeeze()[()]
+
+def get_mc_hist_posterior(hist, data, norm):
+    """
+    Returns the most likely posterior for the expected number of events in the
+    MC histogram `hist` given that you observed `data`.
+
+    The expected number of events in each bin from MC has some uncertainty just
+    due to statistics. When computing the p-value I use the posterior estimates
+    for all the parameters in the fit, so here we want the most likely number
+    of expected events after observing the events. In this case since the
+    jitter in the MC histogram is governed by the dirichlet distribution, the
+    posterior is simply the sum of the events in the MC and the data. If we
+    have lots of MC statistics, the data makes almost no difference.
+
+    Note: In general this tends to produce a more conservative p-value when you
+    don't have a lot of MC statistics.
+    """
+    N = hist.sum()*norm
+    alpha = hist.astype(np.double) + data
+    if alpha.sum() > 0:
+        return alpha*N/alpha.sum()
+    else:
+        return alpha
 
 def get_multinomial_prob(data, mc, norm, size=10000):
     """
@@ -25,20 +70,29 @@ def get_multinomial_prob(data, mc, norm, size=10000):
         norm: Normalization constant for the MC
         size: number of values to compute
     """
-    chi2_data = []
-    chi2_samples = []
-    for i in range(size):
-        mc_hist = sample_mc_hist(mc,norm=norm)
-        N = mc_hist.sum()
-        # Fix a bug in scipy(). See https://github.com/scipy/scipy/issues/8235 (I think).
-        mc_hist = mc_hist + 1e-10
-        p = mc_hist/mc_hist.sum()
-        chi2_data.append(chi2(data,mc_hist))
-        n = np.random.poisson(N)
-        samples = multinomial.rvs(n,p)
-        chi2_samples.append(chi2(samples,mc_hist))
-    chi2_samples = np.array(chi2_samples)
-    chi2_data = np.array(chi2_data)
+    # Get the posterior MC histogram
+    mc_hist = get_mc_hist_posterior(mc,data,norm=norm)
+    # Expected number of events
+    N = mc_hist.sum()
+    # Fix a bug in scipy(). See https://github.com/scipy/scipy/issues/8235 (I think).
+    mc_hist = mc_hist + 1e-10
+    # Multinomial probability distribution for the MC
+    p = mc_hist/mc_hist.sum()
+    # Calculate the negative log likelihood ratio for the data
+    chi2_data = nllr(data,mc_hist)
+    # To draw the multinomial samples we first draw the expected number of
+    # events from a Poisson distribution and then loop over the counts and
+    # unique values. The reason we do this is that you can't call
+    # multinomial.rvs with a multidimensional `n` array, and looping over every
+    # single entry takes forever
+    ns = np.random.poisson(N,size=size)
+    samples = []
+    for n, count in zip(*np.unique(ns,return_counts=True)):
+        samples.append(multinomial.rvs(n,p,size=count))
+    samples = np.concatenate(samples)
+    # Calculate the negative log likelihood ratio for the data simulated under
+    # the null hypothesis
+    chi2_samples = nllr(samples,mc_hist)
     return np.count_nonzero(chi2_samples >= chi2_data)/len(chi2_samples)
 
 def dirichlet_mode(alpha):