fix a bug in the likelihood ratio calculation

author: tlatorre <tlatorre@uchicago.edu> 2020-06-08 11:42:54 -0500
committer: tlatorre <tlatorre@uchicago.edu> 2020-06-08 11:42:54 -0500
commit: 7dbc1b291b76605b12de4c2515c81f1c71462949 (patch)
tree: 6852fb1db79f189eaee9e2ab738e06d3c9db82e2
parent: a6817a5f9d26f62b1df007ad547a9da1dd7953ba (diff)
download: sddm-7dbc1b291b76605b12de4c2515c81f1c71462949.tar.gz
sddm-7dbc1b291b76605b12de4c2515c81f1c71462949.tar.bz2
sddm-7dbc1b291b76605b12de4c2515c81f1c71462949.zip
1 files changed, 92 insertions, 7 deletions
diff --git a/utils/sddm/plot_energy.py b/utils/sddm/plot_energy.py
index 62f6f34..1c1b7e3 100755
--- a/utils/sddm/plot_energy.py
+++ b/utils/sddm/plot_energy.py
@@ -17,12 +17,15 @@
 from __future__ import print_function, division
 import numpy as np
 from scipy.stats import poisson, dirichlet, multinomial, beta
-from scipy.special import spence
+from scipy.special import spence, gammaln
 from .dc import DC_MUON, DC_JUNK, DC_CRATE_ISOTROPY, DC_QVNHIT, DC_NECK, DC_FLASHER, DC_ESUM, DC_OWL, DC_OWL_TRIGGER, DC_FTS, DC_ITC, DC_BREAKDOWN
 from . import grouper, print_warning, AV_RADIUS, PSUP_RADIUS
 import pandas as pd
 from scipy.misc import logsumexp
 
+def lnfact(n):
+    return gammaln(n+1)
+
 # Fermi constant
 GF                      = 1.16637887e-5 # 1/MeV^2
 ELECTRON_MASS           = 0.5109989461  # MeV
@@ -199,6 +202,12 @@ def prompt_event(ev):
     ev.loc[ev.prompt,'prompt'] &= np.concatenate(([True],np.diff(ev[ev.prompt].gtr.values) > 250e6))
     return ev
 
+def log_prior_correlated(p):
+    return dirichlet.logpdf(p,np.ones_like(p))
+
+def log_prior_independent(p):
+    return np.log(1.0)
+
 def sample_posterior_correlated(n,size):
     # Assume prior is a dirichlet with alpha = 1
     return dirichlet.rvs(n+1,size=size)
@@ -208,15 +217,91 @@ def sample_posterior_independent(n,size):
     p2 = beta.rvs(n[0]+n[2]+1,n[1]+n[3]+1,size=size)
     return np.vstack((p1*p2,p1*(1-p2),(1-p1)*p2,(1-p1)*(1-p2))).T
 
+def sample_posterior_independent2(n,size):
+    p1 = beta.rvs(n[0]+n[1]+1,n[2]+n[3]+1,size=size)
+    p2 = beta.rvs(n[0]+n[2]+1,n[1]+n[3]+1,size=size)
+    return np.vstack((p1,p2)).T, np.vstack((p1*p2,p1*(1-p2),(1-p1)*p2,(1-p1)*(1-p2))).T
+
+def log_posterior_independent(p,n):
+    return beta.logpdf(p[0],n[0]+n[1]+1,n[2]+n[3]+1) + beta.logpdf(p[1],n[0]+n[2]+1,n[1]+n[3]+1)
+
+def log_posterior_correlated(p,n):
+    return dirichlet.logpdf(p,n+1)
+
 def nll(p,n):
     return -multinomial.logpmf(n,np.sum(n),p)
 
-def get_likelihood_ratio(n,size=100000):
+def get_log_likelihood_ratio(n):
+    """
+    Returns the log of the likelihood ratio between the independent and
+    correlated hypotheses for two cuts. Suppose you have two cuts, A and B and
+    apply them to a bunch of events and that you observe n1 events pass both A
+    and B, n2 pass A but fail B, n3 fail A but pass B, and n4 fail both. You
+    would then like to know whether the probability of passing cut A is independent
+    of passing cut B. This function will return the log of the likelihood ratio
+    between the model that the two probabilities are independent and the model that
+    they are correlated.
+
+    We calculate this as:
+
+        lambda = log(P(M_I|D,I)/P(M_C|D,I))
+               = log(P(D|M_I,I)P(M_I|I)/(P(D|M_C,I)P(M_C|I)))
+
+    Then, assuming both models are equally likely,
+
+        lambda = log(P(D|M_I,I)/P(D|M_C,I))
+               = log(P(D|M_I,I)) - log(P(D|M_C,I))
+
+    and this is what this function returns.
+
+    `n` should be an array with the counts (n1,n2,n3,n4).
+    """
+    n1, n2, n3, n4 = n
+    N = np.sum(n)
+
+    return lnfact(n1+n2)+lnfact(n3+n4)+lnfact(n1+n3)+lnfact(n2+n4)+np.log(N+3)+np.log(N+2)-lnfact(N+1)-lnfact(n1)-lnfact(n2)-lnfact(n3)-lnfact(n4)-np.log(6)
+
+def sample_log_likelihood_ratio(n,size=100):
+    """
+    Returns an array of the log of the likelihood ratio between the independent and
+    correlated hypotheses for two cuts.
+
+    This function returns the same thing as get_log_likelihood_ratio() but
+    calculates it by drawing samples from the posterior and calculating:
+
+        P(M|D,I) = Likelihood*Prior/Posterior
+
+    instead of using the analytic form for the likelihood ratio directly.
+
+    This is mostly just used to test whether the analytic form is correct.
+    """
     samples_correlated = sample_posterior_correlated(n,size)
-    nll_correlated = nll(samples_correlated,n)
-    samples_independent = sample_posterior_independent(n,size)
-    nll_independent = nll(samples_independent,n)
-    return logsumexp(-nll_independent) - logsumexp(-nll_correlated)
+    log_correlated = -nll(samples_correlated,n)
+    lprior_correlated = np.array([log_prior_correlated(sample) for sample in samples_correlated])
+    lpdf_posterior_correlated = np.array([log_posterior_correlated(sample,n) for sample in samples_correlated])
+    
+    log_data_given_correlated = log_correlated + lprior_correlated - lpdf_posterior_correlated
+
+    samples_independent2, samples_independent = sample_posterior_independent2(n,size)
+    log_independent = -nll(samples_independent,n)
+    lprior_independent = np.array([log_prior_independent(sample) for sample in samples_independent2])
+    lpdf_posterior_independent = np.array([log_posterior_independent(sample,n) for sample in samples_independent2])
+
+    log_data_given_independent = log_independent + lprior_independent - lpdf_posterior_independent
+
+    return log_data_given_independent - log_data_given_correlated
+
+def test_likelihood_ratio(n,size=100):
+    """
+    Test whether we get the same value for the log of the likelihood ratio
+    whether we calculate it using the analytical form or by sampling the
+    posterior.
+    """
+    lambda_practice = sample_log_likelihood_ratio(n,size=size)
+
+    lambda_theory = get_log_likelihood_ratio(n)
+
+    assert np.allclose(lambda_practice,lambda_theory)
 
 def plot_corner_plot(ev, title, save=None):
     import matplotlib.pyplot as plt
@@ -251,7 +336,7 @@ def plot_corner_plot(ev, title, save=None):
                     n_hilo = np.count_nonzero((ev[variables[i]] >= cuts[i]) & (ev[variables[j]] < cuts[j]))
                     n_hihi = np.count_nonzero((ev[variables[i]] >= cuts[i]) & (ev[variables[j]] >= cuts[j]))
                     observed = np.array([n_lolo,n_lohi,n_hilo,n_hihi])
-                    plt.title(r"$\lambda = %.1f$" % get_likelihood_ratio(observed))
+                    plt.title(r"$\lambda = %.1f$" % get_log_likelihood_ratio(observed))
             if i == len(variables) - 1:
                 plt.xlabel(labels[j])
             else:
author	tlatorre <tlatorre@uchicago.edu>	2020-06-08 11:42:54 -0500
committer	tlatorre <tlatorre@uchicago.edu>	2020-06-08 11:42:54 -0500
commit	7dbc1b291b76605b12de4c2515c81f1c71462949 (patch)
tree	6852fb1db79f189eaee9e2ab738e06d3c9db82e2
parent	a6817a5f9d26f62b1df007ad547a9da1dd7953ba (diff)
download	sddm-7dbc1b291b76605b12de4c2515c81f1c71462949.tar.gz sddm-7dbc1b291b76605b12de4c2515c81f1c71462949.tar.bz2 sddm-7dbc1b291b76605b12de4c2515c81f1c71462949.zip