aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authortlatorre <tlatorre@uchicago.edu>2020-06-08 11:42:54 -0500
committertlatorre <tlatorre@uchicago.edu>2020-06-08 11:42:54 -0500
commit7dbc1b291b76605b12de4c2515c81f1c71462949 (patch)
tree6852fb1db79f189eaee9e2ab738e06d3c9db82e2
parenta6817a5f9d26f62b1df007ad547a9da1dd7953ba (diff)
downloadsddm-7dbc1b291b76605b12de4c2515c81f1c71462949.tar.gz
sddm-7dbc1b291b76605b12de4c2515c81f1c71462949.tar.bz2
sddm-7dbc1b291b76605b12de4c2515c81f1c71462949.zip
fix a bug in the likelihood ratio calculation
-rwxr-xr-xutils/sddm/plot_energy.py99
1 files changed, 92 insertions, 7 deletions
diff --git a/utils/sddm/plot_energy.py b/utils/sddm/plot_energy.py
index 62f6f34..1c1b7e3 100755
--- a/utils/sddm/plot_energy.py
+++ b/utils/sddm/plot_energy.py
@@ -17,12 +17,15 @@
from __future__ import print_function, division
import numpy as np
from scipy.stats import poisson, dirichlet, multinomial, beta
-from scipy.special import spence
+from scipy.special import spence, gammaln
from .dc import DC_MUON, DC_JUNK, DC_CRATE_ISOTROPY, DC_QVNHIT, DC_NECK, DC_FLASHER, DC_ESUM, DC_OWL, DC_OWL_TRIGGER, DC_FTS, DC_ITC, DC_BREAKDOWN
from . import grouper, print_warning, AV_RADIUS, PSUP_RADIUS
import pandas as pd
from scipy.misc import logsumexp
+def lnfact(n):
+ return gammaln(n+1)
+
# Fermi constant
GF = 1.16637887e-5 # 1/MeV^2
ELECTRON_MASS = 0.5109989461 # MeV
@@ -199,6 +202,12 @@ def prompt_event(ev):
ev.loc[ev.prompt,'prompt'] &= np.concatenate(([True],np.diff(ev[ev.prompt].gtr.values) > 250e6))
return ev
+def log_prior_correlated(p):
+ return dirichlet.logpdf(p,np.ones_like(p))
+
+def log_prior_independent(p):
+ return np.log(1.0)
+
def sample_posterior_correlated(n,size):
# Assume prior is a dirichlet with alpha = 1
return dirichlet.rvs(n+1,size=size)
@@ -208,15 +217,91 @@ def sample_posterior_independent(n,size):
p2 = beta.rvs(n[0]+n[2]+1,n[1]+n[3]+1,size=size)
return np.vstack((p1*p2,p1*(1-p2),(1-p1)*p2,(1-p1)*(1-p2))).T
+def sample_posterior_independent2(n,size):
+ p1 = beta.rvs(n[0]+n[1]+1,n[2]+n[3]+1,size=size)
+ p2 = beta.rvs(n[0]+n[2]+1,n[1]+n[3]+1,size=size)
+ return np.vstack((p1,p2)).T, np.vstack((p1*p2,p1*(1-p2),(1-p1)*p2,(1-p1)*(1-p2))).T
+
+def log_posterior_independent(p,n):
+ return beta.logpdf(p[0],n[0]+n[1]+1,n[2]+n[3]+1) + beta.logpdf(p[1],n[0]+n[2]+1,n[1]+n[3]+1)
+
+def log_posterior_correlated(p,n):
+ return dirichlet.logpdf(p,n+1)
+
def nll(p,n):
return -multinomial.logpmf(n,np.sum(n),p)
-def get_likelihood_ratio(n,size=100000):
+def get_log_likelihood_ratio(n):
+ """
+ Returns the log of the likelihood ratio between the independent and
+ correlated hypotheses for two cuts. Suppose you have two cuts, A and B and
+ apply them to a bunch of events and that you observe n1 events pass both A
+ and B, n2 pass A but fail B, n3 fail A but pass B, and n4 fail both. You
+ would then like to know whether the probability of passing cut A is independent
+ of passing cut B. This function will return the log of the likelihood ratio
+ between the model that the two probabilities are independent and the model that
+ they are correlated.
+
+ We calculate this as:
+
+ lambda = log(P(M_I|D,I)/P(M_C|D,I))
+ = log(P(D|M_I,I)P(M_I|I)/(P(D|M_C,I)P(M_C|I)))
+
+ Then, assuming both models are equally likely,
+
+ lambda = log(P(D|M_I,I)/P(D|M_C,I))
+ = log(P(D|M_I,I)) - log(P(D|M_C,I))
+
+ and this is what this function returns.
+
+ `n` should be an array with the counts (n1,n2,n3,n4).
+ """
+ n1, n2, n3, n4 = n
+ N = np.sum(n)
+
+ return lnfact(n1+n2)+lnfact(n3+n4)+lnfact(n1+n3)+lnfact(n2+n4)+np.log(N+3)+np.log(N+2)-lnfact(N+1)-lnfact(n1)-lnfact(n2)-lnfact(n3)-lnfact(n4)-np.log(6)
+
+def sample_log_likelihood_ratio(n,size=100):
+ """
+ Returns an array of the log of the likelihood ratio between the independent and
+ correlated hypotheses for two cuts.
+
+ This function returns the same thing as get_log_likelihood_ratio() but
+ calculates it by drawing samples from the posterior and calculating:
+
+ P(M|D,I) = Likelihood*Prior/Posterior
+
+ instead of using the analytic form for the likelihood ratio directly.
+
+ This is mostly just used to test whether the analytic form is correct.
+ """
samples_correlated = sample_posterior_correlated(n,size)
- nll_correlated = nll(samples_correlated,n)
- samples_independent = sample_posterior_independent(n,size)
- nll_independent = nll(samples_independent,n)
- return logsumexp(-nll_independent) - logsumexp(-nll_correlated)
+ log_correlated = -nll(samples_correlated,n)
+ lprior_correlated = np.array([log_prior_correlated(sample) for sample in samples_correlated])
+ lpdf_posterior_correlated = np.array([log_posterior_correlated(sample,n) for sample in samples_correlated])
+
+ log_data_given_correlated = log_correlated + lprior_correlated - lpdf_posterior_correlated
+
+ samples_independent2, samples_independent = sample_posterior_independent2(n,size)
+ log_independent = -nll(samples_independent,n)
+ lprior_independent = np.array([log_prior_independent(sample) for sample in samples_independent2])
+ lpdf_posterior_independent = np.array([log_posterior_independent(sample,n) for sample in samples_independent2])
+
+ log_data_given_independent = log_independent + lprior_independent - lpdf_posterior_independent
+
+ return log_data_given_independent - log_data_given_correlated
+
+def test_likelihood_ratio(n,size=100):
+ """
+ Test whether we get the same value for the log of the likelihood ratio
+ whether we calculate it using the analytical form or by sampling the
+ posterior.
+ """
+ lambda_practice = sample_log_likelihood_ratio(n,size=size)
+
+ lambda_theory = get_log_likelihood_ratio(n)
+
+ assert np.allclose(lambda_practice,lambda_theory)
def plot_corner_plot(ev, title, save=None):
import matplotlib.pyplot as plt
@@ -251,7 +336,7 @@ def plot_corner_plot(ev, title, save=None):
n_hilo = np.count_nonzero((ev[variables[i]] >= cuts[i]) & (ev[variables[j]] < cuts[j]))
n_hihi = np.count_nonzero((ev[variables[i]] >= cuts[i]) & (ev[variables[j]] >= cuts[j]))
observed = np.array([n_lolo,n_lohi,n_hilo,n_hihi])
- plt.title(r"$\lambda = %.1f$" % get_likelihood_ratio(observed))
+ plt.title(r"$\lambda = %.1f$" % get_log_likelihood_ratio(observed))
if i == len(variables) - 1:
plt.xlabel(labels[j])
else: