diff options
author | tlatorre <tlatorre@uchicago.edu> | 2020-06-08 11:42:54 -0500 |
---|---|---|
committer | tlatorre <tlatorre@uchicago.edu> | 2020-06-08 11:42:54 -0500 |
commit | 7dbc1b291b76605b12de4c2515c81f1c71462949 (patch) | |
tree | 6852fb1db79f189eaee9e2ab738e06d3c9db82e2 | |
parent | a6817a5f9d26f62b1df007ad547a9da1dd7953ba (diff) | |
download | sddm-7dbc1b291b76605b12de4c2515c81f1c71462949.tar.gz sddm-7dbc1b291b76605b12de4c2515c81f1c71462949.tar.bz2 sddm-7dbc1b291b76605b12de4c2515c81f1c71462949.zip |
fix a bug in the likelihood ratio calculation
-rwxr-xr-x | utils/sddm/plot_energy.py | 99 |
1 files changed, 92 insertions, 7 deletions
diff --git a/utils/sddm/plot_energy.py b/utils/sddm/plot_energy.py index 62f6f34..1c1b7e3 100755 --- a/utils/sddm/plot_energy.py +++ b/utils/sddm/plot_energy.py @@ -17,12 +17,15 @@ from __future__ import print_function, division import numpy as np from scipy.stats import poisson, dirichlet, multinomial, beta -from scipy.special import spence +from scipy.special import spence, gammaln from .dc import DC_MUON, DC_JUNK, DC_CRATE_ISOTROPY, DC_QVNHIT, DC_NECK, DC_FLASHER, DC_ESUM, DC_OWL, DC_OWL_TRIGGER, DC_FTS, DC_ITC, DC_BREAKDOWN from . import grouper, print_warning, AV_RADIUS, PSUP_RADIUS import pandas as pd from scipy.misc import logsumexp +def lnfact(n): + return gammaln(n+1) + # Fermi constant GF = 1.16637887e-5 # 1/MeV^2 ELECTRON_MASS = 0.5109989461 # MeV @@ -199,6 +202,12 @@ def prompt_event(ev): ev.loc[ev.prompt,'prompt'] &= np.concatenate(([True],np.diff(ev[ev.prompt].gtr.values) > 250e6)) return ev +def log_prior_correlated(p): + return dirichlet.logpdf(p,np.ones_like(p)) + +def log_prior_independent(p): + return np.log(1.0) + def sample_posterior_correlated(n,size): # Assume prior is a dirichlet with alpha = 1 return dirichlet.rvs(n+1,size=size) @@ -208,15 +217,91 @@ def sample_posterior_independent(n,size): p2 = beta.rvs(n[0]+n[2]+1,n[1]+n[3]+1,size=size) return np.vstack((p1*p2,p1*(1-p2),(1-p1)*p2,(1-p1)*(1-p2))).T +def sample_posterior_independent2(n,size): + p1 = beta.rvs(n[0]+n[1]+1,n[2]+n[3]+1,size=size) + p2 = beta.rvs(n[0]+n[2]+1,n[1]+n[3]+1,size=size) + return np.vstack((p1,p2)).T, np.vstack((p1*p2,p1*(1-p2),(1-p1)*p2,(1-p1)*(1-p2))).T + +def log_posterior_independent(p,n): + return beta.logpdf(p[0],n[0]+n[1]+1,n[2]+n[3]+1) + beta.logpdf(p[1],n[0]+n[2]+1,n[1]+n[3]+1) + +def log_posterior_correlated(p,n): + return dirichlet.logpdf(p,n+1) + def nll(p,n): return -multinomial.logpmf(n,np.sum(n),p) -def get_likelihood_ratio(n,size=100000): +def get_log_likelihood_ratio(n): + """ + Returns the log of the likelihood ratio between the independent and + correlated hypotheses for two cuts. Suppose you have two cuts, A and B and + apply them to a bunch of events and that you observe n1 events pass both A + and B, n2 pass A but fail B, n3 fail A but pass B, and n4 fail both. You + would then like to know whether the probability of passing cut A is independent + of passing cut B. This function will return the log of the likelihood ratio + between the model that the two probabilities are independent and the model that + they are correlated. + + We calculate this as: + + lambda = log(P(M_I|D,I)/P(M_C|D,I)) + = log(P(D|M_I,I)P(M_I|I)/(P(D|M_C,I)P(M_C|I))) + + Then, assuming both models are equally likely, + + lambda = log(P(D|M_I,I)/P(D|M_C,I)) + = log(P(D|M_I,I)) - log(P(D|M_C,I)) + + and this is what this function returns. + + `n` should be an array with the counts (n1,n2,n3,n4). + """ + n1, n2, n3, n4 = n + N = np.sum(n) + + return lnfact(n1+n2)+lnfact(n3+n4)+lnfact(n1+n3)+lnfact(n2+n4)+np.log(N+3)+np.log(N+2)-lnfact(N+1)-lnfact(n1)-lnfact(n2)-lnfact(n3)-lnfact(n4)-np.log(6) + +def sample_log_likelihood_ratio(n,size=100): + """ + Returns an array of the log of the likelihood ratio between the independent and + correlated hypotheses for two cuts. + + This function returns the same thing as get_log_likelihood_ratio() but + calculates it by drawing samples from the posterior and calculating: + + P(M|D,I) = Likelihood*Prior/Posterior + + instead of using the analytic form for the likelihood ratio directly. + + This is mostly just used to test whether the analytic form is correct. + """ samples_correlated = sample_posterior_correlated(n,size) - nll_correlated = nll(samples_correlated,n) - samples_independent = sample_posterior_independent(n,size) - nll_independent = nll(samples_independent,n) - return logsumexp(-nll_independent) - logsumexp(-nll_correlated) + log_correlated = -nll(samples_correlated,n) + lprior_correlated = np.array([log_prior_correlated(sample) for sample in samples_correlated]) + lpdf_posterior_correlated = np.array([log_posterior_correlated(sample,n) for sample in samples_correlated]) + + log_data_given_correlated = log_correlated + lprior_correlated - lpdf_posterior_correlated + + samples_independent2, samples_independent = sample_posterior_independent2(n,size) + log_independent = -nll(samples_independent,n) + lprior_independent = np.array([log_prior_independent(sample) for sample in samples_independent2]) + lpdf_posterior_independent = np.array([log_posterior_independent(sample,n) for sample in samples_independent2]) + + log_data_given_independent = log_independent + lprior_independent - lpdf_posterior_independent + + return log_data_given_independent - log_data_given_correlated + +def test_likelihood_ratio(n,size=100): + """ + Test whether we get the same value for the log of the likelihood ratio + whether we calculate it using the analytical form or by sampling the + posterior. + """ + lambda_practice = sample_log_likelihood_ratio(n,size=size) + + lambda_theory = get_log_likelihood_ratio(n) + + assert np.allclose(lambda_practice,lambda_theory) def plot_corner_plot(ev, title, save=None): import matplotlib.pyplot as plt @@ -251,7 +336,7 @@ def plot_corner_plot(ev, title, save=None): n_hilo = np.count_nonzero((ev[variables[i]] >= cuts[i]) & (ev[variables[j]] < cuts[j])) n_hihi = np.count_nonzero((ev[variables[i]] >= cuts[i]) & (ev[variables[j]] >= cuts[j])) observed = np.array([n_lolo,n_lohi,n_hilo,n_hihi]) - plt.title(r"$\lambda = %.1f$" % get_likelihood_ratio(observed)) + plt.title(r"$\lambda = %.1f$" % get_log_likelihood_ratio(observed)) if i == len(variables) - 1: plt.xlabel(labels[j]) else: |