update chi2 script to plot p-value coverage

author: tlatorre <tlatorre@uchicago.edu> 2020-08-25 02:40:51 -0500
committer: tlatorre <tlatorre@uchicago.edu> 2020-08-25 02:40:51 -0500
commit: a26354a837a9135696b18d8e447bfb7734e2ed94 (patch)
tree: 87050bdf4fefa83e6a6d802bb823a728d4235e38
parent: aedf61a2bb1ab7479f5d27cebd2c83d0c107260f (diff)
download: sddm-a26354a837a9135696b18d8e447bfb7734e2ed94.tar.gz
sddm-a26354a837a9135696b18d8e447bfb7734e2ed94.tar.bz2
sddm-a26354a837a9135696b18d8e447bfb7734e2ed94.zip
1 files changed, 301 insertions, 50 deletions
diff --git a/utils/chi2 b/utils/chi2
index 18c162b..277349d 100755
--- a/utils/chi2
+++ b/utils/chi2
@@ -14,16 +14,14 @@
 # You should have received a copy of the GNU General Public License along with
 # this program. If not, see <https://www.gnu.org/licenses/>.
 """
-Script to plot final fit results along with sidebands for the dark matter
-analysis. To run it just run:
+Script to do final null hypothesis test that the events in the 20 MeV - 10 GeV
+range are consistent with atmospheric neutrino events. To run it just run:
 
-    $ ./plot-energy [list of fit results]
+    $ ./chi2 [list of data fit results] --mc [list of atmospheric MC files] --muon-mc [list of muon MC files] --steps [steps]
 
-Currently it will plot energy distributions for external muons, michel
-electrons, atmospheric events with neutron followers, and prompt signal like
-events. Each of these plots will have a different subplot for the particle ID
-of the best fit, i.e. single electron, single muon, double electron, electron +
-muon, or double muon.
+After running you will get a plot showing the best fit results of the MC and
+data along with p-values for each of the possible particle combinations (single
+electron, single muon, double electron, etc.).
 """
 from __future__ import print_function, division
 import numpy as np
@@ -35,12 +33,21 @@ from itertools import izip_longest
 from sddm.stats import *
 
 # Uncertainty on the energy scale
-# FIXME: Should get real number from stopping muons
+# FIXME: These are just placeholders! Should get real number from stopping
+# muons.
 ENERGY_SCALE_MEAN = {'e': 1.0, 'u': 1.0, 'eu': 1.0}
 ENERGY_SCALE_UNCERTAINTY = {'e':0.1, 'u': 0.1, 'eu': 0.1}
 ENERGY_RESOLUTION_MEAN = {'e': 0.0, 'u': 0.0, 'eu': 0.0}
 ENERGY_RESOLUTION_UNCERTAINTY = {'e':0.1, 'u': 0.1, 'eu': 0.1}
 
+# Absolute tolerance for the minimizer.
+# Since we're minimizing the negative log likelihood, we really only care about
+# the value of the minimum to within ~0.05 (10% of the one sigma shift).
+# However, I have noticed before that setting a higher tolerance can sometimes
+# cause the fit to get stuck in a local minima, so we set it here to a very
+# small value.
+FTOL_ABS = 1e-10
+
 particle_id = {20: 'e', 22: r'\mu'}
 
 def plot_hist2(hists, bins, color=None):
@@ -66,6 +73,31 @@ def plot_hist2(hists, bins, color=None):
         plt.tight_layout()
 
 def get_mc_hists(data,x,bins,apply_norm=False):
+    """
+    Returns the expected Monte Carlo histograms for the atmospheric neutrino
+    background.
+
+    Args:
+        - data: pandas dataframe of the Monte Carlo events
+        - x: fit parameters
+        - bins: histogram bins
+        - apply_norm: boolean value representing whether we should apply the
+          atmospheric neutrino scale parameter
+
+    This function does two basic things:
+
+        1. apply the energy bias and resolution corrections
+        2. histogram the results
+
+    Normally to apply the energy resolution correction we should smear out each
+    MC event by the resolution and then bin the results. But since there are
+    thousands and thousands of MC events this takes way too long. Therefore, we
+    use a trick. We first bin the MC events on a much finer binning scale than
+    the final bins. We then smear this histogram and then bin the results in
+    the final bins.
+
+    Returns a dictionary mapping particle id combo -> histogram.
+    """
     mc_hists = {}
 
     # FIXME: May need to increase number of bins here
@@ -94,6 +126,9 @@ def get_mc_hists(data,x,bins,apply_norm=False):
     return mc_hists
 
 def get_data_hists(data,bins,scale=1.0):
+    """
+    Returns the data histogrammed into `bins`.
+    """
     data_hists = {}
     for id in (20,22,2020,2022,2222):
         df_id = data[data.id == id]
@@ -110,6 +145,16 @@ def get_data_hists(data,bins,scale=1.0):
 # 6 - Electron + Muon energy resolution
 # 7 - External Muon scale
 
+FIT_PARS = [
+    'Atmospheric Neutrino Flux Scale',
+    'Electron energy bias',
+    'Electron energy resolution',
+    'Muon energy bias',
+    'Muon energy resolution',
+    'Electron + Muon energy bias',
+    'Electron + Muon energy resolution',
+    'External Muon scale']
+
 def make_nll(data, muons, mc, bins):
     data_hists = get_data_hists(data,bins)
     muon_hists = get_data_hists(muons,bins)
@@ -118,34 +163,95 @@ def make_nll(data, muons, mc, bins):
         if any(x[i] < 0 for i in range(len(x))):
             return np.inf
 
+        # Get the Monte Carlo histograms. We need to do this within the
+        # likelihood function since several of the parameters like the energy
+        # bias and resolution affect the histograms.
+        #
+        # Note: I really should be applying the bias term to the data instead
+        # of the Monte Carlo. The reason being that the reconstruction was all
+        # tuned based on the data from the Monte Carlo. For example, the single
+        # PE charge distribution is taken from the Monte Carlo. Therefore, if
+        # there is some difference in bias between data and Monte Carlo, we
+        # should apply it to the data. However, the reason I apply it to the
+        # Monte Carlo instead is because applying an energy bias correction to
+        # an analysis in which we're using histograms is not really a good
+        # idea. If the energy scale parameter changes by a very small amount
+        # and causes a single event to cross a bin boundary you will see a
+        # discrete jump in the likelihood. This isn't good for minimizers
+        # (although it may be ok with the MCMC depending on the size of the
+        # jump). To reduce this issue, I apply the energy bias correction to
+        # the Monte Carlo, and since there are almost 100 times more events in
+        # the Monte Carlo than in data, the issue is much smaller.
+        #
+        # All that being said, this doesn't completely get rid of the issue,
+        # but I do two things that should make it OK:
+        #
+        # 1. I use the SBPLX minimizer instead of COBYLA which should have a
+        #    better chance of jumping over small discontinuities.
+        #
+        # 2. I ultimately run an MCMC and so if we get stuck somewhere close to
+        #    the minimum, the MCMC results should correctly deal with any small
+        #    discontinuities.
+        #
+        # Also, it's critical that I first adjust the data energy by whatever
+        # amount I find with the stopping muons and Michel distributions.
         mc_hists = get_mc_hists(mc,x,bins,apply_norm=True)
 
+        # Calculate the negative log of the likelihood of observing the data
+        # given the fit parameters
+
         nll = 0
         for id in data_hists:
             oi = data_hists[id]
             ei = mc_hists[id] + muon_hists[id]*x[7] + EPSILON
             N = ei.sum()
             nll -= -N - np.sum(gammaln(oi+1)) + np.sum(oi*np.log(ei))
+
+        # Add the priors
         nll -= norm.logpdf(x[1],ENERGY_SCALE_MEAN['e'],ENERGY_SCALE_UNCERTAINTY['e'])
         nll -= norm.logpdf(x[3],ENERGY_SCALE_MEAN['u'],ENERGY_SCALE_UNCERTAINTY['u'])
         nll -= norm.logpdf(x[5],ENERGY_SCALE_MEAN['eu'],ENERGY_SCALE_UNCERTAINTY['eu'])
         nll -= norm.logpdf(x[2],ENERGY_RESOLUTION_MEAN['e'],ENERGY_RESOLUTION_UNCERTAINTY['e'])
         nll -= norm.logpdf(x[4],ENERGY_RESOLUTION_MEAN['u'],ENERGY_RESOLUTION_UNCERTAINTY['u'])
         nll -= norm.logpdf(x[6],ENERGY_RESOLUTION_MEAN['eu'],ENERGY_RESOLUTION_UNCERTAINTY['eu'])
+
+        # Print the result
         print("nll = %.2f" % nll)
+
         return nll
     return nll
 
 def get_mc_hists_posterior(data,muon_hists,data_hists,x,bins):
+    """
+    Returns the posterior on the Monte Carlo histograms.
+
+    Basically this function just histograms the Monte Carlo data. However,
+    there is one extra thing it does. In general when doing a fit, the Monte
+    Carlo histograms have some uncertainty since you can never simulate an
+    infinite number of statistics. I don't think I've ever really seen anyone
+    properly treat this. Since the uncertainty on the central value in each bin
+    is just given by the Dirichlet distribution, we treat the problem of
+    finding the best value of the posterior as a problem in which you're prior
+    is equal to the expected number of events from the Monte Carlo, and then
+    you actually see the data.  Since the likelihood on the true mean in each
+    bin is a multinomial, the posterior is also a dirichlet where the alpha
+    paramters are given by a sum of the prior and observed counts.
+
+    All that is a long way of saying we calculate the posterior as the sum of
+    the Monte Carlo events (unscaled) and the observed events. In the limit of
+    infinite statistics, this is just equal to the Monte Carlo predicted
+    histogram, but deals with the fact that we don't have infinite statistics,
+    and so a single outlier event isn't necessarily a problem with the model.
+
+    Returns a dictionary mapping particle id combo -> histogram.
+    """
     mc_hists = get_mc_hists(data,x,bins)
     for id in (20,22,2020,2022,2222):
         mc_hists[id] = get_mc_hist_posterior(mc_hists[id],data_hists[id],norm=x[0])
+        # FIXME: does the orering of when we add the muons matter here?
         mc_hists[id] += muon_hists[id]*x[7]
     return mc_hists
 
-def get_data_hist(data,x,bins):
-    return np.histogram(data,bins=bins)[0]
-
 def get_multinomial_prob(data, data_muon, data_mc, id, x_samples, bins, percentile=99.0, size=10000):
     """
     Returns the p-value that the histogram of the data is drawn from the MC
@@ -194,6 +300,69 @@ def get_multinomial_prob(data, data_muon, data_mc, id, x_samples, bins, percenti
         ps.append(np.count_nonzero(chi2_samples >= chi2_data)/len(chi2_samples))
     return np.percentile(ps,percentile)
 
+def get_prob(data,muon,mc,samples,bins,size):
+    prob = {}
+    for id in (20,22,2020,2022,2222):
+        prob[id] = get_multinomial_prob(data,muon,mc,id,samples,bins,size=size)
+        print(id, prob[id])
+    return prob
+
+def do_fit(data,muon,data_mc,bins,steps):
+    """
+    Run the fit and return the minimum along with samples from running an MCMC
+    starting near the minimum.
+
+    Args:
+        - data: pandas dataframe representing the data to fit
+        - muon: pandas dataframe representing the expected background from
+                external muons
+        - data_mc: pandas dataframe representing the expected background from
+                   atmospheric neutrino events
+        - bins: an array of bins to use for the fit
+        - steps: the number of MCMC steps to run
+
+    Returns a tuple (xopt, samples) where samples is an array of shape (steps,
+    number of parameters).
+    """
+    nll = make_nll(data,muon,data_mc,bins)
+
+    x0 = np.array([1.0,1.0,EPSILON,1.0,EPSILON,1.0,EPSILON,EPSILON])
+    opt = nlopt.opt(nlopt.LN_SBPLX, len(x0))
+    opt.set_min_objective(nll)
+    low = np.array([EPSILON]*len(x0))
+    high = np.array([10]*len(x0))
+    opt.set_lower_bounds(low)
+    opt.set_upper_bounds(high)
+    opt.set_ftol_abs(FTOL_ABS)
+    opt.set_initial_step([0.01]*len(x0))
+
+    xopt = opt.optimize(x0)
+    print("xopt = ", xopt)
+    nll_xopt = nll(xopt)
+    print("nll(xopt) = ", nll(xopt))
+
+    pos = np.empty((20, len(x0)),dtype=np.double)
+    for i in range(pos.shape[0]):
+        pos[i] = xopt + np.random.randn(len(x0))*0.1
+        pos[i,:] = np.clip(pos[i,:],low,high)
+
+    nwalkers, ndim = pos.shape
+
+    sampler = emcee.EnsembleSampler(nwalkers, ndim, lambda x: -nll(x))
+    with np.errstate(invalid='ignore'):
+        sampler.run_mcmc(pos, steps)
+
+    print("Mean acceptance fraction: {0:.3f}".format(np.mean(sampler.acceptance_fraction)))
+
+    try:
+        print("autocorrelation time: ", sampler.get_autocorr_time(quiet=True))
+    except Exception as e:
+        print(e)
+
+    samples = sampler.chain.reshape((-1,len(x0)))
+
+    return xopt, samples
+
 if __name__ == '__main__':
     import argparse
     import numpy as np
@@ -214,6 +383,7 @@ if __name__ == '__main__':
     parser.add_argument("--nhit-thresh", type=int, default=None, help="nhit threshold to apply to events before processing (should only be used for testing to speed things up)")
     parser.add_argument("--steps", type=int, default=1000, help="number of steps in the MCMC chain")
     parser.add_argument("--multinomial-prob-size", type=int, default=10000, help="number of p values to compute")
+    parser.add_argument("--coverage", type=int, default=0, help="plot p value coverage")
     args = parser.parse_args()
 
     setup_matplotlib(args.save)
@@ -276,42 +446,133 @@ if __name__ == '__main__':
 
     bins = np.logspace(np.log10(20),np.log10(10e3),21)
 
-    nll = make_nll(data,muon,data_mc,bins)
+    if args.coverage:
+        p_values = {id: [] for id in (20,22,2020,2022,2222)}
+        p_values_atm = {id: [] for id in (20,22,2020,2022,2222)}
+
+        scale = 0.01
+        muon_scale = 0.01
+        energy_resolution = 0.1
+
+        true_values = [scale,1.0,energy_resolution,1.0,energy_resolution,1.0,energy_resolution,muon_scale]
+
+        assert(len(true_values) == len(FIT_PARS))
+
+        pull = [[] for i in range(len(FIT_PARS))]
+
+        # Set the random seed so we get reproducible results here
+        np.random.seed(0)
+
+        for i in range(args.coverage):
+            # Calculate expected number of events
+            N = len(data_mc)*scale
+            N_atm = len(data_atm_mc)*scale
+            N_muon = len(muon)*muon_scale
+            N_muon_atm = len(muon_atm)*muon_scale
+
+            # Calculate observed number of events
+            n = np.random.poisson(N)
+            n_atm = np.random.poisson(N_atm)
+            n_muon = np.random.poisson(N_muon)
+            n_muon_atm = np.random.poisson(N_muon_atm)
+
+            # Sample data from Monte Carlo
+            data = pd.concat((data_mc.sample(n=n,replace=True), muon.sample(n=n_muon,replace=True)))
+            data_atm = pd.concat((data_atm_mc.sample(n=n_atm,replace=True), muon_atm.sample(n=n_muon_atm,replace=True)))
+
+            # Smear the energies by the additional energy resolution
+            data.ke += np.random.randn(len(data.ke))*data.ke*energy_resolution
+            data_atm.ke += np.random.randn(len(data_atm.ke))*data_atm.ke*energy_resolution
+
+            xopt, samples = do_fit(data,muon,data_mc,bins,args.steps)
+
+            for i in range(len(FIT_PARS)):
+                mean = np.mean(samples[:,i])
+                std = np.std(samples[:,i])
+                pull[i].append((mean - true_values[i])/std)
+
+            prob = get_prob(data,muon,data_mc,samples,bins,size=args.multinomial_prob_size)
+            prob_atm = get_prob(data_atm,muon_atm,data_atm_mc,samples,bins,size=args.multinomial_prob_size)
+
+            for id in (20,22,2020,2022,2222):
+                p_values[id].append(prob[id])
+                p_values_atm[id].append(prob_atm[id])
+
+        fig = plt.figure()
+        for id in (20,22,2020,2022,2222):
+            if id == 20:
+                plt.subplot(2,3,1)
+            elif id == 22:
+                plt.subplot(2,3,2)
+            elif id == 2020:
+                plt.subplot(2,3,4)
+            elif id == 2022:
+                plt.subplot(2,3,5)
+            elif id == 2222:
+                plt.subplot(2,3,6)
+            plt.hist(p_values[id],bins=np.linspace(0,1,101),histtype='step')
+            plt.title('$' + ''.join([particle_id[int(''.join(x))] for x in grouper(str(id),2)]) + '$')
+        despine(fig,trim=True)
+        plt.tight_layout()
 
-    x0 = np.array([1.0,1.0,EPSILON,1.0,EPSILON,1.0,EPSILON,EPSILON])
-    opt = nlopt.opt(nlopt.LN_SBPLX, len(x0))
-    opt.set_min_objective(nll)
-    low = np.array([EPSILON]*len(x0))
-    high = np.array([10]*len(x0))
-    opt.set_lower_bounds(low)
-    opt.set_upper_bounds(high)
-    opt.set_ftol_abs(1e-10)
-    opt.set_initial_step([0.01]*len(x0))
+        if args.save:
+            fig.savefig("chi2_p_value_coverage_plot.pdf")
+            fig.savefig("chi2_p_value_coverage_plot.eps")
+        else:
+            plt.suptitle("P-value Coverage without Neutron Follower")
+
+        fig = plt.figure()
+        for id in (20,22,2020,2022,2222):
+            if id == 20:
+                plt.subplot(2,3,1)
+            elif id == 22:
+                plt.subplot(2,3,2)
+            elif id == 2020:
+                plt.subplot(2,3,4)
+            elif id == 2022:
+                plt.subplot(2,3,5)
+            elif id == 2222:
+                plt.subplot(2,3,6)
+            plt.hist(p_values_atm[id],bins=np.linspace(0,1,101),histtype='step')
+            plt.title('$' + ''.join([particle_id[int(''.join(x))] for x in grouper(str(id),2)]) + '$')
+        despine(fig,trim=True)
+        plt.tight_layout()
 
-    xopt = opt.optimize(x0)
-    print("xopt = ", xopt)
-    nll_xopt = nll(xopt)
-    print("nll(xopt) = ", nll(xopt))
+        if args.save:
+            fig.savefig("chi2_p_value_coverage_plot_atm.pdf")
+            fig.savefig("chi2_p_value_coverage_plot_atm.eps")
+        else:
+            plt.suptitle("P-value Coverage with Neutron Follower")
 
-    pos = np.empty((20, len(x0)),dtype=np.double)
-    for i in range(pos.shape[0]):
-        pos[i] = xopt + np.random.randn(len(x0))*0.1
-        pos[i,:] = np.clip(pos[i,:],low,high)
+        # Bins for the pull plots
+        bins = np.linspace(-10,10,101)
+        bincenters = (bins[1:] + bins[:-1])/2
 
-    nwalkers, ndim = pos.shape
+        fig = plt.figure()
+        axes = []
+        for i, name in enumerate(FIT_PARS):
+            axes.append(plt.subplot(3,3,i+1))
+            plt.hist(pull[i],bins=bins,histtype='step',normed=True)
+            plt.plot(bincenters,norm.pdf(bincenters))
+            plt.title(name)
+        for ax in axes:
+            ax.set_xlim((-10,10))
+            despine(ax=ax,left=True,trim=True)
+            ax.get_yaxis().set_visible(False)
+        plt.tight_layout()
 
-    sampler = emcee.EnsembleSampler(nwalkers, ndim, lambda x: -nll(x))
-    with np.errstate(invalid='ignore'):
-        sampler.run_mcmc(pos, args.steps)
+        if args.save:
+            fig.savefig("chi2_pull_plot.pdf")
+            fig.savefig("chi2_pull_plot.eps")
+        else:
+            plt.show()
 
-    print("Mean acceptance fraction: {0:.3f}".format(np.mean(sampler.acceptance_fraction)))
+        sys.exit(0)
 
-    try:
-        print("autocorrelation time: ", sampler.get_autocorr_time(quiet=True))
-    except Exception as e:
-        print(e)
+    xopt, samples = do_fit(data,muon,data_mc,bins,args.steps)
 
-    samples = sampler.chain.reshape((-1,len(x0)))
+    prob = get_prob(data,muon,data_mc,samples,bins,size=args.multinomial_prob_size)
+    prob_atm = get_prob(data_atm,muon_atm,data_atm_mc,samples,bins,size=args.multinomial_prob_size)
 
     plt.figure()
     plt.subplot(3,3,1)
@@ -340,16 +601,6 @@ if __name__ == '__main__':
     plt.xlabel("Muon Scale")
     plt.tight_layout()
 
-    prob = {}
-    for id in (20,22,2020,2022,2222):
-        prob[id] = get_multinomial_prob(data,muon,data_mc,id,samples,bins,size=args.multinomial_prob_size)
-        print(id, prob[id])
-
-    prob_atm = {}
-    for id in (20,22,2020,2022,2222):
-        prob_atm[id] = get_multinomial_prob(data_atm,muon_atm,data_atm_mc,id,samples,bins,size=args.multinomial_prob_size)
-        print(id, prob_atm[id])
-
     handles = [Line2D([0], [0], color='C0'),
                Line2D([0], [0], color='C1'),
                Line2D([0], [0], color='C2')]
author	tlatorre <tlatorre@uchicago.edu>	2020-08-25 02:40:51 -0500
committer	tlatorre <tlatorre@uchicago.edu>	2020-08-25 02:40:51 -0500
commit	a26354a837a9135696b18d8e447bfb7734e2ed94 (patch)
tree	87050bdf4fefa83e6a6d802bb823a728d4235e38
parent	aedf61a2bb1ab7479f5d27cebd2c83d0c107260f (diff)
download	sddm-a26354a837a9135696b18d8e447bfb7734e2ed94.tar.gz sddm-a26354a837a9135696b18d8e447bfb7734e2ed94.tar.bz2 sddm-a26354a837a9135696b18d8e447bfb7734e2ed94.zip