diff options
author | tlatorre <tlatorre@uchicago.edu> | 2020-08-25 02:40:51 -0500 |
---|---|---|
committer | tlatorre <tlatorre@uchicago.edu> | 2020-08-25 02:40:51 -0500 |
commit | a26354a837a9135696b18d8e447bfb7734e2ed94 (patch) | |
tree | 87050bdf4fefa83e6a6d802bb823a728d4235e38 | |
parent | aedf61a2bb1ab7479f5d27cebd2c83d0c107260f (diff) | |
download | sddm-a26354a837a9135696b18d8e447bfb7734e2ed94.tar.gz sddm-a26354a837a9135696b18d8e447bfb7734e2ed94.tar.bz2 sddm-a26354a837a9135696b18d8e447bfb7734e2ed94.zip |
update chi2 script to plot p-value coverage
-rwxr-xr-x | utils/chi2 | 351 |
1 files changed, 301 insertions, 50 deletions
@@ -14,16 +14,14 @@ # You should have received a copy of the GNU General Public License along with # this program. If not, see <https://www.gnu.org/licenses/>. """ -Script to plot final fit results along with sidebands for the dark matter -analysis. To run it just run: +Script to do final null hypothesis test that the events in the 20 MeV - 10 GeV +range are consistent with atmospheric neutrino events. To run it just run: - $ ./plot-energy [list of fit results] + $ ./chi2 [list of data fit results] --mc [list of atmospheric MC files] --muon-mc [list of muon MC files] --steps [steps] -Currently it will plot energy distributions for external muons, michel -electrons, atmospheric events with neutron followers, and prompt signal like -events. Each of these plots will have a different subplot for the particle ID -of the best fit, i.e. single electron, single muon, double electron, electron + -muon, or double muon. +After running you will get a plot showing the best fit results of the MC and +data along with p-values for each of the possible particle combinations (single +electron, single muon, double electron, etc.). """ from __future__ import print_function, division import numpy as np @@ -35,12 +33,21 @@ from itertools import izip_longest from sddm.stats import * # Uncertainty on the energy scale -# FIXME: Should get real number from stopping muons +# FIXME: These are just placeholders! Should get real number from stopping +# muons. ENERGY_SCALE_MEAN = {'e': 1.0, 'u': 1.0, 'eu': 1.0} ENERGY_SCALE_UNCERTAINTY = {'e':0.1, 'u': 0.1, 'eu': 0.1} ENERGY_RESOLUTION_MEAN = {'e': 0.0, 'u': 0.0, 'eu': 0.0} ENERGY_RESOLUTION_UNCERTAINTY = {'e':0.1, 'u': 0.1, 'eu': 0.1} +# Absolute tolerance for the minimizer. +# Since we're minimizing the negative log likelihood, we really only care about +# the value of the minimum to within ~0.05 (10% of the one sigma shift). +# However, I have noticed before that setting a higher tolerance can sometimes +# cause the fit to get stuck in a local minima, so we set it here to a very +# small value. +FTOL_ABS = 1e-10 + particle_id = {20: 'e', 22: r'\mu'} def plot_hist2(hists, bins, color=None): @@ -66,6 +73,31 @@ def plot_hist2(hists, bins, color=None): plt.tight_layout() def get_mc_hists(data,x,bins,apply_norm=False): + """ + Returns the expected Monte Carlo histograms for the atmospheric neutrino + background. + + Args: + - data: pandas dataframe of the Monte Carlo events + - x: fit parameters + - bins: histogram bins + - apply_norm: boolean value representing whether we should apply the + atmospheric neutrino scale parameter + + This function does two basic things: + + 1. apply the energy bias and resolution corrections + 2. histogram the results + + Normally to apply the energy resolution correction we should smear out each + MC event by the resolution and then bin the results. But since there are + thousands and thousands of MC events this takes way too long. Therefore, we + use a trick. We first bin the MC events on a much finer binning scale than + the final bins. We then smear this histogram and then bin the results in + the final bins. + + Returns a dictionary mapping particle id combo -> histogram. + """ mc_hists = {} # FIXME: May need to increase number of bins here @@ -94,6 +126,9 @@ def get_mc_hists(data,x,bins,apply_norm=False): return mc_hists def get_data_hists(data,bins,scale=1.0): + """ + Returns the data histogrammed into `bins`. + """ data_hists = {} for id in (20,22,2020,2022,2222): df_id = data[data.id == id] @@ -110,6 +145,16 @@ def get_data_hists(data,bins,scale=1.0): # 6 - Electron + Muon energy resolution # 7 - External Muon scale +FIT_PARS = [ + 'Atmospheric Neutrino Flux Scale', + 'Electron energy bias', + 'Electron energy resolution', + 'Muon energy bias', + 'Muon energy resolution', + 'Electron + Muon energy bias', + 'Electron + Muon energy resolution', + 'External Muon scale'] + def make_nll(data, muons, mc, bins): data_hists = get_data_hists(data,bins) muon_hists = get_data_hists(muons,bins) @@ -118,34 +163,95 @@ def make_nll(data, muons, mc, bins): if any(x[i] < 0 for i in range(len(x))): return np.inf + # Get the Monte Carlo histograms. We need to do this within the + # likelihood function since several of the parameters like the energy + # bias and resolution affect the histograms. + # + # Note: I really should be applying the bias term to the data instead + # of the Monte Carlo. The reason being that the reconstruction was all + # tuned based on the data from the Monte Carlo. For example, the single + # PE charge distribution is taken from the Monte Carlo. Therefore, if + # there is some difference in bias between data and Monte Carlo, we + # should apply it to the data. However, the reason I apply it to the + # Monte Carlo instead is because applying an energy bias correction to + # an analysis in which we're using histograms is not really a good + # idea. If the energy scale parameter changes by a very small amount + # and causes a single event to cross a bin boundary you will see a + # discrete jump in the likelihood. This isn't good for minimizers + # (although it may be ok with the MCMC depending on the size of the + # jump). To reduce this issue, I apply the energy bias correction to + # the Monte Carlo, and since there are almost 100 times more events in + # the Monte Carlo than in data, the issue is much smaller. + # + # All that being said, this doesn't completely get rid of the issue, + # but I do two things that should make it OK: + # + # 1. I use the SBPLX minimizer instead of COBYLA which should have a + # better chance of jumping over small discontinuities. + # + # 2. I ultimately run an MCMC and so if we get stuck somewhere close to + # the minimum, the MCMC results should correctly deal with any small + # discontinuities. + # + # Also, it's critical that I first adjust the data energy by whatever + # amount I find with the stopping muons and Michel distributions. mc_hists = get_mc_hists(mc,x,bins,apply_norm=True) + # Calculate the negative log of the likelihood of observing the data + # given the fit parameters + nll = 0 for id in data_hists: oi = data_hists[id] ei = mc_hists[id] + muon_hists[id]*x[7] + EPSILON N = ei.sum() nll -= -N - np.sum(gammaln(oi+1)) + np.sum(oi*np.log(ei)) + + # Add the priors nll -= norm.logpdf(x[1],ENERGY_SCALE_MEAN['e'],ENERGY_SCALE_UNCERTAINTY['e']) nll -= norm.logpdf(x[3],ENERGY_SCALE_MEAN['u'],ENERGY_SCALE_UNCERTAINTY['u']) nll -= norm.logpdf(x[5],ENERGY_SCALE_MEAN['eu'],ENERGY_SCALE_UNCERTAINTY['eu']) nll -= norm.logpdf(x[2],ENERGY_RESOLUTION_MEAN['e'],ENERGY_RESOLUTION_UNCERTAINTY['e']) nll -= norm.logpdf(x[4],ENERGY_RESOLUTION_MEAN['u'],ENERGY_RESOLUTION_UNCERTAINTY['u']) nll -= norm.logpdf(x[6],ENERGY_RESOLUTION_MEAN['eu'],ENERGY_RESOLUTION_UNCERTAINTY['eu']) + + # Print the result print("nll = %.2f" % nll) + return nll return nll def get_mc_hists_posterior(data,muon_hists,data_hists,x,bins): + """ + Returns the posterior on the Monte Carlo histograms. + + Basically this function just histograms the Monte Carlo data. However, + there is one extra thing it does. In general when doing a fit, the Monte + Carlo histograms have some uncertainty since you can never simulate an + infinite number of statistics. I don't think I've ever really seen anyone + properly treat this. Since the uncertainty on the central value in each bin + is just given by the Dirichlet distribution, we treat the problem of + finding the best value of the posterior as a problem in which you're prior + is equal to the expected number of events from the Monte Carlo, and then + you actually see the data. Since the likelihood on the true mean in each + bin is a multinomial, the posterior is also a dirichlet where the alpha + paramters are given by a sum of the prior and observed counts. + + All that is a long way of saying we calculate the posterior as the sum of + the Monte Carlo events (unscaled) and the observed events. In the limit of + infinite statistics, this is just equal to the Monte Carlo predicted + histogram, but deals with the fact that we don't have infinite statistics, + and so a single outlier event isn't necessarily a problem with the model. + + Returns a dictionary mapping particle id combo -> histogram. + """ mc_hists = get_mc_hists(data,x,bins) for id in (20,22,2020,2022,2222): mc_hists[id] = get_mc_hist_posterior(mc_hists[id],data_hists[id],norm=x[0]) + # FIXME: does the orering of when we add the muons matter here? mc_hists[id] += muon_hists[id]*x[7] return mc_hists -def get_data_hist(data,x,bins): - return np.histogram(data,bins=bins)[0] - def get_multinomial_prob(data, data_muon, data_mc, id, x_samples, bins, percentile=99.0, size=10000): """ Returns the p-value that the histogram of the data is drawn from the MC @@ -194,6 +300,69 @@ def get_multinomial_prob(data, data_muon, data_mc, id, x_samples, bins, percenti ps.append(np.count_nonzero(chi2_samples >= chi2_data)/len(chi2_samples)) return np.percentile(ps,percentile) +def get_prob(data,muon,mc,samples,bins,size): + prob = {} + for id in (20,22,2020,2022,2222): + prob[id] = get_multinomial_prob(data,muon,mc,id,samples,bins,size=size) + print(id, prob[id]) + return prob + +def do_fit(data,muon,data_mc,bins,steps): + """ + Run the fit and return the minimum along with samples from running an MCMC + starting near the minimum. + + Args: + - data: pandas dataframe representing the data to fit + - muon: pandas dataframe representing the expected background from + external muons + - data_mc: pandas dataframe representing the expected background from + atmospheric neutrino events + - bins: an array of bins to use for the fit + - steps: the number of MCMC steps to run + + Returns a tuple (xopt, samples) where samples is an array of shape (steps, + number of parameters). + """ + nll = make_nll(data,muon,data_mc,bins) + + x0 = np.array([1.0,1.0,EPSILON,1.0,EPSILON,1.0,EPSILON,EPSILON]) + opt = nlopt.opt(nlopt.LN_SBPLX, len(x0)) + opt.set_min_objective(nll) + low = np.array([EPSILON]*len(x0)) + high = np.array([10]*len(x0)) + opt.set_lower_bounds(low) + opt.set_upper_bounds(high) + opt.set_ftol_abs(FTOL_ABS) + opt.set_initial_step([0.01]*len(x0)) + + xopt = opt.optimize(x0) + print("xopt = ", xopt) + nll_xopt = nll(xopt) + print("nll(xopt) = ", nll(xopt)) + + pos = np.empty((20, len(x0)),dtype=np.double) + for i in range(pos.shape[0]): + pos[i] = xopt + np.random.randn(len(x0))*0.1 + pos[i,:] = np.clip(pos[i,:],low,high) + + nwalkers, ndim = pos.shape + + sampler = emcee.EnsembleSampler(nwalkers, ndim, lambda x: -nll(x)) + with np.errstate(invalid='ignore'): + sampler.run_mcmc(pos, steps) + + print("Mean acceptance fraction: {0:.3f}".format(np.mean(sampler.acceptance_fraction))) + + try: + print("autocorrelation time: ", sampler.get_autocorr_time(quiet=True)) + except Exception as e: + print(e) + + samples = sampler.chain.reshape((-1,len(x0))) + + return xopt, samples + if __name__ == '__main__': import argparse import numpy as np @@ -214,6 +383,7 @@ if __name__ == '__main__': parser.add_argument("--nhit-thresh", type=int, default=None, help="nhit threshold to apply to events before processing (should only be used for testing to speed things up)") parser.add_argument("--steps", type=int, default=1000, help="number of steps in the MCMC chain") parser.add_argument("--multinomial-prob-size", type=int, default=10000, help="number of p values to compute") + parser.add_argument("--coverage", type=int, default=0, help="plot p value coverage") args = parser.parse_args() setup_matplotlib(args.save) @@ -276,42 +446,133 @@ if __name__ == '__main__': bins = np.logspace(np.log10(20),np.log10(10e3),21) - nll = make_nll(data,muon,data_mc,bins) + if args.coverage: + p_values = {id: [] for id in (20,22,2020,2022,2222)} + p_values_atm = {id: [] for id in (20,22,2020,2022,2222)} + + scale = 0.01 + muon_scale = 0.01 + energy_resolution = 0.1 + + true_values = [scale,1.0,energy_resolution,1.0,energy_resolution,1.0,energy_resolution,muon_scale] + + assert(len(true_values) == len(FIT_PARS)) + + pull = [[] for i in range(len(FIT_PARS))] + + # Set the random seed so we get reproducible results here + np.random.seed(0) + + for i in range(args.coverage): + # Calculate expected number of events + N = len(data_mc)*scale + N_atm = len(data_atm_mc)*scale + N_muon = len(muon)*muon_scale + N_muon_atm = len(muon_atm)*muon_scale + + # Calculate observed number of events + n = np.random.poisson(N) + n_atm = np.random.poisson(N_atm) + n_muon = np.random.poisson(N_muon) + n_muon_atm = np.random.poisson(N_muon_atm) + + # Sample data from Monte Carlo + data = pd.concat((data_mc.sample(n=n,replace=True), muon.sample(n=n_muon,replace=True))) + data_atm = pd.concat((data_atm_mc.sample(n=n_atm,replace=True), muon_atm.sample(n=n_muon_atm,replace=True))) + + # Smear the energies by the additional energy resolution + data.ke += np.random.randn(len(data.ke))*data.ke*energy_resolution + data_atm.ke += np.random.randn(len(data_atm.ke))*data_atm.ke*energy_resolution + + xopt, samples = do_fit(data,muon,data_mc,bins,args.steps) + + for i in range(len(FIT_PARS)): + mean = np.mean(samples[:,i]) + std = np.std(samples[:,i]) + pull[i].append((mean - true_values[i])/std) + + prob = get_prob(data,muon,data_mc,samples,bins,size=args.multinomial_prob_size) + prob_atm = get_prob(data_atm,muon_atm,data_atm_mc,samples,bins,size=args.multinomial_prob_size) + + for id in (20,22,2020,2022,2222): + p_values[id].append(prob[id]) + p_values_atm[id].append(prob_atm[id]) + + fig = plt.figure() + for id in (20,22,2020,2022,2222): + if id == 20: + plt.subplot(2,3,1) + elif id == 22: + plt.subplot(2,3,2) + elif id == 2020: + plt.subplot(2,3,4) + elif id == 2022: + plt.subplot(2,3,5) + elif id == 2222: + plt.subplot(2,3,6) + plt.hist(p_values[id],bins=np.linspace(0,1,101),histtype='step') + plt.title('$' + ''.join([particle_id[int(''.join(x))] for x in grouper(str(id),2)]) + '$') + despine(fig,trim=True) + plt.tight_layout() - x0 = np.array([1.0,1.0,EPSILON,1.0,EPSILON,1.0,EPSILON,EPSILON]) - opt = nlopt.opt(nlopt.LN_SBPLX, len(x0)) - opt.set_min_objective(nll) - low = np.array([EPSILON]*len(x0)) - high = np.array([10]*len(x0)) - opt.set_lower_bounds(low) - opt.set_upper_bounds(high) - opt.set_ftol_abs(1e-10) - opt.set_initial_step([0.01]*len(x0)) + if args.save: + fig.savefig("chi2_p_value_coverage_plot.pdf") + fig.savefig("chi2_p_value_coverage_plot.eps") + else: + plt.suptitle("P-value Coverage without Neutron Follower") + + fig = plt.figure() + for id in (20,22,2020,2022,2222): + if id == 20: + plt.subplot(2,3,1) + elif id == 22: + plt.subplot(2,3,2) + elif id == 2020: + plt.subplot(2,3,4) + elif id == 2022: + plt.subplot(2,3,5) + elif id == 2222: + plt.subplot(2,3,6) + plt.hist(p_values_atm[id],bins=np.linspace(0,1,101),histtype='step') + plt.title('$' + ''.join([particle_id[int(''.join(x))] for x in grouper(str(id),2)]) + '$') + despine(fig,trim=True) + plt.tight_layout() - xopt = opt.optimize(x0) - print("xopt = ", xopt) - nll_xopt = nll(xopt) - print("nll(xopt) = ", nll(xopt)) + if args.save: + fig.savefig("chi2_p_value_coverage_plot_atm.pdf") + fig.savefig("chi2_p_value_coverage_plot_atm.eps") + else: + plt.suptitle("P-value Coverage with Neutron Follower") - pos = np.empty((20, len(x0)),dtype=np.double) - for i in range(pos.shape[0]): - pos[i] = xopt + np.random.randn(len(x0))*0.1 - pos[i,:] = np.clip(pos[i,:],low,high) + # Bins for the pull plots + bins = np.linspace(-10,10,101) + bincenters = (bins[1:] + bins[:-1])/2 - nwalkers, ndim = pos.shape + fig = plt.figure() + axes = [] + for i, name in enumerate(FIT_PARS): + axes.append(plt.subplot(3,3,i+1)) + plt.hist(pull[i],bins=bins,histtype='step',normed=True) + plt.plot(bincenters,norm.pdf(bincenters)) + plt.title(name) + for ax in axes: + ax.set_xlim((-10,10)) + despine(ax=ax,left=True,trim=True) + ax.get_yaxis().set_visible(False) + plt.tight_layout() - sampler = emcee.EnsembleSampler(nwalkers, ndim, lambda x: -nll(x)) - with np.errstate(invalid='ignore'): - sampler.run_mcmc(pos, args.steps) + if args.save: + fig.savefig("chi2_pull_plot.pdf") + fig.savefig("chi2_pull_plot.eps") + else: + plt.show() - print("Mean acceptance fraction: {0:.3f}".format(np.mean(sampler.acceptance_fraction))) + sys.exit(0) - try: - print("autocorrelation time: ", sampler.get_autocorr_time(quiet=True)) - except Exception as e: - print(e) + xopt, samples = do_fit(data,muon,data_mc,bins,args.steps) - samples = sampler.chain.reshape((-1,len(x0))) + prob = get_prob(data,muon,data_mc,samples,bins,size=args.multinomial_prob_size) + prob_atm = get_prob(data_atm,muon_atm,data_atm_mc,samples,bins,size=args.multinomial_prob_size) plt.figure() plt.subplot(3,3,1) @@ -340,16 +601,6 @@ if __name__ == '__main__': plt.xlabel("Muon Scale") plt.tight_layout() - prob = {} - for id in (20,22,2020,2022,2222): - prob[id] = get_multinomial_prob(data,muon,data_mc,id,samples,bins,size=args.multinomial_prob_size) - print(id, prob[id]) - - prob_atm = {} - for id in (20,22,2020,2022,2222): - prob_atm[id] = get_multinomial_prob(data_atm,muon_atm,data_atm_mc,id,samples,bins,size=args.multinomial_prob_size) - print(id, prob_atm[id]) - handles = [Line2D([0], [0], color='C0'), Line2D([0], [0], color='C1'), Line2D([0], [0], color='C2')] |