diff options
Diffstat (limited to 'utils/chi2')
-rwxr-xr-x | utils/chi2 | 511 |
1 files changed, 209 insertions, 302 deletions
@@ -27,22 +27,30 @@ from __future__ import print_function, division import numpy as np from scipy.stats import iqr, poisson from matplotlib.lines import Line2D -from scipy.stats import iqr, norm, beta +from scipy.stats import iqr, norm, beta, percentileofscore from scipy.special import spence from itertools import izip_longest from sddm.stats import * -import contextlib -from sddm.dc import estimate_errors - -# from https://stackoverflow.com/questions/2891790/how-to-pretty-print-a-numpy-array-without-scientific-notation-and-with-given-pre -@contextlib.contextmanager -def printoptions(*args, **kwargs): - original = np.get_printoptions() - np.set_printoptions(*args, **kwargs) - try: - yield - finally: - np.set_printoptions(**original) +from sddm.dc import estimate_errors, EPSILON, truncnorm_scaled +import emcee +from sddm import printoptions +from sddm.utils import fast_cdf, correct_energy_bias + +# Likelihood Fit Parameters +# 0 - Atmospheric Neutrino Flux Scale +# 1 - Electron energy bias +# 2 - Electron energy resolution +# 3 - Muon energy bias +# 4 - Muon energy resolution +# 5 - External Muon scale + +FIT_PARS = [ + 'Atmospheric Neutrino Flux Scale', + 'Electron energy bias', + 'Electron energy resolution', + 'Muon energy bias', + 'Muon energy resolution', + 'External Muon scale'] # Uncertainty on the energy scale # @@ -62,18 +70,43 @@ def printoptions(*args, **kwargs): # the analysis at all, I'll leave the uncertainty here at 10% anyways. # - For the electron + muon energy resolution, I don't have any real good way # to estimate this, so we are conservative and set the error to 10%. -ENERGY_SCALE_MEAN = {'e': 0.0, 'u': -0.066} -ENERGY_SCALE_UNCERTAINTY = {'e':0.1, 'u': 0.011} -ENERGY_RESOLUTION_MEAN = {'e': 0.0, 'u': 0.0, 'eu': 0.0} -ENERGY_RESOLUTION_UNCERTAINTY = {'e':0.1, 'u': 0.014, 'eu': 0.1} - -# Absolute tolerance for the minimizer. -# Since we're minimizing the negative log likelihood, we really only care about -# the value of the minimum to within ~0.05 (10% of the one sigma shift). -# However, I have noticed before that setting a higher tolerance can sometimes -# cause the fit to get stuck in a local minima, so we set it here to a very -# small value. -FTOL_ABS = 1e-10 +PRIORS = [ + 0.01, # Atmospheric Neutrino Scale + 0.0, # Electron energy scale + 0.0, # Electron energy resolution + 0.066, # Muon energy scale + 0.0, # Muon energy resolution + 0.01, # Muon scale +] + +PRIOR_UNCERTAINTIES = [ + 0.002, # Atmospheric Neutrino Scale + 0.1, # Electron energy scale + 0.1, # Electron energy resolution + 0.011, # Muon energy scale + 0.014, # Muon energy resolution + 0.001, # Muon scale +] + +# Lower bounds for the fit parameters +PRIORS_LOW = [ + EPSILON, + -10, + EPSILON, + -10, + EPSILON, + EPSILON +] + +# Upper bounds for the fit parameters +PRIORS_HIGH = [ + 10, + 10, + 10, + 10, + 10, + 10 +] particle_id = {20: 'e', 22: r'\mu'} @@ -96,10 +129,10 @@ def plot_hist2(hists, bins, color=None): plt.xlabel("Energy (MeV)") plt.title('$' + ''.join([particle_id[int(''.join(x))] for x in grouper(str(id),2)]) + '$') - if len(df): + if len(hists): plt.tight_layout() -def get_mc_hists(data,x,bins,apply_norm=False,reweight=False): +def get_mc_hists(data,x,bins,scale=1.0,reweight=False): """ Returns the expected Monte Carlo histograms for the atmospheric neutrino background. @@ -108,156 +141,86 @@ def get_mc_hists(data,x,bins,apply_norm=False,reweight=False): - data: pandas dataframe of the Monte Carlo events - x: fit parameters - bins: histogram bins - - apply_norm: boolean value representing whether we should apply the - atmospheric neutrino scale parameter + - scale: multiply histograms by an overall scale factor This function does two basic things: 1. apply the energy bias and resolution corrections 2. histogram the results - Normally to apply the energy resolution correction we should smear out each - MC event by the resolution and then bin the results. But since there are - thousands and thousands of MC events this takes way too long. Therefore, we - use a trick. We first bin the MC events on a much finer binning scale than - the final bins. We then smear this histogram and then bin the results in - the final bins. - Returns a dictionary mapping particle id combo -> histogram. """ - ke_dict = {} + df_dict = {} for id in (20,22,2020,2022,2222): - ke_dict[id] = data[data.id == id].ke.values - - if reweight: - ke_weights_dict = {} - for id in (20,22,2020,2022,2222): - ke_weights_dict[id] = data[data.id == id].weight.values - else: - ke_weights_dict = None + df_dict[id] = data[data.id == id] - return get_mc_hists_fast(ke_dict,x,bins,apply_norm,weights_dict=ke_weights_dict) + return get_mc_hists_fast(df_dict,x,bins,scale,reweight) -def get_mc_hists_fast(ke_dict,x,bins,apply_norm=False,weights_dict=None): +def get_mc_hists_fast(df_dict,x,bins,scale=1.0,reweight=False): """ Same as get_mc_hists() but the first argument is a dictionary mapping - particle id -> kinetic energy array. This is much faster than selecting the - events from the dataframe every time. + particle id -> dataframe. This is much faster than selecting the events + from the dataframe every time. """ mc_hists = {} - # FIXME: May need to increase number of bins here - bins2 = np.logspace(np.log10(10),np.log10(20e3),100) - bincenters2 = (bins2[1:] + bins2[:-1])/2 - for id in (20,22,2020,2022,2222): - ke = ke_dict[id] + df = df_dict[id] - if id in (20,2020): - scale = bincenters2*max(EPSILON,x[2]) - elif id in (22,2222): - scale = bincenters2*max(EPSILON,x[4]) + if id == 20: + ke = df.energy1.values*(1+x[1]) + resolution = df.energy1.values*max(EPSILON,x[2]) + elif id == 2020: + ke = df.energy1.values*(1+x[1]) + df.energy2.values*(1+x[1]) + resolution = np.sqrt((df.energy1.values*max(EPSILON,x[2]))**2 + (df.energy2.values*max(EPSILON,x[2]))**2) + elif id == 22: + ke = df.energy1.values*(1+x[3]) + resolution = df.energy1.values*max(EPSILON,x[4]) + elif id == 2222: + ke = df.energy1.values*(1+x[3]) + df.energy2.values*(1+x[3]) + resolution = np.sqrt((df.energy1.values*max(EPSILON,x[4]))**2 + (df.energy2.values*max(EPSILON,x[4]))**2) elif id == 2022: - scale = bincenters2*max(EPSILON,x[5]) + ke = df.energy1.values*(1+x[1]) + df.energy2.values*(1+x[3]) + resolution = np.sqrt((df.energy1.values*max(EPSILON,x[2]))**2 + (df.energy2.values*max(EPSILON,x[4]))**2) - if weights_dict: - hist = np.histogram(ke,bins=bins2,weights=weights_dict[id])[0] + if reweight: + cdf = fast_cdf(bins[:,np.newaxis],ke,resolution)*df.weight.values else: - hist = np.histogram(ke,bins=bins2)[0] + cdf = fast_cdf(bins[:,np.newaxis],ke,resolution) - cdf = norm.cdf(bins[:,np.newaxis],bincenters2,scale)*hist mc_hists[id] = np.sum(cdf[1:] - cdf[:-1],axis=-1) - if apply_norm: - mc_hists[id] *= x[0] + mc_hists[id] *= scale return mc_hists -def get_data_hists_fast(ke_dict,x,bins,scale=1.0): - data_hists = {} - for id in (20,22,2020,2022,2222): - if id in (20,2020): - data_hists[id] = np.histogram(ke_dict[id]*(1+x[1]),bins=bins)[0]*scale - elif id in (22,2222): - data_hists[id] = np.histogram(ke_dict[id]*(1+x[3]),bins=bins)[0]*scale - elif id == 2022: - data_hists[id] = np.histogram((ke_dict[id]*[1+x[1],1+x[3]]).sum(axis=-1),bins=bins)[0]*scale - return data_hists - -def get_data_hists(data,x,bins,scale=1.0): +def get_data_hists(data,bins,scale=1.0): """ Returns the data histogrammed into `bins`. """ - ke_dict = {} + data_hists = {} for id in (20,22,2020,2022,2222): - if id in (20,2020): - ke_dict[id] = data[data.id == id].ke.values - elif id in (22,2222): - ke_dict[id] = data[data.id == id].ke.values - elif id == 2022: - ke_dict[id] = np.array([data.loc[data.id == id,'energy1'].values, - data.loc[data.id == id,'energy2'].values]).T - - return get_data_hists_fast(ke_dict,x,bins,scale) - -# Likelihood Fit Parameters -# 0 - Atmospheric Neutrino Flux Scale -# 1 - Electron energy bias -# 2 - Electron energy resolution -# 3 - Muon energy bias -# 4 - Muon energy resolution -# 5 - Electron + Muon energy resolution -# 6 - External Muon scale - -FIT_PARS = [ - 'Atmospheric Neutrino Flux Scale', - 'Electron energy bias', - 'Electron energy resolution', - 'Muon energy bias', - 'Muon energy resolution', - 'Electron + Muon energy resolution', - 'External Muon scale'] + data_hists[id] = np.histogram(data[data.id == id].ke.values,bins=bins)[0]*scale + return data_hists def make_nll(data, muons, mc, bins, print_nll=False): - ke_dict = {} + df_dict = {} for id in (20,22,2020,2022,2222): - ke_dict[id] = mc[mc.id == id].ke.values + df_dict[id] = mc[mc.id == id] - ke_dict_muon = {} + df_dict_muon = {} for id in (20,22,2020,2022,2222): - ke_dict_muon[id] = muons[muons.id == id].ke.values + df_dict_muon[id] = muons[muons.id == id] - ke_dict_data = {} - for id in (20,22,2020,2022,2222): - if id in (20,2020): - ke_dict_data[id] = data[data.id == id].ke.values - elif id in (22,2222): - ke_dict_data[id] = data[data.id == id].ke.values - elif id == 2022: - ke_dict_data[id] = np.array([data.loc[data.id == id,'energy1'].values, - data.loc[data.id == id,'energy2'].values]).T + data_hists = get_data_hists(data,bins) def nll(x, grad=None): - if any(x[i] < 0 for i in (0,2,4,5,6)): + if any(x[i] < 0 for i in (0,2,4,5)): return np.inf - # Get the data histograms. We need to do this within the - # likelihood function since we apply the energy bias corrections to the - # data. - # - # Note: Applying the energy bias correction to the data has a few - # issues. If the energy scale parameter changes by even a very small - # amount it can cause events to cross a bin boundary you will see a - # discrete jump in the likelihood. This isn't good for minimizers, and - # so when doing the first minimization with nlopt, I fix these - # parameters. Later, when we run the MCMC, these paramters are then - # free to float and the small discontinuities in the likelihood - # shouldn't be a problem. - data_hists = get_data_hists_fast(ke_dict_data,x,bins) - # Get the Monte Carlo histograms. We need to do this within the # likelihood function since we apply the energy resolution parameters # to the Monte Carlo. - mc_hists = get_mc_hists_fast(ke_dict,x,bins,apply_norm=True) - muon_hists = get_mc_hists_fast(ke_dict_muon,x,bins,apply_norm=False) + mc_hists = get_mc_hists_fast(df_dict,x,bins) + muon_hists = get_mc_hists_fast(df_dict_muon,x,bins) # Calculate the negative log of the likelihood of observing the data # given the fit parameters @@ -265,16 +228,12 @@ def make_nll(data, muons, mc, bins, print_nll=False): nll = 0 for id in data_hists: oi = data_hists[id] - ei = mc_hists[id] + muon_hists[id]*x[6] + EPSILON + ei = mc_hists[id]*x[0] + muon_hists[id]*x[5] + EPSILON N = ei.sum() nll -= -N - np.sum(gammaln(oi+1)) + np.sum(oi*np.log(ei)) # Add the priors - nll -= norm.logpdf(x[1],ENERGY_SCALE_MEAN['e'],ENERGY_SCALE_UNCERTAINTY['e']) - nll -= norm.logpdf(x[3],ENERGY_SCALE_MEAN['u'],ENERGY_SCALE_UNCERTAINTY['u']) - nll -= norm.logpdf(x[2],ENERGY_RESOLUTION_MEAN['e'],ENERGY_RESOLUTION_UNCERTAINTY['e']) - nll -= norm.logpdf(x[4],ENERGY_RESOLUTION_MEAN['u'],ENERGY_RESOLUTION_UNCERTAINTY['u']) - nll -= norm.logpdf(x[5],ENERGY_RESOLUTION_MEAN['eu'],ENERGY_RESOLUTION_UNCERTAINTY['eu']) + nll -= norm.logpdf(x,PRIORS,PRIOR_UNCERTAINTIES).sum() if print_nll: # Print the result @@ -297,7 +256,7 @@ def get_mc_hists_posterior(data_mc,muon_hists,data_hists,x,bins): is equal to the expected number of events from the Monte Carlo, and then you actually see the data. Since the likelihood on the true mean in each bin is a multinomial, the posterior is also a dirichlet where the alpha - paramters are given by a sum of the prior and observed counts. + parameters are given by a sum of the prior and observed counts. All that is a long way of saying we calculate the posterior as the sum of the Monte Carlo events (unscaled) and the observed events. In the limit of @@ -311,7 +270,7 @@ def get_mc_hists_posterior(data_mc,muon_hists,data_hists,x,bins): for id in (20,22,2020,2022,2222): mc_hists[id] = get_mc_hist_posterior(mc_hists[id],data_hists[id],norm=x[0]) # FIXME: does the orering of when we add the muons matter here? - mc_hists[id] += muon_hists[id]*x[6] + mc_hists[id] += muon_hists[id]*x[5] return mc_hists def get_multinomial_prob(data, data_muon, data_mc, id, x_samples, bins, percentile=50.0, size=10000): @@ -334,19 +293,11 @@ def get_multinomial_prob(data, data_muon, data_mc, id, x_samples, bins, percenti bins: bins used to bin the mc histogram size: number of values to compute """ - ke_dict_muon = {} + df_dict_muon = {} for _id in (20,22,2020,2022,2222): - ke_dict_muon[_id] = data_muon[data_muon.id == _id].ke.values + df_dict_muon[_id] = data_muon[data_muon.id == _id] - ke_dict_data = {} - for _id in (20,22,2020,2022,2222): - if _id in (20,2020): - ke_dict_data[_id] = data[data.id == _id].ke.values - elif _id in (22,2222): - ke_dict_data[_id] = data[data.id == _id].ke.values - elif _id == 2022: - ke_dict_data[_id] = np.array([data.loc[data.id == _id,'energy1'].values, - data.loc[data.id == _id,'energy2'].values]).T + data_hists = get_data_hists(data,bins) # Get the total number of "universes" simulated in the GENIE reweight tool if len(data_mc): @@ -358,8 +309,7 @@ def get_multinomial_prob(data, data_muon, data_mc, id, x_samples, bins, percenti for i in range(size): x = x_samples[np.random.randint(x_samples.shape[0])] - data_hists = get_data_hists_fast(ke_dict_data,x,bins) - muon_hists = get_mc_hists_fast(ke_dict_muon,x,bins,apply_norm=False) + muon_hists = get_mc_hists_fast(df_dict_muon,x,bins) if nuniverses > 0: universe = np.random.randint(nuniverses) @@ -394,7 +344,7 @@ def get_prob(data,muon,mc,samples,bins,size): print(id, prob[id]) return prob -def do_fit(data,muon,data_mc,bins,steps,print_nll=False): +def do_fit(data,muon,data_mc,bins,steps,print_nll=False,walkers=100,thin=10): """ Run the fit and return the minimum along with samples from running an MCMC starting near the minimum. @@ -413,46 +363,34 @@ def do_fit(data,muon,data_mc,bins,steps,print_nll=False): """ nll = make_nll(data,muon,data_mc,bins,print_nll) - x0 = np.array([1.0,0.0,EPSILON,0.0,EPSILON,EPSILON,EPSILON]) + x0 = np.array(PRIORS) opt = nlopt.opt(nlopt.LN_SBPLX, len(x0)) opt.set_min_objective(nll) - low = np.array([EPSILON,-1e9,EPSILON,-1e9,EPSILON,EPSILON,EPSILON]) - high = np.array([1e9]*len(x0)) - # Fix the energy bias parameters for the minimization, since they will - # cause discontinuities in the likelihood function. - low[1] = 0.0 - low[3] = 0.0 - high[1] = 0.0 - high[3] = 0.0 + low = np.array(PRIORS_LOW) + high = np.array(PRIORS_HIGH) opt.set_lower_bounds(low) opt.set_upper_bounds(high) - opt.set_ftol_abs(FTOL_ABS) + opt.set_ftol_abs(1e-2) opt.set_initial_step([0.01]*len(x0)) - xopt = opt.optimize(x0) - print("xopt = ", xopt) - nll_xopt = nll(xopt) - print("nll(xopt) = ", nll(xopt)) - - # Now, we allow them to float - low[1] = -1e9 - low[3] = -1e9 - high[1] = 1e9 - high[3] = 1e9 - - stepsizes = estimate_errors(nll,xopt,low,high) - - with printoptions(precision=3, suppress=True): - print("Errors: ", stepsizes) - - pos = np.empty((20, len(x0)),dtype=np.double) + pos = np.empty((walkers, len(x0)),dtype=np.double) for i in range(pos.shape[0]): - pos[i] = xopt + np.random.randn(len(x0))*stepsizes + pos[i] = truncnorm_scaled(PRIORS_LOW,PRIORS_HIGH,PRIORS,PRIOR_UNCERTAINTIES) pos[i,:] = np.clip(pos[i,:],low,high) + pos[i] = opt.optimize(pos[i]) + with printoptions(precision=2, suppress=True): + print("pos[%i] = %s, nll = %.2f" % (i, pos[i], opt.last_optimum_value())) nwalkers, ndim = pos.shape - sampler = emcee.EnsembleSampler(nwalkers, ndim, lambda x: -nll(x)) + # We use the KDEMove here because I think it should sample the likelihood + # better. Because we have energy scale parameters and we are doing a binned + # likelihood, the likelihood is discontinuous. There can also be several + # local minima. The author of emcee recommends using the KDEMove with a lot + # of workers to try and properly sample a multimodal distribution. In + # addition, I've found that the autocorrelation time for the KDEMove is + # much better than the other moves. + sampler = emcee.EnsembleSampler(nwalkers, ndim, lambda x: -nll(x), moves=emcee.moves.KDEMove()) with np.errstate(invalid='ignore'): sampler.run_mcmc(pos, steps) @@ -463,47 +401,9 @@ def do_fit(data,muon,data_mc,bins,steps,print_nll=False): except Exception as e: print(e) - samples = sampler.chain.reshape((-1,len(x0))) + samples = sampler.get_chain(flat=True,thin=thin) - return xopt, samples - -# Energy bias of reconstruction relative to Monte Carlo. -# -# Note: You can recreate this array using: -# -# ./plot-fit-results -o [filename] -ENERGY_BIAS = np.array([ - (100.0, 0.050140, 0.078106), - (200.0, 0.058666, 0.078106), - (300.0, 0.049239, -0.000318), - (400.0, 0.045581, -0.020932), - (500.0, 0.050757, -0.028394), - (600.0, 0.048310, -0.029017), - (700.0, 0.052434, -0.020770), - (800.0, 0.032920, -0.019298), - (900.0, 0.040963, -0.015354)], - dtype=[('T',np.float32),('e_bias',np.float32),('u_bias',np.float32)]) - -def correct_energy_bias(df): - """ - Corrects for the energy bias of the reconstruction relative to the true - Monte Carlo energy. - """ - # Note: We divide here since we define the bias as: - # - # bias = (T_reco - T_true)/T_true - # - # Therefore, - # - # T_true = T_reco/(1+bias) - df.loc[df['id1'] == 20,'energy1'] /= (1+np.interp(df.loc[df['id1'] == 20,'energy1'],ENERGY_BIAS['T'],ENERGY_BIAS['e_bias'])) - df.loc[df['id2'] == 20,'energy2'] /= (1+np.interp(df.loc[df['id2'] == 20,'energy1'],ENERGY_BIAS['T'],ENERGY_BIAS['e_bias'])) - df.loc[df['id3'] == 20,'energy3'] /= (1+np.interp(df.loc[df['id3'] == 20,'energy1'],ENERGY_BIAS['T'],ENERGY_BIAS['e_bias'])) - df.loc[df['id1'] == 22,'energy1'] /= (1+np.interp(df.loc[df['id1'] == 22,'energy1'],ENERGY_BIAS['T'],ENERGY_BIAS['u_bias'])) - df.loc[df['id2'] == 22,'energy2'] /= (1+np.interp(df.loc[df['id2'] == 22,'energy1'],ENERGY_BIAS['T'],ENERGY_BIAS['u_bias'])) - df.loc[df['id3'] == 22,'energy3'] /= (1+np.interp(df.loc[df['id3'] == 22,'energy1'],ENERGY_BIAS['T'],ENERGY_BIAS['u_bias'])) - df['ke'] = df['energy1'].fillna(0) + df['energy2'].fillna(0) + df['energy3'].fillna(0) - return df + return sampler.get_chain(flat=True)[sampler.get_log_prob(flat=True).argmax()], samples if __name__ == '__main__': import argparse @@ -515,7 +415,6 @@ if __name__ == '__main__': from sddm.plot import * from sddm import setup_matplotlib import nlopt - import emcee parser = argparse.ArgumentParser("plot fit results") parser.add_argument("filenames", nargs='+', help="input files") @@ -526,8 +425,11 @@ if __name__ == '__main__': parser.add_argument("--steps", type=int, default=1000, help="number of steps in the MCMC chain") parser.add_argument("--multinomial-prob-size", type=int, default=10000, help="number of p values to compute") parser.add_argument("--coverage", type=int, default=0, help="plot p value coverage") + parser.add_argument("--pull", type=int, default=0, help="plot pull plots") parser.add_argument("--weights", nargs='+', required=True, help="GENIE reweight HDF5 files") parser.add_argument("--print-nll", action='store_true', default=False, help="print nll values") + parser.add_argument("--walkers", type=int, default=100, help="number of walkers") + parser.add_argument("--thin", type=int, default=10, help="number of steps to thin") args = parser.parse_args() setup_matplotlib(args.save) @@ -613,30 +515,17 @@ if __name__ == '__main__': p_values = {id: [] for id in (20,22,2020,2022,2222)} p_values_atm = {id: [] for id in (20,22,2020,2022,2222)} - ENERGY_SCALE_MEAN = {'e': 0.0, 'u': 0.0} - ENERGY_SCALE_UNCERTAINTY = {'e':1.0, 'u': 1.0} - ENERGY_RESOLUTION_MEAN = {'e': 0.0, 'u': 0.0, 'eu': 0.0} - ENERGY_RESOLUTION_UNCERTAINTY = {'e':1.0, 'u': 1.0, 'eu': 1.0} - - scale = 0.01 - muon_scale = 0.01 - energy_resolution = 0.1 - - true_values = [scale,0.0,energy_resolution,0.0,energy_resolution,energy_resolution,muon_scale] - - assert(len(true_values) == len(FIT_PARS)) - - pull = [[] for i in range(len(FIT_PARS))] - # Set the random seed so we get reproducible results here np.random.seed(0) + xtrue = truncnorm_scaled(PRIORS_LOW,PRIORS_HIGH,PRIORS,PRIOR_UNCERTAINTIES) + for i in range(args.coverage): # Calculate expected number of events - N = len(data_mc)*scale - N_atm = len(data_atm_mc)*scale - N_muon = len(muon)*muon_scale - N_muon_atm = len(muon_atm)*muon_scale + N = len(data_mc)*xtrue[0] + N_atm = len(data_atm_mc)*xtrue[0] + N_muon = len(muon)*xtrue[5] + N_muon_atm = len(muon_atm)*xtrue[5] # Calculate observed number of events n = np.random.poisson(N) @@ -649,15 +538,19 @@ if __name__ == '__main__': data_atm = pd.concat((data_atm_mc.sample(n=n_atm,replace=True,weights='weight'), muon_atm.sample(n=n_muon_atm,replace=True))) # Smear the energies by the additional energy resolution - data.ke += np.random.randn(len(data.ke))*data.ke*energy_resolution - data_atm.ke += np.random.randn(len(data_atm.ke))*data_atm.ke*energy_resolution + data.loc[data.id1 == 20,'energy1'] *= (1+xtrue[1]+np.random.randn(np.count_nonzero(data.id1 == 20))*xtrue[2]) + data.loc[data.id1 == 22,'energy1'] *= (1+xtrue[3]+np.random.randn(np.count_nonzero(data.id1 == 22))*xtrue[4]) + data.loc[data.id2 == 20,'energy2'] *= (1+xtrue[1]+np.random.randn(np.count_nonzero(data.id2 == 20))*xtrue[2]) + data.loc[data.id2 == 22,'energy2'] *= (1+xtrue[3]+np.random.randn(np.count_nonzero(data.id2 == 22))*xtrue[4]) + data['ke'] = data['energy1'].fillna(0) + data['energy2'].fillna(0) + data['energy3'].fillna(0) - xopt, samples = do_fit(data,muon,data_mc,bins,args.steps,args.print_nll) + data_atm.loc[data_atm.id1 == 20,'energy1'] *= (1+xtrue[1]+np.random.randn(np.count_nonzero(data_atm.id1 == 20))*xtrue[2]) + data_atm.loc[data_atm.id1 == 22,'energy1'] *= (1+xtrue[3]+np.random.randn(np.count_nonzero(data_atm.id1 == 22))*xtrue[4]) + data_atm.loc[data_atm.id2 == 20,'energy2'] *= (1+xtrue[1]+np.random.randn(np.count_nonzero(data_atm.id2 == 20))*xtrue[2]) + data_atm.loc[data_atm.id2 == 22,'energy2'] *= (1+xtrue[3]+np.random.randn(np.count_nonzero(data_atm.id2 == 22))*xtrue[4]) + data_atm['ke'] = data_atm['energy1'].fillna(0) + data_atm['energy2'].fillna(0) + data_atm['energy3'].fillna(0) - for i in range(len(FIT_PARS)): - mean = np.mean(samples[:,i]) - std = np.std(samples[:,i]) - pull[i].append((mean - true_values[i])/std) + xopt, samples = do_fit(data,muon,data_mc,bins,args.steps,args.print_nll,args.walkers,args.thin) prob = get_prob(data,muon,data_mc_with_weights,samples,bins,size=args.multinomial_prob_size) prob_atm = get_prob(data_atm,muon_atm,data_atm_mc_with_weights,samples,bins,size=args.multinomial_prob_size) @@ -712,19 +605,62 @@ if __name__ == '__main__': else: plt.suptitle("P-value Coverage with Neutron Follower") - # Bins for the pull plots - bins = np.linspace(-10,10,101) - bincenters = (bins[1:] + bins[:-1])/2 + sys.exit(0) + + if args.pull: + pull = [[] for i in range(len(FIT_PARS))] + + # Set the random seed so we get reproducible results here + np.random.seed(0) + + for i in range(args.pull): + xtrue = truncnorm_scaled(PRIORS_LOW,PRIORS_HIGH,PRIORS,PRIOR_UNCERTAINTIES) + + # Calculate expected number of events + N = len(data_mc)*xtrue[0] + N_atm = len(data_atm_mc)*xtrue[0] + N_muon = len(muon)*xtrue[5] + N_muon_atm = len(muon_atm)*xtrue[5] + + # Calculate observed number of events + n = np.random.poisson(N) + n_atm = np.random.poisson(N_atm) + n_muon = np.random.poisson(N_muon) + n_muon_atm = np.random.poisson(N_muon_atm) + + # Sample data from Monte Carlo + data = pd.concat((data_mc.sample(n=n,replace=True), muon.sample(n=n_muon,replace=True))) + data_atm = pd.concat((data_atm_mc.sample(n=n_atm,replace=True), muon_atm.sample(n=n_muon_atm,replace=True))) + + # Smear the energies by the additional energy resolution + data.loc[data.id1 == 20,'energy1'] *= (1+xtrue[1]+np.random.randn(np.count_nonzero(data.id1 == 20))*xtrue[2]) + data.loc[data.id1 == 22,'energy1'] *= (1+xtrue[3]+np.random.randn(np.count_nonzero(data.id1 == 22))*xtrue[4]) + data.loc[data.id2 == 20,'energy2'] *= (1+xtrue[1]+np.random.randn(np.count_nonzero(data.id2 == 20))*xtrue[2]) + data.loc[data.id2 == 22,'energy2'] *= (1+xtrue[3]+np.random.randn(np.count_nonzero(data.id2 == 22))*xtrue[4]) + data['ke'] = data['energy1'].fillna(0) + data['energy2'].fillna(0) + data['energy3'].fillna(0) + + data_atm.loc[data_atm.id1 == 20,'energy1'] *= (1+xtrue[1]+np.random.randn(np.count_nonzero(data_atm.id1 == 20))*xtrue[2]) + data_atm.loc[data_atm.id1 == 22,'energy1'] *= (1+xtrue[3]+np.random.randn(np.count_nonzero(data_atm.id1 == 22))*xtrue[4]) + data_atm.loc[data_atm.id2 == 20,'energy2'] *= (1+xtrue[1]+np.random.randn(np.count_nonzero(data_atm.id2 == 20))*xtrue[2]) + data_atm.loc[data_atm.id2 == 22,'energy2'] *= (1+xtrue[3]+np.random.randn(np.count_nonzero(data_atm.id2 == 22))*xtrue[4]) + data_atm['ke'] = data_atm['energy1'].fillna(0) + data_atm['energy2'].fillna(0) + data_atm['energy3'].fillna(0) + + xopt, samples = do_fit(data,muon,data_mc,bins,args.steps,args.print_nll,args.walkers,args.thin) + + for i in range(len(FIT_PARS)): + # The "pull plots" we make here are actually produced via a + # procedure called "Simulation Based Calibration". + # + # See https://arxiv.org/abs/1804.06788. + pull[i].append(percentileofscore(samples[:,i],xtrue[i])) fig = plt.figure() axes = [] for i, name in enumerate(FIT_PARS): axes.append(plt.subplot(3,3,i+1)) - plt.hist(pull[i],bins=bins,histtype='step',normed=True) - plt.plot(bincenters,norm.pdf(bincenters)) + plt.hist(pull[i],bins=np.linspace(0,100,101),histtype='step',normed=True) plt.title(name) for ax in axes: - ax.set_xlim((-10,10)) despine(ax=ax,left=True,trim=True) ax.get_yaxis().set_visible(False) plt.tight_layout() @@ -737,47 +673,18 @@ if __name__ == '__main__': sys.exit(0) - xopt, samples = do_fit(data,muon,data_mc,bins,args.steps,args.print_nll) + xopt, samples = do_fit(data,muon,data_mc,bins,args.steps,args.print_nll,args.walkers,args.thin) prob = get_prob(data,muon,data_mc_with_weights,samples,bins,size=args.multinomial_prob_size) prob_atm = get_prob(data_atm,muon_atm,data_atm_mc_with_weights,samples,bins,size=args.multinomial_prob_size) plt.figure() - plt.subplot(3,3,1) - plt.hist(samples[:,0],bins=100,histtype='step') - plt.xlabel("Atmospheric Flux Scale") - despine(ax=plt.gca(),left=True,trim=True) - plt.gca().get_yaxis().set_visible(False) - plt.subplot(3,3,2) - plt.hist(samples[:,1],bins=100,histtype='step') - plt.xlabel("Electron Energy Scale") - despine(ax=plt.gca(),left=True,trim=True) - plt.gca().get_yaxis().set_visible(False) - plt.subplot(3,3,3) - plt.hist(samples[:,2],bins=100,histtype='step') - plt.xlabel("Electron Energy Resolution") - despine(ax=plt.gca(),left=True,trim=True) - plt.gca().get_yaxis().set_visible(False) - plt.subplot(3,3,4) - plt.hist(samples[:,3],bins=100,histtype='step') - plt.xlabel("Muon Energy Scale") - despine(ax=plt.gca(),left=True,trim=True) - plt.gca().get_yaxis().set_visible(False) - plt.subplot(3,3,5) - plt.hist(samples[:,4],bins=100,histtype='step') - plt.xlabel("Muon Energy Resolution") - despine(ax=plt.gca(),left=True,trim=True) - plt.gca().get_yaxis().set_visible(False) - plt.subplot(3,3,6) - plt.hist(samples[:,5],bins=100,histtype='step') - plt.xlabel("Electron + Muon Energy Resolution") - despine(ax=plt.gca(),left=True,trim=True) - plt.gca().get_yaxis().set_visible(False) - plt.subplot(3,3,7) - plt.hist(samples[:,6],bins=100,histtype='step') - plt.xlabel("Muon Scale") - despine(ax=plt.gca(),left=True,trim=True) - plt.gca().get_yaxis().set_visible(False) + for i in range(len(FIT_PARS)): + plt.subplot(3,2,i+1) + plt.hist(samples[:,i],bins=100,histtype='step') + plt.xlabel(FIT_PARS[i].title()) + despine(ax=plt.gca(),left=True,trim=True) + plt.gca().get_yaxis().set_visible(False) plt.tight_layout() if args.save: @@ -792,9 +699,9 @@ if __name__ == '__main__': labels = ('Data','Monte Carlo','External Muons') fig = plt.figure() - hists = get_data_hists(data,xopt,bins) - hists_muon = get_data_hists(muon,xopt,bins,scale=xopt[6]) - hists_mc = get_mc_hists(data_mc,xopt,bins,apply_norm=True) + hists = get_data_hists(data,bins) + hists_muon = get_mc_hists(muon,xopt,bins,scale=xopt[5]) + hists_mc = get_mc_hists(data_mc,xopt,bins,scale=xopt[0]) plot_hist2(hists,bins=bins,color='C0') plot_hist2(hists_mc,bins=bins,color='C1') plot_hist2(hists_muon,bins=bins,color='C2') @@ -819,12 +726,12 @@ if __name__ == '__main__': else: plt.suptitle("Without Neutron Follower") fig = plt.figure() - hists = get_data_hists(data_atm,xopt,bins) - hists_muon = get_data_hists(muon_atm,xopt,bins,scale=xopt[6]) - hists_mc = get_mc_hists(data_atm_mc,xopt,bins,apply_norm=True) + hists = get_data_hists(data_atm,bins) + hists_muon = get_mc_hists(muon_atm,xopt,bins,scale=xopt[5]) + hists_mc = get_mc_hists(data_atm_mc,xopt,bins,scale=xopt[0]) plot_hist2(hists,bins=bins,color='C0') plot_hist2(hists_mc,bins=bins,color='C1') - plot_hist2(hists_muon,bins=bins,color='C1') + plot_hist2(hists_muon,bins=bins,color='C2') for id in (20,22,2020,2022,2222): if id == 20: plt.subplot(2,3,1) |