From d48484c29d09a944de6f9251a3c659e76279464e Mon Sep 17 00:00:00 2001 From: tlatorre Date: Mon, 5 Oct 2020 14:36:40 -0500 Subject: major updates to the chi2 analysis This commit fixes the chi2 analysis so that it is no longer biased. Previously, the chi2 analysis pull plots showed a consistent bias. At first, I thought this was due to the fact that the posterior wasn't gaussian, but even after switching to percentile plots based on the algorithm outlined in "Validating Bayesian Inference Algorithms with Simulation-Based Calibration", I was still seeing a bias. I finally tracked it down to the fact that I was applying the energy scale parameters to the data instead of the Monte Carlo. Therefore, in this commit I update the posterior to now apply the energy scale parameters to the Monte Carlo instead of the data. This has the slight disadvantage that the final histograms will be binned in the biased energy, but that's not really a big deal. In addition, this commit contains several other updates: - switch to plotting percentile plots based on the algorithm in "Validating Bayesian Inference Algorithms with Simulation-Based Calibration" instead of pull plots - apply both the energy scale and resolution at the individual particle level, i.e. there is no longer an energy resolution term for electron + muon fits - separate pull plots and coverage plots. Previously I was making both the p-value coverage plots and the pull plots at the same time. However, the pull plots shouldn't have anything to do with the GENIE weights whereas the p-value coverage plots should draw samples weighted by the GENIE weights. In addition, for the pull plots we draw new truth parameters on every iteration whereas for the p-value coverage plots we only draw them once. - switch to using KDEMove() for the MCMC since I think it samples multimodal distributions a lot better than the default emcee move. - I now correct for the reconstruction energy bias in plot-michel and plot-muons --- utils/chi2 | 511 +++++++++++++++++++--------------------------- utils/dc | 12 +- utils/plot-michels | 22 +- utils/plot-muons | 3 + utils/sddm/__init__.py | 12 ++ utils/sddm/dc.py | 9 + utils/sddm/plot.py | 0 utils/sddm/plot_energy.py | 0 8 files changed, 243 insertions(+), 326 deletions(-) mode change 100755 => 100644 utils/sddm/plot.py mode change 100755 => 100644 utils/sddm/plot_energy.py diff --git a/utils/chi2 b/utils/chi2 index 205037b..be6b3e6 100755 --- a/utils/chi2 +++ b/utils/chi2 @@ -27,22 +27,30 @@ from __future__ import print_function, division import numpy as np from scipy.stats import iqr, poisson from matplotlib.lines import Line2D -from scipy.stats import iqr, norm, beta +from scipy.stats import iqr, norm, beta, percentileofscore from scipy.special import spence from itertools import izip_longest from sddm.stats import * -import contextlib -from sddm.dc import estimate_errors - -# from https://stackoverflow.com/questions/2891790/how-to-pretty-print-a-numpy-array-without-scientific-notation-and-with-given-pre -@contextlib.contextmanager -def printoptions(*args, **kwargs): - original = np.get_printoptions() - np.set_printoptions(*args, **kwargs) - try: - yield - finally: - np.set_printoptions(**original) +from sddm.dc import estimate_errors, EPSILON, truncnorm_scaled +import emcee +from sddm import printoptions +from sddm.utils import fast_cdf, correct_energy_bias + +# Likelihood Fit Parameters +# 0 - Atmospheric Neutrino Flux Scale +# 1 - Electron energy bias +# 2 - Electron energy resolution +# 3 - Muon energy bias +# 4 - Muon energy resolution +# 5 - External Muon scale + +FIT_PARS = [ + 'Atmospheric Neutrino Flux Scale', + 'Electron energy bias', + 'Electron energy resolution', + 'Muon energy bias', + 'Muon energy resolution', + 'External Muon scale'] # Uncertainty on the energy scale # @@ -62,18 +70,43 @@ def printoptions(*args, **kwargs): # the analysis at all, I'll leave the uncertainty here at 10% anyways. # - For the electron + muon energy resolution, I don't have any real good way # to estimate this, so we are conservative and set the error to 10%. -ENERGY_SCALE_MEAN = {'e': 0.0, 'u': -0.066} -ENERGY_SCALE_UNCERTAINTY = {'e':0.1, 'u': 0.011} -ENERGY_RESOLUTION_MEAN = {'e': 0.0, 'u': 0.0, 'eu': 0.0} -ENERGY_RESOLUTION_UNCERTAINTY = {'e':0.1, 'u': 0.014, 'eu': 0.1} - -# Absolute tolerance for the minimizer. -# Since we're minimizing the negative log likelihood, we really only care about -# the value of the minimum to within ~0.05 (10% of the one sigma shift). -# However, I have noticed before that setting a higher tolerance can sometimes -# cause the fit to get stuck in a local minima, so we set it here to a very -# small value. -FTOL_ABS = 1e-10 +PRIORS = [ + 0.01, # Atmospheric Neutrino Scale + 0.0, # Electron energy scale + 0.0, # Electron energy resolution + 0.066, # Muon energy scale + 0.0, # Muon energy resolution + 0.01, # Muon scale +] + +PRIOR_UNCERTAINTIES = [ + 0.002, # Atmospheric Neutrino Scale + 0.1, # Electron energy scale + 0.1, # Electron energy resolution + 0.011, # Muon energy scale + 0.014, # Muon energy resolution + 0.001, # Muon scale +] + +# Lower bounds for the fit parameters +PRIORS_LOW = [ + EPSILON, + -10, + EPSILON, + -10, + EPSILON, + EPSILON +] + +# Upper bounds for the fit parameters +PRIORS_HIGH = [ + 10, + 10, + 10, + 10, + 10, + 10 +] particle_id = {20: 'e', 22: r'\mu'} @@ -96,10 +129,10 @@ def plot_hist2(hists, bins, color=None): plt.xlabel("Energy (MeV)") plt.title('$' + ''.join([particle_id[int(''.join(x))] for x in grouper(str(id),2)]) + '$') - if len(df): + if len(hists): plt.tight_layout() -def get_mc_hists(data,x,bins,apply_norm=False,reweight=False): +def get_mc_hists(data,x,bins,scale=1.0,reweight=False): """ Returns the expected Monte Carlo histograms for the atmospheric neutrino background. @@ -108,156 +141,86 @@ def get_mc_hists(data,x,bins,apply_norm=False,reweight=False): - data: pandas dataframe of the Monte Carlo events - x: fit parameters - bins: histogram bins - - apply_norm: boolean value representing whether we should apply the - atmospheric neutrino scale parameter + - scale: multiply histograms by an overall scale factor This function does two basic things: 1. apply the energy bias and resolution corrections 2. histogram the results - Normally to apply the energy resolution correction we should smear out each - MC event by the resolution and then bin the results. But since there are - thousands and thousands of MC events this takes way too long. Therefore, we - use a trick. We first bin the MC events on a much finer binning scale than - the final bins. We then smear this histogram and then bin the results in - the final bins. - Returns a dictionary mapping particle id combo -> histogram. """ - ke_dict = {} + df_dict = {} for id in (20,22,2020,2022,2222): - ke_dict[id] = data[data.id == id].ke.values - - if reweight: - ke_weights_dict = {} - for id in (20,22,2020,2022,2222): - ke_weights_dict[id] = data[data.id == id].weight.values - else: - ke_weights_dict = None + df_dict[id] = data[data.id == id] - return get_mc_hists_fast(ke_dict,x,bins,apply_norm,weights_dict=ke_weights_dict) + return get_mc_hists_fast(df_dict,x,bins,scale,reweight) -def get_mc_hists_fast(ke_dict,x,bins,apply_norm=False,weights_dict=None): +def get_mc_hists_fast(df_dict,x,bins,scale=1.0,reweight=False): """ Same as get_mc_hists() but the first argument is a dictionary mapping - particle id -> kinetic energy array. This is much faster than selecting the - events from the dataframe every time. + particle id -> dataframe. This is much faster than selecting the events + from the dataframe every time. """ mc_hists = {} - # FIXME: May need to increase number of bins here - bins2 = np.logspace(np.log10(10),np.log10(20e3),100) - bincenters2 = (bins2[1:] + bins2[:-1])/2 - for id in (20,22,2020,2022,2222): - ke = ke_dict[id] + df = df_dict[id] - if id in (20,2020): - scale = bincenters2*max(EPSILON,x[2]) - elif id in (22,2222): - scale = bincenters2*max(EPSILON,x[4]) + if id == 20: + ke = df.energy1.values*(1+x[1]) + resolution = df.energy1.values*max(EPSILON,x[2]) + elif id == 2020: + ke = df.energy1.values*(1+x[1]) + df.energy2.values*(1+x[1]) + resolution = np.sqrt((df.energy1.values*max(EPSILON,x[2]))**2 + (df.energy2.values*max(EPSILON,x[2]))**2) + elif id == 22: + ke = df.energy1.values*(1+x[3]) + resolution = df.energy1.values*max(EPSILON,x[4]) + elif id == 2222: + ke = df.energy1.values*(1+x[3]) + df.energy2.values*(1+x[3]) + resolution = np.sqrt((df.energy1.values*max(EPSILON,x[4]))**2 + (df.energy2.values*max(EPSILON,x[4]))**2) elif id == 2022: - scale = bincenters2*max(EPSILON,x[5]) + ke = df.energy1.values*(1+x[1]) + df.energy2.values*(1+x[3]) + resolution = np.sqrt((df.energy1.values*max(EPSILON,x[2]))**2 + (df.energy2.values*max(EPSILON,x[4]))**2) - if weights_dict: - hist = np.histogram(ke,bins=bins2,weights=weights_dict[id])[0] + if reweight: + cdf = fast_cdf(bins[:,np.newaxis],ke,resolution)*df.weight.values else: - hist = np.histogram(ke,bins=bins2)[0] + cdf = fast_cdf(bins[:,np.newaxis],ke,resolution) - cdf = norm.cdf(bins[:,np.newaxis],bincenters2,scale)*hist mc_hists[id] = np.sum(cdf[1:] - cdf[:-1],axis=-1) - if apply_norm: - mc_hists[id] *= x[0] + mc_hists[id] *= scale return mc_hists -def get_data_hists_fast(ke_dict,x,bins,scale=1.0): - data_hists = {} - for id in (20,22,2020,2022,2222): - if id in (20,2020): - data_hists[id] = np.histogram(ke_dict[id]*(1+x[1]),bins=bins)[0]*scale - elif id in (22,2222): - data_hists[id] = np.histogram(ke_dict[id]*(1+x[3]),bins=bins)[0]*scale - elif id == 2022: - data_hists[id] = np.histogram((ke_dict[id]*[1+x[1],1+x[3]]).sum(axis=-1),bins=bins)[0]*scale - return data_hists - -def get_data_hists(data,x,bins,scale=1.0): +def get_data_hists(data,bins,scale=1.0): """ Returns the data histogrammed into `bins`. """ - ke_dict = {} + data_hists = {} for id in (20,22,2020,2022,2222): - if id in (20,2020): - ke_dict[id] = data[data.id == id].ke.values - elif id in (22,2222): - ke_dict[id] = data[data.id == id].ke.values - elif id == 2022: - ke_dict[id] = np.array([data.loc[data.id == id,'energy1'].values, - data.loc[data.id == id,'energy2'].values]).T - - return get_data_hists_fast(ke_dict,x,bins,scale) - -# Likelihood Fit Parameters -# 0 - Atmospheric Neutrino Flux Scale -# 1 - Electron energy bias -# 2 - Electron energy resolution -# 3 - Muon energy bias -# 4 - Muon energy resolution -# 5 - Electron + Muon energy resolution -# 6 - External Muon scale - -FIT_PARS = [ - 'Atmospheric Neutrino Flux Scale', - 'Electron energy bias', - 'Electron energy resolution', - 'Muon energy bias', - 'Muon energy resolution', - 'Electron + Muon energy resolution', - 'External Muon scale'] + data_hists[id] = np.histogram(data[data.id == id].ke.values,bins=bins)[0]*scale + return data_hists def make_nll(data, muons, mc, bins, print_nll=False): - ke_dict = {} + df_dict = {} for id in (20,22,2020,2022,2222): - ke_dict[id] = mc[mc.id == id].ke.values + df_dict[id] = mc[mc.id == id] - ke_dict_muon = {} + df_dict_muon = {} for id in (20,22,2020,2022,2222): - ke_dict_muon[id] = muons[muons.id == id].ke.values + df_dict_muon[id] = muons[muons.id == id] - ke_dict_data = {} - for id in (20,22,2020,2022,2222): - if id in (20,2020): - ke_dict_data[id] = data[data.id == id].ke.values - elif id in (22,2222): - ke_dict_data[id] = data[data.id == id].ke.values - elif id == 2022: - ke_dict_data[id] = np.array([data.loc[data.id == id,'energy1'].values, - data.loc[data.id == id,'energy2'].values]).T + data_hists = get_data_hists(data,bins) def nll(x, grad=None): - if any(x[i] < 0 for i in (0,2,4,5,6)): + if any(x[i] < 0 for i in (0,2,4,5)): return np.inf - # Get the data histograms. We need to do this within the - # likelihood function since we apply the energy bias corrections to the - # data. - # - # Note: Applying the energy bias correction to the data has a few - # issues. If the energy scale parameter changes by even a very small - # amount it can cause events to cross a bin boundary you will see a - # discrete jump in the likelihood. This isn't good for minimizers, and - # so when doing the first minimization with nlopt, I fix these - # parameters. Later, when we run the MCMC, these paramters are then - # free to float and the small discontinuities in the likelihood - # shouldn't be a problem. - data_hists = get_data_hists_fast(ke_dict_data,x,bins) - # Get the Monte Carlo histograms. We need to do this within the # likelihood function since we apply the energy resolution parameters # to the Monte Carlo. - mc_hists = get_mc_hists_fast(ke_dict,x,bins,apply_norm=True) - muon_hists = get_mc_hists_fast(ke_dict_muon,x,bins,apply_norm=False) + mc_hists = get_mc_hists_fast(df_dict,x,bins) + muon_hists = get_mc_hists_fast(df_dict_muon,x,bins) # Calculate the negative log of the likelihood of observing the data # given the fit parameters @@ -265,16 +228,12 @@ def make_nll(data, muons, mc, bins, print_nll=False): nll = 0 for id in data_hists: oi = data_hists[id] - ei = mc_hists[id] + muon_hists[id]*x[6] + EPSILON + ei = mc_hists[id]*x[0] + muon_hists[id]*x[5] + EPSILON N = ei.sum() nll -= -N - np.sum(gammaln(oi+1)) + np.sum(oi*np.log(ei)) # Add the priors - nll -= norm.logpdf(x[1],ENERGY_SCALE_MEAN['e'],ENERGY_SCALE_UNCERTAINTY['e']) - nll -= norm.logpdf(x[3],ENERGY_SCALE_MEAN['u'],ENERGY_SCALE_UNCERTAINTY['u']) - nll -= norm.logpdf(x[2],ENERGY_RESOLUTION_MEAN['e'],ENERGY_RESOLUTION_UNCERTAINTY['e']) - nll -= norm.logpdf(x[4],ENERGY_RESOLUTION_MEAN['u'],ENERGY_RESOLUTION_UNCERTAINTY['u']) - nll -= norm.logpdf(x[5],ENERGY_RESOLUTION_MEAN['eu'],ENERGY_RESOLUTION_UNCERTAINTY['eu']) + nll -= norm.logpdf(x,PRIORS,PRIOR_UNCERTAINTIES).sum() if print_nll: # Print the result @@ -297,7 +256,7 @@ def get_mc_hists_posterior(data_mc,muon_hists,data_hists,x,bins): is equal to the expected number of events from the Monte Carlo, and then you actually see the data. Since the likelihood on the true mean in each bin is a multinomial, the posterior is also a dirichlet where the alpha - paramters are given by a sum of the prior and observed counts. + parameters are given by a sum of the prior and observed counts. All that is a long way of saying we calculate the posterior as the sum of the Monte Carlo events (unscaled) and the observed events. In the limit of @@ -311,7 +270,7 @@ def get_mc_hists_posterior(data_mc,muon_hists,data_hists,x,bins): for id in (20,22,2020,2022,2222): mc_hists[id] = get_mc_hist_posterior(mc_hists[id],data_hists[id],norm=x[0]) # FIXME: does the orering of when we add the muons matter here? - mc_hists[id] += muon_hists[id]*x[6] + mc_hists[id] += muon_hists[id]*x[5] return mc_hists def get_multinomial_prob(data, data_muon, data_mc, id, x_samples, bins, percentile=50.0, size=10000): @@ -334,19 +293,11 @@ def get_multinomial_prob(data, data_muon, data_mc, id, x_samples, bins, percenti bins: bins used to bin the mc histogram size: number of values to compute """ - ke_dict_muon = {} + df_dict_muon = {} for _id in (20,22,2020,2022,2222): - ke_dict_muon[_id] = data_muon[data_muon.id == _id].ke.values + df_dict_muon[_id] = data_muon[data_muon.id == _id] - ke_dict_data = {} - for _id in (20,22,2020,2022,2222): - if _id in (20,2020): - ke_dict_data[_id] = data[data.id == _id].ke.values - elif _id in (22,2222): - ke_dict_data[_id] = data[data.id == _id].ke.values - elif _id == 2022: - ke_dict_data[_id] = np.array([data.loc[data.id == _id,'energy1'].values, - data.loc[data.id == _id,'energy2'].values]).T + data_hists = get_data_hists(data,bins) # Get the total number of "universes" simulated in the GENIE reweight tool if len(data_mc): @@ -358,8 +309,7 @@ def get_multinomial_prob(data, data_muon, data_mc, id, x_samples, bins, percenti for i in range(size): x = x_samples[np.random.randint(x_samples.shape[0])] - data_hists = get_data_hists_fast(ke_dict_data,x,bins) - muon_hists = get_mc_hists_fast(ke_dict_muon,x,bins,apply_norm=False) + muon_hists = get_mc_hists_fast(df_dict_muon,x,bins) if nuniverses > 0: universe = np.random.randint(nuniverses) @@ -394,7 +344,7 @@ def get_prob(data,muon,mc,samples,bins,size): print(id, prob[id]) return prob -def do_fit(data,muon,data_mc,bins,steps,print_nll=False): +def do_fit(data,muon,data_mc,bins,steps,print_nll=False,walkers=100,thin=10): """ Run the fit and return the minimum along with samples from running an MCMC starting near the minimum. @@ -413,46 +363,34 @@ def do_fit(data,muon,data_mc,bins,steps,print_nll=False): """ nll = make_nll(data,muon,data_mc,bins,print_nll) - x0 = np.array([1.0,0.0,EPSILON,0.0,EPSILON,EPSILON,EPSILON]) + x0 = np.array(PRIORS) opt = nlopt.opt(nlopt.LN_SBPLX, len(x0)) opt.set_min_objective(nll) - low = np.array([EPSILON,-1e9,EPSILON,-1e9,EPSILON,EPSILON,EPSILON]) - high = np.array([1e9]*len(x0)) - # Fix the energy bias parameters for the minimization, since they will - # cause discontinuities in the likelihood function. - low[1] = 0.0 - low[3] = 0.0 - high[1] = 0.0 - high[3] = 0.0 + low = np.array(PRIORS_LOW) + high = np.array(PRIORS_HIGH) opt.set_lower_bounds(low) opt.set_upper_bounds(high) - opt.set_ftol_abs(FTOL_ABS) + opt.set_ftol_abs(1e-2) opt.set_initial_step([0.01]*len(x0)) - xopt = opt.optimize(x0) - print("xopt = ", xopt) - nll_xopt = nll(xopt) - print("nll(xopt) = ", nll(xopt)) - - # Now, we allow them to float - low[1] = -1e9 - low[3] = -1e9 - high[1] = 1e9 - high[3] = 1e9 - - stepsizes = estimate_errors(nll,xopt,low,high) - - with printoptions(precision=3, suppress=True): - print("Errors: ", stepsizes) - - pos = np.empty((20, len(x0)),dtype=np.double) + pos = np.empty((walkers, len(x0)),dtype=np.double) for i in range(pos.shape[0]): - pos[i] = xopt + np.random.randn(len(x0))*stepsizes + pos[i] = truncnorm_scaled(PRIORS_LOW,PRIORS_HIGH,PRIORS,PRIOR_UNCERTAINTIES) pos[i,:] = np.clip(pos[i,:],low,high) + pos[i] = opt.optimize(pos[i]) + with printoptions(precision=2, suppress=True): + print("pos[%i] = %s, nll = %.2f" % (i, pos[i], opt.last_optimum_value())) nwalkers, ndim = pos.shape - sampler = emcee.EnsembleSampler(nwalkers, ndim, lambda x: -nll(x)) + # We use the KDEMove here because I think it should sample the likelihood + # better. Because we have energy scale parameters and we are doing a binned + # likelihood, the likelihood is discontinuous. There can also be several + # local minima. The author of emcee recommends using the KDEMove with a lot + # of workers to try and properly sample a multimodal distribution. In + # addition, I've found that the autocorrelation time for the KDEMove is + # much better than the other moves. + sampler = emcee.EnsembleSampler(nwalkers, ndim, lambda x: -nll(x), moves=emcee.moves.KDEMove()) with np.errstate(invalid='ignore'): sampler.run_mcmc(pos, steps) @@ -463,47 +401,9 @@ def do_fit(data,muon,data_mc,bins,steps,print_nll=False): except Exception as e: print(e) - samples = sampler.chain.reshape((-1,len(x0))) + samples = sampler.get_chain(flat=True,thin=thin) - return xopt, samples - -# Energy bias of reconstruction relative to Monte Carlo. -# -# Note: You can recreate this array using: -# -# ./plot-fit-results -o [filename] -ENERGY_BIAS = np.array([ - (100.0, 0.050140, 0.078106), - (200.0, 0.058666, 0.078106), - (300.0, 0.049239, -0.000318), - (400.0, 0.045581, -0.020932), - (500.0, 0.050757, -0.028394), - (600.0, 0.048310, -0.029017), - (700.0, 0.052434, -0.020770), - (800.0, 0.032920, -0.019298), - (900.0, 0.040963, -0.015354)], - dtype=[('T',np.float32),('e_bias',np.float32),('u_bias',np.float32)]) - -def correct_energy_bias(df): - """ - Corrects for the energy bias of the reconstruction relative to the true - Monte Carlo energy. - """ - # Note: We divide here since we define the bias as: - # - # bias = (T_reco - T_true)/T_true - # - # Therefore, - # - # T_true = T_reco/(1+bias) - df.loc[df['id1'] == 20,'energy1'] /= (1+np.interp(df.loc[df['id1'] == 20,'energy1'],ENERGY_BIAS['T'],ENERGY_BIAS['e_bias'])) - df.loc[df['id2'] == 20,'energy2'] /= (1+np.interp(df.loc[df['id2'] == 20,'energy1'],ENERGY_BIAS['T'],ENERGY_BIAS['e_bias'])) - df.loc[df['id3'] == 20,'energy3'] /= (1+np.interp(df.loc[df['id3'] == 20,'energy1'],ENERGY_BIAS['T'],ENERGY_BIAS['e_bias'])) - df.loc[df['id1'] == 22,'energy1'] /= (1+np.interp(df.loc[df['id1'] == 22,'energy1'],ENERGY_BIAS['T'],ENERGY_BIAS['u_bias'])) - df.loc[df['id2'] == 22,'energy2'] /= (1+np.interp(df.loc[df['id2'] == 22,'energy1'],ENERGY_BIAS['T'],ENERGY_BIAS['u_bias'])) - df.loc[df['id3'] == 22,'energy3'] /= (1+np.interp(df.loc[df['id3'] == 22,'energy1'],ENERGY_BIAS['T'],ENERGY_BIAS['u_bias'])) - df['ke'] = df['energy1'].fillna(0) + df['energy2'].fillna(0) + df['energy3'].fillna(0) - return df + return sampler.get_chain(flat=True)[sampler.get_log_prob(flat=True).argmax()], samples if __name__ == '__main__': import argparse @@ -515,7 +415,6 @@ if __name__ == '__main__': from sddm.plot import * from sddm import setup_matplotlib import nlopt - import emcee parser = argparse.ArgumentParser("plot fit results") parser.add_argument("filenames", nargs='+', help="input files") @@ -526,8 +425,11 @@ if __name__ == '__main__': parser.add_argument("--steps", type=int, default=1000, help="number of steps in the MCMC chain") parser.add_argument("--multinomial-prob-size", type=int, default=10000, help="number of p values to compute") parser.add_argument("--coverage", type=int, default=0, help="plot p value coverage") + parser.add_argument("--pull", type=int, default=0, help="plot pull plots") parser.add_argument("--weights", nargs='+', required=True, help="GENIE reweight HDF5 files") parser.add_argument("--print-nll", action='store_true', default=False, help="print nll values") + parser.add_argument("--walkers", type=int, default=100, help="number of walkers") + parser.add_argument("--thin", type=int, default=10, help="number of steps to thin") args = parser.parse_args() setup_matplotlib(args.save) @@ -613,30 +515,17 @@ if __name__ == '__main__': p_values = {id: [] for id in (20,22,2020,2022,2222)} p_values_atm = {id: [] for id in (20,22,2020,2022,2222)} - ENERGY_SCALE_MEAN = {'e': 0.0, 'u': 0.0} - ENERGY_SCALE_UNCERTAINTY = {'e':1.0, 'u': 1.0} - ENERGY_RESOLUTION_MEAN = {'e': 0.0, 'u': 0.0, 'eu': 0.0} - ENERGY_RESOLUTION_UNCERTAINTY = {'e':1.0, 'u': 1.0, 'eu': 1.0} - - scale = 0.01 - muon_scale = 0.01 - energy_resolution = 0.1 - - true_values = [scale,0.0,energy_resolution,0.0,energy_resolution,energy_resolution,muon_scale] - - assert(len(true_values) == len(FIT_PARS)) - - pull = [[] for i in range(len(FIT_PARS))] - # Set the random seed so we get reproducible results here np.random.seed(0) + xtrue = truncnorm_scaled(PRIORS_LOW,PRIORS_HIGH,PRIORS,PRIOR_UNCERTAINTIES) + for i in range(args.coverage): # Calculate expected number of events - N = len(data_mc)*scale - N_atm = len(data_atm_mc)*scale - N_muon = len(muon)*muon_scale - N_muon_atm = len(muon_atm)*muon_scale + N = len(data_mc)*xtrue[0] + N_atm = len(data_atm_mc)*xtrue[0] + N_muon = len(muon)*xtrue[5] + N_muon_atm = len(muon_atm)*xtrue[5] # Calculate observed number of events n = np.random.poisson(N) @@ -649,15 +538,19 @@ if __name__ == '__main__': data_atm = pd.concat((data_atm_mc.sample(n=n_atm,replace=True,weights='weight'), muon_atm.sample(n=n_muon_atm,replace=True))) # Smear the energies by the additional energy resolution - data.ke += np.random.randn(len(data.ke))*data.ke*energy_resolution - data_atm.ke += np.random.randn(len(data_atm.ke))*data_atm.ke*energy_resolution + data.loc[data.id1 == 20,'energy1'] *= (1+xtrue[1]+np.random.randn(np.count_nonzero(data.id1 == 20))*xtrue[2]) + data.loc[data.id1 == 22,'energy1'] *= (1+xtrue[3]+np.random.randn(np.count_nonzero(data.id1 == 22))*xtrue[4]) + data.loc[data.id2 == 20,'energy2'] *= (1+xtrue[1]+np.random.randn(np.count_nonzero(data.id2 == 20))*xtrue[2]) + data.loc[data.id2 == 22,'energy2'] *= (1+xtrue[3]+np.random.randn(np.count_nonzero(data.id2 == 22))*xtrue[4]) + data['ke'] = data['energy1'].fillna(0) + data['energy2'].fillna(0) + data['energy3'].fillna(0) - xopt, samples = do_fit(data,muon,data_mc,bins,args.steps,args.print_nll) + data_atm.loc[data_atm.id1 == 20,'energy1'] *= (1+xtrue[1]+np.random.randn(np.count_nonzero(data_atm.id1 == 20))*xtrue[2]) + data_atm.loc[data_atm.id1 == 22,'energy1'] *= (1+xtrue[3]+np.random.randn(np.count_nonzero(data_atm.id1 == 22))*xtrue[4]) + data_atm.loc[data_atm.id2 == 20,'energy2'] *= (1+xtrue[1]+np.random.randn(np.count_nonzero(data_atm.id2 == 20))*xtrue[2]) + data_atm.loc[data_atm.id2 == 22,'energy2'] *= (1+xtrue[3]+np.random.randn(np.count_nonzero(data_atm.id2 == 22))*xtrue[4]) + data_atm['ke'] = data_atm['energy1'].fillna(0) + data_atm['energy2'].fillna(0) + data_atm['energy3'].fillna(0) - for i in range(len(FIT_PARS)): - mean = np.mean(samples[:,i]) - std = np.std(samples[:,i]) - pull[i].append((mean - true_values[i])/std) + xopt, samples = do_fit(data,muon,data_mc,bins,args.steps,args.print_nll,args.walkers,args.thin) prob = get_prob(data,muon,data_mc_with_weights,samples,bins,size=args.multinomial_prob_size) prob_atm = get_prob(data_atm,muon_atm,data_atm_mc_with_weights,samples,bins,size=args.multinomial_prob_size) @@ -712,19 +605,62 @@ if __name__ == '__main__': else: plt.suptitle("P-value Coverage with Neutron Follower") - # Bins for the pull plots - bins = np.linspace(-10,10,101) - bincenters = (bins[1:] + bins[:-1])/2 + sys.exit(0) + + if args.pull: + pull = [[] for i in range(len(FIT_PARS))] + + # Set the random seed so we get reproducible results here + np.random.seed(0) + + for i in range(args.pull): + xtrue = truncnorm_scaled(PRIORS_LOW,PRIORS_HIGH,PRIORS,PRIOR_UNCERTAINTIES) + + # Calculate expected number of events + N = len(data_mc)*xtrue[0] + N_atm = len(data_atm_mc)*xtrue[0] + N_muon = len(muon)*xtrue[5] + N_muon_atm = len(muon_atm)*xtrue[5] + + # Calculate observed number of events + n = np.random.poisson(N) + n_atm = np.random.poisson(N_atm) + n_muon = np.random.poisson(N_muon) + n_muon_atm = np.random.poisson(N_muon_atm) + + # Sample data from Monte Carlo + data = pd.concat((data_mc.sample(n=n,replace=True), muon.sample(n=n_muon,replace=True))) + data_atm = pd.concat((data_atm_mc.sample(n=n_atm,replace=True), muon_atm.sample(n=n_muon_atm,replace=True))) + + # Smear the energies by the additional energy resolution + data.loc[data.id1 == 20,'energy1'] *= (1+xtrue[1]+np.random.randn(np.count_nonzero(data.id1 == 20))*xtrue[2]) + data.loc[data.id1 == 22,'energy1'] *= (1+xtrue[3]+np.random.randn(np.count_nonzero(data.id1 == 22))*xtrue[4]) + data.loc[data.id2 == 20,'energy2'] *= (1+xtrue[1]+np.random.randn(np.count_nonzero(data.id2 == 20))*xtrue[2]) + data.loc[data.id2 == 22,'energy2'] *= (1+xtrue[3]+np.random.randn(np.count_nonzero(data.id2 == 22))*xtrue[4]) + data['ke'] = data['energy1'].fillna(0) + data['energy2'].fillna(0) + data['energy3'].fillna(0) + + data_atm.loc[data_atm.id1 == 20,'energy1'] *= (1+xtrue[1]+np.random.randn(np.count_nonzero(data_atm.id1 == 20))*xtrue[2]) + data_atm.loc[data_atm.id1 == 22,'energy1'] *= (1+xtrue[3]+np.random.randn(np.count_nonzero(data_atm.id1 == 22))*xtrue[4]) + data_atm.loc[data_atm.id2 == 20,'energy2'] *= (1+xtrue[1]+np.random.randn(np.count_nonzero(data_atm.id2 == 20))*xtrue[2]) + data_atm.loc[data_atm.id2 == 22,'energy2'] *= (1+xtrue[3]+np.random.randn(np.count_nonzero(data_atm.id2 == 22))*xtrue[4]) + data_atm['ke'] = data_atm['energy1'].fillna(0) + data_atm['energy2'].fillna(0) + data_atm['energy3'].fillna(0) + + xopt, samples = do_fit(data,muon,data_mc,bins,args.steps,args.print_nll,args.walkers,args.thin) + + for i in range(len(FIT_PARS)): + # The "pull plots" we make here are actually produced via a + # procedure called "Simulation Based Calibration". + # + # See https://arxiv.org/abs/1804.06788. + pull[i].append(percentileofscore(samples[:,i],xtrue[i])) fig = plt.figure() axes = [] for i, name in enumerate(FIT_PARS): axes.append(plt.subplot(3,3,i+1)) - plt.hist(pull[i],bins=bins,histtype='step',normed=True) - plt.plot(bincenters,norm.pdf(bincenters)) + plt.hist(pull[i],bins=np.linspace(0,100,101),histtype='step',normed=True) plt.title(name) for ax in axes: - ax.set_xlim((-10,10)) despine(ax=ax,left=True,trim=True) ax.get_yaxis().set_visible(False) plt.tight_layout() @@ -737,47 +673,18 @@ if __name__ == '__main__': sys.exit(0) - xopt, samples = do_fit(data,muon,data_mc,bins,args.steps,args.print_nll) + xopt, samples = do_fit(data,muon,data_mc,bins,args.steps,args.print_nll,args.walkers,args.thin) prob = get_prob(data,muon,data_mc_with_weights,samples,bins,size=args.multinomial_prob_size) prob_atm = get_prob(data_atm,muon_atm,data_atm_mc_with_weights,samples,bins,size=args.multinomial_prob_size) plt.figure() - plt.subplot(3,3,1) - plt.hist(samples[:,0],bins=100,histtype='step') - plt.xlabel("Atmospheric Flux Scale") - despine(ax=plt.gca(),left=True,trim=True) - plt.gca().get_yaxis().set_visible(False) - plt.subplot(3,3,2) - plt.hist(samples[:,1],bins=100,histtype='step') - plt.xlabel("Electron Energy Scale") - despine(ax=plt.gca(),left=True,trim=True) - plt.gca().get_yaxis().set_visible(False) - plt.subplot(3,3,3) - plt.hist(samples[:,2],bins=100,histtype='step') - plt.xlabel("Electron Energy Resolution") - despine(ax=plt.gca(),left=True,trim=True) - plt.gca().get_yaxis().set_visible(False) - plt.subplot(3,3,4) - plt.hist(samples[:,3],bins=100,histtype='step') - plt.xlabel("Muon Energy Scale") - despine(ax=plt.gca(),left=True,trim=True) - plt.gca().get_yaxis().set_visible(False) - plt.subplot(3,3,5) - plt.hist(samples[:,4],bins=100,histtype='step') - plt.xlabel("Muon Energy Resolution") - despine(ax=plt.gca(),left=True,trim=True) - plt.gca().get_yaxis().set_visible(False) - plt.subplot(3,3,6) - plt.hist(samples[:,5],bins=100,histtype='step') - plt.xlabel("Electron + Muon Energy Resolution") - despine(ax=plt.gca(),left=True,trim=True) - plt.gca().get_yaxis().set_visible(False) - plt.subplot(3,3,7) - plt.hist(samples[:,6],bins=100,histtype='step') - plt.xlabel("Muon Scale") - despine(ax=plt.gca(),left=True,trim=True) - plt.gca().get_yaxis().set_visible(False) + for i in range(len(FIT_PARS)): + plt.subplot(3,2,i+1) + plt.hist(samples[:,i],bins=100,histtype='step') + plt.xlabel(FIT_PARS[i].title()) + despine(ax=plt.gca(),left=True,trim=True) + plt.gca().get_yaxis().set_visible(False) plt.tight_layout() if args.save: @@ -792,9 +699,9 @@ if __name__ == '__main__': labels = ('Data','Monte Carlo','External Muons') fig = plt.figure() - hists = get_data_hists(data,xopt,bins) - hists_muon = get_data_hists(muon,xopt,bins,scale=xopt[6]) - hists_mc = get_mc_hists(data_mc,xopt,bins,apply_norm=True) + hists = get_data_hists(data,bins) + hists_muon = get_mc_hists(muon,xopt,bins,scale=xopt[5]) + hists_mc = get_mc_hists(data_mc,xopt,bins,scale=xopt[0]) plot_hist2(hists,bins=bins,color='C0') plot_hist2(hists_mc,bins=bins,color='C1') plot_hist2(hists_muon,bins=bins,color='C2') @@ -819,12 +726,12 @@ if __name__ == '__main__': else: plt.suptitle("Without Neutron Follower") fig = plt.figure() - hists = get_data_hists(data_atm,xopt,bins) - hists_muon = get_data_hists(muon_atm,xopt,bins,scale=xopt[6]) - hists_mc = get_mc_hists(data_atm_mc,xopt,bins,apply_norm=True) + hists = get_data_hists(data_atm,bins) + hists_muon = get_mc_hists(muon_atm,xopt,bins,scale=xopt[5]) + hists_mc = get_mc_hists(data_atm_mc,xopt,bins,scale=xopt[0]) plot_hist2(hists,bins=bins,color='C0') plot_hist2(hists_mc,bins=bins,color='C1') - plot_hist2(hists_muon,bins=bins,color='C1') + plot_hist2(hists_muon,bins=bins,color='C2') for id in (20,22,2020,2022,2222): if id == 20: plt.subplot(2,3,1) diff --git a/utils/dc b/utils/dc index ba363cc..9366b6e 100755 --- a/utils/dc +++ b/utils/dc @@ -19,7 +19,6 @@ import numpy as np from scipy.stats import iqr import nlopt from scipy.stats import poisson -import contextlib import sys from math import exp import emcee @@ -29,6 +28,7 @@ from matplotlib.lines import Line2D from sddm.plot import despine from sddm.dc import * from sddm.plot_energy import * +from sddm import printoptions try: from emcee import moves @@ -36,16 +36,6 @@ except ImportError: print("emcee version 2.2.1 is required",file=sys.stderr) sys.exit(1) -# from https://stackoverflow.com/questions/2891790/how-to-pretty-print-a-numpy-array-without-scientific-notation-and-with-given-pre -@contextlib.contextmanager -def printoptions(*args, **kwargs): - original = np.get_printoptions() - np.set_printoptions(*args, **kwargs) - try: - yield - finally: - np.set_printoptions(**original) - def radius_cut(ev): ev['radius_cut'] = np.digitize((ev.r/PSUP_RADIUS)**3,(0.9,)) return ev diff --git a/utils/plot-michels b/utils/plot-michels index b313c3e..9637e13 100755 --- a/utils/plot-michels +++ b/utils/plot-michels @@ -30,17 +30,7 @@ from sddm.stats import * import emcee from sddm.dc import estimate_errors, EPSILON import nlopt -import contextlib - -# from https://stackoverflow.com/questions/2891790/how-to-pretty-print-a-numpy-array-without-scientific-notation-and-with-given-pre -@contextlib.contextmanager -def printoptions(*args, **kwargs): - original = np.get_printoptions() - np.set_printoptions(*args, **kwargs) - try: - yield - finally: - np.set_printoptions(**original) +from sddm import printoptions particle_id = {20: 'e', 22: 'u'} @@ -100,6 +90,7 @@ if __name__ == '__main__': from sddm.plot_energy import * from sddm.plot import * from sddm import setup_matplotlib + from sddm.utils import correct_energy_bias parser = argparse.ArgumentParser("plot fit results") parser.add_argument("filenames", nargs='+', help="input files") @@ -108,6 +99,7 @@ if __name__ == '__main__': parser.add_argument("--nhit-thresh", type=int, default=None, help="nhit threshold to apply to events before processing (should only be used for testing to speed things up)") parser.add_argument("--print-nll", action='store_true', default=False, help="print nll values") parser.add_argument("--steps", type=int, default=1000, help="number of steps in the MCMC chain") + parser.add_argument("--walkers", type=int, default=100, help="number of walkers") args = parser.parse_args() setup_matplotlib(args.save) @@ -120,7 +112,9 @@ if __name__ == '__main__': for run, df in rhdr.groupby('run'): evs.append(get_events(df.filename.values, merge_fits=True, nhit_thresh=args.nhit_thresh)) ev = pd.concat(evs) + ev = correct_energy_bias(ev) ev_mc = get_events(args.mc, merge_fits=True) + ev_mc = correct_energy_bias(ev_mc) # Drop events without fits ev = ev[~np.isnan(ev.fmin)] @@ -195,17 +189,19 @@ if __name__ == '__main__': stepsizes = estimate_errors(nll,xopt,low,high) + stepsizes[1] = 0.1 + with printoptions(precision=3, suppress=True): print("Errors: ", stepsizes) - pos = np.empty((20, len(x0)),dtype=np.double) + pos = np.empty((args.walkers, len(x0)),dtype=np.double) for i in range(pos.shape[0]): pos[i] = xopt + np.random.randn(len(x0))*stepsizes pos[i,:] = np.clip(pos[i,:],low,high) nwalkers, ndim = pos.shape - sampler = emcee.EnsembleSampler(nwalkers, ndim, lambda x: -nll(x)) + sampler = emcee.EnsembleSampler(nwalkers, ndim, lambda x: -nll(x), moves=emcee.moves.KDEMove()) with np.errstate(invalid='ignore'): sampler.run_mcmc(pos, args.steps) diff --git a/utils/plot-muons b/utils/plot-muons index 25e7d24..27d3423 100755 --- a/utils/plot-muons +++ b/utils/plot-muons @@ -75,6 +75,7 @@ if __name__ == '__main__': from sddm.plot_energy import * from sddm.plot import * from sddm import setup_matplotlib + from sddm.utils import correct_energy_bias parser = argparse.ArgumentParser("plot fit results") parser.add_argument("filenames", nargs='+', help="input files") @@ -93,7 +94,9 @@ if __name__ == '__main__': for run, df in rhdr.groupby('run'): evs.append(get_events(df.filename.values, merge_fits=True, nhit_thresh=args.nhit_thresh)) ev = pd.concat(evs) + ev = correct_energy_bias(ev) ev_mc = get_events(args.mc, merge_fits=True) + ev_mc = correct_energy_bias(ev_mc) # Drop events without fits ev = ev[~np.isnan(ev.fmin)] diff --git a/utils/sddm/__init__.py b/utils/sddm/__init__.py index f213c2b..8bed558 100644 --- a/utils/sddm/__init__.py +++ b/utils/sddm/__init__.py @@ -4,6 +4,8 @@ from itertools import izip_longest import os import h5py import pandas as pd +import numpy as np +import contextlib IDP_E_MINUS = 20 IDP_MU_MINUS = 22 @@ -183,3 +185,13 @@ def read_hdf(filename, df): with h5py.File(filename,'r') as f: data = f[df][:] return pd.DataFrame(data) + +# from https://stackoverflow.com/questions/2891790/how-to-pretty-print-a-numpy-array-without-scientific-notation-and-with-given-pre +@contextlib.contextmanager +def printoptions(*args, **kwargs): + original = np.get_printoptions() + np.set_printoptions(*args, **kwargs) + try: + yield + finally: + np.set_printoptions(**original) diff --git a/utils/sddm/dc.py b/utils/sddm/dc.py index 745220b..ee62656 100755 --- a/utils/sddm/dc.py +++ b/utils/sddm/dc.py @@ -40,6 +40,15 @@ DC_BREAKDOWN = 0x800 # nans in case an expected value is predicted to be zero. EPSILON = 1e-10 +def truncnorm_scaled(low,high,loc=0,scale=1,random_state=None): + """ + Just like scipy's truncnorm.rvs() except we auto convert the clip values. + """ + low = np.atleast_1d(low) + high = np.atleast_1d(high) + a, b = (low - loc)/scale, (high - loc)/scale + return truncnorm.rvs(a,b,loc,scale,random_state=random_state) + def get_proposal_func(stepsizes, low, high): """ Returns a function which produces proposal steps for a Metropolis Hastings diff --git a/utils/sddm/plot.py b/utils/sddm/plot.py old mode 100755 new mode 100644 diff --git a/utils/sddm/plot_energy.py b/utils/sddm/plot_energy.py old mode 100755 new mode 100644 -- cgit