diff options
author | tlatorre <tlatorre@uchicago.edu> | 2020-06-29 07:52:43 -0500 |
---|---|---|
committer | tlatorre <tlatorre@uchicago.edu> | 2020-06-29 07:52:43 -0500 |
commit | 21420a805f214203819dd19ffc45fca9451a10bd (patch) | |
tree | 864b651123c9c0347b763fc501992a50f497dbe6 | |
parent | 83a96dec3aec3d794dcf9b2d566d0cfb6547a0f5 (diff) | |
download | sddm-21420a805f214203819dd19ffc45fca9451a10bd.tar.gz sddm-21420a805f214203819dd19ffc45fca9451a10bd.tar.bz2 sddm-21420a805f214203819dd19ffc45fca9451a10bd.zip |
add energy scale to chi2 fit
-rwxr-xr-x | utils/chi2 | 132 |
1 files changed, 102 insertions, 30 deletions
@@ -33,9 +33,14 @@ from scipy.stats import iqr, norm, beta from scipy.special import spence from itertools import izip_longest +# Uncertainty on the energy scale +# FIXME: Should get real number from stopping muons +ENERGY_SCALE_MEAN = 1.0 +ENERGY_SCALE_UNCERTAINTY = 0.1 + particle_id = {20: 'e', 22: r'\mu'} -def plot_hist2(df, muons=False, norm=1.0, color=None): +def plot_hist2(df, norm=1.0, scale=1.0, color=None): for id, df_id in sorted(df.groupby('id')): if id == 20: plt.subplot(2,3,1) @@ -48,14 +53,10 @@ def plot_hist2(df, muons=False, norm=1.0, color=None): elif id == 2222: plt.subplot(2,3,6) - if muons: - plt.hist(np.log10(df_id.ke.values/1000), bins=np.linspace(0,4.5,100), histtype='step') - plt.xlabel("log10(Energy (GeV))") - else: - bins = np.logspace(np.log10(20),np.log10(10e3),21) - plt.hist(df_id.ke.values, bins=bins, histtype='step', weights=np.tile(norm,len(df_id.ke.values)),color=color) - plt.gca().set_xscale("log") - plt.xlabel("Energy (MeV)") + bins = np.logspace(np.log10(20),np.log10(10e3),21) + plt.hist(df_id.ke.values*scale, bins=bins, histtype='step', weights=np.tile(norm,len(df_id.ke.values)),color=color) + plt.gca().set_xscale("log") + plt.xlabel("Energy (MeV)") plt.title('$' + ''.join([particle_id[int(''.join(x))] for x in grouper(str(id),2)]) + '$') if len(df): @@ -93,8 +94,19 @@ def plot_hist(df, muons=False): if len(df): plt.tight_layout() -def make_nll(data_hists, mc_hists): +def make_nll(data, mc_hists): def nll(x, grad=None): + if x[0] < 0 or x[1] < 0: + return np.inf + + data_hists = {} + for id in (20,22,2020,2022,2222): + df_id = data[data.id == id] + if len(df_id): + data_hists[id] = np.histogram(df_id.ke.values*x[1],bins=bins)[0] + else: + data_hists[id] = np.zeros(len(bins)-1,dtype=np.int) + nll = 0 for id in data_hists: N = data_hists[id].sum() @@ -105,24 +117,53 @@ def make_nll(data_hists, mc_hists): p += 1e-10 p /= p.sum() nll -= multinomial.logpmf(data_hists[id],N,p) - return nll + return nll - norm.logpdf(x[1],ENERGY_SCALE_MEAN,ENERGY_SCALE_UNCERTAINTY) return nll def chi2(samples,expected): return np.sum((samples-expected)**2/expected,axis=-1) -def get_multinomial_prob(data, mc, size=1000): - N = mc.sum() - # Fix a bug in scipy(). See https://github.com/scipy/scipy/issues/8235 (I think). - mc = mc + 1e-10 - p = mc/mc.sum() - chi2_data = chi2(data,mc) +def get_mc_hist(data,x,bins): + if len(data): + return np.histogram(data,bins=bins)[0]*x[0]/100.0 + else: + return np.zeros(len(bins)-1,dtype=np.int) + +def get_data_hist(data,x,bins): + if len(data): + return np.histogram(data*x[1],bins=bins)[0] + else: + return np.zeros(len(bins)-1,dtype=np.int) + +def get_multinomial_prob(data, data_mc, x_samples, bins, size=10000): + """ + Returns the p-value that the histogram of the data is drawn from the MC + histogram. + + Arguments: + + data: 1D array of KE values + data_mc: 1D array of MC KE values + x_samples: MCMC samples of the floated parameters in the fit + bins: bins used to bin the mc histogram + size: number of values to compute + """ + chi2_data = [] chi2_samples = [] for i in range(size): + x = x_samples[np.random.randint(x_samples.shape[0])] + mc = get_mc_hist(data_mc,x,bins) + N = mc.sum() + # Fix a bug in scipy(). See https://github.com/scipy/scipy/issues/8235 (I think). + mc = mc + 1e-10 + p = mc/mc.sum() + data_hist = get_data_hist(data,x,bins) + chi2_data.append(chi2(data_hist,mc)) n = np.random.poisson(N) samples = multinomial.rvs(n,p) chi2_samples.append(chi2(samples,mc)) chi2_samples = np.array(chi2_samples) + chi2_data = np.array(chi2_data) return np.count_nonzero(chi2_samples >= chi2_data)/len(chi2_samples) if __name__ == '__main__': @@ -135,12 +176,14 @@ if __name__ == '__main__': from sddm.plot import * from sddm import setup_matplotlib import nlopt + import emcee parser = argparse.ArgumentParser("plot fit results") parser.add_argument("filenames", nargs='+', help="input files") parser.add_argument("--save", action='store_true', default=False, help="save corner plots for backgrounds") parser.add_argument("--mc", nargs='+', required=True, help="atmospheric MC files") parser.add_argument("--nhit-thresh", type=int, default=None, help="nhit threshold to apply to events before processing (should only be used for testing to speed things up)") + parser.add_argument("--steps", type=int, default=1000, help="number of steps in the MCMC chain") args = parser.parse_args() setup_matplotlib(args.save) @@ -210,7 +253,7 @@ if __name__ == '__main__': for id in (20,22,2020,2022,2222): df_id = mc[mc.id == id] if len(df_id): - mc_hists[id] = np.histogram(df_id.ke.values,bins=bins)[0] + mc_hists[id] = np.histogram(df_id.ke.values,bins=bins)[0]/100 else: mc_hists[id] = np.zeros(len(bins)-1,dtype=np.int) @@ -226,34 +269,63 @@ if __name__ == '__main__': for id in (20,22,2020,2022,2222): df_id = atm_mc[atm_mc.id == id] if len(df_id): - mc_atm_hists[id] = np.histogram(df_id.ke.values,bins=bins)[0] + mc_atm_hists[id] = np.histogram(df_id.ke.values,bins=bins)[0]/100 else: mc_atm_hists[id] = np.zeros(len(bins)-1,dtype=np.int) - nll = make_nll(data_hists,mc_hists) + nll = make_nll(data,mc_hists) - x0 = np.array([1.0]) + x0 = np.array([1.0,1.0]) opt = nlopt.opt(nlopt.LN_SBPLX, len(x0)) opt.set_min_objective(nll) - low = np.array([1e-10]) - high = np.array([10]) + low = np.array([1e-10,1e-10]) + high = np.array([10,10]) opt.set_lower_bounds(low) opt.set_upper_bounds(high) opt.set_ftol_abs(1e-10) - opt.set_initial_step([0.01]) + opt.set_initial_step([0.01,0.01]) xopt = opt.optimize(x0) + print("xopt = ", xopt) nll_xopt = nll(xopt) print("nll(xopt) = ", nll(xopt)) + pos = np.empty((10, len(x0)),dtype=np.double) + for i in range(pos.shape[0]): + pos[i] = xopt + np.random.randn(len(x0))*0.1 + pos[i,:] = np.clip(pos[i,:],1e-10,10) + + nwalkers, ndim = pos.shape + + sampler = emcee.EnsembleSampler(nwalkers, ndim, lambda x: -nll(x)) + with np.errstate(invalid='ignore'): + sampler.run_mcmc(pos, args.steps) + + print("Mean acceptance fraction: {0:.3f}".format(np.mean(sampler.acceptance_fraction))) + + try: + print("autocorrelation time: ", sampler.get_autocorr_time(quiet=True)) + except Exception as e: + print(e) + + samples = sampler.chain.reshape((-1,len(x0))) + + plt.figure() + plt.subplot(2,2,1) + plt.hist(samples[:,0],bins=100,histtype='step') + plt.xlabel("Atmospheric Flux Scale") + plt.subplot(2,2,2) + plt.hist(samples[:,1],bins=100,histtype='step') + plt.xlabel("Energy Scale") + prob = {} for id in (20,22,2020,2022,2222): - prob[id] = get_multinomial_prob(data_hists[id],mc_hists[id]*xopt[0]) + prob[id] = get_multinomial_prob(data[data.id == id].ke.values,mc[mc.id == id].ke.values,samples,bins) print(id, prob[id]) prob_atm = {} for id in (20,22,2020,2022,2222): - prob_atm[id] = get_multinomial_prob(data_atm_hists[id],mc_atm_hists[id]*xopt[0]) + prob_atm[id] = get_multinomial_prob(atm[atm.id == id].ke.values,atm_mc[atm_mc.id == id].ke.values,samples,bins) print(id, prob_atm[id]) handles = [Line2D([0], [0], color='C0'), @@ -261,8 +333,8 @@ if __name__ == '__main__': labels = ('Data','Monte Carlo') fig = plt.figure() - plot_hist2(prompt,color='C0') - plot_hist2(prompt_mc,norm=xopt[0],color='C1') + plot_hist2(prompt,scale=xopt[1],color='C0') + plot_hist2(prompt_mc,norm=xopt[0]/100,color='C1') for id in (20,22,2020,2022,2222): if id == 20: plt.subplot(2,3,1) @@ -284,8 +356,8 @@ if __name__ == '__main__': else: plt.suptitle("Without Neutron Follower") fig = plt.figure() - plot_hist2(atm,color='C0') - plot_hist2(atm_mc,norm=xopt[0],color='C1') + plot_hist2(atm,scale=xopt[1],color='C0') + plot_hist2(atm_mc,norm=xopt[0]/100,color='C1') for id in (20,22,2020,2022,2222): if id == 20: plt.subplot(2,3,1) |