From 031c70a265f31fcc73379f5153570cd926138462 Mon Sep 17 00:00:00 2001 From: tlatorre Date: Mon, 31 Aug 2020 10:43:27 -0500 Subject: add GENIE weights to chi2 analysis --- utils/chi2 | 66 +++++++++++++++++++++++++++++++++++++++++--------------- utils/sddm/dc.py | 7 +++--- 2 files changed, 53 insertions(+), 20 deletions(-) diff --git a/utils/chi2 b/utils/chi2 index b1c3c17..0ae7cb8 100755 --- a/utils/chi2 +++ b/utils/chi2 @@ -84,7 +84,7 @@ def plot_hist2(hists, bins, color=None): if len(df): plt.tight_layout() -def get_mc_hists(data,x,bins,apply_norm=False): +def get_mc_hists(data,x,bins,apply_norm=False,reweight=False): """ Returns the expected Monte Carlo histograms for the atmospheric neutrino background. @@ -113,9 +113,17 @@ def get_mc_hists(data,x,bins,apply_norm=False): ke_dict = {} for id in (20,22,2020,2022,2222): ke_dict[id] = data[data.id == id].ke.values - return get_mc_hists_fast(ke_dict,x,bins,apply_norm) -def get_mc_hists_fast(ke_dict,x,bins,apply_norm=False): + if reweight: + ke_weights_dict = {} + for id in (20,22,2020,2022,2222): + ke_weights_dict[id] = data[data.id == id].weight.values + else: + ke_weights_dict = None + + return get_mc_hists_fast(ke_dict,x,bins,apply_norm,weights_dict=ke_weights_dict) + +def get_mc_hists_fast(ke_dict,x,bins,apply_norm=False,weights_dict=None): """ Same as get_mc_hists() but the first argument is a dictionary mapping particle id -> kinetic energy array. This is much faster than selecting the @@ -140,7 +148,10 @@ def get_mc_hists_fast(ke_dict,x,bins,apply_norm=False): ke = ke*x[5] scale = bincenters2*max(EPSILON,x[6]) - hist = np.histogram(ke,bins=bins2)[0] + if weights_dict: + hist = np.histogram(ke,bins=bins2,weights=weights_dict[id])[0] + else: + hist = np.histogram(ke,bins=bins2)[0] cdf = norm.cdf(bins[:,np.newaxis],bincenters2,scale)*hist mc_hists[id] = np.sum(cdf[1:] - cdf[:-1],axis=-1) @@ -252,7 +263,7 @@ def make_nll(data, muons, mc, bins): return nll return nll -def get_mc_hists_posterior(ke_dict,muon_hists,data_hists,x,bins): +def get_mc_hists_posterior(data_mc,muon_hists,data_hists,x,bins): """ Returns the posterior on the Monte Carlo histograms. @@ -276,7 +287,7 @@ def get_mc_hists_posterior(ke_dict,muon_hists,data_hists,x,bins): Returns a dictionary mapping particle id combo -> histogram. """ - mc_hists = get_mc_hists_fast(ke_dict,x,bins) + mc_hists = get_mc_hists(data_mc,x,bins,reweight=True) for id in (20,22,2020,2022,2222): mc_hists[id] = get_mc_hist_posterior(mc_hists[id],data_hists[id],norm=x[0]) # FIXME: does the orering of when we add the muons matter here? @@ -306,14 +317,20 @@ def get_multinomial_prob(data, data_muon, data_mc, id, x_samples, bins, percenti data_hists = get_data_hists(data,bins) muon_hists = get_data_hists(data_muon,bins) - ke_dict = {} - for _id in (20,22,2020,2022,2222): - ke_dict[_id] = data_mc[data_mc.id == _id].ke.values + # Get the total number of "universes" simulated in the GENIE reweight tool + if len(data_mc): + nuniverses = data_mc['universe'].max()+1 + else: + nuniverses = 0 ps = [] for i in range(size): x = x_samples[np.random.randint(x_samples.shape[0])] - mc = get_mc_hists_posterior(ke_dict,muon_hists,data_hists,x,bins)[id] + if nuniverses > 0: + universe = np.random.randint(nuniverses) + else: + universe = 0 + mc = get_mc_hists_posterior(data_mc[data_mc.universe == universe],muon_hists,data_hists,x,bins)[id] N = mc.sum() # Fix a bug in scipy(). See https://github.com/scipy/scipy/issues/8235 (I think). mc = mc + 1e-10 @@ -376,7 +393,7 @@ def do_fit(data,muon,data_mc,bins,steps): nll_xopt = nll(xopt) print("nll(xopt) = ", nll(xopt)) - stepsizes = estimate_errors(nll,xopt,low,high,constraints) + stepsizes = estimate_errors(nll,xopt,low,high) with printoptions(precision=3, suppress=True): print("Errors: ", stepsizes) @@ -424,6 +441,7 @@ if __name__ == '__main__': parser.add_argument("--steps", type=int, default=1000, help="number of steps in the MCMC chain") parser.add_argument("--multinomial-prob-size", type=int, default=10000, help="number of p values to compute") parser.add_argument("--coverage", type=int, default=0, help="plot p value coverage") + parser.add_argument("--weights", nargs='+', required=True, help="GENIE reweight HDF5 files") args = parser.parse_args() setup_matplotlib(args.save) @@ -439,6 +457,7 @@ if __name__ == '__main__': ev_mc = get_events(args.mc, merge_fits=True, nhit_thresh=args.nhit_thresh) muon_mc = get_events(args.muon_mc, merge_fits=True, nhit_thresh=args.nhit_thresh) + weights = pd.concat([read_hdf(filename, "weights") for filename in args.weights],ignore_index=True) # Set all prompt events in the MC to be muons muon_mc.loc[muon_mc.prompt & muon_mc.filename.str.contains("cosmic"),'muon'] = True @@ -484,6 +503,19 @@ if __name__ == '__main__': data_mc = ev_mc[ev_mc.signal & ev_mc.prompt & ~ev_mc.atm] data_atm_mc = ev_mc[ev_mc.signal & ev_mc.prompt & ev_mc.atm] + # Merge the reweight scales only after we've done all the data cleaning and + # high level cuts + data_mc_with_weights = pd.merge(data_mc,weights,how='left',on=['run','evn']) + data_atm_mc_with_weights = pd.merge(data_atm_mc,weights,how='left',on=['run','evn']) + + # Make sure we are only using data with weights + # + # FIXME: There is probably a better way to do this. For example, we could + # use the full dataset for the fit and then scale the normalization factor + # by the ratio of events with weights to the total number of events. + data_mc = data_mc_with_weights[data_mc_with_weights.universe == 0] + data_atm_mc = data_atm_mc_with_weights[data_atm_mc_with_weights.universe == 0] + bins = np.logspace(np.log10(20),np.log10(10e3),21) if args.coverage: @@ -519,8 +551,8 @@ if __name__ == '__main__': n_muon_atm = np.random.poisson(N_muon_atm) # Sample data from Monte Carlo - data = pd.concat((data_mc.sample(n=n,replace=True), muon.sample(n=n_muon,replace=True))) - data_atm = pd.concat((data_atm_mc.sample(n=n_atm,replace=True), muon_atm.sample(n=n_muon_atm,replace=True))) + data = pd.concat((data_mc.sample(n=n,replace=True,weights='weights'), muon.sample(n=n_muon,replace=True))) + data_atm = pd.concat((data_atm_mc.sample(n=n_atm,replace=True,weights='weights'), muon_atm.sample(n=n_muon_atm,replace=True))) # Smear the energies by the additional energy resolution data.ke += np.random.randn(len(data.ke))*data.ke*energy_resolution @@ -533,8 +565,8 @@ if __name__ == '__main__': std = np.std(samples[:,i]) pull[i].append((mean - true_values[i])/std) - prob = get_prob(data,muon,data_mc,samples,bins,size=args.multinomial_prob_size) - prob_atm = get_prob(data_atm,muon_atm,data_atm_mc,samples,bins,size=args.multinomial_prob_size) + prob = get_prob(data,muon,data_mc_with_weights,samples,bins,size=args.multinomial_prob_size) + prob_atm = get_prob(data_atm,muon_atm,data_atm_mc_with_weights,samples,bins,size=args.multinomial_prob_size) for id in (20,22,2020,2022,2222): p_values[id].append(prob[id]) @@ -613,8 +645,8 @@ if __name__ == '__main__': xopt, samples = do_fit(data,muon,data_mc,bins,args.steps) - prob = get_prob(data,muon,data_mc,samples,bins,size=args.multinomial_prob_size) - prob_atm = get_prob(data_atm,muon_atm,data_atm_mc,samples,bins,size=args.multinomial_prob_size) + prob = get_prob(data,muon,data_mc_with_weights,samples,bins,size=args.multinomial_prob_size) + prob_atm = get_prob(data_atm,muon_atm,data_atm_mc_with_weights,samples,bins,size=args.multinomial_prob_size) plt.figure() plt.subplot(3,3,1) diff --git a/utils/sddm/dc.py b/utils/sddm/dc.py index 1146213..461769e 100755 --- a/utils/sddm/dc.py +++ b/utils/sddm/dc.py @@ -93,7 +93,7 @@ def get_proposal_func(stepsizes, low, high): return x, log_p_x0_given_x - log_p_x_given_x0 return proposal -def estimate_errors(nll, xopt, low, high, constraints): +def estimate_errors(nll, xopt, low, high, constraints=None): """ Return approximate 1 sigma errors for each parameter assuming all parameters are uncorrelated. @@ -115,11 +115,12 @@ def estimate_errors(nll, xopt, low, high, constraints): errors = np.empty_like(xopt) for i in range(len(xopt)): - if np.sign(root(low[i],xopt,i)) == np.sign(nll_xopt): + if np.sign(root(low[i],xopt,i)) == np.sign(root(xopt[i],xopt,i)): xlo = xopt[i] - low[i] else: xlo = brentq(root,low[i],xopt[i],args=(xopt,i)) - if np.sign(root(high[i],xopt,i)) == np.sign(nll_xopt): + + if np.sign(root(high[i],xopt,i)) == np.sign(root(xopt[i],xopt,i)): xhi = high[i] - xopt[i] else: xhi = brentq(root,xopt[i],high[i],args=(xopt,i)) -- cgit