From 22f1ead8e6ab663978cadd42819376130037a99c Mon Sep 17 00:00:00 2001 From: tlatorre Date: Fri, 9 Oct 2020 11:29:39 -0500 Subject: only merge weights after selecting universe This commit updates the chi2 script to only merge the weights dataframe with the Monte Carlo dataframe after we have selected the universe for the weights. This *greatly* reduces the memory usage. --- utils/chi2 | 43 ++++++++++++++++++------------------------- 1 file changed, 18 insertions(+), 25 deletions(-) (limited to 'utils') diff --git a/utils/chi2 b/utils/chi2 index be6b3e6..9835ba3 100755 --- a/utils/chi2 +++ b/utils/chi2 @@ -273,7 +273,7 @@ def get_mc_hists_posterior(data_mc,muon_hists,data_hists,x,bins): mc_hists[id] += muon_hists[id]*x[5] return mc_hists -def get_multinomial_prob(data, data_muon, data_mc, id, x_samples, bins, percentile=50.0, size=10000): +def get_multinomial_prob(data, data_muon, data_mc, weights, id, x_samples, bins, percentile=50.0, size=10000): """ Returns the p-value that the histogram of the data is drawn from the MC histogram. @@ -301,7 +301,7 @@ def get_multinomial_prob(data, data_muon, data_mc, id, x_samples, bins, percenti # Get the total number of "universes" simulated in the GENIE reweight tool if len(data_mc): - nuniverses = data_mc['universe'].max()+1 + nuniverses = weights['universe'].max()+1 else: nuniverses = 0 @@ -313,9 +313,12 @@ def get_multinomial_prob(data, data_muon, data_mc, id, x_samples, bins, percenti if nuniverses > 0: universe = np.random.randint(nuniverses) + data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == universe],how='left',on=['run','evn']) + data_mc_with_weights.weight = data_mc_with_weights.weight.fillna(1.0) else: - universe = 0 - mc = get_mc_hists_posterior(data_mc[data_mc.universe == universe],muon_hists,data_hists,x,bins)[id] + data_mc_with_weights = data_mc.copy() + data_mc['weight'] = 1.0 + mc = get_mc_hists_posterior(data_mc_with_weights,muon_hists,data_hists,x,bins)[id] N = mc.sum() # Fix a bug in scipy(). See https://github.com/scipy/scipy/issues/8235 (I think). mc = mc + 1e-10 @@ -337,10 +340,10 @@ def get_multinomial_prob(data, data_muon, data_mc, id, x_samples, bins, percenti ps.append(np.count_nonzero(chi2_samples >= chi2_data)/len(chi2_samples)) return np.percentile(ps,percentile) -def get_prob(data,muon,mc,samples,bins,size): +def get_prob(data,muon,mc,weights,samples,bins,size): prob = {} for id in (20,22,2020,2022,2222): - prob[id] = get_multinomial_prob(data,muon,mc,id,samples,bins,size=size) + prob[id] = get_multinomial_prob(data,muon,mc,weights,id,samples,bins,size=size) print(id, prob[id]) return prob @@ -496,19 +499,6 @@ if __name__ == '__main__': data_mc = ev_mc[ev_mc.signal & ev_mc.prompt & ~ev_mc.atm] data_atm_mc = ev_mc[ev_mc.signal & ev_mc.prompt & ev_mc.atm] - # Merge the reweight scales only after we've done all the data cleaning and - # high level cuts - data_mc_with_weights = pd.merge(data_mc,weights,how='left',on=['run','evn']) - data_atm_mc_with_weights = pd.merge(data_atm_mc,weights,how='left',on=['run','evn']) - - # Make sure we are only using data with weights - # - # FIXME: There is probably a better way to do this. For example, we could - # use the full dataset for the fit and then scale the normalization factor - # by the ratio of events with weights to the total number of events. - data_mc = data_mc_with_weights[data_mc_with_weights.universe == 0] - data_atm_mc = data_atm_mc_with_weights[data_atm_mc_with_weights.universe == 0] - bins = np.logspace(np.log10(20),np.log10(10e3),21) if args.coverage: @@ -520,6 +510,9 @@ if __name__ == '__main__': xtrue = truncnorm_scaled(PRIORS_LOW,PRIORS_HIGH,PRIORS,PRIOR_UNCERTAINTIES) + data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == 0],how='left',on=['run','evn']) + data_atm_mc_with_weights = pd.merge(data_atm_mc,weights[weights.universe == 0],how='left',on=['run','evn']) + for i in range(args.coverage): # Calculate expected number of events N = len(data_mc)*xtrue[0] @@ -534,8 +527,8 @@ if __name__ == '__main__': n_muon_atm = np.random.poisson(N_muon_atm) # Sample data from Monte Carlo - data = pd.concat((data_mc.sample(n=n,replace=True,weights='weight'), muon.sample(n=n_muon,replace=True))) - data_atm = pd.concat((data_atm_mc.sample(n=n_atm,replace=True,weights='weight'), muon_atm.sample(n=n_muon_atm,replace=True))) + data = pd.concat((data_mc_with_weights.sample(n=n,replace=True,weights='weight'), muon.sample(n=n_muon,replace=True))) + data_atm = pd.concat((data_atm_mc_with_weights.sample(n=n_atm,replace=True,weights='weight'), muon_atm.sample(n=n_muon_atm,replace=True))) # Smear the energies by the additional energy resolution data.loc[data.id1 == 20,'energy1'] *= (1+xtrue[1]+np.random.randn(np.count_nonzero(data.id1 == 20))*xtrue[2]) @@ -552,8 +545,8 @@ if __name__ == '__main__': xopt, samples = do_fit(data,muon,data_mc,bins,args.steps,args.print_nll,args.walkers,args.thin) - prob = get_prob(data,muon,data_mc_with_weights,samples,bins,size=args.multinomial_prob_size) - prob_atm = get_prob(data_atm,muon_atm,data_atm_mc_with_weights,samples,bins,size=args.multinomial_prob_size) + prob = get_prob(data,muon,data_mc,weights,samples,bins,size=args.multinomial_prob_size) + prob_atm = get_prob(data_atm,muon_atm,data_atm_mc,weights,samples,bins,size=args.multinomial_prob_size) for id in (20,22,2020,2022,2222): p_values[id].append(prob[id]) @@ -675,8 +668,8 @@ if __name__ == '__main__': xopt, samples = do_fit(data,muon,data_mc,bins,args.steps,args.print_nll,args.walkers,args.thin) - prob = get_prob(data,muon,data_mc_with_weights,samples,bins,size=args.multinomial_prob_size) - prob_atm = get_prob(data_atm,muon_atm,data_atm_mc_with_weights,samples,bins,size=args.multinomial_prob_size) + prob = get_prob(data,muon,data_mc,weights,samples,bins,size=args.multinomial_prob_size) + prob_atm = get_prob(data_atm,muon_atm,data_atm_mc,weights,samples,bins,size=args.multinomial_prob_size) plt.figure() for i in range(len(FIT_PARS)): -- cgit