diff options
author | tlatorre <tlatorre@uchicago.edu> | 2020-10-09 11:29:39 -0500 |
---|---|---|
committer | tlatorre <tlatorre@uchicago.edu> | 2020-10-09 11:29:39 -0500 |
commit | 22f1ead8e6ab663978cadd42819376130037a99c (patch) | |
tree | 3985ef82ef83b447bb2a8a72a4b9779988e07dd7 | |
parent | d48484c29d09a944de6f9251a3c659e76279464e (diff) | |
download | sddm-22f1ead8e6ab663978cadd42819376130037a99c.tar.gz sddm-22f1ead8e6ab663978cadd42819376130037a99c.tar.bz2 sddm-22f1ead8e6ab663978cadd42819376130037a99c.zip |
only merge weights after selecting universe
This commit updates the chi2 script to only merge the weights dataframe
with the Monte Carlo dataframe after we have selected the universe for
the weights. This *greatly* reduces the memory usage.
-rwxr-xr-x | utils/chi2 | 43 |
1 files changed, 18 insertions, 25 deletions
@@ -273,7 +273,7 @@ def get_mc_hists_posterior(data_mc,muon_hists,data_hists,x,bins): mc_hists[id] += muon_hists[id]*x[5] return mc_hists -def get_multinomial_prob(data, data_muon, data_mc, id, x_samples, bins, percentile=50.0, size=10000): +def get_multinomial_prob(data, data_muon, data_mc, weights, id, x_samples, bins, percentile=50.0, size=10000): """ Returns the p-value that the histogram of the data is drawn from the MC histogram. @@ -301,7 +301,7 @@ def get_multinomial_prob(data, data_muon, data_mc, id, x_samples, bins, percenti # Get the total number of "universes" simulated in the GENIE reweight tool if len(data_mc): - nuniverses = data_mc['universe'].max()+1 + nuniverses = weights['universe'].max()+1 else: nuniverses = 0 @@ -313,9 +313,12 @@ def get_multinomial_prob(data, data_muon, data_mc, id, x_samples, bins, percenti if nuniverses > 0: universe = np.random.randint(nuniverses) + data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == universe],how='left',on=['run','evn']) + data_mc_with_weights.weight = data_mc_with_weights.weight.fillna(1.0) else: - universe = 0 - mc = get_mc_hists_posterior(data_mc[data_mc.universe == universe],muon_hists,data_hists,x,bins)[id] + data_mc_with_weights = data_mc.copy() + data_mc['weight'] = 1.0 + mc = get_mc_hists_posterior(data_mc_with_weights,muon_hists,data_hists,x,bins)[id] N = mc.sum() # Fix a bug in scipy(). See https://github.com/scipy/scipy/issues/8235 (I think). mc = mc + 1e-10 @@ -337,10 +340,10 @@ def get_multinomial_prob(data, data_muon, data_mc, id, x_samples, bins, percenti ps.append(np.count_nonzero(chi2_samples >= chi2_data)/len(chi2_samples)) return np.percentile(ps,percentile) -def get_prob(data,muon,mc,samples,bins,size): +def get_prob(data,muon,mc,weights,samples,bins,size): prob = {} for id in (20,22,2020,2022,2222): - prob[id] = get_multinomial_prob(data,muon,mc,id,samples,bins,size=size) + prob[id] = get_multinomial_prob(data,muon,mc,weights,id,samples,bins,size=size) print(id, prob[id]) return prob @@ -496,19 +499,6 @@ if __name__ == '__main__': data_mc = ev_mc[ev_mc.signal & ev_mc.prompt & ~ev_mc.atm] data_atm_mc = ev_mc[ev_mc.signal & ev_mc.prompt & ev_mc.atm] - # Merge the reweight scales only after we've done all the data cleaning and - # high level cuts - data_mc_with_weights = pd.merge(data_mc,weights,how='left',on=['run','evn']) - data_atm_mc_with_weights = pd.merge(data_atm_mc,weights,how='left',on=['run','evn']) - - # Make sure we are only using data with weights - # - # FIXME: There is probably a better way to do this. For example, we could - # use the full dataset for the fit and then scale the normalization factor - # by the ratio of events with weights to the total number of events. - data_mc = data_mc_with_weights[data_mc_with_weights.universe == 0] - data_atm_mc = data_atm_mc_with_weights[data_atm_mc_with_weights.universe == 0] - bins = np.logspace(np.log10(20),np.log10(10e3),21) if args.coverage: @@ -520,6 +510,9 @@ if __name__ == '__main__': xtrue = truncnorm_scaled(PRIORS_LOW,PRIORS_HIGH,PRIORS,PRIOR_UNCERTAINTIES) + data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == 0],how='left',on=['run','evn']) + data_atm_mc_with_weights = pd.merge(data_atm_mc,weights[weights.universe == 0],how='left',on=['run','evn']) + for i in range(args.coverage): # Calculate expected number of events N = len(data_mc)*xtrue[0] @@ -534,8 +527,8 @@ if __name__ == '__main__': n_muon_atm = np.random.poisson(N_muon_atm) # Sample data from Monte Carlo - data = pd.concat((data_mc.sample(n=n,replace=True,weights='weight'), muon.sample(n=n_muon,replace=True))) - data_atm = pd.concat((data_atm_mc.sample(n=n_atm,replace=True,weights='weight'), muon_atm.sample(n=n_muon_atm,replace=True))) + data = pd.concat((data_mc_with_weights.sample(n=n,replace=True,weights='weight'), muon.sample(n=n_muon,replace=True))) + data_atm = pd.concat((data_atm_mc_with_weights.sample(n=n_atm,replace=True,weights='weight'), muon_atm.sample(n=n_muon_atm,replace=True))) # Smear the energies by the additional energy resolution data.loc[data.id1 == 20,'energy1'] *= (1+xtrue[1]+np.random.randn(np.count_nonzero(data.id1 == 20))*xtrue[2]) @@ -552,8 +545,8 @@ if __name__ == '__main__': xopt, samples = do_fit(data,muon,data_mc,bins,args.steps,args.print_nll,args.walkers,args.thin) - prob = get_prob(data,muon,data_mc_with_weights,samples,bins,size=args.multinomial_prob_size) - prob_atm = get_prob(data_atm,muon_atm,data_atm_mc_with_weights,samples,bins,size=args.multinomial_prob_size) + prob = get_prob(data,muon,data_mc,weights,samples,bins,size=args.multinomial_prob_size) + prob_atm = get_prob(data_atm,muon_atm,data_atm_mc,weights,samples,bins,size=args.multinomial_prob_size) for id in (20,22,2020,2022,2222): p_values[id].append(prob[id]) @@ -675,8 +668,8 @@ if __name__ == '__main__': xopt, samples = do_fit(data,muon,data_mc,bins,args.steps,args.print_nll,args.walkers,args.thin) - prob = get_prob(data,muon,data_mc_with_weights,samples,bins,size=args.multinomial_prob_size) - prob_atm = get_prob(data_atm,muon_atm,data_atm_mc_with_weights,samples,bins,size=args.multinomial_prob_size) + prob = get_prob(data,muon,data_mc,weights,samples,bins,size=args.multinomial_prob_size) + prob_atm = get_prob(data_atm,muon_atm,data_atm_mc,weights,samples,bins,size=args.multinomial_prob_size) plt.figure() for i in range(len(FIT_PARS)): |