aboutsummaryrefslogtreecommitdiff
path: root/utils/chi2
diff options
context:
space:
mode:
authortlatorre <tlatorre@uchicago.edu>2020-10-09 11:29:39 -0500
committertlatorre <tlatorre@uchicago.edu>2020-10-09 11:29:39 -0500
commit22f1ead8e6ab663978cadd42819376130037a99c (patch)
tree3985ef82ef83b447bb2a8a72a4b9779988e07dd7 /utils/chi2
parentd48484c29d09a944de6f9251a3c659e76279464e (diff)
downloadsddm-22f1ead8e6ab663978cadd42819376130037a99c.tar.gz
sddm-22f1ead8e6ab663978cadd42819376130037a99c.tar.bz2
sddm-22f1ead8e6ab663978cadd42819376130037a99c.zip
only merge weights after selecting universe
This commit updates the chi2 script to only merge the weights dataframe with the Monte Carlo dataframe after we have selected the universe for the weights. This *greatly* reduces the memory usage.
Diffstat (limited to 'utils/chi2')
-rwxr-xr-xutils/chi243
1 files changed, 18 insertions, 25 deletions
diff --git a/utils/chi2 b/utils/chi2
index be6b3e6..9835ba3 100755
--- a/utils/chi2
+++ b/utils/chi2
@@ -273,7 +273,7 @@ def get_mc_hists_posterior(data_mc,muon_hists,data_hists,x,bins):
mc_hists[id] += muon_hists[id]*x[5]
return mc_hists
-def get_multinomial_prob(data, data_muon, data_mc, id, x_samples, bins, percentile=50.0, size=10000):
+def get_multinomial_prob(data, data_muon, data_mc, weights, id, x_samples, bins, percentile=50.0, size=10000):
"""
Returns the p-value that the histogram of the data is drawn from the MC
histogram.
@@ -301,7 +301,7 @@ def get_multinomial_prob(data, data_muon, data_mc, id, x_samples, bins, percenti
# Get the total number of "universes" simulated in the GENIE reweight tool
if len(data_mc):
- nuniverses = data_mc['universe'].max()+1
+ nuniverses = weights['universe'].max()+1
else:
nuniverses = 0
@@ -313,9 +313,12 @@ def get_multinomial_prob(data, data_muon, data_mc, id, x_samples, bins, percenti
if nuniverses > 0:
universe = np.random.randint(nuniverses)
+ data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == universe],how='left',on=['run','evn'])
+ data_mc_with_weights.weight = data_mc_with_weights.weight.fillna(1.0)
else:
- universe = 0
- mc = get_mc_hists_posterior(data_mc[data_mc.universe == universe],muon_hists,data_hists,x,bins)[id]
+ data_mc_with_weights = data_mc.copy()
+ data_mc['weight'] = 1.0
+ mc = get_mc_hists_posterior(data_mc_with_weights,muon_hists,data_hists,x,bins)[id]
N = mc.sum()
# Fix a bug in scipy(). See https://github.com/scipy/scipy/issues/8235 (I think).
mc = mc + 1e-10
@@ -337,10 +340,10 @@ def get_multinomial_prob(data, data_muon, data_mc, id, x_samples, bins, percenti
ps.append(np.count_nonzero(chi2_samples >= chi2_data)/len(chi2_samples))
return np.percentile(ps,percentile)
-def get_prob(data,muon,mc,samples,bins,size):
+def get_prob(data,muon,mc,weights,samples,bins,size):
prob = {}
for id in (20,22,2020,2022,2222):
- prob[id] = get_multinomial_prob(data,muon,mc,id,samples,bins,size=size)
+ prob[id] = get_multinomial_prob(data,muon,mc,weights,id,samples,bins,size=size)
print(id, prob[id])
return prob
@@ -496,19 +499,6 @@ if __name__ == '__main__':
data_mc = ev_mc[ev_mc.signal & ev_mc.prompt & ~ev_mc.atm]
data_atm_mc = ev_mc[ev_mc.signal & ev_mc.prompt & ev_mc.atm]
- # Merge the reweight scales only after we've done all the data cleaning and
- # high level cuts
- data_mc_with_weights = pd.merge(data_mc,weights,how='left',on=['run','evn'])
- data_atm_mc_with_weights = pd.merge(data_atm_mc,weights,how='left',on=['run','evn'])
-
- # Make sure we are only using data with weights
- #
- # FIXME: There is probably a better way to do this. For example, we could
- # use the full dataset for the fit and then scale the normalization factor
- # by the ratio of events with weights to the total number of events.
- data_mc = data_mc_with_weights[data_mc_with_weights.universe == 0]
- data_atm_mc = data_atm_mc_with_weights[data_atm_mc_with_weights.universe == 0]
-
bins = np.logspace(np.log10(20),np.log10(10e3),21)
if args.coverage:
@@ -520,6 +510,9 @@ if __name__ == '__main__':
xtrue = truncnorm_scaled(PRIORS_LOW,PRIORS_HIGH,PRIORS,PRIOR_UNCERTAINTIES)
+ data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == 0],how='left',on=['run','evn'])
+ data_atm_mc_with_weights = pd.merge(data_atm_mc,weights[weights.universe == 0],how='left',on=['run','evn'])
+
for i in range(args.coverage):
# Calculate expected number of events
N = len(data_mc)*xtrue[0]
@@ -534,8 +527,8 @@ if __name__ == '__main__':
n_muon_atm = np.random.poisson(N_muon_atm)
# Sample data from Monte Carlo
- data = pd.concat((data_mc.sample(n=n,replace=True,weights='weight'), muon.sample(n=n_muon,replace=True)))
- data_atm = pd.concat((data_atm_mc.sample(n=n_atm,replace=True,weights='weight'), muon_atm.sample(n=n_muon_atm,replace=True)))
+ data = pd.concat((data_mc_with_weights.sample(n=n,replace=True,weights='weight'), muon.sample(n=n_muon,replace=True)))
+ data_atm = pd.concat((data_atm_mc_with_weights.sample(n=n_atm,replace=True,weights='weight'), muon_atm.sample(n=n_muon_atm,replace=True)))
# Smear the energies by the additional energy resolution
data.loc[data.id1 == 20,'energy1'] *= (1+xtrue[1]+np.random.randn(np.count_nonzero(data.id1 == 20))*xtrue[2])
@@ -552,8 +545,8 @@ if __name__ == '__main__':
xopt, samples = do_fit(data,muon,data_mc,bins,args.steps,args.print_nll,args.walkers,args.thin)
- prob = get_prob(data,muon,data_mc_with_weights,samples,bins,size=args.multinomial_prob_size)
- prob_atm = get_prob(data_atm,muon_atm,data_atm_mc_with_weights,samples,bins,size=args.multinomial_prob_size)
+ prob = get_prob(data,muon,data_mc,weights,samples,bins,size=args.multinomial_prob_size)
+ prob_atm = get_prob(data_atm,muon_atm,data_atm_mc,weights,samples,bins,size=args.multinomial_prob_size)
for id in (20,22,2020,2022,2222):
p_values[id].append(prob[id])
@@ -675,8 +668,8 @@ if __name__ == '__main__':
xopt, samples = do_fit(data,muon,data_mc,bins,args.steps,args.print_nll,args.walkers,args.thin)
- prob = get_prob(data,muon,data_mc_with_weights,samples,bins,size=args.multinomial_prob_size)
- prob_atm = get_prob(data_atm,muon_atm,data_atm_mc_with_weights,samples,bins,size=args.multinomial_prob_size)
+ prob = get_prob(data,muon,data_mc,weights,samples,bins,size=args.multinomial_prob_size)
+ prob_atm = get_prob(data_atm,muon_atm,data_atm_mc,weights,samples,bins,size=args.multinomial_prob_size)
plt.figure()
for i in range(len(FIT_PARS)):