aboutsummaryrefslogtreecommitdiff
path: root/utils
diff options
context:
space:
mode:
authortlatorre <tlatorre@uchicago.edu>2020-12-08 12:06:38 -0600
committertlatorre <tlatorre@uchicago.edu>2020-12-08 12:06:38 -0600
commit19cb0cc7b3e37eb9289c12772debfb65ef669b39 (patch)
tree4009b0644b4f278e02696b7fb3f16e431fc78262 /utils
parenta22284c6fd6678990cb9d13a08d447d46fc9b8a8 (diff)
downloadsddm-19cb0cc7b3e37eb9289c12772debfb65ef669b39.tar.gz
sddm-19cb0cc7b3e37eb9289c12772debfb65ef669b39.tar.bz2
sddm-19cb0cc7b3e37eb9289c12772debfb65ef669b39.zip
update dm-search and chi2 to refit with GENIE systematics
Diffstat (limited to 'utils')
-rwxr-xr-xutils/chi252
-rwxr-xr-xutils/dm-search85
2 files changed, 128 insertions, 9 deletions
diff --git a/utils/chi2 b/utils/chi2
index 18799ff..d493741 100755
--- a/utils/chi2
+++ b/utils/chi2
@@ -338,7 +338,7 @@ def get_prob(data,muon,mc,atmo_scale_factor,muon_scale_factor,samples,bins,size)
print(id, prob[id])
return prob
-def do_fit(data,muon,data_mc,weights,atmo_scale_factor,muon_scale_factor,bins,steps,print_nll=False,walkers=100,thin=10):
+def do_fit(data,muon,data_mc,weights,atmo_scale_factor,muon_scale_factor,bins,steps,print_nll=False,walkers=100,thin=10,refit=True):
"""
Run the fit and return the minimum along with samples from running an MCMC
starting near the minimum.
@@ -410,6 +410,54 @@ def do_fit(data,muon,data_mc,weights,atmo_scale_factor,muon_scale_factor,bins,st
universe = np.argmin(nlls)
+ if refit:
+ data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == universe],how='left',on=['run','evn'])
+ data_mc_with_weights.weight = data_mc_with_weights.weight.fillna(1.0)
+
+ # Create a new negative log likelihood function with the weighted Monte Carlo.
+ nll = make_nll(data,muon,data_mc_with_weights,atmo_scale_factor,muon_scale_factor,bins,reweight=True,print_nll=print_nll)
+
+ # Now, we refit with the Monte Carlo weighted by the most likely GENIE
+ # systematics.
+ pos = np.empty((walkers, len(PRIORS)),dtype=np.double)
+ for i in range(pos.shape[0]):
+ pos[i] = truncnorm_scaled(PRIORS_LOW,PRIORS_HIGH,PRIORS,PRIOR_UNCERTAINTIES)
+
+ nwalkers, ndim = pos.shape
+
+ # We use the KDEMove here because I think it should sample the likelihood
+ # better. Because we have energy scale parameters and we are doing a binned
+ # likelihood, the likelihood is discontinuous. There can also be several
+ # local minima. The author of emcee recommends using the KDEMove with a lot
+ # of workers to try and properly sample a multimodal distribution. In
+ # addition, I've found that the autocorrelation time for the KDEMove is
+ # much better than the other moves.
+ sampler = emcee.EnsembleSampler(nwalkers, ndim, lambda x: -nll(x), moves=emcee.moves.KDEMove())
+ with np.errstate(invalid='ignore'):
+ sampler.run_mcmc(pos, steps)
+
+ print("Mean acceptance fraction: {0:.3f}".format(np.mean(sampler.acceptance_fraction)))
+
+ try:
+ print("autocorrelation time: ", sampler.get_autocorr_time(quiet=True))
+ except Exception as e:
+ print(e)
+
+ samples = sampler.get_chain(flat=True,thin=thin)
+
+ # Now, we use nlopt to find the best set of parameters. We start at the
+ # best starting point from the MCMC and then run the SBPLX routine.
+ x0 = sampler.get_chain(flat=True)[sampler.get_log_prob(flat=True).argmax()]
+ opt = nlopt.opt(nlopt.LN_SBPLX, len(x0))
+ opt.set_min_objective(nll)
+ low = np.array(PRIORS_LOW)
+ high = np.array(PRIORS_HIGH)
+ opt.set_lower_bounds(low)
+ opt.set_upper_bounds(high)
+ opt.set_ftol_abs(1e-10)
+ opt.set_initial_step([0.01]*len(x0))
+ xopt = opt.optimize(x0)
+
return xopt, universe, samples
if __name__ == '__main__':
@@ -672,7 +720,7 @@ if __name__ == '__main__':
data_atm.loc[data_atm.id2 == 22,'energy2'] *= (1+xtrue[3]+np.random.randn(np.count_nonzero(data_atm.id2 == 22))*xtrue[4])
data_atm['ke'] = data_atm['energy1'].fillna(0) + data_atm['energy2'].fillna(0) + data_atm['energy3'].fillna(0)
- xopt, universe, samples = do_fit(data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,args.steps,args.print_nll,args.walkers,args.thin)
+ xopt, universe, samples = do_fit(data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,args.steps,args.print_nll,args.walkers,args.thin,refit=False)
for i in range(len(FIT_PARS)):
# The "pull plots" we make here are actually produced via a
diff --git a/utils/dm-search b/utils/dm-search
index d75fdcc..5204b7a 100755
--- a/utils/dm-search
+++ b/utils/dm-search
@@ -225,7 +225,7 @@ def get_data_hists(data,bins,scale=1.0):
data_hists[id] = np.histogram(data[data.id == id].ke.values,bins=bins)[0]*scale
return data_hists
-def make_nll(dm_particle_id, dm_mass, dm_energy, data, muons, mc, atmo_scale_factor, muon_scale_factor, bins, print_nll=False):
+def make_nll(data, muons, mc, atmo_scale_factor, muon_scale_factor, bins, reweight=False, print_nll=False):
df_dict = {}
for id in (20,22,2020,2022,2222):
df_dict[id] = mc[mc.id == id]
@@ -249,7 +249,7 @@ def make_nll(dm_particle_id, dm_mass, dm_energy, data, muons, mc, atmo_scale_fac
# Get the Monte Carlo histograms. We need to do this within the
# likelihood function since we apply the energy resolution parameters
# to the Monte Carlo.
- mc_hists = get_mc_hists_fast(df_dict,x,bins,scale=1/atmo_scale_factor)
+ mc_hists = get_mc_hists_fast(df_dict,x,bins,scale=1/atmo_scale_factor,reweight=reweight)
muon_hists = get_mc_hists_fast(df_dict_muon,x,bins,scale=1/muon_scale_factor)
dm_hists = get_mc_hists_fast(df_dict_dm,x,bins,scale=1/len(dm_sample))
@@ -273,7 +273,7 @@ def make_nll(dm_particle_id, dm_mass, dm_energy, data, muons, mc, atmo_scale_fac
return nll
return nll
-def do_fit(dm_particle_id,dm_mass,dm_energy,data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,steps,print_nll=False,walkers=100,thin=10):
+def do_fit(data,muon,data_mc,weights,atmo_scale_factor,muon_scale_factor,bins,steps,print_nll=False,walkers=100,thin=10,refit=True):
"""
Run the fit and return the minimum along with samples from running an MCMC
starting near the minimum.
@@ -284,6 +284,7 @@ def do_fit(dm_particle_id,dm_mass,dm_energy,data,muon,data_mc,atmo_scale_factor,
external muons
- data_mc: pandas dataframe representing the expected background from
atmospheric neutrino events
+ - weights: pandas dataframe with the GENIE weights
- bins: an array of bins to use for the fit
- steps: the number of MCMC steps to run
@@ -325,13 +326,80 @@ def do_fit(dm_particle_id,dm_mass,dm_energy,data,muon,data_mc,atmo_scale_factor,
opt.set_min_objective(nll)
low = np.array(PRIORS_LOW)
high = np.array(PRIORS_HIGH)
+ if refit:
+ # If we are refitting, we want to do the first fit assuming no dark
+ # matter to make sure we get the best GENIE systematics for the null
+ # hypothesis.
+ x0[6] = low[6]
+ high[6] = low[6]
opt.set_lower_bounds(low)
opt.set_upper_bounds(high)
opt.set_ftol_abs(1e-10)
opt.set_initial_step([0.01]*len(x0))
xopt = opt.optimize(x0)
- return xopt, samples
+ # Get the total number of "universes" simulated in the GENIE reweight tool
+ nuniverses = weights['universe'].max()+1
+
+ nlls = []
+ for universe in range(nuniverses):
+ data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == universe],how='left',on=['run','evn'])
+ data_mc_with_weights.weight = data_mc_with_weights.weight.fillna(1.0)
+
+ nll = make_nll(data,muon,data_mc_with_weights,atmo_scale_factor,muon_scale_factor,bins,reweight=True,print_nll=print_nll)
+ nlls.append(nll(xopt))
+
+ universe = np.argmin(nlls)
+
+ if refit:
+ data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == universe],how='left',on=['run','evn'])
+ data_mc_with_weights.weight = data_mc_with_weights.weight.fillna(1.0)
+
+ # Create a new negative log likelihood function with the weighted Monte Carlo.
+ nll = make_nll(data,muon,data_mc_with_weights,atmo_scale_factor,muon_scale_factor,bins,reweight=True,print_nll=print_nll)
+
+ # Now, we refit with the Monte Carlo weighted by the most likely GENIE
+ # systematics.
+ pos = np.empty((walkers, len(PRIORS)),dtype=np.double)
+ for i in range(pos.shape[0]):
+ pos[i] = sample_priors()
+
+ nwalkers, ndim = pos.shape
+
+ # We use the KDEMove here because I think it should sample the likelihood
+ # better. Because we have energy scale parameters and we are doing a binned
+ # likelihood, the likelihood is discontinuous. There can also be several
+ # local minima. The author of emcee recommends using the KDEMove with a lot
+ # of workers to try and properly sample a multimodal distribution. In
+ # addition, I've found that the autocorrelation time for the KDEMove is
+ # much better than the other moves.
+ sampler = emcee.EnsembleSampler(nwalkers, ndim, lambda x: -nll(x), moves=emcee.moves.KDEMove())
+ with np.errstate(invalid='ignore'):
+ sampler.run_mcmc(pos, steps)
+
+ print("Mean acceptance fraction: {0:.3f}".format(np.mean(sampler.acceptance_fraction)))
+
+ try:
+ print("autocorrelation time: ", sampler.get_autocorr_time(quiet=True))
+ except Exception as e:
+ print(e)
+
+ samples = sampler.get_chain(flat=True,thin=thin)
+
+ # Now, we use nlopt to find the best set of parameters. We start at the
+ # best starting point from the MCMC and then run the SBPLX routine.
+ x0 = sampler.get_chain(flat=True)[sampler.get_log_prob(flat=True).argmax()]
+ opt = nlopt.opt(nlopt.LN_SBPLX, len(x0))
+ opt.set_min_objective(nll)
+ low = np.array(PRIORS_LOW)
+ high = np.array(PRIORS_HIGH)
+ opt.set_lower_bounds(low)
+ opt.set_upper_bounds(high)
+ opt.set_ftol_abs(1e-10)
+ opt.set_initial_step([0.01]*len(x0))
+ xopt = opt.optimize(x0)
+
+ return xopt, universe, samples
def sample_priors():
"""
@@ -391,7 +459,10 @@ def get_limits(dm_masses,data,muon,data_mc,atmo_scale_factor,muon_scale_factor,b
m1 = SNOMAN_MASS[id1]
m2 = SNOMAN_MASS[id2]
dm_energy = dm_mass
- xopt, samples = do_fit(dm_particle_id,dm_mass,dm_energy,data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,steps,print_nll,walkers,thin)
+ xopt, universe, samples = do_fit(dm_particle_id,dm_mass,dm_energy,data,muon,data_mc,weights,atmo_scale_factor,muon_scale_factor,bins,steps,print_nll,walkers,thin)
+
+ data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == universe],how='left',on=['run','evn'])
+ data_mc_with_weights.weight = data_mc_with_weights.weight.fillna(1.0)
limit = np.percentile(samples[:,6],90)
limits[dm_particle_id][i] = limit
@@ -426,7 +497,7 @@ def get_limits(dm_masses,data,muon,data_mc,atmo_scale_factor,muon_scale_factor,b
dm_hists = get_mc_hists(dm_sample,xopt,bins,scale=1/len(dm_sample))
frac = dm_hists[dm_particle_id].sum()
dm_hists[dm_particle_id] /= frac
- mc_hists = get_mc_hists(data_mc,xopt,bins,scale=xopt[0]/atmo_scale_factor)
+ mc_hists = get_mc_hists(data_mc_with_weights,xopt,bins,scale=xopt[0]/atmo_scale_factor,reweight=True)
muon_hists = get_mc_hists(muon,xopt,bins,scale=xopt[5]/muon_scale_factor)
n = (dm_hists[dm_particle_id]*(mc_hists[dm_particle_id] + muon_hists[dm_particle_id])).sum()
# Set our discovery threshold to the p-value we want divided by the
@@ -622,7 +693,7 @@ if __name__ == '__main__':
data_atm.loc[data_atm.id2 == 22,'energy2'] *= (1+xtrue[3]+np.random.randn(np.count_nonzero(data_atm.id2 == 22))*xtrue[4])
data_atm['ke'] = data_atm['energy1'].fillna(0) + data_atm['energy2'].fillna(0) + data_atm['energy3'].fillna(0)
- xopt, samples = do_fit(dm_particle_id,dm_mass,dm_energy,data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,args.steps,args.print_nll,args.walkers,args.thin)
+ xopt, universe, samples = do_fit(dm_particle_id,dm_mass,dm_energy,data,muon,data_mc,weights,atmo_scale_factor,muon_scale_factor,bins,args.steps,args.print_nll,args.walkers,args.thin,refit=False)
for i in range(len(FIT_PARS)):
# The "pull plots" we make here are actually produced via a