GNU GENERAL PUBLIC LICENSE
Version 3, 29 June 2007
Copyright (C) 2007 Free Software Foundation, Inc.
#!/usr/bin/env python
# Copyright (c) 2019, Anthony Latorre <tlatorre at uchicago>
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
# more details.
#
# You should have received a copy of the GNU General Public License along with
# this program. If not, see <https://www.gnu.org/licenses/>.
"""
Script to do final dark matter search analysis. To run it just run:
$ ./dm-search [list of data fit results] --mc [list of atmospheric MC files] --muon-mc [list of muon MC files] --steps [steps]
After running you will get a plot showing the limits for back to back dark
matter at a range of energies.
"""
from __future__ import print_function, division
import numpy as np
from scipy.stats import iqr, poisson
from matplotlib.lines import Line2D
from scipy.stats import iqr, norm, beta, percentileofscore
from scipy.special import spence
from sddm.stats import *
from sddm.dc import estimate_errors, EPSILON, truncnorm_scaled
import emcee
from sddm import printoptions
from sddm.utils import fast_cdf, correct_energy_bias
from scipy.integrate import quad
from sddm.dm import *
from sddm import SNOMAN_MASS, AV_RADIUS
import nlopt
from itertools import chain
# Likelihood Fit Parameters
# 0 - Atmospheric Neutrino Flux Scale
# 1 - Electron energy bias
# 2 - Electron energy resolution
# 3 - Muon energy bias
# 4 - Muon energy resolution
# 5 - External Muon scale
# 6 - Dark Matter Scale
# Number of events to use in the dark matter Monte Carlo histogram when fitting
# Ideally this would be as big as possible, but the bigger it is, the more time
# the fit takes.
DM_SAMPLES = 10000
DM_MASSES = {2020: np.logspace(np.log10(22),np.log10(1e3),101),
2222: np.logspace(np.log10(318),np.log10(1e3),101)}
DISCOVERY_P_VALUE = 0.05
FIT_PARS = [
'Atmospheric Neutrino Flux Scale',
'Electron energy bias',
'Electron energy resolution',
'Muon energy bias',
'Muon energy resolution',
'External Muon scale',
'Dark Matter Scale']
# Uncertainty on the energy scale
#
# - the muon energy scale and resolution terms come directly from measurements
# on stopping muons, so those are known well.
# - for electrons, we only have Michel electrons at the low end of our energy
# range, and therefore we don't really have any good way of constraining the
# energy scale or resolution. However, if we assume that the ~7% energy bias
# in the muons is from the single PE distribution (it seems likely to me that
# that is a major part of the bias), then the energy scale should be roughly
# the same. Since the Michel electron distributions are consistent, we leave
# the mean value at 0, but to be conservative, we set the error to 10%.
# - The energy resolution for muons was pretty much spot on, and so we expect
# the same from electrons. In addition, the Michel spectrum is consistent so
# at that energy level we don't see anything which leads us to expect a major
# difference. To be conservative, and because I don't think it really affects
# the analysis at all, I'll leave the uncertainty here at 10% anyways.
PRIORS = [
1.0, # Atmospheric Neutrino Scale
0.015, # Electron energy scale
0.0, # Electron energy resolution
0.053, # Muon energy scale
0.0, # Muon energy resolution
0.0, # Muon scale
0.0, # Dark Matter Scale
]
PRIOR_UNCERTAINTIES = [
0.2, # Atmospheric Neutrino Scale
0.03, # Electron energy scale
0.05, # Electron energy resolution
0.01, # Muon energy scale
0.013, # Muon energy resolution
10.0, # Muon scale
np.inf,# Dark Matter Scale
]
# Lower bounds for the fit parameters
PRIORS_LOW = [
EPSILON,
-10,
EPSILON,
-10,
EPSILON,
0,
0,
]
# Upper bounds for the fit parameters
PRIORS_HIGH = [
10,
10,
10,
10,
10,
1e9,
1000,
]
particle_id = {20: 'e', 22: r'\mu'}
def plot_hist2(hists, bins, color=None):
for id in (20,22,2020,2022,2222):
if id == 20:
plt.subplot(2,3,1)
elif id == 22:
plt.subplot(2,3,2)
elif id == 2020:
plt.subplot(2,3,4)
elif id == 2022:
plt.subplot(2,3,5)
elif id == 2222:
plt.subplot(2,3,6)
bincenters = (bins[id][1:] + bins[id][:-1])/2
plt.hist(bincenters, bins=bins[id], histtype='step', weights=hists[id],color=color)
plt.gca().set_xscale("log")
major = np.array([10,100,1000,10000])
minor = np.unique(list(chain(*list(range(i,i*10,i) for i in major[:-1]))))
minor = np.setdiff1d(minor,major)
major = major[major <= bins[id][-1]]
minor = minor[minor <= bins[id][-1]]
plt.gca().set_xticks(major)
plt.gca().set_xticks(minor,minor=True)
plt.gca().set_xlim(10,10000)
plt.xlabel("Energy (MeV)")
plt.title('$' + ''.join([particle_id[int(''.join(x))] for x in grouper(str(id),2)]) + '$')
if len(hists):
plt.tight_layout()
def get_mc_hists(data,x,bins,scale=1.0,reweight=False):
"""
Returns the expected Monte Carlo histograms for the atmospheric neutrino
background.
Args:
- data: pandas dataframe of the Monte Carlo events
- x: fit parameters
- bins: histogram bins
- scale: multiply histograms by an overall scale factor
This function does two basic things:
1. apply the energy bias and resolution corrections
2. histogram the results
Returns a dictionary mapping particle id combo -> histogram.
"""
df_dict = {}
for id in (20,22,2020,2022,2222):
df_dict[id] = data[data.id == id]
return get_mc_hists_fast(df_dict,x,bins,scale,reweight)
def get_mc_hists_fast(df_dict,x,bins,scale=1.0,reweight=False):
"""
Same as get_mc_hists() but the first argument is a dictionary mapping
particle id -> dataframe. This is much faster than selecting the events
from the dataframe every time.
"""
mc_hists = {}
for id in (20,22,2020,2022,2222):
df = df_dict[id]
if id == 20:
ke = df.energy1.values*(1+x[1])
resolution = df.energy1.values*max(EPSILON,x[2])
elif id == 2020:
ke = df.energy1.values*(1+x[1]) + df.energy2.values*(1+x[1])
resolution = np.sqrt((df.energy1.values*max(EPSILON,x[2]))**2 + (df.energy2.values*max(EPSILON,x[2]))**2)
elif id == 22:
ke = df.energy1.values*(1+x[3])
resolution = df.energy1.values*max(EPSILON,x[4])
elif id == 2222:
ke = df.energy1.values*(1+x[3]) + df.energy2.values*(1+x[3])
resolution = np.sqrt((df.energy1.values*max(EPSILON,x[4]))**2 + (df.energy2.values*max(EPSILON,x[4]))**2)
elif id == 2022:
ke = df.energy1.values*(1+x[1]) + df.energy2.values*(1+x[3])
resolution = np.sqrt((df.energy1.values*max(EPSILON,x[2]))**2 + (df.energy2.values*max(EPSILON,x[4]))**2)
if reweight:
cdf = fast_cdf(bins[id][:,np.newaxis],ke,resolution)*df.weight.values
else:
cdf = fast_cdf(bins[id][:,np.newaxis],ke,resolution)
if 'flux_weight' in df.columns:
cdf *= df.flux_weight.values
mc_hists[id] = np.sum(cdf[1:] - cdf[:-1],axis=-1)
mc_hists[id] *= scale
return mc_hists
def get_data_hists(data,bins,scale=1.0):
"""
Returns the data histogrammed into `bins`.
"""
data_hists = {}
for id in (20,22,2020,2022,2222):
data_hists[id] = np.histogram(data[data.id == id].ke.values,bins=bins[id])[0]*scale
return data_hists
def make_nll(dm_particle_id, dm_mass, dm_energy, data, muons, mc, atmo_scale_factor, muon_scale_factor, bins, reweight=False, print_nll=False, dm_sample=None):
df_dict = dict(tuple(mc.groupby('id')))
for id in (20,22,2020,2022,2222):
if id not in df_dict:
df_dict[id] = mc.iloc[:0]
df_dict_muon = dict(tuple(muons.groupby('id')))
for id in (20,22,2020,2022,2222):
if id not in df_dict_muon:
df_dict_muon[id] = muons.iloc[:0]
data_hists = get_data_hists(data,bins)
if dm_sample is None:
dm_sample = get_dm_sample(DM_SAMPLES,dm_particle_id,dm_mass,dm_energy)
df_dict_dm = {}
for id in (20,22,2020,2022,2222):
df_dict_dm[id] = dm_sample[dm_sample.id == id]
def nll(x, grad=None):
if (x < PRIORS_LOW).any() or (x > PRIORS_HIGH).any():
return np.inf
# Get the Monte Carlo histograms. We need to do this within the
# likelihood function since we apply the energy resolution parameters
# to the Monte Carlo.
mc_hists = get_mc_hists_fast(df_dict,x,bins,scale=1/atmo_scale_factor,reweight=reweight)
muon_hists = get_mc_hists_fast(df_dict_muon,x,bins,scale=1/muon_scale_factor)
dm_hists = get_mc_hists_fast(df_dict_dm,x,bins,scale=1/len(dm_sample))
# Calculate the negative log of the likelihood of observing the data
# given the fit parameters
nll = 0
for id in data_hists:
oi = data_hists[id]
ei = mc_hists[id]*x[0] + muon_hists[id]*x[5] + dm_hists[id]*x[6] + EPSILON
N = ei.sum()
nll -= -N - np.sum(gammaln(oi+1)) + np.sum(oi*np.log(ei))
# Add the priors
nll -= norm.logpdf(x[:6],PRIORS[:6],PRIOR_UNCERTAINTIES[:6]).sum()
if print_nll:
# Print the result
print("nll = %.2f" % nll)
return nll
return nll
def do_fit(dm_particle_id,dm_mass,dm_energy,data,muon,data_mc,weights,atmo_scale_factor,muon_scale_factor,bins,steps,print_nll=False,walkers=100,thin=10,refit=True):
"""
Run the fit and return the minimum along with samples from running an MCMC
starting near the minimum.
Args:
- data: pandas dataframe representing the data to fit
- muon: pandas dataframe representing the expected background from
external muons
- data_mc: pandas dataframe representing the expected background from
atmospheric neutrino events
- weights: pandas dataframe with the GENIE weights
- bins: an array of bins to use for the fit
- steps: the number of MCMC steps to run
Returns a tuple (xopt, samples) where samples is an array of shape (steps,
number of parameters).
"""
dm_sample = get_dm_sample(DM_SAMPLES,dm_particle_id,dm_mass,dm_energy)
nll = make_nll(dm_particle_id,dm_mass,dm_energy,data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,print_nll,dm_sample=dm_sample)
pos = np.empty((walkers, len(PRIORS)),dtype=np.double)
for i in range(pos.shape[0]):
pos[i] = sample_priors()
nwalkers, ndim = pos.shape
# We use the KDEMove here because I think it should sample the likelihood
# better. Because we have energy scale parameters and we are doing a binned
# likelihood, the likelihood is discontinuous. There can also be several
# local minima. The author of emcee recommends using the KDEMove with a lot
# of workers to try and properly sample a multimodal distribution. In
# addition, I've found that the autocorrelation time for the KDEMove is
# much better than the other moves.
sampler = emcee.EnsembleSampler(nwalkers, ndim, lambda x: -nll(x), moves=emcee.moves.KDEMove())
with np.errstate(invalid='ignore'):
sampler.run_mcmc(pos, steps)
print("Mean acceptance fraction: {0:.3f}".format(np.mean(sampler.acceptance_fraction)))
try:
print("autocorrelation time: ", sampler.get_autocorr_time(quiet=True))
except Exception as e:
print(e)
samples = sampler.get_chain(flat=True,thin=thin)
# Now, we use nlopt to find the best set of parameters. We start at the
# best starting point from the MCMC and then run the SBPLX routine.
x0 = sampler.get_chain(flat=True)[sampler.get_log_prob(flat=True).argmax()]
opt = nlopt.opt(nlopt.LN_SBPLX, len(x0))
opt.set_min_objective(nll)
low = np.array(PRIORS_LOW)
high = np.array(PRIORS_HIGH)
if refit:
# If we are refitting, we want to do the first fit assuming no dark
# matter to make sure we get the best GENIE systematics for the null
# hypothesis.
x0[6] = low[6]
high[6] = low[6]
opt.set_lower_bounds(low)
opt.set_upper_bounds(high)
opt.set_ftol_abs(1e-10)
opt.set_initial_step([0.01]*len(x0))
xopt = opt.optimize(x0)
# Get the total number of "universes" simulated in the GENIE reweight tool
nuniverses = max(weights.keys())+1
nlls = []
for universe in range(nuniverses):
data_mc_with_weights = pd.merge(data_mc,weights[universe],how='left',on=['run','unique_id'])
data_mc_with_weights.weight = data_mc_with_weights.weight.fillna(1.0)
nll = make_nll(dm_particle_id,dm_mass,dm_energy,data,muon,data_mc_with_weights,atmo_scale_factor,muon_scale_factor,bins,reweight=True,print_nll=print_nll,dm_sample=dm_sample)
nlls.append(nll(xopt))
universe = np.argmin(nlls)
if refit:
data_mc_with_weights = pd.merge(data_mc,weights[universe],how='left',on=['run','unique_id'])
data_mc_with_weights.weight = data_mc_with_weights.weight.fillna(1.0)
# Create a new negative log likelihood function with the weighted Monte Carlo.
nll = make_nll(dm_particle_id,dm_mass,dm_energy,data,muon,data_mc_with_weights,atmo_scale_factor,muon_scale_factor,bins,reweight=True,print_nll=print_nll,dm_sample=dm_sample)
# Now, we refit with the Monte Carlo weighted by the most likely GENIE
# systematics.
pos = np.empty((walkers, len(PRIORS)),dtype=np.double)
for i in range(pos.shape[0]):
pos[i] = sample_priors()
nwalkers, ndim = pos.shape
# We use the KDEMove here because I think it should sample the likelihood
# better. Because we have energy scale parameters and we are doing a binned
# likelihood, the likelihood is discontinuous. There can also be several
# local minima. The author of emcee recommends using the KDEMove with a lot
# of workers to try and properly sample a multimodal distribution. In
# addition, I've found that the autocorrelation time for the KDEMove is
# much better than the other moves.
sampler = emcee.EnsembleSampler(nwalkers, ndim, lambda x: -nll(x), moves=emcee.moves.KDEMove())
with np.errstate(invalid='ignore'):
sampler.run_mcmc(pos, steps)
print("Mean acceptance fraction: {0:.3f}".format(np.mean(sampler.acceptance_fraction)))
try:
print("autocorrelation time: ", sampler.get_autocorr_time(quiet=True))
except Exception as e:
print(e)
samples = sampler.get_chain(flat=True,thin=thin)
# Now, we use nlopt to find the best set of parameters. We start at the
# best starting point from the MCMC and then run the SBPLX routine.
x0 = sampler.get_chain(flat=True)[sampler.get_log_prob(flat=True).argmax()]
opt = nlopt.opt(nlopt.LN_SBPLX, len(x0))
opt.set_min_objective(nll)
low = np.array(PRIORS_LOW)
high = np.array(PRIORS_HIGH)
opt.set_lower_bounds(low)
opt.set_upper_bounds(high)
opt.set_ftol_abs(1e-10)
opt.set_initial_step([0.01]*len(x0))
xopt = opt.optimize(x0)
return xopt, universe, samples
def sample_priors():
"""
Returns a random sample of the fit parameters from the priors. For the
first 6 parameters we use a truncated normal distribution, and for the last
parameter we use a uniform distribution.
"""
return np.concatenate((truncnorm_scaled(PRIORS_LOW[:6],PRIORS_HIGH[:6],PRIORS[:6],PRIOR_UNCERTAINTIES[:6]),[np.random.uniform(PRIORS_LOW[6],PRIORS_HIGH[6])]))
def get_dm_sample(n,dm_particle_id,dm_mass,dm_energy):
"""
Returns a dataframe containing events from a dark matter particle.
Args:
- n: int
number of events
- dm_particle_id: int
the particle id of the DM particle (2020 or 2222)
- dm_energy: float
The total kinetic energy of the DM particle
- dm_resolution: float
The fractional energy resolution of the dark matter particle, i.e.
the actual energy resolution will be dm_energy*dm_resolution.
"""
id1 = dm_particle_id//100
id2 = dm_particle_id % 100
m1 = SNOMAN_MASS[id1]
m2 = SNOMAN_MASS[id2]
energy1 = []
data = np.empty(n,dtype=[('energy1',np.double),('energy2',np.double),('ke',np.double),('id1',np.int),('id2',np.int),('id',np.int)])
for i, (v1, v2) in enumerate(islice(gen_decay(dm_mass,dm_energy,m1,m2),n)):
E1 = v1[0]
E2 = v2[0]
T1 = E1 - m1
T2 = E2 - m2
data[i] = T1, T2, T1 + T2, id1, id2, dm_particle_id
# FIXME: Get electron and muon resolution
data['energy1'] += norm.rvs(scale=data['energy1']*0.05)
data['energy2'] += norm.rvs(scale=data['energy2']*0.05)
return pd.DataFrame(data)
def get_limits(dm_masses,data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,steps,print_nll,walkers,thin):
limits = {}
best_fit = {}
discovery_array = {}
for dm_particle_id in (2020,2222):
limits[dm_particle_id] = np.empty(len(dm_masses[dm_particle_id]))
best_fit[dm_particle_id] = np.empty(len(dm_masses[dm_particle_id]))
discovery_array[dm_particle_id] = np.empty(len(dm_masses[dm_particle_id]))
for i, dm_mass in enumerate(dm_masses[dm_particle_id]):
id1 = dm_particle_id//100
id2 = dm_particle_id % 100
m1 = SNOMAN_MASS[id1]
m2 = SNOMAN_MASS[id2]
dm_energy = dm_mass
xopt, universe, samples = do_fit(dm_particle_id,dm_mass,dm_energy,data,muon,data_mc,weights,atmo_scale_factor,muon_scale_factor,bins,steps,print_nll,walkers,thin)
data_mc_with_weights = pd.merge(data_mc,weights[universe],how='left',on=['run','unique_id'])
data_mc_with_weights.weight = data_mc_with_weights.weight.fillna(1.0)
limit = np.percentile(samples[:,6],90)
limits[dm_particle_id][i] = limit
# Here, to determine if there is a discovery we make an approximate
# calculation of the number of events which would be significant.
#
# We expect the likelihood to be approximately that of a Poisson
# distribution with n background events and we are searching for a
# signal s. n is constrained by the rest of the histograms, and so
# we can treat is as being approximately fixed. In this case, the
# likelihood looks approximately like:
#
# P(s) = e^(-(s+n))(s+n)**i/i!
#
# Where i is the actual number of events. Under the null hypothesis
# (i.e. no dark matter), we expect i to be Poisson distributed with
# mean n. Therefore s should have the same distribution but offset
# by n. Therefore, to determine the threshold, we simply look for
# the threshold we expect in n and then subtract n.
dm_kinetic_energy = dm_energy - m1 - m2
dm_sample = get_dm_sample(DM_SAMPLES,dm_particle_id,dm_mass,dm_energy)
# To calculate `n` we approximately want the number of events in
# the bin which most of the dark matter events will fall. However,
# to smoothly transition between bins, we multiply the normalized
# dark matter histogram with the expected MC histogram and then
# take the sum. In the case that the dark matter events all fall
# into a single bin, this gives us that bin, but smoothly
# interpolates between the bins.
dm_hists = get_mc_hists(dm_sample,xopt,bins,scale=1/len(dm_sample))
frac = dm_hists[dm_particle_id].sum()
dm_hists[dm_particle_id] /= frac
mc_hists = get_mc_hists(data_mc_with_weights,xopt,bins,scale=xopt[0]/atmo_scale_factor,reweight=True)
muon_hists = get_mc_hists(muon,xopt,bins,scale=xopt[5]/muon_scale_factor)
n = (dm_hists[dm_particle_id]*(mc_hists[dm_particle_id] + muon_hists[dm_particle_id])).sum()
# Set our discovery threshold to the p-value we want divided by the
# number of bins. The idea here is that the number of bins is
# approximately equal to the number of trials so we need to
# increase our discovery threshold to account for the look
# elsewhere effect.
threshold = DISCOVERY_P_VALUE/(len(bins[dm_particle_id])-1)
discovery = poisson.ppf(1-threshold,n) + 1 - n
# Here, we scale the discovery threshold by the fraction of the
# dark matter hist in the histogram range. The idea is that if only
# a small fraction of the dark matter histogram falls into the
# histogram range, the total number of dark matter events returned
# by the fit can be larger by this amount. I noticed this when
# testing under the null hypothesis that the majority of the
# "discoveries" were on the edge of the histogram.
discovery_array[dm_particle_id][i] = discovery/frac
best_fit[dm_particle_id][i] = xopt[6]
return limits, best_fit, discovery_array
if __name__ == '__main__':
import argparse
import numpy as np
import pandas as pd
import sys
import h5py
from sddm.plot_energy import *
from sddm.plot import *
from sddm import setup_matplotlib
import nlopt
from sddm.renormalize import *
parser = argparse.ArgumentParser("plot fit results")
parser.add_argument("filenames", nargs='+', help="input files")
parser.add_argument("--save", action='store_true', default=False, help="save corner plots for backgrounds")
parser.add_argument("--mc", nargs='+', required=True, help="atmospheric MC files")
parser.add_argument("--muon-mc", nargs='+', required=True, help="muon MC files")
parser.add_argument("--nhit-thresh", type=int, default=None, help="nhit threshold to apply to events before processing (should only be used for testing to speed things up)")
parser.add_argument("--steps", type=int, default=1000, help="number of steps in the MCMC chain")
parser.add_argument("--pull", type=int, default=0, help="plot pull plots")
parser.add_argument("--weights", nargs='+', required=True, help="GENIE reweight HDF5 files")
parser.add_argument("--print-nll", action='store_true', default=False, help="print nll values")
parser.add_argument("--walkers", type=int, default=100, help="number of walkers")
parser.add_argument("--thin", type=int, default=10, help="number of steps to thin")
parser.add_argument("--test", type=int, default=0, help="run tests to check discovery threshold")
parser.add_argument("--run-list", default=None, help="run list")
parser.add_argument("--mcpl", nargs='+', required=True, help="GENIE MCPL files")
parser.add_argument("--run-info", required=True, help="run_info.log autosno file")
args = parser.parse_args()
setup_matplotlib(args.save)
import matplotlib.pyplot as plt
rhdr = pd.concat([read_hdf(filename, "rhdr").assign(filename=filename) for filename in args.filenames],ignore_index=True)
if args.run_list is not None:
run_list = np.genfromtxt(args.run_list)
rhdr = rhdr[rhdr.run.isin(run_list)]
# Loop over runs to prevent using too much memory
evs = []
for run, df in rhdr.groupby('run'):
evs.append(get_events(df.filename.values, merge_fits=True, nhit_thresh=args.nhit_thresh))
ev = pd.concat(evs).reset_index()
livetime = 0.0
livetime_pulse_gt = 0.0
for _ev in evs:
if not np.isnan(_ev.attrs['time_10_mhz']):
livetime += _ev.attrs['time_10_mhz']
else:
livetime += _ev.attrs['time_pulse_gt']
livetime_pulse_gt += _ev.attrs['time_pulse_gt']
print("livetime = %.2f" % livetime)
print("livetime (pulse gt) = %.2f" % livetime_pulse_gt)
if args.run_info:
livetime_run_info = 0.0
run_info = np.genfromtxt(args.run_info,usecols=range(4),dtype=(np.int,np.int,np.double,np.double))
for run in set(ev.run.values):
for i in range(run_info.shape[0]):
if run_info[i][0] == run:
livetime_run_info += run_info[i][3]
print("livetime (run info) = %.2f" % livetime_run_info)
ev = correct_energy_bias(ev)
# Note: We loop over the MC filenames here instead of just passing the
# whole list to get_events() because I had to rerun some of the MC events
# using SNOMAN and so most of the runs actually have two different files
# and otherwise the GTIDs will clash
ev_mcs = []
for filename in args.mc:
ev_mcs.append(get_events([filename], merge_fits=True, nhit_thresh=args.nhit_thresh, mc=True))
ev_mc = pd.concat([ev_mc for ev_mc in ev_mcs if len(ev_mc) > 0]).reset_index()
if (~rhdr.run.isin(ev_mc.run)).any():
print_warning("Error! The following runs have no Monte Carlo: %s" % \
np.unique(rhdr.run[~rhdr.run.isin(ev_mc.run)].values))
muon_mc = get_events(args.muon_mc, merge_fits=True, nhit_thresh=args.nhit_thresh, mc=True)
weights = pd.concat([read_hdf(filename, "weights") for filename in args.weights],ignore_index=True)
# Add the "flux_weight" column to the ev_mc data since I stupidly simulated
# the muon neutrino flux for the tau neutrino flux in GENIE. Doh!
mcpl = load_mcpl_files(args.mcpl)
ev_mc = renormalize_data(ev_mc,mcpl)
# Merge weights with MCPL dataframe to get the unique id column in the
# weights dataframe since that is what we use to merge with the Monte
# Carlo.
weights = pd.merge(weights,mcpl[['run','evn','unique_id']],on=['run','evn'],how='left')
# There are a handful of weights which turn out to be slightly negative for
# some reason. For example:
#
# run evn universe weight
# 10970 25 597 -0.000055
# 11389 87 729 -0.021397
# 11701 204 2 -0.000268
# 11919 120 82 -0.002245
# 11976 163 48 -0.000306
# 11976 163 710 -0.000022
# 12131 76 175 -0.000513
# 12207 70 255 -0.002925
# 12207 70 282 -0.014856
# 12207 70 368 -0.030593
# 12207 70 453 -0.019011
# 12207 70 520 -0.020748
# 12207 70 834 -0.028754
# 12207 70 942 -0.020309
# 12233 230 567 -0.000143
# 12618 168 235 -0.000020
# 13428 128 42 -0.083639
# 14264 23 995 -0.017637
# 15034 69 624 -0.000143
# 15752 154 957 -0.006827
weights = weights[weights.weight > 0]
weights = dict(tuple(weights.groupby('universe')))
ev_mc = correct_energy_bias(ev_mc)
muon_mc = correct_energy_bias(muon_mc)
# Set all prompt events in the MC to be muons
muon_mc.loc[muon_mc.prompt,'muon'] = True
# 00-orphan cut
ev = ev[(ev.gtid & 0xff) != 0]
ev_mc = ev_mc[(ev_mc.gtid & 0xff) != 0]
muon_mc = muon_mc[(muon_mc.gtid & 0xff) != 0]
# remove events 200 microseconds after a muon
ev = ev.groupby('run',group_keys=False).apply(muon_follower_cut)
# Get rid of events which don't have a successful fit
ev = ev[~np.isnan(ev.fmin)]
ev_mc = ev_mc[~np.isnan(ev_mc.fmin)]
muon_mc = muon_mc[~np.isnan(muon_mc.fmin)]
# require (r < av radius)
ev = ev[ev.r < AV_RADIUS]
ev_mc = ev_mc[ev_mc.r < AV_RADIUS]
muon_mc = muon_mc[muon_mc.r < AV_RADIUS]
fiducial_volume = (4/3)*np.pi*(AV_RADIUS)**3
# require psi < 6
ev = ev[ev.psi < 6]
ev_mc = ev_mc[ev_mc.psi < 6]
muon_mc = muon_mc[muon_mc.psi < 6]
data = ev[ev.signal & ev.prompt & ~ev.atm]
data_atm = ev[ev.signal & ev.prompt & ev.atm]
# Right now we use the muon Monte Carlo in the fit. If you want to use the
# actual data, you can comment the next two lines and then uncomment the
# two after that.
muon = muon_mc[muon_mc.muon & muon_mc.prompt & ~muon_mc.atm]
muon_atm = muon_mc[muon_mc.muon & muon_mc.prompt & muon_mc.atm]
#muon = ev[ev.muon & ev.prompt & ~ev.atm]
#muon_atm = ev[ev.muon & ev.prompt & ev.atm]
if not args.pull and not args.test:
ev_mc = ev_mc[ev_mc.run.isin(rhdr.run)]
data_mc = ev_mc[ev_mc.signal & ev_mc.prompt & ~ev_mc.atm]
data_atm_mc = ev_mc[ev_mc.signal & ev_mc.prompt & ev_mc.atm]
bins = {20:np.logspace(np.log10(20),np.log10(10e3),21),
22:np.logspace(np.log10(20),np.log10(10e3),21)[:-5],
2020:np.logspace(np.log10(20),np.log10(10e3),21),
2022:np.logspace(np.log10(20),np.log10(10e3),21)[:-5],
2222:np.logspace(np.log10(20),np.log10(10e3),21)[:-5]}
atmo_scale_factor = 100.0
muon_scale_factor = len(muon) + len(muon_atm)
if args.pull:
pull = [[] for i in range(len(FIT_PARS))]
# Set the random seed so we get reproducible results here
np.random.seed(0)
for i in range(args.pull):
xtrue = sample_priors()
# Calculate expected number of events
N = len(data_mc)*xtrue[0]/atmo_scale_factor
N_atm = len(data_atm_mc)*xtrue[0]/atmo_scale_factor
N_muon = len(muon)*xtrue[5]/muon_scale_factor
N_muon_atm = len(muon_atm)*xtrue[5]/muon_scale_factor
N_dm = xtrue[6]
# Calculate observed number of events
n = np.random.poisson(N)
n_atm = np.random.poisson(N_atm)
n_muon = np.random.poisson(N_muon)
n_muon_atm = np.random.poisson(N_muon_atm)
n_dm = np.random.poisson(N_dm)
dm_particle_id = np.random.choice([2020,2222])
dm_mass = np.random.uniform(20,10e3)
dm_energy = dm_mass
# Sample data from Monte Carlo
data = pd.concat((data_mc.sample(n=n,weights='flux_weight',replace=True), muon.sample(n=n_muon,replace=True)))
data_atm = pd.concat((data_atm_mc.sample(n=n_atm,weights='flux_weight',replace=True), muon_atm.sample(n=n_muon_atm,replace=True)))
# Smear the energies by the additional energy resolution
data.loc[data.id1 == 20,'energy1'] *= (1+xtrue[1]+np.random.randn(np.count_nonzero(data.id1 == 20))*xtrue[2])
data.loc[data.id1 == 22,'energy1'] *= (1+xtrue[3]+np.random.randn(np.count_nonzero(data.id1 == 22))*xtrue[4])
data.loc[data.id2 == 20,'energy2'] *= (1+xtrue[1]+np.random.randn(np.count_nonzero(data.id2 == 20))*xtrue[2])
data.loc[data.id2 == 22,'energy2'] *= (1+xtrue[3]+np.random.randn(np.count_nonzero(data.id2 == 22))*xtrue[4])
data['ke'] = data['energy1'].fillna(0) + data['energy2'].fillna(0) + data['energy3'].fillna(0)
data_atm.loc[data_atm.id1 == 20,'energy1'] *= (1+xtrue[1]+np.random.randn(np.count_nonzero(data_atm.id1 == 20))*xtrue[2])
data_atm.loc[data_atm.id1 == 22,'energy1'] *= (1+xtrue[3]+np.random.randn(np.count_nonzero(data_atm.id1 == 22))*xtrue[4])
data_atm.loc[data_atm.id2 == 20,'energy2'] *= (1+xtrue[1]+np.random.randn(np.count_nonzero(data_atm.id2 == 20))*xtrue[2])
data_atm.loc[data_atm.id2 == 22,'energy2'] *= (1+xtrue[3]+np.random.randn(np.count_nonzero(data_atm.id2 == 22))*xtrue[4])
data_atm['ke'] = data_atm['energy1'].fillna(0) + data_atm['energy2'].fillna(0) + data_atm['energy3'].fillna(0)
xopt, universe, samples = do_fit(dm_particle_id,dm_mass,dm_energy,data,muon,data_mc,weights,atmo_scale_factor,muon_scale_factor,bins,args.steps,args.print_nll,args.walkers,args.thin,refit=False)
for i in range(len(FIT_PARS)):
# The "pull plots" we make here are actually produced via a
# procedure called "Simulation Based Calibration".
#
# See https://arxiv.org/abs/1804.06788.
pull[i].append(percentileofscore(samples[:,i],xtrue[i]))
fig = plt.figure()
axes = []
for i, name in enumerate(FIT_PARS):
axes.append(plt.subplot(4,2,i+1))
n, bins, patches = plt.hist(pull[i],bins=np.linspace(0,100,11),histtype='step')
expected = len(pull[i])/(len(bins)-1)
plt.axhline(expected,color='k',ls='--',alpha=0.25)
plt.axhspan(poisson.ppf(0.005,expected), poisson.ppf(0.995,expected), facecolor='0.5', alpha=0.25)
plt.title(name)
for ax in axes:
despine(ax=ax,left=True,trim=True)
ax.get_yaxis().set_visible(False)
plt.tight_layout()
if args.save:
fig.savefig("dm_search_pull_plot.pdf")
fig.savefig("dm_search_pull_plot.eps")
else:
plt.show()
sys.exit(0)
if args.test:
# Set the random seed so we get reproducible results here
np.random.seed(0)
data_mc_with_weights = pd.merge(data_mc,weights[0],how='left',on=['run','unique_id'])
data_atm_mc_with_weights = pd.merge(data_atm_mc,weights[0],how='left',on=['run','unique_id'])
discoveries = 0
data_mc_with_weights.weight *= data_mc_with_weights.flux_weight
data_atm_mc_with_weights.weight *= data_atm_mc_with_weights.flux_weight
for i in range(args.test):
xtrue = sample_priors()
# Calculate expected number of events
N = len(data_mc)*xtrue[0]/atmo_scale_factor
N_atm = len(data_atm_mc)*xtrue[0]/atmo_scale_factor
N_muon = len(muon)*xtrue[5]/muon_scale_factor
N_muon_atm = len(muon_atm)*xtrue[5]/muon_scale_factor
# Calculate observed number of events
n = np.random.poisson(N)
n_atm = np.random.poisson(N_atm)
n_muon = np.random.poisson(N_muon)
n_muon_atm = np.random.poisson(N_muon_atm)
# Sample data from Monte Carlo
data = pd.concat((data_mc_with_weights.sample(n=n,replace=True,weights='weight'), muon.sample(n=n_muon,replace=True)))
data_atm = pd.concat((data_atm_mc_with_weights.sample(n=n_atm,replace=True,weights='weight'), muon_atm.sample(n=n_muon_atm,replace=True)))
# Smear the energies by the additional energy resolution
data.loc[data.id1 == 20,'energy1'] *= (1+xtrue[1]+np.random.randn(np.count_nonzero(data.id1 == 20))*xtrue[2])
data.loc[data.id1 == 22,'energy1'] *= (1+xtrue[3]+np.random.randn(np.count_nonzero(data.id1 == 22))*xtrue[4])
data.loc[data.id2 == 20,'energy2'] *= (1+xtrue[1]+np.random.randn(np.count_nonzero(data.id2 == 20))*xtrue[2])
data.loc[data.id2 == 22,'energy2'] *= (1+xtrue[3]+np.random.randn(np.count_nonzero(data.id2 == 22))*xtrue[4])
data['ke'] = data['energy1'].fillna(0) + data['energy2'].fillna(0) + data['energy3'].fillna(0)
data_atm.loc[data_atm.id1 == 20,'energy1'] *= (1+xtrue[1]+np.random.randn(np.count_nonzero(data_atm.id1 == 20))*xtrue[2])
data_atm.loc[data_atm.id1 == 22,'energy1'] *= (1+xtrue[3]+np.random.randn(np.count_nonzero(data_atm.id1 == 22))*xtrue[4])
data_atm.loc[data_atm.id2 == 20,'energy2'] *= (1+xtrue[1]+np.random.randn(np.count_nonzero(data_atm.id2 == 20))*xtrue[2])
data_atm.loc[data_atm.id2 == 22,'energy2'] *= (1+xtrue[3]+np.random.randn(np.count_nonzero(data_atm.id2 == 22))*xtrue[4])
data_atm['ke'] = data_atm['energy1'].fillna(0) + data_atm['energy2'].fillna(0) + data_atm['energy3'].fillna(0)
limits, best_fit, discovery_array = get_limits(DM_MASSES,data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,args.steps,args.print_nll,args.walkers,args.thin)
for id in (2020,2222):
if (best_fit[id] > discovery_array[id]).any():
discoveries += 1
print("expected %.2f discoveries" % DISCOVERY_P_VALUE)
print("actually got %i/%i = %.2f discoveries" % (discoveries,args.test,discoveries/args.test))
sys.exit(0)
limits, best_fit, discovery_array = get_limits(DM_MASSES,data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,args.steps,args.print_nll,args.walkers,args.thin)
fig = plt.figure()
for color, dm_particle_id in zip(('C0','C1'),(2020,2222)):
plt.plot(DM_MASSES[dm_particle_id],np.array(limits[dm_particle_id])*100**3*3600*24*365/fiducial_volume/livetime,color=color,label='$' + ''.join([particle_id[int(''.join(x))] for x in grouper(str(dm_particle_id),2)]) + '$')
plt.gca().set_xscale("log")
despine(fig,trim=True)
plt.xlabel("Energy (MeV)")
plt.ylabel("Event Rate Limit (events/$\mathrm{m}^3$/year)")
plt.legend()
plt.tight_layout()
if args.save:
plt.savefig("dm_search_limit.pdf")
plt.savefig("dm_search_limit.eps")
else:
plt.suptitle("Dark Matter Limits")
fig = plt.figure()
for color, dm_particle_id in zip(('C0','C1'),(2020,2222)):
plt.plot(DM_MASSES[dm_particle_id],best_fit[dm_particle_id],color=color,label='$' + ''.join([particle_id[int(''.join(x))] for x in grouper(str(dm_particle_id),2)]) + '$')
plt.plot(DM_MASSES[dm_particle_id],discovery_array[dm_particle_id],color=color,ls='--')
plt.gca().set_xscale("log")
despine(fig,trim=True)
plt.xlabel("Energy (MeV)")
plt.ylabel("Event Rate Limit (events)")
plt.legend()
plt.tight_layout()
if args.save:
plt.savefig("dm_best_fit_with_discovery_threshold.pdf")
plt.savefig("dm_best_fit_with_discovery_threshold.eps")
else:
plt.suptitle("Best Fit Dark Matter")
plt.show()