diff options
author | tlatorre <tlatorre@uchicago.edu> | 2020-11-03 08:10:42 -0600 |
---|---|---|
committer | tlatorre <tlatorre@uchicago.edu> | 2020-11-03 08:10:42 -0600 |
commit | 3958f04ff6202f0f9f137285fbebdef399b8851d (patch) | |
tree | 5a52a9c6ec7cf02905e941b9320e994dcb7a3ac4 | |
parent | 220377d85d8cf71d37a47b160ff71bdfadce8f38 (diff) | |
download | sddm-3958f04ff6202f0f9f137285fbebdef399b8851d.tar.gz sddm-3958f04ff6202f0f9f137285fbebdef399b8851d.tar.bz2 sddm-3958f04ff6202f0f9f137285fbebdef399b8851d.zip |
initial commit of a script to do the dark matter search
-rwxr-xr-x | utils/chi2 | 2 | ||||
-rwxr-xr-x | utils/dm-search | 517 |
2 files changed, 517 insertions, 2 deletions
@@ -68,8 +68,6 @@ FIT_PARS = [ # at that energy level we don't see anything which leads us to expect a major # difference. To be conservative, and because I don't think it really affects # the analysis at all, I'll leave the uncertainty here at 10% anyways. -# - For the electron + muon energy resolution, I don't have any real good way -# to estimate this, so we are conservative and set the error to 10%. PRIORS = [ 1.0, # Atmospheric Neutrino Scale 0.0, # Electron energy scale diff --git a/utils/dm-search b/utils/dm-search new file mode 100755 index 0000000..323aabc --- /dev/null +++ b/utils/dm-search @@ -0,0 +1,517 @@ +#!/usr/bin/env python +# Copyright (c) 2019, Anthony Latorre <tlatorre at uchicago> +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the Free +# Software Foundation, either version 3 of the License, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +# more details. +# +# You should have received a copy of the GNU General Public License along with +# this program. If not, see <https://www.gnu.org/licenses/>. +""" +Script to do final dark matter search analysis. To run it just run: + + $ ./dm-search [list of data fit results] --mc [list of atmospheric MC files] --muon-mc [list of muon MC files] --steps [steps] + +After running you will get a plot showing the limits for back to back dark +matter at a range of energies. +""" +from __future__ import print_function, division +import numpy as np +from scipy.stats import iqr, poisson +from matplotlib.lines import Line2D +from scipy.stats import iqr, norm, beta, percentileofscore +from scipy.special import spence +from itertools import izip_longest +from sddm.stats import * +from sddm.dc import estimate_errors, EPSILON, truncnorm_scaled +import emcee +from sddm import printoptions +from sddm.utils import fast_cdf, correct_energy_bias + +# Likelihood Fit Parameters +# 0 - Atmospheric Neutrino Flux Scale +# 1 - Electron energy bias +# 2 - Electron energy resolution +# 3 - Muon energy bias +# 4 - Muon energy resolution +# 5 - External Muon scale +# 6 - Dark Matter Scale + +# Energy resolution as a fraction of energy for dark matter signal +DM_ENERGY_RESOLUTION = 0.1 + +FIT_PARS = [ + 'Atmospheric Neutrino Flux Scale', + 'Electron energy bias', + 'Electron energy resolution', + 'Muon energy bias', + 'Muon energy resolution', + 'External Muon scale', + 'Dark Matter Scale'] + +# Uncertainty on the energy scale +# +# - the muon energy scale and resolution terms come directly from measurements +# on stopping muons, so those are known well. +# - for electrons, we only have Michel electrons at the low end of our energy +# range, and therefore we don't really have any good way of constraining the +# energy scale or resolution. However, if we assume that the ~7% energy bias +# in the muons is from the single PE distribution (it seems likely to me that +# that is a major part of the bias), then the energy scale should be roughly +# the same. Since the Michel electron distributions are consistent, we leave +# the mean value at 0, but to be conservative, we set the error to 10%. +# - The energy resolution for muons was pretty much spot on, and so we expect +# the same from electrons. In addition, the Michel spectrum is consistent so +# at that energy level we don't see anything which leads us to expect a major +# difference. To be conservative, and because I don't think it really affects +# the analysis at all, I'll leave the uncertainty here at 10% anyways. +PRIORS = [ + 1.0, # Atmospheric Neutrino Scale + 0.015, # Electron energy scale + 0.0, # Electron energy resolution + 0.054, # Muon energy scale + 0.0, # Muon energy resolution + 0.0, # Muon scale + 0.0, # Dark Matter Scale +] + +PRIOR_UNCERTAINTIES = [ + 0.2, # Atmospheric Neutrino Scale + 0.03, # Electron energy scale + 0.05, # Electron energy resolution + 0.011, # Muon energy scale + 0.014, # Muon energy resolution + 10.0, # Muon scale + np.inf,# Dark Matter Scale +] + +# Lower bounds for the fit parameters +PRIORS_LOW = [ + EPSILON, + -10, + EPSILON, + -10, + EPSILON, + EPSILON, + EPSILON, +] + +# Upper bounds for the fit parameters +PRIORS_HIGH = [ + 10, + 10, + 10, + 10, + 10, + 1e9, + 1000, +] + +particle_id = {20: 'e', 22: r'\mu'} + +def plot_hist2(hists, bins, color=None): + for id in (20,22,2020,2022,2222): + if id == 20: + plt.subplot(2,3,1) + elif id == 22: + plt.subplot(2,3,2) + elif id == 2020: + plt.subplot(2,3,4) + elif id == 2022: + plt.subplot(2,3,5) + elif id == 2222: + plt.subplot(2,3,6) + + bincenters = (bins[1:] + bins[:-1])/2 + plt.hist(bincenters, bins=bins, histtype='step', weights=hists[id],color=color) + plt.gca().set_xscale("log") + plt.xlabel("Energy (MeV)") + plt.title('$' + ''.join([particle_id[int(''.join(x))] for x in grouper(str(id),2)]) + '$') + + if len(hists): + plt.tight_layout() + +def get_mc_hists(data,x,bins,scale=1.0,reweight=False): + """ + Returns the expected Monte Carlo histograms for the atmospheric neutrino + background. + + Args: + - data: pandas dataframe of the Monte Carlo events + - x: fit parameters + - bins: histogram bins + - scale: multiply histograms by an overall scale factor + + This function does two basic things: + + 1. apply the energy bias and resolution corrections + 2. histogram the results + + Returns a dictionary mapping particle id combo -> histogram. + """ + df_dict = {} + for id in (20,22,2020,2022,2222): + df_dict[id] = data[data.id == id] + + return get_mc_hists_fast(df_dict,x,bins,scale,reweight) + +def get_mc_hists_fast(df_dict,x,bins,scale=1.0,reweight=False): + """ + Same as get_mc_hists() but the first argument is a dictionary mapping + particle id -> dataframe. This is much faster than selecting the events + from the dataframe every time. + """ + mc_hists = {} + + for id in (20,22,2020,2022,2222): + df = df_dict[id] + + if id == 20: + ke = df.energy1.values*(1+x[1]) + resolution = df.energy1.values*max(EPSILON,x[2]) + elif id == 2020: + ke = df.energy1.values*(1+x[1]) + df.energy2.values*(1+x[1]) + resolution = np.sqrt((df.energy1.values*max(EPSILON,x[2]))**2 + (df.energy2.values*max(EPSILON,x[2]))**2) + elif id == 22: + ke = df.energy1.values*(1+x[3]) + resolution = df.energy1.values*max(EPSILON,x[4]) + elif id == 2222: + ke = df.energy1.values*(1+x[3]) + df.energy2.values*(1+x[3]) + resolution = np.sqrt((df.energy1.values*max(EPSILON,x[4]))**2 + (df.energy2.values*max(EPSILON,x[4]))**2) + elif id == 2022: + ke = df.energy1.values*(1+x[1]) + df.energy2.values*(1+x[3]) + resolution = np.sqrt((df.energy1.values*max(EPSILON,x[2]))**2 + (df.energy2.values*max(EPSILON,x[4]))**2) + + if reweight: + cdf = fast_cdf(bins[:,np.newaxis],ke,resolution)*df.weight.values + else: + cdf = fast_cdf(bins[:,np.newaxis],ke,resolution) + + mc_hists[id] = np.sum(cdf[1:] - cdf[:-1],axis=-1) + mc_hists[id] *= scale + return mc_hists + +def get_data_hists(data,bins,scale=1.0): + """ + Returns the data histogrammed into `bins`. + """ + data_hists = {} + for id in (20,22,2020,2022,2222): + data_hists[id] = np.histogram(data[data.id == id].ke.values,bins=bins)[0]*scale + return data_hists + +def make_nll(dm_particle_id, dm_energy, data, muons, mc, atmo_scale_factor, muon_scale_factor, bins, print_nll=False): + df_dict = {} + for id in (20,22,2020,2022,2222): + df_dict[id] = mc[mc.id == id] + + df_dict_muon = {} + for id in (20,22,2020,2022,2222): + df_dict_muon[id] = muons[muons.id == id] + + data_hists = get_data_hists(data,bins) + + def nll(x, grad=None): + if any(x[i] < 0 for i in (0,2,4,5,6)): + return np.inf + + # Get the Monte Carlo histograms. We need to do this within the + # likelihood function since we apply the energy resolution parameters + # to the Monte Carlo. + mc_hists = get_mc_hists_fast(df_dict,x,bins,scale=1/atmo_scale_factor) + muon_hists = get_mc_hists_fast(df_dict_muon,x,bins,scale=1/muon_scale_factor) + + # Calculate the negative log of the likelihood of observing the data + # given the fit parameters + + nll = 0 + for id in data_hists: + oi = data_hists[id] + ei = mc_hists[id]*x[0] + muon_hists[id]*x[5] + EPSILON + if id == dm_particle_id: + cdf = fast_cdf(bins,dm_energy,dm_energy*DM_ENERGY_RESOLUTION) + dm_hist = np.sum(cdf[1:] - cdf[:-1]) + ei += dm_hist*x[6] + N = ei.sum() + nll -= -N - np.sum(gammaln(oi+1)) + np.sum(oi*np.log(ei)) + + # Add the priors + nll -= norm.logpdf(x[:6],PRIORS[:6],PRIOR_UNCERTAINTIES[:6]).sum() + + if print_nll: + # Print the result + print("nll = %.2f" % nll) + + return nll + return nll + +def do_fit(dm_particle_id,dm_energy,data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,steps,print_nll=False,walkers=100,thin=10): + """ + Run the fit and return the minimum along with samples from running an MCMC + starting near the minimum. + + Args: + - data: pandas dataframe representing the data to fit + - muon: pandas dataframe representing the expected background from + external muons + - data_mc: pandas dataframe representing the expected background from + atmospheric neutrino events + - bins: an array of bins to use for the fit + - steps: the number of MCMC steps to run + + Returns a tuple (xopt, samples) where samples is an array of shape (steps, + number of parameters). + """ + nll = make_nll(dm_particle_id,dm_energy,data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,print_nll) + + pos = np.empty((walkers, len(PRIORS)),dtype=np.double) + for i in range(pos.shape[0]): + pos[i] = sample_priors() + + nwalkers, ndim = pos.shape + + # We use the KDEMove here because I think it should sample the likelihood + # better. Because we have energy scale parameters and we are doing a binned + # likelihood, the likelihood is discontinuous. There can also be several + # local minima. The author of emcee recommends using the KDEMove with a lot + # of workers to try and properly sample a multimodal distribution. In + # addition, I've found that the autocorrelation time for the KDEMove is + # much better than the other moves. + sampler = emcee.EnsembleSampler(nwalkers, ndim, lambda x: -nll(x), moves=emcee.moves.KDEMove()) + with np.errstate(invalid='ignore'): + sampler.run_mcmc(pos, steps) + + print("Mean acceptance fraction: {0:.3f}".format(np.mean(sampler.acceptance_fraction))) + + try: + print("autocorrelation time: ", sampler.get_autocorr_time(quiet=True)) + except Exception as e: + print(e) + + samples = sampler.get_chain(flat=True,thin=thin) + + return sampler.get_chain(flat=True)[sampler.get_log_prob(flat=True).argmax()], samples + +def sample_priors(): + """ + Returns a random sample of the fit parameters from the priors. For the + first 6 parameters we use a truncated normal distribution, and for the last + parameter we use a uniform distribution. + """ + return np.concatenate((truncnorm_scaled(PRIORS_LOW[:6],PRIORS_HIGH[:6],PRIORS[:6],PRIOR_UNCERTAINTIES[:6]),[np.random.uniform(PRIORS_LOW[6],PRIORS_HIGH[6])])) + +def get_dm_sample(n,dm_particle_id,dm_energy,dm_resolution): + ke = norm.rvs(dm_energy,dm_energy*dm_resolution) + id = np.tile(dm_particle_id,n) + data = pd.DataFrame(np.concatenate((ke,id)).T,columns=['ke','id']) + return data + +if __name__ == '__main__': + import argparse + import numpy as np + import pandas as pd + import sys + import h5py + from sddm.plot_energy import * + from sddm.plot import * + from sddm import setup_matplotlib + import nlopt + + parser = argparse.ArgumentParser("plot fit results") + parser.add_argument("filenames", nargs='+', help="input files") + parser.add_argument("--save", action='store_true', default=False, help="save corner plots for backgrounds") + parser.add_argument("--mc", nargs='+', required=True, help="atmospheric MC files") + parser.add_argument("--muon-mc", nargs='+', required=True, help="muon MC files") + parser.add_argument("--nhit-thresh", type=int, default=None, help="nhit threshold to apply to events before processing (should only be used for testing to speed things up)") + parser.add_argument("--steps", type=int, default=1000, help="number of steps in the MCMC chain") + parser.add_argument("--pull", type=int, default=0, help="plot pull plots") + parser.add_argument("--weights", nargs='+', required=True, help="GENIE reweight HDF5 files") + parser.add_argument("--print-nll", action='store_true', default=False, help="print nll values") + parser.add_argument("--walkers", type=int, default=100, help="number of walkers") + parser.add_argument("--thin", type=int, default=10, help="number of steps to thin") + args = parser.parse_args() + + setup_matplotlib(args.save) + + import matplotlib.pyplot as plt + + # Loop over runs to prevent using too much memory + evs = [] + rhdr = pd.concat([read_hdf(filename, "rhdr").assign(filename=filename) for filename in args.filenames],ignore_index=True) + for run, df in rhdr.groupby('run'): + evs.append(get_events(df.filename.values, merge_fits=True, nhit_thresh=args.nhit_thresh)) + ev = pd.concat(evs) + + ev = correct_energy_bias(ev) + + ev_mc = get_events(args.mc, merge_fits=True, nhit_thresh=args.nhit_thresh, apply_nhit_trigger=False) + muon_mc = get_events(args.muon_mc, merge_fits=True, nhit_thresh=args.nhit_thresh, apply_nhit_trigger=False) + weights = pd.concat([read_hdf(filename, "weights") for filename in args.weights],ignore_index=True) + + ev_mc = correct_energy_bias(ev_mc) + muon_mc = correct_energy_bias(muon_mc) + + # Set all prompt events in the MC to be muons + muon_mc.loc[muon_mc.prompt & muon_mc.filename.str.contains("cosmic"),'muon'] = True + + ev = ev.reset_index() + ev_mc = ev_mc.reset_index() + muon_mc = muon_mc.reset_index() + + # 00-orphan cut + ev = ev[(ev.gtid & 0xff) != 0] + ev_mc = ev_mc[(ev_mc.gtid & 0xff) != 0] + muon_mc = muon_mc[(muon_mc.gtid & 0xff) != 0] + + # remove events 200 microseconds after a muon + ev = ev.groupby('run',group_keys=False).apply(muon_follower_cut) + + # Get rid of events which don't have a successful fit + ev = ev[~np.isnan(ev.fmin)] + ev_mc = ev_mc[~np.isnan(ev_mc.fmin)] + muon_mc = muon_mc[~np.isnan(muon_mc.fmin)] + + # require (r/r_psup)^3 < 0.9 + ev = ev[ev.r_psup < 0.9] + ev_mc = ev_mc[ev_mc.r_psup < 0.9] + muon_mc = muon_mc[muon_mc.r_psup < 0.9] + + # require psi < 6 + ev = ev[ev.psi < 6] + ev_mc = ev_mc[ev_mc.psi < 6] + muon_mc = muon_mc[muon_mc.psi < 6] + + data = ev[ev.signal & ev.prompt & ~ev.atm] + data_atm = ev[ev.signal & ev.prompt & ev.atm] + + # Right now we use the muon Monte Carlo in the fit. If you want to use the + # actual data, you can comment the next two lines and then uncomment the + # two after that. + muon = muon_mc[muon_mc.muon & muon_mc.prompt & ~muon_mc.atm] + muon_atm = muon_mc[muon_mc.muon & muon_mc.prompt & muon_mc.atm] + #muon = ev[ev.muon & ev.prompt & ~ev.atm] + #muon_atm = ev[ev.muon & ev.prompt & ev.atm] + + if (~rhdr.run.isin(ev_mc.run)).any(): + print_warning("Error! The following runs have no Monte Carlo: %s" % \ + rhdr.run[~rhdr.run.isin(ev_mc.run)].values) + sys.exit(1) + + ev_mc = ev_mc[ev_mc.run.isin(rhdr.run)] + + data_mc = ev_mc[ev_mc.signal & ev_mc.prompt & ~ev_mc.atm] + data_atm_mc = ev_mc[ev_mc.signal & ev_mc.prompt & ev_mc.atm] + + data_mc = ev_mc[ev_mc.signal & ev_mc.prompt & ~ev_mc.atm] + + bins = np.logspace(np.log10(20),np.log10(10e3),21) + + atmo_scale_factor = 100.0 + muon_scale_factor = len(muon) + len(muon_atm) + + if args.pull: + pull = [[] for i in range(len(FIT_PARS))] + + # Set the random seed so we get reproducible results here + np.random.seed(0) + + for i in range(args.pull): + xtrue = sample_priors() + + # Calculate expected number of events + N = len(data_mc)*xtrue[0]/atmo_scale_factor + N_atm = len(data_atm_mc)*xtrue[0]/atmo_scale_factor + N_muon = len(muon)*xtrue[5]/muon_scale_factor + N_muon_atm = len(muon_atm)*xtrue[5]/muon_scale_factor + N_dm = xtrue[6] + + # Calculate observed number of events + n = np.random.poisson(N) + n_atm = np.random.poisson(N_atm) + n_muon = np.random.poisson(N_muon) + n_muon_atm = np.random.poisson(N_muon_atm) + n_dm = np.random.poisson(N_dm) + + dm_particle_id = np.random.choice([2020,2222]) + dm_energy = np.random.uniform(20,10e3) + + # Sample data from Monte Carlo + data = pd.concat((data_mc.sample(n=n,replace=True), muon.sample(n=n_muon,replace=True),get_dm_sample(n_dm,dm_particle_id,dm_energy,DM_ENERGY_RESOLUTION))) + data_atm = pd.concat((data_atm_mc.sample(n=n_atm,replace=True), muon_atm.sample(n=n_muon_atm,replace=True))) + + # Smear the energies by the additional energy resolution + data.loc[data.id1 == 20,'energy1'] *= (1+xtrue[1]+np.random.randn(np.count_nonzero(data.id1 == 20))*xtrue[2]) + data.loc[data.id1 == 22,'energy1'] *= (1+xtrue[3]+np.random.randn(np.count_nonzero(data.id1 == 22))*xtrue[4]) + data.loc[data.id2 == 20,'energy2'] *= (1+xtrue[1]+np.random.randn(np.count_nonzero(data.id2 == 20))*xtrue[2]) + data.loc[data.id2 == 22,'energy2'] *= (1+xtrue[3]+np.random.randn(np.count_nonzero(data.id2 == 22))*xtrue[4]) + data['ke'] = data['energy1'].fillna(0) + data['energy2'].fillna(0) + data['energy3'].fillna(0) + + data_atm.loc[data_atm.id1 == 20,'energy1'] *= (1+xtrue[1]+np.random.randn(np.count_nonzero(data_atm.id1 == 20))*xtrue[2]) + data_atm.loc[data_atm.id1 == 22,'energy1'] *= (1+xtrue[3]+np.random.randn(np.count_nonzero(data_atm.id1 == 22))*xtrue[4]) + data_atm.loc[data_atm.id2 == 20,'energy2'] *= (1+xtrue[1]+np.random.randn(np.count_nonzero(data_atm.id2 == 20))*xtrue[2]) + data_atm.loc[data_atm.id2 == 22,'energy2'] *= (1+xtrue[3]+np.random.randn(np.count_nonzero(data_atm.id2 == 22))*xtrue[4]) + data_atm['ke'] = data_atm['energy1'].fillna(0) + data_atm['energy2'].fillna(0) + data_atm['energy3'].fillna(0) + + xopt, samples = do_fit(data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,args.steps,args.print_nll,args.walkers,args.thin) + + for i in range(len(FIT_PARS)): + # The "pull plots" we make here are actually produced via a + # procedure called "Simulation Based Calibration". + # + # See https://arxiv.org/abs/1804.06788. + pull[i].append(percentileofscore(samples[:,i],xtrue[i])) + + fig = plt.figure() + axes = [] + for i, name in enumerate(FIT_PARS): + axes.append(plt.subplot(3,2,i+1)) + n, bins, patches = plt.hist(pull[i],bins=np.linspace(0,100,11),histtype='step') + expected = len(pull[i])/(len(bins)-1) + plt.axhline(expected,color='k',ls='--',alpha=0.25) + plt.axhspan(poisson.ppf(0.005,expected), poisson.ppf(0.995,expected), facecolor='0.5', alpha=0.25) + plt.title(name) + for ax in axes: + despine(ax=ax,left=True,trim=True) + ax.get_yaxis().set_visible(False) + plt.tight_layout() + + if args.save: + fig.savefig("dm_search_pull_plot.pdf") + fig.savefig("dm_search_pull_plot.eps") + else: + plt.show() + + sys.exit(0) + + dm_energies = np.logspace(np.log10(20),np.log10(10e3),10) + limits = {} + for dm_particle_id in (2020,2222): + limits[dm_particle_id] = [] + for dm_energy in dm_energies: + xopt, samples = do_fit(dm_particle_id,dm_energy,data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,args.steps,args.print_nll,args.walkers,args.thin) + + limit = np.percentile(samples[:,6],90) + limits[dm_particle_id].append(limit) + + fig = plt.figure() + for dm_particle_id in (2020,2222): + plt.plot(dm_energies,limits[dm_particle_id],label='$' + ''.join([particle_id[int(''.join(x))] for x in grouper(str(dm_particle_id),2)]) + '$') + plt.gca().set_xscale("log") + despine(fig,trim=True) + plt.xlabel("Energy (MeV)") + plt.ylabel("Event Rate Limit (events)") + plt.legend() + plt.tight_layout() + + if args.save: + plt.savefig("dm_search_limit.pdf") + plt.savefig("dm_search_limit.eps") + else: + plt.suptitle("Dark Matter Limits") + plt.show() |