#!/usr/bin/env python # Copyright (c) 2019, Anthony Latorre # # This program is free software: you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by the Free # Software Foundation, either version 3 of the License, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for # more details. # # You should have received a copy of the GNU General Public License along with # this program. If not, see . """ Script to do final dark matter search analysis. To run it just run: $ ./dm-search [list of data fit results] --mc [list of atmospheric MC files] --muon-mc [list of muon MC files] --steps [steps] After running you will get a plot showing the limits for back to back dark matter at a range of energies. """ from __future__ import print_function, division import numpy as np from scipy.stats import iqr, poisson from matplotlib.lines import Line2D from scipy.stats import iqr, norm, beta, percentileofscore from scipy.special import spence from itertools import izip_longest from sddm.stats import * from sddm.dc import estimate_errors, EPSILON, truncnorm_scaled import emcee from sddm import printoptions from sddm.utils import fast_cdf, correct_energy_bias # Likelihood Fit Parameters # 0 - Atmospheric Neutrino Flux Scale # 1 - Electron energy bias # 2 - Electron energy resolution # 3 - Muon energy bias # 4 - Muon energy resolution # 5 - External Muon scale # 6 - Dark Matter Scale # Energy resolution as a fraction of energy for dark matter signal DM_ENERGY_RESOLUTION = 0.1 FIT_PARS = [ 'Atmospheric Neutrino Flux Scale', 'Electron energy bias', 'Electron energy resolution', 'Muon energy bias', 'Muon energy resolution', 'External Muon scale', 'Dark Matter Scale'] # Uncertainty on the energy scale # # - the muon energy scale and resolution terms come directly from measurements # on stopping muons, so those are known well. # - for electrons, we only have Michel electrons at the low end of our energy # range, and therefore we don't really have any good way of constraining the # energy scale or resolution. However, if we assume that the ~7% energy bias # in the muons is from the single PE distribution (it seems likely to me that # that is a major part of the bias), then the energy scale should be roughly # the same. Since the Michel electron distributions are consistent, we leave # the mean value at 0, but to be conservative, we set the error to 10%. # - The energy resolution for muons was pretty much spot on, and so we expect # the same from electrons. In addition, the Michel spectrum is consistent so # at that energy level we don't see anything which leads us to expect a major # difference. To be conservative, and because I don't think it really affects # the analysis at all, I'll leave the uncertainty here at 10% anyways. PRIORS = [ 1.0, # Atmospheric Neutrino Scale 0.015, # Electron energy scale 0.0, # Electron energy resolution 0.054, # Muon energy scale 0.0, # Muon energy resolution 0.0, # Muon scale 0.0, # Dark Matter Scale ] PRIOR_UNCERTAINTIES = [ 0.2, # Atmospheric Neutrino Scale 0.03, # Electron energy scale 0.05, # Electron energy resolution 0.011, # Muon energy scale 0.014, # Muon energy resolution 10.0, # Muon scale np.inf,# Dark Matter Scale ] # Lower bounds for the fit parameters PRIORS_LOW = [ EPSILON, -10, EPSILON, -10, EPSILON, EPSILON, EPSILON, ] # Upper bounds for the fit parameters PRIORS_HIGH = [ 10, 10, 10, 10, 10, 1e9, 1000, ] particle_id = {20: 'e', 22: r'\mu'} def plot_hist2(hists, bins, color=None): for id in (20,22,2020,2022,2222): if id == 20: plt.subplot(2,3,1) elif id == 22: plt.subplot(2,3,2) elif id == 2020: plt.subplot(2,3,4) elif id == 2022: plt.subplot(2,3,5) elif id == 2222: plt.subplot(2,3,6) bincenters = (bins[1:] + bins[:-1])/2 plt.hist(bincenters, bins=bins, histtype='step', weights=hists[id],color=color) plt.gca().set_xscale("log") plt.xlabel("Energy (MeV)") plt.title('$' + ''.join([particle_id[int(''.join(x))] for x in grouper(str(id),2)]) + '$') if len(hists): plt.tight_layout() def get_mc_hists(data,x,bins,scale=1.0,reweight=False): """ Returns the expected Monte Carlo histograms for the atmospheric neutrino background. Args: - data: pandas dataframe of the Monte Carlo events - x: fit parameters - bins: histogram bins - scale: multiply histograms by an overall scale factor This function does two basic things: 1. apply the energy bias and resolution corrections 2. histogram the results Returns a dictionary mapping particle id combo -> histogram. """ df_dict = {} for id in (20,22,2020,2022,2222): df_dict[id] = data[data.id == id] return get_mc_hists_fast(df_dict,x,bins,scale,reweight) def get_mc_hists_fast(df_dict,x,bins,scale=1.0,reweight=False): """ Same as get_mc_hists() but the first argument is a dictionary mapping particle id -> dataframe. This is much faster than selecting the events from the dataframe every time. """ mc_hists = {} for id in (20,22,2020,2022,2222): df = df_dict[id] if id == 20: ke = df.energy1.values*(1+x[1]) resolution = df.energy1.values*max(EPSILON,x[2]) elif id == 2020: ke = df.energy1.values*(1+x[1]) + df.energy2.values*(1+x[1]) resolution = np.sqrt((df.energy1.values*max(EPSILON,x[2]))**2 + (df.energy2.values*max(EPSILON,x[2]))**2) elif id == 22: ke = df.energy1.values*(1+x[3]) resolution = df.energy1.values*max(EPSILON,x[4]) elif id == 2222: ke = df.energy1.values*(1+x[3]) + df.energy2.values*(1+x[3]) resolution = np.sqrt((df.energy1.values*max(EPSILON,x[4]))**2 + (df.energy2.values*max(EPSILON,x[4]))**2) elif id == 2022: ke = df.energy1.values*(1+x[1]) + df.energy2.values*(1+x[3]) resolution = np.sqrt((df.energy1.values*max(EPSILON,x[2]))**2 + (df.energy2.values*max(EPSILON,x[4]))**2) if reweight: cdf = fast_cdf(bins[:,np.newaxis],ke,resolution)*df.weight.values else: cdf = fast_cdf(bins[:,np.newaxis],ke,resolution) mc_hists[id] = np.sum(cdf[1:] - cdf[:-1],axis=-1) mc_hists[id] *= scale return mc_hists def get_data_hists(data,bins,scale=1.0): """ Returns the data histogrammed into `bins`. """ data_hists = {} for id in (20,22,2020,2022,2222): data_hists[id] = np.histogram(data[data.id == id].ke.values,bins=bins)[0]*scale return data_hists def make_nll(dm_particle_id, dm_energy, data, muons, mc, atmo_scale_factor, muon_scale_factor, bins, print_nll=False): df_dict = {} for id in (20,22,2020,2022,2222): df_dict[id] = mc[mc.id == id] df_dict_muon = {} for id in (20,22,2020,2022,2222): df_dict_muon[id] = muons[muons.id == id] data_hists = get_data_hists(data,bins) def nll(x, grad=None): if any(x[i] < 0 for i in (0,2,4,5,6)): return np.inf # Get the Monte Carlo histograms. We need to do this within the # likelihood function since we apply the energy resolution parameters # to the Monte Carlo. mc_hists = get_mc_hists_fast(df_dict,x,bins,scale=1/atmo_scale_factor) muon_hists = get_mc_hists_fast(df_dict_muon,x,bins,scale=1/muon_scale_factor) # Calculate the negative log of the likelihood of observing the data # given the fit parameters nll = 0 for id in data_hists: oi = data_hists[id] ei = mc_hists[id]*x[0] + muon_hists[id]*x[5] + EPSILON if id == dm_particle_id: cdf = fast_cdf(bins,dm_energy,dm_energy*DM_ENERGY_RESOLUTION) dm_hist = np.sum(cdf[1:] - cdf[:-1]) ei += dm_hist*x[6] N = ei.sum() nll -= -N - np.sum(gammaln(oi+1)) + np.sum(oi*np.log(ei)) # Add the priors nll -= norm.logpdf(x[:6],PRIORS[:6],PRIOR_UNCERTAINTIES[:6]).sum() if print_nll: # Print the result print("nll = %.2f" % nll) return nll return nll def do_fit(dm_particle_id,dm_energy,data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,steps,print_nll=False,walkers=100,thin=10): """ Run the fit and return the minimum along with samples from running an MCMC starting near the minimum. Args: - data: pandas dataframe representing the data to fit - muon: pandas dataframe representing the expected background from external muons - data_mc: pandas dataframe representing the expected background from atmospheric neutrino events - bins: an array of bins to use for the fit - steps: the number of MCMC steps to run Returns a tuple (xopt, samples) where samples is an array of shape (steps, number of parameters). """ nll = make_nll(dm_particle_id,dm_energy,data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,print_nll) pos = np.empty((walkers, len(PRIORS)),dtype=np.double) for i in range(pos.shape[0]): pos[i] = sample_priors() nwalkers, ndim = pos.shape # We use the KDEMove here because I think it should sample the likelihood # better. Because we have energy scale parameters and we are doing a binned # likelihood, the likelihood is discontinuous. There can also be several # local minima. The author of emcee recommends using the KDEMove with a lot # of workers to try and properly sample a multimodal distribution. In # addition, I've found that the autocorrelation time for the KDEMove is # much better than the other moves. sampler = emcee.EnsembleSampler(nwalkers, ndim, lambda x: -nll(x), moves=emcee.moves.KDEMove()) with np.errstate(invalid='ignore'): sampler.run_mcmc(pos, steps) print("Mean acceptance fraction: {0:.3f}".format(np.mean(sampler.acceptance_fraction))) try: print("autocorrelation time: ", sampler.get_autocorr_time(quiet=True)) except Exception as e: print(e) samples = sampler.get_chain(flat=True,thin=thin) return sampler.get_chain(flat=True)[sampler.get_log_prob(flat=True).argmax()], samples def sample_priors(): """ Returns a random sample of the fit parameters from the priors. For the first 6 parameters we use a truncated normal distribution, and for the last parameter we use a uniform distribution. """ return np.concatenate((truncnorm_scaled(PRIORS_LOW[:6],PRIORS_HIGH[:6],PRIORS[:6],PRIOR_UNCERTAINTIES[:6]),[np.random.uniform(PRIORS_LOW[6],PRIORS_HIGH[6])])) def get_dm_sample(n,dm_particle_id,dm_energy,dm_resolution): ke = norm.rvs(dm_energy,dm_energy*dm_resolution) id = np.tile(dm_particle_id,n) data = pd.DataFrame(np.concatenate((ke,id)).T,columns=['ke','id']) return data if __name__ == '__main__': import argparse import numpy as np import pandas as pd import sys import h5py from sddm.plot_energy import * from sddm.plot import * from sddm import setup_matplotlib import nlopt parser = argparse.ArgumentParser("plot fit results") parser.add_argument("filenames", nargs='+', help="input files") parser.add_argument("--save", action='store_true', default=False, help="save corner plots for backgrounds") parser.add_argument("--mc", nargs='+', required=True, help="atmospheric MC files") parser.add_argument("--muon-mc", nargs='+', required=True, help="muon MC files") parser.add_argument("--nhit-thresh", type=int, default=None, help="nhit threshold to apply to events before processing (should only be used for testing to speed things up)") parser.add_argument("--steps", type=int, default=1000, help="number of steps in the MCMC chain") parser.add_argument("--pull", type=int, default=0, help="plot pull plots") parser.add_argument("--weights", nargs='+', required=True, help="GENIE reweight HDF5 files") parser.add_argument("--print-nll", action='store_true', default=False, help="print nll values") parser.add_argument("--walkers", type=int, default=100, help="number of walkers") parser.add_argument("--thin", type=int, default=10, help="number of steps to thin") args = parser.parse_args() setup_matplotlib(args.save) import matplotlib.pyplot as plt # Loop over runs to prevent using too much memory evs = [] rhdr = pd.concat([read_hdf(filename, "rhdr").assign(filename=filename) for filename in args.filenames],ignore_index=True) for run, df in rhdr.groupby('run'): evs.append(get_events(df.filename.values, merge_fits=True, nhit_thresh=args.nhit_thresh)) ev = pd.concat(evs) ev = correct_energy_bias(ev) ev_mc = get_events(args.mc, merge_fits=True, nhit_thresh=args.nhit_thresh, apply_nhit_trigger=False) muon_mc = get_events(args.muon_mc, merge_fits=True, nhit_thresh=args.nhit_thresh, apply_nhit_trigger=False) weights = pd.concat([read_hdf(filename, "weights") for filename in args.weights],ignore_index=True) ev_mc = correct_energy_bias(ev_mc) muon_mc = correct_energy_bias(muon_mc) # Set all prompt events in the MC to be muons muon_mc.loc[muon_mc.prompt & muon_mc.filename.str.contains("cosmic"),'muon'] = True ev = ev.reset_index() ev_mc = ev_mc.reset_index() muon_mc = muon_mc.reset_index() # 00-orphan cut ev = ev[(ev.gtid & 0xff) != 0] ev_mc = ev_mc[(ev_mc.gtid & 0xff) != 0] muon_mc = muon_mc[(muon_mc.gtid & 0xff) != 0] # remove events 200 microseconds after a muon ev = ev.groupby('run',group_keys=False).apply(muon_follower_cut) # Get rid of events which don't have a successful fit ev = ev[~np.isnan(ev.fmin)] ev_mc = ev_mc[~np.isnan(ev_mc.fmin)] muon_mc = muon_mc[~np.isnan(muon_mc.fmin)] # require (r/r_psup)^3 < 0.9 ev = ev[ev.r_psup < 0.9] ev_mc = ev_mc[ev_mc.r_psup < 0.9] muon_mc = muon_mc[muon_mc.r_psup < 0.9] # require psi < 6 ev = ev[ev.psi < 6] ev_mc = ev_mc[ev_mc.psi < 6] muon_mc = muon_mc[muon_mc.psi < 6] data = ev[ev.signal & ev.prompt & ~ev.atm] data_atm = ev[ev.signal & ev.prompt & ev.atm] # Right now we use the muon Monte Carlo in the fit. If you want to use the # actual data, you can comment the next two lines and then uncomment the # two after that. muon = muon_mc[muon_mc.muon & muon_mc.prompt & ~muon_mc.atm] muon_atm = muon_mc[muon_mc.muon & muon_mc.prompt & muon_mc.atm] #muon = ev[ev.muon & ev.prompt & ~ev.atm] #muon_atm = ev[ev.muon & ev.prompt & ev.atm] if (~rhdr.run.isin(ev_mc.run)).any(): print_warning("Error! The following runs have no Monte Carlo: %s" % \ rhdr.run[~rhdr.run.isin(ev_mc.run)].values) sys.exit(1) ev_mc = ev_mc[ev_mc.run.isin(rhdr.run)] data_mc = ev_mc[ev_mc.signal & ev_mc.prompt & ~ev_mc.atm] data_atm_mc = ev_mc[ev_mc.signal & ev_mc.prompt & ev_mc.atm] data_mc = ev_mc[ev_mc.signal & ev_mc.prompt & ~ev_mc.atm] bins = np.logspace(np.log10(20),np.log10(10e3),21) atmo_scale_factor = 100.0 muon_scale_factor = len(muon) + len(muon_atm) if args.pull: pull = [[] for i in range(len(FIT_PARS))] # Set the random seed so we get reproducible results here np.random.seed(0) for i in range(args.pull): xtrue = sample_priors() # Calculate expected number of events N = len(data_mc)*xtrue[0]/atmo_scale_factor N_atm = len(data_atm_mc)*xtrue[0]/atmo_scale_factor N_muon = len(muon)*xtrue[5]/muon_scale_factor N_muon_atm = len(muon_atm)*xtrue[5]/muon_scale_factor N_dm = xtrue[6] # Calculate observed number of events n = np.random.poisson(N) n_atm = np.random.poisson(N_atm) n_muon = np.random.poisson(N_muon) n_muon_atm = np.random.poisson(N_muon_atm) n_dm = np.random.poisson(N_dm) dm_particle_id = np.random.choice([2020,2222]) dm_energy = np.random.uniform(20,10e3) # Sample data from Monte Carlo data = pd.concat((data_mc.sample(n=n,replace=True), muon.sample(n=n_muon,replace=True),get_dm_sample(n_dm,dm_particle_id,dm_energy,DM_ENERGY_RESOLUTION))) data_atm = pd.concat((data_atm_mc.sample(n=n_atm,replace=True), muon_atm.sample(n=n_muon_atm,replace=True))) # Smear the energies by the additional energy resolution data.loc[data.id1 == 20,'energy1'] *= (1+xtrue[1]+np.random.randn(np.count_nonzero(data.id1 == 20))*xtrue[2]) data.loc[data.id1 == 22,'energy1'] *= (1+xtrue[3]+np.random.randn(np.count_nonzero(data.id1 == 22))*xtrue[4]) data.loc[data.id2 == 20,'energy2'] *= (1+xtrue[1]+np.random.randn(np.count_nonzero(data.id2 == 20))*xtrue[2]) data.loc[data.id2 == 22,'energy2'] *= (1+xtrue[3]+np.random.randn(np.count_nonzero(data.id2 == 22))*xtrue[4]) data['ke'] = data['energy1'].fillna(0) + data['energy2'].fillna(0) + data['energy3'].fillna(0) data_atm.loc[data_atm.id1 == 20,'energy1'] *= (1+xtrue[1]+np.random.randn(np.count_nonzero(data_atm.id1 == 20))*xtrue[2]) data_atm.loc[data_atm.id1 == 22,'energy1'] *= (1+xtrue[3]+np.random.randn(np.count_nonzero(data_atm.id1 == 22))*xtrue[4]) data_atm.loc[data_atm.id2 == 20,'energy2'] *= (1+xtrue[1]+np.random.randn(np.count_nonzero(data_atm.id2 == 20))*xtrue[2]) data_atm.loc[data_atm.id2 == 22,'energy2'] *= (1+xtrue[3]+np.random.randn(np.count_nonzero(data_atm.id2 == 22))*xtrue[4]) data_atm['ke'] = data_atm['energy1'].fillna(0) + data_atm['energy2'].fillna(0) + data_atm['energy3'].fillna(0) xopt, samples = do_fit(data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,args.steps,args.print_nll,args.walkers,args.thin) for i in range(len(FIT_PARS)): # The "pull plots" we make here are actually produced via a # procedure called "Simulation Based Calibration". # # See https://arxiv.org/abs/1804.06788. pull[i].append(percentileofscore(samples[:,i],xtrue[i])) fig = plt.figure() axes = [] for i, name in enumerate(FIT_PARS): axes.append(plt.subplot(3,2,i+1)) n, bins, patches = plt.hist(pull[i],bins=np.linspace(0,100,11),histtype='step') expected = len(pull[i])/(len(bins)-1) plt.axhline(expected,color='k',ls='--',alpha=0.25) plt.axhspan(poisson.ppf(0.005,expected), poisson.ppf(0.995,expected), facecolor='0.5', alpha=0.25) plt.title(name) for ax in axes: despine(ax=ax,left=True,trim=True) ax.get_yaxis().set_visible(False) plt.tight_layout() if args.save: fig.savefig("dm_search_pull_plot.pdf") fig.savefig("dm_search_pull_plot.eps") else: plt.show() sys.exit(0) dm_energies = np.logspace(np.log10(20),np.log10(10e3),10) limits = {} for dm_particle_id in (2020,2222): limits[dm_particle_id] = [] for dm_energy in dm_energies: xopt, samples = do_fit(dm_particle_id,dm_energy,data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,args.steps,args.print_nll,args.walkers,args.thin) limit = np.percentile(samples[:,6],90) limits[dm_particle_id].append(limit) fig = plt.figure() for dm_particle_id in (2020,2222): plt.plot(dm_energies,limits[dm_particle_id],label='$' + ''.join([particle_id[int(''.join(x))] for x in grouper(str(dm_particle_id),2)]) + '$') plt.gca().set_xscale("log") despine(fig,trim=True) plt.xlabel("Energy (MeV)") plt.ylabel("Event Rate Limit (events)") plt.legend() plt.tight_layout() if args.save: plt.savefig("dm_search_limit.pdf") plt.savefig("dm_search_limit.eps") else: plt.suptitle("Dark Matter Limits") plt.show()