diff options
| author | tlatorre <tlatorre@uchicago.edu> | 2020-11-03 08:10:42 -0600 | 
|---|---|---|
| committer | tlatorre <tlatorre@uchicago.edu> | 2020-11-03 08:10:42 -0600 | 
| commit | 3958f04ff6202f0f9f137285fbebdef399b8851d (patch) | |
| tree | 5a52a9c6ec7cf02905e941b9320e994dcb7a3ac4 | |
| parent | 220377d85d8cf71d37a47b160ff71bdfadce8f38 (diff) | |
| download | sddm-3958f04ff6202f0f9f137285fbebdef399b8851d.tar.gz sddm-3958f04ff6202f0f9f137285fbebdef399b8851d.tar.bz2 sddm-3958f04ff6202f0f9f137285fbebdef399b8851d.zip  | |
initial commit of a script to do the dark matter search
| -rwxr-xr-x | utils/chi2 | 2 | ||||
| -rwxr-xr-x | utils/dm-search | 517 | 
2 files changed, 517 insertions, 2 deletions
@@ -68,8 +68,6 @@ FIT_PARS = [  #   at that energy level we don't see anything which leads us to expect a major  #   difference. To be conservative, and because I don't think it really affects  #   the analysis at all, I'll leave the uncertainty here at 10% anyways. -# - For the electron + muon energy resolution, I don't have any real good way -#   to estimate this, so we are conservative and set the error to 10%.  PRIORS = [      1.0,   # Atmospheric Neutrino Scale      0.0,   # Electron energy scale diff --git a/utils/dm-search b/utils/dm-search new file mode 100755 index 0000000..323aabc --- /dev/null +++ b/utils/dm-search @@ -0,0 +1,517 @@ +#!/usr/bin/env python +# Copyright (c) 2019, Anthony Latorre <tlatorre at uchicago> +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the Free +# Software Foundation, either version 3 of the License, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +# more details. +# +# You should have received a copy of the GNU General Public License along with +# this program. If not, see <https://www.gnu.org/licenses/>. +""" +Script to do final dark matter search analysis. To run it just run: + +    $ ./dm-search [list of data fit results] --mc [list of atmospheric MC files] --muon-mc [list of muon MC files] --steps [steps] + +After running you will get a plot showing the limits for back to back dark +matter at a range of energies. +""" +from __future__ import print_function, division +import numpy as np +from scipy.stats import iqr, poisson +from matplotlib.lines import Line2D +from scipy.stats import iqr, norm, beta, percentileofscore +from scipy.special import spence +from itertools import izip_longest +from sddm.stats import * +from sddm.dc import estimate_errors, EPSILON, truncnorm_scaled +import emcee +from sddm import printoptions +from sddm.utils import fast_cdf, correct_energy_bias + +# Likelihood Fit Parameters +# 0 - Atmospheric Neutrino Flux Scale +# 1 - Electron energy bias +# 2 - Electron energy resolution +# 3 - Muon energy bias +# 4 - Muon energy resolution +# 5 - External Muon scale +# 6 - Dark Matter Scale + +# Energy resolution as a fraction of energy for dark matter signal +DM_ENERGY_RESOLUTION = 0.1 + +FIT_PARS = [ +    'Atmospheric Neutrino Flux Scale', +    'Electron energy bias', +    'Electron energy resolution', +    'Muon energy bias', +    'Muon energy resolution', +    'External Muon scale', +    'Dark Matter Scale'] + +# Uncertainty on the energy scale +# +# - the muon energy scale and resolution terms come directly from measurements +#   on stopping muons, so those are known well. +# - for electrons, we only have Michel electrons at the low end of our energy +#   range, and therefore we don't really have any good way of constraining the +#   energy scale or resolution. However, if we assume that the ~7% energy bias +#   in the muons is from the single PE distribution (it seems likely to me that +#   that is a major part of the bias), then the energy scale should be roughly +#   the same. Since the Michel electron distributions are consistent, we leave +#   the mean value at 0, but to be conservative, we set the error to 10%. +# - The energy resolution for muons was pretty much spot on, and so we expect +#   the same from electrons. In addition, the Michel spectrum is consistent so +#   at that energy level we don't see anything which leads us to expect a major +#   difference. To be conservative, and because I don't think it really affects +#   the analysis at all, I'll leave the uncertainty here at 10% anyways. +PRIORS = [ +    1.0,   # Atmospheric Neutrino Scale +    0.015, # Electron energy scale +    0.0,   # Electron energy resolution +    0.054, # Muon energy scale +    0.0,   # Muon energy resolution +    0.0,   # Muon scale +    0.0,   # Dark Matter Scale +] + +PRIOR_UNCERTAINTIES = [ +    0.2,   # Atmospheric Neutrino Scale +    0.03,  # Electron energy scale +    0.05,  # Electron energy resolution +    0.011, # Muon energy scale +    0.014, # Muon energy resolution +    10.0,  # Muon scale +    np.inf,# Dark Matter Scale +] + +# Lower bounds for the fit parameters +PRIORS_LOW = [ +    EPSILON, +    -10, +    EPSILON, +    -10, +    EPSILON, +    EPSILON, +    EPSILON, +] + +# Upper bounds for the fit parameters +PRIORS_HIGH = [ +    10, +    10, +    10, +    10, +    10, +    1e9, +    1000, +] + +particle_id = {20: 'e', 22: r'\mu'} + +def plot_hist2(hists, bins, color=None): +    for id in (20,22,2020,2022,2222): +        if id == 20: +            plt.subplot(2,3,1) +        elif id == 22: +            plt.subplot(2,3,2) +        elif id == 2020: +            plt.subplot(2,3,4) +        elif id == 2022: +            plt.subplot(2,3,5) +        elif id == 2222: +            plt.subplot(2,3,6) + +        bincenters = (bins[1:] + bins[:-1])/2 +        plt.hist(bincenters, bins=bins, histtype='step', weights=hists[id],color=color) +        plt.gca().set_xscale("log") +        plt.xlabel("Energy (MeV)") +        plt.title('$' + ''.join([particle_id[int(''.join(x))] for x in grouper(str(id),2)]) + '$') + +    if len(hists): +        plt.tight_layout() + +def get_mc_hists(data,x,bins,scale=1.0,reweight=False): +    """ +    Returns the expected Monte Carlo histograms for the atmospheric neutrino +    background. + +    Args: +        - data: pandas dataframe of the Monte Carlo events +        - x: fit parameters +        - bins: histogram bins +        - scale: multiply histograms by an overall scale factor + +    This function does two basic things: + +        1. apply the energy bias and resolution corrections +        2. histogram the results + +    Returns a dictionary mapping particle id combo -> histogram. +    """ +    df_dict = {} +    for id in (20,22,2020,2022,2222): +        df_dict[id] = data[data.id == id] + +    return get_mc_hists_fast(df_dict,x,bins,scale,reweight) + +def get_mc_hists_fast(df_dict,x,bins,scale=1.0,reweight=False): +    """ +    Same as get_mc_hists() but the first argument is a dictionary mapping +    particle id -> dataframe. This is much faster than selecting the events +    from the dataframe every time. +    """ +    mc_hists = {} + +    for id in (20,22,2020,2022,2222): +        df = df_dict[id] + +        if id == 20: +            ke = df.energy1.values*(1+x[1]) +            resolution = df.energy1.values*max(EPSILON,x[2]) +        elif id == 2020: +            ke = df.energy1.values*(1+x[1]) + df.energy2.values*(1+x[1]) +            resolution = np.sqrt((df.energy1.values*max(EPSILON,x[2]))**2 + (df.energy2.values*max(EPSILON,x[2]))**2) +        elif id == 22: +            ke = df.energy1.values*(1+x[3]) +            resolution = df.energy1.values*max(EPSILON,x[4]) +        elif id == 2222: +            ke = df.energy1.values*(1+x[3]) + df.energy2.values*(1+x[3]) +            resolution = np.sqrt((df.energy1.values*max(EPSILON,x[4]))**2 + (df.energy2.values*max(EPSILON,x[4]))**2) +        elif id == 2022: +            ke = df.energy1.values*(1+x[1]) + df.energy2.values*(1+x[3]) +            resolution = np.sqrt((df.energy1.values*max(EPSILON,x[2]))**2 + (df.energy2.values*max(EPSILON,x[4]))**2) + +        if reweight: +            cdf = fast_cdf(bins[:,np.newaxis],ke,resolution)*df.weight.values +        else: +            cdf = fast_cdf(bins[:,np.newaxis],ke,resolution) + +        mc_hists[id] = np.sum(cdf[1:] - cdf[:-1],axis=-1) +        mc_hists[id] *= scale +    return mc_hists + +def get_data_hists(data,bins,scale=1.0): +    """ +    Returns the data histogrammed into `bins`. +    """ +    data_hists = {} +    for id in (20,22,2020,2022,2222): +        data_hists[id] = np.histogram(data[data.id == id].ke.values,bins=bins)[0]*scale +    return data_hists + +def make_nll(dm_particle_id, dm_energy, data, muons, mc, atmo_scale_factor, muon_scale_factor, bins, print_nll=False): +    df_dict = {} +    for id in (20,22,2020,2022,2222): +        df_dict[id] = mc[mc.id == id] + +    df_dict_muon = {} +    for id in (20,22,2020,2022,2222): +        df_dict_muon[id] = muons[muons.id == id] + +    data_hists = get_data_hists(data,bins) + +    def nll(x, grad=None): +        if any(x[i] < 0 for i in (0,2,4,5,6)): +            return np.inf + +        # Get the Monte Carlo histograms. We need to do this within the +        # likelihood function since we apply the energy resolution parameters +        # to the Monte Carlo. +        mc_hists = get_mc_hists_fast(df_dict,x,bins,scale=1/atmo_scale_factor) +        muon_hists = get_mc_hists_fast(df_dict_muon,x,bins,scale=1/muon_scale_factor) + +        # Calculate the negative log of the likelihood of observing the data +        # given the fit parameters + +        nll = 0 +        for id in data_hists: +            oi = data_hists[id] +            ei = mc_hists[id]*x[0] + muon_hists[id]*x[5] + EPSILON +            if id == dm_particle_id: +                cdf = fast_cdf(bins,dm_energy,dm_energy*DM_ENERGY_RESOLUTION) +                dm_hist = np.sum(cdf[1:] - cdf[:-1]) +                ei += dm_hist*x[6] +            N = ei.sum() +            nll -= -N - np.sum(gammaln(oi+1)) + np.sum(oi*np.log(ei)) + +        # Add the priors +        nll -= norm.logpdf(x[:6],PRIORS[:6],PRIOR_UNCERTAINTIES[:6]).sum() + +        if print_nll: +            # Print the result +            print("nll = %.2f" % nll) + +        return nll +    return nll + +def do_fit(dm_particle_id,dm_energy,data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,steps,print_nll=False,walkers=100,thin=10): +    """ +    Run the fit and return the minimum along with samples from running an MCMC +    starting near the minimum. + +    Args: +        - data: pandas dataframe representing the data to fit +        - muon: pandas dataframe representing the expected background from +                external muons +        - data_mc: pandas dataframe representing the expected background from +                   atmospheric neutrino events +        - bins: an array of bins to use for the fit +        - steps: the number of MCMC steps to run + +    Returns a tuple (xopt, samples) where samples is an array of shape (steps, +    number of parameters). +    """ +    nll = make_nll(dm_particle_id,dm_energy,data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,print_nll) + +    pos = np.empty((walkers, len(PRIORS)),dtype=np.double) +    for i in range(pos.shape[0]): +        pos[i] = sample_priors() + +    nwalkers, ndim = pos.shape + +    # We use the KDEMove here because I think it should sample the likelihood +    # better. Because we have energy scale parameters and we are doing a binned +    # likelihood, the likelihood is discontinuous. There can also be several +    # local minima. The author of emcee recommends using the KDEMove with a lot +    # of workers to try and properly sample a multimodal distribution. In +    # addition, I've found that the autocorrelation time for the KDEMove is +    # much better than the other moves. +    sampler = emcee.EnsembleSampler(nwalkers, ndim, lambda x: -nll(x), moves=emcee.moves.KDEMove()) +    with np.errstate(invalid='ignore'): +        sampler.run_mcmc(pos, steps) + +    print("Mean acceptance fraction: {0:.3f}".format(np.mean(sampler.acceptance_fraction))) + +    try: +        print("autocorrelation time: ", sampler.get_autocorr_time(quiet=True)) +    except Exception as e: +        print(e) + +    samples = sampler.get_chain(flat=True,thin=thin) + +    return sampler.get_chain(flat=True)[sampler.get_log_prob(flat=True).argmax()], samples + +def sample_priors(): +    """ +    Returns a random sample of the fit parameters from the priors. For the +    first 6 parameters we use a truncated normal distribution, and for the last +    parameter we use a uniform distribution. +    """ +    return np.concatenate((truncnorm_scaled(PRIORS_LOW[:6],PRIORS_HIGH[:6],PRIORS[:6],PRIOR_UNCERTAINTIES[:6]),[np.random.uniform(PRIORS_LOW[6],PRIORS_HIGH[6])])) + +def get_dm_sample(n,dm_particle_id,dm_energy,dm_resolution): +    ke = norm.rvs(dm_energy,dm_energy*dm_resolution) +    id = np.tile(dm_particle_id,n) +    data = pd.DataFrame(np.concatenate((ke,id)).T,columns=['ke','id']) +    return data + +if __name__ == '__main__': +    import argparse +    import numpy as np +    import pandas as pd +    import sys +    import h5py +    from sddm.plot_energy import * +    from sddm.plot import * +    from sddm import setup_matplotlib +    import nlopt + +    parser = argparse.ArgumentParser("plot fit results") +    parser.add_argument("filenames", nargs='+', help="input files") +    parser.add_argument("--save", action='store_true', default=False, help="save corner plots for backgrounds") +    parser.add_argument("--mc", nargs='+', required=True, help="atmospheric MC files") +    parser.add_argument("--muon-mc", nargs='+', required=True, help="muon MC files") +    parser.add_argument("--nhit-thresh", type=int, default=None, help="nhit threshold to apply to events before processing (should only be used for testing to speed things up)") +    parser.add_argument("--steps", type=int, default=1000, help="number of steps in the MCMC chain") +    parser.add_argument("--pull", type=int, default=0, help="plot pull plots") +    parser.add_argument("--weights", nargs='+', required=True, help="GENIE reweight HDF5 files") +    parser.add_argument("--print-nll", action='store_true', default=False, help="print nll values") +    parser.add_argument("--walkers", type=int, default=100, help="number of walkers") +    parser.add_argument("--thin", type=int, default=10, help="number of steps to thin") +    args = parser.parse_args() + +    setup_matplotlib(args.save) + +    import matplotlib.pyplot as plt + +    # Loop over runs to prevent using too much memory +    evs = [] +    rhdr = pd.concat([read_hdf(filename, "rhdr").assign(filename=filename) for filename in args.filenames],ignore_index=True) +    for run, df in rhdr.groupby('run'): +        evs.append(get_events(df.filename.values, merge_fits=True, nhit_thresh=args.nhit_thresh)) +    ev = pd.concat(evs) + +    ev = correct_energy_bias(ev) + +    ev_mc = get_events(args.mc, merge_fits=True, nhit_thresh=args.nhit_thresh, apply_nhit_trigger=False) +    muon_mc = get_events(args.muon_mc, merge_fits=True, nhit_thresh=args.nhit_thresh, apply_nhit_trigger=False) +    weights = pd.concat([read_hdf(filename, "weights") for filename in args.weights],ignore_index=True) + +    ev_mc = correct_energy_bias(ev_mc) +    muon_mc = correct_energy_bias(muon_mc) + +    # Set all prompt events in the MC to be muons +    muon_mc.loc[muon_mc.prompt & muon_mc.filename.str.contains("cosmic"),'muon'] = True + +    ev = ev.reset_index() +    ev_mc = ev_mc.reset_index() +    muon_mc = muon_mc.reset_index() + +    # 00-orphan cut +    ev = ev[(ev.gtid & 0xff) != 0] +    ev_mc = ev_mc[(ev_mc.gtid & 0xff) != 0] +    muon_mc = muon_mc[(muon_mc.gtid & 0xff) != 0] + +    # remove events 200 microseconds after a muon +    ev = ev.groupby('run',group_keys=False).apply(muon_follower_cut) + +    # Get rid of events which don't have a successful fit +    ev = ev[~np.isnan(ev.fmin)] +    ev_mc = ev_mc[~np.isnan(ev_mc.fmin)] +    muon_mc = muon_mc[~np.isnan(muon_mc.fmin)] + +    # require (r/r_psup)^3 < 0.9 +    ev = ev[ev.r_psup < 0.9] +    ev_mc = ev_mc[ev_mc.r_psup < 0.9] +    muon_mc = muon_mc[muon_mc.r_psup < 0.9] + +    # require psi < 6 +    ev = ev[ev.psi < 6] +    ev_mc = ev_mc[ev_mc.psi < 6] +    muon_mc = muon_mc[muon_mc.psi < 6] + +    data = ev[ev.signal & ev.prompt & ~ev.atm] +    data_atm = ev[ev.signal & ev.prompt & ev.atm] + +    # Right now we use the muon Monte Carlo in the fit. If you want to use the +    # actual data, you can comment the next two lines and then uncomment the +    # two after that. +    muon = muon_mc[muon_mc.muon & muon_mc.prompt & ~muon_mc.atm] +    muon_atm = muon_mc[muon_mc.muon & muon_mc.prompt & muon_mc.atm] +    #muon = ev[ev.muon & ev.prompt & ~ev.atm] +    #muon_atm = ev[ev.muon & ev.prompt & ev.atm] + +    if (~rhdr.run.isin(ev_mc.run)).any(): +        print_warning("Error! The following runs have no Monte Carlo: %s" % \ +            rhdr.run[~rhdr.run.isin(ev_mc.run)].values) +        sys.exit(1) + +    ev_mc = ev_mc[ev_mc.run.isin(rhdr.run)] + +    data_mc = ev_mc[ev_mc.signal & ev_mc.prompt & ~ev_mc.atm] +    data_atm_mc = ev_mc[ev_mc.signal & ev_mc.prompt & ev_mc.atm] + +    data_mc = ev_mc[ev_mc.signal & ev_mc.prompt & ~ev_mc.atm] + +    bins = np.logspace(np.log10(20),np.log10(10e3),21) + +    atmo_scale_factor = 100.0 +    muon_scale_factor = len(muon) + len(muon_atm) + +    if args.pull: +        pull = [[] for i in range(len(FIT_PARS))] + +        # Set the random seed so we get reproducible results here +        np.random.seed(0) + +        for i in range(args.pull): +            xtrue = sample_priors() + +            # Calculate expected number of events +            N = len(data_mc)*xtrue[0]/atmo_scale_factor +            N_atm = len(data_atm_mc)*xtrue[0]/atmo_scale_factor +            N_muon = len(muon)*xtrue[5]/muon_scale_factor +            N_muon_atm = len(muon_atm)*xtrue[5]/muon_scale_factor +            N_dm = xtrue[6] + +            # Calculate observed number of events +            n = np.random.poisson(N) +            n_atm = np.random.poisson(N_atm) +            n_muon = np.random.poisson(N_muon) +            n_muon_atm = np.random.poisson(N_muon_atm) +            n_dm = np.random.poisson(N_dm) + +            dm_particle_id = np.random.choice([2020,2222]) +            dm_energy = np.random.uniform(20,10e3) + +            # Sample data from Monte Carlo +            data = pd.concat((data_mc.sample(n=n,replace=True), muon.sample(n=n_muon,replace=True),get_dm_sample(n_dm,dm_particle_id,dm_energy,DM_ENERGY_RESOLUTION))) +            data_atm = pd.concat((data_atm_mc.sample(n=n_atm,replace=True), muon_atm.sample(n=n_muon_atm,replace=True))) + +            # Smear the energies by the additional energy resolution +            data.loc[data.id1 == 20,'energy1'] *= (1+xtrue[1]+np.random.randn(np.count_nonzero(data.id1 == 20))*xtrue[2]) +            data.loc[data.id1 == 22,'energy1'] *= (1+xtrue[3]+np.random.randn(np.count_nonzero(data.id1 == 22))*xtrue[4]) +            data.loc[data.id2 == 20,'energy2'] *= (1+xtrue[1]+np.random.randn(np.count_nonzero(data.id2 == 20))*xtrue[2]) +            data.loc[data.id2 == 22,'energy2'] *= (1+xtrue[3]+np.random.randn(np.count_nonzero(data.id2 == 22))*xtrue[4]) +            data['ke'] = data['energy1'].fillna(0) + data['energy2'].fillna(0) + data['energy3'].fillna(0) + +            data_atm.loc[data_atm.id1 == 20,'energy1'] *= (1+xtrue[1]+np.random.randn(np.count_nonzero(data_atm.id1 == 20))*xtrue[2]) +            data_atm.loc[data_atm.id1 == 22,'energy1'] *= (1+xtrue[3]+np.random.randn(np.count_nonzero(data_atm.id1 == 22))*xtrue[4]) +            data_atm.loc[data_atm.id2 == 20,'energy2'] *= (1+xtrue[1]+np.random.randn(np.count_nonzero(data_atm.id2 == 20))*xtrue[2]) +            data_atm.loc[data_atm.id2 == 22,'energy2'] *= (1+xtrue[3]+np.random.randn(np.count_nonzero(data_atm.id2 == 22))*xtrue[4]) +            data_atm['ke'] = data_atm['energy1'].fillna(0) + data_atm['energy2'].fillna(0) + data_atm['energy3'].fillna(0) + +            xopt, samples = do_fit(data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,args.steps,args.print_nll,args.walkers,args.thin) + +            for i in range(len(FIT_PARS)): +                # The "pull plots" we make here are actually produced via a +                # procedure called "Simulation Based Calibration". +                # +                # See https://arxiv.org/abs/1804.06788. +                pull[i].append(percentileofscore(samples[:,i],xtrue[i])) + +        fig = plt.figure() +        axes = [] +        for i, name in enumerate(FIT_PARS): +            axes.append(plt.subplot(3,2,i+1)) +            n, bins, patches = plt.hist(pull[i],bins=np.linspace(0,100,11),histtype='step') +            expected = len(pull[i])/(len(bins)-1) +            plt.axhline(expected,color='k',ls='--',alpha=0.25) +            plt.axhspan(poisson.ppf(0.005,expected), poisson.ppf(0.995,expected), facecolor='0.5', alpha=0.25) +            plt.title(name) +        for ax in axes: +            despine(ax=ax,left=True,trim=True) +            ax.get_yaxis().set_visible(False) +        plt.tight_layout() + +        if args.save: +            fig.savefig("dm_search_pull_plot.pdf") +            fig.savefig("dm_search_pull_plot.eps") +        else: +            plt.show() + +        sys.exit(0) + +    dm_energies = np.logspace(np.log10(20),np.log10(10e3),10) +    limits = {} +    for dm_particle_id in (2020,2222): +        limits[dm_particle_id] = [] +        for dm_energy in dm_energies: +            xopt, samples = do_fit(dm_particle_id,dm_energy,data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,args.steps,args.print_nll,args.walkers,args.thin) + +            limit = np.percentile(samples[:,6],90) +            limits[dm_particle_id].append(limit) + +    fig = plt.figure() +    for dm_particle_id in (2020,2222): +        plt.plot(dm_energies,limits[dm_particle_id],label='$' + ''.join([particle_id[int(''.join(x))] for x in grouper(str(dm_particle_id),2)]) + '$') +    plt.gca().set_xscale("log") +    despine(fig,trim=True) +    plt.xlabel("Energy (MeV)") +    plt.ylabel("Event Rate Limit (events)") +    plt.legend() +    plt.tight_layout() + +    if args.save: +        plt.savefig("dm_search_limit.pdf") +        plt.savefig("dm_search_limit.eps") +    else: +        plt.suptitle("Dark Matter Limits") +        plt.show()  | 
