diff options
| author | tlatorre <tlatorre@uchicago.edu> | 2020-05-12 11:34:47 -0500 | 
|---|---|---|
| committer | tlatorre <tlatorre@uchicago.edu> | 2020-05-12 11:34:47 -0500 | 
| commit | 764bf1b496de0d3d3a22b988a0634ea68434bb26 (patch) | |
| tree | 83dde566645f6b7f1abef44ae9654bb62482a0ac | |
| parent | c24438b1fa9d368f2b05d623c7a2cb0d27852cfc (diff) | |
| download | sddm-764bf1b496de0d3d3a22b988a0634ea68434bb26.tar.gz sddm-764bf1b496de0d3d3a22b988a0634ea68434bb26.tar.bz2 sddm-764bf1b496de0d3d3a22b988a0634ea68434bb26.zip | |
add a script to do a closure test on the contamination analysis
| -rwxr-xr-x | utils/dc | 217 | ||||
| -rwxr-xr-x | utils/dc-closure-test | 555 | ||||
| -rwxr-xr-x | utils/plot-energy | 120 | ||||
| -rwxr-xr-x | utils/sddm/dc.py | 7 | ||||
| -rwxr-xr-x | utils/sddm/plot_energy.py | 139 | 
5 files changed, 741 insertions, 297 deletions
| @@ -266,178 +266,20 @@ if __name__ == '__main__':      import pandas as pd      import sys      import h5py +    from sddm import setup_matplotlib      parser = argparse.ArgumentParser("plot fit results")      parser.add_argument("filenames", nargs='+', help="input files")      parser.add_argument("--steps", type=int, default=100000, help="number of steps in the MCMC chain")      parser.add_argument("--save", action="store_true", default=False, help="save plots") +    parser.add_argument("--mc", nargs='+', required=True, help="atmospheric MC files")      args = parser.parse_args() -    if args.save: -        # default \textwidth for a fullpage article in Latex is 16.50764 cm. -        # You can figure this out by compiling the following TeX document: -        # -        #    \documentclass{article} -        #    \usepackage{fullpage} -        #    \usepackage{layouts} -        #    \begin{document} -        #    textwidth in cm: \printinunitsof{cm}\prntlen{\textwidth} -        #    \end{document} - -        width = 16.50764 -        width /= 2.54 # cm -> inches -        # According to this page: -        # http://www-personal.umich.edu/~jpboyd/eng403_chap2_tuftegospel.pdf, -        # Tufte suggests an aspect ratio of 1.5 - 1.6. -        height = width/1.5 -        FIGSIZE = (width,height) - -        import matplotlib.pyplot as plt - -        font = {'family':'serif', 'serif': ['computer modern roman']} -        plt.rc('font',**font) - -        plt.rc('text', usetex=True) -    else: -        # on retina screens, the default plots are way too small -        # by using Qt5 and setting QT_AUTO_SCREEN_SCALE_FACTOR=1 -        # Qt5 will scale everything using the dpi in ~/.Xresources -        import matplotlib -        matplotlib.use("Qt5Agg") - -        import matplotlib.pyplot as plt - -        # Default figure size. Currently set to my monitor width and height so that -        # things are properly formatted -        FIGSIZE = (13.78,7.48) - -        # Make the defalt font bigger -        plt.rc('font', size=22) - -    for filename in args.filenames: -        ev = pd.read_hdf(filename,"ev") - -    ev = pd.concat([pd.read_hdf(filename, "ev") for filename in args.filenames],ignore_index=True) -    fits = pd.concat([pd.read_hdf(filename, "fits") for filename in args.filenames],ignore_index=True) -    rhdr = pd.concat([pd.read_hdf(filename, "rhdr") for filename in args.filenames],ignore_index=True) - -    first_gtid = rhdr.set_index('run').to_dict()['first_gtid'] - -    # First, remove junk events since orphans won't have a 50 MHz clock and so -    # could screw up the 50 MHz clock unwrapping -    ev = ev[ev.dc & DC_JUNK == 0] - -    # We need the events to be in time order here in order to calculate the -    # delta t between events. It's not obvious exactly how to do this. You -    # could sort by GTID, but that wraps around. Similarly we can't sort by the -    # 50 MHz clock because it also wraps around. Finally, I'm hesitant to sort -    # by the 10 MHz clock since it can be unreliable. -    # -    # Update: Phil proposed a clever way to get the events in order using the -    # GTID: -    # -    # > The GTID rollover should be easy to handle because there should never -    # > be two identical GTID's in a run. So if you order the events by GTID, -    # > you can assume that events with GTID's that come logically before the -    # > first GTID in the run must have occurred after the other events. -    # -    # Therefore, we can just add 0x1000000 to all GTIDs before the first GTID -    # in the event and sort on that. We get the first GTID from the RHDR bank. -    ev['gtid_sort'] = ev['gtid'].copy() - -    ev = ev.groupby('run',as_index=False).apply(gtid_sort,first_gtid=first_gtid).reset_index(level=0,drop=True) - -    ev = ev.sort_values(by=['run','gtid_sort'],kind='mergesort') - -    for run, ev_run in ev.groupby('run'): -        # Warn about 50 MHz clock jumps since they could indicate that the -        # events aren't in order. -        dt = np.diff(ev_run.gtr) -        if np.count_nonzero((np.abs(dt) > 1e9) & (dt > -0x7ffffffffff*20.0/2)): -            print_warning("Warning: %i 50 MHz clock jumps in run %i. Are the events in order?" % \ -                (np.count_nonzero((np.abs(dt) > 1e9) & (dt > -0x7ffffffffff*20.0/2)),run)) - -    # unwrap the 50 MHz clock within each run -    ev.gtr = ev.groupby(['run'],as_index=False)['gtr'].transform(unwrap_50_mhz_clock) - -    for run, ev_run in ev.groupby('run'): -        # Warn about GTID jumps since we could be missing a potential flasher -        # and/or breakdown, and we need all the events in order to do a -        # retrigger cut -        if np.count_nonzero(np.diff(ev_run.gtid) != 1): -            print_warning("Warning: %i GTID jumps in run %i" % (np.count_nonzero(np.diff(ev_run.gtid) != 1),run)) - -    # calculate the time difference between each event and the previous event -    # so we can tag retrigger events -    ev['dt'] = ev.groupby(['run'],as_index=False)['gtr'].transform(lambda x: np.concatenate(([1e9],np.diff(x.values)))) - -    # Calculate the approximate Ockham factor. -    # See Chapter 20 in "Probability Theory: The Logic of Science" by Jaynes -    # -    # Note: This is a really approximate form by assuming that the shape of -    # the likelihood space is equal to the average uncertainty in the -    # different parameters. -    fits['w'] = fits['n']*np.log(0.05/10e3) + np.log(fits['energy1']) + fits['n']*np.log(1e-4/(4*np.pi)) -    fits['w'] -= fits['n']*100 - -    # Note: we index on the left hand site with loc to avoid a copy error -    # -    # See https://www.dataquest.io/blog/settingwithcopywarning/ -    fits.loc[fits['n'] > 1, 'w'] += np.log(fits[fits['n'] > 1]['energy2']) -    fits.loc[fits['n'] > 2, 'w'] += np.log(fits[fits['n'] > 2]['energy3']) - -    fits['fmin'] = fits['fmin'] - fits['w'] - -    fits['ke'] = fits['energy1'] -    fits['id'] = fits['id1'] -    fits.loc[fits['n'] == 2, 'id'] = fits['id1']*100 + fits['id2'] -    fits.loc[fits['n'] == 3, 'id'] = fits['id1']*10000 + fits['id2']*100 + fits['id3'] -    fits['theta'] = fits['theta1'] - -    print("number of events = %i" % len(ev)) - -    # Now, select prompt events. -    # -    # We define a prompt event here as any event with an NHIT > 100 and whose -    # previous > 100 nhit event was more than 250 ms ago -    # -    # Note: It's important we do this *before* applying the data cleaning cuts -    # since otherwise we may have a prompt event identified only after the -    # cuts. -    # -    # For example, suppose there was a breakdown and for whatever reason -    # the *second* event after the breakdown didn't get tagged correctly. If we -    # apply the data cleaning cuts first and then tag prompt events then this -    # event will get tagged as a prompt event. -    ev = ev.groupby('run',as_index=False).apply(prompt_event).reset_index(level=0,drop=True) - -    print("number of events after prompt nhit cut = %i" % np.count_nonzero(ev.prompt)) - -    # flasher follower cut -    ev = ev.groupby('run',as_index=False).apply(flasher_follower_cut).reset_index(level=0,drop=True) - -    # breakdown follower cut -    ev = ev.groupby('run',as_index=False).apply(breakdown_follower_cut).reset_index(level=0,drop=True) - -    # retrigger cut -    ev = ev.groupby('run',as_index=False).apply(retrigger_cut).reset_index(level=0,drop=True) - -    ev = ev[ev.prompt] - -    ev.set_index(['run','gtid']) - -    ev = pd.merge(fits,ev,how='inner',on=['run','gtid']) -    ev_single_particle = ev[(ev.id2 == 0) & (ev.id3 == 0)] -    ev_single_particle = ev_single_particle.sort_values('fmin').groupby(['run','gtid']).nth(0) -    ev = ev.sort_values('fmin').groupby(['run','gtid']).nth(0) -    ev['psi'] /= ev.nhit_cal - -    ev['cos_theta'] = np.cos(ev_single_particle['theta1']) -    ev['r'] = np.sqrt(ev.x**2 + ev.y**2 + ev.z**2) -    ev['udotr'] = np.sin(ev_single_particle.theta1)*np.cos(ev_single_particle.phi1)*ev_single_particle.x + \ -                  np.sin(ev_single_particle.theta1)*np.sin(ev_single_particle.phi1)*ev_single_particle.y + \ -                  np.cos(ev_single_particle.theta1)*ev_single_particle.z -    ev['udotr'] /= ev.r +    setup_matplotlib(args.save) + +    import matplotlib.pyplot as plt + +    ev = get_events(args.filenames,merge_fits=True)      # figure out bins for high level variables      ev = radius_cut(ev) @@ -459,14 +301,33 @@ if __name__ == '__main__':          for _, row in ev[ev[bg]].iterrows():              data[bg][row.radius_cut][row.psi_cut][row.z_cut][row.udotr_cut] += 1 -    # FIXME: Estimate for now, needs to come from MC -    sacrifice = {bg: 0.0 for bg in ['muon','noise','neck','flasher','breakdown']} -    sacrifice['signal'] = np.zeros((len(np.unique(ev.radius_cut)),len(np.unique(ev.psi_cut)),len(np.unique(ev.cos_theta_cut)),len(np.unique(ev.z_cut))),dtype=int) -    p_r_signal = np.array([0.9,0.1]) -    p_psi_signal = np.array([1.0,0.0]) -    p_z_signal = np.array([0.5,0.5]) -    p_udotr_signal = np.array([0.25,0.75]) -    sacrifice['signal'] = p_r_signal[:,np.newaxis,np.newaxis,np.newaxis]*p_psi_signal[:,np.newaxis,np.newaxis]*p_z_signal[:,np.newaxis]*p_udotr_signal +    ev_mc = get_events(args.mc, merge_fits=True) + +    ev_mc = ev_mc[ev_mc.prompt] + +    # figure out bins for high level variables +    ev_mc = radius_cut(ev_mc) +    ev_mc = psi_cut(ev_mc) +    ev_mc = cos_theta_cut(ev_mc) +    ev_mc = z_cut(ev_mc) +    ev_mc = udotr_cut(ev_mc) + +    ev_mc['noise'] = ev_mc.dc & (DC_JUNK | DC_CRATE_ISOTROPY | DC_QVNHIT | DC_ITC | DC_ESUM) != 0 +    ev_mc['neck'] = ((ev_mc.dc & DC_NECK) != 0) & ~ev_mc.noise +    ev_mc['flasher'] = ((ev_mc.dc & DC_FLASHER) != 0) & ~(ev_mc.noise | ev_mc.neck) & (ev_mc.nhit < 1000) +    ev_mc['breakdown'] = ((ev_mc.dc & (DC_FLASHER | DC_BREAKDOWN)) != 0) & ~(ev_mc.noise | ev_mc.neck) & (ev_mc.nhit >= 1000) +    ev_mc['muon'] = ((ev_mc.dc & DC_MUON) != 0) & ~(ev_mc.noise | ev_mc.neck | ev_mc.flasher | ev_mc.breakdown) +    ev_mc['signal'] = ~(ev_mc.noise | ev_mc.neck | ev_mc.flasher | ev_mc.breakdown | ev_mc.muon) + +    # FIXME: Double check that what I'm calculating here matches with what I +    # expect +    sacrifice = {} +    for bg in ['signal','muon','noise','neck','flasher','breakdown']: +        sacrifice[bg] = np.zeros((2,2,2,2),dtype=float) +        for _, row in ev_mc[ev_mc[bg]].iterrows(): +            sacrifice[bg][row.radius_cut][row.psi_cut][row.z_cut][row.udotr_cut] += 1 + +        sacrifice[bg] /= len(ev_mc)      constraints = [flasher_r_udotr, breakdown_r_udotr,muon_r_psi_z_udotr,neck_r_z_udotr,noise_z_udotr]      nll = make_nll(data,sacrifice,constraints) @@ -616,7 +477,7 @@ if __name__ == '__main__':      # Plot walker positions as a function of step number for the expected      # number of events -    fig, axes = plt.subplots(6, figsize=FIGSIZE, num=1, sharex=True) +    fig, axes = plt.subplots(6, num=1, sharex=True)      samples = sampler.get_chain()      labels = ["Signal","Muon","Noise","Neck","Flasher","Breakdown"]      for i, bg in enumerate(['signal','muon','noise','neck','flasher','breakdown']): @@ -631,7 +492,7 @@ if __name__ == '__main__':      # Plot walker positions as a function of step number for the background cut      # efficiencies -    fig, axes = plt.subplots(5, figsize=FIGSIZE, num=2, sharex=True) +    fig, axes = plt.subplots(5, num=2, sharex=True)      samples = sampler.get_chain()      tag_labels = ['M','N','Ne','F','B']      for i, bg in enumerate(['muon','noise','neck','flasher','breakdown']): @@ -646,7 +507,7 @@ if __name__ == '__main__':      samples = sampler.chain.reshape((-1,len(x0))) -    plt.figure(3, figsize=FIGSIZE) +    plt.figure(3)      for i, bg in enumerate(['signal','muon','noise','neck','flasher','breakdown']):          ax = plt.subplot(3,2,i+1)          plt.hist(samples[:,i],bins=100,histtype='step') @@ -656,7 +517,7 @@ if __name__ == '__main__':      plt.legend()      plt.tight_layout() -    plt.figure(4, figsize=FIGSIZE) +    plt.figure(4)      for i, bg in enumerate(['muon','noise','neck','flasher','breakdown']):          ax = plt.subplot(3,2,i+1)          plt.hist(samples[:,6+i],bins=100,histtype='step') @@ -732,7 +593,7 @@ if __name__ == '__main__':          p_psi_breakdown_lo]      ylim_max = 0 -    fig = plt.figure(5, figsize=FIGSIZE) +    fig = plt.figure(5)      axes = []      for i, bg in enumerate(['signal','muon','noise','neck','flasher','breakdown']):          axes.append(plt.subplot(3,2,i+1)) diff --git a/utils/dc-closure-test b/utils/dc-closure-test new file mode 100755 index 0000000..54d2629 --- /dev/null +++ b/utils/dc-closure-test @@ -0,0 +1,555 @@ +#!/usr/bin/env python +# Copyright (c) 2019, Anthony Latorre <tlatorre at uchicago> +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the Free +# Software Foundation, either version 3 of the License, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +# more details. +# +# You should have received a copy of the GNU General Public License along with +# this program. If not, see <https://www.gnu.org/licenses/>. + +from __future__ import print_function, division +import numpy as np +from scipy.stats import iqr +import nlopt +from scipy.stats import poisson +import contextlib +import sys +from math import exp +import emcee +from scipy.optimize import brentq +from scipy.stats import truncnorm +from matplotlib.lines import Line2D +from sddm.plot import despine +from sddm.dc import * +from sddm.plot_energy import * + +try: +    from emcee import moves +except ImportError: +    print("emcee version 2.2.1 is required",file=sys.stderr) +    sys.exit(1) + +# from https://stackoverflow.com/questions/2891790/how-to-pretty-print-a-numpy-array-without-scientific-notation-and-with-given-pre +@contextlib.contextmanager +def printoptions(*args, **kwargs): +    original = np.get_printoptions() +    np.set_printoptions(*args, **kwargs) +    try: +        yield +    finally:  +        np.set_printoptions(**original) + +def radius_cut(ev): +    ev['radius_cut'] = np.digitize((ev.r/PSUP_RADIUS)**3,(0.9,)) +    return ev + +def udotr_cut(ev): +    ev['udotr_cut'] = np.digitize(ev.udotr,(-0.5,)) +    return ev + +def psi_cut(ev): +    ev['psi_cut'] = np.digitize(ev.psi,(6.0,)) +    return ev + +def cos_theta_cut(ev): +    ev['cos_theta_cut'] = np.digitize(ev.cos_theta,(-0.5,)) +    return ev + +def z_cut(ev): +    ev['z_cut'] = np.digitize(ev.z,(0.0,)) +    return ev + +# Constraint to enforce the fact that P(r,psi,z,udotr|muon) all add up to 1.0. +# In the likelihood function we set the last possibility for r and udotr equal +# to 1.0 minus the others. Therefore, we need to enforce the fact that the +# others must add up to less than 1. +muon_r_psi_z_udotr = Constraint(range(11,26)) + +# Constraint to enforce the fact that P(z,udotr|noise) all add up to 1.0. In +# the likelihood function we set the last possibility for r and udotr equal to +# 1.0 minus the others. Therefore, we need to enforce the fact that the others +# must add up to less than 1. +noise_z_udotr = Constraint(range(28,31)) + +# Constraint to enforce the fact that P(r,z,udotr|neck) all add up to 1.0. In +# the likelihood function we set the last possibility for r and udotr equal to +# 1.0 minus the others. Therefore, we need to enforce the fact that the others +# must add up to less than 1. +neck_r_z_udotr = Constraint(range(31,38)) + +# Constraint to enforce the fact that P(r,udotr|flasher) all add up to 1.0. In +# the likelihood function we set the last possibility for r and udotr equal to +# 1.0 minus the others. Therefore, we need to enforce the fact that the others +# must add up to less than 1 +flasher_r_udotr = Constraint(range(39,42)) + +# Constraint to enforce the fact that P(r,udotr|breakdown) all add up to 1.0. +# In the likelihood function we set the last possibility for r and udotr equal +# to 1.0 minus the others. Therefore, we need to enforce the fact that the +# others must add up to less than 1. +breakdown_r_udotr = Constraint(range(44,47)) + +def make_nll(data, sacrifice, constraints): +    def nll(x, grad=None, fill_value=1e9): +        if grad is not None and grad.size > 0: +            raise Exception("nll got passed grad!") + +        nll = 0.0 +        # Here we explicitly return a crazy high value if one of the +        # constraints is violated. When using nlopt it should respect all the +        # constraints, *but* later when we do the Metropolis Hastings algorithm +        # we don't have any way to add the constraints explicitly. +        for constraint in constraints: +            if constraint(x) > 0: +                nll += fill_value + 1e4*constraint(x)**2 + +        if (x <= 0).any() or (x[6:] >= 1).any(): +            nll += fill_value + 1e4*np.sum((x[x < 0])**2) + 1e4*np.sum((x[6:][x[6:] > 1]-1)**2) + +        if nll: +            return nll + +        (mu_signal, mu_muon, mu_noise, mu_neck, mu_flasher, mu_breakdown, +         contamination_muon, contamination_noise, contamination_neck, contamination_flasher, contamination_breakdown, +         p_r_psi_z_udotr_muon_lolololo, # 11 +         p_r_psi_z_udotr_muon_lololohi, +         p_r_psi_z_udotr_muon_lolohilo, +         p_r_psi_z_udotr_muon_lolohihi, +         p_r_psi_z_udotr_muon_lohilolo, +         p_r_psi_z_udotr_muon_lohilohi, +         p_r_psi_z_udotr_muon_lohihilo, +         p_r_psi_z_udotr_muon_lohihihi, +         p_r_psi_z_udotr_muon_hilololo, +         p_r_psi_z_udotr_muon_hilolohi, +         p_r_psi_z_udotr_muon_hilohilo, +         p_r_psi_z_udotr_muon_hilohihi, +         p_r_psi_z_udotr_muon_hihilolo, +         p_r_psi_z_udotr_muon_hihilohi, +         p_r_psi_z_udotr_muon_hihihilo, +         p_r_noise_lo, p_psi_noise_lo, # 26, 27 +         p_z_udotr_noise_lolo, # 28 +         p_z_udotr_noise_lohi, +         p_z_udotr_noise_hilo, +         p_r_z_udotr_neck_lololo, # 31 +         p_r_z_udotr_neck_lolohi, +         p_r_z_udotr_neck_lohilo, +         p_r_z_udotr_neck_lohihi, +         p_r_z_udotr_neck_hilolo, +         p_r_z_udotr_neck_hilohi, +         p_r_z_udotr_neck_hihilo, +         p_psi_neck_lo, # 38 +         p_r_udotr_flasher_lolo, p_r_udotr_flasher_lohi, p_r_udotr_flasher_hilo, # 39, ..., 41 +         p_psi_flasher_lo, p_z_flasher_lo, +         p_r_udotr_breakdown_lolo, p_r_udotr_breakdown_lohi, p_r_udotr_breakdown_hilo, # 44, ..., 46 +         p_psi_breakdown_lo, p_z_breakdown_lo, +         p_neck_given_muon) = x + +        p_r_udotr_flasher_hihi = 1-p_r_udotr_flasher_lolo-p_r_udotr_flasher_lohi-p_r_udotr_flasher_hilo +        p_r_udotr_breakdown_hihi = 1-p_r_udotr_breakdown_lolo-p_r_udotr_breakdown_lohi-p_r_udotr_breakdown_hilo +        p_r_psi_z_udotr_muon_hihihihi = 1 - \ +            p_r_psi_z_udotr_muon_lolololo - \ +            p_r_psi_z_udotr_muon_lololohi - \ +            p_r_psi_z_udotr_muon_lolohilo - \ +            p_r_psi_z_udotr_muon_lolohihi - \ +            p_r_psi_z_udotr_muon_lohilolo - \ +            p_r_psi_z_udotr_muon_lohilohi - \ +            p_r_psi_z_udotr_muon_lohihilo - \ +            p_r_psi_z_udotr_muon_lohihihi - \ +            p_r_psi_z_udotr_muon_hilololo - \ +            p_r_psi_z_udotr_muon_hilolohi - \ +            p_r_psi_z_udotr_muon_hilohilo - \ +            p_r_psi_z_udotr_muon_hilohihi - \ +            p_r_psi_z_udotr_muon_hihilolo - \ +            p_r_psi_z_udotr_muon_hihilohi - \ +            p_r_psi_z_udotr_muon_hihihilo +        p_r_z_udotr_neck_hihihi = 1 - p_r_z_udotr_neck_lololo - p_r_z_udotr_neck_lolohi - p_r_z_udotr_neck_lohilo - p_r_z_udotr_neck_lohihi - p_r_z_udotr_neck_hilolo - p_r_z_udotr_neck_hilohi - p_r_z_udotr_neck_hihilo +        p_z_udotr_noise_hihi = 1 - p_z_udotr_noise_lolo - p_z_udotr_noise_lohi - p_z_udotr_noise_hilo + +        # Muon events +        # first 6 parameters are the mean number of signal and bgs +        p_muon = np.array([\ +            [[[p_r_psi_z_udotr_muon_lolololo, p_r_psi_z_udotr_muon_lololohi], \ +              [p_r_psi_z_udotr_muon_lolohilo, p_r_psi_z_udotr_muon_lolohihi]], \ +             [[p_r_psi_z_udotr_muon_lohilolo, p_r_psi_z_udotr_muon_lohilohi], \ +              [p_r_psi_z_udotr_muon_lohihilo, p_r_psi_z_udotr_muon_lohihihi]]], \ +            [[[p_r_psi_z_udotr_muon_hilololo, p_r_psi_z_udotr_muon_hilolohi], \ +              [p_r_psi_z_udotr_muon_hilohilo, p_r_psi_z_udotr_muon_hilohihi]], \ +             [[p_r_psi_z_udotr_muon_hihilolo, p_r_psi_z_udotr_muon_hihilohi], \ +              [p_r_psi_z_udotr_muon_hihihilo, p_r_psi_z_udotr_muon_hihihihi]]]]) +        expected_muon = p_muon*contamination_muon*mu_muon + sacrifice['muon']*mu_signal + +        nll -= fast_poisson_logpmf(data['muon'],expected_muon).sum() + +        # Noise events +        p_r_noise = np.array([p_r_noise_lo,1-p_r_noise_lo]) +        p_psi_noise = np.array([p_psi_noise_lo,1-p_psi_noise_lo]) +        p_z_udotr_noise = np.array([\ +            [p_z_udotr_noise_lolo,p_z_udotr_noise_lohi], +            [p_z_udotr_noise_hilo,p_z_udotr_noise_hihi]]) +        p_noise = p_r_noise[:,np.newaxis,np.newaxis,np.newaxis]*p_psi_noise[:,np.newaxis,np.newaxis]*p_z_udotr_noise +        expected_noise = p_noise*contamination_noise*mu_noise + sacrifice['noise']*mu_signal + +        nll -= fast_poisson_logpmf(data['noise'],expected_noise).sum() + +        # Neck events +        # FIXME: for now assume parameterized same as muon +        p_r_z_udotr_neck = np.array([\ +            [[p_r_z_udotr_neck_lololo, p_r_z_udotr_neck_lolohi], \ +             [p_r_z_udotr_neck_lohilo, p_r_z_udotr_neck_lohihi]], \ +            [[p_r_z_udotr_neck_hilolo, p_r_z_udotr_neck_hilohi], \ +             [p_r_z_udotr_neck_hihilo, p_r_z_udotr_neck_hihihi]]]) +        p_psi_neck = np.array([p_psi_neck_lo,1-p_psi_neck_lo]) +        p_neck = p_r_z_udotr_neck[:,np.newaxis,:,:]*p_psi_neck[:,np.newaxis,np.newaxis] +        expected_neck = p_neck*contamination_neck*mu_neck + sacrifice['neck']*mu_signal +        # FIXME: pdf should be different for muon given neck +        expected_neck += p_muon*p_neck_given_muon*mu_muon + +        nll -= fast_poisson_logpmf(data['neck'],expected_neck).sum() + +        # Flasher events +        p_r_udotr_flasher = np.array([\ +            [p_r_udotr_flasher_lolo,p_r_udotr_flasher_lohi], \ +            [p_r_udotr_flasher_hilo,p_r_udotr_flasher_hihi]]) +        p_psi_flasher = np.array([p_psi_flasher_lo,1-p_psi_flasher_lo]) +        p_z_flasher = np.array([p_z_flasher_lo,1-p_z_flasher_lo]) +        p_flasher = p_r_udotr_flasher[:,np.newaxis,np.newaxis,:]*p_psi_flasher[:,np.newaxis,np.newaxis]*p_z_flasher[:,np.newaxis] +        expected_flasher = p_flasher*contamination_flasher*mu_flasher + sacrifice['flasher']*mu_signal + +        nll -= fast_poisson_logpmf(data['flasher'],expected_flasher).sum() + +        # Breakdown events +        p_r_udotr_breakdown = np.array([\ +            [p_r_udotr_breakdown_lolo,p_r_udotr_breakdown_lohi], \ +            [p_r_udotr_breakdown_hilo,p_r_udotr_breakdown_hihi]]) +        p_psi_breakdown = np.array([p_psi_breakdown_lo,1-p_psi_breakdown_lo]) +        p_z_breakdown = np.array([p_z_breakdown_lo,1-p_z_breakdown_lo]) +        p_breakdown = p_r_udotr_breakdown[:,np.newaxis,np.newaxis,:]*p_psi_breakdown[:,np.newaxis,np.newaxis]*p_z_breakdown[:,np.newaxis] +        expected_breakdown = p_breakdown*contamination_breakdown*mu_breakdown + sacrifice['breakdown']*mu_signal + +        nll -= fast_poisson_logpmf(data['breakdown'],expected_breakdown).sum() + +        # Signal like events +        expected_signal = np.zeros_like(expected_muon) +        expected_signal += mu_signal*sacrifice['signal'] +        expected_signal += p_muon*(1-contamination_muon)*mu_muon +        expected_signal += p_neck*(1-contamination_neck)*mu_neck +        expected_signal += p_noise*(1-contamination_noise)*mu_noise +        expected_signal += p_flasher*(1-contamination_flasher)*mu_flasher +        expected_signal += p_breakdown*(1-contamination_breakdown)*mu_breakdown + +        nll -= fast_poisson_logpmf(data['signal'],expected_signal).sum() + +        if not np.isfinite(nll): +            print("x = ", x) +            print("p_r_z_udotr_neck = ", p_r_z_udotr_neck) +            print("expected_muon = ", expected_muon) +            print("expected_noise = ", expected_noise) +            print("expected_neck = ", expected_neck) +            print("expected_flasher = ", expected_flasher) +            print("expected_breakdown = ", expected_breakdown) +            print("nll is not finite!") +            sys.exit(0) + +        return nll +    return nll + +def fit(data, sacrifice, steps): +    constraints = [flasher_r_udotr, breakdown_r_udotr,muon_r_psi_z_udotr,neck_r_z_udotr,noise_z_udotr] +    nll = make_nll(data,sacrifice,constraints) + +    x0 = [] +    for bg in ['signal','muon','noise','neck','flasher','breakdown']: +        x0.append(data[bg].sum()) + +    # contamination +    x0 += [0.99]*5 + +    if data['muon'].sum() > 0: +        # P(r,psi,z,udotr|muon) +        x0 += [data['muon'][0,0,0,0].sum()/data['muon'].sum()] +        x0 += [data['muon'][0,0,0,1].sum()/data['muon'].sum()] +        x0 += [data['muon'][0,0,1,0].sum()/data['muon'].sum()] +        x0 += [data['muon'][0,0,1,1].sum()/data['muon'].sum()] +        x0 += [data['muon'][0,1,0,0].sum()/data['muon'].sum()] +        x0 += [data['muon'][0,1,0,1].sum()/data['muon'].sum()] +        x0 += [data['muon'][0,1,1,0].sum()/data['muon'].sum()] +        x0 += [data['muon'][0,1,1,1].sum()/data['muon'].sum()] +        x0 += [data['muon'][1,0,0,0].sum()/data['muon'].sum()] +        x0 += [data['muon'][1,0,0,1].sum()/data['muon'].sum()] +        x0 += [data['muon'][1,0,1,0].sum()/data['muon'].sum()] +        x0 += [data['muon'][1,0,1,1].sum()/data['muon'].sum()] +        x0 += [data['muon'][1,1,0,0].sum()/data['muon'].sum()] +        x0 += [data['muon'][1,1,0,1].sum()/data['muon'].sum()] +        x0 += [data['muon'][1,1,1,0].sum()/data['muon'].sum()] +    else: +        x0 += [0.1]*15 + +    if data['noise'].sum() > 0: +        # P(r|noise) +        x0 += [data['noise'][0].sum()/data['noise'].sum()] +        # P(psi|noise) +        x0 += [data['noise'][:,0].sum()/data['noise'].sum()] +        # P(z,udotr|noise) +        x0 += [data['noise'][:,:,0,0].sum()/data['noise'].sum()] +        x0 += [data['noise'][:,:,0,1].sum()/data['noise'].sum()] +        x0 += [data['noise'][:,:,1,0].sum()/data['noise'].sum()] +    else: +        x0 += [0.1]*5 + +    if data['neck'].sum() > 0: +        # P(r,z,udotr|neck) +        x0 += [data['neck'][0,:,0,0].sum()/data['neck'].sum()] +        x0 += [data['neck'][0,:,0,1].sum()/data['neck'].sum()] +        x0 += [data['neck'][0,:,1,0].sum()/data['neck'].sum()] +        x0 += [data['neck'][0,:,1,1].sum()/data['neck'].sum()] +        x0 += [data['neck'][1,:,0,0].sum()/data['neck'].sum()] +        x0 += [data['neck'][1,:,0,1].sum()/data['neck'].sum()] +        x0 += [data['neck'][1,:,1,0].sum()/data['neck'].sum()] +        # P(psi|neck) +        x0 += [data['neck'][:,0].sum()/data['neck'].sum()] +    else: +        x0 += [0.1]*8 + +    if data['flasher'].sum() > 0: +        # P(r,udotr|flasher) +        x0 += [data['flasher'][0,:,:,0].sum()/data['flasher'].sum()] +        x0 += [data['flasher'][0,:,:,1].sum()/data['flasher'].sum()] +        x0 += [data['flasher'][1,:,:,0].sum()/data['flasher'].sum()] +        # P(psi|flasher) +        x0 += [data['flasher'][:,0].sum()/data['flasher'].sum()] +        # P(z|flasher) +        x0 += [data['flasher'][:,:,0].sum()/data['flasher'].sum()] +    else: +        x0 += [0.1]*5 + +    if data['breakdown'].sum() > 0: +        # P(r,udotr|breakdown) +        x0 += [data['breakdown'][0,:,:,0].sum()/data['breakdown'].sum()] +        x0 += [data['breakdown'][0,:,:,1].sum()/data['breakdown'].sum()] +        x0 += [data['breakdown'][1,:,:,0].sum()/data['breakdown'].sum()] +        # P(psi|breakdown) +        x0 += [data['breakdown'][:,0].sum()/data['breakdown'].sum()] +        # P(z|breakdown) +        x0 += [data['breakdown'][:,:,0].sum()/data['breakdown'].sum()] +    else: +        x0 += [0.1]*5 + +    # P(neck|muon) +    x0 += [EPSILON] + +    x0 = np.array(x0) + +    # Use the COBYLA algorithm here because it is the only derivative free +    # minimization routine which honors inequality constraints +    # Edit: SBPLX seems to work better +    opt = nlopt.opt(nlopt.LN_SBPLX, len(x0)) +    opt.set_min_objective(nll) +    # set lower bounds to 1e-10 to prevent nans if we predict something should +    # be 0 but observe an event. +    low = np.ones_like(x0)*EPSILON +    high = np.array([1e9]*6 + [1-EPSILON]*(len(x0)-6)) +    x0[x0 < low] = low[x0 < low] +    x0[x0 > high] = high[x0 > high] +    opt.set_lower_bounds(low) +    opt.set_upper_bounds(high) +    opt.set_ftol_abs(1e-10) +    opt.set_initial_step([1]*6 + [0.01]*(len(x0)-6)) +    #for constraint in constraints: +        #opt.add_inequality_constraint(constraint,0) + +    xopt = opt.optimize(x0) +    nll_xopt = nll(xopt) +    print("nll(xopt) = ", nll(xopt)) + +    while True: +        xopt = opt.optimize(xopt) +        if not nll(xopt) < nll_xopt - 1e-10: +            break +        nll_xopt = nll(xopt) +        print("nll(xopt) = ", nll(xopt)) +        #print("n = ", opt.get_numevals()) + +    stepsizes = estimate_errors(nll,xopt,constraints) +    with printoptions(precision=3, suppress=True): +        print("Errors: ", stepsizes) + +    #samples = metropolis_hastings(nll,xopt,stepsizes,100000) +    #print("nll(xopt) = %.2g" % nll(xopt)) + +    pos = np.empty((10, len(x0)),dtype=np.double) +    for i in range(pos.shape[0]): +        pos[i] = xopt + np.random.randn(len(x0))*stepsizes +        pos[i,:6] = np.clip(pos[i,:6],EPSILON,1e9) +        pos[i,6:] = np.clip(pos[i,6:],EPSILON,1-EPSILON) + +        for constraint in constraints: +            if constraint(pos[i]) >= 0: +                pos[i] = constraint.renormalize_no_fix(pos[i]) + +    nwalkers, ndim = pos.shape + +    proposal = get_proposal_func(stepsizes*0.1,low,high) +    sampler = emcee.EnsembleSampler(nwalkers, ndim, lambda x, grad, fill_value: -nll(x,grad,fill_value), moves=emcee.moves.MHMove(proposal),args=[None,np.inf]) +    with np.errstate(invalid='ignore'): +        sampler.run_mcmc(pos, steps) + +    samples = sampler.chain.reshape((-1,len(x0))) + +    return samples + +if __name__ == '__main__': +    import argparse +    import numpy as np +    import pandas as pd +    import sys +    import h5py +    from sddm import setup_matplotlib + +    parser = argparse.ArgumentParser("plot fit results") +    parser.add_argument("filenames", nargs='+', help="input files") +    parser.add_argument("--steps", type=int, default=100000, help="number of steps in the MCMC chain") +    parser.add_argument("--save", action="store_true", default=False, help="save plots") +    parser.add_argument("--mc", nargs='+', required=True, help="atmospheric MC files") +    parser.add_argument("-n", type=int, default=10, help="number of fits to run") +    args = parser.parse_args() + +    setup_matplotlib(args.save) + +    import matplotlib.pyplot as plt + +    ev = get_events(args.filenames,merge_fits=True) +    ev_mc = get_events(args.mc, merge_fits=True) + +    # figure out bins for high level variables +    ev = radius_cut(ev) +    ev = psi_cut(ev) +    ev = cos_theta_cut(ev) +    ev = z_cut(ev) +    ev = udotr_cut(ev) + +    ev['noise'] = ev.dc & (DC_JUNK | DC_CRATE_ISOTROPY | DC_QVNHIT | DC_ITC | DC_ESUM) != 0 +    ev['neck'] = ((ev.dc & DC_NECK) != 0) & ~ev.noise +    ev['flasher'] = ((ev.dc & DC_FLASHER) != 0) & ~(ev.noise | ev.neck) & (ev.nhit < 1000) +    ev['breakdown'] = ((ev.dc & (DC_FLASHER | DC_BREAKDOWN)) != 0) & ~(ev.noise | ev.neck) & (ev.nhit >= 1000) +    ev['muon'] = ((ev.dc & DC_MUON) != 0) & ~(ev.noise | ev.neck | ev.flasher | ev.breakdown) +    ev['signal'] = ~(ev.noise | ev.neck | ev.flasher | ev.breakdown | ev.muon) + +    ev_mc = ev_mc[ev_mc.prompt] + +    # figure out bins for high level variables +    ev_mc = radius_cut(ev_mc) +    ev_mc = psi_cut(ev_mc) +    ev_mc = cos_theta_cut(ev_mc) +    ev_mc = z_cut(ev_mc) +    ev_mc = udotr_cut(ev_mc) + +    ev_mc['noise'] = ev_mc.dc & (DC_JUNK | DC_CRATE_ISOTROPY | DC_QVNHIT | DC_ITC | DC_ESUM) != 0 +    ev_mc['neck'] = ((ev_mc.dc & DC_NECK) != 0) & ~ev_mc.noise +    ev_mc['flasher'] = ((ev_mc.dc & DC_FLASHER) != 0) & ~(ev_mc.noise | ev_mc.neck) & (ev_mc.nhit < 1000) +    ev_mc['breakdown'] = ((ev_mc.dc & (DC_FLASHER | DC_BREAKDOWN)) != 0) & ~(ev_mc.noise | ev_mc.neck) & (ev_mc.nhit >= 1000) +    ev_mc['muon'] = ((ev_mc.dc & DC_MUON) != 0) & ~(ev_mc.noise | ev_mc.neck | ev_mc.flasher | ev_mc.breakdown) +    ev_mc['signal'] = ~(ev_mc.noise | ev_mc.neck | ev_mc.flasher | ev_mc.breakdown | ev_mc.muon) + +    contamination_pull = {} + +    nbg = {} +    for bg in ['signal','muon','noise','neck','flasher','breakdown']: +        nbg[bg] = 100 +        contamination_pull[bg] = [] + +    for i in range(args.n): +        data = {} +        for bg in ['signal','muon','noise','neck','flasher','breakdown']: +            data[bg] = np.zeros((2,2,2,2),dtype=int) +            if bg == 'signal': +                for bg2 in ['signal','muon','noise','neck','flasher','breakdown']: +                    if bg2 == 'signal': +                        for _, row in ev_mc[ev_mc[bg2]].sample(n=nbg[bg2],replace=True).iterrows(): +                            data[bg][row.radius_cut][row.psi_cut][row.z_cut][row.udotr_cut] += 1 +                    else: +                        for _, row in ev[ev[bg2]].sample(n=nbg[bg2],replace=True).iterrows(): +                            data[bg][row.radius_cut][row.psi_cut][row.z_cut][row.udotr_cut] += 1 +            else: +                for _, row in ev[ev[bg]].iterrows(): +                    data[bg][row.radius_cut][row.psi_cut][row.z_cut][row.udotr_cut] += 1 + +        # FIXME: Double check that what I'm calculating here matches with what I +        # expect +        sacrifice = {} +        for bg in ['signal','muon','noise','neck','flasher','breakdown']: +            sacrifice[bg] = np.zeros((2,2,2,2),dtype=float) +            for _, row in ev_mc[ev_mc[bg]].iterrows(): +                sacrifice[bg][row.radius_cut][row.psi_cut][row.z_cut][row.udotr_cut] += 1 + +            sacrifice[bg] /= len(ev_mc) + +        samples = fit(data, sacrifice, args.steps) + +        (mu_signal, mu_muon, mu_noise, mu_neck, mu_flasher, mu_breakdown, +         contamination_muon, contamination_noise, contamination_neck, contamination_flasher, contamination_breakdown, +         p_r_psi_z_udotr_muon_lolololo, # 11 +         p_r_psi_z_udotr_muon_lololohi, +         p_r_psi_z_udotr_muon_lolohilo, +         p_r_psi_z_udotr_muon_lolohihi, +         p_r_psi_z_udotr_muon_lohilolo, +         p_r_psi_z_udotr_muon_lohilohi, +         p_r_psi_z_udotr_muon_lohihilo, +         p_r_psi_z_udotr_muon_lohihihi, +         p_r_psi_z_udotr_muon_hilololo, +         p_r_psi_z_udotr_muon_hilolohi, +         p_r_psi_z_udotr_muon_hilohilo, +         p_r_psi_z_udotr_muon_hilohihi, +         p_r_psi_z_udotr_muon_hihilolo, +         p_r_psi_z_udotr_muon_hihilohi, +         p_r_psi_z_udotr_muon_hihihilo, +         p_r_noise_lo, p_psi_noise_lo, # 26, 27 +         p_z_udotr_noise_lolo, # 28 +         p_z_udotr_noise_lohi, +         p_z_udotr_noise_hilo, +         p_r_z_udotr_neck_lololo, # 31 +         p_r_z_udotr_neck_lolohi, +         p_r_z_udotr_neck_lohilo, +         p_r_z_udotr_neck_lohihi, +         p_r_z_udotr_neck_hilolo, +         p_r_z_udotr_neck_hilohi, +         p_r_z_udotr_neck_hihilo, +         p_psi_neck_lo, # 38 +         p_r_udotr_flasher_lolo, p_r_udotr_flasher_lohi, p_r_udotr_flasher_hilo, # 39, ..., 41 +         p_psi_flasher_lo, p_z_flasher_lo, +         p_r_udotr_breakdown_lolo, p_r_udotr_breakdown_lohi, p_r_udotr_breakdown_hilo, # 44, ..., 46 +         p_psi_breakdown_lo, p_z_breakdown_lo, +         p_neck_given_muon) = samples.T + +        for i, bg in enumerate(['signal','muon','noise','neck','flasher','breakdown']): +            if i == 0: +                contamination = samples[:,i] +            else: +                contamination = samples[:,i]*(1-samples[:,5+i]) +            mean = np.mean(contamination) +            std = np.std(contamination) +            contamination_pull[bg].append((mean - nbg[bg])/std) + +    fig = plt.figure() +    axes = [] +    for i, bg in enumerate(['signal','muon','noise','neck','flasher','breakdown']): +        axes.append(plt.subplot(3,2,i+1)) +        plt.hist(contamination_pull[bg],bins=100,histtype='step') +        plt.title(bg.capitalize()) +    for ax in axes: +        ax.set_xlim((-10,10)) +        despine(ax=ax,left=True,trim=True) +        ax.get_yaxis().set_visible(False) +    plt.tight_layout() + +    if args.save: +        fig.savefig("contamination_pull_plot.pdf") +        fig.savefig("contamination_pull_plot.eps") +    else: +        plt.show() diff --git a/utils/plot-energy b/utils/plot-energy index 4cd116b..6997f3b 100755 --- a/utils/plot-energy +++ b/utils/plot-energy @@ -117,125 +117,7 @@ if __name__ == '__main__':      import matplotlib.pyplot as plt -    ev = pd.concat([pd.read_hdf(filename, "ev") for filename in args.filenames],ignore_index=True) -    fits = pd.concat([pd.read_hdf(filename, "fits") for filename in args.filenames],ignore_index=True) -    rhdr = pd.concat([pd.read_hdf(filename, "rhdr") for filename in args.filenames],ignore_index=True) - -    first_gtid = rhdr.set_index('run').to_dict()['first_gtid'] - -    # First, remove junk events since orphans won't have a 50 MHz clock and so -    # could screw up the 50 MHz clock unwrapping -    ev = ev[ev.dc & DC_JUNK == 0] - -    # We need the events to be in time order here in order to calculate the -    # delta t between events. It's not obvious exactly how to do this. You -    # could sort by GTID, but that wraps around. Similarly we can't sort by the -    # 50 MHz clock because it also wraps around. Finally, I'm hesitant to sort -    # by the 10 MHz clock since it can be unreliable. -    # -    # Update: Phil proposed a clever way to get the events in order using the -    # GTID: -    # -    # > The GTID rollover should be easy to handle because there should never -    # > be two identical GTID's in a run. So if you order the events by GTID, -    # > you can assume that events with GTID's that come logically before the -    # > first GTID in the run must have occurred after the other events. -    # -    # Therefore, we can just add 0x1000000 to all GTIDs before the first GTID -    # in the event and sort on that. We get the first GTID from the RHDR bank. -    ev['gtid_sort'] = ev['gtid'].copy() - -    ev = ev.groupby('run',as_index=False).apply(gtid_sort,first_gtid=first_gtid).reset_index(level=0,drop=True) - -    ev = ev.sort_values(by=['run','gtid_sort'],kind='mergesort') - -    for run, ev_run in ev.groupby('run'): -        # Warn about 50 MHz clock jumps since they could indicate that the -        # events aren't in order. -        dt = np.diff(ev_run.gtr) -        if np.count_nonzero((np.abs(dt) > 1e9) & (dt > -0x7ffffffffff*20.0/2)): -            print_warning("Warning: %i 50 MHz clock jumps in run %i. Are the events in order?" % \ -                (np.count_nonzero((np.abs(dt) > 1e9) & (dt > -0x7ffffffffff*20.0/2)),run)) - -    # unwrap the 50 MHz clock within each run -    ev.gtr = ev.groupby(['run'],group_keys=False)['gtr'].transform(unwrap_50_mhz_clock) - -    for run, ev_run in ev.groupby('run'): -        # Warn about GTID jumps since we could be missing a potential flasher -        # and/or breakdown, and we need all the events in order to do a -        # retrigger cut -        if np.count_nonzero(np.diff(ev_run.gtid) != 1): -            print_warning("Warning: %i GTID jumps in run %i" % (np.count_nonzero(np.diff(ev_run.gtid) != 1),run)) - -    # calculate the time difference between each event and the previous event -    # so we can tag retrigger events -    ev['dt'] = ev.groupby(['run'],group_keys=False)['gtr'].transform(lambda x: np.concatenate(([1e9],np.diff(x.values)))) - -    # Calculate the approximate Ockham factor. -    # See Chapter 20 in "Probability Theory: The Logic of Science" by Jaynes -    # -    # Note: This is a really approximate form by assuming that the shape of -    # the likelihood space is equal to the average uncertainty in the -    # different parameters. -    fits['w'] = fits['n']*np.log(0.05/10e3) + np.log(fits['energy1']) + fits['n']*np.log(1e-4/(4*np.pi)) - -    # Apply a fudge factor to the Ockham factor of 100 for each extra particle -    # FIXME: I chose 100 a while ago but didn't really investigate what the -    # optimal value was or exactly why it was needed. Should do this. -    fits['w'] -= fits['n']*100 - -    # Note: we index on the left hand site with loc to avoid a copy error -    # -    # See https://www.dataquest.io/blog/settingwithcopywarning/ -    fits.loc[fits['n'] > 1, 'w'] += np.log(fits[fits['n'] > 1]['energy2']) -    fits.loc[fits['n'] > 2, 'w'] += np.log(fits[fits['n'] > 2]['energy3']) - -    fits['fmin'] = fits['fmin'] - fits['w'] - -    # See https://stackoverflow.com/questions/11976503/how-to-keep-index-when-using-pandas-merge -    # for how to properly divide the psi column by nhit_cal which is in the ev -    # dataframe before we actually merge -    fits['psi'] /= fits.reset_index().merge(ev,on=['run','gtid']).set_index('index')['nhit_cal'] -    fits.loc[fits['n'] == 1,'ke'] = fits['energy1'] -    fits.loc[fits['n'] == 2,'ke'] = fits['energy1'] + fits['energy2'] -    fits.loc[fits['n'] == 3,'ke'] = fits['energy1'] + fits['energy2'] + fits['energy3'] -    fits['id'] = fits['id1'] -    fits.loc[fits['n'] == 2, 'id'] = fits['id1']*100 + fits['id2'] -    fits.loc[fits['n'] == 3, 'id'] = fits['id1']*10000 + fits['id2']*100 + fits['id3'] -    fits['theta'] = fits['theta1'] -    fits['r'] = np.sqrt(fits.x**2 + fits.y**2 + fits.z**2) -    fits['r_psup'] = (fits['r']/PSUP_RADIUS)**3 - -    ev['ftp_r'] = np.sqrt(ev.ftp_x**2 + ev.ftp_y**2 + ev.ftp_z**2) -    ev['ftp_r_psup'] = (ev['ftp_r']/PSUP_RADIUS)**3 - -    print("number of events = %i" % len(ev)) - -    # Now, select prompt events. -    # -    # We define a prompt event here as any event with an NHIT > 100 and whose -    # previous > 100 nhit event was more than 250 ms ago -    # -    # Note: It's important we do this *before* applying the data cleaning cuts -    # since otherwise we may have a prompt event identified only after the -    # cuts. -    # -    # For example, suppose there was a breakdown and for whatever reason -    # the *second* event after the breakdown didn't get tagged correctly. If we -    # apply the data cleaning cuts first and then tag prompt events then this -    # event will get tagged as a prompt event. -    ev = ev.groupby('run',group_keys=False).apply(prompt_event) - -    print("number of events after prompt nhit cut = %i" % np.count_nonzero(ev.prompt)) - -    # flasher follower cut -    ev = ev.groupby('run',group_keys=False).apply(flasher_follower_cut) - -    # breakdown follower cut -    ev = ev.groupby('run',group_keys=False).apply(breakdown_follower_cut) - -    # retrigger cut -    ev = ev.groupby('run',group_keys=False).apply(retrigger_cut) +    ev, fits = get_events(args.filenames)      if args.dc:          ev = ev[ev.prompt] diff --git a/utils/sddm/dc.py b/utils/sddm/dc.py index b5cf336..9081a21 100755 --- a/utils/sddm/dc.py +++ b/utils/sddm/dc.py @@ -359,3 +359,10 @@ class Constraint(object):                      continue                  x[i] -= x[i]/2.0          return x + +def fast_poisson_logpmf(k,mu): +    """ +    A fast version of poisson.logpmf(k,mu) which doesn't include the constant +    term of -log(k!) since for likelihood functions it is not necessary. +    """ +    return -mu + k*np.log(mu) diff --git a/utils/sddm/plot_energy.py b/utils/sddm/plot_energy.py index 0c6dcf6..f496b0b 100755 --- a/utils/sddm/plot_energy.py +++ b/utils/sddm/plot_energy.py @@ -347,3 +347,142 @@ def michel_spectrum(T):      y[mask] = GF**2*MUON_MASS**5*x[mask]**2*(6-4*x[mask]+FINE_STRUCTURE_CONSTANT*f(x[mask])/np.pi)/(192*np.pi**3)      y *= 2*MUON_MASS      return y + +def get_events(filenames, merge_fits=False): +    ev = pd.concat([pd.read_hdf(filename, "ev") for filename in filenames],ignore_index=True) +    fits = pd.concat([pd.read_hdf(filename, "fits") for filename in filenames],ignore_index=True) +    rhdr = pd.concat([pd.read_hdf(filename, "rhdr") for filename in filenames],ignore_index=True) + +    first_gtid = rhdr.set_index('run').to_dict()['first_gtid'] + +    # First, remove junk events since orphans won't have a 50 MHz clock and so +    # could screw up the 50 MHz clock unwrapping +    ev = ev[ev.dc & DC_JUNK == 0] + +    # We need the events to be in time order here in order to calculate the +    # delta t between events. It's not obvious exactly how to do this. You +    # could sort by GTID, but that wraps around. Similarly we can't sort by the +    # 50 MHz clock because it also wraps around. Finally, I'm hesitant to sort +    # by the 10 MHz clock since it can be unreliable. +    # +    # Update: Phil proposed a clever way to get the events in order using the +    # GTID: +    # +    # > The GTID rollover should be easy to handle because there should never +    # > be two identical GTID's in a run. So if you order the events by GTID, +    # > you can assume that events with GTID's that come logically before the +    # > first GTID in the run must have occurred after the other events. +    # +    # Therefore, we can just add 0x1000000 to all GTIDs before the first GTID +    # in the event and sort on that. We get the first GTID from the RHDR bank. +    ev['gtid_sort'] = ev['gtid'].copy() + +    ev = ev.groupby('run',as_index=False).apply(gtid_sort,first_gtid=first_gtid).reset_index(level=0,drop=True) + +    ev = ev.sort_values(by=['run','gtid_sort'],kind='mergesort') + +    for run, ev_run in ev.groupby('run'): +        # Warn about 50 MHz clock jumps since they could indicate that the +        # events aren't in order. +        dt = np.diff(ev_run.gtr) +        if np.count_nonzero((np.abs(dt) > 1e9) & (dt > -0x7ffffffffff*20.0/2)): +            print_warning("Warning: %i 50 MHz clock jumps in run %i. Are the events in order?" % \ +                (np.count_nonzero((np.abs(dt) > 1e9) & (dt > -0x7ffffffffff*20.0/2)),run)) + +    # unwrap the 50 MHz clock within each run +    ev.gtr = ev.groupby(['run'],group_keys=False)['gtr'].transform(unwrap_50_mhz_clock) + +    for run, ev_run in ev.groupby('run'): +        # Warn about GTID jumps since we could be missing a potential flasher +        # and/or breakdown, and we need all the events in order to do a +        # retrigger cut +        if np.count_nonzero(np.diff(ev_run.gtid) != 1): +            print_warning("Warning: %i GTID jumps in run %i" % (np.count_nonzero(np.diff(ev_run.gtid) != 1),run)) + +    # calculate the time difference between each event and the previous event +    # so we can tag retrigger events +    ev['dt'] = ev.groupby(['run'],group_keys=False)['gtr'].transform(lambda x: np.concatenate(([1e9],np.diff(x.values)))) + +    # Calculate the approximate Ockham factor. +    # See Chapter 20 in "Probability Theory: The Logic of Science" by Jaynes +    # +    # Note: This is a really approximate form by assuming that the shape of +    # the likelihood space is equal to the average uncertainty in the +    # different parameters. +    fits['w'] = fits['n']*np.log(0.05/10e3) + np.log(fits['energy1']) + fits['n']*np.log(1e-4/(4*np.pi)) + +    # Apply a fudge factor to the Ockham factor of 100 for each extra particle +    # FIXME: I chose 100 a while ago but didn't really investigate what the +    # optimal value was or exactly why it was needed. Should do this. +    fits['w'] -= fits['n']*100 + +    # Note: we index on the left hand site with loc to avoid a copy error +    # +    # See https://www.dataquest.io/blog/settingwithcopywarning/ +    fits.loc[fits['n'] > 1, 'w'] += np.log(fits[fits['n'] > 1]['energy2']) +    fits.loc[fits['n'] > 2, 'w'] += np.log(fits[fits['n'] > 2]['energy3']) + +    fits['fmin'] = fits['fmin'] - fits['w'] + +    # See https://stackoverflow.com/questions/11976503/how-to-keep-index-when-using-pandas-merge +    # for how to properly divide the psi column by nhit_cal which is in the ev +    # dataframe before we actually merge +    fits['psi'] /= fits.reset_index().merge(ev,on=['run','gtid']).set_index('index')['nhit_cal'] +    fits.loc[fits['n'] == 1,'ke'] = fits['energy1'] +    fits.loc[fits['n'] == 2,'ke'] = fits['energy1'] + fits['energy2'] +    fits.loc[fits['n'] == 3,'ke'] = fits['energy1'] + fits['energy2'] + fits['energy3'] +    fits['id'] = fits['id1'] +    fits.loc[fits['n'] == 2, 'id'] = fits['id1']*100 + fits['id2'] +    fits.loc[fits['n'] == 3, 'id'] = fits['id1']*10000 + fits['id2']*100 + fits['id3'] +    fits['theta'] = fits['theta1'] +    fits['r'] = np.sqrt(fits.x**2 + fits.y**2 + fits.z**2) +    fits['r_psup'] = (fits['r']/PSUP_RADIUS)**3 + +    ev['ftp_r'] = np.sqrt(ev.ftp_x**2 + ev.ftp_y**2 + ev.ftp_z**2) +    ev['ftp_r_psup'] = (ev['ftp_r']/PSUP_RADIUS)**3 + +    # Now, select prompt events. +    # +    # We define a prompt event here as any event with an NHIT > 100 and whose +    # previous > 100 nhit event was more than 250 ms ago +    # +    # Note: It's important we do this *before* applying the data cleaning cuts +    # since otherwise we may have a prompt event identified only after the +    # cuts. +    # +    # For example, suppose there was a breakdown and for whatever reason +    # the *second* event after the breakdown didn't get tagged correctly. If we +    # apply the data cleaning cuts first and then tag prompt events then this +    # event will get tagged as a prompt event. +    ev = ev.groupby('run',group_keys=False).apply(prompt_event) + +    print("number of events after prompt nhit cut = %i" % np.count_nonzero(ev.prompt)) + +    # flasher follower cut +    ev = ev.groupby('run',group_keys=False).apply(flasher_follower_cut) + +    # breakdown follower cut +    ev = ev.groupby('run',group_keys=False).apply(breakdown_follower_cut) + +    # retrigger cut +    ev = ev.groupby('run',group_keys=False).apply(retrigger_cut) + +    if merge_fits: +        ev.set_index(['run','gtid']) + +        ev = pd.merge(fits,ev,how='inner',on=['run','gtid']) +        ev_single_particle = ev[(ev.id2 == 0) & (ev.id3 == 0)] +        ev_single_particle = ev_single_particle.sort_values('fmin').groupby(['run','gtid']).nth(0) +        ev = ev.sort_values('fmin').groupby(['run','gtid']).nth(0) +        ev['psi'] /= ev.nhit_cal + +        ev['cos_theta'] = np.cos(ev_single_particle['theta1']) +        ev['r'] = np.sqrt(ev.x**2 + ev.y**2 + ev.z**2) +        ev['udotr'] = np.sin(ev_single_particle.theta1)*np.cos(ev_single_particle.phi1)*ev_single_particle.x + \ +                      np.sin(ev_single_particle.theta1)*np.sin(ev_single_particle.phi1)*ev_single_particle.y + \ +                      np.cos(ev_single_particle.theta1)*ev_single_particle.z +        ev['udotr'] /= ev.r + +        return ev + +    return ev, fits | 
