utils/plot-energy


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273

#!/usr/bin/env python
# Copyright (c) 2019, Anthony Latorre <tlatorre at uchicago>
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
# more details.
#
# You should have received a copy of the GNU General Public License along with
# this program. If not, see <https://www.gnu.org/licenses/>.

from __future__ import print_function, division
import yaml
try:
    from yaml import CLoader as Loader
except ImportError:
    from yaml.loader import SafeLoader as Loader
import numpy as np
from scipy.stats import iqr
from matplotlib.lines import Line2D

# on retina screens, the default plots are way too small
# by using Qt5 and setting QT_AUTO_SCREEN_SCALE_FACTOR=1
# Qt5 will scale everything using the dpi in ~/.Xresources
import matplotlib
matplotlib.use("Qt5Agg")

SNOMAN_MASS = {
    20: 0.511,
    21: 0.511,
    22: 105.658,
    23: 105.658
}

AV_RADIUS = 600.0

# Data cleaning bitmasks.
DC_MUON           = 0x1
DC_JUNK           = 0x2
DC_CRATE_ISOTROPY = 0x4
DC_QVNHIT         = 0x8
DC_NECK           = 0x10
DC_FLASHER        = 0x20
DC_ESUM           = 0x40
DC_OWL            = 0x80
DC_OWL_TRIGGER    = 0x100
DC_FTS            = 0x200

def plot_hist(x, label=None):
    # determine the bin width using the Freedman Diaconis rule
    # see https://en.wikipedia.org/wiki/Freedman%E2%80%93Diaconis_rule
    h = 2*iqr(x)/len(x)**(1/3)
    n = max(int((np.max(x)-np.min(x))/h),10)
    bins = np.linspace(np.min(x),np.max(x),n)
    plt.hist(x, bins=bins, histtype='step', label=label)

def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]

if __name__ == '__main__':
    import argparse
    import matplotlib.pyplot as plt
    import numpy as np
    import pandas as pd
    import sys

    parser = argparse.ArgumentParser("plot fit results")
    parser.add_argument("filenames", nargs='+', help="input files")
    args = parser.parse_args()

    events = []
    fit_results = []

    for filename in args.filenames:
        print(filename)
        with open(filename) as f:
            data = yaml.load(f.read(),Loader=Loader)

        for i, event in enumerate(data['data']):
            for ev in event['ev']:
                events.append((
                    ev['run'],
                    ev['gtr'],
                    ev['nhit'],
                    ev['gtid'],
                    ev['dc'],
                    ev['ftp']['x'] if 'ftp' in ev else np.nan,
                    ev['ftp']['y'] if 'ftp' in ev else np.nan,
                    ev['ftp']['z'] if 'ftp' in ev else np.nan,
                    ev['rsp']['energy'] if 'rsp' in ev and ev['rsp']['energy'] > 0 else np.nan,
                    ))

                if 'fit' not in ev:
                    continue

                for id, fit_result in [x for x in ev['fit'].iteritems() if isinstance(x[0],int)]:
                    # FIXME: Should I just store the particle ids in the YAML
                    # output as a list of particle ids instead of a single
                    # integer?
                    ids = map(int,chunks(str(id),2))
                    energy = 0.0
                    skip = False
                    for i, ke in zip(ids,np.atleast_1d(fit_result['energy'])):
                        energy += ke + SNOMAN_MASS[i]

                        # This is a bit of a hack. It appears that many times
                        # the fit will actually do much better by including a
                        # very low energy electron or muon. I believe the
                        # reason for this is that of course my likelihood
                        # function is not perfect (for example, I don't include
                        # the correct angular distribution for Rayleigh
                        # scattered light), and so the fitter often wants to
                        # add a very low energy electron or muon to fix things.
                        #
                        # Ideally I would fix the likelihood function, but for
                        # now we just discard any fit results which have a very
                        # low energy electron or muon.
                        if len(ids) > 1 and i == 20 and ke < 20.0:
                            skip = True

                        if len(ids) > 1 and i == 22 and ke < 200.0:
                            skip = True

                    if skip:
                        continue

                    # Calculate the approximate Ockham factor.
                    # See Chapter 20 in "Probability Theory: The Logic of Science" by Jaynes
                    #
                    # Note: This is a really approximate form by assuming that
                    # the shape of the likelihood space is equal to the average
                    # uncertainty in the different parameters.
                    w = len(ids)*np.log(0.1*0.001) + np.sum(np.log(fit_result['energy'])) + len(ids)*np.log(1e-4/(4*np.pi))

                    fit_results.append((
                        ev['run'],
                        ev['gtid'],
                        id,
                        fit_result['posx'],
                        fit_result['posy'],
                        fit_result['posz'],
                        fit_result['t0'],
                        energy,
                        fit_result['fmin'] - w,
                        fit_result['psi']/ev['nhit']))

    # create a dataframe
    # note: we have to first create a numpy structured array since there is no
    # way to pass a list of data types to the DataFrame constructor. See
    # https://github.com/pandas-dev/pandas/issues/4464
    array = np.array(fit_results,
                     dtype=[('run',np.int),      # run number
                            ('gtid',np.int),     # gtid
                            ('id',np.int),       # particle id
                            ('x', np.double),    # x
                            ('y',np.double),     # y
                            ('z',np.double),     # z
                            ('t0',np.double),    # t0
                            ('ke',np.double),    # kinetic energy
                            ('fmin',np.double),  # negative log likelihood
                            ('psi',np.double)]   # goodness of fit parameter
                   )
    df = pd.DataFrame.from_records(array)

    array = np.array(events,
                     dtype=[('run',np.int),      # run number
                            ('gtr',np.double),   # 50 MHz clock in ns
                            ('nhit',np.int),     # number of PMTs hit
                            ('gtid',np.int),     # gtid
                            ('dc',np.int),       # data cleaning word
                            ('ftpx',np.double),       # data cleaning word
                            ('ftpy',np.double),       # data cleaning word
                            ('ftpz',np.double),       # data cleaning word
                            ('rsp_energy',np.double)]       # data cleaning word
                   )
    df_ev = pd.DataFrame.from_records(array)

    # remove events 200 microseconds after a muon
    muons = df_ev[(df_ev.dc & DC_MUON) != 0]

    print("number of events = %i" % len(df_ev))

    print("number of muons = %i" % len(muons))

    df_ev = df_ev[(df_ev.dc & DC_MUON) == 0]

    print("number of events after muon cut = %i" % len(df_ev))

    if muons.size:
        # FIXME: need to deal with 50 MHz clock rollover
        df_ev = df_ev[~np.any((df_ev.gtr.values > muons.gtr.values[:,np.newaxis]) & (df_ev.gtr.values <= (muons.gtr.values[:,np.newaxis] + 200e3)),axis=0)]

    print("number of events after muon follower cut = %i" % len(df_ev))

    # perform prompt event data cleaning
    df_ev = df_ev[df_ev.dc & (DC_JUNK | DC_CRATE_ISOTROPY | DC_QVNHIT | DC_FLASHER | DC_NECK) == 0]

    print("number of events after data cleaning = %i" % len(df_ev))

    # select prompt events
    # FIXME: how to deal with two prompt events one after another
    prompt = df_ev[df_ev.nhit >= 100]

    print("number of events after prompt nhit cut = %i" % len(prompt))

    if prompt.size:
        # FIXME: need to deal with 50 MHz clock rollover
        # neutron followers have to obey stricter set of data cleaning cuts
        neutron = df_ev[df_ev.dc & (DC_JUNK | DC_CRATE_ISOTROPY | DC_QVNHIT | DC_FLASHER | DC_NECK | DC_ESUM | DC_OWL | DC_OWL_TRIGGER | DC_FTS) == DC_FTS]
        neutron = neutron[~np.isnan(neutron.ftpx) & ~np.isnan(neutron.rsp_energy)]
        r = np.sqrt(neutron.ftpx**2 + neutron.ftpy**2 + neutron.ftpz**2)
        neutron = neutron[r < AV_RADIUS]
        neutron = neutron[neutron.rsp_energy > 4.0]
        # neutron events accepted after 20 microseconds and before 250 ms (50 ms during salt)
        df_ev = prompt[~np.any((neutron.gtr.values > prompt.gtr.values[:,np.newaxis] + 20e3) & (neutron.gtr.values < prompt.gtr.values[:,np.newaxis] + 250e6),axis=1)]
    else:
        df_ev = prompt

    print("number of events after neutron follower cut = %i" % len(df_ev))

    df = pd.merge(df,df_ev,how='inner',on=['run','gtid'])

    # get rid of events which don't have a fit
    nan = np.isnan(df.fmin.values)
    df = df[~nan]

    if np.count_nonzero(nan):
        print("skipping %i events because they are missing fit information!" % np.count_nonzero(nan),file=sys.stderr)

    # get the best fit
    df = df.sort_values('fmin').groupby(['run','gtid']).first()

    # require r < 6 meters
    df = df[np.sqrt(df.x.values**2 + df.y.values**2 + df.z.values**2) < AV_RADIUS]

    print("number of events after radius cut = %i" % len(df))

    # Note: Need to design and apply a psi based cut here, and apply the muon
    # and neutron follower cuts.

    for id, df_id in sorted(df.groupby('id')):
        if id == 20:
            plt.subplot(3,4,1)
        elif id == 22:
            plt.subplot(3,4,2)
        elif id == 2020:
            plt.subplot(3,4,5)
        elif id == 2022:
            plt.subplot(3,4,6)
        elif id == 2222:
            plt.subplot(3,4,7)
        elif id == 202020:
            plt.subplot(3,4,9)
        elif id == 202022:
            plt.subplot(3,4,10)
        elif id == 202222:
            plt.subplot(3,4,11)
        elif id == 222222:
            plt.subplot(3,4,12)

        plt.hist(df_id.ke.values, bins=np.linspace(20,10e3,100), histtype='step')
        plt.xlabel("Energy (MeV)")
        plt.title(str(id))

    plt.tight_layout()
    plt.show()