From 5d8906508262d326450752204ac93119f355db88 Mon Sep 17 00:00:00 2001
From: tlatorre <tlatorre@uchicago.edu>
Date: Tue, 8 Dec 2020 09:02:16 -0600
Subject: update dm-search to fix bug

This commit updates dm-search to fix a bug where I was returning lists
from get_limits() but then comparing the best fit and discovery lists as
if they were numpy arrays.

I also updated how I calculate the expected number of events based on
the results from doing a toy MC.
---
 utils/dm-search | 74 +++++++++++++++++++++++----------------------------------
 1 file changed, 30 insertions(+), 44 deletions(-)

(limited to 'utils')

diff --git a/utils/dm-search b/utils/dm-search
index 1e9c987..5b9cbaa 100755
--- a/utils/dm-search
+++ b/utils/dm-search
@@ -377,49 +377,15 @@ def get_dm_sample(n,dm_particle_id,dm_mass,dm_energy):
         data[i] = T1, T2, T1 + T2, id1, id2, dm_particle_id
     return pd.DataFrame(data)
 
-def integrate_mc_hist(mc, muons, id, x, a, b, atmo_scale_factor, muon_scale_factor):
-    """
-    Integrate the Monte Carlo expected histograms from a to b.
-
-    Args:
-        - mc: DataFrame
-            Pandas dataframe of atmospheric Monte Carlo events.
-        - muons: DataFrame
-            Pandas dataframe of external muon Monte Carlo events.
-        - id: int
-            The particle ID histogram to integrate.
-        - x: array
-            The fit parameters.
-        - a, b: float
-            Bounds of the integration.
-        - atmo_scale_factor, muon_scale_factor: float
-            Scale factors for the fit parameters.
-    """
-    mc_hists = get_mc_hists(mc,x,bins,scale=x[0]/atmo_scale_factor)
-    muon_hists = get_mc_hists(muons,x,bins,scale=x[5]/muon_scale_factor)
-
-    def total_hist(x):
-        if x < bins[0] or x > bins[-1]:
-            return 0
-        i = np.digitize(x,bins)
-        if i == len(bins):
-            i = len(bins) - 1
-        return (mc_hists[id][i-1] + muon_hists[id][i-1])/(bins[i]-bins[i-1])
-
-    # Yes, this is a stupid simple integration that I don't need quad for, but
-    # I don't want to have to code all the corner cases including when a and b
-    # are in the same bin, etc.
-    return quad(total_hist,a,b,points=bins[(bins >= a) & (bins <= b)])[0]
-
 def get_limits(dm_masses,data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,steps,print_nll,walkers,thin):
     limits = {}
     best_fit = {}
     discovery_array = {}
     for dm_particle_id in (2020,2222):
-        limits[dm_particle_id] = []
-        best_fit[dm_particle_id] = []
-        discovery_array[dm_particle_id] = []
-        for dm_mass in dm_masses[dm_particle_id]:
+        limits[dm_particle_id] = np.empty(len(dm_masses[dm_particle_id]))
+        best_fit[dm_particle_id] = np.empty(len(dm_masses[dm_particle_id]))
+        discovery_array[dm_particle_id] = np.empty(len(dm_masses[dm_particle_id]))
+        for i, dm_mass in enumerate(dm_masses[dm_particle_id]):
             id1 = dm_particle_id//100
             id2 = dm_particle_id % 100
             m1 = SNOMAN_MASS[id1]
@@ -428,7 +394,7 @@ def get_limits(dm_masses,data,muon,data_mc,atmo_scale_factor,muon_scale_factor,b
             xopt, samples = do_fit(dm_particle_id,dm_mass,dm_energy,data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,steps,print_nll,walkers,thin)
 
             limit = np.percentile(samples[:,6],90)
-            limits[dm_particle_id].append(limit)
+            limits[dm_particle_id][i] = limit
 
             # Here, to determine if there is a discovery we make an approximate
             # calculation of the number of events which would be significant.
@@ -447,9 +413,22 @@ def get_limits(dm_masses,data,muon,data_mc,atmo_scale_factor,muon_scale_factor,b
             # by n. Therefore, to determine the threshold, we simply look for
             # the threshold we expect in n and then subtract n.
             dm_kinetic_energy = dm_energy - m1 - m2
-            a = dm_kinetic_energy - 2*dm_kinetic_energy*DM_ENERGY_RESOLUTION
-            b = dm_kinetic_energy + 2*dm_kinetic_energy*DM_ENERGY_RESOLUTION
-            n = integrate_mc_hist(data_mc, muon, dm_particle_id, xopt, a, b, atmo_scale_factor, muon_scale_factor)
+
+            dm_sample = get_dm_sample(DM_SAMPLES,dm_particle_id,dm_mass,dm_energy)
+
+            # To calculate `n` we approximately want the number of events in
+            # the bin which most of the dark matter events will fall. However,
+            # to smoothly transition between bins, we multiply the normalized
+            # dark matter histogram with the expected MC histogram and then
+            # take the sum. In the case that the dark matter events all fall
+            # into a single bin, this gives us that bin, but smoothly
+            # interpolates between the bins.
+            dm_hists = get_mc_hists(dm_sample,xopt,bins,scale=1/len(dm_sample))
+            frac = dm_hists[dm_particle_id].sum()
+            dm_hists[dm_particle_id] /= frac
+            mc_hists = get_mc_hists(data_mc,xopt,bins,scale=xopt[0]/atmo_scale_factor)
+            muon_hists = get_mc_hists(muon,xopt,bins,scale=xopt[5]/muon_scale_factor)
+            n = (dm_hists[dm_particle_id]*(mc_hists[dm_particle_id] + muon_hists[dm_particle_id])).sum()
             # Set our discovery threshold to the p-value we want divided by the
             # number of bins. The idea here is that the number of bins is
             # approximately equal to the number of trials so we need to
@@ -457,8 +436,15 @@ def get_limits(dm_masses,data,muon,data_mc,atmo_scale_factor,muon_scale_factor,b
             # elsewhere effect.
             threshold = DISCOVERY_P_VALUE/(len(bins)-1)
             discovery = poisson.ppf(1-threshold,n) + 1 - n
-            discovery_array[dm_particle_id].append(discovery)
-            best_fit[dm_particle_id].append(xopt[6])
+            # Here, we scale the discovery threshold by the fraction of the
+            # dark matter hist in the histogram range. The idea is that if only
+            # a small fraction of the dark matter histogram falls into the
+            # histogram range, the total number of dark matter events returned
+            # by the fit can be larger by this amount. I noticed this when
+            # testing under the null hypothesis that the majority of the
+            # "discoveries" were on the edge of the histogram.
+            discovery_array[dm_particle_id][i] = discovery/frac
+            best_fit[dm_particle_id][i] = xopt[6]
 
     return limits, best_fit, discovery_array
 
-- 
cgit 

-left: 5px; padding-right: 5px; }
td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
.highlight .hll { background-color: #ffffcc }
.highlight .c { color: #888888 } /* Comment */
.highlight .err { color: #a61717; background-color: #e3d2d2 } /* Error */
.highlight .k { color: #008800; font-weight: bold } /* Keyword */
.highlight .ch { color: #888888 } /* Comment.Hashbang */
.highlight .cm { color: #888888 } /* Comment.Multiline */
.highlight .cp { color: #cc0000; font-weight: bold } /* Comment.Preproc */
.highlight .cpf { color: #888888 } /* Comment.PreprocFile */
.highlight .c1 { color: #888888 } /* Comment.Single */
.highlight .cs { color: #cc0000; font-weight: bold; background-color: #fff0f0 } /* Comment.Special */
.highlight .gd { color: #000000; background-color: #ffdddd } /* Generic.Deleted */
.highlight .ge { font-style: italic } /* Generic.Emph */
.highlight .ges { font-weight: bold; font-style: italic } /* Generic.EmphStrong */
.highlight .gr { color: #aa0000 } /* Generic.Error */
.highlight .gh { color: #333333 } /* Generic.Heading */
.highlight .gi { color: #000000; background-color: #ddffdd } /* Generic.Inserted */
.highlight .go { color: #888888 } /* Generic.Output */
.highlight .gp { color: #555555 } /* Generic.Prompt */
.highlight .gs { font-weight: bold } /* Generic.Strong */
.highlight .gu { color: #666666 } /* Generic.Subheading */
.highlight .gt { color: #aa0000 } /* Generic.Traceback */
.highlight .kc { color: #008800; font-weight: bold } /* Keyword.Constant */
.highlight .kd { color: #008800; font-weight: bold } /* Keyword.Declaration */
.highlight .kn { color: #008800; font-weight: bold } /* Keyword.Namespace */
.highlight .kp { color: #008800 } /* Keyword.Pseudo */
.highlight .kr { color: #008800; font-weight: bold } /* Keyword.Reserved */
.highlight .kt { color: #888888; font-weight: bold } /* Keyword.Type */
.highlight .m { color: #0000DD; font-weight: bold } /* Literal.Number */
.highlight .s { color: #dd2200; background-color: #fff0f0 } /* Literal.String */
.highlight .na { color: #336699 } /* Name.Attribute */
.highlight .nb { color: #003388 } /* Name.Builtin */
.highlight .nc { color: #bb0066; font-weight: bold } /* Name.Class */
.highlight .no { color: #003366; font-weight: bold } /* Name.Constant */
.highlight .nd { color: #555555 } /* Name.Decorator */
.highlight .ne { color: #bb0066; font-weight: bold } /* Name.Exception */
.highlight .nf { color: #0066bb; font-weight: bold } /* Name.Function */
.highlight .nl { color: #336699; font-style: italic } /* Name.Label */
.highlight .nn { color: #bb0066; font-weight: bold } /* Name.Namespace */
.highlight .py { color: #336699; font-weight: bold } /* Name.Property */
.highlight .nt { color: #bb0066; font-weight: bold } /* Name.Tag */
.highlight .nv { color: #336699 } /* Name.Variable */
.highlight .ow { color: #008800 } /* Operator.Word */
.highlight .w { color: #bbbbbb } /* Text.Whitespace */
.highlight .mb { color: #0000DD; font-weight: bold } /* Literal.Number.Bin */
.highlight .mf { color: #0000DD; font-weight: bold } /* Literal.Number.Float */
.highlight .mh { color: #0000DD; font-weight: bold } /* Literal.Number.Hex */
.highlight .mi { color: #0000DD; font-weight: bold } /* Literal.Number.Integer */
.highlight .mo { color: #0000DD; font-weight: bold } /* Literal.Number.Oct */
.highlight .sa { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Affix */
.highlight .sb { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Backtick */
.highlight .sc { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Char */
.highlight .dl { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Delimiter */
.highlight .sd { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Doc */
.highlight .s2 { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Double */
.highlight .se { color: #0044dd; background-color: #fff0f0 } /* Literal.String.Escape */
.highlight .sh { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Heredoc */
.highlight .si { color: #3333bb; background-color: #fff0f0 } /* Literal.String.Interpol */
.highlight .sx { color: #22bb22; background-color: #f0fff0 } /* Literal.String.Other */
.highlight .sr { color: #008800; background-color: #fff0ff } /* Literal.String.Regex */
.highlight .s1 { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Single */
.highlight .ss { color: #aa6600; background-color: #fff0f0 } /* Literal.String.Symbol */
.highlight .bp { color: #003388 } /* Name.Builtin.Pseudo */
.highlight .fm { color: #0066bb; font-weight: bold } /* Name.Function.Magic */
.highlight .vc { color: #336699 } /* Name.Variable.Class */
.highlight .vg { color: #dd7700 } /* Name.Variable.Global */
.highlight .vi { color: #3333bb } /* Name.Variable.Instance */
.highlight .vm { color: #336699 } /* Name.Variable.Magic */
.highlight .il { color: #0000DD; font-weight: bold } /* Literal.Number.Integer.Long */</style><div class="highlight"><pre><span></span>Key:
    - To do
    p Partially done
    ^ Done
    ? Not sure if it&#39;s done

- write cross section section in paper
? explain integral in paper
    - not sure what integral I was talking about here. I just checked the git
      history and this was before I had written the fitter, so it must be
      talking about the integral to compute the expected event rate.
p develop signal generator
    - made signal generator for two particle decays
p develop high energy and multi-ring fitter
^ write talk for Monday
^ write a section on atmospheric backgrounds in paper
^ check index of refraction calculation against tables in the paper
^ write more tests for the solid angle calculation
    - use table from paper
^ make sure normal direction is normalized
^ speed up likelihood calculation
    - rotate and translate path points in path_init
    - create fast interp1d for equally spaced points
    - optimize with -O2
    - only calculate average time if mu_direct[i] is not too small and PMT is
      hit
    - add likelihood function which calculates total expected charge and only
      loops over hit PMTs
    - add fast likelihood which doesn&#39;t use a 2D lookup table
- add function to compute the Hessian via finite differences
- add function to calculate covariance matrix
- add function to compute the approximate volume in the likelihood space where
  the likelihood is greater than some value
^ add output to likelihood
^ add a set of seed points to start the likelihood
^ minimize the likelihood in a do while loop until it converges to a single
  point
^ once the first position is found, compute the Hough transform to look for
  other rings
p add rayleigh scattering to likelihood function
    - Done, but it&#39;s pretty hacky
p add muon shower PDF? delta rays?
    - Done for delta rays, but need to add shower photons
- calculate path length through acrylic?
    - did some tests on this and it seems to fix the radial energy bias for
      electrons, so it&#39;s definitely an issue. However, currently the small 5%
      energy bias is not really a big deal, although I would like to fix this
      eventually.
p update estar table with energies above 1 GeV
    - added values up to 10 GeV
^ speed up normalize() and interp1d()
^ add shower PDF to get_total_charge_approx()
x compute mean of starting points
^ speed up time PDF calculation for fast likelihood
p calculate psi parameter as a goodness of fit test
    - Still need to tweak this. It seems to have a strong nhit dependence
^ add a maximum length for the range
    - now only fit to the end of the PSUP
^ add a time limit for the fit
^ multi-particle fit
p optimize CHARGE_FRACTION
? find out why likelihood is sometimes returning nan
^ add pdf for cerenkov light from delta rays
^ tweak find peaks algorithm
^ update zebra to allow use of links
p minimum energy for each particle type
    - I do already have a minimum energy which is the Cerenkov threshold,
      however I think what I meant here was to have something significantly
      higher than that since a particle just at the Cerenkov threshold won&#39;t
      produce many PMT hits. I think I was mostly concerned about this issue
      because many fits were always fitting better by including a low energy
      electron, however I have solved that problem by simply dropping multi
      particle fits which have an electron with an energy less than 20 MeV.
^ check all zebra links
^ fix zero logical record size bug in zebra.c
p speed up fit
    - have optimized most of the likelihood function. I think further
      optimization will require a different technique for evaluating the
      integrals
^ nhit cut
p add flasher cut
p default max particles
    - submit-grid-jobs fits up to 3 particles by default
^ add environment variable to control where to look for pmt.txt and titles files
p update charge code to handle highest values for qhs and qlx
    - don&#39;t have this coded into the charge probability code, but I now use QHL
      if QHS is railed and flag any PMT with a railed QHL
^ use qhs and qlx based on variable in PMT bank
    - Note: I don&#39;t use the best charge variable in the PMT bank, but I do
      select QHL when QHS is railed
^ fix bug in path_init()
^ load DQXX file based on run number
^ update rayleigh scattering lengths
^ fix delta ray calculation
p figure out why electron energy bias is +10%
    - updated code to use a more accurate approximation for the number of
      shower photons which seems to bring the bias down to 5%
p update find peaks algorithm to do single particle quick fits
    - I did update the find peaks algorithm to work *much* better, but it still
      doesn&#39;t do single particle quick fits. I think the next improvement would
      be to add the ability to actually determine the number of rings.
^ figure out how to combine SNO fitter data with my fitter for final analysis.
  For example, how to apply neutron follower cut?
- double check that zdab-reprocess is correct for the D2O and salt phases since
  it appears to be from the NCD phase
? add code to compute expected deviation from nll_best to normalize psi
    - tried several different versions of this and nothing seemed to perform as
      well as psi/nhit.
^ add term to likelihood for probability that a channel is misccalibrated or
  didn&#39;t make it into the event
- when calculating the first order statistic for a gaussian is alpha = pi/8 a
  better choice than 0.375?
- speed up charge initialization
- determine *real* mean and standard deviation of single PE charge
  distribution. TELLIE SNO+ data?
^ extend electron range table
    - extended the electron range table up to 1 TeV
? fix time PDF. Currently take the first order statistic of dark noise +
  indirect light + direct light all together, but this isn&#39;t correct.
    - thought more about this and it&#39;s not actually obvious if what I&#39;m doing
      is wrong. Need to think more about this.
</pre></div>