From 5d8906508262d326450752204ac93119f355db88 Mon Sep 17 00:00:00 2001 From: tlatorre Date: Tue, 8 Dec 2020 09:02:16 -0600 Subject: update dm-search to fix bug This commit updates dm-search to fix a bug where I was returning lists from get_limits() but then comparing the best fit and discovery lists as if they were numpy arrays. I also updated how I calculate the expected number of events based on the results from doing a toy MC. --- utils/dm-search | 74 +++++++++++++++++++++++---------------------------------- 1 file changed, 30 insertions(+), 44 deletions(-) (limited to 'utils') diff --git a/utils/dm-search b/utils/dm-search index 1e9c987..5b9cbaa 100755 --- a/utils/dm-search +++ b/utils/dm-search @@ -377,49 +377,15 @@ def get_dm_sample(n,dm_particle_id,dm_mass,dm_energy): data[i] = T1, T2, T1 + T2, id1, id2, dm_particle_id return pd.DataFrame(data) -def integrate_mc_hist(mc, muons, id, x, a, b, atmo_scale_factor, muon_scale_factor): - """ - Integrate the Monte Carlo expected histograms from a to b. - - Args: - - mc: DataFrame - Pandas dataframe of atmospheric Monte Carlo events. - - muons: DataFrame - Pandas dataframe of external muon Monte Carlo events. - - id: int - The particle ID histogram to integrate. - - x: array - The fit parameters. - - a, b: float - Bounds of the integration. - - atmo_scale_factor, muon_scale_factor: float - Scale factors for the fit parameters. - """ - mc_hists = get_mc_hists(mc,x,bins,scale=x[0]/atmo_scale_factor) - muon_hists = get_mc_hists(muons,x,bins,scale=x[5]/muon_scale_factor) - - def total_hist(x): - if x < bins[0] or x > bins[-1]: - return 0 - i = np.digitize(x,bins) - if i == len(bins): - i = len(bins) - 1 - return (mc_hists[id][i-1] + muon_hists[id][i-1])/(bins[i]-bins[i-1]) - - # Yes, this is a stupid simple integration that I don't need quad for, but - # I don't want to have to code all the corner cases including when a and b - # are in the same bin, etc. - return quad(total_hist,a,b,points=bins[(bins >= a) & (bins <= b)])[0] - def get_limits(dm_masses,data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,steps,print_nll,walkers,thin): limits = {} best_fit = {} discovery_array = {} for dm_particle_id in (2020,2222): - limits[dm_particle_id] = [] - best_fit[dm_particle_id] = [] - discovery_array[dm_particle_id] = [] - for dm_mass in dm_masses[dm_particle_id]: + limits[dm_particle_id] = np.empty(len(dm_masses[dm_particle_id])) + best_fit[dm_particle_id] = np.empty(len(dm_masses[dm_particle_id])) + discovery_array[dm_particle_id] = np.empty(len(dm_masses[dm_particle_id])) + for i, dm_mass in enumerate(dm_masses[dm_particle_id]): id1 = dm_particle_id//100 id2 = dm_particle_id % 100 m1 = SNOMAN_MASS[id1] @@ -428,7 +394,7 @@ def get_limits(dm_masses,data,muon,data_mc,atmo_scale_factor,muon_scale_factor,b xopt, samples = do_fit(dm_particle_id,dm_mass,dm_energy,data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,steps,print_nll,walkers,thin) limit = np.percentile(samples[:,6],90) - limits[dm_particle_id].append(limit) + limits[dm_particle_id][i] = limit # Here, to determine if there is a discovery we make an approximate # calculation of the number of events which would be significant. @@ -447,9 +413,22 @@ def get_limits(dm_masses,data,muon,data_mc,atmo_scale_factor,muon_scale_factor,b # by n. Therefore, to determine the threshold, we simply look for # the threshold we expect in n and then subtract n. dm_kinetic_energy = dm_energy - m1 - m2 - a = dm_kinetic_energy - 2*dm_kinetic_energy*DM_ENERGY_RESOLUTION - b = dm_kinetic_energy + 2*dm_kinetic_energy*DM_ENERGY_RESOLUTION - n = integrate_mc_hist(data_mc, muon, dm_particle_id, xopt, a, b, atmo_scale_factor, muon_scale_factor) + + dm_sample = get_dm_sample(DM_SAMPLES,dm_particle_id,dm_mass,dm_energy) + + # To calculate `n` we approximately want the number of events in + # the bin which most of the dark matter events will fall. However, + # to smoothly transition between bins, we multiply the normalized + # dark matter histogram with the expected MC histogram and then + # take the sum. In the case that the dark matter events all fall + # into a single bin, this gives us that bin, but smoothly + # interpolates between the bins. + dm_hists = get_mc_hists(dm_sample,xopt,bins,scale=1/len(dm_sample)) + frac = dm_hists[dm_particle_id].sum() + dm_hists[dm_particle_id] /= frac + mc_hists = get_mc_hists(data_mc,xopt,bins,scale=xopt[0]/atmo_scale_factor) + muon_hists = get_mc_hists(muon,xopt,bins,scale=xopt[5]/muon_scale_factor) + n = (dm_hists[dm_particle_id]*(mc_hists[dm_particle_id] + muon_hists[dm_particle_id])).sum() # Set our discovery threshold to the p-value we want divided by the # number of bins. The idea here is that the number of bins is # approximately equal to the number of trials so we need to @@ -457,8 +436,15 @@ def get_limits(dm_masses,data,muon,data_mc,atmo_scale_factor,muon_scale_factor,b # elsewhere effect. threshold = DISCOVERY_P_VALUE/(len(bins)-1) discovery = poisson.ppf(1-threshold,n) + 1 - n - discovery_array[dm_particle_id].append(discovery) - best_fit[dm_particle_id].append(xopt[6]) + # Here, we scale the discovery threshold by the fraction of the + # dark matter hist in the histogram range. The idea is that if only + # a small fraction of the dark matter histogram falls into the + # histogram range, the total number of dark matter events returned + # by the fit can be larger by this amount. I noticed this when + # testing under the null hypothesis that the majority of the + # "discoveries" were on the edge of the histogram. + discovery_array[dm_particle_id][i] = discovery/frac + best_fit[dm_particle_id][i] = xopt[6] return limits, best_fit, discovery_array -- cgit -left: 5px; padding-right: 5px; } td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } .highlight .hll { background-color: #ffffcc } .highlight .c { color: #888888 } /* Comment */ .highlight .err { color: #a61717; background-color: #e3d2d2 } /* Error */ .highlight .k { color: #008800; font-weight: bold } /* Keyword */ .highlight .ch { color: #888888 } /* Comment.Hashbang */ .highlight .cm { color: #888888 } /* Comment.Multiline */ .highlight .cp { color: #cc0000; font-weight: bold } /* Comment.Preproc */ .highlight .cpf { color: #888888 } /* Comment.PreprocFile */ .highlight .c1 { color: #888888 } /* Comment.Single */ .highlight .cs { color: #cc0000; font-weight: bold; background-color: #fff0f0 } /* Comment.Special */ .highlight .gd { color: #000000; background-color: #ffdddd } /* Generic.Deleted */ .highlight .ge { font-style: italic } /* Generic.Emph */ .highlight .ges { font-weight: bold; font-style: italic } /* Generic.EmphStrong */ .highlight .gr { color: #aa0000 } /* Generic.Error */ .highlight .gh { color: #333333 } /* Generic.Heading */ .highlight .gi { color: #000000; background-color: #ddffdd } /* Generic.Inserted */ .highlight .go { color: #888888 } /* Generic.Output */ .highlight .gp { color: #555555 } /* Generic.Prompt */ .highlight .gs { font-weight: bold } /* Generic.Strong */ .highlight .gu { color: #666666 } /* Generic.Subheading */ .highlight .gt { color: #aa0000 } /* Generic.Traceback */ .highlight .kc { color: #008800; font-weight: bold } /* Keyword.Constant */ .highlight .kd { color: #008800; font-weight: bold } /* Keyword.Declaration */ .highlight .kn { color: #008800; font-weight: bold } /* Keyword.Namespace */ .highlight .kp { color: #008800 } /* Keyword.Pseudo */ .highlight .kr { color: #008800; font-weight: bold } /* Keyword.Reserved */ .highlight .kt { color: #888888; font-weight: bold } /* Keyword.Type */ .highlight .m { color: #0000DD; font-weight: bold } /* Literal.Number */ .highlight .s { color: #dd2200; background-color: #fff0f0 } /* Literal.String */ .highlight .na { color: #336699 } /* Name.Attribute */ .highlight .nb { color: #003388 } /* Name.Builtin */ .highlight .nc { color: #bb0066; font-weight: bold } /* Name.Class */ .highlight .no { color: #003366; font-weight: bold } /* Name.Constant */ .highlight .nd { color: #555555 } /* Name.Decorator */ .highlight .ne { color: #bb0066; font-weight: bold } /* Name.Exception */ .highlight .nf { color: #0066bb; font-weight: bold } /* Name.Function */ .highlight .nl { color: #336699; font-style: italic } /* Name.Label */ .highlight .nn { color: #bb0066; font-weight: bold } /* Name.Namespace */ .highlight .py { color: #336699; font-weight: bold } /* Name.Property */ .highlight .nt { color: #bb0066; font-weight: bold } /* Name.Tag */ .highlight .nv { color: #336699 } /* Name.Variable */ .highlight .ow { color: #008800 } /* Operator.Word */ .highlight .w { color: #bbbbbb } /* Text.Whitespace */ .highlight .mb { color: #0000DD; font-weight: bold } /* Literal.Number.Bin */ .highlight .mf { color: #0000DD; font-weight: bold } /* Literal.Number.Float */ .highlight .mh { color: #0000DD; font-weight: bold } /* Literal.Number.Hex */ .highlight .mi { color: #0000DD; font-weight: bold } /* Literal.Number.Integer */ .highlight .mo { color: #0000DD; font-weight: bold } /* Literal.Number.Oct */ .highlight .sa { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Affix */ .highlight .sb { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Backtick */ .highlight .sc { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Char */ .highlight .dl { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Delimiter */ .highlight .sd { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Doc */ .highlight .s2 { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Double */ .highlight .se { color: #0044dd; background-color: #fff0f0 } /* Literal.String.Escape */ .highlight .sh { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Heredoc */ .highlight .si { color: #3333bb; background-color: #fff0f0 } /* Literal.String.Interpol */ .highlight .sx { color: #22bb22; background-color: #f0fff0 } /* Literal.String.Other */ .highlight .sr { color: #008800; background-color: #fff0ff } /* Literal.String.Regex */ .highlight .s1 { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Single */ .highlight .ss { color: #aa6600; background-color: #fff0f0 } /* Literal.String.Symbol */ .highlight .bp { color: #003388 } /* Name.Builtin.Pseudo */ .highlight .fm { color: #0066bb; font-weight: bold } /* Name.Function.Magic */ .highlight .vc { color: #336699 } /* Name.Variable.Class */ .highlight .vg { color: #dd7700 } /* Name.Variable.Global */ .highlight .vi { color: #3333bb } /* Name.Variable.Instance */ .highlight .vm { color: #336699 } /* Name.Variable.Magic */ .highlight .il { color: #0000DD; font-weight: bold } /* Literal.Number.Integer.Long */
Key:
    - To do
    p Partially done
    ^ Done
    ? Not sure if it's done

- write cross section section in paper
? explain integral in paper
    - not sure what integral I was talking about here. I just checked the git
      history and this was before I had written the fitter, so it must be
      talking about the integral to compute the expected event rate.
p develop signal generator
    - made signal generator for two particle decays
p develop high energy and multi-ring fitter
^ write talk for Monday
^ write a section on atmospheric backgrounds in paper
^ check index of refraction calculation against tables in the paper
^ write more tests for the solid angle calculation
    - use table from paper
^ make sure normal direction is normalized
^ speed up likelihood calculation
    - rotate and translate path points in path_init
    - create fast interp1d for equally spaced points
    - optimize with -O2
    - only calculate average time if mu_direct[i] is not too small and PMT is
      hit
    - add likelihood function which calculates total expected charge and only
      loops over hit PMTs
    - add fast likelihood which doesn't use a 2D lookup table
- add function to compute the Hessian via finite differences
- add function to calculate covariance matrix
- add function to compute the approximate volume in the likelihood space where
  the likelihood is greater than some value
^ add output to likelihood
^ add a set of seed points to start the likelihood
^ minimize the likelihood in a do while loop until it converges to a single
  point
^ once the first position is found, compute the Hough transform to look for
  other rings
p add rayleigh scattering to likelihood function
    - Done, but it's pretty hacky
p add muon shower PDF? delta rays?
    - Done for delta rays, but need to add shower photons
- calculate path length through acrylic?
    - did some tests on this and it seems to fix the radial energy bias for
      electrons, so it's definitely an issue. However, currently the small 5%
      energy bias is not really a big deal, although I would like to fix this
      eventually.
p update estar table with energies above 1 GeV
    - added values up to 10 GeV
^ speed up normalize() and interp1d()
^ add shower PDF to get_total_charge_approx()
x compute mean of starting points
^ speed up time PDF calculation for fast likelihood
p calculate psi parameter as a goodness of fit test
    - Still need to tweak this. It seems to have a strong nhit dependence
^ add a maximum length for the range
    - now only fit to the end of the PSUP
^ add a time limit for the fit
^ multi-particle fit
p optimize CHARGE_FRACTION
? find out why likelihood is sometimes returning nan
^ add pdf for cerenkov light from delta rays
^ tweak find peaks algorithm
^ update zebra to allow use of links
p minimum energy for each particle type
    - I do already have a minimum energy which is the Cerenkov threshold,
      however I think what I meant here was to have something significantly
      higher than that since a particle just at the Cerenkov threshold won't
      produce many PMT hits. I think I was mostly concerned about this issue
      because many fits were always fitting better by including a low energy
      electron, however I have solved that problem by simply dropping multi
      particle fits which have an electron with an energy less than 20 MeV.
^ check all zebra links
^ fix zero logical record size bug in zebra.c
p speed up fit
    - have optimized most of the likelihood function. I think further
      optimization will require a different technique for evaluating the
      integrals
^ nhit cut
p add flasher cut
p default max particles
    - submit-grid-jobs fits up to 3 particles by default
^ add environment variable to control where to look for pmt.txt and titles files
p update charge code to handle highest values for qhs and qlx
    - don't have this coded into the charge probability code, but I now use QHL
      if QHS is railed and flag any PMT with a railed QHL
^ use qhs and qlx based on variable in PMT bank
    - Note: I don't use the best charge variable in the PMT bank, but I do
      select QHL when QHS is railed
^ fix bug in path_init()
^ load DQXX file based on run number
^ update rayleigh scattering lengths
^ fix delta ray calculation
p figure out why electron energy bias is +10%
    - updated code to use a more accurate approximation for the number of
      shower photons which seems to bring the bias down to 5%
p update find peaks algorithm to do single particle quick fits
    - I did update the find peaks algorithm to work *much* better, but it still
      doesn't do single particle quick fits. I think the next improvement would
      be to add the ability to actually determine the number of rings.
^ figure out how to combine SNO fitter data with my fitter for final analysis.
  For example, how to apply neutron follower cut?
- double check that zdab-reprocess is correct for the D2O and salt phases since
  it appears to be from the NCD phase
? add code to compute expected deviation from nll_best to normalize psi
    - tried several different versions of this and nothing seemed to perform as
      well as psi/nhit.
^ add term to likelihood for probability that a channel is misccalibrated or
  didn't make it into the event
- when calculating the first order statistic for a gaussian is alpha = pi/8 a
  better choice than 0.375?
- speed up charge initialization
- determine *real* mean and standard deviation of single PE charge
  distribution. TELLIE SNO+ data?
^ extend electron range table
    - extended the electron range table up to 1 TeV
? fix time PDF. Currently take the first order statistic of dark noise +
  indirect light + direct light all together, but this isn't correct.
    - thought more about this and it's not actually obvious if what I'm doing
      is wrong. Need to think more about this.