update dm-search to fix bug

This commit updates dm-search to fix a bug where I was returning lists from get_limits() but then comparing the best fit and discovery lists as if they were numpy arrays. I also updated how I calculate the expected number of events based on the results from doing a toy MC.
author: tlatorre <tlatorre@uchicago.edu> 2020-12-08 09:02:16 -0600
committer: tlatorre <tlatorre@uchicago.edu> 2020-12-08 09:02:16 -0600
commit: 5d8906508262d326450752204ac93119f355db88 (patch)
tree: 983f82312909c17c630962a259250e49fffa236d /utils/dm-search
parent: ed6302abb7bf4c3304f2e8f7cb6c7462c7caaf4f (diff)
download: sddm-5d8906508262d326450752204ac93119f355db88.tar.gz
sddm-5d8906508262d326450752204ac93119f355db88.tar.bz2
sddm-5d8906508262d326450752204ac93119f355db88.zip
1 files changed, 30 insertions, 44 deletions
diff --git a/utils/dm-search b/utils/dm-search
index 1e9c987..5b9cbaa 100755
--- a/utils/dm-search
+++ b/utils/dm-search
@@ -377,49 +377,15 @@ def get_dm_sample(n,dm_particle_id,dm_mass,dm_energy):
         data[i] = T1, T2, T1 + T2, id1, id2, dm_particle_id
     return pd.DataFrame(data)
 
-def integrate_mc_hist(mc, muons, id, x, a, b, atmo_scale_factor, muon_scale_factor):
-    """
-    Integrate the Monte Carlo expected histograms from a to b.
-
-    Args:
-        - mc: DataFrame
-            Pandas dataframe of atmospheric Monte Carlo events.
-        - muons: DataFrame
-            Pandas dataframe of external muon Monte Carlo events.
-        - id: int
-            The particle ID histogram to integrate.
-        - x: array
-            The fit parameters.
-        - a, b: float
-            Bounds of the integration.
-        - atmo_scale_factor, muon_scale_factor: float
-            Scale factors for the fit parameters.
-    """
-    mc_hists = get_mc_hists(mc,x,bins,scale=x[0]/atmo_scale_factor)
-    muon_hists = get_mc_hists(muons,x,bins,scale=x[5]/muon_scale_factor)
-
-    def total_hist(x):
-        if x < bins[0] or x > bins[-1]:
-            return 0
-        i = np.digitize(x,bins)
-        if i == len(bins):
-            i = len(bins) - 1
-        return (mc_hists[id][i-1] + muon_hists[id][i-1])/(bins[i]-bins[i-1])
-
-    # Yes, this is a stupid simple integration that I don't need quad for, but
-    # I don't want to have to code all the corner cases including when a and b
-    # are in the same bin, etc.
-    return quad(total_hist,a,b,points=bins[(bins >= a) & (bins <= b)])[0]
-
 def get_limits(dm_masses,data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,steps,print_nll,walkers,thin):
     limits = {}
     best_fit = {}
     discovery_array = {}
     for dm_particle_id in (2020,2222):
-        limits[dm_particle_id] = []
-        best_fit[dm_particle_id] = []
-        discovery_array[dm_particle_id] = []
-        for dm_mass in dm_masses[dm_particle_id]:
+        limits[dm_particle_id] = np.empty(len(dm_masses[dm_particle_id]))
+        best_fit[dm_particle_id] = np.empty(len(dm_masses[dm_particle_id]))
+        discovery_array[dm_particle_id] = np.empty(len(dm_masses[dm_particle_id]))
+        for i, dm_mass in enumerate(dm_masses[dm_particle_id]):
             id1 = dm_particle_id//100
             id2 = dm_particle_id % 100
             m1 = SNOMAN_MASS[id1]
@@ -428,7 +394,7 @@ def get_limits(dm_masses,data,muon,data_mc,atmo_scale_factor,muon_scale_factor,b
             xopt, samples = do_fit(dm_particle_id,dm_mass,dm_energy,data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,steps,print_nll,walkers,thin)
 
             limit = np.percentile(samples[:,6],90)
-            limits[dm_particle_id].append(limit)
+            limits[dm_particle_id][i] = limit
 
             # Here, to determine if there is a discovery we make an approximate
             # calculation of the number of events which would be significant.
@@ -447,9 +413,22 @@ def get_limits(dm_masses,data,muon,data_mc,atmo_scale_factor,muon_scale_factor,b
             # by n. Therefore, to determine the threshold, we simply look for
             # the threshold we expect in n and then subtract n.
             dm_kinetic_energy = dm_energy - m1 - m2
-            a = dm_kinetic_energy - 2*dm_kinetic_energy*DM_ENERGY_RESOLUTION
-            b = dm_kinetic_energy + 2*dm_kinetic_energy*DM_ENERGY_RESOLUTION
-            n = integrate_mc_hist(data_mc, muon, dm_particle_id, xopt, a, b, atmo_scale_factor, muon_scale_factor)
+
+            dm_sample = get_dm_sample(DM_SAMPLES,dm_particle_id,dm_mass,dm_energy)
+
+            # To calculate `n` we approximately want the number of events in
+            # the bin which most of the dark matter events will fall. However,
+            # to smoothly transition between bins, we multiply the normalized
+            # dark matter histogram with the expected MC histogram and then
+            # take the sum. In the case that the dark matter events all fall
+            # into a single bin, this gives us that bin, but smoothly
+            # interpolates between the bins.
+            dm_hists = get_mc_hists(dm_sample,xopt,bins,scale=1/len(dm_sample))
+            frac = dm_hists[dm_particle_id].sum()
+            dm_hists[dm_particle_id] /= frac
+            mc_hists = get_mc_hists(data_mc,xopt,bins,scale=xopt[0]/atmo_scale_factor)
+            muon_hists = get_mc_hists(muon,xopt,bins,scale=xopt[5]/muon_scale_factor)
+            n = (dm_hists[dm_particle_id]*(mc_hists[dm_particle_id] + muon_hists[dm_particle_id])).sum()
             # Set our discovery threshold to the p-value we want divided by the
             # number of bins. The idea here is that the number of bins is
             # approximately equal to the number of trials so we need to
@@ -457,8 +436,15 @@ def get_limits(dm_masses,data,muon,data_mc,atmo_scale_factor,muon_scale_factor,b
             # elsewhere effect.
             threshold = DISCOVERY_P_VALUE/(len(bins)-1)
             discovery = poisson.ppf(1-threshold,n) + 1 - n
-            discovery_array[dm_particle_id].append(discovery)
-            best_fit[dm_particle_id].append(xopt[6])
+            # Here, we scale the discovery threshold by the fraction of the
+            # dark matter hist in the histogram range. The idea is that if only
+            # a small fraction of the dark matter histogram falls into the
+            # histogram range, the total number of dark matter events returned
+            # by the fit can be larger by this amount. I noticed this when
+            # testing under the null hypothesis that the majority of the
+            # "discoveries" were on the edge of the histogram.
+            discovery_array[dm_particle_id][i] = discovery/frac
+            best_fit[dm_particle_id][i] = xopt[6]
 
     return limits, best_fit, discovery_array
author	tlatorre <tlatorre@uchicago.edu>	2020-12-08 09:02:16 -0600
committer	tlatorre <tlatorre@uchicago.edu>	2020-12-08 09:02:16 -0600
commit	5d8906508262d326450752204ac93119f355db88 (patch)
tree	983f82312909c17c630962a259250e49fffa236d /utils/dm-search
parent	ed6302abb7bf4c3304f2e8f7cb6c7462c7caaf4f (diff)
download	sddm-5d8906508262d326450752204ac93119f355db88.tar.gz sddm-5d8906508262d326450752204ac93119f355db88.tar.bz2 sddm-5d8906508262d326450752204ac93119f355db88.zip