update dm-search script

author: tlatorre <tlatorre@uchicago.edu> 2020-11-16 07:55:17 -0600
committer: tlatorre <tlatorre@uchicago.edu> 2020-11-16 07:55:17 -0600
commit: fbcdf03f205840c5cd550689e3f60f5168f8ecca (patch)
tree: 8733e490193a7243a87dda563a86597a58a96289 /utils/dm-search
parent: 31bf8df0a8d222310bbbd772067f336140c24baa (diff)
download: sddm-fbcdf03f205840c5cd550689e3f60f5168f8ecca.tar.gz
sddm-fbcdf03f205840c5cd550689e3f60f5168f8ecca.tar.bz2
sddm-fbcdf03f205840c5cd550689e3f60f5168f8ecca.zip
1 files changed, 220 insertions, 30 deletions
diff --git a/utils/dm-search b/utils/dm-search
index 323aabc..1d14d05 100755
--- a/utils/dm-search
+++ b/utils/dm-search
@@ -33,6 +33,8 @@ from sddm.dc import estimate_errors, EPSILON, truncnorm_scaled
 import emcee
 from sddm import printoptions
 from sddm.utils import fast_cdf, correct_energy_bias
+from scipy.integrate import quad
+from sddm.dm import *
 
 # Likelihood Fit Parameters
 # 0 - Atmospheric Neutrino Flux Scale
@@ -43,6 +45,15 @@ from sddm.utils import fast_cdf, correct_energy_bias
 # 5 - External Muon scale
 # 6 - Dark Matter Scale
 
+# Number of events to use in the dark matter Monte Carlo histogram when fitting
+# Ideally this would be as big as possible, but the bigger it is, the more time
+# the fit takes.
+DM_SAMPLES = 10000
+
+DM_ENERGIES = np.logspace(np.log10(20),np.log10(10e3),10)
+
+DISCOVERY_P_VALUE = 0.05
+
 # Energy resolution as a fraction of energy for dark matter signal
 DM_ENERGY_RESOLUTION = 0.1
 
@@ -75,7 +86,7 @@ PRIORS = [
     1.0,   # Atmospheric Neutrino Scale
     0.015, # Electron energy scale
     0.0,   # Electron energy resolution
-    0.054, # Muon energy scale
+    0.053, # Muon energy scale
     0.0,   # Muon energy resolution
     0.0,   # Muon scale
     0.0,   # Dark Matter Scale
@@ -85,8 +96,8 @@ PRIOR_UNCERTAINTIES = [
     0.2,   # Atmospheric Neutrino Scale
     0.03,  # Electron energy scale
     0.05,  # Electron energy resolution
-    0.011, # Muon energy scale
-    0.014, # Muon energy resolution
+    0.01,  # Muon energy scale
+    0.013, # Muon energy resolution
     10.0,  # Muon scale
     np.inf,# Dark Matter Scale
 ]
@@ -206,7 +217,7 @@ def get_data_hists(data,bins,scale=1.0):
         data_hists[id] = np.histogram(data[data.id == id].ke.values,bins=bins)[0]*scale
     return data_hists
 
-def make_nll(dm_particle_id, dm_energy, data, muons, mc, atmo_scale_factor, muon_scale_factor, bins, print_nll=False):
+def make_nll(dm_particle_id, dm_mass, dm_energy, data, muons, mc, atmo_scale_factor, muon_scale_factor, bins, print_nll=False):
     df_dict = {}
     for id in (20,22,2020,2022,2222):
         df_dict[id] = mc[mc.id == id]
@@ -217,6 +228,12 @@ def make_nll(dm_particle_id, dm_energy, data, muons, mc, atmo_scale_factor, muon
 
     data_hists = get_data_hists(data,bins)
 
+    dm_sample = get_dm_sample(DM_SAMPLES,dm_particle_id,dm_mass,dm_energy)
+
+    df_dict_dm = {}
+    for id in (20,22,2020,2022,2222):
+        df_dict_dm[id] = dm_sample[dm_sample.id == id]
+
     def nll(x, grad=None):
         if any(x[i] < 0 for i in (0,2,4,5,6)):
             return np.inf
@@ -226,6 +243,7 @@ def make_nll(dm_particle_id, dm_energy, data, muons, mc, atmo_scale_factor, muon
         # to the Monte Carlo.
         mc_hists = get_mc_hists_fast(df_dict,x,bins,scale=1/atmo_scale_factor)
         muon_hists = get_mc_hists_fast(df_dict_muon,x,bins,scale=1/muon_scale_factor)
+        dm_hists = get_mc_hists_fast(df_dict_dm,x,bins,scale=1/len(dm_sample))
 
         # Calculate the negative log of the likelihood of observing the data
         # given the fit parameters
@@ -233,11 +251,7 @@ def make_nll(dm_particle_id, dm_energy, data, muons, mc, atmo_scale_factor, muon
         nll = 0
         for id in data_hists:
             oi = data_hists[id]
-            ei = mc_hists[id]*x[0] + muon_hists[id]*x[5] + EPSILON
-            if id == dm_particle_id:
-                cdf = fast_cdf(bins,dm_energy,dm_energy*DM_ENERGY_RESOLUTION)
-                dm_hist = np.sum(cdf[1:] - cdf[:-1])
-                ei += dm_hist*x[6]
+            ei = mc_hists[id]*x[0] + muon_hists[id]*x[5] + dm_hists[id]*x[6] + EPSILON 
             N = ei.sum()
             nll -= -N - np.sum(gammaln(oi+1)) + np.sum(oi*np.log(ei))
 
@@ -251,7 +265,7 @@ def make_nll(dm_particle_id, dm_energy, data, muons, mc, atmo_scale_factor, muon
         return nll
     return nll
 
-def do_fit(dm_particle_id,dm_energy,data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,steps,print_nll=False,walkers=100,thin=10):
+def do_fit(dm_particle_id,dm_mass,dm_energy,data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,steps,print_nll=False,walkers=100,thin=10):
     """
     Run the fit and return the minimum along with samples from running an MCMC
     starting near the minimum.
@@ -268,7 +282,7 @@ def do_fit(dm_particle_id,dm_energy,data,muon,data_mc,atmo_scale_factor,muon_sca
     Returns a tuple (xopt, samples) where samples is an array of shape (steps,
     number of parameters).
     """
-    nll = make_nll(dm_particle_id,dm_energy,data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,print_nll)
+    nll = make_nll(dm_particle_id,dm_mass,dm_energy,data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,print_nll)
 
     pos = np.empty((walkers, len(PRIORS)),dtype=np.double)
     for i in range(pos.shape[0]):
@@ -306,11 +320,121 @@ def sample_priors():
     """
     return np.concatenate((truncnorm_scaled(PRIORS_LOW[:6],PRIORS_HIGH[:6],PRIORS[:6],PRIOR_UNCERTAINTIES[:6]),[np.random.uniform(PRIORS_LOW[6],PRIORS_HIGH[6])]))
 
-def get_dm_sample(n,dm_particle_id,dm_energy,dm_resolution):
-    ke = norm.rvs(dm_energy,dm_energy*dm_resolution)
-    id = np.tile(dm_particle_id,n)
-    data = pd.DataFrame(np.concatenate((ke,id)).T,columns=['ke','id'])
-    return data
+def get_dm_sample(n,dm_particle_id,dm_mass,dm_energy):
+    """
+    Returns a dataframe containing events from a dark matter particle.
+
+    Args:
+
+        - n: int
+            number of events
+        - dm_particle_id: int
+            the particle id of the DM particle (2020 or 2222)
+        - dm_energy: float
+            The total kinetic energy of the DM particle
+        - dm_resolution: float
+            The fractional energy resolution of the dark matter particle, i.e.
+            the actual energy resolution will be dm_energy*dm_resolution.
+    """
+    id1 = dm_particle_id//100
+    id2 = dm_particle_id % 100
+    m1 = SNOMAN_MASS[id1]
+    m2 = SNOMAN_MASS[id2]
+    energy1 = []
+    data = np.empty(n,dtype=[('energy1',np.double),('energy2',np.double),('ke',np.double),('id1',np.int),('id2',np.int),('id',np.int)])
+    for i, (v1, v2) in enumerate(islice(gen_decay(dm_mass,dm_energy,m1,m2),n)):
+        pos = rand_ball(PSUP_RADIUS)
+        E1 = v1[0]
+        E2 = v2[0]
+        p1 = np.linalg.norm(v1[1:])
+        p2 = np.linalg.norm(v2[1:])
+        T1 = E1 - m1
+        T2 = E2 - m2
+        # FIXME: Get electron and muon resolution
+        T1 += norm.rvs(scale=T1*0.05)
+        T2 += norm.rvs(scale=T2*0.05)
+        data[i] = T1, T2, T1 + T2, id1, id2, dm_particle_id
+    return pd.DataFrame(data)
+
+def integrate_mc_hist(mc, muons, id, x, a, b, atmo_scale_factor, muon_scale_factor):
+    """
+    Integrate the Monte Carlo expected histograms from a to b.
+
+    Args:
+        - mc: DataFrame
+            Pandas dataframe of atmospheric Monte Carlo events.
+        - muons: DataFrame
+            Pandas dataframe of external muon Monte Carlo events.
+        - id: int
+            The particle ID histogram to integrate.
+        - x: array
+            The fit parameters.
+        - a, b: float
+            Bounds of the integration.
+        - atmo_scale_factor, muon_scale_factor: float
+            Scale factors for the fit parameters.
+    """
+    mc_hists = get_mc_hists(mc,x,bins,scale=x[0]/atmo_scale_factor)
+    muon_hists = get_mc_hists(muons,x,bins,scale=x[5]/muon_scale_factor)
+
+    def total_hist(x):
+        if x < bins[0] or x > bins[-1]:
+            return 0
+        i = np.digitize(x,bins)
+        if i == len(bins):
+            i = len(bins) - 1
+        return mc_hists[id][i-1] + muon_hists[id][i-1]
+
+    # Yes, this is a stupid simple integration that I don't need quad for, but
+    # I don't want to have to code all the corner cases including when a and b
+    # are in the same bin, etc.
+    return quad(total_hist,a,b,points=bins[(bins >= a) & (bins <= b)])[0]
+
+def get_limits(dm_energies,data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,steps,print_nll,walkers,thin):
+    limits = {}
+    best_fit = {}
+    discovery_array = {}
+    for dm_particle_id in (2020,2222):
+        limits[dm_particle_id] = []
+        best_fit[dm_particle_id] = []
+        discovery_array[dm_particle_id] = []
+        for dm_energy in dm_energies:
+            dm_mass = dm_energy
+            xopt, samples = do_fit(dm_particle_id,dm_mass,dm_energy,data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,steps,print_nll,walkers,thin)
+
+            limit = np.percentile(samples[:,6],90)
+            limits[dm_particle_id].append(limit)
+
+            # Here, to determine if there is a discovery we make an approximate
+            # calculation of the number of events which would be significant.
+            #
+            # We expect the likelihood to be approximately that of a Poisson
+            # distribution with n background events and we are searching for a
+            # signal s. n is constrained by the rest of the histograms, and so
+            # we can treat is as being approximately fixed. In this case, the
+            # likelihood looks approximately like:
+            #
+            #     P(s) = e^(-(s+n))(s+n)**i/i!
+            #
+            # Where i is the actual number of events. Under the null hypothesis
+            # (i.e. no dark matter), we expect i to be Poisson distributed with
+            # mean n. Therefore s should have the same distribution but offset
+            # by n. Therefore, to determine the threshold, we simply look for
+            # the threshold we expect in n and then subtract n.
+            a = dm_energy - 2*dm_energy*DM_ENERGY_RESOLUTION
+            b = dm_energy + 2*dm_energy*DM_ENERGY_RESOLUTION
+            n = integrate_mc_hist(data_mc, muon, dm_particle_id, xopt, a, b, atmo_scale_factor, muon_scale_factor)
+            # Set our discovery threshold to the p-value we want divided by the
+            # number of bins. The idea here is that the number of bins is
+            # approximately equal to the number of trials so we need to
+            # increase our discovery threshold to account for the look
+            # elsewhere effect.
+            threshold = DISCOVERY_P_VALUE/(len(bins)-1)
+            discovery = poisson.ppf(1-threshold,n) + 1 - n
+            discovery_array[dm_particle_id].append(discovery)
+            best_fit[dm_particle_id].append(xopt[6])
+
+    return limits, best_fit, discovery_array
 
 if __name__ == '__main__':
     import argparse
@@ -335,6 +459,7 @@ if __name__ == '__main__':
     parser.add_argument("--print-nll", action='store_true', default=False, help="print nll values")
     parser.add_argument("--walkers", type=int, default=100, help="number of walkers")
     parser.add_argument("--thin", type=int, default=10, help="number of steps to thin")
+    parser.add_argument("--test", type=int, default=0, help="run tests to check discovery threshold")
     args = parser.parse_args()
 
     setup_matplotlib(args.save)
@@ -439,10 +564,11 @@ if __name__ == '__main__':
             n_dm = np.random.poisson(N_dm)
 
             dm_particle_id = np.random.choice([2020,2222])
-            dm_energy = np.random.uniform(20,10e3)
+            dm_mass = np.random.uniform(20,10e3)
+            dm_energy = dm_mass
 
             # Sample data from Monte Carlo
-            data = pd.concat((data_mc.sample(n=n,replace=True), muon.sample(n=n_muon,replace=True),get_dm_sample(n_dm,dm_particle_id,dm_energy,DM_ENERGY_RESOLUTION)))
+            data = pd.concat((data_mc.sample(n=n,replace=True), muon.sample(n=n_muon,replace=True),get_dm_sample(n_dm,dm_particle_id,dm_mass,dm_energy)))
             data_atm = pd.concat((data_atm_mc.sample(n=n_atm,replace=True), muon_atm.sample(n=n_muon_atm,replace=True)))
 
             # Smear the energies by the additional energy resolution
@@ -458,7 +584,7 @@ if __name__ == '__main__':
             data_atm.loc[data_atm.id2 == 22,'energy2'] *= (1+xtrue[3]+np.random.randn(np.count_nonzero(data_atm.id2 == 22))*xtrue[4])
             data_atm['ke'] = data_atm['energy1'].fillna(0) + data_atm['energy2'].fillna(0) + data_atm['energy3'].fillna(0)
 
-            xopt, samples = do_fit(data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,args.steps,args.print_nll,args.walkers,args.thin)
+            xopt, samples = do_fit(dm_particle_id,dm_mass,dm_energy,data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,args.steps,args.print_nll,args.walkers,args.thin)
 
             for i in range(len(FIT_PARS)):
                 # The "pull plots" we make here are actually produced via a
@@ -470,7 +596,7 @@ if __name__ == '__main__':
         fig = plt.figure()
         axes = []
         for i, name in enumerate(FIT_PARS):
-            axes.append(plt.subplot(3,2,i+1))
+            axes.append(plt.subplot(4,2,i+1))
             n, bins, patches = plt.hist(pull[i],bins=np.linspace(0,100,11),histtype='step')
             expected = len(pull[i])/(len(bins)-1)
             plt.axhline(expected,color='k',ls='--',alpha=0.25)
@@ -489,19 +615,66 @@ if __name__ == '__main__':
 
         sys.exit(0)
 
-    dm_energies = np.logspace(np.log10(20),np.log10(10e3),10)
-    limits = {}
-    for dm_particle_id in (2020,2222):
-        limits[dm_particle_id] = []
-        for dm_energy in dm_energies:
-            xopt, samples = do_fit(dm_particle_id,dm_energy,data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,args.steps,args.print_nll,args.walkers,args.thin)
+    if args.test:
+        # Set the random seed so we get reproducible results here
+        np.random.seed(0)
 
-            limit = np.percentile(samples[:,6],90)
-            limits[dm_particle_id].append(limit)
+        discoveries = 0
+
+        for i in range(args.test):
+            xtrue = sample_priors()
+
+            # Calculate expected number of events
+            N = len(data_mc)*xtrue[0]/atmo_scale_factor
+            N_atm = len(data_atm_mc)*xtrue[0]/atmo_scale_factor
+            N_muon = len(muon)*xtrue[5]/muon_scale_factor
+            N_muon_atm = len(muon_atm)*xtrue[5]/muon_scale_factor
+            N_dm = xtrue[6]
+
+            # Calculate observed number of events
+            n = np.random.poisson(N)
+            n_atm = np.random.poisson(N_atm)
+            n_muon = np.random.poisson(N_muon)
+            n_muon_atm = np.random.poisson(N_muon_atm)
+            n_dm = np.random.poisson(N_dm)
+
+            dm_particle_id = np.random.choice([2020,2222])
+            dm_mass = np.random.uniform(20,10e3)
+            dm_energy = dm_mass
+
+            # Sample data from Monte Carlo
+            data = pd.concat((data_mc.sample(n=n,replace=True), muon.sample(n=n_muon,replace=True),get_dm_sample(n_dm,dm_particle_id,dm_mass,dm_energy)))
+            data_atm = pd.concat((data_atm_mc.sample(n=n_atm,replace=True), muon_atm.sample(n=n_muon_atm,replace=True)))
+
+            # Smear the energies by the additional energy resolution
+            data.loc[data.id1 == 20,'energy1'] *= (1+xtrue[1]+np.random.randn(np.count_nonzero(data.id1 == 20))*xtrue[2])
+            data.loc[data.id1 == 22,'energy1'] *= (1+xtrue[3]+np.random.randn(np.count_nonzero(data.id1 == 22))*xtrue[4])
+            data.loc[data.id2 == 20,'energy2'] *= (1+xtrue[1]+np.random.randn(np.count_nonzero(data.id2 == 20))*xtrue[2])
+            data.loc[data.id2 == 22,'energy2'] *= (1+xtrue[3]+np.random.randn(np.count_nonzero(data.id2 == 22))*xtrue[4])
+            data['ke'] = data['energy1'].fillna(0) + data['energy2'].fillna(0) + data['energy3'].fillna(0)
+
+            data_atm.loc[data_atm.id1 == 20,'energy1'] *= (1+xtrue[1]+np.random.randn(np.count_nonzero(data_atm.id1 == 20))*xtrue[2])
+            data_atm.loc[data_atm.id1 == 22,'energy1'] *= (1+xtrue[3]+np.random.randn(np.count_nonzero(data_atm.id1 == 22))*xtrue[4])
+            data_atm.loc[data_atm.id2 == 20,'energy2'] *= (1+xtrue[1]+np.random.randn(np.count_nonzero(data_atm.id2 == 20))*xtrue[2])
+            data_atm.loc[data_atm.id2 == 22,'energy2'] *= (1+xtrue[3]+np.random.randn(np.count_nonzero(data_atm.id2 == 22))*xtrue[4])
+            data_atm['ke'] = data_atm['energy1'].fillna(0) + data_atm['energy2'].fillna(0) + data_atm['energy3'].fillna(0)
+
+            limits, best_fit, discovery_array = get_limits(DM_ENERGIES,data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,args.steps,args.print_nll,args.walkers,args.thin)
+
+            for id in (2020,2222):
+                if (best_fit[id] > discovery_array[id]).any():
+                    discoveries += 1
+
+        print("expected %.2f discoveries" % DISCOVERY_P_VALUE)
+        print("actually got %i/%i = %.2f discoveries" % (discoveries,args.test,discoveries/args.test))
+
+        sys.exit(0)
+
+    limits, best_fit, discovery_array = get_limits(DM_ENERGIES,data,muon,data_mc,atmo_scale_factor,muon_scale_factor,bins,args.steps,args.print_nll,args.walkers,args.thin)
 
     fig = plt.figure()
-    for dm_particle_id in (2020,2222):
-        plt.plot(dm_energies,limits[dm_particle_id],label='$' + ''.join([particle_id[int(''.join(x))] for x in grouper(str(dm_particle_id),2)]) + '$')
+    for color, dm_particle_id in zip(('C0','C1'),(2020,2222)):
+        plt.plot(DM_ENERGIES,limits[dm_particle_id],color=color,label='$' + ''.join([particle_id[int(''.join(x))] for x in grouper(str(dm_particle_id),2)]) + '$')
     plt.gca().set_xscale("log")
     despine(fig,trim=True)
     plt.xlabel("Energy (MeV)")
@@ -514,4 +687,21 @@ if __name__ == '__main__':
         plt.savefig("dm_search_limit.eps")
     else:
         plt.suptitle("Dark Matter Limits")
+
+    fig = plt.figure()
+    for color, dm_particle_id in zip(('C0','C1'),(2020,2222)):
+        plt.plot(DM_ENERGIES,best_fit[dm_particle_id],color=color,label='$' + ''.join([particle_id[int(''.join(x))] for x in grouper(str(dm_particle_id),2)]) + '$')
+        plt.plot(DM_ENERGIES,discovery_array[dm_particle_id],color=color,ls='--')
+    plt.gca().set_xscale("log")
+    despine(fig,trim=True)
+    plt.xlabel("Energy (MeV)")
+    plt.ylabel("Event Rate Limit (events)")
+    plt.legend()
+    plt.tight_layout()
+
+    if args.save:
+        plt.savefig("dm_best_fit_with_discovery_threshold.pdf")
+        plt.savefig("dm_best_fit_with_discovery_threshold.eps")
+    else:
+        plt.suptitle("Best Fit Dark Matter")
         plt.show()
author	tlatorre <tlatorre@uchicago.edu>	2020-11-16 07:55:17 -0600
committer	tlatorre <tlatorre@uchicago.edu>	2020-11-16 07:55:17 -0600
commit	fbcdf03f205840c5cd550689e3f60f5168f8ecca (patch)
tree	8733e490193a7243a87dda563a86597a58a96289 /utils/dm-search
parent	31bf8df0a8d222310bbbd772067f336140c24baa (diff)
download	sddm-fbcdf03f205840c5cd550689e3f60f5168f8ecca.tar.gz sddm-fbcdf03f205840c5cd550689e3f60f5168f8ecca.tar.bz2 sddm-fbcdf03f205840c5cd550689e3f60f5168f8ecca.zip