update bins for muon histograms

This commit updates the bins for the muon histograms to not go too far above 2 GeV. The reason is that at these energies most muons will exit the detetor and my energy reconstruction doesn't work too well. I also updated chi2 and dm-search to use the flux_weight when sampling from the MC.
author: tlatorre <tlatorre@uchicago.edu> 2020-12-21 00:02:04 -0600
committer: tlatorre <tlatorre@uchicago.edu> 2020-12-21 00:02:04 -0600
commit: cf2d8db90bbb7757b0336cab7801a03ef7043f31 (patch)
tree: d918df4826af0985907ab042bf1ccb5d389475e4
parent: 834df6cfc5737db2b2f4c0a1432c97960ed65e76 (diff)
download: sddm-cf2d8db90bbb7757b0336cab7801a03ef7043f31.tar.gz
sddm-cf2d8db90bbb7757b0336cab7801a03ef7043f31.tar.bz2
sddm-cf2d8db90bbb7757b0336cab7801a03ef7043f31.zip
2 files changed, 67 insertions, 30 deletions
diff --git a/utils/chi2 b/utils/chi2
index 6888158..6e41f16 100755
--- a/utils/chi2
+++ b/utils/chi2
@@ -122,14 +122,17 @@ def plot_hist2(hists, bins, color=None):
         elif id == 2222:
             plt.subplot(2,3,6)
 
-        bincenters = (bins[1:] + bins[:-1])/2
-        plt.hist(bincenters, bins=bins, histtype='step', weights=hists[id],color=color)
+        bincenters = (bins[id][1:] + bins[id][:-1])/2
+        plt.hist(bincenters, bins=bins[id], histtype='step', weights=hists[id],color=color)
         plt.gca().set_xscale("log")
         major = np.array([10,100,1000,10000])
         minor = np.unique(list(chain(*list(range(i,i*10,i) for i in major[:-1]))))
         minor = np.setdiff1d(minor,major)
+        major = major[major <= bins[id][-1]]
+        minor = minor[minor <= bins[id][-1]]
         plt.gca().set_xticks(major)
         plt.gca().set_xticks(minor,minor=True)
+        plt.gca().set_xlim(10,10000)
         plt.xlabel("Energy (MeV)")
         plt.title('$' + ''.join([particle_id[int(''.join(x))] for x in grouper(str(id),2)]) + '$')
 
@@ -188,9 +191,9 @@ def get_mc_hists_fast(df_dict,x,bins,scale=1.0,reweight=False):
             resolution = np.sqrt((df.energy1.values*max(EPSILON,x[2]))**2 + (df.energy2.values*max(EPSILON,x[4]))**2)
 
         if reweight:
-            cdf = fast_cdf(bins[:,np.newaxis],ke,resolution)*df.weight.values
+            cdf = fast_cdf(bins[id][:,np.newaxis],ke,resolution)*df.weight.values
         else:
-            cdf = fast_cdf(bins[:,np.newaxis],ke,resolution)
+            cdf = fast_cdf(bins[id][:,np.newaxis],ke,resolution)
 
         if 'flux_weight' in df.columns:
             cdf *= df.flux_weight.values
@@ -205,7 +208,7 @@ def get_data_hists(data,bins,scale=1.0):
     """
     data_hists = {}
     for id in (20,22,2020,2022,2222):
-        data_hists[id] = np.histogram(data[data.id == id].ke.values,bins=bins)[0]*scale
+        data_hists[id] = np.histogram(data[data.id == id].ke.values,bins=bins[id])[0]*scale
     return data_hists
 
 def make_nll(data, muons, mc, atmo_scale_factor, muon_scale_factor, bins, reweight=False, print_nll=False):
@@ -479,7 +482,7 @@ if __name__ == '__main__':
     import nlopt
     from sddm.renormalize import *
 
-    parser = argparse.ArgumentParser("plot fit results")
+    parser = argparse.ArgumentParser("run null hypothesis test")
     parser.add_argument("filenames", nargs='+', help="input files")
     parser.add_argument("--save", action='store_true', default=False, help="save corner plots for backgrounds")
     parser.add_argument("--mc", nargs='+', required=True, help="atmospheric MC files")
@@ -501,16 +504,17 @@ if __name__ == '__main__':
 
     import matplotlib.pyplot as plt
 
+    if args.run_list is not None:
+        run_list = np.genfromtxt(args.run_list)
+
     # Loop over runs to prevent using too much memory
     evs = []
     rhdr = pd.concat([read_hdf(filename, "rhdr").assign(filename=filename) for filename in args.filenames],ignore_index=True)
     for run, df in rhdr.groupby('run'):
+        if args.run_list and run not in run_list:
+            continue
         evs.append(get_events(df.filename.values, merge_fits=True, nhit_thresh=args.nhit_thresh))
-    ev = pd.concat(evs)
-
-    if args.run_list is not None:
-        runs = np.genfromtxt(args.run_list)
-        ev = ev[ev.run.isin(runs)]
+    ev = pd.concat(evs).reset_index()
 
     ev = correct_energy_bias(ev)
 
@@ -521,10 +525,25 @@ if __name__ == '__main__':
     ev_mcs = []
     for filename in args.mc:
         ev_mcs.append(get_events([filename], merge_fits=True, nhit_thresh=args.nhit_thresh, mc=True))
-    ev_mc = pd.concat(ev_mcs)
+    ev_mc = pd.concat(ev_mcs).reset_index()
     muon_mc = get_events(args.muon_mc, merge_fits=True, nhit_thresh=args.nhit_thresh, mc=True)
     weights = pd.concat([read_hdf(filename, "weights") for filename in args.weights],ignore_index=True)
 
+    # The next two things we have to do are reweight the Monte Carlo data since
+    # I accidentally simulated the muon neutrino flux instead of the tau
+    # neutrino flux and load in the GENIE systematics weights.
+    #
+    # Both of these are a bit tricky because of the fact that I had to
+    # resimulate some MC events since they failed to simulate (there was a
+    # packer error and occasionally a geometry error that was causing ~10% of
+    # the MC events to fail). Since I had to resimulate them, it's not possible
+    # to connect the GENIE weights to the MC events by just using the event
+    # number.
+    #
+    # Therefore, I decided to use a "unique_id" field which I compute by
+    # hashing the position of the event. This unique_id along with the run
+    # should completely specify a unique mapping between the events.
+
     # Add the "flux_weight" column to the ev_mc data since I stupidly simulated
     # the muon neutrino flux for the tau neutrino flux in GENIE. Doh!
     mcpl = load_mcpl_files(args.mcpl)
@@ -605,9 +624,9 @@ if __name__ == '__main__':
     #muon = ev[ev.muon & ev.prompt & ~ev.atm]
     #muon_atm = ev[ev.muon & ev.prompt & ev.atm]
 
-    if (~rhdr.run.isin(ev_mc.run)).any():
+    if (~ev.run.isin(ev_mc.run)).any():
         print_warning("Error! The following runs have no Monte Carlo: %s" % \
-            rhdr.run[~rhdr.run.isin(ev_mc.run)].values)
+            np.unique(ev.run[~ev.run.isin(ev_mc.run)].values))
         sys.exit(1)
 
     if not args.pull and not args.coverage:
@@ -616,7 +635,11 @@ if __name__ == '__main__':
     data_mc = ev_mc[ev_mc.signal & ev_mc.prompt & ~ev_mc.atm]
     data_atm_mc = ev_mc[ev_mc.signal & ev_mc.prompt & ev_mc.atm]
 
-    bins = np.logspace(np.log10(20),np.log10(10e3),21)
+    bins = {20:np.logspace(np.log10(20),np.log10(10e3),21),
+            22:np.logspace(np.log10(20),np.log10(10e3),21)[:-5],
+            2020:np.logspace(np.log10(20),np.log10(10e3),21),
+            2022:np.logspace(np.log10(20),np.log10(10e3),21)[:-5],
+            2222:np.logspace(np.log10(20),np.log10(10e3),21)[:-5]}
 
     atmo_scale_factor = 100.0
     muon_scale_factor = len(muon) + len(muon_atm)
@@ -633,6 +656,9 @@ if __name__ == '__main__':
         data_mc_with_weights = pd.merge(data_mc,weights[weights.universe == 0],how='left',on=['run','unique_id'])
         data_atm_mc_with_weights = pd.merge(data_atm_mc,weights[weights.universe == 0],how='left',on=['run','unique_id'])
 
+        data_mc_with_weights.weight *= data_mc_with_weights.flux_weight
+        data_atm_mc_with_weights.weight *= data_atm_mc_with_weights.flux_weight
+
         for i in range(args.coverage):
             # Calculate expected number of events
             N = len(data_mc)*xtrue[0]/atmo_scale_factor
@@ -748,8 +774,8 @@ if __name__ == '__main__':
             n_muon_atm = np.random.poisson(N_muon_atm)
 
             # Sample data from Monte Carlo
-            data = pd.concat((data_mc.sample(n=n,replace=True), muon.sample(n=n_muon,replace=True)))
-            data_atm = pd.concat((data_atm_mc.sample(n=n_atm,replace=True), muon_atm.sample(n=n_muon_atm,replace=True)))
+            data = pd.concat((data_mc.sample(n=n,weights='flux_weight',replace=True), muon.sample(n=n_muon,replace=True)))
+            data_atm = pd.concat((data_atm_mc.sample(n=n_atm,weights='flux_weight',replace=True), muon_atm.sample(n=n_muon_atm,replace=True)))
 
             # Smear the energies by the additional energy resolution
             data.loc[data.id1 == 20,'energy1'] *= (1+xtrue[1]+np.random.randn(np.count_nonzero(data.id1 == 20))*xtrue[2])
diff --git a/utils/dm-search b/utils/dm-search
index dd4946b..12d2a4b 100755
--- a/utils/dm-search
+++ b/utils/dm-search
@@ -139,14 +139,17 @@ def plot_hist2(hists, bins, color=None):
         elif id == 2222:
             plt.subplot(2,3,6)
 
-        bincenters = (bins[1:] + bins[:-1])/2
-        plt.hist(bincenters, bins=bins, histtype='step', weights=hists[id],color=color)
+        bincenters = (bins[id][1:] + bins[id][:-1])/2
+        plt.hist(bincenters, bins=bins[id], histtype='step', weights=hists[id],color=color)
         plt.gca().set_xscale("log")
         major = np.array([10,100,1000,10000])
         minor = np.unique(list(chain(*list(range(i,i*10,i) for i in major[:-1]))))
         minor = np.setdiff1d(minor,major)
+        major = major[major <= bins[id][-1]]
+        minor = minor[minor <= bins[id][-1]]
         plt.gca().set_xticks(major)
         plt.gca().set_xticks(minor,minor=True)
+        plt.gca().set_xlim(10,10000)
         plt.xlabel("Energy (MeV)")
         plt.title('$' + ''.join([particle_id[int(''.join(x))] for x in grouper(str(id),2)]) + '$')
 
@@ -205,9 +208,9 @@ def get_mc_hists_fast(df_dict,x,bins,scale=1.0,reweight=False):
             resolution = np.sqrt((df.energy1.values*max(EPSILON,x[2]))**2 + (df.energy2.values*max(EPSILON,x[4]))**2)
 
         if reweight:
-            cdf = fast_cdf(bins[:,np.newaxis],ke,resolution)*df.weight.values
+            cdf = fast_cdf(bins[id][:,np.newaxis],ke,resolution)*df.weight.values
         else:
-            cdf = fast_cdf(bins[:,np.newaxis],ke,resolution)
+            cdf = fast_cdf(bins[id][:,np.newaxis],ke,resolution)
 
         if 'flux_weight' in df.columns:
             cdf *= df.flux_weight.values
@@ -222,7 +225,7 @@ def get_data_hists(data,bins,scale=1.0):
     """
     data_hists = {}
     for id in (20,22,2020,2022,2222):
-        data_hists[id] = np.histogram(data[data.id == id].ke.values,bins=bins)[0]*scale
+        data_hists[id] = np.histogram(data[data.id == id].ke.values,bins=bins[id])[0]*scale
     return data_hists
 
 def make_nll(dm_particle_id, dm_mass, dm_energy, data, muons, mc, atmo_scale_factor, muon_scale_factor, bins, reweight=False, print_nll=False, dm_sample=None):
@@ -559,17 +562,18 @@ if __name__ == '__main__':
 
     import matplotlib.pyplot as plt
 
+    if args.run_list is not None:
+        run_list = np.genfromtxt(args.run_list)
+
     # Loop over runs to prevent using too much memory
     evs = []
     rhdr = pd.concat([read_hdf(filename, "rhdr").assign(filename=filename) for filename in args.filenames],ignore_index=True)
     for run, df in rhdr.groupby('run'):
+        if args.run_list and run not in run_list:
+            continue
         evs.append(get_events(df.filename.values, merge_fits=True, nhit_thresh=args.nhit_thresh))
     ev = pd.concat(evs).reset_index()
 
-    if args.run_list is not None:
-        runs = np.genfromtxt(args.run_list)
-        ev = ev[ev.run.isin(runs)]
-
     livetime = 0.0
     livetime_pulse_gt = 0.0
     for _ev in evs:
@@ -600,7 +604,7 @@ if __name__ == '__main__':
     ev_mcs = []
     for filename in args.mc:
         ev_mcs.append(get_events([filename], merge_fits=True, nhit_thresh=args.nhit_thresh, mc=True))
-    ev_mc = pd.concat(ev_mcs)
+    ev_mc = pd.concat(ev_mcs).reset_index()
     muon_mc = get_events(args.muon_mc, merge_fits=True, nhit_thresh=args.nhit_thresh, mc=True)
     weights = pd.concat([read_hdf(filename, "weights") for filename in args.weights],ignore_index=True)
 
@@ -697,7 +701,11 @@ if __name__ == '__main__':
     data_mc = ev_mc[ev_mc.signal & ev_mc.prompt & ~ev_mc.atm]
     data_atm_mc = ev_mc[ev_mc.signal & ev_mc.prompt & ev_mc.atm]
 
-    bins = np.logspace(np.log10(20),np.log10(10e3),21)
+    bins = {20:np.logspace(np.log10(20),np.log10(10e3),21),
+            22:np.logspace(np.log10(20),np.log10(10e3),21)[:-5],
+            2020:np.logspace(np.log10(20),np.log10(10e3),21),
+            2022:np.logspace(np.log10(20),np.log10(10e3),21)[:-5],
+            2222:np.logspace(np.log10(20),np.log10(10e3),21)[:-5]}
 
     atmo_scale_factor = 100.0
     muon_scale_factor = len(muon) + len(muon_atm)
@@ -730,8 +738,8 @@ if __name__ == '__main__':
             dm_energy = dm_mass
 
             # Sample data from Monte Carlo
-            data = pd.concat((data_mc.sample(n=n,replace=True), muon.sample(n=n_muon,replace=True),get_dm_sample(n_dm,dm_particle_id,dm_mass,dm_energy)))
-            data_atm = pd.concat((data_atm_mc.sample(n=n_atm,replace=True), muon_atm.sample(n=n_muon_atm,replace=True)))
+            data = pd.concat((data_mc.sample(n=n,weights='flux_weight',replace=True), muon.sample(n=n_muon,replace=True)))
+            data_atm = pd.concat((data_atm_mc.sample(n=n_atm,weights='flux_weight',replace=True), muon_atm.sample(n=n_muon_atm,replace=True)))
 
             # Smear the energies by the additional energy resolution
             data.loc[data.id1 == 20,'energy1'] *= (1+xtrue[1]+np.random.randn(np.count_nonzero(data.id1 == 20))*xtrue[2])
@@ -786,6 +794,9 @@ if __name__ == '__main__':
 
         discoveries = 0
 
+        data_mc_with_weights.weight *= data_mc_with_weights.flux_weight
+        data_atm_mc_with_weights.weight *= data_atm_mc_with_weights.flux_weight
+
         for i in range(args.test):
             xtrue = sample_priors()
author	tlatorre <tlatorre@uchicago.edu>	2020-12-21 00:02:04 -0600
committer	tlatorre <tlatorre@uchicago.edu>	2020-12-21 00:02:04 -0600
commit	cf2d8db90bbb7757b0336cab7801a03ef7043f31 (patch)
tree	d918df4826af0985907ab042bf1ccb5d389475e4
parent	834df6cfc5737db2b2f4c0a1432c97960ed65e76 (diff)
download	sddm-cf2d8db90bbb7757b0336cab7801a03ef7043f31.tar.gz sddm-cf2d8db90bbb7757b0336cab7801a03ef7043f31.tar.bz2 sddm-cf2d8db90bbb7757b0336cab7801a03ef7043f31.zip