cache results from get_events()

author: tlatorre <tlatorre@uchicago.edu> 2021-01-03 11:40:47 -0600
committer: tlatorre <tlatorre@uchicago.edu> 2021-01-03 11:40:47 -0600
commit: 6fc97e22fa2a8cc48a96b2ce22c4d55a8fd2b80f (patch)
tree: f5bf72de18002e0d126b8e37c628f68ec3926fb6
parent: c71532924c8c2be07026041b7cd62819e90e8c64 (diff)
download: sddm-6fc97e22fa2a8cc48a96b2ce22c4d55a8fd2b80f.tar.gz
sddm-6fc97e22fa2a8cc48a96b2ce22c4d55a8fd2b80f.tar.bz2
sddm-6fc97e22fa2a8cc48a96b2ce22c4d55a8fd2b80f.zip
6 files changed, 48 insertions, 24 deletions
diff --git a/utils/chi2 b/utils/chi2
index c883deb..f4d94c4 100755
--- a/utils/chi2
+++ b/utils/chi2
@@ -552,7 +552,7 @@ if __name__ == '__main__':
     # Add the "flux_weight" column to the ev_mc data since I stupidly simulated
     # the muon neutrino flux for the tau neutrino flux in GENIE. Doh!
     mcpl = load_mcpl_files(args.mcpl)
-    ev_mc = renormalize_data(ev_mc.reset_index(),mcpl)
+    ev_mc = renormalize_data(ev_mc,mcpl)
 
     # Merge weights with MCPL dataframe to get the unique id column in the
     # weights dataframe since that is what we use to merge with the Monte
@@ -591,11 +591,7 @@ if __name__ == '__main__':
     muon_mc = correct_energy_bias(muon_mc)
 
     # Set all prompt events in the MC to be muons
-    muon_mc.loc[muon_mc.prompt & muon_mc.filename.str.contains("cosmic"),'muon'] = True
-
-    ev = ev.reset_index()
-    ev_mc = ev_mc.reset_index()
-    muon_mc = muon_mc.reset_index()
+    muon_mc.loc[muon_mc.prompt ,'muon'] = True
 
     # 00-orphan cut
     ev = ev[(ev.gtid & 0xff) != 0]
diff --git a/utils/dm-search b/utils/dm-search
index 8e2473c..cd91eab 100755
--- a/utils/dm-search
+++ b/utils/dm-search
@@ -616,7 +616,7 @@ if __name__ == '__main__':
     # Add the "flux_weight" column to the ev_mc data since I stupidly simulated
     # the muon neutrino flux for the tau neutrino flux in GENIE. Doh!
     mcpl = load_mcpl_files(args.mcpl)
-    ev_mc = renormalize_data(ev_mc.reset_index(),mcpl)
+    ev_mc = renormalize_data(ev_mc,mcpl)
 
     # Merge weights with MCPL dataframe to get the unique id column in the
     # weights dataframe since that is what we use to merge with the Monte
@@ -653,11 +653,7 @@ if __name__ == '__main__':
     muon_mc = correct_energy_bias(muon_mc)
 
     # Set all prompt events in the MC to be muons
-    muon_mc.loc[muon_mc.prompt & muon_mc.filename.str.contains("cosmic"),'muon'] = True
-
-    ev = ev.reset_index()
-    ev_mc = ev_mc.reset_index()
-    muon_mc = muon_mc.reset_index()
+    muon_mc.loc[muon_mc.prompt,'muon'] = True
 
     # 00-orphan cut
     ev = ev[(ev.gtid & 0xff) != 0]
diff --git a/utils/plot-michels b/utils/plot-michels
index dbd6fd5..e0e4dd7 100755
--- a/utils/plot-michels
+++ b/utils/plot-michels
@@ -131,9 +131,6 @@ if __name__ == '__main__':
     ev = ev.reset_index()
     ev_mc = ev_mc.reset_index()
 
-    # Set all prompt events in the MC to be muons
-    ev_mc.loc[ev_mc.prompt & ev_mc.filename.str.contains("cosmic"),'muon'] = True
-
     # First, do basic data cleaning which is done for all events.
     ev = ev[ev.signal | ev.muon]
     ev_mc = ev_mc[ev_mc.signal | ev_mc.muon]
diff --git a/utils/plot-muons b/utils/plot-muons
index 2616ad2..a37f223 100755
--- a/utils/plot-muons
+++ b/utils/plot-muons
@@ -102,11 +102,8 @@ if __name__ == '__main__':
     ev = ev[~np.isnan(ev.fmin)]
     ev_mc = ev_mc[~np.isnan(ev_mc.fmin)]
 
-    ev = ev.reset_index()
-    ev_mc = ev_mc.reset_index()
-
     # Set all prompt events in the MC to be muons
-    ev_mc.loc[ev_mc.prompt & ev_mc.filename.str.contains("cosmic"),'muon'] = True
+    ev_mc.loc[ev_mc.prompt,'muon'] = True
 
     # First, do basic data cleaning which is done for all events.
     ev = ev[ev.signal | ev.muon]
diff --git a/utils/sddm/__init__.py b/utils/sddm/__init__.py
index c6f2ddc..fb0fd69 100644
--- a/utils/sddm/__init__.py
+++ b/utils/sddm/__init__.py
@@ -193,7 +193,20 @@ def read_hdf(filename, df):
     """
     with h5py.File(filename,'r') as f:
         data = f[df][:]
-    return pd.DataFrame(data)
+        df = pd.DataFrame(data)
+        df.attrs = dict(f.attrs)
+    return df
+
+def write_hdf(filename, df, name):
+    """
+    Similar to pd.read_hdf() but doesn't require pytables. I'm adding this
+    function since the open science grid login nodes don't have pytables.
+    """
+    with h5py.File(filename,'w') as f:
+        f.create_dataset(name,data=df.to_records())
+        if hasattr(df,'attrs'):
+            for key, value in df.attrs.items():
+                f.attrs[key] = value
 
 # from https://stackoverflow.com/questions/2891790/how-to-pretty-print-a-numpy-array-without-scientific-notation-and-with-given-pre
 @contextlib.contextmanager
diff --git a/utils/sddm/plot_energy.py b/utils/sddm/plot_energy.py
index 391fa66..f81df67 100644
--- a/utils/sddm/plot_energy.py
+++ b/utils/sddm/plot_energy.py
@@ -19,11 +19,11 @@ import numpy as np
 from scipy.stats import poisson, dirichlet, multinomial, beta
 from scipy.special import spence, gammaln
 from .dc import DC_MUON, DC_JUNK, DC_CRATE_ISOTROPY, DC_QVNHIT, DC_NECK, DC_FLASHER, DC_ESUM, DC_OWL, DC_OWL_TRIGGER, DC_FTS, DC_ITC, DC_BREAKDOWN
-from . import grouper, print_warning, AV_RADIUS, PSUP_RADIUS, read_hdf
+from . import grouper, print_warning, AV_RADIUS, PSUP_RADIUS, read_hdf, write_hdf
 import pandas as pd
 from scipy.special import logsumexp
-from os.path import exists, join
-from os import environ
+from os.path import exists, join, split, getmtime
+from os import environ, mkdir, remove
 import hashlib
 
 EPSILON = 1e-10
@@ -482,7 +482,21 @@ def burst_cut(ev):
     return ev.groupby(burst.cumsum()).filter(lambda ev: len(ev[ev.prompt_50]) <= BURST_MAX_EVENTS).reset_index(drop=True)
 
 def get_events(filenames, merge_fits=False, nhit_thresh=None, mc=False):
-    ev = pd.concat([read_hdf(filename, "ev").assign(filename=filename) for filename in filenames],ignore_index=True)
+    h = hashlib.sha1()
+
+    for filename in filenames:
+        head, tail = split(filename)
+        mtime = getmtime(filename)
+        h.update(tail.encode("utf-8"))
+        h.update(str(mtime).encode("utf-8"))
+
+    hash_filename = join(environ['HOME'],'.cached_hdf5','%s.hdf5' % h.hexdigest())
+
+    if exists(hash_filename) and merge_fits:
+        print("loading result from '%s'" % hash_filename)
+        return read_hdf(hash_filename,"ev")
+
+    ev = pd.concat([read_hdf(filename, "ev") for filename in filenames],ignore_index=True)
     fits = pd.concat([read_hdf(filename, "fits") for filename in filenames],ignore_index=True)
     rhdr = pd.concat([read_hdf(filename, "rhdr") for filename in filenames],ignore_index=True)
     if mc:
@@ -727,10 +741,21 @@ def get_events(filenames, merge_fits=False, nhit_thresh=None, mc=False):
                       np.cos(ev_single_particle.theta1)*ev_single_particle.z
         ev['udotr'] /= ev_single_particle.r
 
+        ev = ev.reset_index()
+
         # Yes, this is super hacky, but I don't want to change what's returned from
         # this function.
         ev.attrs = {'time_pulse_gt': time_pulse_gt, 'time_10_mhz': time_10_mhz}
 
+        head, tail = split(hash_filename)
+        if not exists(head):
+            mkdir(head)
+
+        try:
+            write_hdf(hash_filename,ev,"ev")
+        except KeyboardInterrupt as e:
+            remove(hash_filename)
+
         return ev
 
     # Yes, this is super hacky, but I don't want to change what's returned from
author	tlatorre <tlatorre@uchicago.edu>	2021-01-03 11:40:47 -0600
committer	tlatorre <tlatorre@uchicago.edu>	2021-01-03 11:40:47 -0600
commit	6fc97e22fa2a8cc48a96b2ce22c4d55a8fd2b80f (patch)
tree	f5bf72de18002e0d126b8e37c628f68ec3926fb6
parent	c71532924c8c2be07026041b7cd62819e90e8c64 (diff)
download	sddm-6fc97e22fa2a8cc48a96b2ce22c4d55a8fd2b80f.tar.gz sddm-6fc97e22fa2a8cc48a96b2ce22c4d55a8fd2b80f.tar.bz2 sddm-6fc97e22fa2a8cc48a96b2ce22c4d55a8fd2b80f.zip