1 files changed, 87 insertions, 118 deletions
diff --git a/benchmark.py b/benchmark.py
index 4c46a1f..f839f61 100755
--- a/benchmark.py
+++ b/benchmark.py
@@ -1,95 +1,61 @@
 #!/usr/bin/env python
 import numpy as np
+import pycuda.driver as cuda
 from pycuda import gpuarray as ga
 import time
 from uncertainties import ufloat
 import sys
-
-from chroma.gpu import GPU, to_float3
-from chroma.camera import get_rays
-from chroma.event import Photons
-from chroma.sample import uniform_sphere
-from chroma.generator.photon import G4ParallelGenerator
-from chroma.optics import water_wcsim
-from chroma.generator.vertex import constant_particle_gun
-from chroma.tools import profile_if_possible
+import itertools
+
+from chroma import gpu
+from chroma import camera
+from chroma import event
+from chroma import sample
+from chroma import generator
+from chroma import tools
+from chroma import optics
 # Generator processes need to fork BEFORE the GPU context is setup
-g4generator = G4ParallelGenerator(4, water_wcsim)
-
-def progress(seq):
-    "Print progress while iterating over `seq`."
-    n = len(seq)
-    print '[' + ' '*21 + ']\r[',
-    sys.stdout.flush()
-    for i, item in enumerate(seq):
-        if i % (n//10) == 0:
-            print '.',
-            sys.stdout.flush()
-        yield item
-    print ']'
-    sys.stdout.flush()
-
-def ray_trace(gpu, number=1000):
-    """
-    Return the number of mean and standard deviation of the number of ray
-    intersections per second as a ufloat for the geometry loaded onto `gpu`.
-
-    .. note::
-        The rays are thrown from a camera sitting *outside* of the geometry.
+g4generator = generator.photon.G4ParallelGenerator(4, optics.water_wcsim)
 
-    Args:
-        - gpu, chroma.gpu.GPU
-            The GPU object with a geometry already loaded.
-        - number, int
-            The number of kernel calls to average.
-    """
-    lb, ub = gpu.geometry.mesh.get_bounds()
-    scale = np.linalg.norm(ub-lb)
-    point = [0,scale,(lb[2]+ub[2])/2]
+def intersect(gpu_instance, number=100, nphotons=500000, nthreads_per_block=64, max_blocks=1024):
+    "Returns the average number of ray intersections per second."
+    distances_gpu = ga.empty(nphotons, dtype=np.float32)
 
-    size = (800,600)
-    width, height = size
+    run_times = []
+    for i in tools.progress(range(number)):
+        pos = np.zeros((nphotons,3), dtype=np.float32)
+        dir = sample.uniform_sphere(nphotons)
 
-    origins, directions = get_rays(point, size, 0.035, focal_length=0.018)
+        gpu_rays = gpu.GPURays(pos, dir)
 
-    origins_gpu = ga.to_gpu(to_float3(origins))
-    directions_gpu = ga.to_gpu(to_float3(directions))
-    pixels_gpu = ga.zeros(width*height, dtype=np.int32)
+        gpu_funcs = gpu.GPUFuncs(gpu_instance.module)
 
-    run_times = []
-    for i in progress(range(number)):
         t0 = time.time()
-        gpu.kernels.ray_trace(np.int32(pixels_gpu.size), origins_gpu, directions_gpu, pixels_gpu, block=(gpu.nthreads_per_block,1,1), grid=(pixels_gpu.size//gpu.nthreads_per_block+1,1))
-        gpu.context.synchronize()
+        gpu_funcs.distance_to_mesh(np.int32(gpu_rays.pos.size), gpu_rays.pos, gpu_rays.dir, distances_gpu, block=(nthreads_per_block,1,1), grid=(gpu_rays.pos.size//nthreads_per_block+1,1))
+        cuda.Context.get_current().synchronize()
         elapsed = time.time() - t0
 
         if i > 0:
             # first kernel call incurs some driver overhead
             run_times.append(elapsed)
 
-    return pixels_gpu.size/ufloat((np.mean(run_times),np.std(run_times)))
+    return gpu_rays.pos.size/ufloat((np.mean(run_times),np.std(run_times)))
 
-def load_photons(gpu, number=10, nphotons=500000):
-    """
-    Return the mean and standard deviation of the number of photons loaded
-    onto `gpu` per second.
-
-    Args:
-        - gpu, chroma.gpu.GPU
-            The GPU object with a geometry already loaded.
-        - number, int
-            The number of loads to average
-        - nphotons, int
-            The number of photons to load per trial
-    """
-    gpu.setup_propagate()
-    photons = Photons(np.zeros((nphotons,3)), uniform_sphere(nphotons), np.random.uniform(400,800,size=nphotons))
+def load_photons(number=100, nphotons=500000):
+    """Returns the average number of photons moved to the GPU device memory
+    per second."""
+    pos = np.zeros((nphotons,3))
+    dir = sample.uniform_sphere(nphotons)
+    pol = np.cross(sample.uniform_sphere(nphotons), dir)
+    pol /= np.apply_along_axis(np.linalg.norm,1,pol)[:,np.newaxis]
+    wavelengths = np.random.uniform(400,800,size=nphotons)
+    photons = event.Photons(pos, dir, pol, wavelengths)
 
     run_times = []
-    for i in progress(range(number)):
+    for i in tools.progress(range(number)):
         t0 = time.time()
-        gpu.load_photons(photons)
-        gpu.context.synchronize()
+        gpu_photons = gpu.GPUPhotons(photons)
+        cuda.Context.get_current().synchronize()
         elapsed = time.time() - t0
 
         if i > 0:
@@ -98,28 +64,23 @@ def load_photons(gpu, number=10, nphotons=500000):
 
     return nphotons/ufloat((np.mean(run_times),np.std(run_times)))
 
-def propagate(gpu, number=10, nphotons=500000):
-    """
-    Return the mean and standard deviation of the number of photons propagated
-    per second as a ufloat for the geometry loaded onto `gpu`.
-
-    Args:
-        - gpu, chroma.gpu.GPU
-            The GPU object with a geometry already loaded.
-        - number, int
-            The number of kernel calls to average.
-        - nphotons, int
-            The number of photons to propagate per kernel call.
-    """
-    gpu.setup_propagate()
+def propagate(gpu_instance, number=10, nphotons=500000, nthreads_per_block=64, max_blocks=1024):
+    "Returns the average number of photons propagated on the GPU per second."
+    rng_states = gpu.get_rng_states(nthreads_per_block*max_blocks)
 
     run_times = []
-    for i in progress(range(number)):
-        photons = Photons(np.zeros((nphotons,3)), uniform_sphere(nphotons), np.random.uniform(400,800,size=nphotons))
-        gpu.load_photons(photons)
+    for i in tools.progress(range(number)):
+        pos = np.zeros((nphotons,3))
+        dir = sample.uniform_sphere(nphotons)
+        pol = np.cross(sample.uniform_sphere(nphotons), dir)
+        pol /= np.apply_along_axis(np.linalg.norm,1,pol)[:,np.newaxis]
+        wavelengths = np.random.uniform(400,800,size=nphotons)
+        photons = event.Photons(pos, dir, pol, wavelengths)
+        gpu_photons = gpu.GPUPhotons(photons)
+
         t0 = time.time()
-        gpu.propagate()
-        gpu.context.synchronize()
+        gpu.propagate(gpu_instance, gpu_photons, rng_states, nthreads_per_block, max_blocks)
+        cuda.Context.get_current().synchronize()
         elapsed = time.time() - t0
 
         if i > 0:
@@ -128,16 +89,15 @@ def propagate(gpu, number=10, nphotons=500000):
 
     return nphotons/ufloat((np.mean(run_times),np.std(run_times)))
 
-@profile_if_possible
-def pdf(gpu, max_pmt_id, npdfs=10, nevents=100, nreps=1):
+@tools.profile_if_possible
+def pdf(gpu_instance, gpu_geometry, max_pmt_id, npdfs=10, nevents=100, nreps=1, nthreads_per_block=64, max_blocks=1024):
     """
-    Return the mean and standard deviation of the number of 100 MeV
-    events per second that can be histogrammed a ufloat for the
-    geometry loaded onto `gpu`.
+    Returns the average number of 100 MeV events per second that can be
+    histogrammed per second.
 
     Args:
-        - gpu, chroma.gpu.GPU
-            The GPU object with a geometry already loaded.
+        - gpu_instance, chroma.gpu.GPU
+            The GPU instance passed to the GPUGeometry constructor.
         - max_pmt_id, int
             The channel number of the highest PMT
         - npdfs, int
@@ -147,27 +107,28 @@ def pdf(gpu, max_pmt_id, npdfs=10, nevents=100, nreps=1):
         - nreps, int
             The number of times to propagate each event and add to PDF
     """
-    gpu.setup_propagate()
-
-    run_times = []
+    rng_states = gpu.get_rng_states(nthreads_per_block*max_blocks)
 
-    gpu.setup_propagate()
-    gpu.setup_daq(max_pmt_id)
-    gpu.setup_pdf(max_pmt_id, 100, (-0.5, 999.5), 10, (-0.5, 9.5))
-    vertex_gen = constant_particle_gun('e-', (0,0,0), (1,0,0), 100)
+    gpu_daq = gpu.GPUDaq(gpu_geometry, max_pmt_id)
+    gpu_pdf = gpu.GPUPDF()
+    gpu_pdf.setup_pdf(max_pmt_id, 100, (-0.5, 999.5), 10, (-0.5, 9.5))
 
-    for i in progress(range(npdfs)):
+    run_times = []
+    for i in tools.progress(range(npdfs)):
         t0 = time.time()
-        gpu.clear_pdf()
+        gpu_pdf.clear_pdf()
+
+        vertex_gen = generator.vertex.constant_particle_gun('e-', (0,0,0), (1,0,0), 100)
+        vertex_iter = itertools.islice(vertex_gen, nevents)
 
-        for ev in g4generator.generate_events(nevents, vertex_gen):
+        for ev in g4generator.generate_events(vertex_iter):
             for j in xrange(nreps):
-                gpu.load_photons(ev.photon_start)
-                gpu.propagate()
-                gpu.run_daq()
-                gpu.add_hits_to_pdf()
+                gpu_photons = gpu.GPUPhotons(ev.photons_beg)
+                gpu.propagate(gpu_instance, gpu_photons, rng_states, nthreads_per_block, max_blocks)
+                gpu_channels = gpu_daq.acquire(gpu_photons, rng_states, nthreads_per_block, max_blocks)
+                gpu_pdf.add_hits_to_pdf(gpu_channels, nthreads_per_block)
 
-        hitcount, pdf = gpu.get_pdfs()
+        hitcount, pdf = gpu_pdf.get_pdfs()
 
         elapsed = time.time() - t0
 
@@ -178,15 +139,23 @@ def pdf(gpu, max_pmt_id, npdfs=10, nevents=100, nreps=1):
     return nevents*nreps/ufloat((np.mean(run_times),np.std(run_times)))
 
 if __name__ == '__main__':
-    from chroma.detectors import build_lbne_200kton, build_minilbne
+    from chroma import detectors
+    import gc
 
-    lbne = build_lbne_200kton()
+    lbne = detectors.build_lbne_200kton()
     lbne.build(bits=11)
 
-    gpu = GPU()
-    gpu.load_geometry(lbne, print_usage=False)
+    gpu_instance = gpu.GPU()
+    gpu_geometry = gpu.GPUGeometry(gpu_instance, lbne)
     
-    print '%s track steps/s' % ray_trace(gpu)
-    print '%s loaded photons/s' % load_photons(gpu)
-    print '%s propagated photons/s' % propagate(gpu)
-    print '%s histogrammed 100 MeV events/s' % pdf(gpu, max(lbne.pmtids))
+    gpu_instance.print_mem_info()
+    print '%s ray intersections/sec.' % tools.ufloat_to_str(intersect(gpu_instance))
+    gc.collect()
+    gpu_instance.print_mem_info()
+    print '%s photons loaded/sec.' % tools.ufloat_to_str(load_photons())
+    gc.collect()
+    gpu_instance.print_mem_info()
+    print '%s photons propagated/sec.' % tools.ufloat_to_str(propagate(gpu_instance))
+    gc.collect()
+    gpu_instance.print_mem_info()
+    print '%s 100 MeV events histogrammed/s' % tools.ufloat_to_str(pdf(gpu_instance, gpu_geometry, max(lbne.pmtids)))