1 files changed, 28 insertions, 19 deletions
diff --git a/benchmark.py b/benchmark.py
index f839f61..31c145c 100755
--- a/benchmark.py
+++ b/benchmark.py
@@ -17,21 +17,21 @@ from chroma import optics
 # Generator processes need to fork BEFORE the GPU context is setup
 g4generator = generator.photon.G4ParallelGenerator(4, optics.water_wcsim)
 
-def intersect(gpu_instance, number=100, nphotons=500000, nthreads_per_block=64, max_blocks=1024):
+def intersect(gpu_geometry, number=100, nphotons=500000, nthreads_per_block=64,
+              max_blocks=1024):
     "Returns the average number of ray intersections per second."
     distances_gpu = ga.empty(nphotons, dtype=np.float32)
 
+    module = gpu.get_cu_module('mesh.h', options=('--use_fast_math',))
+    gpu_funcs = gpu.GPUFuncs(module)
+
     run_times = []
     for i in tools.progress(range(number)):
-        pos = np.zeros((nphotons,3), dtype=np.float32)
-        dir = sample.uniform_sphere(nphotons)
-
-        gpu_rays = gpu.GPURays(pos, dir)
-
-        gpu_funcs = gpu.GPUFuncs(gpu_instance.module)
+        pos = ga.zeros(nphotons, dtype=ga.vec.float3)
+        dir = ga.to_gpu(gpu.to_float3(sample.uniform_sphere(nphotons)))
 
         t0 = time.time()
-        gpu_funcs.distance_to_mesh(np.int32(gpu_rays.pos.size), gpu_rays.pos, gpu_rays.dir, distances_gpu, block=(nthreads_per_block,1,1), grid=(gpu_rays.pos.size//nthreads_per_block+1,1))
+        gpu_funcs.distance_to_mesh(np.int32(pos.size), pos, dir, gpu_geometry.gpudata, distances_gpu, block=(nthreads_per_block,1,1), grid=(pos.size//nthreads_per_block+1,1))
         cuda.Context.get_current().synchronize()
         elapsed = time.time() - t0
 
@@ -39,7 +39,7 @@ def intersect(gpu_instance, number=100, nphotons=500000, nthreads_per_block=64,
             # first kernel call incurs some driver overhead
             run_times.append(elapsed)
 
-    return gpu_rays.pos.size/ufloat((np.mean(run_times),np.std(run_times)))
+    return nphotons/ufloat((np.mean(run_times),np.std(run_times)))
 
 def load_photons(number=100, nphotons=500000):
     """Returns the average number of photons moved to the GPU device memory
@@ -64,7 +64,8 @@ def load_photons(number=100, nphotons=500000):
 
     return nphotons/ufloat((np.mean(run_times),np.std(run_times)))
 
-def propagate(gpu_instance, number=10, nphotons=500000, nthreads_per_block=64, max_blocks=1024):
+def propagate(gpu_geometry, number=10, nphotons=500000, nthreads_per_block=64,
+              max_blocks=1024):
     "Returns the average number of photons propagated on the GPU per second."
     rng_states = gpu.get_rng_states(nthreads_per_block*max_blocks)
 
@@ -79,7 +80,8 @@ def propagate(gpu_instance, number=10, nphotons=500000, nthreads_per_block=64, m
         gpu_photons = gpu.GPUPhotons(photons)
 
         t0 = time.time()
-        gpu.propagate(gpu_instance, gpu_photons, rng_states, nthreads_per_block, max_blocks)
+        gpu_photons.propagate(gpu_geometry, rng_states, nthreads_per_block,
+                              max_blocks)
         cuda.Context.get_current().synchronize()
         elapsed = time.time() - t0
 
@@ -90,7 +92,8 @@ def propagate(gpu_instance, number=10, nphotons=500000, nthreads_per_block=64, m
     return nphotons/ufloat((np.mean(run_times),np.std(run_times)))
 
 @tools.profile_if_possible
-def pdf(gpu_instance, gpu_geometry, max_pmt_id, npdfs=10, nevents=100, nreps=1, nthreads_per_block=64, max_blocks=1024):
+def pdf(gpu_geometry, max_pmt_id, npdfs=10, nevents=100, nreps=1,
+        nthreads_per_block=64, max_blocks=1024):
     """
     Returns the average number of 100 MeV events per second that can be
     histogrammed per second.
@@ -118,14 +121,17 @@ def pdf(gpu_instance, gpu_geometry, max_pmt_id, npdfs=10, nevents=100, nreps=1,
         t0 = time.time()
         gpu_pdf.clear_pdf()
 
-        vertex_gen = generator.vertex.constant_particle_gun('e-', (0,0,0), (1,0,0), 100)
+        vertex_gen = generator.vertex.constant_particle_gun('e-', (0,0,0),
+                                                            (1,0,0), 100)
         vertex_iter = itertools.islice(vertex_gen, nevents)
 
         for ev in g4generator.generate_events(vertex_iter):
             for j in xrange(nreps):
                 gpu_photons = gpu.GPUPhotons(ev.photons_beg)
-                gpu.propagate(gpu_instance, gpu_photons, rng_states, nthreads_per_block, max_blocks)
-                gpu_channels = gpu_daq.acquire(gpu_photons, rng_states, nthreads_per_block, max_blocks)
+                gpu_photons.propagate(gpu_geometry, rng_states,
+                                      nthreads_per_block, max_blocks)
+                gpu_channels = gpu_daq.acquire(gpu_photons, rng_states,
+                                               nthreads_per_block, max_blocks)
                 gpu_pdf.add_hits_to_pdf(gpu_channels, nthreads_per_block)
 
         hitcount, pdf = gpu_pdf.get_pdfs()
@@ -146,16 +152,19 @@ if __name__ == '__main__':
     lbne.build(bits=11)
 
     gpu_instance = gpu.GPU()
-    gpu_geometry = gpu.GPUGeometry(gpu_instance, lbne)
+    gpu_geometry = gpu.GPUGeometry(lbne)
     
     gpu_instance.print_mem_info()
-    print '%s ray intersections/sec.' % tools.ufloat_to_str(intersect(gpu_instance))
+    print '%s ray intersections/sec.' % \
+        tools.ufloat_to_str(intersect(gpu_geometry))
     gc.collect()
     gpu_instance.print_mem_info()
     print '%s photons loaded/sec.' % tools.ufloat_to_str(load_photons())
     gc.collect()
     gpu_instance.print_mem_info()
-    print '%s photons propagated/sec.' % tools.ufloat_to_str(propagate(gpu_instance))
+    print '%s photons propagated/sec.' % \
+        tools.ufloat_to_str(propagate(gpu_geometry))
     gc.collect()
     gpu_instance.print_mem_info()
-    print '%s 100 MeV events histogrammed/s' % tools.ufloat_to_str(pdf(gpu_instance, gpu_geometry, max(lbne.pmtids)))
+    print '%s 100 MeV events histogrammed/s' % \
+        tools.ufloat_to_str(pdf(gpu_geometry, max(lbne.pmtids)))