1 files changed, 338 insertions, 317 deletions
diff --git a/gpu.py b/gpu.py
index 585ab5f..721ea67 100644
--- a/gpu.py
+++ b/gpu.py
@@ -1,67 +1,215 @@
 import numpy as np
 import numpy.ma as ma
-from pycuda.tools import make_default_context
-from pycuda.compiler import SourceModule
-from pycuda.characterize import sizeof
-import pycuda.driver as cuda
-from pycuda import gpuarray
 from copy import copy
 from itertools import izip
-from chroma.tools import timeit
 from os.path import dirname
+import sys
+
+import pytools
+import pycuda.tools
+from pycuda.compiler import SourceModule
+import pycuda.characterize
+import pycuda.driver as cuda
+from pycuda import gpuarray as ga
 
 import chroma.src
+from chroma.tools import timeit
 from chroma.geometry import standard_wavelengths
 from chroma.color import map_to_color
-import sys
-import event
+from chroma import event
 
 cuda.init()
 
+#@pycuda.tools.context_dependent_memoize
+def get_cu_module(name, options=None, include_source_directory=True):
+    """Returns a pycuda.compiler.SourceModule object from a CUDA source file
+    located in the chroma src directory at src/[name].cu."""
+    if options is None:
+        options = []
+
+    srcdir = dirname(chroma.src.__file__)
+
+    if include_source_directory:
+        options += ['-I' + srcdir]
+
+    with open('%s/%s.cu' % (srcdir, name)) as f:
+        source = f.read()
+
+    return pycuda.compiler.SourceModule(source, options=options,
+                                        no_extern_c=True)
+
+class GPUFuncs(object):
+    """Simple container class for GPU functions as attributes."""
+    def __init__(self, module):
+        self.module = module
+        self.funcs = {}
+
+    def __getattr__(self, name):
+        try:
+            return self.funcs[name]
+        except KeyError:
+            f = self.module.get_function(name)
+            self.funcs[name] = f
+            return f
+
+init_rng_src = """
+#include <curand_kernel.h>
+
+extern "C"
+{
+
+__global__ void init_rng(int nthreads, curandState *s, unsigned long long seed, unsigned long long offset)
+{
+	int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+	if (id >= nthreads)
+		return;
+
+	curand_init(seed, id, offset, &s[id]);
+}
+
+} // extern "C"
+"""
+
+def get_rng_states(size, seed=1):
+    "Return `size` number of CUDA random number generator states."
+    rng_states = cuda.mem_alloc(size*pycuda.characterize.sizeof('curandStateXORWOW', '#include <curand_kernel.h>'))
+
+    module = pycuda.compiler.SourceModule(init_rng_src, no_extern_c=True)
+    init_rng = module.get_function('init_rng')
+
+    init_rng(np.int32(size), rng_states, np.uint64(seed), np.uint64(0), block=(64,1,1), grid=(size//64+1,1))
+
+    return rng_states
+
 def to_float3(arr):
-    return arr.astype(np.float32).view(gpuarray.vec.float3)[:,0]
-
-def boolean_argsort(condition):
-    '''Returns two arrays of indicies indicating which elements of the
-    boolean array `condition` should be exchanged so that all of the
-    True entries occur before the false entries.
-
-       Returns: tuple (a,b, ntrue) which give the indices for elements
-       to exchange in a and b, and the number of true entries in the array
-       in ntrue.  This function in general requires fewer swaps than 
-       numpy.argsort.
-    '''
-
-    true_indices = condition.nonzero()[0][::-1] # reverse
-    false_indices = (~condition).nonzero()[0]
-    
-    length = min(len(true_indices), len(false_indices))
-    
-    cut_index = np.searchsorted((true_indices[:length] - false_indices[:length]) <= 0, True)
-    return (true_indices[:cut_index], false_indices[:cut_index], len(true_indices))
-
-
-def chunk_iterator(nelements, nthreads_per_block, max_blocks):
-    '''Iterator that yields tuples with the values requried to process
+    "Returns an pycuda.gpuarray.vec.float3 array from an (N,3) array."
+    if not arr.flags['C_CONTIGUOUS']:
+        arr = np.asarray(arr, order='c')
+    return arr.astype(np.float32).view(ga.vec.float3)[:,0]
+
+def chunk_iterator(nelements, nthreads_per_block=64, max_blocks=1024):
+    """Iterator that yields tuples with the values requried to process
     a long array in multiple kernel passes on the GPU.
 
-       Each yielded value is (first_index, elements_this_iteration, nblocks_this_iteration)
+    Each yielded value is of the form,
+        (first_index, elements_this_iteration, nblocks_this_iteration)
 
-    >>> list(chunk_iterator(300, 32, 2))
-    [(0, 64, 2), (64, 64, 2), (128, 64, 2), (192, 64, 2), (256, 9, 1)]
-    '''
+    Example:
+        >>> list(chunk_iterator(300, 32, 2))
+        [(0, 64, 2), (64, 64, 2), (128, 64, 2), (192, 64, 2), (256, 9, 1)]
+    """
     first = 0
     while first < nelements:
         elements_left = nelements - first
         blocks = int(elements_left // nthreads_per_block)
         if elements_left % nthreads_per_block != 0:
-            blocks += 1 #Round up only if needed
+            blocks += 1 # Round up only if needed
         blocks = min(max_blocks, blocks)
         elements_this_round = min(elements_left, blocks * nthreads_per_block)
 
         yield (first, elements_this_round, blocks)
         first += elements_this_round
-    
+
+class GPUPhotons(object):
+    def __init__(self, photons):
+        self.pos = ga.to_gpu(to_float3(photons.pos))
+        self.dir = ga.to_gpu(to_float3(photons.dir))
+        self.pol = ga.to_gpu(to_float3(photons.pol))
+        self.wavelengths = ga.to_gpu(photons.wavelengths.astype(np.float32))
+        self.t = ga.to_gpu(photons.t.astype(np.float32))
+        self.last_hit_triangles = ga.to_gpu(photons.last_hit_triangles.astype(np.int32))
+        self.flags = ga.to_gpu(photons.flags.astype(np.uint32))
+
+    def get(self):
+        pos = self.pos.get().view(np.float32).reshape((len(self.pos),3))
+        dir = self.dir.get().view(np.float32).reshape((len(self.dir),3))
+        pol = self.pol.get().view(np.float32).reshape((len(self.pol),3))
+        wavelengths = self.wavelengths.get()
+        t = self.t.get()
+        last_hit_triangles = self.last_hit_triangles.get()
+        flags = self.flags.get()
+        return event.Photons(pos, dir, pol, wavelengths, t, last_hit_triangles, flags)
+
+class GPUChannels(object):
+    def __init__(self, t, q, flags):
+        self.t = t
+        self.q = q
+        self.flags = flags
+
+    def get(self):
+        t = self.t.get()
+        q = self.q.get().astype(np.float32)
+
+        # For now, assume all channels with small
+        # enough hit time were hit.
+        return event.Channels(t<1e8, t, q, self.flags.get())
+
+def propagate(gpu, gpuphotons, rng_states, nthreads_per_block=64, max_blocks=1024, max_steps=10):
+    """Propagate photons on GPU to termination or max_steps, whichever
+    comes first.
+
+    May be called repeatedly without reloading photon information if
+    single-stepping through photon history.
+
+    ..warning::
+        `rng_states` must have at least `nthreads_per_block`*`max_blocks`
+        number of curandStates.
+    """
+    nphotons = gpuphotons.pos.size
+    step = 0
+    input_queue = np.zeros(shape=nphotons+1, dtype=np.uint32)
+    input_queue[1:] = np.arange(nphotons, dtype=np.uint32)
+    input_queue_gpu = ga.to_gpu(input_queue)
+    output_queue = np.zeros(shape=nphotons+1, dtype=np.uint32)
+    output_queue[0] = 1
+    output_queue_gpu = ga.to_gpu(output_queue)
+
+    propagate = gpu.module.get_function('propagate')
+
+    while step < max_steps:
+        # Just finish the rest of the steps if the # of photons is low
+        if nphotons < nthreads_per_block * 16 * 8:
+            nsteps = max_steps - step
+        else:
+            nsteps = 1
+
+        for first_photon, photons_this_round, blocks in \
+                chunk_iterator(nphotons, nthreads_per_block, max_blocks):
+            propagate(np.int32(first_photon), np.int32(photons_this_round), input_queue_gpu[1:], output_queue_gpu, rng_states, gpuphotons.pos, gpuphotons.dir, gpuphotons.wavelengths, gpuphotons.pol, gpuphotons.t, gpuphotons.flags, gpuphotons.last_hit_triangles, np.int32(nsteps), block=(nthreads_per_block,1,1), grid=(blocks, 1))
+
+        step += nsteps
+
+        if step < max_steps:
+            temp = input_queue_gpu
+            input_queue_gpu = output_queue_gpu
+            output_queue_gpu = temp
+            output_queue_gpu[:1].set(np.uint32(1))
+            nphotons = input_queue_gpu[:1].get()[0] - 1
+
+    if ga.max(gpuphotons.flags).get() & (1 << 31):
+        print >>sys.stderr, "WARNING: ABORTED PHOTONS"
+
+class GPURays(object):
+    def __init__(self, pos, dir, nblocks=64):
+        self.pos = ga.to_gpu(to_float3(pos))
+        self.dir = ga.to_gpu(to_float3(dir))
+
+        self.nblocks = nblocks
+
+        self.module = get_cu_module('transform')
+        self.gpu_funcs = GPUFuncs(self.module)
+
+    def rotate(self, phi, n):
+        self.gpu_funcs.rotate(np.int32(self.pos.size), self.pos, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
+        self.gpu_funcs.rotate(np.int32(self.dir.size), self.dir, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.dir.size//self.nblocks+1,1))
+
+    def rotate_around_point(self, phi, n, point):
+        self.gpu_funcs.rotate_around_point(np.int32(self.pos.size), self.pos, np.float32(phi), ga.vec.make_float3(*n), ga.vec.make_float3(*point), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
+        self.gpu_funcs.rotate(np.int32(self.dir.size), self.dir, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.dir.size//self.nblocks+1,1))
+
+    def translate(self, v):
+        self.gpu_funcs.translate(np.int32(self.pos.size), self.pos, ga.vec.make_float3(*v), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
 
 def format_size(size):
     if size < 1e3:
@@ -77,317 +225,165 @@ def format_array(name, array):
     return '%-15s %6s %6s' % \
         (name, format_size(len(array)), format_size(array.nbytes))
 
-class CUDAFuncs(object):
-    '''simple container class for GPU functions as attributes'''
-    def __init__(self, cuda_module, func_names):
-        '''Extract __global__ functions listed in func_names from
-        the PyCUDA module object.  The functions are assigned
-        to attributes of this object with the same name.'''
-        for name in func_names:
-            setattr(self, name, cuda_module.get_function(name))
-
-class GPU(object):
-    def __init__(self, device_id=None, nthreads_per_block=64, max_blocks=1024):
-        '''Initialize a GPU context on the specified device.  If device_id==None,
-        the default device is used.'''
-
-        if device_id is None:
-            self.context = make_default_context()
-        else:
-            device = cuda.Device(device_id)
-            self.context = device.make_context()
-
-        print 'device %s' % self.context.get_device().name()
-
-        self.nthreads_per_block = nthreads_per_block
-        self.max_blocks = max_blocks
-
-        self.context.set_cache_config(cuda.func_cache.PREFER_L1)
-
-        cuda_options = ['-I' + dirname(chroma.src.__file__), '--use_fast_math', '--ptxas-options=-v']
-
-        self.module = SourceModule(chroma.src.kernel, options=cuda_options, no_extern_c=True)
-        self.kernels = CUDAFuncs(self.module, ['ray_trace', 'ray_trace_alpha', 'distance_to_mesh', 'rotate', 'rotate_around_point', 'translate', 'update_xyz_lookup', 'update_xyz_image', 'process_image', 'init_rng'])
-
-        self.geo_funcs = CUDAFuncs(self.module, ['set_wavelength_range', 'set_material', 'set_surface', 'set_global_mesh_variables', 'color_solids'])
-
-        self.prop_funcs = CUDAFuncs(self.module, ['init_rng', 'propagate', 'swap'])
-        self.daq_module = SourceModule(chroma.src.daq, options=cuda_options, no_extern_c=True)
-        self.daq_funcs = CUDAFuncs(self.daq_module, 
-                              ['reset_earliest_time_int', 'run_daq',
-                               'convert_sortable_int_to_float', 'bin_hits',
-                               'accumulate_pdf_eval'])
+class GPUGeometry(object):
+    def __init__(self, gpu, geometry, load=True, activate=True, print_usage=True):
+        self.geometry = geometry
 
-    def print_device_usage(self):
-        print 'device usage:'
-        print format_array('vertices', self.vertices_gpu)
-        print format_array('triangles', self.triangles_gpu)
-        print format_array('lower_bounds', self.lower_bounds_gpu)
-        print format_array('upper_bounds', self.upper_bounds_gpu)
-        print format_array('node_map', self.node_map_gpu)
-        print format_array('node_map_end', self.node_map_end_gpu)
-        print '%-15s %6s %6s' % ('total', '', format_size(self.vertices_gpu.nbytes + self.triangles_gpu.nbytes + self.lower_bounds_gpu.nbytes + self.upper_bounds_gpu.nbytes + self.node_map_gpu.nbytes + self.node_map_end_gpu.nbytes))
+        self.module = gpu.module
+        self.gpu_funcs = GPUFuncs(gpu.module)
 
-    def load_geometry(self, geometry, print_usage=True):
-        if not hasattr(geometry, 'mesh'):
-            print 'WARNING: building geometry with 8-bits'
-            geometry.build(bits=8)
+        if load:
+            self.load(activate, print_usage)
 
-        self.geo_funcs.set_wavelength_range(np.float32(standard_wavelengths[0]), np.float32(standard_wavelengths[-1]), np.float32(standard_wavelengths[1]-standard_wavelengths[0]), np.uint32(standard_wavelengths.size), block=(1,1,1), grid=(1,1))
+    def load(self, activate=True, print_usage=True):
+        self.gpu_funcs.set_wavelength_range(np.float32(standard_wavelengths[0]), np.float32(standard_wavelengths[-1]), np.float32(standard_wavelengths[1]-standard_wavelengths[0]), np.uint32(standard_wavelengths.size), block=(1,1,1), grid=(1,1))
 
         self.materials = []
-        for i in range(len(geometry.unique_materials)):
-            material = copy(geometry.unique_materials[i])
+        for i in range(len(self.geometry.unique_materials)):
+            material = copy(self.geometry.unique_materials[i])
 
             if material is None:
                 raise Exception('one or more triangles is missing a material.')
 
-            material.refractive_index_gpu = gpuarray.to_gpu(np.interp(standard_wavelengths, material.refractive_index[:,0], material.refractive_index[:,1]).astype(np.float32))
-            material.absorption_length_gpu = gpuarray.to_gpu(np.interp(standard_wavelengths, material.absorption_length[:,0], material.absorption_length[:,1]).astype(np.float32))
-            material.scattering_length_gpu = gpuarray.to_gpu(np.interp(standard_wavelengths, material.scattering_length[:,0], material.scattering_length[:,1]).astype(np.float32))
+            material.refractive_index_gpu = ga.to_gpu(np.interp(standard_wavelengths, material.refractive_index[:,0], material.refractive_index[:,1]).astype(np.float32))
+            material.absorption_length_gpu = ga.to_gpu(np.interp(standard_wavelengths, material.absorption_length[:,0], material.absorption_length[:,1]).astype(np.float32))
+            material.scattering_length_gpu = ga.to_gpu(np.interp(standard_wavelengths, material.scattering_length[:,0], material.scattering_length[:,1]).astype(np.float32))
 
-            self.geo_funcs.set_material(np.int32(i), material.refractive_index_gpu, material.absorption_length_gpu, material.scattering_length_gpu, block=(1,1,1), grid=(1,1))
+            self.gpu_funcs.set_material(np.int32(i), material.refractive_index_gpu, material.absorption_length_gpu, material.scattering_length_gpu, block=(1,1,1), grid=(1,1))
 
             self.materials.append(material)
 
         self.surfaces = []
-        for i in range(len(geometry.unique_surfaces)):
-            surface = copy(geometry.unique_surfaces[i])
+        for i in range(len(self.geometry.unique_surfaces)):
+            surface = copy(self.geometry.unique_surfaces[i])
 
             if surface is None:
                 continue
 
-            surface.detect_gpu = gpuarray.to_gpu(np.interp(standard_wavelengths, surface.detect[:,0], surface.detect[:,1]).astype(np.float32))
-            surface.absorb_gpu = gpuarray.to_gpu(np.interp(standard_wavelengths, surface.absorb[:,0], surface.absorb[:,1]).astype(np.float32))
-            surface.reflect_diffuse_gpu = gpuarray.to_gpu(np.interp(standard_wavelengths, surface.reflect_diffuse[:,0], surface.reflect_diffuse[:,1]).astype(np.float32))
-            surface.reflect_specular_gpu = gpuarray.to_gpu(np.interp(standard_wavelengths, surface.reflect_specular[:,0], surface.reflect_specular[:,1]).astype(np.float32))
+            surface.detect_gpu = ga.to_gpu(np.interp(standard_wavelengths, surface.detect[:,0], surface.detect[:,1]).astype(np.float32))
+            surface.absorb_gpu = ga.to_gpu(np.interp(standard_wavelengths, surface.absorb[:,0], surface.absorb[:,1]).astype(np.float32))
+            surface.reflect_diffuse_gpu = ga.to_gpu(np.interp(standard_wavelengths, surface.reflect_diffuse[:,0], surface.reflect_diffuse[:,1]).astype(np.float32))
+            surface.reflect_specular_gpu = ga.to_gpu(np.interp(standard_wavelengths, surface.reflect_specular[:,0], surface.reflect_specular[:,1]).astype(np.float32))
 
-            self.geo_funcs.set_surface(np.int32(i), surface.detect_gpu, surface.absorb_gpu, surface.reflect_diffuse_gpu, surface.reflect_specular_gpu, block=(1,1,1), grid=(1,1))
+            self.gpu_funcs.set_surface(np.int32(i), surface.detect_gpu, surface.absorb_gpu, surface.reflect_diffuse_gpu, surface.reflect_specular_gpu, block=(1,1,1), grid=(1,1))
 
             self.surfaces.append(surface)
 
-        self.vertices_gpu = gpuarray.to_gpu(to_float3(geometry.mesh.vertices))
+        self.vertices_gpu = ga.to_gpu(to_float3(self.geometry.mesh.vertices))
 
         triangles = \
-            np.empty(len(geometry.mesh.triangles), dtype=gpuarray.vec.uint4)
-        triangles['x'] = geometry.mesh.triangles[:,0]
-        triangles['y'] = geometry.mesh.triangles[:,1]
-        triangles['z'] = geometry.mesh.triangles[:,2]
-        triangles['w'] = ((geometry.material1_index & 0xff) << 24) | ((geometry.material2_index & 0xff) << 16) | ((geometry.surface_index & 0xff) << 8)
-        self.triangles_gpu = gpuarray.to_gpu(triangles)
+            np.empty(len(self.geometry.mesh.triangles), dtype=ga.vec.uint4)
+        triangles['x'] = self.geometry.mesh.triangles[:,0]
+        triangles['y'] = self.geometry.mesh.triangles[:,1]
+        triangles['z'] = self.geometry.mesh.triangles[:,2]
+        triangles['w'] = ((self.geometry.material1_index & 0xff) << 24) | ((self.geometry.material2_index & 0xff) << 16) | ((self.geometry.surface_index & 0xff) << 8)
+        self.triangles_gpu = ga.to_gpu(triangles)
 
-        self.lower_bounds_gpu = gpuarray.to_gpu(to_float3(geometry.lower_bounds))
+        self.lower_bounds_gpu = ga.to_gpu(to_float3(self.geometry.lower_bounds))
 
-        self.upper_bounds_gpu = gpuarray.to_gpu(to_float3(geometry.upper_bounds))
+        self.upper_bounds_gpu = ga.to_gpu(to_float3(self.geometry.upper_bounds))
 
-        self.colors_gpu = gpuarray.to_gpu(geometry.colors.astype(np.uint32))
-        self.node_map_gpu = gpuarray.to_gpu(geometry.node_map.astype(np.uint32))
-        self.node_map_end_gpu = gpuarray.to_gpu(geometry.node_map_end.astype(np.uint32))
-        self.solid_id_map_gpu = gpuarray.to_gpu(geometry.solid_id.astype(np.uint32))
-
-        self.geo_funcs.set_global_mesh_variables(self.triangles_gpu, self.vertices_gpu, self.colors_gpu, np.uint32(geometry.node_map.size-1), np.uint32(geometry.first_node), self.lower_bounds_gpu, self.upper_bounds_gpu, block=(1,1,1), grid=(1,1))
+        self.colors_gpu = ga.to_gpu(self.geometry.colors.astype(np.uint32))
+        self.node_map_gpu = ga.to_gpu(self.geometry.node_map.astype(np.uint32))
+        self.node_map_end_gpu = ga.to_gpu(self.geometry.node_map_end.astype(np.uint32))
+        self.solid_id_map_gpu = ga.to_gpu(self.geometry.solid_id.astype(np.uint32))
 
         self.node_map_tex = self.module.get_texref('node_map')
         self.node_map_end_tex = self.module.get_texref('node_map_end')
 
+        if print_usage:
+            self.print_device_usage()
+
+        if activate:
+            self.activate()
+
+    def activate(self):
+        self.gpu_funcs.set_global_mesh_variables(self.triangles_gpu, self.vertices_gpu, self.colors_gpu, np.uint32(self.geometry.node_map.size-1), np.uint32(self.geometry.first_node), self.lower_bounds_gpu, self.upper_bounds_gpu, block=(1,1,1), grid=(1,1))
+
         self.node_map_tex.set_address(self.node_map_gpu.gpudata, self.node_map_gpu.nbytes)
         self.node_map_end_tex.set_address(self.node_map_end_gpu.gpudata, self.node_map_end_gpu.nbytes)
 
         self.node_map_tex.set_format(cuda.array_format.UNSIGNED_INT32, 1)
         self.node_map_end_tex.set_format(cuda.array_format.UNSIGNED_INT32, 1)
 
-        self.geometry = geometry
-
-        if print_usage:
-            print 'average of %f child nodes per node' % (np.mean(geometry.node_map_end[geometry.first_node:] - geometry.node_map[geometry.first_node:]))
-            print '%i nodes with one child' % (np.count_nonzero((geometry.node_map_end[geometry.first_node:] - geometry.node_map[geometry.first_node:]) == 1))
-            print '%i leaf nodes with one child' % (np.count_nonzero((geometry.node_map_end[:geometry.first_node] - geometry.node_map[:geometry.first_node]) == 1))
-            self.print_device_usage()
+    def print_device_usage(self):
+        print 'device usage:'
+        print '-'*10
+        print format_array('vertices', self.vertices_gpu)
+        print format_array('triangles', self.triangles_gpu)
+        print format_array('lower_bounds', self.lower_bounds_gpu)
+        print format_array('upper_bounds', self.upper_bounds_gpu)
+        print format_array('node_map', self.node_map_gpu)
+        print format_array('node_map_end', self.node_map_end_gpu)
+        print '%-15s %6s %6s' % ('total', '', format_size(self.vertices_gpu.nbytes + self.triangles_gpu.nbytes + self.lower_bounds_gpu.nbytes + self.upper_bounds_gpu.nbytes + self.node_map_gpu.nbytes + self.node_map_end_gpu.nbytes))
+        print '-'*10
+        free, total = cuda.mem_get_info()
+        print '%-15s %6s %6s' % ('device total', '', format_size(total))
+        print '%-15s %6s %6s' % ('device used', '', format_size(total-free))
+        print '%-15s %6s %6s' % ('device free', '', format_size(free))
+        print
 
     def reset_colors(self):
         self.colors_gpu.set_async(self.geometry.colors.astype(np.uint32))
 
     def color_solids(self, solid_hit, colors):
-        solid_hit_gpu = gpuarray.to_gpu(np.array(solid_hit, dtype=np.bool))
-        solid_colors_gpu = gpuarray.to_gpu(np.array(colors, dtype=np.uint32))
+        solid_hit_gpu = ga.to_gpu(np.array(solid_hit, dtype=np.bool))
+        solid_colors_gpu = ga.to_gpu(np.array(colors, dtype=np.uint32))
 
-        for first_triangle, triangles_this_round, blocks in chunk_iterator(self.triangles_gpu.size, self.nthreads_per_block, self.max_blocks):
-            self.geo_funcs.color_solids(np.int32(first_triangle),
+        for first_triangle, triangles_this_round, blocks in \
+                chunk_iterator(self.triangles_gpu.size):
+            self.gpu_funcs.color_solids(np.int32(first_triangle),
                                         np.int32(triangles_this_round),
                                         self.solid_id_map_gpu, 
                                         solid_hit_gpu, 
                                         solid_colors_gpu, 
-                                        block=(self.nthreads_per_block,1,1), 
+                                        block=(64,1,1), 
                                         grid=(blocks,1))
-        self.context.synchronize()
-
-    def setup_propagate(self, seed=1):
-        self.rng_states_gpu = cuda.mem_alloc(self.nthreads_per_block * self.max_blocks
-                                             * sizeof('curandStateXORWOW', '#include <curand_kernel.h>'))
-        self.prop_funcs.init_rng(np.int32(self.max_blocks * self.nthreads_per_block), self.rng_states_gpu, 
-                                 np.uint64(seed), np.uint64(0), 
-                                 block=(self.nthreads_per_block,1,1), 
-                                 grid=(self.max_blocks,1))
-        #self.context.synchronize()
-
-    def load_photons(self, photons):
-        '''Load photons onto the GPU from an event.Photons object.
-
-           If photon.histories or photon.last_hit_triangles are set to none,
-           they will be initialized to 0 and -1 on the GPU, respectively.
-        '''
-        self.nphotons = len(photons.positions)
-        assert len(photons.directions) == self.nphotons
-        assert len(photons.polarizations) == self.nphotons
-        assert len(photons.wavelengths) == self.nphotons
-
-        self.positions_gpu = gpuarray.to_gpu(to_float3(photons.positions))
-        self.directions_gpu = gpuarray.to_gpu(to_float3(photons.directions))
-        self.polarizations_gpu = gpuarray.to_gpu(to_float3(photons.polarizations))
-        self.wavelengths_gpu = gpuarray.to_gpu(photons.wavelengths.astype(np.float32))
-
-        if photons.times is not None:
-            self.times_gpu = gpuarray.to_gpu(photons.times.astype(np.float32))
-        else:
-            self.times_gpu = gpuarray.zeros(self.nphotons, dtype=np.float32)
 
-        if photons.histories is not None:
-            self.histories_gpu = gpuarray.to_gpu(photons.histories.astype(np.uint32))
-        else:
-            self.histories_gpu = gpuarray.zeros(self.nphotons, dtype=np.uint32)
-
-        if photons.last_hit_triangles is not None:
-            self.last_hit_triangles_gpu = gpuarray.to_gpu(photons.last_hit_triangles.astype(np.int32))
-        else:
-            self.last_hit_triangles_gpu = gpuarray.empty(self.nphotons, dtype=np.int32)
-            self.last_hit_triangles_gpu.fill(-1)
-
-
-    def propagate(self, max_steps=10):
-        '''Propagate photons on GPU to termination or max_steps, whichever comes first.
-
-        May be called repeatedly without reloading photon information if single-stepping
-        through photon history.
-        '''
-        nphotons = self.nphotons
-        step = 0
-        input_queue = np.zeros(shape=self.nphotons+1, dtype=np.uint32)
-        input_queue[1:] = np.arange(self.nphotons, dtype=np.uint32)
-        input_queue_gpu = gpuarray.to_gpu(input_queue)
-        output_queue = np.zeros(shape=self.nphotons+1, dtype=np.uint32)
-        output_queue[0] = 1
-        output_queue_gpu = gpuarray.to_gpu(output_queue)
-
-
-        while step < max_steps:
-            ## Just finish the rest of the steps if the # of photons is low
-            if nphotons < self.nthreads_per_block * 16 * 8:
-                nsteps = max_steps - step
-            else:
-                nsteps = 1
-
-            for first_photon, photons_this_round, blocks in chunk_iterator(nphotons, self.nthreads_per_block, self.max_blocks):
-                self.prop_funcs.propagate(np.int32(first_photon),
-                                          np.int32(photons_this_round),
-                                          input_queue_gpu[1:],
-                                          output_queue_gpu,
-                                          self.rng_states_gpu, self.positions_gpu, 
-                                          self.directions_gpu, 
-                                          self.wavelengths_gpu, self.polarizations_gpu, 
-                                          self.times_gpu, self.histories_gpu, 
-                                          self.last_hit_triangles_gpu, 
-                                          np.int32(nsteps), 
-                                          block=(self.nthreads_per_block,1,1),
-                                          grid=(blocks, 1))
-        
-            step += nsteps
-
-            if step < max_steps:
-                temp = input_queue_gpu
-                input_queue_gpu = output_queue_gpu
-                output_queue_gpu = temp
-                output_queue_gpu[:1].set(np.uint32(1))
-                nphotons = input_queue_gpu[:1].get()[0] - 1
-        
-        if gpuarray.max(self.histories_gpu).get() & (1 << 31):
-            print >>sys.stderr, "WARNING: ABORTED PHOTONS IN THIS EVENT"
-
-        if 'profile' in __builtins__:
-            self.context.synchronize()
-
-    def get_photons(self):
-        '''Returns a dictionary of current photon state information.
-
-        Contents of dictionary have the same names as the parameters to load_photons().
-        '''
-        return event.Photons(positions=self.positions_gpu.get().view(np.float32).reshape(self.positions_gpu.size, 3),
-                             directions=self.directions_gpu.get().view(np.float32).reshape(self.directions_gpu.size, 3),
-                             polarizations=self.polarizations_gpu.get().view(np.float32).reshape(self.polarizations_gpu.size, 3),
-                             wavelengths=self.wavelengths_gpu.get(),
-                             times=self.times_gpu.get(),
-                             histories=self.histories_gpu.get(),
-                             last_hit_triangles=self.last_hit_triangles_gpu.get())
-    
-    def setup_daq(self, max_pmt_id, pmt_rms=1.2e-9):
-        self.earliest_time_gpu = gpuarray.GPUArray(shape=(max_pmt_id+1,), dtype=np.float32)
-        self.earliest_time_int_gpu = gpuarray.GPUArray(shape=self.earliest_time_gpu.shape, 
-                                                       dtype=np.uint32)
-        self.channel_history_gpu = gpuarray.zeros_like(self.earliest_time_int_gpu)
-        self.channel_q_gpu = gpuarray.zeros_like(self.earliest_time_int_gpu)
+class GPUDaq(object):
+    def __init__(self, gpu_geometry, max_pmt_id, pmt_rms=1.2e-9):
+        self.earliest_time_gpu = ga.empty(max_pmt_id+1, dtype=np.float32)
+        self.earliest_time_int_gpu = ga.empty(max_pmt_id+1, dtype=np.uint32)
+        self.channel_history_gpu = ga.zeros_like(self.earliest_time_int_gpu)
+        self.channel_q_gpu = ga.zeros_like(self.earliest_time_int_gpu)
         self.daq_pmt_rms = pmt_rms
+        self.solid_id_map_gpu = gpu_geometry.solid_id_map_gpu
+
+        self.module = get_cu_module('daq', include_source_directory=False)
+        self.gpu_funcs = GPUFuncs(self.module)
 
-    def run_daq(self):
-        self.daq_funcs.reset_earliest_time_int(np.float32(1e9), 
-                                               np.int32(len(self.earliest_time_int_gpu)), 
-                                               self.earliest_time_int_gpu, 
-                                               block=(self.nthreads_per_block,1,1), 
-                                               grid=(len(self.earliest_time_int_gpu)//self.nthreads_per_block+1,1))
+    def acquire(self, gpuphotons, rng_states, nthreads_per_block=64, max_blocks=1024):
+        self.gpu_funcs.reset_earliest_time_int(np.float32(1e9), np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, block=(nthreads_per_block,1,1), grid=(len(self.earliest_time_int_gpu)//nthreads_per_block+1,1))
         self.channel_q_gpu.fill(0)
         self.channel_history_gpu.fill(0)
 
-        #self.context.synchronize()
-        for first_photon, photons_this_round, blocks in chunk_iterator(self.nphotons, self.nthreads_per_block, self.max_blocks):
-            self.daq_funcs.run_daq(self.rng_states_gpu, np.uint32(0x1 << 2), 
-                                   np.float32(self.daq_pmt_rms), 
-                                   np.int32(first_photon),
-                                   np.int32(photons_this_round), 
-                                   self.times_gpu, self.histories_gpu,
-                                   self.last_hit_triangles_gpu, 
-                                   self.solid_id_map_gpu, 
-                                   np.int32(len(self.earliest_time_int_gpu)), 
-                                   self.earliest_time_int_gpu,
-                                   self.channel_q_gpu,
-                                   self.channel_history_gpu,
-                                   block=(self.nthreads_per_block,1,1), grid=(blocks,1))
-        #self.context.synchronize()
-        self.daq_funcs.convert_sortable_int_to_float(np.int32(len(self.earliest_time_int_gpu)), 
-                                                     self.earliest_time_int_gpu, 
-                                                     self.earliest_time_gpu, 
-                                                     block=(self.nthreads_per_block,1,1), 
-                                                     grid=(len(self.earliest_time_int_gpu)//self.nthreads_per_block+1,1))
-        if 'profile' in __builtins__:
-            self.context.synchronize()
-        
+        n = len(gpuphotons.pos)
 
-    def get_hits(self):
-        t = self.earliest_time_gpu.get()
-        # For now, assume all channels with small enough hit time were hit
-        return event.Channels(hit=t<1e8, t=t, 
-                        q=self.channel_q_gpu.get().astype(np.float32),
-                        histories=self.channel_history_gpu.get())
+        for first_photon, photons_this_round, blocks in \
+                chunk_iterator(n, nthreads_per_block, max_blocks):
+            self.gpu_funcs.run_daq(rng_states, np.uint32(0x1 << 2), np.float32(self.daq_pmt_rms), np.int32(first_photon), np.int32(photons_this_round), gpuphotons.t, gpuphotons.flags, gpuphotons.last_hit_triangles, self.solid_id_map_gpu, np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, self.channel_q_gpu, self.channel_history_gpu, block=(nthreads_per_block,1,1), grid=(blocks,1))
+
+        self.gpu_funcs.convert_sortable_int_to_float(np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, self.earliest_time_gpu, block=(nthreads_per_block,1,1), grid=(len(self.earliest_time_int_gpu)//nthreads_per_block+1,1))
+
+        return GPUChannels(self.earliest_time_gpu, self.channel_q_gpu, self.channel_history_gpu)
+
+class GPUPDF(object):
+    def __init__(self):
+        self.module = get_cu_module('daq')
+        self.gpu_funcs = GPUFuncs(self.module)
 
     def setup_pdf(self, max_pmt_id, tbins, trange, qbins, qrange):
-        '''Setup GPU arrays to hold PDF information.
+        """Setup GPU arrays to hold PDF information.
 
            max_pmt_id: int, largest PMT id #
            tbins: number of time bins
            trange: tuple of (min, max) time in PDF
            qbins: number of charge bins
            qrange: tuple of (min, max) charge in PDF
-        '''
+        """
         self.events_in_histogram = 0
-        self.hitcount_gpu = gpuarray.zeros(max_pmt_id+1, dtype=np.uint32)
-        self.pdf_gpu = gpuarray.zeros(shape=(max_pmt_id+1, tbins, qbins), 
+        self.hitcount_gpu = ga.zeros(max_pmt_id+1, dtype=np.uint32)
+        self.pdf_gpu = ga.zeros(shape=(max_pmt_id+1, tbins, qbins), 
                                       dtype=np.uint32)
         self.tbins = tbins
         self.trange = trange
@@ -395,16 +391,14 @@ class GPU(object):
         self.qrange = qrange
 
     def clear_pdf(self):
-        '''Rezero the PDF counters.'''
+        """Rezero the PDF counters."""
         self.hitcount_gpu.fill(0)
         self.pdf_gpu.fill(0)
 
-    def add_hits_to_pdf(self):
-        '''Put the most recent results of run_daq() into the PDF.'''
-
-        self.daq_funcs.bin_hits(np.int32(len(self.hitcount_gpu)),
-                                self.channel_q_gpu,
-                                self.earliest_time_gpu,
+    def add_hits_to_pdf(self, gpuchannels, nthreads_per_block=64):
+        self.gpu_funcs.bin_hits(np.int32(len(self.hitcount_gpu)),
+                                gpuchannels.q,
+                                gpuchannels.t,
                                 self.hitcount_gpu,
                                 np.int32(self.tbins),
                                 np.float32(self.trange[0]),
@@ -413,20 +407,21 @@ class GPU(object):
                                 np.float32(self.qrange[0]),
                                 np.float32(self.qrange[1]),
                                 self.pdf_gpu,
-                                block=(self.nthreads_per_block,1,1), 
-                                grid=(len(self.earliest_time_gpu)//self.nthreads_per_block+1,1))
+                                block=(nthreads_per_block,1,1), 
+                                grid=(len(gpuchannels.t)//nthreads_per_block+1,1))
 
 
         self.events_in_histogram += 1
 
     def get_pdfs(self):
-        '''Returns the 1D hitcount array and the 3D [channel, time, charge] histogram'''
+        """Returns the 1D hitcount array and the 3D [channel, time, charge]
+        histogram."""
         return self.hitcount_gpu.get(), self.pdf_gpu.get()
 
-    def setup_pdf_eval(self, event_hit, event_time, event_charge,
-                       min_twidth, trange, min_qwidth, qrange, min_bin_content=10,
+    def setup_pdf_eval(self, event_hit, event_time, event_charge, min_twidth,
+                       trange, min_qwidth, qrange, min_bin_content=10,
                        time_only=True):
-        '''Setup GPU arrays to compute PDF values for the given event.
+        """Setup GPU arrays to compute PDF values for the given event.
         The pdf_eval calculation allows the PDF to be evaluated at a
         single point for each channel as the Monte Carlo is run.  The
         effective bin size will be as small as (`min_twidth`,
@@ -455,14 +450,14 @@ class GPU(object):
               The bin will be expanded to include at least this many events
             time_only: bool
               If True, only the time observable will be used in the PDF.
-        '''
-        self.event_hit_gpu = gpuarray.to_gpu(event_hit.astype(np.uint32))
-        self.event_time_gpu = gpuarray.to_gpu(event_time.astype(np.float32))
-        self.event_charge_gpu = gpuarray.to_gpu(event_charge.astype(np.float32))
-
-        self.eval_hitcount_gpu = gpuarray.zeros(len(event_hit), dtype=np.uint32)
-        self.eval_bincount_gpu = gpuarray.zeros(len(event_hit), dtype=np.uint32)
-        self.nearest_mc_gpu = gpuarray.empty(shape=len(event_hit) * min_bin_content, 
+        """
+        self.event_hit_gpu = ga.to_gpu(event_hit.astype(np.uint32))
+        self.event_time_gpu = ga.to_gpu(event_time.astype(np.float32))
+        self.event_charge_gpu = ga.to_gpu(event_charge.astype(np.float32))
+
+        self.eval_hitcount_gpu = ga.zeros(len(event_hit), dtype=np.uint32)
+        self.eval_bincount_gpu = ga.zeros(len(event_hit), dtype=np.uint32)
+        self.nearest_mc_gpu = ga.empty(shape=len(event_hit) * min_bin_content, 
                                              dtype=np.float32)
         self.nearest_mc_gpu.fill(1e9)
 
@@ -474,21 +469,20 @@ class GPU(object):
         self.time_only = time_only
 
     def clear_pdf_eval(self):
-        '''Reset PDF evaluation counters to start accumulating new Monte Carlo.'''
+        "Reset PDF evaluation counters to start accumulating new Monte Carlo."
         self.eval_hitcount_gpu.fill(0)
         self.eval_bincount_gpu.fill(0)
         self.nearest_mc_gpu.fill(1e9)
 
-    def accumulate_pdf_eval(self):
-        '''Add the most recent results of run_daq() to the PDF evaluation.'''
-        
-        self.daq_funcs.accumulate_pdf_eval(np.int32(self.time_only),
+    def accumulate_pdf_eval(self, gpuchannels, nthreads_per_block=64):
+        "Add the most recent results of run_daq() to the PDF evaluation."
+        self.gpu_funcs.accumulate_pdf_eval(np.int32(self.time_only),
                                            np.int32(len(self.event_hit_gpu)),
                                            self.event_hit_gpu,
                                            self.event_time_gpu,
                                            self.event_charge_gpu,
-                                           self.earliest_time_gpu,
-                                           self.channel_q_gpu,
+                                           gpuchannels.t,
+                                           gpuchannels.q,
                                            self.eval_hitcount_gpu,
                                            self.eval_bincount_gpu,
                                            np.float32(self.min_twidth),
@@ -499,8 +493,8 @@ class GPU(object):
                                            np.float32(self.qrange[1]),
                                            self.nearest_mc_gpu,
                                            np.int32(self.min_bin_content),
-                                           block=(self.nthreads_per_block,1,1), 
-                                           grid=(len(self.earliest_time_gpu)//self.nthreads_per_block+1,1))
+                                           block=(nthreads_per_block,1,1), 
+                                           grid=(len(gpuchannels.t)//nthreads_per_block+1,1))
 
     def get_pdf_eval(self):
         evhit = self.event_hit_gpu.get().astype(bool)
@@ -525,8 +519,9 @@ class GPU(object):
 
         nearest_mc = self.nearest_mc_gpu.get().reshape((len(hitcount), self.min_bin_content))
 
-        # Deal with the case where we did not even get min_bin_content events in the PDF
-        # but also clamp the lower range to ensure we don't index by a negative number in 2 lines
+        # Deal with the case where we did not even get min_bin_content events
+        # in the PDF but also clamp the lower range to ensure we don't index
+        # by a negative number in 2 lines
         last_valid_entry = np.maximum(0, (nearest_mc < 1e9).astype(int).sum(axis=1) - 1)
         distance = nearest_mc[np.arange(len(last_valid_entry)),last_valid_entry]
 
@@ -544,5 +539,31 @@ class GPU(object):
 
         return hitcount, pdf_value, pdf_value * pdf_frac_uncert
 
+class GPU(object):
+    def __init__(self, device_id=None):
+        """Initialize a GPU context on the specified device.
+        If device_id is None, the default device is used."""
+
+        if device_id is None:
+            self.context = pycuda.tools.make_default_context()
+        else:
+            device = cuda.Device(device_id)
+            self.context = device.make_context()
+
+        print 'device %s' % self.context.get_device().name()
+
+        self.print_mem_info()
+
+        self.context.set_cache_config(cuda.func_cache.PREFER_L1)
+
+        cuda_options = ['--use_fast_math']#, '--ptxas-options=-v']
+
+        self.module = get_cu_module('kernel', options=cuda_options)
+
+    def print_mem_info(self):
+        free, total = cuda.mem_get_info()
+
+        print '%.1f %% of device memory is free.' % ((free/float(total))*100)
+
     def __del__(self):
         self.context.pop()