8 files changed, 826 insertions, 799 deletions
diff --git a/chroma/gpu.py b/chroma/gpu.py
deleted file mode 100644
index af83c75..0000000
--- a/chroma/gpu.py
+++ /dev/null
@@ -1,799 +0,0 @@
-import numpy as np
-import numpy.ma as ma
-from copy import copy
-from itertools import izip
-import os
-import sys
-import gc
-
-import pytools
-import pycuda.tools
-from pycuda.compiler import SourceModule
-from pycuda import characterize
-import pycuda.driver as cuda
-from pycuda import gpuarray as ga
-
-import chroma.cuda
-from chroma.tools import timeit, profile_if_possible
-from chroma.geometry import standard_wavelengths
-from chroma import event
-
-cuda.init()
-
-# standard nvcc options
-cuda_options = ('--use_fast_math',)#, '--ptxas-options=-v']
-
-@pycuda.tools.context_dependent_memoize
-def get_cu_module(name, options=None, include_source_directory=True):
-    """Returns a pycuda.compiler.SourceModule object from a CUDA source file
-    located in the chroma cuda directory at cuda/[name]."""
-    if options is None:
-        options = []
-    elif isinstance(options, tuple):
-        options = list(options)
-    else:
-        raise TypeError('`options` must be a tuple.')
-
-    from chroma.cuda import srcdir
-
-    if include_source_directory:
-        options += ['-I' + srcdir]
-
-    with open('%s/%s' % (srcdir, name)) as f:
-        source = f.read()
-
-    return pycuda.compiler.SourceModule(source, options=options,
-                                        no_extern_c=True)
-
-@pycuda.tools.memoize
-def get_cu_source(name):
-    """Get the source code for a CUDA source file located in the chroma cuda
-    directory at src/[name]."""
-    from chroma.cuda import srcdir
-    with open('%s/%s' % (srcdir, name)) as f:
-        source = f.read()
-    return source
-
-class GPUFuncs(object):
-    """Simple container class for GPU functions as attributes."""
-    def __init__(self, module):
-        self.module = module
-        self.funcs = {}
-
-    def __getattr__(self, name):
-        try:
-            return self.funcs[name]
-        except KeyError:
-            f = self.module.get_function(name)
-            self.funcs[name] = f
-            return f
-
-init_rng_src = """
-#include <curand_kernel.h>
-
-extern "C"
-{
-
-__global__ void init_rng(int nthreads, curandState *s, unsigned long long seed, unsigned long long offset)
-{
-	int id = blockIdx.x*blockDim.x + threadIdx.x;
-
-	if (id >= nthreads)
-		return;
-
-	curand_init(seed, id, offset, &s[id]);
-}
-
-} // extern "C"
-"""
-
-def get_rng_states(size, seed=1):
-    "Return `size` number of CUDA random number generator states."
-    rng_states = cuda.mem_alloc(size*characterize.sizeof('curandStateXORWOW', '#include <curand_kernel.h>'))
-
-    module = pycuda.compiler.SourceModule(init_rng_src, no_extern_c=True)
-    init_rng = module.get_function('init_rng')
-
-    init_rng(np.int32(size), rng_states, np.uint64(seed), np.uint64(0), block=(64,1,1), grid=(size//64+1,1))
-
-    return rng_states
-
-def to_float3(arr):
-    "Returns an pycuda.gpuarray.vec.float3 array from an (N,3) array."
-    if not arr.flags['C_CONTIGUOUS']:
-        arr = np.asarray(arr, order='c')
-    return arr.astype(np.float32).view(ga.vec.float3)[:,0]
-
-def to_uint3(arr):
-    "Returns a pycuda.gpuarray.vec.uint3 array from an (N,3) array."
-    if not arr.flags['C_CONTIGUOUS']:
-        arr = np.asarray(arr, order='c')
-    return arr.astype(np.uint32).view(ga.vec.uint3)[:,0]
-
-def chunk_iterator(nelements, nthreads_per_block=64, max_blocks=1024):
-    """Iterator that yields tuples with the values requried to process
-    a long array in multiple kernel passes on the GPU.
-
-    Each yielded value is of the form,
-        (first_index, elements_this_iteration, nblocks_this_iteration)
-
-    Example:
-        >>> list(chunk_iterator(300, 32, 2))
-        [(0, 64, 2), (64, 64, 2), (128, 64, 2), (192, 64, 2), (256, 9, 1)]
-    """
-    first = 0
-    while first < nelements:
-        elements_left = nelements - first
-        blocks = int(elements_left // nthreads_per_block)
-        if elements_left % nthreads_per_block != 0:
-            blocks += 1 # Round up only if needed
-        blocks = min(max_blocks, blocks)
-        elements_this_round = min(elements_left, blocks * nthreads_per_block)
-
-        yield (first, elements_this_round, blocks)
-        first += elements_this_round
-
-class GPUPhotons(object):
-    def __init__(self, photons, ncopies=1):
-        """Load ``photons`` onto the GPU, replicating as requested.
-
-           Args:
-               - photons: chroma.Event.Photons
-                   Photon state information to load onto GPU
-               - ncopies: int, *optional*
-                   Number of times to replicate the photons
-                   on the GPU.  This is used if you want
-                   to propagate the same event many times,
-                   for example in a likelihood calculation.
-
-                   The amount of GPU storage will be proportionally
-                   larger if ncopies > 1, so be careful.
-        """
-        nphotons = len(photons)
-        self.pos = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3)
-        self.dir = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3)
-        self.pol = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3)
-        self.wavelengths = ga.empty(shape=nphotons*ncopies, dtype=np.float32)
-        self.t = ga.empty(shape=nphotons*ncopies, dtype=np.float32)
-        self.last_hit_triangles = ga.empty(shape=nphotons*ncopies, dtype=np.int32)
-        self.flags = ga.empty(shape=nphotons*ncopies, dtype=np.uint32)
-
-        # Assign the provided photons to the beginning (possibly
-        # the entire array if ncopies is 1
-        self.pos[:nphotons].set(to_float3(photons.pos))
-        self.dir[:nphotons].set(to_float3(photons.dir))
-        self.pol[:nphotons].set(to_float3(photons.pol))
-        self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32))
-        self.t[:nphotons].set(photons.t.astype(np.float32))
-        self.last_hit_triangles[:nphotons].set(photons.last_hit_triangles.astype(np.int32))
-        self.flags[:nphotons].set(photons.flags.astype(np.uint32))
-
-        module = get_cu_module('propagate.cu', options=cuda_options)
-        self.gpu_funcs = GPUFuncs(module)
-
-        # Replicate the photons to the rest of the slots if needed
-        if ncopies > 1:
-            max_blocks = 1024
-            nthreads_per_block = 64
-            for first_photon, photons_this_round, blocks in \
-                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
-                self.gpu_funcs.photon_duplicate(np.int32(first_photon), np.int32(photons_this_round),
-                                                self.pos, self.dir, self.wavelengths, self.pol, self.t, 
-                                                self.flags, self.last_hit_triangles, 
-                                                np.int32(ncopies-1), 
-                                                np.int32(nphotons),
-                                                block=(nthreads_per_block,1,1), grid=(blocks, 1))
-
-
-        # Save the duplication information for the iterate_copies() method
-        self.true_nphotons = nphotons
-        self.ncopies = ncopies
-
-    def get(self):
-        pos = self.pos.get().view(np.float32).reshape((len(self.pos),3))
-        dir = self.dir.get().view(np.float32).reshape((len(self.dir),3))
-        pol = self.pol.get().view(np.float32).reshape((len(self.pol),3))
-        wavelengths = self.wavelengths.get()
-        t = self.t.get()
-        last_hit_triangles = self.last_hit_triangles.get()
-        flags = self.flags.get()
-        return event.Photons(pos, dir, pol, wavelengths, t, last_hit_triangles, flags)
-
-    def iterate_copies(self):
-        '''Returns an iterator that yields GPUPhotonsSlice objects
-        corresponding to the event copies stored in ``self``.'''
-        for i in xrange(self.ncopies):
-            window = slice(self.true_nphotons*i, self.true_nphotons*(i+1))
-            yield GPUPhotonsSlice(pos=self.pos[window],
-                                  dir=self.dir[window],
-                                  pol=self.pol[window],
-                                  wavelengths=self.wavelengths[window],
-                                  t=self.t[window],
-                                  last_hit_triangles=self.last_hit_triangles[window],
-                                  flags=self.flags[window])
-
-    @profile_if_possible
-    def propagate(self, gpu_geometry, rng_states, nthreads_per_block=64,
-                  max_blocks=1024, max_steps=10):
-        """Propagate photons on GPU to termination or max_steps, whichever
-        comes first.
-
-        May be called repeatedly without reloading photon information if
-        single-stepping through photon history.
-
-        ..warning::
-            `rng_states` must have at least `nthreads_per_block`*`max_blocks`
-            number of curandStates.
-        """
-        nphotons = self.pos.size
-        step = 0
-        input_queue = np.empty(shape=nphotons+1, dtype=np.uint32)
-        input_queue[0] = 0
-        # Order photons initially in the queue to put the clones next to each other
-        for copy in xrange(self.ncopies):
-            input_queue[1+copy::self.ncopies] = np.arange(self.true_nphotons, dtype=np.uint32) + copy * self.true_nphotons
-        input_queue_gpu = ga.to_gpu(input_queue)
-        output_queue = np.zeros(shape=nphotons+1, dtype=np.uint32)
-        output_queue[0] = 1
-        output_queue_gpu = ga.to_gpu(output_queue)
-
-        while step < max_steps:
-            # Just finish the rest of the steps if the # of photons is low
-            if nphotons < nthreads_per_block * 16 * 8:
-                nsteps = max_steps - step
-            else:
-                nsteps = 1
-
-            for first_photon, photons_this_round, blocks in \
-                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
-                self.gpu_funcs.propagate(np.int32(first_photon), np.int32(photons_this_round), input_queue_gpu[1:], output_queue_gpu, rng_states, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, np.int32(nsteps), gpu_geometry.gpudata, block=(nthreads_per_block,1,1), grid=(blocks, 1))
-
-            step += nsteps
-
-            if step < max_steps:
-                temp = input_queue_gpu
-                input_queue_gpu = output_queue_gpu
-                output_queue_gpu = temp
-                output_queue_gpu[:1].set(np.uint32(1))
-                nphotons = input_queue_gpu[:1].get()[0] - 1
-
-        if ga.max(self.flags).get() & (1 << 31):
-            print >>sys.stderr, "WARNING: ABORTED PHOTONS"
-
-    def __del__(self):
-        del self.pos
-        del self.dir
-        del self.pol
-        del self.wavelengths
-        del self.t
-        del self.flags
-        del self.last_hit_triangles
-        # Free up GPU memory quickly if now available
-        gc.collect()
-
-class GPUPhotonsSlice(GPUPhotons):
-    '''A `slice`-like view of a subrange of another GPU photons array.
-    Works exactly like an instance of GPUPhotons, but the GPU storage
-    is taken from another GPUPhotons instance.
-
-    Returned by the GPUPhotons.iterate_copies() iterator.'''
-    def __init__(self, pos, dir, pol, wavelengths, t, last_hit_triangles,
-                 flags):
-        '''Create new object using slices of GPUArrays from an instance
-        of GPUPhotons.  NOTE THESE ARE NOT CPU ARRAYS!'''
-        self.pos = pos
-        self.dir = dir
-        self.pol = pol
-        self.wavelengths = wavelengths
-        self.t = t
-        self.last_hit_triangles = last_hit_triangles
-        self.flags = flags
-
-        module = get_cu_module('propagate.cu', options=cuda_options)
-        self.gpu_funcs = GPUFuncs(module)
-
-        self.true_nphotons = len(pos)
-        self.ncopies = 1
-
-    def __del__(self):
-        pass # Do nothing, because we don't own any of our GPU memory
-
-class GPUChannels(object):
-    def __init__(self, t, q, flags):
-        self.t = t
-        self.q = q
-        self.flags = flags
-
-    def get(self):
-        t = self.t.get()
-        q = self.q.get().astype(np.float32)
-
-        # For now, assume all channels with small
-        # enough hit time were hit.
-        return event.Channels(t<1e8, t, q, self.flags.get())
-
-class GPURays(object):
-    """The GPURays class holds arrays of ray positions and directions
-    on the GPU that are used to render a geometry."""
-    def __init__(self, pos, dir, max_alpha_depth=10, nblocks=64):
-        self.pos = ga.to_gpu(to_float3(pos))
-        self.dir = ga.to_gpu(to_float3(dir))
-
-        self.max_alpha_depth = max_alpha_depth
-
-        self.nblocks = nblocks
-
-        transform_module = get_cu_module('transform.cu', options=cuda_options)
-        self.transform_funcs = GPUFuncs(transform_module)
-
-        render_module = get_cu_module('render.cu', options=cuda_options)
-        self.render_funcs = GPUFuncs(render_module)
-
-        self.dx = ga.empty(max_alpha_depth*self.pos.size, dtype=np.float32)
-        self.color = ga.empty(self.dx.size, dtype=ga.vec.float4)
-        self.dxlen = ga.zeros(self.pos.size, dtype=np.uint32)
-
-    def rotate(self, phi, n):
-        "Rotate by an angle phi around the axis `n`."
-        self.transform_funcs.rotate(np.int32(self.pos.size), self.pos, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
-        self.transform_funcs.rotate(np.int32(self.dir.size), self.dir, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.dir.size//self.nblocks+1,1))
-
-    def rotate_around_point(self, phi, n, point):
-        """"Rotate by an angle phi around the axis `n` passing through
-        the point `point`."""
-        self.transform_funcs.rotate_around_point(np.int32(self.pos.size), self.pos, np.float32(phi), ga.vec.make_float3(*n), ga.vec.make_float3(*point), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
-        self.transform_funcs.rotate(np.int32(self.dir.size), self.dir, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.dir.size//self.nblocks+1,1))
-
-    def translate(self, v):
-        "Translate the ray positions by the vector `v`."
-        self.transform_funcs.translate(np.int32(self.pos.size), self.pos, ga.vec.make_float3(*v), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
-
-    def render(self, gpu_geometry, pixels, alpha_depth=10,
-               keep_last_render=False):
-        """Render `gpu_geometry` and fill the GPU array `pixels` with pixel
-        colors."""
-        if not keep_last_render:
-            self.dxlen.fill(0)
-
-        if alpha_depth > self.max_alpha_depth:
-            raise Exception('alpha_depth > max_alpha_depth')
-
-        if not isinstance(pixels, ga.GPUArray):
-            raise TypeError('`pixels` must be a %s instance.' % ga.GPUArray)
-
-        if pixels.size != self.pos.size:
-            raise ValueError('`pixels`.size != number of rays')
-
-        self.render_funcs.render(np.int32(self.pos.size), self.pos, self.dir, gpu_geometry.gpudata, np.uint32(alpha_depth), pixels, self.dx, self.dxlen, self.color, block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
-
-    def snapshot(self, gpu_geometry, alpha_depth=10):
-        "Render `gpu_geometry` and return a numpy array of pixel colors."
-        pixels = ga.empty(self.pos.size, dtype=np.uint32)
-        self.render(gpu_geometry, pixels, alpha_depth)
-        return pixels.get()
-
-def make_gpu_struct(size, members):
-    struct = cuda.mem_alloc(size)
-
-    i = 0
-    for member in members:
-        if isinstance(member, ga.GPUArray):
-            member = member.gpudata
-
-        if isinstance(member, cuda.DeviceAllocation):
-            if i % 8:
-                raise Exception('cannot align 64-bit pointer. '
-                                'arrange struct member variables in order of '
-                                'decreasing size.')
-
-            cuda.memcpy_htod(int(struct)+i, np.intp(int(member)))
-            i += 8
-        elif np.isscalar(member):
-            cuda.memcpy_htod(int(struct)+i, member)
-            i += member.nbytes
-        else:
-            raise TypeError('expected a GPU device pointer or scalar type.')
-
-    return struct
-
-def format_size(size):
-    if size < 1e3:
-        return '%.1f%s' % (size, ' ')
-    elif size < 1e6:
-        return '%.1f%s' % (size/1e3, 'K')
-    elif size < 1e9:
-        return '%.1f%s' % (size/1e6, 'M')
-    else:
-        return '%.1f%s' % (size/1e9, 'G')
-
-def format_array(name, array):
-    return '%-15s %6s %6s' % \
-        (name, format_size(len(array)), format_size(array.nbytes))
-
-class GPUGeometry(object):
-    def __init__(self, geometry, wavelengths=None, print_usage=False):
-        if wavelengths is None:
-            wavelengths = standard_wavelengths
-
-        try:
-            wavelength_step = np.unique(np.diff(wavelengths)).item()
-        except ValueError:
-            raise ValueError('wavelengths must be equally spaced apart.')
-
-        geometry_source = get_cu_source('geometry.h')
-        material_struct_size = characterize.sizeof('Material', geometry_source)
-        surface_struct_size = characterize.sizeof('Surface', geometry_source)
-        geometry_struct_size = characterize.sizeof('Geometry', geometry_source)
-
-        self.material_data = []
-        self.material_ptrs = []
-
-        def interp_material_property(wavelengths, property):
-            # note that it is essential that the material properties be
-            # interpolated linearly. this fact is used in the propagation
-            # code to guarantee that probabilities still sum to one.
-            return np.interp(wavelengths, property[:,0], property[:,1]).astype(np.float32)
-
-        for i in range(len(geometry.unique_materials)):
-            material = geometry.unique_materials[i]
-
-            if material is None:
-                raise Exception('one or more triangles is missing a material.')
-
-            refractive_index = interp_material_property(wavelengths, material.refractive_index)
-            refractive_index_gpu = ga.to_gpu(refractive_index)
-            absorption_length = interp_material_property(wavelengths, material.absorption_length)
-            absorption_length_gpu = ga.to_gpu(absorption_length)
-            scattering_length = interp_material_property(wavelengths, material.scattering_length)
-            scattering_length_gpu = ga.to_gpu(scattering_length)
-
-            self.material_data.append(refractive_index_gpu)
-            self.material_data.append(absorption_length_gpu)
-            self.material_data.append(scattering_length_gpu)
-
-            material_gpu = \
-                make_gpu_struct(material_struct_size,
-                                [refractive_index_gpu, absorption_length_gpu,
-                                 scattering_length_gpu,
-                                 np.uint32(len(wavelengths)),
-                                 np.float32(wavelength_step),
-                                 np.float32(wavelengths[0])])
-
-            self.material_ptrs.append(material_gpu)
-
-        self.material_pointer_array = \
-            make_gpu_struct(8*len(self.material_ptrs), self.material_ptrs)
-
-        self.surface_data = []
-        self.surface_ptrs = []
-
-        for i in range(len(geometry.unique_surfaces)):
-            surface = geometry.unique_surfaces[i]
-
-            if surface is None:
-                # need something to copy to the surface array struct
-                # that is the same size as a 64-bit pointer.
-                # this pointer will never be used by the simulation.
-                self.surface_ptrs.append(np.uint64(0))
-                continue
-
-            detect = interp_material_property(wavelengths, surface.detect)
-            detect_gpu = ga.to_gpu(detect)
-            absorb = interp_material_property(wavelengths, surface.absorb)
-            absorb_gpu = ga.to_gpu(absorb)
-            reflect_diffuse = interp_material_property(wavelengths, surface.reflect_diffuse)
-            reflect_diffuse_gpu = ga.to_gpu(reflect_diffuse)
-            reflect_specular = interp_material_property(wavelengths, surface.reflect_specular)
-            reflect_specular_gpu = ga.to_gpu(reflect_specular)
-
-            self.surface_data.append(detect_gpu)
-            self.surface_data.append(absorb_gpu)
-            self.surface_data.append(reflect_diffuse_gpu)
-            self.surface_data.append(reflect_specular_gpu)
-
-            surface_gpu = \
-                make_gpu_struct(surface_struct_size,
-                                [detect_gpu, absorb_gpu,
-                                 reflect_diffuse_gpu,
-                                 reflect_specular_gpu,
-                                 np.uint32(len(wavelengths)),
-                                 np.float32(wavelength_step),
-                                 np.float32(wavelengths[0])])
-
-            self.surface_ptrs.append(surface_gpu)
-
-        self.surface_pointer_array = \
-            make_gpu_struct(8*len(self.surface_ptrs), self.surface_ptrs)
-
-        self.vertices = ga.to_gpu(to_float3(geometry.mesh.vertices))
-        self.triangles = ga.to_gpu(to_uint3(geometry.mesh.triangles))
-
-        material_codes = (((geometry.material1_index & 0xff) << 24) |
-                          ((geometry.material2_index & 0xff) << 16) |
-                          ((geometry.surface_index & 0xff) << 8)).astype(np.uint32)
-
-        self.material_codes = ga.to_gpu(material_codes)
-
-        self.lower_bounds = ga.to_gpu(to_float3(geometry.lower_bounds))
-        self.upper_bounds = ga.to_gpu(to_float3(geometry.upper_bounds))
-        self.colors = ga.to_gpu(geometry.colors.astype(np.uint32))
-        self.node_map = ga.to_gpu(geometry.node_map.astype(np.uint32))
-        self.node_map_end = ga.to_gpu(geometry.node_map_end.astype(np.uint32))
-        self.solid_id_map = ga.to_gpu(geometry.solid_id.astype(np.uint32))
-
-        self.gpudata = make_gpu_struct(geometry_struct_size,
-                                       [self.vertices, self.triangles,
-                                        self.material_codes,
-                                        self.colors, self.lower_bounds,
-                                        self.upper_bounds, self.node_map,
-                                        self.node_map_end,
-                                        self.material_pointer_array,
-                                        self.surface_pointer_array,
-                                        np.uint32(geometry.start_node),
-                                        np.uint32(geometry.first_node)])
-
-        self.geometry = geometry
-
-        if print_usage:
-            self.print_device_usage()
-
-    def print_device_usage(self):
-        print 'device usage:'
-        print '-'*10
-        print format_array('vertices', self.vertices)
-        print format_array('triangles', self.triangles)
-        print format_array('lower_bounds', self.lower_bounds)
-        print format_array('upper_bounds', self.upper_bounds)
-        print format_array('node_map', self.node_map)
-        print format_array('node_map_end', self.node_map_end)
-        print '%-15s %6s %6s' % ('total', '', format_size(self.vertices.nbytes + self.triangles.nbytes + self.lower_bounds.nbytes + self.upper_bounds.nbytes + self.node_map.nbytes + self.node_map_end.nbytes))
-        print '-'*10
-        free, total = cuda.mem_get_info()
-        print '%-15s %6s %6s' % ('device total', '', format_size(total))
-        print '%-15s %6s %6s' % ('device used', '', format_size(total-free))
-        print '%-15s %6s %6s' % ('device free', '', format_size(free))
-        print
-
-    def reset_colors(self):
-        self.colors.set_async(self.geometry.colors.astype(np.uint32))
-
-    def color_solids(self, solid_hit, colors, nblocks_per_thread=64,
-                     max_blocks=1024):
-        solid_hit_gpu = ga.to_gpu(np.array(solid_hit, dtype=np.bool))
-        solid_colors_gpu = ga.to_gpu(np.array(colors, dtype=np.uint32))
-
-        module = get_cu_module('mesh.h', options=cuda_options)
-        color_solids = module.get_function('color_solids')
-
-        for first_triangle, triangles_this_round, blocks in \
-                chunk_iterator(self.triangles.size, nblocks_per_thread,
-                               max_blocks):
-            color_solids(np.int32(first_triangle),
-                         np.int32(triangles_this_round), self.solid_id_map,
-                         solid_hit_gpu, solid_colors_gpu, self.gpudata,
-                         block=(nblocks_per_thread,1,1), 
-                         grid=(blocks,1))
-
-class GPUDaq(object):
-    def __init__(self, gpu_geometry, max_pmt_id, pmt_rms=1.2e-9):
-        self.earliest_time_gpu = ga.empty(max_pmt_id+1, dtype=np.float32)
-        self.earliest_time_int_gpu = ga.empty(max_pmt_id+1, dtype=np.uint32)
-        self.channel_history_gpu = ga.zeros_like(self.earliest_time_int_gpu)
-        self.channel_q_gpu = ga.zeros_like(self.earliest_time_int_gpu)
-        self.daq_pmt_rms = pmt_rms
-        self.solid_id_map_gpu = gpu_geometry.solid_id_map
-
-        self.module = get_cu_module('daq.cu', options=cuda_options, 
-                                    include_source_directory=False)
-        self.gpu_funcs = GPUFuncs(self.module)
-
-    def acquire(self, gpuphotons, rng_states, nthreads_per_block=64, max_blocks=1024):
-        self.gpu_funcs.reset_earliest_time_int(np.float32(1e9), np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, block=(nthreads_per_block,1,1), grid=(len(self.earliest_time_int_gpu)//nthreads_per_block+1,1))
-        self.channel_q_gpu.fill(0)
-        self.channel_history_gpu.fill(0)
-
-        n = len(gpuphotons.pos)
-
-        for first_photon, photons_this_round, blocks in \
-                chunk_iterator(n, nthreads_per_block, max_blocks):
-            self.gpu_funcs.run_daq(rng_states, np.uint32(0x1 << 2), np.float32(self.daq_pmt_rms), np.int32(first_photon), np.int32(photons_this_round), gpuphotons.t, gpuphotons.flags, gpuphotons.last_hit_triangles, self.solid_id_map_gpu, np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, self.channel_q_gpu, self.channel_history_gpu, block=(nthreads_per_block,1,1), grid=(blocks,1))
-
-        self.gpu_funcs.convert_sortable_int_to_float(np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, self.earliest_time_gpu, block=(nthreads_per_block,1,1), grid=(len(self.earliest_time_int_gpu)//nthreads_per_block+1,1))
-
-        return GPUChannels(self.earliest_time_gpu, self.channel_q_gpu, self.channel_history_gpu)
-
-class GPUPDF(object):
-    def __init__(self):
-        self.module = get_cu_module('daq.cu', options=cuda_options,
-                                    include_source_directory=False)
-        self.gpu_funcs = GPUFuncs(self.module)
-
-    def setup_pdf(self, max_pmt_id, tbins, trange, qbins, qrange):
-        """Setup GPU arrays to hold PDF information.
-
-           max_pmt_id: int, largest PMT id #
-           tbins: number of time bins
-           trange: tuple of (min, max) time in PDF
-           qbins: number of charge bins
-           qrange: tuple of (min, max) charge in PDF
-        """
-        self.events_in_histogram = 0
-        self.hitcount_gpu = ga.zeros(max_pmt_id+1, dtype=np.uint32)
-        self.pdf_gpu = ga.zeros(shape=(max_pmt_id+1, tbins, qbins), 
-                                      dtype=np.uint32)
-        self.tbins = tbins
-        self.trange = trange
-        self.qbins = qbins
-        self.qrange = qrange
-
-    def clear_pdf(self):
-        """Rezero the PDF counters."""
-        self.hitcount_gpu.fill(0)
-        self.pdf_gpu.fill(0)
-
-    def add_hits_to_pdf(self, gpuchannels, nthreads_per_block=64):
-        self.gpu_funcs.bin_hits(np.int32(len(self.hitcount_gpu)),
-                                gpuchannels.q,
-                                gpuchannels.t,
-                                self.hitcount_gpu,
-                                np.int32(self.tbins),
-                                np.float32(self.trange[0]),
-                                np.float32(self.trange[1]),
-                                np.int32(self.qbins),
-                                np.float32(self.qrange[0]),
-                                np.float32(self.qrange[1]),
-                                self.pdf_gpu,
-                                block=(nthreads_per_block,1,1), 
-                                grid=(len(gpuchannels.t)//nthreads_per_block+1,1))
-
-
-        self.events_in_histogram += 1
-
-    def get_pdfs(self):
-        """Returns the 1D hitcount array and the 3D [channel, time, charge]
-        histogram."""
-        return self.hitcount_gpu.get(), self.pdf_gpu.get()
-
-    def setup_pdf_eval(self, event_hit, event_time, event_charge, min_twidth,
-                       trange, min_qwidth, qrange, min_bin_content=10,
-                       time_only=True):
-        """Setup GPU arrays to compute PDF values for the given event.
-        The pdf_eval calculation allows the PDF to be evaluated at a
-        single point for each channel as the Monte Carlo is run.  The
-        effective bin size will be as small as (`min_twidth`,
-        `min_qwidth`) around the point of interest, but will be large
-        enough to ensure that `min_bin_content` Monte Carlo events
-        fall into the bin.
-
-            event_hit: ndarray
-              Hit or not-hit status for each channel in the detector.
-            event_time: ndarray
-              Hit time for each channel in the detector.  If channel 
-              not hit, the time will be ignored.
-            event_charge: ndarray
-              Integrated charge for each channel in the detector.
-              If channel not hit, the charge will be ignored.
-
-            min_twidth: float
-              Minimum bin size in the time dimension
-            trange: (float, float)
-              Range of time dimension in PDF
-            min_qwidth: float
-              Minimum bin size in charge dimension
-            qrange: (float, float)
-              Range of charge dimension in PDF
-            min_bin_content: int
-              The bin will be expanded to include at least this many events
-            time_only: bool
-              If True, only the time observable will be used in the PDF.
-        """
-        self.event_hit_gpu = ga.to_gpu(event_hit.astype(np.uint32))
-        self.event_time_gpu = ga.to_gpu(event_time.astype(np.float32))
-        self.event_charge_gpu = ga.to_gpu(event_charge.astype(np.float32))
-
-        self.eval_hitcount_gpu = ga.zeros(len(event_hit), dtype=np.uint32)
-        self.eval_bincount_gpu = ga.zeros(len(event_hit), dtype=np.uint32)
-        self.nearest_mc_gpu = ga.empty(shape=len(event_hit) * min_bin_content, 
-                                             dtype=np.float32)
-        self.nearest_mc_gpu.fill(1e9)
-
-        self.min_twidth = min_twidth
-        self.trange = trange
-        self.min_qwidth = min_qwidth
-        self.qrange = qrange
-        self.min_bin_content = min_bin_content
-        self.time_only = time_only
-
-    def clear_pdf_eval(self):
-        "Reset PDF evaluation counters to start accumulating new Monte Carlo."
-        self.eval_hitcount_gpu.fill(0)
-        self.eval_bincount_gpu.fill(0)
-        self.nearest_mc_gpu.fill(1e9)
-
-    def accumulate_pdf_eval(self, gpuchannels, nthreads_per_block=64):
-        "Add the most recent results of run_daq() to the PDF evaluation."
-        self.gpu_funcs.accumulate_pdf_eval(np.int32(self.time_only),
-                                           np.int32(len(self.event_hit_gpu)),
-                                           self.event_hit_gpu,
-                                           self.event_time_gpu,
-                                           self.event_charge_gpu,
-                                           gpuchannels.t,
-                                           gpuchannels.q,
-                                           self.eval_hitcount_gpu,
-                                           self.eval_bincount_gpu,
-                                           np.float32(self.min_twidth),
-                                           np.float32(self.trange[0]),
-                                           np.float32(self.trange[1]),
-                                           np.float32(self.min_qwidth),
-                                           np.float32(self.qrange[0]),
-                                           np.float32(self.qrange[1]),
-                                           self.nearest_mc_gpu,
-                                           np.int32(self.min_bin_content),
-                                           block=(nthreads_per_block,1,1), 
-                                           grid=(len(gpuchannels.t)//nthreads_per_block+1,1))
-
-    def get_pdf_eval(self):
-        evhit = self.event_hit_gpu.get().astype(bool)
-        hitcount = self.eval_hitcount_gpu.get()
-        bincount = self.eval_bincount_gpu.get()
-
-        pdf_value = np.zeros(len(hitcount), dtype=float)
-        pdf_frac_uncert = np.zeros_like(pdf_value)
-
-        # PDF value for high stats bins
-        high_stats = bincount >= self.min_bin_content
-        if high_stats.any():
-            if self.time_only:
-                pdf_value[high_stats] = bincount[high_stats].astype(float) / hitcount[high_stats] / self.min_twidth
-            else:
-                assert Exception('Unimplemented 2D (time,charge) mode!')
-
-            pdf_frac_uncert[high_stats] = 1.0/np.sqrt(bincount[high_stats])
-
-        # PDF value for low stats bins
-        low_stats = ~high_stats & (hitcount > 0) & evhit
-
-        nearest_mc = self.nearest_mc_gpu.get().reshape((len(hitcount), self.min_bin_content))
-
-        # Deal with the case where we did not even get min_bin_content events
-        # in the PDF but also clamp the lower range to ensure we don't index
-        # by a negative number in 2 lines
-        last_valid_entry = np.maximum(0, (nearest_mc < 1e9).astype(int).sum(axis=1) - 1)
-        distance = nearest_mc[np.arange(len(last_valid_entry)),last_valid_entry]
-
-        if low_stats.any():
-            if self.time_only:
-                pdf_value[low_stats] = (last_valid_entry[low_stats] + 1).astype(float) / hitcount[low_stats] / distance[low_stats] / 2.0
-            else:
-                assert Exception('Unimplemented 2D (time,charge) mode!')
-
-            pdf_frac_uncert[low_stats] = 1.0/np.sqrt(last_valid_entry[low_stats] + 1)
-
-        # PDFs with no stats got zero by default during array creation
-        
-        print 'high_stats:', high_stats.sum(), 'low_stats', low_stats.sum()
-
-        return hitcount, pdf_value, pdf_value * pdf_frac_uncert
-
-def create_cuda_context(device_id=None):
-    """Initialize and return a CUDA context on the specified device.
-    If device_id is None, the default device is used."""
-    if device_id is None:
-        try:
-            context = pycuda.tools.make_default_context()
-        except cuda.LogicError:
-            # initialize cuda
-            cuda.init()
-            context = pycuda.tools.make_default_context()
-    else:
-        try:
-            device = cuda.Device(device_id)
-        except cuda.LogicError:
-            # initialize cuda
-            cuda.init()
-            device = cuda.Device(device_id)
-        context = device.make_context()
-
-    context.set_cache_config(cuda.func_cache.PREFER_L1)
-
-    return context
diff --git a/chroma/gpu/__init__.py b/chroma/gpu/__init__.py
new file mode 100644
index 0000000..8a87fc7
--- /dev/null
+++ b/chroma/gpu/__init__.py
@@ -0,0 +1,5 @@
+from chroma.gpu.tools import *
+from chroma.gpu.geometry import *
+from chroma.gpu.photon import *
+from chroma.gpu.daq import *
+from chroma.gpu.pdf import *
diff --git a/chroma/gpu/daq.py b/chroma/gpu/daq.py
new file mode 100644
index 0000000..82958dc
--- /dev/null
+++ b/chroma/gpu/daq.py
@@ -0,0 +1,49 @@
+import numpy as np
+from pycuda import gpuarray as ga
+
+from chroma.gpu.tools import get_cu_module, cuda_options, GPUFuncs, \
+    chunk_iterator
+from chroma import event
+
+class GPUChannels(object):
+    def __init__(self, t, q, flags):
+        self.t = t
+        self.q = q
+        self.flags = flags
+
+    def get(self):
+        t = self.t.get()
+        q = self.q.get().astype(np.float32)
+
+        # For now, assume all channels with small
+        # enough hit time were hit.
+        return event.Channels(t<1e8, t, q, self.flags.get())
+
+class GPUDaq(object):
+    def __init__(self, gpu_geometry, max_pmt_id, pmt_rms=1.2e-9):
+        self.earliest_time_gpu = ga.empty(max_pmt_id+1, dtype=np.float32)
+        self.earliest_time_int_gpu = ga.empty(max_pmt_id+1, dtype=np.uint32)
+        self.channel_history_gpu = ga.zeros_like(self.earliest_time_int_gpu)
+        self.channel_q_gpu = ga.zeros_like(self.earliest_time_int_gpu)
+        self.daq_pmt_rms = pmt_rms
+        self.solid_id_map_gpu = gpu_geometry.solid_id_map
+
+        self.module = get_cu_module('daq.cu', options=cuda_options, 
+                                    include_source_directory=False)
+        self.gpu_funcs = GPUFuncs(self.module)
+
+    def acquire(self, gpuphotons, rng_states, nthreads_per_block=64, max_blocks=1024):
+        self.gpu_funcs.reset_earliest_time_int(np.float32(1e9), np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, block=(nthreads_per_block,1,1), grid=(len(self.earliest_time_int_gpu)//nthreads_per_block+1,1))
+        self.channel_q_gpu.fill(0)
+        self.channel_history_gpu.fill(0)
+
+        n = len(gpuphotons.pos)
+
+        for first_photon, photons_this_round, blocks in \
+                chunk_iterator(n, nthreads_per_block, max_blocks):
+            self.gpu_funcs.run_daq(rng_states, np.uint32(0x1 << 2), np.float32(self.daq_pmt_rms), np.int32(first_photon), np.int32(photons_this_round), gpuphotons.t, gpuphotons.flags, gpuphotons.last_hit_triangles, self.solid_id_map_gpu, np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, self.channel_q_gpu, self.channel_history_gpu, block=(nthreads_per_block,1,1), grid=(blocks,1))
+
+        self.gpu_funcs.convert_sortable_int_to_float(np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, self.earliest_time_gpu, block=(nthreads_per_block,1,1), grid=(len(self.earliest_time_int_gpu)//nthreads_per_block+1,1))
+
+        return GPUChannels(self.earliest_time_gpu, self.channel_q_gpu, self.channel_history_gpu)
+
diff --git a/chroma/gpu/geometry.py b/chroma/gpu/geometry.py
new file mode 100644
index 0000000..a223006
--- /dev/null
+++ b/chroma/gpu/geometry.py
@@ -0,0 +1,174 @@
+import numpy as np
+import pycuda.driver as cuda
+from pycuda import gpuarray as ga
+from pycuda import characterize
+
+from chroma.geometry import standard_wavelengths
+from chroma.gpu.tools import get_cu_module, get_cu_source, cuda_options, \
+    chunk_iterator, format_array, format_size, to_uint3, to_float3, \
+    make_gpu_struct
+
+class GPUGeometry(object):
+    def __init__(self, geometry, wavelengths=None, print_usage=False):
+        if wavelengths is None:
+            wavelengths = standard_wavelengths
+
+        try:
+            wavelength_step = np.unique(np.diff(wavelengths)).item()
+        except ValueError:
+            raise ValueError('wavelengths must be equally spaced apart.')
+
+        geometry_source = get_cu_source('geometry.h')
+        material_struct_size = characterize.sizeof('Material', geometry_source)
+        surface_struct_size = characterize.sizeof('Surface', geometry_source)
+        geometry_struct_size = characterize.sizeof('Geometry', geometry_source)
+
+        self.material_data = []
+        self.material_ptrs = []
+
+        def interp_material_property(wavelengths, property):
+            # note that it is essential that the material properties be
+            # interpolated linearly. this fact is used in the propagation
+            # code to guarantee that probabilities still sum to one.
+            return np.interp(wavelengths, property[:,0], property[:,1]).astype(np.float32)
+
+        for i in range(len(geometry.unique_materials)):
+            material = geometry.unique_materials[i]
+
+            if material is None:
+                raise Exception('one or more triangles is missing a material.')
+
+            refractive_index = interp_material_property(wavelengths, material.refractive_index)
+            refractive_index_gpu = ga.to_gpu(refractive_index)
+            absorption_length = interp_material_property(wavelengths, material.absorption_length)
+            absorption_length_gpu = ga.to_gpu(absorption_length)
+            scattering_length = interp_material_property(wavelengths, material.scattering_length)
+            scattering_length_gpu = ga.to_gpu(scattering_length)
+
+            self.material_data.append(refractive_index_gpu)
+            self.material_data.append(absorption_length_gpu)
+            self.material_data.append(scattering_length_gpu)
+
+            material_gpu = \
+                make_gpu_struct(material_struct_size,
+                                [refractive_index_gpu, absorption_length_gpu,
+                                 scattering_length_gpu,
+                                 np.uint32(len(wavelengths)),
+                                 np.float32(wavelength_step),
+                                 np.float32(wavelengths[0])])
+
+            self.material_ptrs.append(material_gpu)
+
+        self.material_pointer_array = \
+            make_gpu_struct(8*len(self.material_ptrs), self.material_ptrs)
+
+        self.surface_data = []
+        self.surface_ptrs = []
+
+        for i in range(len(geometry.unique_surfaces)):
+            surface = geometry.unique_surfaces[i]
+
+            if surface is None:
+                # need something to copy to the surface array struct
+                # that is the same size as a 64-bit pointer.
+                # this pointer will never be used by the simulation.
+                self.surface_ptrs.append(np.uint64(0))
+                continue
+
+            detect = interp_material_property(wavelengths, surface.detect)
+            detect_gpu = ga.to_gpu(detect)
+            absorb = interp_material_property(wavelengths, surface.absorb)
+            absorb_gpu = ga.to_gpu(absorb)
+            reflect_diffuse = interp_material_property(wavelengths, surface.reflect_diffuse)
+            reflect_diffuse_gpu = ga.to_gpu(reflect_diffuse)
+            reflect_specular = interp_material_property(wavelengths, surface.reflect_specular)
+            reflect_specular_gpu = ga.to_gpu(reflect_specular)
+
+            self.surface_data.append(detect_gpu)
+            self.surface_data.append(absorb_gpu)
+            self.surface_data.append(reflect_diffuse_gpu)
+            self.surface_data.append(reflect_specular_gpu)
+
+            surface_gpu = \
+                make_gpu_struct(surface_struct_size,
+                                [detect_gpu, absorb_gpu,
+                                 reflect_diffuse_gpu,
+                                 reflect_specular_gpu,
+                                 np.uint32(len(wavelengths)),
+                                 np.float32(wavelength_step),
+                                 np.float32(wavelengths[0])])
+
+            self.surface_ptrs.append(surface_gpu)
+
+        self.surface_pointer_array = \
+            make_gpu_struct(8*len(self.surface_ptrs), self.surface_ptrs)
+
+        self.vertices = ga.to_gpu(to_float3(geometry.mesh.vertices))
+        self.triangles = ga.to_gpu(to_uint3(geometry.mesh.triangles))
+
+        material_codes = (((geometry.material1_index & 0xff) << 24) |
+                          ((geometry.material2_index & 0xff) << 16) |
+                          ((geometry.surface_index & 0xff) << 8)).astype(np.uint32)
+
+        self.material_codes = ga.to_gpu(material_codes)
+
+        self.lower_bounds = ga.to_gpu(to_float3(geometry.lower_bounds))
+        self.upper_bounds = ga.to_gpu(to_float3(geometry.upper_bounds))
+        self.colors = ga.to_gpu(geometry.colors.astype(np.uint32))
+        self.node_map = ga.to_gpu(geometry.node_map.astype(np.uint32))
+        self.node_map_end = ga.to_gpu(geometry.node_map_end.astype(np.uint32))
+        self.solid_id_map = ga.to_gpu(geometry.solid_id.astype(np.uint32))
+
+        self.gpudata = make_gpu_struct(geometry_struct_size,
+                                       [self.vertices, self.triangles,
+                                        self.material_codes,
+                                        self.colors, self.lower_bounds,
+                                        self.upper_bounds, self.node_map,
+                                        self.node_map_end,
+                                        self.material_pointer_array,
+                                        self.surface_pointer_array,
+                                        np.uint32(geometry.start_node),
+                                        np.uint32(geometry.first_node)])
+
+        self.geometry = geometry
+
+        if print_usage:
+            self.print_device_usage()
+
+    def print_device_usage(self):
+        print 'device usage:'
+        print '-'*10
+        print format_array('vertices', self.vertices)
+        print format_array('triangles', self.triangles)
+        print format_array('lower_bounds', self.lower_bounds)
+        print format_array('upper_bounds', self.upper_bounds)
+        print format_array('node_map', self.node_map)
+        print format_array('node_map_end', self.node_map_end)
+        print '%-15s %6s %6s' % ('total', '', format_size(self.vertices.nbytes + self.triangles.nbytes + self.lower_bounds.nbytes + self.upper_bounds.nbytes + self.node_map.nbytes + self.node_map_end.nbytes))
+        print '-'*10
+        free, total = cuda.mem_get_info()
+        print '%-15s %6s %6s' % ('device total', '', format_size(total))
+        print '%-15s %6s %6s' % ('device used', '', format_size(total-free))
+        print '%-15s %6s %6s' % ('device free', '', format_size(free))
+        print
+
+    def reset_colors(self):
+        self.colors.set_async(self.geometry.colors.astype(np.uint32))
+
+    def color_solids(self, solid_hit, colors, nblocks_per_thread=64,
+                     max_blocks=1024):
+        solid_hit_gpu = ga.to_gpu(np.array(solid_hit, dtype=np.bool))
+        solid_colors_gpu = ga.to_gpu(np.array(colors, dtype=np.uint32))
+
+        module = get_cu_module('mesh.h', options=cuda_options)
+        color_solids = module.get_function('color_solids')
+
+        for first_triangle, triangles_this_round, blocks in \
+                chunk_iterator(self.triangles.size, nblocks_per_thread,
+                               max_blocks):
+            color_solids(np.int32(first_triangle),
+                         np.int32(triangles_this_round), self.solid_id_map,
+                         solid_hit_gpu, solid_colors_gpu, self.gpudata,
+                         block=(nblocks_per_thread,1,1), 
+                         grid=(blocks,1))
+
diff --git a/chroma/gpu/pdf.py b/chroma/gpu/pdf.py
new file mode 100644
index 0000000..053d5f3
--- /dev/null
+++ b/chroma/gpu/pdf.py
@@ -0,0 +1,177 @@
+import numpy as np
+from pycuda import gpuarray as ga
+
+from chroma.gpu.tools import get_cu_module, cuda_options, GPUFuncs
+
+class GPUPDF(object):
+    def __init__(self):
+        self.module = get_cu_module('daq.cu', options=cuda_options,
+                                    include_source_directory=False)
+        self.gpu_funcs = GPUFuncs(self.module)
+
+    def setup_pdf(self, max_pmt_id, tbins, trange, qbins, qrange):
+        """Setup GPU arrays to hold PDF information.
+
+           max_pmt_id: int, largest PMT id #
+           tbins: number of time bins
+           trange: tuple of (min, max) time in PDF
+           qbins: number of charge bins
+           qrange: tuple of (min, max) charge in PDF
+        """
+        self.events_in_histogram = 0
+        self.hitcount_gpu = ga.zeros(max_pmt_id+1, dtype=np.uint32)
+        self.pdf_gpu = ga.zeros(shape=(max_pmt_id+1, tbins, qbins), 
+                                      dtype=np.uint32)
+        self.tbins = tbins
+        self.trange = trange
+        self.qbins = qbins
+        self.qrange = qrange
+
+    def clear_pdf(self):
+        """Rezero the PDF counters."""
+        self.hitcount_gpu.fill(0)
+        self.pdf_gpu.fill(0)
+
+    def add_hits_to_pdf(self, gpuchannels, nthreads_per_block=64):
+        self.gpu_funcs.bin_hits(np.int32(len(self.hitcount_gpu)),
+                                gpuchannels.q,
+                                gpuchannels.t,
+                                self.hitcount_gpu,
+                                np.int32(self.tbins),
+                                np.float32(self.trange[0]),
+                                np.float32(self.trange[1]),
+                                np.int32(self.qbins),
+                                np.float32(self.qrange[0]),
+                                np.float32(self.qrange[1]),
+                                self.pdf_gpu,
+                                block=(nthreads_per_block,1,1), 
+                                grid=(len(gpuchannels.t)//nthreads_per_block+1,1))
+
+
+        self.events_in_histogram += 1
+
+    def get_pdfs(self):
+        """Returns the 1D hitcount array and the 3D [channel, time, charge]
+        histogram."""
+        return self.hitcount_gpu.get(), self.pdf_gpu.get()
+
+    def setup_pdf_eval(self, event_hit, event_time, event_charge, min_twidth,
+                       trange, min_qwidth, qrange, min_bin_content=10,
+                       time_only=True):
+        """Setup GPU arrays to compute PDF values for the given event.
+        The pdf_eval calculation allows the PDF to be evaluated at a
+        single point for each channel as the Monte Carlo is run.  The
+        effective bin size will be as small as (`min_twidth`,
+        `min_qwidth`) around the point of interest, but will be large
+        enough to ensure that `min_bin_content` Monte Carlo events
+        fall into the bin.
+
+            event_hit: ndarray
+              Hit or not-hit status for each channel in the detector.
+            event_time: ndarray
+              Hit time for each channel in the detector.  If channel 
+              not hit, the time will be ignored.
+            event_charge: ndarray
+              Integrated charge for each channel in the detector.
+              If channel not hit, the charge will be ignored.
+
+            min_twidth: float
+              Minimum bin size in the time dimension
+            trange: (float, float)
+              Range of time dimension in PDF
+            min_qwidth: float
+              Minimum bin size in charge dimension
+            qrange: (float, float)
+              Range of charge dimension in PDF
+            min_bin_content: int
+              The bin will be expanded to include at least this many events
+            time_only: bool
+              If True, only the time observable will be used in the PDF.
+        """
+        self.event_hit_gpu = ga.to_gpu(event_hit.astype(np.uint32))
+        self.event_time_gpu = ga.to_gpu(event_time.astype(np.float32))
+        self.event_charge_gpu = ga.to_gpu(event_charge.astype(np.float32))
+
+        self.eval_hitcount_gpu = ga.zeros(len(event_hit), dtype=np.uint32)
+        self.eval_bincount_gpu = ga.zeros(len(event_hit), dtype=np.uint32)
+        self.nearest_mc_gpu = ga.empty(shape=len(event_hit) * min_bin_content, 
+                                             dtype=np.float32)
+        self.nearest_mc_gpu.fill(1e9)
+
+        self.min_twidth = min_twidth
+        self.trange = trange
+        self.min_qwidth = min_qwidth
+        self.qrange = qrange
+        self.min_bin_content = min_bin_content
+        self.time_only = time_only
+
+    def clear_pdf_eval(self):
+        "Reset PDF evaluation counters to start accumulating new Monte Carlo."
+        self.eval_hitcount_gpu.fill(0)
+        self.eval_bincount_gpu.fill(0)
+        self.nearest_mc_gpu.fill(1e9)
+
+    def accumulate_pdf_eval(self, gpuchannels, nthreads_per_block=64):
+        "Add the most recent results of run_daq() to the PDF evaluation."
+        self.gpu_funcs.accumulate_pdf_eval(np.int32(self.time_only),
+                                           np.int32(len(self.event_hit_gpu)),
+                                           self.event_hit_gpu,
+                                           self.event_time_gpu,
+                                           self.event_charge_gpu,
+                                           gpuchannels.t,
+                                           gpuchannels.q,
+                                           self.eval_hitcount_gpu,
+                                           self.eval_bincount_gpu,
+                                           np.float32(self.min_twidth),
+                                           np.float32(self.trange[0]),
+                                           np.float32(self.trange[1]),
+                                           np.float32(self.min_qwidth),
+                                           np.float32(self.qrange[0]),
+                                           np.float32(self.qrange[1]),
+                                           self.nearest_mc_gpu,
+                                           np.int32(self.min_bin_content),
+                                           block=(nthreads_per_block,1,1), 
+                                           grid=(len(gpuchannels.t)//nthreads_per_block+1,1))
+
+    def get_pdf_eval(self):
+        evhit = self.event_hit_gpu.get().astype(bool)
+        hitcount = self.eval_hitcount_gpu.get()
+        bincount = self.eval_bincount_gpu.get()
+
+        pdf_value = np.zeros(len(hitcount), dtype=float)
+        pdf_frac_uncert = np.zeros_like(pdf_value)
+
+        # PDF value for high stats bins
+        high_stats = bincount >= self.min_bin_content
+        if high_stats.any():
+            if self.time_only:
+                pdf_value[high_stats] = bincount[high_stats].astype(float) / hitcount[high_stats] / self.min_twidth
+            else:
+                assert Exception('Unimplemented 2D (time,charge) mode!')
+
+            pdf_frac_uncert[high_stats] = 1.0/np.sqrt(bincount[high_stats])
+
+        # PDF value for low stats bins
+        low_stats = ~high_stats & (hitcount > 0) & evhit
+
+        nearest_mc = self.nearest_mc_gpu.get().reshape((len(hitcount), self.min_bin_content))
+
+        # Deal with the case where we did not even get min_bin_content events
+        # in the PDF but also clamp the lower range to ensure we don't index
+        # by a negative number in 2 lines
+        last_valid_entry = np.maximum(0, (nearest_mc < 1e9).astype(int).sum(axis=1) - 1)
+        distance = nearest_mc[np.arange(len(last_valid_entry)),last_valid_entry]
+
+        if low_stats.any():
+            if self.time_only:
+                pdf_value[low_stats] = (last_valid_entry[low_stats] + 1).astype(float) / hitcount[low_stats] / distance[low_stats] / 2.0
+            else:
+                assert Exception('Unimplemented 2D (time,charge) mode!')
+
+            pdf_frac_uncert[low_stats] = 1.0/np.sqrt(last_valid_entry[low_stats] + 1)
+
+        # PDFs with no stats got zero by default during array creation
+        
+        print 'high_stats:', high_stats.sum(), 'low_stats', low_stats.sum()
+
+        return hitcount, pdf_value, pdf_value * pdf_frac_uncert
diff --git a/chroma/gpu/photon.py b/chroma/gpu/photon.py
new file mode 100644
index 0000000..2a9a1ff
--- /dev/null
+++ b/chroma/gpu/photon.py
@@ -0,0 +1,175 @@
+import numpy as np
+import sys
+import gc
+from pycuda import gpuarray as ga
+
+from chroma.tools import profile_if_possible
+from chroma import event
+from chroma.gpu.tools import get_cu_module, cuda_options, GPUFuncs, \
+    chunk_iterator, to_float3
+
+
+class GPUPhotons(object):
+    def __init__(self, photons, ncopies=1):
+        """Load ``photons`` onto the GPU, replicating as requested.
+
+           Args:
+               - photons: chroma.Event.Photons
+                   Photon state information to load onto GPU
+               - ncopies: int, *optional*
+                   Number of times to replicate the photons
+                   on the GPU.  This is used if you want
+                   to propagate the same event many times,
+                   for example in a likelihood calculation.
+
+                   The amount of GPU storage will be proportionally
+                   larger if ncopies > 1, so be careful.
+        """
+        nphotons = len(photons)
+        self.pos = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3)
+        self.dir = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3)
+        self.pol = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3)
+        self.wavelengths = ga.empty(shape=nphotons*ncopies, dtype=np.float32)
+        self.t = ga.empty(shape=nphotons*ncopies, dtype=np.float32)
+        self.last_hit_triangles = ga.empty(shape=nphotons*ncopies, dtype=np.int32)
+        self.flags = ga.empty(shape=nphotons*ncopies, dtype=np.uint32)
+
+        # Assign the provided photons to the beginning (possibly
+        # the entire array if ncopies is 1
+        self.pos[:nphotons].set(to_float3(photons.pos))
+        self.dir[:nphotons].set(to_float3(photons.dir))
+        self.pol[:nphotons].set(to_float3(photons.pol))
+        self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32))
+        self.t[:nphotons].set(photons.t.astype(np.float32))
+        self.last_hit_triangles[:nphotons].set(photons.last_hit_triangles.astype(np.int32))
+        self.flags[:nphotons].set(photons.flags.astype(np.uint32))
+
+        module = get_cu_module('propagate.cu', options=cuda_options)
+        self.gpu_funcs = GPUFuncs(module)
+
+        # Replicate the photons to the rest of the slots if needed
+        if ncopies > 1:
+            max_blocks = 1024
+            nthreads_per_block = 64
+            for first_photon, photons_this_round, blocks in \
+                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
+                self.gpu_funcs.photon_duplicate(np.int32(first_photon), np.int32(photons_this_round),
+                                                self.pos, self.dir, self.wavelengths, self.pol, self.t, 
+                                                self.flags, self.last_hit_triangles, 
+                                                np.int32(ncopies-1), 
+                                                np.int32(nphotons),
+                                                block=(nthreads_per_block,1,1), grid=(blocks, 1))
+
+
+        # Save the duplication information for the iterate_copies() method
+        self.true_nphotons = nphotons
+        self.ncopies = ncopies
+
+    def get(self):
+        pos = self.pos.get().view(np.float32).reshape((len(self.pos),3))
+        dir = self.dir.get().view(np.float32).reshape((len(self.dir),3))
+        pol = self.pol.get().view(np.float32).reshape((len(self.pol),3))
+        wavelengths = self.wavelengths.get()
+        t = self.t.get()
+        last_hit_triangles = self.last_hit_triangles.get()
+        flags = self.flags.get()
+        return event.Photons(pos, dir, pol, wavelengths, t, last_hit_triangles, flags)
+
+    def iterate_copies(self):
+        '''Returns an iterator that yields GPUPhotonsSlice objects
+        corresponding to the event copies stored in ``self``.'''
+        for i in xrange(self.ncopies):
+            window = slice(self.true_nphotons*i, self.true_nphotons*(i+1))
+            yield GPUPhotonsSlice(pos=self.pos[window],
+                                  dir=self.dir[window],
+                                  pol=self.pol[window],
+                                  wavelengths=self.wavelengths[window],
+                                  t=self.t[window],
+                                  last_hit_triangles=self.last_hit_triangles[window],
+                                  flags=self.flags[window])
+
+    @profile_if_possible
+    def propagate(self, gpu_geometry, rng_states, nthreads_per_block=64,
+                  max_blocks=1024, max_steps=10):
+        """Propagate photons on GPU to termination or max_steps, whichever
+        comes first.
+
+        May be called repeatedly without reloading photon information if
+        single-stepping through photon history.
+
+        ..warning::
+            `rng_states` must have at least `nthreads_per_block`*`max_blocks`
+            number of curandStates.
+        """
+        nphotons = self.pos.size
+        step = 0
+        input_queue = np.empty(shape=nphotons+1, dtype=np.uint32)
+        input_queue[0] = 0
+        # Order photons initially in the queue to put the clones next to each other
+        for copy in xrange(self.ncopies):
+            input_queue[1+copy::self.ncopies] = np.arange(self.true_nphotons, dtype=np.uint32) + copy * self.true_nphotons
+        input_queue_gpu = ga.to_gpu(input_queue)
+        output_queue = np.zeros(shape=nphotons+1, dtype=np.uint32)
+        output_queue[0] = 1
+        output_queue_gpu = ga.to_gpu(output_queue)
+
+        while step < max_steps:
+            # Just finish the rest of the steps if the # of photons is low
+            if nphotons < nthreads_per_block * 16 * 8:
+                nsteps = max_steps - step
+            else:
+                nsteps = 1
+
+            for first_photon, photons_this_round, blocks in \
+                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
+                self.gpu_funcs.propagate(np.int32(first_photon), np.int32(photons_this_round), input_queue_gpu[1:], output_queue_gpu, rng_states, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, np.int32(nsteps), gpu_geometry.gpudata, block=(nthreads_per_block,1,1), grid=(blocks, 1))
+
+            step += nsteps
+
+            if step < max_steps:
+                temp = input_queue_gpu
+                input_queue_gpu = output_queue_gpu
+                output_queue_gpu = temp
+                output_queue_gpu[:1].set(np.uint32(1))
+                nphotons = input_queue_gpu[:1].get()[0] - 1
+
+        if ga.max(self.flags).get() & (1 << 31):
+            print >>sys.stderr, "WARNING: ABORTED PHOTONS"
+
+    def __del__(self):
+        del self.pos
+        del self.dir
+        del self.pol
+        del self.wavelengths
+        del self.t
+        del self.flags
+        del self.last_hit_triangles
+        # Free up GPU memory quickly if now available
+        gc.collect()
+
+class GPUPhotonsSlice(GPUPhotons):
+    '''A `slice`-like view of a subrange of another GPU photons array.
+    Works exactly like an instance of GPUPhotons, but the GPU storage
+    is taken from another GPUPhotons instance.
+
+    Returned by the GPUPhotons.iterate_copies() iterator.'''
+    def __init__(self, pos, dir, pol, wavelengths, t, last_hit_triangles,
+                 flags):
+        '''Create new object using slices of GPUArrays from an instance
+        of GPUPhotons.  NOTE THESE ARE NOT CPU ARRAYS!'''
+        self.pos = pos
+        self.dir = dir
+        self.pol = pol
+        self.wavelengths = wavelengths
+        self.t = t
+        self.last_hit_triangles = last_hit_triangles
+        self.flags = flags
+
+        module = get_cu_module('propagate.cu', options=cuda_options)
+        self.gpu_funcs = GPUFuncs(module)
+
+        self.true_nphotons = len(pos)
+        self.ncopies = 1
+
+    def __del__(self):
+        pass # Do nothing, because we don't own any of our GPU memory
diff --git a/chroma/gpu/render.py b/chroma/gpu/render.py
new file mode 100644
index 0000000..e645da4
--- /dev/null
+++ b/chroma/gpu/render.py
@@ -0,0 +1,66 @@
+import numpy as np
+from pycuda import gpuarray as ga
+
+from chroma.gpu.tools import get_cu_module, cuda_options, GPUFuncs, \
+    to_float3
+
+class GPURays(object):
+    """The GPURays class holds arrays of ray positions and directions
+    on the GPU that are used to render a geometry."""
+    def __init__(self, pos, dir, max_alpha_depth=10, nblocks=64):
+        self.pos = ga.to_gpu(to_float3(pos))
+        self.dir = ga.to_gpu(to_float3(dir))
+
+        self.max_alpha_depth = max_alpha_depth
+
+        self.nblocks = nblocks
+
+        transform_module = get_cu_module('transform.cu', options=cuda_options)
+        self.transform_funcs = GPUFuncs(transform_module)
+
+        render_module = get_cu_module('render.cu', options=cuda_options)
+        self.render_funcs = GPUFuncs(render_module)
+
+        self.dx = ga.empty(max_alpha_depth*self.pos.size, dtype=np.float32)
+        self.color = ga.empty(self.dx.size, dtype=ga.vec.float4)
+        self.dxlen = ga.zeros(self.pos.size, dtype=np.uint32)
+
+    def rotate(self, phi, n):
+        "Rotate by an angle phi around the axis `n`."
+        self.transform_funcs.rotate(np.int32(self.pos.size), self.pos, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
+        self.transform_funcs.rotate(np.int32(self.dir.size), self.dir, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.dir.size//self.nblocks+1,1))
+
+    def rotate_around_point(self, phi, n, point):
+        """"Rotate by an angle phi around the axis `n` passing through
+        the point `point`."""
+        self.transform_funcs.rotate_around_point(np.int32(self.pos.size), self.pos, np.float32(phi), ga.vec.make_float3(*n), ga.vec.make_float3(*point), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
+        self.transform_funcs.rotate(np.int32(self.dir.size), self.dir, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.dir.size//self.nblocks+1,1))
+
+    def translate(self, v):
+        "Translate the ray positions by the vector `v`."
+        self.transform_funcs.translate(np.int32(self.pos.size), self.pos, ga.vec.make_float3(*v), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
+
+    def render(self, gpu_geometry, pixels, alpha_depth=10,
+               keep_last_render=False):
+        """Render `gpu_geometry` and fill the GPU array `pixels` with pixel
+        colors."""
+        if not keep_last_render:
+            self.dxlen.fill(0)
+
+        if alpha_depth > self.max_alpha_depth:
+            raise Exception('alpha_depth > max_alpha_depth')
+
+        if not isinstance(pixels, ga.GPUArray):
+            raise TypeError('`pixels` must be a %s instance.' % ga.GPUArray)
+
+        if pixels.size != self.pos.size:
+            raise ValueError('`pixels`.size != number of rays')
+
+        self.render_funcs.render(np.int32(self.pos.size), self.pos, self.dir, gpu_geometry.gpudata, np.uint32(alpha_depth), pixels, self.dx, self.dxlen, self.color, block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
+
+    def snapshot(self, gpu_geometry, alpha_depth=10):
+        "Render `gpu_geometry` and return a numpy array of pixel colors."
+        pixels = ga.empty(self.pos.size, dtype=np.uint32)
+        self.render(gpu_geometry, pixels, alpha_depth)
+        return pixels.get()
+
diff --git a/chroma/gpu/tools.py b/chroma/gpu/tools.py
new file mode 100644
index 0000000..a11a561
--- /dev/null
+++ b/chroma/gpu/tools.py
@@ -0,0 +1,180 @@
+import numpy as np
+import pycuda.tools
+from pycuda import characterize
+import pycuda.driver as cuda
+from pycuda import gpuarray as ga
+
+from chroma.cuda import srcdir
+
+cuda.init()
+
+# standard nvcc options
+cuda_options = ('--use_fast_math',)#, '--ptxas-options=-v']
+
+@pycuda.tools.context_dependent_memoize
+def get_cu_module(name, options=None, include_source_directory=True):
+    """Returns a pycuda.compiler.SourceModule object from a CUDA source file
+    located in the chroma cuda directory at cuda/[name]."""
+    if options is None:
+        options = []
+    elif isinstance(options, tuple):
+        options = list(options)
+    else:
+        raise TypeError('`options` must be a tuple.')
+
+    if include_source_directory:
+        options += ['-I' + srcdir]
+
+    with open('%s/%s' % (srcdir, name)) as f:
+        source = f.read()
+
+    return pycuda.compiler.SourceModule(source, options=options,
+                                        no_extern_c=True)
+
+@pycuda.tools.memoize
+def get_cu_source(name):
+    """Get the source code for a CUDA source file located in the chroma cuda
+    directory at src/[name]."""
+    with open('%s/%s' % (srcdir, name)) as f:
+        source = f.read()
+    return source
+
+class GPUFuncs(object):
+    """Simple container class for GPU functions as attributes."""
+    def __init__(self, module):
+        self.module = module
+        self.funcs = {}
+
+    def __getattr__(self, name):
+        try:
+            return self.funcs[name]
+        except KeyError:
+            f = self.module.get_function(name)
+            self.funcs[name] = f
+            return f
+
+init_rng_src = """
+#include <curand_kernel.h>
+
+extern "C"
+{
+
+__global__ void init_rng(int nthreads, curandState *s, unsigned long long seed, unsigned long long offset)
+{
+	int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+	if (id >= nthreads)
+		return;
+
+	curand_init(seed, id, offset, &s[id]);
+}
+
+} // extern "C"
+"""
+
+def get_rng_states(size, seed=1):
+    "Return `size` number of CUDA random number generator states."
+    rng_states = cuda.mem_alloc(size*characterize.sizeof('curandStateXORWOW', '#include <curand_kernel.h>'))
+
+    module = pycuda.compiler.SourceModule(init_rng_src, no_extern_c=True)
+    init_rng = module.get_function('init_rng')
+
+    init_rng(np.int32(size), rng_states, np.uint64(seed), np.uint64(0), block=(64,1,1), grid=(size//64+1,1))
+
+    return rng_states
+
+def to_float3(arr):
+    "Returns an pycuda.gpuarray.vec.float3 array from an (N,3) array."
+    if not arr.flags['C_CONTIGUOUS']:
+        arr = np.asarray(arr, order='c')
+    return arr.astype(np.float32).view(ga.vec.float3)[:,0]
+
+def to_uint3(arr):
+    "Returns a pycuda.gpuarray.vec.uint3 array from an (N,3) array."
+    if not arr.flags['C_CONTIGUOUS']:
+        arr = np.asarray(arr, order='c')
+    return arr.astype(np.uint32).view(ga.vec.uint3)[:,0]
+
+def chunk_iterator(nelements, nthreads_per_block=64, max_blocks=1024):
+    """Iterator that yields tuples with the values requried to process
+    a long array in multiple kernel passes on the GPU.
+
+    Each yielded value is of the form,
+        (first_index, elements_this_iteration, nblocks_this_iteration)
+
+    Example:
+        >>> list(chunk_iterator(300, 32, 2))
+        [(0, 64, 2), (64, 64, 2), (128, 64, 2), (192, 64, 2), (256, 9, 1)]
+    """
+    first = 0
+    while first < nelements:
+        elements_left = nelements - first
+        blocks = int(elements_left // nthreads_per_block)
+        if elements_left % nthreads_per_block != 0:
+            blocks += 1 # Round up only if needed
+        blocks = min(max_blocks, blocks)
+        elements_this_round = min(elements_left, blocks * nthreads_per_block)
+
+        yield (first, elements_this_round, blocks)
+        first += elements_this_round
+
+def create_cuda_context(device_id=None):
+    """Initialize and return a CUDA context on the specified device.
+    If device_id is None, the default device is used."""
+    if device_id is None:
+        try:
+            context = pycuda.tools.make_default_context()
+        except cuda.LogicError:
+            # initialize cuda
+            cuda.init()
+            context = pycuda.tools.make_default_context()
+    else:
+        try:
+            device = cuda.Device(device_id)
+        except cuda.LogicError:
+            # initialize cuda
+            cuda.init()
+            device = cuda.Device(device_id)
+        context = device.make_context()
+
+    context.set_cache_config(cuda.func_cache.PREFER_L1)
+
+    return context
+
+def make_gpu_struct(size, members):
+    struct = cuda.mem_alloc(size)
+
+    i = 0
+    for member in members:
+        if isinstance(member, ga.GPUArray):
+            member = member.gpudata
+
+        if isinstance(member, cuda.DeviceAllocation):
+            if i % 8:
+                raise Exception('cannot align 64-bit pointer. '
+                                'arrange struct member variables in order of '
+                                'decreasing size.')
+
+            cuda.memcpy_htod(int(struct)+i, np.intp(int(member)))
+            i += 8
+        elif np.isscalar(member):
+            cuda.memcpy_htod(int(struct)+i, member)
+            i += member.nbytes
+        else:
+            raise TypeError('expected a GPU device pointer or scalar type.')
+
+    return struct
+
+def format_size(size):
+    if size < 1e3:
+        return '%.1f%s' % (size, ' ')
+    elif size < 1e6:
+        return '%.1f%s' % (size/1e3, 'K')
+    elif size < 1e9:
+        return '%.1f%s' % (size/1e6, 'M')
+    else:
+        return '%.1f%s' % (size/1e9, 'G')
+
+def format_array(name, array):
+    return '%-15s %6s %6s' % \
+        (name, format_size(len(array)), format_size(array.nbytes))