1 files changed, 273 insertions, 140 deletions
diff --git a/gpu.py b/gpu.py
index 721ea67..de99e0a 100644
--- a/gpu.py
+++ b/gpu.py
@@ -8,7 +8,7 @@ import sys
 import pytools
 import pycuda.tools
 from pycuda.compiler import SourceModule
-import pycuda.characterize
+from pycuda import characterize
 import pycuda.driver as cuda
 from pycuda import gpuarray as ga
 
@@ -20,24 +20,37 @@ from chroma import event
 
 cuda.init()
 
-#@pycuda.tools.context_dependent_memoize
+# standard nvcc options
+cuda_options = ('--use_fast_math',)#, '--ptxas-options=-v']
+
+@pycuda.tools.context_dependent_memoize
 def get_cu_module(name, options=None, include_source_directory=True):
     """Returns a pycuda.compiler.SourceModule object from a CUDA source file
     located in the chroma src directory at src/[name].cu."""
     if options is None:
         options = []
+    elif isinstance(options, tuple):
+        options = list(options)
+    else:
+        raise TypeError('`options` must be a tuple.')
 
     srcdir = dirname(chroma.src.__file__)
 
     if include_source_directory:
         options += ['-I' + srcdir]
 
-    with open('%s/%s.cu' % (srcdir, name)) as f:
+    with open('%s/%s' % (srcdir, name)) as f:
         source = f.read()
 
     return pycuda.compiler.SourceModule(source, options=options,
                                         no_extern_c=True)
 
+def get_cu_source(name):
+    srcdir = dirname(chroma.src.__file__)
+    with open('%s/%s' % (srcdir, name)) as f:
+        source = f.read()
+    return source
+
 class GPUFuncs(object):
     """Simple container class for GPU functions as attributes."""
     def __init__(self, module):
@@ -73,7 +86,7 @@ __global__ void init_rng(int nthreads, curandState *s, unsigned long long seed,
 
 def get_rng_states(size, seed=1):
     "Return `size` number of CUDA random number generator states."
-    rng_states = cuda.mem_alloc(size*pycuda.characterize.sizeof('curandStateXORWOW', '#include <curand_kernel.h>'))
+    rng_states = cuda.mem_alloc(size*characterize.sizeof('curandStateXORWOW', '#include <curand_kernel.h>'))
 
     module = pycuda.compiler.SourceModule(init_rng_src, no_extern_c=True)
     init_rng = module.get_function('init_rng')
@@ -88,6 +101,12 @@ def to_float3(arr):
         arr = np.asarray(arr, order='c')
     return arr.astype(np.float32).view(ga.vec.float3)[:,0]
 
+def to_uint3(arr):
+    "Returns a pycuda.gpuarray.vec.uint3 array from an (N,3) array."
+    if not arr.flags['C_CONTIGUOUS']:
+        arr = np.asarray(arr, order='c')
+    return arr.astype(np.uint32).view(ga.vec.uint3)[:,0]
+
 def chunk_iterator(nelements, nthreads_per_block=64, max_blocks=1024):
     """Iterator that yields tuples with the values requried to process
     a long array in multiple kernel passes on the GPU.
@@ -121,6 +140,11 @@ class GPUPhotons(object):
         self.last_hit_triangles = ga.to_gpu(photons.last_hit_triangles.astype(np.int32))
         self.flags = ga.to_gpu(photons.flags.astype(np.uint32))
 
+        #cuda_options = ('--use_fast_math', '-w')#, '--ptxas-options=-v']
+
+        module = get_cu_module('propagate.cu', options=cuda_options)
+        self.gpu_funcs = GPUFuncs(module)
+
     def get(self):
         pos = self.pos.get().view(np.float32).reshape((len(self.pos),3))
         dir = self.dir.get().view(np.float32).reshape((len(self.dir),3))
@@ -131,6 +155,50 @@ class GPUPhotons(object):
         flags = self.flags.get()
         return event.Photons(pos, dir, pol, wavelengths, t, last_hit_triangles, flags)
 
+    def propagate(self, gpu_geometry, rng_states, nthreads_per_block=64,
+                  max_blocks=1024, max_steps=10):
+        """Propagate photons on GPU to termination or max_steps, whichever
+        comes first.
+
+        May be called repeatedly without reloading photon information if
+        single-stepping through photon history.
+
+        ..warning::
+            `rng_states` must have at least `nthreads_per_block`*`max_blocks`
+            number of curandStates.
+        """
+        nphotons = self.pos.size
+        step = 0
+        input_queue = np.zeros(shape=nphotons+1, dtype=np.uint32)
+        input_queue[1:] = np.arange(nphotons, dtype=np.uint32)
+        input_queue_gpu = ga.to_gpu(input_queue)
+        output_queue = np.zeros(shape=nphotons+1, dtype=np.uint32)
+        output_queue[0] = 1
+        output_queue_gpu = ga.to_gpu(output_queue)
+
+        while step < max_steps:
+            # Just finish the rest of the steps if the # of photons is low
+            if nphotons < nthreads_per_block * 16 * 8:
+                nsteps = max_steps - step
+            else:
+                nsteps = 1
+
+            for first_photon, photons_this_round, blocks in \
+                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
+                self.gpu_funcs.propagate(np.int32(first_photon), np.int32(photons_this_round), input_queue_gpu[1:], output_queue_gpu, rng_states, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, np.int32(nsteps), gpu_geometry.gpudata, block=(nthreads_per_block,1,1), grid=(blocks, 1))
+
+            step += nsteps
+
+            if step < max_steps:
+                temp = input_queue_gpu
+                input_queue_gpu = output_queue_gpu
+                output_queue_gpu = temp
+                output_queue_gpu[:1].set(np.uint32(1))
+                nphotons = input_queue_gpu[:1].get()[0] - 1
+
+        if ga.max(self.flags).get() & (1 << 31):
+            print >>sys.stderr, "WARNING: ABORTED PHOTONS"
+
 class GPUChannels(object):
     def __init__(self, t, q, flags):
         self.t = t
@@ -145,71 +213,89 @@ class GPUChannels(object):
         # enough hit time were hit.
         return event.Channels(t<1e8, t, q, self.flags.get())
 
-def propagate(gpu, gpuphotons, rng_states, nthreads_per_block=64, max_blocks=1024, max_steps=10):
-    """Propagate photons on GPU to termination or max_steps, whichever
-    comes first.
-
-    May be called repeatedly without reloading photon information if
-    single-stepping through photon history.
-
-    ..warning::
-        `rng_states` must have at least `nthreads_per_block`*`max_blocks`
-        number of curandStates.
-    """
-    nphotons = gpuphotons.pos.size
-    step = 0
-    input_queue = np.zeros(shape=nphotons+1, dtype=np.uint32)
-    input_queue[1:] = np.arange(nphotons, dtype=np.uint32)
-    input_queue_gpu = ga.to_gpu(input_queue)
-    output_queue = np.zeros(shape=nphotons+1, dtype=np.uint32)
-    output_queue[0] = 1
-    output_queue_gpu = ga.to_gpu(output_queue)
-
-    propagate = gpu.module.get_function('propagate')
-
-    while step < max_steps:
-        # Just finish the rest of the steps if the # of photons is low
-        if nphotons < nthreads_per_block * 16 * 8:
-            nsteps = max_steps - step
-        else:
-            nsteps = 1
-
-        for first_photon, photons_this_round, blocks in \
-                chunk_iterator(nphotons, nthreads_per_block, max_blocks):
-            propagate(np.int32(first_photon), np.int32(photons_this_round), input_queue_gpu[1:], output_queue_gpu, rng_states, gpuphotons.pos, gpuphotons.dir, gpuphotons.wavelengths, gpuphotons.pol, gpuphotons.t, gpuphotons.flags, gpuphotons.last_hit_triangles, np.int32(nsteps), block=(nthreads_per_block,1,1), grid=(blocks, 1))
-
-        step += nsteps
-
-        if step < max_steps:
-            temp = input_queue_gpu
-            input_queue_gpu = output_queue_gpu
-            output_queue_gpu = temp
-            output_queue_gpu[:1].set(np.uint32(1))
-            nphotons = input_queue_gpu[:1].get()[0] - 1
-
-    if ga.max(gpuphotons.flags).get() & (1 << 31):
-        print >>sys.stderr, "WARNING: ABORTED PHOTONS"
-
 class GPURays(object):
-    def __init__(self, pos, dir, nblocks=64):
+    """The GPURays class holds arrays of ray positions and directions
+    on the GPU that are used to render a geometry."""
+    def __init__(self, pos, dir, max_alpha_depth=10, nblocks=64):
         self.pos = ga.to_gpu(to_float3(pos))
         self.dir = ga.to_gpu(to_float3(dir))
 
+        self.max_alpha_depth = max_alpha_depth
+
         self.nblocks = nblocks
 
-        self.module = get_cu_module('transform')
-        self.gpu_funcs = GPUFuncs(self.module)
+        transform_module = get_cu_module('transform.cu', options=cuda_options)
+        self.transform_funcs = GPUFuncs(transform_module)
+
+        render_module = get_cu_module('render.cu', options=cuda_options)
+        self.render_funcs = GPUFuncs(render_module)
+
+        self.dx = ga.empty(max_alpha_depth*self.pos.size, dtype=np.float32)
+        self.color = ga.empty(self.dx.size, dtype=ga.vec.float4)
+        self.dxlen = ga.zeros(self.pos.size, dtype=np.uint32)
 
     def rotate(self, phi, n):
-        self.gpu_funcs.rotate(np.int32(self.pos.size), self.pos, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
-        self.gpu_funcs.rotate(np.int32(self.dir.size), self.dir, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.dir.size//self.nblocks+1,1))
+        "Rotate by an angle phi around the axis `n`."
+        self.transform_funcs.rotate(np.int32(self.pos.size), self.pos, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
+        self.transform_funcs.rotate(np.int32(self.dir.size), self.dir, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.dir.size//self.nblocks+1,1))
 
     def rotate_around_point(self, phi, n, point):
-        self.gpu_funcs.rotate_around_point(np.int32(self.pos.size), self.pos, np.float32(phi), ga.vec.make_float3(*n), ga.vec.make_float3(*point), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
-        self.gpu_funcs.rotate(np.int32(self.dir.size), self.dir, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.dir.size//self.nblocks+1,1))
+        """"Rotate by an angle phi around the axis `n` passing through
+        the point `point`."""
+        self.transform_funcs.rotate_around_point(np.int32(self.pos.size), self.pos, np.float32(phi), ga.vec.make_float3(*n), ga.vec.make_float3(*point), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
+        self.transform_funcs.rotate(np.int32(self.dir.size), self.dir, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.dir.size//self.nblocks+1,1))
 
     def translate(self, v):
-        self.gpu_funcs.translate(np.int32(self.pos.size), self.pos, ga.vec.make_float3(*v), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
+        "Translate the ray positions by the vector `v`."
+        self.transform_funcs.translate(np.int32(self.pos.size), self.pos, ga.vec.make_float3(*v), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
+
+    def render(self, gpu_geometry, pixels, alpha_depth=10,
+               keep_last_render=False):
+        """Render `gpu_geometry` and fill the GPU array `pixels` with pixel
+        colors."""
+        if not keep_last_render:
+            self.dxlen.fill(0)
+
+        if alpha_depth > self.max_alpha_depth:
+            raise Exception('alpha_depth > max_alpha_depth')
+
+        if not isinstance(pixels, ga.GPUArray):
+            raise TypeError('`pixels` must be a %s instance.' % ga.GPUArray)
+
+        if pixels.size != self.pos.size:
+            raise ValueError('`pixels`.size != number of rays')
+
+        self.render_funcs.render(np.int32(self.pos.size), self.pos, self.dir, gpu_geometry.gpudata, np.uint32(alpha_depth), pixels, self.dx, self.dxlen, self.color, block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
+
+    def snapshot(self, gpu_geometry, alpha_depth=10):
+        "Render `gpu_geometry` and return a numpy array of pixel colors."
+        pixels = ga.empty(self.pos.size, dtype=np.uint32)
+        self.render(gpu_geometry, pixels, alpha_depth)
+        return pixels.get()
+
+def make_gpu_struct(size, members):
+    struct = cuda.mem_alloc(size)
+
+    i = 0
+    for member in members:
+        if isinstance(member, ga.GPUArray):
+            member = member.gpudata
+
+        if isinstance(member, cuda.DeviceAllocation):
+            if i % 8:
+                raise Exception('cannot align 64-bit pointer. '
+                                'arrange struct member variables in order of '
+                                'decreasing size.')
+
+            cuda.memcpy_htod(int(struct)+i, np.intp(int(member)))
+            i += 8
+        elif np.isscalar(member):
+            cuda.memcpy_htod(int(struct)+i, member)
+            i += member.nbytes
+        else:
+            raise TypeError('expected a GPU device pointer or scalar type.')
+
+    return struct
 
 def format_size(size):
     if size < 1e3:
@@ -226,96 +312,142 @@ def format_array(name, array):
         (name, format_size(len(array)), format_size(array.nbytes))
 
 class GPUGeometry(object):
-    def __init__(self, gpu, geometry, load=True, activate=True, print_usage=True):
-        self.geometry = geometry
+    def __init__(self, geometry, wavelengths=None, print_usage=True):
+        if wavelengths is None:
+            wavelengths = standard_wavelengths
+
+        try:
+            wavelength_step = np.unique(np.diff(wavelengths)).item()
+        except ValueError:
+            raise ValueError('wavelengths must be equally spaced apart.')
 
-        self.module = gpu.module
-        self.gpu_funcs = GPUFuncs(gpu.module)
+        geometry_source = get_cu_source('geometry.h')
+        material_struct_size = characterize.sizeof('Material', geometry_source)
+        surface_struct_size = characterize.sizeof('Surface', geometry_source)
+        geometry_struct_size = characterize.sizeof('Geometry', geometry_source)
 
-        if load:
-            self.load(activate, print_usage)
+        self.material_data = []
+        self.material_ptrs = []
 
-    def load(self, activate=True, print_usage=True):
-        self.gpu_funcs.set_wavelength_range(np.float32(standard_wavelengths[0]), np.float32(standard_wavelengths[-1]), np.float32(standard_wavelengths[1]-standard_wavelengths[0]), np.uint32(standard_wavelengths.size), block=(1,1,1), grid=(1,1))
+        def interp_material_property(wavelengths, property):
+            # note that it is essential that the material properties be
+            # interpolated linearly. this fact is used in the propagation
+            # code to guarantee that probabilities still sum to one.
+            return np.interp(wavelengths, property[:,0], property[:,1]).astype(np.float32)
 
-        self.materials = []
-        for i in range(len(self.geometry.unique_materials)):
-            material = copy(self.geometry.unique_materials[i])
+        for i in range(len(geometry.unique_materials)):
+            material = geometry.unique_materials[i]
 
             if material is None:
                 raise Exception('one or more triangles is missing a material.')
 
-            material.refractive_index_gpu = ga.to_gpu(np.interp(standard_wavelengths, material.refractive_index[:,0], material.refractive_index[:,1]).astype(np.float32))
-            material.absorption_length_gpu = ga.to_gpu(np.interp(standard_wavelengths, material.absorption_length[:,0], material.absorption_length[:,1]).astype(np.float32))
-            material.scattering_length_gpu = ga.to_gpu(np.interp(standard_wavelengths, material.scattering_length[:,0], material.scattering_length[:,1]).astype(np.float32))
+            refractive_index = interp_material_property(wavelengths, material.refractive_index)
+            refractive_index_gpu = ga.to_gpu(refractive_index)
+            absorption_length = interp_material_property(wavelengths, material.absorption_length)
+            absorption_length_gpu = ga.to_gpu(absorption_length)
+            scattering_length = interp_material_property(wavelengths, material.scattering_length)
+            scattering_length_gpu = ga.to_gpu(scattering_length)
 
-            self.gpu_funcs.set_material(np.int32(i), material.refractive_index_gpu, material.absorption_length_gpu, material.scattering_length_gpu, block=(1,1,1), grid=(1,1))
+            self.material_data.append(refractive_index_gpu)
+            self.material_data.append(absorption_length_gpu)
+            self.material_data.append(scattering_length_gpu)
 
-            self.materials.append(material)
+            material_gpu = \
+                make_gpu_struct(material_struct_size,
+                                [refractive_index_gpu, absorption_length_gpu,
+                                 scattering_length_gpu,
+                                 np.uint32(len(wavelengths)),
+                                 np.float32(wavelength_step),
+                                 np.float32(wavelengths[0])])
 
-        self.surfaces = []
-        for i in range(len(self.geometry.unique_surfaces)):
-            surface = copy(self.geometry.unique_surfaces[i])
-
-            if surface is None:
-                continue
+            self.material_ptrs.append(material_gpu)
 
-            surface.detect_gpu = ga.to_gpu(np.interp(standard_wavelengths, surface.detect[:,0], surface.detect[:,1]).astype(np.float32))
-            surface.absorb_gpu = ga.to_gpu(np.interp(standard_wavelengths, surface.absorb[:,0], surface.absorb[:,1]).astype(np.float32))
-            surface.reflect_diffuse_gpu = ga.to_gpu(np.interp(standard_wavelengths, surface.reflect_diffuse[:,0], surface.reflect_diffuse[:,1]).astype(np.float32))
-            surface.reflect_specular_gpu = ga.to_gpu(np.interp(standard_wavelengths, surface.reflect_specular[:,0], surface.reflect_specular[:,1]).astype(np.float32))
+        self.material_pointer_array = \
+            make_gpu_struct(8*len(self.material_ptrs), self.material_ptrs)
 
-            self.gpu_funcs.set_surface(np.int32(i), surface.detect_gpu, surface.absorb_gpu, surface.reflect_diffuse_gpu, surface.reflect_specular_gpu, block=(1,1,1), grid=(1,1))
+        self.surface_data = []
+        self.surface_ptrs = []
 
-            self.surfaces.append(surface)
+        for i in range(len(geometry.unique_surfaces)):
+            surface = geometry.unique_surfaces[i]
 
-        self.vertices_gpu = ga.to_gpu(to_float3(self.geometry.mesh.vertices))
-
-        triangles = \
-            np.empty(len(self.geometry.mesh.triangles), dtype=ga.vec.uint4)
-        triangles['x'] = self.geometry.mesh.triangles[:,0]
-        triangles['y'] = self.geometry.mesh.triangles[:,1]
-        triangles['z'] = self.geometry.mesh.triangles[:,2]
-        triangles['w'] = ((self.geometry.material1_index & 0xff) << 24) | ((self.geometry.material2_index & 0xff) << 16) | ((self.geometry.surface_index & 0xff) << 8)
-        self.triangles_gpu = ga.to_gpu(triangles)
-
-        self.lower_bounds_gpu = ga.to_gpu(to_float3(self.geometry.lower_bounds))
-
-        self.upper_bounds_gpu = ga.to_gpu(to_float3(self.geometry.upper_bounds))
+            if surface is None:
+                # need something to copy to the surface array struct
+                # that is the same size as a 64-bit pointer.
+                # this pointer will never be used by the simulation.
+                self.surface_ptrs.append(np.uint64(0))
+                continue
 
-        self.colors_gpu = ga.to_gpu(self.geometry.colors.astype(np.uint32))
-        self.node_map_gpu = ga.to_gpu(self.geometry.node_map.astype(np.uint32))
-        self.node_map_end_gpu = ga.to_gpu(self.geometry.node_map_end.astype(np.uint32))
-        self.solid_id_map_gpu = ga.to_gpu(self.geometry.solid_id.astype(np.uint32))
+            detect = interp_material_property(wavelengths, surface.detect)
+            detect_gpu = ga.to_gpu(detect)
+            absorb = interp_material_property(wavelengths, surface.absorb)
+            absorb_gpu = ga.to_gpu(absorb)
+            reflect_diffuse = interp_material_property(wavelengths, surface.reflect_diffuse)
+            reflect_diffuse_gpu = ga.to_gpu(reflect_diffuse)
+            reflect_specular = interp_material_property(wavelengths, surface.reflect_specular)
+            reflect_specular_gpu = ga.to_gpu(reflect_specular)
+
+            self.surface_data.append(detect_gpu)
+            self.surface_data.append(absorb_gpu)
+            self.surface_data.append(reflect_diffuse_gpu)
+            self.surface_data.append(reflect_specular_gpu)
+
+            surface_gpu = \
+                make_gpu_struct(surface_struct_size,
+                                [detect_gpu, absorb_gpu,
+                                 reflect_diffuse_gpu,
+                                 reflect_specular_gpu,
+                                 np.uint32(len(wavelengths)),
+                                 np.float32(wavelength_step),
+                                 np.float32(wavelengths[0])])
+
+            self.surface_ptrs.append(surface_gpu)
+
+        self.surface_pointer_array = \
+            make_gpu_struct(8*len(self.surface_ptrs), self.surface_ptrs)
+
+        self.vertices = ga.to_gpu(to_float3(geometry.mesh.vertices))
+        self.triangles = ga.to_gpu(to_uint3(geometry.mesh.triangles))
+
+        material_codes = (((geometry.material1_index & 0xff) << 24) |
+                          ((geometry.material2_index & 0xff) << 16) |
+                          ((geometry.surface_index & 0xff) << 8)).astype(np.uint32)
+
+        self.material_codes = ga.to_gpu(material_codes)
+
+        self.lower_bounds = ga.to_gpu(to_float3(geometry.lower_bounds))
+        self.upper_bounds = ga.to_gpu(to_float3(geometry.upper_bounds))
+        self.colors = ga.to_gpu(geometry.colors.astype(np.uint32))
+        self.node_map = ga.to_gpu(geometry.node_map.astype(np.uint32))
+        self.node_map_end = ga.to_gpu(geometry.node_map_end.astype(np.uint32))
+        self.solid_id_map = ga.to_gpu(geometry.solid_id.astype(np.uint32))
+
+        self.gpudata = make_gpu_struct(geometry_struct_size,
+                                       [self.vertices, self.triangles,
+                                        self.material_codes,
+                                        self.colors, self.lower_bounds,
+                                        self.upper_bounds, self.node_map,
+                                        self.node_map_end,
+                                        self.material_pointer_array,
+                                        self.surface_pointer_array,
+                                        np.uint32(geometry.start_node),
+                                        np.uint32(geometry.first_node)])
 
-        self.node_map_tex = self.module.get_texref('node_map')
-        self.node_map_end_tex = self.module.get_texref('node_map_end')
+        self.geometry = geometry
 
         if print_usage:
             self.print_device_usage()
 
-        if activate:
-            self.activate()
-
-    def activate(self):
-        self.gpu_funcs.set_global_mesh_variables(self.triangles_gpu, self.vertices_gpu, self.colors_gpu, np.uint32(self.geometry.node_map.size-1), np.uint32(self.geometry.first_node), self.lower_bounds_gpu, self.upper_bounds_gpu, block=(1,1,1), grid=(1,1))
-
-        self.node_map_tex.set_address(self.node_map_gpu.gpudata, self.node_map_gpu.nbytes)
-        self.node_map_end_tex.set_address(self.node_map_end_gpu.gpudata, self.node_map_end_gpu.nbytes)
-
-        self.node_map_tex.set_format(cuda.array_format.UNSIGNED_INT32, 1)
-        self.node_map_end_tex.set_format(cuda.array_format.UNSIGNED_INT32, 1)
-
     def print_device_usage(self):
         print 'device usage:'
         print '-'*10
-        print format_array('vertices', self.vertices_gpu)
-        print format_array('triangles', self.triangles_gpu)
-        print format_array('lower_bounds', self.lower_bounds_gpu)
-        print format_array('upper_bounds', self.upper_bounds_gpu)
-        print format_array('node_map', self.node_map_gpu)
-        print format_array('node_map_end', self.node_map_end_gpu)
-        print '%-15s %6s %6s' % ('total', '', format_size(self.vertices_gpu.nbytes + self.triangles_gpu.nbytes + self.lower_bounds_gpu.nbytes + self.upper_bounds_gpu.nbytes + self.node_map_gpu.nbytes + self.node_map_end_gpu.nbytes))
+        print format_array('vertices', self.vertices)
+        print format_array('triangles', self.triangles)
+        print format_array('lower_bounds', self.lower_bounds)
+        print format_array('upper_bounds', self.upper_bounds)
+        print format_array('node_map', self.node_map)
+        print format_array('node_map_end', self.node_map_end)
+        print '%-15s %6s %6s' % ('total', '', format_size(self.vertices.nbytes + self.triangles.nbytes + self.lower_bounds.nbytes + self.upper_bounds.nbytes + self.node_map.nbytes + self.node_map_end.nbytes))
         print '-'*10
         free, total = cuda.mem_get_info()
         print '%-15s %6s %6s' % ('device total', '', format_size(total))
@@ -324,21 +456,24 @@ class GPUGeometry(object):
         print
 
     def reset_colors(self):
-        self.colors_gpu.set_async(self.geometry.colors.astype(np.uint32))
+        self.colors.set_async(self.geometry.colors.astype(np.uint32))
 
-    def color_solids(self, solid_hit, colors):
+    def color_solids(self, solid_hit, colors, nblocks_per_thread=64,
+                     max_blocks=1024):
         solid_hit_gpu = ga.to_gpu(np.array(solid_hit, dtype=np.bool))
         solid_colors_gpu = ga.to_gpu(np.array(colors, dtype=np.uint32))
 
+        module = get_cu_module('mesh.h', options=cuda_options)
+        color_solids = module.get_function('color_solids')
+
         for first_triangle, triangles_this_round, blocks in \
-                chunk_iterator(self.triangles_gpu.size):
-            self.gpu_funcs.color_solids(np.int32(first_triangle),
-                                        np.int32(triangles_this_round),
-                                        self.solid_id_map_gpu, 
-                                        solid_hit_gpu, 
-                                        solid_colors_gpu, 
-                                        block=(64,1,1), 
-                                        grid=(blocks,1))
+                chunk_iterator(self.triangles.size, nblocks_per_thread,
+                               max_blocks):
+            color_solids(np.int32(first_triangle),
+                         np.int32(triangles_this_round), self.solid_id_map,
+                         solid_hit_gpu, solid_colors_gpu, self.gpudata,
+                         block=(nblocks_per_thread,1,1), 
+                         grid=(blocks,1))
 
 class GPUDaq(object):
     def __init__(self, gpu_geometry, max_pmt_id, pmt_rms=1.2e-9):
@@ -347,9 +482,10 @@ class GPUDaq(object):
         self.channel_history_gpu = ga.zeros_like(self.earliest_time_int_gpu)
         self.channel_q_gpu = ga.zeros_like(self.earliest_time_int_gpu)
         self.daq_pmt_rms = pmt_rms
-        self.solid_id_map_gpu = gpu_geometry.solid_id_map_gpu
+        self.solid_id_map_gpu = gpu_geometry.solid_id_map
 
-        self.module = get_cu_module('daq', include_source_directory=False)
+        self.module = get_cu_module('daq.cu', options=cuda_options, 
+                                    include_source_directory=False)
         self.gpu_funcs = GPUFuncs(self.module)
 
     def acquire(self, gpuphotons, rng_states, nthreads_per_block=64, max_blocks=1024):
@@ -369,7 +505,8 @@ class GPUDaq(object):
 
 class GPUPDF(object):
     def __init__(self):
-        self.module = get_cu_module('daq')
+        self.module = get_cu_module('daq.cu', options=cuda_options,
+                                    include_source_directory=False)
         self.gpu_funcs = GPUFuncs(self.module)
 
     def setup_pdf(self, max_pmt_id, tbins, trange, qbins, qrange):
@@ -556,10 +693,6 @@ class GPU(object):
 
         self.context.set_cache_config(cuda.func_cache.PREFER_L1)
 
-        cuda_options = ['--use_fast_math']#, '--ptxas-options=-v']
-
-        self.module = get_cu_module('kernel', options=cuda_options)
-
     def print_mem_info(self):
         free, total = cuda.mem_get_info()