28 files changed, 1927 insertions, 1736 deletions
diff --git a/benchmark.py b/benchmark.py
index f839f61..31c145c 100755
--- a/benchmark.py
+++ b/benchmark.py
@@ -17,21 +17,21 @@ from chroma import optics
 # Generator processes need to fork BEFORE the GPU context is setup
 g4generator = generator.photon.G4ParallelGenerator(4, optics.water_wcsim)
 
-def intersect(gpu_instance, number=100, nphotons=500000, nthreads_per_block=64, max_blocks=1024):
+def intersect(gpu_geometry, number=100, nphotons=500000, nthreads_per_block=64,
+              max_blocks=1024):
     "Returns the average number of ray intersections per second."
     distances_gpu = ga.empty(nphotons, dtype=np.float32)
 
+    module = gpu.get_cu_module('mesh.h', options=('--use_fast_math',))
+    gpu_funcs = gpu.GPUFuncs(module)
+
     run_times = []
     for i in tools.progress(range(number)):
-        pos = np.zeros((nphotons,3), dtype=np.float32)
-        dir = sample.uniform_sphere(nphotons)
-
-        gpu_rays = gpu.GPURays(pos, dir)
-
-        gpu_funcs = gpu.GPUFuncs(gpu_instance.module)
+        pos = ga.zeros(nphotons, dtype=ga.vec.float3)
+        dir = ga.to_gpu(gpu.to_float3(sample.uniform_sphere(nphotons)))
 
         t0 = time.time()
-        gpu_funcs.distance_to_mesh(np.int32(gpu_rays.pos.size), gpu_rays.pos, gpu_rays.dir, distances_gpu, block=(nthreads_per_block,1,1), grid=(gpu_rays.pos.size//nthreads_per_block+1,1))
+        gpu_funcs.distance_to_mesh(np.int32(pos.size), pos, dir, gpu_geometry.gpudata, distances_gpu, block=(nthreads_per_block,1,1), grid=(pos.size//nthreads_per_block+1,1))
         cuda.Context.get_current().synchronize()
         elapsed = time.time() - t0
 
@@ -39,7 +39,7 @@ def intersect(gpu_instance, number=100, nphotons=500000, nthreads_per_block=64,
             # first kernel call incurs some driver overhead
             run_times.append(elapsed)
 
-    return gpu_rays.pos.size/ufloat((np.mean(run_times),np.std(run_times)))
+    return nphotons/ufloat((np.mean(run_times),np.std(run_times)))
 
 def load_photons(number=100, nphotons=500000):
     """Returns the average number of photons moved to the GPU device memory
@@ -64,7 +64,8 @@ def load_photons(number=100, nphotons=500000):
 
     return nphotons/ufloat((np.mean(run_times),np.std(run_times)))
 
-def propagate(gpu_instance, number=10, nphotons=500000, nthreads_per_block=64, max_blocks=1024):
+def propagate(gpu_geometry, number=10, nphotons=500000, nthreads_per_block=64,
+              max_blocks=1024):
     "Returns the average number of photons propagated on the GPU per second."
     rng_states = gpu.get_rng_states(nthreads_per_block*max_blocks)
 
@@ -79,7 +80,8 @@ def propagate(gpu_instance, number=10, nphotons=500000, nthreads_per_block=64, m
         gpu_photons = gpu.GPUPhotons(photons)
 
         t0 = time.time()
-        gpu.propagate(gpu_instance, gpu_photons, rng_states, nthreads_per_block, max_blocks)
+        gpu_photons.propagate(gpu_geometry, rng_states, nthreads_per_block,
+                              max_blocks)
         cuda.Context.get_current().synchronize()
         elapsed = time.time() - t0
 
@@ -90,7 +92,8 @@ def propagate(gpu_instance, number=10, nphotons=500000, nthreads_per_block=64, m
     return nphotons/ufloat((np.mean(run_times),np.std(run_times)))
 
 @tools.profile_if_possible
-def pdf(gpu_instance, gpu_geometry, max_pmt_id, npdfs=10, nevents=100, nreps=1, nthreads_per_block=64, max_blocks=1024):
+def pdf(gpu_geometry, max_pmt_id, npdfs=10, nevents=100, nreps=1,
+        nthreads_per_block=64, max_blocks=1024):
     """
     Returns the average number of 100 MeV events per second that can be
     histogrammed per second.
@@ -118,14 +121,17 @@ def pdf(gpu_instance, gpu_geometry, max_pmt_id, npdfs=10, nevents=100, nreps=1,
         t0 = time.time()
         gpu_pdf.clear_pdf()
 
-        vertex_gen = generator.vertex.constant_particle_gun('e-', (0,0,0), (1,0,0), 100)
+        vertex_gen = generator.vertex.constant_particle_gun('e-', (0,0,0),
+                                                            (1,0,0), 100)
         vertex_iter = itertools.islice(vertex_gen, nevents)
 
         for ev in g4generator.generate_events(vertex_iter):
             for j in xrange(nreps):
                 gpu_photons = gpu.GPUPhotons(ev.photons_beg)
-                gpu.propagate(gpu_instance, gpu_photons, rng_states, nthreads_per_block, max_blocks)
-                gpu_channels = gpu_daq.acquire(gpu_photons, rng_states, nthreads_per_block, max_blocks)
+                gpu_photons.propagate(gpu_geometry, rng_states,
+                                      nthreads_per_block, max_blocks)
+                gpu_channels = gpu_daq.acquire(gpu_photons, rng_states,
+                                               nthreads_per_block, max_blocks)
                 gpu_pdf.add_hits_to_pdf(gpu_channels, nthreads_per_block)
 
         hitcount, pdf = gpu_pdf.get_pdfs()
@@ -146,16 +152,19 @@ if __name__ == '__main__':
     lbne.build(bits=11)
 
     gpu_instance = gpu.GPU()
-    gpu_geometry = gpu.GPUGeometry(gpu_instance, lbne)
+    gpu_geometry = gpu.GPUGeometry(lbne)
     
     gpu_instance.print_mem_info()
-    print '%s ray intersections/sec.' % tools.ufloat_to_str(intersect(gpu_instance))
+    print '%s ray intersections/sec.' % \
+        tools.ufloat_to_str(intersect(gpu_geometry))
     gc.collect()
     gpu_instance.print_mem_info()
     print '%s photons loaded/sec.' % tools.ufloat_to_str(load_photons())
     gc.collect()
     gpu_instance.print_mem_info()
-    print '%s photons propagated/sec.' % tools.ufloat_to_str(propagate(gpu_instance))
+    print '%s photons propagated/sec.' % \
+        tools.ufloat_to_str(propagate(gpu_geometry))
     gc.collect()
     gpu_instance.print_mem_info()
-    print '%s 100 MeV events histogrammed/s' % tools.ufloat_to_str(pdf(gpu_instance, gpu_geometry, max(lbne.pmtids)))
+    print '%s 100 MeV events histogrammed/s' % \
+        tools.ufloat_to_str(pdf(gpu_geometry, max(lbne.pmtids)))
diff --git a/camera.py b/camera.py
index ea76f28..1197e8c 100755
--- a/camera.py
+++ b/camera.py
@@ -102,17 +102,18 @@ def encode_movie(dir):
     print 'movie saved to %s.' % filename
 
 class Camera(Thread):
-    def __init__(self, geometry, size=(800,600), device_id=None, enable3d=False, green_magenta=False):
+    def __init__(self, geometry, size=(800,600), device_id=None, enable3d=False, green_magenta=False, alpha_depth=10):
         Thread.__init__(self)
         self.geometry = geometry
         self.device_id = device_id
         self.size = size
         self.enable3d = enable3d
         self.green_magenta = green_magenta
+        self.alpha_depth = alpha_depth
 
         self.unique_bvh_layers = np.unique(self.geometry.layers)
         self.currentlayer = None
-        self.bvh_layers = {self.currentlayer : self.geometry}
+        self.bvh_layers = {}
 
         try: 
             import spnav as spnav_module
@@ -123,15 +124,19 @@ class Camera(Thread):
 
     def init_gpu(self):
         self.gpu_instance = gpu.GPU(self.device_id)
-        self.gpu_geometry = gpu.GPUGeometry(self.gpu_instance, self.geometry)
-        self.gpu_funcs = gpu.GPUFuncs(self.gpu_instance.module)
+        self.gpu_geometry = gpu.GPUGeometry(self.geometry)
+        self.gpu_funcs = gpu.GPUFuncs(gpu.get_cu_module('mesh.h'))
+        self.hybrid_funcs = gpu.GPUFuncs(gpu.get_cu_module('hybrid_render.cu'))
+
+        self.gpu_geometries = [self.gpu_geometry]
 
         self.width, self.height = self.size
 
         self.npixels = self.width*self.height
 
         pygame.init()
-        self.screen = pygame.display.set_mode(self.size)
+        self.window = pygame.display.set_mode(self.size)
+        self.screen = pygame.Surface(self.size, pygame.SRCALPHA)
         pygame.display.set_caption('')
         self.clock = pygame.time.Clock()
 
@@ -182,9 +187,12 @@ class Camera(Thread):
 
             self.rays = gpu.GPURays(pos, dir)
 
+            self.distance_array = ga.empty(self.alpha_depth*self.rays.pos.size, dtype=np.float32)
+            self.index_array = ga.empty(self.alpha_depth*self.rays.pos.size, dtype=np.uint32)
+            self.n_array = ga.zeros(self.rays.pos.size, dtype=np.uint32)
+
             self.pixels_gpu = ga.empty(self.npixels, dtype=np.int32)
 
-        self.alpha = True
         self.movie = False
         self.movie_index = 0
         self.movie_dir = None
@@ -218,7 +226,7 @@ class Camera(Thread):
         for wavelength, rgb_tuple in \
                 zip([685.0, 545.0, 445.0],[(1,0,0),(0,1,0),(0,0,1)]):
             for i in range(self.xyz_lookup1_gpu.size//(self.npixels)+1):
-                self.gpu_funcs.update_xyz_lookup(np.int32(self.npixels), np.int32(self.xyz_lookup1_gpu.size), np.int32(i*self.npixels), ga.vec.make_float3(*source_position), self.rng_states_gpu, np.float32(wavelength), ga.vec.make_float3(*rgb_tuple), self.xyz_lookup1_gpu, self.xyz_lookup2_gpu, np.int32(self.max_steps), block=(self.nblocks,1,1), grid=(self.npixels//self.nblocks+1,1))
+                self.hybrid_funcs.update_xyz_lookup(np.int32(self.npixels), np.int32(self.xyz_lookup1_gpu.size), np.int32(i*self.npixels), ga.vec.make_float3(*source_position), self.rng_states_gpu, np.float32(wavelength), ga.vec.make_float3(*rgb_tuple), self.xyz_lookup1_gpu, self.xyz_lookup2_gpu, np.int32(self.max_steps), self.gpu_geometry.gpudata, block=(self.nblocks,1,1), grid=(self.npixels//self.nblocks+1,1))
 
         self.nlookup_calls += 1
 
@@ -234,7 +242,7 @@ class Camera(Thread):
     def update_image_from_rays(self, image, rays):
         for wavelength, rgb_tuple in \
                 zip([685.0, 545.0, 445.0],[(1,0,0),(0,1,0),(0,0,1)]):
-            self.gpu_funcs.update_xyz_image(np.int32(rays.pos.size), self.rng_states_gpu, rays.pos, rays.dir, np.float32(wavelength), ga.vec.make_float3(*rgb_tuple), self.xyz_lookup1_gpu, self.xyz_lookup2_gpu, image, np.int32(self.nlookup_calls), np.int32(self.max_steps), block=(self.nblocks,1,1), grid=(rays.pos.size//self.nblocks+1,1))
+            self.hybrid_funcs.update_xyz_image(np.int32(rays.pos.size), self.rng_states_gpu, rays.pos, rays.dir, np.float32(wavelength), ga.vec.make_float3(*rgb_tuple), self.xyz_lookup1_gpu, self.xyz_lookup2_gpu, image, np.int32(self.nlookup_calls), np.int32(self.max_steps), self.gpu_geometry.gpudata, block=(self.nblocks,1,1), grid=(rays.pos.size//self.nblocks+1,1))
 
     def update_image(self):
         if self.enable3d:
@@ -247,10 +255,10 @@ class Camera(Thread):
 
     def process_image(self):
         if self.enable3d:
-            self.gpu_funcs.process_image(np.int32(self.pixels1_gpu.size), self.image1_gpu, self.pixels1_gpu, np.int32(self.nimages), block=(self.nblocks,1,1), grid=((self.pixels1_gpu.size)//self.nblocks+1,1))
-            self.gpu_funcs.process_image(np.int32(self.pixels2_gpu.size), self.image2_gpu, self.pixels2_gpu, np.int32(self.nimages), block=(self.nblocks,1,1), grid=((self.pixels2_gpu.size)//self.nblocks+1,1))
+            self.hybrid_funcs.process_image(np.int32(self.pixels1_gpu.size), self.image1_gpu, self.pixels1_gpu, np.int32(self.nimages), block=(self.nblocks,1,1), grid=((self.pixels1_gpu.size)//self.nblocks+1,1))
+            self.hybrid_funcs.process_image(np.int32(self.pixels2_gpu.size), self.image2_gpu, self.pixels2_gpu, np.int32(self.nimages), block=(self.nblocks,1,1), grid=((self.pixels2_gpu.size)//self.nblocks+1,1))
         else:
-            self.gpu_funcs.process_image(np.int32(self.pixels_gpu.size), self.image_gpu, self.pixels_gpu, np.int32(self.nimages), block=(self.nblocks,1,1), grid=((self.pixels_gpu.size)//self.nblocks+1,1))
+            self.hybrid_funcs.process_image(np.int32(self.pixels_gpu.size), self.image_gpu, self.pixels_gpu, np.int32(self.nimages), block=(self.nblocks,1,1), grid=((self.pixels_gpu.size)//self.nblocks+1,1))
 
     def screenshot(self, dir='', start=0):
         root, ext = 'screenshot', 'png'
@@ -320,29 +328,28 @@ class Camera(Thread):
 
             self.update()
 
-    def update_pixels(self):
+    def update_pixels(self, gpu_geometry=None, keep_last_render=False):
+        if gpu_geometry is None:
+            gpu_geometry = self.gpu_geometry
+
         if self.render:
             while self.nlookup_calls < 10:
                 self.update_xyz_lookup(self.source_position)
             self.update_image()
             self.process_image()
         else:
-            if self.alpha:
-                if self.enable3d:
-                    self.gpu_funcs.ray_trace_alpha(np.int32(self.rays1.pos.size), self.rays1.pos, self.rays1.dir, self.pixels1_gpu, block=(self.nblocks,1,1), grid=(self.rays2.pos.size//self.nblocks+1,1))
-                    self.gpu_funcs.ray_trace_alpha(np.int32(self.rays2.pos.size), self.rays2.pos, self.rays2.dir, self.pixels2_gpu, block=(self.nblocks,1,1), grid=(self.rays2.pos.size//self.nblocks+1,1))
-                else:
-                    self.gpu_funcs.ray_trace_alpha(np.int32(self.rays.pos.size), self.rays.pos, self.rays.dir, self.pixels_gpu, block=(self.nblocks,1,1), grid=(self.rays.pos.size//self.nblocks+1,1))
+            if self.enable3d:
+                self.rays1.render(gpu_geometry, self.pixels1_gpu,
+                                  self.alpha_depth, keep_last_render)
+                self.rays2.render(gpu_geometry, self.pixels2_gpu,
+                                  self.alpha_depth, keep_last_render)
             else:
-                if self.enable3d:
-                    self.gpu_funcs.ray_trace(np.int32(self.rays1.pos.size), self.rays1.pos, self.rays1.dir, self.pixels1_gpu, block=(self.nblocks,1,1), grid=(self.rays1.pos.size//self.nblocks+1,1))
-                    self.gpu_funcs.ray_trace(np.int32(self.rays2.pos.size), self.rays2.pos, self.rays2.dir, self.pixels2_gpu, block=(self.nblocks,1,1), grid=(self.rays2.pos.size//self.nblocks+1,1))
-                else:
-                    self.gpu_funcs.ray_trace(np.int32(self.rays.pos.size), self.rays.pos, self.rays.dir, self.pixels_gpu, block=(self.nblocks,1,1), grid=(self.rays.pos.size//self.nblocks+1,1))
+                self.rays.render(gpu_geometry, self.pixels_gpu,
+                                 self.alpha_depth, keep_last_render)
 
-    def update(self):
+    def update_viewing_angle(self):
         if self.enable3d:
-            self.gpu_funcs.distance_to_mesh(np.int32(self.scope_rays.pos.size), self.scope_rays.pos, self.scope_rays.dir, self.distances_gpu, block=(self.nblocks,1,1), grid=(self.scope_rays.pos.size//self.nblocks,1))
+            self.gpu_funcs.distance_to_mesh(np.int32(self.scope_rays.pos.size), self.scope_rays.pos, self.scope_rays.dir, self.gpu_geometry.gpudata, self.distances_gpu, block=(self.nblocks,1,1), grid=(self.scope_rays.pos.size//self.nblocks,1))
 
             baseline = ga.min(self.distances_gpu).get().item()
 
@@ -377,7 +384,16 @@ class Camera(Thread):
 
             self.viewing_angle = new_viewing_angle
 
-        self.update_pixels()
+    def update(self):
+        if self.enable3d:
+            self.update_viewing_angle()
+
+        n = len(self.gpu_geometries)
+        for i, gpu_geometry in enumerate(self.gpu_geometries):
+            if i == 0:
+                self.update_pixels(gpu_geometry)
+            else:
+                self.update_pixels(gpu_geometry, keep_last_render=True)
 
         if self.enable3d:
             pixels1 = self.pixels1_gpu.get()
@@ -387,12 +403,18 @@ class Camera(Thread):
                 pixels = (pixels1 & 0x00ff00) | (pixels2 & 0xff00ff)
             else:
                 pixels = (pixels1 & 0xff0000) | (pixels2 & 0x00ffff)
+
+            alpha = ((0xff & (pixels1 >> 24)) + (0xff & (pixels2 >> 24)))/2
+
+            pixels |= (alpha << 24)
         else:
             pixels = self.pixels_gpu.get()
 
         pygame.surfarray.blit_array(self.screen, pixels.reshape(self.size))
         if self.doom_mode:
             self.screen.blit(self.doom_hud, self.doom_rect)
+        self.window.fill(0)
+        self.window.blit(self.screen, (0,0))
         pygame.display.flip()
 
         if self.movie:
@@ -400,12 +422,17 @@ class Camera(Thread):
             self.movie_index += 1
 
     def loadlayer(self, layer):
-        try:
-            layergeometry = self.bvh_layers[layer]
-            layergeometry.activate()
-        except KeyError:
-            layergeometry = build(bvh_mesh(self.geometry, layer),8)
-            self.bvh_layers[layer] = gpu.GPUGeometry(self.gpu, layergeometry)
+        if layer is None:
+            self.gpu_geometries = [self.gpu_geometry]
+        else:
+            try:
+                gpu_geometry = self.bvh_layers[layer]
+            except KeyError:
+                geometry = build(bvh_mesh(self.geometry, layer), 8)
+                gpu_geometry = gpu.GPUGeometry(geometry)
+                self.bvh_layers[layer] = gpu_geometry
+
+            self.gpu_geometries = [self.gpu_geometry, gpu_geometry]
 
         self.update()
 
@@ -484,6 +511,15 @@ class Camera(Thread):
                 self.done = True
                 return
 
+            elif event.key == K_EQUALS:
+                self.alpha_depth += 1
+                self.update()
+
+            elif event.key == K_MINUS:
+                if self.alpha_depth > 0:
+                    self.alpha_depth -= 1
+                    self.update()
+
             elif event.key == K_PAGEDOWN:
                 if self.currentlayer is None:
                     self.currentlayer = self.unique_bvh_layers[-1]
@@ -517,10 +553,6 @@ class Camera(Thread):
                 self.clear_image()
                 self.update()
 
-            elif event.key == K_F8:
-                self.alpha = not self.alpha
-                self.update()
-
             elif event.key == K_m:
                 if self.movie:
                     encode_movie(self.movie_dir)
@@ -630,6 +662,20 @@ class EventViewer(Camera):
         Camera.__init__(self, geometry, **kwargs)
         self.rr = RootReader(filename)
 
+    def render_particle_track(self):
+        marker = Solid(make.cube(0.1), vacuum, vacuum)
+
+        geometry = Geometry()
+        for pos in self.ev.photons_beg.pos[::100]:
+            geometry.add_solid(marker, displacement=pos)
+
+        geometry.build(bits=11)
+        gpu_geometry = gpu.GPUGeometry(geometry)
+
+        self.gpu_geometries = [self.gpu_geometry, gpu_geometry]
+
+        self.update()
+
     def color_hit_pmts(self):
         self.gpu_geometry.reset_colors()
 
@@ -651,6 +697,7 @@ class EventViewer(Camera):
                     pass
                 else:
                     self.color_hit_pmts()
+                    self.render_particle_track()
                 return
 
             elif event.key == K_PAGEDOWN:
@@ -660,6 +707,7 @@ class EventViewer(Camera):
                     pass
                 else:
                     self.color_hit_pmts()
+                    self.render_particle_track()
                 return
 
         Camera.process_event(self, event)
diff --git a/geometry.py b/geometry.py
index 1b1f004..5b89110 100644
--- a/geometry.py
+++ b/geometry.py
@@ -383,6 +383,8 @@ class Geometry(object):
             if unique_zvalues.size == 1:
                 break
 
+        self.start_node = self.node_map.size - 1
+
         if use_cache:
             print >>sys.stderr, 'Writing BVH to cache directory...'
 
@@ -391,7 +393,7 @@ class Geometry(object):
 
             with gzip.GzipFile(cache_path, 'wb', compresslevel=1) as f:
                 data = {}
-                for key in ['lower_bounds', 'upper_bounds', 'node_map', 'node_map_end', 'layers', 'first_node']:
+                for key in ['lower_bounds', 'upper_bounds', 'node_map', 'node_map_end', 'layers', 'first_node', 'start_node']:
                     data[key] = getattr(self, key)
                 data['reorder'] = reorder
                 pickle.dump(data, f, -1)
diff --git a/gpu.py b/gpu.py
index 721ea67..de99e0a 100644
--- a/gpu.py
+++ b/gpu.py
@@ -8,7 +8,7 @@ import sys
 import pytools
 import pycuda.tools
 from pycuda.compiler import SourceModule
-import pycuda.characterize
+from pycuda import characterize
 import pycuda.driver as cuda
 from pycuda import gpuarray as ga
 
@@ -20,24 +20,37 @@ from chroma import event
 
 cuda.init()
 
-#@pycuda.tools.context_dependent_memoize
+# standard nvcc options
+cuda_options = ('--use_fast_math',)#, '--ptxas-options=-v']
+
+@pycuda.tools.context_dependent_memoize
 def get_cu_module(name, options=None, include_source_directory=True):
     """Returns a pycuda.compiler.SourceModule object from a CUDA source file
     located in the chroma src directory at src/[name].cu."""
     if options is None:
         options = []
+    elif isinstance(options, tuple):
+        options = list(options)
+    else:
+        raise TypeError('`options` must be a tuple.')
 
     srcdir = dirname(chroma.src.__file__)
 
     if include_source_directory:
         options += ['-I' + srcdir]
 
-    with open('%s/%s.cu' % (srcdir, name)) as f:
+    with open('%s/%s' % (srcdir, name)) as f:
         source = f.read()
 
     return pycuda.compiler.SourceModule(source, options=options,
                                         no_extern_c=True)
 
+def get_cu_source(name):
+    srcdir = dirname(chroma.src.__file__)
+    with open('%s/%s' % (srcdir, name)) as f:
+        source = f.read()
+    return source
+
 class GPUFuncs(object):
     """Simple container class for GPU functions as attributes."""
     def __init__(self, module):
@@ -73,7 +86,7 @@ __global__ void init_rng(int nthreads, curandState *s, unsigned long long seed,
 
 def get_rng_states(size, seed=1):
     "Return `size` number of CUDA random number generator states."
-    rng_states = cuda.mem_alloc(size*pycuda.characterize.sizeof('curandStateXORWOW', '#include <curand_kernel.h>'))
+    rng_states = cuda.mem_alloc(size*characterize.sizeof('curandStateXORWOW', '#include <curand_kernel.h>'))
 
     module = pycuda.compiler.SourceModule(init_rng_src, no_extern_c=True)
     init_rng = module.get_function('init_rng')
@@ -88,6 +101,12 @@ def to_float3(arr):
         arr = np.asarray(arr, order='c')
     return arr.astype(np.float32).view(ga.vec.float3)[:,0]
 
+def to_uint3(arr):
+    "Returns a pycuda.gpuarray.vec.uint3 array from an (N,3) array."
+    if not arr.flags['C_CONTIGUOUS']:
+        arr = np.asarray(arr, order='c')
+    return arr.astype(np.uint32).view(ga.vec.uint3)[:,0]
+
 def chunk_iterator(nelements, nthreads_per_block=64, max_blocks=1024):
     """Iterator that yields tuples with the values requried to process
     a long array in multiple kernel passes on the GPU.
@@ -121,6 +140,11 @@ class GPUPhotons(object):
         self.last_hit_triangles = ga.to_gpu(photons.last_hit_triangles.astype(np.int32))
         self.flags = ga.to_gpu(photons.flags.astype(np.uint32))
 
+        #cuda_options = ('--use_fast_math', '-w')#, '--ptxas-options=-v']
+
+        module = get_cu_module('propagate.cu', options=cuda_options)
+        self.gpu_funcs = GPUFuncs(module)
+
     def get(self):
         pos = self.pos.get().view(np.float32).reshape((len(self.pos),3))
         dir = self.dir.get().view(np.float32).reshape((len(self.dir),3))
@@ -131,6 +155,50 @@ class GPUPhotons(object):
         flags = self.flags.get()
         return event.Photons(pos, dir, pol, wavelengths, t, last_hit_triangles, flags)
 
+    def propagate(self, gpu_geometry, rng_states, nthreads_per_block=64,
+                  max_blocks=1024, max_steps=10):
+        """Propagate photons on GPU to termination or max_steps, whichever
+        comes first.
+
+        May be called repeatedly without reloading photon information if
+        single-stepping through photon history.
+
+        ..warning::
+            `rng_states` must have at least `nthreads_per_block`*`max_blocks`
+            number of curandStates.
+        """
+        nphotons = self.pos.size
+        step = 0
+        input_queue = np.zeros(shape=nphotons+1, dtype=np.uint32)
+        input_queue[1:] = np.arange(nphotons, dtype=np.uint32)
+        input_queue_gpu = ga.to_gpu(input_queue)
+        output_queue = np.zeros(shape=nphotons+1, dtype=np.uint32)
+        output_queue[0] = 1
+        output_queue_gpu = ga.to_gpu(output_queue)
+
+        while step < max_steps:
+            # Just finish the rest of the steps if the # of photons is low
+            if nphotons < nthreads_per_block * 16 * 8:
+                nsteps = max_steps - step
+            else:
+                nsteps = 1
+
+            for first_photon, photons_this_round, blocks in \
+                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
+                self.gpu_funcs.propagate(np.int32(first_photon), np.int32(photons_this_round), input_queue_gpu[1:], output_queue_gpu, rng_states, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, np.int32(nsteps), gpu_geometry.gpudata, block=(nthreads_per_block,1,1), grid=(blocks, 1))
+
+            step += nsteps
+
+            if step < max_steps:
+                temp = input_queue_gpu
+                input_queue_gpu = output_queue_gpu
+                output_queue_gpu = temp
+                output_queue_gpu[:1].set(np.uint32(1))
+                nphotons = input_queue_gpu[:1].get()[0] - 1
+
+        if ga.max(self.flags).get() & (1 << 31):
+            print >>sys.stderr, "WARNING: ABORTED PHOTONS"
+
 class GPUChannels(object):
     def __init__(self, t, q, flags):
         self.t = t
@@ -145,71 +213,89 @@ class GPUChannels(object):
         # enough hit time were hit.
         return event.Channels(t<1e8, t, q, self.flags.get())
 
-def propagate(gpu, gpuphotons, rng_states, nthreads_per_block=64, max_blocks=1024, max_steps=10):
-    """Propagate photons on GPU to termination or max_steps, whichever
-    comes first.
-
-    May be called repeatedly without reloading photon information if
-    single-stepping through photon history.
-
-    ..warning::
-        `rng_states` must have at least `nthreads_per_block`*`max_blocks`
-        number of curandStates.
-    """
-    nphotons = gpuphotons.pos.size
-    step = 0
-    input_queue = np.zeros(shape=nphotons+1, dtype=np.uint32)
-    input_queue[1:] = np.arange(nphotons, dtype=np.uint32)
-    input_queue_gpu = ga.to_gpu(input_queue)
-    output_queue = np.zeros(shape=nphotons+1, dtype=np.uint32)
-    output_queue[0] = 1
-    output_queue_gpu = ga.to_gpu(output_queue)
-
-    propagate = gpu.module.get_function('propagate')
-
-    while step < max_steps:
-        # Just finish the rest of the steps if the # of photons is low
-        if nphotons < nthreads_per_block * 16 * 8:
-            nsteps = max_steps - step
-        else:
-            nsteps = 1
-
-        for first_photon, photons_this_round, blocks in \
-                chunk_iterator(nphotons, nthreads_per_block, max_blocks):
-            propagate(np.int32(first_photon), np.int32(photons_this_round), input_queue_gpu[1:], output_queue_gpu, rng_states, gpuphotons.pos, gpuphotons.dir, gpuphotons.wavelengths, gpuphotons.pol, gpuphotons.t, gpuphotons.flags, gpuphotons.last_hit_triangles, np.int32(nsteps), block=(nthreads_per_block,1,1), grid=(blocks, 1))
-
-        step += nsteps
-
-        if step < max_steps:
-            temp = input_queue_gpu
-            input_queue_gpu = output_queue_gpu
-            output_queue_gpu = temp
-            output_queue_gpu[:1].set(np.uint32(1))
-            nphotons = input_queue_gpu[:1].get()[0] - 1
-
-    if ga.max(gpuphotons.flags).get() & (1 << 31):
-        print >>sys.stderr, "WARNING: ABORTED PHOTONS"
-
 class GPURays(object):
-    def __init__(self, pos, dir, nblocks=64):
+    """The GPURays class holds arrays of ray positions and directions
+    on the GPU that are used to render a geometry."""
+    def __init__(self, pos, dir, max_alpha_depth=10, nblocks=64):
         self.pos = ga.to_gpu(to_float3(pos))
         self.dir = ga.to_gpu(to_float3(dir))
 
+        self.max_alpha_depth = max_alpha_depth
+
         self.nblocks = nblocks
 
-        self.module = get_cu_module('transform')
-        self.gpu_funcs = GPUFuncs(self.module)
+        transform_module = get_cu_module('transform.cu', options=cuda_options)
+        self.transform_funcs = GPUFuncs(transform_module)
+
+        render_module = get_cu_module('render.cu', options=cuda_options)
+        self.render_funcs = GPUFuncs(render_module)
+
+        self.dx = ga.empty(max_alpha_depth*self.pos.size, dtype=np.float32)
+        self.color = ga.empty(self.dx.size, dtype=ga.vec.float4)
+        self.dxlen = ga.zeros(self.pos.size, dtype=np.uint32)
 
     def rotate(self, phi, n):
-        self.gpu_funcs.rotate(np.int32(self.pos.size), self.pos, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
-        self.gpu_funcs.rotate(np.int32(self.dir.size), self.dir, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.dir.size//self.nblocks+1,1))
+        "Rotate by an angle phi around the axis `n`."
+        self.transform_funcs.rotate(np.int32(self.pos.size), self.pos, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
+        self.transform_funcs.rotate(np.int32(self.dir.size), self.dir, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.dir.size//self.nblocks+1,1))
 
     def rotate_around_point(self, phi, n, point):
-        self.gpu_funcs.rotate_around_point(np.int32(self.pos.size), self.pos, np.float32(phi), ga.vec.make_float3(*n), ga.vec.make_float3(*point), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
-        self.gpu_funcs.rotate(np.int32(self.dir.size), self.dir, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.dir.size//self.nblocks+1,1))
+        """"Rotate by an angle phi around the axis `n` passing through
+        the point `point`."""
+        self.transform_funcs.rotate_around_point(np.int32(self.pos.size), self.pos, np.float32(phi), ga.vec.make_float3(*n), ga.vec.make_float3(*point), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
+        self.transform_funcs.rotate(np.int32(self.dir.size), self.dir, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.dir.size//self.nblocks+1,1))
 
     def translate(self, v):
-        self.gpu_funcs.translate(np.int32(self.pos.size), self.pos, ga.vec.make_float3(*v), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
+        "Translate the ray positions by the vector `v`."
+        self.transform_funcs.translate(np.int32(self.pos.size), self.pos, ga.vec.make_float3(*v), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
+
+    def render(self, gpu_geometry, pixels, alpha_depth=10,
+               keep_last_render=False):
+        """Render `gpu_geometry` and fill the GPU array `pixels` with pixel
+        colors."""
+        if not keep_last_render:
+            self.dxlen.fill(0)
+
+        if alpha_depth > self.max_alpha_depth:
+            raise Exception('alpha_depth > max_alpha_depth')
+
+        if not isinstance(pixels, ga.GPUArray):
+            raise TypeError('`pixels` must be a %s instance.' % ga.GPUArray)
+
+        if pixels.size != self.pos.size:
+            raise ValueError('`pixels`.size != number of rays')
+
+        self.render_funcs.render(np.int32(self.pos.size), self.pos, self.dir, gpu_geometry.gpudata, np.uint32(alpha_depth), pixels, self.dx, self.dxlen, self.color, block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
+
+    def snapshot(self, gpu_geometry, alpha_depth=10):
+        "Render `gpu_geometry` and return a numpy array of pixel colors."
+        pixels = ga.empty(self.pos.size, dtype=np.uint32)
+        self.render(gpu_geometry, pixels, alpha_depth)
+        return pixels.get()
+
+def make_gpu_struct(size, members):
+    struct = cuda.mem_alloc(size)
+
+    i = 0
+    for member in members:
+        if isinstance(member, ga.GPUArray):
+            member = member.gpudata
+
+        if isinstance(member, cuda.DeviceAllocation):
+            if i % 8:
+                raise Exception('cannot align 64-bit pointer. '
+                                'arrange struct member variables in order of '
+                                'decreasing size.')
+
+            cuda.memcpy_htod(int(struct)+i, np.intp(int(member)))
+            i += 8
+        elif np.isscalar(member):
+            cuda.memcpy_htod(int(struct)+i, member)
+            i += member.nbytes
+        else:
+            raise TypeError('expected a GPU device pointer or scalar type.')
+
+    return struct
 
 def format_size(size):
     if size < 1e3:
@@ -226,96 +312,142 @@ def format_array(name, array):
         (name, format_size(len(array)), format_size(array.nbytes))
 
 class GPUGeometry(object):
-    def __init__(self, gpu, geometry, load=True, activate=True, print_usage=True):
-        self.geometry = geometry
+    def __init__(self, geometry, wavelengths=None, print_usage=True):
+        if wavelengths is None:
+            wavelengths = standard_wavelengths
+
+        try:
+            wavelength_step = np.unique(np.diff(wavelengths)).item()
+        except ValueError:
+            raise ValueError('wavelengths must be equally spaced apart.')
 
-        self.module = gpu.module
-        self.gpu_funcs = GPUFuncs(gpu.module)
+        geometry_source = get_cu_source('geometry.h')
+        material_struct_size = characterize.sizeof('Material', geometry_source)
+        surface_struct_size = characterize.sizeof('Surface', geometry_source)
+        geometry_struct_size = characterize.sizeof('Geometry', geometry_source)
 
-        if load:
-            self.load(activate, print_usage)
+        self.material_data = []
+        self.material_ptrs = []
 
-    def load(self, activate=True, print_usage=True):
-        self.gpu_funcs.set_wavelength_range(np.float32(standard_wavelengths[0]), np.float32(standard_wavelengths[-1]), np.float32(standard_wavelengths[1]-standard_wavelengths[0]), np.uint32(standard_wavelengths.size), block=(1,1,1), grid=(1,1))
+        def interp_material_property(wavelengths, property):
+            # note that it is essential that the material properties be
+            # interpolated linearly. this fact is used in the propagation
+            # code to guarantee that probabilities still sum to one.
+            return np.interp(wavelengths, property[:,0], property[:,1]).astype(np.float32)
 
-        self.materials = []
-        for i in range(len(self.geometry.unique_materials)):
-            material = copy(self.geometry.unique_materials[i])
+        for i in range(len(geometry.unique_materials)):
+            material = geometry.unique_materials[i]
 
             if material is None:
                 raise Exception('one or more triangles is missing a material.')
 
-            material.refractive_index_gpu = ga.to_gpu(np.interp(standard_wavelengths, material.refractive_index[:,0], material.refractive_index[:,1]).astype(np.float32))
-            material.absorption_length_gpu = ga.to_gpu(np.interp(standard_wavelengths, material.absorption_length[:,0], material.absorption_length[:,1]).astype(np.float32))
-            material.scattering_length_gpu = ga.to_gpu(np.interp(standard_wavelengths, material.scattering_length[:,0], material.scattering_length[:,1]).astype(np.float32))
+            refractive_index = interp_material_property(wavelengths, material.refractive_index)
+            refractive_index_gpu = ga.to_gpu(refractive_index)
+            absorption_length = interp_material_property(wavelengths, material.absorption_length)
+            absorption_length_gpu = ga.to_gpu(absorption_length)
+            scattering_length = interp_material_property(wavelengths, material.scattering_length)
+            scattering_length_gpu = ga.to_gpu(scattering_length)
 
-            self.gpu_funcs.set_material(np.int32(i), material.refractive_index_gpu, material.absorption_length_gpu, material.scattering_length_gpu, block=(1,1,1), grid=(1,1))
+            self.material_data.append(refractive_index_gpu)
+            self.material_data.append(absorption_length_gpu)
+            self.material_data.append(scattering_length_gpu)
 
-            self.materials.append(material)
+            material_gpu = \
+                make_gpu_struct(material_struct_size,
+                                [refractive_index_gpu, absorption_length_gpu,
+                                 scattering_length_gpu,
+                                 np.uint32(len(wavelengths)),
+                                 np.float32(wavelength_step),
+                                 np.float32(wavelengths[0])])
 
-        self.surfaces = []
-        for i in range(len(self.geometry.unique_surfaces)):
-            surface = copy(self.geometry.unique_surfaces[i])
-
-            if surface is None:
-                continue
+            self.material_ptrs.append(material_gpu)
 
-            surface.detect_gpu = ga.to_gpu(np.interp(standard_wavelengths, surface.detect[:,0], surface.detect[:,1]).astype(np.float32))
-            surface.absorb_gpu = ga.to_gpu(np.interp(standard_wavelengths, surface.absorb[:,0], surface.absorb[:,1]).astype(np.float32))
-            surface.reflect_diffuse_gpu = ga.to_gpu(np.interp(standard_wavelengths, surface.reflect_diffuse[:,0], surface.reflect_diffuse[:,1]).astype(np.float32))
-            surface.reflect_specular_gpu = ga.to_gpu(np.interp(standard_wavelengths, surface.reflect_specular[:,0], surface.reflect_specular[:,1]).astype(np.float32))
+        self.material_pointer_array = \
+            make_gpu_struct(8*len(self.material_ptrs), self.material_ptrs)
 
-            self.gpu_funcs.set_surface(np.int32(i), surface.detect_gpu, surface.absorb_gpu, surface.reflect_diffuse_gpu, surface.reflect_specular_gpu, block=(1,1,1), grid=(1,1))
+        self.surface_data = []
+        self.surface_ptrs = []
 
-            self.surfaces.append(surface)
+        for i in range(len(geometry.unique_surfaces)):
+            surface = geometry.unique_surfaces[i]
 
-        self.vertices_gpu = ga.to_gpu(to_float3(self.geometry.mesh.vertices))
-
-        triangles = \
-            np.empty(len(self.geometry.mesh.triangles), dtype=ga.vec.uint4)
-        triangles['x'] = self.geometry.mesh.triangles[:,0]
-        triangles['y'] = self.geometry.mesh.triangles[:,1]
-        triangles['z'] = self.geometry.mesh.triangles[:,2]
-        triangles['w'] = ((self.geometry.material1_index & 0xff) << 24) | ((self.geometry.material2_index & 0xff) << 16) | ((self.geometry.surface_index & 0xff) << 8)
-        self.triangles_gpu = ga.to_gpu(triangles)
-
-        self.lower_bounds_gpu = ga.to_gpu(to_float3(self.geometry.lower_bounds))
-
-        self.upper_bounds_gpu = ga.to_gpu(to_float3(self.geometry.upper_bounds))
+            if surface is None:
+                # need something to copy to the surface array struct
+                # that is the same size as a 64-bit pointer.
+                # this pointer will never be used by the simulation.
+                self.surface_ptrs.append(np.uint64(0))
+                continue
 
-        self.colors_gpu = ga.to_gpu(self.geometry.colors.astype(np.uint32))
-        self.node_map_gpu = ga.to_gpu(self.geometry.node_map.astype(np.uint32))
-        self.node_map_end_gpu = ga.to_gpu(self.geometry.node_map_end.astype(np.uint32))
-        self.solid_id_map_gpu = ga.to_gpu(self.geometry.solid_id.astype(np.uint32))
+            detect = interp_material_property(wavelengths, surface.detect)
+            detect_gpu = ga.to_gpu(detect)
+            absorb = interp_material_property(wavelengths, surface.absorb)
+            absorb_gpu = ga.to_gpu(absorb)
+            reflect_diffuse = interp_material_property(wavelengths, surface.reflect_diffuse)
+            reflect_diffuse_gpu = ga.to_gpu(reflect_diffuse)
+            reflect_specular = interp_material_property(wavelengths, surface.reflect_specular)
+            reflect_specular_gpu = ga.to_gpu(reflect_specular)
+
+            self.surface_data.append(detect_gpu)
+            self.surface_data.append(absorb_gpu)
+            self.surface_data.append(reflect_diffuse_gpu)
+            self.surface_data.append(reflect_specular_gpu)
+
+            surface_gpu = \
+                make_gpu_struct(surface_struct_size,
+                                [detect_gpu, absorb_gpu,
+                                 reflect_diffuse_gpu,
+                                 reflect_specular_gpu,
+                                 np.uint32(len(wavelengths)),
+                                 np.float32(wavelength_step),
+                                 np.float32(wavelengths[0])])
+
+            self.surface_ptrs.append(surface_gpu)
+
+        self.surface_pointer_array = \
+            make_gpu_struct(8*len(self.surface_ptrs), self.surface_ptrs)
+
+        self.vertices = ga.to_gpu(to_float3(geometry.mesh.vertices))
+        self.triangles = ga.to_gpu(to_uint3(geometry.mesh.triangles))
+
+        material_codes = (((geometry.material1_index & 0xff) << 24) |
+                          ((geometry.material2_index & 0xff) << 16) |
+                          ((geometry.surface_index & 0xff) << 8)).astype(np.uint32)
+
+        self.material_codes = ga.to_gpu(material_codes)
+
+        self.lower_bounds = ga.to_gpu(to_float3(geometry.lower_bounds))
+        self.upper_bounds = ga.to_gpu(to_float3(geometry.upper_bounds))
+        self.colors = ga.to_gpu(geometry.colors.astype(np.uint32))
+        self.node_map = ga.to_gpu(geometry.node_map.astype(np.uint32))
+        self.node_map_end = ga.to_gpu(geometry.node_map_end.astype(np.uint32))
+        self.solid_id_map = ga.to_gpu(geometry.solid_id.astype(np.uint32))
+
+        self.gpudata = make_gpu_struct(geometry_struct_size,
+                                       [self.vertices, self.triangles,
+                                        self.material_codes,
+                                        self.colors, self.lower_bounds,
+                                        self.upper_bounds, self.node_map,
+                                        self.node_map_end,
+                                        self.material_pointer_array,
+                                        self.surface_pointer_array,
+                                        np.uint32(geometry.start_node),
+                                        np.uint32(geometry.first_node)])
 
-        self.node_map_tex = self.module.get_texref('node_map')
-        self.node_map_end_tex = self.module.get_texref('node_map_end')
+        self.geometry = geometry
 
         if print_usage:
             self.print_device_usage()
 
-        if activate:
-            self.activate()
-
-    def activate(self):
-        self.gpu_funcs.set_global_mesh_variables(self.triangles_gpu, self.vertices_gpu, self.colors_gpu, np.uint32(self.geometry.node_map.size-1), np.uint32(self.geometry.first_node), self.lower_bounds_gpu, self.upper_bounds_gpu, block=(1,1,1), grid=(1,1))
-
-        self.node_map_tex.set_address(self.node_map_gpu.gpudata, self.node_map_gpu.nbytes)
-        self.node_map_end_tex.set_address(self.node_map_end_gpu.gpudata, self.node_map_end_gpu.nbytes)
-
-        self.node_map_tex.set_format(cuda.array_format.UNSIGNED_INT32, 1)
-        self.node_map_end_tex.set_format(cuda.array_format.UNSIGNED_INT32, 1)
-
     def print_device_usage(self):
         print 'device usage:'
         print '-'*10
-        print format_array('vertices', self.vertices_gpu)
-        print format_array('triangles', self.triangles_gpu)
-        print format_array('lower_bounds', self.lower_bounds_gpu)
-        print format_array('upper_bounds', self.upper_bounds_gpu)
-        print format_array('node_map', self.node_map_gpu)
-        print format_array('node_map_end', self.node_map_end_gpu)
-        print '%-15s %6s %6s' % ('total', '', format_size(self.vertices_gpu.nbytes + self.triangles_gpu.nbytes + self.lower_bounds_gpu.nbytes + self.upper_bounds_gpu.nbytes + self.node_map_gpu.nbytes + self.node_map_end_gpu.nbytes))
+        print format_array('vertices', self.vertices)
+        print format_array('triangles', self.triangles)
+        print format_array('lower_bounds', self.lower_bounds)
+        print format_array('upper_bounds', self.upper_bounds)
+        print format_array('node_map', self.node_map)
+        print format_array('node_map_end', self.node_map_end)
+        print '%-15s %6s %6s' % ('total', '', format_size(self.vertices.nbytes + self.triangles.nbytes + self.lower_bounds.nbytes + self.upper_bounds.nbytes + self.node_map.nbytes + self.node_map_end.nbytes))
         print '-'*10
         free, total = cuda.mem_get_info()
         print '%-15s %6s %6s' % ('device total', '', format_size(total))
@@ -324,21 +456,24 @@ class GPUGeometry(object):
         print
 
     def reset_colors(self):
-        self.colors_gpu.set_async(self.geometry.colors.astype(np.uint32))
+        self.colors.set_async(self.geometry.colors.astype(np.uint32))
 
-    def color_solids(self, solid_hit, colors):
+    def color_solids(self, solid_hit, colors, nblocks_per_thread=64,
+                     max_blocks=1024):
         solid_hit_gpu = ga.to_gpu(np.array(solid_hit, dtype=np.bool))
         solid_colors_gpu = ga.to_gpu(np.array(colors, dtype=np.uint32))
 
+        module = get_cu_module('mesh.h', options=cuda_options)
+        color_solids = module.get_function('color_solids')
+
         for first_triangle, triangles_this_round, blocks in \
-                chunk_iterator(self.triangles_gpu.size):
-            self.gpu_funcs.color_solids(np.int32(first_triangle),
-                                        np.int32(triangles_this_round),
-                                        self.solid_id_map_gpu, 
-                                        solid_hit_gpu, 
-                                        solid_colors_gpu, 
-                                        block=(64,1,1), 
-                                        grid=(blocks,1))
+                chunk_iterator(self.triangles.size, nblocks_per_thread,
+                               max_blocks):
+            color_solids(np.int32(first_triangle),
+                         np.int32(triangles_this_round), self.solid_id_map,
+                         solid_hit_gpu, solid_colors_gpu, self.gpudata,
+                         block=(nblocks_per_thread,1,1), 
+                         grid=(blocks,1))
 
 class GPUDaq(object):
     def __init__(self, gpu_geometry, max_pmt_id, pmt_rms=1.2e-9):
@@ -347,9 +482,10 @@ class GPUDaq(object):
         self.channel_history_gpu = ga.zeros_like(self.earliest_time_int_gpu)
         self.channel_q_gpu = ga.zeros_like(self.earliest_time_int_gpu)
         self.daq_pmt_rms = pmt_rms
-        self.solid_id_map_gpu = gpu_geometry.solid_id_map_gpu
+        self.solid_id_map_gpu = gpu_geometry.solid_id_map
 
-        self.module = get_cu_module('daq', include_source_directory=False)
+        self.module = get_cu_module('daq.cu', options=cuda_options, 
+                                    include_source_directory=False)
         self.gpu_funcs = GPUFuncs(self.module)
 
     def acquire(self, gpuphotons, rng_states, nthreads_per_block=64, max_blocks=1024):
@@ -369,7 +505,8 @@ class GPUDaq(object):
 
 class GPUPDF(object):
     def __init__(self):
-        self.module = get_cu_module('daq')
+        self.module = get_cu_module('daq.cu', options=cuda_options,
+                                    include_source_directory=False)
         self.gpu_funcs = GPUFuncs(self.module)
 
     def setup_pdf(self, max_pmt_id, tbins, trange, qbins, qrange):
@@ -556,10 +693,6 @@ class GPU(object):
 
         self.context.set_cache_config(cuda.func_cache.PREFER_L1)
 
-        cuda_options = ['--use_fast_math']#, '--ptxas-options=-v']
-
-        self.module = get_cu_module('kernel', options=cuda_options)
-
     def print_mem_info(self):
         free, total = cuda.mem_get_info()
 
diff --git a/make.py b/make.py
index 523b10f..fa66c4d 100644
--- a/make.py
+++ b/make.py
@@ -72,12 +72,12 @@ def box(dx, dy, dz, center=(0,0,0)):
     "Return a box with linear dimensions `dx`, `dy`, and `dz`."
     return linear_extrude([-dx/2.0,dx/2.0,dx/2.0,-dx/2.0],[-dy/2.0,-dy/2.0,dy/2.0,dy/2.0],height=dz,center=center)
 
-def cube(size=1, height=None):
+def cube(size=1, height=None, center=(0,0,0)):
     "Return a cube mesh whose sides have length `size`."
     if height is None:
         height = size
 
-    return linear_extrude([-size/2.0,size/2.0,size/2.0,-size/2.0],[-size/2.0,-size/2.0,size/2.0,size/2.0], height=size)
+    return linear_extrude([-size/2.0,size/2.0,size/2.0,-size/2.0],[-size/2.0,-size/2.0,size/2.0,size/2.0], height=size, center=center)
 
 def cylinder(radius=1, height=2, radius2=None, nsteps=64):
     """
diff --git a/sim.py b/sim.py
index 931858f..53f7f9a 100755
--- a/sim.py
+++ b/sim.py
@@ -48,10 +48,7 @@ class Simulation(object):
             detector.build(bits=bvh_bits, use_cache=use_cache)
         
         self.gpu = gpu.GPU(cuda_device)
-
-        # geometry is loaded onto gpu by default
-        self.gpu_geometry = gpu.GPUGeometry(self.gpu, detector)
-        
+        self.gpu_geometry = gpu.GPUGeometry(detector)
         self.gpu_daq = gpu.GPUDaq(self.gpu_geometry, max(self.detector.pmtids))
         self.gpu_pdf = gpu.GPUPDF()
 
@@ -77,7 +74,7 @@ class Simulation(object):
         for ev in iterable:
             gpu_photons = gpu.GPUPhotons(ev.photons_beg)
 
-            gpu.propagate(self.gpu, gpu_photons, self.rng_states, nthreads_per_block=self.nthreads_per_block, max_blocks=self.max_blocks, max_steps=max_steps)
+            gpu_photons.propagate(self.gpu_geometry, self.rng_states, nthreads_per_block=self.nthreads_per_block, max_blocks=self.max_blocks, max_steps=max_steps)
 
             ev.nphotons = len(ev.photons_beg.pos)
 
@@ -114,7 +111,9 @@ class Simulation(object):
 
         for ev in iterable:
             gpu_photons = gpu.GPUPhotons(ev.photons_beg)
-            gpu.propagate(self.gpu, gpu_photons, self.rng_states, nthreads_per_block=self.nthreads_per_block, max_blocks=self.max_blocks)
+            gpu_photons.propagate(self.gpu_geometry, self.rng_states,
+                                  nthreads_per_block=self.nthreads_per_block,
+                                  max_blocks=self.max_blocks)
             gpu_channels = self.gpu_daq.acquire(gpu_photons, self.rng_states, nthreads_per_block=self.nthreads_per_block, max_blocks=self.max_blocks)
             self.gpu_pdf.add_hits_to_pdf(gpu_channels)
         
@@ -143,7 +142,9 @@ class Simulation(object):
 
         for ev in iterable:
             gpu_photons = gpu.GPUPhotons(ev.photons_beg)
-            gpu.propagate(self.gpu, gpu_photons, self.rng_states, nthreads_per_block=self.nthreads_per_block, max_blocks=self.max_blocks)
+            gpu_photons.propagate(self.gpu_geometry, self.rng_states,
+                                  nthreads_per_block=self.nthreads_per_block,
+                                  max_blocks=self.max_blocks)
             gpu_channels = self.gpu_daq.acquire(gpu_photons, self.rng_states, nthreads_per_block=self.nthreads_per_block, max_blocks=self.max_blocks)
             self.gpu_pdf.accumulate_pdf_eval(gpu_channels)
         
diff --git a/src/__init__.py b/src/__init__.py
index f2f5215..e69de29 100644
--- a/src/__init__.py
+++ b/src/__init__.py
@@ -1,7 +0,0 @@
-from os.path import dirname
-
-with open(dirname(__file__) + '/kernel.cu') as f:
-    kernel = f.read()
-
-with open(dirname(__file__) + '/daq.cu') as f:
-    daq = f.read()
diff --git a/src/alpha.h b/src/alpha.h
deleted file mode 100644
index ac75834..0000000
--- a/src/alpha.h
+++ /dev/null
@@ -1,141 +0,0 @@
-#ifndef __ALPHA_H__
-#define __ALPHA_H__
-
-#include "linalg.h"
-#include "intersect.h"
-#include "mesh.h"
-#include "sorting.h"
-
-#include "stdio.h"
-
-#define ALPHA_DEPTH 10
-
-__device__ int get_color_alpha(const float3 &origin, const float3& direction)
-{
-	float distance;
-
-	if (!intersect_node(origin, direction, g_start_node, -1.0f))
-		return 0;
-
-	unsigned int stack[STACK_SIZE];
-
-	unsigned int *head = &stack[0];
-	unsigned int *node = &stack[1];
-	unsigned int *tail = &stack[STACK_SIZE-1];
-	*node = g_start_node;
-
-	unsigned int i;
-
-	float distances[ALPHA_DEPTH];
-	unsigned int indices[ALPHA_DEPTH];
-	unsigned int n=0;
-
-	do
-	{
-		unsigned int first_child = tex1Dfetch(node_map, *node);
-		unsigned int stop = tex1Dfetch(node_map_end, *node);
-
-		while (*node >= g_first_node && stop == first_child+1)
-		{
-			*node = first_child;
-			first_child = tex1Dfetch(node_map, *node);
-			stop = tex1Dfetch(node_map_end, *node);
-		}
-		
-		if (*node >= g_first_node)
-		{
-			for (i=first_child; i < stop; i++)
-			{
-				if (intersect_node(origin, direction, i, -1.0f))
-				{
-					*node = i;
-					node++;
-				}
-			}
-
-			node--;
-		}
-		else // node is a leaf
-		{
-			for (i=first_child; i < stop; i++)
-			{
-				uint4 triangle_data = g_triangles[i];
-
-				float3 v0 = g_vertices[triangle_data.x];
-				float3 v1 = g_vertices[triangle_data.y];
-				float3 v2 = g_vertices[triangle_data.z];
-
-				if (intersect_triangle(origin, direction, v0, v1, v2, distance))
-				{
-					if (n < 1)
-					{
-						distances[0] = distance;
-						indices[0] = i;
-					}
-					else
-					{
-						unsigned long j = searchsorted(n, distances, distance);
-
-						if (j <= ALPHA_DEPTH-1)
-						{
-							insert(ALPHA_DEPTH, distances, j, distance);
-							insert(ALPHA_DEPTH, indices, j, i);
-						}
-					}
-					
-					if (n < ALPHA_DEPTH)
-						n++;
-				}
-				
-			} // triangle loop
-
-			node--;
-
-		} // node is a leaf
-
-	} // while loop
-	while (node != head);
-
-	if (n < 1)
-		return 0;
-
-	float scale = 1.0f;
-	float fr = 0.0f;
-	float fg = 0.0f;
-	float fb = 0.0f;
-	unsigned int index;
-	for (i=0; i < n; i++)
-	{
-		index = indices[i];
-
-		uint4 triangle_data = g_triangles[index];
-
-		float3 v0 = g_vertices[triangle_data.x];
-		float3 v1 = g_vertices[triangle_data.y];
-		float3 v2 = g_vertices[triangle_data.z];
-
-		float cos_theta = dot(normalize(cross(v1-v0,v2-v1)),-direction);
-
-		if (cos_theta < 0.0f)
-			cos_theta = dot(-normalize(cross(v1-v0,v2-v1)),-direction);
-
-		unsigned int r0 = 0xff & (g_colors[index] >> 16);
-		unsigned int g0 = 0xff & (g_colors[index] >> 8);
-		unsigned int b0 = 0xff & g_colors[index];
-
-		float alpha = (255 - (0xff & (g_colors[index] >> 24)))/255.0f;
-
-		fr += r0*scale*cos_theta*alpha;
-		fg += g0*scale*cos_theta*alpha;
-		fb += b0*scale*cos_theta*alpha;
-
-		scale *= (1.0f-alpha);
-	}
-	unsigned int r = floorf(fr);
-	unsigned int g = floorf(fg);
-	unsigned int b = floorf(fb);
-
-	return r << 16 | g << 8 | b;
-}
-
-#endif
diff --git a/src/daq.cu b/src/daq.cu
index cb53d56..5a68846 100644
--- a/src/daq.cu
+++ b/src/daq.cu
@@ -1,208 +1,198 @@
 // -*-c++-*-
 #include <curand_kernel.h>
 
-__device__ unsigned int float_to_sortable_int(float f)
+__device__ unsigned int
+float_to_sortable_int(float f)
 {
-  return __float_as_int(f);
-  //int i = __float_as_int(f);
-  //unsigned int mask = -(int)(i >> 31) | 0x80000000;
-  //return i ^ mask;
+    return __float_as_int(f);
+    //int i = __float_as_int(f);
+    //unsigned int mask = -(int)(i >> 31) | 0x80000000;
+    //return i ^ mask;
 }
 
-__device__ float sortable_int_to_float(unsigned int i)
+__device__ float
+sortable_int_to_float(unsigned int i)
 {
-  return __int_as_float(i);
-  //unsigned int mask = ((i >> 31) - 1) | 0x80000000;
-  //return __int_as_float(i ^ mask);
+    return __int_as_float(i);
+    //unsigned int mask = ((i >> 31) - 1) | 0x80000000;
+    //return __int_as_float(i ^ mask);
 }
 
 extern "C"
 {
 
-__global__ void reset_earliest_time_int(float maxtime, int ntime_ints, unsigned int *time_ints)
+__global__ void
+reset_earliest_time_int(float maxtime, int ntime_ints, unsigned int *time_ints)
 {
-	int id = threadIdx.x + blockDim.x * blockIdx.x;
-	if (id < ntime_ints) {
-		unsigned int maxtime_int = float_to_sortable_int(maxtime);
-		time_ints[id] = maxtime_int;
-	}
+    int id = threadIdx.x + blockDim.x * blockIdx.x;
+    if (id < ntime_ints) {
+	unsigned int maxtime_int = float_to_sortable_int(maxtime);
+	time_ints[id] = maxtime_int;
+    }
 }
 
-__global__ void run_daq(curandState *s, unsigned int detection_state,
-			float time_rms,
-			int first_photon,
-			int nphotons, float *photon_times,
-			unsigned int *photon_histories,
-			int *last_hit_triangles, int *solid_map,
-			int nsolids, unsigned int *earliest_time_int,
-			unsigned int *channel_q,
-			unsigned int *channel_histories)
+__global__ void
+run_daq(curandState *s, unsigned int detection_state, float time_rms,
+	int first_photon, int nphotons, float *photon_times,
+	unsigned int *photon_histories, int *last_hit_triangles,
+	int *solid_map, int nsolids, unsigned int *earliest_time_int,
+	unsigned int *channel_q, unsigned int *channel_histories)
 {
 
-	int id = threadIdx.x + blockDim.x * blockIdx.x;
+    int id = threadIdx.x + blockDim.x * blockIdx.x;
 
-	if (id < nphotons)
-	{
-		curandState rng = s[id];
-		int photon_id = id + first_photon;
-		int triangle_id = last_hit_triangles[photon_id];
+    if (id < nphotons) {
+	curandState rng = s[id];
+	int photon_id = id + first_photon;
+	int triangle_id = last_hit_triangles[photon_id];
 		
-		if (triangle_id > -1)
-		{
-			int solid_id = solid_map[triangle_id];
-			unsigned int history = photon_histories[photon_id];
-
-			if (solid_id < nsolids && (history & detection_state)) 
-			{
-			  float time = photon_times[photon_id] + curand_normal(&rng) * time_rms;
-			  unsigned int time_int = float_to_sortable_int(time);
-			  
-			  atomicMin(earliest_time_int + solid_id, time_int);
-			  atomicAdd(channel_q + solid_id, 1);
-			  atomicOr(channel_histories + solid_id, history);
-			}
-
-		}
+	if (triangle_id > -1) {
+	    int solid_id = solid_map[triangle_id];
+	    unsigned int history = photon_histories[photon_id];
 
-		s[id] = rng;
+	    if (solid_id < nsolids && (history & detection_state)) {
+		float time = photon_times[photon_id] + curand_normal(&rng) * time_rms;
+		unsigned int time_int = float_to_sortable_int(time);
+			  
+		atomicMin(earliest_time_int + solid_id, time_int);
+		atomicAdd(channel_q + solid_id, 1);
+		atomicOr(channel_histories + solid_id, history);
+	    }
 
 	}
 
+	s[id] = rng;
+	
+    }
+    
 }
 
-__global__ void convert_sortable_int_to_float(int n, 
-					      unsigned int *sortable_ints, 
-					      float *float_output)
+__global__ void
+convert_sortable_int_to_float(int n, unsigned int *sortable_ints,
+			      float *float_output)
 {
-	int id = threadIdx.x + blockDim.x * blockIdx.x;
+    int id = threadIdx.x + blockDim.x * blockIdx.x;
 	
-	if (id < n)
-		float_output[id] = sortable_int_to_float(sortable_ints[id]);
+    if (id < n)
+	float_output[id] = sortable_int_to_float(sortable_ints[id]);
 }
 
-__global__ void bin_hits(int nchannels,
-			 unsigned int *channel_q, float *channel_time,
-			 unsigned int *hitcount, 
-			 int tbins, float tmin, float tmax,
-			 int qbins, float qmin, float qmax,
-			 unsigned int *pdf)
+__global__ void
+bin_hits(int nchannels, unsigned int *channel_q, float *channel_time,
+	 unsigned int *hitcount, int tbins, float tmin, float tmax, int qbins,
+	 float qmin, float qmax, unsigned int *pdf)
 {
-	int id = threadIdx.x + blockDim.x * blockIdx.x;
+    int id = threadIdx.x + blockDim.x * blockIdx.x;
 
-	if (id >= nchannels)
-	  return;
+    if (id >= nchannels)
+	return;
 
-	unsigned int q = channel_q[id];
-	float t = channel_time[id];
+    unsigned int q = channel_q[id];
+    float t = channel_time[id];
 	
-	if (t < 1e8 && t >= tmin && t < tmax && q >= qmin && q < qmax) {
-		hitcount[id] += 1;
+    if (t < 1e8 && t >= tmin && t < tmax && q >= qmin && q < qmax) {
+	hitcount[id] += 1;
 		
-		int tbin = (t - tmin) / (tmax - tmin) * tbins;
-		int qbin = (q - qmin) / (qmax - qmin) * qbins;
+	int tbin = (t - tmin) / (tmax - tmin) * tbins;
+	int qbin = (q - qmin) / (qmax - qmin) * qbins;
 	    
-		// row major order (channel, t, q)
-		int bin = id * (tbins * qbins) + tbin * qbins + qbin;
-		pdf[bin] += 1;
-	}
+	// row major order (channel, t, q)
+	int bin = id * (tbins * qbins) + tbin * qbins + qbin;
+	pdf[bin] += 1;
+    }
 }
 	
-__global__ void accumulate_pdf_eval(int time_only,
-				    int nchannels,
-				    unsigned int *event_hit,
-				    float *event_time,
-				    float *event_charge,
-				    float *mc_time,
-				    unsigned int *mc_charge, // quirk of DAQ!
-				    unsigned int *hitcount, 
-				    unsigned int *bincount,
-				    float min_twidth, float tmin, float tmax,
-				    float min_qwidth, float qmin, float qmax,
-				    float *nearest_mc,
-				    int min_bin_content)
+__global__ void
+accumulate_pdf_eval(int time_only, int nchannels, unsigned int *event_hit,
+		    float *event_time, float *event_charge, float *mc_time,
+		    unsigned int *mc_charge, // quirk of DAQ!
+		    unsigned int *hitcount, unsigned int *bincount,
+		    float min_twidth, float tmin, float tmax,
+		    float min_qwidth, float qmin, float qmax,
+		    float *nearest_mc, int min_bin_content)
 {
-	int id = threadIdx.x + blockDim.x * blockIdx.x;
+    int id = threadIdx.x + blockDim.x * blockIdx.x;
 	
-	if (id >= nchannels)
-		return;
+    if (id >= nchannels)
+	return;
 	
-	// Was this channel hit in the Monte Carlo?
-	float channel_mc_time = mc_time[id];
-	if (channel_mc_time >= 1e8)
-		return; // Nothing else to do
+    // Was this channel hit in the Monte Carlo?
+    float channel_mc_time = mc_time[id];
+    if (channel_mc_time >= 1e8)
+	return; // Nothing else to do
 	
-	// Is this channel inside the range of the PDF?
-	float distance;
-	int channel_bincount = bincount[id];
-	if (time_only) {
-		if (channel_mc_time < tmin || channel_mc_time > tmax)
-			return;  // Nothing else to do
+    // Is this channel inside the range of the PDF?
+    float distance;
+    int channel_bincount = bincount[id];
+    if (time_only) {
+	if (channel_mc_time < tmin || channel_mc_time > tmax)
+	    return;  // Nothing else to do
 		
-		// This MC information is contained in the PDF
-		hitcount[id] += 1;
+	// This MC information is contained in the PDF
+	hitcount[id] += 1;
 	  
-		// Was this channel hit in the event-of-interest?
-		int channel_event_hit = event_hit[id];
-		if (!channel_event_hit)
-			return; // No need to update PDF value for unhit channel
+	// Was this channel hit in the event-of-interest?
+	int channel_event_hit = event_hit[id];
+	if (!channel_event_hit)
+	    return; // No need to update PDF value for unhit channel
 		
-		// Are we inside the minimum size bin?
-		float channel_event_time = event_time[id];
-		distance = fabsf(channel_mc_time - channel_event_time);
-		if (distance < min_twidth/2.0)
-		  bincount[id] = channel_bincount + 1;
-
-	} else { // time and charge PDF
-		float channel_mc_charge = mc_charge[id]; // int->float conversion because DAQ just returns an integer
+	// Are we inside the minimum size bin?
+	float channel_event_time = event_time[id];
+	distance = fabsf(channel_mc_time - channel_event_time);
+	if (distance < min_twidth/2.0)
+	    bincount[id] = channel_bincount + 1;
+
+    }
+    else { // time and charge PDF
+	float channel_mc_charge = mc_charge[id]; // int->float conversion because DAQ just returns an integer
 	
-		if (channel_mc_time < tmin || channel_mc_time > tmax ||
-		    channel_mc_charge < qmin || channel_mc_charge > qmax)
-			return;  // Nothing else to do
+	if (channel_mc_time < tmin || channel_mc_time > tmax ||
+	    channel_mc_charge < qmin || channel_mc_charge > qmax)
+	    return;  // Nothing else to do
 		
-		// This MC information is contained in the PDF
-		hitcount[id] += 1;
+	// This MC information is contained in the PDF
+	hitcount[id] += 1;
 	  
-		// Was this channel hit in the event-of-interest?
-		int channel_event_hit = event_hit[id];
-		if (!channel_event_hit)
-			return; // No need to update PDF value for unhit channel
+	// Was this channel hit in the event-of-interest?
+	int channel_event_hit = event_hit[id];
+	if (!channel_event_hit)
+	    return; // No need to update PDF value for unhit channel
 		
-		// Are we inside the minimum size bin?
-		float channel_event_time = event_time[id];
-		float channel_event_charge = event_charge[id];
-		float normalized_time_distance = fabsf(channel_event_time - channel_mc_time)/min_twidth/2.0;
-		float normalized_charge_distance = fabsf(channel_event_charge - channel_mc_charge)/min_qwidth/2.0;
-		distance = sqrt(normalized_time_distance*normalized_time_distance + normalized_charge_distance*normalized_charge_distance);
-
-		if (distance < 1.0f)
-		  bincount[id] = channel_bincount + 1;
-	}
-
-	// Do we need to keep updating the nearest_mc list?
-	if (channel_bincount + 1 >= min_bin_content)
-		return; // No need to perform insertion into nearest_mc because bincount is a better estimate of the PDF value
-
-	// insertion sort the distance into the array of nearest MC points
-	int offset = min_bin_content * id;
-	int entry = min_bin_content - 1;
+	// Are we inside the minimum size bin?
+	float channel_event_time = event_time[id];
+	float channel_event_charge = event_charge[id];
+	float normalized_time_distance = fabsf(channel_event_time - channel_mc_time)/min_twidth/2.0;
+	float normalized_charge_distance = fabsf(channel_event_charge - channel_mc_charge)/min_qwidth/2.0;
+	distance = sqrt(normalized_time_distance*normalized_time_distance + normalized_charge_distance*normalized_charge_distance);
+
+	if (distance < 1.0f)
+	    bincount[id] = channel_bincount + 1;
+    }
+
+    // Do we need to keep updating the nearest_mc list?
+    if (channel_bincount + 1 >= min_bin_content)
+	return; // No need to perform insertion into nearest_mc because bincount is a better estimate of the PDF value
+
+    // insertion sort the distance into the array of nearest MC points
+    int offset = min_bin_content * id;
+    int entry = min_bin_content - 1;
 	
-	// If last entry less than new entry, nothing to update
-	if (distance > nearest_mc[offset + entry])
-		return;
-
-	// Find where to insert the new entry while sliding the rest
-	// to the right
+    // If last entry less than new entry, nothing to update
+    if (distance > nearest_mc[offset + entry])
+	return;
+
+    // Find where to insert the new entry while sliding the rest
+    // to the right
+    entry--;
+    while (entry >= 0) {
+	if (nearest_mc[offset+entry] >= distance)
+	    nearest_mc[offset+entry+1] = nearest_mc[offset+entry];
+	else
+	    break;
 	entry--;
-	while (entry >= 0) {
-		if (nearest_mc[offset+entry] >= distance)
-			nearest_mc[offset+entry+1] = nearest_mc[offset+entry];
-		else
-			break;
-		entry--;
-	}
+    }
 
-	nearest_mc[offset+entry+1] = distance;
+    nearest_mc[offset+entry+1] = distance;
 }
 
-
 } // extern "C"
diff --git a/src/geometry.h b/src/geometry.h
new file mode 100644
index 0000000..2b5eacb
--- /dev/null
+++ b/src/geometry.h
@@ -0,0 +1,74 @@
+#ifndef __GEOMETRY_H__
+#define __GEOMETRY_H__
+
+struct Material
+{
+    float *refractive_index;
+    float *absorption_length;
+    float *scattering_length;
+    unsigned int n;
+    float step;
+    float wavelength0;
+};
+
+struct Surface
+{
+    float *detect;
+    float *absorb;
+    float *reflect_diffuse;
+    float *reflect_specular;
+    unsigned int n;
+    float step;
+    float wavelength0;
+};
+
+struct Triangle
+{
+    float3 v0, v1, v2;
+};
+
+struct Geometry
+{
+    float3 *vertices;
+    uint3 *triangles;
+    unsigned int *material_codes;
+    unsigned int *colors;
+    float3 *lower_bounds;
+    float3 *upper_bounds;
+    unsigned int *node_map;
+    unsigned int *node_map_end;
+    Material **materials;
+    Surface **surfaces;
+    unsigned int start_node;
+    unsigned int first_node;
+};
+
+__device__ Triangle
+get_triangle(Geometry *geometry, const unsigned int &i)
+{
+    uint3 triangle_data = geometry->triangles[i];
+
+    Triangle triangle;
+    triangle.v0 = geometry->vertices[triangle_data.x];
+    triangle.v1 = geometry->vertices[triangle_data.y];
+    triangle.v2 = geometry->vertices[triangle_data.z];
+
+    return triangle;
+}
+
+template <class T>
+__device__ float
+interp_property(T *m, const float &x, const float *fp)
+{
+    if (x < m->wavelength0)
+	return fp[0];
+
+    if (x > (m->wavelength0 + (m->n-1)*m->step))
+	return fp[m->n-1];
+
+    int jl = (x-m->wavelength0)/m->step;
+
+    return fp[jl] + (x-(m->wavelength0 + jl*m->step))*(fp[jl+1]-fp[jl])/m->step;
+}
+
+#endif
diff --git a/src/hybrid_render.cu b/src/hybrid_render.cu
new file mode 100644
index 0000000..29edefa
--- /dev/null
+++ b/src/hybrid_render.cu
@@ -0,0 +1,202 @@
+//-*-c-*-
+#include <math_constants.h>
+#include <curand_kernel.h>
+
+#include "linalg.h"
+#include "matrix.h"
+#include "rotate.h"
+#include "mesh.h"
+#include "geometry.h"
+#include "photon.h"
+
+__device__ void
+fAtomicAdd(float *addr, float data)
+{
+    while (data)
+	data = atomicExch(addr, data+atomicExch(addr, 0.0f));
+}
+
+__device__ void
+to_diffuse(Photon &p, State &s, Geometry *g, curandState &rng, int max_steps)
+{
+    int steps = 0;
+    while (steps < max_steps) {
+	steps++;
+
+	int command;
+
+	fill_state(s, p, g);
+
+	if (p.last_hit_triangle == -1)
+	    break;
+
+	command = propagate_to_boundary(p, s, rng);
+
+	if (command == BREAK)
+	    break;
+
+	if (command == CONTINUE)
+	    continue;
+
+	if (s.surface_index != -1) {
+	    command = propagate_at_surface(p, s, rng, g);
+
+	    if (p.history & REFLECT_DIFFUSE)
+		break;
+
+	    if (command == BREAK)
+		break;
+
+	    if (command == CONTINUE)
+		continue;
+	}
+
+	propagate_at_boundary(p, s, rng);
+
+    } // while (steps < max_steps)
+
+} // to_diffuse
+
+extern "C"
+{
+
+__global__ void
+update_xyz_lookup(int nthreads, int total_threads, int offset, float3 position,
+		  curandState *rng_states, float wavelength, float3 xyz,
+		  float3 *xyz_lookup1, float3 *xyz_lookup2, int max_steps,
+		  Geometry *g)
+{
+    int kernel_id = blockIdx.x*blockDim.x + threadIdx.x;
+    int id = kernel_id + offset;
+
+    if (kernel_id >= nthreads || id >= total_threads)
+	return;
+
+    curandState rng = rng_states[kernel_id];
+
+    Triangle t = get_triangle(g, id);
+
+    float a = curand_uniform(&rng);
+    float b = uniform(&rng, 0.0f, (1.0f - a));
+    float c = 1.0f - a - b;
+
+    float3 direction = a*t.v0 + b*t.v1 + c*t.v2 - position;
+    direction /= norm(direction);
+
+    float distance;
+    int triangle_index = intersect_mesh(position, direction, g, distance);
+
+    if (triangle_index != id) {
+	rng_states[kernel_id] = rng;
+	return;
+    }
+
+    float3 v01 = t.v1 - t.v0;
+    float3 v12 = t.v2 - t.v1;
+    
+    float3 surface_normal = normalize(cross(v01,v12));
+
+    float cos_theta = dot(surface_normal,-direction);
+
+    if (cos_theta < 0.0f)
+	cos_theta = dot(-surface_normal,-direction);
+
+    Photon p;
+    p.position = position;
+    p.direction = direction;
+    p.wavelength = wavelength;
+    p.polarization = uniform_sphere(&rng);
+    p.last_hit_triangle = -1;
+    p.time = 0;
+    p.history = 0;
+
+    State s;
+    to_diffuse(p, s, g, rng, max_steps);
+
+    if (p.history & REFLECT_DIFFUSE) {
+	if (s.inside_to_outside) {
+	    fAtomicAdd(&xyz_lookup1[p.last_hit_triangle].x, cos_theta*xyz.x);
+	    fAtomicAdd(&xyz_lookup1[p.last_hit_triangle].y, cos_theta*xyz.y);
+	    fAtomicAdd(&xyz_lookup1[p.last_hit_triangle].z, cos_theta*xyz.z);
+	}
+	else {
+	    fAtomicAdd(&xyz_lookup2[p.last_hit_triangle].x, cos_theta*xyz.x);
+	    fAtomicAdd(&xyz_lookup2[p.last_hit_triangle].y, cos_theta*xyz.y);
+	    fAtomicAdd(&xyz_lookup2[p.last_hit_triangle].z, cos_theta*xyz.z);
+	}
+    }
+
+    rng_states[kernel_id] = rng;
+
+} // update_xyz_lookup
+
+__global__ void
+update_xyz_image(int nthreads, curandState *rng_states, float3 *positions,
+		 float3 *directions, float wavelength, float3 xyz,
+		 float3 *xyz_lookup1, float3 *xyz_lookup2, float3 *image,
+		 int nlookup_calls, int max_steps, Geometry *g)
+{
+    int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if (id >= nthreads)
+	return;
+
+    curandState rng = rng_states[id];
+
+    Photon p;
+    p.position = positions[id];
+    p.direction = directions[id];
+    p.direction /= norm(p.direction);
+    p.wavelength = wavelength;
+    p.polarization = uniform_sphere(&rng);
+    p.last_hit_triangle = -1;
+    p.time = 0;
+    p.history = 0;
+
+    State s;
+    to_diffuse(p, s, g, rng, max_steps);
+
+    if (p.history & REFLECT_DIFFUSE) {
+	if (s.inside_to_outside)
+	    image[id] += xyz*xyz_lookup1[p.last_hit_triangle]/nlookup_calls;
+	else
+	    image[id] += xyz*xyz_lookup2[p.last_hit_triangle]/nlookup_calls;
+    }
+
+    rng_states[id] = rng;
+
+} // update_xyz_image
+
+__global__ void
+process_image(int nthreads, float3 *image, unsigned int *pixels, int nimages)
+{
+    int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if (id >= nthreads)
+	return;
+
+    float3 rgb = image[id]/nimages;
+
+    if (rgb.x < 0.0f)
+	rgb.x = 0.0f;
+    if (rgb.y < 0.0f)
+	rgb.y = 0.0f;
+    if (rgb.z < 0.0f)
+	rgb.z = 0.0f;
+
+    if (rgb.x > 1.0f)
+	rgb.x = 1.0f;
+    if (rgb.y > 1.0f)
+	rgb.y = 1.0f;
+    if (rgb.z > 1.0f)
+	rgb.z = 1.0f;
+
+    unsigned int r = floorf(rgb.x*255.0f);
+    unsigned int g = floorf(rgb.y*255.0f);
+    unsigned int b = floorf(rgb.z*255.0f);
+
+    pixels[id] = 255 << 24 | r << 16 | g << 8 | b;
+
+} // process_image
+
+} // extern "c"
diff --git a/src/intersect.h b/src/intersect.h
index 26e1d7e..978bde8 100644
--- a/src/intersect.h
+++ b/src/intersect.h
@@ -3,30 +3,32 @@
 #ifndef __INTERSECT_H__
 #define __INTERSECT_H__
 
-#include <math_constants.h>
-
 #include "linalg.h"
 #include "matrix.h"
-#include "rotate.h"
+#include "geometry.h"
 
 #define EPSILON 0.0f
 
-/* Test the intersection between a ray starting from `origin` traveling in the
-   direction `direction` and a triangle defined by the vertices `v0`, `v1`, and
-   `v2`. If the ray intersects the triangle, set `distance` to the distance
-   between `origin` and the intersection and return true, else return false.
-
-   `direction` must be normalized. */
-__device__ bool intersect_triangle(const float3 &origin, const float3 &direction, const float3 &v0, const float3 &v1, const float3 &v2, float &distance)
+/* Tests the intersection between a ray and a triangle.
+   If the ray intersects the triangle, set `distance` to the distance from
+   `origin` to the intersection and return true, else return false.
+   `direction` must be normalized to one. */
+__device__ bool
+intersect_triangle(const float3 &origin, const float3 &direction,
+		   const Triangle &triangle, float &distance)
 {
-	Matrix m = make_matrix(v1-v0, v2-v0, -direction);
+	float3 m1 = triangle.v1-triangle.v0;
+	float3 m2 = triangle.v2-triangle.v0;
+	float3 m3 = -direction;
+
+	Matrix m = make_matrix(m1, m2, m3);
 	
 	float determinant = det(m);
 
 	if (determinant == 0.0f)
 		return false;
 
-	float3 b = origin-v0;
+	float3 b = origin-triangle.v0;
 
 	float u1 = ((m.a11*m.a22 - m.a12*m.a21)*b.x +
 		    (m.a02*m.a21 - m.a01*m.a22)*b.y +
@@ -54,38 +56,17 @@ __device__ bool intersect_triangle(const float3 &origin, const float3 &direction
 	return true;
 }
 
-/* Return the 32 bit color associated with the intersection between a ray
-   starting from `origin` traveling in the direction `direction` and the
-   plane defined by the points `v0`, `v1`, and `v2` using the cosine of the
-   angle between the ray and the plane normal to determine the brightness.
-
-   `direction` must be normalized. */
-__device__ unsigned int get_color(const float3 &direction, const float3 &v0, const float3& v1, const float3 &v2, unsigned int base_color=0xFFFFFF)
-{
-	float scale = dot(normalize(cross(v1-v0,v2-v1)),-direction);
-
-	if (scale < 0.0f)
-	{
-		base_color = 0xff0000;
-		scale = dot(-normalize(cross(v1-v0,v2-v1)),-direction);
-	}
-
-	unsigned int r = 0xFF & (base_color >> 16);
-	unsigned int g = 0xFF & (base_color >> 8);
-	unsigned int b = 0xFF & base_color;
-
-	r = floorf(r*scale);
-	g = floorf(g*scale);
-	b = floorf(b*scale);
-
-	return r << 16 | g << 8 | b;
-}
-
-/* Test the intersection between a ray starting from `origin` traveling in the
-   direction `direction` and the axis-aligned box defined by the opposite
-   vertices `lower_bound` and `upper_bound`. If the ray intersects the box
-   return True, else return False. */
-__device__ bool intersect_box(const float3 &origin, const float3 &direction, const float3 &lower_bound, const float3 &upper_bound, float& distance_to_box)
+/* Tests the intersection between a ray and an axis-aligned box defined by
+   an upper and lower bound. If the ray intersects the box, set
+   `distance_to_box` to the distance from `origin` to the intersection and
+   return true, else return false. `direction` must be normalized to one.
+
+   Source: "An Efficient and Robust Ray-Box Intersection Algorithm."
+   by Williams, et. al. */
+__device__ bool
+intersect_box(const float3 &origin, const float3 &direction,
+	      const float3 &lower_bound, const float3 &upper_bound,
+	      float& distance_to_box)
 {
 	float kmin, kmax, kymin, kymax, kzmin, kzmax;
 
diff --git a/src/kernel.cu b/src/kernel.cu
deleted file mode 100644
index 4418305..0000000
--- a/src/kernel.cu
+++ /dev/null
@@ -1,389 +0,0 @@
-//-*-c-*-
-#include <math_constants.h>
-#include <curand_kernel.h>
-
-#include "linalg.h"
-#include "matrix.h"
-#include "rotate.h"
-#include "mesh.h"
-#include "photon.h"
-#include "alpha.h"
-
-__device__ void fAtomicAdd(float *addr, float data)
-{
-	while (data)
-	{
-		data = atomicExch(addr, data+atomicExch(addr, 0.0f));
-	}
-}
-
-__device__ void to_diffuse(Photon &p, State &s, curandState &rng, const int &max_steps)
-{
-	int steps = 0;
-	while (steps < max_steps)
-	{
-		steps++;
-
-		int command;
-
-		fill_state(s, p);
-
-		if (p.last_hit_triangle == -1)
-			break;
-
-		command = propagate_to_boundary(p, s, rng);
-
-		if (command == BREAK)
-			break;
-
-		if (command == CONTINUE)
-			continue;
-
-		if (s.surface_index != -1)
-		{
-			command = propagate_at_surface(p, s, rng);
-
-			if (p.history & REFLECT_DIFFUSE)
-				break;
-
-			if (command == BREAK)
-				break;
-
-			if (command == CONTINUE)
-				continue;
-		}
-
-		propagate_at_boundary(p, s, rng);
-
-	} // while (steps < max_steps)
-
-} // to_diffuse
-
-extern "C"
-{
-
-__global__ void update_xyz_lookup(int nthreads, int total_threads, int offset, float3 position, curandState *rng_states, float wavelength, float3 xyz, float3 *xyz_lookup1, float3 *xyz_lookup2, int max_steps)
-{
-	int kernel_id = blockIdx.x*blockDim.x + threadIdx.x;
-	int id = kernel_id + offset;
-
-	if (kernel_id >= nthreads || id >= total_threads)
-		return;
-
-	curandState rng = rng_states[kernel_id];
-
-	uint4 triangle_data = g_triangles[id];
-
-	float3 v0 = g_vertices[triangle_data.x];
-	float3 v1 = g_vertices[triangle_data.y];
-	float3 v2 = g_vertices[triangle_data.z];
-
-	float a = curand_uniform(&rng);
-	float b = uniform(&rng, 0.0f, (1.0f - a));
-	float c = 1.0f - a - b;
-
-	float3 direction = a*v0 + b*v1 + c*v2 - position;
-	direction /= norm(direction);
-
-	float distance;
-	int triangle_index = intersect_mesh(position, direction, distance);
-
-	if (triangle_index != id)
-	{
-		rng_states[kernel_id] = rng;
-		return;
-	}
-
-	float cos_theta = dot(normalize(cross(v1-v0,v2-v1)),-direction);
-
-	if (cos_theta < 0.0f)
-		cos_theta = dot(-normalize(cross(v1-v0,v2-v1)),-direction);
-
-	Photon p;
-	p.position = position;
-	p.direction = direction;
-	p.wavelength = wavelength;
-	p.polarization = uniform_sphere(&rng);
-	p.last_hit_triangle = -1;
-	p.time = 0;
-	p.history = 0;
-
-	State s;
-	to_diffuse(p, s, rng, max_steps);
-
-	if (p.history & REFLECT_DIFFUSE)
-	{
-		if (s.inside_to_outside)
-		{
-			fAtomicAdd(&xyz_lookup1[p.last_hit_triangle].x, cos_theta*xyz.x);
-			fAtomicAdd(&xyz_lookup1[p.last_hit_triangle].y, cos_theta*xyz.y);
-			fAtomicAdd(&xyz_lookup1[p.last_hit_triangle].z, cos_theta*xyz.z);
-		}
-		else
-		{
-			fAtomicAdd(&xyz_lookup2[p.last_hit_triangle].x, cos_theta*xyz.x);
-			fAtomicAdd(&xyz_lookup2[p.last_hit_triangle].y, cos_theta*xyz.y);
-			fAtomicAdd(&xyz_lookup2[p.last_hit_triangle].z, cos_theta*xyz.z);
-		}
-	}
-
-	rng_states[kernel_id] = rng;
-
-} // update_xyz_lookup
-
-__global__ void update_xyz_image(int nthreads, curandState *rng_states, float3 *positions, float3 *directions, float wavelength, float3 xyz, float3 *xyz_lookup1, float3 *xyz_lookup2, float3 *image, int nlookup_calls, int max_steps)
-{
-	int id = blockIdx.x*blockDim.x + threadIdx.x;
-
-	if (id >= nthreads)
-		return;
-
-	curandState rng = rng_states[id];
-
-	Photon p;
-	p.position = positions[id];
-	p.direction = directions[id];
-	p.direction /= norm(p.direction);
-	p.wavelength = wavelength;
-	p.polarization = uniform_sphere(&rng);
-	p.last_hit_triangle = -1;
-	p.time = 0;
-	p.history = 0;
-
-	State s;
-	to_diffuse(p, s, rng, max_steps);
-
-	if (p.history & REFLECT_DIFFUSE)
-	{
-		if (s.inside_to_outside)
-		{
-			image[id].x += xyz.x*xyz_lookup1[p.last_hit_triangle].x/nlookup_calls;
-			image[id].y += xyz.y*xyz_lookup1[p.last_hit_triangle].y/nlookup_calls;
-			image[id].z += xyz.z*xyz_lookup1[p.last_hit_triangle].z/nlookup_calls;
-		}
-		else
-		{
-			image[id].x += xyz.x*xyz_lookup2[p.last_hit_triangle].x/nlookup_calls;
-			image[id].y += xyz.y*xyz_lookup2[p.last_hit_triangle].y/nlookup_calls;
-			image[id].z += xyz.z*xyz_lookup2[p.last_hit_triangle].z/nlookup_calls;
-		}
-	}
-
-	rng_states[id] = rng;
-
-} // update_xyz_image
-
-__global__ void process_image(int nthreads, float3 *image, int *pixels, int nimages)
-{
-	int id = blockIdx.x*blockDim.x + threadIdx.x;
-
-	if (id >= nthreads)
-		return;
-
-	float3 rgb = image[id]/nimages;
-
-	if (rgb.x < 0.0f)
-		rgb.x = 0.0f;
-	if (rgb.y < 0.0f)
-		rgb.y = 0.0f;
-	if (rgb.z < 0.0f)
-		rgb.z = 0.0f;
-
-	if (rgb.x > 1.0f)
-		rgb.x = 1.0f;
-	if (rgb.y > 1.0f)
-		rgb.y = 1.0f;
-	if (rgb.z > 1.0f)
-		rgb.z = 1.0f;
-
-	unsigned int r = floorf(rgb.x*255.0f);
-	unsigned int g = floorf(rgb.y*255.0f);
-	unsigned int b = floorf(rgb.z*255.0f);
-
-	pixels[id] = r << 16 | g << 8 | b;
-
-} // process_image
-
-__global__ void distance_to_mesh(int nthreads, float3 *positions, float3 *directions, float *distances)
-{
-	int id = blockIdx.x*blockDim.x + threadIdx.x;
-
-	if (id >= nthreads)
-		return;
-
-	float3 position = positions[id];
-	float3 direction = directions[id];
-	direction /= norm(direction);
-
-	float distance;
-
-	int triangle_index = intersect_mesh(position, direction, distance);
-
-	if (triangle_index == -1)
-		distances[id] = 1e9;
-	else
-		distances[id] = distance;
-
-} // distance_to_mesh
-
-__global__ void ray_trace(int nthreads, float3 *positions, float3 *directions, int *pixels)
-{
-	int id = blockIdx.x*blockDim.x + threadIdx.x;
-
-	if (id >= nthreads)
-		return;
-
-	float3 position = positions[id];
-	float3 direction = directions[id];
-	direction /= norm(direction);
-
-	float distance;
-
-	int triangle_index = intersect_mesh(position, direction, distance);
-
-	if (triangle_index == -1)
-	{
-		pixels[id] = 0;
-	}
-	else
-	{
-		uint4 triangle_data = g_triangles[triangle_index];
-
-		float3 v0 = g_vertices[triangle_data.x];
-		float3 v1 = g_vertices[triangle_data.y];
-		float3 v2 = g_vertices[triangle_data.z];
-
-		pixels[id] = get_color(direction, v0, v1, v2, g_colors[triangle_index]);
-	}
-
-} // ray_trace
-
-/* Trace the rays starting at `positions` traveling in the direction
-   `directions` to their intersection with the global mesh. If the ray
-   intersects the mesh set the pixel associated with the ray to a 32 bit
-   color whose brightness is determined by the cosine of the angle between
-   the ray and the normal of the triangle it intersected, else set the pixel
-   to 0. */
-__global__ void ray_trace_alpha(int nthreads, float3 *positions, float3 *directions, int *pixels)
-{
-	int id = blockIdx.x*blockDim.x + threadIdx.x;
-
-	if (id >= nthreads)
-		return;
-
-	float3 position = positions[id];
-	float3 direction = directions[id];
-	direction /= norm(direction);
-
-	bool hit;
-	float distance;
-
-	pixels[id] = get_color_alpha(position, direction);
-
-} // ray_trace
-
-__global__ void swap(int *values, int nswap, int *offset_a, int *offset_b)
-{
-  int id = blockIdx.x*blockDim.x + threadIdx.x;
-
-  if (id < nswap) {
-    int a = offset_a[id];
-    int b = offset_b[id];
-
-    int tmp = values[a];
-    values[a] = values[b];
-    values[b] = tmp;
-  }
-}
-
-__global__ void propagate(int first_photon, int nthreads, 
-			  unsigned int *input_queue, 
-			  unsigned int *output_queue,
-			  curandState *rng_states, 
-			  float3 *positions, float3 *directions, float *wavelengths, float3 *polarizations, float *times, 
-			  unsigned int *histories, int *last_hit_triangles, int max_steps)
-{
-	int id = blockIdx.x*blockDim.x + threadIdx.x;
-
-	if (id >= nthreads)
-		return;
-
-	curandState rng = rng_states[id];
-
-	int photon_id = input_queue[first_photon + id];
-
-	Photon p;
-	p.position = positions[photon_id];
-	p.direction = directions[photon_id];
-	p.direction /= norm(p.direction);
-	p.polarization = polarizations[photon_id];
-	p.polarization /= norm(p.polarization);
-	p.wavelength = wavelengths[photon_id];
-	p.time = times[photon_id];
-	p.last_hit_triangle = last_hit_triangles[photon_id];
-	p.history = histories[photon_id];
-
-	if (p.history & (NO_HIT | BULK_ABSORB | SURFACE_DETECT | SURFACE_ABSORB))
-		return;
-
-	State s;
-
-	int steps = 0;
-	while (steps < max_steps)
-	{
-		steps++;
-
-		int command;
-
-		// check for NaN and fail
-		if (isnan(p.direction.x*p.direction.y*p.direction.z*p.position.x*p.position.y*p.position.z)) 
-		{
-			p.history |= NO_HIT | NAN_ABORT;
-			break;
-		}
-
-		fill_state(s, p);
-
-		if (p.last_hit_triangle == -1)
-			break;
-
-		command = propagate_to_boundary(p, s, rng);
-
-		if (command == BREAK)
-			break;
-
-		if (command == CONTINUE)
-			continue;
-
-		if (s.surface_index != -1)
-		{
-			command = propagate_at_surface(p, s, rng);
-
-			if (command == BREAK)
-				break;
-
-			if (command == CONTINUE)
-				continue;
-		}
-
-		propagate_at_boundary(p, s, rng);
-
-	} // while (steps < max_steps)
-
-	rng_states[id] = rng;
-	positions[photon_id] = p.position;
-	directions[photon_id] = p.direction;
-	polarizations[photon_id] = p.polarization;
-	wavelengths[photon_id] = p.wavelength;
-	times[photon_id] = p.time;
-	histories[photon_id] = p.history;
-	last_hit_triangles[photon_id] = p.last_hit_triangle;
-	
-	// Not done, put photon in output queue
-	if ( (p.history & (NO_HIT | BULK_ABSORB | SURFACE_DETECT | SURFACE_ABSORB)) == 0) {
-	  int out_idx = atomicAdd(output_queue, 1);
-	  output_queue[out_idx] = photon_id;
-	}
-} // propagate
-
-} // extern "c"
diff --git a/src/linalg.h b/src/linalg.h
index fe6b1b2..35b2423 100644
--- a/src/linalg.h
+++ b/src/linalg.h
@@ -1,121 +1,170 @@
 #ifndef __LINALG_H__
 #define __LINALG_H__
 
-__device__ __host__ float3 operator- (const float3 &a)
+__device__ float3
+operator- (const float3 &a)
 {
-	return make_float3(-a.x, -a.y, -a.z);
+    return make_float3(-a.x, -a.y, -a.z);
 }
 
-__device__ __host__ float3 operator+ (const float3 &a, const float3 &b)
+__device__ float3
+operator* (const float3 &a, const float3 &b)
 {
-	return make_float3(a.x+b.x, a.y+b.y, a.z+b.z);
+    return make_float3(a.x*b.x, a.y*b.y, a.z*b.z);
 }
 
-__device__ __host__ void operator+= (float3 &a, const float3 &b)
+__device__ float3
+operator/ (const float3 &a, const float3 &b)
 {
-	a.x += b.x;
-	a.y += b.y;
-	a.z += b.z;
+    return make_float3(a.x/b.x, a.y/b.y, a.z/b.z);
 }
 
-__device__ __host__ float3 operator- (const float3 &a, const float3 &b)
+__device__ void
+operator*= (float3 &a, const float3 &b)
 {
-	return make_float3(a.x-b.x, a.y-b.y, a.z-b.z);
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
 }
 
-__device__ __host__ void operator-= (float3 &a, const float3 &b)
+__device__ void
+operator/= (float3 &a, const float3 &b)
 {
-	a.x -= b.x;
-	a.y -= b.y;
-	a.z -= b.z;
+    a.x /= b.x;
+    a.y /= b.y;
+    a.z /= b.z;
 }
 
-__device__ __host__ float3 operator+ (const float3 &a, const float &c)
+__device__ float3
+operator+ (const float3 &a, const float3 &b)
 {
-	return make_float3(a.x+c, a.y+c, a.z+c);
+    return make_float3(a.x+b.x, a.y+b.y, a.z+b.z);
 }
 
-__device__ __host__ void operator+= (float3 &a, const float &c)
+__device__ void
+operator+= (float3 &a, const float3 &b)
 {
-	a.x += c;
-	a.y += c;
-	a.z += c;
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
 }
 
-__device__ __host__ float3 operator+ (const float &c, const float3 &a)
+__device__ float3
+operator- (const float3 &a, const float3 &b)
 {
-	return make_float3(c+a.x, c+a.y, c+a.z);
+    return make_float3(a.x-b.x, a.y-b.y, a.z-b.z);
 }
 
-__device__ __host__ float3 operator- (const float3 &a, const float &c)
+__device__ void
+operator-= (float3 &a, const float3 &b)
 {
-	return make_float3(a.x-c, a.y-c, a.z-c);
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
 }
 
-__device__ __host__ void operator-= (float3 &a, const float &c)
+__device__ float3
+operator+ (const float3 &a, const float &c)
 {
-	a.x -= c;
-	a.y -= c;
-	a.z -= c;
+    return make_float3(a.x+c, a.y+c, a.z+c);
 }
 
-__device__ __host__ float3 operator- (const float &c, const float3& a)
+__device__ void
+operator+= (float3 &a, const float &c)
 {
-	return make_float3(c-a.x, c-a.y, c-a.z);
+    a.x += c;
+    a.y += c;
+    a.z += c;
 }
 
-__device__ __host__ float3 operator* (const float3 &a, const float &c)
+__device__ float3
+operator+ (const float &c, const float3 &a)
 {
-	return make_float3(a.x*c, a.y*c, a.z*c);
+    return make_float3(c+a.x, c+a.y, c+a.z);
 }
 
-__device__ __host__ void operator*= (float3 &a, const float &c)
+__device__ float3
+operator- (const float3 &a, const float &c)
 {
-	a.x *= c;
-	a.y *= c;
-	a.z *= c;
+    return make_float3(a.x-c, a.y-c, a.z-c);
 }
 
-__device__ __host__ float3 operator* (const float &c, const float3& a)
+__device__ void
+operator-= (float3 &a, const float &c)
 {
-	return make_float3(c*a.x, c*a.y, c*a.z);
+    a.x -= c;
+    a.y -= c;
+    a.z -= c;
 }
 
-__device__ __host__ float3 operator/ (const float3 &a, const float &c)
+__device__ float3
+operator- (const float &c, const float3& a)
 {
-	return make_float3(a.x/c, a.y/c, a.z/c);
+    return make_float3(c-a.x, c-a.y, c-a.z);
 }
 
-__device__ __host__ void operator/= (float3 &a, const float &c)
+__device__ float3
+operator* (const float3 &a, const float &c)
 {
-	a.x /= c;
-	a.y /= c;
-	a.z /= c;
+    return make_float3(a.x*c, a.y*c, a.z*c);
 }
 
-__device__ __host__ float3 operator/ (const float &c, const float3 &a)
+__device__ void
+operator*= (float3 &a, const float &c)
 {
-	return make_float3(c/a.x, c/a.y, c/a.z);
+    a.x *= c;
+    a.y *= c;
+    a.z *= c;
 }
 
-__device__ __host__ float dot(const float3 &a, const float3 &b)
+__device__ float3 
+operator* (const float &c, const float3& a)
 {
-	return a.x*b.x + a.y*b.y + a.z*b.z;
+    return make_float3(c*a.x, c*a.y, c*a.z);
 }
 
-__device__ __host__ float3 cross(const float3 &a, const float3 &b)
+__device__ float3
+operator/ (const float3 &a, const float &c)
 {
-	return make_float3(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x);
+    return make_float3(a.x/c, a.y/c, a.z/c);
 }
 
-__device__ __host__ float norm(const float3 &a)
+__device__ void
+operator/= (float3 &a, const float &c)
 {
-	return sqrtf(dot(a,a));
+    a.x /= c;
+    a.y /= c;
+    a.z /= c;
 }
 
-__device__ __host__ float3 normalize(const float3 &a)
+__device__ float3
+operator/ (const float &c, const float3 &a)
 {
-	return a/norm(a);
+    return make_float3(c/a.x, c/a.y, c/a.z);
+}
+
+__device__ float
+dot(const float3 &a, const float3 &b)
+{
+    return a.x*b.x + a.y*b.y + a.z*b.z;
+}
+
+__device__ float3
+cross(const float3 &a, const float3 &b)
+{
+    return make_float3(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x);
+}
+
+__device__ float
+norm(const float3 &a)
+{
+    return sqrtf(dot(a,a));
+}
+
+__device__ float3
+normalize(const float3 &a)
+{
+    return a/norm(a);
 }
 
 #endif
diff --git a/src/materials.h b/src/materials.h
deleted file mode 100644
index 2355d24..0000000
--- a/src/materials.h
+++ /dev/null
@@ -1,68 +0,0 @@
-#ifndef __MATERIALS_H__
-#define __MATERIALS_H__
-
-__device__ float min_wavelength;
-__device__ float max_wavelength;
-__device__ float wavelength_step;
-__device__ unsigned int wavelength_size;
-
-struct Material
-{
-	float *refractive_index;
-	float *absorption_length;
-	float *scattering_length;
-};
-
-struct Surface
-{
-	float *detect;
-	float *absorb;
-	float *reflect_diffuse;
-	float *reflect_specular;
-};
-
-__device__ Material materials[20];
-__device__ Surface surfaces[20];
-
-__device__ float interp_property(const float &x, const float *fp)
-{
-	if (x < min_wavelength)
-		return fp[0];
-
-	if (x > max_wavelength)
-		return fp[wavelength_size-1];
-
-	int jl = (x-min_wavelength)/wavelength_step;
-
-	return fp[jl] + (x-(min_wavelength + jl*wavelength_step))*(fp[jl+1]-fp[jl])/wavelength_step;
-}
-
-extern "C"
-{
-
-__global__ void set_wavelength_range(float _min_wavelength, float _max_wavelength, float _wavelength_step, unsigned int _wavelength_size)
-{
-	min_wavelength = _min_wavelength;
-	max_wavelength = _max_wavelength;
-	wavelength_step = _wavelength_step;
-	wavelength_size = _wavelength_size;
-}
-
-__global__ void set_material(int material_index, float *refractive_index, float *absorption_length, float *scattering_length)
-{
-	materials[material_index].refractive_index = refractive_index;
-	materials[material_index].absorption_length = absorption_length;
-	materials[material_index].scattering_length = scattering_length;
-}
-
-__global__ void set_surface(int surface_index, float *detect, float *absorb, float *reflect_diffuse, float *reflect_specular)
-{
-	surfaces[surface_index].detect = detect;
-	surfaces[surface_index].absorb = absorb;
-	surfaces[surface_index].reflect_diffuse = reflect_diffuse;
-	surfaces[surface_index].reflect_specular = reflect_specular;
-}
-
-} // extern "c"
-
-#endif
diff --git a/src/matrix.h b/src/matrix.h
index a3e480b..0a66e58 100644
--- a/src/matrix.h
+++ b/src/matrix.h
@@ -3,221 +3,247 @@
 
 struct Matrix
 {
-	float a00, a01, a02, a10, a11, a12, a20, a21, a22;
+    float a00, a01, a02, a10, a11, a12, a20, a21, a22;
 };
 
-__device__ __host__ Matrix make_matrix(float a00, float a01, float a02,
-				       float a10, float a11, float a12,
-				       float a20, float a21, float a22)
+__device__ Matrix
+make_matrix(float a00, float a01, float a02,
+	    float a10, float a11, float a12,
+	    float a20, float a21, float a22)
 {
-	Matrix m = {a00, a01, a02, a10, a11, a12, a20, a21, a22};
-	return m;
+    Matrix m = {a00, a01, a02, a10, a11, a12, a20, a21, a22};
+    return m;
 }
 
-__device__ __host__ Matrix make_matrix(const float3 &u1, const float3 &u2, const float3 &u3)
+__device__ Matrix
+make_matrix(const float3 &u1, const float3 &u2, const float3 &u3)
 {
-	Matrix m = {u1.x, u2.x, u3.x, u1.y, u2.y, u3.y, u1.z, u2.z, u3.z};
-	return m;
+    Matrix m = {u1.x, u2.x, u3.x, u1.y, u2.y, u3.y, u1.z, u2.z, u3.z};
+    return m;
 }
 
-__device__ __host__ Matrix operator- (const Matrix &m)
+__device__ Matrix
+operator- (const Matrix &m)
 {
-	return make_matrix(-m.a00, -m.a01, -m.a02,
-			   -m.a10, -m.a11, -m.a12,
-			   -m.a20, -m.a21, -m.a22);
+    return make_matrix(-m.a00, -m.a01, -m.a02,
+		       -m.a10, -m.a11, -m.a12,
+		       -m.a20, -m.a21, -m.a22);
 }
 
-__device__ __host__ float3 operator* (const Matrix &m, const float3 &a)
+__device__ float3
+operator* (const Matrix &m, const float3 &a)
 {
-	return make_float3(m.a00*a.x + m.a01*a.y + m.a02*a.z,
-			   m.a10*a.x + m.a11*a.y + m.a12*a.z,
-			   m.a20*a.x + m.a21*a.y + m.a22*a.z);
+    return make_float3(m.a00*a.x + m.a01*a.y + m.a02*a.z,
+		       m.a10*a.x + m.a11*a.y + m.a12*a.z,
+		       m.a20*a.x + m.a21*a.y + m.a22*a.z);
 }
 
-__device__ __host__ Matrix operator+ (const Matrix &m, const Matrix &n)
+__device__ Matrix
+operator+ (const Matrix &m, const Matrix &n)
 {
-	return make_matrix(m.a00+n.a00, m.a01+n.a01, m.a02+n.a02,
-			   m.a10+n.a10, m.a11+n.a11, m.a12+n.a12,
-			   m.a20+n.a20, m.a21+n.a21, m.a22+n.a22);
+    return make_matrix(m.a00+n.a00, m.a01+n.a01, m.a02+n.a02,
+		       m.a10+n.a10, m.a11+n.a11, m.a12+n.a12,
+		       m.a20+n.a20, m.a21+n.a21, m.a22+n.a22);
 }
 
-__device__ __host__ void operator+= (Matrix &m, const Matrix &n)
+__device__ void
+operator+= (Matrix &m, const Matrix &n)
 {
-	m.a00 += n.a00;
-	m.a01 += n.a01;
-	m.a02 += n.a02;
-	m.a10 += n.a10;
-	m.a11 += n.a11;
-	m.a12 += n.a12;
-	m.a20 += n.a20;
-	m.a21 += n.a21;
-	m.a22 += n.a22;
+    m.a00 += n.a00;
+    m.a01 += n.a01;
+    m.a02 += n.a02;
+    m.a10 += n.a10;
+    m.a11 += n.a11;
+    m.a12 += n.a12;
+    m.a20 += n.a20;
+    m.a21 += n.a21;
+    m.a22 += n.a22;
 }
 
-__device__ __host__ Matrix operator- (const Matrix &m, const Matrix &n)
+__device__ Matrix
+operator- (const Matrix &m, const Matrix &n)
 {
-	return make_matrix(m.a00-n.a00, m.a01-n.a01, m.a02-n.a02,
-			   m.a10-n.a10, m.a11-n.a11, m.a12-n.a12,
-			   m.a20-n.a20, m.a21-n.a21, m.a22-n.a22);
+    return make_matrix(m.a00-n.a00, m.a01-n.a01, m.a02-n.a02,
+		       m.a10-n.a10, m.a11-n.a11, m.a12-n.a12,
+		       m.a20-n.a20, m.a21-n.a21, m.a22-n.a22);
 }
 
-__device__ __host__ void operator-= (Matrix &m, const Matrix& n)
+__device__ void
+operator-= (Matrix &m, const Matrix& n)
 {
-	m.a00 -= n.a00;
-	m.a01 -= n.a01;
-	m.a02 -= n.a02;
-	m.a10 -= n.a10;
-	m.a11 -= n.a11;
-	m.a12 -= n.a12;
-	m.a20 -= n.a20;
-	m.a21 -= n.a21;
-	m.a22 -= n.a22;
+    m.a00 -= n.a00;
+    m.a01 -= n.a01;
+    m.a02 -= n.a02;
+    m.a10 -= n.a10;
+    m.a11 -= n.a11;
+    m.a12 -= n.a12;
+    m.a20 -= n.a20;
+    m.a21 -= n.a21;
+    m.a22 -= n.a22;
 }
 
-__device__ __host__ Matrix operator* (const Matrix &m, const Matrix &n)
+__device__ Matrix
+operator* (const Matrix &m, const Matrix &n)
 {
-	return make_matrix(m.a00*n.a00 + m.a01*n.a10 + m.a02*n.a20,
-			   m.a00*n.a01 + m.a01*n.a11 + m.a02*n.a21,
-			   m.a00*n.a02 + m.a01*n.a12 + m.a02*n.a22,
-			   m.a10*n.a00 + m.a11*n.a10 + m.a12*n.a20,
-			   m.a10*n.a01 + m.a11*n.a11 + m.a12*n.a21,
-			   m.a10*n.a02 + m.a11*n.a12 + m.a12*n.a22,
-			   m.a20*n.a00 + m.a21*n.a10 + m.a22*n.a20,
-			   m.a20*n.a01 + m.a21*n.a11 + m.a22*n.a21,
-			   m.a20*n.a02 + m.a21*n.a12 + m.a22*n.a22);
+    return make_matrix(m.a00*n.a00 + m.a01*n.a10 + m.a02*n.a20,
+		       m.a00*n.a01 + m.a01*n.a11 + m.a02*n.a21,
+		       m.a00*n.a02 + m.a01*n.a12 + m.a02*n.a22,
+		       m.a10*n.a00 + m.a11*n.a10 + m.a12*n.a20,
+		       m.a10*n.a01 + m.a11*n.a11 + m.a12*n.a21,
+		       m.a10*n.a02 + m.a11*n.a12 + m.a12*n.a22,
+		       m.a20*n.a00 + m.a21*n.a10 + m.a22*n.a20,
+		       m.a20*n.a01 + m.a21*n.a11 + m.a22*n.a21,
+		       m.a20*n.a02 + m.a21*n.a12 + m.a22*n.a22);
 }
 
-__device__ __host__ Matrix operator+ (const Matrix &m, const float &c)
+__device__ Matrix
+operator+ (const Matrix &m, const float &c)
 {
-	return make_matrix(m.a00+c, m.a01+c, m.a02+c,
-			   m.a10+c, m.a11+c, m.a12+c,
-			   m.a20+c, m.a21+c, m.a22+c);
+    return make_matrix(m.a00+c, m.a01+c, m.a02+c,
+		       m.a10+c, m.a11+c, m.a12+c,
+		       m.a20+c, m.a21+c, m.a22+c);
 }
 
-__device__ __host__ void operator+= (Matrix &m, const float &c)
+__device__ void
+operator+= (Matrix &m, const float &c)
 {
-	m.a00 += c;
-	m.a01 += c;
-	m.a02 += c;
-	m.a10 += c;
-	m.a11 += c;
-	m.a12 += c;
-	m.a20 += c;
-	m.a21 += c;
-	m.a22 += c;
+    m.a00 += c;
+    m.a01 += c;
+    m.a02 += c;
+    m.a10 += c;
+    m.a11 += c;
+    m.a12 += c;
+    m.a20 += c;
+    m.a21 += c;
+    m.a22 += c;
 }
 
-__device__ __host__ Matrix operator+ (const float &c, const Matrix &m)
+__device__ Matrix
+operator+ (const float &c, const Matrix &m)
 {
-	return make_matrix(c+m.a00, c+m.a01, c+m.a02,
-			   c+m.a10, c+m.a11, c+m.a12,
-			   c+m.a20, c+m.a21, c+m.a22);
+    return make_matrix(c+m.a00, c+m.a01, c+m.a02,
+		       c+m.a10, c+m.a11, c+m.a12,
+		       c+m.a20, c+m.a21, c+m.a22);
 }
 
-__device__ __host__ Matrix operator- (const Matrix &m, const float &c)
+__device__ Matrix
+operator- (const Matrix &m, const float &c)
 {
-	return make_matrix(m.a00-c, m.a01-c, m.a02-c,
-			   m.a10-c, m.a11-c, m.a12-c,
-			   m.a20-c, m.a21-c, m.a22-c);
+    return make_matrix(m.a00-c, m.a01-c, m.a02-c,
+		       m.a10-c, m.a11-c, m.a12-c,
+		       m.a20-c, m.a21-c, m.a22-c);
 }
 
-__device__ __host__ void operator-= (Matrix &m, const float &c)
+__device__ void
+operator-= (Matrix &m, const float &c)
 {
-	m.a00 -= c;
-	m.a01 -= c;
-	m.a02 -= c;
-	m.a10 -= c;
-	m.a11 -= c;
-	m.a12 -= c;
-	m.a20 -= c;
-	m.a21 -= c;
-	m.a22 -= c;
+    m.a00 -= c;
+    m.a01 -= c;
+    m.a02 -= c;
+    m.a10 -= c;
+    m.a11 -= c;
+    m.a12 -= c;
+    m.a20 -= c;
+    m.a21 -= c;
+    m.a22 -= c;
 }
 
-__device__ __host__ Matrix operator- (const float &c, const Matrix &m)
+__device__ Matrix
+operator- (const float &c, const Matrix &m)
 {
-	return make_matrix(c-m.a00, c-m.a01, c-m.a02,
-			   c-m.a10, c-m.a11, c-m.a12,
-			   c-m.a20, c-m.a21, c-m.a22);
+    return make_matrix(c-m.a00, c-m.a01, c-m.a02,
+		       c-m.a10, c-m.a11, c-m.a12,
+		       c-m.a20, c-m.a21, c-m.a22);
 }
 
-__device__ __host__ Matrix operator* (const Matrix &m, const float &c)
+__device__ Matrix
+operator* (const Matrix &m, const float &c)
 {
-	return make_matrix(m.a00*c, m.a01*c, m.a02*c,
-			   m.a10*c, m.a11*c, m.a12*c,
-			   m.a20*c, m.a21*c, m.a22*c);
+    return make_matrix(m.a00*c, m.a01*c, m.a02*c,
+		       m.a10*c, m.a11*c, m.a12*c,
+		       m.a20*c, m.a21*c, m.a22*c);
 }
 
-__device__ __host__ void operator*= (Matrix &m, const float &c)
+__device__ void
+operator*= (Matrix &m, const float &c)
 {
-	m.a00 *= c;
-	m.a01 *= c;
-	m.a02 *= c;
-	m.a10 *= c;
-	m.a11 *= c;
-	m.a12 *= c;
-	m.a20 *= c;
-	m.a21 *= c;
-	m.a22 *= c;
+    m.a00 *= c;
+    m.a01 *= c;
+    m.a02 *= c;
+    m.a10 *= c;
+    m.a11 *= c;
+    m.a12 *= c;
+    m.a20 *= c;
+    m.a21 *= c;
+    m.a22 *= c;
 }
 
-__device__ __host__ Matrix operator* (const float &c, const Matrix &m)
+__device__ Matrix
+operator* (const float &c, const Matrix &m)
 {
-	return make_matrix(c*m.a00, c*m.a01, c*m.a02,
-			   c*m.a10, c*m.a11, c*m.a12,
-			   c*m.a20, c*m.a21, c*m.a22);
+    return make_matrix(c*m.a00, c*m.a01, c*m.a02,
+		       c*m.a10, c*m.a11, c*m.a12,
+		       c*m.a20, c*m.a21, c*m.a22);
 }
 
-__device__ __host__ Matrix operator/ (const Matrix &m, const float &c)
+__device__ Matrix
+operator/ (const Matrix &m, const float &c)
 {
-	return make_matrix(m.a00/c, m.a01/c, m.a02/c,
-			   m.a10/c, m.a11/c, m.a12/c,
-			   m.a20/c, m.a21/c, m.a22/c);
+    return make_matrix(m.a00/c, m.a01/c, m.a02/c,
+		       m.a10/c, m.a11/c, m.a12/c,
+		       m.a20/c, m.a21/c, m.a22/c);
 }
 
-__device__ __host__ void operator/= (Matrix &m, const float &c)
+__device__ void
+operator/= (Matrix &m, const float &c)
 {
-	m.a00 /= c;
-	m.a01 /= c;
-	m.a02 /= c;
-	m.a10 /= c;
-	m.a11 /= c;
-	m.a12 /= c;
-	m.a20 /= c;
-	m.a21 /= c;
-	m.a22 /= c;
+    m.a00 /= c;
+    m.a01 /= c;
+    m.a02 /= c;
+    m.a10 /= c;
+    m.a11 /= c;
+    m.a12 /= c;
+    m.a20 /= c;
+    m.a21 /= c;
+    m.a22 /= c;
 }
 
-__device__ __host__ Matrix operator/ (const float &c, const Matrix &m)
+__device__ Matrix
+operator/ (const float &c, const Matrix &m)
 {
-	return make_matrix(c/m.a00, c/m.a01, c/m.a02,
-			   c/m.a10, c/m.a11, c/m.a12,
-			   c/m.a20, c/m.a21, c/m.a22);
+    return make_matrix(c/m.a00, c/m.a01, c/m.a02,
+		       c/m.a10, c/m.a11, c/m.a12,
+		       c/m.a20, c/m.a21, c/m.a22);
 }
 
-__device__ __host__ float det(const Matrix &m)
+__device__ float
+det(const Matrix &m)
 {
-	return m.a00*(m.a11*m.a22 - m.a12*m.a21) - m.a10*(m.a01*m.a22 - m.a02*m.a21) + m.a20*(m.a01*m.a12 - m.a02*m.a11);
+    return m.a00*(m.a11*m.a22 - m.a12*m.a21) -
+	   m.a10*(m.a01*m.a22 - m.a02*m.a21) +
+	   m.a20*(m.a01*m.a12 - m.a02*m.a11);
 }
 
-__device__ __host__ Matrix inv(const Matrix &m)
+__device__ Matrix
+inv(const Matrix &m)
 {
-	return make_matrix(m.a11*m.a22 - m.a12*m.a21,
-			   m.a02*m.a21 - m.a01*m.a22,
-			   m.a01*m.a12 - m.a02*m.a11,
-			   m.a12*m.a20 - m.a10*m.a22,
-			   m.a00*m.a22 - m.a02*m.a20,
-			   m.a02*m.a10 - m.a00*m.a12,
-			   m.a10*m.a21 - m.a11*m.a20,
-			   m.a01*m.a20 - m.a00*m.a21,
-			   m.a00*m.a11 - m.a01*m.a10)/det(m);
+    return make_matrix(m.a11*m.a22 - m.a12*m.a21,
+		       m.a02*m.a21 - m.a01*m.a22,
+		       m.a01*m.a12 - m.a02*m.a11,
+		       m.a12*m.a20 - m.a10*m.a22,
+		       m.a00*m.a22 - m.a02*m.a20,
+		       m.a02*m.a10 - m.a00*m.a12,
+		       m.a10*m.a21 - m.a11*m.a20,
+		       m.a01*m.a20 - m.a00*m.a21,
+		       m.a00*m.a11 - m.a01*m.a10)/det(m);
 }
 
-__device__ __host__ Matrix outer(const float3 &a, const float3 &b)
+__device__ Matrix
+outer(const float3 &a, const float3 &b)
 {
-	return make_matrix(a.x*b.x, a.x*b.y, a.x*b.z,
-			   a.y*b.x, a.y*b.y, a.y*b.z,
-			   a.z*b.x, a.z*b.y, a.z*b.z);
+    return make_matrix(a.x*b.x, a.x*b.y, a.x*b.z,
+		       a.y*b.x, a.y*b.y, a.y*b.z,
+		       a.z*b.x, a.z*b.y, a.z*b.z);
 }
 
 #endif
diff --git a/src/mesh.h b/src/mesh.h
index f466470..0622144 100644
--- a/src/mesh.h
+++ b/src/mesh.h
@@ -2,187 +2,164 @@
 #define __MESH_H__
 
 #include "intersect.h"
+#include "geometry.h"
 
 #define STACK_SIZE 500
 
-/* flattened triangle mesh */
-__device__ float3 *g_vertices;
-__device__ uint4 *g_triangles;
-__device__ unsigned int *g_colors;
-__device__ unsigned int g_start_node;
-__device__ unsigned int g_first_node;
-
-/* lower/upper bounds for the bounding box associated with each node/leaf */
-__device__ float3 *g_lower_bounds;
-__device__ float3 *g_upper_bounds;
-
-/* map to child node/triangle indices */
-texture<unsigned int, 1, cudaReadModeElementType> node_map;
-texture<unsigned int, 1, cudaReadModeElementType> node_map_end;
-
-__device__ float3 make_float3(const float4 &a)
+/* Tests the intersection between a ray and a node in the bounding volume
+   hierarchy. If the ray intersects the bounding volume and `min_distance`
+   is less than zero or the distance from `origin` to the intersection is
+   less than `min_distance`, return true, else return false. */
+__device__ bool
+intersect_node(const float3 &origin, const float3 &direction,
+	       Geometry *geometry, const int &i, const float &min_distance)
 {
-	return make_float3(a.x, a.y, a.z);
-}
+    /* assigning these to local variables is faster for some reason */
+    float3 lower_bound = geometry->lower_bounds[i];
+    float3 upper_bound = geometry->upper_bounds[i];
 
-__device__ int convert(int c)
-{
-	if (c & 0x80)
-		return (0xFFFFFF00 | c);
-	else
-		return c;
-}
+    float distance_to_box;
 
-/* Test the intersection between a ray starting at `origin` traveling in the
-   direction `direction` and the bounding box around node `i`. If the ray
-   intersects the bounding box return true, else return false. */
-__device__ bool intersect_node(const float3 &origin, const float3 &direction, const int &i, const float &min_distance)
-{
-	float3 lower_bound = g_lower_bounds[i];
-	float3 upper_bound = g_upper_bounds[i];
+    if (intersect_box(origin, direction, lower_bound, upper_bound,
+		      distance_to_box)) {
+	if (min_distance < 0.0f)
+	    return true;
 
-	float distance_to_box;
+	if (distance_to_box > min_distance)
+	    return false;
 
-	if (intersect_box(origin, direction, lower_bound, upper_bound, distance_to_box))
-	{
-		if (min_distance < 0.0f)
-			return true;
+	return true;
+    }
+    else {
+	return false;
+    }
 
-		if (distance_to_box > min_distance)
-			return false;
-
-		return true;
-	}
-	else
-	{
-		return false;
-	}
-
-} // intersect_node
+}
 
-/* Find the intersection between a ray starting at `origin` traveling in the
-   direction `direction` and the global mesh texture. If the ray intersects
-   the texture return the index of the triangle which the ray intersected,
-   else return -1. */
-__device__ int intersect_mesh(const float3 &origin, const float3& direction, float &min_distance, int last_hit_triangle = -1)
+/* Finds the intersection between a ray and `geometry`. If the ray does
+   intersect the mesh and the index of the intersected triangle is not equal
+   to `last_hit_triangle`, set `min_distance` to the distance from `origin` to
+   the intersection and return the index of the triangle which the ray
+   intersected, else return -1. */
+__device__ int
+intersect_mesh(const float3 &origin, const float3& direction,
+	       Geometry *geometry, float &min_distance,
+	       int last_hit_triangle = -1)
 {
-	int triangle_index = -1;
+    int triangle_index = -1;
 
-	float distance;
-	min_distance = -1.0f;
+    float distance;
+    min_distance = -1.0f;
 
-	if (!intersect_node(origin, direction, g_start_node, min_distance))
-		return -1;
+    if (!intersect_node(origin, direction, geometry, geometry->start_node,
+			min_distance))
+	return -1;
 
-	unsigned int stack[STACK_SIZE];
+    unsigned int stack[STACK_SIZE];
 
-	unsigned int *head = &stack[0];
-	unsigned int *node = &stack[1];
-	unsigned int *tail = &stack[STACK_SIZE-1];
-	*node = g_start_node;
+    unsigned int *head = &stack[0];
+    unsigned int *node = &stack[1];
+    unsigned int *tail = &stack[STACK_SIZE-1];
+    *node = geometry->start_node;
 
-	unsigned int i;
+    unsigned int i;
 
-	do
-	{
-		unsigned int first_child = tex1Dfetch(node_map, *node);
-		unsigned int stop = tex1Dfetch(node_map_end, *node);
+    do
+    {
+	unsigned int first_child = geometry->node_map[*node];
+	unsigned int stop = geometry->node_map_end[*node];
 
-		while (*node >= g_first_node && stop == first_child+1)
-		{
-			*node = first_child;
-			first_child = tex1Dfetch(node_map, *node);
-			stop = tex1Dfetch(node_map_end, *node);
-		}
+	while (*node >= geometry->first_node && stop == first_child+1) {
+	    *node = first_child;
+	    first_child = geometry->node_map[*node];
+	    stop = geometry->node_map_end[*node];
+	}
 		
-		if (*node >= g_first_node)
-		{
-			for (i=first_child; i < stop; i++)
-			{
-				if (intersect_node(origin, direction, i, min_distance))
-				{
-					*node = i;
-					node++;
-				}
-			}
-
-			node--;
+	if (*node >= geometry->first_node) {
+	    for (i=first_child; i < stop; i++) {
+		if (intersect_node(origin, direction, geometry, i,
+				   min_distance)) {
+		    *node = i;
+		    node++;
+		}
+	    }
+	    
+	    node--;
+	}
+	else {
+	    // node is a leaf
+	    for (i=first_child; i < stop; i++) {
+		if (last_hit_triangle == i)
+		    continue;
+
+		Triangle triangle = get_triangle(geometry, i);
+
+		if (intersect_triangle(origin, direction, triangle,
+				       distance)) {
+		    if (triangle_index == -1) {
+			triangle_index = i;
+			min_distance = distance;
+			continue;
+		    }
+
+		    if (distance < min_distance) {
+			triangle_index = i;
+			min_distance = distance;
+		    }
 		}
-		else // node is a leaf
-		{
-			for (i=first_child; i < stop; i++)
-			{
-				if (last_hit_triangle == i)
-					continue;
-
-				uint4 triangle_data = g_triangles[i];
-
-				float3 v0 = g_vertices[triangle_data.x];
-				float3 v1 = g_vertices[triangle_data.y];
-				float3 v2 = g_vertices[triangle_data.z];
-
-				if (intersect_triangle(origin, direction, v0, v1, v2, distance))
-				{
-					if (triangle_index == -1)
-					{
-						triangle_index = i;
-						min_distance = distance;
-						continue;
-					}
-
-					if (distance < min_distance)
-					{
-						triangle_index = i;
-						min_distance = distance;
-					}
-				}
-			} // triangle loop
-
-			node--;
-
-		} // node is a leaf
-
-	} // while loop
-	while (node != head);
-
-	return triangle_index;
+	    } // triangle loop
+
+	    node--;
+
+	} // node is a leaf
+
+    } // while loop
+    while (node != head);
+    
+    return triangle_index;
 }
 
 extern "C"
 {
 
-__global__ void set_global_mesh_variables(uint4 *triangles, float3 *vertices, unsigned int *colors, unsigned int start_node, unsigned int first_node, float3 *lower_bounds, float3 *upper_bounds)
+__global__ void
+distance_to_mesh(int nthreads, float3 *positions, float3 *directions,
+		 Geometry *g, float *distances)
 {
-	g_triangles = triangles;
-	g_vertices = vertices;
-	g_colors = colors;
-	g_start_node = start_node;
-	g_first_node = first_node;
-	g_lower_bounds = lower_bounds;
-	g_upper_bounds = upper_bounds;
-}
+    int id = blockIdx.x*blockDim.x + threadIdx.x;
 
-__global__ void set_colors(unsigned int *colors)
-{
-	g_colors = colors;
+    if (id >= nthreads)
+	return;
+
+    float3 position = positions[id];
+    float3 direction = directions[id];
+    direction /= norm(direction);
+
+    float distance;
+
+    int triangle_index = intersect_mesh(position, direction, g, distance);
+
+    if (triangle_index == -1)
+	distances[id] = 1e9;
+    else
+	distances[id] = distance;
 }
 
-__global__ void color_solids(int first_triangle, int nthreads, 
-			     int *solid_id_map, 
-			     bool *solid_hit, 
-			     unsigned int *solid_colors)
+__global__ void
+color_solids(int first_triangle, int nthreads, int *solid_id_map,
+	     bool *solid_hit, unsigned int *solid_colors, Geometry *geometry)
 {
-	int id = blockIdx.x*blockDim.x + threadIdx.x;
+    int id = blockIdx.x*blockDim.x + threadIdx.x;
 
-	if (id >= nthreads)
-		return;
+    if (id >= nthreads)
+	return;
 
-	int triangle_id = first_triangle + id;
-	int solid_id = solid_id_map[triangle_id];
-	if (solid_hit[solid_id])
-		g_colors[triangle_id] = solid_colors[solid_id];
+    int triangle_id = first_triangle + id;
+    int solid_id = solid_id_map[triangle_id];
+    if (solid_hit[solid_id])
+	geometry->colors[triangle_id] = solid_colors[solid_id];
 }
 
-} // extern "c"
+} // extern "C"
 
 #endif
diff --git a/src/photon.h b/src/photon.h
index e781864..8b7e588 100644
--- a/src/photon.h
+++ b/src/photon.h
@@ -3,309 +3,322 @@
 
 #include "stdio.h"
 #include "linalg.h"
-#include "materials.h"
 #include "rotate.h"
 #include "random.h"
 #include "physical_constants.h"
 #include "mesh.h"
+#include "geometry.h"
 
 struct Photon
 {
-	float3 position;
-	float3 direction;
-	float3 polarization;
-	float wavelength;
-	float time;
+    float3 position;
+    float3 direction;
+    float3 polarization;
+    float wavelength;
+    float time;
 
-	unsigned int history;
+    unsigned int history;
 
-	int last_hit_triangle;
+    int last_hit_triangle;
 };
 
 struct State
 {
-	bool inside_to_outside;
+    bool inside_to_outside;
 
-	float3 surface_normal;
+    float3 surface_normal;
 
-	float refractive_index1, refractive_index2;
-	float absorption_length;
-	float scattering_length;
+    float refractive_index1, refractive_index2;
+    float absorption_length;
+    float scattering_length;
 
-	int surface_index;
+    int surface_index;
 
-	float distance_to_boundary;
+    float distance_to_boundary;
 };
 
 enum
 {
-	NO_HIT           = 0x1 << 0,
-	BULK_ABSORB      = 0x1 << 1,
-	SURFACE_DETECT   = 0x1 << 2,
-	SURFACE_ABSORB   = 0x1 << 3,
-	RAYLEIGH_SCATTER = 0x1 << 4,
-	REFLECT_DIFFUSE  = 0x1 << 5,
-	REFLECT_SPECULAR = 0x1 << 6,
-	NAN_ABORT        = 0x1 << 31
+    NO_HIT           = 0x1 << 0,
+    BULK_ABSORB      = 0x1 << 1,
+    SURFACE_DETECT   = 0x1 << 2,
+    SURFACE_ABSORB   = 0x1 << 3,
+    RAYLEIGH_SCATTER = 0x1 << 4,
+    REFLECT_DIFFUSE  = 0x1 << 5,
+    REFLECT_SPECULAR = 0x1 << 6,
+    NAN_ABORT        = 0x1 << 31
 }; // processes
 
 enum {BREAK, CONTINUE, PASS}; // return value from propagate_to_boundary
 
-__device__ float get_theta(const float3 &a, const float3 &b)
+__device__ int
+convert(int c)
 {
-	return acosf(fmaxf(-1.0f,fminf(1.0f,dot(a,b))));
+    if (c & 0x80)
+	return (0xFFFFFF00 | c);
+    else
+	return c;
 }
 
-__device__ void fill_state(State &s, Photon &p)
+__device__ float
+get_theta(const float3 &a, const float3 &b)
 {
-	p.last_hit_triangle = intersect_mesh(p.position, p.direction, s.distance_to_boundary, p.last_hit_triangle);
-
-	if (p.last_hit_triangle == -1)
-	{
-		p.history |= NO_HIT;
-		return;
-	}
-
-	uint4 triangle_data = g_triangles[p.last_hit_triangle];
-
-	float3 v0 = g_vertices[triangle_data.x];
-	float3 v1 = g_vertices[triangle_data.y];
-	float3 v2 = g_vertices[triangle_data.z];
-
-	int inner_material_index = convert(0xFF & (triangle_data.w >> 24));
-	int outer_material_index = convert(0xFF & (triangle_data.w >> 16));
-	s.surface_index = convert(0xFF & (triangle_data.w >> 8));
-
-	s.surface_normal = cross(v1-v0, v2-v1);
-	s.surface_normal /= norm(s.surface_normal);
-
-	Material material1, material2;
-	if (dot(s.surface_normal,-p.direction) > 0.0f)
-	{
-		// outside to inside
-		material1 = materials[outer_material_index];
-		material2 = materials[inner_material_index];
-
-		s.inside_to_outside = false;
-	}
-	else
-	{
-		// inside to outside
-		material1 = materials[inner_material_index];
-		material2 = materials[outer_material_index];
-		s.surface_normal = -s.surface_normal;
-
-		s.inside_to_outside = true;
-	}
+    return acosf(fmaxf(-1.0f,fminf(1.0f,dot(a,b))));
+}
 
-	s.refractive_index1 = interp_property(p.wavelength, material1.refractive_index);
-	s.refractive_index2 = interp_property(p.wavelength, material2.refractive_index);
-	s.absorption_length = interp_property(p.wavelength, material1.absorption_length);
-	s.scattering_length = interp_property(p.wavelength, material1.scattering_length);
+__device__ void
+fill_state(State &s, Photon &p, Geometry *g)
+{
+    p.last_hit_triangle = intersect_mesh(p.position, p.direction, g,
+					 s.distance_to_boundary,
+					 p.last_hit_triangle);
+
+    if (p.last_hit_triangle == -1) {
+	p.history |= NO_HIT;
+	return;
+    }
+    
+    Triangle t = get_triangle(g, p.last_hit_triangle);
+    
+    unsigned int material_code = g->material_codes[p.last_hit_triangle];
+    
+    int inner_material_index = convert(0xFF & (material_code >> 24));
+    int outer_material_index = convert(0xFF & (material_code >> 16));
+    s.surface_index = convert(0xFF & (material_code >> 8));
+    
+    float3 v01 = t.v1 - t.v0;
+    float3 v12 = t.v2 - t.v1;
+    
+    s.surface_normal = normalize(cross(v01, v12));
+				 
+    Material *material1, *material2;
+    if (dot(s.surface_normal,-p.direction) > 0.0f) {
+	// outside to inside
+	material1 = g->materials[outer_material_index];
+	material2 = g->materials[inner_material_index];
+
+	s.inside_to_outside = false;
+    }
+    else {
+	// inside to outside
+	material1 = g->materials[inner_material_index];
+	material2 = g->materials[outer_material_index];
+	s.surface_normal = -s.surface_normal;
+
+	s.inside_to_outside = true;
+    }
+
+    s.refractive_index1 = interp_property(material1, p.wavelength,
+					  material1->refractive_index);
+    s.refractive_index2 = interp_property(material2, p.wavelength,
+					  material2->refractive_index);
+    s.absorption_length = interp_property(material1, p.wavelength,
+					  material1->absorption_length);
+    s.scattering_length = interp_property(material1, p.wavelength,
+					  material1->scattering_length);
 
 } // fill_state
 
-__device__ float3 pick_new_direction(float3 axis, float theta, float phi)
+__device__ float3
+pick_new_direction(float3 axis, float theta, float phi)
 {
-	// Taken from SNOMAN rayscatter.for
-	float cos_theta = cosf(theta);
-	float sin_theta = sinf(theta);
-	float cos_phi   = cosf(phi);
-	float sin_phi   = sinf(phi);
+    // Taken from SNOMAN rayscatter.for
+    float cos_theta = cosf(theta);
+    float sin_theta = sinf(theta);
+    float cos_phi   = cosf(phi);
+    float sin_phi   = sinf(phi);
 	
-	float sin_axis_theta = sqrt(1.0f - axis.z*axis.z);
-	float cos_axis_phi, sin_axis_phi;
+    float sin_axis_theta = sqrt(1.0f - axis.z*axis.z);
+    float cos_axis_phi, sin_axis_phi;
 	
-	if (isnan(sin_axis_theta) || sin_axis_theta < 0.00001f) {
-		cos_axis_phi = 1.0f;
-		sin_axis_phi = 0.0f;
-	} else {
-		cos_axis_phi = axis.x / sin_axis_theta;
-		sin_axis_phi = axis.y / sin_axis_theta;
-	}
-
-	return make_float3(cos_theta*axis.x + sin_theta*(axis.z*cos_phi*cos_axis_phi - sin_phi*sin_axis_phi),
-			   cos_theta*axis.y + sin_theta*(cos_phi*axis.z*sin_axis_phi - sin_phi*cos_axis_phi),
-			   cos_theta*axis.z - sin_theta*cos_phi*sin_axis_theta);
+    if (isnan(sin_axis_theta) || sin_axis_theta < 0.00001f) {
+	cos_axis_phi = 1.0f;
+	sin_axis_phi = 0.0f;
+    }
+    else {
+	cos_axis_phi = axis.x / sin_axis_theta;
+	sin_axis_phi = axis.y / sin_axis_theta;
+    }
+
+    float dirx = cos_theta*axis.x +
+	sin_theta*(axis.z*cos_phi*cos_axis_phi - sin_phi*sin_axis_phi);
+    float diry = cos_theta*axis.y +
+	sin_theta*(cos_phi*axis.z*sin_axis_phi - sin_phi*cos_axis_phi);
+    float dirz = cos_theta*axis.z - sin_theta*cos_phi*sin_axis_theta;
+
+    return make_float3(dirx, diry, dirz);
 }
 
-__device__ void rayleigh_scatter(Photon &p, curandState &rng)
+__device__ void
+rayleigh_scatter(Photon &p, curandState &rng)
 {
-	float cos_theta = 2.0f*cosf((acosf(1.0f - 2.0f*curand_uniform(&rng))-2*PI)/3.0f);
-	if (cos_theta > 1.0f)
-	  cos_theta = 1.0f;
-	else if (cos_theta < -1.0f)
-	  cos_theta = -1.0f;
-
-	float theta = acosf(cos_theta);
-	float phi = uniform(&rng, 0.0f, 2.0f * PI);
-
-	p.direction = pick_new_direction(p.polarization, theta, phi);
-
-	if (1.0f - fabsf(cos_theta) < 1e-6f) {
-		p.polarization = pick_new_direction(p.polarization, PI/2.0f, phi);
-	} else {
-		// linear combination of old polarization and new direction
-		p.polarization = p.polarization - cos_theta * p.direction;
-	}
-
-	p.direction /= norm(p.direction);
-	p.polarization /= norm(p.polarization);
+    float cos_theta = 2.0f*cosf((acosf(1.0f - 2.0f*curand_uniform(&rng))-2*PI)/3.0f);
+    if (cos_theta > 1.0f)
+	cos_theta = 1.0f;
+    else if (cos_theta < -1.0f)
+	cos_theta = -1.0f;
+
+    float theta = acosf(cos_theta);
+    float phi = uniform(&rng, 0.0f, 2.0f * PI);
+
+    p.direction = pick_new_direction(p.polarization, theta, phi);
+
+    if (1.0f - fabsf(cos_theta) < 1e-6f) {
+	p.polarization = pick_new_direction(p.polarization, PI/2.0f, phi);
+    }
+    else {
+	// linear combination of old polarization and new direction
+	p.polarization = p.polarization - cos_theta * p.direction;
+    }
+
+    p.direction /= norm(p.direction);
+    p.polarization /= norm(p.polarization);
 } // scatter
 
 __device__ int propagate_to_boundary(Photon &p, State &s, curandState &rng)
 {
-	float absorption_distance = -s.absorption_length*logf(curand_uniform(&rng));
-	float scattering_distance = -s.scattering_length*logf(curand_uniform(&rng));
+    float absorption_distance = -s.absorption_length*logf(curand_uniform(&rng));
+    float scattering_distance = -s.scattering_length*logf(curand_uniform(&rng));
 
-	if (absorption_distance <= scattering_distance)
-	{
-		if (absorption_distance <= s.distance_to_boundary)
-		{
-			p.time += absorption_distance/(SPEED_OF_LIGHT/s.refractive_index1);
-			p.position += absorption_distance*p.direction;
-			p.history |= BULK_ABSORB;
+    if (absorption_distance <= scattering_distance) {
+	if (absorption_distance <= s.distance_to_boundary) {
+	    p.time += absorption_distance/(SPEED_OF_LIGHT/s.refractive_index1);
+	    p.position += absorption_distance*p.direction;
+	    p.history |= BULK_ABSORB;
 
-			p.last_hit_triangle = -1;
+	    p.last_hit_triangle = -1;
 
-			return BREAK;
-		} // photon is absorbed in material1
-	}
-	else
-	{
-		if (scattering_distance <= s.distance_to_boundary)
-		{
-			p.time += scattering_distance/(SPEED_OF_LIGHT/s.refractive_index1);
-			p.position += scattering_distance*p.direction;
+	    return BREAK;
+	} // photon is absorbed in material1
+    }
+    else {
+	if (scattering_distance <= s.distance_to_boundary) {
+	    p.time += scattering_distance/(SPEED_OF_LIGHT/s.refractive_index1);
+	    p.position += scattering_distance*p.direction;
 
-			rayleigh_scatter(p, rng);
+	    rayleigh_scatter(p, rng);
 
-			p.history |= RAYLEIGH_SCATTER;
+	    p.history |= RAYLEIGH_SCATTER;
 
-			p.last_hit_triangle = -1;
+	    p.last_hit_triangle = -1;
 
-			return CONTINUE;
-		} // photon is scattered in material1
-	} // if scattering_distance < absorption_distance
+	    return CONTINUE;
+	} // photon is scattered in material1
+    } // if scattering_distance < absorption_distance
 
-	p.position += s.distance_to_boundary*p.direction;
-	p.time += s.distance_to_boundary/(SPEED_OF_LIGHT/s.refractive_index1);
+    p.position += s.distance_to_boundary*p.direction;
+    p.time += s.distance_to_boundary/(SPEED_OF_LIGHT/s.refractive_index1);
 
-	return PASS;
+    return PASS;
 
 } // propagate_to_boundary
 
-__device__ void propagate_at_boundary(Photon &p, State &s, curandState &rng)
+__device__ void
+propagate_at_boundary(Photon &p, State &s, curandState &rng)
 {
-	float incident_angle = get_theta(s.surface_normal,-p.direction);
-	float refracted_angle = asinf(sinf(incident_angle)*s.refractive_index1/s.refractive_index2);
-
-	float3 incident_plane_normal = cross(p.direction, s.surface_normal);
-	float incident_plane_normal_length = norm(incident_plane_normal);
-
-	// Photons at normal incidence do not have a unique plane of incidence,
-	// so we have to pick the plane normal to be the polarization vector
-	// to get the correct logic below
-	if (incident_plane_normal_length < 1e-6f)
-	  incident_plane_normal = p.polarization;
-	else
-	  incident_plane_normal /= incident_plane_normal_length;
-
-	float normal_coefficient = dot(p.polarization, incident_plane_normal);
-	float normal_probability = normal_coefficient*normal_coefficient;
-
-	float reflection_coefficient;
-	if (curand_uniform(&rng) < normal_probability)
-	{
-		reflection_coefficient = -sinf(incident_angle-refracted_angle)/sinf(incident_angle+refracted_angle);
-
-		if ((curand_uniform(&rng) < reflection_coefficient*reflection_coefficient) || isnan(refracted_angle))
-		{
-			p.direction = rotate(s.surface_normal, incident_angle, incident_plane_normal);
+    float incident_angle = get_theta(s.surface_normal,-p.direction);
+    float refracted_angle = asinf(sinf(incident_angle)*s.refractive_index1/s.refractive_index2);
+
+    float3 incident_plane_normal = cross(p.direction, s.surface_normal);
+    float incident_plane_normal_length = norm(incident_plane_normal);
+
+    // Photons at normal incidence do not have a unique plane of incidence,
+    // so we have to pick the plane normal to be the polarization vector
+    // to get the correct logic below
+    if (incident_plane_normal_length < 1e-6f)
+	incident_plane_normal = p.polarization;
+    else
+	incident_plane_normal /= incident_plane_normal_length;
+
+    float normal_coefficient = dot(p.polarization, incident_plane_normal);
+    float normal_probability = normal_coefficient*normal_coefficient;
+
+    float reflection_coefficient;
+    if (curand_uniform(&rng) < normal_probability) {
+	// photon polarization normal to plane of incidence
+	reflection_coefficient = -sinf(incident_angle-refracted_angle)/sinf(incident_angle+refracted_angle);
+
+	if ((curand_uniform(&rng) < reflection_coefficient*reflection_coefficient) || isnan(refracted_angle)) {
+	    p.direction = rotate(s.surface_normal, incident_angle, incident_plane_normal);
 			
-			p.history |= REFLECT_SPECULAR;
-		}
-		else
-		{
-			p.direction = rotate(s.surface_normal, PI-refracted_angle, incident_plane_normal);
-		}
-
-		p.polarization = incident_plane_normal;
-	} // photon polarization normal to plane of incidence
-	else
-	{
-		reflection_coefficient = tanf(incident_angle-refracted_angle)/tanf(incident_angle+refracted_angle);
-
-		if ((curand_uniform(&rng) < reflection_coefficient*reflection_coefficient) || isnan(refracted_angle))
-		{
-			p.direction = rotate(s.surface_normal, incident_angle, incident_plane_normal);
-			
-			p.history |= REFLECT_SPECULAR;
-		}
-		else
-		{
-			p.direction = rotate(s.surface_normal, PI-refracted_angle, incident_plane_normal);
-		}
-
-		p.polarization = cross(incident_plane_normal, p.direction);
-		p.polarization /= norm(p.polarization);
-	} // photon polarization parallel to plane of incidence
-
-} // propagate_at_boundary
-
-__device__ int propagate_at_surface(Photon &p, State &s, curandState &rng)
-{
-	Surface surface = surfaces[s.surface_index];
-
-	float detect = interp_property(p.wavelength, surface.detect);
-	float absorb = interp_property(p.wavelength, surface.absorb);
-	float reflect_diffuse = interp_property(p.wavelength, surface.reflect_diffuse);
-	float reflect_specular = interp_property(p.wavelength, surface.reflect_specular);
-
-	// since the surface properties are interpolated linearly, we are
-	// guaranteed that they still sum to 1.0.
+	    p.history |= REFLECT_SPECULAR;
+	}
+	else {
+	    p.direction = rotate(s.surface_normal, PI-refracted_angle, incident_plane_normal);
+	}
 
-	float uniform_sample = curand_uniform(&rng);
+	p.polarization = incident_plane_normal;
+    }
+    else {
+	// photon polarization parallel to plane of incidence
+	reflection_coefficient = tanf(incident_angle-refracted_angle)/tanf(incident_angle+refracted_angle);
 
-	if (uniform_sample < absorb)
-	{
-		p.history |= SURFACE_ABSORB;
-		return BREAK;
+	if ((curand_uniform(&rng) < reflection_coefficient*reflection_coefficient) || isnan(refracted_angle)) {
+	    p.direction = rotate(s.surface_normal, incident_angle, incident_plane_normal);
+			
+	    p.history |= REFLECT_SPECULAR;
 	}
-	else if (uniform_sample < absorb + detect)
-	{
-		p.history |= SURFACE_DETECT;
-		return BREAK;
+	else {
+	    p.direction = rotate(s.surface_normal, PI-refracted_angle, incident_plane_normal);
 	}
-	else if (uniform_sample < absorb + detect + reflect_diffuse)
-	{
-		// diffusely reflect
-		p.direction = uniform_sphere(&rng);
 
-		if (dot(p.direction, s.surface_normal) < 0.0f)
-			p.direction = -p.direction;
+	p.polarization = cross(incident_plane_normal, p.direction);
+	p.polarization /= norm(p.polarization);
+    }
 
-		// randomize polarization?
-		p.polarization = cross(uniform_sphere(&rng), p.direction);
-		p.polarization /= norm(p.polarization);
+} // propagate_at_boundary
+
+__device__ int
+propagate_at_surface(Photon &p, State &s, curandState &rng, Geometry *geometry)
+{
+    Surface *surface = geometry->surfaces[s.surface_index];
+
+    /* since the surface properties are interpolated linearly, we are
+       guaranteed that they still sum to 1.0. */
+
+    float detect = interp_property(surface, p.wavelength, surface->detect);
+    float absorb = interp_property(surface, p.wavelength, surface->absorb);
+    float reflect_diffuse = interp_property(surface, p.wavelength, surface->reflect_diffuse);
+    float reflect_specular = interp_property(surface, p.wavelength, surface->reflect_specular);
+
+    float uniform_sample = curand_uniform(&rng);
+
+    if (uniform_sample < absorb) {
+	p.history |= SURFACE_ABSORB;
+	return BREAK;
+    }
+    else if (uniform_sample < absorb + detect) {
+	p.history |= SURFACE_DETECT;
+	return BREAK;
+    }
+    else if (uniform_sample < absorb + detect + reflect_diffuse) {
+	// diffusely reflect
+	p.direction = uniform_sphere(&rng);
+
+	if (dot(p.direction, s.surface_normal) < 0.0f)
+	    p.direction = -p.direction;
+
+	// randomize polarization?
+	p.polarization = cross(uniform_sphere(&rng), p.direction);
+	p.polarization /= norm(p.polarization);
 
-		p.history |= REFLECT_DIFFUSE;
+	p.history |= REFLECT_DIFFUSE;
 
-		return CONTINUE;
-	}
-	else
-	{
-		// specularly reflect
-		float incident_angle = get_theta(s.surface_normal,-p.direction);
-		float3 incident_plane_normal = cross(p.direction, s.surface_normal);
-		incident_plane_normal /= norm(incident_plane_normal);
+	return CONTINUE;
+    }
+    else {
+	// specularly reflect
+	float incident_angle = get_theta(s.surface_normal,-p.direction);
+	float3 incident_plane_normal = cross(p.direction, s.surface_normal);
+	incident_plane_normal /= norm(incident_plane_normal);
 
-		p.direction = rotate(s.surface_normal, incident_angle, incident_plane_normal);
+	p.direction = rotate(s.surface_normal, incident_angle,
+			     incident_plane_normal);
 
-		p.history |= REFLECT_SPECULAR;
+	p.history |= REFLECT_SPECULAR;
 
-		return CONTINUE;
-	}
+	return CONTINUE;
+    }
 
 } // propagate_at_surface
 
diff --git a/src/propagate.cu b/src/propagate.cu
new file mode 100644
index 0000000..2d5183e
--- /dev/null
+++ b/src/propagate.cu
@@ -0,0 +1,100 @@
+//-*-c-*-
+
+#include "linalg.h"
+#include "mesh.h"
+#include "geometry.h"
+#include "photon.h"
+
+extern "C"
+{
+
+__global__ void
+propagate(int first_photon, int nthreads, unsigned int *input_queue,
+	  unsigned int *output_queue, curandState *rng_states,
+	  float3 *positions, float3 *directions,
+	  float *wavelengths, float3 *polarizations,
+	  float *times, unsigned int *histories,
+	  int *last_hit_triangles, int max_steps,
+	  Geometry *geometry)
+{
+    int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if (id >= nthreads)
+	return;
+
+    curandState rng = rng_states[id];
+
+    int photon_id = input_queue[first_photon + id];
+
+    Photon p;
+    p.position = positions[photon_id];
+    p.direction = directions[photon_id];
+    p.direction /= norm(p.direction);
+    p.polarization = polarizations[photon_id];
+    p.polarization /= norm(p.polarization);
+    p.wavelength = wavelengths[photon_id];
+    p.time = times[photon_id];
+    p.last_hit_triangle = last_hit_triangles[photon_id];
+    p.history = histories[photon_id];
+
+    if (p.history & (NO_HIT | BULK_ABSORB | SURFACE_DETECT | SURFACE_ABSORB))
+	return;
+
+    State s;
+
+    int steps = 0;
+    while (steps < max_steps) {
+	steps++;
+
+	int command;
+
+	// check for NaN and fail
+	if (isnan(p.direction.x*p.direction.y*p.direction.z*p.position.x*p.position.y*p.position.z)) {
+	    p.history |= NO_HIT | NAN_ABORT;
+	    break;
+	}
+
+	fill_state(s, p, geometry);
+
+	if (p.last_hit_triangle == -1)
+	    break;
+
+	command = propagate_to_boundary(p, s, rng);
+
+	if (command == BREAK)
+	    break;
+
+	if (command == CONTINUE)
+	    continue;
+
+	if (s.surface_index != -1) {
+	    command = propagate_at_surface(p, s, rng, geometry);
+
+	    if (command == BREAK)
+		break;
+
+	    if (command == CONTINUE)
+		continue;
+	}
+
+	propagate_at_boundary(p, s, rng);
+
+    } // while (steps < max_steps)
+
+    rng_states[id] = rng;
+    positions[photon_id] = p.position;
+    directions[photon_id] = p.direction;
+    polarizations[photon_id] = p.polarization;
+    wavelengths[photon_id] = p.wavelength;
+    times[photon_id] = p.time;
+    histories[photon_id] = p.history;
+    last_hit_triangles[photon_id] = p.last_hit_triangle;
+	
+    // Not done, put photon in output queue
+    if ((p.history & (NO_HIT | BULK_ABSORB | SURFACE_DETECT | SURFACE_ABSORB)) == 0) {
+	int out_idx = atomicAdd(output_queue, 1);
+	output_queue[out_idx] = photon_id;
+    }
+} // propagate
+
+} // extern "C"
diff --git a/src/random.h b/src/random.h
index 6abc415..e9d2e75 100644
--- a/src/random.h
+++ b/src/random.h
@@ -5,76 +5,81 @@
 
 #include "physical_constants.h"
 
-__device__ float uniform(curandState *s, const float &low, const float &high)
+__device__ float
+uniform(curandState *s, const float &low, const float &high)
 {
-	return low + curand_uniform(s)*(high-low);
+    return low + curand_uniform(s)*(high-low);
 }
 
-__device__ float3 uniform_sphere(curandState *s)
+__device__ float3
+uniform_sphere(curandState *s)
 {
-	float theta = uniform(s, 0.0f, 2*PI);
-	float u = uniform(s, -1.0f, 1.0f);
-	float c = sqrtf(1.0f-u*u);
+    float theta = uniform(s, 0.0f, 2*PI);
+    float u = uniform(s, -1.0f, 1.0f);
+    float c = sqrtf(1.0f-u*u);
 
-	return make_float3(c*cosf(theta), c*sinf(theta), u);
+    return make_float3(c*cosf(theta), c*sinf(theta), u);
 }
 
 // Draw a random sample given a cumulative distribution function
 // Assumptions: ncdf >= 2, cdf_y[0] is 0.0, and cdf_y[ncdf-1] is 1.0
-__device__ float sample_cdf(curandState *rng, int ncdf, 
-			    float *cdf_x, float *cdf_y)
+__device__ float
+sample_cdf(curandState *rng, int ncdf, float *cdf_x, float *cdf_y)
 {
-  float u = curand_uniform(rng);
-
-  // Find u in cdf_y with binary search
-  // list must contain at least 2 elements: 0.0 and 1.0
-  int lower=0;
-  int upper=ncdf-1;
-  while(lower < upper-1) {
-    int half = (lower+upper) / 2;
-    if (u < cdf_y[half])
-      upper = half;
-    else
-      lower = half;
-  }
+    float u = curand_uniform(rng);
+
+    // Find u in cdf_y with binary search
+    // list must contain at least 2 elements: 0.0 and 1.0
+    int lower=0;
+    int upper=ncdf-1;
+    while(lower < upper-1) {
+	int half = (lower+upper) / 2;
+	if (u < cdf_y[half])
+	    upper = half;
+	else
+	    lower = half;
+    }
   
-  float frac = (u - cdf_y[lower]) / (cdf_y[upper] - cdf_y[lower]);
-  return cdf_x[lower] * frac + cdf_x[upper] * (1.0f - frac);
+    float frac = (u - cdf_y[lower]) / (cdf_y[upper] - cdf_y[lower]);
+    return cdf_x[lower] * frac + cdf_x[upper] * (1.0f - frac);
 }
 
-
 extern "C"
 {
 
-__global__ void init_rng(int nthreads, curandState *s, unsigned long long seed, unsigned long long offset)
+__global__ void
+init_rng(int nthreads, curandState *s, unsigned long long seed,
+	 unsigned long long offset)
 {
-	int id = blockIdx.x*blockDim.x + threadIdx.x;
+    int id = blockIdx.x*blockDim.x + threadIdx.x;
 
-	if (id >= nthreads)
-		return;
+    if (id >= nthreads)
+	return;
 
-	curand_init(seed, id, offset, &s[id]);
+    curand_init(seed, id, offset, &s[id]);
 }
 
-__global__ void fill_uniform(int nthreads, curandState *s, float *a, float low, float high)
+__global__ void
+fill_uniform(int nthreads, curandState *s, float *a, float low, float high)
 {
-	int id = blockIdx.x*blockDim.x + threadIdx.x;
+    int id = blockIdx.x*blockDim.x + threadIdx.x;
 
-	if (id >= nthreads)
-		return;
+    if (id >= nthreads)
+	return;
 
-	a[id] = uniform(&s[id], low, high);
+    a[id] = uniform(&s[id], low, high);
 
 }
 
-__global__ void fill_uniform_sphere(int nthreads, curandState *s, float3 *a)
+__global__ void
+fill_uniform_sphere(int nthreads, curandState *s, float3 *a)
 {
-	int id = blockIdx.x*blockDim.x + threadIdx.x;
+    int id = blockIdx.x*blockDim.x + threadIdx.x;
 
-	if (id >= nthreads)
-		return;
+    if (id >= nthreads)
+	return;
 
-	a[id] = uniform_sphere(&s[id]);
+    a[id] = uniform_sphere(&s[id]);
 }
 
 } // extern "c"
diff --git a/src/render.cu b/src/render.cu
new file mode 100644
index 0000000..e5b4ac5
--- /dev/null
+++ b/src/render.cu
@@ -0,0 +1,169 @@
+//-*-c-*-
+
+#include "linalg.h"
+#include "intersect.h"
+#include "mesh.h"
+#include "sorting.h"
+#include "geometry.h"
+
+#include "stdio.h"
+
+__device__ float4
+get_color(const float3 &direction, const Triangle &triangle, unsigned int rgba)
+{
+    float3 v01 = triangle.v1 - triangle.v0;
+    float3 v12 = triangle.v2 - triangle.v1;
+    
+    float3 surface_normal = normalize(cross(v01,v12));
+
+    float cos_theta = dot(surface_normal,-direction);
+
+    if (cos_theta < 0.0f)
+	cos_theta = dot(-surface_normal,-direction);
+
+    unsigned int a0 = 0xff & (rgba >> 24);
+    unsigned int r0 = 0xff & (rgba >> 16);
+    unsigned int g0 = 0xff & (rgba >> 8);
+    unsigned int b0 = 0xff & rgba;
+
+    float alpha = (255 - a0)/255.0f;
+
+    return make_float4(r0*cos_theta, g0*cos_theta, b0*cos_theta, alpha);
+}
+
+extern "C"
+{
+
+__global__ void
+render(int nthreads, float3 *_origin, float3 *_direction, Geometry *geometry,
+       unsigned int alpha_depth, unsigned int *pixels, float *_dx,
+       unsigned int *dxlen, float4 *_color)
+{
+    int id = blockIdx.x*blockDim.x + threadIdx.x;
+	
+    if (id >= nthreads)
+	return;
+
+    float3 origin = _origin[id];
+    float3 direction = _direction[id];
+    unsigned int n = dxlen[id];
+
+    float distance;
+
+    if (n < 1 && !intersect_node(origin, direction, geometry,
+				 geometry->start_node, -1.0f)) {
+	pixels[id] = 0;
+	return;
+    }
+
+    unsigned int stack[STACK_SIZE];
+
+    unsigned int *head = &stack[0];
+    unsigned int *node = &stack[1];
+    unsigned int *tail = &stack[STACK_SIZE-1];
+    *node = geometry->start_node;
+
+    float *dx = _dx + id*alpha_depth;
+    float4 *color_a = _color + id*alpha_depth;
+
+    unsigned int i;
+
+    do {
+	unsigned int first_child = geometry->node_map[*node];
+	unsigned int stop = geometry->node_map_end[*node];
+	
+	while (*node >= geometry->first_node && stop == first_child+1) {
+	    *node = first_child;
+	    first_child = geometry->node_map[*node];
+	    stop = geometry->node_map_end[*node];
+	}
+		
+	if (*node >= geometry->first_node) {
+	    for (i=first_child; i < stop; i++) {
+		if (intersect_node(origin, direction, geometry, i, -1.0f)) {
+		    *node = i;
+		    node++;
+		}
+	    }
+
+	    node--;
+	}
+	else {
+	    // node is a leaf
+	    for (i=first_child; i < stop; i++) {
+		Triangle triangle = get_triangle(geometry, i);
+		
+		if (intersect_triangle(origin, direction, triangle,
+				       distance)) {
+		    if (n < 1) {
+			dx[0] = distance;
+			    
+			unsigned int rgba = geometry->colors[i];
+			float4 color = get_color(direction, triangle, rgba);
+
+			color_a[0] = color;
+		    }
+		    else {
+			unsigned long j = searchsorted(n, dx, distance);
+
+			if (j <= alpha_depth-1) {
+			    insert(alpha_depth, dx, j, distance);
+
+			    unsigned int rgba = geometry->colors[i];
+			    float4 color = get_color(direction, triangle,
+						     rgba);
+
+			    insert(alpha_depth, color_a, j, color);
+			}
+		    }
+					
+		    if (n < alpha_depth)
+			n++;
+		}
+				
+	    } // triangle loop
+	    
+	    node--;
+	    
+	} // node is a leaf
+	
+    } // while loop
+    while (node != head);
+
+    if (n < 1) {
+	pixels[id] = 0;
+	return;
+    }
+
+    dxlen[id] = n;
+
+    float scale = 1.0f;
+    float fr = 0.0f;
+    float fg = 0.0f;
+    float fb = 0.0f;
+    for (i=0; i < n; i++) {
+	float alpha;
+	if (i < alpha_depth-1)
+	    alpha = color_a[i].w;
+	else
+	    alpha = 1.0;
+
+	fr += scale*color_a[i].x*alpha;
+	fg += scale*color_a[i].y*alpha;
+	fb += scale*color_a[i].z*alpha;
+	
+	scale *= (1.0f-alpha);
+    }
+    unsigned int a;
+    if (n < alpha_depth)
+	a = floorf(255*(1.0f-scale));
+    else
+	a = 255;
+    unsigned int r = floorf(fr);
+    unsigned int g = floorf(fg);
+    unsigned int b = floorf(fb);
+
+    pixels[id] = a << 24 | r << 16 | g << 8 | b;
+}
+
+} // extern "C"
diff --git a/src/rotate.h b/src/rotate.h
index 7fc6f7e..15f8037 100644
--- a/src/rotate.h
+++ b/src/rotate.h
@@ -6,21 +6,22 @@
 
 __device__ const Matrix IDENTITY_MATRIX = {1,0,0,0,1,0,0,0,1};
 
-__device__ __host__ Matrix make_rotation_matrix(float phi, const float3 &n)
+__device__ Matrix
+make_rotation_matrix(float phi, const float3 &n)
 {
-	/* rotate points counterclockwise, when looking towards +infinity,
-	   through an angle `phi` about the axis `n`. */
+    float cos_phi = cosf(phi);
+    float sin_phi = sinf(phi);
 
-	float cos_phi = cosf(phi);
-	float sin_phi = sinf(phi);
-
-	return IDENTITY_MATRIX*cos_phi + (1-cos_phi)*outer(n,n) +
-		sin_phi*make_matrix(0,n.z,-n.y,-n.z,0,n.x,n.y,-n.x,0);
+    return IDENTITY_MATRIX*cos_phi + (1-cos_phi)*outer(n,n) +
+	sin_phi*make_matrix(0,n.z,-n.y,-n.z,0,n.x,n.y,-n.x,0);
 }
 
-__device__ __host__ float3 rotate(const float3 &a, float phi, const float3 &n)
+/* rotate points counterclockwise, when looking towards +infinity,
+   through an angle `phi` about the axis `n`. */
+__device__ float3
+rotate(const float3 &a, float phi, const float3 &n)
 {
-	return make_rotation_matrix(phi,n)*a;
+    return make_rotation_matrix(phi,n)*a;
 }
 
 #endif
diff --git a/src/sorting.h b/src/sorting.h
index ec16bbe..03eabc6 100644
--- a/src/sorting.h
+++ b/src/sorting.h
@@ -1,103 +1,105 @@
 template <class T>
-__device__ void swap(T &a, T &b)
+__device__ void
+swap(T &a, T &b)
 {
-	T tmp = a;
-	a = b;
-	b = tmp;
+    T tmp = a;
+    a = b;
+    b = tmp;
 }
 
 template <class T>
-__device__ void reverse(int n, T *a)
+__device__ void
+reverse(int n, T *a)
 {
-	for (int i=0; i < n/2; i++)
-		swap(a[i],a[n-1-i]);
+    for (int i=0; i < n/2; i++)
+	swap(a[i],a[n-1-i]);
 }
 
 template <class T>
-__device__ void piksrt(int n, T *arr)
+__device__ void
+piksrt(int n, T *arr)
 {
-	int i,j;
-	T a;
+    int i,j;
+    T a;
 
-	for (j=1; j < n; j++)
-	{
-		a = arr[j];
-		i = j-1;
-		while (i >= 0 && arr[i] > a)
-		{
-			arr[i+1] = arr[i];
-			i--;
-		}
-		arr[i+1] = a;
+    for (j=1; j < n; j++) {
+	a = arr[j];
+	i = j-1;
+	while (i >= 0 && arr[i] > a) {
+	    arr[i+1] = arr[i];
+	    i--;
 	}
+	arr[i+1] = a;
+    }
 }
 
 template <class T, class U>
-__device__ void piksrt2(int n, T *arr, U *brr)
+__device__ void
+piksrt2(int n, T *arr, U *brr)
 {
-	int i,j;
-	T a;
-	U b;
+    int i,j;
+    T a;
+    U b;
 
-	for (j=1; j < n; j++)
-	{
-		a = arr[j];
-		b = brr[j];
-		i = j-1;
-		while (i >= 0 && arr[i] > a)
-		{
-			arr[i+1] = arr[i];
-			brr[i+1] = brr[i];
-			i--;
-		}
-		arr[i+1] = a;
-		brr[i+1] = b;
+    for (j=1; j < n; j++) {
+	a = arr[j];
+	b = brr[j];
+	i = j-1;
+	while (i >= 0 && arr[i] > a) {
+	    arr[i+1] = arr[i];
+	    brr[i+1] = brr[i];
+	    i--;
 	}
+	arr[i+1] = a;
+	brr[i+1] = b;
+    }
 }
 
 template <class T>
-__device__ unsigned long searchsorted(unsigned long n, T *arr, const T &x)
+__device__ unsigned long
+searchsorted(unsigned long n, T *arr, const T &x)
 {
-	unsigned long ju,jm,jl;
-	int ascnd;
+    unsigned long ju,jm,jl;
+    int ascnd;
 
-	jl = 0;
-	ju = n;
+    jl = 0;
+    ju = n;
 
-	ascnd = (arr[n-1] > arr[0]);
+    ascnd = (arr[n-1] > arr[0]);
 
-	while (ju-jl > 1)
-	{
-		jm = (ju+jl) >> 1;
+    while (ju-jl > 1) {
+	jm = (ju+jl) >> 1;
 
-		if (x >= arr[jm] == ascnd)
-			jl = jm;
-		else
-			ju = jm;
-	}
-
-	if (x <= arr[0])
-		return 0;
-	else if (x == arr[n-1])
-		return n-1;
+	if (x >= arr[jm] == ascnd)
+	    jl = jm;
 	else
-		return ju;
+	    ju = jm;
+    }
+
+    if (x <= arr[0])
+	return 0;
+    else if (x == arr[n-1])
+	return n-1;
+    else
+	return ju;
 }
 
 template <class T>
-__device__ void insert(unsigned long n, T *arr, unsigned long i, const T &x)
+__device__ void
+insert(unsigned long n, T *arr, unsigned long i, const T &x)
 {
-	unsigned long j;
-	for (j=n-1; j > i; j--)
-		arr[j] = arr[j-1];
-	arr[i] = x;
+    unsigned long j;
+    for (j=n-1; j > i; j--)
+	arr[j] = arr[j-1];
+    arr[i] = x;
 }
 
 template <class T>
-__device__ void add_sorted(unsigned long n, T *arr, const T &x)
+__device__ void
+add_sorted(unsigned long n, T *arr, const T &x)
 {
-	unsigned long i = searchsorted(n, arr, x);
+    unsigned long i = searchsorted(n, arr, x);
 
-	if (i < n)
-		insert(n, arr, i, x);
+    if (i < n)
+	insert(n, arr, i, x);
 }
diff --git a/src/tools.cu b/src/tools.cu
index 3d3fed7..80d06ec 100644
--- a/src/tools.cu
+++ b/src/tools.cu
@@ -3,15 +3,19 @@
 extern "C"
 {
 
-__global__ void interleave(int nthreads, unsigned long long *x, unsigned long long *y, unsigned long long *z, int bits, unsigned long long *dest)
+__global__ void
+interleave(int nthreads, unsigned long long *x, unsigned long long *y,
+	   unsigned long long *z, int bits, unsigned long long *dest)
 {
-	int id = blockIdx.x*blockDim.x + threadIdx.x;
+    int id = blockIdx.x*blockDim.x + threadIdx.x;
 
-	if (id >= nthreads)
-		return;
+    if (id >= nthreads)
+	return;
 
-	for (int i=0; i < bits; i++)
-		dest[id] |= (x[id] & 1 << i) << (2*i) | (y[id] & 1 << i) << (2*i+1) | (z[id] & 1 << i) << (2*i+2);
+    for (int i=0; i < bits; i++)
+	dest[id] |= (x[id] & 1 << i) << (2*i) |
+	            (y[id] & 1 << i) << (2*i+1) |
+	            (z[id] & 1 << i) << (2*i+2);
 }
 
-}
+} // extern "C"
diff --git a/src/transform.cu b/src/transform.cu
index 57bd509..1f4405e 100644
--- a/src/transform.cu
+++ b/src/transform.cu
@@ -7,41 +7,45 @@ extern "C"
 {
 
 /* Translate the points `a` by the vector `v` */
-__global__ void translate(int nthreads, float3 *a, float3 v)
+__global__ void
+translate(int nthreads, float3 *a, float3 v)
 {
-	int id = blockIdx.x*blockDim.x + threadIdx.x;
+    int id = blockIdx.x*blockDim.x + threadIdx.x;
 
-	if (id >= nthreads)
-		return;
+    if (id >= nthreads)
+	return;
 
-	a[id] += v;
+    a[id] += v;
 }
 
 /* Rotate the points `a` through an angle `phi` counter-clockwise about the
    axis `axis` (when looking towards +infinity). */
-__global__ void rotate(int nthreads, float3 *a, float phi, float3 axis)
+__global__ void
+rotate(int nthreads, float3 *a, float phi, float3 axis)
 {
-	int id = blockIdx.x*blockDim.x + threadIdx.x;
+    int id = blockIdx.x*blockDim.x + threadIdx.x;
 
-	if (id >= nthreads)
-		return;
+    if (id >= nthreads)
+	return;
 
-	a[id] = rotate(a[id], phi, axis);
+    a[id] = rotate(a[id], phi, axis);
 }
 
 /* Rotate the points `a` through an angle `phi` counter-clockwise
    (when looking towards +infinity along `axis`) about the axis defined
    by the point `point` and the vector `axis` . */
-__global__ void rotate_around_point(int nthreads, float3 *a, float phi, float3 axis, float3 point)
+__global__ void
+rotate_around_point(int nthreads, float3 *a, float phi, float3 axis,
+		    float3 point)
 {
-	int id = blockIdx.x*blockDim.x + threadIdx.x;
+    int id = blockIdx.x*blockDim.x + threadIdx.x;
 
-	if (id >= nthreads)
-		return;
+    if (id >= nthreads)
+	return;
 
-	a[id] -= point;
-	a[id] = rotate(a[id], phi, axis);
-	a[id] += point;
+    a[id] -= point;
+    a[id] = rotate(a[id], phi, axis);
+    a[id] += point;
 }
 
 } // extern "c"
diff --git a/stl.py b/stl.py
index 0806c29..abf0cc9 100644
--- a/stl.py
+++ b/stl.py
@@ -5,7 +5,7 @@ from chroma.geometry import Mesh
 import bz2
 
 def mesh_from_stl(filename):
-    "Return a mesh from an stl file."
+    "Returns a `chroma.geometry.Mesh` from an STL file."
     if filename.endswith('.bz2'):
         f = bz2.BZ2File(filename)
     else:
diff --git a/tests/test_pdf.py b/tests/test_pdf.py
index 791f119..c7c9394 100644
--- a/tests/test_pdf.py
+++ b/tests/test_pdf.py
@@ -20,7 +20,7 @@ class TestPDF(unittest.TestCase):
 
         g4generator = G4ParallelGenerator(1, water_wcsim)
         gpu_instance = gpu.GPU(0)
-        gpu_geometry = gpu.GPUGeometry(gpu_instance, self.detector)
+        gpu_geometry = gpu.GPUGeometry(self.detector)
 
         nthreads_per_block, max_blocks = 64, 1024
 
@@ -34,8 +34,10 @@ class TestPDF(unittest.TestCase):
 
         for ev in g4generator.generate_events(itertools.islice(self.vertex_gen, 10)):
             gpu_photons = gpu.GPUPhotons(ev.photons_beg)
-            gpu.propagate(gpu_instance, gpu_photons, rng_states, nthreads_per_block, max_blocks)
-            gpu_channels = gpu_daq.acquire(gpu_photons, rng_states, nthreads_per_block, max_blocks)
+            gpu_photons.propagate(gpu_geometry, rng_states, nthreads_per_block,
+                                  max_blocks)
+            gpu_channels = gpu_daq.acquire(gpu_photons, rng_states,
+                                           nthreads_per_block, max_blocks)
             gpu_pdf.add_hits_to_pdf(gpu_channels)
 
         hitcount, pdf = gpu_pdf.get_pdfs()
diff --git a/tests/test_propagation.py b/tests/test_propagation.py
index bf886bd..8f9d084 100644
--- a/tests/test_propagation.py
+++ b/tests/test_propagation.py
@@ -34,10 +34,12 @@ class TestPropagation(unittest.TestCase):
         wavelengths = np.empty(nphotons, np.float32)
         wavelengths.fill(400.0)
 
-        photons = Photons(pos=pos, dir=dir, pol=pol, t=t, wavelengths=wavelengths)
+        photons = Photons(pos=pos, dir=dir, pol=pol, t=t,
+                          wavelengths=wavelengths)
 
         # First make one step to check for strangeness
-        photons_end = sim.simulate([photons], keep_photons_end=True, max_steps=1).next().photons_end
+        photons_end = sim.simulate([photons], keep_photons_end=True,
+                                   max_steps=1).next().photons_end
 
         self.assertFalse(np.isnan(photons_end.pos).any())
         self.assertFalse(np.isnan(photons_end.dir).any())
@@ -46,8 +48,10 @@ class TestPropagation(unittest.TestCase):
         self.assertFalse(np.isnan(photons_end.wavelengths).any())
 
         # Now let it run the usual ten steps
-        photons_end = sim.simulate([photons], keep_photons_end=True, max_steps=10).next().photons_end
+        photons_end = sim.simulate([photons], keep_photons_end=True,
+                                   max_steps=10).next().photons_end
         aborted = (photons_end.flags & (1 << 31)) > 0
-        print 'aborted photons: %1.1f' % (float(np.count_nonzero(aborted)) / nphotons)
+        print 'aborted photons: %1.1f' % \
+            (float(np.count_nonzero(aborted)) / nphotons)
         self.assertFalse(aborted.any())