30 files changed, 1570 insertions, 1484 deletions
diff --git a/benchmark.py b/benchmark.py
index 4c46a1f..f839f61 100755
--- a/benchmark.py
+++ b/benchmark.py
@@ -1,95 +1,61 @@
 #!/usr/bin/env python
 import numpy as np
+import pycuda.driver as cuda
 from pycuda import gpuarray as ga
 import time
 from uncertainties import ufloat
 import sys
-
-from chroma.gpu import GPU, to_float3
-from chroma.camera import get_rays
-from chroma.event import Photons
-from chroma.sample import uniform_sphere
-from chroma.generator.photon import G4ParallelGenerator
-from chroma.optics import water_wcsim
-from chroma.generator.vertex import constant_particle_gun
-from chroma.tools import profile_if_possible
+import itertools
+
+from chroma import gpu
+from chroma import camera
+from chroma import event
+from chroma import sample
+from chroma import generator
+from chroma import tools
+from chroma import optics
 # Generator processes need to fork BEFORE the GPU context is setup
-g4generator = G4ParallelGenerator(4, water_wcsim)
-
-def progress(seq):
-    "Print progress while iterating over `seq`."
-    n = len(seq)
-    print '[' + ' '*21 + ']\r[',
-    sys.stdout.flush()
-    for i, item in enumerate(seq):
-        if i % (n//10) == 0:
-            print '.',
-            sys.stdout.flush()
-        yield item
-    print ']'
-    sys.stdout.flush()
-
-def ray_trace(gpu, number=1000):
-    """
-    Return the number of mean and standard deviation of the number of ray
-    intersections per second as a ufloat for the geometry loaded onto `gpu`.
-
-    .. note::
-        The rays are thrown from a camera sitting *outside* of the geometry.
+g4generator = generator.photon.G4ParallelGenerator(4, optics.water_wcsim)
 
-    Args:
-        - gpu, chroma.gpu.GPU
-            The GPU object with a geometry already loaded.
-        - number, int
-            The number of kernel calls to average.
-    """
-    lb, ub = gpu.geometry.mesh.get_bounds()
-    scale = np.linalg.norm(ub-lb)
-    point = [0,scale,(lb[2]+ub[2])/2]
+def intersect(gpu_instance, number=100, nphotons=500000, nthreads_per_block=64, max_blocks=1024):
+    "Returns the average number of ray intersections per second."
+    distances_gpu = ga.empty(nphotons, dtype=np.float32)
 
-    size = (800,600)
-    width, height = size
+    run_times = []
+    for i in tools.progress(range(number)):
+        pos = np.zeros((nphotons,3), dtype=np.float32)
+        dir = sample.uniform_sphere(nphotons)
 
-    origins, directions = get_rays(point, size, 0.035, focal_length=0.018)
+        gpu_rays = gpu.GPURays(pos, dir)
 
-    origins_gpu = ga.to_gpu(to_float3(origins))
-    directions_gpu = ga.to_gpu(to_float3(directions))
-    pixels_gpu = ga.zeros(width*height, dtype=np.int32)
+        gpu_funcs = gpu.GPUFuncs(gpu_instance.module)
 
-    run_times = []
-    for i in progress(range(number)):
         t0 = time.time()
-        gpu.kernels.ray_trace(np.int32(pixels_gpu.size), origins_gpu, directions_gpu, pixels_gpu, block=(gpu.nthreads_per_block,1,1), grid=(pixels_gpu.size//gpu.nthreads_per_block+1,1))
-        gpu.context.synchronize()
+        gpu_funcs.distance_to_mesh(np.int32(gpu_rays.pos.size), gpu_rays.pos, gpu_rays.dir, distances_gpu, block=(nthreads_per_block,1,1), grid=(gpu_rays.pos.size//nthreads_per_block+1,1))
+        cuda.Context.get_current().synchronize()
         elapsed = time.time() - t0
 
         if i > 0:
             # first kernel call incurs some driver overhead
             run_times.append(elapsed)
 
-    return pixels_gpu.size/ufloat((np.mean(run_times),np.std(run_times)))
+    return gpu_rays.pos.size/ufloat((np.mean(run_times),np.std(run_times)))
 
-def load_photons(gpu, number=10, nphotons=500000):
-    """
-    Return the mean and standard deviation of the number of photons loaded
-    onto `gpu` per second.
-
-    Args:
-        - gpu, chroma.gpu.GPU
-            The GPU object with a geometry already loaded.
-        - number, int
-            The number of loads to average
-        - nphotons, int
-            The number of photons to load per trial
-    """
-    gpu.setup_propagate()
-    photons = Photons(np.zeros((nphotons,3)), uniform_sphere(nphotons), np.random.uniform(400,800,size=nphotons))
+def load_photons(number=100, nphotons=500000):
+    """Returns the average number of photons moved to the GPU device memory
+    per second."""
+    pos = np.zeros((nphotons,3))
+    dir = sample.uniform_sphere(nphotons)
+    pol = np.cross(sample.uniform_sphere(nphotons), dir)
+    pol /= np.apply_along_axis(np.linalg.norm,1,pol)[:,np.newaxis]
+    wavelengths = np.random.uniform(400,800,size=nphotons)
+    photons = event.Photons(pos, dir, pol, wavelengths)
 
     run_times = []
-    for i in progress(range(number)):
+    for i in tools.progress(range(number)):
         t0 = time.time()
-        gpu.load_photons(photons)
-        gpu.context.synchronize()
+        gpu_photons = gpu.GPUPhotons(photons)
+        cuda.Context.get_current().synchronize()
         elapsed = time.time() - t0
 
         if i > 0:
@@ -98,28 +64,23 @@ def load_photons(gpu, number=10, nphotons=500000):
 
     return nphotons/ufloat((np.mean(run_times),np.std(run_times)))
 
-def propagate(gpu, number=10, nphotons=500000):
-    """
-    Return the mean and standard deviation of the number of photons propagated
-    per second as a ufloat for the geometry loaded onto `gpu`.
-
-    Args:
-        - gpu, chroma.gpu.GPU
-            The GPU object with a geometry already loaded.
-        - number, int
-            The number of kernel calls to average.
-        - nphotons, int
-            The number of photons to propagate per kernel call.
-    """
-    gpu.setup_propagate()
+def propagate(gpu_instance, number=10, nphotons=500000, nthreads_per_block=64, max_blocks=1024):
+    "Returns the average number of photons propagated on the GPU per second."
+    rng_states = gpu.get_rng_states(nthreads_per_block*max_blocks)
 
     run_times = []
-    for i in progress(range(number)):
-        photons = Photons(np.zeros((nphotons,3)), uniform_sphere(nphotons), np.random.uniform(400,800,size=nphotons))
-        gpu.load_photons(photons)
+    for i in tools.progress(range(number)):
+        pos = np.zeros((nphotons,3))
+        dir = sample.uniform_sphere(nphotons)
+        pol = np.cross(sample.uniform_sphere(nphotons), dir)
+        pol /= np.apply_along_axis(np.linalg.norm,1,pol)[:,np.newaxis]
+        wavelengths = np.random.uniform(400,800,size=nphotons)
+        photons = event.Photons(pos, dir, pol, wavelengths)
+        gpu_photons = gpu.GPUPhotons(photons)
+
         t0 = time.time()
-        gpu.propagate()
-        gpu.context.synchronize()
+        gpu.propagate(gpu_instance, gpu_photons, rng_states, nthreads_per_block, max_blocks)
+        cuda.Context.get_current().synchronize()
         elapsed = time.time() - t0
 
         if i > 0:
@@ -128,16 +89,15 @@ def propagate(gpu, number=10, nphotons=500000):
 
     return nphotons/ufloat((np.mean(run_times),np.std(run_times)))
 
-@profile_if_possible
-def pdf(gpu, max_pmt_id, npdfs=10, nevents=100, nreps=1):
+@tools.profile_if_possible
+def pdf(gpu_instance, gpu_geometry, max_pmt_id, npdfs=10, nevents=100, nreps=1, nthreads_per_block=64, max_blocks=1024):
     """
-    Return the mean and standard deviation of the number of 100 MeV
-    events per second that can be histogrammed a ufloat for the
-    geometry loaded onto `gpu`.
+    Returns the average number of 100 MeV events per second that can be
+    histogrammed per second.
 
     Args:
-        - gpu, chroma.gpu.GPU
-            The GPU object with a geometry already loaded.
+        - gpu_instance, chroma.gpu.GPU
+            The GPU instance passed to the GPUGeometry constructor.
         - max_pmt_id, int
             The channel number of the highest PMT
         - npdfs, int
@@ -147,27 +107,28 @@ def pdf(gpu, max_pmt_id, npdfs=10, nevents=100, nreps=1):
         - nreps, int
             The number of times to propagate each event and add to PDF
     """
-    gpu.setup_propagate()
-
-    run_times = []
+    rng_states = gpu.get_rng_states(nthreads_per_block*max_blocks)
 
-    gpu.setup_propagate()
-    gpu.setup_daq(max_pmt_id)
-    gpu.setup_pdf(max_pmt_id, 100, (-0.5, 999.5), 10, (-0.5, 9.5))
-    vertex_gen = constant_particle_gun('e-', (0,0,0), (1,0,0), 100)
+    gpu_daq = gpu.GPUDaq(gpu_geometry, max_pmt_id)
+    gpu_pdf = gpu.GPUPDF()
+    gpu_pdf.setup_pdf(max_pmt_id, 100, (-0.5, 999.5), 10, (-0.5, 9.5))
 
-    for i in progress(range(npdfs)):
+    run_times = []
+    for i in tools.progress(range(npdfs)):
         t0 = time.time()
-        gpu.clear_pdf()
+        gpu_pdf.clear_pdf()
+
+        vertex_gen = generator.vertex.constant_particle_gun('e-', (0,0,0), (1,0,0), 100)
+        vertex_iter = itertools.islice(vertex_gen, nevents)
 
-        for ev in g4generator.generate_events(nevents, vertex_gen):
+        for ev in g4generator.generate_events(vertex_iter):
             for j in xrange(nreps):
-                gpu.load_photons(ev.photon_start)
-                gpu.propagate()
-                gpu.run_daq()
-                gpu.add_hits_to_pdf()
+                gpu_photons = gpu.GPUPhotons(ev.photons_beg)
+                gpu.propagate(gpu_instance, gpu_photons, rng_states, nthreads_per_block, max_blocks)
+                gpu_channels = gpu_daq.acquire(gpu_photons, rng_states, nthreads_per_block, max_blocks)
+                gpu_pdf.add_hits_to_pdf(gpu_channels, nthreads_per_block)
 
-        hitcount, pdf = gpu.get_pdfs()
+        hitcount, pdf = gpu_pdf.get_pdfs()
 
         elapsed = time.time() - t0
 
@@ -178,15 +139,23 @@ def pdf(gpu, max_pmt_id, npdfs=10, nevents=100, nreps=1):
     return nevents*nreps/ufloat((np.mean(run_times),np.std(run_times)))
 
 if __name__ == '__main__':
-    from chroma.detectors import build_lbne_200kton, build_minilbne
+    from chroma import detectors
+    import gc
 
-    lbne = build_lbne_200kton()
+    lbne = detectors.build_lbne_200kton()
     lbne.build(bits=11)
 
-    gpu = GPU()
-    gpu.load_geometry(lbne, print_usage=False)
+    gpu_instance = gpu.GPU()
+    gpu_geometry = gpu.GPUGeometry(gpu_instance, lbne)
     
-    print '%s track steps/s' % ray_trace(gpu)
-    print '%s loaded photons/s' % load_photons(gpu)
-    print '%s propagated photons/s' % propagate(gpu)
-    print '%s histogrammed 100 MeV events/s' % pdf(gpu, max(lbne.pmtids))
+    gpu_instance.print_mem_info()
+    print '%s ray intersections/sec.' % tools.ufloat_to_str(intersect(gpu_instance))
+    gc.collect()
+    gpu_instance.print_mem_info()
+    print '%s photons loaded/sec.' % tools.ufloat_to_str(load_photons())
+    gc.collect()
+    gpu_instance.print_mem_info()
+    print '%s photons propagated/sec.' % tools.ufloat_to_str(propagate(gpu_instance))
+    gc.collect()
+    gpu_instance.print_mem_info()
+    print '%s 100 MeV events histogrammed/s' % tools.ufloat_to_str(pdf(gpu_instance, gpu_geometry, max(lbne.pmtids)))
diff --git a/camera.py b/camera.py
index 03b07f5..fc2f4aa 100755
--- a/camera.py
+++ b/camera.py
@@ -9,15 +9,17 @@ from chroma.geometry import Mesh, Solid, Geometry
 import chroma.make as make
 import matplotlib.pyplot as plt
 
-from chroma.gpu import GPU, CUDAFuncs
+from chroma import gpu
 from chroma.tools import timeit
 from chroma.transform import rotate
 from chroma.optics import vacuum, lambertian_surface
+import chroma.project as project
+from chroma.fileio.root import RootReader
 
 import pygame
 from pygame.locals import *
 
-from pycuda import gpuarray
+from pycuda import gpuarray as ga
 from pycuda.characterize import sizeof
 import pycuda.driver as cuda
 
@@ -98,28 +100,14 @@ def encode_movie(dir):
 
     print 'movie saved to %s.' % filename
 
-def get_rays(position, size = (800, 600), width = 0.035, focal_length=0.018):
-    height = width*(size[1]/float(size[0]))
-
-    x = np.linspace(-width/2, width/2, size[0])
-    z = np.linspace(-height/2, height/2, size[1])
-
-    grid = np.array(tuple(product(x,[0],z)))
-
-    grid += (0,focal_length,0)
-    focal_point = np.zeros(3)
-
-    grid += position
-    focal_point += position
-
-    return grid, focal_point-grid
-
 class Camera(Thread):
-    def __init__(self, geometry, size=(800,600), device_id=None):
+    def __init__(self, geometry, size=(800,600), device_id=None, enable3d=False, green_magenta=False):
         Thread.__init__(self)
         self.geometry = geometry
         self.device_id = device_id
         self.size = size
+        self.enable3d = enable3d
+        self.green_magenta = green_magenta
 
         self.unique_bvh_layers = np.unique(self.geometry.layers)
         self.currentlayer = None
@@ -133,11 +121,14 @@ class Camera(Thread):
             self.spnav = False
 
     def init_gpu(self):
-        self.gpu = GPU(self.device_id)
-        self.gpu.load_geometry(geometry)
+        self.gpu_instance = gpu.GPU(self.device_id)
+        self.gpu_geometry = gpu.GPUGeometry(self.gpu_instance, geometry)
+        self.gpu_funcs = gpu.GPUFuncs(self.gpu_instance.module)
 
         self.width, self.height = size
 
+        self.npixels = self.width*self.height
+
         pygame.init()
         self.screen = pygame.display.set_mode(size)
         pygame.display.set_caption('')
@@ -159,15 +150,38 @@ class Camera(Thread):
 
         self.nblocks = 64
 
-        self.point = np.array([0, self.scale*1.0, (lower_bound[2]+upper_bound[2])/2])
-        self.axis1 = np.array([1,0,0], float)
-        self.axis2 = np.array([0,0,1], float)
+        self.point = np.array([0, -self.scale*1.0, (lower_bound[2]+upper_bound[2])/2])
+        self.axis1 = np.array([0,0,1], float)
+        self.axis2 = np.array([1,0,0], float)
+
+        self.film_width = 0.035
+
+        if self.enable3d:
+            self.point1 = self.point-(self.scale/60,0,0)
+            self.point2 = self.point+(self.scale/60,0,0)
+
+            self.viewing_angle = 0.0
 
-        origins, directions = get_rays(self.point, self.size)
+            pos1, dir1 = project.from_film(self.point1, size=self.size, width=self.film_width)
+            pos2, dir2 = project.from_film(self.point2, size=self.size, width=self.film_width)
 
-        self.origins_gpu = gpuarray.to_gpu(origins.astype(np.float32).view(gpuarray.vec.float3))
-        self.directions_gpu = gpuarray.to_gpu(directions.astype(np.float32).view(gpuarray.vec.float3))
-        self.pixels_gpu = gpuarray.zeros(self.width*self.height, dtype=np.int32)
+            self.rays1 = gpu.GPURays(pos1, dir1)
+            self.rays2 = gpu.GPURays(pos2, dir2)
+
+            scope_pos, scope_dir = project.from_film(self.point, size=np.array(self.size)/4.0, width=self.film_width/4.0)
+
+            self.scope_rays = gpu.GPURays(scope_pos, scope_dir)
+
+            self.pixels1_gpu = ga.empty(self.width*self.height, dtype=np.int32)
+            self.pixels2_gpu = ga.empty(self.width*self.height, dtype=np.int32)
+
+            self.distances_gpu = ga.empty(self.scope_rays.pos.size, dtype=np.float32)
+        else:
+            pos, dir = project.from_film(self.point, size=self.size, width=self.film_width)
+
+            self.rays = gpu.GPURays(pos, dir)
+
+            self.pixels_gpu = ga.empty(self.npixels, dtype=np.int32)
 
         self.alpha = True
         self.movie = False
@@ -177,12 +191,15 @@ class Camera(Thread):
 
     @timeit
     def initialize_render(self):
-        self.rng_states_gpu = cuda.mem_alloc(self.width*self.height*sizeof('curandStateXORWOW', '#include <curand_kernel.h>'))
-        self.gpu.kernels.init_rng(np.int32(self.width*self.height), self.rng_states_gpu, np.int32(0), np.int32(0), block=(self.nblocks,1,1), grid=(self.width*self.height//self.nblocks+1,1))
-        self.xyz_lookup1_gpu = gpuarray.zeros(len(self.geometry.mesh.triangles), dtype=gpuarray.vec.float3)
-        self.xyz_lookup2_gpu = gpuarray.zeros(len(self.geometry.mesh.triangles), dtype=gpuarray.vec.float3)
-        self.image_gpu = gpuarray.zeros(self.width*self.height, dtype=gpuarray.vec.float3)
-        self.gpu.context.synchronize()
+        self.rng_states_gpu = gpu.get_rng_states(self.npixels)
+        self.xyz_lookup1_gpu = ga.zeros(len(self.geometry.mesh.triangles), dtype=ga.vec.float3)
+        self.xyz_lookup2_gpu = ga.zeros(len(self.geometry.mesh.triangles), dtype=ga.vec.float3)
+
+        if self.enable3d:
+            self.image1_gpu = ga.zeros(self.npixels, dtype=ga.vec.float3)
+            self.image2_gpu = ga.zeros(self.npixels, dtype=ga.vec.float3)
+        else:
+            self.image_gpu = ga.zeros(self.npixels, dtype=ga.vec.float3)
 
         self.source_position = self.point
 
@@ -191,39 +208,48 @@ class Camera(Thread):
         self.max_steps = 10
 
     def clear_xyz_lookup(self):
-        self.xyz_lookup1_gpu.fill(gpuarray.vec.make_float3(0.0,0.0,0.0))
-        self.xyz_lookup2_gpu.fill(gpuarray.vec.make_float3(0.0,0.0,0.0))
+        self.xyz_lookup1_gpu.fill(ga.vec.make_float3(0.0,0.0,0.0))
+        self.xyz_lookup2_gpu.fill(ga.vec.make_float3(0.0,0.0,0.0))
 
         self.nlookup_calls = 0
 
     def update_xyz_lookup(self, source_position):
-        for i in range(self.xyz_lookup1_gpu.size//(self.width*self.height)+1):
-            self.gpu.kernels.update_xyz_lookup(np.int32(self.width*self.height), np.int32(self.xyz_lookup1_gpu.size), np.int32(i*self.width*self.height), gpuarray.vec.make_float3(*source_position), self.rng_states_gpu, np.float32(685.0), gpuarray.vec.make_float3(1.0,0.0,0.0), self.xyz_lookup1_gpu, self.xyz_lookup2_gpu, np.int32(self.max_steps), block=(self.nblocks,1,1), grid=(self.width*self.height//self.nblocks+1,1))
-
-        for i in range(self.xyz_lookup1_gpu.size//(self.width*self.height)+1):
-            self.gpu.kernels.update_xyz_lookup(np.int32(self.width*self.height), np.int32(self.xyz_lookup1_gpu.size), np.int32(i*self.width*self.height), gpuarray.vec.make_float3(*source_position), self.rng_states_gpu, np.float32(545.0), gpuarray.vec.make_float3(0.0,1.0,0.0), self.xyz_lookup1_gpu, self.xyz_lookup2_gpu, np.int32(self.max_steps), block=(self.nblocks,1,1), grid=(self.width*self.height//self.nblocks+1,1))
-
-        for i in range(self.xyz_lookup1_gpu.size//(self.width*self.height)+1):
-            self.gpu.kernels.update_xyz_lookup(np.int32(self.width*self.height), np.int32(self.xyz_lookup1_gpu.size), np.int32(i*self.width*self.height), gpuarray.vec.make_float3(*source_position), self.rng_states_gpu, np.float32(445.0), gpuarray.vec.make_float3(0.0,0.0,1.0), self.xyz_lookup1_gpu, self.xyz_lookup2_gpu, np.int32(self.max_steps), block=(self.nblocks,1,1), grid=(self.width*self.height//self.nblocks+1,1))
+        for wavelength, rgb_tuple in \
+                zip([685.0, 545.0, 445.0],[(1,0,0),(0,1,0),(0,0,1)]):
+            for i in range(self.xyz_lookup1_gpu.size//(self.npixels)+1):
+                self.gpu_funcs.update_xyz_lookup(np.int32(self.npixels), np.int32(self.xyz_lookup1_gpu.size), np.int32(i*self.npixels), ga.vec.make_float3(*source_position), self.rng_states_gpu, np.float32(wavelength), ga.vec.make_float3(*rgb_tuple), self.xyz_lookup1_gpu, self.xyz_lookup2_gpu, np.int32(self.max_steps), block=(self.nblocks,1,1), grid=(self.npixels//self.nblocks+1,1))
 
         self.nlookup_calls += 1
 
     def clear_image(self):
-        self.image_gpu.fill(gpuarray.vec.make_float3(0.0,0.0,0.0))
+        if self.enable3d:
+            self.image1_gpu.fill(ga.vec.make_float3(0.0,0.0,0.0))
+            self.image2_gpu.fill(ga.vec.make_float3(0.0,0.0,0.0))
+        else:
+            self.image_gpu.fill(ga.vec.make_float3(0.0,0.0,0.0))
 
         self.nimages = 0
 
-    def update_image(self):
-        self.gpu.kernels.update_xyz_image(np.int32(self.width*self.height), self.rng_states_gpu, self.origins_gpu, self.directions_gpu, np.float32(685.0), gpuarray.vec.make_float3(1.0,0.0,0.0), self.xyz_lookup1_gpu, self.xyz_lookup2_gpu, self.image_gpu, np.int32(self.nlookup_calls), np.int32(self.max_steps), block=(self.nblocks,1,1), grid=(self.width*self.height//self.nblocks+1,1))
-
-        self.gpu.kernels.update_xyz_image(np.int32(self.width*self.height), self.rng_states_gpu, self.origins_gpu, self.directions_gpu, np.float32(545.0), gpuarray.vec.make_float3(0.0,1.0,0.0), self.xyz_lookup1_gpu, self.xyz_lookup2_gpu, self.image_gpu, np.int32(self.nlookup_calls), np.int32(self.max_steps), block=(self.nblocks,1,1), grid=(self.width*self.height//self.nblocks+1,1))
+    def update_image_from_rays(self, image, rays):
+        for wavelength, rgb_tuple in \
+                zip([685.0, 545.0, 445.0],[(1,0,0),(0,1,0),(0,0,1)]):
+            self.gpu_funcs.update_xyz_image(np.int32(rays.pos.size), self.rng_states_gpu, rays.pos, rays.dir, np.float32(wavelength), ga.vec.make_float3(*rgb_tuple), self.xyz_lookup1_gpu, self.xyz_lookup2_gpu, image, np.int32(self.nlookup_calls), np.int32(self.max_steps), block=(self.nblocks,1,1), grid=(rays.pos.size//self.nblocks+1,1))
 
-        self.gpu.kernels.update_xyz_image(np.int32(self.width*self.height), self.rng_states_gpu, self.origins_gpu, self.directions_gpu, np.float32(445.0), gpuarray.vec.make_float3(0.0,0.0,1.0), self.xyz_lookup1_gpu, self.xyz_lookup2_gpu, self.image_gpu, np.int32(self.nlookup_calls), np.int32(self.max_steps), block=(self.nblocks,1,1), grid=(self.width*self.height//self.nblocks+1,1))
+    def update_image(self):
+        if self.enable3d:
+            self.update_image_from_rays(self.image1_gpu, self.rays1)
+            self.update_image_from_rays(self.image2_gpu, self.rays2)
+        else:
+            self.update_image_from_rays(self.image_gpu, self.rays)
 
         self.nimages += 1
 
     def process_image(self):
-        self.gpu.kernels.process_image(np.int32(self.width*self.height), self.image_gpu, self.pixels_gpu, np.int32(self.nimages), block=(self.nblocks,1,1), grid=((self.width*self.height)//self.nblocks+1,1))
+        if self.enable3d:
+            self.gpu_funcs.process_image(np.int32(self.pixels1_gpu.size), self.image1_gpu, self.pixels1_gpu, np.int32(self.nimages), block=(self.nblocks,1,1), grid=((self.pixels1_gpu.size)//self.nblocks+1,1))
+            self.gpu_funcs.process_image(np.int32(self.pixels2_gpu.size), self.image2_gpu, self.pixels2_gpu, np.int32(self.nimages), block=(self.nblocks,1,1), grid=((self.pixels2_gpu.size)//self.nblocks+1,1))
+        else:
+            self.gpu_funcs.process_image(np.int32(self.pixels_gpu.size), self.image_gpu, self.pixels_gpu, np.int32(self.nimages), block=(self.nblocks,1,1), grid=((self.pixels_gpu.size)//self.nblocks+1,1))
 
     def screenshot(self, dir='', start=0):
         root, ext = 'screenshot', 'png'
@@ -238,8 +264,15 @@ class Camera(Thread):
         print 'image saved to %s' % filename
 
     def rotate(self, phi, n):
-        self.gpu.kernels.rotate(np.int32(self.pixels_gpu.size), self.origins_gpu, np.float32(phi), gpuarray.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.pixels_gpu.size//self.nblocks+1,1))
-        self.gpu.kernels.rotate(np.int32(self.pixels_gpu.size), self.directions_gpu, np.float32(phi), gpuarray.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.pixels_gpu.size//self.nblocks+1,1))
+        if self.enable3d:
+            self.rays1.rotate(phi, n)
+            self.rays2.rotate(phi, n)
+            self.scope_rays.rotate(phi, n)
+
+            self.point1 = rotate(self.point1, phi, n)
+            self.point2 = rotate(self.point2, phi, n)
+        else:
+            self.rays.rotate(phi, n)
 
         self.point = rotate(self.point, phi, n)
         self.axis1 = rotate(self.axis1, phi, n)
@@ -251,12 +284,16 @@ class Camera(Thread):
         self.update()
 
     def rotate_around_point(self, phi, n, point, redraw=True):
-        self.gpu.kernels.rotate_around_point(np.int32(self.origins_gpu.size), self.origins_gpu, np.float32(phi), gpuarray.vec.make_float3(*n), gpuarray.vec.make_float3(*point), block=(self.nblocks,1,1), grid=(self.origins_gpu.size//self.nblocks+1,1))
-        self.gpu.kernels.rotate(np.int32(self.directions_gpu.size), self.directions_gpu, np.float32(phi), gpuarray.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.directions_gpu.size//self.nblocks+1,1))
-
         self.axis1 = rotate(self.axis1, phi, n)
         self.axis2 = rotate(self.axis2, phi, n)
 
+        if self.enable3d:
+            self.rays1.rotate_around_point(phi, n, point)
+            self.rays2.rotate_around_point(phi, n, point)
+            self.scope_rays.rotate_around_point(phi, n, point)
+        else:
+            self.rays.rotate_around_point(phi, n, point)
+
         if redraw:
             if self.render:
                 self.clear_image()
@@ -264,17 +301,25 @@ class Camera(Thread):
             self.update()
 
     def translate(self, v, redraw=True):
-        self.gpu.kernels.translate(np.int32(self.pixels_gpu.size), self.origins_gpu, gpuarray.vec.make_float3(*v), block=(self.nblocks,1,1), grid=(self.pixels_gpu.size//self.nblocks,1))
-
         self.point += v
 
+        if self.enable3d:
+            self.rays1.translate(v)
+            self.rays2.translate(v)
+            self.scope_rays.translate(v)
+
+            self.point1 += v
+            self.point2 += v
+        else:
+            self.rays.translate(v)
+
         if redraw:
             if self.render:
                 self.clear_image()
 
             self.update()
 
-    def update(self):
+    def update_pixels(self):
         if self.render:
             while self.nlookup_calls < 10:
                 self.update_xyz_lookup(self.source_position)
@@ -282,11 +327,69 @@ class Camera(Thread):
             self.process_image()
         else:
             if self.alpha:
-                self.gpu.kernels.ray_trace_alpha(np.int32(self.pixels_gpu.size), self.origins_gpu, self.directions_gpu, self.pixels_gpu, block=(self.nblocks,1,1), grid=(self.pixels_gpu.size//self.nblocks+1,1))
+                if self.enable3d:
+                    self.gpu_funcs.ray_trace_alpha(np.int32(self.rays1.pos.size), self.rays1.pos, self.rays1.dir, self.pixels1_gpu, block=(self.nblocks,1,1), grid=(self.rays2.pos.size//self.nblocks+1,1))
+                    self.gpu_funcs.ray_trace_alpha(np.int32(self.rays2.pos.size), self.rays2.pos, self.rays2.dir, self.pixels2_gpu, block=(self.nblocks,1,1), grid=(self.rays2.pos.size//self.nblocks+1,1))
+                else:
+                    self.gpu_funcs.ray_trace_alpha(np.int32(self.rays.pos.size), self.rays.pos, self.rays.dir, self.pixels_gpu, block=(self.nblocks,1,1), grid=(self.rays.pos.size//self.nblocks+1,1))
+            else:
+                if self.enable3d:
+                    self.gpu_funcs.ray_trace(np.int32(self.rays1.pos.size), self.rays1.pos, self.rays1.dir, self.pixels1_gpu, block=(self.nblocks,1,1), grid=(self.rays1.pos.size//self.nblocks+1,1))
+                    self.gpu_funcs.ray_trace(np.int32(self.rays2.pos.size), self.rays2.pos, self.rays2.dir, self.pixels2_gpu, block=(self.nblocks,1,1), grid=(self.rays2.pos.size//self.nblocks+1,1))
+                else:
+                    self.gpu_funcs.ray_trace(np.int32(self.rays.pos.size), self.rays.pos, self.rays.dir, self.pixels_gpu, block=(self.nblocks,1,1), grid=(self.rays.pos.size//self.nblocks+1,1))
+
+    def update(self):
+        if self.enable3d:
+            self.gpu_funcs.distance_to_mesh(np.int32(self.scope_rays.pos.size), self.scope_rays.pos, self.scope_rays.dir, self.distances_gpu, block=(self.nblocks,1,1), grid=(self.scope_rays.pos.size//self.nblocks,1))
+
+            baseline = ga.min(self.distances_gpu).get().item()
+
+            if baseline < 1e9:
+                d1 = self.point1 - self.point
+                v1 = d1/np.linalg.norm(d1)
+                v1 *= baseline/60 - np.linalg.norm(d1)
+
+                self.rays1.translate(v1)
+
+                self.point1 += v1
+
+                d2 = self.point2 - self.point
+                v2 = d2/np.linalg.norm(d2)
+                v2 *= baseline/60 - np.linalg.norm(d2)
+
+                self.rays2.translate(v2)
+
+                self.point2 += v2
+
+            direction = np.cross(self.axis1,self.axis2)
+            direction /= np.linalg.norm(direction)
+            direction1 = self.point + direction*baseline - self.point1
+            direction1 /= np.linalg.norm(direction1)
+
+            new_viewing_angle = np.arccos(direction1.dot(direction))
+
+            phi = new_viewing_angle - self.viewing_angle
+
+            self.rays1.rotate_around_point(phi, self.axis1, self.point1)
+            self.rays2.rotate_around_point(-phi, self.axis1, self.point2)
+
+            self.viewing_angle = new_viewing_angle
+
+        self.update_pixels()
+
+        if self.enable3d:
+            pixels1 = self.pixels1_gpu.get()
+            pixels2 = self.pixels2_gpu.get()
+
+            if self.green_magenta:
+                pixels = (pixels1 & 0x00ff00) | (pixels2 & 0xff00ff)
             else:
-                self.gpu.kernels.ray_trace(np.int32(self.pixels_gpu.size), self.origins_gpu, self.directions_gpu, self.pixels_gpu, block=(self.nblocks,1,1), grid=(self.pixels_gpu.size//self.nblocks+1,1))
+                pixels = (pixels1 & 0xff0000) | (pixels2 & 0x00ffff)
+        else:
+            pixels = self.pixels_gpu.get()
 
-        pygame.surfarray.blit_array(self.screen, self.pixels_gpu.get().reshape(self.size))
+        pygame.surfarray.blit_array(self.screen, pixels.reshape(self.size))
         if self.doom_mode:
             self.screen.blit(self.doom_hud, self.doom_rect)
         pygame.display.flip()
@@ -298,11 +401,11 @@ class Camera(Thread):
     def loadlayer(self, layer):
         try:
             layergeometry = self.bvh_layers[layer]
+            layergeometry.activate()
         except KeyError:
             layergeometry = build(bvh_mesh(self.geometry, layer),8)
-            self.bvh_layers[layer] = layergeometry
+            self.bvh_layers[layer] = gpu.GPUGeometry(self.gpu, layergeometry)
 
-        self.gpu.load_geometry(layergeometry)
         self.update()
 
     def process_event(self, event):
@@ -331,15 +434,15 @@ class Camera(Thread):
 
             length = np.linalg.norm(movement)
 
-            mouse_direction = movement[0]*self.axis1 + movement[1]*self.axis2
+            mouse_direction = movement[0]*self.axis2 - movement[1]*self.axis1
             mouse_direction /= np.linalg.norm(mouse_direction)
 
             if pygame.key.get_mods() & (KMOD_LSHIFT | KMOD_RSHIFT):
-                v = mouse_direction*self.scale*length/float(self.width)
+                v = -mouse_direction*self.scale*length/float(self.width)
                 self.translate(v)
             else:
                 phi = np.float32(2*np.pi*length/float(self.width))
-                n = rotate(mouse_direction, np.pi/2, -np.cross(self.axis1,self.axis2))
+                n = rotate(mouse_direction, np.pi/2, np.cross(self.axis1,self.axis2))
 
                 if pygame.key.get_mods() & KMOD_LCTRL:
                     self.rotate_around_point(phi, n, self.point)
@@ -348,11 +451,11 @@ class Camera(Thread):
 
         elif event.type == KEYDOWN:
             if event.key == K_a:
-                v = self.scale*self.axis1/10.0
+                v = self.scale*self.axis2/10.0
                 self.translate(v)
 
             elif event.key == K_d:
-                v = -self.scale*self.axis1/10.0
+                v = -self.scale*self.axis2/10.0
                 self.translate(v)
 
             elif event.key == K_w:
@@ -363,10 +466,6 @@ class Camera(Thread):
                 v = -self.scale*np.cross(self.axis1,self.axis2)/10.0
                 self.translate(v)
 
-            elif event.key == K_SPACE:
-                v = self.scale*self.axis2/10.0
-                self.translate(v)
-
             elif event.key == K_F6:
                 self.clear_xyz_lookup()
                 self.clear_image()
@@ -443,10 +542,10 @@ class Camera(Thread):
                 else:
                     accelerate_factor = 1.0
                 #print 'raw:', spnav_event
-                v1 = self.axis1
-                v2 = self.axis2
-                v3 = np.cross(self.axis1,self.axis2)
-
+                v1 = self.axis2
+                v2 = self.axis1
+                v3 = -np.cross(self.axis1,self.axis2)
+                
                 v = -v1*spnav_event.motion.x + v2*spnav_event.motion.y \
                     + v3*spnav_event.motion.z
                 v *= self.scale / 5000.0 * accelerate_factor
@@ -502,12 +601,10 @@ class Camera(Thread):
                 self.spnav = False
 
         self.update()
-        
+
         self.done = False
         self.clicked = False
 
-        #current_layer = 0
-
         while not self.done:
             self.clock.tick(20)
 
@@ -525,53 +622,44 @@ class Camera(Thread):
         if self.spnav:
             self.spnav_module.spnav_close()
 
-        del self.gpu
+        del self.gpu_instance
 
 class EventViewer(Camera):
     def __init__(self, geometry, filename, **kwargs):
         Camera.__init__(self, geometry, **kwargs)
-
-        import ROOT
-
-        self.f = ROOT.TFile(filename)
-        self.T = self.f.Get('T')
-        self.T.GetEntry(0)
-        self.nsolids = geometry.solid_id.max() + 1
+        self.rr = RootReader(filename)
 
     def color_hit_pmts(self):
-        self.gpu.reset_colors()
-
-        hit = np.empty(self.nsolids, np.int32)
-        t = np.empty(self.nsolids, np.float32)
-        q = np.empty(self.nsolids, np.float32)
-
-        # self.nsolids has a weird data type that PyROOT doesn't understand
-        self.T.ev.get_channels(int(self.nsolids), hit, t, q)
+        self.gpu_geometry.reset_colors()
 
-        # PyROOT prints warnings when we try to pass a bool array directly
-        # so we convert afterward
-        hit = hit.astype(np.bool)
+        hit = self.ev.channels.hit
+        t = self.ev.channels.t
+        q = self.ev.channels.q
 
         # Important: Compute range only with HIT channels
         solid_colors = map_to_color(q, range=(q[hit].min(),q[hit].max()))
-        self.gpu.color_solids(hit, solid_colors)
+        self.gpu_geometry.color_solids(hit, solid_colors)
         self.update()
 
     def process_event(self, event):
         if event.type == KEYDOWN:
             if event.key == K_PAGEUP:
-                entry = self.T.GetReadEntry()
-                if entry < self.T.GetEntries() - 1:
-                    self.T.GetEntry(entry+1)
+                try:
+                    self.ev = self.rr.next()
+                except StopIteration:
+                    pass
+                else:
                     self.color_hit_pmts()
-                    return
+                return
 
             elif event.key == K_PAGEDOWN:
-                entry = self.T.GetReadEntry()
-                if entry > 0:
-                    self.T.GetEntry(entry-1)
+                try:
+                    self.ev = self.rr.prev()
+                except StopIteration:
+                    pass
+                else:
                     self.color_hit_pmts()
-                    return
+                return
 
         Camera.process_event(self, event)
 
@@ -589,6 +677,10 @@ if __name__ == '__main__':
                       help='bits for z-ordering space axes', default=10)
     parser.add_option('-r', '--resolution', dest='resolution',
                       help='specify resolution', default='1024,576')
+    parser.add_option('--3d', action='store_true', dest='enable3d',
+                      help='enable 3d', default=False)
+    parser.add_option('--green', action='store_true', dest='green_magenta',
+                      help='3d with green and magenta lenses', default=False)
     parser.add_option('-i', dest='io_file', default=None)
     options, args = parser.parse_args()
 
@@ -619,9 +711,9 @@ if __name__ == '__main__':
 
     geometry = build(obj, options.bits)
     if options.io_file is not None:
-        camera = EventViewer(geometry, options.io_file, size=size)
+        camera = EventViewer(geometry, options.io_file, size=size, enable3d=options.enable3d, green_magenta=options.green_magenta)
     else:
-        camera = Camera(geometry, size)
+        camera = Camera(geometry, size, enable3d=options.enable3d, green_magenta=options.green_magenta)
 
     camera.start()
     camera.join()
diff --git a/detectors/lbne.py b/detectors/lbne.py
index ca13c2d..97d09e9 100644
--- a/detectors/lbne.py
+++ b/detectors/lbne.py
@@ -6,18 +6,18 @@ from chroma.transform import rotate, make_rotation_matrix
 from itertools import product
 import chroma.make as make
 
-def build_lbne(radius, height, nstrings, pmts_per_string, endcap_spacing, physical_model=True):
+def build_lbne(radius, height, nstrings, pmts_per_string, endcap_spacing, detector_material=water_wcsim, physical_model=True):
     if physical_model:
-        pmt = solids.build_12inch_pmt_with_lc(outer_material=water_wcsim)
+        pmt = solids.build_12inch_pmt_with_lc(outer_material=detector_material)
     else:
-        pmt = solids.build_12inch_pmt_shell(outer_material=water_wcsim)
+        pmt = solids.build_12inch_pmt_shell(outer_material=detector_material)
 
-    lbne = Geometry()
+    lbne = Geometry(detector_material)
 
     # outer cylinder
     cylinder_mesh = make.segmented_cylinder(radius, height+height/(pmts_per_string-1), nsteps=16*nstrings, nsegments=1200)
     cylinder_mesh.vertices = rotate(cylinder_mesh.vertices, np.pi/2, (-1,0,0))
-    lbne.add_solid(Solid(cylinder_mesh, water_wcsim, vacuum, black_surface, 0xff0000ff))
+    lbne.add_solid(Solid(cylinder_mesh, detector_material, vacuum, black_surface, 0xff0000ff))
 
     lbne.pmtids = []
 
diff --git a/detectors/miniclean.py b/detectors/miniclean.py
index b0d4955..92dd019 100644
--- a/detectors/miniclean.py
+++ b/detectors/miniclean.py
@@ -42,7 +42,7 @@ def build_miniclean(real_av=False):
     geo = Geometry()
 
     simple_iv = sphere(0.818)
-    geo.add_solid(Solid(simple_iv, liquid_argon, vacuum, color=0x33FF0000))
+    geo.add_solid(Solid(simple_iv, liquid_argon, vacuum, color=0xffFF0000))
 
     polygons = read_polygons(os.path.join(dir, 'miniclean_polygons.txt'))
 
@@ -59,7 +59,7 @@ def build_miniclean(real_av=False):
         triangles = mesh.assemble(group=True)
         triangle_centroids = triangles.mean(axis=1)
         face = triangle_centroids[:,2] < 0.1
-        colors[face] = 0x1100FF00
+        colors[face] = 0xffffff
 
         polygon_types[polygon_id] = (mesh, colors)
 
diff --git a/event.py b/event.py
index 657f414..ed64b63 100644
--- a/event.py
+++ b/event.py
@@ -1,69 +1,73 @@
 import numpy as np
 
-from chroma.sample import uniform_sphere
+class Vertex(object):
+    def __init__(self, particle_name, pos, dir, pol, ke, t0=0.0):
+        self.particle_name = particle_name
+        self.pos = pos
+        self.dir = dir
+        self.pol = pol
+        self.ke = ke
+        self.t0 = t0
 
 class Photons(object):
-    def __init__(self, positions, directions, wavelengths, polarizations=None, times=None, last_hit_triangles=None, histories=None):
-        self.positions = positions
-
-        nphotons = len(positions)
-
-        assert len(directions) == nphotons
-        self.directions = directions
-
-        assert len(wavelengths) == nphotons
+    def __init__(self, pos, dir, pol, wavelengths, t=None, last_hit_triangles=None, flags=None):
+        self.pos = pos
+        self.dir = dir
+        self.pol = pol
         self.wavelengths = wavelengths
 
-        if polarizations is not None:
-            self.polarizations = polarizations
+        if t is None:
+            self.t = np.zeros(len(pos), dtype=np.float32)
         else:
-            # polarizations are isotropic in plane perpendicular
-            # to the direction vectors
-            polarizations = np.cross(directions, uniform_sphere(nphotons))
-            # normalize polarization vectors
-            polarizations /= np.tile(np.apply_along_axis(np.linalg.norm,1,polarizations),[3,1]).transpose()
-            self.polarizations = np.asarray(polarizations, order='C')
+            self.t = t
 
-        self.times = times
-        self.last_hit_triangles = last_hit_triangles
-        self.histories = histories
+        if last_hit_triangles is None:
+            self.last_hit_triangles = np.empty(len(pos), dtype=np.int32)
+            self.last_hit_triangles.fill(-1)
+        else:
+            self.last_hit_triangles = last_hit_triangles
+
+        if flags is None:
+            self.flags = np.zeros(len(pos), dtype=np.uint32)
+        else:
+            self.flags = flags
 
-def concatenate_photons(photons):
-    '''Merge a list of Photons objects into one long list of photons'''
-    return Photons(positions=np.concatenate([p.positions for p in photons]),
-                   directions=np.concatenate([p.directions for p in photons]),
-                   polarizations=np.concatenate([p.polarizations for p in photons]),
-                   times=np.concatenate([p.times for p in photons]),
-                   wavelengths=np.concatenate([p.wavelengths for p in photons]))
+    def __add__(self, other):
+        pos = np.concatenate((self.pos, other.pos))
+        dir = np.concatenate((self.dir, other.dir))
+        pol = np.concatenate((self.pol, other.pol))
+        wavelengths = np.concatenate((self.wavelengths, other.wavelengths))
+        t = np.concatenate((self.t, other.t))
+        last_hit_triangles = np.concatenate((self.last_hit_triangles, other.last_hit_triangles))
+        flags = np.concatenate((self.flags, other.flags))
+        return Photons(pos, dir, pol, wavelengths, t, last_hit_triangles, flags)
 
 class Channels(object):
-    def __init__(self, hit, t, q, histories=None):
+    def __init__(self, hit, t, q, flags=None):
         self.hit = hit
         self.t = t
         self.q = q
-        self.histories=histories
+        self.flags = flags
 
     def hit_channels(self):
         return self.hit.nonzero(), self.t[self.hit], self.q[self.hit]
 
-class Subtrack(object):
-    def __init__(self, particle_name, position, direction, start_time, total_energy):
-        self.particle_name = particle_name
-        self.position = position
-        self.direction = direction
-        self.start_time = start_time
-        self.total_energy = total_energy
-
 class Event(object):
-    def __init__(self, event_id, particle_name=None, gen_position=None, gen_direction=None, gen_total_energy=None, ):
-        self.event_id = event_id
-        self.particle_name = particle_name
-        self.gen_position = gen_position
-        self.gen_direction = gen_direction
-        self.gen_total_energy = gen_total_energy
+    def __init__(self, id=0, primary_vertex=None, vertices=None, photons_beg=None, photons_end=None, channels=None):
+        self.id = id
+
+        self.nphotons = None
+
+        self.primary_vertex = primary_vertex
+
+        if vertices is not None:
+            if np.iterable(vertices):
+                self.vertices = vertices
+            else:
+                self.vertices = [vertices]
+        else:
+            self.vertices = []
 
-        self.subtracks = []
-        self.nphoton = 0
-        self.photon_start = None
-        self.photon_stop = None
-        self.channels = None
+        self.photons_beg = photons_beg
+        self.photons_end = photons_end
+        self.channels = channels
diff --git a/fileio/root.C b/fileio/root.C
index 07e50a1..2b39b1b 100644
--- a/fileio/root.C
+++ b/fileio/root.C
@@ -3,166 +3,143 @@
 #include <TTree.h>
 #include <string>
 
+struct Vertex {
+  std::string particle_name;
+  TVector3 pos;
+  TVector3 dir;
+  TVector3 pol;
+  double ke;
+  double t0;
+
+  ClassDef(Vertex, 1);
+};
+
 struct Photon {
-  double time;
-  TVector3 position;
-  TVector3 direction;
-  TVector3 polarization;
+  double t;
+  TVector3 pos;
+  TVector3 dir;
+  TVector3 pol;
   double wavelength; // nm
-  unsigned int history;
+  unsigned int flag;
   int last_hit_triangle;
 
   ClassDef(Photon, 1);
 };
 
-struct Track {
-  std::string particle;
-  TVector3 position;
-  TVector3 direction;
-  double start_time;
-  double total_energy;
+struct Channel {
+  Channel() : id(-1), t(-1e9), q(-1e9) { };
+  int id;
+  double t;
+  double q;
+  unsigned int flag;
 
-  ClassDef(Track, 1);
+  ClassDef(Channel, 1);
 };
 
-struct MC {
-  std::string particle;
-  TVector3 gen_position;
-  TVector3 gen_direction;
-  double gen_total_energy;
-
-  int nphoton;
-  
-  std::vector<Track> subtrack;
-  std::vector<Photon> photon_start;
-  std::vector<Photon> photon_stop;
+struct Event {
+  int id;
+  unsigned int nhit;
+  unsigned int nchannels;
 
-  ClassDef(MC, 1);
-};
+  Vertex primary_vertex;
 
-struct Channel {
-  Channel() : channel_id(-1), time(-9999.0), charge(-9999.0) { };
-  int channel_id;
-  double time;
-  double charge;
-  unsigned int mc_history;
+  std::vector<Vertex> vertices;
+  std::vector<Photon> photons_beg;
+  std::vector<Photon> photons_end;
+  std::vector<Channel> channels;
 
-  ClassDef(Channel, 1);
+  ClassDef(Event, 1);
 };
-  
-struct Event {
-  int event_id;
-  MC mc;
-  int nhit;
-  int max_channel_id;
-  std::vector<Channel> channel;
 
-  ClassDef(Event, 1);
+void fill_channels(Event *ev, unsigned int nhit, unsigned int *ids, float *t,
+		   float *q, unsigned int *flags, unsigned int nchannels)
+{
+  ev->nhit = 0;
+  ev->nchannels = nchannels;
+  ev->channels.resize(0);
 
-  // Populate arrays of length nentries with hit, time, and charge
-  // information, indexed by channel ID
-  void get_channels(unsigned int nentries, int *hit, float *time,
-		    float *charge, unsigned int *mc_history=0)
-  {
-    for (unsigned int i=0; i < nentries; i++) {
-      hit[i] = 0;
-      time[i] = -1e9f;
-      charge[i] = -1e9f;
-      if (mc_history)
-	mc_history[i] = 0;
-    }
+  Channel ch;
+  unsigned int id;
+  for (unsigned int i=0; i < nhit; i++) {
+      ev->nhit++;
+      id = ids[i];
+      ch.id = id;
+      ch.t = t[id];
+      ch.q = q[id];
+      ch.flag = flags[id];
+      ev->channels.push_back(ch);
+  }
+}
+
+void get_channels(Event *ev, int *hit, float *t, float *q, unsigned int *flags)
+{
+  for (unsigned int i=0; i < ev->nchannels; i++) {
+    hit[i] = 0;
+    t[i] = -1e9f;
+    q[i] = -1e9f;
+    flags[i] = 0;
+  }
 
-    for (unsigned int i=0; i < channel.size(); i++) {
-      unsigned int channel_id = channel[i].channel_id;
+  unsigned int id;
+  for (unsigned int i=0; i < ev->channels.size(); i++) {
+    id = ev->channels[i].id;
 
-      if (channel_id < nentries) {
-	hit[channel_id] = 1;
-	time[channel_id] = channel[i].time;
-	charge[channel_id] = channel[i].charge;
-	if (mc_history)
-	  mc_history[channel_id] = channel[i].mc_history;
-      }
+    if (id < ev->nchannels) {
+      hit[id] = 1;
+      t[id] = ev->channels[i].t;
+      q[id] = ev->channels[i].q;
+      flags[id] = ev->channels[i].flag;
     }
   }
+}
 
-};
-
-void get_photons(const std::vector<Photon> &photons, float *positions,
-		 float *directions, float *polarizations, float *wavelengths,
-		 float *times, unsigned int *histories, int *last_hit_triangles)
+void get_photons(const std::vector<Photon> &photons, float *pos, float *dir,
+		 float *pol, float *wavelengths, float *t,
+		 int *last_hit_triangles, unsigned int *flags)
 {
   for (unsigned int i=0; i < photons.size(); i++) {
     const Photon &photon = photons[i];
-    positions[3*i] = photon.position.X();
-    positions[3*i+1] = photon.position.Y();
-    positions[3*i+2] = photon.position.Z();
+    pos[3*i] = photon.pos.X();
+    pos[3*i+1] = photon.pos.Y();
+    pos[3*i+2] = photon.pos.Z();
 
-    directions[3*i] = photon.direction.X();
-    directions[3*i+1] = photon.direction.Y();
-    directions[3*i+2] = photon.direction.Z();
+    dir[3*i] = photon.dir.X();
+    dir[3*i+1] = photon.dir.Y();
+    dir[3*i+2] = photon.dir.Z();
     
-    polarizations[3*i] = photon.polarization.X();
-    polarizations[3*i+1] = photon.polarization.Y();
-    polarizations[3*i+2] = photon.polarization.Z();
+    pol[3*i] = photon.pol.X();
+    pol[3*i+1] = photon.pol.Y();
+    pol[3*i+2] = photon.pol.Z();
 
     wavelengths[i] = photon.wavelength;
-    times[i] = photon.time;
-    histories[i] = photon.history;
+    t[i] = photon.t;
+    flags[i] = photon.flag;
     last_hit_triangles[i] = photon.last_hit_triangle;
   }
 }
 		 
-
 void fill_photons(std::vector<Photon> &photons,
 		  unsigned int nphotons, float *pos, float *dir,
-		  float *pol, float *wavelength, float *t0,
-		  unsigned int *histories=0, int *last_hit_triangle=0)
+		  float *pol, float *wavelengths, float *t,
+		  int *last_hit_triangles, unsigned int *flags)
 {
   photons.resize(nphotons);
   
   for (unsigned int i=0; i < nphotons; i++) {
     Photon &photon = photons[i];
-    photon.time = t0[i];
-    photon.position.SetXYZ(pos[3*i], pos[3*i + 1], pos[3*i + 2]);
-    photon.direction.SetXYZ(dir[3*i], dir[3*i + 1], dir[3*i + 2]);
-    photon.polarization.SetXYZ(pol[3*i], pol[3*i + 1], pol[3*i + 2]);
-    photon.wavelength = wavelength[i];
-    if (histories)
-      photon.history = histories[i];
-    else
-      photon.history = 0;
-
-    if (last_hit_triangle)
-      photon.last_hit_triangle = last_hit_triangle[i];
-    else
-      photon.last_hit_triangle = -1;
-  }
-}
-		  
-
-void fill_hits(Event *ev, unsigned int nchannels, float *time,
-	       float *charge, unsigned int *history)
-{
-  ev->channel.resize(0);
-  ev->nhit = 0;
-  ev->max_channel_id = nchannels - 1;
+    photon.t = t[i];
+    photon.pos.SetXYZ(pos[3*i], pos[3*i + 1], pos[3*i + 2]);
+    photon.dir.SetXYZ(dir[3*i], dir[3*i + 1], dir[3*i + 2]);
+    photon.pol.SetXYZ(pol[3*i], pol[3*i + 1], pol[3*i + 2]);
+    photon.wavelength = wavelengths[i];
+    photon.last_hit_triangle = last_hit_triangles[i];
+    photon.flag = flags[i];
 
-  Channel ch;
-  for (unsigned int i=0; i < nchannels; i++) {
-    if (time[i] < 1e8) {
-      ev->nhit++;
-      ch.channel_id = i;
-      ch.time = time[i];
-      ch.charge = charge[i];
-      ch.mc_history = history[i];
-      ev->channel.push_back(ch);
-    }
   }
 }
 
-
 #ifdef __MAKECINT__
-#pragma link C++ class vector<Track>;
+#pragma link C++ class vector<Vertex>;
 #pragma link C++ class vector<Photon>;
 #pragma link C++ class vector<Channel>;
 #endif
diff --git a/fileio/root.py b/fileio/root.py
index 4c9d9bb..507c249 100644
--- a/fileio/root.py
+++ b/fileio/root.py
@@ -14,61 +14,66 @@ def tvector3_to_ndarray(vec):
 def make_photon_with_arrays(size):
     '''Returns a new chroma.event.Photons object for `size` number of
     photons with empty arrays set for all the photon attributes.'''
-    return event.Photons(positions=np.empty((size,3), dtype=np.float32),
-                         directions=np.empty((size,3), dtype=np.float32),
-                         polarizations=np.empty((size,3), dtype=np.float32),
+    return event.Photons(pos=np.empty((size,3), dtype=np.float32),
+                         dir=np.empty((size,3), dtype=np.float32),
+                         pol=np.empty((size,3), dtype=np.float32),
                          wavelengths=np.empty(size, dtype=np.float32),
-                         times=np.empty(size, dtype=np.float32),
-                         histories=np.empty(size, dtype=np.uint32),
+                         t=np.empty(size, dtype=np.float32),
+                         flags=np.empty(size, dtype=np.uint32),
                          last_hit_triangles=np.empty(size, dtype=np.int32))
 
+def root_vertex_to_python_vertex(vertex):
+    "Returns a chroma.event.Vertex object from a root Vertex object."
+    return event.Vertex(str(vertex.particle_name),
+                        tvector3_to_ndarray(vertex.pos),
+                        tvector3_to_ndarray(vertex.dir),
+                        tvector3_to_ndarray(vertex.pol),
+                        vertex.ke,
+                        vertex.t0)
 
 def root_event_to_python_event(ev):
     '''Returns a new chroma.event.Event object created from the
     contents of the ROOT event `ev`.'''
-    pyev = event.Event(ev.event_id)
-
-    # MC
-    pyev.particle_name = str(ev.mc.particle)
-    pyev.gen_position = tvector3_to_ndarray(ev.mc.gen_position)
-    pyev.gen_direction = tvector3_to_ndarray(ev.mc.gen_direction)
-    pyev.gen_total_energy = ev.mc.gen_total_energy
-
-    pyev.nphoton = ev.mc.nphoton
-
-    for subtrack in ev.mc.subtrack:
-        pysubtrack = event.Subtrack(str(subtrack.particle),
-                                    tvector3_to_ndarray(subtrack.position),
-                                    tvector3_to_ndarray(subtrack.direction),
-                                    subtrack.start_time,
-                                    subtrack.total_energy)
-        pyev.subtracks.append(pysubtrack)
-
-    # photon start
-    if ev.mc.photon_start.size() > 0:
-        photons = make_photon_with_arrays(ev.mc.photon_start.size())
-        ROOT.get_photons(ev.mc.photon_start, photons.positions.ravel(), photons.directions.ravel(),
-                         photons.polarizations.ravel(), photons.wavelengths, photons.times,
-                         photons.histories, photons.last_hit_triangles)
-        pyev.photon_start = photons
-
-    # photon stop
-    if ev.mc.photon_stop.size() > 0:
-        photons = make_photon_with_arrays(ev.mc.photon_stop.size())
-        ROOT.get_photons(ev.mc.photon_stop, photons.positions.ravel(), photons.directions.ravel(),
-                         photons.polarizations.ravel(), photons.wavelengths, photons.times,
-                         photons.histories, photons.last_hit_triangles)
-        pyev.photon_stop = photons
-
-    # hits
-    max_channel_id = ev.max_channel_id
-    hit = np.empty(shape=max_channel_id+1, dtype=np.int32)
-    t = np.empty(shape=max_channel_id+1, dtype=np.float32)
-    q = np.empty(shape=max_channel_id+1, dtype=np.float32)
-    histories = np.empty(shape=max_channel_id+1, dtype=np.uint32)
-
-    ev.get_channels(max_channel_id+1, hit, t, q, histories)
-    pyev.channels = event.Channels(hit.astype(bool), t, q, histories)
+    pyev = event.Event(ev.id)
+    pyev.primary_vertex = root_vertex_to_python_vertex(ev.primary_vertex)
+
+    for vertex in ev.vertices:
+        pyev.vertices.append(root_vertex_to_python_vertex(vertex))
+
+    # photon begin
+    if ev.photons_beg.size() > 0:
+        photons = make_photon_with_arrays(ev.photons_beg.size())
+        ROOT.get_photons(ev.photons_beg,
+                         photons.pos.ravel(),
+                         photons.dir.ravel(),
+                         photons.pol.ravel(),
+                         photons.wavelengths,
+                         photons.t,
+                         photons.last_hit_triangles,
+                         photons.flags)
+        pyev.photons_beg = photons
+
+    # photon end
+    if ev.photons_end.size() > 0:
+        photons = make_photon_with_arrays(ev.photons_end.size())
+        ROOT.get_photons(ev.photons_end,
+                         photons.pos.ravel(),
+                         photons.dir.ravel(),
+                         photons.pol.ravel(),
+                         photons.wavelengths,
+                         photons.t,
+                         photons.last_hit_triangles,
+                         photons.flags)
+        pyev.photons_end = photons
+
+    # channels
+    hit = np.empty(ev.nchannels, dtype=np.int32)
+    t = np.empty(ev.nchannels, dtype=np.float32)
+    q = np.empty(ev.nchannels, dtype=np.float32)
+    flags = np.empty(ev.nchannels, dtype=np.uint32)
+
+    ROOT.get_channels(ev, hit, t, q, flags)
+    pyev.channels = event.Channels(hit.astype(bool), t, q, flags)
     return pyev
     
 class RootReader(object):
@@ -140,48 +145,53 @@ class RootWriter(object):
         self.T.Branch('ev', self.ev)
 
     def write_event(self, pyev):
-        '''Write an event.Event object to the ROOT tree as a ROOT.Event object.'''
-        self.ev.event_id = pyev.event_id
-
-        self.ev.mc.particle = pyev.particle_name
-        self.ev.mc.gen_position.SetXYZ(*pyev.gen_position)
-        self.ev.mc.gen_direction.SetXYZ(*pyev.gen_direction)
-        self.ev.mc.gen_total_energy = pyev.gen_total_energy
-        self.ev.mc.nphoton = pyev.nphoton
-
-        if pyev.photon_start is not None:
-            photons = pyev.photon_start
-            ROOT.fill_photons(self.ev.mc.photon_start,
-                              len(photons.positions), 
-                              np.ravel(photons.positions),
-                              np.ravel(photons.directions),
-                              np.ravel(photons.polarizations),
-                              photons.wavelengths, photons.times,
-                              photons.histories, photons.last_hit_triangles)
-        if pyev.photon_stop is not None:
-            photons = pyev.photon_stop
-            ROOT.fill_photons(self.ev.mc.photon_stop,
-                              len(photons.positions), 
-                              np.ravel(photons.positions),
-                              np.ravel(photons.directions),
-                              np.ravel(photons.polarizations),
-                              photons.wavelengths, photons.times,
-                              photons.histories, photons.last_hit_triangles)
-
-        self.ev.mc.subtrack.resize(0)
-        if pyev.subtracks is not None:
-            self.ev.mc.subtrack.resize(len(pyev.subtracks))
-            for i, subtrack in enumerate(pyev.subtracks):
-                self.ev.mc.subtrack[i].name = subtrack.particle_name
-                self.ev.mc.subtrack[i].position.SetXYZ(*subtrack.position)
-                self.ev.mc.subtrack[i].direction.SetXYZ(*subtrack.direction)
-                self.ev.mc.subtrack[i].start_time = subtrack.start_time
-                self.ev.mc.subtrack[i].total_energy = subtrack.total_energy
-
-        ROOT.fill_hits(self.ev, len(pyev.channels.t), pyev.channels.t, pyev.channels.q, pyev.channels.histories)
+        "Write an event.Event object to the ROOT tree as a ROOT.Event object."
+        self.ev.id = pyev.id
+
+        self.ev.primary_vertex.particle_name = pyev.primary_vertex.particle_name
+        self.ev.primary_vertex.pos.SetXYZ(*pyev.primary_vertex.pos)
+        self.ev.primary_vertex.dir.SetXYZ(*pyev.primary_vertex.dir)
+        if pyev.primary_vertex.pol is not None:
+            self.ev.primary_vertex.pol.SetXYZ(*pyev.primary_vertex.pol)
+        self.ev.primary_vertex.ke = pyev.primary_vertex.ke
+
+        if pyev.photons_beg is not None:
+            photons = pyev.photons_beg
+            ROOT.fill_photons(self.ev.photons_beg,
+                              len(photons.pos),
+                              photons.pos.ravel(),
+                              photons.dir.ravel(),
+                              photons.pol.ravel(),
+                              photons.wavelengths, photons.t,
+                              photons.last_hit_triangles, photons.flags)
+
+        if pyev.photons_end is not None:
+            photons = pyev.photons_end
+            ROOT.fill_photons(self.ev.photons_end,
+                              len(photons.pos),
+                              photons.pos.ravel(),
+                              photons.dir.ravel(),
+                              photons.pol.ravel(),
+                              photons.wavelengths, photons.t,
+                              photons.last_hit_triangles, photons.flags)
+
+        self.ev.vertices.resize(0)
+        if pyev.vertices is not None:
+            self.ev.vertices.resize(len(pyev.vertices))
+            for i, vertex in enumerate(pyev.vertices):
+                self.ev.vertices[i].particle_name = vertex.particle_name
+                self.ev.vertices[i].pos.SetXYZ(*vertex.pos)
+                self.ev.vertices[i].dir.SetXYZ(*vertex.dir)
+                if vertex.pol is not None:
+                    self.ev.vertices[i].pol.SetXYZ(*vertex.pol)
+                self.ev.vertices[i].ke = vertex.ke
+                self.ev.vertices[i].t0 = vertex.t0
+
+        if pyev.channels is not None:
+            ROOT.fill_channels(self.ev, np.count_nonzero(pyev.channels.hit), np.arange(len(pyev.channels.t))[pyev.channels.hit].astype(np.int32), pyev.channels.t, pyev.channels.q, pyev.channels.flags, len(pyev.channels.hit))
+
         self.T.Fill()
 
     def close(self):
         self.T.Write()
         self.file.Close()
-    
diff --git a/generator/g4gen.py b/generator/g4gen.py
index 9650b68..8dca086 100644
--- a/generator/g4gen.py
+++ b/generator/g4gen.py
@@ -4,7 +4,7 @@ import g4py.NISTmaterials
 import g4py.ParticleGun
 import pyublas
 import numpy as np
-import chroma.event as event
+from chroma.event import Photons, Vertex
 
 try:
     import G4chroma
@@ -20,16 +20,18 @@ except:
 
 class G4Generator(object):
     def __init__(self, material, seed=None):
-        '''Create generator to produce photons inside the specified material.
+        """Create generator to produce photons inside the specified material.
 
            material: chroma.geometry.Material object with density, 
                      composition dict and refractive_index.
 
                      composition dictionary should be 
                         { element_symbol : fraction_by_weight, ... }.
-           seed: Random number generator seed for HepRandom. If None,
-                 generator is not seeded.
-        '''
+
+           seed: int, *optional*
+               Random number generator seed for HepRandom. If None, generator
+               is not seeded.
+        """
         if seed is not None:
             HepRandom.setTheSeed(seed)
 
@@ -51,11 +53,8 @@ class G4Generator(object):
         gRunManager.SetUserAction(self.tracking_action)
         gRunManager.Initialize()
         
-        #preinitialize the process by running a simple event
-        self.generate_photons(event.Event(event_id=0, particle_name='e-', 
-                                          gen_position=(0,0,0), 
-                                          gen_direction=(1,0,0), 
-                                          gen_total_energy=1.0))
+        # preinitialize the process by running a simple event
+        self.generate_photons([Vertex('e-', (0,0,0), (1,0,0), 0, 1.0)])
         
     def create_g4material(self, material):
         g4material = G4Material('world_material', material.density * g / cm3,
@@ -96,57 +95,38 @@ class G4Generator(object):
         pol[:,2] = self.tracking_action.GetPolZ()
         
         wavelengths = self.tracking_action.GetWavelength().astype(np.float32)
-        times = self.tracking_action.GetT0().astype(np.float32)
 
-        return event.Photons(positions=pos, directions=dir, polarizations=pol, times=times, wavelengths=wavelengths)
+        t0 = self.tracking_action.GetT0().astype(np.float32)
+
+        return Photons(pos, dir, pol, wavelengths, t0)
 
-    def generate_photons(self, ev):
-        '''Use GEANT4 to generate photons produced by the given particle.
+    def generate_photons(self, vertices):
+        """Use GEANT4 to generate photons produced by propagating `vertices`.
            
-               ev: a generator.event.Event object with the particle
-                   properties set.  If it contains subtracks, those
-                   will be used to create the photon vertices rather
-                   than the main particle.
-
-               Returns an instance of event.Photons containing the
-               generated photon vertices for the primary particle or
-               all the subtracks, if present.
-        '''
-        photons = []
-        if ev.subtracks:
-            subtracks = ev.subtracks
-        else:
-            # Create temporary subtrack for single primary particle
-            subtracks = [event.Subtrack(particle_name=ev.particle_name,
-                                        position=ev.gen_position,
-                                        direction=ev.gen_direction,
-                                        start_time=0.0,
-                                        total_energy=ev.gen_total_energy)]
-
-        for subtrack in subtracks:
-            self.particle_gun.SetParticleByName(subtrack.particle_name)
-            self.particle_gun.SetParticleEnergy(subtrack.total_energy * MeV)
-            self.particle_gun.SetParticlePosition(G4ThreeVector(*subtrack.position)*m)
-            self.particle_gun.SetParticleMomentumDirection(G4ThreeVector(*subtrack.direction).unit())
+        Args:
+            vertices: list of event.Vertex objects
+                List of initial particle vertices.
+
+        Returns:
+            photons: event.Photons
+                Photon vertices generated by the propagation of `vertices`.
+        """
+        photons = None
+
+        for vertex in vertices:
+            self.particle_gun.SetParticleByName(vertex.particle_name)
+            mass = G4ParticleTable.GetParticleTable().FindParticle(vertex.particle_name).GetPDGMass()
+            total_energy = vertex.ke*MeV + mass
+            self.particle_gun.SetParticleEnergy(total_energy)
+            self.particle_gun.SetParticlePosition(G4ThreeVector(*vertex.pos)*m)
+            self.particle_gun.SetParticleMomentumDirection(G4ThreeVector(*vertex.dir).unit())
             
             self.tracking_action.Clear()
             gRunManager.BeamOn(1)
-            photons.append(self._extract_photons_from_tracking_action())
-
-        # Merge all photon lists into one big list
-        return event.concatenate_photons(photons)
-
-if __name__ == '__main__':
-    import time
-    import optics
-    gen = G4Generator(optics.water)
-
-    start = time.time()
-    n = 0
-    for i in xrange(100):
-        photons = gen.generate_photons(event.Event('mu-', (0,0,0), (1,0,0), 1.0))
-        n += len(photons.times)
-        print photons.positions[0].min(), photons.positions[0].max()
-    stop = time.time()
-    print stop - start, 'sec'
-    print n / (stop-start), 'photons/sec'
+
+            if photons is None:
+                photons = self._extract_photons_from_tracking_action()
+            else:
+                photons += self._extract_photons_from_tracking_action()
+
+        return photons
diff --git a/generator/photon.py b/generator/photon.py
index e656428..6c656b2 100644
--- a/generator/photon.py
+++ b/generator/photon.py
@@ -25,24 +25,29 @@ class G4GeneratorProcess(multiprocessing.Process):
         photon_socket = context.socket(zmq.PUSH)
         photon_socket.connect(self.photon_socket_address)
 
-        # Signal with the photon socket that we are online and ready for messages
+        # Signal with the photon socket that we are online
+        # and ready for messages.
         photon_socket.send('READY')
 
         while True:
             ev = vertex_socket.recv_pyobj()
-            ev.photon_start = gen.generate_photons(ev)
+            ev.photons_beg = gen.generate_photons(ev.vertices)
+            #print 'type(ev.photons_beg) is %s' % type(ev.photons_beg)
             photon_socket.send_pyobj(ev)
 
 def partition(num, partitions):
-    '''Generator that returns num//partitions, with the last item including the remainder.
-
-    Useful for partitioning a number into mostly equal parts while preserving the sum.
-
-    >>> list(partition(800, 3))
-    [266, 266, 268]
-    >>> sum(list(partition(800, 3)))
-    800
-    '''
+    """Generator that returns num//partitions, with the last item including
+    the remainder.
+
+    Useful for partitioning a number into mostly equal parts while preserving
+    the sum.
+
+    Examples:
+        >>> list(partition(800, 3))
+        [266, 266, 268]
+        >>> sum(list(partition(800, 3)))
+        800
+    """
     step = num // partitions
     for i in xrange(partitions):
         if i < partitions - 1:
@@ -51,8 +56,8 @@ def partition(num, partitions):
             yield step + (num % partitions)
 
 def vertex_sender(vertex_iterator, vertex_socket):
-    for ev in vertex_iterator:
-        vertex_socket.send_pyobj(ev)
+    for vertex in vertex_iterator:
+        vertex_socket.send_pyobj(vertex)
 
 def socket_iterator(nelements, socket):
     for i in xrange(nelements):
@@ -66,8 +71,7 @@ class G4ParallelGenerator(object):
         base_address = 'ipc:///tmp/chroma_'+str(uuid.uuid4())
         self.vertex_address = base_address + '.vertex'
         self.photon_address = base_address + '.photon'
-        self.processes = [ G4GeneratorProcess(i, material, self.vertex_address, self.photon_address, seed=base_seed + i)
-                           for i in xrange(nprocesses) ]
+        self.processes = [ G4GeneratorProcess(i, material, self.vertex_address, self.photon_address, seed=base_seed + i) for i in xrange(nprocesses) ]
 
         for p in self.processes:
             p.start()
@@ -78,18 +82,16 @@ class G4ParallelGenerator(object):
         self.photon_socket = self.zmq_context.socket(zmq.PULL)
         self.photon_socket.bind(self.photon_address)
 
-        # Verify everyone is running and connected to avoid sending all the events to one client
+        # Verify everyone is running and connected to avoid
+        # sending all the events to one client.
         for i in xrange(nprocesses):
             msg = self.photon_socket.recv()
             assert msg == 'READY'
 
-    def generate_events(self, nevents, vertex_iterator):
-        # Doing this to avoid a deadlock caused by putting to one queue while getting from another
-        limited_iterator = itertools.islice(vertex_iterator, nevents)
-        sender_thread = threading.Thread(target=vertex_sender, args=(limited_iterator,
-                                                                     self.vertex_socket))
+    def generate_events(self, vertex_iterator):
+        # Doing this to avoid a deadlock caused by putting to one queue
+        # while getting from another.
+        vertex_list = list(vertex_iterator)
+        sender_thread = threading.Thread(target=vertex_sender, args=(vertex_list, self.vertex_socket))
         sender_thread.start()
-        return socket_iterator(nevents, self.photon_socket)
-
-        
-            
+        return socket_iterator(len(vertex_list), self.photon_socket)
diff --git a/generator/vertex.py b/generator/vertex.py
index 139d675..5521d6b 100644
--- a/generator/vertex.py
+++ b/generator/vertex.py
@@ -1,26 +1,28 @@
-import itertools
 import numpy as np
-from math import sin, cos, sqrt
-import copy
+from itertools import izip, count
 
-import chroma.pi0 as pi0
-from chroma.event import Event, Subtrack
+from chroma.pi0 import pi0_decay
+from chroma import event
+from chroma.sample import uniform_sphere
+from chroma.itertoolset import repeat_func
 
 # generator parts for use with gun()
 
+def from_histogram(h):
+    "Yield values drawn randomly from the histogram `h` interpreted as a pdf."
+    pdf = h.hist/h.hist.sum()
+    cdf = np.cumsum(pdf)
+
+    for x in repeat_func(np.random.random_sample):
+        yield h.bincenters[np.searchsorted(cdf, x)]
+
 def constant(obj):
     while True:
         yield obj
 
 def isotropic():
     while True:
-        cos_theta = np.random.uniform(-1.0, 1.0)
-        sin_theta = sqrt(1.0 - cos_theta**2)
-        phi = np.random.uniform(0.0, 2*np.pi)
-        direction = np.array([sin_theta * cos(phi),
-                              sin_theta * sin(phi),
-                              cos_theta])
-        yield direction
+        yield uniform_sphere()
         
 def line_segment(point1, point2):
     while True:
@@ -38,43 +40,34 @@ def flat(e_lo, e_hi):
 
 # vertex generators
 
-def particle_gun(particle_name, position, direction, total_energy, start_id=0):
-    for i, ev_particle, ev_position, ev_direction, ev_total_energy in \
-            itertools.izip(itertools.count(start_id), 
-                           particle_name, position, direction, total_energy):
-        ev = Event(event_id=i, particle_name=ev_particle, 
-                   gen_position=ev_position, gen_direction=ev_direction, gen_total_energy=ev_total_energy)
-        yield ev
-
-def pi0_gun(pi0_position, pi0_direction, pi0_total_energy, start_id=0):
-    for i, ev_position, ev_direction, ev_total_energy in \
-            itertools.izip(itertools.count(start_id), pi0_position, pi0_direction, pi0_total_energy):
+def particle_gun(particle_name_iter, pos_iter, dir_iter, ke_iter, start_id=0):
+    for i, particle_name, pos, dir, ke in izip(count(start_id), particle_name_iter, pos_iter, dir_iter, ke_iter):
+        dir /= np.linalg.norm(dir)
+        vertex = event.Vertex(particle_name, pos, dir, None, ke)
+        ev_vertex = event.Event(i, vertex, [vertex])
+        yield ev_vertex
 
-        ev = Event(event_id=i, particle_name='pi0',
-                   gen_position=ev_position, gen_direction=ev_direction, gen_total_energy=ev_total_energy)
+def pi0_gun(pos_iter, dir_iter, ke_iter, start_id=0):
+    for i, pos, dir, ke in izip(count(start_id), pos_iter, dir_iter, ke_iter):
+        dir /= np.linalg.norm(dir)
+        primary_vertex = event.Vertex('pi0', pos, dir, None, ke)
 
-        ev_direction = ev_direction / np.linalg.norm(ev_direction)
-        
         cos_theta_rest = np.random.random_sample() * 2 - 1
         theta_rest = np.arccos(cos_theta_rest)
         phi_rest = np.random.random_sample() * 2 * np.pi
 
-        (gamma1_e, gamma1_dir), (gamma2_e, gamma2_dir) = pi0.pi0_decay(energy=ev_total_energy,
-                                                                       direction=ev_direction,
-                                                                       theta=theta_rest,
-                                                                       phi=phi_rest)
+        (gamma1_e, gamma1_dir), (gamma2_e, gamma2_dir) = \
+            pi0_decay(ke+134.9766, dir, theta_rest, phi_rest)
+
+        gamma1_vertex = event.Vertex('gamma', pos, gamma1_dir, None, gamma1_e)
+        gamma2_vertex = event.Vertex('gamma', pos, gamma2_dir, None, gamma2_e)
+
+        ev_vertex = event.Event(i, primary_vertex, [gamma1_vertex, gamma2_vertex])
 
-        ev.subtracks = [Subtrack(particle_name='gamma', position=ev_position, direction=gamma1_dir,
-                                 start_time=0.0, total_energy=gamma1_e),
-                        Subtrack(particle_name='gamma', position=ev_position, direction=gamma2_dir,
-                                 start_time=0.0, total_energy=gamma2_e)]
-        yield ev
+        yield ev_vertex
 
 
-def constant_particle_gun(particle_name, position, direction, total_energy,
-                          start_id=0):
+def constant_particle_gun(particle_name, pos, dir, ke, start_id=0):
     '''Convenience wrapper around particle gun that assumes all
     arguments are constants, rather than generators.'''
-    return particle_gun(constant(particle_name), constant(position),
-                        constant(direction), constant(total_energy),
-                        start_id=start_id)
+    return particle_gun(constant(particle_name), constant(pos), constant(dir), constant(ke), start_id=start_id)
diff --git a/geometry.py b/geometry.py
index b2cb9c0..cccf516 100644
--- a/geometry.py
+++ b/geometry.py
@@ -194,7 +194,8 @@ def morton_order(mesh, bits):
     return interleave(mean_positions, bits)
 
 class Geometry(object):
-    def __init__(self):
+    def __init__(self, detector_material=None):
+        self.detector_material = detector_material
         self.solids = []
         self.solid_rotations = []
         self.solid_displacements = []
diff --git a/gpu.py b/gpu.py
index ab71358..721ea67 100644
--- a/gpu.py
+++ b/gpu.py
@@ -1,67 +1,215 @@
 import numpy as np
 import numpy.ma as ma
-from pycuda.tools import make_default_context
-from pycuda.compiler import SourceModule
-from pycuda.characterize import sizeof
-import pycuda.driver as cuda
-from pycuda import gpuarray
 from copy import copy
 from itertools import izip
-from chroma.tools import timeit
 from os.path import dirname
+import sys
+
+import pytools
+import pycuda.tools
+from pycuda.compiler import SourceModule
+import pycuda.characterize
+import pycuda.driver as cuda
+from pycuda import gpuarray as ga
 
 import chroma.src
+from chroma.tools import timeit
 from chroma.geometry import standard_wavelengths
 from chroma.color import map_to_color
-import sys
-import event
+from chroma import event
 
 cuda.init()
 
+#@pycuda.tools.context_dependent_memoize
+def get_cu_module(name, options=None, include_source_directory=True):
+    """Returns a pycuda.compiler.SourceModule object from a CUDA source file
+    located in the chroma src directory at src/[name].cu."""
+    if options is None:
+        options = []
+
+    srcdir = dirname(chroma.src.__file__)
+
+    if include_source_directory:
+        options += ['-I' + srcdir]
+
+    with open('%s/%s.cu' % (srcdir, name)) as f:
+        source = f.read()
+
+    return pycuda.compiler.SourceModule(source, options=options,
+                                        no_extern_c=True)
+
+class GPUFuncs(object):
+    """Simple container class for GPU functions as attributes."""
+    def __init__(self, module):
+        self.module = module
+        self.funcs = {}
+
+    def __getattr__(self, name):
+        try:
+            return self.funcs[name]
+        except KeyError:
+            f = self.module.get_function(name)
+            self.funcs[name] = f
+            return f
+
+init_rng_src = """
+#include <curand_kernel.h>
+
+extern "C"
+{
+
+__global__ void init_rng(int nthreads, curandState *s, unsigned long long seed, unsigned long long offset)
+{
+	int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+	if (id >= nthreads)
+		return;
+
+	curand_init(seed, id, offset, &s[id]);
+}
+
+} // extern "C"
+"""
+
+def get_rng_states(size, seed=1):
+    "Return `size` number of CUDA random number generator states."
+    rng_states = cuda.mem_alloc(size*pycuda.characterize.sizeof('curandStateXORWOW', '#include <curand_kernel.h>'))
+
+    module = pycuda.compiler.SourceModule(init_rng_src, no_extern_c=True)
+    init_rng = module.get_function('init_rng')
+
+    init_rng(np.int32(size), rng_states, np.uint64(seed), np.uint64(0), block=(64,1,1), grid=(size//64+1,1))
+
+    return rng_states
+
 def to_float3(arr):
-    return arr.astype(np.float32).view(gpuarray.vec.float3)
-
-def boolean_argsort(condition):
-    '''Returns two arrays of indicies indicating which elements of the
-    boolean array `condition` should be exchanged so that all of the
-    True entries occur before the false entries.
-
-       Returns: tuple (a,b, ntrue) which give the indices for elements
-       to exchange in a and b, and the number of true entries in the array
-       in ntrue.  This function in general requires fewer swaps than 
-       numpy.argsort.
-    '''
-
-    true_indices = condition.nonzero()[0][::-1] # reverse
-    false_indices = (~condition).nonzero()[0]
-    
-    length = min(len(true_indices), len(false_indices))
-    
-    cut_index = np.searchsorted((true_indices[:length] - false_indices[:length]) <= 0, True)
-    return (true_indices[:cut_index], false_indices[:cut_index], len(true_indices))
-
-
-def chunk_iterator(nelements, nthreads_per_block, max_blocks):
-    '''Iterator that yields tuples with the values requried to process
+    "Returns an pycuda.gpuarray.vec.float3 array from an (N,3) array."
+    if not arr.flags['C_CONTIGUOUS']:
+        arr = np.asarray(arr, order='c')
+    return arr.astype(np.float32).view(ga.vec.float3)[:,0]
+
+def chunk_iterator(nelements, nthreads_per_block=64, max_blocks=1024):
+    """Iterator that yields tuples with the values requried to process
     a long array in multiple kernel passes on the GPU.
 
-       Each yielded value is (first_index, elements_this_iteration, nblocks_this_iteration)
+    Each yielded value is of the form,
+        (first_index, elements_this_iteration, nblocks_this_iteration)
 
-    >>> list(chunk_iterator(300, 32, 2))
-    [(0, 64, 2), (64, 64, 2), (128, 64, 2), (192, 64, 2), (256, 9, 1)]
-    '''
+    Example:
+        >>> list(chunk_iterator(300, 32, 2))
+        [(0, 64, 2), (64, 64, 2), (128, 64, 2), (192, 64, 2), (256, 9, 1)]
+    """
     first = 0
     while first < nelements:
         elements_left = nelements - first
         blocks = int(elements_left // nthreads_per_block)
         if elements_left % nthreads_per_block != 0:
-            blocks += 1 #Round up only if needed
+            blocks += 1 # Round up only if needed
         blocks = min(max_blocks, blocks)
         elements_this_round = min(elements_left, blocks * nthreads_per_block)
 
         yield (first, elements_this_round, blocks)
         first += elements_this_round
-    
+
+class GPUPhotons(object):
+    def __init__(self, photons):
+        self.pos = ga.to_gpu(to_float3(photons.pos))
+        self.dir = ga.to_gpu(to_float3(photons.dir))
+        self.pol = ga.to_gpu(to_float3(photons.pol))
+        self.wavelengths = ga.to_gpu(photons.wavelengths.astype(np.float32))
+        self.t = ga.to_gpu(photons.t.astype(np.float32))
+        self.last_hit_triangles = ga.to_gpu(photons.last_hit_triangles.astype(np.int32))
+        self.flags = ga.to_gpu(photons.flags.astype(np.uint32))
+
+    def get(self):
+        pos = self.pos.get().view(np.float32).reshape((len(self.pos),3))
+        dir = self.dir.get().view(np.float32).reshape((len(self.dir),3))
+        pol = self.pol.get().view(np.float32).reshape((len(self.pol),3))
+        wavelengths = self.wavelengths.get()
+        t = self.t.get()
+        last_hit_triangles = self.last_hit_triangles.get()
+        flags = self.flags.get()
+        return event.Photons(pos, dir, pol, wavelengths, t, last_hit_triangles, flags)
+
+class GPUChannels(object):
+    def __init__(self, t, q, flags):
+        self.t = t
+        self.q = q
+        self.flags = flags
+
+    def get(self):
+        t = self.t.get()
+        q = self.q.get().astype(np.float32)
+
+        # For now, assume all channels with small
+        # enough hit time were hit.
+        return event.Channels(t<1e8, t, q, self.flags.get())
+
+def propagate(gpu, gpuphotons, rng_states, nthreads_per_block=64, max_blocks=1024, max_steps=10):
+    """Propagate photons on GPU to termination or max_steps, whichever
+    comes first.
+
+    May be called repeatedly without reloading photon information if
+    single-stepping through photon history.
+
+    ..warning::
+        `rng_states` must have at least `nthreads_per_block`*`max_blocks`
+        number of curandStates.
+    """
+    nphotons = gpuphotons.pos.size
+    step = 0
+    input_queue = np.zeros(shape=nphotons+1, dtype=np.uint32)
+    input_queue[1:] = np.arange(nphotons, dtype=np.uint32)
+    input_queue_gpu = ga.to_gpu(input_queue)
+    output_queue = np.zeros(shape=nphotons+1, dtype=np.uint32)
+    output_queue[0] = 1
+    output_queue_gpu = ga.to_gpu(output_queue)
+
+    propagate = gpu.module.get_function('propagate')
+
+    while step < max_steps:
+        # Just finish the rest of the steps if the # of photons is low
+        if nphotons < nthreads_per_block * 16 * 8:
+            nsteps = max_steps - step
+        else:
+            nsteps = 1
+
+        for first_photon, photons_this_round, blocks in \
+                chunk_iterator(nphotons, nthreads_per_block, max_blocks):
+            propagate(np.int32(first_photon), np.int32(photons_this_round), input_queue_gpu[1:], output_queue_gpu, rng_states, gpuphotons.pos, gpuphotons.dir, gpuphotons.wavelengths, gpuphotons.pol, gpuphotons.t, gpuphotons.flags, gpuphotons.last_hit_triangles, np.int32(nsteps), block=(nthreads_per_block,1,1), grid=(blocks, 1))
+
+        step += nsteps
+
+        if step < max_steps:
+            temp = input_queue_gpu
+            input_queue_gpu = output_queue_gpu
+            output_queue_gpu = temp
+            output_queue_gpu[:1].set(np.uint32(1))
+            nphotons = input_queue_gpu[:1].get()[0] - 1
+
+    if ga.max(gpuphotons.flags).get() & (1 << 31):
+        print >>sys.stderr, "WARNING: ABORTED PHOTONS"
+
+class GPURays(object):
+    def __init__(self, pos, dir, nblocks=64):
+        self.pos = ga.to_gpu(to_float3(pos))
+        self.dir = ga.to_gpu(to_float3(dir))
+
+        self.nblocks = nblocks
+
+        self.module = get_cu_module('transform')
+        self.gpu_funcs = GPUFuncs(self.module)
+
+    def rotate(self, phi, n):
+        self.gpu_funcs.rotate(np.int32(self.pos.size), self.pos, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
+        self.gpu_funcs.rotate(np.int32(self.dir.size), self.dir, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.dir.size//self.nblocks+1,1))
+
+    def rotate_around_point(self, phi, n, point):
+        self.gpu_funcs.rotate_around_point(np.int32(self.pos.size), self.pos, np.float32(phi), ga.vec.make_float3(*n), ga.vec.make_float3(*point), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
+        self.gpu_funcs.rotate(np.int32(self.dir.size), self.dir, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.dir.size//self.nblocks+1,1))
+
+    def translate(self, v):
+        self.gpu_funcs.translate(np.int32(self.pos.size), self.pos, ga.vec.make_float3(*v), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
 
 def format_size(size):
     if size < 1e3:
@@ -77,317 +225,165 @@ def format_array(name, array):
     return '%-15s %6s %6s' % \
         (name, format_size(len(array)), format_size(array.nbytes))
 
-class CUDAFuncs(object):
-    '''simple container class for GPU functions as attributes'''
-    def __init__(self, cuda_module, func_names):
-        '''Extract __global__ functions listed in func_names from
-        the PyCUDA module object.  The functions are assigned
-        to attributes of this object with the same name.'''
-        for name in func_names:
-            setattr(self, name, cuda_module.get_function(name))
-
-class GPU(object):
-    def __init__(self, device_id=None, nthreads_per_block=64, max_blocks=1024):
-        '''Initialize a GPU context on the specified device.  If device_id==None,
-        the default device is used.'''
-
-        if device_id is None:
-            self.context = make_default_context()
-        else:
-            device = cuda.Device(device_id)
-            self.context = device.make_context()
-
-        print 'device %s' % self.context.get_device().name()
-
-        self.nthreads_per_block = nthreads_per_block
-        self.max_blocks = max_blocks
-
-        self.context.set_cache_config(cuda.func_cache.PREFER_L1)
-
-        cuda_options = ['-I' + dirname(chroma.src.__file__), '--use_fast_math', '--ptxas-options=-v']
-
-        self.module = SourceModule(chroma.src.kernel, options=cuda_options, no_extern_c=True)
-        self.kernels = CUDAFuncs(self.module, ['ray_trace', 'ray_trace_alpha', 'rotate', 'rotate_around_point', 'translate', 'update_xyz_lookup', 'update_xyz_image', 'process_image', 'init_rng'])
-
-        self.geo_funcs = CUDAFuncs(self.module, ['set_wavelength_range', 'set_material', 'set_surface', 'set_global_mesh_variables', 'color_solids'])
-
-        self.prop_funcs = CUDAFuncs(self.module, ['init_rng', 'propagate', 'swap'])
-        self.daq_module = SourceModule(chroma.src.daq, options=cuda_options, no_extern_c=True)
-        self.daq_funcs = CUDAFuncs(self.daq_module, 
-                              ['reset_earliest_time_int', 'run_daq',
-                               'convert_sortable_int_to_float', 'bin_hits',
-                               'accumulate_pdf_eval'])
+class GPUGeometry(object):
+    def __init__(self, gpu, geometry, load=True, activate=True, print_usage=True):
+        self.geometry = geometry
 
-    def print_device_usage(self):
-        print 'device usage:'
-        print format_array('vertices', self.vertices_gpu)
-        print format_array('triangles', self.triangles_gpu)
-        print format_array('lower_bounds', self.lower_bounds_gpu)
-        print format_array('upper_bounds', self.upper_bounds_gpu)
-        print format_array('node_map', self.node_map_gpu)
-        print format_array('node_map_end', self.node_map_end_gpu)
-        print '%-15s %6s %6s' % ('total', '', format_size(self.vertices_gpu.nbytes + self.triangles_gpu.nbytes + self.lower_bounds_gpu.nbytes + self.upper_bounds_gpu.nbytes + self.node_map_gpu.nbytes + self.node_map_end_gpu.nbytes))
+        self.module = gpu.module
+        self.gpu_funcs = GPUFuncs(gpu.module)
 
-    def load_geometry(self, geometry, print_usage=True):
-        if not hasattr(geometry, 'mesh'):
-            print 'WARNING: building geometry with 8-bits'
-            geometry.build(bits=8)
+        if load:
+            self.load(activate, print_usage)
 
-        self.geo_funcs.set_wavelength_range(np.float32(standard_wavelengths[0]), np.float32(standard_wavelengths[-1]), np.float32(standard_wavelengths[1]-standard_wavelengths[0]), np.uint32(standard_wavelengths.size), block=(1,1,1), grid=(1,1))
+    def load(self, activate=True, print_usage=True):
+        self.gpu_funcs.set_wavelength_range(np.float32(standard_wavelengths[0]), np.float32(standard_wavelengths[-1]), np.float32(standard_wavelengths[1]-standard_wavelengths[0]), np.uint32(standard_wavelengths.size), block=(1,1,1), grid=(1,1))
 
         self.materials = []
-        for i in range(len(geometry.unique_materials)):
-            material = copy(geometry.unique_materials[i])
+        for i in range(len(self.geometry.unique_materials)):
+            material = copy(self.geometry.unique_materials[i])
 
             if material is None:
                 raise Exception('one or more triangles is missing a material.')
 
-            material.refractive_index_gpu = gpuarray.to_gpu(np.interp(standard_wavelengths, material.refractive_index[:,0], material.refractive_index[:,1]).astype(np.float32))
-            material.absorption_length_gpu = gpuarray.to_gpu(np.interp(standard_wavelengths, material.absorption_length[:,0], material.absorption_length[:,1]).astype(np.float32))
-            material.scattering_length_gpu = gpuarray.to_gpu(np.interp(standard_wavelengths, material.scattering_length[:,0], material.scattering_length[:,1]).astype(np.float32))
+            material.refractive_index_gpu = ga.to_gpu(np.interp(standard_wavelengths, material.refractive_index[:,0], material.refractive_index[:,1]).astype(np.float32))
+            material.absorption_length_gpu = ga.to_gpu(np.interp(standard_wavelengths, material.absorption_length[:,0], material.absorption_length[:,1]).astype(np.float32))
+            material.scattering_length_gpu = ga.to_gpu(np.interp(standard_wavelengths, material.scattering_length[:,0], material.scattering_length[:,1]).astype(np.float32))
 
-            self.geo_funcs.set_material(np.int32(i), material.refractive_index_gpu, material.absorption_length_gpu, material.scattering_length_gpu, block=(1,1,1), grid=(1,1))
+            self.gpu_funcs.set_material(np.int32(i), material.refractive_index_gpu, material.absorption_length_gpu, material.scattering_length_gpu, block=(1,1,1), grid=(1,1))
 
             self.materials.append(material)
 
         self.surfaces = []
-        for i in range(len(geometry.unique_surfaces)):
-            surface = copy(geometry.unique_surfaces[i])
+        for i in range(len(self.geometry.unique_surfaces)):
+            surface = copy(self.geometry.unique_surfaces[i])
 
             if surface is None:
                 continue
 
-            surface.detect_gpu = gpuarray.to_gpu(np.interp(standard_wavelengths, surface.detect[:,0], surface.detect[:,1]).astype(np.float32))
-            surface.absorb_gpu = gpuarray.to_gpu(np.interp(standard_wavelengths, surface.absorb[:,0], surface.absorb[:,1]).astype(np.float32))
-            surface.reflect_diffuse_gpu = gpuarray.to_gpu(np.interp(standard_wavelengths, surface.reflect_diffuse[:,0], surface.reflect_diffuse[:,1]).astype(np.float32))
-            surface.reflect_specular_gpu = gpuarray.to_gpu(np.interp(standard_wavelengths, surface.reflect_specular[:,0], surface.reflect_specular[:,1]).astype(np.float32))
+            surface.detect_gpu = ga.to_gpu(np.interp(standard_wavelengths, surface.detect[:,0], surface.detect[:,1]).astype(np.float32))
+            surface.absorb_gpu = ga.to_gpu(np.interp(standard_wavelengths, surface.absorb[:,0], surface.absorb[:,1]).astype(np.float32))
+            surface.reflect_diffuse_gpu = ga.to_gpu(np.interp(standard_wavelengths, surface.reflect_diffuse[:,0], surface.reflect_diffuse[:,1]).astype(np.float32))
+            surface.reflect_specular_gpu = ga.to_gpu(np.interp(standard_wavelengths, surface.reflect_specular[:,0], surface.reflect_specular[:,1]).astype(np.float32))
 
-            self.geo_funcs.set_surface(np.int32(i), surface.detect_gpu, surface.absorb_gpu, surface.reflect_diffuse_gpu, surface.reflect_specular_gpu, block=(1,1,1), grid=(1,1))
+            self.gpu_funcs.set_surface(np.int32(i), surface.detect_gpu, surface.absorb_gpu, surface.reflect_diffuse_gpu, surface.reflect_specular_gpu, block=(1,1,1), grid=(1,1))
 
             self.surfaces.append(surface)
 
-        self.vertices_gpu = gpuarray.to_gpu(to_float3(geometry.mesh.vertices))
+        self.vertices_gpu = ga.to_gpu(to_float3(self.geometry.mesh.vertices))
 
         triangles = \
-            np.empty(len(geometry.mesh.triangles), dtype=gpuarray.vec.uint4)
-        triangles['x'] = geometry.mesh.triangles[:,0]
-        triangles['y'] = geometry.mesh.triangles[:,1]
-        triangles['z'] = geometry.mesh.triangles[:,2]
-        triangles['w'] = ((geometry.material1_index & 0xff) << 24) | ((geometry.material2_index & 0xff) << 16) | ((geometry.surface_index & 0xff) << 8)
-        self.triangles_gpu = gpuarray.to_gpu(triangles)
+            np.empty(len(self.geometry.mesh.triangles), dtype=ga.vec.uint4)
+        triangles['x'] = self.geometry.mesh.triangles[:,0]
+        triangles['y'] = self.geometry.mesh.triangles[:,1]
+        triangles['z'] = self.geometry.mesh.triangles[:,2]
+        triangles['w'] = ((self.geometry.material1_index & 0xff) << 24) | ((self.geometry.material2_index & 0xff) << 16) | ((self.geometry.surface_index & 0xff) << 8)
+        self.triangles_gpu = ga.to_gpu(triangles)
 
-        self.lower_bounds_gpu = gpuarray.to_gpu(to_float3(geometry.lower_bounds))
+        self.lower_bounds_gpu = ga.to_gpu(to_float3(self.geometry.lower_bounds))
 
-        self.upper_bounds_gpu = gpuarray.to_gpu(to_float3(geometry.upper_bounds))
+        self.upper_bounds_gpu = ga.to_gpu(to_float3(self.geometry.upper_bounds))
 
-        self.colors_gpu = gpuarray.to_gpu(geometry.colors.astype(np.uint32))
-        self.node_map_gpu = gpuarray.to_gpu(geometry.node_map.astype(np.uint32))
-        self.node_map_end_gpu = gpuarray.to_gpu(geometry.node_map_end.astype(np.uint32))
-        self.solid_id_map_gpu = gpuarray.to_gpu(geometry.solid_id.astype(np.uint32))
-
-        self.geo_funcs.set_global_mesh_variables(self.triangles_gpu, self.vertices_gpu, self.colors_gpu, np.uint32(geometry.node_map.size-1), np.uint32(geometry.first_node), self.lower_bounds_gpu, self.upper_bounds_gpu, block=(1,1,1), grid=(1,1))
+        self.colors_gpu = ga.to_gpu(self.geometry.colors.astype(np.uint32))
+        self.node_map_gpu = ga.to_gpu(self.geometry.node_map.astype(np.uint32))
+        self.node_map_end_gpu = ga.to_gpu(self.geometry.node_map_end.astype(np.uint32))
+        self.solid_id_map_gpu = ga.to_gpu(self.geometry.solid_id.astype(np.uint32))
 
         self.node_map_tex = self.module.get_texref('node_map')
         self.node_map_end_tex = self.module.get_texref('node_map_end')
 
+        if print_usage:
+            self.print_device_usage()
+
+        if activate:
+            self.activate()
+
+    def activate(self):
+        self.gpu_funcs.set_global_mesh_variables(self.triangles_gpu, self.vertices_gpu, self.colors_gpu, np.uint32(self.geometry.node_map.size-1), np.uint32(self.geometry.first_node), self.lower_bounds_gpu, self.upper_bounds_gpu, block=(1,1,1), grid=(1,1))
+
         self.node_map_tex.set_address(self.node_map_gpu.gpudata, self.node_map_gpu.nbytes)
         self.node_map_end_tex.set_address(self.node_map_end_gpu.gpudata, self.node_map_end_gpu.nbytes)
 
         self.node_map_tex.set_format(cuda.array_format.UNSIGNED_INT32, 1)
         self.node_map_end_tex.set_format(cuda.array_format.UNSIGNED_INT32, 1)
 
-        self.geometry = geometry
-
-        if print_usage:
-            print 'average of %f child nodes per node' % (np.mean(geometry.node_map_end[geometry.first_node:] - geometry.node_map[geometry.first_node:]))
-            print '%i nodes with one child' % (np.count_nonzero((geometry.node_map_end[geometry.first_node:] - geometry.node_map[geometry.first_node:]) == 1))
-            print '%i leaf nodes with one child' % (np.count_nonzero((geometry.node_map_end[:geometry.first_node] - geometry.node_map[:geometry.first_node]) == 1))
-            self.print_device_usage()
+    def print_device_usage(self):
+        print 'device usage:'
+        print '-'*10
+        print format_array('vertices', self.vertices_gpu)
+        print format_array('triangles', self.triangles_gpu)
+        print format_array('lower_bounds', self.lower_bounds_gpu)
+        print format_array('upper_bounds', self.upper_bounds_gpu)
+        print format_array('node_map', self.node_map_gpu)
+        print format_array('node_map_end', self.node_map_end_gpu)
+        print '%-15s %6s %6s' % ('total', '', format_size(self.vertices_gpu.nbytes + self.triangles_gpu.nbytes + self.lower_bounds_gpu.nbytes + self.upper_bounds_gpu.nbytes + self.node_map_gpu.nbytes + self.node_map_end_gpu.nbytes))
+        print '-'*10
+        free, total = cuda.mem_get_info()
+        print '%-15s %6s %6s' % ('device total', '', format_size(total))
+        print '%-15s %6s %6s' % ('device used', '', format_size(total-free))
+        print '%-15s %6s %6s' % ('device free', '', format_size(free))
+        print
 
     def reset_colors(self):
         self.colors_gpu.set_async(self.geometry.colors.astype(np.uint32))
 
     def color_solids(self, solid_hit, colors):
-        solid_hit_gpu = gpuarray.to_gpu(np.array(solid_hit, dtype=np.bool))
-        solid_colors_gpu = gpuarray.to_gpu(np.array(colors, dtype=np.uint32))
+        solid_hit_gpu = ga.to_gpu(np.array(solid_hit, dtype=np.bool))
+        solid_colors_gpu = ga.to_gpu(np.array(colors, dtype=np.uint32))
 
-        for first_triangle, triangles_this_round, blocks in chunk_iterator(self.triangles_gpu.size, self.nthreads_per_block, self.max_blocks):
-            self.geo_funcs.color_solids(np.int32(first_triangle),
+        for first_triangle, triangles_this_round, blocks in \
+                chunk_iterator(self.triangles_gpu.size):
+            self.gpu_funcs.color_solids(np.int32(first_triangle),
                                         np.int32(triangles_this_round),
                                         self.solid_id_map_gpu, 
                                         solid_hit_gpu, 
                                         solid_colors_gpu, 
-                                        block=(self.nthreads_per_block,1,1), 
+                                        block=(64,1,1), 
                                         grid=(blocks,1))
-        self.context.synchronize()
-
-    def setup_propagate(self, seed=1):
-        self.rng_states_gpu = cuda.mem_alloc(self.nthreads_per_block * self.max_blocks
-                                             * sizeof('curandStateXORWOW', '#include <curand_kernel.h>'))
-        self.prop_funcs.init_rng(np.int32(self.max_blocks * self.nthreads_per_block), self.rng_states_gpu, 
-                                 np.uint64(seed), np.uint64(0), 
-                                 block=(self.nthreads_per_block,1,1), 
-                                 grid=(self.max_blocks,1))
-        #self.context.synchronize()
-
-    def load_photons(self, photons):
-        '''Load photons onto the GPU from an event.Photons object.
-
-           If photon.histories or photon.last_hit_triangles are set to none,
-           they will be initialized to 0 and -1 on the GPU, respectively.
-        '''
-        self.nphotons = len(photons.positions)
-        assert len(photons.directions) == self.nphotons
-        assert len(photons.polarizations) == self.nphotons
-        assert len(photons.wavelengths) == self.nphotons
-
-        self.positions_gpu = gpuarray.to_gpu(to_float3(photons.positions))
-        self.directions_gpu = gpuarray.to_gpu(to_float3(photons.directions))
-        self.polarizations_gpu = gpuarray.to_gpu(to_float3(photons.polarizations))
-        self.wavelengths_gpu = gpuarray.to_gpu(photons.wavelengths.astype(np.float32))
-
-        if photons.times is not None:
-            self.times_gpu = gpuarray.to_gpu(photons.times.astype(np.float32))
-        else:
-            self.times_gpu = gpuarray.zeros(self.nphotons, dtype=np.float32)
 
-        if photons.histories is not None:
-            self.histories_gpu = gpuarray.to_gpu(photons.histories.astype(np.uint32))
-        else:
-            self.histories_gpu = gpuarray.zeros(self.nphotons, dtype=np.uint32)
-
-        if photons.last_hit_triangles is not None:
-            self.last_hit_triangles_gpu = gpuarray.to_gpu(photons.last_hit_triangles.astype(np.int32))
-        else:
-            self.last_hit_triangles_gpu = gpuarray.empty(self.nphotons, dtype=np.int32)
-            self.last_hit_triangles_gpu.fill(-1)
-
-
-    def propagate(self, max_steps=10):
-        '''Propagate photons on GPU to termination or max_steps, whichever comes first.
-
-        May be called repeatedly without reloading photon information if single-stepping
-        through photon history.
-        '''
-        nphotons = self.nphotons
-        step = 0
-        input_queue = np.zeros(shape=self.nphotons+1, dtype=np.uint32)
-        input_queue[1:] = np.arange(self.nphotons, dtype=np.uint32)
-        input_queue_gpu = gpuarray.to_gpu(input_queue)
-        output_queue = np.zeros(shape=self.nphotons+1, dtype=np.uint32)
-        output_queue[0] = 1
-        output_queue_gpu = gpuarray.to_gpu(output_queue)
-
-
-        while step < max_steps:
-            ## Just finish the rest of the steps if the # of photons is low
-            if nphotons < self.nthreads_per_block * 16 * 8:
-                nsteps = max_steps - step
-            else:
-                nsteps = 1
-
-            for first_photon, photons_this_round, blocks in chunk_iterator(nphotons, self.nthreads_per_block, self.max_blocks):
-                self.prop_funcs.propagate(np.int32(first_photon),
-                                          np.int32(photons_this_round),
-                                          input_queue_gpu[1:],
-                                          output_queue_gpu,
-                                          self.rng_states_gpu, self.positions_gpu, 
-                                          self.directions_gpu, 
-                                          self.wavelengths_gpu, self.polarizations_gpu, 
-                                          self.times_gpu, self.histories_gpu, 
-                                          self.last_hit_triangles_gpu, 
-                                          np.int32(nsteps), 
-                                          block=(self.nthreads_per_block,1,1),
-                                          grid=(blocks, 1))
-        
-            step += nsteps
-
-            if step < max_steps:
-                temp = input_queue_gpu
-                input_queue_gpu = output_queue_gpu
-                output_queue_gpu = temp
-                output_queue_gpu[:1].set(np.uint32(1))
-                nphotons = input_queue_gpu[:1].get()[0] - 1
-        
-        if gpuarray.max(self.histories_gpu).get() & (1 << 31):
-            print >>sys.stderr, "WARNING: ABORTED PHOTONS IN THIS EVENT"
-
-        if 'profile' in __builtins__:
-            self.context.synchronize()
-
-    def get_photons(self):
-        '''Returns a dictionary of current photon state information.
-
-        Contents of dictionary have the same names as the parameters to load_photons().
-        '''
-        return event.Photons(positions=self.positions_gpu.get().view(np.float32).reshape(self.positions_gpu.size, 3),
-                             directions=self.directions_gpu.get().view(np.float32).reshape(self.directions_gpu.size, 3),
-                             polarizations=self.polarizations_gpu.get().view(np.float32).reshape(self.polarizations_gpu.size, 3),
-                             wavelengths=self.wavelengths_gpu.get(),
-                             times=self.times_gpu.get(),
-                             histories=self.histories_gpu.get(),
-                             last_hit_triangles=self.last_hit_triangles_gpu.get())
-    
-    def setup_daq(self, max_pmt_id, pmt_rms=1.2e-9):
-        self.earliest_time_gpu = gpuarray.GPUArray(shape=(max_pmt_id+1,), dtype=np.float32)
-        self.earliest_time_int_gpu = gpuarray.GPUArray(shape=self.earliest_time_gpu.shape, 
-                                                       dtype=np.uint32)
-        self.channel_history_gpu = gpuarray.zeros_like(self.earliest_time_int_gpu)
-        self.channel_q_gpu = gpuarray.zeros_like(self.earliest_time_int_gpu)
+class GPUDaq(object):
+    def __init__(self, gpu_geometry, max_pmt_id, pmt_rms=1.2e-9):
+        self.earliest_time_gpu = ga.empty(max_pmt_id+1, dtype=np.float32)
+        self.earliest_time_int_gpu = ga.empty(max_pmt_id+1, dtype=np.uint32)
+        self.channel_history_gpu = ga.zeros_like(self.earliest_time_int_gpu)
+        self.channel_q_gpu = ga.zeros_like(self.earliest_time_int_gpu)
         self.daq_pmt_rms = pmt_rms
+        self.solid_id_map_gpu = gpu_geometry.solid_id_map_gpu
+
+        self.module = get_cu_module('daq', include_source_directory=False)
+        self.gpu_funcs = GPUFuncs(self.module)
 
-    def run_daq(self):
-        self.daq_funcs.reset_earliest_time_int(np.float32(1e9), 
-                                               np.int32(len(self.earliest_time_int_gpu)), 
-                                               self.earliest_time_int_gpu, 
-                                               block=(self.nthreads_per_block,1,1), 
-                                               grid=(len(self.earliest_time_int_gpu)//self.nthreads_per_block+1,1))
+    def acquire(self, gpuphotons, rng_states, nthreads_per_block=64, max_blocks=1024):
+        self.gpu_funcs.reset_earliest_time_int(np.float32(1e9), np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, block=(nthreads_per_block,1,1), grid=(len(self.earliest_time_int_gpu)//nthreads_per_block+1,1))
         self.channel_q_gpu.fill(0)
         self.channel_history_gpu.fill(0)
 
-        #self.context.synchronize()
-        for first_photon, photons_this_round, blocks in chunk_iterator(self.nphotons, self.nthreads_per_block, self.max_blocks):
-            self.daq_funcs.run_daq(self.rng_states_gpu, np.uint32(0x1 << 2), 
-                                   np.float32(self.daq_pmt_rms), 
-                                   np.int32(first_photon),
-                                   np.int32(photons_this_round), 
-                                   self.times_gpu, self.histories_gpu,
-                                   self.last_hit_triangles_gpu, 
-                                   self.solid_id_map_gpu, 
-                                   np.int32(len(self.earliest_time_int_gpu)), 
-                                   self.earliest_time_int_gpu,
-                                   self.channel_q_gpu,
-                                   self.channel_history_gpu,
-                                   block=(self.nthreads_per_block,1,1), grid=(blocks,1))
-        #self.context.synchronize()
-        self.daq_funcs.convert_sortable_int_to_float(np.int32(len(self.earliest_time_int_gpu)), 
-                                                     self.earliest_time_int_gpu, 
-                                                     self.earliest_time_gpu, 
-                                                     block=(self.nthreads_per_block,1,1), 
-                                                     grid=(len(self.earliest_time_int_gpu)//self.nthreads_per_block+1,1))
-        if 'profile' in __builtins__:
-            self.context.synchronize()
-        
+        n = len(gpuphotons.pos)
 
-    def get_hits(self):
-        t = self.earliest_time_gpu.get()
-        # For now, assume all channels with small enough hit time were hit
-        return event.Channels(hit=t<1e8, t=t, 
-                        q=self.channel_q_gpu.get().astype(np.float32),
-                        histories=self.channel_history_gpu.get())
+        for first_photon, photons_this_round, blocks in \
+                chunk_iterator(n, nthreads_per_block, max_blocks):
+            self.gpu_funcs.run_daq(rng_states, np.uint32(0x1 << 2), np.float32(self.daq_pmt_rms), np.int32(first_photon), np.int32(photons_this_round), gpuphotons.t, gpuphotons.flags, gpuphotons.last_hit_triangles, self.solid_id_map_gpu, np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, self.channel_q_gpu, self.channel_history_gpu, block=(nthreads_per_block,1,1), grid=(blocks,1))
+
+        self.gpu_funcs.convert_sortable_int_to_float(np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, self.earliest_time_gpu, block=(nthreads_per_block,1,1), grid=(len(self.earliest_time_int_gpu)//nthreads_per_block+1,1))
+
+        return GPUChannels(self.earliest_time_gpu, self.channel_q_gpu, self.channel_history_gpu)
+
+class GPUPDF(object):
+    def __init__(self):
+        self.module = get_cu_module('daq')
+        self.gpu_funcs = GPUFuncs(self.module)
 
     def setup_pdf(self, max_pmt_id, tbins, trange, qbins, qrange):
-        '''Setup GPU arrays to hold PDF information.
+        """Setup GPU arrays to hold PDF information.
 
            max_pmt_id: int, largest PMT id #
            tbins: number of time bins
            trange: tuple of (min, max) time in PDF
            qbins: number of charge bins
            qrange: tuple of (min, max) charge in PDF
-        '''
+        """
         self.events_in_histogram = 0
-        self.hitcount_gpu = gpuarray.zeros(max_pmt_id+1, dtype=np.uint32)
-        self.pdf_gpu = gpuarray.zeros(shape=(max_pmt_id+1, tbins, qbins), 
+        self.hitcount_gpu = ga.zeros(max_pmt_id+1, dtype=np.uint32)
+        self.pdf_gpu = ga.zeros(shape=(max_pmt_id+1, tbins, qbins), 
                                       dtype=np.uint32)
         self.tbins = tbins
         self.trange = trange
@@ -395,16 +391,14 @@ class GPU(object):
         self.qrange = qrange
 
     def clear_pdf(self):
-        '''Rezero the PDF counters.'''
+        """Rezero the PDF counters."""
         self.hitcount_gpu.fill(0)
         self.pdf_gpu.fill(0)
 
-    def add_hits_to_pdf(self):
-        '''Put the most recent results of run_daq() into the PDF.'''
-
-        self.daq_funcs.bin_hits(np.int32(len(self.hitcount_gpu)),
-                                self.channel_q_gpu,
-                                self.earliest_time_gpu,
+    def add_hits_to_pdf(self, gpuchannels, nthreads_per_block=64):
+        self.gpu_funcs.bin_hits(np.int32(len(self.hitcount_gpu)),
+                                gpuchannels.q,
+                                gpuchannels.t,
                                 self.hitcount_gpu,
                                 np.int32(self.tbins),
                                 np.float32(self.trange[0]),
@@ -413,20 +407,21 @@ class GPU(object):
                                 np.float32(self.qrange[0]),
                                 np.float32(self.qrange[1]),
                                 self.pdf_gpu,
-                                block=(self.nthreads_per_block,1,1), 
-                                grid=(len(self.earliest_time_gpu)//self.nthreads_per_block+1,1))
+                                block=(nthreads_per_block,1,1), 
+                                grid=(len(gpuchannels.t)//nthreads_per_block+1,1))
 
 
         self.events_in_histogram += 1
 
     def get_pdfs(self):
-        '''Returns the 1D hitcount array and the 3D [channel, time, charge] histogram'''
+        """Returns the 1D hitcount array and the 3D [channel, time, charge]
+        histogram."""
         return self.hitcount_gpu.get(), self.pdf_gpu.get()
 
-    def setup_pdf_eval(self, event_hit, event_time, event_charge,
-                       min_twidth, trange, min_qwidth, qrange, min_bin_content=10,
+    def setup_pdf_eval(self, event_hit, event_time, event_charge, min_twidth,
+                       trange, min_qwidth, qrange, min_bin_content=10,
                        time_only=True):
-        '''Setup GPU arrays to compute PDF values for the given event.
+        """Setup GPU arrays to compute PDF values for the given event.
         The pdf_eval calculation allows the PDF to be evaluated at a
         single point for each channel as the Monte Carlo is run.  The
         effective bin size will be as small as (`min_twidth`,
@@ -455,14 +450,14 @@ class GPU(object):
               The bin will be expanded to include at least this many events
             time_only: bool
               If True, only the time observable will be used in the PDF.
-        '''
-        self.event_hit_gpu = gpuarray.to_gpu(event_hit.astype(np.uint32))
-        self.event_time_gpu = gpuarray.to_gpu(event_time.astype(np.float32))
-        self.event_charge_gpu = gpuarray.to_gpu(event_charge.astype(np.float32))
-
-        self.eval_hitcount_gpu = gpuarray.zeros(len(event_hit), dtype=np.uint32)
-        self.eval_bincount_gpu = gpuarray.zeros(len(event_hit), dtype=np.uint32)
-        self.nearest_mc_gpu = gpuarray.empty(shape=len(event_hit) * min_bin_content, 
+        """
+        self.event_hit_gpu = ga.to_gpu(event_hit.astype(np.uint32))
+        self.event_time_gpu = ga.to_gpu(event_time.astype(np.float32))
+        self.event_charge_gpu = ga.to_gpu(event_charge.astype(np.float32))
+
+        self.eval_hitcount_gpu = ga.zeros(len(event_hit), dtype=np.uint32)
+        self.eval_bincount_gpu = ga.zeros(len(event_hit), dtype=np.uint32)
+        self.nearest_mc_gpu = ga.empty(shape=len(event_hit) * min_bin_content, 
                                              dtype=np.float32)
         self.nearest_mc_gpu.fill(1e9)
 
@@ -474,21 +469,20 @@ class GPU(object):
         self.time_only = time_only
 
     def clear_pdf_eval(self):
-        '''Reset PDF evaluation counters to start accumulating new Monte Carlo.'''
+        "Reset PDF evaluation counters to start accumulating new Monte Carlo."
         self.eval_hitcount_gpu.fill(0)
         self.eval_bincount_gpu.fill(0)
         self.nearest_mc_gpu.fill(1e9)
 
-    def accumulate_pdf_eval(self):
-        '''Add the most recent results of run_daq() to the PDF evaluation.'''
-        
-        self.daq_funcs.accumulate_pdf_eval(np.int32(self.time_only),
+    def accumulate_pdf_eval(self, gpuchannels, nthreads_per_block=64):
+        "Add the most recent results of run_daq() to the PDF evaluation."
+        self.gpu_funcs.accumulate_pdf_eval(np.int32(self.time_only),
                                            np.int32(len(self.event_hit_gpu)),
                                            self.event_hit_gpu,
                                            self.event_time_gpu,
                                            self.event_charge_gpu,
-                                           self.earliest_time_gpu,
-                                           self.channel_q_gpu,
+                                           gpuchannels.t,
+                                           gpuchannels.q,
                                            self.eval_hitcount_gpu,
                                            self.eval_bincount_gpu,
                                            np.float32(self.min_twidth),
@@ -499,8 +493,8 @@ class GPU(object):
                                            np.float32(self.qrange[1]),
                                            self.nearest_mc_gpu,
                                            np.int32(self.min_bin_content),
-                                           block=(self.nthreads_per_block,1,1), 
-                                           grid=(len(self.earliest_time_gpu)//self.nthreads_per_block+1,1))
+                                           block=(nthreads_per_block,1,1), 
+                                           grid=(len(gpuchannels.t)//nthreads_per_block+1,1))
 
     def get_pdf_eval(self):
         evhit = self.event_hit_gpu.get().astype(bool)
@@ -525,8 +519,9 @@ class GPU(object):
 
         nearest_mc = self.nearest_mc_gpu.get().reshape((len(hitcount), self.min_bin_content))
 
-        # Deal with the case where we did not even get min_bin_content events in the PDF
-        # but also clamp the lower range to ensure we don't index by a negative number in 2 lines
+        # Deal with the case where we did not even get min_bin_content events
+        # in the PDF but also clamp the lower range to ensure we don't index
+        # by a negative number in 2 lines
         last_valid_entry = np.maximum(0, (nearest_mc < 1e9).astype(int).sum(axis=1) - 1)
         distance = nearest_mc[np.arange(len(last_valid_entry)),last_valid_entry]
 
@@ -544,5 +539,31 @@ class GPU(object):
 
         return hitcount, pdf_value, pdf_value * pdf_frac_uncert
 
+class GPU(object):
+    def __init__(self, device_id=None):
+        """Initialize a GPU context on the specified device.
+        If device_id is None, the default device is used."""
+
+        if device_id is None:
+            self.context = pycuda.tools.make_default_context()
+        else:
+            device = cuda.Device(device_id)
+            self.context = device.make_context()
+
+        print 'device %s' % self.context.get_device().name()
+
+        self.print_mem_info()
+
+        self.context.set_cache_config(cuda.func_cache.PREFER_L1)
+
+        cuda_options = ['--use_fast_math']#, '--ptxas-options=-v']
+
+        self.module = get_cu_module('kernel', options=cuda_options)
+
+    def print_mem_info(self):
+        free, total = cuda.mem_get_info()
+
+        print '%.1f %% of device memory is free.' % ((free/float(total))*100)
+
     def __del__(self):
         self.context.pop()
diff --git a/gputhread.py b/gputhread.py
deleted file mode 100644
index 63124e9..0000000
--- a/gputhread.py
+++ /dev/null
@@ -1,95 +0,0 @@
-import numpy as np
-from copy import copy
-import time
-import pycuda.driver as cuda
-from pycuda.characterize import sizeof
-from pycuda.compiler import SourceModule
-from pycuda import gpuarray
-import threading
-import Queue
-import src
-
-class GPUThread(threading.Thread):
-    def __init__(self, device_id, geometry, jobs, output, nblocks=64, max_nthreads=100000):
-        threading.Thread.__init__(self)
-
-        self.device_id = device_id
-        self.geometry = copy(geometry)
-        self.jobs = jobs
-        self.output = output
-        self.nblocks = nblocks
-        self.max_nthreads = max_nthreads
-        self._stop = threading.Event()
-
-    def stop(self):
-        self._stop.set()
-
-    def stopped(self):
-        return self._stop.is_set()
-
-    def run(self):
-        device = cuda.Device(self.device_id)
-        context = device.make_context()
-        module = SourceModule(src.kernel, options=['-I' + src.dir], no_extern_c=True, cache_dir=False)
-
-        propagate = module.get_function('propagate')
-        fill_uniform = module.get_function('fill_uniform')
-        fill_uniform_sphere = module.get_function('fill_uniform_sphere')
-
-        init_rng = module.get_function('init_rng')
-        rng_states_gpu = cuda.mem_alloc(self.max_nthreads*sizeof('curandStateXORWOW', '#include <curand_kernel.h>'))
-        init_rng(np.int32(self.max_nthreads), rng_states_gpu, np.int32(self.device_id), np.int32(0), block=(self.nblocks,1,1), grid=(self.max_nthreads//self.nblocks+1,1))
-
-        self.geometry.load(module)
-
-        daq_module = SourceModule(src.daq, options=['-I' + src.dir], no_extern_c=True, cache_dir=False)
-        reset_earliest_time_int = daq_module.get_function('reset_earliest_time_int')
-        run_daq = daq_module.get_function('run_daq')
-        convert_sortable_int_to_float = daq_module.get_function('convert_sortable_int_to_float')
-
-        earliest_time_gpu = gpuarray.GPUArray(shape=(max(self.geometry.pmtids)+1,), dtype=np.float32)
-        earliest_time_int_gpu = gpuarray.GPUArray(shape=earliest_time_gpu.shape, dtype=np.uint32)
-
-        solid_map_gpu = gpuarray.to_gpu(self.geometry.solid_id.astype(np.int32))
-        
-        while not self.stopped():
-            try:
-                position, nphotons = self.jobs.get(block=False, timeout=0.1)
-            except Queue.Empty:
-                continue
-
-            t0 = time.time()
-
-            gpu_kwargs = {'block': (self.nblocks,1,1), 'grid': (nphotons//self.nblocks+1,1)}
-
-            positions_gpu = gpuarray.empty(nphotons, dtype=gpuarray.vec.float3)
-            positions_gpu.fill(position)
-            directions_gpu = gpuarray.empty(nphotons, dtype=gpuarray.vec.float3)
-            fill_uniform_sphere(np.int32(nphotons), rng_states_gpu, directions_gpu, **gpu_kwargs)
-            wavelengths_gpu = gpuarray.empty(nphotons, dtype=np.float32)
-            fill_uniform(np.int32(nphotons), rng_states_gpu, wavelengths_gpu, np.float32(200), np.float32(800), **gpu_kwargs)
-            polarizations_gpu = gpuarray.empty(nphotons, dtype=gpuarray.vec.float3)
-            fill_uniform_sphere(np.int32(nphotons), rng_states_gpu, polarizations_gpu, **gpu_kwargs)
-            times_gpu = gpuarray.zeros(nphotons, dtype=np.float32)
-            histories_gpu = gpuarray.zeros(nphotons, dtype=np.uint32)
-            last_hit_triangles_gpu = gpuarray.empty(nphotons, dtype=np.int32)
-            last_hit_triangles_gpu.fill(-1)
-
-            max_steps = 10
-
-            propagate(np.int32(nphotons), rng_states_gpu, positions_gpu, directions_gpu, wavelengths_gpu, polarizations_gpu, times_gpu, histories_gpu, last_hit_triangles_gpu, np.int32(max_steps), **gpu_kwargs)
-
-            reset_earliest_time_int(np.float32(1e9), np.int32(len(earliest_time_int_gpu)), earliest_time_int_gpu, block=(self.nblocks,1,1), grid=(len(earliest_time_int_gpu)//self.nblocks+1,1))
-
-            run_daq(rng_states_gpu, np.uint32(0x1 << 2), np.float32(1.2e-9), np.int32(nphotons), times_gpu, histories_gpu, last_hit_triangles_gpu, solid_map_gpu, np.int32(len(earliest_time_int_gpu)), earliest_time_int_gpu, block=(self.nblocks,1,1), grid=(nphotons//self.nblocks+1,1))
-
-            convert_sortable_int_to_float(np.int32(len(earliest_time_int_gpu)), earliest_time_int_gpu, earliest_time_gpu, block=(self.nblocks,1,1), grid=(len(earliest_time_int_gpu)//self.nblocks+1,1))
-            cuda.Context.synchronize()
-            elapsed = time.time() - t0
-            
-            #print 'device %i; elapsed %f sec' % (self.device_id, elapsed)
-
-            self.output.put(earliest_time_gpu.get())
-            self.jobs.task_done()
-
-        context.pop()
diff --git a/itertoolset.py b/itertoolset.py
index eb96e74..498b940 100644
--- a/itertoolset.py
+++ b/itertoolset.py
@@ -2,21 +2,54 @@ from itertools import *
 import collections
 from copy import deepcopy
 
+def peek(iterable):
+    """Peek at the first element of `iterable`.
+
+    Returns a tuple of the form (first_element, iterable).
+
+    Once peek() has been called, the original iterable will be modified if it
+    was an iterator (it will be advanced by 1); use the returned iterator for
+    an equivalent copy of the original iterable.
+    """
+    it = iter(iterable)
+    first_element = next(it)
+    return first_element, chain([first_element], it)
+
+def repeat_func(func, times=None, args=()):
+    "equivalent to (func(*args) for i in xrange(times))."
+    if times is None:
+        while True:
+            yield func(*args)
+    else:
+        for i in xrange(times):
+            yield func(*args)
+
+def repeat_copy(object, times=None):
+    """Returns deep copies of `object` over and over again. Runs indefinitely
+    unless the `times` argument is specified."""
+    if times is None:
+        while True:
+            yield deepcopy(object)
+    else:
+        for i in xrange(times):
+            yield object
+
 def repeating_iterator(i, nreps):
-    '''Returns an iterator that emits each element of `i` multiple times specified by `nreps`.
-    The length of this iterator is the lenght of `i` times `nreps`.  This iterator
-    is safe even if the item consumer modifies the items.
+    """Returns an iterator that emits each element of `i` multiple times
+    specified by `nreps`. The length of this iterator is the lenght of `i`
+    times `nreps`.  This iterator is safe even if the item consumer modifies
+    the items.
 
-    >>> list(repeating_iterator('ABCD', 3)
-    ['A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C', 'D', 'D', 'D']
-    >>> list(repeating_iterator('ABCD', 1)
-    ['A', 'B', 'C', 'D']
-    '''
+    Examples:
+        >>> list(repeating_iterator('ABCD', 3)
+        ['A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C', 'D', 'D', 'D']
+        >>> list(repeating_iterator('ABCD', 1)
+        ['A', 'B', 'C', 'D']
+    """
     for item in i:
         for counter in xrange(nreps):
             yield deepcopy(item)
 
-
 def grouper(n, iterable, fillvalue=None):
     "grouper(3, 'ABCDEFG', 'x') --> ABC DEF Gxx"
     args = [iter(iterable)] * n
diff --git a/likelihood.py b/likelihood.py
index c797afe..51d7ef9 100644
--- a/likelihood.py
+++ b/likelihood.py
@@ -2,7 +2,7 @@ import numpy as np
 from histogram import HistogramDD
 from uncertainties import ufloat, umath, unumpy
 from math import sqrt
-from itertools import izip, compress
+from itertools import islice, izip, compress, repeat
 from chroma.tools import profile_if_possible
 
 class Likelihood(object):
@@ -47,16 +47,19 @@ class Likelihood(object):
         """
         Return the negative log likelihood that the detector event set in the
         constructor or by set_event() was the result of a particle generated
-        by `vertex_generator`.  If `nreps` set to > 1, each set of photon vertices
-        will be propagated `nreps` times.
+        by `vertex_generator`.  If `nreps` set to > 1, each set of photon
+        vertices will be propagated `nreps` times.
         """
 
-        hitcount, pdf_prob, pdf_prob_uncert = self.sim.eval_pdf(self.event.channels,
-                                                                nevals, vertex_generator,
-                                                                2.0e-9, self.trange, 
-                                                                1, self.qrange,
-                                                                nreps=nreps,
-                                                                time_only=self.time_only)
+        vertex_generator = islice(vertex_generator, nevals)
+
+        hitcount, pdf_prob, pdf_prob_uncert = \
+            self.sim.eval_pdf(self.event.channels,
+                              vertex_generator,
+                              2.0e-9, self.trange, 
+                              1, self.qrange,
+                              nreps=nreps,
+                              time_only=self.time_only)
         
         # Normalize probabilities and put a floor to keep the log finite
         hit_prob = hitcount.astype(np.float32) / (nreps * nevals)
@@ -79,7 +82,8 @@ class Likelihood(object):
         hit_channel_prob_uncert = ( (nevals * nreps * hit_prob * (1.0 - hit_prob)) / hit_prob**2 ).sum()**0.5
         log_likelihood = ufloat((hit_channel_prob, 0.0))
 
-        # Then include the probability densities of the observed charges and times
+        # Then include the probability densities of the observed
+        # charges and times.
         hit_pdf_ufloat = unumpy.uarray((pdf_prob[self.event.channels.hit],
                                         pdf_prob_uncert[self.event.channels.hit]))
         log_likelihood += unumpy.log(hit_pdf_ufloat).sum()
@@ -91,21 +95,35 @@ if __name__ == '__main__':
     from chroma.sim import Simulation
     from chroma.optics import water
     from chroma.generator import constant_particle_gun
-    from chroma.tools import enable_debug_on_crash
+    from chroma import tools
     import time
 
-    enable_debug_on_crash()
+    tools.enable_debug_on_crash()
 
     detector = detectors.find('lbne')
-    sim = Simulation(detector, water)
+    sim = Simulation(detector, seed=0)
 
-    event = sim.simulate(1, constant_particle_gun('e-',(0,0,0),(1,0,0),100.0)).next()
+    event = sim.simulate(islice(constant_particle_gun('e-',(0,0,0),(1,0,0),100.0), 1)).next()
 
     print 'nhit = %i' % np.count_nonzero(event.channels.hit)
 
     likelihood = Likelihood(sim, event)
 
-    for x in np.linspace(-10.0, 10.0, 2*10*5+1):
-        start = time.time()
-        l = likelihood.eval(constant_particle_gun('e-',(0,0,0),(1,0,0),100.0),100, nreps=200)
-        print 'x = %5.1f, %s (%1.1f sec)' % (x, l, time.time() - start)
+    x = np.linspace(-10.0, 10.0, 100)
+    l = []
+
+    for pos in izip(x, repeat(0), repeat(0)):
+        t0 = time.time()
+        ev_vertex_iter = constant_particle_gun('e-',pos,(1,0,0),100.0)
+        l.append(likelihood.eval(ev_vertex_iter, 1000))
+        elapsed = time.time() - t0
+
+        print '(%.1f, %.1f, %.1f), %s (%1.1f sec)' % \
+            (pos[0], pos[1], pos[2], tools.ufloat_to_str(l[-1]), elapsed)
+
+    import matplotlib.pyplot as plt
+
+    plt.errorbar(x, [v.nominal_value for v in l], [v.std_dev() for v in l])
+    plt.xlabel('X Position (m)')
+    plt.ylabel('Negative Log Likelihood')
+    plt.show()
diff --git a/optics.py b/optics.py
index 7c89c08..6784324 100644
--- a/optics.py
+++ b/optics.py
@@ -172,7 +172,7 @@ labppo_scintillator.set('scattering_length', wavelengths=[ 200.0, 215.0, 225.0,
 
 ################### WCSim materials #####################
 
-water_wcsim = Material('water')
+water_wcsim = Material('water_wcsim')
 water_wcsim.density = 1.0 # g/cm^3
 water_wcsim.composition = { 'H' : 0.1119, 'O' : 0.8881 } # fraction by mass
 hc_over_GeV = 1.2398424468024265e-06 # h_Planck * c_light / GeV / nanometer
diff --git a/project.py b/project.py
new file mode 100644
index 0000000..0e3626f
--- /dev/null
+++ b/project.py
@@ -0,0 +1,28 @@
+import numpy as np
+from itertools import repeat
+from chroma.transform import rotate
+
+def from_film(position, axis1=(0,0,1), axis2=(1,0,0), size=(800,600), width=0.035, focal_length=0.018):
+    height = width*(size[1]/float(size[0]))
+
+    x = np.linspace(width/2, -width/2, size[0])
+    z = np.linspace(-height/2, height/2, size[1])
+
+    zz, xx = np.meshgrid(z,x)
+
+    grid = np.array(zip(xx.flatten(),np.zeros(xx.size),zz.flatten()))
+
+    axis1 = np.asarray(axis1)/np.linalg.norm(axis1)
+    axis2 = np.asarray(axis2)/np.linalg.norm(axis2)
+
+    assert axis1.dot(axis2) == 0.0
+
+    if (axis1 != (0,0,1)).any():
+        grid = rotate(grid, np.arccos(axis1.dot((0,0,1))), np.cross(axis1,(0,0,1)))
+
+    if (axis2 != (1,0,0)).any():
+        grid = rotate(grid, np.arccos(axis2.dot((1,0,0))), np.cross(axis2,(1,0,0)))
+
+    grid -= np.cross(axis1,axis2)*focal_length
+
+    return grid+position, -grid/np.apply_along_axis(np.linalg.norm,1,grid)[:,np.newaxis]
diff --git a/sim.py b/sim.py
index 3044f3e..c0df524 100755
--- a/sim.py
+++ b/sim.py
@@ -3,148 +3,147 @@ import sys
 import time
 import os
 import numpy as np
-
-import detectors
-import optics
-import generator
-from generator import constant
 import itertools
 import threading
-import gpu
-from fileio import root
-from chroma.itertoolset import repeating_iterator
-from tools import profile_if_possible, enable_debug_on_crash
+
+from chroma import detectors
+from chroma import optics
+from chroma import generator
+from chroma import gpu
+from chroma.fileio import root
+from chroma import tools
+from chroma import event
+from chroma.itertoolset import peek, repeat_copy, repeating_iterator
 
 def pick_seed():
-    '''Returns a seed for a random number generator selected using
-    a mixture of the current time and the current process ID.'''
+    """Returns a seed for a random number generator selected using
+    a mixture of the current time and the current process ID."""
     return int(time.time()) ^ (os.getpid() << 16)
 
 class Simulation(object):
-    def __init__(self, detector, detector_material, 
-                 seed=None, cuda_device=None, geant4_processes=4, bvh_bits=11,
-                 use_cache=True):
+    def __init__(self, detector, seed=None, cuda_device=None, geant4_processes=4, bvh_bits=11, use_cache=True, nthreads_per_block=64, max_blocks=1024):
         self.detector = detector
 
+        self.nthreads_per_block = nthreads_per_block
+        self.max_blocks = max_blocks
+
         if seed is None:
             self.seed = pick_seed()
         else:
             self.seed = seed
-        print >>sys.stderr, 'RNG seed:', self.seed
+
+        print 'RNG seed: %i' % self.seed
         # We have three generators to seed: numpy.random, GEANT4, and CURAND.
-        # The latter two are done below
+        # The latter two are done below.
         np.random.seed(self.seed)
 
-        self.detector_material = detector_material
         if geant4_processes > 0:
-            self.photon_generator = generator.photon.G4ParallelGenerator(geant4_processes, 
-                                                                         detector_material,
-                                                                         base_seed=self.seed)
+            self.photon_generator = generator.photon.G4ParallelGenerator(geant4_processes, detector.detector_material, base_seed=self.seed)
         else:
             self.photon_generator = None
 
-        print >>sys.stderr, 'Creating BVH with %d bits...' % (bvh_bits)
-        detector.build(bits=bvh_bits, use_cache=use_cache)
+        if not hasattr(detector, 'mesh'):
+            # need to build geometry
+            print 'Creating BVH with %i bits...' % bvh_bits
+            detector.build(bits=bvh_bits, use_cache=use_cache)
         
-        print >>sys.stderr, 'Initializing GPU...'
-        self.gpu_worker = gpu.GPU(cuda_device)
+        self.gpu = gpu.GPU(cuda_device)
+
+        # geometry is loaded onto gpu by default
+        self.gpu_geometry = gpu.GPUGeometry(self.gpu, detector)
         
-        print >>sys.stderr, 'Loading detector onto GPU...'
-        self.gpu_worker.load_geometry(detector)
+        self.gpu_daq = gpu.GPUDaq(self.gpu_geometry, max(self.detector.pmtids))
+        self.gpu_pdf = gpu.GPUPDF()
+
+        print 'Initializing random numbers generators...'
+        self.rng_states = gpu.get_rng_states(self.nthreads_per_block*self.max_blocks, seed=self.seed)
 
-        print >>sys.stderr, 'Initializing random numbers generators...'
-        self.gpu_worker.setup_propagate(seed=self.seed)
-        self.gpu_worker.setup_daq(max(self.detector.pmtids))
         self.pdf_config = None
 
-    def simulate(self, nevents, vertex_generator, keep_photon_start=False, keep_photon_stop=False,
-                 run_daq=True, nreps=1):
-        photon_gen = repeating_iterator(self.photon_generator.generate_events(nevents, vertex_generator),
-                                        nreps)
-        return self.simulate_photons(nevents*nreps, 
-                                     photon_gen,
-                                     keep_photon_start=keep_photon_start, keep_photon_stop=keep_photon_stop,
-                                     run_daq=run_daq)
-
-    def simulate_photons(self, nevents, photon_generator, keep_photon_start=False, keep_photon_stop=False,
-                         run_daq=True):
-        for ev in itertools.islice(photon_generator, nevents):
-            self.gpu_worker.load_photons(ev.photon_start)
-            self.gpu_worker.propagate()
-            self.gpu_worker.run_daq()
-            
-            ev.nphoton = len(ev.photon_start.positions)
-            
-            if not keep_photon_start:
-                ev.photon_start = None
-
-            if keep_photon_stop:
-                ev.photon_stop = self.gpu_worker.get_photons()
+    def simulate(self, iterable, keep_photons_beg=False, keep_photons_end=False, run_daq=True, max_steps=10):
+        first_element, iterable = peek(iterable)
+
+        if isinstance(first_element, event.Event):
+            iterable = self.photon_generator.generate_events(iterable)
+        elif isinstance(first_element, event.Photons):
+            iterable = (event.Event(photons_beg=x) for x in iterable)
+
+        for ev in iterable:
+            gpu_photons = gpu.GPUPhotons(ev.photons_beg)
+
+            gpu.propagate(self.gpu, gpu_photons, self.rng_states, nthreads_per_block=self.nthreads_per_block, max_blocks=self.max_blocks, max_steps=max_steps)
+
+            ev.nphotons = len(ev.photons_beg.pos)
+
+            if not keep_photons_beg:
+                ev.photons_beg = None
+
+            if keep_photons_end:
+                ev.photons_end = gpu_photons.get()
 
             if run_daq:
-                ev.channels = self.gpu_worker.get_hits()        
+                gpu_channels = self.gpu_daq.acquire(gpu_photons, self.rng_states, nthreads_per_block=self.nthreads_per_block, max_blocks=self.max_blocks)
+                ev.channels = gpu_channels.get()
 
             yield ev
 
-    def propagate_photons(self, photons, max_steps=10):
-        self.gpu_worker.load_photons(photons)
-        self.gpu_worker.propagate(max_steps=max_steps)
-        return self.gpu_worker.get_photons()
-
-    def create_pdf(self, nevents, vertex_generator, tbins, trange,
-                   qbins, qrange, nreps=1):
-        photon_gen = repeating_iterator(self.photon_generator.generate_events(nevents, vertex_generator),
-                                        nreps)
-        return self.create_pdf_from_photons(nevents*nreps, photon_gen,
-                                            tbins, trange, qbins, qrange)
-
-    def create_pdf_from_photons(self, nevents, photon_generator,
-                                tbins, trange, qbins, qrange):
-        '''Returns tuple: 1D array of channel hit counts, 3D array of (channel, time, charge) pdfs'''
+    def create_pdf(self, iterable, tbins, trange, qbins, qrange, nreps=1):
+        """Returns tuple: 1D array of channel hit counts, 3D array of
+        (channel, time, charge) pdfs."""
+        first_element, iterable = peek(iterable)
+
+        if isinstance(first_element, event.Event):
+            iterable = self.photon_generator.generate_events(iterable)
+
         pdf_config = (tbins, trange, qbins, qrange)
         if pdf_config != self.pdf_config:
             self.pdf_config = pdf_config
-            self.gpu_worker.setup_pdf(max(self.detector.pmtids), tbins, trange,
+            self.gpu_pdf.setup_pdf(max(self.detector.pmtids), tbins, trange,
                                       qbins, qrange)
         else:
-            self.gpu_worker.clear_pdf()
+            self.gpu_pdf.clear_pdf()
 
-        for ev in itertools.islice(photon_generator, nevents):
-            self.gpu_worker.load_photons(ev.photon_start)
-            self.gpu_worker.propagate()
-            self.gpu_worker.run_daq()
-            self.gpu_worker.add_hits_to_pdf()
-        
-        return self.gpu_worker.get_pdfs()
-
-
-    def eval_pdf(self, event_channels, nevents, vertex_generator, 
-                 min_twidth, trange, min_qwidth, qrange,
-                 min_bin_content=20, nreps=1, time_only=True):
-        photon_gen = repeating_iterator(self.photon_generator.generate_events(nevents, vertex_generator),
-                                        nreps)
-        return self.eval_pdf_from_photons(event_channels, nevents*nreps, photon_gen,
-                                          min_twidth, trange, min_qwidth, qrange, 
-                                          min_bin_content=min_bin_content, time_only=time_only)
-
-    def eval_pdf_from_photons(self, event_channels, nevents, photon_generator,
-                              min_twidth, trange, min_qwidth, qrange,
-                              min_bin_content=20, time_only=True):
-        '''Returns tuple: 1D array of channel hit counts, 1D array of PDF probability densities'''
-        self.gpu_worker.setup_pdf_eval(event_channels.hit, event_channels.t, event_channels.q,
-                                       min_twidth, trange, min_qwidth, qrange,
-                                       min_bin_content=min_bin_content, time_only=True)
-
-        for ev in itertools.islice(photon_generator, nevents):
-            self.gpu_worker.load_photons(ev.photon_start)
-            self.gpu_worker.propagate()
-            self.gpu_worker.run_daq()
-            self.gpu_worker.accumulate_pdf_eval()
+        if nreps > 1:
+            iterable = repeating_iterator(iterable)
+
+        for ev in iterable:
+            gpu_photons = gpu.GPUPhotons(ev.photons_beg)
+            gpu.propagate(self.gpu, gpu_photons, self.rng_states, nthreads_per_block=self.nthreads_per_block, max_blocks=self.max_blocks)
+            gpu_channels = self.gpu_daq.acquire(gpu_photons, self.rng_states, nthreads_per_block=self.nthreads_per_block, max_blocks=self.max_blocks)
+            self.gpu_pdf.add_hits_to_pdf(gpu_channels)
         
-        return self.gpu_worker.get_pdf_eval()
+        return self.gpu_pdf.get_pdfs()
+
+    def eval_pdf(self, event_channels, iterable, min_twidth, trange, min_qwidth, qrange, min_bin_content=20, nreps=1, time_only=True):
+        """Returns tuple: 1D array of channel hit counts, 1D array of PDF
+        probability densities."""
+        self.gpu_pdf.setup_pdf_eval(event_channels.hit,
+                                    event_channels.t,
+                                    event_channels.q,
+                                    min_twidth,
+                                    trange,
+                                    min_qwidth,
+                                    qrange,
+                                    min_bin_content=min_bin_content,
+                                    time_only=True)
+
+        first_element, iterable = peek(iterable)
+
+        if isinstance(first_element, event.Event):
+            iterable = self.photon_generator.generate_events(iterable)
+
+        if nreps > 1:
+            iterable = repeating_iterator(iterable)
+
+        for ev in iterable:
+            gpu_photons = gpu.GPUPhotons(ev.photons_beg)
+            gpu.propagate(self.gpu, gpu_photons, self.rng_states, nthreads_per_block=self.nthreads_per_block, max_blocks=self.max_blocks)
+            gpu_channels = self.gpu_daq.acquire(gpu_photons, self.rng_states, nthreads_per_block=self.nthreads_per_block, max_blocks=self.max_blocks)
+            self.gpu_pdf.accumulate_pdf_eval(gpu_channels)
         
+        return self.gpu_pdf.get_pdf_eval()
 
-@profile_if_possible
+@tools.profile_if_possible
 def main():
     import optparse
 
@@ -155,17 +154,22 @@ def main():
                       help='Set random number generator seed')
     parser.add_option('-g', type='int', dest='ngenerators', default=4,
                       help='Number of GEANT4 generator processes')
-    parser.add_option('--detector', type='string', dest='detector', default='microlbne')
+    parser.add_option('--detector', type='string', dest='detector',
+                      default='microlbne')
     parser.add_option('--nevents', type='int', dest='nevents', default=100)    
-    parser.add_option('--particle', type='string', dest='particle', default='e-')
-    parser.add_option('--energy', type='float', dest='energy', default=100.0)
-    parser.add_option('--pos', type='string', dest='pos', default='(0,0,0)')
-    parser.add_option('--dir', type='string', dest='dir', default='(1,0,0)')
-    parser.add_option('--save-photon-start', action='store_true', 
-                      dest='save_photon_start', default=False,
+    parser.add_option('--particle', type='string', dest='particle',
+                      help='particle name', default='e-')
+    parser.add_option('--ke', type='float', dest='ke',
+                      help='particle kinetic energy in MeV', default=100.0)
+    parser.add_option('--pos', type='string', dest='pos',
+                      help='particle vertex origin.', default='(0,0,0)')
+    parser.add_option('--dir', type='string', dest='dir',
+                      help='particle vertex direction.', default='(1,0,0)')
+    parser.add_option('--save-photon-beg', action='store_true', 
+                      dest='save_photons_beg', default=False,
                       help='Save initial photon vertices to disk')
-    parser.add_option('--save-photon-stop', action='store_true', 
-                      dest='save_photon_stop', default=False,
+    parser.add_option('--save-photon-end', action='store_true', 
+                      dest='save_photons_end', default=False,
                       help='Save final photon vertices to disk')
 
     options, args = parser.parse_args()
@@ -178,57 +182,52 @@ def main():
     if options.nevents <= 0:
         sys.exit('--nevents must be greater than 0!')
 
-    position = np.array(eval(options.pos), dtype=float)
-    direction = np.array(eval(options.dir), dtype=float)
-    print >>sys.stderr, 'Loading detector %s...' % options.detector
+    pos = np.array(eval(options.pos), dtype=float)
+    dir = np.array(eval(options.dir), dtype=float)
+
+    print 'Loading detector %s...' % options.detector
+    sys.stdout.flush()
     detector = detectors.find(options.detector)
 
-    print >>sys.stderr, 'Creating generator...'
+    print 'Creating particle vertex generator...'
+    sys.stdout.flush()
     if options.particle == 'pi0':
-        vertex_generator = generator.vertex.pi0_gun(pi0_position=constant(position),
-                                                    pi0_direction=constant(direction),
-                                                    pi0_total_energy=constant(options.energy))
+        ev_vertex_iter = itertools.islice(generator.vertex.pi0_gun(itertools.repeat(pos), itertools.repeat(dir), itertools.repeat(options.ke)), options.nevents)
     else:
-        vertex_generator = generator.vertex.particle_gun(particle_name=constant(options.particle),
-                                                         position=constant(position),
-                                                         direction=constant(direction),
-                                                         total_energy=constant(options.energy))
+        vertex = event.Vertex(options.particle, pos, dir, None, options.ke)
+        ev_vertex_iter = (event.Event(i, vertex, [vertex]) for i, vertex in zip(range(options.nevents), repeat_copy(vertex)))
 
     # Initializing simulation
-    print >>sys.stderr, 'WARNING: ASSUMING DETECTOR IS WCSIM WATER!!'    
-    simulation = Simulation(detector=detector, detector_material=optics.water_wcsim,
-                            seed=options.seed, cuda_device=options.device, 
-                            geant4_processes=options.ngenerators, bvh_bits=options.nbits)
+    simulation = Simulation(detector=detector, seed=options.seed, cuda_device=options.device, geant4_processes=options.ngenerators, bvh_bits=options.nbits)
 
     # Create output file
     writer = root.RootWriter(output_filename)
 
     # Preheat generator
-    event_iterator = simulation.simulate(options.nevents, vertex_generator,
-                                         keep_photon_start=options.save_photon_start,
-                                         keep_photon_stop=options.save_photon_stop)
+    ev_iter = simulation.simulate(ev_vertex_iter, keep_photons_beg=options.save_photons_beg, keep_photons_end=options.save_photons_end)
 
+    print 'Starting simulation...'
 
-    print >>sys.stderr, 'Starting simulation...'
-    start_sim = time.time()
     nphotons = 0
+    t0 = time.time()
 
-    for i, ev in enumerate(event_iterator):
-        assert ev.nphoton > 0, 'GEANT4 generated event with no photons!'
-        nphotons += ev.nphoton
+    for i, ev in enumerate(ev_iter):
+        print "\rEvent: %i" % (i+1),
+        sys.stdout.flush()
 
-        writer.write_event(ev)
+        assert ev.nphotons > 0, 'Geant4 generated event with no photons!'
+        nphotons += ev.nphotons
 
-        if i % 10 == 0:
-            print >>sys.stderr, "\rEvent:", i,
+        writer.write_event(ev)
+    print
 
-    end_sim = time.time()
-    print >>sys.stderr, "\rEvent:", options.nevents - 1
+    elapsed = time.time() - t0
 
     writer.close()
 
-    print >>sys.stderr, 'Done. %1.1f events/sec, %1.0f photons/sec.' % (options.nevents/(end_sim - start_sim), nphotons/(end_sim - start_sim))
+    print '%f elapsed, %1.1f events/sec, %1.0f photons/sec.' % \
+        (elapsed, options.nevents/elapsed, nphotons/elapsed)
 
 if __name__ == '__main__':
-    enable_debug_on_crash()
+    tools.enable_debug_on_crash()
     main()
diff --git a/src/alpha.h b/src/alpha.h
index e4a3a40..ac75834 100644
--- a/src/alpha.h
+++ b/src/alpha.h
@@ -4,57 +4,14 @@
 #include "linalg.h"
 #include "intersect.h"
 #include "mesh.h"
+#include "sorting.h"
 
-template <class T>
-__device__ void swap(T &a, T &b)
-{
-	T temp = a;
-	a = b;
-	b = temp;
-}
+#include "stdio.h"
 
 #define ALPHA_DEPTH 10
 
-struct HitList
-{
-	unsigned int size;
-	unsigned int indices[ALPHA_DEPTH];
-	float distances[ALPHA_DEPTH];
-};
-
-__device__ void add_to_hit_list(const float &distance, const int &index, HitList &h)
-{
-	unsigned int i;
-	if (h.size >= ALPHA_DEPTH)
-	{
-		if (distance > h.distances[ALPHA_DEPTH-1])
-			return;
-
-		i = ALPHA_DEPTH-1;
-	}
-	else
-	{
-		i = h.size;
-		h.size += 1;
-	}
-
-	h.indices[i] = index;
-	h.distances[i] = distance;
-
-	while (i > 0 && h.distances[i-1] > h.distances[i])
-	{
-		swap(h.distances[i-1], h.distances[i]);
-		swap(h.indices[i-1], h.indices[i]);
-
-		i -= 1;
-	}
-}
-
 __device__ int get_color_alpha(const float3 &origin, const float3& direction)
 {
-	HitList h;
-	h.size = 0;
-
 	float distance;
 
 	if (!intersect_node(origin, direction, g_start_node, -1.0f))
@@ -69,6 +26,10 @@ __device__ int get_color_alpha(const float3 &origin, const float3& direction)
 
 	unsigned int i;
 
+	float distances[ALPHA_DEPTH];
+	unsigned int indices[ALPHA_DEPTH];
+	unsigned int n=0;
+
 	do
 	{
 		unsigned int first_child = tex1Dfetch(node_map, *node);
@@ -106,9 +67,26 @@ __device__ int get_color_alpha(const float3 &origin, const float3& direction)
 
 				if (intersect_triangle(origin, direction, v0, v1, v2, distance))
 				{
-					add_to_hit_list(distance, i, h);
+					if (n < 1)
+					{
+						distances[0] = distance;
+						indices[0] = i;
+					}
+					else
+					{
+						unsigned long j = searchsorted(n, distances, distance);
+
+						if (j <= ALPHA_DEPTH-1)
+						{
+							insert(ALPHA_DEPTH, distances, j, distance);
+							insert(ALPHA_DEPTH, indices, j, i);
+						}
+					}
+					
+					if (n < ALPHA_DEPTH)
+						n++;
 				}
-
+				
 			} // triangle loop
 
 			node--;
@@ -118,16 +96,19 @@ __device__ int get_color_alpha(const float3 &origin, const float3& direction)
 	} // while loop
 	while (node != head);
 
-	if (h.size < 1)
+	if (n < 1)
 		return 0;
 
 	float scale = 1.0f;
 	float fr = 0.0f;
 	float fg = 0.0f;
 	float fb = 0.0f;
-	for (i=0; i < h.size; i++)
+	unsigned int index;
+	for (i=0; i < n; i++)
 	{
-		uint4 triangle_data = g_triangles[h.indices[i]];
+		index = indices[i];
+
+		uint4 triangle_data = g_triangles[index];
 
 		float3 v0 = g_vertices[triangle_data.x];
 		float3 v1 = g_vertices[triangle_data.y];
@@ -138,11 +119,11 @@ __device__ int get_color_alpha(const float3 &origin, const float3& direction)
 		if (cos_theta < 0.0f)
 			cos_theta = dot(-normalize(cross(v1-v0,v2-v1)),-direction);
 
-		unsigned int r0 = 0xff & (g_colors[h.indices[i]] >> 16);
-		unsigned int g0 = 0xff & (g_colors[h.indices[i]] >> 8);
-		unsigned int b0 = 0xff & g_colors[h.indices[i]];
+		unsigned int r0 = 0xff & (g_colors[index] >> 16);
+		unsigned int g0 = 0xff & (g_colors[index] >> 8);
+		unsigned int b0 = 0xff & g_colors[index];
 
-		float alpha = (255 - (0xff & (g_colors[h.indices[i]] >> 24)))/255.0f;
+		float alpha = (255 - (0xff & (g_colors[index] >> 24)))/255.0f;
 
 		fr += r0*scale*cos_theta*alpha;
 		fg += g0*scale*cos_theta*alpha;
diff --git a/src/kernel.cu b/src/kernel.cu
index 307412e..4418305 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -62,41 +62,6 @@ __device__ void to_diffuse(Photon &p, State &s, curandState &rng, const int &max
 extern "C"
 {
 
-/* Translate the points `a` by the vector `v` */
-__global__ void translate(int nthreads, float3 *a, float3 v)
-{
-	int id = blockIdx.x*blockDim.x + threadIdx.x;
-
-	if (id >= nthreads)
-		return;
-
-	a[id] += v;
-}
-
-/* Rotate the points `a` through an angle `phi` counter-clockwise about the
-   axis `axis` (when looking towards +infinity). */
-__global__ void rotate(int nthreads, float3 *a, float phi, float3 axis)
-{
-	int id = blockIdx.x*blockDim.x + threadIdx.x;
-
-	if (id >= nthreads)
-		return;
-
-	a[id] = rotate(a[id], phi, axis);
-}
-
-__global__ void rotate_around_point(int nthreads, float3 *a, float phi, float3 axis, float3 point)
-{
-	int id = blockIdx.x*blockDim.x + threadIdx.x;
-
-	if (id >= nthreads)
-		return;
-
-	a[id] -= point;
-	a[id] = rotate(a[id], phi, axis);
-	a[id] += point;
-}
-
 __global__ void update_xyz_lookup(int nthreads, int total_threads, int offset, float3 position, curandState *rng_states, float wavelength, float3 xyz, float3 *xyz_lookup1, float3 *xyz_lookup2, int max_steps)
 {
 	int kernel_id = blockIdx.x*blockDim.x + threadIdx.x;
@@ -239,6 +204,28 @@ __global__ void process_image(int nthreads, float3 *image, int *pixels, int nima
 
 } // process_image
 
+__global__ void distance_to_mesh(int nthreads, float3 *positions, float3 *directions, float *distances)
+{
+	int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+	if (id >= nthreads)
+		return;
+
+	float3 position = positions[id];
+	float3 direction = directions[id];
+	direction /= norm(direction);
+
+	float distance;
+
+	int triangle_index = intersect_mesh(position, direction, distance);
+
+	if (triangle_index == -1)
+		distances[id] = 1e9;
+	else
+		distances[id] = distance;
+
+} // distance_to_mesh
+
 __global__ void ray_trace(int nthreads, float3 *positions, float3 *directions, int *pixels)
 {
 	int id = blockIdx.x*blockDim.x + threadIdx.x;
@@ -288,6 +275,9 @@ __global__ void ray_trace_alpha(int nthreads, float3 *positions, float3 *directi
 	float3 direction = directions[id];
 	direction /= norm(direction);
 
+	bool hit;
+	float distance;
+
 	pixels[id] = get_color_alpha(position, direction);
 
 } // ray_trace
diff --git a/src/sorting.h b/src/sorting.h
new file mode 100644
index 0000000..ec16bbe
--- /dev/null
+++ b/src/sorting.h
@@ -0,0 +1,103 @@
+template <class T>
+__device__ void swap(T &a, T &b)
+{
+	T tmp = a;
+	a = b;
+	b = tmp;
+}
+
+template <class T>
+__device__ void reverse(int n, T *a)
+{
+	for (int i=0; i < n/2; i++)
+		swap(a[i],a[n-1-i]);
+}
+
+template <class T>
+__device__ void piksrt(int n, T *arr)
+{
+	int i,j;
+	T a;
+
+	for (j=1; j < n; j++)
+	{
+		a = arr[j];
+		i = j-1;
+		while (i >= 0 && arr[i] > a)
+		{
+			arr[i+1] = arr[i];
+			i--;
+		}
+		arr[i+1] = a;
+	}
+}
+
+template <class T, class U>
+__device__ void piksrt2(int n, T *arr, U *brr)
+{
+	int i,j;
+	T a;
+	U b;
+
+	for (j=1; j < n; j++)
+	{
+		a = arr[j];
+		b = brr[j];
+		i = j-1;
+		while (i >= 0 && arr[i] > a)
+		{
+			arr[i+1] = arr[i];
+			brr[i+1] = brr[i];
+			i--;
+		}
+		arr[i+1] = a;
+		brr[i+1] = b;
+	}
+}
+
+template <class T>
+__device__ unsigned long searchsorted(unsigned long n, T *arr, const T &x)
+{
+	unsigned long ju,jm,jl;
+	int ascnd;
+
+	jl = 0;
+	ju = n;
+
+	ascnd = (arr[n-1] > arr[0]);
+
+	while (ju-jl > 1)
+	{
+		jm = (ju+jl) >> 1;
+
+		if (x >= arr[jm] == ascnd)
+			jl = jm;
+		else
+			ju = jm;
+	}
+
+	if (x <= arr[0])
+		return 0;
+	else if (x == arr[n-1])
+		return n-1;
+	else
+		return ju;
+}
+
+template <class T>
+__device__ void insert(unsigned long n, T *arr, unsigned long i, const T &x)
+{
+	unsigned long j;
+	for (j=n-1; j > i; j--)
+		arr[j] = arr[j-1];
+	arr[i] = x;
+}
+
+template <class T>
+__device__ void add_sorted(unsigned long n, T *arr, const T &x)
+{
+	unsigned long i = searchsorted(n, arr, x);
+
+	if (i < n)
+		insert(n, arr, i, x);
+}
diff --git a/src/transform.cu b/src/transform.cu
new file mode 100644
index 0000000..57bd509
--- /dev/null
+++ b/src/transform.cu
@@ -0,0 +1,47 @@
+//-*-c-*-
+
+#include "linalg.h"
+#include "rotate.h"
+
+extern "C"
+{
+
+/* Translate the points `a` by the vector `v` */
+__global__ void translate(int nthreads, float3 *a, float3 v)
+{
+	int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+	if (id >= nthreads)
+		return;
+
+	a[id] += v;
+}
+
+/* Rotate the points `a` through an angle `phi` counter-clockwise about the
+   axis `axis` (when looking towards +infinity). */
+__global__ void rotate(int nthreads, float3 *a, float phi, float3 axis)
+{
+	int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+	if (id >= nthreads)
+		return;
+
+	a[id] = rotate(a[id], phi, axis);
+}
+
+/* Rotate the points `a` through an angle `phi` counter-clockwise
+   (when looking towards +infinity along `axis`) about the axis defined
+   by the point `point` and the vector `axis` . */
+__global__ void rotate_around_point(int nthreads, float3 *a, float phi, float3 axis, float3 point)
+{
+	int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+	if (id >= nthreads)
+		return;
+
+	a[id] -= point;
+	a[id] = rotate(a[id], phi, axis);
+	a[id] += point;
+}
+
+} // extern "c"
diff --git a/tests/test_fileio.py b/tests/test_fileio.py
index d21c9ff..2911976 100644
--- a/tests/test_fileio.py
+++ b/tests/test_fileio.py
@@ -5,36 +5,34 @@ import numpy as np
 
 class TestFileIO(unittest.TestCase):
     def test_file_write_and_read(self):
-        ev = event.Event(1, 'e-', (0,0,1), (1,0,0), 15)
+        ev = event.Event(1, event.Vertex('e-', (0,0,1), (1,0,0), (0,1,0), 15))
 
-        photon_start = root.make_photon_with_arrays(1)
-        photon_start.positions[0] = (1,2,3)
-        photon_start.directions[0] = (4,5,6)
-        photon_start.polarizations[0] = (7,8,9)
-        photon_start.wavelengths[0] = 400.0
-        photon_start.times[0] = 100.0
-        photon_start.histories[0] = 20
-        photon_start.last_hit_triangles[0] = 5
-        ev.photon_start = photon_start
+        photons_beg = root.make_photon_with_arrays(1)
+        photons_beg.pos[0] = (1,2,3)
+        photons_beg.dir[0] = (4,5,6)
+        photons_beg.pol[0] = (7,8,9)
+        photons_beg.wavelengths[0] = 400.0
+        photons_beg.t[0] = 100.0
+        photons_beg.last_hit_triangles[0] = 5
+        photons_beg.flags[0] = 20
+        ev.photons_beg = photons_beg
 
-        photon_stop = root.make_photon_with_arrays(1)
-        photon_stop.positions[0] = (1,2,3)
-        photon_stop.directions[0] = (4,5,6)
-        photon_stop.polarizations[0] = (7,8,9)
-        photon_stop.wavelengths[0] = 400.0
-        photon_stop.times[0] = 100.0
-        photon_stop.histories[0] = 20
-        photon_stop.last_hit_triangles[0] = 5
-        ev.photon_stop = photon_stop
+        photons_end = root.make_photon_with_arrays(1)
+        photons_end.pos[0] = (1,2,3)
+        photons_end.dir[0] = (4,5,6)
+        photons_end.pol[0] = (7,8,9)
+        photons_end.wavelengths[0] = 400.0
+        photons_end.t[0] = 100.0
+        photons_end.last_hit_triangles[0] = 5
+        photons_end.flags[0] = 20
+        ev.photons_end = photons_end
 
-        ev.nphoton = 1
-
-        ev.subtracks.append(event.Subtrack('e-', (40,30,20), (-1, -2, -3), 400, 800))
+        ev.vertices = [ev.primary_vertex]
 
         channels = event.Channels(hit=np.array([True, False]), 
                                   t=np.array([20.0, 1e9], dtype=np.float32),
                                   q=np.array([2.0, 0.0], dtype=np.float32),
-                                  histories=np.array([8, 32], dtype=np.uint32))
+                                  flags=np.array([8, 32], dtype=np.uint32))
         ev.channels = channels
 
         filename = '/tmp/chroma-filewritertest.root'
@@ -58,16 +56,18 @@ class TestFileIO(unittest.TestCase):
         # Enough screwing around, let's get the one event in the file
         newev = reader.current()
 
-
         # Now check if everything is correct in the event
-        for attribute in ['event_id', 'particle_name','gen_total_energy']:
+        for attribute in ['id']:
             self.assertEqual(getattr(ev, attribute), getattr(newev, attribute), 'compare %s' % attribute)
-        for attribute in ['gen_position', 'gen_direction']:
-            self.assertTrue(np.allclose(getattr(ev, attribute), getattr(newev, attribute)), 'compare %s' % attribute)
 
-        for attribute in ['positions', 'directions', 'wavelengths', 'polarizations', 'times',
-                          'histories', 'last_hit_triangles']:    
-            self.assertTrue(np.allclose(getattr(ev.photon_start, attribute), 
-                                        getattr(newev.photon_start, attribute)), 'compare %s' % attribute)
-            self.assertTrue(np.allclose(getattr(ev.photon_stop, attribute), 
-                                        getattr(newev.photon_stop, attribute)), 'compare %s' % attribute)
+        for attribute in ['pos', 'dir', 'pol', 'ke', 't0']:
+            self.assertTrue(np.allclose(getattr(ev.primary_vertex, attribute), getattr(newev.primary_vertex, attribute)), 'compare %s' % attribute)
+
+            for i in range(len(ev.vertices)):
+                self.assertTrue(np.allclose(getattr(ev.vertices[i], attribute), getattr(newev.vertices[i], attribute)), 'compare %s' % attribute)
+
+        for attribute in ['pos', 'dir', 'pol', 'wavelengths', 't', 'last_hit_triangles', 'flags']:    
+            self.assertTrue(np.allclose(getattr(ev.photons_beg, attribute), 
+                                        getattr(newev.photons_beg, attribute)), 'compare %s' % attribute)
+            self.assertTrue(np.allclose(getattr(ev.photons_end, attribute), 
+                                        getattr(newev.photons_end, attribute)), 'compare %s' % attribute)
diff --git a/tests/test_generator_photon.py b/tests/test_generator_photon.py
index 9684126..13501fe 100644
--- a/tests/test_generator_photon.py
+++ b/tests/test_generator_photon.py
@@ -1,20 +1,21 @@
 import unittest
+import itertools
 
-import chroma.generator.photon
+from chroma import generator
 from chroma.generator.vertex import constant_particle_gun
-from chroma.optics import water_wcsim
+from chroma import optics
 
 class TestG4ParallelGenerator(unittest.TestCase):
     def test_center(self):
         '''Generate Cherenkov light at the center of the world volume'''
-        gen = chroma.generator.photon.G4ParallelGenerator(1, water_wcsim)
-        vertex = constant_particle_gun('e-', (0,0,0), (1,0,0), 100)
-        for event in gen.generate_events(10, vertex):
-            self.assertGreater(len(event.photon_start.positions), 0)
+        gen = generator.photon.G4ParallelGenerator(1, optics.water_wcsim)
+        vertex = itertools.islice(constant_particle_gun('e-', (0,0,0), (1,0,0), 100), 10)
+        for event in gen.generate_events(vertex):
+            self.assertGreater(len(event.photons_beg.pos), 0)
 
     def test_off_center(self):
         '''Generate Cherenkov light at (1 m, 0 m, 0 m)'''
-        gen = chroma.generator.photon.G4ParallelGenerator(1, water_wcsim)
-        vertex = constant_particle_gun('e-', (1,0,0), (1,0,0), 100)
-        for event in gen.generate_events(10, vertex):
-            self.assertGreater(len(event.photon_start.positions), 0)
+        gen = generator.photon.G4ParallelGenerator(1, optics.water_wcsim)
+        vertex = itertools.islice(constant_particle_gun('e-', (1,0,0), (1,0,0), 100), 10)
+        for event in gen.generate_events(vertex):
+            self.assertGreater(len(event.photons_beg.pos), 0)
diff --git a/tests/test_generator_vertex.py b/tests/test_generator_vertex.py
index d773579..cec363d 100644
--- a/tests/test_generator_vertex.py
+++ b/tests/test_generator_vertex.py
@@ -8,16 +8,16 @@ class TestParticleGun(unittest.TestCase):
         '''Generate electron vertices at the center of the world volume.'''
         vertex = chroma.generator.vertex.constant_particle_gun('e-', (0,0,0), (1,0,0), 100)
         for ev in itertools.islice(vertex, 100):
-            self.assertEquals(ev.particle_name, 'e-')
-            self.assertTrue(np.allclose(ev.gen_position, [0,0,0]))
-            self.assertTrue(np.allclose(ev.gen_direction, [1,0,0]))
-            self.assertTrue(np.allclose(ev.gen_total_energy, 100))
+            self.assertEquals(ev.primary_vertex.particle_name, 'e-')
+            self.assertTrue(np.allclose(ev.primary_vertex.pos, [0,0,0]))
+            self.assertTrue(np.allclose(ev.primary_vertex.dir, [1,0,0]))
+            self.assertTrue(np.allclose(ev.primary_vertex.ke, 100))
 
     def test_off_center(self):
         '''Generate electron vertices at (1,0,0) in the world volume.'''
         vertex = chroma.generator.vertex.constant_particle_gun('e-', (1,0,0), (1,0,0), 100)
         for ev in itertools.islice(vertex, 100):
-            self.assertEquals(ev.particle_name, 'e-')
-            self.assertTrue(np.allclose(ev.gen_position, [1,0,0]))
-            self.assertTrue(np.allclose(ev.gen_direction, [1,0,0]))
-            self.assertTrue(np.allclose(ev.gen_total_energy, 100))
+            self.assertEquals(ev.primary_vertex.particle_name, 'e-')
+            self.assertTrue(np.allclose(ev.primary_vertex.pos, [1,0,0]))
+            self.assertTrue(np.allclose(ev.primary_vertex.dir, [1,0,0]))
+            self.assertTrue(np.allclose(ev.primary_vertex.ke, 100))
diff --git a/tests/test_pdf.py b/tests/test_pdf.py
index 9a08b53..791f119 100644
--- a/tests/test_pdf.py
+++ b/tests/test_pdf.py
@@ -1,11 +1,12 @@
 import unittest
 import numpy as np
+import itertools
 
 import chroma.detectors
 from chroma.generator.photon import G4ParallelGenerator
 from chroma.generator.vertex import constant_particle_gun
 from chroma.optics import water_wcsim
-from chroma.gpu import GPU
+from chroma import gpu
 from chroma.sim import Simulation
 
 class TestPDF(unittest.TestCase):
@@ -18,21 +19,26 @@ class TestPDF(unittest.TestCase):
         '''Create a hit count and (q,t) PDF for 10 MeV events in MicroLBNE'''
 
         g4generator = G4ParallelGenerator(1, water_wcsim)
-        gpu = GPU(0)
-        gpu.load_geometry(self.detector)
-        gpu.setup_propagate()
-        gpu.setup_daq(max(self.detector.pmtids))
-        gpu.setup_pdf(max(self.detector.pmtids), 100, (-0.5, 999.5), 10, (-0.5, 9.5))
+        gpu_instance = gpu.GPU(0)
+        gpu_geometry = gpu.GPUGeometry(gpu_instance, self.detector)
+
+        nthreads_per_block, max_blocks = 64, 1024
+
+        rng_states = gpu.get_rng_states(nthreads_per_block*max_blocks)
+
+        gpu_daq = gpu.GPUDaq(gpu_geometry, max(self.detector.pmtids))
+        gpu_pdf = gpu.GPUPDF()
+        gpu_pdf.setup_pdf(max(self.detector.pmtids), 100, (-0.5, 999.5), 10, (-0.5, 9.5))
         
-        gpu.clear_pdf()
+        gpu_pdf.clear_pdf()
 
-        for ev in g4generator.generate_events(10, self.vertex_gen):
-            gpu.load_photons(ev.photon_start)
-            gpu.propagate()
-            gpu.run_daq()
-            gpu.add_hits_to_pdf()
+        for ev in g4generator.generate_events(itertools.islice(self.vertex_gen, 10)):
+            gpu_photons = gpu.GPUPhotons(ev.photons_beg)
+            gpu.propagate(gpu_instance, gpu_photons, rng_states, nthreads_per_block, max_blocks)
+            gpu_channels = gpu_daq.acquire(gpu_photons, rng_states, nthreads_per_block, max_blocks)
+            gpu_pdf.add_hits_to_pdf(gpu_channels)
 
-        hitcount, pdf = gpu.get_pdfs()
+        hitcount, pdf = gpu_pdf.get_pdfs()
         self.assertTrue( (hitcount > 0).any() )
         self.assertTrue( (pdf > 0).any() )
 
@@ -41,9 +47,11 @@ class TestPDF(unittest.TestCase):
             self.assertEqual(nhits, pdf[i].sum())
 
     def testSimPDF(self):
-        sim = Simulation(self.detector, water_wcsim)
-        hitcount, pdf = sim.create_pdf(100, self.vertex_gen,
-                                        100, (-0.5, 999.5), 10, (-0.5, 9.5))
+        sim = Simulation(self.detector)
+
+        vertex_iter = itertools.islice(self.vertex_gen, 10)
+
+        hitcount, pdf = sim.create_pdf(vertex_iter, 100, (-0.5, 999.5), 10, (-0.5, 9.5))
 
         self.assertTrue( (hitcount > 0).any() )
         self.assertTrue( (pdf > 0).any() )
diff --git a/tests/test_propagation.py b/tests/test_propagation.py
index 331242b..bf886bd 100644
--- a/tests/test_propagation.py
+++ b/tests/test_propagation.py
@@ -11,43 +11,43 @@ class TestPropagation(unittest.TestCase):
     def testAbort(self):
         '''Photons that hit a triangle at normal incidence should not abort.
 
-        Photons that hit a triangle at exactly normal incidence can sometimes produce a dot product
-        that is outside the range allowed by acos().  Trigger these with axis aligned photons in a box
+        Photons that hit a triangle at exactly normal incidence can sometimes
+        produce a dot product that is outside the range allowed by acos().
+        Trigger these with axis aligned photons in a box.
         '''
 
         # Setup geometry
-        cube = Geometry()
+        cube = Geometry(vacuum)
         cube.add_solid(Solid(box(100,100,100), vacuum, vacuum))
         cube.pmtids = [0]
-        sim = Simulation(cube, vacuum, bvh_bits=4, geant4_processes=0,
-                              use_cache=False)
+        sim = Simulation(cube, bvh_bits=4, geant4_processes=0, use_cache=False)
 
         # Create initial photons
         nphotons = 10000
-        positions = np.tile([0,0,0], (nphotons,1)).astype(np.float32)
-        directions = np.tile([0,0,1], (nphotons,1)).astype(np.float32)
-        polarizations = np.zeros_like(positions)
+        pos = np.tile([0,0,0], (nphotons,1)).astype(np.float32)
+        dir = np.tile([0,0,1], (nphotons,1)).astype(np.float32)
+        pol = np.zeros_like(pos)
         phi = np.random.uniform(0, 2*np.pi, nphotons).astype(np.float32)
-        polarizations[:,0] = np.cos(phi)
-        polarizations[:,1] = np.sin(phi)
-        times = np.zeros(nphotons, dtype=np.float32)
+        pol[:,0] = np.cos(phi)
+        pol[:,1] = np.sin(phi)
+        t = np.zeros(nphotons, dtype=np.float32)
         wavelengths = np.empty(nphotons, np.float32)
         wavelengths.fill(400.0)
 
-        photons = Photons(positions=positions, directions=directions, polarizations=polarizations,
-                          times=times, wavelengths=wavelengths)
+        photons = Photons(pos=pos, dir=dir, pol=pol, t=t, wavelengths=wavelengths)
 
         # First make one step to check for strangeness
-        photon_stop = sim.propagate_photons(photons, max_steps=1)
-        self.assertFalse(np.isnan(photon_stop.positions).any())
-        self.assertFalse(np.isnan(photon_stop.directions).any())
-        self.assertFalse(np.isnan(photon_stop.polarizations).any())
-        self.assertFalse(np.isnan(photon_stop.times).any())
-        self.assertFalse(np.isnan(photon_stop.wavelengths).any())
+        photons_end = sim.simulate([photons], keep_photons_end=True, max_steps=1).next().photons_end
+
+        self.assertFalse(np.isnan(photons_end.pos).any())
+        self.assertFalse(np.isnan(photons_end.dir).any())
+        self.assertFalse(np.isnan(photons_end.pol).any())
+        self.assertFalse(np.isnan(photons_end.t).any())
+        self.assertFalse(np.isnan(photons_end.wavelengths).any())
 
         # Now let it run the usual ten steps
-        photon_stop = sim.propagate_photons(photons, max_steps=10)
-        aborted = (photon_stop.histories & (1 << 31)) > 0
+        photons_end = sim.simulate([photons], keep_photons_end=True, max_steps=10).next().photons_end
+        aborted = (photons_end.flags & (1 << 31)) > 0
         print 'aborted photons: %1.1f' % (float(np.count_nonzero(aborted)) / nphotons)
         self.assertFalse(aborted.any())
         
diff --git a/tests/test_rayleigh.py b/tests/test_rayleigh.py
index 4394ada..7e5fc92 100644
--- a/tests/test_rayleigh.py
+++ b/tests/test_rayleigh.py
@@ -13,43 +13,43 @@ ROOT.gROOT.SetBatch(1)
 
 class TestRayleigh(unittest.TestCase):
     def setUp(self):
-        self.cube = Geometry()
+        self.cube = Geometry(water_wcsim)
         self.cube.add_solid(Solid(box(100,100,100), water_wcsim, water_wcsim))
         self.cube.pmtids = [0]
-        self.sim = Simulation(self.cube, water_wcsim, bvh_bits=4, geant4_processes=0,
+        self.sim = Simulation(self.cube, bvh_bits=4, geant4_processes=0,
                               use_cache=False)
         nphotons = 100000
-        positions = np.tile([0,0,0], (nphotons,1)).astype(np.float32)
-        directions = np.tile([0,0,1], (nphotons,1)).astype(np.float32)
-        polarizations = np.zeros_like(positions)
+        pos = np.tile([0,0,0], (nphotons,1)).astype(np.float32)
+        dir = np.tile([0,0,1], (nphotons,1)).astype(np.float32)
+        pol = np.zeros_like(pos)
         phi = np.random.uniform(0, 2*np.pi, nphotons).astype(np.float32)
-        polarizations[:,0] = np.cos(phi)
-        polarizations[:,1] = np.sin(phi)
-        times = np.zeros(nphotons, dtype=np.float32)
+        pol[:,0] = np.cos(phi)
+        pol[:,1] = np.sin(phi)
+        t = np.zeros(nphotons, dtype=np.float32)
         wavelengths = np.empty(nphotons, np.float32)
         wavelengths.fill(400.0)
 
-        self.photons = Photons(positions=positions, directions=directions, polarizations=polarizations,
-                               times=times, wavelengths=wavelengths)
+        self.photons = Photons(pos=pos, dir=dir, pol=pol, t=t, wavelengths=wavelengths)
 
     def testAngularDistributionPolarized(self):
         # Fully polarized photons
-        self.photons.polarizations[:] = [1.0, 0.0, 0.0]
+        self.photons.pol[:] = [1.0, 0.0, 0.0]
 
-        photon_stop = self.sim.propagate_photons(self.photons, max_steps=1)
-        aborted = (photon_stop.histories & (1 << 31)) > 0
+        photons_end = self.sim.simulate([self.photons], keep_photons_end=True, max_steps=1).next().photons_end
+        aborted = (photons_end.flags & (1 << 31)) > 0
         self.assertFalse(aborted.any())
 
-        # Compute the dot product between initial and final directions
-        rayleigh_scatters = (photon_stop.histories & (1 << 4)) > 0
-        cos_scatter = (self.photons.directions[rayleigh_scatters] * photon_stop.directions[rayleigh_scatters]).sum(axis=1)
+        # Compute the dot product between initial and final dir
+        rayleigh_scatters = (photons_end.flags & (1 << 4)) > 0
+        cos_scatter = (self.photons.dir[rayleigh_scatters] * photons_end.dir[rayleigh_scatters]).sum(axis=1)
         theta_scatter = np.arccos(cos_scatter)
         h = histogram.Histogram(bins=100, range=(0, np.pi))
         h.fill(theta_scatter)
         h = rootify(h)
 
-        # The functional form for polarized light should be (1 + \cos^2 \theta)\sin \theta
-        # according to GEANT4 physics reference manual
+        # The functional form for polarized light should be
+        # (1 + \cos^2 \theta)\sin \theta according to GEANT4 physics
+        # reference manual.
         f = ROOT.TF1("pol_func", "[0]*(1+cos(x)**2)*sin(x)", 0, np.pi)
         h.Fit(f)
         self.assertGreater(f.GetProb(), 1e-3)
diff --git a/threadtest.py b/threadtest.py
deleted file mode 100644
index f03f920..0000000
--- a/threadtest.py
+++ /dev/null
@@ -1,94 +0,0 @@
-from gputhread import *
-from Queue import Queue
-from sample import uniform_sphere
-import numpy as np
-from pycuda import gpuarray
-import pycuda.driver as cuda
-from uncertainties import ufloat, umath
-from histogram import Histogram
-import math
-
-jobs = Queue()
-output = Queue()
-
-def generate_event(detector, position=(0,0,0), nphotons=5000):
-    jobs.put((gpuarray.vec.make_float3(*position), nphotons))
-    jobs.join()
-
-    earliest_times = output.get()
-
-    pmt_times = earliest_times[detector.pmtids]
-    event_times = [ (i, t) for i, t in zip(detector.pmtids, pmt_times) if t < 1e8 ]
-    print '%i hit pmts' % len(event_times)
-    return event_times
-
-def likelihood(detector, event_times, position=(0,0,0), nphotons=5000, neval=100):
-    for i in range(neval):
-        jobs.put((gpuarray.vec.make_float3(*position), nphotons))
-    jobs.join()
-
-    t = np.zeros(shape=(neval, max(detector.pmtids)+1), dtype=np.float32)
-    for i in range(neval):
-        t[i] = output.get()
-
-    log_likelihood = 0.0
-    log_likelihood_variance = 0.0
-    for i, time in event_times:
-        h = Histogram(500, (-0.5e-9, 99.5e-9))
-        h.fill(t[:,i])
-
-        if h.nentries > 0:
-            h.normalize()
-
-        probability = h.ueval(time)
-
-        if probability.nominal_value == 0.0:
-            if h.nentries > 0:
-                probability = ufloat((0.5/h.nentries, 0.5/h.nentries))
-            else:
-                probability = \
-                    ufloat((1.0/len(h.bincenters), 1.0/len(h.bincenters)))
-
-        log_likelihood += math.log(probability.nominal_value)
-        log_likelihood_variance += (probability.std_dev()/probability.nominal_value)**2
-
-    return ufloat((-log_likelihood, log_likelihood_variance**0.5))
-
-if __name__ == '__main__':
-    import sys
-    import optparse
-    import time
-
-    parser = optparse.OptionParser('%prog')
-    parser.add_option('-b', type='int', dest='nbits', default=8)
-    parser.add_option('-j', type='string', dest='devices', default='0')
-    parser.add_option('-n', type='int', dest='nblocks', default=64)
-    options, args = parser.parse_args()
-
-    from detectors import build_minilbne
-
-    detector = build_minilbne()
-
-    detector.build(bits=options.nbits)
-
-    cuda.init()
-
-    gputhreads = []
-    for i in [int(s) for s in options.devices.split(',')]:
-        gputhreads.append(GPUThread(i, detector, jobs, output, options.nblocks))
-        gputhreads[-1].start()
-
-    try:
-        event_times = generate_event(detector)
-
-        for z in np.linspace(-1.0, 1.0, 100):
-            t0 = time.time()
-            log_likelihood = likelihood(detector, event_times, (z,0,0))
-            elapsed = time.time() - t0
-            print 'z = %5.2f, likelihood = %s, elapsed %.2f sec' % (z, log_likelihood, elapsed)
-    finally:
-        for gputhread in gputhreads:
-            gputhread.stop()
-
-        for gputhread in gputhreads:
-            gputhread.join()
diff --git a/tools.py b/tools.py
index 1338526..d6b71b1 100644
--- a/tools.py
+++ b/tools.py
@@ -2,6 +2,24 @@ import numpy as np
 import time
 import datetime
 import sys
+import math
+
+def ufloat_to_str(x):
+    msd = -int(math.floor(math.log10(x.std_dev())))
+    return '%.*f +/- %.*f' % (msd, x.nominal_value, msd, x.std_dev())
+
+def progress(seq):
+    "Print progress while iterating over `seq`."
+    n = len(seq)
+    print '[' + ' '*21 + ']\r[',
+    sys.stdout.flush()
+    for i, item in enumerate(seq):
+        if i % (n//10) == 0:
+            print '.',
+            sys.stdout.flush()
+        yield item
+    print ']'
+    sys.stdout.flush()
 
 def debugger_hook(type, value, tb):
     if hasattr(sys, 'ps1') or not sys.stderr.isatty():