From 142b3c3caff164deb9bc7b2848e58e52387723ff Mon Sep 17 00:00:00 2001
From: Stan Seibert <stan@mtrr.org>
Date: Fri, 16 Sep 2011 15:02:02 -0400
Subject: Move CUDA source inside chroma package, rename tests directory to
 test

---
 chroma/cuda/__init__.py          |   0
 chroma/cuda/daq.cu               | 198 ++++++++++++++++
 chroma/cuda/geometry.h           |  74 ++++++
 chroma/cuda/hybrid_render.cu     | 202 ++++++++++++++++
 chroma/cuda/intersect.h          | 141 ++++++++++++
 chroma/cuda/linalg.h             | 170 ++++++++++++++
 chroma/cuda/matrix.h             | 249 ++++++++++++++++++++
 chroma/cuda/mesh.h               | 175 ++++++++++++++
 chroma/cuda/photon.h             | 325 ++++++++++++++++++++++++++
 chroma/cuda/physical_constants.h |   7 +
 chroma/cuda/propagate.cu         | 147 ++++++++++++
 chroma/cuda/random.h             |  87 +++++++
 chroma/cuda/render.cu            | 171 ++++++++++++++
 chroma/cuda/rotate.h             |  27 +++
 chroma/cuda/sorting.h            | 107 +++++++++
 chroma/cuda/tools.cu             |  21 ++
 chroma/cuda/transform.cu         |  51 ++++
 distribute_setup.py              | 485 +++++++++++++++++++++++++++++++++++++++
 src/__init__.py                  |   0
 src/daq.cu                       | 198 ----------------
 src/geometry.h                   |  74 ------
 src/hybrid_render.cu             | 202 ----------------
 src/intersect.h                  | 141 ------------
 src/linalg.h                     | 170 --------------
 src/matrix.h                     | 249 --------------------
 src/mesh.h                       | 175 --------------
 src/photon.h                     | 325 --------------------------
 src/physical_constants.h         |   7 -
 src/propagate.cu                 | 147 ------------
 src/random.h                     |  87 -------
 src/render.cu                    | 171 --------------
 src/rotate.h                     |  27 ---
 src/sorting.h                    | 107 ---------
 src/tools.cu                     |  21 --
 src/transform.cu                 |  51 ----
 test/__init__.py                 |   0
 test/data/ray_intersection.npz   | Bin 0 -> 1920080 bytes
 test/linalg_test.cu              | 128 +++++++++++
 test/linalg_test.py              | 214 +++++++++++++++++
 test/matrix_test.cu              | 152 ++++++++++++
 test/matrix_test.py              | 327 ++++++++++++++++++++++++++
 test/rotate_test.cu              |  14 ++
 test/rotate_test.py              |  67 ++++++
 test/test_fileio.py              |  74 ++++++
 test/test_generator_photon.py    |  21 ++
 test/test_generator_vertex.py    |  23 ++
 test/test_pdf.py                 |  67 ++++++
 test/test_propagation.py         |  58 +++++
 test/test_ray_intersection.py    |  27 +++
 test/test_rayleigh.py            |  57 +++++
 test/test_sample_cdf.cu          |  16 ++
 test/test_sample_cdf.py          |  64 ++++++
 tests/__init__.py                |   0
 tests/data/ray_intersection.npz  | Bin 1920080 -> 0 bytes
 tests/linalg_test.cu             | 128 -----------
 tests/linalg_test.py             | 214 -----------------
 tests/matrix_test.cu             | 152 ------------
 tests/matrix_test.py             | 327 --------------------------
 tests/rotate_test.cu             |  14 --
 tests/rotate_test.py             |  67 ------
 tests/test_fileio.py             |  74 ------
 tests/test_generator_photon.py   |  21 --
 tests/test_generator_vertex.py   |  23 --
 tests/test_pdf.py                |  67 ------
 tests/test_propagation.py        |  58 -----
 tests/test_ray_intersection.py   |  27 ---
 tests/test_rayleigh.py           |  57 -----
 tests/test_sample_cdf.cu         |  16 --
 tests/test_sample_cdf.py         |  64 ------
 69 files changed, 3946 insertions(+), 3461 deletions(-)
 create mode 100644 chroma/cuda/__init__.py
 create mode 100644 chroma/cuda/daq.cu
 create mode 100644 chroma/cuda/geometry.h
 create mode 100644 chroma/cuda/hybrid_render.cu
 create mode 100644 chroma/cuda/intersect.h
 create mode 100644 chroma/cuda/linalg.h
 create mode 100644 chroma/cuda/matrix.h
 create mode 100644 chroma/cuda/mesh.h
 create mode 100644 chroma/cuda/photon.h
 create mode 100644 chroma/cuda/physical_constants.h
 create mode 100644 chroma/cuda/propagate.cu
 create mode 100644 chroma/cuda/random.h
 create mode 100644 chroma/cuda/render.cu
 create mode 100644 chroma/cuda/rotate.h
 create mode 100644 chroma/cuda/sorting.h
 create mode 100644 chroma/cuda/tools.cu
 create mode 100644 chroma/cuda/transform.cu
 create mode 100644 distribute_setup.py
 delete mode 100644 src/__init__.py
 delete mode 100644 src/daq.cu
 delete mode 100644 src/geometry.h
 delete mode 100644 src/hybrid_render.cu
 delete mode 100644 src/intersect.h
 delete mode 100644 src/linalg.h
 delete mode 100644 src/matrix.h
 delete mode 100644 src/mesh.h
 delete mode 100644 src/photon.h
 delete mode 100644 src/physical_constants.h
 delete mode 100644 src/propagate.cu
 delete mode 100644 src/random.h
 delete mode 100644 src/render.cu
 delete mode 100644 src/rotate.h
 delete mode 100644 src/sorting.h
 delete mode 100644 src/tools.cu
 delete mode 100644 src/transform.cu
 create mode 100644 test/__init__.py
 create mode 100644 test/data/ray_intersection.npz
 create mode 100644 test/linalg_test.cu
 create mode 100644 test/linalg_test.py
 create mode 100644 test/matrix_test.cu
 create mode 100644 test/matrix_test.py
 create mode 100644 test/rotate_test.cu
 create mode 100644 test/rotate_test.py
 create mode 100644 test/test_fileio.py
 create mode 100644 test/test_generator_photon.py
 create mode 100644 test/test_generator_vertex.py
 create mode 100644 test/test_pdf.py
 create mode 100644 test/test_propagation.py
 create mode 100644 test/test_ray_intersection.py
 create mode 100644 test/test_rayleigh.py
 create mode 100644 test/test_sample_cdf.cu
 create mode 100644 test/test_sample_cdf.py
 delete mode 100644 tests/__init__.py
 delete mode 100644 tests/data/ray_intersection.npz
 delete mode 100644 tests/linalg_test.cu
 delete mode 100644 tests/linalg_test.py
 delete mode 100644 tests/matrix_test.cu
 delete mode 100644 tests/matrix_test.py
 delete mode 100644 tests/rotate_test.cu
 delete mode 100644 tests/rotate_test.py
 delete mode 100644 tests/test_fileio.py
 delete mode 100644 tests/test_generator_photon.py
 delete mode 100644 tests/test_generator_vertex.py
 delete mode 100644 tests/test_pdf.py
 delete mode 100644 tests/test_propagation.py
 delete mode 100644 tests/test_ray_intersection.py
 delete mode 100644 tests/test_rayleigh.py
 delete mode 100644 tests/test_sample_cdf.cu
 delete mode 100644 tests/test_sample_cdf.py

diff --git a/chroma/cuda/__init__.py b/chroma/cuda/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/chroma/cuda/daq.cu b/chroma/cuda/daq.cu
new file mode 100644
index 0000000..5a68846
--- /dev/null
+++ b/chroma/cuda/daq.cu
@@ -0,0 +1,198 @@
+// -*-c++-*-
+#include <curand_kernel.h>
+
+__device__ unsigned int
+float_to_sortable_int(float f)
+{
+    return __float_as_int(f);
+    //int i = __float_as_int(f);
+    //unsigned int mask = -(int)(i >> 31) | 0x80000000;
+    //return i ^ mask;
+}
+
+__device__ float
+sortable_int_to_float(unsigned int i)
+{
+    return __int_as_float(i);
+    //unsigned int mask = ((i >> 31) - 1) | 0x80000000;
+    //return __int_as_float(i ^ mask);
+}
+
+extern "C"
+{
+
+__global__ void
+reset_earliest_time_int(float maxtime, int ntime_ints, unsigned int *time_ints)
+{
+    int id = threadIdx.x + blockDim.x * blockIdx.x;
+    if (id < ntime_ints) {
+	unsigned int maxtime_int = float_to_sortable_int(maxtime);
+	time_ints[id] = maxtime_int;
+    }
+}
+
+__global__ void
+run_daq(curandState *s, unsigned int detection_state, float time_rms,
+	int first_photon, int nphotons, float *photon_times,
+	unsigned int *photon_histories, int *last_hit_triangles,
+	int *solid_map, int nsolids, unsigned int *earliest_time_int,
+	unsigned int *channel_q, unsigned int *channel_histories)
+{
+
+    int id = threadIdx.x + blockDim.x * blockIdx.x;
+
+    if (id < nphotons) {
+	curandState rng = s[id];
+	int photon_id = id + first_photon;
+	int triangle_id = last_hit_triangles[photon_id];
+		
+	if (triangle_id > -1) {
+	    int solid_id = solid_map[triangle_id];
+	    unsigned int history = photon_histories[photon_id];
+
+	    if (solid_id < nsolids && (history & detection_state)) {
+		float time = photon_times[photon_id] + curand_normal(&rng) * time_rms;
+		unsigned int time_int = float_to_sortable_int(time);
+			  
+		atomicMin(earliest_time_int + solid_id, time_int);
+		atomicAdd(channel_q + solid_id, 1);
+		atomicOr(channel_histories + solid_id, history);
+	    }
+
+	}
+
+	s[id] = rng;
+	
+    }
+    
+}
+
+__global__ void
+convert_sortable_int_to_float(int n, unsigned int *sortable_ints,
+			      float *float_output)
+{
+    int id = threadIdx.x + blockDim.x * blockIdx.x;
+	
+    if (id < n)
+	float_output[id] = sortable_int_to_float(sortable_ints[id]);
+}
+
+__global__ void
+bin_hits(int nchannels, unsigned int *channel_q, float *channel_time,
+	 unsigned int *hitcount, int tbins, float tmin, float tmax, int qbins,
+	 float qmin, float qmax, unsigned int *pdf)
+{
+    int id = threadIdx.x + blockDim.x * blockIdx.x;
+
+    if (id >= nchannels)
+	return;
+
+    unsigned int q = channel_q[id];
+    float t = channel_time[id];
+	
+    if (t < 1e8 && t >= tmin && t < tmax && q >= qmin && q < qmax) {
+	hitcount[id] += 1;
+		
+	int tbin = (t - tmin) / (tmax - tmin) * tbins;
+	int qbin = (q - qmin) / (qmax - qmin) * qbins;
+	    
+	// row major order (channel, t, q)
+	int bin = id * (tbins * qbins) + tbin * qbins + qbin;
+	pdf[bin] += 1;
+    }
+}
+	
+__global__ void
+accumulate_pdf_eval(int time_only, int nchannels, unsigned int *event_hit,
+		    float *event_time, float *event_charge, float *mc_time,
+		    unsigned int *mc_charge, // quirk of DAQ!
+		    unsigned int *hitcount, unsigned int *bincount,
+		    float min_twidth, float tmin, float tmax,
+		    float min_qwidth, float qmin, float qmax,
+		    float *nearest_mc, int min_bin_content)
+{
+    int id = threadIdx.x + blockDim.x * blockIdx.x;
+	
+    if (id >= nchannels)
+	return;
+	
+    // Was this channel hit in the Monte Carlo?
+    float channel_mc_time = mc_time[id];
+    if (channel_mc_time >= 1e8)
+	return; // Nothing else to do
+	
+    // Is this channel inside the range of the PDF?
+    float distance;
+    int channel_bincount = bincount[id];
+    if (time_only) {
+	if (channel_mc_time < tmin || channel_mc_time > tmax)
+	    return;  // Nothing else to do
+		
+	// This MC information is contained in the PDF
+	hitcount[id] += 1;
+	  
+	// Was this channel hit in the event-of-interest?
+	int channel_event_hit = event_hit[id];
+	if (!channel_event_hit)
+	    return; // No need to update PDF value for unhit channel
+		
+	// Are we inside the minimum size bin?
+	float channel_event_time = event_time[id];
+	distance = fabsf(channel_mc_time - channel_event_time);
+	if (distance < min_twidth/2.0)
+	    bincount[id] = channel_bincount + 1;
+
+    }
+    else { // time and charge PDF
+	float channel_mc_charge = mc_charge[id]; // int->float conversion because DAQ just returns an integer
+	
+	if (channel_mc_time < tmin || channel_mc_time > tmax ||
+	    channel_mc_charge < qmin || channel_mc_charge > qmax)
+	    return;  // Nothing else to do
+		
+	// This MC information is contained in the PDF
+	hitcount[id] += 1;
+	  
+	// Was this channel hit in the event-of-interest?
+	int channel_event_hit = event_hit[id];
+	if (!channel_event_hit)
+	    return; // No need to update PDF value for unhit channel
+		
+	// Are we inside the minimum size bin?
+	float channel_event_time = event_time[id];
+	float channel_event_charge = event_charge[id];
+	float normalized_time_distance = fabsf(channel_event_time - channel_mc_time)/min_twidth/2.0;
+	float normalized_charge_distance = fabsf(channel_event_charge - channel_mc_charge)/min_qwidth/2.0;
+	distance = sqrt(normalized_time_distance*normalized_time_distance + normalized_charge_distance*normalized_charge_distance);
+
+	if (distance < 1.0f)
+	    bincount[id] = channel_bincount + 1;
+    }
+
+    // Do we need to keep updating the nearest_mc list?
+    if (channel_bincount + 1 >= min_bin_content)
+	return; // No need to perform insertion into nearest_mc because bincount is a better estimate of the PDF value
+
+    // insertion sort the distance into the array of nearest MC points
+    int offset = min_bin_content * id;
+    int entry = min_bin_content - 1;
+	
+    // If last entry less than new entry, nothing to update
+    if (distance > nearest_mc[offset + entry])
+	return;
+
+    // Find where to insert the new entry while sliding the rest
+    // to the right
+    entry--;
+    while (entry >= 0) {
+	if (nearest_mc[offset+entry] >= distance)
+	    nearest_mc[offset+entry+1] = nearest_mc[offset+entry];
+	else
+	    break;
+	entry--;
+    }
+
+    nearest_mc[offset+entry+1] = distance;
+}
+
+} // extern "C"
diff --git a/chroma/cuda/geometry.h b/chroma/cuda/geometry.h
new file mode 100644
index 0000000..2b5eacb
--- /dev/null
+++ b/chroma/cuda/geometry.h
@@ -0,0 +1,74 @@
+#ifndef __GEOMETRY_H__
+#define __GEOMETRY_H__
+
+struct Material
+{
+    float *refractive_index;
+    float *absorption_length;
+    float *scattering_length;
+    unsigned int n;
+    float step;
+    float wavelength0;
+};
+
+struct Surface
+{
+    float *detect;
+    float *absorb;
+    float *reflect_diffuse;
+    float *reflect_specular;
+    unsigned int n;
+    float step;
+    float wavelength0;
+};
+
+struct Triangle
+{
+    float3 v0, v1, v2;
+};
+
+struct Geometry
+{
+    float3 *vertices;
+    uint3 *triangles;
+    unsigned int *material_codes;
+    unsigned int *colors;
+    float3 *lower_bounds;
+    float3 *upper_bounds;
+    unsigned int *node_map;
+    unsigned int *node_map_end;
+    Material **materials;
+    Surface **surfaces;
+    unsigned int start_node;
+    unsigned int first_node;
+};
+
+__device__ Triangle
+get_triangle(Geometry *geometry, const unsigned int &i)
+{
+    uint3 triangle_data = geometry->triangles[i];
+
+    Triangle triangle;
+    triangle.v0 = geometry->vertices[triangle_data.x];
+    triangle.v1 = geometry->vertices[triangle_data.y];
+    triangle.v2 = geometry->vertices[triangle_data.z];
+
+    return triangle;
+}
+
+template <class T>
+__device__ float
+interp_property(T *m, const float &x, const float *fp)
+{
+    if (x < m->wavelength0)
+	return fp[0];
+
+    if (x > (m->wavelength0 + (m->n-1)*m->step))
+	return fp[m->n-1];
+
+    int jl = (x-m->wavelength0)/m->step;
+
+    return fp[jl] + (x-(m->wavelength0 + jl*m->step))*(fp[jl+1]-fp[jl])/m->step;
+}
+
+#endif
diff --git a/chroma/cuda/hybrid_render.cu b/chroma/cuda/hybrid_render.cu
new file mode 100644
index 0000000..29edefa
--- /dev/null
+++ b/chroma/cuda/hybrid_render.cu
@@ -0,0 +1,202 @@
+//-*-c-*-
+#include <math_constants.h>
+#include <curand_kernel.h>
+
+#include "linalg.h"
+#include "matrix.h"
+#include "rotate.h"
+#include "mesh.h"
+#include "geometry.h"
+#include "photon.h"
+
+__device__ void
+fAtomicAdd(float *addr, float data)
+{
+    while (data)
+	data = atomicExch(addr, data+atomicExch(addr, 0.0f));
+}
+
+__device__ void
+to_diffuse(Photon &p, State &s, Geometry *g, curandState &rng, int max_steps)
+{
+    int steps = 0;
+    while (steps < max_steps) {
+	steps++;
+
+	int command;
+
+	fill_state(s, p, g);
+
+	if (p.last_hit_triangle == -1)
+	    break;
+
+	command = propagate_to_boundary(p, s, rng);
+
+	if (command == BREAK)
+	    break;
+
+	if (command == CONTINUE)
+	    continue;
+
+	if (s.surface_index != -1) {
+	    command = propagate_at_surface(p, s, rng, g);
+
+	    if (p.history & REFLECT_DIFFUSE)
+		break;
+
+	    if (command == BREAK)
+		break;
+
+	    if (command == CONTINUE)
+		continue;
+	}
+
+	propagate_at_boundary(p, s, rng);
+
+    } // while (steps < max_steps)
+
+} // to_diffuse
+
+extern "C"
+{
+
+__global__ void
+update_xyz_lookup(int nthreads, int total_threads, int offset, float3 position,
+		  curandState *rng_states, float wavelength, float3 xyz,
+		  float3 *xyz_lookup1, float3 *xyz_lookup2, int max_steps,
+		  Geometry *g)
+{
+    int kernel_id = blockIdx.x*blockDim.x + threadIdx.x;
+    int id = kernel_id + offset;
+
+    if (kernel_id >= nthreads || id >= total_threads)
+	return;
+
+    curandState rng = rng_states[kernel_id];
+
+    Triangle t = get_triangle(g, id);
+
+    float a = curand_uniform(&rng);
+    float b = uniform(&rng, 0.0f, (1.0f - a));
+    float c = 1.0f - a - b;
+
+    float3 direction = a*t.v0 + b*t.v1 + c*t.v2 - position;
+    direction /= norm(direction);
+
+    float distance;
+    int triangle_index = intersect_mesh(position, direction, g, distance);
+
+    if (triangle_index != id) {
+	rng_states[kernel_id] = rng;
+	return;
+    }
+
+    float3 v01 = t.v1 - t.v0;
+    float3 v12 = t.v2 - t.v1;
+    
+    float3 surface_normal = normalize(cross(v01,v12));
+
+    float cos_theta = dot(surface_normal,-direction);
+
+    if (cos_theta < 0.0f)
+	cos_theta = dot(-surface_normal,-direction);
+
+    Photon p;
+    p.position = position;
+    p.direction = direction;
+    p.wavelength = wavelength;
+    p.polarization = uniform_sphere(&rng);
+    p.last_hit_triangle = -1;
+    p.time = 0;
+    p.history = 0;
+
+    State s;
+    to_diffuse(p, s, g, rng, max_steps);
+
+    if (p.history & REFLECT_DIFFUSE) {
+	if (s.inside_to_outside) {
+	    fAtomicAdd(&xyz_lookup1[p.last_hit_triangle].x, cos_theta*xyz.x);
+	    fAtomicAdd(&xyz_lookup1[p.last_hit_triangle].y, cos_theta*xyz.y);
+	    fAtomicAdd(&xyz_lookup1[p.last_hit_triangle].z, cos_theta*xyz.z);
+	}
+	else {
+	    fAtomicAdd(&xyz_lookup2[p.last_hit_triangle].x, cos_theta*xyz.x);
+	    fAtomicAdd(&xyz_lookup2[p.last_hit_triangle].y, cos_theta*xyz.y);
+	    fAtomicAdd(&xyz_lookup2[p.last_hit_triangle].z, cos_theta*xyz.z);
+	}
+    }
+
+    rng_states[kernel_id] = rng;
+
+} // update_xyz_lookup
+
+__global__ void
+update_xyz_image(int nthreads, curandState *rng_states, float3 *positions,
+		 float3 *directions, float wavelength, float3 xyz,
+		 float3 *xyz_lookup1, float3 *xyz_lookup2, float3 *image,
+		 int nlookup_calls, int max_steps, Geometry *g)
+{
+    int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if (id >= nthreads)
+	return;
+
+    curandState rng = rng_states[id];
+
+    Photon p;
+    p.position = positions[id];
+    p.direction = directions[id];
+    p.direction /= norm(p.direction);
+    p.wavelength = wavelength;
+    p.polarization = uniform_sphere(&rng);
+    p.last_hit_triangle = -1;
+    p.time = 0;
+    p.history = 0;
+
+    State s;
+    to_diffuse(p, s, g, rng, max_steps);
+
+    if (p.history & REFLECT_DIFFUSE) {
+	if (s.inside_to_outside)
+	    image[id] += xyz*xyz_lookup1[p.last_hit_triangle]/nlookup_calls;
+	else
+	    image[id] += xyz*xyz_lookup2[p.last_hit_triangle]/nlookup_calls;
+    }
+
+    rng_states[id] = rng;
+
+} // update_xyz_image
+
+__global__ void
+process_image(int nthreads, float3 *image, unsigned int *pixels, int nimages)
+{
+    int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if (id >= nthreads)
+	return;
+
+    float3 rgb = image[id]/nimages;
+
+    if (rgb.x < 0.0f)
+	rgb.x = 0.0f;
+    if (rgb.y < 0.0f)
+	rgb.y = 0.0f;
+    if (rgb.z < 0.0f)
+	rgb.z = 0.0f;
+
+    if (rgb.x > 1.0f)
+	rgb.x = 1.0f;
+    if (rgb.y > 1.0f)
+	rgb.y = 1.0f;
+    if (rgb.z > 1.0f)
+	rgb.z = 1.0f;
+
+    unsigned int r = floorf(rgb.x*255.0f);
+    unsigned int g = floorf(rgb.y*255.0f);
+    unsigned int b = floorf(rgb.z*255.0f);
+
+    pixels[id] = 255 << 24 | r << 16 | g << 8 | b;
+
+} // process_image
+
+} // extern "c"
diff --git a/chroma/cuda/intersect.h b/chroma/cuda/intersect.h
new file mode 100644
index 0000000..978bde8
--- /dev/null
+++ b/chroma/cuda/intersect.h
@@ -0,0 +1,141 @@
+//-*-c-*-
+
+#ifndef __INTERSECT_H__
+#define __INTERSECT_H__
+
+#include "linalg.h"
+#include "matrix.h"
+#include "geometry.h"
+
+#define EPSILON 0.0f
+
+/* Tests the intersection between a ray and a triangle.
+   If the ray intersects the triangle, set `distance` to the distance from
+   `origin` to the intersection and return true, else return false.
+   `direction` must be normalized to one. */
+__device__ bool
+intersect_triangle(const float3 &origin, const float3 &direction,
+		   const Triangle &triangle, float &distance)
+{
+	float3 m1 = triangle.v1-triangle.v0;
+	float3 m2 = triangle.v2-triangle.v0;
+	float3 m3 = -direction;
+
+	Matrix m = make_matrix(m1, m2, m3);
+	
+	float determinant = det(m);
+
+	if (determinant == 0.0f)
+		return false;
+
+	float3 b = origin-triangle.v0;
+
+	float u1 = ((m.a11*m.a22 - m.a12*m.a21)*b.x +
+		    (m.a02*m.a21 - m.a01*m.a22)*b.y +
+		    (m.a01*m.a12 - m.a02*m.a11)*b.z)/determinant;
+
+	if (u1 < -EPSILON || u1 > 1.0f)
+		return false;
+
+	float u2 = ((m.a12*m.a20 - m.a10*m.a22)*b.x +
+		    (m.a00*m.a22 - m.a02*m.a20)*b.y +
+		    (m.a02*m.a10 - m.a00*m.a12)*b.z)/determinant;
+
+	if (u2 < -EPSILON || u2 > 1.0f)
+		return false;
+
+	float u3 = ((m.a10*m.a21 - m.a11*m.a20)*b.x +
+		    (m.a01*m.a20 - m.a00*m.a21)*b.y +
+		    (m.a00*m.a11 - m.a01*m.a10)*b.z)/determinant;
+
+	if (u3 <= 0.0f || (1.0f-u1-u2) < -EPSILON)
+		return false;
+
+	distance = u3;
+
+	return true;
+}
+
+/* Tests the intersection between a ray and an axis-aligned box defined by
+   an upper and lower bound. If the ray intersects the box, set
+   `distance_to_box` to the distance from `origin` to the intersection and
+   return true, else return false. `direction` must be normalized to one.
+
+   Source: "An Efficient and Robust Ray-Box Intersection Algorithm."
+   by Williams, et. al. */
+__device__ bool
+intersect_box(const float3 &origin, const float3 &direction,
+	      const float3 &lower_bound, const float3 &upper_bound,
+	      float& distance_to_box)
+{
+	float kmin, kmax, kymin, kymax, kzmin, kzmax;
+
+	float divx = 1.0f/direction.x;
+	if (divx >= 0.0f)
+	{
+		kmin = (lower_bound.x - origin.x)*divx;
+		kmax = (upper_bound.x - origin.x)*divx;
+	}
+	else
+	{
+		kmin = (upper_bound.x - origin.x)*divx;
+		kmax = (lower_bound.x - origin.x)*divx;
+	}
+
+	if (kmax < 0.0f)
+		return false;
+
+	float divy = 1.0f/direction.y;
+	if (divy >= 0.0f)
+	{
+		kymin = (lower_bound.y - origin.y)*divy;
+		kymax = (upper_bound.y - origin.y)*divy;
+	}
+	else
+	{
+		kymin = (upper_bound.y - origin.y)*divy;
+		kymax = (lower_bound.y - origin.y)*divy;
+	}
+
+	if (kymax < 0.0f)
+		return false;
+
+	if (kymin > kmin)
+		kmin = kymin;
+
+	if (kymax < kmax)
+		kmax = kymax;
+
+	if (kmin > kmax)
+		return false;
+
+	float divz = 1.0f/direction.z;
+	if (divz >= 0.0f)
+	{
+		kzmin = (lower_bound.z - origin.z)*divz;
+		kzmax = (upper_bound.z - origin.z)*divz;
+	}
+	else
+	{
+		kzmin = (upper_bound.z - origin.z)*divz;
+		kzmax = (lower_bound.z - origin.z)*divz;
+	}
+
+	if (kzmax < 0.0f)
+		return false;
+
+	if (kzmin > kmin)
+		kmin = kzmin;
+
+	if (kzmax < kmax)
+		kmax = kzmax;
+
+	if (kmin > kmax)
+		return false;
+
+	distance_to_box = kmin;
+
+	return true;
+}
+
+#endif
diff --git a/chroma/cuda/linalg.h b/chroma/cuda/linalg.h
new file mode 100644
index 0000000..35b2423
--- /dev/null
+++ b/chroma/cuda/linalg.h
@@ -0,0 +1,170 @@
+#ifndef __LINALG_H__
+#define __LINALG_H__
+
+__device__ float3
+operator- (const float3 &a)
+{
+    return make_float3(-a.x, -a.y, -a.z);
+}
+
+__device__ float3
+operator* (const float3 &a, const float3 &b)
+{
+    return make_float3(a.x*b.x, a.y*b.y, a.z*b.z);
+}
+
+__device__ float3
+operator/ (const float3 &a, const float3 &b)
+{
+    return make_float3(a.x/b.x, a.y/b.y, a.z/b.z);
+}
+
+__device__ void
+operator*= (float3 &a, const float3 &b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+}
+
+__device__ void
+operator/= (float3 &a, const float3 &b)
+{
+    a.x /= b.x;
+    a.y /= b.y;
+    a.z /= b.z;
+}
+
+__device__ float3
+operator+ (const float3 &a, const float3 &b)
+{
+    return make_float3(a.x+b.x, a.y+b.y, a.z+b.z);
+}
+
+__device__ void
+operator+= (float3 &a, const float3 &b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+}
+
+__device__ float3
+operator- (const float3 &a, const float3 &b)
+{
+    return make_float3(a.x-b.x, a.y-b.y, a.z-b.z);
+}
+
+__device__ void
+operator-= (float3 &a, const float3 &b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+}
+
+__device__ float3
+operator+ (const float3 &a, const float &c)
+{
+    return make_float3(a.x+c, a.y+c, a.z+c);
+}
+
+__device__ void
+operator+= (float3 &a, const float &c)
+{
+    a.x += c;
+    a.y += c;
+    a.z += c;
+}
+
+__device__ float3
+operator+ (const float &c, const float3 &a)
+{
+    return make_float3(c+a.x, c+a.y, c+a.z);
+}
+
+__device__ float3
+operator- (const float3 &a, const float &c)
+{
+    return make_float3(a.x-c, a.y-c, a.z-c);
+}
+
+__device__ void
+operator-= (float3 &a, const float &c)
+{
+    a.x -= c;
+    a.y -= c;
+    a.z -= c;
+}
+
+__device__ float3
+operator- (const float &c, const float3& a)
+{
+    return make_float3(c-a.x, c-a.y, c-a.z);
+}
+
+__device__ float3
+operator* (const float3 &a, const float &c)
+{
+    return make_float3(a.x*c, a.y*c, a.z*c);
+}
+
+__device__ void
+operator*= (float3 &a, const float &c)
+{
+    a.x *= c;
+    a.y *= c;
+    a.z *= c;
+}
+
+__device__ float3 
+operator* (const float &c, const float3& a)
+{
+    return make_float3(c*a.x, c*a.y, c*a.z);
+}
+
+__device__ float3
+operator/ (const float3 &a, const float &c)
+{
+    return make_float3(a.x/c, a.y/c, a.z/c);
+}
+
+__device__ void
+operator/= (float3 &a, const float &c)
+{
+    a.x /= c;
+    a.y /= c;
+    a.z /= c;
+}
+
+__device__ float3
+operator/ (const float &c, const float3 &a)
+{
+    return make_float3(c/a.x, c/a.y, c/a.z);
+}
+
+__device__ float
+dot(const float3 &a, const float3 &b)
+{
+    return a.x*b.x + a.y*b.y + a.z*b.z;
+}
+
+__device__ float3
+cross(const float3 &a, const float3 &b)
+{
+    return make_float3(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x);
+}
+
+__device__ float
+norm(const float3 &a)
+{
+    return sqrtf(dot(a,a));
+}
+
+__device__ float3
+normalize(const float3 &a)
+{
+    return a/norm(a);
+}
+
+#endif
diff --git a/chroma/cuda/matrix.h b/chroma/cuda/matrix.h
new file mode 100644
index 0000000..0a66e58
--- /dev/null
+++ b/chroma/cuda/matrix.h
@@ -0,0 +1,249 @@
+#ifndef __MATRIX_H__
+#define __MATRIX_H__
+
+struct Matrix
+{
+    float a00, a01, a02, a10, a11, a12, a20, a21, a22;
+};
+
+__device__ Matrix
+make_matrix(float a00, float a01, float a02,
+	    float a10, float a11, float a12,
+	    float a20, float a21, float a22)
+{
+    Matrix m = {a00, a01, a02, a10, a11, a12, a20, a21, a22};
+    return m;
+}
+
+__device__ Matrix
+make_matrix(const float3 &u1, const float3 &u2, const float3 &u3)
+{
+    Matrix m = {u1.x, u2.x, u3.x, u1.y, u2.y, u3.y, u1.z, u2.z, u3.z};
+    return m;
+}
+
+__device__ Matrix
+operator- (const Matrix &m)
+{
+    return make_matrix(-m.a00, -m.a01, -m.a02,
+		       -m.a10, -m.a11, -m.a12,
+		       -m.a20, -m.a21, -m.a22);
+}
+
+__device__ float3
+operator* (const Matrix &m, const float3 &a)
+{
+    return make_float3(m.a00*a.x + m.a01*a.y + m.a02*a.z,
+		       m.a10*a.x + m.a11*a.y + m.a12*a.z,
+		       m.a20*a.x + m.a21*a.y + m.a22*a.z);
+}
+
+__device__ Matrix
+operator+ (const Matrix &m, const Matrix &n)
+{
+    return make_matrix(m.a00+n.a00, m.a01+n.a01, m.a02+n.a02,
+		       m.a10+n.a10, m.a11+n.a11, m.a12+n.a12,
+		       m.a20+n.a20, m.a21+n.a21, m.a22+n.a22);
+}
+
+__device__ void
+operator+= (Matrix &m, const Matrix &n)
+{
+    m.a00 += n.a00;
+    m.a01 += n.a01;
+    m.a02 += n.a02;
+    m.a10 += n.a10;
+    m.a11 += n.a11;
+    m.a12 += n.a12;
+    m.a20 += n.a20;
+    m.a21 += n.a21;
+    m.a22 += n.a22;
+}
+
+__device__ Matrix
+operator- (const Matrix &m, const Matrix &n)
+{
+    return make_matrix(m.a00-n.a00, m.a01-n.a01, m.a02-n.a02,
+		       m.a10-n.a10, m.a11-n.a11, m.a12-n.a12,
+		       m.a20-n.a20, m.a21-n.a21, m.a22-n.a22);
+}
+
+__device__ void
+operator-= (Matrix &m, const Matrix& n)
+{
+    m.a00 -= n.a00;
+    m.a01 -= n.a01;
+    m.a02 -= n.a02;
+    m.a10 -= n.a10;
+    m.a11 -= n.a11;
+    m.a12 -= n.a12;
+    m.a20 -= n.a20;
+    m.a21 -= n.a21;
+    m.a22 -= n.a22;
+}
+
+__device__ Matrix
+operator* (const Matrix &m, const Matrix &n)
+{
+    return make_matrix(m.a00*n.a00 + m.a01*n.a10 + m.a02*n.a20,
+		       m.a00*n.a01 + m.a01*n.a11 + m.a02*n.a21,
+		       m.a00*n.a02 + m.a01*n.a12 + m.a02*n.a22,
+		       m.a10*n.a00 + m.a11*n.a10 + m.a12*n.a20,
+		       m.a10*n.a01 + m.a11*n.a11 + m.a12*n.a21,
+		       m.a10*n.a02 + m.a11*n.a12 + m.a12*n.a22,
+		       m.a20*n.a00 + m.a21*n.a10 + m.a22*n.a20,
+		       m.a20*n.a01 + m.a21*n.a11 + m.a22*n.a21,
+		       m.a20*n.a02 + m.a21*n.a12 + m.a22*n.a22);
+}
+
+__device__ Matrix
+operator+ (const Matrix &m, const float &c)
+{
+    return make_matrix(m.a00+c, m.a01+c, m.a02+c,
+		       m.a10+c, m.a11+c, m.a12+c,
+		       m.a20+c, m.a21+c, m.a22+c);
+}
+
+__device__ void
+operator+= (Matrix &m, const float &c)
+{
+    m.a00 += c;
+    m.a01 += c;
+    m.a02 += c;
+    m.a10 += c;
+    m.a11 += c;
+    m.a12 += c;
+    m.a20 += c;
+    m.a21 += c;
+    m.a22 += c;
+}
+
+__device__ Matrix
+operator+ (const float &c, const Matrix &m)
+{
+    return make_matrix(c+m.a00, c+m.a01, c+m.a02,
+		       c+m.a10, c+m.a11, c+m.a12,
+		       c+m.a20, c+m.a21, c+m.a22);
+}
+
+__device__ Matrix
+operator- (const Matrix &m, const float &c)
+{
+    return make_matrix(m.a00-c, m.a01-c, m.a02-c,
+		       m.a10-c, m.a11-c, m.a12-c,
+		       m.a20-c, m.a21-c, m.a22-c);
+}
+
+__device__ void
+operator-= (Matrix &m, const float &c)
+{
+    m.a00 -= c;
+    m.a01 -= c;
+    m.a02 -= c;
+    m.a10 -= c;
+    m.a11 -= c;
+    m.a12 -= c;
+    m.a20 -= c;
+    m.a21 -= c;
+    m.a22 -= c;
+}
+
+__device__ Matrix
+operator- (const float &c, const Matrix &m)
+{
+    return make_matrix(c-m.a00, c-m.a01, c-m.a02,
+		       c-m.a10, c-m.a11, c-m.a12,
+		       c-m.a20, c-m.a21, c-m.a22);
+}
+
+__device__ Matrix
+operator* (const Matrix &m, const float &c)
+{
+    return make_matrix(m.a00*c, m.a01*c, m.a02*c,
+		       m.a10*c, m.a11*c, m.a12*c,
+		       m.a20*c, m.a21*c, m.a22*c);
+}
+
+__device__ void
+operator*= (Matrix &m, const float &c)
+{
+    m.a00 *= c;
+    m.a01 *= c;
+    m.a02 *= c;
+    m.a10 *= c;
+    m.a11 *= c;
+    m.a12 *= c;
+    m.a20 *= c;
+    m.a21 *= c;
+    m.a22 *= c;
+}
+
+__device__ Matrix
+operator* (const float &c, const Matrix &m)
+{
+    return make_matrix(c*m.a00, c*m.a01, c*m.a02,
+		       c*m.a10, c*m.a11, c*m.a12,
+		       c*m.a20, c*m.a21, c*m.a22);
+}
+
+__device__ Matrix
+operator/ (const Matrix &m, const float &c)
+{
+    return make_matrix(m.a00/c, m.a01/c, m.a02/c,
+		       m.a10/c, m.a11/c, m.a12/c,
+		       m.a20/c, m.a21/c, m.a22/c);
+}
+
+__device__ void
+operator/= (Matrix &m, const float &c)
+{
+    m.a00 /= c;
+    m.a01 /= c;
+    m.a02 /= c;
+    m.a10 /= c;
+    m.a11 /= c;
+    m.a12 /= c;
+    m.a20 /= c;
+    m.a21 /= c;
+    m.a22 /= c;
+}
+
+__device__ Matrix
+operator/ (const float &c, const Matrix &m)
+{
+    return make_matrix(c/m.a00, c/m.a01, c/m.a02,
+		       c/m.a10, c/m.a11, c/m.a12,
+		       c/m.a20, c/m.a21, c/m.a22);
+}
+
+__device__ float
+det(const Matrix &m)
+{
+    return m.a00*(m.a11*m.a22 - m.a12*m.a21) -
+	   m.a10*(m.a01*m.a22 - m.a02*m.a21) +
+	   m.a20*(m.a01*m.a12 - m.a02*m.a11);
+}
+
+__device__ Matrix
+inv(const Matrix &m)
+{
+    return make_matrix(m.a11*m.a22 - m.a12*m.a21,
+		       m.a02*m.a21 - m.a01*m.a22,
+		       m.a01*m.a12 - m.a02*m.a11,
+		       m.a12*m.a20 - m.a10*m.a22,
+		       m.a00*m.a22 - m.a02*m.a20,
+		       m.a02*m.a10 - m.a00*m.a12,
+		       m.a10*m.a21 - m.a11*m.a20,
+		       m.a01*m.a20 - m.a00*m.a21,
+		       m.a00*m.a11 - m.a01*m.a10)/det(m);
+}
+
+__device__ Matrix
+outer(const float3 &a, const float3 &b)
+{
+    return make_matrix(a.x*b.x, a.x*b.y, a.x*b.z,
+		       a.y*b.x, a.y*b.y, a.y*b.z,
+		       a.z*b.x, a.z*b.y, a.z*b.z);
+}
+
+#endif
diff --git a/chroma/cuda/mesh.h b/chroma/cuda/mesh.h
new file mode 100644
index 0000000..d60d801
--- /dev/null
+++ b/chroma/cuda/mesh.h
@@ -0,0 +1,175 @@
+#ifndef __MESH_H__
+#define __MESH_H__
+
+#include "intersect.h"
+#include "geometry.h"
+
+#include "stdio.h"
+
+#define STACK_SIZE 100
+
+/* Tests the intersection between a ray and a node in the bounding volume
+   hierarchy. If the ray intersects the bounding volume and `min_distance`
+   is less than zero or the distance from `origin` to the intersection is
+   less than `min_distance`, return true, else return false. */
+__device__ bool
+intersect_node(const float3 &origin, const float3 &direction,
+	       Geometry *g, int i, float min_distance=-1.0f)
+{
+    /* assigning these to local variables is faster for some reason */
+    float3 lower_bound = g->lower_bounds[i];
+    float3 upper_bound = g->upper_bounds[i];
+
+    float distance_to_box;
+
+    if (intersect_box(origin, direction, lower_bound, upper_bound,
+		      distance_to_box)) {
+	if (min_distance < 0.0f)
+	    return true;
+
+	if (distance_to_box > min_distance)
+	    return false;
+
+	return true;
+    }
+    else {
+	return false;
+    }
+
+}
+
+/* Finds the intersection between a ray and `geometry`. If the ray does
+   intersect the mesh and the index of the intersected triangle is not equal
+   to `last_hit_triangle`, set `min_distance` to the distance from `origin` to
+   the intersection and return the index of the triangle which the ray
+   intersected, else return -1. */
+__device__ int
+intersect_mesh(const float3 &origin, const float3& direction, Geometry *g,
+	       float &min_distance, int last_hit_triangle = -1)
+{
+    int triangle_index = -1;
+
+    float distance;
+    min_distance = -1.0f;
+
+    if (!intersect_node(origin, direction, g, g->start_node, min_distance))
+	return -1;
+
+    unsigned int stack[STACK_SIZE];
+
+    unsigned int *head = &stack[0];
+    unsigned int *node = &stack[1];
+    unsigned int *tail = &stack[STACK_SIZE-1];
+    *node = g->start_node;
+
+    unsigned int i;
+
+    do
+    {
+	unsigned int first_child = g->node_map[*node];
+	unsigned int stop = g->node_map_end[*node];
+
+	while (*node >= g->first_node && stop == first_child+1) {
+	    *node = first_child;
+	    first_child = g->node_map[*node];
+	    stop = g->node_map_end[*node];
+	}
+		
+	if (*node >= g->first_node) {
+	    for (i=first_child; i < stop; i++) {
+		if (intersect_node(origin, direction, g, i, min_distance)) {
+		    *node = i;
+		    node++;
+		}
+	    }
+	    
+	    node--;
+	}
+	else {
+	    // node is a leaf
+	    for (i=first_child; i < stop; i++) {
+		if (last_hit_triangle == i)
+		    continue;
+
+		Triangle t = get_triangle(g, i);
+
+		if (intersect_triangle(origin, direction, t, distance)) {
+		    if (triangle_index == -1) {
+			triangle_index = i;
+			min_distance = distance;
+			continue;
+		    }
+
+		    if (distance < min_distance) {
+			triangle_index = i;
+			min_distance = distance;
+		    }
+		}
+	    } // triangle loop
+
+	    node--;
+
+	} // node is a leaf
+
+	if (node > tail) {
+	    printf("warning: intersect_mesh() aborted; node > tail\n");
+	    break;
+	}
+
+    } // while loop
+    while (node != head);
+    
+    return triangle_index;
+}
+
+extern "C"
+{
+
+__global__ void
+distance_to_mesh(int nthreads, float3 *_origin, float3 *_direction,
+		 Geometry *g, float *_distance)
+{
+    __shared__ Geometry sg;
+
+    if (threadIdx.x == 0)
+	sg = *g;
+
+    __syncthreads();
+
+    int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if (id >= nthreads)
+	return;
+
+    g = &sg;
+
+    float3 origin = _origin[id];
+    float3 direction = _direction[id];
+    direction /= norm(direction);
+
+    float distance;
+
+    int triangle_index = intersect_mesh(origin, direction, g, distance);
+
+    if (triangle_index != -1)
+	_distance[id] = distance;
+}
+
+__global__ void
+color_solids(int first_triangle, int nthreads, int *solid_id_map,
+	     bool *solid_hit, unsigned int *solid_colors, Geometry *g)
+{
+    int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if (id >= nthreads)
+	return;
+
+    int triangle_id = first_triangle + id;
+    int solid_id = solid_id_map[triangle_id];
+    if (solid_hit[solid_id])
+	g->colors[triangle_id] = solid_colors[solid_id];
+}
+
+} // extern "C"
+
+#endif
diff --git a/chroma/cuda/photon.h b/chroma/cuda/photon.h
new file mode 100644
index 0000000..8b7e588
--- /dev/null
+++ b/chroma/cuda/photon.h
@@ -0,0 +1,325 @@
+#ifndef __PHOTON_H__
+#define __PHOTON_H__
+
+#include "stdio.h"
+#include "linalg.h"
+#include "rotate.h"
+#include "random.h"
+#include "physical_constants.h"
+#include "mesh.h"
+#include "geometry.h"
+
+struct Photon
+{
+    float3 position;
+    float3 direction;
+    float3 polarization;
+    float wavelength;
+    float time;
+
+    unsigned int history;
+
+    int last_hit_triangle;
+};
+
+struct State
+{
+    bool inside_to_outside;
+
+    float3 surface_normal;
+
+    float refractive_index1, refractive_index2;
+    float absorption_length;
+    float scattering_length;
+
+    int surface_index;
+
+    float distance_to_boundary;
+};
+
+enum
+{
+    NO_HIT           = 0x1 << 0,
+    BULK_ABSORB      = 0x1 << 1,
+    SURFACE_DETECT   = 0x1 << 2,
+    SURFACE_ABSORB   = 0x1 << 3,
+    RAYLEIGH_SCATTER = 0x1 << 4,
+    REFLECT_DIFFUSE  = 0x1 << 5,
+    REFLECT_SPECULAR = 0x1 << 6,
+    NAN_ABORT        = 0x1 << 31
+}; // processes
+
+enum {BREAK, CONTINUE, PASS}; // return value from propagate_to_boundary
+
+__device__ int
+convert(int c)
+{
+    if (c & 0x80)
+	return (0xFFFFFF00 | c);
+    else
+	return c;
+}
+
+__device__ float
+get_theta(const float3 &a, const float3 &b)
+{
+    return acosf(fmaxf(-1.0f,fminf(1.0f,dot(a,b))));
+}
+
+__device__ void
+fill_state(State &s, Photon &p, Geometry *g)
+{
+    p.last_hit_triangle = intersect_mesh(p.position, p.direction, g,
+					 s.distance_to_boundary,
+					 p.last_hit_triangle);
+
+    if (p.last_hit_triangle == -1) {
+	p.history |= NO_HIT;
+	return;
+    }
+    
+    Triangle t = get_triangle(g, p.last_hit_triangle);
+    
+    unsigned int material_code = g->material_codes[p.last_hit_triangle];
+    
+    int inner_material_index = convert(0xFF & (material_code >> 24));
+    int outer_material_index = convert(0xFF & (material_code >> 16));
+    s.surface_index = convert(0xFF & (material_code >> 8));
+    
+    float3 v01 = t.v1 - t.v0;
+    float3 v12 = t.v2 - t.v1;
+    
+    s.surface_normal = normalize(cross(v01, v12));
+				 
+    Material *material1, *material2;
+    if (dot(s.surface_normal,-p.direction) > 0.0f) {
+	// outside to inside
+	material1 = g->materials[outer_material_index];
+	material2 = g->materials[inner_material_index];
+
+	s.inside_to_outside = false;
+    }
+    else {
+	// inside to outside
+	material1 = g->materials[inner_material_index];
+	material2 = g->materials[outer_material_index];
+	s.surface_normal = -s.surface_normal;
+
+	s.inside_to_outside = true;
+    }
+
+    s.refractive_index1 = interp_property(material1, p.wavelength,
+					  material1->refractive_index);
+    s.refractive_index2 = interp_property(material2, p.wavelength,
+					  material2->refractive_index);
+    s.absorption_length = interp_property(material1, p.wavelength,
+					  material1->absorption_length);
+    s.scattering_length = interp_property(material1, p.wavelength,
+					  material1->scattering_length);
+
+} // fill_state
+
+__device__ float3
+pick_new_direction(float3 axis, float theta, float phi)
+{
+    // Taken from SNOMAN rayscatter.for
+    float cos_theta = cosf(theta);
+    float sin_theta = sinf(theta);
+    float cos_phi   = cosf(phi);
+    float sin_phi   = sinf(phi);
+	
+    float sin_axis_theta = sqrt(1.0f - axis.z*axis.z);
+    float cos_axis_phi, sin_axis_phi;
+	
+    if (isnan(sin_axis_theta) || sin_axis_theta < 0.00001f) {
+	cos_axis_phi = 1.0f;
+	sin_axis_phi = 0.0f;
+    }
+    else {
+	cos_axis_phi = axis.x / sin_axis_theta;
+	sin_axis_phi = axis.y / sin_axis_theta;
+    }
+
+    float dirx = cos_theta*axis.x +
+	sin_theta*(axis.z*cos_phi*cos_axis_phi - sin_phi*sin_axis_phi);
+    float diry = cos_theta*axis.y +
+	sin_theta*(cos_phi*axis.z*sin_axis_phi - sin_phi*cos_axis_phi);
+    float dirz = cos_theta*axis.z - sin_theta*cos_phi*sin_axis_theta;
+
+    return make_float3(dirx, diry, dirz);
+}
+
+__device__ void
+rayleigh_scatter(Photon &p, curandState &rng)
+{
+    float cos_theta = 2.0f*cosf((acosf(1.0f - 2.0f*curand_uniform(&rng))-2*PI)/3.0f);
+    if (cos_theta > 1.0f)
+	cos_theta = 1.0f;
+    else if (cos_theta < -1.0f)
+	cos_theta = -1.0f;
+
+    float theta = acosf(cos_theta);
+    float phi = uniform(&rng, 0.0f, 2.0f * PI);
+
+    p.direction = pick_new_direction(p.polarization, theta, phi);
+
+    if (1.0f - fabsf(cos_theta) < 1e-6f) {
+	p.polarization = pick_new_direction(p.polarization, PI/2.0f, phi);
+    }
+    else {
+	// linear combination of old polarization and new direction
+	p.polarization = p.polarization - cos_theta * p.direction;
+    }
+
+    p.direction /= norm(p.direction);
+    p.polarization /= norm(p.polarization);
+} // scatter
+
+__device__ int propagate_to_boundary(Photon &p, State &s, curandState &rng)
+{
+    float absorption_distance = -s.absorption_length*logf(curand_uniform(&rng));
+    float scattering_distance = -s.scattering_length*logf(curand_uniform(&rng));
+
+    if (absorption_distance <= scattering_distance) {
+	if (absorption_distance <= s.distance_to_boundary) {
+	    p.time += absorption_distance/(SPEED_OF_LIGHT/s.refractive_index1);
+	    p.position += absorption_distance*p.direction;
+	    p.history |= BULK_ABSORB;
+
+	    p.last_hit_triangle = -1;
+
+	    return BREAK;
+	} // photon is absorbed in material1
+    }
+    else {
+	if (scattering_distance <= s.distance_to_boundary) {
+	    p.time += scattering_distance/(SPEED_OF_LIGHT/s.refractive_index1);
+	    p.position += scattering_distance*p.direction;
+
+	    rayleigh_scatter(p, rng);
+
+	    p.history |= RAYLEIGH_SCATTER;
+
+	    p.last_hit_triangle = -1;
+
+	    return CONTINUE;
+	} // photon is scattered in material1
+    } // if scattering_distance < absorption_distance
+
+    p.position += s.distance_to_boundary*p.direction;
+    p.time += s.distance_to_boundary/(SPEED_OF_LIGHT/s.refractive_index1);
+
+    return PASS;
+
+} // propagate_to_boundary
+
+__device__ void
+propagate_at_boundary(Photon &p, State &s, curandState &rng)
+{
+    float incident_angle = get_theta(s.surface_normal,-p.direction);
+    float refracted_angle = asinf(sinf(incident_angle)*s.refractive_index1/s.refractive_index2);
+
+    float3 incident_plane_normal = cross(p.direction, s.surface_normal);
+    float incident_plane_normal_length = norm(incident_plane_normal);
+
+    // Photons at normal incidence do not have a unique plane of incidence,
+    // so we have to pick the plane normal to be the polarization vector
+    // to get the correct logic below
+    if (incident_plane_normal_length < 1e-6f)
+	incident_plane_normal = p.polarization;
+    else
+	incident_plane_normal /= incident_plane_normal_length;
+
+    float normal_coefficient = dot(p.polarization, incident_plane_normal);
+    float normal_probability = normal_coefficient*normal_coefficient;
+
+    float reflection_coefficient;
+    if (curand_uniform(&rng) < normal_probability) {
+	// photon polarization normal to plane of incidence
+	reflection_coefficient = -sinf(incident_angle-refracted_angle)/sinf(incident_angle+refracted_angle);
+
+	if ((curand_uniform(&rng) < reflection_coefficient*reflection_coefficient) || isnan(refracted_angle)) {
+	    p.direction = rotate(s.surface_normal, incident_angle, incident_plane_normal);
+			
+	    p.history |= REFLECT_SPECULAR;
+	}
+	else {
+	    p.direction = rotate(s.surface_normal, PI-refracted_angle, incident_plane_normal);
+	}
+
+	p.polarization = incident_plane_normal;
+    }
+    else {
+	// photon polarization parallel to plane of incidence
+	reflection_coefficient = tanf(incident_angle-refracted_angle)/tanf(incident_angle+refracted_angle);
+
+	if ((curand_uniform(&rng) < reflection_coefficient*reflection_coefficient) || isnan(refracted_angle)) {
+	    p.direction = rotate(s.surface_normal, incident_angle, incident_plane_normal);
+			
+	    p.history |= REFLECT_SPECULAR;
+	}
+	else {
+	    p.direction = rotate(s.surface_normal, PI-refracted_angle, incident_plane_normal);
+	}
+
+	p.polarization = cross(incident_plane_normal, p.direction);
+	p.polarization /= norm(p.polarization);
+    }
+
+} // propagate_at_boundary
+
+__device__ int
+propagate_at_surface(Photon &p, State &s, curandState &rng, Geometry *geometry)
+{
+    Surface *surface = geometry->surfaces[s.surface_index];
+
+    /* since the surface properties are interpolated linearly, we are
+       guaranteed that they still sum to 1.0. */
+
+    float detect = interp_property(surface, p.wavelength, surface->detect);
+    float absorb = interp_property(surface, p.wavelength, surface->absorb);
+    float reflect_diffuse = interp_property(surface, p.wavelength, surface->reflect_diffuse);
+    float reflect_specular = interp_property(surface, p.wavelength, surface->reflect_specular);
+
+    float uniform_sample = curand_uniform(&rng);
+
+    if (uniform_sample < absorb) {
+	p.history |= SURFACE_ABSORB;
+	return BREAK;
+    }
+    else if (uniform_sample < absorb + detect) {
+	p.history |= SURFACE_DETECT;
+	return BREAK;
+    }
+    else if (uniform_sample < absorb + detect + reflect_diffuse) {
+	// diffusely reflect
+	p.direction = uniform_sphere(&rng);
+
+	if (dot(p.direction, s.surface_normal) < 0.0f)
+	    p.direction = -p.direction;
+
+	// randomize polarization?
+	p.polarization = cross(uniform_sphere(&rng), p.direction);
+	p.polarization /= norm(p.polarization);
+
+	p.history |= REFLECT_DIFFUSE;
+
+	return CONTINUE;
+    }
+    else {
+	// specularly reflect
+	float incident_angle = get_theta(s.surface_normal,-p.direction);
+	float3 incident_plane_normal = cross(p.direction, s.surface_normal);
+	incident_plane_normal /= norm(incident_plane_normal);
+
+	p.direction = rotate(s.surface_normal, incident_angle,
+			     incident_plane_normal);
+
+	p.history |= REFLECT_SPECULAR;
+
+	return CONTINUE;
+    }
+
+} // propagate_at_surface
+
+#endif
diff --git a/chroma/cuda/physical_constants.h b/chroma/cuda/physical_constants.h
new file mode 100644
index 0000000..2ff87cd
--- /dev/null
+++ b/chroma/cuda/physical_constants.h
@@ -0,0 +1,7 @@
+#ifndef __PHYSICAL_CONSTANTS_H__
+#define __PHYSICAL_CONSTANTS_H__
+
+#define SPEED_OF_LIGHT 2.99792458e8
+#define PI 3.141592653589793
+
+#endif
diff --git a/chroma/cuda/propagate.cu b/chroma/cuda/propagate.cu
new file mode 100644
index 0000000..fdcf532
--- /dev/null
+++ b/chroma/cuda/propagate.cu
@@ -0,0 +1,147 @@
+//-*-c-*-
+
+#include "linalg.h"
+#include "geometry.h"
+#include "photon.h"
+
+#include "stdio.h"
+
+extern "C"
+{
+
+__global__ void
+photon_duplicate(int first_photon, int nthreads,
+		 float3 *positions, float3 *directions,
+		 float *wavelengths, float3 *polarizations,
+		 float *times, unsigned int *histories,
+		 int *last_hit_triangles, 
+		 int copies, int stride)
+{
+    int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if (id >= nthreads)
+	return;
+
+    int photon_id = first_photon + id;
+
+    Photon p;
+    p.position = positions[photon_id];
+    p.direction = directions[photon_id];
+    p.polarization = polarizations[photon_id];
+    p.wavelength = wavelengths[photon_id];
+    p.time = times[photon_id];
+    p.last_hit_triangle = last_hit_triangles[photon_id];
+    p.history = histories[photon_id];
+
+    for (int i=1; i <= copies; i++) {
+      int target_photon_id = photon_id + stride * i;
+
+      positions[target_photon_id] = p.position;
+      directions[target_photon_id] = p.direction;
+      polarizations[target_photon_id] = p.polarization;
+      wavelengths[target_photon_id] = p.wavelength;
+      times[target_photon_id] = p.time;
+      last_hit_triangles[target_photon_id] = p.last_hit_triangle;
+      histories[target_photon_id] = p.history;
+    }
+}
+		 
+__global__ void
+propagate(int first_photon, int nthreads, unsigned int *input_queue,
+	  unsigned int *output_queue, curandState *rng_states,
+	  float3 *positions, float3 *directions,
+	  float *wavelengths, float3 *polarizations,
+	  float *times, unsigned int *histories,
+	  int *last_hit_triangles, int max_steps,
+	  Geometry *g)
+{
+    __shared__ Geometry sg;
+
+    if (threadIdx.x == 0)
+	sg = *g;
+
+    __syncthreads();
+
+    int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if (id >= nthreads)
+	return;
+
+    g = &sg;
+
+    curandState rng = rng_states[id];
+
+    int photon_id = input_queue[first_photon + id];
+
+    Photon p;
+    p.position = positions[photon_id];
+    p.direction = directions[photon_id];
+    p.direction /= norm(p.direction);
+    p.polarization = polarizations[photon_id];
+    p.polarization /= norm(p.polarization);
+    p.wavelength = wavelengths[photon_id];
+    p.time = times[photon_id];
+    p.last_hit_triangle = last_hit_triangles[photon_id];
+    p.history = histories[photon_id];
+
+    if (p.history & (NO_HIT | BULK_ABSORB | SURFACE_DETECT | SURFACE_ABSORB))
+	return;
+
+    State s;
+
+    int steps = 0;
+    while (steps < max_steps) {
+	steps++;
+
+	int command;
+
+	// check for NaN and fail
+	if (isnan(p.direction.x*p.direction.y*p.direction.z*p.position.x*p.position.y*p.position.z)) {
+	    p.history |= NO_HIT | NAN_ABORT;
+	    break;
+	}
+
+	fill_state(s, p, g);
+
+	if (p.last_hit_triangle == -1)
+	    break;
+
+	command = propagate_to_boundary(p, s, rng);
+
+	if (command == BREAK)
+	    break;
+
+	if (command == CONTINUE)
+	    continue;
+
+	if (s.surface_index != -1) {
+	    command = propagate_at_surface(p, s, rng, g);
+
+	    if (command == BREAK)
+		break;
+
+	    if (command == CONTINUE)
+		continue;
+	}
+
+	propagate_at_boundary(p, s, rng);
+
+    } // while (steps < max_steps)
+
+    rng_states[id] = rng;
+    positions[photon_id] = p.position;
+    directions[photon_id] = p.direction;
+    polarizations[photon_id] = p.polarization;
+    wavelengths[photon_id] = p.wavelength;
+    times[photon_id] = p.time;
+    histories[photon_id] = p.history;
+    last_hit_triangles[photon_id] = p.last_hit_triangle;
+	
+    // Not done, put photon in output queue
+    if ((p.history & (NO_HIT | BULK_ABSORB | SURFACE_DETECT | SURFACE_ABSORB)) == 0) {
+	int out_idx = atomicAdd(output_queue, 1);
+	output_queue[out_idx] = photon_id;
+    }
+} // propagate
+
+} // extern "C"
diff --git a/chroma/cuda/random.h b/chroma/cuda/random.h
new file mode 100644
index 0000000..e9d2e75
--- /dev/null
+++ b/chroma/cuda/random.h
@@ -0,0 +1,87 @@
+#ifndef __RANDOM_H__
+#define __RANDOM_H__
+
+#include <curand_kernel.h>
+
+#include "physical_constants.h"
+
+__device__ float
+uniform(curandState *s, const float &low, const float &high)
+{
+    return low + curand_uniform(s)*(high-low);
+}
+
+__device__ float3
+uniform_sphere(curandState *s)
+{
+    float theta = uniform(s, 0.0f, 2*PI);
+    float u = uniform(s, -1.0f, 1.0f);
+    float c = sqrtf(1.0f-u*u);
+
+    return make_float3(c*cosf(theta), c*sinf(theta), u);
+}
+
+// Draw a random sample given a cumulative distribution function
+// Assumptions: ncdf >= 2, cdf_y[0] is 0.0, and cdf_y[ncdf-1] is 1.0
+__device__ float
+sample_cdf(curandState *rng, int ncdf, float *cdf_x, float *cdf_y)
+{
+    float u = curand_uniform(rng);
+
+    // Find u in cdf_y with binary search
+    // list must contain at least 2 elements: 0.0 and 1.0
+    int lower=0;
+    int upper=ncdf-1;
+    while(lower < upper-1) {
+	int half = (lower+upper) / 2;
+	if (u < cdf_y[half])
+	    upper = half;
+	else
+	    lower = half;
+    }
+  
+    float frac = (u - cdf_y[lower]) / (cdf_y[upper] - cdf_y[lower]);
+    return cdf_x[lower] * frac + cdf_x[upper] * (1.0f - frac);
+}
+
+extern "C"
+{
+
+__global__ void
+init_rng(int nthreads, curandState *s, unsigned long long seed,
+	 unsigned long long offset)
+{
+    int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if (id >= nthreads)
+	return;
+
+    curand_init(seed, id, offset, &s[id]);
+}
+
+__global__ void
+fill_uniform(int nthreads, curandState *s, float *a, float low, float high)
+{
+    int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if (id >= nthreads)
+	return;
+
+    a[id] = uniform(&s[id], low, high);
+
+}
+
+__global__ void
+fill_uniform_sphere(int nthreads, curandState *s, float3 *a)
+{
+    int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if (id >= nthreads)
+	return;
+
+    a[id] = uniform_sphere(&s[id]);
+}
+
+} // extern "c"
+
+#endif
diff --git a/chroma/cuda/render.cu b/chroma/cuda/render.cu
new file mode 100644
index 0000000..d4ed443
--- /dev/null
+++ b/chroma/cuda/render.cu
@@ -0,0 +1,171 @@
+//-*-c-*-
+
+#include "linalg.h"
+#include "intersect.h"
+#include "mesh.h"
+#include "sorting.h"
+#include "geometry.h"
+
+#include "stdio.h"
+
+__device__ float4
+get_color(const float3 &direction, const Triangle &t, unsigned int rgba)
+{
+    float3 v01 = t.v1 - t.v0;
+    float3 v12 = t.v2 - t.v1;
+    
+    float3 surface_normal = normalize(cross(v01,v12));
+
+    float cos_theta = dot(surface_normal,-direction);
+
+    if (cos_theta < 0.0f)
+	cos_theta = -cos_theta;
+
+    unsigned int a0 = 0xff & (rgba >> 24);
+    unsigned int r0 = 0xff & (rgba >> 16);
+    unsigned int g0 = 0xff & (rgba >> 8);
+    unsigned int b0 = 0xff & rgba;
+
+    float alpha = (255 - a0)/255.0f;
+
+    return make_float4(r0*cos_theta, g0*cos_theta, b0*cos_theta, alpha);
+}
+
+extern "C"
+{
+
+__global__ void
+render(int nthreads, float3 *_origin, float3 *_direction, Geometry *g,
+       unsigned int alpha_depth, unsigned int *pixels, float *_dx,
+       unsigned int *dxlen, float4 *_color)
+{
+    __shared__ Geometry sg;
+
+    if (threadIdx.x == 0)
+	sg = *g;
+
+    __syncthreads();
+
+    int id = blockIdx.x*blockDim.x + threadIdx.x;
+	
+    if (id >= nthreads)
+	return;
+
+    g = &sg;
+
+    float3 origin = _origin[id];
+    float3 direction = _direction[id];
+    unsigned int n = dxlen[id];
+
+    float distance;
+
+    if (n < 1 && !intersect_node(origin, direction, g, g->start_node)) {
+	pixels[id] = 0;
+	return;
+    }
+
+    unsigned int stack[STACK_SIZE];
+
+    unsigned int *head = &stack[0];
+    unsigned int *node = &stack[1];
+    unsigned int *tail = &stack[STACK_SIZE-1];
+    *node = g->start_node;
+
+    float *dx = _dx + id*alpha_depth;
+    float4 *color_a = _color + id*alpha_depth;
+
+    unsigned int i;
+
+    do {
+	unsigned int first_child = g->node_map[*node];
+	unsigned int stop = g->node_map_end[*node];
+	
+	while (*node >= g->first_node && stop == first_child+1) {
+	    *node = first_child;
+	    first_child = g->node_map[*node];
+	    stop = g->node_map_end[*node];
+	}
+		
+	if (*node >= g->first_node) {
+	    for (i=first_child; i < stop; i++) {
+		if (intersect_node(origin, direction, g, i)) {
+		    *node = i;
+		    node++;
+		}
+	    }
+
+	    node--;
+	}
+	else {
+	    // node is a leaf
+	    for (i=first_child; i < stop; i++) {
+		Triangle t = get_triangle(g, i);
+		
+		if (intersect_triangle(origin, direction, t, distance)) {
+		    if (n < 1) {
+			dx[0] = distance;
+			    
+			unsigned int rgba = g->colors[i];
+			float4 color = get_color(direction, t, rgba);
+
+			color_a[0] = color;
+		    }
+		    else {
+			unsigned long j = searchsorted(n, dx, distance);
+
+			if (j <= alpha_depth-1) {
+			    insert(alpha_depth, dx, j, distance);
+
+			    unsigned int rgba = g->colors[i];
+			    float4 color = get_color(direction, t, rgba);
+
+			    insert(alpha_depth, color_a, j, color);
+			}
+		    }
+					
+		    if (n < alpha_depth)
+			n++;
+		}
+				
+	    } // triangle loop
+	    
+	    node--;
+	    
+	} // node is a leaf
+	
+    } // while loop
+    while (node != head);
+
+    if (n < 1) {
+	pixels[id] = 0;
+	return;
+    }
+
+    dxlen[id] = n;
+
+    float scale = 1.0f;
+    float fr = 0.0f;
+    float fg = 0.0f;
+    float fb = 0.0f;
+    for (i=0; i < n; i++) {
+	float alpha = color_a[i].w;
+
+	fr += scale*color_a[i].x*alpha;
+	fg += scale*color_a[i].y*alpha;
+	fb += scale*color_a[i].z*alpha;
+	
+	scale *= (1.0f-alpha);
+    }
+    unsigned int a;
+    if (n < alpha_depth)
+	a = floorf(255*(1.0f-scale));
+    else
+    	a = 255;
+    unsigned int red = floorf(fr/(1.0f-scale));
+    unsigned int green = floorf(fg/(1.0f-scale));
+    unsigned int blue = floorf(fb/(1.0f-scale));
+
+    pixels[id] = a << 24 | red << 16 | green << 8 | blue;
+}
+
+} // extern "C"
diff --git a/chroma/cuda/rotate.h b/chroma/cuda/rotate.h
new file mode 100644
index 0000000..15f8037
--- /dev/null
+++ b/chroma/cuda/rotate.h
@@ -0,0 +1,27 @@
+#ifndef __ROTATE_H__
+#define __ROTATE_H__
+
+#include "linalg.h"
+#include "matrix.h"
+
+__device__ const Matrix IDENTITY_MATRIX = {1,0,0,0,1,0,0,0,1};
+
+__device__ Matrix
+make_rotation_matrix(float phi, const float3 &n)
+{
+    float cos_phi = cosf(phi);
+    float sin_phi = sinf(phi);
+
+    return IDENTITY_MATRIX*cos_phi + (1-cos_phi)*outer(n,n) +
+	sin_phi*make_matrix(0,n.z,-n.y,-n.z,0,n.x,n.y,-n.x,0);
+}
+
+/* rotate points counterclockwise, when looking towards +infinity,
+   through an angle `phi` about the axis `n`. */
+__device__ float3
+rotate(const float3 &a, float phi, const float3 &n)
+{
+    return make_rotation_matrix(phi,n)*a;
+}
+
+#endif
diff --git a/chroma/cuda/sorting.h b/chroma/cuda/sorting.h
new file mode 100644
index 0000000..2bf1a5a
--- /dev/null
+++ b/chroma/cuda/sorting.h
@@ -0,0 +1,107 @@
+template <class T>
+__device__ void
+swap(T &a, T &b)
+{
+    T tmp = a;
+    a = b;
+    b = tmp;
+}
+
+template <class T>
+__device__ void
+reverse(int n, T *a)
+{
+    for (int i=0; i < n/2; i++)
+	swap(a[i],a[n-1-i]);
+}
+
+template <class T>
+__device__ void
+piksrt(int n, T *arr)
+{
+    int i,j;
+    T a;
+
+    for (j=1; j < n; j++) {
+	a = arr[j];
+	i = j-1;
+	while (i >= 0 && arr[i] > a) {
+	    arr[i+1] = arr[i];
+	    i--;
+	}
+	arr[i+1] = a;
+    }
+}
+
+template <class T, class U>
+__device__ void
+piksrt2(int n, T *arr, U *brr)
+{
+    int i,j;
+    T a;
+    U b;
+
+    for (j=1; j < n; j++) {
+	a = arr[j];
+	b = brr[j];
+	i = j-1;
+	while (i >= 0 && arr[i] > a) {
+	    arr[i+1] = arr[i];
+	    brr[i+1] = brr[i];
+	    i--;
+	}
+	arr[i+1] = a;
+	brr[i+1] = b;
+    }
+}
+
+/* Returns the index in `arr` where `x` should be inserted in order to
+   maintain order. If `n` equals one, return the index such that, when
+   `x` is inserted, `arr` will be in ascending order.
+*/
+template <class T>
+__device__ unsigned long
+searchsorted(unsigned long n, T *arr, const T &x)
+{
+    unsigned long ju,jm,jl;
+    int ascnd;
+
+    jl = 0;
+    ju = n;
+
+    ascnd = (arr[n-1] >= arr[0]);
+
+    while (ju-jl > 1) {
+	jm = (ju+jl) >> 1;
+
+	if ((x > arr[jm]) == ascnd)
+	    jl = jm;
+	else
+	    ju = jm;
+    }
+
+    if ((x <= arr[0]) == ascnd)
+	return 0;
+    else
+	return ju;
+}
+
+template <class T>
+__device__ void
+insert(unsigned long n, T *arr, unsigned long i, const T &x)
+{
+    unsigned long j;
+    for (j=n-1; j > i; j--)
+	arr[j] = arr[j-1];
+    arr[i] = x;
+}
+
+template <class T>
+__device__ void
+add_sorted(unsigned long n, T *arr, const T &x)
+{
+    unsigned long i = searchsorted(n, arr, x);
+
+    if (i < n)
+	insert(n, arr, i, x);
+}
diff --git a/chroma/cuda/tools.cu b/chroma/cuda/tools.cu
new file mode 100644
index 0000000..80d06ec
--- /dev/null
+++ b/chroma/cuda/tools.cu
@@ -0,0 +1,21 @@
+//--*-c-*-
+
+extern "C"
+{
+
+__global__ void
+interleave(int nthreads, unsigned long long *x, unsigned long long *y,
+	   unsigned long long *z, int bits, unsigned long long *dest)
+{
+    int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if (id >= nthreads)
+	return;
+
+    for (int i=0; i < bits; i++)
+	dest[id] |= (x[id] & 1 << i) << (2*i) |
+	            (y[id] & 1 << i) << (2*i+1) |
+	            (z[id] & 1 << i) << (2*i+2);
+}
+
+} // extern "C"
diff --git a/chroma/cuda/transform.cu b/chroma/cuda/transform.cu
new file mode 100644
index 0000000..1f4405e
--- /dev/null
+++ b/chroma/cuda/transform.cu
@@ -0,0 +1,51 @@
+//-*-c-*-
+
+#include "linalg.h"
+#include "rotate.h"
+
+extern "C"
+{
+
+/* Translate the points `a` by the vector `v` */
+__global__ void
+translate(int nthreads, float3 *a, float3 v)
+{
+    int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if (id >= nthreads)
+	return;
+
+    a[id] += v;
+}
+
+/* Rotate the points `a` through an angle `phi` counter-clockwise about the
+   axis `axis` (when looking towards +infinity). */
+__global__ void
+rotate(int nthreads, float3 *a, float phi, float3 axis)
+{
+    int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if (id >= nthreads)
+	return;
+
+    a[id] = rotate(a[id], phi, axis);
+}
+
+/* Rotate the points `a` through an angle `phi` counter-clockwise
+   (when looking towards +infinity along `axis`) about the axis defined
+   by the point `point` and the vector `axis` . */
+__global__ void
+rotate_around_point(int nthreads, float3 *a, float phi, float3 axis,
+		    float3 point)
+{
+    int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if (id >= nthreads)
+	return;
+
+    a[id] -= point;
+    a[id] = rotate(a[id], phi, axis);
+    a[id] += point;
+}
+
+} // extern "c"
diff --git a/distribute_setup.py b/distribute_setup.py
new file mode 100644
index 0000000..5a95c92
--- /dev/null
+++ b/distribute_setup.py
@@ -0,0 +1,485 @@
+#!python
+"""Bootstrap distribute installation
+
+If you want to use setuptools in your package's setup.py, just include this
+file in the same directory with it, and add this to the top of your setup.py::
+
+    from distribute_setup import use_setuptools
+    use_setuptools()
+
+If you want to require a specific version of setuptools, set a download
+mirror, or use an alternate download directory, you can do so by supplying
+the appropriate options to ``use_setuptools()``.
+
+This file can also be run as a script to install or upgrade setuptools.
+"""
+import os
+import sys
+import time
+import fnmatch
+import tempfile
+import tarfile
+from distutils import log
+
+try:
+    from site import USER_SITE
+except ImportError:
+    USER_SITE = None
+
+try:
+    import subprocess
+
+    def _python_cmd(*args):
+        args = (sys.executable,) + args
+        return subprocess.call(args) == 0
+
+except ImportError:
+    # will be used for python 2.3
+    def _python_cmd(*args):
+        args = (sys.executable,) + args
+        # quoting arguments if windows
+        if sys.platform == 'win32':
+            def quote(arg):
+                if ' ' in arg:
+                    return '"%s"' % arg
+                return arg
+            args = [quote(arg) for arg in args]
+        return os.spawnl(os.P_WAIT, sys.executable, *args) == 0
+
+DEFAULT_VERSION = "0.6.21"
+DEFAULT_URL = "http://pypi.python.org/packages/source/d/distribute/"
+SETUPTOOLS_FAKED_VERSION = "0.6c11"
+
+SETUPTOOLS_PKG_INFO = """\
+Metadata-Version: 1.0
+Name: setuptools
+Version: %s
+Summary: xxxx
+Home-page: xxx
+Author: xxx
+Author-email: xxx
+License: xxx
+Description: xxx
+""" % SETUPTOOLS_FAKED_VERSION
+
+
+def _install(tarball):
+    # extracting the tarball
+    tmpdir = tempfile.mkdtemp()
+    log.warn('Extracting in %s', tmpdir)
+    old_wd = os.getcwd()
+    try:
+        os.chdir(tmpdir)
+        tar = tarfile.open(tarball)
+        _extractall(tar)
+        tar.close()
+
+        # going in the directory
+        subdir = os.path.join(tmpdir, os.listdir(tmpdir)[0])
+        os.chdir(subdir)
+        log.warn('Now working in %s', subdir)
+
+        # installing
+        log.warn('Installing Distribute')
+        if not _python_cmd('setup.py', 'install'):
+            log.warn('Something went wrong during the installation.')
+            log.warn('See the error message above.')
+    finally:
+        os.chdir(old_wd)
+
+
+def _build_egg(egg, tarball, to_dir):
+    # extracting the tarball
+    tmpdir = tempfile.mkdtemp()
+    log.warn('Extracting in %s', tmpdir)
+    old_wd = os.getcwd()
+    try:
+        os.chdir(tmpdir)
+        tar = tarfile.open(tarball)
+        _extractall(tar)
+        tar.close()
+
+        # going in the directory
+        subdir = os.path.join(tmpdir, os.listdir(tmpdir)[0])
+        os.chdir(subdir)
+        log.warn('Now working in %s', subdir)
+
+        # building an egg
+        log.warn('Building a Distribute egg in %s', to_dir)
+        _python_cmd('setup.py', '-q', 'bdist_egg', '--dist-dir', to_dir)
+
+    finally:
+        os.chdir(old_wd)
+    # returning the result
+    log.warn(egg)
+    if not os.path.exists(egg):
+        raise IOError('Could not build the egg.')
+
+
+def _do_download(version, download_base, to_dir, download_delay):
+    egg = os.path.join(to_dir, 'distribute-%s-py%d.%d.egg'
+                       % (version, sys.version_info[0], sys.version_info[1]))
+    if not os.path.exists(egg):
+        tarball = download_setuptools(version, download_base,
+                                      to_dir, download_delay)
+        _build_egg(egg, tarball, to_dir)
+    sys.path.insert(0, egg)
+    import setuptools
+    setuptools.bootstrap_install_from = egg
+
+
+def use_setuptools(version=DEFAULT_VERSION, download_base=DEFAULT_URL,
+                   to_dir=os.curdir, download_delay=15, no_fake=True):
+    # making sure we use the absolute path
+    to_dir = os.path.abspath(to_dir)
+    was_imported = 'pkg_resources' in sys.modules or \
+        'setuptools' in sys.modules
+    try:
+        try:
+            import pkg_resources
+            if not hasattr(pkg_resources, '_distribute'):
+                if not no_fake:
+                    _fake_setuptools()
+                raise ImportError
+        except ImportError:
+            return _do_download(version, download_base, to_dir, download_delay)
+        try:
+            pkg_resources.require("distribute>="+version)
+            return
+        except pkg_resources.VersionConflict:
+            e = sys.exc_info()[1]
+            if was_imported:
+                sys.stderr.write(
+                "The required version of distribute (>=%s) is not available,\n"
+                "and can't be installed while this script is running. Please\n"
+                "install a more recent version first, using\n"
+                "'easy_install -U distribute'."
+                "\n\n(Currently using %r)\n" % (version, e.args[0]))
+                sys.exit(2)
+            else:
+                del pkg_resources, sys.modules['pkg_resources']    # reload ok
+                return _do_download(version, download_base, to_dir,
+                                    download_delay)
+        except pkg_resources.DistributionNotFound:
+            return _do_download(version, download_base, to_dir,
+                                download_delay)
+    finally:
+        if not no_fake:
+            _create_fake_setuptools_pkg_info(to_dir)
+
+def download_setuptools(version=DEFAULT_VERSION, download_base=DEFAULT_URL,
+                        to_dir=os.curdir, delay=15):
+    """Download distribute from a specified location and return its filename
+
+    `version` should be a valid distribute version number that is available
+    as an egg for download under the `download_base` URL (which should end
+    with a '/'). `to_dir` is the directory where the egg will be downloaded.
+    `delay` is the number of seconds to pause before an actual download
+    attempt.
+    """
+    # making sure we use the absolute path
+    to_dir = os.path.abspath(to_dir)
+    try:
+        from urllib.request import urlopen
+    except ImportError:
+        from urllib2 import urlopen
+    tgz_name = "distribute-%s.tar.gz" % version
+    url = download_base + tgz_name
+    saveto = os.path.join(to_dir, tgz_name)
+    src = dst = None
+    if not os.path.exists(saveto):  # Avoid repeated downloads
+        try:
+            log.warn("Downloading %s", url)
+            src = urlopen(url)
+            # Read/write all in one block, so we don't create a corrupt file
+            # if the download is interrupted.
+            data = src.read()
+            dst = open(saveto, "wb")
+            dst.write(data)
+        finally:
+            if src:
+                src.close()
+            if dst:
+                dst.close()
+    return os.path.realpath(saveto)
+
+def _no_sandbox(function):
+    def __no_sandbox(*args, **kw):
+        try:
+            from setuptools.sandbox import DirectorySandbox
+            if not hasattr(DirectorySandbox, '_old'):
+                def violation(*args):
+                    pass
+                DirectorySandbox._old = DirectorySandbox._violation
+                DirectorySandbox._violation = violation
+                patched = True
+            else:
+                patched = False
+        except ImportError:
+            patched = False
+
+        try:
+            return function(*args, **kw)
+        finally:
+            if patched:
+                DirectorySandbox._violation = DirectorySandbox._old
+                del DirectorySandbox._old
+
+    return __no_sandbox
+
+def _patch_file(path, content):
+    """Will backup the file then patch it"""
+    existing_content = open(path).read()
+    if existing_content == content:
+        # already patched
+        log.warn('Already patched.')
+        return False
+    log.warn('Patching...')
+    _rename_path(path)
+    f = open(path, 'w')
+    try:
+        f.write(content)
+    finally:
+        f.close()
+    return True
+
+_patch_file = _no_sandbox(_patch_file)
+
+def _same_content(path, content):
+    return open(path).read() == content
+
+def _rename_path(path):
+    new_name = path + '.OLD.%s' % time.time()
+    log.warn('Renaming %s into %s', path, new_name)
+    os.rename(path, new_name)
+    return new_name
+
+def _remove_flat_installation(placeholder):
+    if not os.path.isdir(placeholder):
+        log.warn('Unkown installation at %s', placeholder)
+        return False
+    found = False
+    for file in os.listdir(placeholder):
+        if fnmatch.fnmatch(file, 'setuptools*.egg-info'):
+            found = True
+            break
+    if not found:
+        log.warn('Could not locate setuptools*.egg-info')
+        return
+
+    log.warn('Removing elements out of the way...')
+    pkg_info = os.path.join(placeholder, file)
+    if os.path.isdir(pkg_info):
+        patched = _patch_egg_dir(pkg_info)
+    else:
+        patched = _patch_file(pkg_info, SETUPTOOLS_PKG_INFO)
+
+    if not patched:
+        log.warn('%s already patched.', pkg_info)
+        return False
+    # now let's move the files out of the way
+    for element in ('setuptools', 'pkg_resources.py', 'site.py'):
+        element = os.path.join(placeholder, element)
+        if os.path.exists(element):
+            _rename_path(element)
+        else:
+            log.warn('Could not find the %s element of the '
+                     'Setuptools distribution', element)
+    return True
+
+_remove_flat_installation = _no_sandbox(_remove_flat_installation)
+
+def _after_install(dist):
+    log.warn('After install bootstrap.')
+    placeholder = dist.get_command_obj('install').install_purelib
+    _create_fake_setuptools_pkg_info(placeholder)
+
+def _create_fake_setuptools_pkg_info(placeholder):
+    if not placeholder or not os.path.exists(placeholder):
+        log.warn('Could not find the install location')
+        return
+    pyver = '%s.%s' % (sys.version_info[0], sys.version_info[1])
+    setuptools_file = 'setuptools-%s-py%s.egg-info' % \
+            (SETUPTOOLS_FAKED_VERSION, pyver)
+    pkg_info = os.path.join(placeholder, setuptools_file)
+    if os.path.exists(pkg_info):
+        log.warn('%s already exists', pkg_info)
+        return
+
+    log.warn('Creating %s', pkg_info)
+    f = open(pkg_info, 'w')
+    try:
+        f.write(SETUPTOOLS_PKG_INFO)
+    finally:
+        f.close()
+
+    pth_file = os.path.join(placeholder, 'setuptools.pth')
+    log.warn('Creating %s', pth_file)
+    f = open(pth_file, 'w')
+    try:
+        f.write(os.path.join(os.curdir, setuptools_file))
+    finally:
+        f.close()
+
+_create_fake_setuptools_pkg_info = _no_sandbox(_create_fake_setuptools_pkg_info)
+
+def _patch_egg_dir(path):
+    # let's check if it's already patched
+    pkg_info = os.path.join(path, 'EGG-INFO', 'PKG-INFO')
+    if os.path.exists(pkg_info):
+        if _same_content(pkg_info, SETUPTOOLS_PKG_INFO):
+            log.warn('%s already patched.', pkg_info)
+            return False
+    _rename_path(path)
+    os.mkdir(path)
+    os.mkdir(os.path.join(path, 'EGG-INFO'))
+    pkg_info = os.path.join(path, 'EGG-INFO', 'PKG-INFO')
+    f = open(pkg_info, 'w')
+    try:
+        f.write(SETUPTOOLS_PKG_INFO)
+    finally:
+        f.close()
+    return True
+
+_patch_egg_dir = _no_sandbox(_patch_egg_dir)
+
+def _before_install():
+    log.warn('Before install bootstrap.')
+    _fake_setuptools()
+
+
+def _under_prefix(location):
+    if 'install' not in sys.argv:
+        return True
+    args = sys.argv[sys.argv.index('install')+1:]
+    for index, arg in enumerate(args):
+        for option in ('--root', '--prefix'):
+            if arg.startswith('%s=' % option):
+                top_dir = arg.split('root=')[-1]
+                return location.startswith(top_dir)
+            elif arg == option:
+                if len(args) > index:
+                    top_dir = args[index+1]
+                    return location.startswith(top_dir)
+        if arg == '--user' and USER_SITE is not None:
+            return location.startswith(USER_SITE)
+    return True
+
+
+def _fake_setuptools():
+    log.warn('Scanning installed packages')
+    try:
+        import pkg_resources
+    except ImportError:
+        # we're cool
+        log.warn('Setuptools or Distribute does not seem to be installed.')
+        return
+    ws = pkg_resources.working_set
+    try:
+        setuptools_dist = ws.find(pkg_resources.Requirement.parse('setuptools',
+                                  replacement=False))
+    except TypeError:
+        # old distribute API
+        setuptools_dist = ws.find(pkg_resources.Requirement.parse('setuptools'))
+
+    if setuptools_dist is None:
+        log.warn('No setuptools distribution found')
+        return
+    # detecting if it was already faked
+    setuptools_location = setuptools_dist.location
+    log.warn('Setuptools installation detected at %s', setuptools_location)
+
+    # if --root or --preix was provided, and if
+    # setuptools is not located in them, we don't patch it
+    if not _under_prefix(setuptools_location):
+        log.warn('Not patching, --root or --prefix is installing Distribute'
+                 ' in another location')
+        return
+
+    # let's see if its an egg
+    if not setuptools_location.endswith('.egg'):
+        log.warn('Non-egg installation')
+        res = _remove_flat_installation(setuptools_location)
+        if not res:
+            return
+    else:
+        log.warn('Egg installation')
+        pkg_info = os.path.join(setuptools_location, 'EGG-INFO', 'PKG-INFO')
+        if (os.path.exists(pkg_info) and
+            _same_content(pkg_info, SETUPTOOLS_PKG_INFO)):
+            log.warn('Already patched.')
+            return
+        log.warn('Patching...')
+        # let's create a fake egg replacing setuptools one
+        res = _patch_egg_dir(setuptools_location)
+        if not res:
+            return
+    log.warn('Patched done.')
+    _relaunch()
+
+
+def _relaunch():
+    log.warn('Relaunching...')
+    # we have to relaunch the process
+    # pip marker to avoid a relaunch bug
+    if sys.argv[:3] == ['-c', 'install', '--single-version-externally-managed']:
+        sys.argv[0] = 'setup.py'
+    args = [sys.executable] + sys.argv
+    sys.exit(subprocess.call(args))
+
+
+def _extractall(self, path=".", members=None):
+    """Extract all members from the archive to the current working
+       directory and set owner, modification time and permissions on
+       directories afterwards. `path' specifies a different directory
+       to extract to. `members' is optional and must be a subset of the
+       list returned by getmembers().
+    """
+    import copy
+    import operator
+    from tarfile import ExtractError
+    directories = []
+
+    if members is None:
+        members = self
+
+    for tarinfo in members:
+        if tarinfo.isdir():
+            # Extract directories with a safe mode.
+            directories.append(tarinfo)
+            tarinfo = copy.copy(tarinfo)
+            tarinfo.mode = 448 # decimal for oct 0700
+        self.extract(tarinfo, path)
+
+    # Reverse sort directories.
+    if sys.version_info < (2, 4):
+        def sorter(dir1, dir2):
+            return cmp(dir1.name, dir2.name)
+        directories.sort(sorter)
+        directories.reverse()
+    else:
+        directories.sort(key=operator.attrgetter('name'), reverse=True)
+
+    # Set correct owner, mtime and filemode on directories.
+    for tarinfo in directories:
+        dirpath = os.path.join(path, tarinfo.name)
+        try:
+            self.chown(tarinfo, dirpath)
+            self.utime(tarinfo, dirpath)
+            self.chmod(tarinfo, dirpath)
+        except ExtractError:
+            e = sys.exc_info()[1]
+            if self.errorlevel > 1:
+                raise
+            else:
+                self._dbg(1, "tarfile: %s" % e)
+
+
+def main(argv, version=DEFAULT_VERSION):
+    """Install or upgrade setuptools and EasyInstall"""
+    tarball = download_setuptools()
+    _install(tarball)
+
+
+if __name__ == '__main__':
+    main(sys.argv[1:])
diff --git a/src/__init__.py b/src/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/src/daq.cu b/src/daq.cu
deleted file mode 100644
index 5a68846..0000000
--- a/src/daq.cu
+++ /dev/null
@@ -1,198 +0,0 @@
-// -*-c++-*-
-#include <curand_kernel.h>
-
-__device__ unsigned int
-float_to_sortable_int(float f)
-{
-    return __float_as_int(f);
-    //int i = __float_as_int(f);
-    //unsigned int mask = -(int)(i >> 31) | 0x80000000;
-    //return i ^ mask;
-}
-
-__device__ float
-sortable_int_to_float(unsigned int i)
-{
-    return __int_as_float(i);
-    //unsigned int mask = ((i >> 31) - 1) | 0x80000000;
-    //return __int_as_float(i ^ mask);
-}
-
-extern "C"
-{
-
-__global__ void
-reset_earliest_time_int(float maxtime, int ntime_ints, unsigned int *time_ints)
-{
-    int id = threadIdx.x + blockDim.x * blockIdx.x;
-    if (id < ntime_ints) {
-	unsigned int maxtime_int = float_to_sortable_int(maxtime);
-	time_ints[id] = maxtime_int;
-    }
-}
-
-__global__ void
-run_daq(curandState *s, unsigned int detection_state, float time_rms,
-	int first_photon, int nphotons, float *photon_times,
-	unsigned int *photon_histories, int *last_hit_triangles,
-	int *solid_map, int nsolids, unsigned int *earliest_time_int,
-	unsigned int *channel_q, unsigned int *channel_histories)
-{
-
-    int id = threadIdx.x + blockDim.x * blockIdx.x;
-
-    if (id < nphotons) {
-	curandState rng = s[id];
-	int photon_id = id + first_photon;
-	int triangle_id = last_hit_triangles[photon_id];
-		
-	if (triangle_id > -1) {
-	    int solid_id = solid_map[triangle_id];
-	    unsigned int history = photon_histories[photon_id];
-
-	    if (solid_id < nsolids && (history & detection_state)) {
-		float time = photon_times[photon_id] + curand_normal(&rng) * time_rms;
-		unsigned int time_int = float_to_sortable_int(time);
-			  
-		atomicMin(earliest_time_int + solid_id, time_int);
-		atomicAdd(channel_q + solid_id, 1);
-		atomicOr(channel_histories + solid_id, history);
-	    }
-
-	}
-
-	s[id] = rng;
-	
-    }
-    
-}
-
-__global__ void
-convert_sortable_int_to_float(int n, unsigned int *sortable_ints,
-			      float *float_output)
-{
-    int id = threadIdx.x + blockDim.x * blockIdx.x;
-	
-    if (id < n)
-	float_output[id] = sortable_int_to_float(sortable_ints[id]);
-}
-
-__global__ void
-bin_hits(int nchannels, unsigned int *channel_q, float *channel_time,
-	 unsigned int *hitcount, int tbins, float tmin, float tmax, int qbins,
-	 float qmin, float qmax, unsigned int *pdf)
-{
-    int id = threadIdx.x + blockDim.x * blockIdx.x;
-
-    if (id >= nchannels)
-	return;
-
-    unsigned int q = channel_q[id];
-    float t = channel_time[id];
-	
-    if (t < 1e8 && t >= tmin && t < tmax && q >= qmin && q < qmax) {
-	hitcount[id] += 1;
-		
-	int tbin = (t - tmin) / (tmax - tmin) * tbins;
-	int qbin = (q - qmin) / (qmax - qmin) * qbins;
-	    
-	// row major order (channel, t, q)
-	int bin = id * (tbins * qbins) + tbin * qbins + qbin;
-	pdf[bin] += 1;
-    }
-}
-	
-__global__ void
-accumulate_pdf_eval(int time_only, int nchannels, unsigned int *event_hit,
-		    float *event_time, float *event_charge, float *mc_time,
-		    unsigned int *mc_charge, // quirk of DAQ!
-		    unsigned int *hitcount, unsigned int *bincount,
-		    float min_twidth, float tmin, float tmax,
-		    float min_qwidth, float qmin, float qmax,
-		    float *nearest_mc, int min_bin_content)
-{
-    int id = threadIdx.x + blockDim.x * blockIdx.x;
-	
-    if (id >= nchannels)
-	return;
-	
-    // Was this channel hit in the Monte Carlo?
-    float channel_mc_time = mc_time[id];
-    if (channel_mc_time >= 1e8)
-	return; // Nothing else to do
-	
-    // Is this channel inside the range of the PDF?
-    float distance;
-    int channel_bincount = bincount[id];
-    if (time_only) {
-	if (channel_mc_time < tmin || channel_mc_time > tmax)
-	    return;  // Nothing else to do
-		
-	// This MC information is contained in the PDF
-	hitcount[id] += 1;
-	  
-	// Was this channel hit in the event-of-interest?
-	int channel_event_hit = event_hit[id];
-	if (!channel_event_hit)
-	    return; // No need to update PDF value for unhit channel
-		
-	// Are we inside the minimum size bin?
-	float channel_event_time = event_time[id];
-	distance = fabsf(channel_mc_time - channel_event_time);
-	if (distance < min_twidth/2.0)
-	    bincount[id] = channel_bincount + 1;
-
-    }
-    else { // time and charge PDF
-	float channel_mc_charge = mc_charge[id]; // int->float conversion because DAQ just returns an integer
-	
-	if (channel_mc_time < tmin || channel_mc_time > tmax ||
-	    channel_mc_charge < qmin || channel_mc_charge > qmax)
-	    return;  // Nothing else to do
-		
-	// This MC information is contained in the PDF
-	hitcount[id] += 1;
-	  
-	// Was this channel hit in the event-of-interest?
-	int channel_event_hit = event_hit[id];
-	if (!channel_event_hit)
-	    return; // No need to update PDF value for unhit channel
-		
-	// Are we inside the minimum size bin?
-	float channel_event_time = event_time[id];
-	float channel_event_charge = event_charge[id];
-	float normalized_time_distance = fabsf(channel_event_time - channel_mc_time)/min_twidth/2.0;
-	float normalized_charge_distance = fabsf(channel_event_charge - channel_mc_charge)/min_qwidth/2.0;
-	distance = sqrt(normalized_time_distance*normalized_time_distance + normalized_charge_distance*normalized_charge_distance);
-
-	if (distance < 1.0f)
-	    bincount[id] = channel_bincount + 1;
-    }
-
-    // Do we need to keep updating the nearest_mc list?
-    if (channel_bincount + 1 >= min_bin_content)
-	return; // No need to perform insertion into nearest_mc because bincount is a better estimate of the PDF value
-
-    // insertion sort the distance into the array of nearest MC points
-    int offset = min_bin_content * id;
-    int entry = min_bin_content - 1;
-	
-    // If last entry less than new entry, nothing to update
-    if (distance > nearest_mc[offset + entry])
-	return;
-
-    // Find where to insert the new entry while sliding the rest
-    // to the right
-    entry--;
-    while (entry >= 0) {
-	if (nearest_mc[offset+entry] >= distance)
-	    nearest_mc[offset+entry+1] = nearest_mc[offset+entry];
-	else
-	    break;
-	entry--;
-    }
-
-    nearest_mc[offset+entry+1] = distance;
-}
-
-} // extern "C"
diff --git a/src/geometry.h b/src/geometry.h
deleted file mode 100644
index 2b5eacb..0000000
--- a/src/geometry.h
+++ /dev/null
@@ -1,74 +0,0 @@
-#ifndef __GEOMETRY_H__
-#define __GEOMETRY_H__
-
-struct Material
-{
-    float *refractive_index;
-    float *absorption_length;
-    float *scattering_length;
-    unsigned int n;
-    float step;
-    float wavelength0;
-};
-
-struct Surface
-{
-    float *detect;
-    float *absorb;
-    float *reflect_diffuse;
-    float *reflect_specular;
-    unsigned int n;
-    float step;
-    float wavelength0;
-};
-
-struct Triangle
-{
-    float3 v0, v1, v2;
-};
-
-struct Geometry
-{
-    float3 *vertices;
-    uint3 *triangles;
-    unsigned int *material_codes;
-    unsigned int *colors;
-    float3 *lower_bounds;
-    float3 *upper_bounds;
-    unsigned int *node_map;
-    unsigned int *node_map_end;
-    Material **materials;
-    Surface **surfaces;
-    unsigned int start_node;
-    unsigned int first_node;
-};
-
-__device__ Triangle
-get_triangle(Geometry *geometry, const unsigned int &i)
-{
-    uint3 triangle_data = geometry->triangles[i];
-
-    Triangle triangle;
-    triangle.v0 = geometry->vertices[triangle_data.x];
-    triangle.v1 = geometry->vertices[triangle_data.y];
-    triangle.v2 = geometry->vertices[triangle_data.z];
-
-    return triangle;
-}
-
-template <class T>
-__device__ float
-interp_property(T *m, const float &x, const float *fp)
-{
-    if (x < m->wavelength0)
-	return fp[0];
-
-    if (x > (m->wavelength0 + (m->n-1)*m->step))
-	return fp[m->n-1];
-
-    int jl = (x-m->wavelength0)/m->step;
-
-    return fp[jl] + (x-(m->wavelength0 + jl*m->step))*(fp[jl+1]-fp[jl])/m->step;
-}
-
-#endif
diff --git a/src/hybrid_render.cu b/src/hybrid_render.cu
deleted file mode 100644
index 29edefa..0000000
--- a/src/hybrid_render.cu
+++ /dev/null
@@ -1,202 +0,0 @@
-//-*-c-*-
-#include <math_constants.h>
-#include <curand_kernel.h>
-
-#include "linalg.h"
-#include "matrix.h"
-#include "rotate.h"
-#include "mesh.h"
-#include "geometry.h"
-#include "photon.h"
-
-__device__ void
-fAtomicAdd(float *addr, float data)
-{
-    while (data)
-	data = atomicExch(addr, data+atomicExch(addr, 0.0f));
-}
-
-__device__ void
-to_diffuse(Photon &p, State &s, Geometry *g, curandState &rng, int max_steps)
-{
-    int steps = 0;
-    while (steps < max_steps) {
-	steps++;
-
-	int command;
-
-	fill_state(s, p, g);
-
-	if (p.last_hit_triangle == -1)
-	    break;
-
-	command = propagate_to_boundary(p, s, rng);
-
-	if (command == BREAK)
-	    break;
-
-	if (command == CONTINUE)
-	    continue;
-
-	if (s.surface_index != -1) {
-	    command = propagate_at_surface(p, s, rng, g);
-
-	    if (p.history & REFLECT_DIFFUSE)
-		break;
-
-	    if (command == BREAK)
-		break;
-
-	    if (command == CONTINUE)
-		continue;
-	}
-
-	propagate_at_boundary(p, s, rng);
-
-    } // while (steps < max_steps)
-
-} // to_diffuse
-
-extern "C"
-{
-
-__global__ void
-update_xyz_lookup(int nthreads, int total_threads, int offset, float3 position,
-		  curandState *rng_states, float wavelength, float3 xyz,
-		  float3 *xyz_lookup1, float3 *xyz_lookup2, int max_steps,
-		  Geometry *g)
-{
-    int kernel_id = blockIdx.x*blockDim.x + threadIdx.x;
-    int id = kernel_id + offset;
-
-    if (kernel_id >= nthreads || id >= total_threads)
-	return;
-
-    curandState rng = rng_states[kernel_id];
-
-    Triangle t = get_triangle(g, id);
-
-    float a = curand_uniform(&rng);
-    float b = uniform(&rng, 0.0f, (1.0f - a));
-    float c = 1.0f - a - b;
-
-    float3 direction = a*t.v0 + b*t.v1 + c*t.v2 - position;
-    direction /= norm(direction);
-
-    float distance;
-    int triangle_index = intersect_mesh(position, direction, g, distance);
-
-    if (triangle_index != id) {
-	rng_states[kernel_id] = rng;
-	return;
-    }
-
-    float3 v01 = t.v1 - t.v0;
-    float3 v12 = t.v2 - t.v1;
-    
-    float3 surface_normal = normalize(cross(v01,v12));
-
-    float cos_theta = dot(surface_normal,-direction);
-
-    if (cos_theta < 0.0f)
-	cos_theta = dot(-surface_normal,-direction);
-
-    Photon p;
-    p.position = position;
-    p.direction = direction;
-    p.wavelength = wavelength;
-    p.polarization = uniform_sphere(&rng);
-    p.last_hit_triangle = -1;
-    p.time = 0;
-    p.history = 0;
-
-    State s;
-    to_diffuse(p, s, g, rng, max_steps);
-
-    if (p.history & REFLECT_DIFFUSE) {
-	if (s.inside_to_outside) {
-	    fAtomicAdd(&xyz_lookup1[p.last_hit_triangle].x, cos_theta*xyz.x);
-	    fAtomicAdd(&xyz_lookup1[p.last_hit_triangle].y, cos_theta*xyz.y);
-	    fAtomicAdd(&xyz_lookup1[p.last_hit_triangle].z, cos_theta*xyz.z);
-	}
-	else {
-	    fAtomicAdd(&xyz_lookup2[p.last_hit_triangle].x, cos_theta*xyz.x);
-	    fAtomicAdd(&xyz_lookup2[p.last_hit_triangle].y, cos_theta*xyz.y);
-	    fAtomicAdd(&xyz_lookup2[p.last_hit_triangle].z, cos_theta*xyz.z);
-	}
-    }
-
-    rng_states[kernel_id] = rng;
-
-} // update_xyz_lookup
-
-__global__ void
-update_xyz_image(int nthreads, curandState *rng_states, float3 *positions,
-		 float3 *directions, float wavelength, float3 xyz,
-		 float3 *xyz_lookup1, float3 *xyz_lookup2, float3 *image,
-		 int nlookup_calls, int max_steps, Geometry *g)
-{
-    int id = blockIdx.x*blockDim.x + threadIdx.x;
-
-    if (id >= nthreads)
-	return;
-
-    curandState rng = rng_states[id];
-
-    Photon p;
-    p.position = positions[id];
-    p.direction = directions[id];
-    p.direction /= norm(p.direction);
-    p.wavelength = wavelength;
-    p.polarization = uniform_sphere(&rng);
-    p.last_hit_triangle = -1;
-    p.time = 0;
-    p.history = 0;
-
-    State s;
-    to_diffuse(p, s, g, rng, max_steps);
-
-    if (p.history & REFLECT_DIFFUSE) {
-	if (s.inside_to_outside)
-	    image[id] += xyz*xyz_lookup1[p.last_hit_triangle]/nlookup_calls;
-	else
-	    image[id] += xyz*xyz_lookup2[p.last_hit_triangle]/nlookup_calls;
-    }
-
-    rng_states[id] = rng;
-
-} // update_xyz_image
-
-__global__ void
-process_image(int nthreads, float3 *image, unsigned int *pixels, int nimages)
-{
-    int id = blockIdx.x*blockDim.x + threadIdx.x;
-
-    if (id >= nthreads)
-	return;
-
-    float3 rgb = image[id]/nimages;
-
-    if (rgb.x < 0.0f)
-	rgb.x = 0.0f;
-    if (rgb.y < 0.0f)
-	rgb.y = 0.0f;
-    if (rgb.z < 0.0f)
-	rgb.z = 0.0f;
-
-    if (rgb.x > 1.0f)
-	rgb.x = 1.0f;
-    if (rgb.y > 1.0f)
-	rgb.y = 1.0f;
-    if (rgb.z > 1.0f)
-	rgb.z = 1.0f;
-
-    unsigned int r = floorf(rgb.x*255.0f);
-    unsigned int g = floorf(rgb.y*255.0f);
-    unsigned int b = floorf(rgb.z*255.0f);
-
-    pixels[id] = 255 << 24 | r << 16 | g << 8 | b;
-
-} // process_image
-
-} // extern "c"
diff --git a/src/intersect.h b/src/intersect.h
deleted file mode 100644
index 978bde8..0000000
--- a/src/intersect.h
+++ /dev/null
@@ -1,141 +0,0 @@
-//-*-c-*-
-
-#ifndef __INTERSECT_H__
-#define __INTERSECT_H__
-
-#include "linalg.h"
-#include "matrix.h"
-#include "geometry.h"
-
-#define EPSILON 0.0f
-
-/* Tests the intersection between a ray and a triangle.
-   If the ray intersects the triangle, set `distance` to the distance from
-   `origin` to the intersection and return true, else return false.
-   `direction` must be normalized to one. */
-__device__ bool
-intersect_triangle(const float3 &origin, const float3 &direction,
-		   const Triangle &triangle, float &distance)
-{
-	float3 m1 = triangle.v1-triangle.v0;
-	float3 m2 = triangle.v2-triangle.v0;
-	float3 m3 = -direction;
-
-	Matrix m = make_matrix(m1, m2, m3);
-	
-	float determinant = det(m);
-
-	if (determinant == 0.0f)
-		return false;
-
-	float3 b = origin-triangle.v0;
-
-	float u1 = ((m.a11*m.a22 - m.a12*m.a21)*b.x +
-		    (m.a02*m.a21 - m.a01*m.a22)*b.y +
-		    (m.a01*m.a12 - m.a02*m.a11)*b.z)/determinant;
-
-	if (u1 < -EPSILON || u1 > 1.0f)
-		return false;
-
-	float u2 = ((m.a12*m.a20 - m.a10*m.a22)*b.x +
-		    (m.a00*m.a22 - m.a02*m.a20)*b.y +
-		    (m.a02*m.a10 - m.a00*m.a12)*b.z)/determinant;
-
-	if (u2 < -EPSILON || u2 > 1.0f)
-		return false;
-
-	float u3 = ((m.a10*m.a21 - m.a11*m.a20)*b.x +
-		    (m.a01*m.a20 - m.a00*m.a21)*b.y +
-		    (m.a00*m.a11 - m.a01*m.a10)*b.z)/determinant;
-
-	if (u3 <= 0.0f || (1.0f-u1-u2) < -EPSILON)
-		return false;
-
-	distance = u3;
-
-	return true;
-}
-
-/* Tests the intersection between a ray and an axis-aligned box defined by
-   an upper and lower bound. If the ray intersects the box, set
-   `distance_to_box` to the distance from `origin` to the intersection and
-   return true, else return false. `direction` must be normalized to one.
-
-   Source: "An Efficient and Robust Ray-Box Intersection Algorithm."
-   by Williams, et. al. */
-__device__ bool
-intersect_box(const float3 &origin, const float3 &direction,
-	      const float3 &lower_bound, const float3 &upper_bound,
-	      float& distance_to_box)
-{
-	float kmin, kmax, kymin, kymax, kzmin, kzmax;
-
-	float divx = 1.0f/direction.x;
-	if (divx >= 0.0f)
-	{
-		kmin = (lower_bound.x - origin.x)*divx;
-		kmax = (upper_bound.x - origin.x)*divx;
-	}
-	else
-	{
-		kmin = (upper_bound.x - origin.x)*divx;
-		kmax = (lower_bound.x - origin.x)*divx;
-	}
-
-	if (kmax < 0.0f)
-		return false;
-
-	float divy = 1.0f/direction.y;
-	if (divy >= 0.0f)
-	{
-		kymin = (lower_bound.y - origin.y)*divy;
-		kymax = (upper_bound.y - origin.y)*divy;
-	}
-	else
-	{
-		kymin = (upper_bound.y - origin.y)*divy;
-		kymax = (lower_bound.y - origin.y)*divy;
-	}
-
-	if (kymax < 0.0f)
-		return false;
-
-	if (kymin > kmin)
-		kmin = kymin;
-
-	if (kymax < kmax)
-		kmax = kymax;
-
-	if (kmin > kmax)
-		return false;
-
-	float divz = 1.0f/direction.z;
-	if (divz >= 0.0f)
-	{
-		kzmin = (lower_bound.z - origin.z)*divz;
-		kzmax = (upper_bound.z - origin.z)*divz;
-	}
-	else
-	{
-		kzmin = (upper_bound.z - origin.z)*divz;
-		kzmax = (lower_bound.z - origin.z)*divz;
-	}
-
-	if (kzmax < 0.0f)
-		return false;
-
-	if (kzmin > kmin)
-		kmin = kzmin;
-
-	if (kzmax < kmax)
-		kmax = kzmax;
-
-	if (kmin > kmax)
-		return false;
-
-	distance_to_box = kmin;
-
-	return true;
-}
-
-#endif
diff --git a/src/linalg.h b/src/linalg.h
deleted file mode 100644
index 35b2423..0000000
--- a/src/linalg.h
+++ /dev/null
@@ -1,170 +0,0 @@
-#ifndef __LINALG_H__
-#define __LINALG_H__
-
-__device__ float3
-operator- (const float3 &a)
-{
-    return make_float3(-a.x, -a.y, -a.z);
-}
-
-__device__ float3
-operator* (const float3 &a, const float3 &b)
-{
-    return make_float3(a.x*b.x, a.y*b.y, a.z*b.z);
-}
-
-__device__ float3
-operator/ (const float3 &a, const float3 &b)
-{
-    return make_float3(a.x/b.x, a.y/b.y, a.z/b.z);
-}
-
-__device__ void
-operator*= (float3 &a, const float3 &b)
-{
-    a.x *= b.x;
-    a.y *= b.y;
-    a.z *= b.z;
-}
-
-__device__ void
-operator/= (float3 &a, const float3 &b)
-{
-    a.x /= b.x;
-    a.y /= b.y;
-    a.z /= b.z;
-}
-
-__device__ float3
-operator+ (const float3 &a, const float3 &b)
-{
-    return make_float3(a.x+b.x, a.y+b.y, a.z+b.z);
-}
-
-__device__ void
-operator+= (float3 &a, const float3 &b)
-{
-    a.x += b.x;
-    a.y += b.y;
-    a.z += b.z;
-}
-
-__device__ float3
-operator- (const float3 &a, const float3 &b)
-{
-    return make_float3(a.x-b.x, a.y-b.y, a.z-b.z);
-}
-
-__device__ void
-operator-= (float3 &a, const float3 &b)
-{
-    a.x -= b.x;
-    a.y -= b.y;
-    a.z -= b.z;
-}
-
-__device__ float3
-operator+ (const float3 &a, const float &c)
-{
-    return make_float3(a.x+c, a.y+c, a.z+c);
-}
-
-__device__ void
-operator+= (float3 &a, const float &c)
-{
-    a.x += c;
-    a.y += c;
-    a.z += c;
-}
-
-__device__ float3
-operator+ (const float &c, const float3 &a)
-{
-    return make_float3(c+a.x, c+a.y, c+a.z);
-}
-
-__device__ float3
-operator- (const float3 &a, const float &c)
-{
-    return make_float3(a.x-c, a.y-c, a.z-c);
-}
-
-__device__ void
-operator-= (float3 &a, const float &c)
-{
-    a.x -= c;
-    a.y -= c;
-    a.z -= c;
-}
-
-__device__ float3
-operator- (const float &c, const float3& a)
-{
-    return make_float3(c-a.x, c-a.y, c-a.z);
-}
-
-__device__ float3
-operator* (const float3 &a, const float &c)
-{
-    return make_float3(a.x*c, a.y*c, a.z*c);
-}
-
-__device__ void
-operator*= (float3 &a, const float &c)
-{
-    a.x *= c;
-    a.y *= c;
-    a.z *= c;
-}
-
-__device__ float3 
-operator* (const float &c, const float3& a)
-{
-    return make_float3(c*a.x, c*a.y, c*a.z);
-}
-
-__device__ float3
-operator/ (const float3 &a, const float &c)
-{
-    return make_float3(a.x/c, a.y/c, a.z/c);
-}
-
-__device__ void
-operator/= (float3 &a, const float &c)
-{
-    a.x /= c;
-    a.y /= c;
-    a.z /= c;
-}
-
-__device__ float3
-operator/ (const float &c, const float3 &a)
-{
-    return make_float3(c/a.x, c/a.y, c/a.z);
-}
-
-__device__ float
-dot(const float3 &a, const float3 &b)
-{
-    return a.x*b.x + a.y*b.y + a.z*b.z;
-}
-
-__device__ float3
-cross(const float3 &a, const float3 &b)
-{
-    return make_float3(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x);
-}
-
-__device__ float
-norm(const float3 &a)
-{
-    return sqrtf(dot(a,a));
-}
-
-__device__ float3
-normalize(const float3 &a)
-{
-    return a/norm(a);
-}
-
-#endif
diff --git a/src/matrix.h b/src/matrix.h
deleted file mode 100644
index 0a66e58..0000000
--- a/src/matrix.h
+++ /dev/null
@@ -1,249 +0,0 @@
-#ifndef __MATRIX_H__
-#define __MATRIX_H__
-
-struct Matrix
-{
-    float a00, a01, a02, a10, a11, a12, a20, a21, a22;
-};
-
-__device__ Matrix
-make_matrix(float a00, float a01, float a02,
-	    float a10, float a11, float a12,
-	    float a20, float a21, float a22)
-{
-    Matrix m = {a00, a01, a02, a10, a11, a12, a20, a21, a22};
-    return m;
-}
-
-__device__ Matrix
-make_matrix(const float3 &u1, const float3 &u2, const float3 &u3)
-{
-    Matrix m = {u1.x, u2.x, u3.x, u1.y, u2.y, u3.y, u1.z, u2.z, u3.z};
-    return m;
-}
-
-__device__ Matrix
-operator- (const Matrix &m)
-{
-    return make_matrix(-m.a00, -m.a01, -m.a02,
-		       -m.a10, -m.a11, -m.a12,
-		       -m.a20, -m.a21, -m.a22);
-}
-
-__device__ float3
-operator* (const Matrix &m, const float3 &a)
-{
-    return make_float3(m.a00*a.x + m.a01*a.y + m.a02*a.z,
-		       m.a10*a.x + m.a11*a.y + m.a12*a.z,
-		       m.a20*a.x + m.a21*a.y + m.a22*a.z);
-}
-
-__device__ Matrix
-operator+ (const Matrix &m, const Matrix &n)
-{
-    return make_matrix(m.a00+n.a00, m.a01+n.a01, m.a02+n.a02,
-		       m.a10+n.a10, m.a11+n.a11, m.a12+n.a12,
-		       m.a20+n.a20, m.a21+n.a21, m.a22+n.a22);
-}
-
-__device__ void
-operator+= (Matrix &m, const Matrix &n)
-{
-    m.a00 += n.a00;
-    m.a01 += n.a01;
-    m.a02 += n.a02;
-    m.a10 += n.a10;
-    m.a11 += n.a11;
-    m.a12 += n.a12;
-    m.a20 += n.a20;
-    m.a21 += n.a21;
-    m.a22 += n.a22;
-}
-
-__device__ Matrix
-operator- (const Matrix &m, const Matrix &n)
-{
-    return make_matrix(m.a00-n.a00, m.a01-n.a01, m.a02-n.a02,
-		       m.a10-n.a10, m.a11-n.a11, m.a12-n.a12,
-		       m.a20-n.a20, m.a21-n.a21, m.a22-n.a22);
-}
-
-__device__ void
-operator-= (Matrix &m, const Matrix& n)
-{
-    m.a00 -= n.a00;
-    m.a01 -= n.a01;
-    m.a02 -= n.a02;
-    m.a10 -= n.a10;
-    m.a11 -= n.a11;
-    m.a12 -= n.a12;
-    m.a20 -= n.a20;
-    m.a21 -= n.a21;
-    m.a22 -= n.a22;
-}
-
-__device__ Matrix
-operator* (const Matrix &m, const Matrix &n)
-{
-    return make_matrix(m.a00*n.a00 + m.a01*n.a10 + m.a02*n.a20,
-		       m.a00*n.a01 + m.a01*n.a11 + m.a02*n.a21,
-		       m.a00*n.a02 + m.a01*n.a12 + m.a02*n.a22,
-		       m.a10*n.a00 + m.a11*n.a10 + m.a12*n.a20,
-		       m.a10*n.a01 + m.a11*n.a11 + m.a12*n.a21,
-		       m.a10*n.a02 + m.a11*n.a12 + m.a12*n.a22,
-		       m.a20*n.a00 + m.a21*n.a10 + m.a22*n.a20,
-		       m.a20*n.a01 + m.a21*n.a11 + m.a22*n.a21,
-		       m.a20*n.a02 + m.a21*n.a12 + m.a22*n.a22);
-}
-
-__device__ Matrix
-operator+ (const Matrix &m, const float &c)
-{
-    return make_matrix(m.a00+c, m.a01+c, m.a02+c,
-		       m.a10+c, m.a11+c, m.a12+c,
-		       m.a20+c, m.a21+c, m.a22+c);
-}
-
-__device__ void
-operator+= (Matrix &m, const float &c)
-{
-    m.a00 += c;
-    m.a01 += c;
-    m.a02 += c;
-    m.a10 += c;
-    m.a11 += c;
-    m.a12 += c;
-    m.a20 += c;
-    m.a21 += c;
-    m.a22 += c;
-}
-
-__device__ Matrix
-operator+ (const float &c, const Matrix &m)
-{
-    return make_matrix(c+m.a00, c+m.a01, c+m.a02,
-		       c+m.a10, c+m.a11, c+m.a12,
-		       c+m.a20, c+m.a21, c+m.a22);
-}
-
-__device__ Matrix
-operator- (const Matrix &m, const float &c)
-{
-    return make_matrix(m.a00-c, m.a01-c, m.a02-c,
-		       m.a10-c, m.a11-c, m.a12-c,
-		       m.a20-c, m.a21-c, m.a22-c);
-}
-
-__device__ void
-operator-= (Matrix &m, const float &c)
-{
-    m.a00 -= c;
-    m.a01 -= c;
-    m.a02 -= c;
-    m.a10 -= c;
-    m.a11 -= c;
-    m.a12 -= c;
-    m.a20 -= c;
-    m.a21 -= c;
-    m.a22 -= c;
-}
-
-__device__ Matrix
-operator- (const float &c, const Matrix &m)
-{
-    return make_matrix(c-m.a00, c-m.a01, c-m.a02,
-		       c-m.a10, c-m.a11, c-m.a12,
-		       c-m.a20, c-m.a21, c-m.a22);
-}
-
-__device__ Matrix
-operator* (const Matrix &m, const float &c)
-{
-    return make_matrix(m.a00*c, m.a01*c, m.a02*c,
-		       m.a10*c, m.a11*c, m.a12*c,
-		       m.a20*c, m.a21*c, m.a22*c);
-}
-
-__device__ void
-operator*= (Matrix &m, const float &c)
-{
-    m.a00 *= c;
-    m.a01 *= c;
-    m.a02 *= c;
-    m.a10 *= c;
-    m.a11 *= c;
-    m.a12 *= c;
-    m.a20 *= c;
-    m.a21 *= c;
-    m.a22 *= c;
-}
-
-__device__ Matrix
-operator* (const float &c, const Matrix &m)
-{
-    return make_matrix(c*m.a00, c*m.a01, c*m.a02,
-		       c*m.a10, c*m.a11, c*m.a12,
-		       c*m.a20, c*m.a21, c*m.a22);
-}
-
-__device__ Matrix
-operator/ (const Matrix &m, const float &c)
-{
-    return make_matrix(m.a00/c, m.a01/c, m.a02/c,
-		       m.a10/c, m.a11/c, m.a12/c,
-		       m.a20/c, m.a21/c, m.a22/c);
-}
-
-__device__ void
-operator/= (Matrix &m, const float &c)
-{
-    m.a00 /= c;
-    m.a01 /= c;
-    m.a02 /= c;
-    m.a10 /= c;
-    m.a11 /= c;
-    m.a12 /= c;
-    m.a20 /= c;
-    m.a21 /= c;
-    m.a22 /= c;
-}
-
-__device__ Matrix
-operator/ (const float &c, const Matrix &m)
-{
-    return make_matrix(c/m.a00, c/m.a01, c/m.a02,
-		       c/m.a10, c/m.a11, c/m.a12,
-		       c/m.a20, c/m.a21, c/m.a22);
-}
-
-__device__ float
-det(const Matrix &m)
-{
-    return m.a00*(m.a11*m.a22 - m.a12*m.a21) -
-	   m.a10*(m.a01*m.a22 - m.a02*m.a21) +
-	   m.a20*(m.a01*m.a12 - m.a02*m.a11);
-}
-
-__device__ Matrix
-inv(const Matrix &m)
-{
-    return make_matrix(m.a11*m.a22 - m.a12*m.a21,
-		       m.a02*m.a21 - m.a01*m.a22,
-		       m.a01*m.a12 - m.a02*m.a11,
-		       m.a12*m.a20 - m.a10*m.a22,
-		       m.a00*m.a22 - m.a02*m.a20,
-		       m.a02*m.a10 - m.a00*m.a12,
-		       m.a10*m.a21 - m.a11*m.a20,
-		       m.a01*m.a20 - m.a00*m.a21,
-		       m.a00*m.a11 - m.a01*m.a10)/det(m);
-}
-
-__device__ Matrix
-outer(const float3 &a, const float3 &b)
-{
-    return make_matrix(a.x*b.x, a.x*b.y, a.x*b.z,
-		       a.y*b.x, a.y*b.y, a.y*b.z,
-		       a.z*b.x, a.z*b.y, a.z*b.z);
-}
-
-#endif
diff --git a/src/mesh.h b/src/mesh.h
deleted file mode 100644
index d60d801..0000000
--- a/src/mesh.h
+++ /dev/null
@@ -1,175 +0,0 @@
-#ifndef __MESH_H__
-#define __MESH_H__
-
-#include "intersect.h"
-#include "geometry.h"
-
-#include "stdio.h"
-
-#define STACK_SIZE 100
-
-/* Tests the intersection between a ray and a node in the bounding volume
-   hierarchy. If the ray intersects the bounding volume and `min_distance`
-   is less than zero or the distance from `origin` to the intersection is
-   less than `min_distance`, return true, else return false. */
-__device__ bool
-intersect_node(const float3 &origin, const float3 &direction,
-	       Geometry *g, int i, float min_distance=-1.0f)
-{
-    /* assigning these to local variables is faster for some reason */
-    float3 lower_bound = g->lower_bounds[i];
-    float3 upper_bound = g->upper_bounds[i];
-
-    float distance_to_box;
-
-    if (intersect_box(origin, direction, lower_bound, upper_bound,
-		      distance_to_box)) {
-	if (min_distance < 0.0f)
-	    return true;
-
-	if (distance_to_box > min_distance)
-	    return false;
-
-	return true;
-    }
-    else {
-	return false;
-    }
-
-}
-
-/* Finds the intersection between a ray and `geometry`. If the ray does
-   intersect the mesh and the index of the intersected triangle is not equal
-   to `last_hit_triangle`, set `min_distance` to the distance from `origin` to
-   the intersection and return the index of the triangle which the ray
-   intersected, else return -1. */
-__device__ int
-intersect_mesh(const float3 &origin, const float3& direction, Geometry *g,
-	       float &min_distance, int last_hit_triangle = -1)
-{
-    int triangle_index = -1;
-
-    float distance;
-    min_distance = -1.0f;
-
-    if (!intersect_node(origin, direction, g, g->start_node, min_distance))
-	return -1;
-
-    unsigned int stack[STACK_SIZE];
-
-    unsigned int *head = &stack[0];
-    unsigned int *node = &stack[1];
-    unsigned int *tail = &stack[STACK_SIZE-1];
-    *node = g->start_node;
-
-    unsigned int i;
-
-    do
-    {
-	unsigned int first_child = g->node_map[*node];
-	unsigned int stop = g->node_map_end[*node];
-
-	while (*node >= g->first_node && stop == first_child+1) {
-	    *node = first_child;
-	    first_child = g->node_map[*node];
-	    stop = g->node_map_end[*node];
-	}
-		
-	if (*node >= g->first_node) {
-	    for (i=first_child; i < stop; i++) {
-		if (intersect_node(origin, direction, g, i, min_distance)) {
-		    *node = i;
-		    node++;
-		}
-	    }
-	    
-	    node--;
-	}
-	else {
-	    // node is a leaf
-	    for (i=first_child; i < stop; i++) {
-		if (last_hit_triangle == i)
-		    continue;
-
-		Triangle t = get_triangle(g, i);
-
-		if (intersect_triangle(origin, direction, t, distance)) {
-		    if (triangle_index == -1) {
-			triangle_index = i;
-			min_distance = distance;
-			continue;
-		    }
-
-		    if (distance < min_distance) {
-			triangle_index = i;
-			min_distance = distance;
-		    }
-		}
-	    } // triangle loop
-
-	    node--;
-
-	} // node is a leaf
-
-	if (node > tail) {
-	    printf("warning: intersect_mesh() aborted; node > tail\n");
-	    break;
-	}
-
-    } // while loop
-    while (node != head);
-    
-    return triangle_index;
-}
-
-extern "C"
-{
-
-__global__ void
-distance_to_mesh(int nthreads, float3 *_origin, float3 *_direction,
-		 Geometry *g, float *_distance)
-{
-    __shared__ Geometry sg;
-
-    if (threadIdx.x == 0)
-	sg = *g;
-
-    __syncthreads();
-
-    int id = blockIdx.x*blockDim.x + threadIdx.x;
-
-    if (id >= nthreads)
-	return;
-
-    g = &sg;
-
-    float3 origin = _origin[id];
-    float3 direction = _direction[id];
-    direction /= norm(direction);
-
-    float distance;
-
-    int triangle_index = intersect_mesh(origin, direction, g, distance);
-
-    if (triangle_index != -1)
-	_distance[id] = distance;
-}
-
-__global__ void
-color_solids(int first_triangle, int nthreads, int *solid_id_map,
-	     bool *solid_hit, unsigned int *solid_colors, Geometry *g)
-{
-    int id = blockIdx.x*blockDim.x + threadIdx.x;
-
-    if (id >= nthreads)
-	return;
-
-    int triangle_id = first_triangle + id;
-    int solid_id = solid_id_map[triangle_id];
-    if (solid_hit[solid_id])
-	g->colors[triangle_id] = solid_colors[solid_id];
-}
-
-} // extern "C"
-
-#endif
diff --git a/src/photon.h b/src/photon.h
deleted file mode 100644
index 8b7e588..0000000
--- a/src/photon.h
+++ /dev/null
@@ -1,325 +0,0 @@
-#ifndef __PHOTON_H__
-#define __PHOTON_H__
-
-#include "stdio.h"
-#include "linalg.h"
-#include "rotate.h"
-#include "random.h"
-#include "physical_constants.h"
-#include "mesh.h"
-#include "geometry.h"
-
-struct Photon
-{
-    float3 position;
-    float3 direction;
-    float3 polarization;
-    float wavelength;
-    float time;
-
-    unsigned int history;
-
-    int last_hit_triangle;
-};
-
-struct State
-{
-    bool inside_to_outside;
-
-    float3 surface_normal;
-
-    float refractive_index1, refractive_index2;
-    float absorption_length;
-    float scattering_length;
-
-    int surface_index;
-
-    float distance_to_boundary;
-};
-
-enum
-{
-    NO_HIT           = 0x1 << 0,
-    BULK_ABSORB      = 0x1 << 1,
-    SURFACE_DETECT   = 0x1 << 2,
-    SURFACE_ABSORB   = 0x1 << 3,
-    RAYLEIGH_SCATTER = 0x1 << 4,
-    REFLECT_DIFFUSE  = 0x1 << 5,
-    REFLECT_SPECULAR = 0x1 << 6,
-    NAN_ABORT        = 0x1 << 31
-}; // processes
-
-enum {BREAK, CONTINUE, PASS}; // return value from propagate_to_boundary
-
-__device__ int
-convert(int c)
-{
-    if (c & 0x80)
-	return (0xFFFFFF00 | c);
-    else
-	return c;
-}
-
-__device__ float
-get_theta(const float3 &a, const float3 &b)
-{
-    return acosf(fmaxf(-1.0f,fminf(1.0f,dot(a,b))));
-}
-
-__device__ void
-fill_state(State &s, Photon &p, Geometry *g)
-{
-    p.last_hit_triangle = intersect_mesh(p.position, p.direction, g,
-					 s.distance_to_boundary,
-					 p.last_hit_triangle);
-
-    if (p.last_hit_triangle == -1) {
-	p.history |= NO_HIT;
-	return;
-    }
-    
-    Triangle t = get_triangle(g, p.last_hit_triangle);
-    
-    unsigned int material_code = g->material_codes[p.last_hit_triangle];
-    
-    int inner_material_index = convert(0xFF & (material_code >> 24));
-    int outer_material_index = convert(0xFF & (material_code >> 16));
-    s.surface_index = convert(0xFF & (material_code >> 8));
-    
-    float3 v01 = t.v1 - t.v0;
-    float3 v12 = t.v2 - t.v1;
-    
-    s.surface_normal = normalize(cross(v01, v12));
-				 
-    Material *material1, *material2;
-    if (dot(s.surface_normal,-p.direction) > 0.0f) {
-	// outside to inside
-	material1 = g->materials[outer_material_index];
-	material2 = g->materials[inner_material_index];
-
-	s.inside_to_outside = false;
-    }
-    else {
-	// inside to outside
-	material1 = g->materials[inner_material_index];
-	material2 = g->materials[outer_material_index];
-	s.surface_normal = -s.surface_normal;
-
-	s.inside_to_outside = true;
-    }
-
-    s.refractive_index1 = interp_property(material1, p.wavelength,
-					  material1->refractive_index);
-    s.refractive_index2 = interp_property(material2, p.wavelength,
-					  material2->refractive_index);
-    s.absorption_length = interp_property(material1, p.wavelength,
-					  material1->absorption_length);
-    s.scattering_length = interp_property(material1, p.wavelength,
-					  material1->scattering_length);
-
-} // fill_state
-
-__device__ float3
-pick_new_direction(float3 axis, float theta, float phi)
-{
-    // Taken from SNOMAN rayscatter.for
-    float cos_theta = cosf(theta);
-    float sin_theta = sinf(theta);
-    float cos_phi   = cosf(phi);
-    float sin_phi   = sinf(phi);
-	
-    float sin_axis_theta = sqrt(1.0f - axis.z*axis.z);
-    float cos_axis_phi, sin_axis_phi;
-	
-    if (isnan(sin_axis_theta) || sin_axis_theta < 0.00001f) {
-	cos_axis_phi = 1.0f;
-	sin_axis_phi = 0.0f;
-    }
-    else {
-	cos_axis_phi = axis.x / sin_axis_theta;
-	sin_axis_phi = axis.y / sin_axis_theta;
-    }
-
-    float dirx = cos_theta*axis.x +
-	sin_theta*(axis.z*cos_phi*cos_axis_phi - sin_phi*sin_axis_phi);
-    float diry = cos_theta*axis.y +
-	sin_theta*(cos_phi*axis.z*sin_axis_phi - sin_phi*cos_axis_phi);
-    float dirz = cos_theta*axis.z - sin_theta*cos_phi*sin_axis_theta;
-
-    return make_float3(dirx, diry, dirz);
-}
-
-__device__ void
-rayleigh_scatter(Photon &p, curandState &rng)
-{
-    float cos_theta = 2.0f*cosf((acosf(1.0f - 2.0f*curand_uniform(&rng))-2*PI)/3.0f);
-    if (cos_theta > 1.0f)
-	cos_theta = 1.0f;
-    else if (cos_theta < -1.0f)
-	cos_theta = -1.0f;
-
-    float theta = acosf(cos_theta);
-    float phi = uniform(&rng, 0.0f, 2.0f * PI);
-
-    p.direction = pick_new_direction(p.polarization, theta, phi);
-
-    if (1.0f - fabsf(cos_theta) < 1e-6f) {
-	p.polarization = pick_new_direction(p.polarization, PI/2.0f, phi);
-    }
-    else {
-	// linear combination of old polarization and new direction
-	p.polarization = p.polarization - cos_theta * p.direction;
-    }
-
-    p.direction /= norm(p.direction);
-    p.polarization /= norm(p.polarization);
-} // scatter
-
-__device__ int propagate_to_boundary(Photon &p, State &s, curandState &rng)
-{
-    float absorption_distance = -s.absorption_length*logf(curand_uniform(&rng));
-    float scattering_distance = -s.scattering_length*logf(curand_uniform(&rng));
-
-    if (absorption_distance <= scattering_distance) {
-	if (absorption_distance <= s.distance_to_boundary) {
-	    p.time += absorption_distance/(SPEED_OF_LIGHT/s.refractive_index1);
-	    p.position += absorption_distance*p.direction;
-	    p.history |= BULK_ABSORB;
-
-	    p.last_hit_triangle = -1;
-
-	    return BREAK;
-	} // photon is absorbed in material1
-    }
-    else {
-	if (scattering_distance <= s.distance_to_boundary) {
-	    p.time += scattering_distance/(SPEED_OF_LIGHT/s.refractive_index1);
-	    p.position += scattering_distance*p.direction;
-
-	    rayleigh_scatter(p, rng);
-
-	    p.history |= RAYLEIGH_SCATTER;
-
-	    p.last_hit_triangle = -1;
-
-	    return CONTINUE;
-	} // photon is scattered in material1
-    } // if scattering_distance < absorption_distance
-
-    p.position += s.distance_to_boundary*p.direction;
-    p.time += s.distance_to_boundary/(SPEED_OF_LIGHT/s.refractive_index1);
-
-    return PASS;
-
-} // propagate_to_boundary
-
-__device__ void
-propagate_at_boundary(Photon &p, State &s, curandState &rng)
-{
-    float incident_angle = get_theta(s.surface_normal,-p.direction);
-    float refracted_angle = asinf(sinf(incident_angle)*s.refractive_index1/s.refractive_index2);
-
-    float3 incident_plane_normal = cross(p.direction, s.surface_normal);
-    float incident_plane_normal_length = norm(incident_plane_normal);
-
-    // Photons at normal incidence do not have a unique plane of incidence,
-    // so we have to pick the plane normal to be the polarization vector
-    // to get the correct logic below
-    if (incident_plane_normal_length < 1e-6f)
-	incident_plane_normal = p.polarization;
-    else
-	incident_plane_normal /= incident_plane_normal_length;
-
-    float normal_coefficient = dot(p.polarization, incident_plane_normal);
-    float normal_probability = normal_coefficient*normal_coefficient;
-
-    float reflection_coefficient;
-    if (curand_uniform(&rng) < normal_probability) {
-	// photon polarization normal to plane of incidence
-	reflection_coefficient = -sinf(incident_angle-refracted_angle)/sinf(incident_angle+refracted_angle);
-
-	if ((curand_uniform(&rng) < reflection_coefficient*reflection_coefficient) || isnan(refracted_angle)) {
-	    p.direction = rotate(s.surface_normal, incident_angle, incident_plane_normal);
-			
-	    p.history |= REFLECT_SPECULAR;
-	}
-	else {
-	    p.direction = rotate(s.surface_normal, PI-refracted_angle, incident_plane_normal);
-	}
-
-	p.polarization = incident_plane_normal;
-    }
-    else {
-	// photon polarization parallel to plane of incidence
-	reflection_coefficient = tanf(incident_angle-refracted_angle)/tanf(incident_angle+refracted_angle);
-
-	if ((curand_uniform(&rng) < reflection_coefficient*reflection_coefficient) || isnan(refracted_angle)) {
-	    p.direction = rotate(s.surface_normal, incident_angle, incident_plane_normal);
-			
-	    p.history |= REFLECT_SPECULAR;
-	}
-	else {
-	    p.direction = rotate(s.surface_normal, PI-refracted_angle, incident_plane_normal);
-	}
-
-	p.polarization = cross(incident_plane_normal, p.direction);
-	p.polarization /= norm(p.polarization);
-    }
-
-} // propagate_at_boundary
-
-__device__ int
-propagate_at_surface(Photon &p, State &s, curandState &rng, Geometry *geometry)
-{
-    Surface *surface = geometry->surfaces[s.surface_index];
-
-    /* since the surface properties are interpolated linearly, we are
-       guaranteed that they still sum to 1.0. */
-
-    float detect = interp_property(surface, p.wavelength, surface->detect);
-    float absorb = interp_property(surface, p.wavelength, surface->absorb);
-    float reflect_diffuse = interp_property(surface, p.wavelength, surface->reflect_diffuse);
-    float reflect_specular = interp_property(surface, p.wavelength, surface->reflect_specular);
-
-    float uniform_sample = curand_uniform(&rng);
-
-    if (uniform_sample < absorb) {
-	p.history |= SURFACE_ABSORB;
-	return BREAK;
-    }
-    else if (uniform_sample < absorb + detect) {
-	p.history |= SURFACE_DETECT;
-	return BREAK;
-    }
-    else if (uniform_sample < absorb + detect + reflect_diffuse) {
-	// diffusely reflect
-	p.direction = uniform_sphere(&rng);
-
-	if (dot(p.direction, s.surface_normal) < 0.0f)
-	    p.direction = -p.direction;
-
-	// randomize polarization?
-	p.polarization = cross(uniform_sphere(&rng), p.direction);
-	p.polarization /= norm(p.polarization);
-
-	p.history |= REFLECT_DIFFUSE;
-
-	return CONTINUE;
-    }
-    else {
-	// specularly reflect
-	float incident_angle = get_theta(s.surface_normal,-p.direction);
-	float3 incident_plane_normal = cross(p.direction, s.surface_normal);
-	incident_plane_normal /= norm(incident_plane_normal);
-
-	p.direction = rotate(s.surface_normal, incident_angle,
-			     incident_plane_normal);
-
-	p.history |= REFLECT_SPECULAR;
-
-	return CONTINUE;
-    }
-
-} // propagate_at_surface
-
-#endif
diff --git a/src/physical_constants.h b/src/physical_constants.h
deleted file mode 100644
index 2ff87cd..0000000
--- a/src/physical_constants.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef __PHYSICAL_CONSTANTS_H__
-#define __PHYSICAL_CONSTANTS_H__
-
-#define SPEED_OF_LIGHT 2.99792458e8
-#define PI 3.141592653589793
-
-#endif
diff --git a/src/propagate.cu b/src/propagate.cu
deleted file mode 100644
index fdcf532..0000000
--- a/src/propagate.cu
+++ /dev/null
@@ -1,147 +0,0 @@
-//-*-c-*-
-
-#include "linalg.h"
-#include "geometry.h"
-#include "photon.h"
-
-#include "stdio.h"
-
-extern "C"
-{
-
-__global__ void
-photon_duplicate(int first_photon, int nthreads,
-		 float3 *positions, float3 *directions,
-		 float *wavelengths, float3 *polarizations,
-		 float *times, unsigned int *histories,
-		 int *last_hit_triangles, 
-		 int copies, int stride)
-{
-    int id = blockIdx.x*blockDim.x + threadIdx.x;
-
-    if (id >= nthreads)
-	return;
-
-    int photon_id = first_photon + id;
-
-    Photon p;
-    p.position = positions[photon_id];
-    p.direction = directions[photon_id];
-    p.polarization = polarizations[photon_id];
-    p.wavelength = wavelengths[photon_id];
-    p.time = times[photon_id];
-    p.last_hit_triangle = last_hit_triangles[photon_id];
-    p.history = histories[photon_id];
-
-    for (int i=1; i <= copies; i++) {
-      int target_photon_id = photon_id + stride * i;
-
-      positions[target_photon_id] = p.position;
-      directions[target_photon_id] = p.direction;
-      polarizations[target_photon_id] = p.polarization;
-      wavelengths[target_photon_id] = p.wavelength;
-      times[target_photon_id] = p.time;
-      last_hit_triangles[target_photon_id] = p.last_hit_triangle;
-      histories[target_photon_id] = p.history;
-    }
-}
-		 
-__global__ void
-propagate(int first_photon, int nthreads, unsigned int *input_queue,
-	  unsigned int *output_queue, curandState *rng_states,
-	  float3 *positions, float3 *directions,
-	  float *wavelengths, float3 *polarizations,
-	  float *times, unsigned int *histories,
-	  int *last_hit_triangles, int max_steps,
-	  Geometry *g)
-{
-    __shared__ Geometry sg;
-
-    if (threadIdx.x == 0)
-	sg = *g;
-
-    __syncthreads();
-
-    int id = blockIdx.x*blockDim.x + threadIdx.x;
-
-    if (id >= nthreads)
-	return;
-
-    g = &sg;
-
-    curandState rng = rng_states[id];
-
-    int photon_id = input_queue[first_photon + id];
-
-    Photon p;
-    p.position = positions[photon_id];
-    p.direction = directions[photon_id];
-    p.direction /= norm(p.direction);
-    p.polarization = polarizations[photon_id];
-    p.polarization /= norm(p.polarization);
-    p.wavelength = wavelengths[photon_id];
-    p.time = times[photon_id];
-    p.last_hit_triangle = last_hit_triangles[photon_id];
-    p.history = histories[photon_id];
-
-    if (p.history & (NO_HIT | BULK_ABSORB | SURFACE_DETECT | SURFACE_ABSORB))
-	return;
-
-    State s;
-
-    int steps = 0;
-    while (steps < max_steps) {
-	steps++;
-
-	int command;
-
-	// check for NaN and fail
-	if (isnan(p.direction.x*p.direction.y*p.direction.z*p.position.x*p.position.y*p.position.z)) {
-	    p.history |= NO_HIT | NAN_ABORT;
-	    break;
-	}
-
-	fill_state(s, p, g);
-
-	if (p.last_hit_triangle == -1)
-	    break;
-
-	command = propagate_to_boundary(p, s, rng);
-
-	if (command == BREAK)
-	    break;
-
-	if (command == CONTINUE)
-	    continue;
-
-	if (s.surface_index != -1) {
-	    command = propagate_at_surface(p, s, rng, g);
-
-	    if (command == BREAK)
-		break;
-
-	    if (command == CONTINUE)
-		continue;
-	}
-
-	propagate_at_boundary(p, s, rng);
-
-    } // while (steps < max_steps)
-
-    rng_states[id] = rng;
-    positions[photon_id] = p.position;
-    directions[photon_id] = p.direction;
-    polarizations[photon_id] = p.polarization;
-    wavelengths[photon_id] = p.wavelength;
-    times[photon_id] = p.time;
-    histories[photon_id] = p.history;
-    last_hit_triangles[photon_id] = p.last_hit_triangle;
-	
-    // Not done, put photon in output queue
-    if ((p.history & (NO_HIT | BULK_ABSORB | SURFACE_DETECT | SURFACE_ABSORB)) == 0) {
-	int out_idx = atomicAdd(output_queue, 1);
-	output_queue[out_idx] = photon_id;
-    }
-} // propagate
-
-} // extern "C"
diff --git a/src/random.h b/src/random.h
deleted file mode 100644
index e9d2e75..0000000
--- a/src/random.h
+++ /dev/null
@@ -1,87 +0,0 @@
-#ifndef __RANDOM_H__
-#define __RANDOM_H__
-
-#include <curand_kernel.h>
-
-#include "physical_constants.h"
-
-__device__ float
-uniform(curandState *s, const float &low, const float &high)
-{
-    return low + curand_uniform(s)*(high-low);
-}
-
-__device__ float3
-uniform_sphere(curandState *s)
-{
-    float theta = uniform(s, 0.0f, 2*PI);
-    float u = uniform(s, -1.0f, 1.0f);
-    float c = sqrtf(1.0f-u*u);
-
-    return make_float3(c*cosf(theta), c*sinf(theta), u);
-}
-
-// Draw a random sample given a cumulative distribution function
-// Assumptions: ncdf >= 2, cdf_y[0] is 0.0, and cdf_y[ncdf-1] is 1.0
-__device__ float
-sample_cdf(curandState *rng, int ncdf, float *cdf_x, float *cdf_y)
-{
-    float u = curand_uniform(rng);
-
-    // Find u in cdf_y with binary search
-    // list must contain at least 2 elements: 0.0 and 1.0
-    int lower=0;
-    int upper=ncdf-1;
-    while(lower < upper-1) {
-	int half = (lower+upper) / 2;
-	if (u < cdf_y[half])
-	    upper = half;
-	else
-	    lower = half;
-    }
-  
-    float frac = (u - cdf_y[lower]) / (cdf_y[upper] - cdf_y[lower]);
-    return cdf_x[lower] * frac + cdf_x[upper] * (1.0f - frac);
-}
-
-extern "C"
-{
-
-__global__ void
-init_rng(int nthreads, curandState *s, unsigned long long seed,
-	 unsigned long long offset)
-{
-    int id = blockIdx.x*blockDim.x + threadIdx.x;
-
-    if (id >= nthreads)
-	return;
-
-    curand_init(seed, id, offset, &s[id]);
-}
-
-__global__ void
-fill_uniform(int nthreads, curandState *s, float *a, float low, float high)
-{
-    int id = blockIdx.x*blockDim.x + threadIdx.x;
-
-    if (id >= nthreads)
-	return;
-
-    a[id] = uniform(&s[id], low, high);
-
-}
-
-__global__ void
-fill_uniform_sphere(int nthreads, curandState *s, float3 *a)
-{
-    int id = blockIdx.x*blockDim.x + threadIdx.x;
-
-    if (id >= nthreads)
-	return;
-
-    a[id] = uniform_sphere(&s[id]);
-}
-
-} // extern "c"
-
-#endif
diff --git a/src/render.cu b/src/render.cu
deleted file mode 100644
index d4ed443..0000000
--- a/src/render.cu
+++ /dev/null
@@ -1,171 +0,0 @@
-//-*-c-*-
-
-#include "linalg.h"
-#include "intersect.h"
-#include "mesh.h"
-#include "sorting.h"
-#include "geometry.h"
-
-#include "stdio.h"
-
-__device__ float4
-get_color(const float3 &direction, const Triangle &t, unsigned int rgba)
-{
-    float3 v01 = t.v1 - t.v0;
-    float3 v12 = t.v2 - t.v1;
-    
-    float3 surface_normal = normalize(cross(v01,v12));
-
-    float cos_theta = dot(surface_normal,-direction);
-
-    if (cos_theta < 0.0f)
-	cos_theta = -cos_theta;
-
-    unsigned int a0 = 0xff & (rgba >> 24);
-    unsigned int r0 = 0xff & (rgba >> 16);
-    unsigned int g0 = 0xff & (rgba >> 8);
-    unsigned int b0 = 0xff & rgba;
-
-    float alpha = (255 - a0)/255.0f;
-
-    return make_float4(r0*cos_theta, g0*cos_theta, b0*cos_theta, alpha);
-}
-
-extern "C"
-{
-
-__global__ void
-render(int nthreads, float3 *_origin, float3 *_direction, Geometry *g,
-       unsigned int alpha_depth, unsigned int *pixels, float *_dx,
-       unsigned int *dxlen, float4 *_color)
-{
-    __shared__ Geometry sg;
-
-    if (threadIdx.x == 0)
-	sg = *g;
-
-    __syncthreads();
-
-    int id = blockIdx.x*blockDim.x + threadIdx.x;
-	
-    if (id >= nthreads)
-	return;
-
-    g = &sg;
-
-    float3 origin = _origin[id];
-    float3 direction = _direction[id];
-    unsigned int n = dxlen[id];
-
-    float distance;
-
-    if (n < 1 && !intersect_node(origin, direction, g, g->start_node)) {
-	pixels[id] = 0;
-	return;
-    }
-
-    unsigned int stack[STACK_SIZE];
-
-    unsigned int *head = &stack[0];
-    unsigned int *node = &stack[1];
-    unsigned int *tail = &stack[STACK_SIZE-1];
-    *node = g->start_node;
-
-    float *dx = _dx + id*alpha_depth;
-    float4 *color_a = _color + id*alpha_depth;
-
-    unsigned int i;
-
-    do {
-	unsigned int first_child = g->node_map[*node];
-	unsigned int stop = g->node_map_end[*node];
-	
-	while (*node >= g->first_node && stop == first_child+1) {
-	    *node = first_child;
-	    first_child = g->node_map[*node];
-	    stop = g->node_map_end[*node];
-	}
-		
-	if (*node >= g->first_node) {
-	    for (i=first_child; i < stop; i++) {
-		if (intersect_node(origin, direction, g, i)) {
-		    *node = i;
-		    node++;
-		}
-	    }
-
-	    node--;
-	}
-	else {
-	    // node is a leaf
-	    for (i=first_child; i < stop; i++) {
-		Triangle t = get_triangle(g, i);
-		
-		if (intersect_triangle(origin, direction, t, distance)) {
-		    if (n < 1) {
-			dx[0] = distance;
-			    
-			unsigned int rgba = g->colors[i];
-			float4 color = get_color(direction, t, rgba);
-
-			color_a[0] = color;
-		    }
-		    else {
-			unsigned long j = searchsorted(n, dx, distance);
-
-			if (j <= alpha_depth-1) {
-			    insert(alpha_depth, dx, j, distance);
-
-			    unsigned int rgba = g->colors[i];
-			    float4 color = get_color(direction, t, rgba);
-
-			    insert(alpha_depth, color_a, j, color);
-			}
-		    }
-					
-		    if (n < alpha_depth)
-			n++;
-		}
-				
-	    } // triangle loop
-	    
-	    node--;
-	    
-	} // node is a leaf
-	
-    } // while loop
-    while (node != head);
-
-    if (n < 1) {
-	pixels[id] = 0;
-	return;
-    }
-
-    dxlen[id] = n;
-
-    float scale = 1.0f;
-    float fr = 0.0f;
-    float fg = 0.0f;
-    float fb = 0.0f;
-    for (i=0; i < n; i++) {
-	float alpha = color_a[i].w;
-
-	fr += scale*color_a[i].x*alpha;
-	fg += scale*color_a[i].y*alpha;
-	fb += scale*color_a[i].z*alpha;
-	
-	scale *= (1.0f-alpha);
-    }
-    unsigned int a;
-    if (n < alpha_depth)
-	a = floorf(255*(1.0f-scale));
-    else
-    	a = 255;
-    unsigned int red = floorf(fr/(1.0f-scale));
-    unsigned int green = floorf(fg/(1.0f-scale));
-    unsigned int blue = floorf(fb/(1.0f-scale));
-
-    pixels[id] = a << 24 | red << 16 | green << 8 | blue;
-}
-
-} // extern "C"
diff --git a/src/rotate.h b/src/rotate.h
deleted file mode 100644
index 15f8037..0000000
--- a/src/rotate.h
+++ /dev/null
@@ -1,27 +0,0 @@
-#ifndef __ROTATE_H__
-#define __ROTATE_H__
-
-#include "linalg.h"
-#include "matrix.h"
-
-__device__ const Matrix IDENTITY_MATRIX = {1,0,0,0,1,0,0,0,1};
-
-__device__ Matrix
-make_rotation_matrix(float phi, const float3 &n)
-{
-    float cos_phi = cosf(phi);
-    float sin_phi = sinf(phi);
-
-    return IDENTITY_MATRIX*cos_phi + (1-cos_phi)*outer(n,n) +
-	sin_phi*make_matrix(0,n.z,-n.y,-n.z,0,n.x,n.y,-n.x,0);
-}
-
-/* rotate points counterclockwise, when looking towards +infinity,
-   through an angle `phi` about the axis `n`. */
-__device__ float3
-rotate(const float3 &a, float phi, const float3 &n)
-{
-    return make_rotation_matrix(phi,n)*a;
-}
-
-#endif
diff --git a/src/sorting.h b/src/sorting.h
deleted file mode 100644
index 2bf1a5a..0000000
--- a/src/sorting.h
+++ /dev/null
@@ -1,107 +0,0 @@
-template <class T>
-__device__ void
-swap(T &a, T &b)
-{
-    T tmp = a;
-    a = b;
-    b = tmp;
-}
-
-template <class T>
-__device__ void
-reverse(int n, T *a)
-{
-    for (int i=0; i < n/2; i++)
-	swap(a[i],a[n-1-i]);
-}
-
-template <class T>
-__device__ void
-piksrt(int n, T *arr)
-{
-    int i,j;
-    T a;
-
-    for (j=1; j < n; j++) {
-	a = arr[j];
-	i = j-1;
-	while (i >= 0 && arr[i] > a) {
-	    arr[i+1] = arr[i];
-	    i--;
-	}
-	arr[i+1] = a;
-    }
-}
-
-template <class T, class U>
-__device__ void
-piksrt2(int n, T *arr, U *brr)
-{
-    int i,j;
-    T a;
-    U b;
-
-    for (j=1; j < n; j++) {
-	a = arr[j];
-	b = brr[j];
-	i = j-1;
-	while (i >= 0 && arr[i] > a) {
-	    arr[i+1] = arr[i];
-	    brr[i+1] = brr[i];
-	    i--;
-	}
-	arr[i+1] = a;
-	brr[i+1] = b;
-    }
-}
-
-/* Returns the index in `arr` where `x` should be inserted in order to
-   maintain order. If `n` equals one, return the index such that, when
-   `x` is inserted, `arr` will be in ascending order.
-*/
-template <class T>
-__device__ unsigned long
-searchsorted(unsigned long n, T *arr, const T &x)
-{
-    unsigned long ju,jm,jl;
-    int ascnd;
-
-    jl = 0;
-    ju = n;
-
-    ascnd = (arr[n-1] >= arr[0]);
-
-    while (ju-jl > 1) {
-	jm = (ju+jl) >> 1;
-
-	if ((x > arr[jm]) == ascnd)
-	    jl = jm;
-	else
-	    ju = jm;
-    }
-
-    if ((x <= arr[0]) == ascnd)
-	return 0;
-    else
-	return ju;
-}
-
-template <class T>
-__device__ void
-insert(unsigned long n, T *arr, unsigned long i, const T &x)
-{
-    unsigned long j;
-    for (j=n-1; j > i; j--)
-	arr[j] = arr[j-1];
-    arr[i] = x;
-}
-
-template <class T>
-__device__ void
-add_sorted(unsigned long n, T *arr, const T &x)
-{
-    unsigned long i = searchsorted(n, arr, x);
-
-    if (i < n)
-	insert(n, arr, i, x);
-}
diff --git a/src/tools.cu b/src/tools.cu
deleted file mode 100644
index 80d06ec..0000000
--- a/src/tools.cu
+++ /dev/null
@@ -1,21 +0,0 @@
-//--*-c-*-
-
-extern "C"
-{
-
-__global__ void
-interleave(int nthreads, unsigned long long *x, unsigned long long *y,
-	   unsigned long long *z, int bits, unsigned long long *dest)
-{
-    int id = blockIdx.x*blockDim.x + threadIdx.x;
-
-    if (id >= nthreads)
-	return;
-
-    for (int i=0; i < bits; i++)
-	dest[id] |= (x[id] & 1 << i) << (2*i) |
-	            (y[id] & 1 << i) << (2*i+1) |
-	            (z[id] & 1 << i) << (2*i+2);
-}
-
-} // extern "C"
diff --git a/src/transform.cu b/src/transform.cu
deleted file mode 100644
index 1f4405e..0000000
--- a/src/transform.cu
+++ /dev/null
@@ -1,51 +0,0 @@
-//-*-c-*-
-
-#include "linalg.h"
-#include "rotate.h"
-
-extern "C"
-{
-
-/* Translate the points `a` by the vector `v` */
-__global__ void
-translate(int nthreads, float3 *a, float3 v)
-{
-    int id = blockIdx.x*blockDim.x + threadIdx.x;
-
-    if (id >= nthreads)
-	return;
-
-    a[id] += v;
-}
-
-/* Rotate the points `a` through an angle `phi` counter-clockwise about the
-   axis `axis` (when looking towards +infinity). */
-__global__ void
-rotate(int nthreads, float3 *a, float phi, float3 axis)
-{
-    int id = blockIdx.x*blockDim.x + threadIdx.x;
-
-    if (id >= nthreads)
-	return;
-
-    a[id] = rotate(a[id], phi, axis);
-}
-
-/* Rotate the points `a` through an angle `phi` counter-clockwise
-   (when looking towards +infinity along `axis`) about the axis defined
-   by the point `point` and the vector `axis` . */
-__global__ void
-rotate_around_point(int nthreads, float3 *a, float phi, float3 axis,
-		    float3 point)
-{
-    int id = blockIdx.x*blockDim.x + threadIdx.x;
-
-    if (id >= nthreads)
-	return;
-
-    a[id] -= point;
-    a[id] = rotate(a[id], phi, axis);
-    a[id] += point;
-}
-
-} // extern "c"
diff --git a/test/__init__.py b/test/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/test/data/ray_intersection.npz b/test/data/ray_intersection.npz
new file mode 100644
index 0000000..fba0e40
Binary files /dev/null and b/test/data/ray_intersection.npz differ
diff --git a/test/linalg_test.cu b/test/linalg_test.cu
new file mode 100644
index 0000000..4e9c983
--- /dev/null
+++ b/test/linalg_test.cu
@@ -0,0 +1,128 @@
+//-*-c-*-
+
+#include "linalg.h"
+
+extern "C"
+{
+
+__global__ void float3add(float3 *a, float3 *b, float3 *dest)
+{
+	int idx = blockIdx.x*blockDim.x + threadIdx.x;
+	dest[idx] = a[idx] + b[idx];
+}
+
+__global__ void float3addequal(float3 *a, float3 *b)
+{
+	int idx = blockIdx.x*blockDim.x + threadIdx.x;
+	a[idx] += b[idx];
+}
+
+__global__ void float3sub(float3 *a, float3 *b, float3 *dest)
+{
+	int idx = blockIdx.x*blockDim.x + threadIdx.x;
+	dest[idx] = a[idx] - b[idx];
+}
+
+__global__ void float3subequal(float3 *a, float3 *b)
+{
+	int idx = blockIdx.x*blockDim.x + threadIdx.x;
+	a[idx] -= b[idx];
+}
+
+__global__ void float3addfloat(float3 *a, float c, float3 *dest)
+{
+	int idx = blockIdx.x*blockDim.x + threadIdx.x;
+	dest[idx] = a[idx] + c;
+}
+
+__global__ void float3addfloatequal(float3 *a, float c)
+{
+	int idx = blockIdx.x*blockDim.x + threadIdx.x;
+	a[idx] += c;
+}
+
+__global__ void floataddfloat3(float3 *a, float c, float3 *dest)
+{
+	int idx = blockIdx.x*blockDim.x + threadIdx.x;
+	dest[idx] = c + a[idx];
+}
+
+__global__ void float3subfloat(float3 *a, float c, float3 *dest)
+{
+	int idx = blockIdx.x*blockDim.x + threadIdx.x;
+	dest[idx] = a[idx] - c;
+}
+
+__global__ void float3subfloatequal(float3 *a, float c)
+{
+	int idx = blockIdx.x*blockDim.x + threadIdx.x;
+	a[idx] -= c;
+}
+
+__global__ void floatsubfloat3(float3 *a, float c, float3 *dest)
+{
+	int idx = blockIdx.x*blockDim.x + threadIdx.x;
+	dest[idx] = c - a[idx];
+}
+
+__global__ void float3mulfloat(float3 *a, float c, float3 *dest)
+{
+	int idx = blockIdx.x*blockDim.x + threadIdx.x;
+	dest[idx] = a[idx]*c;
+}
+
+__global__ void float3mulfloatequal(float3 *a, float c)
+{
+	int idx = blockIdx.x*blockDim.x + threadIdx.x;
+	a[idx] *= c;
+}
+
+__global__ void floatmulfloat3(float3 *a, float c, float3 *dest)
+{
+	int idx = blockIdx.x*blockDim.x + threadIdx.x;
+	dest[idx] = c*a[idx];
+}
+
+__global__ void float3divfloat(float3 *a, float c, float3 *dest)
+{
+	int idx = blockIdx.x*blockDim.x + threadIdx.x;
+	dest[idx] = a[idx]/c;
+}
+
+__global__ void float3divfloatequal(float3 *a, float c)
+{
+	int idx = blockIdx.x*blockDim.x + threadIdx.x;
+	a[idx] /= c;
+}
+
+__global__ void floatdivfloat3(float3 *a, float c, float3 *dest)
+{
+	int idx = blockIdx.x*blockDim.x + threadIdx.x;
+	dest[idx] = c/a[idx];
+}
+
+__global__ void dot(float3 *a, float3 *b, float *dest)
+{
+	int idx = blockIdx.x*blockDim.x + threadIdx.x;
+	dest[idx] = dot(a[idx],b[idx]);
+}
+
+__global__ void cross(float3 *a, float3 *b, float3 *dest)
+{
+	int idx = blockIdx.x*blockDim.x + threadIdx.x;
+	dest[idx] = cross(a[idx],b[idx]);
+}
+
+__global__ void norm(float3 *a, float *dest)
+{
+	int idx = blockIdx.x*blockDim.x + threadIdx.x;
+	dest[idx] = norm(a[idx]);
+}
+
+__global__ void minusfloat3(float3 *a, float3 *dest)
+{
+	int idx = blockIdx.x*blockDim.x + threadIdx.x;
+	dest[idx] = -a[idx];
+}
+
+} // extern "c"
diff --git a/test/linalg_test.py b/test/linalg_test.py
new file mode 100644
index 0000000..31688d9
--- /dev/null
+++ b/test/linalg_test.py
@@ -0,0 +1,214 @@
+import os
+import numpy as np
+from pycuda import autoinit
+from pycuda.compiler import SourceModule
+import pycuda.driver as cuda
+from pycuda import gpuarray
+
+float3 = gpuarray.vec.float3
+
+print 'device %s' % autoinit.device.name()
+
+current_directory = os.path.split(os.path.realpath(__file__))[0]
+source_directory = current_directory + '/../src'
+
+source = open(current_directory + '/linalg_test.cu').read()
+
+mod = SourceModule(source, options=['-I' + source_directory], no_extern_c=True, cache_dir=False)
+
+float3add = mod.get_function('float3add')
+float3addequal = mod.get_function('float3addequal')
+float3sub = mod.get_function('float3sub')
+float3subequal = mod.get_function('float3subequal')
+float3addfloat = mod.get_function('float3addfloat')
+float3addfloatequal = mod.get_function('float3addfloatequal')
+floataddfloat3 = mod.get_function('floataddfloat3')
+float3subfloat = mod.get_function('float3subfloat')
+float3subfloatequal = mod.get_function('float3subfloatequal')
+floatsubfloat3 = mod.get_function('floatsubfloat3')
+float3mulfloat = mod.get_function('float3mulfloat')
+float3mulfloatequal = mod.get_function('float3mulfloatequal')
+floatmulfloat3 = mod.get_function('floatmulfloat3')
+float3divfloat = mod.get_function('float3divfloat')
+float3divfloatequal = mod.get_function('float3divfloatequal')
+floatdivfloat3 = mod.get_function('floatdivfloat3')
+dot = mod.get_function('dot')
+cross = mod.get_function('cross')
+norm = mod.get_function('norm')
+minusfloat3 = mod.get_function('minusfloat3')
+
+size = {'block': (256,1,1), 'grid': (1,1)}
+
+a = np.empty(size['block'][0], dtype=float3)
+b = np.empty(size['block'][0], dtype=float3)
+c = np.float32(np.random.random_sample())
+
+a['x'] = np.random.random_sample(size=a.size)
+a['y'] = np.random.random_sample(size=a.size)
+a['z'] = np.random.random_sample(size=a.size)
+
+b['x'] = np.random.random_sample(size=b.size)
+b['y'] = np.random.random_sample(size=b.size)
+b['z'] = np.random.random_sample(size=b.size)
+
+def testfloat3add():
+    dest = np.empty(a.size, dtype=float3)
+    float3add(cuda.In(a), cuda.In(b), cuda.Out(dest), **size)
+    if not np.allclose(a['x']+b['x'], dest['x']) or \
+            not np.allclose(a['y']+b['y'], dest['y']) or \
+            not np.allclose(a['z']+b['z'], dest['z']):
+        assert False
+
+def testfloat3sub():
+    dest = np.empty(a.size, dtype=float3)
+    float3sub(cuda.In(a), cuda.In(b), cuda.Out(dest), **size)
+    if not np.allclose(a['x']-b['x'], dest['x']) or \
+            not np.allclose(a['y']-b['y'], dest['y']) or \
+            not np.allclose(a['z']-b['z'], dest['z']):
+        assert False
+
+def testfloat3addequal():
+    dest = np.copy(a)
+    float3addequal(cuda.InOut(dest), cuda.In(b), **size)
+    if not np.allclose(a['x']+b['x'], dest['x']) or \
+            not np.allclose(a['y']+b['y'], dest['y']) or \
+            not np.allclose(a['z']+b['z'], dest['z']):
+        assert False
+
+def testfloat3subequal():
+    dest = np.copy(a)
+    float3subequal(cuda.InOut(dest), cuda.In(b), **size)
+    if not np.allclose(a['x']-b['x'], dest['x']) or \
+            not np.allclose(a['y']-b['y'], dest['y']) or \
+            not np.allclose(a['z']-b['z'], dest['z']):
+        assert False
+
+def testfloat3addfloat():
+    dest = np.empty(a.size, dtype=float3)
+    float3addfloat(cuda.In(a), c, cuda.Out(dest), **size)
+    if not np.allclose(a['x']+c, dest['x']) or \
+            not np.allclose(a['y']+c, dest['y']) or \
+            not np.allclose(a['z']+c, dest['z']):
+        assert False
+
+def testfloat3addfloatequal():
+    dest = np.copy(a)
+    float3addfloatequal(cuda.InOut(dest), c, **size)
+    if not np.allclose(a['x']+c, dest['x']) or \
+            not np.allclose(a['y']+c, dest['y']) or \
+            not np.allclose(a['z']+c, dest['z']):
+        assert False
+
+def testfloataddfloat3():
+    dest = np.empty(a.size, dtype=float3)
+    floataddfloat3(cuda.In(a), c, cuda.Out(dest), **size)
+    if not np.allclose(c+a['x'], dest['x']) or \
+            not np.allclose(c+a['y'], dest['y']) or \
+            not np.allclose(c+a['z'], dest['z']):
+        assert False
+
+def testfloat3subfloat():
+    dest = np.empty(a.size, dtype=float3)
+    float3subfloat(cuda.In(a), c, cuda.Out(dest), **size)
+    if not np.allclose(a['x']-c, dest['x']) or \
+            not np.allclose(a['y']-c, dest['y']) or \
+            not np.allclose(a['z']-c, dest['z']):
+        assert False
+
+def testfloat3subfloatequal():
+    dest = np.copy(a)
+    float3subfloatequal(cuda.InOut(dest), c, **size)
+    if not np.allclose(a['x']-c, dest['x']) or \
+            not np.allclose(a['y']-c, dest['y']) or \
+            not np.allclose(a['z']-c, dest['z']):
+        assert False
+
+def testfloatsubfloat3():
+    dest = np.empty(a.size, dtype=float3)
+    floatsubfloat3(cuda.In(a), c, cuda.Out(dest), **size)
+    if not np.allclose(c-a['x'], dest['x']) or \
+            not np.allclose(c-a['y'], dest['y']) or \
+            not np.allclose(c-a['z'], dest['z']):
+        assert False
+
+def testfloat3mulfloat():
+    dest = np.empty(a.size, dtype=float3)
+    float3mulfloat(cuda.In(a), c, cuda.Out(dest), **size)
+    if not np.allclose(a['x']*c, dest['x']) or \
+            not np.allclose(a['y']*c, dest['y']) or \
+            not np.allclose(a['z']*c, dest['z']):
+        assert False
+
+def testfloat3mulfloatequal():
+    dest = np.copy(a)
+    float3mulfloatequal(cuda.InOut(dest), c, **size)
+    if not np.allclose(a['x']*c, dest['x']) or \
+            not np.allclose(a['y']*c, dest['y']) or \
+            not np.allclose(a['z']*c, dest['z']):
+        assert False
+
+def testfloatmulfloat3():
+    dest = np.empty(a.size, dtype=float3)
+    floatmulfloat3(cuda.In(a), c, cuda.Out(dest), **size)
+    if not np.allclose(c*a['x'], dest['x']) or \
+            not np.allclose(c*a['y'], dest['y']) or \
+            not np.allclose(c*a['z'], dest['z']):
+        assert False
+
+def testfloat3divfloat():
+    dest = np.empty(a.size, dtype=float3)
+    float3divfloat(cuda.In(a), c, cuda.Out(dest), **size)
+    if not np.allclose(a['x']/c, dest['x']) or \
+            not np.allclose(a['y']/c, dest['y']) or \
+            not np.allclose(a['z']/c, dest['z']):
+        assert False
+
+def testfloat3divfloatequal():
+    dest = np.copy(a)
+    float3divfloatequal(cuda.InOut(dest), c, **size)
+    if not np.allclose(a['x']/c, dest['x']) or \
+            not np.allclose(a['y']/c, dest['y']) or \
+            not np.allclose(a['z']/c, dest['z']):
+        assert False
+
+def testfloatdivfloat3():
+    dest = np.empty(a.size, dtype=float3)
+    floatdivfloat3(cuda.In(a), c, cuda.Out(dest), **size)
+    if not np.allclose(c/a['x'], dest['x']) or \
+            not np.allclose(c/a['y'], dest['y']) or \
+            not np.allclose(c/a['z'], dest['z']):
+        assert false
+
+def testdot():
+    dest = np.empty(a.size, dtype=np.float32)
+    dot(cuda.In(a), cuda.In(b), cuda.Out(dest), **size)
+    if not np.allclose(a['x']*b['x'] + a['y']*b['y'] + a['z']*b['z'], dest):
+        assert False
+
+def testcross():
+    dest = np.empty(a.size, dtype=float3)
+    cross(cuda.In(a), cuda.In(b), cuda.Out(dest), **size)
+    for u, v, wdest in zip(a,b,dest):
+        w = np.cross((u['x'], u['y'], u['z']),(v['x'],v['y'],v['z']))
+        if not np.allclose(wdest['x'], w[0]) or \
+                not np.allclose(wdest['y'], w[1]) or \
+                not np.allclose(wdest['z'], w[2]):
+            print w
+            print wdest
+            assert False
+
+def testnorm():
+    dest = np.empty(a.size, dtype=np.float32)
+    norm(cuda.In(a), cuda.Out(dest), **size)
+
+    for i in range(len(dest)):
+        if not np.allclose(np.linalg.norm((a['x'][i],a['y'][i],a['z'][i])), dest[i]):
+            assert False
+
+def testminusfloat3():
+    dest = np.empty(a.size, dtype=float3)
+    minusfloat3(cuda.In(a), cuda.Out(dest), **size)
+    if not np.allclose(-a['x'], dest['x']) or \
+            not np.allclose(-a['y'], dest['y']) or \
+            not np.allclose(-a['z'], dest['z']):
+        assert False
diff --git a/test/matrix_test.cu b/test/matrix_test.cu
new file mode 100644
index 0000000..d64cb34
--- /dev/null
+++ b/test/matrix_test.cu
@@ -0,0 +1,152 @@
+//-*-c-*-
+
+#include "matrix.h"
+
+__device__ Matrix array2matrix(float *a)
+{
+	return make_matrix(a[0], a[1], a[2],
+			   a[3], a[4], a[5],
+			   a[6], a[7], a[8]);
+}
+
+__device__ void matrix2array(const Matrix &m, float *a)
+{
+	a[0] = m.a00;
+	a[1] = m.a01;
+	a[2] = m.a02;
+	a[3] = m.a10;
+	a[4] = m.a11;
+	a[5] = m.a12;
+	a[6] = m.a20;
+	a[7] = m.a21;
+	a[8] = m.a22;
+}
+
+extern "C"
+{
+
+__global__ void det(float *a, float *dest)
+{
+	Matrix m = array2matrix(a);
+	dest[0] = det(m);
+}
+
+__global__ void inv(float *a, float *dest)
+{
+	Matrix m = array2matrix(a);
+	matrix2array(inv(m), dest);
+}
+
+__global__ void minusmatrix(float *a, float *dest)
+{
+	matrix2array(-array2matrix(a), dest);
+}
+
+__global__ void matrixadd(float *a, float *b, float *dest)
+{
+	matrix2array(array2matrix(a)+array2matrix(b), dest);
+}
+
+__global__ void matrixsub(float *a, float *b, float *dest)
+{
+	matrix2array(array2matrix(a)-array2matrix(b), dest);
+}
+
+__global__ void matrixmul(float *a, float *b, float *dest)
+{
+	matrix2array(array2matrix(a)*array2matrix(b), dest);
+}
+
+__global__ void multiply(float *a, float3 *x, float3 *dest)
+{
+	dest[0] = array2matrix(a)*x[0];
+}
+
+__global__ void matrixaddfloat(float *a, float c, float *dest)
+{
+	matrix2array(array2matrix(a)+c, dest);
+}
+
+__global__ void matrixsubfloat(float *a, float c, float *dest)
+{
+	matrix2array(array2matrix(a)-c, dest);
+}
+
+__global__ void matrixmulfloat(float *a, float c, float *dest)
+{
+	matrix2array(array2matrix(a)*c, dest);
+}
+
+__global__ void matrixdivfloat(float *a, float c, float *dest)
+{
+	matrix2array(array2matrix(a)/c, dest);
+}
+
+__global__ void floataddmatrix(float *a, float c, float *dest)
+{
+	matrix2array(c+array2matrix(a), dest);
+}
+
+__global__ void floatsubmatrix(float *a, float c, float *dest)
+{
+	matrix2array(c-array2matrix(a), dest);
+}
+
+__global__ void floatmulmatrix(float *a, float c, float *dest)
+{
+	matrix2array(c*array2matrix(a), dest);
+}
+
+__global__ void floatdivmatrix(float *a, float c, float *dest)
+{
+	matrix2array(c/array2matrix(a), dest);
+}
+
+__global__ void matrixaddequals(float *a, float *b)
+{
+	Matrix m = array2matrix(a);
+	m += array2matrix(b);
+	matrix2array(m,a);
+}
+
+__global__ void matrixsubequals(float *a, float *b)
+{
+	Matrix m = array2matrix(a);
+	m -= array2matrix(b);
+	matrix2array(m,a);
+}
+
+__global__ void matrixaddequalsfloat(float *a, float c)
+{
+	Matrix m = array2matrix(a);
+	m += c;
+	matrix2array(m,a);
+}
+
+__global__ void matrixsubequalsfloat(float *a, float c)
+{
+	Matrix m = array2matrix(a);
+	m -= c;
+	matrix2array(m,a);
+}
+
+__global__ void matrixmulequalsfloat(float *a, float c)
+{
+	Matrix m = array2matrix(a);
+	m *= c;
+	matrix2array(m,a);
+}
+
+__global__ void matrixdivequalsfloat(float *a, float c)
+{
+	Matrix m = array2matrix(a);
+	m /= c;
+	matrix2array(m,a);
+}
+
+__global__ void outer(float3 a, float3 b, float* dest)
+{
+	matrix2array(outer(a,b), dest);
+}
+
+} // extern "c"
diff --git a/test/matrix_test.py b/test/matrix_test.py
new file mode 100644
index 0000000..c843025
--- /dev/null
+++ b/test/matrix_test.py
@@ -0,0 +1,327 @@
+import os
+import numpy as np
+from pycuda import autoinit
+from pycuda.compiler import SourceModule
+import pycuda.driver as cuda
+from pycuda import gpuarray
+
+float3 = gpuarray.vec.float3
+
+print 'device %s' % autoinit.device.name()
+
+current_directory = os.path.split(os.path.realpath(__file__))[0]
+source_directory = current_directory + '/../src'
+
+source = open(current_directory + '/matrix_test.cu').read()
+
+mod = SourceModule(source, options=['-I' + source_directory], no_extern_c=True, cache_dir=False)
+
+det = mod.get_function('det')
+inv = mod.get_function('inv')
+matrixadd = mod.get_function('matrixadd')
+matrixsub = mod.get_function('matrixsub')
+matrixmul = mod.get_function('matrixmul')
+multiply = mod.get_function('multiply')
+matrixaddfloat = mod.get_function('matrixaddfloat')
+matrixsubfloat = mod.get_function('matrixsubfloat')
+matrixmulfloat = mod.get_function('matrixmulfloat')
+matrixdivfloat = mod.get_function('matrixdivfloat')
+floataddmatrix = mod.get_function('floataddmatrix')
+floatsubmatrix = mod.get_function('floatsubmatrix')
+floatmulmatrix = mod.get_function('floatmulmatrix')
+floatdivmatrix = mod.get_function('floatdivmatrix')
+matrixaddequals = mod.get_function('matrixaddequals')
+matrixsubequals = mod.get_function('matrixsubequals')
+matrixaddequalsfloat = mod.get_function('matrixaddequalsfloat')
+matrixsubequalsfloat = mod.get_function('matrixsubequalsfloat')
+matrixmulequalsfloat = mod.get_function('matrixmulequalsfloat')
+matrixdivequalsfloat = mod.get_function('matrixdivequalsfloat')
+outer = mod.get_function('outer')
+minusmatrix = mod.get_function('minusmatrix')
+
+size = {'block': (1,1,1), 'grid': (1,1)}
+
+for i in range(1):
+    a = np.random.random_sample(size=9).astype(np.float32)
+    b = np.random.random_sample(size=9).astype(np.float32)
+    dest = np.empty(1, dtype=np.float32)
+    c = np.int32(np.random.random_sample())
+
+    print 'testing det...',
+
+    det(cuda.In(a), cuda.Out(dest), **size)
+
+    if not np.allclose(np.float32(np.linalg.det(a.reshape(3,3))), dest[0]):
+        print 'fail'
+        print np.float32(np.linalg.det(a.reshape(3,3)))
+        print dest[0]
+    else:
+        print 'success'
+
+    print 'testing inv...',
+
+    dest = np.empty(9, dtype=np.float32)
+
+    inv(cuda.In(a), cuda.Out(dest), **size)
+
+    if not np.allclose(np.linalg.inv(a.reshape(3,3)).flatten().astype(np.float32), dest):
+        print 'fail'
+        print np.linalg.inv(a.reshape(3,3)).flatten().astype(np.float32)
+        print dest
+    else:
+        print 'success'
+
+    print 'testing matrixadd...',
+
+    matrixadd(cuda.In(a), cuda.In(b), cuda.Out(dest), **size)
+
+    if not np.allclose(a+b, dest):
+        print 'fail'
+        print a+b
+        print dest
+    else:
+        print 'success'
+
+    print 'testing matrixsub...',
+
+    matrixsub(cuda.In(a), cuda.In(b), cuda.Out(dest), **size)
+
+    if not np.allclose(a-b, dest):
+        print 'fail'
+        print a-b
+        print dest
+    else:
+        print 'sucess'
+
+    print 'testing matrixmul...',
+
+    matrixmul(cuda.In(a), cuda.In(b), cuda.Out(dest), **size)
+
+    if not np.allclose(np.dot(a.reshape(3,3),b.reshape(3,3)).flatten(), dest):
+        print 'fail'
+        print np.dot(a.reshape(3,3),b.reshape(3,3)).flatten()
+        print dest
+    else:
+        print 'success'
+
+    print 'testing multiply...',
+
+    x_cpu = np.random.random_sample(size=3).astype(np.float32)
+    x_gpu = np.array((x_cpu[0], x_cpu[1], x_cpu[2]), dtype=float3)
+
+    dest = np.empty(1, dtype=float3)
+    
+    multiply(cuda.In(a), cuda.In(x_gpu), cuda.Out(dest), **size)
+
+    m = a.reshape(3,3)
+
+    if not np.allclose(np.dot(x_cpu,m[0]), dest[0]['x']) or \
+            not np.allclose(np.dot(x_cpu,m[1]), dest[0]['y']) or \
+            not np.allclose(np.dot(x_cpu,m[2]), dest[0]['z']):
+        print 'fail'
+        print np.dot(x_cpu,m[0])
+        print np.dot(x_cpu,m[1])
+        print np.dot(x_cpu,m[2])
+        print dest[0]['x']
+        print dest[0]['y']
+        print dest[0]['z']
+    else:
+        print 'success'
+
+    print 'testing matrixaddfloat...',
+
+    dest = np.empty(9, dtype=np.float32)
+
+    matrixaddfloat(cuda.In(a), c, cuda.Out(dest), **size)
+
+    if not np.allclose(a+c, dest):
+        print 'fail'
+        print a+c
+        print dest
+    else:
+        print 'success'
+
+    print 'testing matrixsubfloat...',
+
+    matrixsubfloat(cuda.In(a), c, cuda.Out(dest), **size)
+
+    if not np.allclose(a-c, dest):
+        print 'fail'
+        print a-c
+        print dest
+    else:
+        print 'success'
+
+    print 'testing matrixmulfloat...',
+
+    matrixmulfloat(cuda.In(a), c, cuda.Out(dest), **size)
+
+    if not np.allclose(a*c, dest):
+        print 'fail'
+        print a-c
+        print dest
+    else:
+        print 'success'
+
+    print 'testing matrixdivfloat...',
+
+    matrixdivfloat(cuda.In(a), c, cuda.Out(dest), **size)
+
+    if not np.allclose(a/c, dest):
+        print 'fail'
+        print a/c
+        print dest
+    else:
+        print 'success'
+
+    print 'testing floataddmatrix...',
+
+    floataddmatrix(cuda.In(a), c, cuda.Out(dest), **size)
+
+    if not np.allclose(c+a, dest):
+        print 'fail'
+        print c+a
+        print dest
+    else:
+        print 'success'
+
+    print 'testing floatsubmatrix...',
+
+    floatsubmatrix(cuda.In(a), c, cuda.Out(dest), **size)
+
+    if not np.allclose(c-a, dest):
+        print 'fail'
+        print c-a
+        print dest
+    else:
+        print 'success'
+
+    print 'testing floatmulmatrix...',
+
+    floatmulmatrix(cuda.In(a), c, cuda.Out(dest), **size)
+
+    if not np.allclose(c*a, dest):
+        print 'fail'
+        print c*a
+        print dest
+    else:
+        print 'success'
+
+    print 'testing floatdivmatrix...',
+
+    floatdivmatrix(cuda.In(a), c, cuda.Out(dest), **size)
+
+    if not np.allclose(c/a, dest):
+        print 'fail'
+        print c/a
+        print dest
+    else:
+        print 'success'
+
+    print 'testing matrixaddequals...',
+
+    dest = np.copy(a)
+
+    matrixaddequals(cuda.InOut(dest), cuda.In(b), **size)
+
+    if not np.allclose(a+b, dest):
+        print 'fail'
+        print a+b
+        print dest
+    else:
+        print 'success'
+
+    print 'testing matrixsubequals...',
+
+    dest = np.copy(a)
+
+    matrixsubequals(cuda.InOut(dest), cuda.In(b), **size)
+
+    if not np.allclose(a-b, dest):
+        print 'fail'
+        print a-b
+        print dest
+    else:
+        print 'success'
+
+    print 'testing matrixaddequalsfloat...',
+
+    dest = np.copy(a)
+
+    matrixaddequalsfloat(cuda.InOut(dest), c, **size)
+
+    if not np.allclose(a+c, dest):
+        print 'fail'
+        print a+c
+        print dest
+    else:
+        print 'success'
+
+    print 'testing matrixsubequalsfloat...',
+
+    dest = np.copy(a)
+
+    matrixsubequalsfloat(cuda.InOut(dest), c, **size)
+
+    if not np.allclose(a-c, dest):
+        print 'fail'
+        print a-c
+        print dest
+    else:
+        print 'success'
+
+    print 'testing matrixmulequalsfloat...',
+
+    dest = np.copy(a)
+
+    matrixmulequalsfloat(cuda.InOut(dest), c, **size)
+
+    if not np.allclose(a*c, dest):
+        print 'fail'
+        print a*c
+        print dest
+    else:
+        print 'success'
+
+    print 'testing matrixdivequalsfloat...',
+
+    dest = np.copy(a)
+
+    matrixdivequalsfloat(cuda.InOut(dest), c, **size)
+
+    if not np.allclose(a/c, dest):
+        print 'fail'
+        print a*c
+        print dest
+    else:
+        print 'success'
+
+    print 'testing outer...',
+
+    x1_cpu = np.random.random_sample(size=3).astype(np.float32)
+    x2_cpu = np.random.random_sample(size=3).astype(np.float32)
+
+    x1_gpu = np.array((x1_cpu[0], x1_cpu[1], x1_cpu[2]), dtype=float3)
+    x2_gpu = np.array((x2_cpu[0], x2_cpu[1], x2_cpu[2]), dtype=float3)
+
+    outer(x1_gpu, x2_gpu, cuda.Out(dest), **size)
+
+    if not np.allclose(np.outer(x1_cpu, x2_cpu).flatten(), dest):
+        print 'fail'
+        print np.outer(x1_cpu, x2_cpu).flatten()
+        print dest
+    else:
+        print 'success'
+
+    print 'testing minus matrix...',
+
+    dest = np.copy(a)
+
+    minusmatrix(cuda.In(a), cuda.Out(dest), **size)
+
+    if not np.allclose(-a, dest):
+        print 'fail'
+        print -a
+        print dest
+    else:
+        print 'success'
diff --git a/test/rotate_test.cu b/test/rotate_test.cu
new file mode 100644
index 0000000..6cafc12
--- /dev/null
+++ b/test/rotate_test.cu
@@ -0,0 +1,14 @@
+//-*-c-*-
+
+#include "rotate.h"
+
+extern "C"
+{
+
+__global__ void rotate(float3 *a, float *phi, float3 *n, float3 *dest)
+{
+	int idx = blockIdx.x*blockDim.x + threadIdx.x;
+	dest[idx] = rotate(a[idx], phi[idx], n[idx]);
+}
+
+} // extern "c"
diff --git a/test/rotate_test.py b/test/rotate_test.py
new file mode 100644
index 0000000..92eff84
--- /dev/null
+++ b/test/rotate_test.py
@@ -0,0 +1,67 @@
+import os
+import numpy as np
+from pycuda import autoinit
+from pycuda.compiler import SourceModule
+import pycuda.driver as cuda
+from pycuda import gpuarray
+float3 = gpuarray.vec.float3
+
+def rotate(x, phi, n):
+    x = np.asarray(x)
+    n = np.asarray(n)
+
+    r = np.cos(phi)*np.identity(3) + (1-np.cos(phi))*np.outer(n,n) + \
+        np.sin(phi)*np.array([[0,n[2],-n[1]],[-n[2],0,n[0]],[n[1],-n[0],0]])
+
+    return np.inner(x,r)
+
+print 'device %s' % autoinit.device.name()
+
+current_directory = os.path.split(os.path.realpath(__file__))[0]
+source_directory = current_directory + '/../src'
+
+source = open(current_directory + '/rotate_test.cu').read()
+
+mod = SourceModule(source, options=['-I' + source_directory], no_extern_c=True, cache_dir=False)
+
+rotate_gpu = mod.get_function('rotate')
+
+size = {'block': (100,1,1), 'grid': (1,1)}
+
+a = np.empty(size['block'][0], dtype=float3)
+n = np.empty(size['block'][0], dtype=float3)
+phi = np.random.random_sample(size=a.size).astype(np.float32)
+
+a['x'] = np.random.random_sample(size=a.size)
+a['y'] = np.random.random_sample(size=a.size)
+a['z'] = np.random.random_sample(size=a.size)
+
+n['x'] = np.random.random_sample(size=n.size)
+n['y'] = np.random.random_sample(size=n.size)
+n['z'] = np.random.random_sample(size=n.size)
+
+a['x'] = np.ones(a.size)
+a['y'] = np.zeros(a.size)
+a['z'] = np.zeros(a.size)
+
+n['x'] = np.zeros(n.size)
+n['y'] = np.zeros(n.size)
+n['z'] = np.ones(n.size)
+
+phi = np.array([np.pi/2]*a.size).astype(np.float32)
+
+def testrotate():
+    dest = np.empty(a.size, dtype=float3)
+    rotate_gpu(cuda.In(a), cuda.In(phi), cuda.In(n), cuda.Out(dest), **size)
+    for v, theta, w, rdest in zip(a,phi,n,dest):
+        r = rotate((v['x'], v['y'], v['z']), theta, (w['x'], w['y'], w['z']))
+        if not np.allclose(rdest['x'], r[0]) or \
+                not np.allclose(rdest['y'], r[1]) or \
+                not np.allclose(rdest['z'], r[2]):
+            print v
+            print theta
+            print w
+            print r
+            print rdest
+            assert False
+
diff --git a/test/test_fileio.py b/test/test_fileio.py
new file mode 100644
index 0000000..3869a9f
--- /dev/null
+++ b/test/test_fileio.py
@@ -0,0 +1,74 @@
+import unittest
+from chroma.fileio import root
+from chroma import event
+import numpy as np
+
+class TestFileIO(unittest.TestCase):
+    def test_file_write_and_read(self):
+        ev = event.Event(1, event.Vertex('e-', pos=(0,0,1), dir=(1,0,0),
+                                         ke=15.0, pol=(0,1,0)))
+
+        photons_beg = root.make_photon_with_arrays(1)
+        photons_beg.pos[0] = (1,2,3)
+        photons_beg.dir[0] = (4,5,6)
+        photons_beg.pol[0] = (7,8,9)
+        photons_beg.wavelengths[0] = 400.0
+        photons_beg.t[0] = 100.0
+        photons_beg.last_hit_triangles[0] = 5
+        photons_beg.flags[0] = 20
+        ev.photons_beg = photons_beg
+
+        photons_end = root.make_photon_with_arrays(1)
+        photons_end.pos[0] = (1,2,3)
+        photons_end.dir[0] = (4,5,6)
+        photons_end.pol[0] = (7,8,9)
+        photons_end.wavelengths[0] = 400.0
+        photons_end.t[0] = 100.0
+        photons_end.last_hit_triangles[0] = 5
+        photons_end.flags[0] = 20
+        ev.photons_end = photons_end
+
+        ev.vertices = [ev.primary_vertex]
+
+        channels = event.Channels(hit=np.array([True, False]), 
+                                  t=np.array([20.0, 1e9], dtype=np.float32),
+                                  q=np.array([2.0, 0.0], dtype=np.float32),
+                                  flags=np.array([8, 32], dtype=np.uint32))
+        ev.channels = channels
+
+        filename = '/tmp/chroma-filewritertest.root'
+        writer = root.RootWriter(filename)
+        writer.write_event(ev)
+        writer.close()
+
+        # Exercise the RootReader methods
+        reader = root.RootReader(filename)
+        self.assertEquals(len(reader), 1)
+
+        self.assertRaises(StopIteration, reader.prev)
+
+        reader.next()
+
+        self.assertEqual(reader.index(), 0)
+        self.assertRaises(StopIteration, reader.next)
+        
+        reader.jump_to(0)
+
+        # Enough screwing around, let's get the one event in the file
+        newev = reader.current()
+
+        # Now check if everything is correct in the event
+        for attribute in ['id']:
+            self.assertEqual(getattr(ev, attribute), getattr(newev, attribute), 'compare %s' % attribute)
+
+        for attribute in ['pos', 'dir', 'pol', 'ke', 't0']:
+            self.assertTrue(np.allclose(getattr(ev.primary_vertex, attribute), getattr(newev.primary_vertex, attribute)), 'compare %s' % attribute)
+
+            for i in range(len(ev.vertices)):
+                self.assertTrue(np.allclose(getattr(ev.vertices[i], attribute), getattr(newev.vertices[i], attribute)), 'compare %s' % attribute)
+
+        for attribute in ['pos', 'dir', 'pol', 'wavelengths', 't', 'last_hit_triangles', 'flags']:    
+            self.assertTrue(np.allclose(getattr(ev.photons_beg, attribute), 
+                                        getattr(newev.photons_beg, attribute)), 'compare %s' % attribute)
+            self.assertTrue(np.allclose(getattr(ev.photons_end, attribute), 
+                                        getattr(newev.photons_end, attribute)), 'compare %s' % attribute)
diff --git a/test/test_generator_photon.py b/test/test_generator_photon.py
new file mode 100644
index 0000000..13501fe
--- /dev/null
+++ b/test/test_generator_photon.py
@@ -0,0 +1,21 @@
+import unittest
+import itertools
+
+from chroma import generator
+from chroma.generator.vertex import constant_particle_gun
+from chroma import optics
+
+class TestG4ParallelGenerator(unittest.TestCase):
+    def test_center(self):
+        '''Generate Cherenkov light at the center of the world volume'''
+        gen = generator.photon.G4ParallelGenerator(1, optics.water_wcsim)
+        vertex = itertools.islice(constant_particle_gun('e-', (0,0,0), (1,0,0), 100), 10)
+        for event in gen.generate_events(vertex):
+            self.assertGreater(len(event.photons_beg.pos), 0)
+
+    def test_off_center(self):
+        '''Generate Cherenkov light at (1 m, 0 m, 0 m)'''
+        gen = generator.photon.G4ParallelGenerator(1, optics.water_wcsim)
+        vertex = itertools.islice(constant_particle_gun('e-', (1,0,0), (1,0,0), 100), 10)
+        for event in gen.generate_events(vertex):
+            self.assertGreater(len(event.photons_beg.pos), 0)
diff --git a/test/test_generator_vertex.py b/test/test_generator_vertex.py
new file mode 100644
index 0000000..cec363d
--- /dev/null
+++ b/test/test_generator_vertex.py
@@ -0,0 +1,23 @@
+import unittest
+import itertools
+import numpy as np
+import chroma.generator.vertex
+
+class TestParticleGun(unittest.TestCase):
+    def test_constant_particle_gun_center(self):
+        '''Generate electron vertices at the center of the world volume.'''
+        vertex = chroma.generator.vertex.constant_particle_gun('e-', (0,0,0), (1,0,0), 100)
+        for ev in itertools.islice(vertex, 100):
+            self.assertEquals(ev.primary_vertex.particle_name, 'e-')
+            self.assertTrue(np.allclose(ev.primary_vertex.pos, [0,0,0]))
+            self.assertTrue(np.allclose(ev.primary_vertex.dir, [1,0,0]))
+            self.assertTrue(np.allclose(ev.primary_vertex.ke, 100))
+
+    def test_off_center(self):
+        '''Generate electron vertices at (1,0,0) in the world volume.'''
+        vertex = chroma.generator.vertex.constant_particle_gun('e-', (1,0,0), (1,0,0), 100)
+        for ev in itertools.islice(vertex, 100):
+            self.assertEquals(ev.primary_vertex.particle_name, 'e-')
+            self.assertTrue(np.allclose(ev.primary_vertex.pos, [1,0,0]))
+            self.assertTrue(np.allclose(ev.primary_vertex.dir, [1,0,0]))
+            self.assertTrue(np.allclose(ev.primary_vertex.ke, 100))
diff --git a/test/test_pdf.py b/test/test_pdf.py
new file mode 100644
index 0000000..571cbd4
--- /dev/null
+++ b/test/test_pdf.py
@@ -0,0 +1,67 @@
+import unittest
+import numpy as np
+import itertools
+
+import chroma.detectors
+from chroma.generator.photon import G4ParallelGenerator
+from chroma.generator.vertex import constant_particle_gun
+from chroma.optics import water_wcsim
+from chroma import gpu
+from chroma.sim import Simulation
+
+class TestPDF(unittest.TestCase):
+    def setUp(self):
+        self.detector = chroma.detectors.microlbne()
+        self.detector.build()
+        self.vertex_gen = constant_particle_gun('e-', (0,0,0), (1,0,0), 10)
+
+    def testGPUPDF(self):
+        '''Create a hit count and (q,t) PDF for 10 MeV events in MicroLBNE'''
+
+        g4generator = G4ParallelGenerator(1, water_wcsim)
+
+        context = gpu.create_cuda_context()
+
+        gpu_geometry = gpu.GPUGeometry(self.detector)
+
+        nthreads_per_block, max_blocks = 64, 1024
+
+        rng_states = gpu.get_rng_states(nthreads_per_block*max_blocks)
+
+        gpu_daq = gpu.GPUDaq(gpu_geometry, max(self.detector.pmtids))
+        gpu_pdf = gpu.GPUPDF()
+        gpu_pdf.setup_pdf(max(self.detector.pmtids), 100, (-0.5, 999.5), 10, (-0.5, 9.5))
+        
+        gpu_pdf.clear_pdf()
+
+        for ev in g4generator.generate_events(itertools.islice(self.vertex_gen, 10)):
+            gpu_photons = gpu.GPUPhotons(ev.photons_beg)
+            gpu_photons.propagate(gpu_geometry, rng_states, nthreads_per_block,
+                                  max_blocks)
+            gpu_channels = gpu_daq.acquire(gpu_photons, rng_states,
+                                           nthreads_per_block, max_blocks)
+            gpu_pdf.add_hits_to_pdf(gpu_channels)
+
+        hitcount, pdf = gpu_pdf.get_pdfs()
+        self.assertTrue( (hitcount > 0).any() )
+        self.assertTrue( (pdf > 0).any() )
+
+        # Consistency checks
+        for i, nhits in enumerate(hitcount):
+            self.assertEqual(nhits, pdf[i].sum())
+
+        context.pop()
+
+    def testSimPDF(self):
+        sim = Simulation(self.detector)
+
+        vertex_iter = itertools.islice(self.vertex_gen, 10)
+
+        hitcount, pdf = sim.create_pdf(vertex_iter, 100, (-0.5, 999.5), 10, (-0.5, 9.5))
+
+        self.assertTrue( (hitcount > 0).any() )
+        self.assertTrue( (pdf > 0).any() )
+
+        # Consistency checks
+        for i, nhits in enumerate(hitcount):
+            self.assertEqual(nhits, pdf[i].sum())
diff --git a/test/test_propagation.py b/test/test_propagation.py
new file mode 100644
index 0000000..43ecb9b
--- /dev/null
+++ b/test/test_propagation.py
@@ -0,0 +1,58 @@
+import unittest
+import numpy as np
+
+from chroma.geometry import Solid, Geometry
+from chroma.make import box
+from chroma.sim import Simulation
+from chroma.optics import vacuum
+from chroma.event import Photons
+
+class TestPropagation(unittest.TestCase):
+    def testAbort(self):
+        '''Photons that hit a triangle at normal incidence should not abort.
+
+        Photons that hit a triangle at exactly normal incidence can sometimes
+        produce a dot product that is outside the range allowed by acos().
+        Trigger these with axis aligned photons in a box.
+        '''
+
+        # Setup geometry
+        cube = Geometry(vacuum)
+        cube.add_solid(Solid(box(100,100,100), vacuum, vacuum))
+        cube.pmtids = [0]
+        cube.build(use_cache=False)
+        sim = Simulation(cube, geant4_processes=0)
+
+        # Create initial photons
+        nphotons = 10000
+        pos = np.tile([0,0,0], (nphotons,1)).astype(np.float32)
+        dir = np.tile([0,0,1], (nphotons,1)).astype(np.float32)
+        pol = np.zeros_like(pos)
+        phi = np.random.uniform(0, 2*np.pi, nphotons).astype(np.float32)
+        pol[:,0] = np.cos(phi)
+        pol[:,1] = np.sin(phi)
+        t = np.zeros(nphotons, dtype=np.float32)
+        wavelengths = np.empty(nphotons, np.float32)
+        wavelengths.fill(400.0)
+
+        photons = Photons(pos=pos, dir=dir, pol=pol, t=t,
+                          wavelengths=wavelengths)
+
+        # First make one step to check for strangeness
+        photons_end = sim.simulate([photons], keep_photons_end=True,
+                                   max_steps=1).next().photons_end
+
+        self.assertFalse(np.isnan(photons_end.pos).any())
+        self.assertFalse(np.isnan(photons_end.dir).any())
+        self.assertFalse(np.isnan(photons_end.pol).any())
+        self.assertFalse(np.isnan(photons_end.t).any())
+        self.assertFalse(np.isnan(photons_end.wavelengths).any())
+
+        # Now let it run the usual ten steps
+        photons_end = sim.simulate([photons], keep_photons_end=True,
+                                   max_steps=10).next().photons_end
+        aborted = (photons_end.flags & (1 << 31)) > 0
+        print 'aborted photons: %1.1f' % \
+            (float(np.count_nonzero(aborted)) / nphotons)
+        self.assertFalse(aborted.any())
+        
diff --git a/test/test_ray_intersection.py b/test/test_ray_intersection.py
new file mode 100644
index 0000000..7d0c53c
--- /dev/null
+++ b/test/test_ray_intersection.py
@@ -0,0 +1,27 @@
+import unittest
+import chroma
+import numpy as np
+import os
+from pycuda import gpuarray as ga
+
+class TestRayIntersection(unittest.TestCase):
+    def setUp(self):
+        self.context = chroma.gpu.create_cuda_context()
+        self.module = chroma.gpu.get_cu_module('mesh.h')
+        self.gpu_funcs = chroma.gpu.GPUFuncs(self.module)
+        self.box = chroma.gpu.GPUGeometry(chroma.build(chroma.make.cube()))
+
+        pos, dir = chroma.project.from_film()
+        self.pos_gpu = ga.to_gpu(chroma.gpu.to_float3(pos))
+        self.dir_gpu = ga.to_gpu(chroma.gpu.to_float3(dir))
+
+        testdir = os.path.dirname(os.path.abspath(chroma.tests.__file__))
+        self.dx_standard = np.load(os.path.join(testdir,
+                                                'data/ray_intersection.npz'))
+    def test_intersection_distance(self):
+        dx = ga.zeros(self.pos_gpu.size, dtype=np.float32)
+        self.gpu_funcs.distance_to_mesh(np.int32(self.pos_gpu.size), self.pos_gpu, self.dir_gpu, self.box.gpudata, dx, block=(64,1,1), grid=(self.pos_gpu.size//64+1,1))
+        self.assertTrue((dx.get() == self.dx_standard).all())
+
+    def tearDown(self):
+        self.context.pop()
diff --git a/test/test_rayleigh.py b/test/test_rayleigh.py
new file mode 100644
index 0000000..02ccb41
--- /dev/null
+++ b/test/test_rayleigh.py
@@ -0,0 +1,57 @@
+import unittest
+import numpy as np
+
+from chroma.geometry import Solid, Geometry
+from chroma.make import box
+from chroma.sim import Simulation
+from chroma.optics import water_wcsim
+from chroma.event import Photons
+import histogram
+from histogram.root import rootify
+import ROOT
+ROOT.gROOT.SetBatch(1)
+
+class TestRayleigh(unittest.TestCase):
+    def setUp(self):
+        self.cube = Geometry(water_wcsim)
+        self.cube.add_solid(Solid(box(100,100,100), water_wcsim, water_wcsim))
+        self.cube.pmtids = [0]
+        self.cube.build(use_cache=False)
+        self.sim = Simulation(self.cube, geant4_processes=0)
+
+        nphotons = 100000
+        pos = np.tile([0,0,0], (nphotons,1)).astype(np.float32)
+        dir = np.tile([0,0,1], (nphotons,1)).astype(np.float32)
+        pol = np.zeros_like(pos)
+        phi = np.random.uniform(0, 2*np.pi, nphotons).astype(np.float32)
+        pol[:,0] = np.cos(phi)
+        pol[:,1] = np.sin(phi)
+        t = np.zeros(nphotons, dtype=np.float32)
+        wavelengths = np.empty(nphotons, np.float32)
+        wavelengths.fill(400.0)
+
+        self.photons = Photons(pos=pos, dir=dir, pol=pol, t=t, wavelengths=wavelengths)
+
+    def testAngularDistributionPolarized(self):
+        # Fully polarized photons
+        self.photons.pol[:] = [1.0, 0.0, 0.0]
+
+        photons_end = self.sim.simulate([self.photons], keep_photons_end=True, max_steps=1).next().photons_end
+        aborted = (photons_end.flags & (1 << 31)) > 0
+        self.assertFalse(aborted.any())
+
+        # Compute the dot product between initial and final dir
+        rayleigh_scatters = (photons_end.flags & (1 << 4)) > 0
+        cos_scatter = (self.photons.dir[rayleigh_scatters] * photons_end.dir[rayleigh_scatters]).sum(axis=1)
+        theta_scatter = np.arccos(cos_scatter)
+        h = histogram.Histogram(bins=100, range=(0, np.pi))
+        h.fill(theta_scatter)
+        h = rootify(h)
+
+        # The functional form for polarized light should be
+        # (1 + \cos^2 \theta)\sin \theta according to GEANT4 physics
+        # reference manual.
+        f = ROOT.TF1("pol_func", "[0]*(1+cos(x)**2)*sin(x)", 0, np.pi)
+        h.Fit(f)
+        self.assertGreater(f.GetProb(), 1e-3)
+
diff --git a/test/test_sample_cdf.cu b/test/test_sample_cdf.cu
new file mode 100644
index 0000000..1401772
--- /dev/null
+++ b/test/test_sample_cdf.cu
@@ -0,0 +1,16 @@
+// -*-c++-*-
+#include "random.h"
+
+extern "C" {
+
+__global__ void test_sample_cdf(int offset, int ncdf, 
+				float *cdf_x, float *cdf_y, float *out)
+{
+  int id = blockDim.x * blockIdx.x + threadIdx.x;
+  curandState s;
+  curand_init(0, id, offset, &s);
+
+  out[id] = sample_cdf(&s, ncdf, cdf_x, cdf_y);
+}
+
+}
diff --git a/test/test_sample_cdf.py b/test/test_sample_cdf.py
new file mode 100644
index 0000000..2baa2ac
--- /dev/null
+++ b/test/test_sample_cdf.py
@@ -0,0 +1,64 @@
+import pycuda.autoinit
+import pycuda.driver as cuda
+from pycuda.compiler import SourceModule
+from pycuda import gpuarray
+import numpy as np
+import ROOT
+import os
+
+current_directory = os.path.split(os.path.realpath(__file__))[0]
+source_directory = current_directory + '/../src'
+
+source = open(current_directory + '/test_sample_cdf.cu').read()
+
+mod = SourceModule(source, options=['-I' + source_directory], no_extern_c=True, cache_dir=False)
+
+test_sample_cdf = mod.get_function('test_sample_cdf')
+
+def compare_sampling(hist, reps=10):
+    nbins = hist.GetNbinsX();
+    xaxis = hist.GetXaxis()
+    intg = hist.GetIntegral()
+    cdf_y = np.empty(nbins+1, dtype=float)
+    cdf_x = np.empty_like(cdf_y)
+
+    cdf_x[0] = xaxis.GetBinLowEdge(1)
+    cdf_y[0] = 0.0
+    for i in xrange(1,len(cdf_x)):
+        cdf_y[i] = intg[i]
+        cdf_x[i] = xaxis.GetBinUpEdge(i)
+
+    cdf_x_gpu = gpuarray.to_gpu(cdf_x.astype(np.float32))
+    cdf_y_gpu = gpuarray.to_gpu(cdf_y.astype(np.float32))
+    block =(128,1,1)
+    grid = (128, 1)
+    out_gpu = gpuarray.empty(shape=int(block[0]*grid[0]), dtype=np.float32)
+
+    out_h = ROOT.TH1D('out_h', '', hist.GetNbinsX(), 
+                      xaxis.GetXmin(),
+                      xaxis.GetXmax())
+    out_h.SetLineColor(ROOT.kGreen)
+
+    for i in xrange(reps):
+        test_sample_cdf(np.int32(i),
+                        np.int32(len(cdf_x_gpu)), 
+                        cdf_x_gpu, cdf_y_gpu, out_gpu, block=block, grid=grid)
+        out = out_gpu.get()
+        for v in out:
+            out_h.Fill(v)
+
+    prob = out_h.KolmogorovTest(hist)
+    return prob, out_h 
+
+def test_sampling():
+    '''Verify that the CDF-based sampler on the GPU reproduces a binned
+    Gaussian distribution'''
+    f = ROOT.TF1('f_gaussian', 'gaus(0)', -5, 5)
+    f.SetParameters(1.0/np.sqrt(np.pi * 2), 0.0, 1.0)
+    gaussian = ROOT.TH1D('gaussian', '', 100, -5, 5)
+    gaussian.Add(f)
+
+    prob, out_h = compare_sampling(gaussian, reps=50)
+
+    assert prob > 0.01
+
diff --git a/tests/__init__.py b/tests/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/tests/data/ray_intersection.npz b/tests/data/ray_intersection.npz
deleted file mode 100644
index fba0e40..0000000
Binary files a/tests/data/ray_intersection.npz and /dev/null differ
diff --git a/tests/linalg_test.cu b/tests/linalg_test.cu
deleted file mode 100644
index 4e9c983..0000000
--- a/tests/linalg_test.cu
+++ /dev/null
@@ -1,128 +0,0 @@
-//-*-c-*-
-
-#include "linalg.h"
-
-extern "C"
-{
-
-__global__ void float3add(float3 *a, float3 *b, float3 *dest)
-{
-	int idx = blockIdx.x*blockDim.x + threadIdx.x;
-	dest[idx] = a[idx] + b[idx];
-}
-
-__global__ void float3addequal(float3 *a, float3 *b)
-{
-	int idx = blockIdx.x*blockDim.x + threadIdx.x;
-	a[idx] += b[idx];
-}
-
-__global__ void float3sub(float3 *a, float3 *b, float3 *dest)
-{
-	int idx = blockIdx.x*blockDim.x + threadIdx.x;
-	dest[idx] = a[idx] - b[idx];
-}
-
-__global__ void float3subequal(float3 *a, float3 *b)
-{
-	int idx = blockIdx.x*blockDim.x + threadIdx.x;
-	a[idx] -= b[idx];
-}
-
-__global__ void float3addfloat(float3 *a, float c, float3 *dest)
-{
-	int idx = blockIdx.x*blockDim.x + threadIdx.x;
-	dest[idx] = a[idx] + c;
-}
-
-__global__ void float3addfloatequal(float3 *a, float c)
-{
-	int idx = blockIdx.x*blockDim.x + threadIdx.x;
-	a[idx] += c;
-}
-
-__global__ void floataddfloat3(float3 *a, float c, float3 *dest)
-{
-	int idx = blockIdx.x*blockDim.x + threadIdx.x;
-	dest[idx] = c + a[idx];
-}
-
-__global__ void float3subfloat(float3 *a, float c, float3 *dest)
-{
-	int idx = blockIdx.x*blockDim.x + threadIdx.x;
-	dest[idx] = a[idx] - c;
-}
-
-__global__ void float3subfloatequal(float3 *a, float c)
-{
-	int idx = blockIdx.x*blockDim.x + threadIdx.x;
-	a[idx] -= c;
-}
-
-__global__ void floatsubfloat3(float3 *a, float c, float3 *dest)
-{
-	int idx = blockIdx.x*blockDim.x + threadIdx.x;
-	dest[idx] = c - a[idx];
-}
-
-__global__ void float3mulfloat(float3 *a, float c, float3 *dest)
-{
-	int idx = blockIdx.x*blockDim.x + threadIdx.x;
-	dest[idx] = a[idx]*c;
-}
-
-__global__ void float3mulfloatequal(float3 *a, float c)
-{
-	int idx = blockIdx.x*blockDim.x + threadIdx.x;
-	a[idx] *= c;
-}
-
-__global__ void floatmulfloat3(float3 *a, float c, float3 *dest)
-{
-	int idx = blockIdx.x*blockDim.x + threadIdx.x;
-	dest[idx] = c*a[idx];
-}
-
-__global__ void float3divfloat(float3 *a, float c, float3 *dest)
-{
-	int idx = blockIdx.x*blockDim.x + threadIdx.x;
-	dest[idx] = a[idx]/c;
-}
-
-__global__ void float3divfloatequal(float3 *a, float c)
-{
-	int idx = blockIdx.x*blockDim.x + threadIdx.x;
-	a[idx] /= c;
-}
-
-__global__ void floatdivfloat3(float3 *a, float c, float3 *dest)
-{
-	int idx = blockIdx.x*blockDim.x + threadIdx.x;
-	dest[idx] = c/a[idx];
-}
-
-__global__ void dot(float3 *a, float3 *b, float *dest)
-{
-	int idx = blockIdx.x*blockDim.x + threadIdx.x;
-	dest[idx] = dot(a[idx],b[idx]);
-}
-
-__global__ void cross(float3 *a, float3 *b, float3 *dest)
-{
-	int idx = blockIdx.x*blockDim.x + threadIdx.x;
-	dest[idx] = cross(a[idx],b[idx]);
-}
-
-__global__ void norm(float3 *a, float *dest)
-{
-	int idx = blockIdx.x*blockDim.x + threadIdx.x;
-	dest[idx] = norm(a[idx]);
-}
-
-__global__ void minusfloat3(float3 *a, float3 *dest)
-{
-	int idx = blockIdx.x*blockDim.x + threadIdx.x;
-	dest[idx] = -a[idx];
-}
-
-} // extern "c"
diff --git a/tests/linalg_test.py b/tests/linalg_test.py
deleted file mode 100644
index 31688d9..0000000
--- a/tests/linalg_test.py
+++ /dev/null
@@ -1,214 +0,0 @@
-import os
-import numpy as np
-from pycuda import autoinit
-from pycuda.compiler import SourceModule
-import pycuda.driver as cuda
-from pycuda import gpuarray
-
-float3 = gpuarray.vec.float3
-
-print 'device %s' % autoinit.device.name()
-
-current_directory = os.path.split(os.path.realpath(__file__))[0]
-source_directory = current_directory + '/../src'
-
-source = open(current_directory + '/linalg_test.cu').read()
-
-mod = SourceModule(source, options=['-I' + source_directory], no_extern_c=True, cache_dir=False)
-
-float3add = mod.get_function('float3add')
-float3addequal = mod.get_function('float3addequal')
-float3sub = mod.get_function('float3sub')
-float3subequal = mod.get_function('float3subequal')
-float3addfloat = mod.get_function('float3addfloat')
-float3addfloatequal = mod.get_function('float3addfloatequal')
-floataddfloat3 = mod.get_function('floataddfloat3')
-float3subfloat = mod.get_function('float3subfloat')
-float3subfloatequal = mod.get_function('float3subfloatequal')
-floatsubfloat3 = mod.get_function('floatsubfloat3')
-float3mulfloat = mod.get_function('float3mulfloat')
-float3mulfloatequal = mod.get_function('float3mulfloatequal')
-floatmulfloat3 = mod.get_function('floatmulfloat3')
-float3divfloat = mod.get_function('float3divfloat')
-float3divfloatequal = mod.get_function('float3divfloatequal')
-floatdivfloat3 = mod.get_function('floatdivfloat3')
-dot = mod.get_function('dot')
-cross = mod.get_function('cross')
-norm = mod.get_function('norm')
-minusfloat3 = mod.get_function('minusfloat3')
-
-size = {'block': (256,1,1), 'grid': (1,1)}
-
-a = np.empty(size['block'][0], dtype=float3)
-b = np.empty(size['block'][0], dtype=float3)
-c = np.float32(np.random.random_sample())
-
-a['x'] = np.random.random_sample(size=a.size)
-a['y'] = np.random.random_sample(size=a.size)
-a['z'] = np.random.random_sample(size=a.size)
-
-b['x'] = np.random.random_sample(size=b.size)
-b['y'] = np.random.random_sample(size=b.size)
-b['z'] = np.random.random_sample(size=b.size)
-
-def testfloat3add():
-    dest = np.empty(a.size, dtype=float3)
-    float3add(cuda.In(a), cuda.In(b), cuda.Out(dest), **size)
-    if not np.allclose(a['x']+b['x'], dest['x']) or \
-            not np.allclose(a['y']+b['y'], dest['y']) or \
-            not np.allclose(a['z']+b['z'], dest['z']):
-        assert False
-
-def testfloat3sub():
-    dest = np.empty(a.size, dtype=float3)
-    float3sub(cuda.In(a), cuda.In(b), cuda.Out(dest), **size)
-    if not np.allclose(a['x']-b['x'], dest['x']) or \
-            not np.allclose(a['y']-b['y'], dest['y']) or \
-            not np.allclose(a['z']-b['z'], dest['z']):
-        assert False
-
-def testfloat3addequal():
-    dest = np.copy(a)
-    float3addequal(cuda.InOut(dest), cuda.In(b), **size)
-    if not np.allclose(a['x']+b['x'], dest['x']) or \
-            not np.allclose(a['y']+b['y'], dest['y']) or \
-            not np.allclose(a['z']+b['z'], dest['z']):
-        assert False
-
-def testfloat3subequal():
-    dest = np.copy(a)
-    float3subequal(cuda.InOut(dest), cuda.In(b), **size)
-    if not np.allclose(a['x']-b['x'], dest['x']) or \
-            not np.allclose(a['y']-b['y'], dest['y']) or \
-            not np.allclose(a['z']-b['z'], dest['z']):
-        assert False
-
-def testfloat3addfloat():
-    dest = np.empty(a.size, dtype=float3)
-    float3addfloat(cuda.In(a), c, cuda.Out(dest), **size)
-    if not np.allclose(a['x']+c, dest['x']) or \
-            not np.allclose(a['y']+c, dest['y']) or \
-            not np.allclose(a['z']+c, dest['z']):
-        assert False
-
-def testfloat3addfloatequal():
-    dest = np.copy(a)
-    float3addfloatequal(cuda.InOut(dest), c, **size)
-    if not np.allclose(a['x']+c, dest['x']) or \
-            not np.allclose(a['y']+c, dest['y']) or \
-            not np.allclose(a['z']+c, dest['z']):
-        assert False
-
-def testfloataddfloat3():
-    dest = np.empty(a.size, dtype=float3)
-    floataddfloat3(cuda.In(a), c, cuda.Out(dest), **size)
-    if not np.allclose(c+a['x'], dest['x']) or \
-            not np.allclose(c+a['y'], dest['y']) or \
-            not np.allclose(c+a['z'], dest['z']):
-        assert False
-
-def testfloat3subfloat():
-    dest = np.empty(a.size, dtype=float3)
-    float3subfloat(cuda.In(a), c, cuda.Out(dest), **size)
-    if not np.allclose(a['x']-c, dest['x']) or \
-            not np.allclose(a['y']-c, dest['y']) or \
-            not np.allclose(a['z']-c, dest['z']):
-        assert False
-
-def testfloat3subfloatequal():
-    dest = np.copy(a)
-    float3subfloatequal(cuda.InOut(dest), c, **size)
-    if not np.allclose(a['x']-c, dest['x']) or \
-            not np.allclose(a['y']-c, dest['y']) or \
-            not np.allclose(a['z']-c, dest['z']):
-        assert False
-
-def testfloatsubfloat3():
-    dest = np.empty(a.size, dtype=float3)
-    floatsubfloat3(cuda.In(a), c, cuda.Out(dest), **size)
-    if not np.allclose(c-a['x'], dest['x']) or \
-            not np.allclose(c-a['y'], dest['y']) or \
-            not np.allclose(c-a['z'], dest['z']):
-        assert False
-
-def testfloat3mulfloat():
-    dest = np.empty(a.size, dtype=float3)
-    float3mulfloat(cuda.In(a), c, cuda.Out(dest), **size)
-    if not np.allclose(a['x']*c, dest['x']) or \
-            not np.allclose(a['y']*c, dest['y']) or \
-            not np.allclose(a['z']*c, dest['z']):
-        assert False
-
-def testfloat3mulfloatequal():
-    dest = np.copy(a)
-    float3mulfloatequal(cuda.InOut(dest), c, **size)
-    if not np.allclose(a['x']*c, dest['x']) or \
-            not np.allclose(a['y']*c, dest['y']) or \
-            not np.allclose(a['z']*c, dest['z']):
-        assert False
-
-def testfloatmulfloat3():
-    dest = np.empty(a.size, dtype=float3)
-    floatmulfloat3(cuda.In(a), c, cuda.Out(dest), **size)
-    if not np.allclose(c*a['x'], dest['x']) or \
-            not np.allclose(c*a['y'], dest['y']) or \
-            not np.allclose(c*a['z'], dest['z']):
-        assert False
-
-def testfloat3divfloat():
-    dest = np.empty(a.size, dtype=float3)
-    float3divfloat(cuda.In(a), c, cuda.Out(dest), **size)
-    if not np.allclose(a['x']/c, dest['x']) or \
-            not np.allclose(a['y']/c, dest['y']) or \
-            not np.allclose(a['z']/c, dest['z']):
-        assert False
-
-def testfloat3divfloatequal():
-    dest = np.copy(a)
-    float3divfloatequal(cuda.InOut(dest), c, **size)
-    if not np.allclose(a['x']/c, dest['x']) or \
-            not np.allclose(a['y']/c, dest['y']) or \
-            not np.allclose(a['z']/c, dest['z']):
-        assert False
-
-def testfloatdivfloat3():
-    dest = np.empty(a.size, dtype=float3)
-    floatdivfloat3(cuda.In(a), c, cuda.Out(dest), **size)
-    if not np.allclose(c/a['x'], dest['x']) or \
-            not np.allclose(c/a['y'], dest['y']) or \
-            not np.allclose(c/a['z'], dest['z']):
-        assert false
-
-def testdot():
-    dest = np.empty(a.size, dtype=np.float32)
-    dot(cuda.In(a), cuda.In(b), cuda.Out(dest), **size)
-    if not np.allclose(a['x']*b['x'] + a['y']*b['y'] + a['z']*b['z'], dest):
-        assert False
-
-def testcross():
-    dest = np.empty(a.size, dtype=float3)
-    cross(cuda.In(a), cuda.In(b), cuda.Out(dest), **size)
-    for u, v, wdest in zip(a,b,dest):
-        w = np.cross((u['x'], u['y'], u['z']),(v['x'],v['y'],v['z']))
-        if not np.allclose(wdest['x'], w[0]) or \
-                not np.allclose(wdest['y'], w[1]) or \
-                not np.allclose(wdest['z'], w[2]):
-            print w
-            print wdest
-            assert False
-
-def testnorm():
-    dest = np.empty(a.size, dtype=np.float32)
-    norm(cuda.In(a), cuda.Out(dest), **size)
-
-    for i in range(len(dest)):
-        if not np.allclose(np.linalg.norm((a['x'][i],a['y'][i],a['z'][i])), dest[i]):
-            assert False
-
-def testminusfloat3():
-    dest = np.empty(a.size, dtype=float3)
-    minusfloat3(cuda.In(a), cuda.Out(dest), **size)
-    if not np.allclose(-a['x'], dest['x']) or \
-            not np.allclose(-a['y'], dest['y']) or \
-            not np.allclose(-a['z'], dest['z']):
-        assert False
diff --git a/tests/matrix_test.cu b/tests/matrix_test.cu
deleted file mode 100644
index d64cb34..0000000
--- a/tests/matrix_test.cu
+++ /dev/null
@@ -1,152 +0,0 @@
-//-*-c-*-
-
-#include "matrix.h"
-
-__device__ Matrix array2matrix(float *a)
-{
-	return make_matrix(a[0], a[1], a[2],
-			   a[3], a[4], a[5],
-			   a[6], a[7], a[8]);
-}
-
-__device__ void matrix2array(const Matrix &m, float *a)
-{
-	a[0] = m.a00;
-	a[1] = m.a01;
-	a[2] = m.a02;
-	a[3] = m.a10;
-	a[4] = m.a11;
-	a[5] = m.a12;
-	a[6] = m.a20;
-	a[7] = m.a21;
-	a[8] = m.a22;
-}
-
-extern "C"
-{
-
-__global__ void det(float *a, float *dest)
-{
-	Matrix m = array2matrix(a);
-	dest[0] = det(m);
-}
-
-__global__ void inv(float *a, float *dest)
-{
-	Matrix m = array2matrix(a);
-	matrix2array(inv(m), dest);
-}
-
-__global__ void minusmatrix(float *a, float *dest)
-{
-	matrix2array(-array2matrix(a), dest);
-}
-
-__global__ void matrixadd(float *a, float *b, float *dest)
-{
-	matrix2array(array2matrix(a)+array2matrix(b), dest);
-}
-
-__global__ void matrixsub(float *a, float *b, float *dest)
-{
-	matrix2array(array2matrix(a)-array2matrix(b), dest);
-}
-
-__global__ void matrixmul(float *a, float *b, float *dest)
-{
-	matrix2array(array2matrix(a)*array2matrix(b), dest);
-}
-
-__global__ void multiply(float *a, float3 *x, float3 *dest)
-{
-	dest[0] = array2matrix(a)*x[0];
-}
-
-__global__ void matrixaddfloat(float *a, float c, float *dest)
-{
-	matrix2array(array2matrix(a)+c, dest);
-}
-
-__global__ void matrixsubfloat(float *a, float c, float *dest)
-{
-	matrix2array(array2matrix(a)-c, dest);
-}
-
-__global__ void matrixmulfloat(float *a, float c, float *dest)
-{
-	matrix2array(array2matrix(a)*c, dest);
-}
-
-__global__ void matrixdivfloat(float *a, float c, float *dest)
-{
-	matrix2array(array2matrix(a)/c, dest);
-}
-
-__global__ void floataddmatrix(float *a, float c, float *dest)
-{
-	matrix2array(c+array2matrix(a), dest);
-}
-
-__global__ void floatsubmatrix(float *a, float c, float *dest)
-{
-	matrix2array(c-array2matrix(a), dest);
-}
-
-__global__ void floatmulmatrix(float *a, float c, float *dest)
-{
-	matrix2array(c*array2matrix(a), dest);
-}
-
-__global__ void floatdivmatrix(float *a, float c, float *dest)
-{
-	matrix2array(c/array2matrix(a), dest);
-}
-
-__global__ void matrixaddequals(float *a, float *b)
-{
-	Matrix m = array2matrix(a);
-	m += array2matrix(b);
-	matrix2array(m,a);
-}
-
-__global__ void matrixsubequals(float *a, float *b)
-{
-	Matrix m = array2matrix(a);
-	m -= array2matrix(b);
-	matrix2array(m,a);
-}
-
-__global__ void matrixaddequalsfloat(float *a, float c)
-{
-	Matrix m = array2matrix(a);
-	m += c;
-	matrix2array(m,a);
-}
-
-__global__ void matrixsubequalsfloat(float *a, float c)
-{
-	Matrix m = array2matrix(a);
-	m -= c;
-	matrix2array(m,a);
-}
-
-__global__ void matrixmulequalsfloat(float *a, float c)
-{
-	Matrix m = array2matrix(a);
-	m *= c;
-	matrix2array(m,a);
-}
-
-__global__ void matrixdivequalsfloat(float *a, float c)
-{
-	Matrix m = array2matrix(a);
-	m /= c;
-	matrix2array(m,a);
-}
-
-__global__ void outer(float3 a, float3 b, float* dest)
-{
-	matrix2array(outer(a,b), dest);
-}
-
-} // extern "c"
diff --git a/tests/matrix_test.py b/tests/matrix_test.py
deleted file mode 100644
index c843025..0000000
--- a/tests/matrix_test.py
+++ /dev/null
@@ -1,327 +0,0 @@
-import os
-import numpy as np
-from pycuda import autoinit
-from pycuda.compiler import SourceModule
-import pycuda.driver as cuda
-from pycuda import gpuarray
-
-float3 = gpuarray.vec.float3
-
-print 'device %s' % autoinit.device.name()
-
-current_directory = os.path.split(os.path.realpath(__file__))[0]
-source_directory = current_directory + '/../src'
-
-source = open(current_directory + '/matrix_test.cu').read()
-
-mod = SourceModule(source, options=['-I' + source_directory], no_extern_c=True, cache_dir=False)
-
-det = mod.get_function('det')
-inv = mod.get_function('inv')
-matrixadd = mod.get_function('matrixadd')
-matrixsub = mod.get_function('matrixsub')
-matrixmul = mod.get_function('matrixmul')
-multiply = mod.get_function('multiply')
-matrixaddfloat = mod.get_function('matrixaddfloat')
-matrixsubfloat = mod.get_function('matrixsubfloat')
-matrixmulfloat = mod.get_function('matrixmulfloat')
-matrixdivfloat = mod.get_function('matrixdivfloat')
-floataddmatrix = mod.get_function('floataddmatrix')
-floatsubmatrix = mod.get_function('floatsubmatrix')
-floatmulmatrix = mod.get_function('floatmulmatrix')
-floatdivmatrix = mod.get_function('floatdivmatrix')
-matrixaddequals = mod.get_function('matrixaddequals')
-matrixsubequals = mod.get_function('matrixsubequals')
-matrixaddequalsfloat = mod.get_function('matrixaddequalsfloat')
-matrixsubequalsfloat = mod.get_function('matrixsubequalsfloat')
-matrixmulequalsfloat = mod.get_function('matrixmulequalsfloat')
-matrixdivequalsfloat = mod.get_function('matrixdivequalsfloat')
-outer = mod.get_function('outer')
-minusmatrix = mod.get_function('minusmatrix')
-
-size = {'block': (1,1,1), 'grid': (1,1)}
-
-for i in range(1):
-    a = np.random.random_sample(size=9).astype(np.float32)
-    b = np.random.random_sample(size=9).astype(np.float32)
-    dest = np.empty(1, dtype=np.float32)
-    c = np.int32(np.random.random_sample())
-
-    print 'testing det...',
-
-    det(cuda.In(a), cuda.Out(dest), **size)
-
-    if not np.allclose(np.float32(np.linalg.det(a.reshape(3,3))), dest[0]):
-        print 'fail'
-        print np.float32(np.linalg.det(a.reshape(3,3)))
-        print dest[0]
-    else:
-        print 'success'
-
-    print 'testing inv...',
-
-    dest = np.empty(9, dtype=np.float32)
-
-    inv(cuda.In(a), cuda.Out(dest), **size)
-
-    if not np.allclose(np.linalg.inv(a.reshape(3,3)).flatten().astype(np.float32), dest):
-        print 'fail'
-        print np.linalg.inv(a.reshape(3,3)).flatten().astype(np.float32)
-        print dest
-    else:
-        print 'success'
-
-    print 'testing matrixadd...',
-
-    matrixadd(cuda.In(a), cuda.In(b), cuda.Out(dest), **size)
-
-    if not np.allclose(a+b, dest):
-        print 'fail'
-        print a+b
-        print dest
-    else:
-        print 'success'
-
-    print 'testing matrixsub...',
-
-    matrixsub(cuda.In(a), cuda.In(b), cuda.Out(dest), **size)
-
-    if not np.allclose(a-b, dest):
-        print 'fail'
-        print a-b
-        print dest
-    else:
-        print 'sucess'
-
-    print 'testing matrixmul...',
-
-    matrixmul(cuda.In(a), cuda.In(b), cuda.Out(dest), **size)
-
-    if not np.allclose(np.dot(a.reshape(3,3),b.reshape(3,3)).flatten(), dest):
-        print 'fail'
-        print np.dot(a.reshape(3,3),b.reshape(3,3)).flatten()
-        print dest
-    else:
-        print 'success'
-
-    print 'testing multiply...',
-
-    x_cpu = np.random.random_sample(size=3).astype(np.float32)
-    x_gpu = np.array((x_cpu[0], x_cpu[1], x_cpu[2]), dtype=float3)
-
-    dest = np.empty(1, dtype=float3)
-    
-    multiply(cuda.In(a), cuda.In(x_gpu), cuda.Out(dest), **size)
-
-    m = a.reshape(3,3)
-
-    if not np.allclose(np.dot(x_cpu,m[0]), dest[0]['x']) or \
-            not np.allclose(np.dot(x_cpu,m[1]), dest[0]['y']) or \
-            not np.allclose(np.dot(x_cpu,m[2]), dest[0]['z']):
-        print 'fail'
-        print np.dot(x_cpu,m[0])
-        print np.dot(x_cpu,m[1])
-        print np.dot(x_cpu,m[2])
-        print dest[0]['x']
-        print dest[0]['y']
-        print dest[0]['z']
-    else:
-        print 'success'
-
-    print 'testing matrixaddfloat...',
-
-    dest = np.empty(9, dtype=np.float32)
-
-    matrixaddfloat(cuda.In(a), c, cuda.Out(dest), **size)
-
-    if not np.allclose(a+c, dest):
-        print 'fail'
-        print a+c
-        print dest
-    else:
-        print 'success'
-
-    print 'testing matrixsubfloat...',
-
-    matrixsubfloat(cuda.In(a), c, cuda.Out(dest), **size)
-
-    if not np.allclose(a-c, dest):
-        print 'fail'
-        print a-c
-        print dest
-    else:
-        print 'success'
-
-    print 'testing matrixmulfloat...',
-
-    matrixmulfloat(cuda.In(a), c, cuda.Out(dest), **size)
-
-    if not np.allclose(a*c, dest):
-        print 'fail'
-        print a-c
-        print dest
-    else:
-        print 'success'
-
-    print 'testing matrixdivfloat...',
-
-    matrixdivfloat(cuda.In(a), c, cuda.Out(dest), **size)
-
-    if not np.allclose(a/c, dest):
-        print 'fail'
-        print a/c
-        print dest
-    else:
-        print 'success'
-
-    print 'testing floataddmatrix...',
-
-    floataddmatrix(cuda.In(a), c, cuda.Out(dest), **size)
-
-    if not np.allclose(c+a, dest):
-        print 'fail'
-        print c+a
-        print dest
-    else:
-        print 'success'
-
-    print 'testing floatsubmatrix...',
-
-    floatsubmatrix(cuda.In(a), c, cuda.Out(dest), **size)
-
-    if not np.allclose(c-a, dest):
-        print 'fail'
-        print c-a
-        print dest
-    else:
-        print 'success'
-
-    print 'testing floatmulmatrix...',
-
-    floatmulmatrix(cuda.In(a), c, cuda.Out(dest), **size)
-
-    if not np.allclose(c*a, dest):
-        print 'fail'
-        print c*a
-        print dest
-    else:
-        print 'success'
-
-    print 'testing floatdivmatrix...',
-
-    floatdivmatrix(cuda.In(a), c, cuda.Out(dest), **size)
-
-    if not np.allclose(c/a, dest):
-        print 'fail'
-        print c/a
-        print dest
-    else:
-        print 'success'
-
-    print 'testing matrixaddequals...',
-
-    dest = np.copy(a)
-
-    matrixaddequals(cuda.InOut(dest), cuda.In(b), **size)
-
-    if not np.allclose(a+b, dest):
-        print 'fail'
-        print a+b
-        print dest
-    else:
-        print 'success'
-
-    print 'testing matrixsubequals...',
-
-    dest = np.copy(a)
-
-    matrixsubequals(cuda.InOut(dest), cuda.In(b), **size)
-
-    if not np.allclose(a-b, dest):
-        print 'fail'
-        print a-b
-        print dest
-    else:
-        print 'success'
-
-    print 'testing matrixaddequalsfloat...',
-
-    dest = np.copy(a)
-
-    matrixaddequalsfloat(cuda.InOut(dest), c, **size)
-
-    if not np.allclose(a+c, dest):
-        print 'fail'
-        print a+c
-        print dest
-    else:
-        print 'success'
-
-    print 'testing matrixsubequalsfloat...',
-
-    dest = np.copy(a)
-
-    matrixsubequalsfloat(cuda.InOut(dest), c, **size)
-
-    if not np.allclose(a-c, dest):
-        print 'fail'
-        print a-c
-        print dest
-    else:
-        print 'success'
-
-    print 'testing matrixmulequalsfloat...',
-
-    dest = np.copy(a)
-
-    matrixmulequalsfloat(cuda.InOut(dest), c, **size)
-
-    if not np.allclose(a*c, dest):
-        print 'fail'
-        print a*c
-        print dest
-    else:
-        print 'success'
-
-    print 'testing matrixdivequalsfloat...',
-
-    dest = np.copy(a)
-
-    matrixdivequalsfloat(cuda.InOut(dest), c, **size)
-
-    if not np.allclose(a/c, dest):
-        print 'fail'
-        print a*c
-        print dest
-    else:
-        print 'success'
-
-    print 'testing outer...',
-
-    x1_cpu = np.random.random_sample(size=3).astype(np.float32)
-    x2_cpu = np.random.random_sample(size=3).astype(np.float32)
-
-    x1_gpu = np.array((x1_cpu[0], x1_cpu[1], x1_cpu[2]), dtype=float3)
-    x2_gpu = np.array((x2_cpu[0], x2_cpu[1], x2_cpu[2]), dtype=float3)
-
-    outer(x1_gpu, x2_gpu, cuda.Out(dest), **size)
-
-    if not np.allclose(np.outer(x1_cpu, x2_cpu).flatten(), dest):
-        print 'fail'
-        print np.outer(x1_cpu, x2_cpu).flatten()
-        print dest
-    else:
-        print 'success'
-
-    print 'testing minus matrix...',
-
-    dest = np.copy(a)
-
-    minusmatrix(cuda.In(a), cuda.Out(dest), **size)
-
-    if not np.allclose(-a, dest):
-        print 'fail'
-        print -a
-        print dest
-    else:
-        print 'success'
diff --git a/tests/rotate_test.cu b/tests/rotate_test.cu
deleted file mode 100644
index 6cafc12..0000000
--- a/tests/rotate_test.cu
+++ /dev/null
@@ -1,14 +0,0 @@
-//-*-c-*-
-
-#include "rotate.h"
-
-extern "C"
-{
-
-__global__ void rotate(float3 *a, float *phi, float3 *n, float3 *dest)
-{
-	int idx = blockIdx.x*blockDim.x + threadIdx.x;
-	dest[idx] = rotate(a[idx], phi[idx], n[idx]);
-}
-
-} // extern "c"
diff --git a/tests/rotate_test.py b/tests/rotate_test.py
deleted file mode 100644
index 92eff84..0000000
--- a/tests/rotate_test.py
+++ /dev/null
@@ -1,67 +0,0 @@
-import os
-import numpy as np
-from pycuda import autoinit
-from pycuda.compiler import SourceModule
-import pycuda.driver as cuda
-from pycuda import gpuarray
-float3 = gpuarray.vec.float3
-
-def rotate(x, phi, n):
-    x = np.asarray(x)
-    n = np.asarray(n)
-
-    r = np.cos(phi)*np.identity(3) + (1-np.cos(phi))*np.outer(n,n) + \
-        np.sin(phi)*np.array([[0,n[2],-n[1]],[-n[2],0,n[0]],[n[1],-n[0],0]])
-
-    return np.inner(x,r)
-
-print 'device %s' % autoinit.device.name()
-
-current_directory = os.path.split(os.path.realpath(__file__))[0]
-source_directory = current_directory + '/../src'
-
-source = open(current_directory + '/rotate_test.cu').read()
-
-mod = SourceModule(source, options=['-I' + source_directory], no_extern_c=True, cache_dir=False)
-
-rotate_gpu = mod.get_function('rotate')
-
-size = {'block': (100,1,1), 'grid': (1,1)}
-
-a = np.empty(size['block'][0], dtype=float3)
-n = np.empty(size['block'][0], dtype=float3)
-phi = np.random.random_sample(size=a.size).astype(np.float32)
-
-a['x'] = np.random.random_sample(size=a.size)
-a['y'] = np.random.random_sample(size=a.size)
-a['z'] = np.random.random_sample(size=a.size)
-
-n['x'] = np.random.random_sample(size=n.size)
-n['y'] = np.random.random_sample(size=n.size)
-n['z'] = np.random.random_sample(size=n.size)
-
-a['x'] = np.ones(a.size)
-a['y'] = np.zeros(a.size)
-a['z'] = np.zeros(a.size)
-
-n['x'] = np.zeros(n.size)
-n['y'] = np.zeros(n.size)
-n['z'] = np.ones(n.size)
-
-phi = np.array([np.pi/2]*a.size).astype(np.float32)
-
-def testrotate():
-    dest = np.empty(a.size, dtype=float3)
-    rotate_gpu(cuda.In(a), cuda.In(phi), cuda.In(n), cuda.Out(dest), **size)
-    for v, theta, w, rdest in zip(a,phi,n,dest):
-        r = rotate((v['x'], v['y'], v['z']), theta, (w['x'], w['y'], w['z']))
-        if not np.allclose(rdest['x'], r[0]) or \
-                not np.allclose(rdest['y'], r[1]) or \
-                not np.allclose(rdest['z'], r[2]):
-            print v
-            print theta
-            print w
-            print r
-            print rdest
-            assert False
-
diff --git a/tests/test_fileio.py b/tests/test_fileio.py
deleted file mode 100644
index 3869a9f..0000000
--- a/tests/test_fileio.py
+++ /dev/null
@@ -1,74 +0,0 @@
-import unittest
-from chroma.fileio import root
-from chroma import event
-import numpy as np
-
-class TestFileIO(unittest.TestCase):
-    def test_file_write_and_read(self):
-        ev = event.Event(1, event.Vertex('e-', pos=(0,0,1), dir=(1,0,0),
-                                         ke=15.0, pol=(0,1,0)))
-
-        photons_beg = root.make_photon_with_arrays(1)
-        photons_beg.pos[0] = (1,2,3)
-        photons_beg.dir[0] = (4,5,6)
-        photons_beg.pol[0] = (7,8,9)
-        photons_beg.wavelengths[0] = 400.0
-        photons_beg.t[0] = 100.0
-        photons_beg.last_hit_triangles[0] = 5
-        photons_beg.flags[0] = 20
-        ev.photons_beg = photons_beg
-
-        photons_end = root.make_photon_with_arrays(1)
-        photons_end.pos[0] = (1,2,3)
-        photons_end.dir[0] = (4,5,6)
-        photons_end.pol[0] = (7,8,9)
-        photons_end.wavelengths[0] = 400.0
-        photons_end.t[0] = 100.0
-        photons_end.last_hit_triangles[0] = 5
-        photons_end.flags[0] = 20
-        ev.photons_end = photons_end
-
-        ev.vertices = [ev.primary_vertex]
-
-        channels = event.Channels(hit=np.array([True, False]), 
-                                  t=np.array([20.0, 1e9], dtype=np.float32),
-                                  q=np.array([2.0, 0.0], dtype=np.float32),
-                                  flags=np.array([8, 32], dtype=np.uint32))
-        ev.channels = channels
-
-        filename = '/tmp/chroma-filewritertest.root'
-        writer = root.RootWriter(filename)
-        writer.write_event(ev)
-        writer.close()
-
-        # Exercise the RootReader methods
-        reader = root.RootReader(filename)
-        self.assertEquals(len(reader), 1)
-
-        self.assertRaises(StopIteration, reader.prev)
-
-        reader.next()
-
-        self.assertEqual(reader.index(), 0)
-        self.assertRaises(StopIteration, reader.next)
-        
-        reader.jump_to(0)
-
-        # Enough screwing around, let's get the one event in the file
-        newev = reader.current()
-
-        # Now check if everything is correct in the event
-        for attribute in ['id']:
-            self.assertEqual(getattr(ev, attribute), getattr(newev, attribute), 'compare %s' % attribute)
-
-        for attribute in ['pos', 'dir', 'pol', 'ke', 't0']:
-            self.assertTrue(np.allclose(getattr(ev.primary_vertex, attribute), getattr(newev.primary_vertex, attribute)), 'compare %s' % attribute)
-
-            for i in range(len(ev.vertices)):
-                self.assertTrue(np.allclose(getattr(ev.vertices[i], attribute), getattr(newev.vertices[i], attribute)), 'compare %s' % attribute)
-
-        for attribute in ['pos', 'dir', 'pol', 'wavelengths', 't', 'last_hit_triangles', 'flags']:    
-            self.assertTrue(np.allclose(getattr(ev.photons_beg, attribute), 
-                                        getattr(newev.photons_beg, attribute)), 'compare %s' % attribute)
-            self.assertTrue(np.allclose(getattr(ev.photons_end, attribute), 
-                                        getattr(newev.photons_end, attribute)), 'compare %s' % attribute)
diff --git a/tests/test_generator_photon.py b/tests/test_generator_photon.py
deleted file mode 100644
index 13501fe..0000000
--- a/tests/test_generator_photon.py
+++ /dev/null
@@ -1,21 +0,0 @@
-import unittest
-import itertools
-
-from chroma import generator
-from chroma.generator.vertex import constant_particle_gun
-from chroma import optics
-
-class TestG4ParallelGenerator(unittest.TestCase):
-    def test_center(self):
-        '''Generate Cherenkov light at the center of the world volume'''
-        gen = generator.photon.G4ParallelGenerator(1, optics.water_wcsim)
-        vertex = itertools.islice(constant_particle_gun('e-', (0,0,0), (1,0,0), 100), 10)
-        for event in gen.generate_events(vertex):
-            self.assertGreater(len(event.photons_beg.pos), 0)
-
-    def test_off_center(self):
-        '''Generate Cherenkov light at (1 m, 0 m, 0 m)'''
-        gen = generator.photon.G4ParallelGenerator(1, optics.water_wcsim)
-        vertex = itertools.islice(constant_particle_gun('e-', (1,0,0), (1,0,0), 100), 10)
-        for event in gen.generate_events(vertex):
-            self.assertGreater(len(event.photons_beg.pos), 0)
diff --git a/tests/test_generator_vertex.py b/tests/test_generator_vertex.py
deleted file mode 100644
index cec363d..0000000
--- a/tests/test_generator_vertex.py
+++ /dev/null
@@ -1,23 +0,0 @@
-import unittest
-import itertools
-import numpy as np
-import chroma.generator.vertex
-
-class TestParticleGun(unittest.TestCase):
-    def test_constant_particle_gun_center(self):
-        '''Generate electron vertices at the center of the world volume.'''
-        vertex = chroma.generator.vertex.constant_particle_gun('e-', (0,0,0), (1,0,0), 100)
-        for ev in itertools.islice(vertex, 100):
-            self.assertEquals(ev.primary_vertex.particle_name, 'e-')
-            self.assertTrue(np.allclose(ev.primary_vertex.pos, [0,0,0]))
-            self.assertTrue(np.allclose(ev.primary_vertex.dir, [1,0,0]))
-            self.assertTrue(np.allclose(ev.primary_vertex.ke, 100))
-
-    def test_off_center(self):
-        '''Generate electron vertices at (1,0,0) in the world volume.'''
-        vertex = chroma.generator.vertex.constant_particle_gun('e-', (1,0,0), (1,0,0), 100)
-        for ev in itertools.islice(vertex, 100):
-            self.assertEquals(ev.primary_vertex.particle_name, 'e-')
-            self.assertTrue(np.allclose(ev.primary_vertex.pos, [1,0,0]))
-            self.assertTrue(np.allclose(ev.primary_vertex.dir, [1,0,0]))
-            self.assertTrue(np.allclose(ev.primary_vertex.ke, 100))
diff --git a/tests/test_pdf.py b/tests/test_pdf.py
deleted file mode 100644
index 571cbd4..0000000
--- a/tests/test_pdf.py
+++ /dev/null
@@ -1,67 +0,0 @@
-import unittest
-import numpy as np
-import itertools
-
-import chroma.detectors
-from chroma.generator.photon import G4ParallelGenerator
-from chroma.generator.vertex import constant_particle_gun
-from chroma.optics import water_wcsim
-from chroma import gpu
-from chroma.sim import Simulation
-
-class TestPDF(unittest.TestCase):
-    def setUp(self):
-        self.detector = chroma.detectors.microlbne()
-        self.detector.build()
-        self.vertex_gen = constant_particle_gun('e-', (0,0,0), (1,0,0), 10)
-
-    def testGPUPDF(self):
-        '''Create a hit count and (q,t) PDF for 10 MeV events in MicroLBNE'''
-
-        g4generator = G4ParallelGenerator(1, water_wcsim)
-
-        context = gpu.create_cuda_context()
-
-        gpu_geometry = gpu.GPUGeometry(self.detector)
-
-        nthreads_per_block, max_blocks = 64, 1024
-
-        rng_states = gpu.get_rng_states(nthreads_per_block*max_blocks)
-
-        gpu_daq = gpu.GPUDaq(gpu_geometry, max(self.detector.pmtids))
-        gpu_pdf = gpu.GPUPDF()
-        gpu_pdf.setup_pdf(max(self.detector.pmtids), 100, (-0.5, 999.5), 10, (-0.5, 9.5))
-        
-        gpu_pdf.clear_pdf()
-
-        for ev in g4generator.generate_events(itertools.islice(self.vertex_gen, 10)):
-            gpu_photons = gpu.GPUPhotons(ev.photons_beg)
-            gpu_photons.propagate(gpu_geometry, rng_states, nthreads_per_block,
-                                  max_blocks)
-            gpu_channels = gpu_daq.acquire(gpu_photons, rng_states,
-                                           nthreads_per_block, max_blocks)
-            gpu_pdf.add_hits_to_pdf(gpu_channels)
-
-        hitcount, pdf = gpu_pdf.get_pdfs()
-        self.assertTrue( (hitcount > 0).any() )
-        self.assertTrue( (pdf > 0).any() )
-
-        # Consistency checks
-        for i, nhits in enumerate(hitcount):
-            self.assertEqual(nhits, pdf[i].sum())
-
-        context.pop()
-
-    def testSimPDF(self):
-        sim = Simulation(self.detector)
-
-        vertex_iter = itertools.islice(self.vertex_gen, 10)
-
-        hitcount, pdf = sim.create_pdf(vertex_iter, 100, (-0.5, 999.5), 10, (-0.5, 9.5))
-
-        self.assertTrue( (hitcount > 0).any() )
-        self.assertTrue( (pdf > 0).any() )
-
-        # Consistency checks
-        for i, nhits in enumerate(hitcount):
-            self.assertEqual(nhits, pdf[i].sum())
diff --git a/tests/test_propagation.py b/tests/test_propagation.py
deleted file mode 100644
index 43ecb9b..0000000
--- a/tests/test_propagation.py
+++ /dev/null
@@ -1,58 +0,0 @@
-import unittest
-import numpy as np
-
-from chroma.geometry import Solid, Geometry
-from chroma.make import box
-from chroma.sim import Simulation
-from chroma.optics import vacuum
-from chroma.event import Photons
-
-class TestPropagation(unittest.TestCase):
-    def testAbort(self):
-        '''Photons that hit a triangle at normal incidence should not abort.
-
-        Photons that hit a triangle at exactly normal incidence can sometimes
-        produce a dot product that is outside the range allowed by acos().
-        Trigger these with axis aligned photons in a box.
-        '''
-
-        # Setup geometry
-        cube = Geometry(vacuum)
-        cube.add_solid(Solid(box(100,100,100), vacuum, vacuum))
-        cube.pmtids = [0]
-        cube.build(use_cache=False)
-        sim = Simulation(cube, geant4_processes=0)
-
-        # Create initial photons
-        nphotons = 10000
-        pos = np.tile([0,0,0], (nphotons,1)).astype(np.float32)
-        dir = np.tile([0,0,1], (nphotons,1)).astype(np.float32)
-        pol = np.zeros_like(pos)
-        phi = np.random.uniform(0, 2*np.pi, nphotons).astype(np.float32)
-        pol[:,0] = np.cos(phi)
-        pol[:,1] = np.sin(phi)
-        t = np.zeros(nphotons, dtype=np.float32)
-        wavelengths = np.empty(nphotons, np.float32)
-        wavelengths.fill(400.0)
-
-        photons = Photons(pos=pos, dir=dir, pol=pol, t=t,
-                          wavelengths=wavelengths)
-
-        # First make one step to check for strangeness
-        photons_end = sim.simulate([photons], keep_photons_end=True,
-                                   max_steps=1).next().photons_end
-
-        self.assertFalse(np.isnan(photons_end.pos).any())
-        self.assertFalse(np.isnan(photons_end.dir).any())
-        self.assertFalse(np.isnan(photons_end.pol).any())
-        self.assertFalse(np.isnan(photons_end.t).any())
-        self.assertFalse(np.isnan(photons_end.wavelengths).any())
-
-        # Now let it run the usual ten steps
-        photons_end = sim.simulate([photons], keep_photons_end=True,
-                                   max_steps=10).next().photons_end
-        aborted = (photons_end.flags & (1 << 31)) > 0
-        print 'aborted photons: %1.1f' % \
-            (float(np.count_nonzero(aborted)) / nphotons)
-        self.assertFalse(aborted.any())
-        
diff --git a/tests/test_ray_intersection.py b/tests/test_ray_intersection.py
deleted file mode 100644
index 7d0c53c..0000000
--- a/tests/test_ray_intersection.py
+++ /dev/null
@@ -1,27 +0,0 @@
-import unittest
-import chroma
-import numpy as np
-import os
-from pycuda import gpuarray as ga
-
-class TestRayIntersection(unittest.TestCase):
-    def setUp(self):
-        self.context = chroma.gpu.create_cuda_context()
-        self.module = chroma.gpu.get_cu_module('mesh.h')
-        self.gpu_funcs = chroma.gpu.GPUFuncs(self.module)
-        self.box = chroma.gpu.GPUGeometry(chroma.build(chroma.make.cube()))
-
-        pos, dir = chroma.project.from_film()
-        self.pos_gpu = ga.to_gpu(chroma.gpu.to_float3(pos))
-        self.dir_gpu = ga.to_gpu(chroma.gpu.to_float3(dir))
-
-        testdir = os.path.dirname(os.path.abspath(chroma.tests.__file__))
-        self.dx_standard = np.load(os.path.join(testdir,
-                                                'data/ray_intersection.npz'))
-    def test_intersection_distance(self):
-        dx = ga.zeros(self.pos_gpu.size, dtype=np.float32)
-        self.gpu_funcs.distance_to_mesh(np.int32(self.pos_gpu.size), self.pos_gpu, self.dir_gpu, self.box.gpudata, dx, block=(64,1,1), grid=(self.pos_gpu.size//64+1,1))
-        self.assertTrue((dx.get() == self.dx_standard).all())
-
-    def tearDown(self):
-        self.context.pop()
diff --git a/tests/test_rayleigh.py b/tests/test_rayleigh.py
deleted file mode 100644
index 02ccb41..0000000
--- a/tests/test_rayleigh.py
+++ /dev/null
@@ -1,57 +0,0 @@
-import unittest
-import numpy as np
-
-from chroma.geometry import Solid, Geometry
-from chroma.make import box
-from chroma.sim import Simulation
-from chroma.optics import water_wcsim
-from chroma.event import Photons
-import histogram
-from histogram.root import rootify
-import ROOT
-ROOT.gROOT.SetBatch(1)
-
-class TestRayleigh(unittest.TestCase):
-    def setUp(self):
-        self.cube = Geometry(water_wcsim)
-        self.cube.add_solid(Solid(box(100,100,100), water_wcsim, water_wcsim))
-        self.cube.pmtids = [0]
-        self.cube.build(use_cache=False)
-        self.sim = Simulation(self.cube, geant4_processes=0)
-
-        nphotons = 100000
-        pos = np.tile([0,0,0], (nphotons,1)).astype(np.float32)
-        dir = np.tile([0,0,1], (nphotons,1)).astype(np.float32)
-        pol = np.zeros_like(pos)
-        phi = np.random.uniform(0, 2*np.pi, nphotons).astype(np.float32)
-        pol[:,0] = np.cos(phi)
-        pol[:,1] = np.sin(phi)
-        t = np.zeros(nphotons, dtype=np.float32)
-        wavelengths = np.empty(nphotons, np.float32)
-        wavelengths.fill(400.0)
-
-        self.photons = Photons(pos=pos, dir=dir, pol=pol, t=t, wavelengths=wavelengths)
-
-    def testAngularDistributionPolarized(self):
-        # Fully polarized photons
-        self.photons.pol[:] = [1.0, 0.0, 0.0]
-
-        photons_end = self.sim.simulate([self.photons], keep_photons_end=True, max_steps=1).next().photons_end
-        aborted = (photons_end.flags & (1 << 31)) > 0
-        self.assertFalse(aborted.any())
-
-        # Compute the dot product between initial and final dir
-        rayleigh_scatters = (photons_end.flags & (1 << 4)) > 0
-        cos_scatter = (self.photons.dir[rayleigh_scatters] * photons_end.dir[rayleigh_scatters]).sum(axis=1)
-        theta_scatter = np.arccos(cos_scatter)
-        h = histogram.Histogram(bins=100, range=(0, np.pi))
-        h.fill(theta_scatter)
-        h = rootify(h)
-
-        # The functional form for polarized light should be
-        # (1 + \cos^2 \theta)\sin \theta according to GEANT4 physics
-        # reference manual.
-        f = ROOT.TF1("pol_func", "[0]*(1+cos(x)**2)*sin(x)", 0, np.pi)
-        h.Fit(f)
-        self.assertGreater(f.GetProb(), 1e-3)
-
diff --git a/tests/test_sample_cdf.cu b/tests/test_sample_cdf.cu
deleted file mode 100644
index 1401772..0000000
--- a/tests/test_sample_cdf.cu
+++ /dev/null
@@ -1,16 +0,0 @@
-// -*-c++-*-
-#include "random.h"
-
-extern "C" {
-
-__global__ void test_sample_cdf(int offset, int ncdf, 
-				float *cdf_x, float *cdf_y, float *out)
-{
-  int id = blockDim.x * blockIdx.x + threadIdx.x;
-  curandState s;
-  curand_init(0, id, offset, &s);
-
-  out[id] = sample_cdf(&s, ncdf, cdf_x, cdf_y);
-}
-
-}
diff --git a/tests/test_sample_cdf.py b/tests/test_sample_cdf.py
deleted file mode 100644
index 2baa2ac..0000000
--- a/tests/test_sample_cdf.py
+++ /dev/null
@@ -1,64 +0,0 @@
-import pycuda.autoinit
-import pycuda.driver as cuda
-from pycuda.compiler import SourceModule
-from pycuda import gpuarray
-import numpy as np
-import ROOT
-import os
-
-current_directory = os.path.split(os.path.realpath(__file__))[0]
-source_directory = current_directory + '/../src'
-
-source = open(current_directory + '/test_sample_cdf.cu').read()
-
-mod = SourceModule(source, options=['-I' + source_directory], no_extern_c=True, cache_dir=False)
-
-test_sample_cdf = mod.get_function('test_sample_cdf')
-
-def compare_sampling(hist, reps=10):
-    nbins = hist.GetNbinsX();
-    xaxis = hist.GetXaxis()
-    intg = hist.GetIntegral()
-    cdf_y = np.empty(nbins+1, dtype=float)
-    cdf_x = np.empty_like(cdf_y)
-
-    cdf_x[0] = xaxis.GetBinLowEdge(1)
-    cdf_y[0] = 0.0
-    for i in xrange(1,len(cdf_x)):
-        cdf_y[i] = intg[i]
-        cdf_x[i] = xaxis.GetBinUpEdge(i)
-
-    cdf_x_gpu = gpuarray.to_gpu(cdf_x.astype(np.float32))
-    cdf_y_gpu = gpuarray.to_gpu(cdf_y.astype(np.float32))
-    block =(128,1,1)
-    grid = (128, 1)
-    out_gpu = gpuarray.empty(shape=int(block[0]*grid[0]), dtype=np.float32)
-
-    out_h = ROOT.TH1D('out_h', '', hist.GetNbinsX(), 
-                      xaxis.GetXmin(),
-                      xaxis.GetXmax())
-    out_h.SetLineColor(ROOT.kGreen)
-
-    for i in xrange(reps):
-        test_sample_cdf(np.int32(i),
-                        np.int32(len(cdf_x_gpu)), 
-                        cdf_x_gpu, cdf_y_gpu, out_gpu, block=block, grid=grid)
-        out = out_gpu.get()
-        for v in out:
-            out_h.Fill(v)
-
-    prob = out_h.KolmogorovTest(hist)
-    return prob, out_h 
-
-def test_sampling():
-    '''Verify that the CDF-based sampler on the GPU reproduces a binned
-    Gaussian distribution'''
-    f = ROOT.TF1('f_gaussian', 'gaus(0)', -5, 5)
-    f.SetParameters(1.0/np.sqrt(np.pi * 2), 0.0, 1.0)
-    gaussian = ROOT.TH1D('gaussian', '', 100, -5, 5)
-    gaussian.Add(f)
-
-    prob, out_h = compare_sampling(gaussian, reps=50)
-
-    assert prob > 0.01
-
-- 
cgit