17 files changed, 2154 insertions, 0 deletions
diff --git a/chroma/cuda/__init__.py b/chroma/cuda/__init__.py
new file mode 100644
index 0000000..dd87cbc
--- /dev/null
+++ b/chroma/cuda/__init__.py
@@ -0,0 +1,2 @@
+import os.path
+srcdir = os.path.dirname(os.path.abspath(__file__))
diff --git a/chroma/cuda/daq.cu b/chroma/cuda/daq.cu
new file mode 100644
index 0000000..5a68846
--- /dev/null
+++ b/chroma/cuda/daq.cu
@@ -0,0 +1,198 @@
+// -*-c++-*-
+#include <curand_kernel.h>
+
+__device__ unsigned int
+float_to_sortable_int(float f)
+{
+    return __float_as_int(f);
+    //int i = __float_as_int(f);
+    //unsigned int mask = -(int)(i >> 31) | 0x80000000;
+    //return i ^ mask;
+}
+
+__device__ float
+sortable_int_to_float(unsigned int i)
+{
+    return __int_as_float(i);
+    //unsigned int mask = ((i >> 31) - 1) | 0x80000000;
+    //return __int_as_float(i ^ mask);
+}
+
+extern "C"
+{
+
+__global__ void
+reset_earliest_time_int(float maxtime, int ntime_ints, unsigned int *time_ints)
+{
+    int id = threadIdx.x + blockDim.x * blockIdx.x;
+    if (id < ntime_ints) {
+	unsigned int maxtime_int = float_to_sortable_int(maxtime);
+	time_ints[id] = maxtime_int;
+    }
+}
+
+__global__ void
+run_daq(curandState *s, unsigned int detection_state, float time_rms,
+	int first_photon, int nphotons, float *photon_times,
+	unsigned int *photon_histories, int *last_hit_triangles,
+	int *solid_map, int nsolids, unsigned int *earliest_time_int,
+	unsigned int *channel_q, unsigned int *channel_histories)
+{
+
+    int id = threadIdx.x + blockDim.x * blockIdx.x;
+
+    if (id < nphotons) {
+	curandState rng = s[id];
+	int photon_id = id + first_photon;
+	int triangle_id = last_hit_triangles[photon_id];
+		
+	if (triangle_id > -1) {
+	    int solid_id = solid_map[triangle_id];
+	    unsigned int history = photon_histories[photon_id];
+
+	    if (solid_id < nsolids && (history & detection_state)) {
+		float time = photon_times[photon_id] + curand_normal(&rng) * time_rms;
+		unsigned int time_int = float_to_sortable_int(time);
+			  
+		atomicMin(earliest_time_int + solid_id, time_int);
+		atomicAdd(channel_q + solid_id, 1);
+		atomicOr(channel_histories + solid_id, history);
+	    }
+
+	}
+
+	s[id] = rng;
+	
+    }
+    
+}
+
+__global__ void
+convert_sortable_int_to_float(int n, unsigned int *sortable_ints,
+			      float *float_output)
+{
+    int id = threadIdx.x + blockDim.x * blockIdx.x;
+	
+    if (id < n)
+	float_output[id] = sortable_int_to_float(sortable_ints[id]);
+}
+
+__global__ void
+bin_hits(int nchannels, unsigned int *channel_q, float *channel_time,
+	 unsigned int *hitcount, int tbins, float tmin, float tmax, int qbins,
+	 float qmin, float qmax, unsigned int *pdf)
+{
+    int id = threadIdx.x + blockDim.x * blockIdx.x;
+
+    if (id >= nchannels)
+	return;
+
+    unsigned int q = channel_q[id];
+    float t = channel_time[id];
+	
+    if (t < 1e8 && t >= tmin && t < tmax && q >= qmin && q < qmax) {
+	hitcount[id] += 1;
+		
+	int tbin = (t - tmin) / (tmax - tmin) * tbins;
+	int qbin = (q - qmin) / (qmax - qmin) * qbins;
+	    
+	// row major order (channel, t, q)
+	int bin = id * (tbins * qbins) + tbin * qbins + qbin;
+	pdf[bin] += 1;
+    }
+}
+	
+__global__ void
+accumulate_pdf_eval(int time_only, int nchannels, unsigned int *event_hit,
+		    float *event_time, float *event_charge, float *mc_time,
+		    unsigned int *mc_charge, // quirk of DAQ!
+		    unsigned int *hitcount, unsigned int *bincount,
+		    float min_twidth, float tmin, float tmax,
+		    float min_qwidth, float qmin, float qmax,
+		    float *nearest_mc, int min_bin_content)
+{
+    int id = threadIdx.x + blockDim.x * blockIdx.x;
+	
+    if (id >= nchannels)
+	return;
+	
+    // Was this channel hit in the Monte Carlo?
+    float channel_mc_time = mc_time[id];
+    if (channel_mc_time >= 1e8)
+	return; // Nothing else to do
+	
+    // Is this channel inside the range of the PDF?
+    float distance;
+    int channel_bincount = bincount[id];
+    if (time_only) {
+	if (channel_mc_time < tmin || channel_mc_time > tmax)
+	    return;  // Nothing else to do
+		
+	// This MC information is contained in the PDF
+	hitcount[id] += 1;
+	  
+	// Was this channel hit in the event-of-interest?
+	int channel_event_hit = event_hit[id];
+	if (!channel_event_hit)
+	    return; // No need to update PDF value for unhit channel
+		
+	// Are we inside the minimum size bin?
+	float channel_event_time = event_time[id];
+	distance = fabsf(channel_mc_time - channel_event_time);
+	if (distance < min_twidth/2.0)
+	    bincount[id] = channel_bincount + 1;
+
+    }
+    else { // time and charge PDF
+	float channel_mc_charge = mc_charge[id]; // int->float conversion because DAQ just returns an integer
+	
+	if (channel_mc_time < tmin || channel_mc_time > tmax ||
+	    channel_mc_charge < qmin || channel_mc_charge > qmax)
+	    return;  // Nothing else to do
+		
+	// This MC information is contained in the PDF
+	hitcount[id] += 1;
+	  
+	// Was this channel hit in the event-of-interest?
+	int channel_event_hit = event_hit[id];
+	if (!channel_event_hit)
+	    return; // No need to update PDF value for unhit channel
+		
+	// Are we inside the minimum size bin?
+	float channel_event_time = event_time[id];
+	float channel_event_charge = event_charge[id];
+	float normalized_time_distance = fabsf(channel_event_time - channel_mc_time)/min_twidth/2.0;
+	float normalized_charge_distance = fabsf(channel_event_charge - channel_mc_charge)/min_qwidth/2.0;
+	distance = sqrt(normalized_time_distance*normalized_time_distance + normalized_charge_distance*normalized_charge_distance);
+
+	if (distance < 1.0f)
+	    bincount[id] = channel_bincount + 1;
+    }
+
+    // Do we need to keep updating the nearest_mc list?
+    if (channel_bincount + 1 >= min_bin_content)
+	return; // No need to perform insertion into nearest_mc because bincount is a better estimate of the PDF value
+
+    // insertion sort the distance into the array of nearest MC points
+    int offset = min_bin_content * id;
+    int entry = min_bin_content - 1;
+	
+    // If last entry less than new entry, nothing to update
+    if (distance > nearest_mc[offset + entry])
+	return;
+
+    // Find where to insert the new entry while sliding the rest
+    // to the right
+    entry--;
+    while (entry >= 0) {
+	if (nearest_mc[offset+entry] >= distance)
+	    nearest_mc[offset+entry+1] = nearest_mc[offset+entry];
+	else
+	    break;
+	entry--;
+    }
+
+    nearest_mc[offset+entry+1] = distance;
+}
+
+} // extern "C"
diff --git a/chroma/cuda/geometry.h b/chroma/cuda/geometry.h
new file mode 100644
index 0000000..2b5eacb
--- /dev/null
+++ b/chroma/cuda/geometry.h
@@ -0,0 +1,74 @@
+#ifndef __GEOMETRY_H__
+#define __GEOMETRY_H__
+
+struct Material
+{
+    float *refractive_index;
+    float *absorption_length;
+    float *scattering_length;
+    unsigned int n;
+    float step;
+    float wavelength0;
+};
+
+struct Surface
+{
+    float *detect;
+    float *absorb;
+    float *reflect_diffuse;
+    float *reflect_specular;
+    unsigned int n;
+    float step;
+    float wavelength0;
+};
+
+struct Triangle
+{
+    float3 v0, v1, v2;
+};
+
+struct Geometry
+{
+    float3 *vertices;
+    uint3 *triangles;
+    unsigned int *material_codes;
+    unsigned int *colors;
+    float3 *lower_bounds;
+    float3 *upper_bounds;
+    unsigned int *node_map;
+    unsigned int *node_map_end;
+    Material **materials;
+    Surface **surfaces;
+    unsigned int start_node;
+    unsigned int first_node;
+};
+
+__device__ Triangle
+get_triangle(Geometry *geometry, const unsigned int &i)
+{
+    uint3 triangle_data = geometry->triangles[i];
+
+    Triangle triangle;
+    triangle.v0 = geometry->vertices[triangle_data.x];
+    triangle.v1 = geometry->vertices[triangle_data.y];
+    triangle.v2 = geometry->vertices[triangle_data.z];
+
+    return triangle;
+}
+
+template <class T>
+__device__ float
+interp_property(T *m, const float &x, const float *fp)
+{
+    if (x < m->wavelength0)
+	return fp[0];
+
+    if (x > (m->wavelength0 + (m->n-1)*m->step))
+	return fp[m->n-1];
+
+    int jl = (x-m->wavelength0)/m->step;
+
+    return fp[jl] + (x-(m->wavelength0 + jl*m->step))*(fp[jl+1]-fp[jl])/m->step;
+}
+
+#endif
diff --git a/chroma/cuda/hybrid_render.cu b/chroma/cuda/hybrid_render.cu
new file mode 100644
index 0000000..29edefa
--- /dev/null
+++ b/chroma/cuda/hybrid_render.cu
@@ -0,0 +1,202 @@
+//-*-c-*-
+#include <math_constants.h>
+#include <curand_kernel.h>
+
+#include "linalg.h"
+#include "matrix.h"
+#include "rotate.h"
+#include "mesh.h"
+#include "geometry.h"
+#include "photon.h"
+
+__device__ void
+fAtomicAdd(float *addr, float data)
+{
+    while (data)
+	data = atomicExch(addr, data+atomicExch(addr, 0.0f));
+}
+
+__device__ void
+to_diffuse(Photon &p, State &s, Geometry *g, curandState &rng, int max_steps)
+{
+    int steps = 0;
+    while (steps < max_steps) {
+	steps++;
+
+	int command;
+
+	fill_state(s, p, g);
+
+	if (p.last_hit_triangle == -1)
+	    break;
+
+	command = propagate_to_boundary(p, s, rng);
+
+	if (command == BREAK)
+	    break;
+
+	if (command == CONTINUE)
+	    continue;
+
+	if (s.surface_index != -1) {
+	    command = propagate_at_surface(p, s, rng, g);
+
+	    if (p.history & REFLECT_DIFFUSE)
+		break;
+
+	    if (command == BREAK)
+		break;
+
+	    if (command == CONTINUE)
+		continue;
+	}
+
+	propagate_at_boundary(p, s, rng);
+
+    } // while (steps < max_steps)
+
+} // to_diffuse
+
+extern "C"
+{
+
+__global__ void
+update_xyz_lookup(int nthreads, int total_threads, int offset, float3 position,
+		  curandState *rng_states, float wavelength, float3 xyz,
+		  float3 *xyz_lookup1, float3 *xyz_lookup2, int max_steps,
+		  Geometry *g)
+{
+    int kernel_id = blockIdx.x*blockDim.x + threadIdx.x;
+    int id = kernel_id + offset;
+
+    if (kernel_id >= nthreads || id >= total_threads)
+	return;
+
+    curandState rng = rng_states[kernel_id];
+
+    Triangle t = get_triangle(g, id);
+
+    float a = curand_uniform(&rng);
+    float b = uniform(&rng, 0.0f, (1.0f - a));
+    float c = 1.0f - a - b;
+
+    float3 direction = a*t.v0 + b*t.v1 + c*t.v2 - position;
+    direction /= norm(direction);
+
+    float distance;
+    int triangle_index = intersect_mesh(position, direction, g, distance);
+
+    if (triangle_index != id) {
+	rng_states[kernel_id] = rng;
+	return;
+    }
+
+    float3 v01 = t.v1 - t.v0;
+    float3 v12 = t.v2 - t.v1;
+    
+    float3 surface_normal = normalize(cross(v01,v12));
+
+    float cos_theta = dot(surface_normal,-direction);
+
+    if (cos_theta < 0.0f)
+	cos_theta = dot(-surface_normal,-direction);
+
+    Photon p;
+    p.position = position;
+    p.direction = direction;
+    p.wavelength = wavelength;
+    p.polarization = uniform_sphere(&rng);
+    p.last_hit_triangle = -1;
+    p.time = 0;
+    p.history = 0;
+
+    State s;
+    to_diffuse(p, s, g, rng, max_steps);
+
+    if (p.history & REFLECT_DIFFUSE) {
+	if (s.inside_to_outside) {
+	    fAtomicAdd(&xyz_lookup1[p.last_hit_triangle].x, cos_theta*xyz.x);
+	    fAtomicAdd(&xyz_lookup1[p.last_hit_triangle].y, cos_theta*xyz.y);
+	    fAtomicAdd(&xyz_lookup1[p.last_hit_triangle].z, cos_theta*xyz.z);
+	}
+	else {
+	    fAtomicAdd(&xyz_lookup2[p.last_hit_triangle].x, cos_theta*xyz.x);
+	    fAtomicAdd(&xyz_lookup2[p.last_hit_triangle].y, cos_theta*xyz.y);
+	    fAtomicAdd(&xyz_lookup2[p.last_hit_triangle].z, cos_theta*xyz.z);
+	}
+    }
+
+    rng_states[kernel_id] = rng;
+
+} // update_xyz_lookup
+
+__global__ void
+update_xyz_image(int nthreads, curandState *rng_states, float3 *positions,
+		 float3 *directions, float wavelength, float3 xyz,
+		 float3 *xyz_lookup1, float3 *xyz_lookup2, float3 *image,
+		 int nlookup_calls, int max_steps, Geometry *g)
+{
+    int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if (id >= nthreads)
+	return;
+
+    curandState rng = rng_states[id];
+
+    Photon p;
+    p.position = positions[id];
+    p.direction = directions[id];
+    p.direction /= norm(p.direction);
+    p.wavelength = wavelength;
+    p.polarization = uniform_sphere(&rng);
+    p.last_hit_triangle = -1;
+    p.time = 0;
+    p.history = 0;
+
+    State s;
+    to_diffuse(p, s, g, rng, max_steps);
+
+    if (p.history & REFLECT_DIFFUSE) {
+	if (s.inside_to_outside)
+	    image[id] += xyz*xyz_lookup1[p.last_hit_triangle]/nlookup_calls;
+	else
+	    image[id] += xyz*xyz_lookup2[p.last_hit_triangle]/nlookup_calls;
+    }
+
+    rng_states[id] = rng;
+
+} // update_xyz_image
+
+__global__ void
+process_image(int nthreads, float3 *image, unsigned int *pixels, int nimages)
+{
+    int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if (id >= nthreads)
+	return;
+
+    float3 rgb = image[id]/nimages;
+
+    if (rgb.x < 0.0f)
+	rgb.x = 0.0f;
+    if (rgb.y < 0.0f)
+	rgb.y = 0.0f;
+    if (rgb.z < 0.0f)
+	rgb.z = 0.0f;
+
+    if (rgb.x > 1.0f)
+	rgb.x = 1.0f;
+    if (rgb.y > 1.0f)
+	rgb.y = 1.0f;
+    if (rgb.z > 1.0f)
+	rgb.z = 1.0f;
+
+    unsigned int r = floorf(rgb.x*255.0f);
+    unsigned int g = floorf(rgb.y*255.0f);
+    unsigned int b = floorf(rgb.z*255.0f);
+
+    pixels[id] = 255 << 24 | r << 16 | g << 8 | b;
+
+} // process_image
+
+} // extern "c"
diff --git a/chroma/cuda/intersect.h b/chroma/cuda/intersect.h
new file mode 100644
index 0000000..978bde8
--- /dev/null
+++ b/chroma/cuda/intersect.h
@@ -0,0 +1,141 @@
+//-*-c-*-
+
+#ifndef __INTERSECT_H__
+#define __INTERSECT_H__
+
+#include "linalg.h"
+#include "matrix.h"
+#include "geometry.h"
+
+#define EPSILON 0.0f
+
+/* Tests the intersection between a ray and a triangle.
+   If the ray intersects the triangle, set `distance` to the distance from
+   `origin` to the intersection and return true, else return false.
+   `direction` must be normalized to one. */
+__device__ bool
+intersect_triangle(const float3 &origin, const float3 &direction,
+		   const Triangle &triangle, float &distance)
+{
+	float3 m1 = triangle.v1-triangle.v0;
+	float3 m2 = triangle.v2-triangle.v0;
+	float3 m3 = -direction;
+
+	Matrix m = make_matrix(m1, m2, m3);
+	
+	float determinant = det(m);
+
+	if (determinant == 0.0f)
+		return false;
+
+	float3 b = origin-triangle.v0;
+
+	float u1 = ((m.a11*m.a22 - m.a12*m.a21)*b.x +
+		    (m.a02*m.a21 - m.a01*m.a22)*b.y +
+		    (m.a01*m.a12 - m.a02*m.a11)*b.z)/determinant;
+
+	if (u1 < -EPSILON || u1 > 1.0f)
+		return false;
+
+	float u2 = ((m.a12*m.a20 - m.a10*m.a22)*b.x +
+		    (m.a00*m.a22 - m.a02*m.a20)*b.y +
+		    (m.a02*m.a10 - m.a00*m.a12)*b.z)/determinant;
+
+	if (u2 < -EPSILON || u2 > 1.0f)
+		return false;
+
+	float u3 = ((m.a10*m.a21 - m.a11*m.a20)*b.x +
+		    (m.a01*m.a20 - m.a00*m.a21)*b.y +
+		    (m.a00*m.a11 - m.a01*m.a10)*b.z)/determinant;
+
+	if (u3 <= 0.0f || (1.0f-u1-u2) < -EPSILON)
+		return false;
+
+	distance = u3;
+
+	return true;
+}
+
+/* Tests the intersection between a ray and an axis-aligned box defined by
+   an upper and lower bound. If the ray intersects the box, set
+   `distance_to_box` to the distance from `origin` to the intersection and
+   return true, else return false. `direction` must be normalized to one.
+
+   Source: "An Efficient and Robust Ray-Box Intersection Algorithm."
+   by Williams, et. al. */
+__device__ bool
+intersect_box(const float3 &origin, const float3 &direction,
+	      const float3 &lower_bound, const float3 &upper_bound,
+	      float& distance_to_box)
+{
+	float kmin, kmax, kymin, kymax, kzmin, kzmax;
+
+	float divx = 1.0f/direction.x;
+	if (divx >= 0.0f)
+	{
+		kmin = (lower_bound.x - origin.x)*divx;
+		kmax = (upper_bound.x - origin.x)*divx;
+	}
+	else
+	{
+		kmin = (upper_bound.x - origin.x)*divx;
+		kmax = (lower_bound.x - origin.x)*divx;
+	}
+
+	if (kmax < 0.0f)
+		return false;
+
+	float divy = 1.0f/direction.y;
+	if (divy >= 0.0f)
+	{
+		kymin = (lower_bound.y - origin.y)*divy;
+		kymax = (upper_bound.y - origin.y)*divy;
+	}
+	else
+	{
+		kymin = (upper_bound.y - origin.y)*divy;
+		kymax = (lower_bound.y - origin.y)*divy;
+	}
+
+	if (kymax < 0.0f)
+		return false;
+
+	if (kymin > kmin)
+		kmin = kymin;
+
+	if (kymax < kmax)
+		kmax = kymax;
+
+	if (kmin > kmax)
+		return false;
+
+	float divz = 1.0f/direction.z;
+	if (divz >= 0.0f)
+	{
+		kzmin = (lower_bound.z - origin.z)*divz;
+		kzmax = (upper_bound.z - origin.z)*divz;
+	}
+	else
+	{
+		kzmin = (upper_bound.z - origin.z)*divz;
+		kzmax = (lower_bound.z - origin.z)*divz;
+	}
+
+	if (kzmax < 0.0f)
+		return false;
+
+	if (kzmin > kmin)
+		kmin = kzmin;
+
+	if (kzmax < kmax)
+		kmax = kzmax;
+
+	if (kmin > kmax)
+		return false;
+
+	distance_to_box = kmin;
+
+	return true;
+}
+
+#endif
diff --git a/chroma/cuda/linalg.h b/chroma/cuda/linalg.h
new file mode 100644
index 0000000..35b2423
--- /dev/null
+++ b/chroma/cuda/linalg.h
@@ -0,0 +1,170 @@
+#ifndef __LINALG_H__
+#define __LINALG_H__
+
+__device__ float3
+operator- (const float3 &a)
+{
+    return make_float3(-a.x, -a.y, -a.z);
+}
+
+__device__ float3
+operator* (const float3 &a, const float3 &b)
+{
+    return make_float3(a.x*b.x, a.y*b.y, a.z*b.z);
+}
+
+__device__ float3
+operator/ (const float3 &a, const float3 &b)
+{
+    return make_float3(a.x/b.x, a.y/b.y, a.z/b.z);
+}
+
+__device__ void
+operator*= (float3 &a, const float3 &b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+}
+
+__device__ void
+operator/= (float3 &a, const float3 &b)
+{
+    a.x /= b.x;
+    a.y /= b.y;
+    a.z /= b.z;
+}
+
+__device__ float3
+operator+ (const float3 &a, const float3 &b)
+{
+    return make_float3(a.x+b.x, a.y+b.y, a.z+b.z);
+}
+
+__device__ void
+operator+= (float3 &a, const float3 &b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+}
+
+__device__ float3
+operator- (const float3 &a, const float3 &b)
+{
+    return make_float3(a.x-b.x, a.y-b.y, a.z-b.z);
+}
+
+__device__ void
+operator-= (float3 &a, const float3 &b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+}
+
+__device__ float3
+operator+ (const float3 &a, const float &c)
+{
+    return make_float3(a.x+c, a.y+c, a.z+c);
+}
+
+__device__ void
+operator+= (float3 &a, const float &c)
+{
+    a.x += c;
+    a.y += c;
+    a.z += c;
+}
+
+__device__ float3
+operator+ (const float &c, const float3 &a)
+{
+    return make_float3(c+a.x, c+a.y, c+a.z);
+}
+
+__device__ float3
+operator- (const float3 &a, const float &c)
+{
+    return make_float3(a.x-c, a.y-c, a.z-c);
+}
+
+__device__ void
+operator-= (float3 &a, const float &c)
+{
+    a.x -= c;
+    a.y -= c;
+    a.z -= c;
+}
+
+__device__ float3
+operator- (const float &c, const float3& a)
+{
+    return make_float3(c-a.x, c-a.y, c-a.z);
+}
+
+__device__ float3
+operator* (const float3 &a, const float &c)
+{
+    return make_float3(a.x*c, a.y*c, a.z*c);
+}
+
+__device__ void
+operator*= (float3 &a, const float &c)
+{
+    a.x *= c;
+    a.y *= c;
+    a.z *= c;
+}
+
+__device__ float3 
+operator* (const float &c, const float3& a)
+{
+    return make_float3(c*a.x, c*a.y, c*a.z);
+}
+
+__device__ float3
+operator/ (const float3 &a, const float &c)
+{
+    return make_float3(a.x/c, a.y/c, a.z/c);
+}
+
+__device__ void
+operator/= (float3 &a, const float &c)
+{
+    a.x /= c;
+    a.y /= c;
+    a.z /= c;
+}
+
+__device__ float3
+operator/ (const float &c, const float3 &a)
+{
+    return make_float3(c/a.x, c/a.y, c/a.z);
+}
+
+__device__ float
+dot(const float3 &a, const float3 &b)
+{
+    return a.x*b.x + a.y*b.y + a.z*b.z;
+}
+
+__device__ float3
+cross(const float3 &a, const float3 &b)
+{
+    return make_float3(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x);
+}
+
+__device__ float
+norm(const float3 &a)
+{
+    return sqrtf(dot(a,a));
+}
+
+__device__ float3
+normalize(const float3 &a)
+{
+    return a/norm(a);
+}
+
+#endif
diff --git a/chroma/cuda/matrix.h b/chroma/cuda/matrix.h
new file mode 100644
index 0000000..0a66e58
--- /dev/null
+++ b/chroma/cuda/matrix.h
@@ -0,0 +1,249 @@
+#ifndef __MATRIX_H__
+#define __MATRIX_H__
+
+struct Matrix
+{
+    float a00, a01, a02, a10, a11, a12, a20, a21, a22;
+};
+
+__device__ Matrix
+make_matrix(float a00, float a01, float a02,
+	    float a10, float a11, float a12,
+	    float a20, float a21, float a22)
+{
+    Matrix m = {a00, a01, a02, a10, a11, a12, a20, a21, a22};
+    return m;
+}
+
+__device__ Matrix
+make_matrix(const float3 &u1, const float3 &u2, const float3 &u3)
+{
+    Matrix m = {u1.x, u2.x, u3.x, u1.y, u2.y, u3.y, u1.z, u2.z, u3.z};
+    return m;
+}
+
+__device__ Matrix
+operator- (const Matrix &m)
+{
+    return make_matrix(-m.a00, -m.a01, -m.a02,
+		       -m.a10, -m.a11, -m.a12,
+		       -m.a20, -m.a21, -m.a22);
+}
+
+__device__ float3
+operator* (const Matrix &m, const float3 &a)
+{
+    return make_float3(m.a00*a.x + m.a01*a.y + m.a02*a.z,
+		       m.a10*a.x + m.a11*a.y + m.a12*a.z,
+		       m.a20*a.x + m.a21*a.y + m.a22*a.z);
+}
+
+__device__ Matrix
+operator+ (const Matrix &m, const Matrix &n)
+{
+    return make_matrix(m.a00+n.a00, m.a01+n.a01, m.a02+n.a02,
+		       m.a10+n.a10, m.a11+n.a11, m.a12+n.a12,
+		       m.a20+n.a20, m.a21+n.a21, m.a22+n.a22);
+}
+
+__device__ void
+operator+= (Matrix &m, const Matrix &n)
+{
+    m.a00 += n.a00;
+    m.a01 += n.a01;
+    m.a02 += n.a02;
+    m.a10 += n.a10;
+    m.a11 += n.a11;
+    m.a12 += n.a12;
+    m.a20 += n.a20;
+    m.a21 += n.a21;
+    m.a22 += n.a22;
+}
+
+__device__ Matrix
+operator- (const Matrix &m, const Matrix &n)
+{
+    return make_matrix(m.a00-n.a00, m.a01-n.a01, m.a02-n.a02,
+		       m.a10-n.a10, m.a11-n.a11, m.a12-n.a12,
+		       m.a20-n.a20, m.a21-n.a21, m.a22-n.a22);
+}
+
+__device__ void
+operator-= (Matrix &m, const Matrix& n)
+{
+    m.a00 -= n.a00;
+    m.a01 -= n.a01;
+    m.a02 -= n.a02;
+    m.a10 -= n.a10;
+    m.a11 -= n.a11;
+    m.a12 -= n.a12;
+    m.a20 -= n.a20;
+    m.a21 -= n.a21;
+    m.a22 -= n.a22;
+}
+
+__device__ Matrix
+operator* (const Matrix &m, const Matrix &n)
+{
+    return make_matrix(m.a00*n.a00 + m.a01*n.a10 + m.a02*n.a20,
+		       m.a00*n.a01 + m.a01*n.a11 + m.a02*n.a21,
+		       m.a00*n.a02 + m.a01*n.a12 + m.a02*n.a22,
+		       m.a10*n.a00 + m.a11*n.a10 + m.a12*n.a20,
+		       m.a10*n.a01 + m.a11*n.a11 + m.a12*n.a21,
+		       m.a10*n.a02 + m.a11*n.a12 + m.a12*n.a22,
+		       m.a20*n.a00 + m.a21*n.a10 + m.a22*n.a20,
+		       m.a20*n.a01 + m.a21*n.a11 + m.a22*n.a21,
+		       m.a20*n.a02 + m.a21*n.a12 + m.a22*n.a22);
+}
+
+__device__ Matrix
+operator+ (const Matrix &m, const float &c)
+{
+    return make_matrix(m.a00+c, m.a01+c, m.a02+c,
+		       m.a10+c, m.a11+c, m.a12+c,
+		       m.a20+c, m.a21+c, m.a22+c);
+}
+
+__device__ void
+operator+= (Matrix &m, const float &c)
+{
+    m.a00 += c;
+    m.a01 += c;
+    m.a02 += c;
+    m.a10 += c;
+    m.a11 += c;
+    m.a12 += c;
+    m.a20 += c;
+    m.a21 += c;
+    m.a22 += c;
+}
+
+__device__ Matrix
+operator+ (const float &c, const Matrix &m)
+{
+    return make_matrix(c+m.a00, c+m.a01, c+m.a02,
+		       c+m.a10, c+m.a11, c+m.a12,
+		       c+m.a20, c+m.a21, c+m.a22);
+}
+
+__device__ Matrix
+operator- (const Matrix &m, const float &c)
+{
+    return make_matrix(m.a00-c, m.a01-c, m.a02-c,
+		       m.a10-c, m.a11-c, m.a12-c,
+		       m.a20-c, m.a21-c, m.a22-c);
+}
+
+__device__ void
+operator-= (Matrix &m, const float &c)
+{
+    m.a00 -= c;
+    m.a01 -= c;
+    m.a02 -= c;
+    m.a10 -= c;
+    m.a11 -= c;
+    m.a12 -= c;
+    m.a20 -= c;
+    m.a21 -= c;
+    m.a22 -= c;
+}
+
+__device__ Matrix
+operator- (const float &c, const Matrix &m)
+{
+    return make_matrix(c-m.a00, c-m.a01, c-m.a02,
+		       c-m.a10, c-m.a11, c-m.a12,
+		       c-m.a20, c-m.a21, c-m.a22);
+}
+
+__device__ Matrix
+operator* (const Matrix &m, const float &c)
+{
+    return make_matrix(m.a00*c, m.a01*c, m.a02*c,
+		       m.a10*c, m.a11*c, m.a12*c,
+		       m.a20*c, m.a21*c, m.a22*c);
+}
+
+__device__ void
+operator*= (Matrix &m, const float &c)
+{
+    m.a00 *= c;
+    m.a01 *= c;
+    m.a02 *= c;
+    m.a10 *= c;
+    m.a11 *= c;
+    m.a12 *= c;
+    m.a20 *= c;
+    m.a21 *= c;
+    m.a22 *= c;
+}
+
+__device__ Matrix
+operator* (const float &c, const Matrix &m)
+{
+    return make_matrix(c*m.a00, c*m.a01, c*m.a02,
+		       c*m.a10, c*m.a11, c*m.a12,
+		       c*m.a20, c*m.a21, c*m.a22);
+}
+
+__device__ Matrix
+operator/ (const Matrix &m, const float &c)
+{
+    return make_matrix(m.a00/c, m.a01/c, m.a02/c,
+		       m.a10/c, m.a11/c, m.a12/c,
+		       m.a20/c, m.a21/c, m.a22/c);
+}
+
+__device__ void
+operator/= (Matrix &m, const float &c)
+{
+    m.a00 /= c;
+    m.a01 /= c;
+    m.a02 /= c;
+    m.a10 /= c;
+    m.a11 /= c;
+    m.a12 /= c;
+    m.a20 /= c;
+    m.a21 /= c;
+    m.a22 /= c;
+}
+
+__device__ Matrix
+operator/ (const float &c, const Matrix &m)
+{
+    return make_matrix(c/m.a00, c/m.a01, c/m.a02,
+		       c/m.a10, c/m.a11, c/m.a12,
+		       c/m.a20, c/m.a21, c/m.a22);
+}
+
+__device__ float
+det(const Matrix &m)
+{
+    return m.a00*(m.a11*m.a22 - m.a12*m.a21) -
+	   m.a10*(m.a01*m.a22 - m.a02*m.a21) +
+	   m.a20*(m.a01*m.a12 - m.a02*m.a11);
+}
+
+__device__ Matrix
+inv(const Matrix &m)
+{
+    return make_matrix(m.a11*m.a22 - m.a12*m.a21,
+		       m.a02*m.a21 - m.a01*m.a22,
+		       m.a01*m.a12 - m.a02*m.a11,
+		       m.a12*m.a20 - m.a10*m.a22,
+		       m.a00*m.a22 - m.a02*m.a20,
+		       m.a02*m.a10 - m.a00*m.a12,
+		       m.a10*m.a21 - m.a11*m.a20,
+		       m.a01*m.a20 - m.a00*m.a21,
+		       m.a00*m.a11 - m.a01*m.a10)/det(m);
+}
+
+__device__ Matrix
+outer(const float3 &a, const float3 &b)
+{
+    return make_matrix(a.x*b.x, a.x*b.y, a.x*b.z,
+		       a.y*b.x, a.y*b.y, a.y*b.z,
+		       a.z*b.x, a.z*b.y, a.z*b.z);
+}
+
+#endif
diff --git a/chroma/cuda/mesh.h b/chroma/cuda/mesh.h
new file mode 100644
index 0000000..d60d801
--- /dev/null
+++ b/chroma/cuda/mesh.h
@@ -0,0 +1,175 @@
+#ifndef __MESH_H__
+#define __MESH_H__
+
+#include "intersect.h"
+#include "geometry.h"
+
+#include "stdio.h"
+
+#define STACK_SIZE 100
+
+/* Tests the intersection between a ray and a node in the bounding volume
+   hierarchy. If the ray intersects the bounding volume and `min_distance`
+   is less than zero or the distance from `origin` to the intersection is
+   less than `min_distance`, return true, else return false. */
+__device__ bool
+intersect_node(const float3 &origin, const float3 &direction,
+	       Geometry *g, int i, float min_distance=-1.0f)
+{
+    /* assigning these to local variables is faster for some reason */
+    float3 lower_bound = g->lower_bounds[i];
+    float3 upper_bound = g->upper_bounds[i];
+
+    float distance_to_box;
+
+    if (intersect_box(origin, direction, lower_bound, upper_bound,
+		      distance_to_box)) {
+	if (min_distance < 0.0f)
+	    return true;
+
+	if (distance_to_box > min_distance)
+	    return false;
+
+	return true;
+    }
+    else {
+	return false;
+    }
+
+}
+
+/* Finds the intersection between a ray and `geometry`. If the ray does
+   intersect the mesh and the index of the intersected triangle is not equal
+   to `last_hit_triangle`, set `min_distance` to the distance from `origin` to
+   the intersection and return the index of the triangle which the ray
+   intersected, else return -1. */
+__device__ int
+intersect_mesh(const float3 &origin, const float3& direction, Geometry *g,
+	       float &min_distance, int last_hit_triangle = -1)
+{
+    int triangle_index = -1;
+
+    float distance;
+    min_distance = -1.0f;
+
+    if (!intersect_node(origin, direction, g, g->start_node, min_distance))
+	return -1;
+
+    unsigned int stack[STACK_SIZE];
+
+    unsigned int *head = &stack[0];
+    unsigned int *node = &stack[1];
+    unsigned int *tail = &stack[STACK_SIZE-1];
+    *node = g->start_node;
+
+    unsigned int i;
+
+    do
+    {
+	unsigned int first_child = g->node_map[*node];
+	unsigned int stop = g->node_map_end[*node];
+
+	while (*node >= g->first_node && stop == first_child+1) {
+	    *node = first_child;
+	    first_child = g->node_map[*node];
+	    stop = g->node_map_end[*node];
+	}
+		
+	if (*node >= g->first_node) {
+	    for (i=first_child; i < stop; i++) {
+		if (intersect_node(origin, direction, g, i, min_distance)) {
+		    *node = i;
+		    node++;
+		}
+	    }
+	    
+	    node--;
+	}
+	else {
+	    // node is a leaf
+	    for (i=first_child; i < stop; i++) {
+		if (last_hit_triangle == i)
+		    continue;
+
+		Triangle t = get_triangle(g, i);
+
+		if (intersect_triangle(origin, direction, t, distance)) {
+		    if (triangle_index == -1) {
+			triangle_index = i;
+			min_distance = distance;
+			continue;
+		    }
+
+		    if (distance < min_distance) {
+			triangle_index = i;
+			min_distance = distance;
+		    }
+		}
+	    } // triangle loop
+
+	    node--;
+
+	} // node is a leaf
+
+	if (node > tail) {
+	    printf("warning: intersect_mesh() aborted; node > tail\n");
+	    break;
+	}
+
+    } // while loop
+    while (node != head);
+    
+    return triangle_index;
+}
+
+extern "C"
+{
+
+__global__ void
+distance_to_mesh(int nthreads, float3 *_origin, float3 *_direction,
+		 Geometry *g, float *_distance)
+{
+    __shared__ Geometry sg;
+
+    if (threadIdx.x == 0)
+	sg = *g;
+
+    __syncthreads();
+
+    int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if (id >= nthreads)
+	return;
+
+    g = &sg;
+
+    float3 origin = _origin[id];
+    float3 direction = _direction[id];
+    direction /= norm(direction);
+
+    float distance;
+
+    int triangle_index = intersect_mesh(origin, direction, g, distance);
+
+    if (triangle_index != -1)
+	_distance[id] = distance;
+}
+
+__global__ void
+color_solids(int first_triangle, int nthreads, int *solid_id_map,
+	     bool *solid_hit, unsigned int *solid_colors, Geometry *g)
+{
+    int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if (id >= nthreads)
+	return;
+
+    int triangle_id = first_triangle + id;
+    int solid_id = solid_id_map[triangle_id];
+    if (solid_hit[solid_id])
+	g->colors[triangle_id] = solid_colors[solid_id];
+}
+
+} // extern "C"
+
+#endif
diff --git a/chroma/cuda/photon.h b/chroma/cuda/photon.h
new file mode 100644
index 0000000..8b7e588
--- /dev/null
+++ b/chroma/cuda/photon.h
@@ -0,0 +1,325 @@
+#ifndef __PHOTON_H__
+#define __PHOTON_H__
+
+#include "stdio.h"
+#include "linalg.h"
+#include "rotate.h"
+#include "random.h"
+#include "physical_constants.h"
+#include "mesh.h"
+#include "geometry.h"
+
+struct Photon
+{
+    float3 position;
+    float3 direction;
+    float3 polarization;
+    float wavelength;
+    float time;
+
+    unsigned int history;
+
+    int last_hit_triangle;
+};
+
+struct State
+{
+    bool inside_to_outside;
+
+    float3 surface_normal;
+
+    float refractive_index1, refractive_index2;
+    float absorption_length;
+    float scattering_length;
+
+    int surface_index;
+
+    float distance_to_boundary;
+};
+
+enum
+{
+    NO_HIT           = 0x1 << 0,
+    BULK_ABSORB      = 0x1 << 1,
+    SURFACE_DETECT   = 0x1 << 2,
+    SURFACE_ABSORB   = 0x1 << 3,
+    RAYLEIGH_SCATTER = 0x1 << 4,
+    REFLECT_DIFFUSE  = 0x1 << 5,
+    REFLECT_SPECULAR = 0x1 << 6,
+    NAN_ABORT        = 0x1 << 31
+}; // processes
+
+enum {BREAK, CONTINUE, PASS}; // return value from propagate_to_boundary
+
+__device__ int
+convert(int c)
+{
+    if (c & 0x80)
+	return (0xFFFFFF00 | c);
+    else
+	return c;
+}
+
+__device__ float
+get_theta(const float3 &a, const float3 &b)
+{
+    return acosf(fmaxf(-1.0f,fminf(1.0f,dot(a,b))));
+}
+
+__device__ void
+fill_state(State &s, Photon &p, Geometry *g)
+{
+    p.last_hit_triangle = intersect_mesh(p.position, p.direction, g,
+					 s.distance_to_boundary,
+					 p.last_hit_triangle);
+
+    if (p.last_hit_triangle == -1) {
+	p.history |= NO_HIT;
+	return;
+    }
+    
+    Triangle t = get_triangle(g, p.last_hit_triangle);
+    
+    unsigned int material_code = g->material_codes[p.last_hit_triangle];
+    
+    int inner_material_index = convert(0xFF & (material_code >> 24));
+    int outer_material_index = convert(0xFF & (material_code >> 16));
+    s.surface_index = convert(0xFF & (material_code >> 8));
+    
+    float3 v01 = t.v1 - t.v0;
+    float3 v12 = t.v2 - t.v1;
+    
+    s.surface_normal = normalize(cross(v01, v12));
+				 
+    Material *material1, *material2;
+    if (dot(s.surface_normal,-p.direction) > 0.0f) {
+	// outside to inside
+	material1 = g->materials[outer_material_index];
+	material2 = g->materials[inner_material_index];
+
+	s.inside_to_outside = false;
+    }
+    else {
+	// inside to outside
+	material1 = g->materials[inner_material_index];
+	material2 = g->materials[outer_material_index];
+	s.surface_normal = -s.surface_normal;
+
+	s.inside_to_outside = true;
+    }
+
+    s.refractive_index1 = interp_property(material1, p.wavelength,
+					  material1->refractive_index);
+    s.refractive_index2 = interp_property(material2, p.wavelength,
+					  material2->refractive_index);
+    s.absorption_length = interp_property(material1, p.wavelength,
+					  material1->absorption_length);
+    s.scattering_length = interp_property(material1, p.wavelength,
+					  material1->scattering_length);
+
+} // fill_state
+
+__device__ float3
+pick_new_direction(float3 axis, float theta, float phi)
+{
+    // Taken from SNOMAN rayscatter.for
+    float cos_theta = cosf(theta);
+    float sin_theta = sinf(theta);
+    float cos_phi   = cosf(phi);
+    float sin_phi   = sinf(phi);
+	
+    float sin_axis_theta = sqrt(1.0f - axis.z*axis.z);
+    float cos_axis_phi, sin_axis_phi;
+	
+    if (isnan(sin_axis_theta) || sin_axis_theta < 0.00001f) {
+	cos_axis_phi = 1.0f;
+	sin_axis_phi = 0.0f;
+    }
+    else {
+	cos_axis_phi = axis.x / sin_axis_theta;
+	sin_axis_phi = axis.y / sin_axis_theta;
+    }
+
+    float dirx = cos_theta*axis.x +
+	sin_theta*(axis.z*cos_phi*cos_axis_phi - sin_phi*sin_axis_phi);
+    float diry = cos_theta*axis.y +
+	sin_theta*(cos_phi*axis.z*sin_axis_phi - sin_phi*cos_axis_phi);
+    float dirz = cos_theta*axis.z - sin_theta*cos_phi*sin_axis_theta;
+
+    return make_float3(dirx, diry, dirz);
+}
+
+__device__ void
+rayleigh_scatter(Photon &p, curandState &rng)
+{
+    float cos_theta = 2.0f*cosf((acosf(1.0f - 2.0f*curand_uniform(&rng))-2*PI)/3.0f);
+    if (cos_theta > 1.0f)
+	cos_theta = 1.0f;
+    else if (cos_theta < -1.0f)
+	cos_theta = -1.0f;
+
+    float theta = acosf(cos_theta);
+    float phi = uniform(&rng, 0.0f, 2.0f * PI);
+
+    p.direction = pick_new_direction(p.polarization, theta, phi);
+
+    if (1.0f - fabsf(cos_theta) < 1e-6f) {
+	p.polarization = pick_new_direction(p.polarization, PI/2.0f, phi);
+    }
+    else {
+	// linear combination of old polarization and new direction
+	p.polarization = p.polarization - cos_theta * p.direction;
+    }
+
+    p.direction /= norm(p.direction);
+    p.polarization /= norm(p.polarization);
+} // scatter
+
+__device__ int propagate_to_boundary(Photon &p, State &s, curandState &rng)
+{
+    float absorption_distance = -s.absorption_length*logf(curand_uniform(&rng));
+    float scattering_distance = -s.scattering_length*logf(curand_uniform(&rng));
+
+    if (absorption_distance <= scattering_distance) {
+	if (absorption_distance <= s.distance_to_boundary) {
+	    p.time += absorption_distance/(SPEED_OF_LIGHT/s.refractive_index1);
+	    p.position += absorption_distance*p.direction;
+	    p.history |= BULK_ABSORB;
+
+	    p.last_hit_triangle = -1;
+
+	    return BREAK;
+	} // photon is absorbed in material1
+    }
+    else {
+	if (scattering_distance <= s.distance_to_boundary) {
+	    p.time += scattering_distance/(SPEED_OF_LIGHT/s.refractive_index1);
+	    p.position += scattering_distance*p.direction;
+
+	    rayleigh_scatter(p, rng);
+
+	    p.history |= RAYLEIGH_SCATTER;
+
+	    p.last_hit_triangle = -1;
+
+	    return CONTINUE;
+	} // photon is scattered in material1
+    } // if scattering_distance < absorption_distance
+
+    p.position += s.distance_to_boundary*p.direction;
+    p.time += s.distance_to_boundary/(SPEED_OF_LIGHT/s.refractive_index1);
+
+    return PASS;
+
+} // propagate_to_boundary
+
+__device__ void
+propagate_at_boundary(Photon &p, State &s, curandState &rng)
+{
+    float incident_angle = get_theta(s.surface_normal,-p.direction);
+    float refracted_angle = asinf(sinf(incident_angle)*s.refractive_index1/s.refractive_index2);
+
+    float3 incident_plane_normal = cross(p.direction, s.surface_normal);
+    float incident_plane_normal_length = norm(incident_plane_normal);
+
+    // Photons at normal incidence do not have a unique plane of incidence,
+    // so we have to pick the plane normal to be the polarization vector
+    // to get the correct logic below
+    if (incident_plane_normal_length < 1e-6f)
+	incident_plane_normal = p.polarization;
+    else
+	incident_plane_normal /= incident_plane_normal_length;
+
+    float normal_coefficient = dot(p.polarization, incident_plane_normal);
+    float normal_probability = normal_coefficient*normal_coefficient;
+
+    float reflection_coefficient;
+    if (curand_uniform(&rng) < normal_probability) {
+	// photon polarization normal to plane of incidence
+	reflection_coefficient = -sinf(incident_angle-refracted_angle)/sinf(incident_angle+refracted_angle);
+
+	if ((curand_uniform(&rng) < reflection_coefficient*reflection_coefficient) || isnan(refracted_angle)) {
+	    p.direction = rotate(s.surface_normal, incident_angle, incident_plane_normal);
+			
+	    p.history |= REFLECT_SPECULAR;
+	}
+	else {
+	    p.direction = rotate(s.surface_normal, PI-refracted_angle, incident_plane_normal);
+	}
+
+	p.polarization = incident_plane_normal;
+    }
+    else {
+	// photon polarization parallel to plane of incidence
+	reflection_coefficient = tanf(incident_angle-refracted_angle)/tanf(incident_angle+refracted_angle);
+
+	if ((curand_uniform(&rng) < reflection_coefficient*reflection_coefficient) || isnan(refracted_angle)) {
+	    p.direction = rotate(s.surface_normal, incident_angle, incident_plane_normal);
+			
+	    p.history |= REFLECT_SPECULAR;
+	}
+	else {
+	    p.direction = rotate(s.surface_normal, PI-refracted_angle, incident_plane_normal);
+	}
+
+	p.polarization = cross(incident_plane_normal, p.direction);
+	p.polarization /= norm(p.polarization);
+    }
+
+} // propagate_at_boundary
+
+__device__ int
+propagate_at_surface(Photon &p, State &s, curandState &rng, Geometry *geometry)
+{
+    Surface *surface = geometry->surfaces[s.surface_index];
+
+    /* since the surface properties are interpolated linearly, we are
+       guaranteed that they still sum to 1.0. */
+
+    float detect = interp_property(surface, p.wavelength, surface->detect);
+    float absorb = interp_property(surface, p.wavelength, surface->absorb);
+    float reflect_diffuse = interp_property(surface, p.wavelength, surface->reflect_diffuse);
+    float reflect_specular = interp_property(surface, p.wavelength, surface->reflect_specular);
+
+    float uniform_sample = curand_uniform(&rng);
+
+    if (uniform_sample < absorb) {
+	p.history |= SURFACE_ABSORB;
+	return BREAK;
+    }
+    else if (uniform_sample < absorb + detect) {
+	p.history |= SURFACE_DETECT;
+	return BREAK;
+    }
+    else if (uniform_sample < absorb + detect + reflect_diffuse) {
+	// diffusely reflect
+	p.direction = uniform_sphere(&rng);
+
+	if (dot(p.direction, s.surface_normal) < 0.0f)
+	    p.direction = -p.direction;
+
+	// randomize polarization?
+	p.polarization = cross(uniform_sphere(&rng), p.direction);
+	p.polarization /= norm(p.polarization);
+
+	p.history |= REFLECT_DIFFUSE;
+
+	return CONTINUE;
+    }
+    else {
+	// specularly reflect
+	float incident_angle = get_theta(s.surface_normal,-p.direction);
+	float3 incident_plane_normal = cross(p.direction, s.surface_normal);
+	incident_plane_normal /= norm(incident_plane_normal);
+
+	p.direction = rotate(s.surface_normal, incident_angle,
+			     incident_plane_normal);
+
+	p.history |= REFLECT_SPECULAR;
+
+	return CONTINUE;
+    }
+
+} // propagate_at_surface
+
+#endif
diff --git a/chroma/cuda/physical_constants.h b/chroma/cuda/physical_constants.h
new file mode 100644
index 0000000..2ff87cd
--- /dev/null
+++ b/chroma/cuda/physical_constants.h
@@ -0,0 +1,7 @@
+#ifndef __PHYSICAL_CONSTANTS_H__
+#define __PHYSICAL_CONSTANTS_H__
+
+#define SPEED_OF_LIGHT 2.99792458e8
+#define PI 3.141592653589793
+
+#endif
diff --git a/chroma/cuda/propagate.cu b/chroma/cuda/propagate.cu
new file mode 100644
index 0000000..fdcf532
--- /dev/null
+++ b/chroma/cuda/propagate.cu
@@ -0,0 +1,147 @@
+//-*-c-*-
+
+#include "linalg.h"
+#include "geometry.h"
+#include "photon.h"
+
+#include "stdio.h"
+
+extern "C"
+{
+
+__global__ void
+photon_duplicate(int first_photon, int nthreads,
+		 float3 *positions, float3 *directions,
+		 float *wavelengths, float3 *polarizations,
+		 float *times, unsigned int *histories,
+		 int *last_hit_triangles, 
+		 int copies, int stride)
+{
+    int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if (id >= nthreads)
+	return;
+
+    int photon_id = first_photon + id;
+
+    Photon p;
+    p.position = positions[photon_id];
+    p.direction = directions[photon_id];
+    p.polarization = polarizations[photon_id];
+    p.wavelength = wavelengths[photon_id];
+    p.time = times[photon_id];
+    p.last_hit_triangle = last_hit_triangles[photon_id];
+    p.history = histories[photon_id];
+
+    for (int i=1; i <= copies; i++) {
+      int target_photon_id = photon_id + stride * i;
+
+      positions[target_photon_id] = p.position;
+      directions[target_photon_id] = p.direction;
+      polarizations[target_photon_id] = p.polarization;
+      wavelengths[target_photon_id] = p.wavelength;
+      times[target_photon_id] = p.time;
+      last_hit_triangles[target_photon_id] = p.last_hit_triangle;
+      histories[target_photon_id] = p.history;
+    }
+}
+		 
+__global__ void
+propagate(int first_photon, int nthreads, unsigned int *input_queue,
+	  unsigned int *output_queue, curandState *rng_states,
+	  float3 *positions, float3 *directions,
+	  float *wavelengths, float3 *polarizations,
+	  float *times, unsigned int *histories,
+	  int *last_hit_triangles, int max_steps,
+	  Geometry *g)
+{
+    __shared__ Geometry sg;
+
+    if (threadIdx.x == 0)
+	sg = *g;
+
+    __syncthreads();
+
+    int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if (id >= nthreads)
+	return;
+
+    g = &sg;
+
+    curandState rng = rng_states[id];
+
+    int photon_id = input_queue[first_photon + id];
+
+    Photon p;
+    p.position = positions[photon_id];
+    p.direction = directions[photon_id];
+    p.direction /= norm(p.direction);
+    p.polarization = polarizations[photon_id];
+    p.polarization /= norm(p.polarization);
+    p.wavelength = wavelengths[photon_id];
+    p.time = times[photon_id];
+    p.last_hit_triangle = last_hit_triangles[photon_id];
+    p.history = histories[photon_id];
+
+    if (p.history & (NO_HIT | BULK_ABSORB | SURFACE_DETECT | SURFACE_ABSORB))
+	return;
+
+    State s;
+
+    int steps = 0;
+    while (steps < max_steps) {
+	steps++;
+
+	int command;
+
+	// check for NaN and fail
+	if (isnan(p.direction.x*p.direction.y*p.direction.z*p.position.x*p.position.y*p.position.z)) {
+	    p.history |= NO_HIT | NAN_ABORT;
+	    break;
+	}
+
+	fill_state(s, p, g);
+
+	if (p.last_hit_triangle == -1)
+	    break;
+
+	command = propagate_to_boundary(p, s, rng);
+
+	if (command == BREAK)
+	    break;
+
+	if (command == CONTINUE)
+	    continue;
+
+	if (s.surface_index != -1) {
+	    command = propagate_at_surface(p, s, rng, g);
+
+	    if (command == BREAK)
+		break;
+
+	    if (command == CONTINUE)
+		continue;
+	}
+
+	propagate_at_boundary(p, s, rng);
+
+    } // while (steps < max_steps)
+
+    rng_states[id] = rng;
+    positions[photon_id] = p.position;
+    directions[photon_id] = p.direction;
+    polarizations[photon_id] = p.polarization;
+    wavelengths[photon_id] = p.wavelength;
+    times[photon_id] = p.time;
+    histories[photon_id] = p.history;
+    last_hit_triangles[photon_id] = p.last_hit_triangle;
+	
+    // Not done, put photon in output queue
+    if ((p.history & (NO_HIT | BULK_ABSORB | SURFACE_DETECT | SURFACE_ABSORB)) == 0) {
+	int out_idx = atomicAdd(output_queue, 1);
+	output_queue[out_idx] = photon_id;
+    }
+} // propagate
+
+} // extern "C"
diff --git a/chroma/cuda/random.h b/chroma/cuda/random.h
new file mode 100644
index 0000000..e9d2e75
--- /dev/null
+++ b/chroma/cuda/random.h
@@ -0,0 +1,87 @@
+#ifndef __RANDOM_H__
+#define __RANDOM_H__
+
+#include <curand_kernel.h>
+
+#include "physical_constants.h"
+
+__device__ float
+uniform(curandState *s, const float &low, const float &high)
+{
+    return low + curand_uniform(s)*(high-low);
+}
+
+__device__ float3
+uniform_sphere(curandState *s)
+{
+    float theta = uniform(s, 0.0f, 2*PI);
+    float u = uniform(s, -1.0f, 1.0f);
+    float c = sqrtf(1.0f-u*u);
+
+    return make_float3(c*cosf(theta), c*sinf(theta), u);
+}
+
+// Draw a random sample given a cumulative distribution function
+// Assumptions: ncdf >= 2, cdf_y[0] is 0.0, and cdf_y[ncdf-1] is 1.0
+__device__ float
+sample_cdf(curandState *rng, int ncdf, float *cdf_x, float *cdf_y)
+{
+    float u = curand_uniform(rng);
+
+    // Find u in cdf_y with binary search
+    // list must contain at least 2 elements: 0.0 and 1.0
+    int lower=0;
+    int upper=ncdf-1;
+    while(lower < upper-1) {
+	int half = (lower+upper) / 2;
+	if (u < cdf_y[half])
+	    upper = half;
+	else
+	    lower = half;
+    }
+  
+    float frac = (u - cdf_y[lower]) / (cdf_y[upper] - cdf_y[lower]);
+    return cdf_x[lower] * frac + cdf_x[upper] * (1.0f - frac);
+}
+
+extern "C"
+{
+
+__global__ void
+init_rng(int nthreads, curandState *s, unsigned long long seed,
+	 unsigned long long offset)
+{
+    int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if (id >= nthreads)
+	return;
+
+    curand_init(seed, id, offset, &s[id]);
+}
+
+__global__ void
+fill_uniform(int nthreads, curandState *s, float *a, float low, float high)
+{
+    int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if (id >= nthreads)
+	return;
+
+    a[id] = uniform(&s[id], low, high);
+
+}
+
+__global__ void
+fill_uniform_sphere(int nthreads, curandState *s, float3 *a)
+{
+    int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if (id >= nthreads)
+	return;
+
+    a[id] = uniform_sphere(&s[id]);
+}
+
+} // extern "c"
+
+#endif
diff --git a/chroma/cuda/render.cu b/chroma/cuda/render.cu
new file mode 100644
index 0000000..d4ed443
--- /dev/null
+++ b/chroma/cuda/render.cu
@@ -0,0 +1,171 @@
+//-*-c-*-
+
+#include "linalg.h"
+#include "intersect.h"
+#include "mesh.h"
+#include "sorting.h"
+#include "geometry.h"
+
+#include "stdio.h"
+
+__device__ float4
+get_color(const float3 &direction, const Triangle &t, unsigned int rgba)
+{
+    float3 v01 = t.v1 - t.v0;
+    float3 v12 = t.v2 - t.v1;
+    
+    float3 surface_normal = normalize(cross(v01,v12));
+
+    float cos_theta = dot(surface_normal,-direction);
+
+    if (cos_theta < 0.0f)
+	cos_theta = -cos_theta;
+
+    unsigned int a0 = 0xff & (rgba >> 24);
+    unsigned int r0 = 0xff & (rgba >> 16);
+    unsigned int g0 = 0xff & (rgba >> 8);
+    unsigned int b0 = 0xff & rgba;
+
+    float alpha = (255 - a0)/255.0f;
+
+    return make_float4(r0*cos_theta, g0*cos_theta, b0*cos_theta, alpha);
+}
+
+extern "C"
+{
+
+__global__ void
+render(int nthreads, float3 *_origin, float3 *_direction, Geometry *g,
+       unsigned int alpha_depth, unsigned int *pixels, float *_dx,
+       unsigned int *dxlen, float4 *_color)
+{
+    __shared__ Geometry sg;
+
+    if (threadIdx.x == 0)
+	sg = *g;
+
+    __syncthreads();
+
+    int id = blockIdx.x*blockDim.x + threadIdx.x;
+	
+    if (id >= nthreads)
+	return;
+
+    g = &sg;
+
+    float3 origin = _origin[id];
+    float3 direction = _direction[id];
+    unsigned int n = dxlen[id];
+
+    float distance;
+
+    if (n < 1 && !intersect_node(origin, direction, g, g->start_node)) {
+	pixels[id] = 0;
+	return;
+    }
+
+    unsigned int stack[STACK_SIZE];
+
+    unsigned int *head = &stack[0];
+    unsigned int *node = &stack[1];
+    unsigned int *tail = &stack[STACK_SIZE-1];
+    *node = g->start_node;
+
+    float *dx = _dx + id*alpha_depth;
+    float4 *color_a = _color + id*alpha_depth;
+
+    unsigned int i;
+
+    do {
+	unsigned int first_child = g->node_map[*node];
+	unsigned int stop = g->node_map_end[*node];
+	
+	while (*node >= g->first_node && stop == first_child+1) {
+	    *node = first_child;
+	    first_child = g->node_map[*node];
+	    stop = g->node_map_end[*node];
+	}
+		
+	if (*node >= g->first_node) {
+	    for (i=first_child; i < stop; i++) {
+		if (intersect_node(origin, direction, g, i)) {
+		    *node = i;
+		    node++;
+		}
+	    }
+
+	    node--;
+	}
+	else {
+	    // node is a leaf
+	    for (i=first_child; i < stop; i++) {
+		Triangle t = get_triangle(g, i);
+		
+		if (intersect_triangle(origin, direction, t, distance)) {
+		    if (n < 1) {
+			dx[0] = distance;
+			    
+			unsigned int rgba = g->colors[i];
+			float4 color = get_color(direction, t, rgba);
+
+			color_a[0] = color;
+		    }
+		    else {
+			unsigned long j = searchsorted(n, dx, distance);
+
+			if (j <= alpha_depth-1) {
+			    insert(alpha_depth, dx, j, distance);
+
+			    unsigned int rgba = g->colors[i];
+			    float4 color = get_color(direction, t, rgba);
+
+			    insert(alpha_depth, color_a, j, color);
+			}
+		    }
+					
+		    if (n < alpha_depth)
+			n++;
+		}
+				
+	    } // triangle loop
+	    
+	    node--;
+	    
+	} // node is a leaf
+	
+    } // while loop
+    while (node != head);
+
+    if (n < 1) {
+	pixels[id] = 0;
+	return;
+    }
+
+    dxlen[id] = n;
+
+    float scale = 1.0f;
+    float fr = 0.0f;
+    float fg = 0.0f;
+    float fb = 0.0f;
+    for (i=0; i < n; i++) {
+	float alpha = color_a[i].w;
+
+	fr += scale*color_a[i].x*alpha;
+	fg += scale*color_a[i].y*alpha;
+	fb += scale*color_a[i].z*alpha;
+	
+	scale *= (1.0f-alpha);
+    }
+    unsigned int a;
+    if (n < alpha_depth)
+	a = floorf(255*(1.0f-scale));
+    else
+    	a = 255;
+    unsigned int red = floorf(fr/(1.0f-scale));
+    unsigned int green = floorf(fg/(1.0f-scale));
+    unsigned int blue = floorf(fb/(1.0f-scale));
+
+    pixels[id] = a << 24 | red << 16 | green << 8 | blue;
+}
+
+} // extern "C"
diff --git a/chroma/cuda/rotate.h b/chroma/cuda/rotate.h
new file mode 100644
index 0000000..15f8037
--- /dev/null
+++ b/chroma/cuda/rotate.h
@@ -0,0 +1,27 @@
+#ifndef __ROTATE_H__
+#define __ROTATE_H__
+
+#include "linalg.h"
+#include "matrix.h"
+
+__device__ const Matrix IDENTITY_MATRIX = {1,0,0,0,1,0,0,0,1};
+
+__device__ Matrix
+make_rotation_matrix(float phi, const float3 &n)
+{
+    float cos_phi = cosf(phi);
+    float sin_phi = sinf(phi);
+
+    return IDENTITY_MATRIX*cos_phi + (1-cos_phi)*outer(n,n) +
+	sin_phi*make_matrix(0,n.z,-n.y,-n.z,0,n.x,n.y,-n.x,0);
+}
+
+/* rotate points counterclockwise, when looking towards +infinity,
+   through an angle `phi` about the axis `n`. */
+__device__ float3
+rotate(const float3 &a, float phi, const float3 &n)
+{
+    return make_rotation_matrix(phi,n)*a;
+}
+
+#endif
diff --git a/chroma/cuda/sorting.h b/chroma/cuda/sorting.h
new file mode 100644
index 0000000..2bf1a5a
--- /dev/null
+++ b/chroma/cuda/sorting.h
@@ -0,0 +1,107 @@
+template <class T>
+__device__ void
+swap(T &a, T &b)
+{
+    T tmp = a;
+    a = b;
+    b = tmp;
+}
+
+template <class T>
+__device__ void
+reverse(int n, T *a)
+{
+    for (int i=0; i < n/2; i++)
+	swap(a[i],a[n-1-i]);
+}
+
+template <class T>
+__device__ void
+piksrt(int n, T *arr)
+{
+    int i,j;
+    T a;
+
+    for (j=1; j < n; j++) {
+	a = arr[j];
+	i = j-1;
+	while (i >= 0 && arr[i] > a) {
+	    arr[i+1] = arr[i];
+	    i--;
+	}
+	arr[i+1] = a;
+    }
+}
+
+template <class T, class U>
+__device__ void
+piksrt2(int n, T *arr, U *brr)
+{
+    int i,j;
+    T a;
+    U b;
+
+    for (j=1; j < n; j++) {
+	a = arr[j];
+	b = brr[j];
+	i = j-1;
+	while (i >= 0 && arr[i] > a) {
+	    arr[i+1] = arr[i];
+	    brr[i+1] = brr[i];
+	    i--;
+	}
+	arr[i+1] = a;
+	brr[i+1] = b;
+    }
+}
+
+/* Returns the index in `arr` where `x` should be inserted in order to
+   maintain order. If `n` equals one, return the index such that, when
+   `x` is inserted, `arr` will be in ascending order.
+*/
+template <class T>
+__device__ unsigned long
+searchsorted(unsigned long n, T *arr, const T &x)
+{
+    unsigned long ju,jm,jl;
+    int ascnd;
+
+    jl = 0;
+    ju = n;
+
+    ascnd = (arr[n-1] >= arr[0]);
+
+    while (ju-jl > 1) {
+	jm = (ju+jl) >> 1;
+
+	if ((x > arr[jm]) == ascnd)
+	    jl = jm;
+	else
+	    ju = jm;
+    }
+
+    if ((x <= arr[0]) == ascnd)
+	return 0;
+    else
+	return ju;
+}
+
+template <class T>
+__device__ void
+insert(unsigned long n, T *arr, unsigned long i, const T &x)
+{
+    unsigned long j;
+    for (j=n-1; j > i; j--)
+	arr[j] = arr[j-1];
+    arr[i] = x;
+}
+
+template <class T>
+__device__ void
+add_sorted(unsigned long n, T *arr, const T &x)
+{
+    unsigned long i = searchsorted(n, arr, x);
+
+    if (i < n)
+	insert(n, arr, i, x);
+}
diff --git a/chroma/cuda/tools.cu b/chroma/cuda/tools.cu
new file mode 100644
index 0000000..80d06ec
--- /dev/null
+++ b/chroma/cuda/tools.cu
@@ -0,0 +1,21 @@
+//--*-c-*-
+
+extern "C"
+{
+
+__global__ void
+interleave(int nthreads, unsigned long long *x, unsigned long long *y,
+	   unsigned long long *z, int bits, unsigned long long *dest)
+{
+    int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if (id >= nthreads)
+	return;
+
+    for (int i=0; i < bits; i++)
+	dest[id] |= (x[id] & 1 << i) << (2*i) |
+	            (y[id] & 1 << i) << (2*i+1) |
+	            (z[id] & 1 << i) << (2*i+2);
+}
+
+} // extern "C"
diff --git a/chroma/cuda/transform.cu b/chroma/cuda/transform.cu
new file mode 100644
index 0000000..1f4405e
--- /dev/null
+++ b/chroma/cuda/transform.cu
@@ -0,0 +1,51 @@
+//-*-c-*-
+
+#include "linalg.h"
+#include "rotate.h"
+
+extern "C"
+{
+
+/* Translate the points `a` by the vector `v` */
+__global__ void
+translate(int nthreads, float3 *a, float3 v)
+{
+    int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if (id >= nthreads)
+	return;
+
+    a[id] += v;
+}
+
+/* Rotate the points `a` through an angle `phi` counter-clockwise about the
+   axis `axis` (when looking towards +infinity). */
+__global__ void
+rotate(int nthreads, float3 *a, float phi, float3 axis)
+{
+    int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if (id >= nthreads)
+	return;
+
+    a[id] = rotate(a[id], phi, axis);
+}
+
+/* Rotate the points `a` through an angle `phi` counter-clockwise
+   (when looking towards +infinity along `axis`) about the axis defined
+   by the point `point` and the vector `axis` . */
+__global__ void
+rotate_around_point(int nthreads, float3 *a, float phi, float3 axis,
+		    float3 point)
+{
+    int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if (id >= nthreads)
+	return;
+
+    a[id] -= point;
+    a[id] = rotate(a[id], phi, axis);
+    a[id] += point;
+}
+
+} // extern "c"