3 files changed, 131 insertions, 66 deletions
diff --git a/render.py b/render.py
index fe70492..d7a9ec4 100755
--- a/render.py
+++ b/render.py
@@ -17,6 +17,8 @@ def render(viewable, size=(800,600), name='', bits=8, make_movie=False):
 
     lower_bound, upper_bound = geometry.mesh.get_bounds()
 
+    source_position = [0, 0, upper_bound[2]+1.0]
+
     scale = np.linalg.norm(upper_bound-lower_bound)
 
     print 'device %s' % autoinit.device.name()
@@ -39,32 +41,32 @@ def render(viewable, size=(800,600), name='', bits=8, make_movie=False):
 
     diagonal = np.linalg.norm(upper_bound-lower_bound)
 
-    source_position = [0, 0, upper_bound[2]+1.0]
-
     source_positions_gpu = gpuarray.empty(len(geometry.mesh.triangles), dtype=gpuarray.vec.float3)
     source_positions_gpu.fill(gpuarray.vec.make_float3(*source_position))
 
     source_directions = np.mean(geometry.mesh[:], axis=1) - source_position
     source_directions_gpu = gpuarray.to_gpu(source_directions.astype(np.float32).view(gpuarray.vec.float3))
 
-    rgb_lookup_gpu = gpuarray.zeros(source_positions_gpu.size, dtype=gpuarray.vec.float3)
+    rgb_lookup1_gpu = gpuarray.zeros(source_positions_gpu.size, dtype=gpuarray.vec.float3)
+    rgb_lookup2_gpu = gpuarray.zeros(source_positions_gpu.size, dtype=gpuarray.vec.float3)
 
     max_steps = 10
-    rgb_runs = 1000
+    rgb_runs = 100
 
     print 'building rgb lookup.'
-    cuda_build_rgb_lookup(np.int32(source_positions_gpu.size), rng_states_gpu, source_positions_gpu, source_directions_gpu, rgb_lookup_gpu, np.int32(rgb_runs), np.int32(max_steps), block=(64,1,1), grid=(source_positions_gpu.size//64+1,1))
+    cuda_build_rgb_lookup(np.int32(source_positions_gpu.size), rng_states_gpu, source_positions_gpu, source_directions_gpu, rgb_lookup1_gpu, rgb_lookup2_gpu, np.int32(rgb_runs), np.int32(max_steps), block=(64,1,1), grid=(source_positions_gpu.size//64+1,1))
     cuda.Context.synchronize()
     print 'done.'
 
-    rgb_lookup = rgb_lookup_gpu.get()
-    rgb_lookup['x'] /= rgb_runs
-    rgb_lookup['y'] /= rgb_runs
-    rgb_lookup['z'] /= rgb_runs
-    rgb_lookup['x'][rgb_lookup['x'] > 1.0] = 1.0
-    rgb_lookup['y'][rgb_lookup['y'] > 1.0] = 1.0
-    rgb_lookup['z'][rgb_lookup['z'] > 1.0] = 1.0
-    rgb_lookup_gpu = cuda.to_device(rgb_lookup)
+    rgb_lookup1 = rgb_lookup1_gpu.get().view(np.float32)
+    rgb_lookup1 /= rgb_runs
+    rgb_lookup1[rgb_lookup1 > 1.0] = 1.0
+    rgb_lookup1_gpu.set(rgb_lookup1.view(gpuarray.vec.float3))
+
+    rgb_lookup2 = rgb_lookup2_gpu.get().view(np.float32)
+    rgb_lookup2 /= rgb_runs
+    rgb_lookup2[rgb_lookup2 > 1.0] = 1.0
+    rgb_lookup2_gpu.set(rgb_lookup2.view(gpuarray.vec.float3))
 
     camera = Camera(size)
 
@@ -80,7 +82,7 @@ def render(viewable, size=(800,600), name='', bits=8, make_movie=False):
     directions_gpu = gpuarray.to_gpu(directions.astype(np.float32).view(gpuarray.vec.float3))
     pixels_gpu = gpuarray.zeros(width*height, dtype=np.int32)
 
-    nruns = 5
+    nruns = 1
 
     screen = pygame.display.set_mode(size)
     pygame.display.set_caption(name)
@@ -97,11 +99,11 @@ def render(viewable, size=(800,600), name='', bits=8, make_movie=False):
         global image_index
 
         t0 = time.time()
-        cuda_render(np.int32(pixels_gpu.size), rng_states_gpu, origins_gpu, directions_gpu, rgb_lookup_gpu, np.int32(nruns), pixels_gpu, np.int32(max_steps), block=(nblocks,1,1), grid=(pixels_gpu.size//nblocks+1,1))
+        cuda_render(np.int32(pixels_gpu.size), rng_states_gpu, origins_gpu, directions_gpu, rgb_lookup1_gpu, rgb_lookup2_gpu, np.int32(nruns), pixels_gpu, np.int32(max_steps), block=(nblocks,1,1), grid=(pixels_gpu.size//nblocks+1,1))
         cuda.Context.synchronize()
         elapsed = time.time() - t0
 
-        print 'elapsed %f sec' % elapsed
+        #print 'elapsed %f sec' % elapsed
 
         pygame.surfarray.blit_array(screen, pixels_gpu.get().reshape(size))
         pygame.display.flip()
diff --git a/src/kernel.cu b/src/kernel.cu
index 73800a3..2cd9347 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -12,20 +12,14 @@
 #define BLUE_WAVELENGTH 465
 #define GREEN_WAVELENGTH 545
 
-__device__ void myAtomicAdd(float *addr, float data)
+__device__ void fAtomicAdd(float *addr, float data)
 {
 	while (data)
 		data = atomicExch(addr, data+atomicExch(addr, 0.0f));
 }
 
-__device__ int to_diffuse(Photon p, int max_steps)
+__device__ void to_diffuse(Photon &p, State &s, const int &max_steps)
 {
-	// note that p is NOT passed by reference; this is intentional
-
-	p.last_hit_triangle = -1;
-
-	State s;
-
 	int steps = 0;
 	while (steps < max_steps)
 	{
@@ -51,7 +45,7 @@ __device__ int to_diffuse(Photon p, int max_steps)
 			command = propagate_at_surface(p, s);
 
 			if (p.history & REFLECT_DIFFUSE)
-				return p.last_hit_triangle;
+				break;
 
 			if (command == BREAK)
 				break;
@@ -64,7 +58,6 @@ __device__ int to_diffuse(Photon p, int max_steps)
 
 	} // while (steps < max_steps)
 
-	return -1;
 } // to_diffuse
 
 extern "C"
@@ -93,25 +86,25 @@ __global__ void rotate(int nthreads, float3 *points, float phi, float3 axis)
 	points[id] = rotate(points[id], phi, axis);
 }
 
-__global__ void build_rgb_lookup(int nthreads, curandState *rng_states, float3 *positions, float3 *directions, float3 *rgb_lookup, int runs, int max_steps)
+__global__ void build_rgb_lookup(int nthreads, curandState *rng_states, float3 *positions, float3 *directions, float3 *rgb_lookup1, float3 *rgb_lookup2, int runs, int max_steps)
 {
 	int id = blockIdx.x*blockDim.x + threadIdx.x;
 
 	if (id >= nthreads)
 		return;
 
-	Photon p;
-	p.rng = rng_states[id];
-	p.position = positions[id];
-	p.direction = directions[id];
-	p.direction /= norm(p.direction);
-	p.polarization = uniform_sphere(&p.rng);
-	p.time = 0.0f;
-	p.history = 0x0;
+	Photon seed;
+	seed.rng = rng_states[id];
+	seed.position = positions[id];
+	seed.direction = directions[id];
+	seed.direction /= norm(seed.direction);
+	seed.polarization = uniform_sphere(&seed.rng);
+	seed.time = 0.0f;
+	seed.history = 0x0;
 
 	float distance;
 
-	int hit_triangle = intersect_mesh(p.position, p.direction, distance);
+	int hit_triangle = intersect_mesh(seed.position, seed.direction, distance);
 
 	if (hit_triangle != id)
 		return;
@@ -124,79 +117,143 @@ __global__ void build_rgb_lookup(int nthreads, curandState *rng_states, float3 *
 	float3 v1 = g_vertices[triangle_data.y];
 	float3 v2 = g_vertices[triangle_data.z];
 
-	float cos_theta = dot(normalize(cross(v1-v0, v2-v1)), -p.direction);
+	float cos_theta = dot(normalize(cross(v1-v0, v2-v1)), -seed.direction);
 
 	if (cos_theta < 0.0f)
-		cos_theta = dot(-normalize(cross(v1-v0, v2-v1)), -p.direction);
+		cos_theta = dot(-normalize(cross(v1-v0, v2-v1)), -seed.direction);
+
+	Photon p;
+	State s;
 
 	for (int i=0; i < runs; i++)
 	{
+		p = seed;
 		p.wavelength = RED_WAVELENGTH;
 
-		hit_triangle = to_diffuse(p, max_steps);
+		to_diffuse(p, s, max_steps);
 
-		if (hit_triangle != -1)
-			myAtomicAdd(&rgb_lookup[hit_triangle].x, cos_theta);
+		if (p.history & REFLECT_DIFFUSE)
+		{
+			if (s.inside_to_outside)
+			{
+				fAtomicAdd(&rgb_lookup1[p.last_hit_triangle].x, cos_theta);
+			}
+			else
+			{
+				fAtomicAdd(&rgb_lookup2[p.last_hit_triangle].x, cos_theta);
+			}
+		}
 
+		p = seed;
 		p.wavelength = BLUE_WAVELENGTH;
 
-		hit_triangle = to_diffuse(p, max_steps);
+		to_diffuse(p, s, max_steps);
 
-		if (hit_triangle != -1)
-			myAtomicAdd(&rgb_lookup[hit_triangle].y, cos_theta);
+		if (p.history & REFLECT_DIFFUSE)
+		{
+			if (s.inside_to_outside)
+			{
+				fAtomicAdd(&rgb_lookup1[p.last_hit_triangle].y, cos_theta);
+			}
+			else
+			{
+				fAtomicAdd(&rgb_lookup2[p.last_hit_triangle].y, cos_theta);
+			}
+		}
 
+		p = seed;
 		p.wavelength = GREEN_WAVELENGTH;
 
-		hit_triangle = to_diffuse(p, max_steps);
+		to_diffuse(p, s, max_steps);
 
-		if (hit_triangle != -1)
-			myAtomicAdd(&rgb_lookup[hit_triangle].z, cos_theta);
+		if (p.history & REFLECT_DIFFUSE)
+		{
+			if (s.inside_to_outside)
+			{
+				fAtomicAdd(&rgb_lookup1[p.last_hit_triangle].z, cos_theta);
+			}
+			else
+			{
+				fAtomicAdd(&rgb_lookup2[p.last_hit_triangle].z, cos_theta);
+			}
+		}
 	}
 
 } // build_rgb_lookup
 
-__global__ void render(int nthreads, curandState *rng_states, float3 *positions, float3 *directions, float3 *rgb_lookup, int runs, int *pixels, int max_steps)
+__global__ void render(int nthreads, curandState *rng_states, float3 *positions, float3 *directions, float3 *rgb_lookup1, float3 *rgb_lookup2, int runs, int *pixels, int max_steps)
 {
 	int id = blockIdx.x*blockDim.x + threadIdx.x;
 
 	if (id >= nthreads)
 		return;
 
-	Photon p;
-	p.rng = rng_states[id];
-	p.position = positions[id];
-	p.direction = directions[id];
-	p.direction /= norm(p.direction);
-	p.polarization = uniform_sphere(&p.rng);
-	p.time = 0.0f;
-	p.history = 0x0;
+	Photon seed;
+	seed.rng = rng_states[id];
+	seed.position = positions[id];
+	seed.direction = directions[id];
+	seed.direction /= norm(seed.direction);
+	seed.polarization = uniform_sphere(&seed.rng);
+	seed.time = 0.0f;
+	seed.history = 0x0;
 
 	float3 rgb = make_float3(0.0, 0.0, 0.0);
 
-	int hit_triangle;
+	Photon p;
+	State s;
 
 	for (int i=0; i < runs; i++)
 	{
+		p = seed;
 		p.wavelength = RED_WAVELENGTH;
 
-		hit_triangle = to_diffuse(p, max_steps);
+		to_diffuse(p, s, max_steps);
 
-		if (hit_triangle != -1)
-			rgb.x += rgb_lookup[hit_triangle].x;
+		if (p.history & REFLECT_DIFFUSE)
+		{
+			if (s.inside_to_outside)
+			{
+				rgb.x += rgb_lookup1[p.last_hit_triangle].x;
+			}
+			else
+			{
+				rgb.x += rgb_lookup2[p.last_hit_triangle].x;
+			}
+		}
 
+		p = seed;
 		p.wavelength = BLUE_WAVELENGTH;
 
-		hit_triangle = to_diffuse(p, max_steps);
+		to_diffuse(p, s, max_steps);
 
-		if (hit_triangle != -1)
-			rgb.y += rgb_lookup[hit_triangle].y;
+		if (p.history & REFLECT_DIFFUSE)
+		{
+			if (s.inside_to_outside)
+			{
+				rgb.y += rgb_lookup1[p.last_hit_triangle].y;
+			}
+			else
+			{
+				rgb.y += rgb_lookup2[p.last_hit_triangle].y;
+			}
+		}
 
+		p = seed;
 		p.wavelength = GREEN_WAVELENGTH;
 
-		hit_triangle = to_diffuse(p, max_steps);
+		to_diffuse(p, s, max_steps);
 
-		if (hit_triangle != -1)
-			rgb.z += rgb_lookup[hit_triangle].z;
+		if (p.history & REFLECT_DIFFUSE)
+		{
+			if (s.inside_to_outside)
+			{
+				rgb.z += rgb_lookup1[p.last_hit_triangle].z;
+			}
+			else
+			{
+				rgb.z += rgb_lookup2[p.last_hit_triangle].z;
+			}
+		}
 	}
 
 	rgb /= runs;
diff --git a/src/photon.h b/src/photon.h
index 69ecd34..11bcfc1 100644
--- a/src/photon.h
+++ b/src/photon.h
@@ -25,6 +25,8 @@ struct Photon
 
 struct State
 {
+	bool inside_to_outside;
+
 	float3 surface_normal;
 
 	float refractive_index1, refractive_index2;
@@ -78,6 +80,8 @@ __device__ int fill_state(State &s, Photon &p)
 		// outside to inside
 		material1 = materials[outer_material_index];
 		material2 = materials[inner_material_index];
+
+		s.inside_to_outside = false;
 	}
 	else
 	{
@@ -85,6 +89,8 @@ __device__ int fill_state(State &s, Photon &p)
 		material1 = materials[inner_material_index];
 		material2 = materials[outer_material_index];
 		s.surface_normal = -s.surface_normal;
+
+		s.inside_to_outside = true;
 	}
 
 	s.refractive_index1 = interp_property(p.wavelength, material1.refractive_index);