5 files changed, 381 insertions, 61 deletions
diff --git a/build.py b/build.py
new file mode 100644
index 0000000..8c05c5d
--- /dev/null
+++ b/build.py
@@ -0,0 +1,102 @@
+import numpy as np
+
+class Leaf(object):
+    def __init__(self, meshasdf, mesh_idx):
+        self.xmin, self.ymin, self.zmin = meshasdf[0]
+        self.xmax, self.ymax, self.zmax = meshasdf[0]
+
+        for x, y, z in meshasdf[1:]:
+            if x < self.xmin:
+                self.xmin = x
+            if x > self.xmax:
+                self.xmax = x
+            if y < self.ymin:
+                self.ymin = y
+            if y > self.ymax:
+                self.ymax = y
+            if z < self.zmin:
+                self.zmin = z
+            if z > self.zmax:
+                self.zmax = z
+
+        self.mesh_idx = mesh_idx
+        self.size = meshasdf.size//3
+
+class Node(object):
+    def __init__(self, children):
+        self.xmin = children[0].xmin
+        self.xmax = children[0].xmax
+        self.ymin = children[0].ymin
+        self.ymax = children[0].ymax
+        self.zmin = children[0].zmin
+        self.zmax = children[0].zmax
+
+        for child in children[1:]:
+            if child.xmin < self.xmin:
+                self.xmin = child.xmin
+            if child.xmax > self.xmax:
+                self.xmax = child.xmax
+            if child.ymin < self.ymin:
+                self.ymin = child.ymin
+            if child.ymax > self.ymax:
+                self.ymax = child.ymax
+            if child.zmin < self.zmin:
+                self.zmin = child.zmin
+            if child.zmax > self.zmax:
+                self.zmax = child.zmax
+
+        self.children = children
+        self.size = len(children)
+
+class Graph(object):
+    def __init__(self, mesh, triangles_per_leaf=2, children_per_node=4):
+        leafs = []
+        for i in range(0, mesh.size, triangles_per_leaf*3):
+            leafs.append(Leaf(mesh[i:i+triangles_per_leaf*3], i))
+
+        layers = []
+        layers.append(leafs)
+
+        while len(layers[-1]) > 1:
+            nodes = []
+            for i in range(0, len(layers[-1]), children_per_node):
+                nodes.append(Node(layers[-1][i:i+children_per_node]))
+
+            layers.append(nodes)
+
+        # leaves last
+        layers.reverse()
+
+        nodes = []
+        for layer in layers:
+            for node in layer:
+                nodes.append(node)
+
+        lower = []
+        upper = []
+        start = []
+        count = [] # number of child nodes or triangles
+        first_leaf = -1
+
+        for i, node in enumerate(nodes):
+            lower.append([node.xmin, node.ymin, node.zmin])
+            upper.append([node.xmax, node.ymax, node.zmax])
+            count.append(node.size)
+
+            if isinstance(node, Node):
+                for j, child in enumerate(nodes):
+                    if child is node.children[0]:
+                        start.append(j)
+                        break
+
+            if isinstance(node, Leaf):
+                start.append(node.mesh_idx)
+
+                if first_leaf == -1:
+                    first_leaf = i
+
+        self.lower = np.array(lower)
+        self.upper = np.array(upper)
+        self.start = np.array(start)
+        self.count = np.array(count)
+        self.first_leaf = first_leaf
diff --git a/src/intersect.cu b/src/intersect.cu
index c78350e..8c2bf45 100644
--- a/src/intersect.cu
+++ b/src/intersect.cu
@@ -1,17 +1,33 @@
 //-*-c-*-
+#include <math_constants.h>
+
+#include "linalg.h"
+#include "matrix.h"
+#include "rotate.h"
 
 texture<float4, 1, cudaReadModeElementType> mesh;
 
-__device__ bool intersect_triangle(const float3 &x, const float3 &p, const float3 &v0, const float3 &v1, const float3 &v2, float3 &intersection)
+texture<float4, 1, cudaReadModeElementType> upper_bound_arr;
+texture<float4, 1, cudaReadModeElementType> lower_bound_arr;
+
+texture<uint, 1, cudaReadModeElementType> child_map_arr;
+texture<uint, 1, cudaReadModeElementType> child_len_arr;
+
+__device__ float3 make_float3(const float4 &a)
 {
-	Matrix m = make_matrix(v1-v0, v2-v0, -p);
+	return make_float3(a.x, a.y, a.z);
+}
+
+__device__ bool intersect_triangle(const float3 &origin, const float3 &direction, const float3 &v0, const float3 &v1, const float3 &v2, float3 &intersection)
+{
+	Matrix m = make_matrix(v1-v0, v2-v0, -direction);
 	
 	float determinant = det(m);
 
 	if (determinant == 0.0)
 		return false;
 
-	float3 b = x-v0;
+	float3 b = origin-v0;
 
 	float u1 = ((m.a11*m.a22 - m.a12*m.a21)*b.x + (m.a02*m.a21 - m.a01*m.a22)*b.y + (m.a01*m.a12 - m.a02*m.a11)*b.z)/determinant;
 
@@ -28,103 +44,213 @@ __device__ bool intersect_triangle(const float3 &x, const float3 &p, const float
 	if (u3 < 0.0 || (1-u1-u2) < 0.0)
 		return false;
 
-	intersection = x + p*u3;
+	intersection = origin + direction*u3;
 
 	return true;
 }
 
-__device__ int get_color(const float3 &p, const float3 &v0, const float3& v1, const float3 &v2)
+__device__ int get_color(const float3 &direction, const float3 &v0, const float3& v1, const float3 &v2)
 {
-	float3 normal = cross(v1-v0,v2-v0);
+	int rgb = floorf(255*dot(normalize(cross(v1-v0,v2-v0)),-direction));
+
+	return rgb << 16 | rgb << 8 | rgb;
+}
+
+
+__device__ bool intersect_box(const float3 &origin, const float3 &direction, const float3 &lower_bound, const float3 &upper_bound)
+{
+	float kmin, kmax, kymin, kymax, kzmin, kzmax;
+
+	if (direction.x >= 0.0)
+	{
+		kmin = (lower_bound.x - origin.x)/direction.x;
+		kmax = (upper_bound.x - origin.x)/direction.x;
+	}
+	else
+	{
+		kmin = (upper_bound.x - origin.x)/direction.x;
+		kmax = (lower_bound.x - origin.x)/direction.x;
+	}
+
+	if (kmax < kmin)
+		return false;
+
+	if (direction.y >= 0.0)
+	{
+		kymin = (lower_bound.y - origin.y)/direction.y;
+		kymax = (upper_bound.y - origin.y)/direction.y;
+	}
+	else
+	{
+		kymin = (upper_bound.y - origin.y)/direction.y;
+		kymax = (lower_bound.y - origin.y)/direction.y;
+	}
+
+	if (kymax < kymin)
+		return false;
+
+	if (kymin > kmin)
+		kmin = kymin;
+
+	if (kymax < kmax)
+		kmax = kymax;
+
+	if (kmin > kmax)
+		return false;
+
+	if (direction.z >= 0.0)
+	{
+		kzmin = (lower_bound.z - origin.z)/direction.z;
+		kzmax = (upper_bound.z - origin.z)/direction.z;
+	}
+	else
+	{
+		kzmin = (upper_bound.z - origin.z)/direction.z;
+		kzmax = (lower_bound.z - origin.z)/direction.z;
+	}
 
-	float scale;
-	scale = dot(normal,-p)/(norm(normal)*norm(p));
-	if (scale < 0.0)
-		scale = dot(-normal,-p)/(norm(normal)*norm(p));
+	if (kzmax < kzmin)
+		return false;
 
-	int rgb = floorf(255*scale);
+	if (kzmin > kmin)
+		kmin = kzmin;
 
-	return rgb*65536 + rgb*256 + rgb;
+	if (kzmax < kmax)
+		kmax = kzmax;
+
+	if (kmin > kmax)
+		return false;
+
+	if (kmax < 0.0)
+		return false;
+
+	return true;
 }
 
-__device__ float3 make_float3(const float4 &a)
+__device__ bool intersect_node(const float3 &origin, const float3 &direction, const int &i)
 {
-	return make_float3(a.x, a.y, a.z);
+	float3 lower_bound = make_float3(tex1Dfetch(lower_bound_arr, i));
+	float3 upper_bound = make_float3(tex1Dfetch(upper_bound_arr, i));
+
+	return intersect_box(origin, direction, lower_bound, upper_bound);
 }
 
 extern "C"
 {
 
-__global__ void translate(int max_idx, float3 *x, float3 v)
+__global__ void translate(int max_idx, float3 *pt, float3 v)
 {
 	int idx = blockIdx.x*blockDim.x + threadIdx.x;
 
 	if (idx > max_idx)
 		return;
 
-	x[idx] += v;
+	pt[idx] += v;
 }
 
-__global__ void rotate(int max_idx, float3 *x, float phi, float3 axis)
+__global__ void rotate(int max_idx, float3 *pt, float phi, float3 axis)
 {
 	int idx = blockIdx.x*blockDim.x + threadIdx.x;
 
 	if (idx > max_idx)
 		return;
 
-	x[idx] = rotate(x[idx], phi, axis);
+	pt[idx] = rotate(pt[idx], phi, axis);
 }
 
-__global__ void intersect_triangle_mesh(int max_idx, float3 *xarr, float3 *parr, int n, int *pixelarr)
+__global__ void intersect_mesh(int max_idx, float3 *origin_arr, float3 *direction_arr, int *pixel_arr, int first_leaf)
 {
 	int idx = blockIdx.x*blockDim.x + threadIdx.x;
 
 	if (idx > max_idx)
 		return;
 
-	float3 x = xarr[idx];
-	float3 p = parr[idx];
-	int *pixel = pixelarr+idx;
+	float3 origin = origin_arr[idx];
+	float3 direction = direction_arr[idx];
+
+	int *pixel = pixel_arr+idx;
 
 	bool hit = false;
 
 	float distance, min_distance;
 	float3 intersection, min_intersection;
+	
+	if (!intersect_node(origin, direction, 0))
+	{
+		*pixel = 0;
+		return;
+	}
+
+	int stack[100];
+
+	int *head = &stack[0];
+	int *node = &stack[1];
+	*node = 0;
 
 	int i;
-	for (i=0; i < n; i++)
+
+	do
 	{
-		float3 v0 = make_float3(tex1Dfetch(mesh, 3*i));
-		float3 v1 = make_float3(tex1Dfetch(mesh, 3*i+1));
-		float3 v2 = make_float3(tex1Dfetch(mesh, 3*i+2));
-			
-		if (intersect_triangle(x, p, v0, v1, v2, intersection))
-		{
-			if (!hit)
-			{
-				*pixel = get_color(p, v0, v1, v2);
+		int child_map = tex1Dfetch(child_map_arr, *node);
+		int child_len = tex1Dfetch(child_len_arr, *node);
 
-				min_distance = norm(intersection-x);
-				min_intersection = intersection;
-				hit = true;
-				continue;
-			}
+		if (*node < first_leaf)
+		{
+			for (i=0; i < child_len; i++)
+				if (intersect_node(origin, direction, child_map+i))
+					*node++ = child_map+i;
 
-			distance = norm(intersection-x);
+			if (node == head)
+				break;
+			
+			node--;
 
-			if (distance < min_distance)
+		}
+		else // node is a leaf
+		{
+			for (i=0; i < child_len; i++)
 			{
-				*pixel = get_color(p, v0, v1, v2);
+				int mesh_idx = child_map + 3*i;
+
+				float3 v0 = make_float3(tex1Dfetch(mesh, mesh_idx));
+				float3 v1 = make_float3(tex1Dfetch(mesh, mesh_idx+1));
+				float3 v2 = make_float3(tex1Dfetch(mesh, mesh_idx+2));
+				
+				if (intersect_triangle(origin, direction, v0, v1, v2, intersection))
+				{
+					if (!hit)
+					{
+						*pixel = get_color(direction, v0, v1, v2);
+						
+						min_distance = norm(intersection-origin);
+						min_intersection = intersection;
+						hit = true;
+						continue;
+					}
+					
+					distance = norm(intersection-origin);
+					
+					if (distance < min_distance)
+					{
+						*pixel = get_color(direction, v0, v1, v2);
+						
+						min_distance = distance;
+						min_intersection = intersection;
+					}
+				}
+
+			} // triangle loop
+
+			node--;
+
+		} // node is a leaf
+
+	} // while loop
+	while (node != head);
 
-				min_distance = distance;
-				min_intersection = intersection;
-			}
-		}
-	}
-	
 	if (!hit)
 		*pixel = 0;
-	
-}
+
+} // intersect mesh
 
 } // extern "c"
diff --git a/src/linalg.h b/src/linalg.h
index 593aa9d..fe6b1b2 100644
--- a/src/linalg.h
+++ b/src/linalg.h
@@ -113,4 +113,9 @@ __device__ __host__ float norm(const float3 &a)
 	return sqrtf(dot(a,a));
 }
 
+__device__ __host__ float3 normalize(const float3 &a)
+{
+	return a/norm(a);
+}
+
 #endif
diff --git a/test.py b/test.py
index 4390c50..15b7102 100644
--- a/test.py
+++ b/test.py
@@ -5,6 +5,7 @@ from pycuda import autoinit
 from pycuda.compiler import SourceModule
 import pycuda.driver as cuda
 from pycuda import gpuarray
+from string import Template
 
 def array2vector(arr, dtype=gpuarray.vec.float3):
     if len(arr.shape) != 2 or arr.shape[-1] != 3:
@@ -19,18 +20,16 @@ def array2vector(arr, dtype=gpuarray.vec.float3):
 
 print 'device %s' % autoinit.device.name()
 
-source = open('src/linalg.h').read() + open('src/matrix.h').read() + \
-    open('src/rotate.h').read() + open('src/intersect.cu').read()
+source = open('src/intersect.cu').read()
+mod = SourceModule(source, options=['-I /home/tlatorre/projects/chroma/src'], no_extern_c=True, arch='sm_13')
 
-mod = SourceModule(source, no_extern_c=True, arch='sm_13')
-
-intersect = mod.get_function('intersect_triangle_mesh')
 rotate = mod.get_function('rotate')
 translate = mod.get_function('translate')
+intersect_mesh = mod.get_function('intersect_mesh')
 
 import pygame
 size = width, height = 800, 600
-screen = pygame.display.set_mode(size)
+screen = pygame.display.set_mode(size, (pygame.NOFRAME | pygame.DOUBLEBUF))
 
 film_size = (0.035, 0.024)
 focal_length = 0.05
@@ -46,15 +45,65 @@ grid += (0,300,0)
 x = array2vector(grid)
 x_gpu = cuda.to_device(x)
 
-p = array2vector(((0,300,0)-grid))
+p = (0,300,0)-grid
+
+for i in range(p.shape[0]):
+    p[i] /= np.linalg.norm(p[i])
+
+p = array2vector(p)
 p_gpu = cuda.to_device(p)
 
-mesh3 = array2vector(read_stl('models/tie_interceptor6.stl'))
+
+from zcurve import *
+
+mesh = read_stl('models/tie_interceptor6.stl')
+mesh = mesh.reshape(mesh.shape[0]//3,3,3)
+mesh = morton_order(mesh)
+mesh = mesh.reshape(mesh.shape[0]*3, 3)
+
+mesh3 = array2vector(mesh)
+
+from build import Graph
+
 
 rotate(np.int32(mesh3.size), cuda.InOut(mesh3), np.float32(-np.pi/2), gpuarray.vec.make_float3(1,0,0), block=(256,1,1), grid=(mesh3.size//256+1,1))
 
 translate(np.int32(mesh3.size), cuda.InOut(mesh3), gpuarray.vec.make_float3(0,30,0), block=(256,1,1), grid=(mesh3.size//256+1,1))
 
+graph = Graph(mesh3)
+
+lower = array2vector(graph.lower, dtype=gpuarray.vec.float4)
+upper = array2vector(graph.upper, dtype=gpuarray.vec.float4)
+start = graph.start.astype(np.uint32)
+count = graph.count.astype(np.uint32)
+stack = np.zeros(lower.size, dtype=np.int32)
+
+lower_gpu = cuda.to_device(lower)
+upper_gpu = cuda.to_device(upper)
+
+lower_tex = mod.get_texref('lower_bound_arr')
+upper_tex = mod.get_texref('upper_bound_arr')
+
+lower_tex.set_address(lower_gpu, lower.nbytes)
+upper_tex.set_address(upper_gpu, upper.nbytes)
+
+lower_tex.set_format(cuda.array_format.FLOAT, 4)
+upper_tex.set_format(cuda.array_format.FLOAT, 4)
+
+start_gpu = cuda.to_device(start)
+count_gpu = cuda.to_device(count)
+stack_gpu = cuda.mem_alloc(stack.nbytes)
+cuda.memcpy_htod(stack_gpu, stack)
+
+child_map_tex = mod.get_texref('child_map_arr')
+child_len_tex = mod.get_texref('child_len_arr')
+
+child_map_tex.set_address(start_gpu, start.nbytes)
+child_len_tex.set_address(count_gpu, count.nbytes)
+
+child_map_tex.set_format(cuda.array_format.UNSIGNED_INT32, 1)
+child_len_tex.set_format(cuda.array_format.UNSIGNED_INT32, 1)
+
 mesh = np.empty(mesh3.size, dtype=gpuarray.vec.float4)
 mesh['x'] = mesh3['x']
 mesh['y'] = mesh3['y']
@@ -66,7 +115,14 @@ mesh_tex.set_address(mesh_gpu, mesh.nbytes)
 mesh_tex.set_format(cuda.array_format.FLOAT, 4)
 
 pixel = np.empty(size, dtype=np.int32).flatten()
-pixel_gpu = cuda.to_device(pixel)
+
+pixel_gpu = cuda.mem_alloc(pixel.nbytes)
+cuda.memcpy_htod(pixel_gpu, pixel)
+
+speed = []
+elapsed = []
+
+t0total = time.time()
 
 block_size = 64
 for i in range(100):
@@ -75,16 +131,22 @@ for i in range(100):
     rotate(np.int32(p.size), p_gpu, np.float32(np.pi/100), gpuarray.vec.make_float3(0,0,1), block=(block_size,1,1), grid=(width*height//block_size+1,1))
 
     t0 = time.time()
-    intersect(np.int32(x.size), x_gpu, p_gpu, np.int32(mesh.size//3), pixel_gpu, block=(block_size,1,1), grid=(width*height//block_size+1,1), texrefs=[mesh_tex])
+    intersect_mesh(np.int32(x.size), x_gpu, p_gpu, pixel_gpu, np.int32(graph.first_leaf), block=(block_size,1,1), grid=(width*height//block_size+1,1), texrefs=[mesh_tex, upper_tex, lower_tex, child_map_tex, child_len_tex])
     cuda.Context.synchronize()
 
-    elapsed = time.time() - t0
+    elapsed.append(time.time() - t0)
 
-    print '%i triangles, %i photons, %f sec; (%f photons/s)' % \
-        (mesh.size//3, pixel.size, elapsed, pixel.size/elapsed)
+    print '%i triangles, %i photons, %f sec; (%f photons/sec)' % \
+        (mesh.size//3, pixel.size, elapsed[-1], pixel.size/elapsed[-1])
+
+    speed.append(pixel.size/elapsed[-1])
 
     cuda.memcpy_dtoh(pixel, pixel_gpu)
     pygame.surfarray.blit_array(screen, pixel.reshape(size))
     pygame.display.flip()
 
+print 'average time = %f sec' % np.mean(elapsed)
+print 'average speed = %f photons/sec' % np.mean(speed)
+print 'total time = %f sec' % (time.time() - t0total)
+
 raw_input('press enter to exit')
diff --git a/zcurve.py b/zcurve.py
new file mode 100644
index 0000000..5ef37f3
--- /dev/null
+++ b/zcurve.py
@@ -0,0 +1,25 @@
+import numpy as np
+from itertools import chain
+
+def interleave(*args):
+    return int("".join(chain(*zip(*[bin(x)[2:].zfill(x.nbytes*8) for x in args]))), 2)
+
+def morton_order(mesh, dtype=np.uint8):
+    vertices = mesh.reshape(mesh.shape[0]*3, 3)
+    
+    upper_bound = np.max(vertices, axis=0)
+    lower_bound = np.min(vertices, axis=0)
+
+    quantize = lambda x: dtype((x-lower_bound)*np.iinfo(dtype).max/(upper_bound-lower_bound))
+
+    zvalue = []
+    for triangle in mesh:
+        center = np.mean(np.vstack((triangle[0], triangle[1], triangle[2])), axis=0)
+        zvalue.append(interleave(*quantize(center)))
+
+    ordered_mesh = np.empty(mesh.shape)
+
+    for i, idx in enumerate(zip(*sorted(zip(zvalue, range(len(zvalue)))))[-1]):
+        ordered_mesh[i] = mesh[idx]
+
+    return ordered_mesh