6 files changed, 90 insertions, 102 deletions
diff --git a/camera.py b/camera.py
index 797fdbe..1e9ceb1 100755
--- a/camera.py
+++ b/camera.py
@@ -104,7 +104,7 @@ class Camera(Thread):
         self.xyz_lookup1_gpu = gpuarray.zeros(len(self.geometry.mesh.triangles), dtype=gpuarray.vec.float3)
         self.xyz_lookup2_gpu = gpuarray.zeros(len(self.geometry.mesh.triangles), dtype=gpuarray.vec.float3)
         self.image_gpu = gpuarray.zeros(self.width*self.height, dtype=gpuarray.vec.float3)
-        self.context.synchronize()
+        self.gpu.context.synchronize()
 
         self.source_position = self.point
 
@@ -382,8 +382,6 @@ class Camera(Thread):
         
         self.done = False
         self.clicked = False
-        #shift = False
-        #ctrl = False
 
         #current_layer = 0
 
@@ -416,7 +414,6 @@ class EventViewer(Camera):
         self.T = self.f.Get('T')
         self.T.GetEntry(0)
 
-    #@timeit
     def color_hit_pmts(self):
         self.gpu.reset_colors()
 
diff --git a/geometry.py b/geometry.py
index a98b5cd..0657a77 100644
--- a/geometry.py
+++ b/geometry.py
@@ -1,8 +1,5 @@
 import sys
 import numpy as np
-import pycuda.driver as cuda
-from pycuda import gpuarray
-from itertools import *
 from itertoolset import *
 from tools import timeit
 
@@ -159,7 +156,7 @@ def interleave(arr, bits):
     if len(arr.shape) != 2 or arr.shape[1] != 3:
         raise Exception('shape mismatch')
 
-    z = np.zeros(arr.shape[0], dtype=np.uint32)
+    z = np.zeros(arr.shape[0], dtype=np.uint64)
     for i in range(bits):
         z |= (arr[:,2] & 1 << i) << (2*i) | \
              (arr[:,1] & 1 << i) << (2*i+1) | \
@@ -174,13 +171,13 @@ def morton_order(mesh, bits):
     """
     lower_bound, upper_bound = mesh.get_bounds()
 
-    if bits <= 0 or bits > 10:
-        raise Exception('number of bits must be in the range (0,10].')
+    if bits <= 0 or bits > 21:
+        raise Exception('number of bits must be in the range (0,21].')
 
     max_value = 2**bits - 1
 
     def quantize(x):
-        return np.uint32((x-lower_bound)*max_value/(upper_bound-lower_bound))
+        return np.uint64((x-lower_bound)*max_value/(upper_bound-lower_bound))
 
     mean_positions = quantize(np.mean(mesh.assemble(), axis=1))
 
@@ -212,7 +209,6 @@ class Geometry(object):
         return len(self.solids)-1
 
     @timeit
-    #@profile
     def build(self, bits=8, shift=3):
         offsets = [ (0,0) ]
         for solid in self.solids:
@@ -275,61 +271,40 @@ class Geometry(object):
         self.upper_bounds = np.empty((unique_zvalues.size,3), dtype=np.float32)
 
         assembled_mesh = self.mesh.assemble(group=False)
-        zi1_array = np.searchsorted(zvalues_mesh, unique_zvalues)
-        zi2_array = np.searchsorted(zvalues_mesh, unique_zvalues, side='right')
+        self.node_map = np.searchsorted(zvalues_mesh, unique_zvalues)
+        self.node_map_end = np.searchsorted(zvalues_mesh, unique_zvalues, side='right')
 
-        for i, (zi1, zi2) in enumerate(zip(zi1_array,zi2_array)):
-            self.lower_bounds[i] = assembled_mesh[zi1*3:zi2*3].min(0)
-            self.upper_bounds[i] = assembled_mesh[zi1*3:zi2*3].max(0)
+        for i, (zi1, zi2) in enumerate(izip(self.node_map, self.node_map_end)):
+            self.lower_bounds[i] = assembled_mesh[zi1*3:zi2*3].min(axis=0)
+            self.upper_bounds[i] = assembled_mesh[zi1*3:zi2*3].max(axis=0)
 
-        zvalues = np.array(unique_zvalues, copy=True)
-        self.node_map = np.array(zi1_array, copy=True)
-        self.node_length = zi2_array - zi1_array
         self.layers = np.zeros(unique_zvalues.size, dtype=np.uint32)
         self.first_node = unique_zvalues.size
 
         begin_last_layer = 0
 
-        layer = 1
+        for layer in count(1):
+            bit_shifted_zvalues = unique_zvalues >> shift
+            unique_zvalues = np.unique(bit_shifted_zvalues)
 
-        while True:
-            bit_shifted_zvalues = zvalues >> shift
-            unique_bit_shifted_zvalues = np.unique(bit_shifted_zvalues)
-            zvalues = np.empty(unique_bit_shifted_zvalues.size, dtype=np.uint64)
+            i0 = begin_last_layer + bit_shifted_zvalues.size
 
-            self.lower_bounds.resize(\
-                (self.lower_bounds.shape[0]+unique_bit_shifted_zvalues.size,3))
-            self.upper_bounds.resize(\
-                (self.upper_bounds.shape[0]+unique_bit_shifted_zvalues.size,3))
+            self.node_map.resize(self.node_map.size+unique_zvalues.size)
+            self.node_map[i0:] = np.searchsorted(bit_shifted_zvalues, unique_zvalues) + begin_last_layer
+            self.node_map_end.resize(self.node_map_end.size+unique_zvalues.size)
+            self.node_map_end[i0:] = np.searchsorted(bit_shifted_zvalues, unique_zvalues, side='right') + begin_last_layer
 
-            self.node_map.resize(\
-                self.node_map.size+unique_bit_shifted_zvalues.size)
-            self.node_length.resize(\
-                self.node_length.size+unique_bit_shifted_zvalues.size)
+            self.layers.resize(self.layers.size+unique_zvalues.size)
+            self.layers[i0:] = layer
 
-            self.layers.resize(self.layers.size+unique_bit_shifted_zvalues.size)
+            self.lower_bounds.resize((self.lower_bounds.shape[0]+unique_zvalues.size,3))
+            self.upper_bounds.resize((self.upper_bounds.shape[0]+unique_zvalues.size,3))
 
-            for i, z in enumerate(unique_bit_shifted_zvalues):
-                i1 = np.searchsorted(bit_shifted_zvalues, z) + \
-                    begin_last_layer
-                i2 = np.searchsorted(bit_shifted_zvalues, z, side='right') + \
-                    begin_last_layer
-
-                zvalues[i] = z
-
-                i += begin_last_layer + bit_shifted_zvalues.size
-
-                self.lower_bounds[i] = self.lower_bounds[i1:i2].min(axis=0)
-                self.upper_bounds[i] = self.upper_bounds[i1:i2].max(axis=0)
-
-                self.node_map[i] = i1
-                self.node_length[i] = i2-i1
-                self.layers[i] = layer
+            for i, zi1, zi2 in izip(count(i0), self.node_map[i0:], self.node_map_end[i0:]):
+                self.lower_bounds[i] = self.lower_bounds[zi1:zi2].min(axis=0)
+                self.upper_bounds[i] = self.upper_bounds[zi1:zi2].max(axis=0)
 
             begin_last_layer += bit_shifted_zvalues.size
 
-            layer += 1
-
-            if unique_bit_shifted_zvalues.size == 1:
+            if unique_zvalues.size == 1:
                 break
-
diff --git a/gpu.py b/gpu.py
index a21bc38..9017f20 100644
--- a/gpu.py
+++ b/gpu.py
@@ -72,13 +72,13 @@ class GPU(object):
             device = cuda.Device(device_id)
             self.context = device.make_context()
         print 'device %s' % self.context.get_device().name()
-        self.module = SourceModule(src.kernel, options=['-I' + src.dir], no_extern_c=True, cache_dir=False)
+        self.module = SourceModule(src.kernel, options=['-I' + src.dir], no_extern_c=True)
         self.geo_funcs = CUDAFuncs(self.module, ['set_wavelength_range', 'set_material', 'set_surface', 'set_global_mesh_variables', 'color_solids'])
 
         self.prop_funcs = CUDAFuncs(self.module, ['init_rng', 'propagate'])
         self.nthread_per_block = 64
         self.max_blocks = 1024
-        self.daq_module = SourceModule(src.daq, options=['-I' + src.dir], no_extern_c=True, cache_dir=False)
+        self.daq_module = SourceModule(src.daq, options=['-I' + src.dir], no_extern_c=True)
         self.daq_funcs = CUDAFuncs(self.daq_module, 
                               ['reset_earliest_time_int', 'run_daq',
                                'convert_sortable_int_to_float'])
@@ -90,8 +90,8 @@ class GPU(object):
         print format_array('lower_bounds', self.lower_bounds_gpu)
         print format_array('upper_bounds', self.upper_bounds_gpu)
         print format_array('node_map', self.node_map_gpu)
-        print format_array('node_length', self.node_length_gpu)
-        print '%-15s %6s %6s' % ('total', '', format_size(self.vertices_gpu.nbytes + self.triangles_gpu.nbytes + self.lower_bounds_gpu.nbytes + self.upper_bounds_gpu.nbytes + self.node_map_gpu.nbytes + self.node_length_gpu.nbytes))
+        print format_array('node_map_end', self.node_map_end_gpu)
+        print '%-15s %6s %6s' % ('total', '', format_size(self.vertices_gpu.nbytes + self.triangles_gpu.nbytes + self.lower_bounds_gpu.nbytes + self.upper_bounds_gpu.nbytes + self.node_map_gpu.nbytes + self.node_map_end_gpu.nbytes))
 
     def load_geometry(self, geometry):
         if not hasattr(geometry, 'mesh'):
@@ -155,7 +155,7 @@ class GPU(object):
 
         self.colors_gpu = gpuarray.to_gpu(geometry.colors.astype(np.uint32))
         self.node_map_gpu = gpuarray.to_gpu(geometry.node_map.astype(np.uint32))
-        self.node_length_gpu = gpuarray.to_gpu(geometry.node_length.astype(np.uint32))
+        self.node_map_end_gpu = gpuarray.to_gpu(geometry.node_map_end.astype(np.uint32))
         self.solid_id_map_gpu = gpuarray.to_gpu(geometry.solid_id.astype(np.uint32))
 
         self.geo_funcs.set_global_mesh_variables(self.triangles_gpu, self.vertices_gpu, self.colors_gpu, np.uint32(geometry.node_map.size-1), np.uint32(geometry.first_node), block=(1,1,1), grid=(1,1))
@@ -163,27 +163,25 @@ class GPU(object):
         self.lower_bounds_tex = self.module.get_texref('lower_bounds')
         self.upper_bounds_tex = self.module.get_texref('upper_bounds')
         self.node_map_tex = self.module.get_texref('node_map')
-        self.node_length_tex = self.module.get_texref('node_length')
+        self.node_map_end_tex = self.module.get_texref('node_map_end')
 
         self.lower_bounds_tex.set_address(self.lower_bounds_gpu.gpudata, self.lower_bounds_gpu.nbytes)
         self.upper_bounds_tex.set_address(self.upper_bounds_gpu.gpudata, self.upper_bounds_gpu.nbytes)
         self.node_map_tex.set_address(self.node_map_gpu.gpudata, self.node_map_gpu.nbytes)
-        self.node_length_tex.set_address(self.node_length_gpu.gpudata, self.node_length_gpu.nbytes)
+        self.node_map_end_tex.set_address(self.node_map_end_gpu.gpudata, self.node_map_end_gpu.nbytes)
 
         self.lower_bounds_tex.set_format(cuda.array_format.FLOAT, 4)
         self.upper_bounds_tex.set_format(cuda.array_format.FLOAT, 4)
         self.node_map_tex.set_format(cuda.array_format.UNSIGNED_INT32, 1)
-        self.node_length_tex.set_format(cuda.array_format.UNSIGNED_INT32, 1)
+        self.node_map_end_tex.set_format(cuda.array_format.UNSIGNED_INT32, 1)
 
         self.geometry = geometry
 
         self.print_device_usage()
 
-    #@timeit
     def reset_colors(self):
         self.colors_gpu.set(self.geometry.colors.astype(np.uint32))
 
-    #@timeit
     def color_solids(self, solid_ids, colors):
         solid_ids_gpu = gpuarray.to_gpu(np.array(solid_ids, dtype=np.int32))
         solid_colors_gpu = gpuarray.to_gpu(np.array(colors, dtype=np.uint32))
diff --git a/src/alpha.h b/src/alpha.h
index 5e3e803..a9963eb 100644
--- a/src/alpha.h
+++ b/src/alpha.h
@@ -17,14 +17,14 @@ __device__ void swap(T &a, T &b)
 
 struct HitList
 {
-	int size;
-	int indices[ALPHA_DEPTH];
+	unsigned int size;
+	unsigned int indices[ALPHA_DEPTH];
 	float distances[ALPHA_DEPTH];
 };
 
 __device__ void add_to_hit_list(const float &distance, const int &index, HitList &h)
 {
-	int i;
+	unsigned int i;
 	if (h.size >= ALPHA_DEPTH)
 	{
 		if (distance > h.distances[ALPHA_DEPTH-1])
@@ -50,7 +50,7 @@ __device__ void add_to_hit_list(const float &distance, const int &index, HitList
 	}
 }
 
-__device__ __noinline__ int get_color_alpha(const float3 &origin, const float3& direction)
+__device__ int get_color_alpha(const float3 &origin, const float3& direction)
 {
 	HitList h;
 	h.size = 0;
@@ -60,34 +60,34 @@ __device__ __noinline__ int get_color_alpha(const float3 &origin, const float3&
 	if (!intersect_node(origin, direction, g_start_node))
 		return 0;
 
-	int stack[STACK_SIZE];
+	unsigned int stack[STACK_SIZE];
 
-	int *head = &stack[0];
-	int *node = &stack[1];
-	int *tail = &stack[STACK_SIZE-1];
+	unsigned int *head = &stack[0];
+	unsigned int *node = &stack[1];
+	unsigned int *tail = &stack[STACK_SIZE-1];
 	*node = g_start_node;
 
-	int i;
+	unsigned int i;
 
 	do
 	{
-		int first_child = tex1Dfetch(node_map, *node);
-		int child_count = tex1Dfetch(node_length, *node);
+		unsigned int first_child = tex1Dfetch(node_map, *node);
+		unsigned int stop = tex1Dfetch(node_map_end, *node);
 
-		while (*node >= g_first_node && child_count == 1)
+		while (*node >= g_first_node && stop == first_child+1)
 		{
 			*node = first_child;
 			first_child = tex1Dfetch(node_map, *node);
-			child_count = tex1Dfetch(node_length, *node);
+			stop = tex1Dfetch(node_map_end, *node);
 		}
 		
 		if (*node >= g_first_node)
 		{
-			for (i=0; i < child_count; i++)
+			for (i=first_child; i < stop; i++)
 			{
-				if (intersect_node(origin, direction, first_child+i))
+				if (intersect_node(origin, direction, i))
 				{
-					*node = first_child+i;
+					*node = i;
 					node++;
 				}
 			}
@@ -96,9 +96,9 @@ __device__ __noinline__ int get_color_alpha(const float3 &origin, const float3&
 		}
 		else // node is a leaf
 		{
-			for (i=0; i < child_count; i++)
+			for (i=first_child; i < stop; i++)
 			{
-				uint4 triangle_data = g_triangles[first_child+i];
+				uint4 triangle_data = g_triangles[i];
 
 				float3 v0 = g_vertices[triangle_data.x];
 				float3 v1 = g_vertices[triangle_data.y];
@@ -106,7 +106,7 @@ __device__ __noinline__ int get_color_alpha(const float3 &origin, const float3&
 
 				if (intersect_triangle(origin, direction, v0, v1, v2, distance))
 				{
-					add_to_hit_list(distance, first_child+i, h);
+					add_to_hit_list(distance, i, h);
 				}
 
 			} // triangle loop
diff --git a/src/mesh.h b/src/mesh.h
index e0170ff..7c148af 100644
--- a/src/mesh.h
+++ b/src/mesh.h
@@ -18,7 +18,8 @@ texture<float4, 1, cudaReadModeElementType> lower_bounds;
 
 /* map to child nodes/triangles and the number of child nodes/triangles */
 texture<unsigned int, 1, cudaReadModeElementType> node_map;
-texture<unsigned int, 1, cudaReadModeElementType> node_length;
+texture<unsigned int, 1, cudaReadModeElementType> node_map_end;
+//texture<unsigned int, 1, cudaReadModeElementType> node_length;
 
 __device__ float3 make_float3(const float4 &a)
 {
@@ -48,7 +49,7 @@ __device__ bool intersect_node(const float3 &origin, const float3 &direction, co
    direction `direction` and the global mesh texture. If the ray intersects
    the texture return the index of the triangle which the ray intersected,
    else return -1. */
-__device__ __noinline__ int intersect_mesh(const float3 &origin, const float3& direction, float &min_distance, int last_hit_triangle = -1)
+__device__ int intersect_mesh(const float3 &origin, const float3& direction, float &min_distance, int last_hit_triangle = -1)
 {
 	int triangle_index = -1;
 
@@ -57,34 +58,34 @@ __device__ __noinline__ int intersect_mesh(const float3 &origin, const float3& d
 	if (!intersect_node(origin, direction, g_start_node))
 		return -1;
 
-	int stack[STACK_SIZE];
+	unsigned int stack[STACK_SIZE];
 
-	int *head = &stack[0];
-	int *node = &stack[1];
-	int *tail = &stack[STACK_SIZE-1];
+	unsigned int *head = &stack[0];
+	unsigned int *node = &stack[1];
+	unsigned int *tail = &stack[STACK_SIZE-1];
 	*node = g_start_node;
 
-	int i;
+	unsigned int i;
 
 	do
 	{
-		int first_child = tex1Dfetch(node_map, *node);
-		int child_count = tex1Dfetch(node_length, *node);
+		unsigned int first_child = tex1Dfetch(node_map, *node);
+		unsigned int stop = tex1Dfetch(node_map_end, *node);
 
-		while (*node >= g_first_node && child_count == 1)
+		while (*node >= g_first_node && stop == first_child+1)
 		{
 			*node = first_child;
 			first_child = tex1Dfetch(node_map, *node);
-			child_count = tex1Dfetch(node_length, *node);
+			stop = tex1Dfetch(node_map_end, *node);
 		}
 		
 		if (*node >= g_first_node)
 		{
-			for (i=0; i < child_count; i++)
+			for (i=first_child; i < stop; i++)
 			{
-				if (intersect_node(origin, direction, first_child+i))
+				if (intersect_node(origin, direction, i))
 				{
-					*node = first_child+i;
+					*node = i;
 					node++;
 				}
 			}
@@ -93,12 +94,12 @@ __device__ __noinline__ int intersect_mesh(const float3 &origin, const float3& d
 		}
 		else // node is a leaf
 		{
-			for (i=0; i < child_count; i++)
+			for (i=first_child; i < stop; i++)
 			{
-				if (last_hit_triangle == first_child+i)
+				if (last_hit_triangle == i)
 					continue;
 
-				uint4 triangle_data = g_triangles[first_child+i];
+				uint4 triangle_data = g_triangles[i];
 
 				float3 v0 = g_vertices[triangle_data.x];
 				float3 v1 = g_vertices[triangle_data.y];
@@ -108,14 +109,14 @@ __device__ __noinline__ int intersect_mesh(const float3 &origin, const float3& d
 				{
 					if (triangle_index == -1)
 					{
-						triangle_index = first_child + i;
+						triangle_index = i;
 						min_distance = distance;
 						continue;
 					}
 
 					if (distance < min_distance)
 					{
-						triangle_index = first_child + i;
+						triangle_index = i;
 						min_distance = distance;
 					}
 				}
diff --git a/src/tools.cu b/src/tools.cu
new file mode 100644
index 0000000..3d3fed7
--- /dev/null
+++ b/src/tools.cu
@@ -0,0 +1,17 @@
+//--*-c-*-
+
+extern "C"
+{
+
+__global__ void interleave(int nthreads, unsigned long long *x, unsigned long long *y, unsigned long long *z, int bits, unsigned long long *dest)
+{
+	int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+	if (id >= nthreads)
+		return;
+
+	for (int i=0; i < bits; i++)
+		dest[id] |= (x[id] & 1 << i) << (2*i) | (y[id] & 1 << i) << (2*i+1) | (z[id] & 1 << i) << (2*i+2);
+}
+
+}