5 files changed, 483 insertions, 135 deletions
diff --git a/chroma/cuda/bvh.cu b/chroma/cuda/bvh.cu
new file mode 100644
index 0000000..8543649
--- /dev/null
+++ b/chroma/cuda/bvh.cu
@@ -0,0 +1,183 @@
+//-*-c++-*-
+#include <cuda.h>
+
+#include "linalg.h"
+
+__device__ float3
+fminf(const float3 &a, const float3 &b)
+{
+  return make_float3(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z));
+}
+
+__device__ float3
+fmaxf(const float3 &a, const float3 &b)
+{
+  return make_float3(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z));
+}
+
+__device__ uint3
+min(const uint3 &a, const uint3 &b)
+{
+  return make_uint3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z));
+}
+
+__device__ uint3
+max(const uint3 &a, const uint3 &b)
+{
+  return make_uint3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z));
+}
+
+
+__device__ uint3
+operator+ (const uint3 &a, const unsigned int &b)
+{
+  return make_uint3(a.x + b, a.y + b, a.z + b);
+}
+
+// spread out the first 16 bits in x to occupy every 3rd slot in the return value
+__device__ unsigned long long spread3_16(unsigned int input)
+{
+  // method from http://stackoverflow.com/a/4838734
+  unsigned long long x = input;
+  x = (x | (x << 16)) & 0x00000000FF0000FFul;
+  x = (x | (x << 8)) & 0x000000F00F00F00Ful;
+  x = (x | (x << 4)) & 0x00000C30C30C30C3ul;
+  x = (x | (x << 2)) & 0X0000249249249249ul;
+  
+  return x;
+}
+
+__device__ unsigned int quantize(float v, float world_origin, float world_scale)
+{
+  // truncate!
+  return (unsigned int) ((v - world_origin) / world_scale);
+}
+
+__device__ uint3 quantize3(float3 v, float3 world_origin, float3 world_scale)
+{
+  return make_uint3(quantize(v.x, world_origin.x, world_scale.x),
+		    quantize(v.y, world_origin.y, world_scale.y),
+		    quantize(v.z, world_origin.z, world_scale.z));
+}
+
+const unsigned int LEAF_BIT = (1U << 31);
+
+extern "C"
+{
+
+  __global__ void
+  make_leaves(unsigned int first_triangle,
+	      unsigned int ntriangles, uint3 *triangles, float3 *vertices,
+	      float3 world_origin, float3 world_scale,
+	      uint4 *leaf_nodes, unsigned long long *morton_codes)
+
+  {
+    unsigned int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (thread_id >= ntriangles)
+      return;
+
+    unsigned int triangle_id = first_triangle + thread_id;
+
+    // Find bounding corners and centroid
+    uint3 triangle = triangles[triangle_id];
+    float3 lower = vertices[triangle.x];
+    float3 centroid = lower;
+    float3 upper = lower;
+    float3 temp_vertex;
+
+    temp_vertex = vertices[triangle.y];
+    lower = fminf(lower, temp_vertex);
+    upper = fmaxf(upper, temp_vertex);
+    centroid  += temp_vertex;
+
+    temp_vertex = vertices[triangle.z];
+    lower = fminf(lower, temp_vertex);
+    upper = fmaxf(upper, temp_vertex);
+    centroid  += temp_vertex;
+
+    centroid /= 3.0f;
+
+    // Quantize bounding corners and centroid
+    uint3 q_lower = quantize3(lower, world_origin, world_scale);
+    uint3 q_upper = quantize3(upper, world_origin, world_scale) + 1;
+    uint3 q_centroid = quantize3(centroid, world_origin, world_scale);
+
+    // Compute Morton code from quantized centroid
+    unsigned long long morton = 
+      spread3_16(q_centroid.x) 
+      | (spread3_16(q_centroid.y) << 1)
+      | (spread3_16(q_centroid.z) << 2);
+    
+    // Write leaf and morton code
+    uint4 leaf_node;
+    leaf_node.x = q_lower.x | (q_upper.x << 16);
+    leaf_node.y = q_lower.y | (q_upper.y << 16);
+    leaf_node.z = q_lower.z | (q_upper.z << 16);
+    leaf_node.w = triangle_id | LEAF_BIT;
+
+    leaf_nodes[triangle_id] = leaf_node;
+    morton_codes[triangle_id] = morton;
+  }
+
+  __global__ void
+  reorder_leaves(unsigned int first_triangle,
+		 unsigned int ntriangles,
+		 uint4 *leaf_nodes_in, 
+		 uint4 *leaf_nodes_out,
+		 unsigned int *remap_order)
+
+  {
+    unsigned int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (thread_id >= ntriangles)
+      return;
+
+    unsigned int dest_id = first_triangle + thread_id;
+    unsigned int source_id = remap_order[dest_id];
+
+    leaf_nodes_out[dest_id] = leaf_nodes_in[source_id];
+  }
+
+  __global__ void
+  build_layer(unsigned int first_node,
+	      unsigned int n_parent_nodes, 
+	      unsigned int n_children_per_node,
+	      uint4 *nodes,
+	      unsigned int parent_layer_offset,
+	      unsigned int child_layer_offset)
+  {
+    unsigned int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (thread_id >= n_parent_nodes)
+      return;
+
+    unsigned int parent_id = first_node + thread_id;
+    unsigned int first_child = child_layer_offset + parent_id * n_children_per_node;
+
+    // Load first child
+    uint4 parent_node = nodes[first_child];
+    uint3 lower = make_uint3(parent_node.x & 0xFFFF, parent_node.y & 0xFFFF, parent_node.z & 0xFFFF);
+    uint3 upper = make_uint3(parent_node.x >> 16, parent_node.y >> 16, parent_node.z >> 16);
+    
+
+    // Scan remaining children
+    for (unsigned int i=1; i < n_children_per_node; i++) {
+      uint4 child_node = nodes[first_child + i];
+      
+      if (child_node.x == 0)
+	break;  // Hit first padding node in list of children
+
+      uint3 child_lower = make_uint3(child_node.x & 0xFFFF, child_node.y & 0xFFFF, child_node.z & 0xFFFF);
+      uint3 child_upper = make_uint3(child_node.x >> 16, child_node.y >> 16, child_node.z >> 16);
+
+      lower = min(lower, child_lower);
+      upper = max(upper, child_upper);
+    }
+
+    parent_node.w = first_child;
+    parent_node.x = upper.x << 16 | lower.x;
+    parent_node.y = upper.y << 16 | lower.y;
+    parent_node.z = upper.z << 16 | lower.z;
+
+    nodes[parent_layer_offset + parent_id] = parent_node;
+  }
+
+} // extern "C"
diff --git a/chroma/cuda/geometry.h b/chroma/cuda/geometry.h
index 2b5eacb..86ced6c 100644
--- a/chroma/cuda/geometry.h
+++ b/chroma/cuda/geometry.h
@@ -1,47 +1,40 @@
 #ifndef __GEOMETRY_H__
 #define __GEOMETRY_H__
 
-struct Material
-{
-    float *refractive_index;
-    float *absorption_length;
-    float *scattering_length;
-    unsigned int n;
-    float step;
-    float wavelength0;
-};
-
-struct Surface
-{
-    float *detect;
-    float *absorb;
-    float *reflect_diffuse;
-    float *reflect_specular;
-    unsigned int n;
-    float step;
-    float wavelength0;
-};
-
-struct Triangle
+#include "geometry_types.h"
+#include "linalg.h"
+
+const unsigned int LEAF_BIT = (1U << 31);
+
+
+__device__ float3 
+to_float3(const uint3 &a)
 {
-    float3 v0, v1, v2;
-};
+  return make_float3(a.x, a.y, a.z);
+}
 
-struct Geometry
+__device__ Node
+get_node(Geometry *geometry, const unsigned int &i)
 {
-    float3 *vertices;
-    uint3 *triangles;
-    unsigned int *material_codes;
-    unsigned int *colors;
-    float3 *lower_bounds;
-    float3 *upper_bounds;
-    unsigned int *node_map;
-    unsigned int *node_map_end;
-    Material **materials;
-    Surface **surfaces;
-    unsigned int start_node;
-    unsigned int first_node;
-};
+    uint4 node = geometry->nodes[i];
+    Node node_struct;
+
+    if (node.x == 0) {
+      node_struct.kind = PADDING_NODE;
+      return node_struct;
+    }
+
+    uint3 lower_int = make_uint3(node.x & 0xFFFF, node.y & 0xFFFF, node.z & 0xFFFF);
+    uint3 upper_int = make_uint3(node.x >> 16, node.y >> 16, node.z >> 16);
+
+
+    node_struct.lower = geometry->world_origin + to_float3(lower_int) * geometry->world_scale;
+    node_struct.upper = geometry->world_origin + to_float3(upper_int) * geometry->world_scale;
+    node_struct.child = node.w & ~LEAF_BIT; // Mask off leaf bit
+    node_struct.kind = node.w & LEAF_BIT ? LEAF_NODE : INTERNAL_NODE;
+    
+    return node_struct;
+}
 
 __device__ Triangle
 get_triangle(Geometry *geometry, const unsigned int &i)
diff --git a/chroma/cuda/geometry_types.h b/chroma/cuda/geometry_types.h
new file mode 100644
index 0000000..31ff3c6
--- /dev/null
+++ b/chroma/cuda/geometry_types.h
@@ -0,0 +1,55 @@
+#ifndef __GEOMETRY_TYPES_H__
+#define __GEOMETRY_TYPES_H__
+
+struct Material
+{
+    float *refractive_index;
+    float *absorption_length;
+    float *scattering_length;
+    unsigned int n;
+    float step;
+    float wavelength0;
+};
+
+struct Surface
+{
+    float *detect;
+    float *absorb;
+    float *reflect_diffuse;
+    float *reflect_specular;
+    unsigned int n;
+    float step;
+    float wavelength0;
+};
+
+struct Triangle
+{
+    float3 v0, v1, v2;
+};
+
+enum { INTERNAL_NODE, LEAF_NODE, PADDING_NODE };
+
+struct Node
+{
+    float3 lower;
+    float3 upper;
+    unsigned int child;
+    unsigned int kind;
+};
+
+struct Geometry
+{
+    float3 *vertices;
+    uint3 *triangles;
+    unsigned int *material_codes;
+    unsigned int *colors;
+    uint4 *nodes;
+    Material **materials;
+    Surface **surfaces;
+    float3 world_origin;
+    float _dummy1; // for alignment
+    float3 world_scale;
+    unsigned int branch_degree;
+};
+
+#endif
diff --git a/chroma/cuda/mesh.h b/chroma/cuda/mesh.h
index d60d801..de15156 100644
--- a/chroma/cuda/mesh.h
+++ b/chroma/cuda/mesh.h
@@ -6,7 +6,7 @@
 
 #include "stdio.h"
 
-#define STACK_SIZE 100
+#define STACK_SIZE 1000
 
 /* Tests the intersection between a ray and a node in the bounding volume
    hierarchy. If the ray intersects the bounding volume and `min_distance`
@@ -14,15 +14,11 @@
    less than `min_distance`, return true, else return false. */
 __device__ bool
 intersect_node(const float3 &origin, const float3 &direction,
-	       Geometry *g, int i, float min_distance=-1.0f)
+	       Geometry *g, const Node &node, const float min_distance=-1.0f)
 {
-    /* assigning these to local variables is faster for some reason */
-    float3 lower_bound = g->lower_bounds[i];
-    float3 upper_bound = g->upper_bounds[i];
-
     float distance_to_box;
 
-    if (intersect_box(origin, direction, lower_bound, upper_bound,
+    if (intersect_box(origin, direction, node.lower, node.upper,
 		      distance_to_box)) {
 	if (min_distance < 0.0f)
 	    return true;
@@ -35,7 +31,6 @@ intersect_node(const float3 &origin, const float3 &direction,
     else {
 	return false;
     }
-
 }
 
 /* Finds the intersection between a ray and `geometry`. If the ray does
@@ -52,73 +47,74 @@ intersect_mesh(const float3 &origin, const float3& direction, Geometry *g,
     float distance;
     min_distance = -1.0f;
 
-    if (!intersect_node(origin, direction, g, g->start_node, min_distance))
+    Node root = get_node(g, 0);
+
+    if (!intersect_node(origin, direction, g, root, min_distance))
 	return -1;
 
-    unsigned int stack[STACK_SIZE];
-
-    unsigned int *head = &stack[0];
-    unsigned int *node = &stack[1];
-    unsigned int *tail = &stack[STACK_SIZE-1];
-    *node = g->start_node;
-
-    unsigned int i;
-
-    do
-    {
-	unsigned int first_child = g->node_map[*node];
-	unsigned int stop = g->node_map_end[*node];
-
-	while (*node >= g->first_node && stop == first_child+1) {
-	    *node = first_child;
-	    first_child = g->node_map[*node];
-	    stop = g->node_map_end[*node];
-	}
-		
-	if (*node >= g->first_node) {
-	    for (i=first_child; i < stop; i++) {
-		if (intersect_node(origin, direction, g, i, min_distance)) {
-		    *node = i;
-		    node++;
-		}
-	    }
+    //min_distance = 100.0f;
+    //return 1;
+
+    unsigned int child_ptr_stack[STACK_SIZE];
+    child_ptr_stack[0] = root.child;
+
+    const unsigned int *head = &child_ptr_stack[0];
+    unsigned int *curr = &child_ptr_stack[0];
+    const unsigned int *tail = &child_ptr_stack[STACK_SIZE-1];
+
+    unsigned int count = 0;
+    unsigned int tri_count = 0;
+
+    while (curr >= head) {
+	unsigned int first_child = *curr;
+	curr--;
+
+	for (unsigned int i=first_child; i < first_child + g->branch_degree; i++) {
+	    Node node = get_node(g, i);
+	    count++;
+
+	    if (node.kind == PADDING_NODE)
+	      break; // this node and rest of children are padding
+
+	    if (intersect_node(origin, direction, g, node, min_distance)) {
+
+		if (node.kind == LEAF_NODE) {
+		    // This node wraps a triangle
+
+		    if (node.child != last_hit_triangle) {
+			// Can't hit same triangle twice in a row
+			tri_count++;
+			Triangle t = get_triangle(g, node.child);			
+			if (intersect_triangle(origin, direction, t, distance)) {
+
+			    if (triangle_index == -1 || distance < min_distance) {
+				triangle_index = node.child;
+				min_distance = distance;
+			    } // if hit triangle is closer than previous hits
+
+			} // if hit triangle
+			
+		    } // if not hitting same triangle as last step
+
+		} else {
+		    curr++;
+		    *curr = node.child;
+		} // leaf or internal node?
+	    } // hit node?
 	    
-	    node--;
-	}
-	else {
-	    // node is a leaf
-	    for (i=first_child; i < stop; i++) {
-		if (last_hit_triangle == i)
-		    continue;
-
-		Triangle t = get_triangle(g, i);
-
-		if (intersect_triangle(origin, direction, t, distance)) {
-		    if (triangle_index == -1) {
-			triangle_index = i;
-			min_distance = distance;
-			continue;
-		    }
-
-		    if (distance < min_distance) {
-			triangle_index = i;
-			min_distance = distance;
-		    }
-		}
-	    } // triangle loop
-
-	    node--;
-
-	} // node is a leaf
-
-	if (node > tail) {
-	    printf("warning: intersect_mesh() aborted; node > tail\n");
-	    break;
-	}
-
-    } // while loop
-    while (node != head);
-    
+	    if (curr > tail) {
+		printf("warning: intersect_mesh() aborted; node > tail\n");
+		break;
+	    }
+	} // loop over children, starting with first_child
+
+    } // while nodes on stack
+
+    if (threadIdx.x == 0) {
+	printf("node count: %d\n", count);
+	printf("triangle count: %d\n", tri_count);
+    }
+
     return triangle_index;
 }
 
diff --git a/chroma/gpu/geometry.py b/chroma/gpu/geometry.py
index 081563f..6682f8f 100644
--- a/chroma/gpu/geometry.py
+++ b/chroma/gpu/geometry.py
@@ -6,11 +6,35 @@ from pycuda import characterize
 from chroma.geometry import standard_wavelengths
 from chroma.gpu.tools import get_cu_module, get_cu_source, cuda_options, \
     chunk_iterator, format_array, format_size, to_uint3, to_float3, \
-    make_gpu_struct
+    make_gpu_struct, GPUFuncs
+
 from chroma.log import logger
 
+def round_up_to_multiple(x, multiple):
+    remainder = x % multiple
+    if remainder == 0:
+        return x
+    else:
+        return x + multiple - remainder
+
+def compute_layer_configuration(n, branch_degree):
+    if n == 1:
+        # Special case for root
+        return [ (1, 1) ]
+    else:
+        layer_conf = [ (n, round_up_to_multiple(n, branch_degree)) ]
+
+    while layer_conf[0][1] > 1:
+        nparent = int(np.ceil( float(layer_conf[0][1]) / branch_degree ))
+        if nparent == 1:
+            layer_conf = [ (1, 1) ] + layer_conf
+        else:
+            layer_conf = [ (nparent, round_up_to_multiple(nparent, branch_degree)) ] + layer_conf
+
+    return layer_conf
+
 class GPUGeometry(object):
-    def __init__(self, geometry, wavelengths=None, print_usage=False):
+    def __init__(self, geometry, wavelengths=None, print_usage=False, branch_degree=32):
         if wavelengths is None:
             wavelengths = standard_wavelengths
 
@@ -19,7 +43,7 @@ class GPUGeometry(object):
         except ValueError:
             raise ValueError('wavelengths must be equally spaced apart.')
 
-        geometry_source = get_cu_source('geometry.h')
+        geometry_source = get_cu_source('geometry_types.h')
         material_struct_size = characterize.sizeof('Material', geometry_source)
         surface_struct_size = characterize.sizeof('Surface', geometry_source)
         geometry_struct_size = characterize.sizeof('Geometry', geometry_source)
@@ -104,32 +128,50 @@ class GPUGeometry(object):
         self.surface_pointer_array = \
             make_gpu_struct(8*len(self.surface_ptrs), self.surface_ptrs)
 
-        self.vertices = ga.to_gpu(to_float3(geometry.mesh.vertices))
-        self.triangles = ga.to_gpu(to_uint3(geometry.mesh.triangles))
+        self.pagelocked_vertices = cuda.pagelocked_empty(shape=len(geometry.mesh.vertices),
+                                                         dtype=ga.vec.float3,
+                                                         mem_flags=cuda.host_alloc_flags.DEVICEMAP | cuda.host_alloc_flags.WRITECOMBINED)
+        self.pagelocked_triangles = cuda.pagelocked_empty(shape=len(geometry.mesh.triangles),
+                                                         dtype=ga.vec.uint3,
+                                                         mem_flags=cuda.host_alloc_flags.DEVICEMAP | cuda.host_alloc_flags.WRITECOMBINED)
+        self.pagelocked_vertices[:] = to_float3(geometry.mesh.vertices)
+        self.pagelocked_triangles[:] = to_uint3(geometry.mesh.triangles)
+        self.vertices = np.intp(self.pagelocked_vertices.base.get_device_pointer())
+        self.triangles = np.intp(self.pagelocked_triangles.base.get_device_pointer())
+
+
+        self.branch_degree = branch_degree
+        print 'bvh', cuda.mem_get_info()
+        self.world_origin, self.world_scale, self.nodes = self.make_bvh(geometry.mesh.vertices,
+                                                                        self.vertices,
+                                                                        len(geometry.mesh.triangles),
+                                                                        self.triangles,
+                                                                        self.branch_degree)
+        print 'bvh after'
 
         material_codes = (((geometry.material1_index & 0xff) << 24) |
                           ((geometry.material2_index & 0xff) << 16) |
                           ((geometry.surface_index & 0xff) << 8)).astype(np.uint32)
-
-        self.material_codes = ga.to_gpu(material_codes)
-
-        self.lower_bounds = ga.to_gpu(to_float3(geometry.lower_bounds))
-        self.upper_bounds = ga.to_gpu(to_float3(geometry.upper_bounds))
-        self.colors = ga.to_gpu(geometry.colors.astype(np.uint32))
-        self.node_map = ga.to_gpu(geometry.node_map.astype(np.uint32))
-        self.node_map_end = ga.to_gpu(geometry.node_map_end.astype(np.uint32))
+        self.pagelocked_material_codes = cuda.pagelocked_empty_like(material_codes, mem_flags=cuda.host_alloc_flags.DEVICEMAP | cuda.host_alloc_flags.WRITECOMBINED)
+        self.pagelocked_material_codes[:] = material_codes
+        self.material_codes = np.intp(self.pagelocked_material_codes.base.get_device_pointer())
+
+        colors = geometry.colors.astype(np.uint32)
+        self.pagelocked_colors = cuda.pagelocked_empty_like(colors, mem_flags=cuda.host_alloc_flags.DEVICEMAP | cuda.host_alloc_flags.WRITECOMBINED)
+        self.pagelocked_colors[:] = colors
+        self.colors = np.intp(self.pagelocked_colors.base.get_device_pointer())
         self.solid_id_map = ga.to_gpu(geometry.solid_id.astype(np.uint32))
 
         self.gpudata = make_gpu_struct(geometry_struct_size,
                                        [self.vertices, self.triangles,
                                         self.material_codes,
-                                        self.colors, self.lower_bounds,
-                                        self.upper_bounds, self.node_map,
-                                        self.node_map_end,
+                                        self.colors, self.nodes,
                                         self.material_pointer_array,
                                         self.surface_pointer_array,
-                                        np.uint32(geometry.start_node),
-                                        np.uint32(geometry.first_node)])
+                                        self.world_origin,
+                                        np.float32(0.0), # dummy1
+                                        self.world_scale,
+                                        np.uint32(self.branch_degree)])
 
         self.geometry = geometry
 
@@ -137,17 +179,96 @@ class GPUGeometry(object):
             self.print_device_usage()
         logger.info(self.device_usage_str())
 
+    def make_bvh(self, vertices, gpu_vertices, ntriangles, gpu_triangles, branch_degree):
+        assert branch_degree > 1
+        bvh_module = get_cu_module('bvh.cu', options=cuda_options, 
+                                    include_source_directory=True)
+        bvh_funcs = GPUFuncs(bvh_module)
+        
+        world_min = vertices.min(axis=0)
+        # Full scale at 2**16 - 2 in order to ensure there is dynamic range to round
+        # up by one count after quantization
+        world_scale = (vertices.max(axis=0) - world_min) / (2**16 - 2)
+
+        world_origin = ga.vec.make_float3(*world_min)
+        world_scale  = ga.vec.make_float3(*world_scale)
+
+        layer_conf = compute_layer_configuration(ntriangles, branch_degree)
+        layer_offsets = list(np.cumsum([npad for n, npad in layer_conf]))
+
+        # Last entry is number of nodes, trim off and add zero to get offset of each layer
+        n_nodes = int(layer_offsets[-1])
+        layer_offsets = [0] + layer_offsets[:-1]
+
+        leaf_nodes = ga.empty(shape=ntriangles, dtype=ga.vec.uint4)
+        morton_codes = ga.empty(shape=ntriangles, dtype=np.uint64)
+
+        # Step 1: Make leaves
+        nthreads_per_block=256
+        for first_index, elements_this_iter, nblocks_this_iter in \
+                chunk_iterator(ntriangles, nthreads_per_block, max_blocks=10000):
+            bvh_funcs.make_leaves(np.uint32(first_index),
+                                  np.uint32(elements_this_iter),
+                                  gpu_triangles, gpu_vertices, 
+                                  world_origin, world_scale,
+                                  leaf_nodes, morton_codes,
+                                  block=(nthreads_per_block,1,1),
+                                  grid=(nblocks_this_iter,1))
+
+        # argsort on the CPU because I'm too lazy to do it on the GPU
+        argsort = morton_codes.get().argsort().astype(np.uint32)
+        del morton_codes
+        #local_leaf_nodes = leaf_nodes.get()[argsort]
+        #
+        remap_order = ga.to_gpu(argsort)
+        #m = morton_codes.get()
+        #m.sort()
+        #print m
+        #assert False
+        # Step 2: sort leaf nodes into full node list
+        print cuda.mem_get_info(), leaf_nodes.nbytes
+        nodes = ga.zeros(shape=n_nodes, dtype=ga.vec.uint4)
+        #cuda.memcpy_htod(int(nodes.gpudata)+int(layer_offsets[-1]), local_leaf_nodes)
+
+        for first_index, elements_this_iter, nblocks_this_iter in \
+               chunk_iterator(ntriangles, nthreads_per_block, max_blocks=10000):
+           bvh_funcs.reorder_leaves(np.uint32(first_index),
+                                    np.uint32(elements_this_iter),
+                                    leaf_nodes, nodes[layer_offsets[-1]:], remap_order,
+                                    block=(nthreads_per_block,1,1),
+                                    grid=(nblocks_this_iter,1))
+
+        del leaf_nodes
+        del remap_order
+
+        # Step 3: Create parent layers in reverse order
+        layer_parameters = zip(layer_offsets[:-1], layer_offsets[1:], layer_conf)
+        layer_parameters.reverse()
+
+        for parent_offset, child_offset, (nparent, nparent_pad) in layer_parameters:
+            for first_index, elements_this_iter, nblocks_this_iter in \
+                    chunk_iterator(nparent, nthreads_per_block, max_blocks=10000):
+                print parent_offset, first_index, elements_this_iter, nblocks_this_iter
+                bvh_funcs.build_layer(np.uint32(first_index),
+                                      np.uint32(elements_this_iter),
+                                      np.uint32(branch_degree),
+                                      nodes,
+                                      np.uint32(parent_offset),
+                                      np.uint32(child_offset),
+                                      block=(nthreads_per_block,1,1),
+                                      grid=(nblocks_this_iter,1))
+
+        return world_origin, world_scale, nodes
+
+
     def device_usage_str(self):
         '''Returns a formatted string displaying the memory usage.'''
         s = 'device usage:\n'
         s += '-'*10 + '\n'
-        s += format_array('vertices', self.vertices) + '\n'
-        s += format_array('triangles', self.triangles) + '\n'
-        s += format_array('lower_bounds', self.lower_bounds) + '\n'
-        s += format_array('upper_bounds', self.upper_bounds) + '\n'
-        s += format_array('node_map', self.node_map) + '\n'
-        s += format_array('node_map_end', self.node_map_end) + '\n'
-        s += '%-15s %6s %6s' % ('total', '', format_size(self.vertices.nbytes + self.triangles.nbytes + self.lower_bounds.nbytes + self.upper_bounds.nbytes + self.node_map.nbytes + self.node_map_end.nbytes)) + '\n'
+        #s += format_array('vertices', self.vertices) + '\n'
+        #s += format_array('triangles', self.triangles) + '\n'
+        s += format_array('nodes', self.nodes) + '\n'
+        s += '%-15s %6s %6s' % ('total', '', format_size(self.nodes.nbytes)) + '\n'
         s += '-'*10 + '\n'
         free, total = cuda.mem_get_info()
         s += '%-15s %6s %6s' % ('device total', '', format_size(total)) + '\n'