3 files changed, 36 insertions, 1 deletions
diff --git a/chroma/bvh/grid.py b/chroma/bvh/grid.py
index d23b0bb..d8e1f38 100644
--- a/chroma/bvh/grid.py
+++ b/chroma/bvh/grid.py
@@ -1,7 +1,7 @@
 import numpy as np
 
 from chroma.bvh.bvh import BVH, CHILD_BITS
-from chroma.gpu.bvh import create_leaf_nodes, merge_nodes_detailed, concatenate_layers
+from chroma.gpu.bvh import create_leaf_nodes, merge_nodes_detailed, concatenate_layers, collapse_chains
 
 MAX_CHILD = 2**(32 - CHILD_BITS) - 1
 
@@ -87,4 +87,5 @@ def make_recursive_grid_bvh(mesh, target_degree=3):
         morton_codes = parent_morton_codes
 
     nodes, layer_bounds = concatenate_layers(layers)
+    nodes = collapse_chains(nodes, layer_bounds)
     return BVH(world_coords, nodes, layer_bounds[:-1])
diff --git a/chroma/cuda/bvh.cu b/chroma/cuda/bvh.cu
index 0c64157..882c22e 100644
--- a/chroma/cuda/bvh.cu
+++ b/chroma/cuda/bvh.cu
@@ -522,4 +522,19 @@ extern "C"
     node[b_index] = temp4;
   }
 
+  __global__ void collapse_child(unsigned int start, unsigned int end,
+				 uint4 *node)
+  {
+     unsigned int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+     unsigned int stride = gridDim.x * blockDim.x;
+
+     for (unsigned int i=start+thread_id; i < end; i += stride) {
+       uint4 this_node = node[i];
+       unsigned int nchild = this_node.w >> CHILD_BITS;
+       unsigned int child_id = this_node.w &  ~NCHILD_MASK;
+       if (nchild == 1)
+	 node[i] = node[child_id];
+     }
+  }
+
 } // extern "C"
diff --git a/chroma/gpu/bvh.py b/chroma/gpu/bvh.py
index 00e2e69..b138bb9 100644
--- a/chroma/gpu/bvh.py
+++ b/chroma/gpu/bvh.py
@@ -111,6 +111,25 @@ def merge_nodes_detailed(nodes, first_child, nchild):
 
     return gpu_parent_nodes.get()
 
+def collapse_chains(nodes, layer_bounds):
+    bvh_module = get_cu_module('bvh.cu', options=cuda_options,
+                               include_source_directory=True)
+    bvh_funcs = GPUFuncs(bvh_module)
+    
+    gpu_nodes = ga.to_gpu(nodes)
+
+    bounds = zip(layer_bounds[:-1], layer_bounds[1:])[:-1]
+    bounds.reverse()
+    nthreads_per_block = 256
+    for start, end in bounds:
+        bvh_funcs.collapse_child(np.uint32(start),
+                                 np.uint32(end),
+                                 gpu_nodes,
+                                 block=(nthreads_per_block,1,1),
+                                 grid=(120,1))
+    return gpu_nodes.get()
+
+
 def merge_nodes(nodes, degree, max_ratio=None):
     bvh_module = get_cu_module('bvh.cu', options=cuda_options,
                                include_source_directory=True)