8 files changed, 308 insertions, 23 deletions
diff --git a/chroma/bvh/__init__.py b/chroma/bvh/__init__.py
index fa3d476..7ca6620 100644
--- a/chroma/bvh/__init__.py
+++ b/chroma/bvh/__init__.py
@@ -1,2 +1,3 @@
 from chroma.bvh.bvh import *
 from chroma.bvh.grid import make_recursive_grid_bvh
+from chroma.bvh.simple import make_simple_bvh
diff --git a/chroma/bvh/bvh.py b/chroma/bvh/bvh.py
index d13d285..f530b09 100644
--- a/chroma/bvh/bvh.py
+++ b/chroma/bvh/bvh.py
@@ -196,6 +196,19 @@ class BVH(object):
         '''Returns the number of nodes in this BVH'''
         return len(self.nodes)
 
+def node_area(nodes):
+    '''Returns the area of a list of nodes in fixed point units.'''
+    unpacked = unpack_nodes(nodes)
+    delta = np.empty(shape=len(nodes), 
+                     dtype=[('x', float), ('y', float), ('z', float)])
+    for axis in ['x', 'y', 'z']:
+        delta[axis] = unpacked[axis+'hi'] - unpacked[axis+'lo']    
+
+    half_area = delta['x']*delta['y'] + delta['y']*delta['z'] \
+        + delta['z']*delta['x']
+
+    return 2.0 * half_area.sum()
+
 class BVHLayerSlice(object):
     '''A single layer in a bounding volume hierarchy represented as a slice
     in the node array of the parent.  
@@ -219,12 +232,4 @@ class BVHLayerSlice(object):
     def area(self):
         '''Return the surface area of all the nodes in this layer in world
         units.'''
-        unpacked = unpack_nodes(self.nodes)
-        delta = np.empty(shape=len(self.nodes), 
-                         dtype=[('x', float), ('y', float), ('z', float)])
-        for axis in ['x', 'y', 'z']:
-            delta[axis] = unpacked[axis+'hi'] - unpacked[axis+'lo']
-
-        half_area = delta['x']*delta['y'] + delta['y']*delta['z'] \
-            + delta['z']*delta['x']
-        return 2.0 * half_area.sum() * self.world_coords.world_scale**2
+        return node_area(self.nodes) * self.world_coords.world_scale**2
diff --git a/chroma/bvh/grid.py b/chroma/bvh/grid.py
index 9fcf125..0b36a2f 100644
--- a/chroma/bvh/grid.py
+++ b/chroma/bvh/grid.py
@@ -1,6 +1,7 @@
-from chroma.bvh.bvh import BVH
+from chroma.bvh.bvh import BVH, node_area
+from chroma.gpu.bvh import create_leaf_nodes
 
-def make_recursive_grid_bvh(self, mesh, bits=11):
+def make_recursive_grid_bvh(mesh, bits=11):
     '''Returns a binary tree BVH created using a 'recursive grid' method.
 
     This method is somewhat similar to the original Chroma BVH generator, 
@@ -14,6 +15,14 @@ def make_recursive_grid_bvh(self, mesh, bits=11):
         speed things up.
 
     '''
-    pass
+    world_coords, leaf_nodes, morton_codes = create_leaf_nodes(mesh, bits)
 
+    # rearrange in morton order
+    argsort = morton_codes.argsort()
+    leaf_nodes = leaf_nodes[argsort]
+    morton_codes[argsort]
 
+    
+    print node_area(leaf_nodes) * world_coords.world_scale**2
+
+    
diff --git a/chroma/bvh/simple.py b/chroma/bvh/simple.py
new file mode 100644
index 0000000..6c0e02b
--- /dev/null
+++ b/chroma/bvh/simple.py
@@ -0,0 +1,31 @@
+from chroma.bvh.bvh import BVH, node_area
+from chroma.gpu.bvh import create_leaf_nodes, merge_nodes, concatenate_layers
+
+def make_simple_bvh(mesh, degree):
+    '''Returns a BVH tree created by simple grouping of Morton ordered nodes.
+    '''
+    world_coords, leaf_nodes, morton_codes = \
+        create_leaf_nodes(mesh, round_to_multiple=degree)
+
+    # rearrange in morton order
+    argsort = morton_codes.argsort()
+    leaf_nodes = leaf_nodes[argsort]
+    assert len(leaf_nodes) % degree == 0
+
+    # Create parent layers
+    layers = [leaf_nodes]
+    while len(layers[0]) > 1:
+        parent = merge_nodes(layers[0], degree=degree)
+        if len(parent) > 1:
+            assert len(parent) % degree == 0
+        layers = [parent] + layers
+
+    # How many nodes total?
+    nodes, layer_bounds = concatenate_layers(layers)
+
+    for i, (layer_start, layer_end) in enumerate(zip(layer_bounds[:-1], 
+                                                     layer_bounds[1:])):
+        print i, node_area(nodes[layer_start:layer_end]) * world_coords.world_scale**2
+    
+
+    
diff --git a/chroma/cuda/bvh.cu b/chroma/cuda/bvh.cu
index 50e877a..a6888db 100644
--- a/chroma/cuda/bvh.cu
+++ b/chroma/cuda/bvh.cu
@@ -186,17 +186,6 @@ extern "C"
       spread3_16(q_centroid.x) 
       | (spread3_16(q_centroid.y) << 1)
       | (spread3_16(q_centroid.z) << 2);
-
-    //unsigned long long morton = 
-    //  spread3_16(triangle.x & 0xFFFF) 
-    //  | (spread3_16(triangle.y & 0xFFFF) << 1)
-    //  | (spread3_16(triangle.z & 0xFFFF) << 2);
-
-
-    //unsigned long long morton = 
-    //    ( ((unsigned long long) q_centroid.x) << 32 )
-    //  | ( ((unsigned long long) q_centroid.y) << 16 )
-    //  | ( ((unsigned long long) q_centroid.z) << 0 );
     
     // Write leaf and morton code
     uint4 leaf_node;
@@ -270,6 +259,70 @@ extern "C"
     nodes[parent_layer_offset + parent_id] = parent_node;
   }
 
+  __global__ void
+  make_parents(unsigned int first_node,
+	       unsigned int elements_this_launch, 
+	       unsigned int n_children_per_node,
+	       uint4 *parent_nodes,
+	       uint4 *child_nodes,
+	       unsigned int child_id_offset)
+  {
+    unsigned int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (thread_id >= elements_this_launch)
+      return;
+
+    unsigned int parent_id = first_node + thread_id;
+    unsigned int first_child = parent_id * n_children_per_node;
+
+    // Load first child
+    uint4 parent_node = child_nodes[first_child];
+    uint3 lower = make_uint3(parent_node.x & 0xFFFF, parent_node.y & 0xFFFF, parent_node.z & 0xFFFF);
+    uint3 upper = make_uint3(parent_node.x >> 16, parent_node.y >> 16, parent_node.z >> 16);
+    
+    // Scan remaining children
+    for (unsigned int i=1; i < n_children_per_node; i++) {
+      uint4 child_node = child_nodes[first_child + i];
+      
+      if (child_node.x == 0)
+	break;  // Hit first padding node in list of children
+
+      uint3 child_lower = make_uint3(child_node.x & 0xFFFF, child_node.y & 0xFFFF, child_node.z & 0xFFFF);
+      uint3 child_upper = make_uint3(child_node.x >> 16, child_node.y >> 16, child_node.z >> 16);
+
+      lower = min(lower, child_lower);
+      upper = max(upper, child_upper);
+    }
+
+    parent_node.w = first_child;
+    parent_node.x = upper.x << 16 | lower.x;
+    parent_node.y = upper.y << 16 | lower.y;
+    parent_node.z = upper.z << 16 | lower.z;
+
+    parent_nodes[parent_id] = parent_node;
+  }
+
+  __global__ void
+  copy_and_offset(unsigned int first_node,
+		  unsigned int elements_this_launch, 
+		  unsigned int child_id_offset,
+		  uint4 *src_nodes,
+		  uint4 *dest_nodes)
+    
+  {
+    unsigned int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (thread_id >= elements_this_launch)
+      return;
+
+    unsigned int node_id = first_node + thread_id;
+    uint4 src_node = src_nodes[node_id];
+    
+    unsigned int leaf_flag = src_node.w & 0x80000000;
+    unsigned int child_id = src_node.w &  0x7FFFFFFF;
+    src_node.w = leaf_flag | (child_id + child_id_offset);
+
+    dest_nodes[node_id] = src_node;    
+  }
+
   __global__ void distance_to_prev(unsigned int first_node, unsigned int threads_this_round,
                                    uint4 *node, unsigned int *area)
   {
diff --git a/chroma/gpu/bvh.py b/chroma/gpu/bvh.py
new file mode 100644
index 0000000..b9c920d
--- /dev/null
+++ b/chroma/gpu/bvh.py
@@ -0,0 +1,139 @@
+import numpy as np
+import pycuda.driver as cuda
+from pycuda import gpuarray as ga
+from pycuda import characterize
+
+from chroma.gpu.tools import get_cu_module, cuda_options, \
+    chunk_iterator, to_uint3, to_float3, GPUFuncs, mapped_empty, Mapped
+
+from chroma.bvh.bvh import WorldCoords
+
+def round_up_to_multiple(x, multiple):
+    remainder = x % multiple
+    if remainder == 0:
+        return x
+    else:
+        return x + multiple - remainder
+
+def create_leaf_nodes(mesh, morton_bits=16, round_to_multiple=1):
+    '''Compute the leaf nodes surrounding a triangle mesh.
+
+      ``mesh``: chroma.geometry.Mesh
+        Triangles to box
+      ``morton_bits``: int
+        Number of bits to use per dimension when computing Morton code.
+      ``round_to_multiple``: int
+        Round the number of nodes created up to multiple of this number
+        Extra nodes will be all zero.
+        
+    Returns (world_coords, nodes, morton_codes), where
+      ``world_coords``: chroma.bvh.WorldCoords
+        Defines the fixed point coordinate system
+      ``nodes``: ndarray(shape=len(mesh.triangles), dtype=uint4)
+        List of leaf nodes.  Child IDs will be set to triangle offsets.
+      ``morton_codes``: ndarray(shape=len(mesh.triangles), dtype=np.uint64)
+        Morton codes for each triangle, using ``morton_bits`` per axis.
+        Must be <= 16 bits.
+    '''
+    # Load GPU functions
+    bvh_module = get_cu_module('bvh.cu', options=cuda_options,
+                               include_source_directory=True)
+    bvh_funcs = GPUFuncs(bvh_module)
+
+    # compute world coordinates
+    world_origin = mesh.vertices.min(axis=0)
+    world_scale = np.max((mesh.vertices.max(axis=0) - world_origin)) \
+        / (2**16 - 2)
+    world_coords = WorldCoords(world_origin=world_origin, 
+                               world_scale=world_scale)
+
+    # Put triangles and vertices in mapped host memory
+    triangles = mapped_empty(shape=len(mesh.triangles), dtype=ga.vec.uint3,
+                             write_combined=True)
+    triangles[:] = to_uint3(mesh.triangles)
+    vertices = mapped_empty(shape=len(mesh.vertices), dtype=ga.vec.float3,
+                            write_combined=True)
+    vertices[:] = to_float3(mesh.vertices)
+    
+    # Call GPU to compute nodes
+    nodes = ga.zeros(shape=round_up_to_multiple(len(triangles), 
+                                                round_to_multiple),
+                     dtype=ga.vec.uint4)
+    morton_codes = ga.empty(shape=len(triangles), dtype=np.uint64)
+
+    # Convert world coords to GPU-friendly types
+    world_origin = ga.vec.make_float3(*world_origin)
+    world_scale = np.float32(world_scale)
+
+    nthreads_per_block = 256
+    for first_index, elements_this_iter, nblocks_this_iter in \
+            chunk_iterator(len(triangles), nthreads_per_block, 
+                           max_blocks=30000):
+        bvh_funcs.make_leaves(np.uint32(first_index),
+                              np.uint32(elements_this_iter),
+                              Mapped(triangles), Mapped(vertices),
+                              world_origin, world_scale,
+                              nodes, morton_codes,
+                              block=(nthreads_per_block,1,1),
+                              grid=(nblocks_this_iter,1))
+
+    morton_codes_host = morton_codes.get() >> (16 - morton_bits)
+    return world_coords, nodes.get(), morton_codes_host
+
+
+def merge_nodes(nodes, degree):
+    bvh_module = get_cu_module('bvh.cu', options=cuda_options,
+                               include_source_directory=True)
+    bvh_funcs = GPUFuncs(bvh_module)
+    
+    nparent = len(nodes) / degree
+    if nparent == 1:
+        nparent_pad = nparent
+    else:
+        nparent_pad = round_up_to_multiple(nparent, degree)
+    parent_nodes = ga.zeros(shape=nparent_pad, dtype=ga.vec.uint4)
+
+    nthreads_per_block = 256
+    for first_index, elements_this_iter, nblocks_this_iter in \
+            chunk_iterator(nparent, nthreads_per_block, max_blocks=10000):
+        bvh_funcs.make_parents(np.uint32(first_index),
+                               np.uint32(elements_this_iter),
+                               np.uint32(degree),
+                               parent_nodes,
+                               cuda.In(nodes),
+                               np.uint32(0),
+                               block=(nthreads_per_block,1,1),
+                               grid=(nblocks_this_iter,1))
+
+    return parent_nodes.get()
+
+def concatenate_layers(layers):
+    bvh_module = get_cu_module('bvh.cu', options=cuda_options,
+                               include_source_directory=True)
+    bvh_funcs = GPUFuncs(bvh_module)
+    # Put 0 at beginning of list
+    layer_bounds = np.insert(np.cumsum(map(len, layers)), 0, 0)
+    nodes = ga.empty(shape=int(layer_bounds[-1]), dtype=ga.vec.uint4)
+    nthreads_per_block = 256
+
+    for layer_start, layer_end, layer in zip(layer_bounds[:-1],
+                                             layer_bounds[1:],
+                                             layers):
+        if layer_end == layer_bounds[-1]:
+            # leaf nodes need no offset
+            child_offset = 0
+        else:
+            child_offset = layer_end
+
+        for first_index, elements_this_iter, nblocks_this_iter in \
+                chunk_iterator(layer_end-layer_start, nthreads_per_block,
+                               max_blocks=10000):
+            bvh_funcs.copy_and_offset(np.uint32(first_index),
+                                      np.uint32(elements_this_iter),
+                                      np.uint32(child_offset),
+                                      cuda.In(layer),
+                                      nodes[layer_start:],
+                                      block=(nthreads_per_block,1,1),
+                                      grid=(nblocks_this_iter,1))
+        
+    return nodes.get(), layer_bounds
diff --git a/chroma/gpu/tools.py b/chroma/gpu/tools.py
index 707a45d..b151f80 100644
--- a/chroma/gpu/tools.py
+++ b/chroma/gpu/tools.py
@@ -181,3 +181,36 @@ def format_size(size):
 def format_array(name, array):
     return '%-15s %6s %6s' % \
         (name, format_size(len(array)), format_size(array.nbytes))
+
+def Mapped(array):
+    '''Analog to pycuda.driver.InOut(), but indicates this array
+    is memory mapped to the device space and should not be copied.'''
+    return np.intp(array.base.get_device_pointer())
+
+def mapped_alloc(pagelocked_alloc_func, shape, dtype, write_combined):
+    '''Returns a pagelocked host array mapped into the CUDA device
+    address space, with a gpudata field set so it just works with CUDA 
+    functions.'''
+    flags = cuda.host_alloc_flags.DEVICEMAP
+    if write_combined:
+        flags |= cuda.host_alloc_flags.WRITECOMBINED
+    array = pagelocked_alloc_func(shape=shape, dtype=dtype, mem_flags=flags)
+    return array
+
+def mapped_empty(shape, dtype, write_combined=False):
+    '''See mapped_alloc()'''
+    return mapped_alloc(cuda.pagelocked_empty, shape, dtype, write_combined)
+
+def mapped_empty_like(other, write_combined=False):
+    '''See mapped_alloc()'''
+    return mapped_alloc(cuda.pagelocked_empty, other.shape, other.dtype,
+                        write_combined)
+
+def mapped_zeros(shape, dtype, write_combined=False):
+    '''See mapped_alloc()'''
+    return mapped_alloc(cuda.pagelocked_zeros, shape, dtype, write_combined)
+
+def mapped_zeros_like(other, write_combined=False):
+    '''See mapped_alloc()'''
+    return mapped_alloc(cuda.pagelocked_zeros, other.shape, other.dtype,
+                        write_combined)
diff --git a/test/test_bvh_simple.py b/test/test_bvh_simple.py
new file mode 100644
index 0000000..20c0df5
--- /dev/null
+++ b/test/test_bvh_simple.py
@@ -0,0 +1,14 @@
+import pycuda.autoinit
+import unittest
+from chroma.bvh import make_simple_bvh, BVH
+import chroma.models
+
+import numpy as np
+#from numpy.testing import assert_array_max_ulp, assert_array_equal, \
+#    assert_approx_equal
+
+def test_simple_bvh():
+    mesh = chroma.models.lionsolid()
+    bvh = make_simple_bvh(mesh, degree=2)
+    assert isinstance(bvh, BVH)
+