2 files changed, 189 insertions, 19 deletions
diff --git a/chroma/cuda/bvh.cu b/chroma/cuda/bvh.cu
index a6888db..2ce0580 100644
--- a/chroma/cuda/bvh.cu
+++ b/chroma/cuda/bvh.cu
@@ -114,11 +114,11 @@ __device__ uint4 node_union(const uint4 &a, const uint4 &b)
 
 
 
-__device__ unsigned int surface_half_area(const uint4 &node)
+__device__ unsigned long long surface_half_area(const uint4 &node)
 {
-  unsigned int x = (node.x >> 16) - (node.x & 0xFFFF);
-  unsigned int y = (node.y >> 16) - (node.y & 0xFFFF);
-  unsigned int z = (node.z >> 16) - (node.z & 0xFFFF);
+  unsigned long long x = (node.x >> 16) - (node.x & 0xFFFF);
+  unsigned long long y = (node.y >> 16) - (node.y & 0xFFFF);
+  unsigned long long z = (node.z >> 16) - (node.z & 0xFFFF);
 
   return x*y + y*z + z*x;
 }
@@ -265,7 +265,8 @@ extern "C"
 	       unsigned int n_children_per_node,
 	       uint4 *parent_nodes,
 	       uint4 *child_nodes,
-	       unsigned int child_id_offset)
+	       unsigned int child_id_offset,
+	       unsigned int num_children)
   {
     unsigned int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
     if (thread_id >= elements_this_launch)
@@ -274,6 +275,9 @@ extern "C"
     unsigned int parent_id = first_node + thread_id;
     unsigned int first_child = parent_id * n_children_per_node;
 
+    if (first_child >= num_children)
+      return;
+
     // Load first child
     uint4 parent_node = child_nodes[first_child];
     uint3 lower = make_uint3(parent_node.x & 0xFFFF, parent_node.y & 0xFFFF, parent_node.z & 0xFFFF);
@@ -293,7 +297,7 @@ extern "C"
       upper = max(upper, child_upper);
     }
 
-    parent_node.w = first_child;
+    parent_node.w = first_child + child_id_offset;
     parent_node.x = upper.x << 16 | lower.x;
     parent_node.y = upper.y << 16 | lower.y;
     parent_node.z = upper.z << 16 | lower.z;
@@ -339,7 +343,29 @@ extern "C"
     area[node_id] = surface_half_area(u);
   }
 
-  __global__ void distance_to(unsigned int first_node, unsigned int threads_this_round,
+  __global__ void pair_area(unsigned int first_node,
+			    unsigned int threads_this_round,
+			    uint4 *node, unsigned long long *area)
+  {
+    unsigned int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (thread_id >= threads_this_round)
+      return;
+
+    unsigned int node_id = first_node + thread_id;
+    unsigned int child_id = node_id * 2;
+
+    uint4 a = node[child_id];
+    uint4 b = node[child_id+1];
+    if (b.x == 0)
+      b = a;
+
+    uint4 u = node_union(a, b);
+
+    area[node_id] = 2*surface_half_area(u);
+  }
+
+  __global__ void distance_to(unsigned int first_node, 
+			      unsigned int threads_this_round,
                               unsigned int target_index,
                               uint4 *node, unsigned int *area)
   {
@@ -364,19 +390,18 @@ extern "C"
                                   unsigned int target_index,
                                   uint4 *node,
                                   unsigned int block_offset,
-                                  unsigned int *min_area_block,
+                                  unsigned long long *min_area_block,
                                   unsigned int *min_index_block,
                                   unsigned int *flag)
   {
-    __shared__ unsigned int min_area;
-    __shared__ unsigned int adjacent_area;
+    __shared__ unsigned long long min_area[128];
+    __shared__ unsigned long long adjacent_area;
 
     target_index += blockIdx.y;
 
     uint4 a = node[target_index];
 
     if (threadIdx.x == 0) {
-      min_area = 0xFFFFFFFF;
       adjacent_area = surface_half_area(node_union(a, node[target_index+1]));
     }
 
@@ -389,38 +414,47 @@ extern "C"
     if (thread_id >= threads_this_round)
       node_id = target_index;
 
-    unsigned int area;
+    unsigned long long area;
 
     if (node_id == target_index) {
-      area = 0xFFFFFFFF;
+      area = 0xFFFFFFFFFFFFFFFF;
     } else {
       uint4 b = node[node_id];
 
       if (b.x == 0) {
-	area = 0xFFFFFFFF;
+	area = 0xFFFFFFFFFFFFFFFF;
       } else {
 	uint4 u = node_union(a, b);
 	area = surface_half_area(u);
       }
     }
 
-    atomicMin(&min_area, area);
+    min_area[threadIdx.x] = area;
+
+    __syncthreads();
+
+    // Too lazy to code parallel reduction right now
+    if (threadIdx.x == 0) {
+      for (int i=1; i < blockDim.x; i++)
+	min_area[0] = min(min_area[0], min_area[i]);
+    }
 
     __syncthreads();
 
-    if (min_area == area) {
+    if (min_area[0] == area) {
 
       if (blockIdx.y == 0) {
-        if (min_area < adjacent_area) {
+        if (min_area[0] < adjacent_area) {
           min_index_block[block_offset + blockIdx.x] = node_id;
           min_area_block[block_offset + blockIdx.x] = area;
           flag[0] = 1;
         } else {
-          min_area_block[block_offset + blockIdx.x] = 0xFFFFFFFF;
+          min_area_block[block_offset + blockIdx.x] = 0xFFFFFFFFFFFFFFFF;
+	  min_index_block[block_offset + blockIdx.x] = target_index + 1;
         }
       } else {
 
-        if (min_area < adjacent_area)
+        if (min_area[0] < adjacent_area)
 	  flag[blockIdx.y] = 1;
       }
 
diff --git a/chroma/gpu/bvh.py b/chroma/gpu/bvh.py
index b9c920d..7f2794e 100644
--- a/chroma/gpu/bvh.py
+++ b/chroma/gpu/bvh.py
@@ -102,6 +102,7 @@ def merge_nodes(nodes, degree):
                                parent_nodes,
                                cuda.In(nodes),
                                np.uint32(0),
+                               np.uint32(len(nodes)),
                                block=(nthreads_per_block,1,1),
                                grid=(nblocks_this_iter,1))
 
@@ -137,3 +138,138 @@ def concatenate_layers(layers):
                                       grid=(nblocks_this_iter,1))
         
     return nodes.get(), layer_bounds
+
+def rebuild_tree(bvh, start_layer):
+    bvh_module = get_cu_module('bvh.cu', options=cuda_options,
+                               include_source_directory=True)
+    bvh_funcs = GPUFuncs(bvh_module)
+
+    layer_bounds = bvh.layer_bounds
+    layer_ranges = zip(layer_bounds[:start_layer], 
+                       layer_bounds[1:start_layer+1],
+                       layer_bounds[2:start_layer+2])
+    layer_ranges.reverse()
+
+    gpu_nodes = ga.to_gpu(bvh.nodes)
+    nthreads_per_block = 256
+
+    for parent_start, parent_end, child_end in layer_ranges:
+        nparent = parent_end - parent_start
+        child_start = parent_end
+        nchild = child_end - child_start
+        parent_nodes = gpu_nodes[parent_start:]
+        child_nodes = gpu_nodes[child_start:]
+
+        for first_index, elements_this_iter, nblocks_this_iter in \
+            chunk_iterator(nparent, nthreads_per_block, max_blocks=10000):
+            bvh_funcs.make_parents(np.uint32(first_index),
+                                   np.uint32(elements_this_iter),
+                                   np.uint32(bvh.degree),
+                                   parent_nodes,
+                                   child_nodes,
+                                   np.uint32(child_start),
+                                   np.uint32(nchild),
+                                   block=(nthreads_per_block,1,1),
+                                   grid=(nblocks_this_iter,1))
+        
+    return gpu_nodes.get()
+
+def optimize_layer(orig_nodes):
+    bvh_module = get_cu_module('bvh.cu', options=cuda_options,
+                               include_source_directory=True)
+    bvh_funcs = GPUFuncs(bvh_module)
+
+    nodes = ga.to_gpu(orig_nodes)
+    n = len(nodes)
+    areas = ga.empty(shape=n/2, dtype=np.uint64)
+    nthreads_per_block = 128
+
+    min_areas = ga.empty(shape=int(np.ceil(n/float(nthreads_per_block))), dtype=np.uint64)
+    min_index = ga.empty(shape=min_areas.shape, dtype=np.uint32)
+
+    update = 10000
+
+    skip_size = 1
+    flag = mapped_empty(shape=skip_size, dtype=np.uint32)
+
+    i = 0
+    skips = 0
+    swaps = 0
+    while i < n/2 - 1:
+        # How are we doing?
+        if i % update == 0:
+            for first_index, elements_this_iter, nblocks_this_iter in \
+                    chunk_iterator(n/2, nthreads_per_block, max_blocks=10000):
+
+                bvh_funcs.pair_area(np.uint32(first_index),
+                                    np.uint32(elements_this_iter),
+                                    nodes,
+                                    areas,
+                                    block=(nthreads_per_block,1,1),
+                                    grid=(nblocks_this_iter,1))
+                
+            areas_host = areas.get()
+            #print nodes.get(), areas_host.astype(float)
+            print 'Area of parent layer so far (%d): %1.12e' % (i*2, areas_host.astype(float).sum())
+            print 'Skips: %d, Swaps: %d' % (skips, swaps)
+
+        test_index = i * 2
+
+        blocks = 0
+        look_forward = min(8192, n - test_index - 2)
+        skip_this_round = min(skip_size, n - test_index - 1)
+        flag[:] = 0
+        for first_index, elements_this_iter, nblocks_this_iter in \
+                chunk_iterator(look_forward, nthreads_per_block, max_blocks=10000):
+            bvh_funcs.min_distance_to(np.uint32(first_index + test_index + 2),
+                                      np.uint32(elements_this_iter),
+                                      np.uint32(test_index),
+                                      nodes,
+                                      np.uint32(blocks),
+                                      min_areas,
+                                      min_index,
+                                      Mapped(flag),
+                                      block=(nthreads_per_block,1,1),
+                                      grid=(nblocks_this_iter, skip_this_round))
+            blocks += nblocks_this_iter
+            #print i, first_index, nblocks_this_iter, look_forward
+        cuda.Context.get_current().synchronize()
+
+        if flag[0] == 0:
+            flag_nonzero = flag.nonzero()[0]
+            if len(flag_nonzero) == 0:
+                no_swap_required = skip_size
+            else:
+                no_swap_required = flag_nonzero[0]
+            i += no_swap_required
+            skips += no_swap_required
+            continue
+
+        min_areas_host = min_areas[:blocks].get()
+        min_index_host = min_index[:blocks].get()
+        best_block = min_areas_host.argmin()
+        better_i = min_index_host[best_block]
+
+        swaps += 1
+        #print 'swap', test_index+1, better_i
+        bvh_funcs.swap(np.uint32(test_index+1), np.uint32(better_i),
+                       nodes, block=(1,1,1), grid=(1,1))
+        cuda.Context.get_current().synchronize()
+        i += 1
+
+    for first_index, elements_this_iter, nblocks_this_iter in \
+            chunk_iterator(n/2, nthreads_per_block, max_blocks=10000):
+
+        bvh_funcs.pair_area(np.uint32(first_index),
+                            np.uint32(elements_this_iter),
+                            nodes,
+                            areas,
+                            block=(nthreads_per_block,1,1),
+                            grid=(nblocks_this_iter,1))
+        
+    areas_host = areas.get()
+
+    print 'Final area of parent layer: %1.12e' % areas_host.sum()
+    print 'Skips: %d, Swaps: %d' % (skips, swaps)
+
+    return nodes.get()