Add more BVH manipulation commands:

* chroma-bvh create [name] [degree] - Creates a new BVH with the specified branching degree. * chroma-bvh node_swap [name] [layer] - Optimizes a BVH layer with a "greedy, short-sighted" algorithm that swaps around nodes to minimize the surface area of the immediate parent layer. Rebuilds the tree above the modified layer when finished. Also modified the chroma-bvh stat command to print the sum of the logarithms of the areas of each layer. It seems to be a rough predictor of the simulation speed of the BVH.
author: Stan Seibert <stan@mtrr.org> 2012-01-22 10:49:21 -0500
committer: tlatorre <tlatorre@uchicago.edu> 2021-05-09 08:42:38 -0700
commit: a302076191a4600da66bd461e1c2d568d341ed34 (patch)
tree: 0685370da647a3db9a0460b19f766bbaec61bfdd
parent: deda5c8fa3264bf80e3d4dcba52f13913d6d254c (diff)
download: chroma-a302076191a4600da66bd461e1c2d568d341ed34.tar.gz
chroma-a302076191a4600da66bd461e1c2d568d341ed34.tar.bz2
chroma-a302076191a4600da66bd461e1c2d568d341ed34.zip
3 files changed, 243 insertions, 21 deletions
diff --git a/bin/chroma-bvh b/bin/chroma-bvh
index fcd20a8..d67904a 100644
--- a/bin/chroma-bvh
+++ b/bin/chroma-bvh
@@ -4,9 +4,14 @@ import optparse
 import sys
 import time
 
+import numpy as np
+
 from chroma.cache import Cache
 from chroma.loader import load_geometry_from_string
 from chroma.log import logger, logging
+from chroma.bvh import make_simple_bvh
+from chroma.gpu import create_cuda_context
+from chroma.gpu.bvh import rebuild_tree, optimize_layer
 
 def parse_bvh_id(id_str):
     '''Parse a BVH identifier given on the command line into
@@ -26,9 +31,50 @@ def parse_bvh_id(id_str):
 
 def node_swap(cache, args):
     geo_name, bvh_name = parse_bvh_id(args[0])
+    opt_layer_index = int(args[1])
     mesh_hash = cache.get_geometry_hash(geo_name)
     bvh = cache.load_bvh(mesh_hash, bvh_name)
-    
+
+    if opt_layer_index < 1 or opt_layer_index > bvh.layer_count() - 1:
+        print 'Can only optimize node order in this tree for layers 1 through %d' % (bvh.layer_count() - 1)
+        return
+
+    original_parent_area = bvh.get_layer(opt_layer_index-1).area()
+    context = create_cuda_context()
+
+    print 'Optimizing layer %d through node swaps' % opt_layer_index
+    opt_layer = bvh.get_layer(opt_layer_index)
+    new_layer_nodes = optimize_layer(bvh.get_layer(opt_layer_index).nodes)
+    bvh.nodes[bvh.layer_bounds[opt_layer_index]:bvh.layer_bounds[opt_layer_index+1]] = new_layer_nodes
+
+    print 'Rebuilding tree...'
+    new_nodes = rebuild_tree(bvh, opt_layer_index)
+    bvh.nodes = new_nodes
+
+    print 'Original parent area: %e' % original_parent_area
+    print 'New parent area:      %e' % bvh.get_layer(opt_layer_index-1).area()
+
+    print 'Saving new BVH...'
+    context.pop()
+    print_stat(geo_name, bvh_name, mesh_hash, bvh)
+    cache.save_bvh(bvh, mesh_hash, bvh_name)
+
+def create(cache, args):
+    geo_name, bvh_name = parse_bvh_id(args[0])
+    degree = int(args[1])
+    mesh_hash = cache.get_geometry_hash(geo_name)
+    print 'Loading geometry (MD5=%s): %s' % (mesh_hash, geo_name)
+    geometry = cache.load_geometry(geo_name)
+
+    print 'Creating degree %d BVH...' % degree
+
+    start = time.time()
+    context = create_cuda_context()
+    bvh = make_simple_bvh(geometry.mesh, degree=degree)
+    context.pop()
+    logger.info('BVH generated in %1.1f seconds.' % (time.time() - start))
+
+    cache.save_bvh(bvh, mesh_hash, bvh_name)
 
 def list_cmd(cache, args):
     geo_name = args[0]
@@ -66,20 +112,26 @@ def print_stat(geo_name, bvh_name, mesh_hash, bvh):
     print 'Nodes: %d' % len(bvh)
     print 'Layers:'
 
+    areas = []
     for i in xrange(num_layers):
         layer = bvh.get_layer(i)
         if len(layer) == 1:
             node_str = 'node, '
         else:
             node_str = 'nodes,'
+        area = layer.area()
+        areas.append(area)
         print '  % 3d) % 10s %s    area = % 9e' % \
-            (i, len(layer), node_str, layer.area())
+            (i, len(layer), node_str, area)
+    print 'Log sum area: %f' % np.log(areas).sum()
 
 commands = {
     'stat' : stat,
+    'create': create,
     'list' : list_cmd,
     'copy' : copy,
     'remove' : remove,
+    'node_swap' : node_swap,
     }
              
 
diff --git a/chroma/cuda/bvh.cu b/chroma/cuda/bvh.cu
index a6888db..2ce0580 100644
--- a/chroma/cuda/bvh.cu
+++ b/chroma/cuda/bvh.cu
@@ -114,11 +114,11 @@ __device__ uint4 node_union(const uint4 &a, const uint4 &b)
 
 
 
-__device__ unsigned int surface_half_area(const uint4 &node)
+__device__ unsigned long long surface_half_area(const uint4 &node)
 {
-  unsigned int x = (node.x >> 16) - (node.x & 0xFFFF);
-  unsigned int y = (node.y >> 16) - (node.y & 0xFFFF);
-  unsigned int z = (node.z >> 16) - (node.z & 0xFFFF);
+  unsigned long long x = (node.x >> 16) - (node.x & 0xFFFF);
+  unsigned long long y = (node.y >> 16) - (node.y & 0xFFFF);
+  unsigned long long z = (node.z >> 16) - (node.z & 0xFFFF);
 
   return x*y + y*z + z*x;
 }
@@ -265,7 +265,8 @@ extern "C"
 	       unsigned int n_children_per_node,
 	       uint4 *parent_nodes,
 	       uint4 *child_nodes,
-	       unsigned int child_id_offset)
+	       unsigned int child_id_offset,
+	       unsigned int num_children)
   {
     unsigned int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
     if (thread_id >= elements_this_launch)
@@ -274,6 +275,9 @@ extern "C"
     unsigned int parent_id = first_node + thread_id;
     unsigned int first_child = parent_id * n_children_per_node;
 
+    if (first_child >= num_children)
+      return;
+
     // Load first child
     uint4 parent_node = child_nodes[first_child];
     uint3 lower = make_uint3(parent_node.x & 0xFFFF, parent_node.y & 0xFFFF, parent_node.z & 0xFFFF);
@@ -293,7 +297,7 @@ extern "C"
       upper = max(upper, child_upper);
     }
 
-    parent_node.w = first_child;
+    parent_node.w = first_child + child_id_offset;
     parent_node.x = upper.x << 16 | lower.x;
     parent_node.y = upper.y << 16 | lower.y;
     parent_node.z = upper.z << 16 | lower.z;
@@ -339,7 +343,29 @@ extern "C"
     area[node_id] = surface_half_area(u);
   }
 
-  __global__ void distance_to(unsigned int first_node, unsigned int threads_this_round,
+  __global__ void pair_area(unsigned int first_node,
+			    unsigned int threads_this_round,
+			    uint4 *node, unsigned long long *area)
+  {
+    unsigned int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (thread_id >= threads_this_round)
+      return;
+
+    unsigned int node_id = first_node + thread_id;
+    unsigned int child_id = node_id * 2;
+
+    uint4 a = node[child_id];
+    uint4 b = node[child_id+1];
+    if (b.x == 0)
+      b = a;
+
+    uint4 u = node_union(a, b);
+
+    area[node_id] = 2*surface_half_area(u);
+  }
+
+  __global__ void distance_to(unsigned int first_node, 
+			      unsigned int threads_this_round,
                               unsigned int target_index,
                               uint4 *node, unsigned int *area)
   {
@@ -364,19 +390,18 @@ extern "C"
                                   unsigned int target_index,
                                   uint4 *node,
                                   unsigned int block_offset,
-                                  unsigned int *min_area_block,
+                                  unsigned long long *min_area_block,
                                   unsigned int *min_index_block,
                                   unsigned int *flag)
   {
-    __shared__ unsigned int min_area;
-    __shared__ unsigned int adjacent_area;
+    __shared__ unsigned long long min_area[128];
+    __shared__ unsigned long long adjacent_area;
 
     target_index += blockIdx.y;
 
     uint4 a = node[target_index];
 
     if (threadIdx.x == 0) {
-      min_area = 0xFFFFFFFF;
       adjacent_area = surface_half_area(node_union(a, node[target_index+1]));
     }
 
@@ -389,38 +414,47 @@ extern "C"
     if (thread_id >= threads_this_round)
       node_id = target_index;
 
-    unsigned int area;
+    unsigned long long area;
 
     if (node_id == target_index) {
-      area = 0xFFFFFFFF;
+      area = 0xFFFFFFFFFFFFFFFF;
     } else {
       uint4 b = node[node_id];
 
       if (b.x == 0) {
-	area = 0xFFFFFFFF;
+	area = 0xFFFFFFFFFFFFFFFF;
       } else {
 	uint4 u = node_union(a, b);
 	area = surface_half_area(u);
       }
     }
 
-    atomicMin(&min_area, area);
+    min_area[threadIdx.x] = area;
+
+    __syncthreads();
+
+    // Too lazy to code parallel reduction right now
+    if (threadIdx.x == 0) {
+      for (int i=1; i < blockDim.x; i++)
+	min_area[0] = min(min_area[0], min_area[i]);
+    }
 
     __syncthreads();
 
-    if (min_area == area) {
+    if (min_area[0] == area) {
 
       if (blockIdx.y == 0) {
-        if (min_area < adjacent_area) {
+        if (min_area[0] < adjacent_area) {
           min_index_block[block_offset + blockIdx.x] = node_id;
           min_area_block[block_offset + blockIdx.x] = area;
           flag[0] = 1;
         } else {
-          min_area_block[block_offset + blockIdx.x] = 0xFFFFFFFF;
+          min_area_block[block_offset + blockIdx.x] = 0xFFFFFFFFFFFFFFFF;
+	  min_index_block[block_offset + blockIdx.x] = target_index + 1;
         }
       } else {
 
-        if (min_area < adjacent_area)
+        if (min_area[0] < adjacent_area)
 	  flag[blockIdx.y] = 1;
       }
 
diff --git a/chroma/gpu/bvh.py b/chroma/gpu/bvh.py
index b9c920d..7f2794e 100644
--- a/chroma/gpu/bvh.py
+++ b/chroma/gpu/bvh.py
@@ -102,6 +102,7 @@ def merge_nodes(nodes, degree):
                                parent_nodes,
                                cuda.In(nodes),
                                np.uint32(0),
+                               np.uint32(len(nodes)),
                                block=(nthreads_per_block,1,1),
                                grid=(nblocks_this_iter,1))
 
@@ -137,3 +138,138 @@ def concatenate_layers(layers):
                                       grid=(nblocks_this_iter,1))
         
     return nodes.get(), layer_bounds
+
+def rebuild_tree(bvh, start_layer):
+    bvh_module = get_cu_module('bvh.cu', options=cuda_options,
+                               include_source_directory=True)
+    bvh_funcs = GPUFuncs(bvh_module)
+
+    layer_bounds = bvh.layer_bounds
+    layer_ranges = zip(layer_bounds[:start_layer], 
+                       layer_bounds[1:start_layer+1],
+                       layer_bounds[2:start_layer+2])
+    layer_ranges.reverse()
+
+    gpu_nodes = ga.to_gpu(bvh.nodes)
+    nthreads_per_block = 256
+
+    for parent_start, parent_end, child_end in layer_ranges:
+        nparent = parent_end - parent_start
+        child_start = parent_end
+        nchild = child_end - child_start
+        parent_nodes = gpu_nodes[parent_start:]
+        child_nodes = gpu_nodes[child_start:]
+
+        for first_index, elements_this_iter, nblocks_this_iter in \
+            chunk_iterator(nparent, nthreads_per_block, max_blocks=10000):
+            bvh_funcs.make_parents(np.uint32(first_index),
+                                   np.uint32(elements_this_iter),
+                                   np.uint32(bvh.degree),
+                                   parent_nodes,
+                                   child_nodes,
+                                   np.uint32(child_start),
+                                   np.uint32(nchild),
+                                   block=(nthreads_per_block,1,1),
+                                   grid=(nblocks_this_iter,1))
+        
+    return gpu_nodes.get()
+
+def optimize_layer(orig_nodes):
+    bvh_module = get_cu_module('bvh.cu', options=cuda_options,
+                               include_source_directory=True)
+    bvh_funcs = GPUFuncs(bvh_module)
+
+    nodes = ga.to_gpu(orig_nodes)
+    n = len(nodes)
+    areas = ga.empty(shape=n/2, dtype=np.uint64)
+    nthreads_per_block = 128
+
+    min_areas = ga.empty(shape=int(np.ceil(n/float(nthreads_per_block))), dtype=np.uint64)
+    min_index = ga.empty(shape=min_areas.shape, dtype=np.uint32)
+
+    update = 10000
+
+    skip_size = 1
+    flag = mapped_empty(shape=skip_size, dtype=np.uint32)
+
+    i = 0
+    skips = 0
+    swaps = 0
+    while i < n/2 - 1:
+        # How are we doing?
+        if i % update == 0:
+            for first_index, elements_this_iter, nblocks_this_iter in \
+                    chunk_iterator(n/2, nthreads_per_block, max_blocks=10000):
+
+                bvh_funcs.pair_area(np.uint32(first_index),
+                                    np.uint32(elements_this_iter),
+                                    nodes,
+                                    areas,
+                                    block=(nthreads_per_block,1,1),
+                                    grid=(nblocks_this_iter,1))
+                
+            areas_host = areas.get()
+            #print nodes.get(), areas_host.astype(float)
+            print 'Area of parent layer so far (%d): %1.12e' % (i*2, areas_host.astype(float).sum())
+            print 'Skips: %d, Swaps: %d' % (skips, swaps)
+
+        test_index = i * 2
+
+        blocks = 0
+        look_forward = min(8192, n - test_index - 2)
+        skip_this_round = min(skip_size, n - test_index - 1)
+        flag[:] = 0
+        for first_index, elements_this_iter, nblocks_this_iter in \
+                chunk_iterator(look_forward, nthreads_per_block, max_blocks=10000):
+            bvh_funcs.min_distance_to(np.uint32(first_index + test_index + 2),
+                                      np.uint32(elements_this_iter),
+                                      np.uint32(test_index),
+                                      nodes,
+                                      np.uint32(blocks),
+                                      min_areas,
+                                      min_index,
+                                      Mapped(flag),
+                                      block=(nthreads_per_block,1,1),
+                                      grid=(nblocks_this_iter, skip_this_round))
+            blocks += nblocks_this_iter
+            #print i, first_index, nblocks_this_iter, look_forward
+        cuda.Context.get_current().synchronize()
+
+        if flag[0] == 0:
+            flag_nonzero = flag.nonzero()[0]
+            if len(flag_nonzero) == 0:
+                no_swap_required = skip_size
+            else:
+                no_swap_required = flag_nonzero[0]
+            i += no_swap_required
+            skips += no_swap_required
+            continue
+
+        min_areas_host = min_areas[:blocks].get()
+        min_index_host = min_index[:blocks].get()
+        best_block = min_areas_host.argmin()
+        better_i = min_index_host[best_block]
+
+        swaps += 1
+        #print 'swap', test_index+1, better_i
+        bvh_funcs.swap(np.uint32(test_index+1), np.uint32(better_i),
+                       nodes, block=(1,1,1), grid=(1,1))
+        cuda.Context.get_current().synchronize()
+        i += 1
+
+    for first_index, elements_this_iter, nblocks_this_iter in \
+            chunk_iterator(n/2, nthreads_per_block, max_blocks=10000):
+
+        bvh_funcs.pair_area(np.uint32(first_index),
+                            np.uint32(elements_this_iter),
+                            nodes,
+                            areas,
+                            block=(nthreads_per_block,1,1),
+                            grid=(nblocks_this_iter,1))
+        
+    areas_host = areas.get()
+
+    print 'Final area of parent layer: %1.12e' % areas_host.sum()
+    print 'Skips: %d, Swaps: %d' % (skips, swaps)
+
+    return nodes.get()
author	Stan Seibert <stan@mtrr.org>	2012-01-22 10:49:21 -0500
committer	tlatorre <tlatorre@uchicago.edu>	2021-05-09 08:42:38 -0700
commit	a302076191a4600da66bd461e1c2d568d341ed34 (patch)
tree	0685370da647a3db9a0460b19f766bbaec61bfdd
parent	deda5c8fa3264bf80e3d4dcba52f13913d6d254c (diff)
download	chroma-a302076191a4600da66bd461e1c2d568d341ed34.tar.gz chroma-a302076191a4600da66bd461e1c2d568d341ed34.tar.bz2 chroma-a302076191a4600da66bd461e1c2d568d341ed34.zip