summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStan Seibert <stan@mtrr.org>2012-01-22 10:49:21 -0500
committertlatorre <tlatorre@uchicago.edu>2021-05-09 08:42:38 -0700
commita302076191a4600da66bd461e1c2d568d341ed34 (patch)
tree0685370da647a3db9a0460b19f766bbaec61bfdd
parentdeda5c8fa3264bf80e3d4dcba52f13913d6d254c (diff)
downloadchroma-a302076191a4600da66bd461e1c2d568d341ed34.tar.gz
chroma-a302076191a4600da66bd461e1c2d568d341ed34.tar.bz2
chroma-a302076191a4600da66bd461e1c2d568d341ed34.zip
Add more BVH manipulation commands:
* chroma-bvh create [name] [degree] - Creates a new BVH with the specified branching degree. * chroma-bvh node_swap [name] [layer] - Optimizes a BVH layer with a "greedy, short-sighted" algorithm that swaps around nodes to minimize the surface area of the immediate parent layer. Rebuilds the tree above the modified layer when finished. Also modified the chroma-bvh stat command to print the sum of the logarithms of the areas of each layer. It seems to be a rough predictor of the simulation speed of the BVH.
-rw-r--r--bin/chroma-bvh56
-rw-r--r--chroma/cuda/bvh.cu72
-rw-r--r--chroma/gpu/bvh.py136
3 files changed, 243 insertions, 21 deletions
diff --git a/bin/chroma-bvh b/bin/chroma-bvh
index fcd20a8..d67904a 100644
--- a/bin/chroma-bvh
+++ b/bin/chroma-bvh
@@ -4,9 +4,14 @@ import optparse
import sys
import time
+import numpy as np
+
from chroma.cache import Cache
from chroma.loader import load_geometry_from_string
from chroma.log import logger, logging
+from chroma.bvh import make_simple_bvh
+from chroma.gpu import create_cuda_context
+from chroma.gpu.bvh import rebuild_tree, optimize_layer
def parse_bvh_id(id_str):
'''Parse a BVH identifier given on the command line into
@@ -26,9 +31,50 @@ def parse_bvh_id(id_str):
def node_swap(cache, args):
geo_name, bvh_name = parse_bvh_id(args[0])
+ opt_layer_index = int(args[1])
mesh_hash = cache.get_geometry_hash(geo_name)
bvh = cache.load_bvh(mesh_hash, bvh_name)
-
+
+ if opt_layer_index < 1 or opt_layer_index > bvh.layer_count() - 1:
+ print 'Can only optimize node order in this tree for layers 1 through %d' % (bvh.layer_count() - 1)
+ return
+
+ original_parent_area = bvh.get_layer(opt_layer_index-1).area()
+ context = create_cuda_context()
+
+ print 'Optimizing layer %d through node swaps' % opt_layer_index
+ opt_layer = bvh.get_layer(opt_layer_index)
+ new_layer_nodes = optimize_layer(bvh.get_layer(opt_layer_index).nodes)
+ bvh.nodes[bvh.layer_bounds[opt_layer_index]:bvh.layer_bounds[opt_layer_index+1]] = new_layer_nodes
+
+ print 'Rebuilding tree...'
+ new_nodes = rebuild_tree(bvh, opt_layer_index)
+ bvh.nodes = new_nodes
+
+ print 'Original parent area: %e' % original_parent_area
+ print 'New parent area: %e' % bvh.get_layer(opt_layer_index-1).area()
+
+ print 'Saving new BVH...'
+ context.pop()
+ print_stat(geo_name, bvh_name, mesh_hash, bvh)
+ cache.save_bvh(bvh, mesh_hash, bvh_name)
+
+def create(cache, args):
+ geo_name, bvh_name = parse_bvh_id(args[0])
+ degree = int(args[1])
+ mesh_hash = cache.get_geometry_hash(geo_name)
+ print 'Loading geometry (MD5=%s): %s' % (mesh_hash, geo_name)
+ geometry = cache.load_geometry(geo_name)
+
+ print 'Creating degree %d BVH...' % degree
+
+ start = time.time()
+ context = create_cuda_context()
+ bvh = make_simple_bvh(geometry.mesh, degree=degree)
+ context.pop()
+ logger.info('BVH generated in %1.1f seconds.' % (time.time() - start))
+
+ cache.save_bvh(bvh, mesh_hash, bvh_name)
def list_cmd(cache, args):
geo_name = args[0]
@@ -66,20 +112,26 @@ def print_stat(geo_name, bvh_name, mesh_hash, bvh):
print 'Nodes: %d' % len(bvh)
print 'Layers:'
+ areas = []
for i in xrange(num_layers):
layer = bvh.get_layer(i)
if len(layer) == 1:
node_str = 'node, '
else:
node_str = 'nodes,'
+ area = layer.area()
+ areas.append(area)
print ' % 3d) % 10s %s area = % 9e' % \
- (i, len(layer), node_str, layer.area())
+ (i, len(layer), node_str, area)
+ print 'Log sum area: %f' % np.log(areas).sum()
commands = {
'stat' : stat,
+ 'create': create,
'list' : list_cmd,
'copy' : copy,
'remove' : remove,
+ 'node_swap' : node_swap,
}
diff --git a/chroma/cuda/bvh.cu b/chroma/cuda/bvh.cu
index a6888db..2ce0580 100644
--- a/chroma/cuda/bvh.cu
+++ b/chroma/cuda/bvh.cu
@@ -114,11 +114,11 @@ __device__ uint4 node_union(const uint4 &a, const uint4 &b)
-__device__ unsigned int surface_half_area(const uint4 &node)
+__device__ unsigned long long surface_half_area(const uint4 &node)
{
- unsigned int x = (node.x >> 16) - (node.x & 0xFFFF);
- unsigned int y = (node.y >> 16) - (node.y & 0xFFFF);
- unsigned int z = (node.z >> 16) - (node.z & 0xFFFF);
+ unsigned long long x = (node.x >> 16) - (node.x & 0xFFFF);
+ unsigned long long y = (node.y >> 16) - (node.y & 0xFFFF);
+ unsigned long long z = (node.z >> 16) - (node.z & 0xFFFF);
return x*y + y*z + z*x;
}
@@ -265,7 +265,8 @@ extern "C"
unsigned int n_children_per_node,
uint4 *parent_nodes,
uint4 *child_nodes,
- unsigned int child_id_offset)
+ unsigned int child_id_offset,
+ unsigned int num_children)
{
unsigned int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
if (thread_id >= elements_this_launch)
@@ -274,6 +275,9 @@ extern "C"
unsigned int parent_id = first_node + thread_id;
unsigned int first_child = parent_id * n_children_per_node;
+ if (first_child >= num_children)
+ return;
+
// Load first child
uint4 parent_node = child_nodes[first_child];
uint3 lower = make_uint3(parent_node.x & 0xFFFF, parent_node.y & 0xFFFF, parent_node.z & 0xFFFF);
@@ -293,7 +297,7 @@ extern "C"
upper = max(upper, child_upper);
}
- parent_node.w = first_child;
+ parent_node.w = first_child + child_id_offset;
parent_node.x = upper.x << 16 | lower.x;
parent_node.y = upper.y << 16 | lower.y;
parent_node.z = upper.z << 16 | lower.z;
@@ -339,7 +343,29 @@ extern "C"
area[node_id] = surface_half_area(u);
}
- __global__ void distance_to(unsigned int first_node, unsigned int threads_this_round,
+ __global__ void pair_area(unsigned int first_node,
+ unsigned int threads_this_round,
+ uint4 *node, unsigned long long *area)
+ {
+ unsigned int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+ if (thread_id >= threads_this_round)
+ return;
+
+ unsigned int node_id = first_node + thread_id;
+ unsigned int child_id = node_id * 2;
+
+ uint4 a = node[child_id];
+ uint4 b = node[child_id+1];
+ if (b.x == 0)
+ b = a;
+
+ uint4 u = node_union(a, b);
+
+ area[node_id] = 2*surface_half_area(u);
+ }
+
+ __global__ void distance_to(unsigned int first_node,
+ unsigned int threads_this_round,
unsigned int target_index,
uint4 *node, unsigned int *area)
{
@@ -364,19 +390,18 @@ extern "C"
unsigned int target_index,
uint4 *node,
unsigned int block_offset,
- unsigned int *min_area_block,
+ unsigned long long *min_area_block,
unsigned int *min_index_block,
unsigned int *flag)
{
- __shared__ unsigned int min_area;
- __shared__ unsigned int adjacent_area;
+ __shared__ unsigned long long min_area[128];
+ __shared__ unsigned long long adjacent_area;
target_index += blockIdx.y;
uint4 a = node[target_index];
if (threadIdx.x == 0) {
- min_area = 0xFFFFFFFF;
adjacent_area = surface_half_area(node_union(a, node[target_index+1]));
}
@@ -389,38 +414,47 @@ extern "C"
if (thread_id >= threads_this_round)
node_id = target_index;
- unsigned int area;
+ unsigned long long area;
if (node_id == target_index) {
- area = 0xFFFFFFFF;
+ area = 0xFFFFFFFFFFFFFFFF;
} else {
uint4 b = node[node_id];
if (b.x == 0) {
- area = 0xFFFFFFFF;
+ area = 0xFFFFFFFFFFFFFFFF;
} else {
uint4 u = node_union(a, b);
area = surface_half_area(u);
}
}
- atomicMin(&min_area, area);
+ min_area[threadIdx.x] = area;
+
+ __syncthreads();
+
+ // Too lazy to code parallel reduction right now
+ if (threadIdx.x == 0) {
+ for (int i=1; i < blockDim.x; i++)
+ min_area[0] = min(min_area[0], min_area[i]);
+ }
__syncthreads();
- if (min_area == area) {
+ if (min_area[0] == area) {
if (blockIdx.y == 0) {
- if (min_area < adjacent_area) {
+ if (min_area[0] < adjacent_area) {
min_index_block[block_offset + blockIdx.x] = node_id;
min_area_block[block_offset + blockIdx.x] = area;
flag[0] = 1;
} else {
- min_area_block[block_offset + blockIdx.x] = 0xFFFFFFFF;
+ min_area_block[block_offset + blockIdx.x] = 0xFFFFFFFFFFFFFFFF;
+ min_index_block[block_offset + blockIdx.x] = target_index + 1;
}
} else {
- if (min_area < adjacent_area)
+ if (min_area[0] < adjacent_area)
flag[blockIdx.y] = 1;
}
diff --git a/chroma/gpu/bvh.py b/chroma/gpu/bvh.py
index b9c920d..7f2794e 100644
--- a/chroma/gpu/bvh.py
+++ b/chroma/gpu/bvh.py
@@ -102,6 +102,7 @@ def merge_nodes(nodes, degree):
parent_nodes,
cuda.In(nodes),
np.uint32(0),
+ np.uint32(len(nodes)),
block=(nthreads_per_block,1,1),
grid=(nblocks_this_iter,1))
@@ -137,3 +138,138 @@ def concatenate_layers(layers):
grid=(nblocks_this_iter,1))
return nodes.get(), layer_bounds
+
+def rebuild_tree(bvh, start_layer):
+ bvh_module = get_cu_module('bvh.cu', options=cuda_options,
+ include_source_directory=True)
+ bvh_funcs = GPUFuncs(bvh_module)
+
+ layer_bounds = bvh.layer_bounds
+ layer_ranges = zip(layer_bounds[:start_layer],
+ layer_bounds[1:start_layer+1],
+ layer_bounds[2:start_layer+2])
+ layer_ranges.reverse()
+
+ gpu_nodes = ga.to_gpu(bvh.nodes)
+ nthreads_per_block = 256
+
+ for parent_start, parent_end, child_end in layer_ranges:
+ nparent = parent_end - parent_start
+ child_start = parent_end
+ nchild = child_end - child_start
+ parent_nodes = gpu_nodes[parent_start:]
+ child_nodes = gpu_nodes[child_start:]
+
+ for first_index, elements_this_iter, nblocks_this_iter in \
+ chunk_iterator(nparent, nthreads_per_block, max_blocks=10000):
+ bvh_funcs.make_parents(np.uint32(first_index),
+ np.uint32(elements_this_iter),
+ np.uint32(bvh.degree),
+ parent_nodes,
+ child_nodes,
+ np.uint32(child_start),
+ np.uint32(nchild),
+ block=(nthreads_per_block,1,1),
+ grid=(nblocks_this_iter,1))
+
+ return gpu_nodes.get()
+
+def optimize_layer(orig_nodes):
+ bvh_module = get_cu_module('bvh.cu', options=cuda_options,
+ include_source_directory=True)
+ bvh_funcs = GPUFuncs(bvh_module)
+
+ nodes = ga.to_gpu(orig_nodes)
+ n = len(nodes)
+ areas = ga.empty(shape=n/2, dtype=np.uint64)
+ nthreads_per_block = 128
+
+ min_areas = ga.empty(shape=int(np.ceil(n/float(nthreads_per_block))), dtype=np.uint64)
+ min_index = ga.empty(shape=min_areas.shape, dtype=np.uint32)
+
+ update = 10000
+
+ skip_size = 1
+ flag = mapped_empty(shape=skip_size, dtype=np.uint32)
+
+ i = 0
+ skips = 0
+ swaps = 0
+ while i < n/2 - 1:
+ # How are we doing?
+ if i % update == 0:
+ for first_index, elements_this_iter, nblocks_this_iter in \
+ chunk_iterator(n/2, nthreads_per_block, max_blocks=10000):
+
+ bvh_funcs.pair_area(np.uint32(first_index),
+ np.uint32(elements_this_iter),
+ nodes,
+ areas,
+ block=(nthreads_per_block,1,1),
+ grid=(nblocks_this_iter,1))
+
+ areas_host = areas.get()
+ #print nodes.get(), areas_host.astype(float)
+ print 'Area of parent layer so far (%d): %1.12e' % (i*2, areas_host.astype(float).sum())
+ print 'Skips: %d, Swaps: %d' % (skips, swaps)
+
+ test_index = i * 2
+
+ blocks = 0
+ look_forward = min(8192, n - test_index - 2)
+ skip_this_round = min(skip_size, n - test_index - 1)
+ flag[:] = 0
+ for first_index, elements_this_iter, nblocks_this_iter in \
+ chunk_iterator(look_forward, nthreads_per_block, max_blocks=10000):
+ bvh_funcs.min_distance_to(np.uint32(first_index + test_index + 2),
+ np.uint32(elements_this_iter),
+ np.uint32(test_index),
+ nodes,
+ np.uint32(blocks),
+ min_areas,
+ min_index,
+ Mapped(flag),
+ block=(nthreads_per_block,1,1),
+ grid=(nblocks_this_iter, skip_this_round))
+ blocks += nblocks_this_iter
+ #print i, first_index, nblocks_this_iter, look_forward
+ cuda.Context.get_current().synchronize()
+
+ if flag[0] == 0:
+ flag_nonzero = flag.nonzero()[0]
+ if len(flag_nonzero) == 0:
+ no_swap_required = skip_size
+ else:
+ no_swap_required = flag_nonzero[0]
+ i += no_swap_required
+ skips += no_swap_required
+ continue
+
+ min_areas_host = min_areas[:blocks].get()
+ min_index_host = min_index[:blocks].get()
+ best_block = min_areas_host.argmin()
+ better_i = min_index_host[best_block]
+
+ swaps += 1
+ #print 'swap', test_index+1, better_i
+ bvh_funcs.swap(np.uint32(test_index+1), np.uint32(better_i),
+ nodes, block=(1,1,1), grid=(1,1))
+ cuda.Context.get_current().synchronize()
+ i += 1
+
+ for first_index, elements_this_iter, nblocks_this_iter in \
+ chunk_iterator(n/2, nthreads_per_block, max_blocks=10000):
+
+ bvh_funcs.pair_area(np.uint32(first_index),
+ np.uint32(elements_this_iter),
+ nodes,
+ areas,
+ block=(nthreads_per_block,1,1),
+ grid=(nblocks_this_iter,1))
+
+ areas_host = areas.get()
+
+ print 'Final area of parent layer: %1.12e' % areas_host.sum()
+ print 'Skips: %d, Swaps: %d' % (skips, swaps)
+
+ return nodes.get()