diff options
author | Stan Seibert <stan@mtrr.org> | 2012-01-22 10:49:21 -0500 |
---|---|---|
committer | tlatorre <tlatorre@uchicago.edu> | 2021-05-09 08:42:38 -0700 |
commit | a302076191a4600da66bd461e1c2d568d341ed34 (patch) | |
tree | 0685370da647a3db9a0460b19f766bbaec61bfdd | |
parent | deda5c8fa3264bf80e3d4dcba52f13913d6d254c (diff) | |
download | chroma-a302076191a4600da66bd461e1c2d568d341ed34.tar.gz chroma-a302076191a4600da66bd461e1c2d568d341ed34.tar.bz2 chroma-a302076191a4600da66bd461e1c2d568d341ed34.zip |
Add more BVH manipulation commands:
* chroma-bvh create [name] [degree] - Creates a new BVH with the specified
branching degree.
* chroma-bvh node_swap [name] [layer] - Optimizes a BVH layer with a
"greedy, short-sighted" algorithm that swaps around nodes to minimize
the surface area of the immediate parent layer. Rebuilds the tree
above the modified layer when finished.
Also modified the chroma-bvh stat command to print the sum of the
logarithms of the areas of each layer. It seems to be a rough
predictor of the simulation speed of the BVH.
-rw-r--r-- | bin/chroma-bvh | 56 | ||||
-rw-r--r-- | chroma/cuda/bvh.cu | 72 | ||||
-rw-r--r-- | chroma/gpu/bvh.py | 136 |
3 files changed, 243 insertions, 21 deletions
diff --git a/bin/chroma-bvh b/bin/chroma-bvh index fcd20a8..d67904a 100644 --- a/bin/chroma-bvh +++ b/bin/chroma-bvh @@ -4,9 +4,14 @@ import optparse import sys import time +import numpy as np + from chroma.cache import Cache from chroma.loader import load_geometry_from_string from chroma.log import logger, logging +from chroma.bvh import make_simple_bvh +from chroma.gpu import create_cuda_context +from chroma.gpu.bvh import rebuild_tree, optimize_layer def parse_bvh_id(id_str): '''Parse a BVH identifier given on the command line into @@ -26,9 +31,50 @@ def parse_bvh_id(id_str): def node_swap(cache, args): geo_name, bvh_name = parse_bvh_id(args[0]) + opt_layer_index = int(args[1]) mesh_hash = cache.get_geometry_hash(geo_name) bvh = cache.load_bvh(mesh_hash, bvh_name) - + + if opt_layer_index < 1 or opt_layer_index > bvh.layer_count() - 1: + print 'Can only optimize node order in this tree for layers 1 through %d' % (bvh.layer_count() - 1) + return + + original_parent_area = bvh.get_layer(opt_layer_index-1).area() + context = create_cuda_context() + + print 'Optimizing layer %d through node swaps' % opt_layer_index + opt_layer = bvh.get_layer(opt_layer_index) + new_layer_nodes = optimize_layer(bvh.get_layer(opt_layer_index).nodes) + bvh.nodes[bvh.layer_bounds[opt_layer_index]:bvh.layer_bounds[opt_layer_index+1]] = new_layer_nodes + + print 'Rebuilding tree...' + new_nodes = rebuild_tree(bvh, opt_layer_index) + bvh.nodes = new_nodes + + print 'Original parent area: %e' % original_parent_area + print 'New parent area: %e' % bvh.get_layer(opt_layer_index-1).area() + + print 'Saving new BVH...' + context.pop() + print_stat(geo_name, bvh_name, mesh_hash, bvh) + cache.save_bvh(bvh, mesh_hash, bvh_name) + +def create(cache, args): + geo_name, bvh_name = parse_bvh_id(args[0]) + degree = int(args[1]) + mesh_hash = cache.get_geometry_hash(geo_name) + print 'Loading geometry (MD5=%s): %s' % (mesh_hash, geo_name) + geometry = cache.load_geometry(geo_name) + + print 'Creating degree %d BVH...' % degree + + start = time.time() + context = create_cuda_context() + bvh = make_simple_bvh(geometry.mesh, degree=degree) + context.pop() + logger.info('BVH generated in %1.1f seconds.' % (time.time() - start)) + + cache.save_bvh(bvh, mesh_hash, bvh_name) def list_cmd(cache, args): geo_name = args[0] @@ -66,20 +112,26 @@ def print_stat(geo_name, bvh_name, mesh_hash, bvh): print 'Nodes: %d' % len(bvh) print 'Layers:' + areas = [] for i in xrange(num_layers): layer = bvh.get_layer(i) if len(layer) == 1: node_str = 'node, ' else: node_str = 'nodes,' + area = layer.area() + areas.append(area) print ' % 3d) % 10s %s area = % 9e' % \ - (i, len(layer), node_str, layer.area()) + (i, len(layer), node_str, area) + print 'Log sum area: %f' % np.log(areas).sum() commands = { 'stat' : stat, + 'create': create, 'list' : list_cmd, 'copy' : copy, 'remove' : remove, + 'node_swap' : node_swap, } diff --git a/chroma/cuda/bvh.cu b/chroma/cuda/bvh.cu index a6888db..2ce0580 100644 --- a/chroma/cuda/bvh.cu +++ b/chroma/cuda/bvh.cu @@ -114,11 +114,11 @@ __device__ uint4 node_union(const uint4 &a, const uint4 &b) -__device__ unsigned int surface_half_area(const uint4 &node) +__device__ unsigned long long surface_half_area(const uint4 &node) { - unsigned int x = (node.x >> 16) - (node.x & 0xFFFF); - unsigned int y = (node.y >> 16) - (node.y & 0xFFFF); - unsigned int z = (node.z >> 16) - (node.z & 0xFFFF); + unsigned long long x = (node.x >> 16) - (node.x & 0xFFFF); + unsigned long long y = (node.y >> 16) - (node.y & 0xFFFF); + unsigned long long z = (node.z >> 16) - (node.z & 0xFFFF); return x*y + y*z + z*x; } @@ -265,7 +265,8 @@ extern "C" unsigned int n_children_per_node, uint4 *parent_nodes, uint4 *child_nodes, - unsigned int child_id_offset) + unsigned int child_id_offset, + unsigned int num_children) { unsigned int thread_id = blockDim.x * blockIdx.x + threadIdx.x; if (thread_id >= elements_this_launch) @@ -274,6 +275,9 @@ extern "C" unsigned int parent_id = first_node + thread_id; unsigned int first_child = parent_id * n_children_per_node; + if (first_child >= num_children) + return; + // Load first child uint4 parent_node = child_nodes[first_child]; uint3 lower = make_uint3(parent_node.x & 0xFFFF, parent_node.y & 0xFFFF, parent_node.z & 0xFFFF); @@ -293,7 +297,7 @@ extern "C" upper = max(upper, child_upper); } - parent_node.w = first_child; + parent_node.w = first_child + child_id_offset; parent_node.x = upper.x << 16 | lower.x; parent_node.y = upper.y << 16 | lower.y; parent_node.z = upper.z << 16 | lower.z; @@ -339,7 +343,29 @@ extern "C" area[node_id] = surface_half_area(u); } - __global__ void distance_to(unsigned int first_node, unsigned int threads_this_round, + __global__ void pair_area(unsigned int first_node, + unsigned int threads_this_round, + uint4 *node, unsigned long long *area) + { + unsigned int thread_id = blockDim.x * blockIdx.x + threadIdx.x; + if (thread_id >= threads_this_round) + return; + + unsigned int node_id = first_node + thread_id; + unsigned int child_id = node_id * 2; + + uint4 a = node[child_id]; + uint4 b = node[child_id+1]; + if (b.x == 0) + b = a; + + uint4 u = node_union(a, b); + + area[node_id] = 2*surface_half_area(u); + } + + __global__ void distance_to(unsigned int first_node, + unsigned int threads_this_round, unsigned int target_index, uint4 *node, unsigned int *area) { @@ -364,19 +390,18 @@ extern "C" unsigned int target_index, uint4 *node, unsigned int block_offset, - unsigned int *min_area_block, + unsigned long long *min_area_block, unsigned int *min_index_block, unsigned int *flag) { - __shared__ unsigned int min_area; - __shared__ unsigned int adjacent_area; + __shared__ unsigned long long min_area[128]; + __shared__ unsigned long long adjacent_area; target_index += blockIdx.y; uint4 a = node[target_index]; if (threadIdx.x == 0) { - min_area = 0xFFFFFFFF; adjacent_area = surface_half_area(node_union(a, node[target_index+1])); } @@ -389,38 +414,47 @@ extern "C" if (thread_id >= threads_this_round) node_id = target_index; - unsigned int area; + unsigned long long area; if (node_id == target_index) { - area = 0xFFFFFFFF; + area = 0xFFFFFFFFFFFFFFFF; } else { uint4 b = node[node_id]; if (b.x == 0) { - area = 0xFFFFFFFF; + area = 0xFFFFFFFFFFFFFFFF; } else { uint4 u = node_union(a, b); area = surface_half_area(u); } } - atomicMin(&min_area, area); + min_area[threadIdx.x] = area; + + __syncthreads(); + + // Too lazy to code parallel reduction right now + if (threadIdx.x == 0) { + for (int i=1; i < blockDim.x; i++) + min_area[0] = min(min_area[0], min_area[i]); + } __syncthreads(); - if (min_area == area) { + if (min_area[0] == area) { if (blockIdx.y == 0) { - if (min_area < adjacent_area) { + if (min_area[0] < adjacent_area) { min_index_block[block_offset + blockIdx.x] = node_id; min_area_block[block_offset + blockIdx.x] = area; flag[0] = 1; } else { - min_area_block[block_offset + blockIdx.x] = 0xFFFFFFFF; + min_area_block[block_offset + blockIdx.x] = 0xFFFFFFFFFFFFFFFF; + min_index_block[block_offset + blockIdx.x] = target_index + 1; } } else { - if (min_area < adjacent_area) + if (min_area[0] < adjacent_area) flag[blockIdx.y] = 1; } diff --git a/chroma/gpu/bvh.py b/chroma/gpu/bvh.py index b9c920d..7f2794e 100644 --- a/chroma/gpu/bvh.py +++ b/chroma/gpu/bvh.py @@ -102,6 +102,7 @@ def merge_nodes(nodes, degree): parent_nodes, cuda.In(nodes), np.uint32(0), + np.uint32(len(nodes)), block=(nthreads_per_block,1,1), grid=(nblocks_this_iter,1)) @@ -137,3 +138,138 @@ def concatenate_layers(layers): grid=(nblocks_this_iter,1)) return nodes.get(), layer_bounds + +def rebuild_tree(bvh, start_layer): + bvh_module = get_cu_module('bvh.cu', options=cuda_options, + include_source_directory=True) + bvh_funcs = GPUFuncs(bvh_module) + + layer_bounds = bvh.layer_bounds + layer_ranges = zip(layer_bounds[:start_layer], + layer_bounds[1:start_layer+1], + layer_bounds[2:start_layer+2]) + layer_ranges.reverse() + + gpu_nodes = ga.to_gpu(bvh.nodes) + nthreads_per_block = 256 + + for parent_start, parent_end, child_end in layer_ranges: + nparent = parent_end - parent_start + child_start = parent_end + nchild = child_end - child_start + parent_nodes = gpu_nodes[parent_start:] + child_nodes = gpu_nodes[child_start:] + + for first_index, elements_this_iter, nblocks_this_iter in \ + chunk_iterator(nparent, nthreads_per_block, max_blocks=10000): + bvh_funcs.make_parents(np.uint32(first_index), + np.uint32(elements_this_iter), + np.uint32(bvh.degree), + parent_nodes, + child_nodes, + np.uint32(child_start), + np.uint32(nchild), + block=(nthreads_per_block,1,1), + grid=(nblocks_this_iter,1)) + + return gpu_nodes.get() + +def optimize_layer(orig_nodes): + bvh_module = get_cu_module('bvh.cu', options=cuda_options, + include_source_directory=True) + bvh_funcs = GPUFuncs(bvh_module) + + nodes = ga.to_gpu(orig_nodes) + n = len(nodes) + areas = ga.empty(shape=n/2, dtype=np.uint64) + nthreads_per_block = 128 + + min_areas = ga.empty(shape=int(np.ceil(n/float(nthreads_per_block))), dtype=np.uint64) + min_index = ga.empty(shape=min_areas.shape, dtype=np.uint32) + + update = 10000 + + skip_size = 1 + flag = mapped_empty(shape=skip_size, dtype=np.uint32) + + i = 0 + skips = 0 + swaps = 0 + while i < n/2 - 1: + # How are we doing? + if i % update == 0: + for first_index, elements_this_iter, nblocks_this_iter in \ + chunk_iterator(n/2, nthreads_per_block, max_blocks=10000): + + bvh_funcs.pair_area(np.uint32(first_index), + np.uint32(elements_this_iter), + nodes, + areas, + block=(nthreads_per_block,1,1), + grid=(nblocks_this_iter,1)) + + areas_host = areas.get() + #print nodes.get(), areas_host.astype(float) + print 'Area of parent layer so far (%d): %1.12e' % (i*2, areas_host.astype(float).sum()) + print 'Skips: %d, Swaps: %d' % (skips, swaps) + + test_index = i * 2 + + blocks = 0 + look_forward = min(8192, n - test_index - 2) + skip_this_round = min(skip_size, n - test_index - 1) + flag[:] = 0 + for first_index, elements_this_iter, nblocks_this_iter in \ + chunk_iterator(look_forward, nthreads_per_block, max_blocks=10000): + bvh_funcs.min_distance_to(np.uint32(first_index + test_index + 2), + np.uint32(elements_this_iter), + np.uint32(test_index), + nodes, + np.uint32(blocks), + min_areas, + min_index, + Mapped(flag), + block=(nthreads_per_block,1,1), + grid=(nblocks_this_iter, skip_this_round)) + blocks += nblocks_this_iter + #print i, first_index, nblocks_this_iter, look_forward + cuda.Context.get_current().synchronize() + + if flag[0] == 0: + flag_nonzero = flag.nonzero()[0] + if len(flag_nonzero) == 0: + no_swap_required = skip_size + else: + no_swap_required = flag_nonzero[0] + i += no_swap_required + skips += no_swap_required + continue + + min_areas_host = min_areas[:blocks].get() + min_index_host = min_index[:blocks].get() + best_block = min_areas_host.argmin() + better_i = min_index_host[best_block] + + swaps += 1 + #print 'swap', test_index+1, better_i + bvh_funcs.swap(np.uint32(test_index+1), np.uint32(better_i), + nodes, block=(1,1,1), grid=(1,1)) + cuda.Context.get_current().synchronize() + i += 1 + + for first_index, elements_this_iter, nblocks_this_iter in \ + chunk_iterator(n/2, nthreads_per_block, max_blocks=10000): + + bvh_funcs.pair_area(np.uint32(first_index), + np.uint32(elements_this_iter), + nodes, + areas, + block=(nthreads_per_block,1,1), + grid=(nblocks_this_iter,1)) + + areas_host = areas.get() + + print 'Final area of parent layer: %1.12e' % areas_host.sum() + print 'Skips: %d, Swaps: %d' % (skips, swaps) + + return nodes.get() |