diff options
Diffstat (limited to 'chroma/gpu/bvh.py')
-rw-r--r-- | chroma/gpu/bvh.py | 136 |
1 files changed, 136 insertions, 0 deletions
diff --git a/chroma/gpu/bvh.py b/chroma/gpu/bvh.py index b9c920d..7f2794e 100644 --- a/chroma/gpu/bvh.py +++ b/chroma/gpu/bvh.py @@ -102,6 +102,7 @@ def merge_nodes(nodes, degree): parent_nodes, cuda.In(nodes), np.uint32(0), + np.uint32(len(nodes)), block=(nthreads_per_block,1,1), grid=(nblocks_this_iter,1)) @@ -137,3 +138,138 @@ def concatenate_layers(layers): grid=(nblocks_this_iter,1)) return nodes.get(), layer_bounds + +def rebuild_tree(bvh, start_layer): + bvh_module = get_cu_module('bvh.cu', options=cuda_options, + include_source_directory=True) + bvh_funcs = GPUFuncs(bvh_module) + + layer_bounds = bvh.layer_bounds + layer_ranges = zip(layer_bounds[:start_layer], + layer_bounds[1:start_layer+1], + layer_bounds[2:start_layer+2]) + layer_ranges.reverse() + + gpu_nodes = ga.to_gpu(bvh.nodes) + nthreads_per_block = 256 + + for parent_start, parent_end, child_end in layer_ranges: + nparent = parent_end - parent_start + child_start = parent_end + nchild = child_end - child_start + parent_nodes = gpu_nodes[parent_start:] + child_nodes = gpu_nodes[child_start:] + + for first_index, elements_this_iter, nblocks_this_iter in \ + chunk_iterator(nparent, nthreads_per_block, max_blocks=10000): + bvh_funcs.make_parents(np.uint32(first_index), + np.uint32(elements_this_iter), + np.uint32(bvh.degree), + parent_nodes, + child_nodes, + np.uint32(child_start), + np.uint32(nchild), + block=(nthreads_per_block,1,1), + grid=(nblocks_this_iter,1)) + + return gpu_nodes.get() + +def optimize_layer(orig_nodes): + bvh_module = get_cu_module('bvh.cu', options=cuda_options, + include_source_directory=True) + bvh_funcs = GPUFuncs(bvh_module) + + nodes = ga.to_gpu(orig_nodes) + n = len(nodes) + areas = ga.empty(shape=n/2, dtype=np.uint64) + nthreads_per_block = 128 + + min_areas = ga.empty(shape=int(np.ceil(n/float(nthreads_per_block))), dtype=np.uint64) + min_index = ga.empty(shape=min_areas.shape, dtype=np.uint32) + + update = 10000 + + skip_size = 1 + flag = mapped_empty(shape=skip_size, dtype=np.uint32) + + i = 0 + skips = 0 + swaps = 0 + while i < n/2 - 1: + # How are we doing? + if i % update == 0: + for first_index, elements_this_iter, nblocks_this_iter in \ + chunk_iterator(n/2, nthreads_per_block, max_blocks=10000): + + bvh_funcs.pair_area(np.uint32(first_index), + np.uint32(elements_this_iter), + nodes, + areas, + block=(nthreads_per_block,1,1), + grid=(nblocks_this_iter,1)) + + areas_host = areas.get() + #print nodes.get(), areas_host.astype(float) + print 'Area of parent layer so far (%d): %1.12e' % (i*2, areas_host.astype(float).sum()) + print 'Skips: %d, Swaps: %d' % (skips, swaps) + + test_index = i * 2 + + blocks = 0 + look_forward = min(8192, n - test_index - 2) + skip_this_round = min(skip_size, n - test_index - 1) + flag[:] = 0 + for first_index, elements_this_iter, nblocks_this_iter in \ + chunk_iterator(look_forward, nthreads_per_block, max_blocks=10000): + bvh_funcs.min_distance_to(np.uint32(first_index + test_index + 2), + np.uint32(elements_this_iter), + np.uint32(test_index), + nodes, + np.uint32(blocks), + min_areas, + min_index, + Mapped(flag), + block=(nthreads_per_block,1,1), + grid=(nblocks_this_iter, skip_this_round)) + blocks += nblocks_this_iter + #print i, first_index, nblocks_this_iter, look_forward + cuda.Context.get_current().synchronize() + + if flag[0] == 0: + flag_nonzero = flag.nonzero()[0] + if len(flag_nonzero) == 0: + no_swap_required = skip_size + else: + no_swap_required = flag_nonzero[0] + i += no_swap_required + skips += no_swap_required + continue + + min_areas_host = min_areas[:blocks].get() + min_index_host = min_index[:blocks].get() + best_block = min_areas_host.argmin() + better_i = min_index_host[best_block] + + swaps += 1 + #print 'swap', test_index+1, better_i + bvh_funcs.swap(np.uint32(test_index+1), np.uint32(better_i), + nodes, block=(1,1,1), grid=(1,1)) + cuda.Context.get_current().synchronize() + i += 1 + + for first_index, elements_this_iter, nblocks_this_iter in \ + chunk_iterator(n/2, nthreads_per_block, max_blocks=10000): + + bvh_funcs.pair_area(np.uint32(first_index), + np.uint32(elements_this_iter), + nodes, + areas, + block=(nthreads_per_block,1,1), + grid=(nblocks_this_iter,1)) + + areas_host = areas.get() + + print 'Final area of parent layer: %1.12e' % areas_host.sum() + print 'Skips: %d, Swaps: %d' % (skips, swaps) + + return nodes.get() |