summaryrefslogtreecommitdiff
path: root/chroma/gpu/bvh.py
diff options
context:
space:
mode:
Diffstat (limited to 'chroma/gpu/bvh.py')
-rw-r--r--chroma/gpu/bvh.py136
1 files changed, 136 insertions, 0 deletions
diff --git a/chroma/gpu/bvh.py b/chroma/gpu/bvh.py
index b9c920d..7f2794e 100644
--- a/chroma/gpu/bvh.py
+++ b/chroma/gpu/bvh.py
@@ -102,6 +102,7 @@ def merge_nodes(nodes, degree):
parent_nodes,
cuda.In(nodes),
np.uint32(0),
+ np.uint32(len(nodes)),
block=(nthreads_per_block,1,1),
grid=(nblocks_this_iter,1))
@@ -137,3 +138,138 @@ def concatenate_layers(layers):
grid=(nblocks_this_iter,1))
return nodes.get(), layer_bounds
+
+def rebuild_tree(bvh, start_layer):
+ bvh_module = get_cu_module('bvh.cu', options=cuda_options,
+ include_source_directory=True)
+ bvh_funcs = GPUFuncs(bvh_module)
+
+ layer_bounds = bvh.layer_bounds
+ layer_ranges = zip(layer_bounds[:start_layer],
+ layer_bounds[1:start_layer+1],
+ layer_bounds[2:start_layer+2])
+ layer_ranges.reverse()
+
+ gpu_nodes = ga.to_gpu(bvh.nodes)
+ nthreads_per_block = 256
+
+ for parent_start, parent_end, child_end in layer_ranges:
+ nparent = parent_end - parent_start
+ child_start = parent_end
+ nchild = child_end - child_start
+ parent_nodes = gpu_nodes[parent_start:]
+ child_nodes = gpu_nodes[child_start:]
+
+ for first_index, elements_this_iter, nblocks_this_iter in \
+ chunk_iterator(nparent, nthreads_per_block, max_blocks=10000):
+ bvh_funcs.make_parents(np.uint32(first_index),
+ np.uint32(elements_this_iter),
+ np.uint32(bvh.degree),
+ parent_nodes,
+ child_nodes,
+ np.uint32(child_start),
+ np.uint32(nchild),
+ block=(nthreads_per_block,1,1),
+ grid=(nblocks_this_iter,1))
+
+ return gpu_nodes.get()
+
+def optimize_layer(orig_nodes):
+ bvh_module = get_cu_module('bvh.cu', options=cuda_options,
+ include_source_directory=True)
+ bvh_funcs = GPUFuncs(bvh_module)
+
+ nodes = ga.to_gpu(orig_nodes)
+ n = len(nodes)
+ areas = ga.empty(shape=n/2, dtype=np.uint64)
+ nthreads_per_block = 128
+
+ min_areas = ga.empty(shape=int(np.ceil(n/float(nthreads_per_block))), dtype=np.uint64)
+ min_index = ga.empty(shape=min_areas.shape, dtype=np.uint32)
+
+ update = 10000
+
+ skip_size = 1
+ flag = mapped_empty(shape=skip_size, dtype=np.uint32)
+
+ i = 0
+ skips = 0
+ swaps = 0
+ while i < n/2 - 1:
+ # How are we doing?
+ if i % update == 0:
+ for first_index, elements_this_iter, nblocks_this_iter in \
+ chunk_iterator(n/2, nthreads_per_block, max_blocks=10000):
+
+ bvh_funcs.pair_area(np.uint32(first_index),
+ np.uint32(elements_this_iter),
+ nodes,
+ areas,
+ block=(nthreads_per_block,1,1),
+ grid=(nblocks_this_iter,1))
+
+ areas_host = areas.get()
+ #print nodes.get(), areas_host.astype(float)
+ print 'Area of parent layer so far (%d): %1.12e' % (i*2, areas_host.astype(float).sum())
+ print 'Skips: %d, Swaps: %d' % (skips, swaps)
+
+ test_index = i * 2
+
+ blocks = 0
+ look_forward = min(8192, n - test_index - 2)
+ skip_this_round = min(skip_size, n - test_index - 1)
+ flag[:] = 0
+ for first_index, elements_this_iter, nblocks_this_iter in \
+ chunk_iterator(look_forward, nthreads_per_block, max_blocks=10000):
+ bvh_funcs.min_distance_to(np.uint32(first_index + test_index + 2),
+ np.uint32(elements_this_iter),
+ np.uint32(test_index),
+ nodes,
+ np.uint32(blocks),
+ min_areas,
+ min_index,
+ Mapped(flag),
+ block=(nthreads_per_block,1,1),
+ grid=(nblocks_this_iter, skip_this_round))
+ blocks += nblocks_this_iter
+ #print i, first_index, nblocks_this_iter, look_forward
+ cuda.Context.get_current().synchronize()
+
+ if flag[0] == 0:
+ flag_nonzero = flag.nonzero()[0]
+ if len(flag_nonzero) == 0:
+ no_swap_required = skip_size
+ else:
+ no_swap_required = flag_nonzero[0]
+ i += no_swap_required
+ skips += no_swap_required
+ continue
+
+ min_areas_host = min_areas[:blocks].get()
+ min_index_host = min_index[:blocks].get()
+ best_block = min_areas_host.argmin()
+ better_i = min_index_host[best_block]
+
+ swaps += 1
+ #print 'swap', test_index+1, better_i
+ bvh_funcs.swap(np.uint32(test_index+1), np.uint32(better_i),
+ nodes, block=(1,1,1), grid=(1,1))
+ cuda.Context.get_current().synchronize()
+ i += 1
+
+ for first_index, elements_this_iter, nblocks_this_iter in \
+ chunk_iterator(n/2, nthreads_per_block, max_blocks=10000):
+
+ bvh_funcs.pair_area(np.uint32(first_index),
+ np.uint32(elements_this_iter),
+ nodes,
+ areas,
+ block=(nthreads_per_block,1,1),
+ grid=(nblocks_this_iter,1))
+
+ areas_host = areas.get()
+
+ print 'Final area of parent layer: %1.12e' % areas_host.sum()
+ print 'Skips: %d, Swaps: %d' % (skips, swaps)
+
+ return nodes.get()