summaryrefslogtreecommitdiff
path: root/chroma
diff options
context:
space:
mode:
Diffstat (limited to 'chroma')
-rw-r--r--chroma/cuda/bvh.cu72
-rw-r--r--chroma/gpu/bvh.py136
2 files changed, 189 insertions, 19 deletions
diff --git a/chroma/cuda/bvh.cu b/chroma/cuda/bvh.cu
index a6888db..2ce0580 100644
--- a/chroma/cuda/bvh.cu
+++ b/chroma/cuda/bvh.cu
@@ -114,11 +114,11 @@ __device__ uint4 node_union(const uint4 &a, const uint4 &b)
-__device__ unsigned int surface_half_area(const uint4 &node)
+__device__ unsigned long long surface_half_area(const uint4 &node)
{
- unsigned int x = (node.x >> 16) - (node.x & 0xFFFF);
- unsigned int y = (node.y >> 16) - (node.y & 0xFFFF);
- unsigned int z = (node.z >> 16) - (node.z & 0xFFFF);
+ unsigned long long x = (node.x >> 16) - (node.x & 0xFFFF);
+ unsigned long long y = (node.y >> 16) - (node.y & 0xFFFF);
+ unsigned long long z = (node.z >> 16) - (node.z & 0xFFFF);
return x*y + y*z + z*x;
}
@@ -265,7 +265,8 @@ extern "C"
unsigned int n_children_per_node,
uint4 *parent_nodes,
uint4 *child_nodes,
- unsigned int child_id_offset)
+ unsigned int child_id_offset,
+ unsigned int num_children)
{
unsigned int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
if (thread_id >= elements_this_launch)
@@ -274,6 +275,9 @@ extern "C"
unsigned int parent_id = first_node + thread_id;
unsigned int first_child = parent_id * n_children_per_node;
+ if (first_child >= num_children)
+ return;
+
// Load first child
uint4 parent_node = child_nodes[first_child];
uint3 lower = make_uint3(parent_node.x & 0xFFFF, parent_node.y & 0xFFFF, parent_node.z & 0xFFFF);
@@ -293,7 +297,7 @@ extern "C"
upper = max(upper, child_upper);
}
- parent_node.w = first_child;
+ parent_node.w = first_child + child_id_offset;
parent_node.x = upper.x << 16 | lower.x;
parent_node.y = upper.y << 16 | lower.y;
parent_node.z = upper.z << 16 | lower.z;
@@ -339,7 +343,29 @@ extern "C"
area[node_id] = surface_half_area(u);
}
- __global__ void distance_to(unsigned int first_node, unsigned int threads_this_round,
+ __global__ void pair_area(unsigned int first_node,
+ unsigned int threads_this_round,
+ uint4 *node, unsigned long long *area)
+ {
+ unsigned int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+ if (thread_id >= threads_this_round)
+ return;
+
+ unsigned int node_id = first_node + thread_id;
+ unsigned int child_id = node_id * 2;
+
+ uint4 a = node[child_id];
+ uint4 b = node[child_id+1];
+ if (b.x == 0)
+ b = a;
+
+ uint4 u = node_union(a, b);
+
+ area[node_id] = 2*surface_half_area(u);
+ }
+
+ __global__ void distance_to(unsigned int first_node,
+ unsigned int threads_this_round,
unsigned int target_index,
uint4 *node, unsigned int *area)
{
@@ -364,19 +390,18 @@ extern "C"
unsigned int target_index,
uint4 *node,
unsigned int block_offset,
- unsigned int *min_area_block,
+ unsigned long long *min_area_block,
unsigned int *min_index_block,
unsigned int *flag)
{
- __shared__ unsigned int min_area;
- __shared__ unsigned int adjacent_area;
+ __shared__ unsigned long long min_area[128];
+ __shared__ unsigned long long adjacent_area;
target_index += blockIdx.y;
uint4 a = node[target_index];
if (threadIdx.x == 0) {
- min_area = 0xFFFFFFFF;
adjacent_area = surface_half_area(node_union(a, node[target_index+1]));
}
@@ -389,38 +414,47 @@ extern "C"
if (thread_id >= threads_this_round)
node_id = target_index;
- unsigned int area;
+ unsigned long long area;
if (node_id == target_index) {
- area = 0xFFFFFFFF;
+ area = 0xFFFFFFFFFFFFFFFF;
} else {
uint4 b = node[node_id];
if (b.x == 0) {
- area = 0xFFFFFFFF;
+ area = 0xFFFFFFFFFFFFFFFF;
} else {
uint4 u = node_union(a, b);
area = surface_half_area(u);
}
}
- atomicMin(&min_area, area);
+ min_area[threadIdx.x] = area;
+
+ __syncthreads();
+
+ // Too lazy to code parallel reduction right now
+ if (threadIdx.x == 0) {
+ for (int i=1; i < blockDim.x; i++)
+ min_area[0] = min(min_area[0], min_area[i]);
+ }
__syncthreads();
- if (min_area == area) {
+ if (min_area[0] == area) {
if (blockIdx.y == 0) {
- if (min_area < adjacent_area) {
+ if (min_area[0] < adjacent_area) {
min_index_block[block_offset + blockIdx.x] = node_id;
min_area_block[block_offset + blockIdx.x] = area;
flag[0] = 1;
} else {
- min_area_block[block_offset + blockIdx.x] = 0xFFFFFFFF;
+ min_area_block[block_offset + blockIdx.x] = 0xFFFFFFFFFFFFFFFF;
+ min_index_block[block_offset + blockIdx.x] = target_index + 1;
}
} else {
- if (min_area < adjacent_area)
+ if (min_area[0] < adjacent_area)
flag[blockIdx.y] = 1;
}
diff --git a/chroma/gpu/bvh.py b/chroma/gpu/bvh.py
index b9c920d..7f2794e 100644
--- a/chroma/gpu/bvh.py
+++ b/chroma/gpu/bvh.py
@@ -102,6 +102,7 @@ def merge_nodes(nodes, degree):
parent_nodes,
cuda.In(nodes),
np.uint32(0),
+ np.uint32(len(nodes)),
block=(nthreads_per_block,1,1),
grid=(nblocks_this_iter,1))
@@ -137,3 +138,138 @@ def concatenate_layers(layers):
grid=(nblocks_this_iter,1))
return nodes.get(), layer_bounds
+
+def rebuild_tree(bvh, start_layer):
+ bvh_module = get_cu_module('bvh.cu', options=cuda_options,
+ include_source_directory=True)
+ bvh_funcs = GPUFuncs(bvh_module)
+
+ layer_bounds = bvh.layer_bounds
+ layer_ranges = zip(layer_bounds[:start_layer],
+ layer_bounds[1:start_layer+1],
+ layer_bounds[2:start_layer+2])
+ layer_ranges.reverse()
+
+ gpu_nodes = ga.to_gpu(bvh.nodes)
+ nthreads_per_block = 256
+
+ for parent_start, parent_end, child_end in layer_ranges:
+ nparent = parent_end - parent_start
+ child_start = parent_end
+ nchild = child_end - child_start
+ parent_nodes = gpu_nodes[parent_start:]
+ child_nodes = gpu_nodes[child_start:]
+
+ for first_index, elements_this_iter, nblocks_this_iter in \
+ chunk_iterator(nparent, nthreads_per_block, max_blocks=10000):
+ bvh_funcs.make_parents(np.uint32(first_index),
+ np.uint32(elements_this_iter),
+ np.uint32(bvh.degree),
+ parent_nodes,
+ child_nodes,
+ np.uint32(child_start),
+ np.uint32(nchild),
+ block=(nthreads_per_block,1,1),
+ grid=(nblocks_this_iter,1))
+
+ return gpu_nodes.get()
+
+def optimize_layer(orig_nodes):
+ bvh_module = get_cu_module('bvh.cu', options=cuda_options,
+ include_source_directory=True)
+ bvh_funcs = GPUFuncs(bvh_module)
+
+ nodes = ga.to_gpu(orig_nodes)
+ n = len(nodes)
+ areas = ga.empty(shape=n/2, dtype=np.uint64)
+ nthreads_per_block = 128
+
+ min_areas = ga.empty(shape=int(np.ceil(n/float(nthreads_per_block))), dtype=np.uint64)
+ min_index = ga.empty(shape=min_areas.shape, dtype=np.uint32)
+
+ update = 10000
+
+ skip_size = 1
+ flag = mapped_empty(shape=skip_size, dtype=np.uint32)
+
+ i = 0
+ skips = 0
+ swaps = 0
+ while i < n/2 - 1:
+ # How are we doing?
+ if i % update == 0:
+ for first_index, elements_this_iter, nblocks_this_iter in \
+ chunk_iterator(n/2, nthreads_per_block, max_blocks=10000):
+
+ bvh_funcs.pair_area(np.uint32(first_index),
+ np.uint32(elements_this_iter),
+ nodes,
+ areas,
+ block=(nthreads_per_block,1,1),
+ grid=(nblocks_this_iter,1))
+
+ areas_host = areas.get()
+ #print nodes.get(), areas_host.astype(float)
+ print 'Area of parent layer so far (%d): %1.12e' % (i*2, areas_host.astype(float).sum())
+ print 'Skips: %d, Swaps: %d' % (skips, swaps)
+
+ test_index = i * 2
+
+ blocks = 0
+ look_forward = min(8192, n - test_index - 2)
+ skip_this_round = min(skip_size, n - test_index - 1)
+ flag[:] = 0
+ for first_index, elements_this_iter, nblocks_this_iter in \
+ chunk_iterator(look_forward, nthreads_per_block, max_blocks=10000):
+ bvh_funcs.min_distance_to(np.uint32(first_index + test_index + 2),
+ np.uint32(elements_this_iter),
+ np.uint32(test_index),
+ nodes,
+ np.uint32(blocks),
+ min_areas,
+ min_index,
+ Mapped(flag),
+ block=(nthreads_per_block,1,1),
+ grid=(nblocks_this_iter, skip_this_round))
+ blocks += nblocks_this_iter
+ #print i, first_index, nblocks_this_iter, look_forward
+ cuda.Context.get_current().synchronize()
+
+ if flag[0] == 0:
+ flag_nonzero = flag.nonzero()[0]
+ if len(flag_nonzero) == 0:
+ no_swap_required = skip_size
+ else:
+ no_swap_required = flag_nonzero[0]
+ i += no_swap_required
+ skips += no_swap_required
+ continue
+
+ min_areas_host = min_areas[:blocks].get()
+ min_index_host = min_index[:blocks].get()
+ best_block = min_areas_host.argmin()
+ better_i = min_index_host[best_block]
+
+ swaps += 1
+ #print 'swap', test_index+1, better_i
+ bvh_funcs.swap(np.uint32(test_index+1), np.uint32(better_i),
+ nodes, block=(1,1,1), grid=(1,1))
+ cuda.Context.get_current().synchronize()
+ i += 1
+
+ for first_index, elements_this_iter, nblocks_this_iter in \
+ chunk_iterator(n/2, nthreads_per_block, max_blocks=10000):
+
+ bvh_funcs.pair_area(np.uint32(first_index),
+ np.uint32(elements_this_iter),
+ nodes,
+ areas,
+ block=(nthreads_per_block,1,1),
+ grid=(nblocks_this_iter,1))
+
+ areas_host = areas.get()
+
+ print 'Final area of parent layer: %1.12e' % areas_host.sum()
+ print 'Skips: %d, Swaps: %d' % (skips, swaps)
+
+ return nodes.get()