diff options
-rw-r--r-- | chroma/bvh/__init__.py | 1 | ||||
-rw-r--r-- | chroma/bvh/bvh.py | 23 | ||||
-rw-r--r-- | chroma/bvh/grid.py | 15 | ||||
-rw-r--r-- | chroma/bvh/simple.py | 31 | ||||
-rw-r--r-- | chroma/cuda/bvh.cu | 75 | ||||
-rw-r--r-- | chroma/gpu/bvh.py | 139 | ||||
-rw-r--r-- | chroma/gpu/tools.py | 33 | ||||
-rw-r--r-- | test/test_bvh_simple.py | 14 |
8 files changed, 308 insertions, 23 deletions
diff --git a/chroma/bvh/__init__.py b/chroma/bvh/__init__.py index fa3d476..7ca6620 100644 --- a/chroma/bvh/__init__.py +++ b/chroma/bvh/__init__.py @@ -1,2 +1,3 @@ from chroma.bvh.bvh import * from chroma.bvh.grid import make_recursive_grid_bvh +from chroma.bvh.simple import make_simple_bvh diff --git a/chroma/bvh/bvh.py b/chroma/bvh/bvh.py index d13d285..f530b09 100644 --- a/chroma/bvh/bvh.py +++ b/chroma/bvh/bvh.py @@ -196,6 +196,19 @@ class BVH(object): '''Returns the number of nodes in this BVH''' return len(self.nodes) +def node_area(nodes): + '''Returns the area of a list of nodes in fixed point units.''' + unpacked = unpack_nodes(nodes) + delta = np.empty(shape=len(nodes), + dtype=[('x', float), ('y', float), ('z', float)]) + for axis in ['x', 'y', 'z']: + delta[axis] = unpacked[axis+'hi'] - unpacked[axis+'lo'] + + half_area = delta['x']*delta['y'] + delta['y']*delta['z'] \ + + delta['z']*delta['x'] + + return 2.0 * half_area.sum() + class BVHLayerSlice(object): '''A single layer in a bounding volume hierarchy represented as a slice in the node array of the parent. @@ -219,12 +232,4 @@ class BVHLayerSlice(object): def area(self): '''Return the surface area of all the nodes in this layer in world units.''' - unpacked = unpack_nodes(self.nodes) - delta = np.empty(shape=len(self.nodes), - dtype=[('x', float), ('y', float), ('z', float)]) - for axis in ['x', 'y', 'z']: - delta[axis] = unpacked[axis+'hi'] - unpacked[axis+'lo'] - - half_area = delta['x']*delta['y'] + delta['y']*delta['z'] \ - + delta['z']*delta['x'] - return 2.0 * half_area.sum() * self.world_coords.world_scale**2 + return node_area(self.nodes) * self.world_coords.world_scale**2 diff --git a/chroma/bvh/grid.py b/chroma/bvh/grid.py index 9fcf125..0b36a2f 100644 --- a/chroma/bvh/grid.py +++ b/chroma/bvh/grid.py @@ -1,6 +1,7 @@ -from chroma.bvh.bvh import BVH +from chroma.bvh.bvh import BVH, node_area +from chroma.gpu.bvh import create_leaf_nodes -def make_recursive_grid_bvh(self, mesh, bits=11): +def make_recursive_grid_bvh(mesh, bits=11): '''Returns a binary tree BVH created using a 'recursive grid' method. This method is somewhat similar to the original Chroma BVH generator, @@ -14,6 +15,14 @@ def make_recursive_grid_bvh(self, mesh, bits=11): speed things up. ''' - pass + world_coords, leaf_nodes, morton_codes = create_leaf_nodes(mesh, bits) + # rearrange in morton order + argsort = morton_codes.argsort() + leaf_nodes = leaf_nodes[argsort] + morton_codes[argsort] + + print node_area(leaf_nodes) * world_coords.world_scale**2 + + diff --git a/chroma/bvh/simple.py b/chroma/bvh/simple.py new file mode 100644 index 0000000..6c0e02b --- /dev/null +++ b/chroma/bvh/simple.py @@ -0,0 +1,31 @@ +from chroma.bvh.bvh import BVH, node_area +from chroma.gpu.bvh import create_leaf_nodes, merge_nodes, concatenate_layers + +def make_simple_bvh(mesh, degree): + '''Returns a BVH tree created by simple grouping of Morton ordered nodes. + ''' + world_coords, leaf_nodes, morton_codes = \ + create_leaf_nodes(mesh, round_to_multiple=degree) + + # rearrange in morton order + argsort = morton_codes.argsort() + leaf_nodes = leaf_nodes[argsort] + assert len(leaf_nodes) % degree == 0 + + # Create parent layers + layers = [leaf_nodes] + while len(layers[0]) > 1: + parent = merge_nodes(layers[0], degree=degree) + if len(parent) > 1: + assert len(parent) % degree == 0 + layers = [parent] + layers + + # How many nodes total? + nodes, layer_bounds = concatenate_layers(layers) + + for i, (layer_start, layer_end) in enumerate(zip(layer_bounds[:-1], + layer_bounds[1:])): + print i, node_area(nodes[layer_start:layer_end]) * world_coords.world_scale**2 + + + diff --git a/chroma/cuda/bvh.cu b/chroma/cuda/bvh.cu index 50e877a..a6888db 100644 --- a/chroma/cuda/bvh.cu +++ b/chroma/cuda/bvh.cu @@ -186,17 +186,6 @@ extern "C" spread3_16(q_centroid.x) | (spread3_16(q_centroid.y) << 1) | (spread3_16(q_centroid.z) << 2); - - //unsigned long long morton = - // spread3_16(triangle.x & 0xFFFF) - // | (spread3_16(triangle.y & 0xFFFF) << 1) - // | (spread3_16(triangle.z & 0xFFFF) << 2); - - - //unsigned long long morton = - // ( ((unsigned long long) q_centroid.x) << 32 ) - // | ( ((unsigned long long) q_centroid.y) << 16 ) - // | ( ((unsigned long long) q_centroid.z) << 0 ); // Write leaf and morton code uint4 leaf_node; @@ -270,6 +259,70 @@ extern "C" nodes[parent_layer_offset + parent_id] = parent_node; } + __global__ void + make_parents(unsigned int first_node, + unsigned int elements_this_launch, + unsigned int n_children_per_node, + uint4 *parent_nodes, + uint4 *child_nodes, + unsigned int child_id_offset) + { + unsigned int thread_id = blockDim.x * blockIdx.x + threadIdx.x; + if (thread_id >= elements_this_launch) + return; + + unsigned int parent_id = first_node + thread_id; + unsigned int first_child = parent_id * n_children_per_node; + + // Load first child + uint4 parent_node = child_nodes[first_child]; + uint3 lower = make_uint3(parent_node.x & 0xFFFF, parent_node.y & 0xFFFF, parent_node.z & 0xFFFF); + uint3 upper = make_uint3(parent_node.x >> 16, parent_node.y >> 16, parent_node.z >> 16); + + // Scan remaining children + for (unsigned int i=1; i < n_children_per_node; i++) { + uint4 child_node = child_nodes[first_child + i]; + + if (child_node.x == 0) + break; // Hit first padding node in list of children + + uint3 child_lower = make_uint3(child_node.x & 0xFFFF, child_node.y & 0xFFFF, child_node.z & 0xFFFF); + uint3 child_upper = make_uint3(child_node.x >> 16, child_node.y >> 16, child_node.z >> 16); + + lower = min(lower, child_lower); + upper = max(upper, child_upper); + } + + parent_node.w = first_child; + parent_node.x = upper.x << 16 | lower.x; + parent_node.y = upper.y << 16 | lower.y; + parent_node.z = upper.z << 16 | lower.z; + + parent_nodes[parent_id] = parent_node; + } + + __global__ void + copy_and_offset(unsigned int first_node, + unsigned int elements_this_launch, + unsigned int child_id_offset, + uint4 *src_nodes, + uint4 *dest_nodes) + + { + unsigned int thread_id = blockDim.x * blockIdx.x + threadIdx.x; + if (thread_id >= elements_this_launch) + return; + + unsigned int node_id = first_node + thread_id; + uint4 src_node = src_nodes[node_id]; + + unsigned int leaf_flag = src_node.w & 0x80000000; + unsigned int child_id = src_node.w & 0x7FFFFFFF; + src_node.w = leaf_flag | (child_id + child_id_offset); + + dest_nodes[node_id] = src_node; + } + __global__ void distance_to_prev(unsigned int first_node, unsigned int threads_this_round, uint4 *node, unsigned int *area) { diff --git a/chroma/gpu/bvh.py b/chroma/gpu/bvh.py new file mode 100644 index 0000000..b9c920d --- /dev/null +++ b/chroma/gpu/bvh.py @@ -0,0 +1,139 @@ +import numpy as np +import pycuda.driver as cuda +from pycuda import gpuarray as ga +from pycuda import characterize + +from chroma.gpu.tools import get_cu_module, cuda_options, \ + chunk_iterator, to_uint3, to_float3, GPUFuncs, mapped_empty, Mapped + +from chroma.bvh.bvh import WorldCoords + +def round_up_to_multiple(x, multiple): + remainder = x % multiple + if remainder == 0: + return x + else: + return x + multiple - remainder + +def create_leaf_nodes(mesh, morton_bits=16, round_to_multiple=1): + '''Compute the leaf nodes surrounding a triangle mesh. + + ``mesh``: chroma.geometry.Mesh + Triangles to box + ``morton_bits``: int + Number of bits to use per dimension when computing Morton code. + ``round_to_multiple``: int + Round the number of nodes created up to multiple of this number + Extra nodes will be all zero. + + Returns (world_coords, nodes, morton_codes), where + ``world_coords``: chroma.bvh.WorldCoords + Defines the fixed point coordinate system + ``nodes``: ndarray(shape=len(mesh.triangles), dtype=uint4) + List of leaf nodes. Child IDs will be set to triangle offsets. + ``morton_codes``: ndarray(shape=len(mesh.triangles), dtype=np.uint64) + Morton codes for each triangle, using ``morton_bits`` per axis. + Must be <= 16 bits. + ''' + # Load GPU functions + bvh_module = get_cu_module('bvh.cu', options=cuda_options, + include_source_directory=True) + bvh_funcs = GPUFuncs(bvh_module) + + # compute world coordinates + world_origin = mesh.vertices.min(axis=0) + world_scale = np.max((mesh.vertices.max(axis=0) - world_origin)) \ + / (2**16 - 2) + world_coords = WorldCoords(world_origin=world_origin, + world_scale=world_scale) + + # Put triangles and vertices in mapped host memory + triangles = mapped_empty(shape=len(mesh.triangles), dtype=ga.vec.uint3, + write_combined=True) + triangles[:] = to_uint3(mesh.triangles) + vertices = mapped_empty(shape=len(mesh.vertices), dtype=ga.vec.float3, + write_combined=True) + vertices[:] = to_float3(mesh.vertices) + + # Call GPU to compute nodes + nodes = ga.zeros(shape=round_up_to_multiple(len(triangles), + round_to_multiple), + dtype=ga.vec.uint4) + morton_codes = ga.empty(shape=len(triangles), dtype=np.uint64) + + # Convert world coords to GPU-friendly types + world_origin = ga.vec.make_float3(*world_origin) + world_scale = np.float32(world_scale) + + nthreads_per_block = 256 + for first_index, elements_this_iter, nblocks_this_iter in \ + chunk_iterator(len(triangles), nthreads_per_block, + max_blocks=30000): + bvh_funcs.make_leaves(np.uint32(first_index), + np.uint32(elements_this_iter), + Mapped(triangles), Mapped(vertices), + world_origin, world_scale, + nodes, morton_codes, + block=(nthreads_per_block,1,1), + grid=(nblocks_this_iter,1)) + + morton_codes_host = morton_codes.get() >> (16 - morton_bits) + return world_coords, nodes.get(), morton_codes_host + + +def merge_nodes(nodes, degree): + bvh_module = get_cu_module('bvh.cu', options=cuda_options, + include_source_directory=True) + bvh_funcs = GPUFuncs(bvh_module) + + nparent = len(nodes) / degree + if nparent == 1: + nparent_pad = nparent + else: + nparent_pad = round_up_to_multiple(nparent, degree) + parent_nodes = ga.zeros(shape=nparent_pad, dtype=ga.vec.uint4) + + nthreads_per_block = 256 + for first_index, elements_this_iter, nblocks_this_iter in \ + chunk_iterator(nparent, nthreads_per_block, max_blocks=10000): + bvh_funcs.make_parents(np.uint32(first_index), + np.uint32(elements_this_iter), + np.uint32(degree), + parent_nodes, + cuda.In(nodes), + np.uint32(0), + block=(nthreads_per_block,1,1), + grid=(nblocks_this_iter,1)) + + return parent_nodes.get() + +def concatenate_layers(layers): + bvh_module = get_cu_module('bvh.cu', options=cuda_options, + include_source_directory=True) + bvh_funcs = GPUFuncs(bvh_module) + # Put 0 at beginning of list + layer_bounds = np.insert(np.cumsum(map(len, layers)), 0, 0) + nodes = ga.empty(shape=int(layer_bounds[-1]), dtype=ga.vec.uint4) + nthreads_per_block = 256 + + for layer_start, layer_end, layer in zip(layer_bounds[:-1], + layer_bounds[1:], + layers): + if layer_end == layer_bounds[-1]: + # leaf nodes need no offset + child_offset = 0 + else: + child_offset = layer_end + + for first_index, elements_this_iter, nblocks_this_iter in \ + chunk_iterator(layer_end-layer_start, nthreads_per_block, + max_blocks=10000): + bvh_funcs.copy_and_offset(np.uint32(first_index), + np.uint32(elements_this_iter), + np.uint32(child_offset), + cuda.In(layer), + nodes[layer_start:], + block=(nthreads_per_block,1,1), + grid=(nblocks_this_iter,1)) + + return nodes.get(), layer_bounds diff --git a/chroma/gpu/tools.py b/chroma/gpu/tools.py index 707a45d..b151f80 100644 --- a/chroma/gpu/tools.py +++ b/chroma/gpu/tools.py @@ -181,3 +181,36 @@ def format_size(size): def format_array(name, array): return '%-15s %6s %6s' % \ (name, format_size(len(array)), format_size(array.nbytes)) + +def Mapped(array): + '''Analog to pycuda.driver.InOut(), but indicates this array + is memory mapped to the device space and should not be copied.''' + return np.intp(array.base.get_device_pointer()) + +def mapped_alloc(pagelocked_alloc_func, shape, dtype, write_combined): + '''Returns a pagelocked host array mapped into the CUDA device + address space, with a gpudata field set so it just works with CUDA + functions.''' + flags = cuda.host_alloc_flags.DEVICEMAP + if write_combined: + flags |= cuda.host_alloc_flags.WRITECOMBINED + array = pagelocked_alloc_func(shape=shape, dtype=dtype, mem_flags=flags) + return array + +def mapped_empty(shape, dtype, write_combined=False): + '''See mapped_alloc()''' + return mapped_alloc(cuda.pagelocked_empty, shape, dtype, write_combined) + +def mapped_empty_like(other, write_combined=False): + '''See mapped_alloc()''' + return mapped_alloc(cuda.pagelocked_empty, other.shape, other.dtype, + write_combined) + +def mapped_zeros(shape, dtype, write_combined=False): + '''See mapped_alloc()''' + return mapped_alloc(cuda.pagelocked_zeros, shape, dtype, write_combined) + +def mapped_zeros_like(other, write_combined=False): + '''See mapped_alloc()''' + return mapped_alloc(cuda.pagelocked_zeros, other.shape, other.dtype, + write_combined) diff --git a/test/test_bvh_simple.py b/test/test_bvh_simple.py new file mode 100644 index 0000000..20c0df5 --- /dev/null +++ b/test/test_bvh_simple.py @@ -0,0 +1,14 @@ +import pycuda.autoinit +import unittest +from chroma.bvh import make_simple_bvh, BVH +import chroma.models + +import numpy as np +#from numpy.testing import assert_array_max_ulp, assert_array_equal, \ +# assert_approx_equal + +def test_simple_bvh(): + mesh = chroma.models.lionsolid() + bvh = make_simple_bvh(mesh, degree=2) + assert isinstance(bvh, BVH) + |