diff options
-rw-r--r-- | chroma/cuda/bvh.cu | 183 | ||||
-rw-r--r-- | chroma/cuda/geometry.h | 69 | ||||
-rw-r--r-- | chroma/cuda/geometry_types.h | 55 | ||||
-rw-r--r-- | chroma/cuda/mesh.h | 140 | ||||
-rw-r--r-- | chroma/gpu/geometry.py | 171 |
5 files changed, 483 insertions, 135 deletions
diff --git a/chroma/cuda/bvh.cu b/chroma/cuda/bvh.cu new file mode 100644 index 0000000..8543649 --- /dev/null +++ b/chroma/cuda/bvh.cu @@ -0,0 +1,183 @@ +//-*-c++-*- +#include <cuda.h> + +#include "linalg.h" + +__device__ float3 +fminf(const float3 &a, const float3 &b) +{ + return make_float3(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z)); +} + +__device__ float3 +fmaxf(const float3 &a, const float3 &b) +{ + return make_float3(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z)); +} + +__device__ uint3 +min(const uint3 &a, const uint3 &b) +{ + return make_uint3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z)); +} + +__device__ uint3 +max(const uint3 &a, const uint3 &b) +{ + return make_uint3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z)); +} + + +__device__ uint3 +operator+ (const uint3 &a, const unsigned int &b) +{ + return make_uint3(a.x + b, a.y + b, a.z + b); +} + +// spread out the first 16 bits in x to occupy every 3rd slot in the return value +__device__ unsigned long long spread3_16(unsigned int input) +{ + // method from http://stackoverflow.com/a/4838734 + unsigned long long x = input; + x = (x | (x << 16)) & 0x00000000FF0000FFul; + x = (x | (x << 8)) & 0x000000F00F00F00Ful; + x = (x | (x << 4)) & 0x00000C30C30C30C3ul; + x = (x | (x << 2)) & 0X0000249249249249ul; + + return x; +} + +__device__ unsigned int quantize(float v, float world_origin, float world_scale) +{ + // truncate! + return (unsigned int) ((v - world_origin) / world_scale); +} + +__device__ uint3 quantize3(float3 v, float3 world_origin, float3 world_scale) +{ + return make_uint3(quantize(v.x, world_origin.x, world_scale.x), + quantize(v.y, world_origin.y, world_scale.y), + quantize(v.z, world_origin.z, world_scale.z)); +} + +const unsigned int LEAF_BIT = (1U << 31); + +extern "C" +{ + + __global__ void + make_leaves(unsigned int first_triangle, + unsigned int ntriangles, uint3 *triangles, float3 *vertices, + float3 world_origin, float3 world_scale, + uint4 *leaf_nodes, unsigned long long *morton_codes) + + { + unsigned int thread_id = blockDim.x * blockIdx.x + threadIdx.x; + if (thread_id >= ntriangles) + return; + + unsigned int triangle_id = first_triangle + thread_id; + + // Find bounding corners and centroid + uint3 triangle = triangles[triangle_id]; + float3 lower = vertices[triangle.x]; + float3 centroid = lower; + float3 upper = lower; + float3 temp_vertex; + + temp_vertex = vertices[triangle.y]; + lower = fminf(lower, temp_vertex); + upper = fmaxf(upper, temp_vertex); + centroid += temp_vertex; + + temp_vertex = vertices[triangle.z]; + lower = fminf(lower, temp_vertex); + upper = fmaxf(upper, temp_vertex); + centroid += temp_vertex; + + centroid /= 3.0f; + + // Quantize bounding corners and centroid + uint3 q_lower = quantize3(lower, world_origin, world_scale); + uint3 q_upper = quantize3(upper, world_origin, world_scale) + 1; + uint3 q_centroid = quantize3(centroid, world_origin, world_scale); + + // Compute Morton code from quantized centroid + unsigned long long morton = + spread3_16(q_centroid.x) + | (spread3_16(q_centroid.y) << 1) + | (spread3_16(q_centroid.z) << 2); + + // Write leaf and morton code + uint4 leaf_node; + leaf_node.x = q_lower.x | (q_upper.x << 16); + leaf_node.y = q_lower.y | (q_upper.y << 16); + leaf_node.z = q_lower.z | (q_upper.z << 16); + leaf_node.w = triangle_id | LEAF_BIT; + + leaf_nodes[triangle_id] = leaf_node; + morton_codes[triangle_id] = morton; + } + + __global__ void + reorder_leaves(unsigned int first_triangle, + unsigned int ntriangles, + uint4 *leaf_nodes_in, + uint4 *leaf_nodes_out, + unsigned int *remap_order) + + { + unsigned int thread_id = blockDim.x * blockIdx.x + threadIdx.x; + if (thread_id >= ntriangles) + return; + + unsigned int dest_id = first_triangle + thread_id; + unsigned int source_id = remap_order[dest_id]; + + leaf_nodes_out[dest_id] = leaf_nodes_in[source_id]; + } + + __global__ void + build_layer(unsigned int first_node, + unsigned int n_parent_nodes, + unsigned int n_children_per_node, + uint4 *nodes, + unsigned int parent_layer_offset, + unsigned int child_layer_offset) + { + unsigned int thread_id = blockDim.x * blockIdx.x + threadIdx.x; + if (thread_id >= n_parent_nodes) + return; + + unsigned int parent_id = first_node + thread_id; + unsigned int first_child = child_layer_offset + parent_id * n_children_per_node; + + // Load first child + uint4 parent_node = nodes[first_child]; + uint3 lower = make_uint3(parent_node.x & 0xFFFF, parent_node.y & 0xFFFF, parent_node.z & 0xFFFF); + uint3 upper = make_uint3(parent_node.x >> 16, parent_node.y >> 16, parent_node.z >> 16); + + + // Scan remaining children + for (unsigned int i=1; i < n_children_per_node; i++) { + uint4 child_node = nodes[first_child + i]; + + if (child_node.x == 0) + break; // Hit first padding node in list of children + + uint3 child_lower = make_uint3(child_node.x & 0xFFFF, child_node.y & 0xFFFF, child_node.z & 0xFFFF); + uint3 child_upper = make_uint3(child_node.x >> 16, child_node.y >> 16, child_node.z >> 16); + + lower = min(lower, child_lower); + upper = max(upper, child_upper); + } + + parent_node.w = first_child; + parent_node.x = upper.x << 16 | lower.x; + parent_node.y = upper.y << 16 | lower.y; + parent_node.z = upper.z << 16 | lower.z; + + nodes[parent_layer_offset + parent_id] = parent_node; + } + +} // extern "C" diff --git a/chroma/cuda/geometry.h b/chroma/cuda/geometry.h index 2b5eacb..86ced6c 100644 --- a/chroma/cuda/geometry.h +++ b/chroma/cuda/geometry.h @@ -1,47 +1,40 @@ #ifndef __GEOMETRY_H__ #define __GEOMETRY_H__ -struct Material -{ - float *refractive_index; - float *absorption_length; - float *scattering_length; - unsigned int n; - float step; - float wavelength0; -}; - -struct Surface -{ - float *detect; - float *absorb; - float *reflect_diffuse; - float *reflect_specular; - unsigned int n; - float step; - float wavelength0; -}; - -struct Triangle +#include "geometry_types.h" +#include "linalg.h" + +const unsigned int LEAF_BIT = (1U << 31); + + +__device__ float3 +to_float3(const uint3 &a) { - float3 v0, v1, v2; -}; + return make_float3(a.x, a.y, a.z); +} -struct Geometry +__device__ Node +get_node(Geometry *geometry, const unsigned int &i) { - float3 *vertices; - uint3 *triangles; - unsigned int *material_codes; - unsigned int *colors; - float3 *lower_bounds; - float3 *upper_bounds; - unsigned int *node_map; - unsigned int *node_map_end; - Material **materials; - Surface **surfaces; - unsigned int start_node; - unsigned int first_node; -}; + uint4 node = geometry->nodes[i]; + Node node_struct; + + if (node.x == 0) { + node_struct.kind = PADDING_NODE; + return node_struct; + } + + uint3 lower_int = make_uint3(node.x & 0xFFFF, node.y & 0xFFFF, node.z & 0xFFFF); + uint3 upper_int = make_uint3(node.x >> 16, node.y >> 16, node.z >> 16); + + + node_struct.lower = geometry->world_origin + to_float3(lower_int) * geometry->world_scale; + node_struct.upper = geometry->world_origin + to_float3(upper_int) * geometry->world_scale; + node_struct.child = node.w & ~LEAF_BIT; // Mask off leaf bit + node_struct.kind = node.w & LEAF_BIT ? LEAF_NODE : INTERNAL_NODE; + + return node_struct; +} __device__ Triangle get_triangle(Geometry *geometry, const unsigned int &i) diff --git a/chroma/cuda/geometry_types.h b/chroma/cuda/geometry_types.h new file mode 100644 index 0000000..31ff3c6 --- /dev/null +++ b/chroma/cuda/geometry_types.h @@ -0,0 +1,55 @@ +#ifndef __GEOMETRY_TYPES_H__ +#define __GEOMETRY_TYPES_H__ + +struct Material +{ + float *refractive_index; + float *absorption_length; + float *scattering_length; + unsigned int n; + float step; + float wavelength0; +}; + +struct Surface +{ + float *detect; + float *absorb; + float *reflect_diffuse; + float *reflect_specular; + unsigned int n; + float step; + float wavelength0; +}; + +struct Triangle +{ + float3 v0, v1, v2; +}; + +enum { INTERNAL_NODE, LEAF_NODE, PADDING_NODE }; + +struct Node +{ + float3 lower; + float3 upper; + unsigned int child; + unsigned int kind; +}; + +struct Geometry +{ + float3 *vertices; + uint3 *triangles; + unsigned int *material_codes; + unsigned int *colors; + uint4 *nodes; + Material **materials; + Surface **surfaces; + float3 world_origin; + float _dummy1; // for alignment + float3 world_scale; + unsigned int branch_degree; +}; + +#endif diff --git a/chroma/cuda/mesh.h b/chroma/cuda/mesh.h index d60d801..de15156 100644 --- a/chroma/cuda/mesh.h +++ b/chroma/cuda/mesh.h @@ -6,7 +6,7 @@ #include "stdio.h" -#define STACK_SIZE 100 +#define STACK_SIZE 1000 /* Tests the intersection between a ray and a node in the bounding volume hierarchy. If the ray intersects the bounding volume and `min_distance` @@ -14,15 +14,11 @@ less than `min_distance`, return true, else return false. */ __device__ bool intersect_node(const float3 &origin, const float3 &direction, - Geometry *g, int i, float min_distance=-1.0f) + Geometry *g, const Node &node, const float min_distance=-1.0f) { - /* assigning these to local variables is faster for some reason */ - float3 lower_bound = g->lower_bounds[i]; - float3 upper_bound = g->upper_bounds[i]; - float distance_to_box; - if (intersect_box(origin, direction, lower_bound, upper_bound, + if (intersect_box(origin, direction, node.lower, node.upper, distance_to_box)) { if (min_distance < 0.0f) return true; @@ -35,7 +31,6 @@ intersect_node(const float3 &origin, const float3 &direction, else { return false; } - } /* Finds the intersection between a ray and `geometry`. If the ray does @@ -52,73 +47,74 @@ intersect_mesh(const float3 &origin, const float3& direction, Geometry *g, float distance; min_distance = -1.0f; - if (!intersect_node(origin, direction, g, g->start_node, min_distance)) + Node root = get_node(g, 0); + + if (!intersect_node(origin, direction, g, root, min_distance)) return -1; - unsigned int stack[STACK_SIZE]; - - unsigned int *head = &stack[0]; - unsigned int *node = &stack[1]; - unsigned int *tail = &stack[STACK_SIZE-1]; - *node = g->start_node; - - unsigned int i; - - do - { - unsigned int first_child = g->node_map[*node]; - unsigned int stop = g->node_map_end[*node]; - - while (*node >= g->first_node && stop == first_child+1) { - *node = first_child; - first_child = g->node_map[*node]; - stop = g->node_map_end[*node]; - } - - if (*node >= g->first_node) { - for (i=first_child; i < stop; i++) { - if (intersect_node(origin, direction, g, i, min_distance)) { - *node = i; - node++; - } - } + //min_distance = 100.0f; + //return 1; + + unsigned int child_ptr_stack[STACK_SIZE]; + child_ptr_stack[0] = root.child; + + const unsigned int *head = &child_ptr_stack[0]; + unsigned int *curr = &child_ptr_stack[0]; + const unsigned int *tail = &child_ptr_stack[STACK_SIZE-1]; + + unsigned int count = 0; + unsigned int tri_count = 0; + + while (curr >= head) { + unsigned int first_child = *curr; + curr--; + + for (unsigned int i=first_child; i < first_child + g->branch_degree; i++) { + Node node = get_node(g, i); + count++; + + if (node.kind == PADDING_NODE) + break; // this node and rest of children are padding + + if (intersect_node(origin, direction, g, node, min_distance)) { + + if (node.kind == LEAF_NODE) { + // This node wraps a triangle + + if (node.child != last_hit_triangle) { + // Can't hit same triangle twice in a row + tri_count++; + Triangle t = get_triangle(g, node.child); + if (intersect_triangle(origin, direction, t, distance)) { + + if (triangle_index == -1 || distance < min_distance) { + triangle_index = node.child; + min_distance = distance; + } // if hit triangle is closer than previous hits + + } // if hit triangle + + } // if not hitting same triangle as last step + + } else { + curr++; + *curr = node.child; + } // leaf or internal node? + } // hit node? - node--; - } - else { - // node is a leaf - for (i=first_child; i < stop; i++) { - if (last_hit_triangle == i) - continue; - - Triangle t = get_triangle(g, i); - - if (intersect_triangle(origin, direction, t, distance)) { - if (triangle_index == -1) { - triangle_index = i; - min_distance = distance; - continue; - } - - if (distance < min_distance) { - triangle_index = i; - min_distance = distance; - } - } - } // triangle loop - - node--; - - } // node is a leaf - - if (node > tail) { - printf("warning: intersect_mesh() aborted; node > tail\n"); - break; - } - - } // while loop - while (node != head); - + if (curr > tail) { + printf("warning: intersect_mesh() aborted; node > tail\n"); + break; + } + } // loop over children, starting with first_child + + } // while nodes on stack + + if (threadIdx.x == 0) { + printf("node count: %d\n", count); + printf("triangle count: %d\n", tri_count); + } + return triangle_index; } diff --git a/chroma/gpu/geometry.py b/chroma/gpu/geometry.py index 081563f..6682f8f 100644 --- a/chroma/gpu/geometry.py +++ b/chroma/gpu/geometry.py @@ -6,11 +6,35 @@ from pycuda import characterize from chroma.geometry import standard_wavelengths from chroma.gpu.tools import get_cu_module, get_cu_source, cuda_options, \ chunk_iterator, format_array, format_size, to_uint3, to_float3, \ - make_gpu_struct + make_gpu_struct, GPUFuncs + from chroma.log import logger +def round_up_to_multiple(x, multiple): + remainder = x % multiple + if remainder == 0: + return x + else: + return x + multiple - remainder + +def compute_layer_configuration(n, branch_degree): + if n == 1: + # Special case for root + return [ (1, 1) ] + else: + layer_conf = [ (n, round_up_to_multiple(n, branch_degree)) ] + + while layer_conf[0][1] > 1: + nparent = int(np.ceil( float(layer_conf[0][1]) / branch_degree )) + if nparent == 1: + layer_conf = [ (1, 1) ] + layer_conf + else: + layer_conf = [ (nparent, round_up_to_multiple(nparent, branch_degree)) ] + layer_conf + + return layer_conf + class GPUGeometry(object): - def __init__(self, geometry, wavelengths=None, print_usage=False): + def __init__(self, geometry, wavelengths=None, print_usage=False, branch_degree=32): if wavelengths is None: wavelengths = standard_wavelengths @@ -19,7 +43,7 @@ class GPUGeometry(object): except ValueError: raise ValueError('wavelengths must be equally spaced apart.') - geometry_source = get_cu_source('geometry.h') + geometry_source = get_cu_source('geometry_types.h') material_struct_size = characterize.sizeof('Material', geometry_source) surface_struct_size = characterize.sizeof('Surface', geometry_source) geometry_struct_size = characterize.sizeof('Geometry', geometry_source) @@ -104,32 +128,50 @@ class GPUGeometry(object): self.surface_pointer_array = \ make_gpu_struct(8*len(self.surface_ptrs), self.surface_ptrs) - self.vertices = ga.to_gpu(to_float3(geometry.mesh.vertices)) - self.triangles = ga.to_gpu(to_uint3(geometry.mesh.triangles)) + self.pagelocked_vertices = cuda.pagelocked_empty(shape=len(geometry.mesh.vertices), + dtype=ga.vec.float3, + mem_flags=cuda.host_alloc_flags.DEVICEMAP | cuda.host_alloc_flags.WRITECOMBINED) + self.pagelocked_triangles = cuda.pagelocked_empty(shape=len(geometry.mesh.triangles), + dtype=ga.vec.uint3, + mem_flags=cuda.host_alloc_flags.DEVICEMAP | cuda.host_alloc_flags.WRITECOMBINED) + self.pagelocked_vertices[:] = to_float3(geometry.mesh.vertices) + self.pagelocked_triangles[:] = to_uint3(geometry.mesh.triangles) + self.vertices = np.intp(self.pagelocked_vertices.base.get_device_pointer()) + self.triangles = np.intp(self.pagelocked_triangles.base.get_device_pointer()) + + + self.branch_degree = branch_degree + print 'bvh', cuda.mem_get_info() + self.world_origin, self.world_scale, self.nodes = self.make_bvh(geometry.mesh.vertices, + self.vertices, + len(geometry.mesh.triangles), + self.triangles, + self.branch_degree) + print 'bvh after' material_codes = (((geometry.material1_index & 0xff) << 24) | ((geometry.material2_index & 0xff) << 16) | ((geometry.surface_index & 0xff) << 8)).astype(np.uint32) - - self.material_codes = ga.to_gpu(material_codes) - - self.lower_bounds = ga.to_gpu(to_float3(geometry.lower_bounds)) - self.upper_bounds = ga.to_gpu(to_float3(geometry.upper_bounds)) - self.colors = ga.to_gpu(geometry.colors.astype(np.uint32)) - self.node_map = ga.to_gpu(geometry.node_map.astype(np.uint32)) - self.node_map_end = ga.to_gpu(geometry.node_map_end.astype(np.uint32)) + self.pagelocked_material_codes = cuda.pagelocked_empty_like(material_codes, mem_flags=cuda.host_alloc_flags.DEVICEMAP | cuda.host_alloc_flags.WRITECOMBINED) + self.pagelocked_material_codes[:] = material_codes + self.material_codes = np.intp(self.pagelocked_material_codes.base.get_device_pointer()) + + colors = geometry.colors.astype(np.uint32) + self.pagelocked_colors = cuda.pagelocked_empty_like(colors, mem_flags=cuda.host_alloc_flags.DEVICEMAP | cuda.host_alloc_flags.WRITECOMBINED) + self.pagelocked_colors[:] = colors + self.colors = np.intp(self.pagelocked_colors.base.get_device_pointer()) self.solid_id_map = ga.to_gpu(geometry.solid_id.astype(np.uint32)) self.gpudata = make_gpu_struct(geometry_struct_size, [self.vertices, self.triangles, self.material_codes, - self.colors, self.lower_bounds, - self.upper_bounds, self.node_map, - self.node_map_end, + self.colors, self.nodes, self.material_pointer_array, self.surface_pointer_array, - np.uint32(geometry.start_node), - np.uint32(geometry.first_node)]) + self.world_origin, + np.float32(0.0), # dummy1 + self.world_scale, + np.uint32(self.branch_degree)]) self.geometry = geometry @@ -137,17 +179,96 @@ class GPUGeometry(object): self.print_device_usage() logger.info(self.device_usage_str()) + def make_bvh(self, vertices, gpu_vertices, ntriangles, gpu_triangles, branch_degree): + assert branch_degree > 1 + bvh_module = get_cu_module('bvh.cu', options=cuda_options, + include_source_directory=True) + bvh_funcs = GPUFuncs(bvh_module) + + world_min = vertices.min(axis=0) + # Full scale at 2**16 - 2 in order to ensure there is dynamic range to round + # up by one count after quantization + world_scale = (vertices.max(axis=0) - world_min) / (2**16 - 2) + + world_origin = ga.vec.make_float3(*world_min) + world_scale = ga.vec.make_float3(*world_scale) + + layer_conf = compute_layer_configuration(ntriangles, branch_degree) + layer_offsets = list(np.cumsum([npad for n, npad in layer_conf])) + + # Last entry is number of nodes, trim off and add zero to get offset of each layer + n_nodes = int(layer_offsets[-1]) + layer_offsets = [0] + layer_offsets[:-1] + + leaf_nodes = ga.empty(shape=ntriangles, dtype=ga.vec.uint4) + morton_codes = ga.empty(shape=ntriangles, dtype=np.uint64) + + # Step 1: Make leaves + nthreads_per_block=256 + for first_index, elements_this_iter, nblocks_this_iter in \ + chunk_iterator(ntriangles, nthreads_per_block, max_blocks=10000): + bvh_funcs.make_leaves(np.uint32(first_index), + np.uint32(elements_this_iter), + gpu_triangles, gpu_vertices, + world_origin, world_scale, + leaf_nodes, morton_codes, + block=(nthreads_per_block,1,1), + grid=(nblocks_this_iter,1)) + + # argsort on the CPU because I'm too lazy to do it on the GPU + argsort = morton_codes.get().argsort().astype(np.uint32) + del morton_codes + #local_leaf_nodes = leaf_nodes.get()[argsort] + # + remap_order = ga.to_gpu(argsort) + #m = morton_codes.get() + #m.sort() + #print m + #assert False + # Step 2: sort leaf nodes into full node list + print cuda.mem_get_info(), leaf_nodes.nbytes + nodes = ga.zeros(shape=n_nodes, dtype=ga.vec.uint4) + #cuda.memcpy_htod(int(nodes.gpudata)+int(layer_offsets[-1]), local_leaf_nodes) + + for first_index, elements_this_iter, nblocks_this_iter in \ + chunk_iterator(ntriangles, nthreads_per_block, max_blocks=10000): + bvh_funcs.reorder_leaves(np.uint32(first_index), + np.uint32(elements_this_iter), + leaf_nodes, nodes[layer_offsets[-1]:], remap_order, + block=(nthreads_per_block,1,1), + grid=(nblocks_this_iter,1)) + + del leaf_nodes + del remap_order + + # Step 3: Create parent layers in reverse order + layer_parameters = zip(layer_offsets[:-1], layer_offsets[1:], layer_conf) + layer_parameters.reverse() + + for parent_offset, child_offset, (nparent, nparent_pad) in layer_parameters: + for first_index, elements_this_iter, nblocks_this_iter in \ + chunk_iterator(nparent, nthreads_per_block, max_blocks=10000): + print parent_offset, first_index, elements_this_iter, nblocks_this_iter + bvh_funcs.build_layer(np.uint32(first_index), + np.uint32(elements_this_iter), + np.uint32(branch_degree), + nodes, + np.uint32(parent_offset), + np.uint32(child_offset), + block=(nthreads_per_block,1,1), + grid=(nblocks_this_iter,1)) + + return world_origin, world_scale, nodes + + def device_usage_str(self): '''Returns a formatted string displaying the memory usage.''' s = 'device usage:\n' s += '-'*10 + '\n' - s += format_array('vertices', self.vertices) + '\n' - s += format_array('triangles', self.triangles) + '\n' - s += format_array('lower_bounds', self.lower_bounds) + '\n' - s += format_array('upper_bounds', self.upper_bounds) + '\n' - s += format_array('node_map', self.node_map) + '\n' - s += format_array('node_map_end', self.node_map_end) + '\n' - s += '%-15s %6s %6s' % ('total', '', format_size(self.vertices.nbytes + self.triangles.nbytes + self.lower_bounds.nbytes + self.upper_bounds.nbytes + self.node_map.nbytes + self.node_map_end.nbytes)) + '\n' + #s += format_array('vertices', self.vertices) + '\n' + #s += format_array('triangles', self.triangles) + '\n' + s += format_array('nodes', self.nodes) + '\n' + s += '%-15s %6s %6s' % ('total', '', format_size(self.nodes.nbytes)) + '\n' s += '-'*10 + '\n' free, total = cuda.mem_get_info() s += '%-15s %6s %6s' % ('device total', '', format_size(total)) + '\n' |