diff options
author | Stan Seibert <stan@mtrr.org> | 2012-01-12 13:27:31 -0500 |
---|---|---|
committer | tlatorre <tlatorre@uchicago.edu> | 2021-05-09 08:42:38 -0700 |
commit | 3961d2b3aa0b0b0783ddd1138556c374a2731e9e (patch) | |
tree | 3002f685d75785dabe7ded6c80c2817ede82bce3 | |
parent | 7535581339b839ad17f79ee3e931a0e907cf5071 (diff) | |
download | chroma-3961d2b3aa0b0b0783ddd1138556c374a2731e9e.tar.gz chroma-3961d2b3aa0b0b0783ddd1138556c374a2731e9e.tar.bz2 chroma-3961d2b3aa0b0b0783ddd1138556c374a2731e9e.zip |
Pull BVH calculation out of GPUGeometry class and make the world scale factor the same in all dimensions so that distance and area calculations are easy to do.
-rw-r--r-- | chroma/cuda/bvh.cu | 214 | ||||
-rw-r--r-- | chroma/cuda/geometry_types.h | 3 | ||||
-rw-r--r-- | chroma/gpu/geometry.py | 288 |
3 files changed, 409 insertions, 96 deletions
diff --git a/chroma/cuda/bvh.cu b/chroma/cuda/bvh.cu index 8543649..50e877a 100644 --- a/chroma/cuda/bvh.cu +++ b/chroma/cuda/bvh.cu @@ -2,6 +2,7 @@ #include <cuda.h> #include "linalg.h" +#include "physical_constants.h" __device__ float3 fminf(const float3 &a, const float3 &b) @@ -47,17 +48,79 @@ __device__ unsigned long long spread3_16(unsigned int input) return x; } +__device__ unsigned long long spread2_16(unsigned int input) +{ + unsigned long long x = input; + x = (x | (x << 16)) & 0x000000ff00ff00fful; + x = (x | (x << 8)) & 0x00000f0f0f0f0f0ful; + x = (x | (x << 4)) & 0x0000333333333333ul; + x = (x | (x << 2)) & 0x0000555555555555ul; + return x; +} + + __device__ unsigned int quantize(float v, float world_origin, float world_scale) { // truncate! return (unsigned int) ((v - world_origin) / world_scale); } -__device__ uint3 quantize3(float3 v, float3 world_origin, float3 world_scale) +__device__ uint3 quantize3(float3 v, float3 world_origin, float world_scale) +{ + return make_uint3(quantize(v.x, world_origin.x, world_scale), + quantize(v.y, world_origin.y, world_scale), + quantize(v.z, world_origin.z, world_scale)); +} + +__device__ uint3 quantize3_cyl(float3 v, float3 world_origin, float world_scale) +{ + float3 rescaled_v = (v - world_origin) / world_scale / sqrtf(3.0f); + unsigned int z = rescaled_v.z; + rescaled_v.z = 0.0f; + unsigned int rho = (unsigned int) norm(rescaled_v); + unsigned int phi = (unsigned int) ((atan2f(v.y, v.x)/PI/2.0f + 1.0f) * 65535.0f); + + return make_uint3(rho, phi, z); +} + +__device__ uint3 quantize3_sph(float3 v, float3 world_origin, float world_scale) +{ + float3 rescaled_v = (v - world_origin) / world_scale; + + unsigned int r = (unsigned int) (norm(rescaled_v) / sqrt(3.0f)); + + unsigned int phi = (unsigned int) ((atan2f(rescaled_v.y, rescaled_v.x)/PI/2.0f + 1.0f) * 65535.0f); + + unsigned int theta = (unsigned int) (acosf(rescaled_v.z / norm(rescaled_v)) / PI * 65535.0f); + + return make_uint3(r, theta, phi); +} + +__device__ uint4 node_union(const uint4 &a, const uint4 &b) +{ + uint3 lower = make_uint3(min(a.x & 0xFFFF, b.x & 0xFFFF), + min(a.y & 0xFFFF, b.y & 0xFFFF), + min(a.z & 0xFFFF, b.z & 0xFFFF)); + uint3 upper = make_uint3(max(a.x >> 16, b.x >> 16), + max(a.y >> 16, b.y >> 16), + max(a.z >> 16, b.z >> 16)); + + return make_uint4(upper.x << 16 | lower.x, + upper.y << 16 | lower.y, + upper.z << 16 | lower.z, + 0); + +} + + + +__device__ unsigned int surface_half_area(const uint4 &node) { - return make_uint3(quantize(v.x, world_origin.x, world_scale.x), - quantize(v.y, world_origin.y, world_scale.y), - quantize(v.z, world_origin.z, world_scale.z)); + unsigned int x = (node.x >> 16) - (node.x & 0xFFFF); + unsigned int y = (node.y >> 16) - (node.y & 0xFFFF); + unsigned int z = (node.z >> 16) - (node.z & 0xFFFF); + + return x*y + y*z + z*x; } const unsigned int LEAF_BIT = (1U << 31); @@ -66,9 +129,25 @@ extern "C" { __global__ void + node_area(unsigned int first_node, + unsigned int nnodes_this_round, + uint4 *nodes, + unsigned int *areas) + { + unsigned int thread_id = blockDim.x * blockIdx.x + threadIdx.x; + if (thread_id >= nnodes_this_round) + return; + + unsigned int node_id = first_node + thread_id; + + areas[node_id] = surface_half_area(nodes[node_id]); + } + + + __global__ void make_leaves(unsigned int first_triangle, unsigned int ntriangles, uint3 *triangles, float3 *vertices, - float3 world_origin, float3 world_scale, + float3 world_origin, float world_scale, uint4 *leaf_nodes, unsigned long long *morton_codes) { @@ -107,6 +186,17 @@ extern "C" spread3_16(q_centroid.x) | (spread3_16(q_centroid.y) << 1) | (spread3_16(q_centroid.z) << 2); + + //unsigned long long morton = + // spread3_16(triangle.x & 0xFFFF) + // | (spread3_16(triangle.y & 0xFFFF) << 1) + // | (spread3_16(triangle.z & 0xFFFF) << 2); + + + //unsigned long long morton = + // ( ((unsigned long long) q_centroid.x) << 32 ) + // | ( ((unsigned long long) q_centroid.y) << 16 ) + // | ( ((unsigned long long) q_centroid.z) << 0 ); // Write leaf and morton code uint4 leaf_node; @@ -180,4 +270,118 @@ extern "C" nodes[parent_layer_offset + parent_id] = parent_node; } + __global__ void distance_to_prev(unsigned int first_node, unsigned int threads_this_round, + uint4 *node, unsigned int *area) + { + unsigned int thread_id = blockDim.x * blockIdx.x + threadIdx.x; + if (thread_id >= threads_this_round) + return; + + unsigned int node_id = first_node + thread_id; + + uint4 a = node[node_id - 1]; + uint4 b = node[node_id]; + uint4 u = node_union(a, b); + + area[node_id] = surface_half_area(u); + } + + __global__ void distance_to(unsigned int first_node, unsigned int threads_this_round, + unsigned int target_index, + uint4 *node, unsigned int *area) + { + unsigned int thread_id = blockDim.x * blockIdx.x + threadIdx.x; + if (thread_id >= threads_this_round) + return; + + unsigned int node_id = first_node + thread_id; + + if (node_id == target_index) { + area[node_id] = 0xFFFFFFFF; + } else { + uint4 a = node[target_index]; + uint4 b = node[node_id]; + uint4 u = node_union(a, b); + + area[node_id] = surface_half_area(u); + } + } + + __global__ void min_distance_to(unsigned int first_node, unsigned int threads_this_round, + unsigned int target_index, + uint4 *node, + unsigned int block_offset, + unsigned int *min_area_block, + unsigned int *min_index_block, + unsigned int *flag) + { + __shared__ unsigned int min_area; + __shared__ unsigned int adjacent_area; + + target_index += blockIdx.y; + + uint4 a = node[target_index]; + + if (threadIdx.x == 0) { + min_area = 0xFFFFFFFF; + adjacent_area = surface_half_area(node_union(a, node[target_index+1])); + } + + __syncthreads(); + + unsigned int thread_id = blockDim.x * blockIdx.x + threadIdx.x; + + unsigned int node_id = first_node + thread_id; + + if (thread_id >= threads_this_round) + node_id = target_index; + + unsigned int area; + + if (node_id == target_index) { + area = 0xFFFFFFFF; + } else { + uint4 b = node[node_id]; + + if (b.x == 0) { + area = 0xFFFFFFFF; + } else { + uint4 u = node_union(a, b); + area = surface_half_area(u); + } + } + + atomicMin(&min_area, area); + + __syncthreads(); + + if (min_area == area) { + + if (blockIdx.y == 0) { + if (min_area < adjacent_area) { + min_index_block[block_offset + blockIdx.x] = node_id; + min_area_block[block_offset + blockIdx.x] = area; + flag[0] = 1; + } else { + min_area_block[block_offset + blockIdx.x] = 0xFFFFFFFF; + } + } else { + + if (min_area < adjacent_area) + flag[blockIdx.y] = 1; + } + + } + } + + + + __global__ void swap(unsigned int a_index, unsigned int b_index, + uint4 *node) + { + uint4 temp4 = node[a_index]; + node[a_index] = node[b_index]; + node[b_index] = temp4; + } + } // extern "C" diff --git a/chroma/cuda/geometry_types.h b/chroma/cuda/geometry_types.h index 31ff3c6..89c53bc 100644 --- a/chroma/cuda/geometry_types.h +++ b/chroma/cuda/geometry_types.h @@ -47,8 +47,7 @@ struct Geometry Material **materials; Surface **surfaces; float3 world_origin; - float _dummy1; // for alignment - float3 world_scale; + float world_scale; unsigned int branch_degree; }; diff --git a/chroma/gpu/geometry.py b/chroma/gpu/geometry.py index f3a3b19..6cb991c 100644 --- a/chroma/gpu/geometry.py +++ b/chroma/gpu/geometry.py @@ -33,8 +33,201 @@ def compute_layer_configuration(n, branch_degree): return layer_conf +def optimize_bvh_layer(layer, bvh_funcs): + n = len(layer) + areas = ga.empty(shape=n, dtype=np.uint32) + union_areas = ga.empty(shape=n, dtype=np.uint32) + nthreads_per_block = 128 + min_areas = ga.empty(shape=int(np.ceil(n/float(nthreads_per_block))), dtype=np.uint32) + min_index = ga.empty_like(min_areas) + + update = 50000 + + skip_size = 1 + flag = cuda.pagelocked_empty(shape=skip_size, dtype=np.uint32, mem_flags=cuda.host_alloc_flags.DEVICEMAP) + flag_gpu = np.intp(flag.base.get_device_pointer()) + print 'starting optimization' + + i = 0 + skips = 0 + while i < (n/2 - 1): + # How are we doing? + if i % update == 0: + for first_index, elements_this_iter, nblocks_this_iter in \ + chunk_iterator(n-1, nthreads_per_block, max_blocks=10000): + + bvh_funcs.distance_to_prev(np.uint32(first_index + 1), + np.uint32(elements_this_iter), + layer, + union_areas, + block=(nthreads_per_block,1,1), + grid=(nblocks_this_iter,1)) + + union_areas_host = union_areas.get()[1::2] + print 'Area of parent layer: %1.12e' % union_areas_host.astype(float).sum() + print 'Area of parent layer so far (%d): %1.12e' % (i*2, union_areas_host.astype(float)[:i].\ +sum()) + print 'Skips:', skips + + test_index = i * 2 + + blocks = 0 + look_forward = min(8192*400, n - test_index - 2) + skip_this_round = min(skip_size, n - test_index - 1) + flag[:] = 0 + for first_index, elements_this_iter, nblocks_this_iter in \ + chunk_iterator(look_forward, nthreads_per_block, max_blocks=10000): + bvh_funcs.min_distance_to(np.uint32(first_index + test_index + 2), + np.uint32(elements_this_iter), + np.uint32(test_index), + layer, + np.uint32(blocks), + min_areas, + min_index, + flag_gpu, + block=(nthreads_per_block,1,1), + grid=(nblocks_this_iter, skip_this_round)) + blocks += nblocks_this_iter + cuda.Context.get_current().synchronize() + + if flag[0] == 0: + flag_nonzero = flag.nonzero()[0] + if len(flag_nonzero) == 0: + no_swap_required = skip_size + else: + no_swap_required = flag_nonzero[0] + i += no_swap_required + skips += no_swap_required + continue + + areas_host = min_areas[:blocks].get() + min_index_host = min_index[:blocks].get() + best_block = areas_host.argmin() + better_i = min_index_host[best_block] + + if i % update == 0: + print 'swapping %d and %d' % (test_index + 1, better_i) + + bvh_funcs.swap(np.uint32(test_index+1), np.uint32(better_i), + layer, block=(1,1,1), grid=(1,1)) + i += 1 + + for first_index, elements_this_iter, nblocks_this_iter in \ + chunk_iterator(n-1, nthreads_per_block, max_blocks=10000): + + bvh_funcs.distance_to_prev(np.uint32(first_index + 1), + np.uint32(elements_this_iter), + layer, + union_areas, + block=(nthreads_per_block,1,1), + grid=(nblocks_this_iter,1)) + + union_areas_host = union_areas.get()[1::2] + print 'Final area of parent layer: %1.12e' % union_areas_host.sum() + print 'Skips:', skips + +def make_bvh(vertices, gpu_vertices, ntriangles, gpu_triangles, branch_degree): + assert branch_degree > 1 + bvh_module = get_cu_module('bvh.cu', options=cuda_options, + include_source_directory=True) + bvh_funcs = GPUFuncs(bvh_module) + + world_min = vertices.min(axis=0) + # Full scale at 2**16 - 2 in order to ensure there is dynamic range to round + # up by one count after quantization + world_scale = np.max((vertices.max(axis=0) - world_min)) / (2**16 - 2) + + world_origin = ga.vec.make_float3(*world_min) + world_scale = np.float32(world_scale) + + layer_conf = compute_layer_configuration(ntriangles, branch_degree) + layer_offsets = list(np.cumsum([npad for n, npad in layer_conf])) + + # Last entry is number of nodes, trim off and add zero to get offset of each layer + n_nodes = int(layer_offsets[-1]) + layer_offsets = [0] + layer_offsets[:-1] + + leaf_nodes = ga.empty(shape=ntriangles, dtype=ga.vec.uint4) + morton_codes = ga.empty(shape=ntriangles, dtype=np.uint64) + + # Step 1: Make leaves + nthreads_per_block=256 + for first_index, elements_this_iter, nblocks_this_iter in \ + chunk_iterator(ntriangles, nthreads_per_block, max_blocks=10000): + bvh_funcs.make_leaves(np.uint32(first_index), + np.uint32(elements_this_iter), + gpu_triangles, gpu_vertices, + world_origin, world_scale, + leaf_nodes, morton_codes, + block=(nthreads_per_block,1,1), + grid=(nblocks_this_iter,1)) + + # argsort on the CPU because I'm too lazy to do it on the GPU + argsort = morton_codes.get().argsort().astype(np.uint32) + del morton_codes + local_leaf_nodes = leaf_nodes.get()[argsort] + del leaf_nodes + #del remap_order + # + #remap_order = ga.to_gpu(argsort) + #m = morton_codes.get() + #m.sort() + #print m + #assert False + # Step 2: sort leaf nodes into full node list + #print cuda.mem_get_info(), leaf_nodes.nbytes + nodes = ga.zeros(shape=n_nodes, dtype=ga.vec.uint4) + areas = ga.zeros(shape=n_nodes, dtype=np.uint32) + cuda.memcpy_htod(int(nodes.gpudata)+int(layer_offsets[-1]), local_leaf_nodes) + + #for first_index, elements_this_iter, nblocks_this_iter in \ + # chunk_iterator(ntriangles, nthreads_per_block, max_blocks=10000): + # bvh_funcs.reorder_leaves(np.uint32(first_index), + # np.uint32(elements_this_iter), + # leaf_nodes, nodes[layer_offsets[-1]:], remap_order, + # block=(nthreads_per_block,1,1), + # grid=(nblocks_this_iter,1)) + + + # Step 3: Create parent layers in reverse order + layer_parameters = zip(layer_offsets[:-1], layer_offsets[1:], layer_conf) + layer_parameters.reverse() + + i = len(layer_parameters) + for parent_offset, child_offset, (nparent, nparent_pad) in layer_parameters: + #if i < 30: + # optimize_bvh_layer(nodes[child_offset:child_offset+nparent*branch_degree], + # bvh_funcs) + + for first_index, elements_this_iter, nblocks_this_iter in \ + chunk_iterator(nparent * branch_degree, nthreads_per_block, + max_blocks=10000): + bvh_funcs.node_area(np.uint32(first_index+child_offset), + np.uint32(elements_this_iter), + nodes, + areas, + block=(nthreads_per_block,1,1), + grid=(nblocks_this_iter,1)) + + print 'area', i, nparent * branch_degree, '%e' % areas[child_offset:child_offset+nparent*branch_degree].get().astype(float).sum() + + for first_index, elements_this_iter, nblocks_this_iter in \ + chunk_iterator(nparent, nthreads_per_block, max_blocks=10000): + bvh_funcs.build_layer(np.uint32(first_index), + np.uint32(elements_this_iter), + np.uint32(branch_degree), + nodes, + np.uint32(parent_offset), + np.uint32(child_offset), + block=(nthreads_per_block,1,1), + grid=(nblocks_this_iter,1)) + + i -= 1 + + return world_origin, world_scale, nodes + class GPUGeometry(object): - def __init__(self, geometry, wavelengths=None, print_usage=False, branch_degree=3): + def __init__(self, geometry, wavelengths=None, print_usage=False, branch_degree=2): if wavelengths is None: wavelengths = standard_wavelengths @@ -142,11 +335,11 @@ class GPUGeometry(object): self.branch_degree = branch_degree print 'bvh', cuda.mem_get_info() - self.world_origin, self.world_scale, self.nodes = self.make_bvh(geometry.mesh.vertices, - self.vertices, - len(geometry.mesh.triangles), - self.triangles, - self.branch_degree) + self.world_origin, self.world_scale, self.nodes = make_bvh(geometry.mesh.vertices, + self.vertices, + len(geometry.mesh.triangles), + self.triangles, + self.branch_degree) print 'bvh after', cuda.mem_get_info() material_codes = (((geometry.material1_index & 0xff) << 24) | @@ -164,7 +357,6 @@ class GPUGeometry(object): self.material_pointer_array, self.surface_pointer_array, self.world_origin, - np.float32(0.0), # dummy1 self.world_scale, np.uint32(self.branch_degree)]) @@ -174,88 +366,6 @@ class GPUGeometry(object): self.print_device_usage() logger.info(self.device_usage_str()) - def make_bvh(self, vertices, gpu_vertices, ntriangles, gpu_triangles, branch_degree): - assert branch_degree > 1 - bvh_module = get_cu_module('bvh.cu', options=cuda_options, - include_source_directory=True) - bvh_funcs = GPUFuncs(bvh_module) - - world_min = vertices.min(axis=0) - # Full scale at 2**16 - 2 in order to ensure there is dynamic range to round - # up by one count after quantization - world_scale = (vertices.max(axis=0) - world_min) / (2**16 - 2) - - world_origin = ga.vec.make_float3(*world_min) - world_scale = ga.vec.make_float3(*world_scale) - - layer_conf = compute_layer_configuration(ntriangles, branch_degree) - layer_offsets = list(np.cumsum([npad for n, npad in layer_conf])) - - # Last entry is number of nodes, trim off and add zero to get offset of each layer - n_nodes = int(layer_offsets[-1]) - layer_offsets = [0] + layer_offsets[:-1] - - leaf_nodes = ga.empty(shape=ntriangles, dtype=ga.vec.uint4) - morton_codes = ga.empty(shape=ntriangles, dtype=np.uint64) - - # Step 1: Make leaves - nthreads_per_block=256 - for first_index, elements_this_iter, nblocks_this_iter in \ - chunk_iterator(ntriangles, nthreads_per_block, max_blocks=10000): - bvh_funcs.make_leaves(np.uint32(first_index), - np.uint32(elements_this_iter), - gpu_triangles, gpu_vertices, - world_origin, world_scale, - leaf_nodes, morton_codes, - block=(nthreads_per_block,1,1), - grid=(nblocks_this_iter,1)) - - # argsort on the CPU because I'm too lazy to do it on the GPU - argsort = morton_codes.get().argsort().astype(np.uint32) - del morton_codes - #local_leaf_nodes = leaf_nodes.get()[argsort] - # - remap_order = ga.to_gpu(argsort) - #m = morton_codes.get() - #m.sort() - #print m - #assert False - # Step 2: sort leaf nodes into full node list - print cuda.mem_get_info(), leaf_nodes.nbytes - nodes = ga.zeros(shape=n_nodes, dtype=ga.vec.uint4) - #cuda.memcpy_htod(int(nodes.gpudata)+int(layer_offsets[-1]), local_leaf_nodes) - - for first_index, elements_this_iter, nblocks_this_iter in \ - chunk_iterator(ntriangles, nthreads_per_block, max_blocks=10000): - bvh_funcs.reorder_leaves(np.uint32(first_index), - np.uint32(elements_this_iter), - leaf_nodes, nodes[layer_offsets[-1]:], remap_order, - block=(nthreads_per_block,1,1), - grid=(nblocks_this_iter,1)) - - del leaf_nodes - del remap_order - - # Step 3: Create parent layers in reverse order - layer_parameters = zip(layer_offsets[:-1], layer_offsets[1:], layer_conf) - layer_parameters.reverse() - - for parent_offset, child_offset, (nparent, nparent_pad) in layer_parameters: - for first_index, elements_this_iter, nblocks_this_iter in \ - chunk_iterator(nparent, nthreads_per_block, max_blocks=10000): - print parent_offset, first_index, elements_this_iter, nblocks_this_iter - bvh_funcs.build_layer(np.uint32(first_index), - np.uint32(elements_this_iter), - np.uint32(branch_degree), - nodes, - np.uint32(parent_offset), - np.uint32(child_offset), - block=(nthreads_per_block,1,1), - grid=(nblocks_this_iter,1)) - - return world_origin, world_scale, nodes - - def device_usage_str(self): '''Returns a formatted string displaying the memory usage.''' s = 'device usage:\n' |