summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStan Seibert <stan@mtrr.org>2012-01-12 13:27:31 -0500
committertlatorre <tlatorre@uchicago.edu>2021-05-09 08:42:38 -0700
commit3961d2b3aa0b0b0783ddd1138556c374a2731e9e (patch)
tree3002f685d75785dabe7ded6c80c2817ede82bce3
parent7535581339b839ad17f79ee3e931a0e907cf5071 (diff)
downloadchroma-3961d2b3aa0b0b0783ddd1138556c374a2731e9e.tar.gz
chroma-3961d2b3aa0b0b0783ddd1138556c374a2731e9e.tar.bz2
chroma-3961d2b3aa0b0b0783ddd1138556c374a2731e9e.zip
Pull BVH calculation out of GPUGeometry class and make the world scale factor the same in all dimensions so that distance and area calculations are easy to do.
-rw-r--r--chroma/cuda/bvh.cu214
-rw-r--r--chroma/cuda/geometry_types.h3
-rw-r--r--chroma/gpu/geometry.py288
3 files changed, 409 insertions, 96 deletions
diff --git a/chroma/cuda/bvh.cu b/chroma/cuda/bvh.cu
index 8543649..50e877a 100644
--- a/chroma/cuda/bvh.cu
+++ b/chroma/cuda/bvh.cu
@@ -2,6 +2,7 @@
#include <cuda.h>
#include "linalg.h"
+#include "physical_constants.h"
__device__ float3
fminf(const float3 &a, const float3 &b)
@@ -47,17 +48,79 @@ __device__ unsigned long long spread3_16(unsigned int input)
return x;
}
+__device__ unsigned long long spread2_16(unsigned int input)
+{
+ unsigned long long x = input;
+ x = (x | (x << 16)) & 0x000000ff00ff00fful;
+ x = (x | (x << 8)) & 0x00000f0f0f0f0f0ful;
+ x = (x | (x << 4)) & 0x0000333333333333ul;
+ x = (x | (x << 2)) & 0x0000555555555555ul;
+ return x;
+}
+
+
__device__ unsigned int quantize(float v, float world_origin, float world_scale)
{
// truncate!
return (unsigned int) ((v - world_origin) / world_scale);
}
-__device__ uint3 quantize3(float3 v, float3 world_origin, float3 world_scale)
+__device__ uint3 quantize3(float3 v, float3 world_origin, float world_scale)
+{
+ return make_uint3(quantize(v.x, world_origin.x, world_scale),
+ quantize(v.y, world_origin.y, world_scale),
+ quantize(v.z, world_origin.z, world_scale));
+}
+
+__device__ uint3 quantize3_cyl(float3 v, float3 world_origin, float world_scale)
+{
+ float3 rescaled_v = (v - world_origin) / world_scale / sqrtf(3.0f);
+ unsigned int z = rescaled_v.z;
+ rescaled_v.z = 0.0f;
+ unsigned int rho = (unsigned int) norm(rescaled_v);
+ unsigned int phi = (unsigned int) ((atan2f(v.y, v.x)/PI/2.0f + 1.0f) * 65535.0f);
+
+ return make_uint3(rho, phi, z);
+}
+
+__device__ uint3 quantize3_sph(float3 v, float3 world_origin, float world_scale)
+{
+ float3 rescaled_v = (v - world_origin) / world_scale;
+
+ unsigned int r = (unsigned int) (norm(rescaled_v) / sqrt(3.0f));
+
+ unsigned int phi = (unsigned int) ((atan2f(rescaled_v.y, rescaled_v.x)/PI/2.0f + 1.0f) * 65535.0f);
+
+ unsigned int theta = (unsigned int) (acosf(rescaled_v.z / norm(rescaled_v)) / PI * 65535.0f);
+
+ return make_uint3(r, theta, phi);
+}
+
+__device__ uint4 node_union(const uint4 &a, const uint4 &b)
+{
+ uint3 lower = make_uint3(min(a.x & 0xFFFF, b.x & 0xFFFF),
+ min(a.y & 0xFFFF, b.y & 0xFFFF),
+ min(a.z & 0xFFFF, b.z & 0xFFFF));
+ uint3 upper = make_uint3(max(a.x >> 16, b.x >> 16),
+ max(a.y >> 16, b.y >> 16),
+ max(a.z >> 16, b.z >> 16));
+
+ return make_uint4(upper.x << 16 | lower.x,
+ upper.y << 16 | lower.y,
+ upper.z << 16 | lower.z,
+ 0);
+
+}
+
+
+
+__device__ unsigned int surface_half_area(const uint4 &node)
{
- return make_uint3(quantize(v.x, world_origin.x, world_scale.x),
- quantize(v.y, world_origin.y, world_scale.y),
- quantize(v.z, world_origin.z, world_scale.z));
+ unsigned int x = (node.x >> 16) - (node.x & 0xFFFF);
+ unsigned int y = (node.y >> 16) - (node.y & 0xFFFF);
+ unsigned int z = (node.z >> 16) - (node.z & 0xFFFF);
+
+ return x*y + y*z + z*x;
}
const unsigned int LEAF_BIT = (1U << 31);
@@ -66,9 +129,25 @@ extern "C"
{
__global__ void
+ node_area(unsigned int first_node,
+ unsigned int nnodes_this_round,
+ uint4 *nodes,
+ unsigned int *areas)
+ {
+ unsigned int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+ if (thread_id >= nnodes_this_round)
+ return;
+
+ unsigned int node_id = first_node + thread_id;
+
+ areas[node_id] = surface_half_area(nodes[node_id]);
+ }
+
+
+ __global__ void
make_leaves(unsigned int first_triangle,
unsigned int ntriangles, uint3 *triangles, float3 *vertices,
- float3 world_origin, float3 world_scale,
+ float3 world_origin, float world_scale,
uint4 *leaf_nodes, unsigned long long *morton_codes)
{
@@ -107,6 +186,17 @@ extern "C"
spread3_16(q_centroid.x)
| (spread3_16(q_centroid.y) << 1)
| (spread3_16(q_centroid.z) << 2);
+
+ //unsigned long long morton =
+ // spread3_16(triangle.x & 0xFFFF)
+ // | (spread3_16(triangle.y & 0xFFFF) << 1)
+ // | (spread3_16(triangle.z & 0xFFFF) << 2);
+
+
+ //unsigned long long morton =
+ // ( ((unsigned long long) q_centroid.x) << 32 )
+ // | ( ((unsigned long long) q_centroid.y) << 16 )
+ // | ( ((unsigned long long) q_centroid.z) << 0 );
// Write leaf and morton code
uint4 leaf_node;
@@ -180,4 +270,118 @@ extern "C"
nodes[parent_layer_offset + parent_id] = parent_node;
}
+ __global__ void distance_to_prev(unsigned int first_node, unsigned int threads_this_round,
+ uint4 *node, unsigned int *area)
+ {
+ unsigned int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+ if (thread_id >= threads_this_round)
+ return;
+
+ unsigned int node_id = first_node + thread_id;
+
+ uint4 a = node[node_id - 1];
+ uint4 b = node[node_id];
+ uint4 u = node_union(a, b);
+
+ area[node_id] = surface_half_area(u);
+ }
+
+ __global__ void distance_to(unsigned int first_node, unsigned int threads_this_round,
+ unsigned int target_index,
+ uint4 *node, unsigned int *area)
+ {
+ unsigned int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+ if (thread_id >= threads_this_round)
+ return;
+
+ unsigned int node_id = first_node + thread_id;
+
+ if (node_id == target_index) {
+ area[node_id] = 0xFFFFFFFF;
+ } else {
+ uint4 a = node[target_index];
+ uint4 b = node[node_id];
+ uint4 u = node_union(a, b);
+
+ area[node_id] = surface_half_area(u);
+ }
+ }
+
+ __global__ void min_distance_to(unsigned int first_node, unsigned int threads_this_round,
+ unsigned int target_index,
+ uint4 *node,
+ unsigned int block_offset,
+ unsigned int *min_area_block,
+ unsigned int *min_index_block,
+ unsigned int *flag)
+ {
+ __shared__ unsigned int min_area;
+ __shared__ unsigned int adjacent_area;
+
+ target_index += blockIdx.y;
+
+ uint4 a = node[target_index];
+
+ if (threadIdx.x == 0) {
+ min_area = 0xFFFFFFFF;
+ adjacent_area = surface_half_area(node_union(a, node[target_index+1]));
+ }
+
+ __syncthreads();
+
+ unsigned int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+
+ unsigned int node_id = first_node + thread_id;
+
+ if (thread_id >= threads_this_round)
+ node_id = target_index;
+
+ unsigned int area;
+
+ if (node_id == target_index) {
+ area = 0xFFFFFFFF;
+ } else {
+ uint4 b = node[node_id];
+
+ if (b.x == 0) {
+ area = 0xFFFFFFFF;
+ } else {
+ uint4 u = node_union(a, b);
+ area = surface_half_area(u);
+ }
+ }
+
+ atomicMin(&min_area, area);
+
+ __syncthreads();
+
+ if (min_area == area) {
+
+ if (blockIdx.y == 0) {
+ if (min_area < adjacent_area) {
+ min_index_block[block_offset + blockIdx.x] = node_id;
+ min_area_block[block_offset + blockIdx.x] = area;
+ flag[0] = 1;
+ } else {
+ min_area_block[block_offset + blockIdx.x] = 0xFFFFFFFF;
+ }
+ } else {
+
+ if (min_area < adjacent_area)
+ flag[blockIdx.y] = 1;
+ }
+
+ }
+ }
+
+
+
+ __global__ void swap(unsigned int a_index, unsigned int b_index,
+ uint4 *node)
+ {
+ uint4 temp4 = node[a_index];
+ node[a_index] = node[b_index];
+ node[b_index] = temp4;
+ }
+
} // extern "C"
diff --git a/chroma/cuda/geometry_types.h b/chroma/cuda/geometry_types.h
index 31ff3c6..89c53bc 100644
--- a/chroma/cuda/geometry_types.h
+++ b/chroma/cuda/geometry_types.h
@@ -47,8 +47,7 @@ struct Geometry
Material **materials;
Surface **surfaces;
float3 world_origin;
- float _dummy1; // for alignment
- float3 world_scale;
+ float world_scale;
unsigned int branch_degree;
};
diff --git a/chroma/gpu/geometry.py b/chroma/gpu/geometry.py
index f3a3b19..6cb991c 100644
--- a/chroma/gpu/geometry.py
+++ b/chroma/gpu/geometry.py
@@ -33,8 +33,201 @@ def compute_layer_configuration(n, branch_degree):
return layer_conf
+def optimize_bvh_layer(layer, bvh_funcs):
+ n = len(layer)
+ areas = ga.empty(shape=n, dtype=np.uint32)
+ union_areas = ga.empty(shape=n, dtype=np.uint32)
+ nthreads_per_block = 128
+ min_areas = ga.empty(shape=int(np.ceil(n/float(nthreads_per_block))), dtype=np.uint32)
+ min_index = ga.empty_like(min_areas)
+
+ update = 50000
+
+ skip_size = 1
+ flag = cuda.pagelocked_empty(shape=skip_size, dtype=np.uint32, mem_flags=cuda.host_alloc_flags.DEVICEMAP)
+ flag_gpu = np.intp(flag.base.get_device_pointer())
+ print 'starting optimization'
+
+ i = 0
+ skips = 0
+ while i < (n/2 - 1):
+ # How are we doing?
+ if i % update == 0:
+ for first_index, elements_this_iter, nblocks_this_iter in \
+ chunk_iterator(n-1, nthreads_per_block, max_blocks=10000):
+
+ bvh_funcs.distance_to_prev(np.uint32(first_index + 1),
+ np.uint32(elements_this_iter),
+ layer,
+ union_areas,
+ block=(nthreads_per_block,1,1),
+ grid=(nblocks_this_iter,1))
+
+ union_areas_host = union_areas.get()[1::2]
+ print 'Area of parent layer: %1.12e' % union_areas_host.astype(float).sum()
+ print 'Area of parent layer so far (%d): %1.12e' % (i*2, union_areas_host.astype(float)[:i].\
+sum())
+ print 'Skips:', skips
+
+ test_index = i * 2
+
+ blocks = 0
+ look_forward = min(8192*400, n - test_index - 2)
+ skip_this_round = min(skip_size, n - test_index - 1)
+ flag[:] = 0
+ for first_index, elements_this_iter, nblocks_this_iter in \
+ chunk_iterator(look_forward, nthreads_per_block, max_blocks=10000):
+ bvh_funcs.min_distance_to(np.uint32(first_index + test_index + 2),
+ np.uint32(elements_this_iter),
+ np.uint32(test_index),
+ layer,
+ np.uint32(blocks),
+ min_areas,
+ min_index,
+ flag_gpu,
+ block=(nthreads_per_block,1,1),
+ grid=(nblocks_this_iter, skip_this_round))
+ blocks += nblocks_this_iter
+ cuda.Context.get_current().synchronize()
+
+ if flag[0] == 0:
+ flag_nonzero = flag.nonzero()[0]
+ if len(flag_nonzero) == 0:
+ no_swap_required = skip_size
+ else:
+ no_swap_required = flag_nonzero[0]
+ i += no_swap_required
+ skips += no_swap_required
+ continue
+
+ areas_host = min_areas[:blocks].get()
+ min_index_host = min_index[:blocks].get()
+ best_block = areas_host.argmin()
+ better_i = min_index_host[best_block]
+
+ if i % update == 0:
+ print 'swapping %d and %d' % (test_index + 1, better_i)
+
+ bvh_funcs.swap(np.uint32(test_index+1), np.uint32(better_i),
+ layer, block=(1,1,1), grid=(1,1))
+ i += 1
+
+ for first_index, elements_this_iter, nblocks_this_iter in \
+ chunk_iterator(n-1, nthreads_per_block, max_blocks=10000):
+
+ bvh_funcs.distance_to_prev(np.uint32(first_index + 1),
+ np.uint32(elements_this_iter),
+ layer,
+ union_areas,
+ block=(nthreads_per_block,1,1),
+ grid=(nblocks_this_iter,1))
+
+ union_areas_host = union_areas.get()[1::2]
+ print 'Final area of parent layer: %1.12e' % union_areas_host.sum()
+ print 'Skips:', skips
+
+def make_bvh(vertices, gpu_vertices, ntriangles, gpu_triangles, branch_degree):
+ assert branch_degree > 1
+ bvh_module = get_cu_module('bvh.cu', options=cuda_options,
+ include_source_directory=True)
+ bvh_funcs = GPUFuncs(bvh_module)
+
+ world_min = vertices.min(axis=0)
+ # Full scale at 2**16 - 2 in order to ensure there is dynamic range to round
+ # up by one count after quantization
+ world_scale = np.max((vertices.max(axis=0) - world_min)) / (2**16 - 2)
+
+ world_origin = ga.vec.make_float3(*world_min)
+ world_scale = np.float32(world_scale)
+
+ layer_conf = compute_layer_configuration(ntriangles, branch_degree)
+ layer_offsets = list(np.cumsum([npad for n, npad in layer_conf]))
+
+ # Last entry is number of nodes, trim off and add zero to get offset of each layer
+ n_nodes = int(layer_offsets[-1])
+ layer_offsets = [0] + layer_offsets[:-1]
+
+ leaf_nodes = ga.empty(shape=ntriangles, dtype=ga.vec.uint4)
+ morton_codes = ga.empty(shape=ntriangles, dtype=np.uint64)
+
+ # Step 1: Make leaves
+ nthreads_per_block=256
+ for first_index, elements_this_iter, nblocks_this_iter in \
+ chunk_iterator(ntriangles, nthreads_per_block, max_blocks=10000):
+ bvh_funcs.make_leaves(np.uint32(first_index),
+ np.uint32(elements_this_iter),
+ gpu_triangles, gpu_vertices,
+ world_origin, world_scale,
+ leaf_nodes, morton_codes,
+ block=(nthreads_per_block,1,1),
+ grid=(nblocks_this_iter,1))
+
+ # argsort on the CPU because I'm too lazy to do it on the GPU
+ argsort = morton_codes.get().argsort().astype(np.uint32)
+ del morton_codes
+ local_leaf_nodes = leaf_nodes.get()[argsort]
+ del leaf_nodes
+ #del remap_order
+ #
+ #remap_order = ga.to_gpu(argsort)
+ #m = morton_codes.get()
+ #m.sort()
+ #print m
+ #assert False
+ # Step 2: sort leaf nodes into full node list
+ #print cuda.mem_get_info(), leaf_nodes.nbytes
+ nodes = ga.zeros(shape=n_nodes, dtype=ga.vec.uint4)
+ areas = ga.zeros(shape=n_nodes, dtype=np.uint32)
+ cuda.memcpy_htod(int(nodes.gpudata)+int(layer_offsets[-1]), local_leaf_nodes)
+
+ #for first_index, elements_this_iter, nblocks_this_iter in \
+ # chunk_iterator(ntriangles, nthreads_per_block, max_blocks=10000):
+ # bvh_funcs.reorder_leaves(np.uint32(first_index),
+ # np.uint32(elements_this_iter),
+ # leaf_nodes, nodes[layer_offsets[-1]:], remap_order,
+ # block=(nthreads_per_block,1,1),
+ # grid=(nblocks_this_iter,1))
+
+
+ # Step 3: Create parent layers in reverse order
+ layer_parameters = zip(layer_offsets[:-1], layer_offsets[1:], layer_conf)
+ layer_parameters.reverse()
+
+ i = len(layer_parameters)
+ for parent_offset, child_offset, (nparent, nparent_pad) in layer_parameters:
+ #if i < 30:
+ # optimize_bvh_layer(nodes[child_offset:child_offset+nparent*branch_degree],
+ # bvh_funcs)
+
+ for first_index, elements_this_iter, nblocks_this_iter in \
+ chunk_iterator(nparent * branch_degree, nthreads_per_block,
+ max_blocks=10000):
+ bvh_funcs.node_area(np.uint32(first_index+child_offset),
+ np.uint32(elements_this_iter),
+ nodes,
+ areas,
+ block=(nthreads_per_block,1,1),
+ grid=(nblocks_this_iter,1))
+
+ print 'area', i, nparent * branch_degree, '%e' % areas[child_offset:child_offset+nparent*branch_degree].get().astype(float).sum()
+
+ for first_index, elements_this_iter, nblocks_this_iter in \
+ chunk_iterator(nparent, nthreads_per_block, max_blocks=10000):
+ bvh_funcs.build_layer(np.uint32(first_index),
+ np.uint32(elements_this_iter),
+ np.uint32(branch_degree),
+ nodes,
+ np.uint32(parent_offset),
+ np.uint32(child_offset),
+ block=(nthreads_per_block,1,1),
+ grid=(nblocks_this_iter,1))
+
+ i -= 1
+
+ return world_origin, world_scale, nodes
+
class GPUGeometry(object):
- def __init__(self, geometry, wavelengths=None, print_usage=False, branch_degree=3):
+ def __init__(self, geometry, wavelengths=None, print_usage=False, branch_degree=2):
if wavelengths is None:
wavelengths = standard_wavelengths
@@ -142,11 +335,11 @@ class GPUGeometry(object):
self.branch_degree = branch_degree
print 'bvh', cuda.mem_get_info()
- self.world_origin, self.world_scale, self.nodes = self.make_bvh(geometry.mesh.vertices,
- self.vertices,
- len(geometry.mesh.triangles),
- self.triangles,
- self.branch_degree)
+ self.world_origin, self.world_scale, self.nodes = make_bvh(geometry.mesh.vertices,
+ self.vertices,
+ len(geometry.mesh.triangles),
+ self.triangles,
+ self.branch_degree)
print 'bvh after', cuda.mem_get_info()
material_codes = (((geometry.material1_index & 0xff) << 24) |
@@ -164,7 +357,6 @@ class GPUGeometry(object):
self.material_pointer_array,
self.surface_pointer_array,
self.world_origin,
- np.float32(0.0), # dummy1
self.world_scale,
np.uint32(self.branch_degree)])
@@ -174,88 +366,6 @@ class GPUGeometry(object):
self.print_device_usage()
logger.info(self.device_usage_str())
- def make_bvh(self, vertices, gpu_vertices, ntriangles, gpu_triangles, branch_degree):
- assert branch_degree > 1
- bvh_module = get_cu_module('bvh.cu', options=cuda_options,
- include_source_directory=True)
- bvh_funcs = GPUFuncs(bvh_module)
-
- world_min = vertices.min(axis=0)
- # Full scale at 2**16 - 2 in order to ensure there is dynamic range to round
- # up by one count after quantization
- world_scale = (vertices.max(axis=0) - world_min) / (2**16 - 2)
-
- world_origin = ga.vec.make_float3(*world_min)
- world_scale = ga.vec.make_float3(*world_scale)
-
- layer_conf = compute_layer_configuration(ntriangles, branch_degree)
- layer_offsets = list(np.cumsum([npad for n, npad in layer_conf]))
-
- # Last entry is number of nodes, trim off and add zero to get offset of each layer
- n_nodes = int(layer_offsets[-1])
- layer_offsets = [0] + layer_offsets[:-1]
-
- leaf_nodes = ga.empty(shape=ntriangles, dtype=ga.vec.uint4)
- morton_codes = ga.empty(shape=ntriangles, dtype=np.uint64)
-
- # Step 1: Make leaves
- nthreads_per_block=256
- for first_index, elements_this_iter, nblocks_this_iter in \
- chunk_iterator(ntriangles, nthreads_per_block, max_blocks=10000):
- bvh_funcs.make_leaves(np.uint32(first_index),
- np.uint32(elements_this_iter),
- gpu_triangles, gpu_vertices,
- world_origin, world_scale,
- leaf_nodes, morton_codes,
- block=(nthreads_per_block,1,1),
- grid=(nblocks_this_iter,1))
-
- # argsort on the CPU because I'm too lazy to do it on the GPU
- argsort = morton_codes.get().argsort().astype(np.uint32)
- del morton_codes
- #local_leaf_nodes = leaf_nodes.get()[argsort]
- #
- remap_order = ga.to_gpu(argsort)
- #m = morton_codes.get()
- #m.sort()
- #print m
- #assert False
- # Step 2: sort leaf nodes into full node list
- print cuda.mem_get_info(), leaf_nodes.nbytes
- nodes = ga.zeros(shape=n_nodes, dtype=ga.vec.uint4)
- #cuda.memcpy_htod(int(nodes.gpudata)+int(layer_offsets[-1]), local_leaf_nodes)
-
- for first_index, elements_this_iter, nblocks_this_iter in \
- chunk_iterator(ntriangles, nthreads_per_block, max_blocks=10000):
- bvh_funcs.reorder_leaves(np.uint32(first_index),
- np.uint32(elements_this_iter),
- leaf_nodes, nodes[layer_offsets[-1]:], remap_order,
- block=(nthreads_per_block,1,1),
- grid=(nblocks_this_iter,1))
-
- del leaf_nodes
- del remap_order
-
- # Step 3: Create parent layers in reverse order
- layer_parameters = zip(layer_offsets[:-1], layer_offsets[1:], layer_conf)
- layer_parameters.reverse()
-
- for parent_offset, child_offset, (nparent, nparent_pad) in layer_parameters:
- for first_index, elements_this_iter, nblocks_this_iter in \
- chunk_iterator(nparent, nthreads_per_block, max_blocks=10000):
- print parent_offset, first_index, elements_this_iter, nblocks_this_iter
- bvh_funcs.build_layer(np.uint32(first_index),
- np.uint32(elements_this_iter),
- np.uint32(branch_degree),
- nodes,
- np.uint32(parent_offset),
- np.uint32(child_offset),
- block=(nthreads_per_block,1,1),
- grid=(nblocks_this_iter,1))
-
- return world_origin, world_scale, nodes
-
-
def device_usage_str(self):
'''Returns a formatted string displaying the memory usage.'''
s = 'device usage:\n'