summaryrefslogtreecommitdiff
path: root/chroma
diff options
context:
space:
mode:
authorStan Seibert <stan@mtrr.org>2012-01-12 13:27:31 -0500
committertlatorre <tlatorre@uchicago.edu>2021-05-09 08:42:38 -0700
commit3961d2b3aa0b0b0783ddd1138556c374a2731e9e (patch)
tree3002f685d75785dabe7ded6c80c2817ede82bce3 /chroma
parent7535581339b839ad17f79ee3e931a0e907cf5071 (diff)
downloadchroma-3961d2b3aa0b0b0783ddd1138556c374a2731e9e.tar.gz
chroma-3961d2b3aa0b0b0783ddd1138556c374a2731e9e.tar.bz2
chroma-3961d2b3aa0b0b0783ddd1138556c374a2731e9e.zip
Pull BVH calculation out of GPUGeometry class and make the world scale factor the same in all dimensions so that distance and area calculations are easy to do.
Diffstat (limited to 'chroma')
-rw-r--r--chroma/cuda/bvh.cu214
-rw-r--r--chroma/cuda/geometry_types.h3
-rw-r--r--chroma/gpu/geometry.py288
3 files changed, 409 insertions, 96 deletions
diff --git a/chroma/cuda/bvh.cu b/chroma/cuda/bvh.cu
index 8543649..50e877a 100644
--- a/chroma/cuda/bvh.cu
+++ b/chroma/cuda/bvh.cu
@@ -2,6 +2,7 @@
#include <cuda.h>
#include "linalg.h"
+#include "physical_constants.h"
__device__ float3
fminf(const float3 &a, const float3 &b)
@@ -47,17 +48,79 @@ __device__ unsigned long long spread3_16(unsigned int input)
return x;
}
+__device__ unsigned long long spread2_16(unsigned int input)
+{
+ unsigned long long x = input;
+ x = (x | (x << 16)) & 0x000000ff00ff00fful;
+ x = (x | (x << 8)) & 0x00000f0f0f0f0f0ful;
+ x = (x | (x << 4)) & 0x0000333333333333ul;
+ x = (x | (x << 2)) & 0x0000555555555555ul;
+ return x;
+}
+
+
__device__ unsigned int quantize(float v, float world_origin, float world_scale)
{
// truncate!
return (unsigned int) ((v - world_origin) / world_scale);
}
-__device__ uint3 quantize3(float3 v, float3 world_origin, float3 world_scale)
+__device__ uint3 quantize3(float3 v, float3 world_origin, float world_scale)
+{
+ return make_uint3(quantize(v.x, world_origin.x, world_scale),
+ quantize(v.y, world_origin.y, world_scale),
+ quantize(v.z, world_origin.z, world_scale));
+}
+
+__device__ uint3 quantize3_cyl(float3 v, float3 world_origin, float world_scale)
+{
+ float3 rescaled_v = (v - world_origin) / world_scale / sqrtf(3.0f);
+ unsigned int z = rescaled_v.z;
+ rescaled_v.z = 0.0f;
+ unsigned int rho = (unsigned int) norm(rescaled_v);
+ unsigned int phi = (unsigned int) ((atan2f(v.y, v.x)/PI/2.0f + 1.0f) * 65535.0f);
+
+ return make_uint3(rho, phi, z);
+}
+
+__device__ uint3 quantize3_sph(float3 v, float3 world_origin, float world_scale)
+{
+ float3 rescaled_v = (v - world_origin) / world_scale;
+
+ unsigned int r = (unsigned int) (norm(rescaled_v) / sqrt(3.0f));
+
+ unsigned int phi = (unsigned int) ((atan2f(rescaled_v.y, rescaled_v.x)/PI/2.0f + 1.0f) * 65535.0f);
+
+ unsigned int theta = (unsigned int) (acosf(rescaled_v.z / norm(rescaled_v)) / PI * 65535.0f);
+
+ return make_uint3(r, theta, phi);
+}
+
+__device__ uint4 node_union(const uint4 &a, const uint4 &b)
+{
+ uint3 lower = make_uint3(min(a.x & 0xFFFF, b.x & 0xFFFF),
+ min(a.y & 0xFFFF, b.y & 0xFFFF),
+ min(a.z & 0xFFFF, b.z & 0xFFFF));
+ uint3 upper = make_uint3(max(a.x >> 16, b.x >> 16),
+ max(a.y >> 16, b.y >> 16),
+ max(a.z >> 16, b.z >> 16));
+
+ return make_uint4(upper.x << 16 | lower.x,
+ upper.y << 16 | lower.y,
+ upper.z << 16 | lower.z,
+ 0);
+
+}
+
+
+
+__device__ unsigned int surface_half_area(const uint4 &node)
{
- return make_uint3(quantize(v.x, world_origin.x, world_scale.x),
- quantize(v.y, world_origin.y, world_scale.y),
- quantize(v.z, world_origin.z, world_scale.z));
+ unsigned int x = (node.x >> 16) - (node.x & 0xFFFF);
+ unsigned int y = (node.y >> 16) - (node.y & 0xFFFF);
+ unsigned int z = (node.z >> 16) - (node.z & 0xFFFF);
+
+ return x*y + y*z + z*x;
}
const unsigned int LEAF_BIT = (1U << 31);
@@ -66,9 +129,25 @@ extern "C"
{
__global__ void
+ node_area(unsigned int first_node,
+ unsigned int nnodes_this_round,
+ uint4 *nodes,
+ unsigned int *areas)
+ {
+ unsigned int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+ if (thread_id >= nnodes_this_round)
+ return;
+
+ unsigned int node_id = first_node + thread_id;
+
+ areas[node_id] = surface_half_area(nodes[node_id]);
+ }
+
+
+ __global__ void
make_leaves(unsigned int first_triangle,
unsigned int ntriangles, uint3 *triangles, float3 *vertices,
- float3 world_origin, float3 world_scale,
+ float3 world_origin, float world_scale,
uint4 *leaf_nodes, unsigned long long *morton_codes)
{
@@ -107,6 +186,17 @@ extern "C"
spread3_16(q_centroid.x)
| (spread3_16(q_centroid.y) << 1)
| (spread3_16(q_centroid.z) << 2);
+
+ //unsigned long long morton =
+ // spread3_16(triangle.x & 0xFFFF)
+ // | (spread3_16(triangle.y & 0xFFFF) << 1)
+ // | (spread3_16(triangle.z & 0xFFFF) << 2);
+
+
+ //unsigned long long morton =
+ // ( ((unsigned long long) q_centroid.x) << 32 )
+ // | ( ((unsigned long long) q_centroid.y) << 16 )
+ // | ( ((unsigned long long) q_centroid.z) << 0 );
// Write leaf and morton code
uint4 leaf_node;
@@ -180,4 +270,118 @@ extern "C"
nodes[parent_layer_offset + parent_id] = parent_node;
}
+ __global__ void distance_to_prev(unsigned int first_node, unsigned int threads_this_round,
+ uint4 *node, unsigned int *area)
+ {
+ unsigned int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+ if (thread_id >= threads_this_round)
+ return;
+
+ unsigned int node_id = first_node + thread_id;
+
+ uint4 a = node[node_id - 1];
+ uint4 b = node[node_id];
+ uint4 u = node_union(a, b);
+
+ area[node_id] = surface_half_area(u);
+ }
+
+ __global__ void distance_to(unsigned int first_node, unsigned int threads_this_round,
+ unsigned int target_index,
+ uint4 *node, unsigned int *area)
+ {
+ unsigned int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+ if (thread_id >= threads_this_round)
+ return;
+
+ unsigned int node_id = first_node + thread_id;
+
+ if (node_id == target_index) {
+ area[node_id] = 0xFFFFFFFF;
+ } else {
+ uint4 a = node[target_index];
+ uint4 b = node[node_id];
+ uint4 u = node_union(a, b);
+
+ area[node_id] = surface_half_area(u);
+ }
+ }
+
+ __global__ void min_distance_to(unsigned int first_node, unsigned int threads_this_round,
+ unsigned int target_index,
+ uint4 *node,
+ unsigned int block_offset,
+ unsigned int *min_area_block,
+ unsigned int *min_index_block,
+ unsigned int *flag)
+ {
+ __shared__ unsigned int min_area;
+ __shared__ unsigned int adjacent_area;
+
+ target_index += blockIdx.y;
+
+ uint4 a = node[target_index];
+
+ if (threadIdx.x == 0) {
+ min_area = 0xFFFFFFFF;
+ adjacent_area = surface_half_area(node_union(a, node[target_index+1]));
+ }
+
+ __syncthreads();
+
+ unsigned int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+
+ unsigned int node_id = first_node + thread_id;
+
+ if (thread_id >= threads_this_round)
+ node_id = target_index;
+
+ unsigned int area;
+
+ if (node_id == target_index) {
+ area = 0xFFFFFFFF;
+ } else {
+ uint4 b = node[node_id];
+
+ if (b.x == 0) {
+ area = 0xFFFFFFFF;
+ } else {
+ uint4 u = node_union(a, b);
+ area = surface_half_area(u);
+ }
+ }
+
+ atomicMin(&min_area, area);
+
+ __syncthreads();
+
+ if (min_area == area) {
+
+ if (blockIdx.y == 0) {
+ if (min_area < adjacent_area) {
+ min_index_block[block_offset + blockIdx.x] = node_id;
+ min_area_block[block_offset + blockIdx.x] = area;
+ flag[0] = 1;
+ } else {
+ min_area_block[block_offset + blockIdx.x] = 0xFFFFFFFF;
+ }
+ } else {
+
+ if (min_area < adjacent_area)
+ flag[blockIdx.y] = 1;
+ }
+
+ }
+ }
+
+
+
+ __global__ void swap(unsigned int a_index, unsigned int b_index,
+ uint4 *node)
+ {
+ uint4 temp4 = node[a_index];
+ node[a_index] = node[b_index];
+ node[b_index] = temp4;
+ }
+
} // extern "C"
diff --git a/chroma/cuda/geometry_types.h b/chroma/cuda/geometry_types.h
index 31ff3c6..89c53bc 100644
--- a/chroma/cuda/geometry_types.h
+++ b/chroma/cuda/geometry_types.h
@@ -47,8 +47,7 @@ struct Geometry
Material **materials;
Surface **surfaces;
float3 world_origin;
- float _dummy1; // for alignment
- float3 world_scale;
+ float world_scale;
unsigned int branch_degree;
};
diff --git a/chroma/gpu/geometry.py b/chroma/gpu/geometry.py
index f3a3b19..6cb991c 100644
--- a/chroma/gpu/geometry.py
+++ b/chroma/gpu/geometry.py
@@ -33,8 +33,201 @@ def compute_layer_configuration(n, branch_degree):
return layer_conf
+def optimize_bvh_layer(layer, bvh_funcs):
+ n = len(layer)
+ areas = ga.empty(shape=n, dtype=np.uint32)
+ union_areas = ga.empty(shape=n, dtype=np.uint32)
+ nthreads_per_block = 128
+ min_areas = ga.empty(shape=int(np.ceil(n/float(nthreads_per_block))), dtype=np.uint32)
+ min_index = ga.empty_like(min_areas)
+
+ update = 50000
+
+ skip_size = 1
+ flag = cuda.pagelocked_empty(shape=skip_size, dtype=np.uint32, mem_flags=cuda.host_alloc_flags.DEVICEMAP)
+ flag_gpu = np.intp(flag.base.get_device_pointer())
+ print 'starting optimization'
+
+ i = 0
+ skips = 0
+ while i < (n/2 - 1):
+ # How are we doing?
+ if i % update == 0:
+ for first_index, elements_this_iter, nblocks_this_iter in \
+ chunk_iterator(n-1, nthreads_per_block, max_blocks=10000):
+
+ bvh_funcs.distance_to_prev(np.uint32(first_index + 1),
+ np.uint32(elements_this_iter),
+ layer,
+ union_areas,
+ block=(nthreads_per_block,1,1),
+ grid=(nblocks_this_iter,1))
+
+ union_areas_host = union_areas.get()[1::2]
+ print 'Area of parent layer: %1.12e' % union_areas_host.astype(float).sum()
+ print 'Area of parent layer so far (%d): %1.12e' % (i*2, union_areas_host.astype(float)[:i].\
+sum())
+ print 'Skips:', skips
+
+ test_index = i * 2
+
+ blocks = 0
+ look_forward = min(8192*400, n - test_index - 2)
+ skip_this_round = min(skip_size, n - test_index - 1)
+ flag[:] = 0
+ for first_index, elements_this_iter, nblocks_this_iter in \
+ chunk_iterator(look_forward, nthreads_per_block, max_blocks=10000):
+ bvh_funcs.min_distance_to(np.uint32(first_index + test_index + 2),
+ np.uint32(elements_this_iter),
+ np.uint32(test_index),
+ layer,
+ np.uint32(blocks),
+ min_areas,
+ min_index,
+ flag_gpu,
+ block=(nthreads_per_block,1,1),
+ grid=(nblocks_this_iter, skip_this_round))
+ blocks += nblocks_this_iter
+ cuda.Context.get_current().synchronize()
+
+ if flag[0] == 0:
+ flag_nonzero = flag.nonzero()[0]
+ if len(flag_nonzero) == 0:
+ no_swap_required = skip_size
+ else:
+ no_swap_required = flag_nonzero[0]
+ i += no_swap_required
+ skips += no_swap_required
+ continue
+
+ areas_host = min_areas[:blocks].get()
+ min_index_host = min_index[:blocks].get()
+ best_block = areas_host.argmin()
+ better_i = min_index_host[best_block]
+
+ if i % update == 0:
+ print 'swapping %d and %d' % (test_index + 1, better_i)
+
+ bvh_funcs.swap(np.uint32(test_index+1), np.uint32(better_i),
+ layer, block=(1,1,1), grid=(1,1))
+ i += 1
+
+ for first_index, elements_this_iter, nblocks_this_iter in \
+ chunk_iterator(n-1, nthreads_per_block, max_blocks=10000):
+
+ bvh_funcs.distance_to_prev(np.uint32(first_index + 1),
+ np.uint32(elements_this_iter),
+ layer,
+ union_areas,
+ block=(nthreads_per_block,1,1),
+ grid=(nblocks_this_iter,1))
+
+ union_areas_host = union_areas.get()[1::2]
+ print 'Final area of parent layer: %1.12e' % union_areas_host.sum()
+ print 'Skips:', skips
+
+def make_bvh(vertices, gpu_vertices, ntriangles, gpu_triangles, branch_degree):
+ assert branch_degree > 1
+ bvh_module = get_cu_module('bvh.cu', options=cuda_options,
+ include_source_directory=True)
+ bvh_funcs = GPUFuncs(bvh_module)
+
+ world_min = vertices.min(axis=0)
+ # Full scale at 2**16 - 2 in order to ensure there is dynamic range to round
+ # up by one count after quantization
+ world_scale = np.max((vertices.max(axis=0) - world_min)) / (2**16 - 2)
+
+ world_origin = ga.vec.make_float3(*world_min)
+ world_scale = np.float32(world_scale)
+
+ layer_conf = compute_layer_configuration(ntriangles, branch_degree)
+ layer_offsets = list(np.cumsum([npad for n, npad in layer_conf]))
+
+ # Last entry is number of nodes, trim off and add zero to get offset of each layer
+ n_nodes = int(layer_offsets[-1])
+ layer_offsets = [0] + layer_offsets[:-1]
+
+ leaf_nodes = ga.empty(shape=ntriangles, dtype=ga.vec.uint4)
+ morton_codes = ga.empty(shape=ntriangles, dtype=np.uint64)
+
+ # Step 1: Make leaves
+ nthreads_per_block=256
+ for first_index, elements_this_iter, nblocks_this_iter in \
+ chunk_iterator(ntriangles, nthreads_per_block, max_blocks=10000):
+ bvh_funcs.make_leaves(np.uint32(first_index),
+ np.uint32(elements_this_iter),
+ gpu_triangles, gpu_vertices,
+ world_origin, world_scale,
+ leaf_nodes, morton_codes,
+ block=(nthreads_per_block,1,1),
+ grid=(nblocks_this_iter,1))
+
+ # argsort on the CPU because I'm too lazy to do it on the GPU
+ argsort = morton_codes.get().argsort().astype(np.uint32)
+ del morton_codes
+ local_leaf_nodes = leaf_nodes.get()[argsort]
+ del leaf_nodes
+ #del remap_order
+ #
+ #remap_order = ga.to_gpu(argsort)
+ #m = morton_codes.get()
+ #m.sort()
+ #print m
+ #assert False
+ # Step 2: sort leaf nodes into full node list
+ #print cuda.mem_get_info(), leaf_nodes.nbytes
+ nodes = ga.zeros(shape=n_nodes, dtype=ga.vec.uint4)
+ areas = ga.zeros(shape=n_nodes, dtype=np.uint32)
+ cuda.memcpy_htod(int(nodes.gpudata)+int(layer_offsets[-1]), local_leaf_nodes)
+
+ #for first_index, elements_this_iter, nblocks_this_iter in \
+ # chunk_iterator(ntriangles, nthreads_per_block, max_blocks=10000):
+ # bvh_funcs.reorder_leaves(np.uint32(first_index),
+ # np.uint32(elements_this_iter),
+ # leaf_nodes, nodes[layer_offsets[-1]:], remap_order,
+ # block=(nthreads_per_block,1,1),
+ # grid=(nblocks_this_iter,1))
+
+
+ # Step 3: Create parent layers in reverse order
+ layer_parameters = zip(layer_offsets[:-1], layer_offsets[1:], layer_conf)
+ layer_parameters.reverse()
+
+ i = len(layer_parameters)
+ for parent_offset, child_offset, (nparent, nparent_pad) in layer_parameters:
+ #if i < 30:
+ # optimize_bvh_layer(nodes[child_offset:child_offset+nparent*branch_degree],
+ # bvh_funcs)
+
+ for first_index, elements_this_iter, nblocks_this_iter in \
+ chunk_iterator(nparent * branch_degree, nthreads_per_block,
+ max_blocks=10000):
+ bvh_funcs.node_area(np.uint32(first_index+child_offset),
+ np.uint32(elements_this_iter),
+ nodes,
+ areas,
+ block=(nthreads_per_block,1,1),
+ grid=(nblocks_this_iter,1))
+
+ print 'area', i, nparent * branch_degree, '%e' % areas[child_offset:child_offset+nparent*branch_degree].get().astype(float).sum()
+
+ for first_index, elements_this_iter, nblocks_this_iter in \
+ chunk_iterator(nparent, nthreads_per_block, max_blocks=10000):
+ bvh_funcs.build_layer(np.uint32(first_index),
+ np.uint32(elements_this_iter),
+ np.uint32(branch_degree),
+ nodes,
+ np.uint32(parent_offset),
+ np.uint32(child_offset),
+ block=(nthreads_per_block,1,1),
+ grid=(nblocks_this_iter,1))
+
+ i -= 1
+
+ return world_origin, world_scale, nodes
+
class GPUGeometry(object):
- def __init__(self, geometry, wavelengths=None, print_usage=False, branch_degree=3):
+ def __init__(self, geometry, wavelengths=None, print_usage=False, branch_degree=2):
if wavelengths is None:
wavelengths = standard_wavelengths
@@ -142,11 +335,11 @@ class GPUGeometry(object):
self.branch_degree = branch_degree
print 'bvh', cuda.mem_get_info()
- self.world_origin, self.world_scale, self.nodes = self.make_bvh(geometry.mesh.vertices,
- self.vertices,
- len(geometry.mesh.triangles),
- self.triangles,
- self.branch_degree)
+ self.world_origin, self.world_scale, self.nodes = make_bvh(geometry.mesh.vertices,
+ self.vertices,
+ len(geometry.mesh.triangles),
+ self.triangles,
+ self.branch_degree)
print 'bvh after', cuda.mem_get_info()
material_codes = (((geometry.material1_index & 0xff) << 24) |
@@ -164,7 +357,6 @@ class GPUGeometry(object):
self.material_pointer_array,
self.surface_pointer_array,
self.world_origin,
- np.float32(0.0), # dummy1
self.world_scale,
np.uint32(self.branch_degree)])
@@ -174,88 +366,6 @@ class GPUGeometry(object):
self.print_device_usage()
logger.info(self.device_usage_str())
- def make_bvh(self, vertices, gpu_vertices, ntriangles, gpu_triangles, branch_degree):
- assert branch_degree > 1
- bvh_module = get_cu_module('bvh.cu', options=cuda_options,
- include_source_directory=True)
- bvh_funcs = GPUFuncs(bvh_module)
-
- world_min = vertices.min(axis=0)
- # Full scale at 2**16 - 2 in order to ensure there is dynamic range to round
- # up by one count after quantization
- world_scale = (vertices.max(axis=0) - world_min) / (2**16 - 2)
-
- world_origin = ga.vec.make_float3(*world_min)
- world_scale = ga.vec.make_float3(*world_scale)
-
- layer_conf = compute_layer_configuration(ntriangles, branch_degree)
- layer_offsets = list(np.cumsum([npad for n, npad in layer_conf]))
-
- # Last entry is number of nodes, trim off and add zero to get offset of each layer
- n_nodes = int(layer_offsets[-1])
- layer_offsets = [0] + layer_offsets[:-1]
-
- leaf_nodes = ga.empty(shape=ntriangles, dtype=ga.vec.uint4)
- morton_codes = ga.empty(shape=ntriangles, dtype=np.uint64)
-
- # Step 1: Make leaves
- nthreads_per_block=256
- for first_index, elements_this_iter, nblocks_this_iter in \
- chunk_iterator(ntriangles, nthreads_per_block, max_blocks=10000):
- bvh_funcs.make_leaves(np.uint32(first_index),
- np.uint32(elements_this_iter),
- gpu_triangles, gpu_vertices,
- world_origin, world_scale,
- leaf_nodes, morton_codes,
- block=(nthreads_per_block,1,1),
- grid=(nblocks_this_iter,1))
-
- # argsort on the CPU because I'm too lazy to do it on the GPU
- argsort = morton_codes.get().argsort().astype(np.uint32)
- del morton_codes
- #local_leaf_nodes = leaf_nodes.get()[argsort]
- #
- remap_order = ga.to_gpu(argsort)
- #m = morton_codes.get()
- #m.sort()
- #print m
- #assert False
- # Step 2: sort leaf nodes into full node list
- print cuda.mem_get_info(), leaf_nodes.nbytes
- nodes = ga.zeros(shape=n_nodes, dtype=ga.vec.uint4)
- #cuda.memcpy_htod(int(nodes.gpudata)+int(layer_offsets[-1]), local_leaf_nodes)
-
- for first_index, elements_this_iter, nblocks_this_iter in \
- chunk_iterator(ntriangles, nthreads_per_block, max_blocks=10000):
- bvh_funcs.reorder_leaves(np.uint32(first_index),
- np.uint32(elements_this_iter),
- leaf_nodes, nodes[layer_offsets[-1]:], remap_order,
- block=(nthreads_per_block,1,1),
- grid=(nblocks_this_iter,1))
-
- del leaf_nodes
- del remap_order
-
- # Step 3: Create parent layers in reverse order
- layer_parameters = zip(layer_offsets[:-1], layer_offsets[1:], layer_conf)
- layer_parameters.reverse()
-
- for parent_offset, child_offset, (nparent, nparent_pad) in layer_parameters:
- for first_index, elements_this_iter, nblocks_this_iter in \
- chunk_iterator(nparent, nthreads_per_block, max_blocks=10000):
- print parent_offset, first_index, elements_this_iter, nblocks_this_iter
- bvh_funcs.build_layer(np.uint32(first_index),
- np.uint32(elements_this_iter),
- np.uint32(branch_degree),
- nodes,
- np.uint32(parent_offset),
- np.uint32(child_offset),
- block=(nthreads_per_block,1,1),
- grid=(nblocks_this_iter,1))
-
- return world_origin, world_scale, nodes
-
-
def device_usage_str(self):
'''Returns a formatted string displaying the memory usage.'''
s = 'device usage:\n'