summaryrefslogtreecommitdiff
path: root/chroma/gpu
diff options
context:
space:
mode:
Diffstat (limited to 'chroma/gpu')
-rw-r--r--chroma/gpu/geometry.py257
1 files changed, 17 insertions, 240 deletions
diff --git a/chroma/gpu/geometry.py b/chroma/gpu/geometry.py
index 6cb991c..77d33b2 100644
--- a/chroma/gpu/geometry.py
+++ b/chroma/gpu/geometry.py
@@ -6,228 +6,12 @@ from pycuda import characterize
from chroma.geometry import standard_wavelengths
from chroma.gpu.tools import get_cu_module, get_cu_source, cuda_options, \
chunk_iterator, format_array, format_size, to_uint3, to_float3, \
- make_gpu_struct, GPUFuncs
+ make_gpu_struct, GPUFuncs, mapped_empty, Mapped
from chroma.log import logger
-def round_up_to_multiple(x, multiple):
- remainder = x % multiple
- if remainder == 0:
- return x
- else:
- return x + multiple - remainder
-
-def compute_layer_configuration(n, branch_degree):
- if n == 1:
- # Special case for root
- return [ (1, 1) ]
- else:
- layer_conf = [ (n, round_up_to_multiple(n, branch_degree)) ]
-
- while layer_conf[0][1] > 1:
- nparent = int(np.ceil( float(layer_conf[0][1]) / branch_degree ))
- if nparent == 1:
- layer_conf = [ (1, 1) ] + layer_conf
- else:
- layer_conf = [ (nparent, round_up_to_multiple(nparent, branch_degree)) ] + layer_conf
-
- return layer_conf
-
-def optimize_bvh_layer(layer, bvh_funcs):
- n = len(layer)
- areas = ga.empty(shape=n, dtype=np.uint32)
- union_areas = ga.empty(shape=n, dtype=np.uint32)
- nthreads_per_block = 128
- min_areas = ga.empty(shape=int(np.ceil(n/float(nthreads_per_block))), dtype=np.uint32)
- min_index = ga.empty_like(min_areas)
-
- update = 50000
-
- skip_size = 1
- flag = cuda.pagelocked_empty(shape=skip_size, dtype=np.uint32, mem_flags=cuda.host_alloc_flags.DEVICEMAP)
- flag_gpu = np.intp(flag.base.get_device_pointer())
- print 'starting optimization'
-
- i = 0
- skips = 0
- while i < (n/2 - 1):
- # How are we doing?
- if i % update == 0:
- for first_index, elements_this_iter, nblocks_this_iter in \
- chunk_iterator(n-1, nthreads_per_block, max_blocks=10000):
-
- bvh_funcs.distance_to_prev(np.uint32(first_index + 1),
- np.uint32(elements_this_iter),
- layer,
- union_areas,
- block=(nthreads_per_block,1,1),
- grid=(nblocks_this_iter,1))
-
- union_areas_host = union_areas.get()[1::2]
- print 'Area of parent layer: %1.12e' % union_areas_host.astype(float).sum()
- print 'Area of parent layer so far (%d): %1.12e' % (i*2, union_areas_host.astype(float)[:i].\
-sum())
- print 'Skips:', skips
-
- test_index = i * 2
-
- blocks = 0
- look_forward = min(8192*400, n - test_index - 2)
- skip_this_round = min(skip_size, n - test_index - 1)
- flag[:] = 0
- for first_index, elements_this_iter, nblocks_this_iter in \
- chunk_iterator(look_forward, nthreads_per_block, max_blocks=10000):
- bvh_funcs.min_distance_to(np.uint32(first_index + test_index + 2),
- np.uint32(elements_this_iter),
- np.uint32(test_index),
- layer,
- np.uint32(blocks),
- min_areas,
- min_index,
- flag_gpu,
- block=(nthreads_per_block,1,1),
- grid=(nblocks_this_iter, skip_this_round))
- blocks += nblocks_this_iter
- cuda.Context.get_current().synchronize()
-
- if flag[0] == 0:
- flag_nonzero = flag.nonzero()[0]
- if len(flag_nonzero) == 0:
- no_swap_required = skip_size
- else:
- no_swap_required = flag_nonzero[0]
- i += no_swap_required
- skips += no_swap_required
- continue
-
- areas_host = min_areas[:blocks].get()
- min_index_host = min_index[:blocks].get()
- best_block = areas_host.argmin()
- better_i = min_index_host[best_block]
-
- if i % update == 0:
- print 'swapping %d and %d' % (test_index + 1, better_i)
-
- bvh_funcs.swap(np.uint32(test_index+1), np.uint32(better_i),
- layer, block=(1,1,1), grid=(1,1))
- i += 1
-
- for first_index, elements_this_iter, nblocks_this_iter in \
- chunk_iterator(n-1, nthreads_per_block, max_blocks=10000):
-
- bvh_funcs.distance_to_prev(np.uint32(first_index + 1),
- np.uint32(elements_this_iter),
- layer,
- union_areas,
- block=(nthreads_per_block,1,1),
- grid=(nblocks_this_iter,1))
-
- union_areas_host = union_areas.get()[1::2]
- print 'Final area of parent layer: %1.12e' % union_areas_host.sum()
- print 'Skips:', skips
-
-def make_bvh(vertices, gpu_vertices, ntriangles, gpu_triangles, branch_degree):
- assert branch_degree > 1
- bvh_module = get_cu_module('bvh.cu', options=cuda_options,
- include_source_directory=True)
- bvh_funcs = GPUFuncs(bvh_module)
-
- world_min = vertices.min(axis=0)
- # Full scale at 2**16 - 2 in order to ensure there is dynamic range to round
- # up by one count after quantization
- world_scale = np.max((vertices.max(axis=0) - world_min)) / (2**16 - 2)
-
- world_origin = ga.vec.make_float3(*world_min)
- world_scale = np.float32(world_scale)
-
- layer_conf = compute_layer_configuration(ntriangles, branch_degree)
- layer_offsets = list(np.cumsum([npad for n, npad in layer_conf]))
-
- # Last entry is number of nodes, trim off and add zero to get offset of each layer
- n_nodes = int(layer_offsets[-1])
- layer_offsets = [0] + layer_offsets[:-1]
-
- leaf_nodes = ga.empty(shape=ntriangles, dtype=ga.vec.uint4)
- morton_codes = ga.empty(shape=ntriangles, dtype=np.uint64)
-
- # Step 1: Make leaves
- nthreads_per_block=256
- for first_index, elements_this_iter, nblocks_this_iter in \
- chunk_iterator(ntriangles, nthreads_per_block, max_blocks=10000):
- bvh_funcs.make_leaves(np.uint32(first_index),
- np.uint32(elements_this_iter),
- gpu_triangles, gpu_vertices,
- world_origin, world_scale,
- leaf_nodes, morton_codes,
- block=(nthreads_per_block,1,1),
- grid=(nblocks_this_iter,1))
-
- # argsort on the CPU because I'm too lazy to do it on the GPU
- argsort = morton_codes.get().argsort().astype(np.uint32)
- del morton_codes
- local_leaf_nodes = leaf_nodes.get()[argsort]
- del leaf_nodes
- #del remap_order
- #
- #remap_order = ga.to_gpu(argsort)
- #m = morton_codes.get()
- #m.sort()
- #print m
- #assert False
- # Step 2: sort leaf nodes into full node list
- #print cuda.mem_get_info(), leaf_nodes.nbytes
- nodes = ga.zeros(shape=n_nodes, dtype=ga.vec.uint4)
- areas = ga.zeros(shape=n_nodes, dtype=np.uint32)
- cuda.memcpy_htod(int(nodes.gpudata)+int(layer_offsets[-1]), local_leaf_nodes)
-
- #for first_index, elements_this_iter, nblocks_this_iter in \
- # chunk_iterator(ntriangles, nthreads_per_block, max_blocks=10000):
- # bvh_funcs.reorder_leaves(np.uint32(first_index),
- # np.uint32(elements_this_iter),
- # leaf_nodes, nodes[layer_offsets[-1]:], remap_order,
- # block=(nthreads_per_block,1,1),
- # grid=(nblocks_this_iter,1))
-
-
- # Step 3: Create parent layers in reverse order
- layer_parameters = zip(layer_offsets[:-1], layer_offsets[1:], layer_conf)
- layer_parameters.reverse()
-
- i = len(layer_parameters)
- for parent_offset, child_offset, (nparent, nparent_pad) in layer_parameters:
- #if i < 30:
- # optimize_bvh_layer(nodes[child_offset:child_offset+nparent*branch_degree],
- # bvh_funcs)
-
- for first_index, elements_this_iter, nblocks_this_iter in \
- chunk_iterator(nparent * branch_degree, nthreads_per_block,
- max_blocks=10000):
- bvh_funcs.node_area(np.uint32(first_index+child_offset),
- np.uint32(elements_this_iter),
- nodes,
- areas,
- block=(nthreads_per_block,1,1),
- grid=(nblocks_this_iter,1))
-
- print 'area', i, nparent * branch_degree, '%e' % areas[child_offset:child_offset+nparent*branch_degree].get().astype(float).sum()
-
- for first_index, elements_this_iter, nblocks_this_iter in \
- chunk_iterator(nparent, nthreads_per_block, max_blocks=10000):
- bvh_funcs.build_layer(np.uint32(first_index),
- np.uint32(elements_this_iter),
- np.uint32(branch_degree),
- nodes,
- np.uint32(parent_offset),
- np.uint32(child_offset),
- block=(nthreads_per_block,1,1),
- grid=(nblocks_this_iter,1))
-
- i -= 1
-
- return world_origin, world_scale, nodes
-
class GPUGeometry(object):
- def __init__(self, geometry, wavelengths=None, print_usage=False, branch_degree=2):
+ def __init__(self, geometry, wavelengths=None, print_usage=False):
if wavelengths is None:
wavelengths = standard_wavelengths
@@ -321,26 +105,18 @@ class GPUGeometry(object):
self.surface_pointer_array = \
make_gpu_struct(8*len(self.surface_ptrs), self.surface_ptrs)
- self.pagelocked_vertices = cuda.pagelocked_empty(shape=len(geometry.mesh.vertices),
- dtype=ga.vec.float3,
- mem_flags=cuda.host_alloc_flags.DEVICEMAP | cuda.host_alloc_flags.WRITECOMBINED)
- self.pagelocked_triangles = cuda.pagelocked_empty(shape=len(geometry.mesh.triangles),
- dtype=ga.vec.uint3,
- mem_flags=cuda.host_alloc_flags.DEVICEMAP | cuda.host_alloc_flags.WRITECOMBINED)
- self.pagelocked_vertices[:] = to_float3(geometry.mesh.vertices)
- self.pagelocked_triangles[:] = to_uint3(geometry.mesh.triangles)
- self.vertices = np.intp(self.pagelocked_vertices.base.get_device_pointer())
- self.triangles = np.intp(self.pagelocked_triangles.base.get_device_pointer())
-
-
- self.branch_degree = branch_degree
- print 'bvh', cuda.mem_get_info()
- self.world_origin, self.world_scale, self.nodes = make_bvh(geometry.mesh.vertices,
- self.vertices,
- len(geometry.mesh.triangles),
- self.triangles,
- self.branch_degree)
- print 'bvh after', cuda.mem_get_info()
+ self.vertices = mapped_empty(shape=len(geometry.mesh.vertices),
+ dtype=ga.vec.float3,
+ write_combined=True)
+ self.triangles = mapped_empty(shape=len(geometry.mesh.triangles),
+ dtype=ga.vec.uint3,
+ write_combined=True)
+ self.vertices[:] = to_float3(geometry.mesh.vertices)
+ self.triangles[:] = to_uint3(geometry.mesh.triangles)
+
+ self.nodes = ga.to_gpu(geometry.bvh.nodes)
+ self.world_origin = ga.vec.make_float3(*geometry.bvh.world_coords.world_origin)
+ self.world_scale = np.float32(geometry.bvh.world_coords.world_scale)
material_codes = (((geometry.material1_index & 0xff) << 24) |
((geometry.material2_index & 0xff) << 16) |
@@ -351,14 +127,15 @@ class GPUGeometry(object):
self.solid_id_map = ga.to_gpu(geometry.solid_id.astype(np.uint32))
self.gpudata = make_gpu_struct(geometry_struct_size,
- [self.vertices, self.triangles,
+ [Mapped(self.vertices),
+ Mapped(self.triangles),
self.material_codes,
self.colors, self.nodes,
self.material_pointer_array,
self.surface_pointer_array,
self.world_origin,
self.world_scale,
- np.uint32(self.branch_degree)])
+ np.uint32(geometry.bvh.degree)])
self.geometry = geometry