1 files changed, 17 insertions, 240 deletions
diff --git a/chroma/gpu/geometry.py b/chroma/gpu/geometry.py
index 6cb991c..77d33b2 100644
--- a/chroma/gpu/geometry.py
+++ b/chroma/gpu/geometry.py
@@ -6,228 +6,12 @@ from pycuda import characterize
 from chroma.geometry import standard_wavelengths
 from chroma.gpu.tools import get_cu_module, get_cu_source, cuda_options, \
     chunk_iterator, format_array, format_size, to_uint3, to_float3, \
-    make_gpu_struct, GPUFuncs
+    make_gpu_struct, GPUFuncs, mapped_empty, Mapped
 
 from chroma.log import logger
 
-def round_up_to_multiple(x, multiple):
-    remainder = x % multiple
-    if remainder == 0:
-        return x
-    else:
-        return x + multiple - remainder
-
-def compute_layer_configuration(n, branch_degree):
-    if n == 1:
-        # Special case for root
-        return [ (1, 1) ]
-    else:
-        layer_conf = [ (n, round_up_to_multiple(n, branch_degree)) ]
-
-    while layer_conf[0][1] > 1:
-        nparent = int(np.ceil( float(layer_conf[0][1]) / branch_degree ))
-        if nparent == 1:
-            layer_conf = [ (1, 1) ] + layer_conf
-        else:
-            layer_conf = [ (nparent, round_up_to_multiple(nparent, branch_degree)) ] + layer_conf
-
-    return layer_conf
-
-def optimize_bvh_layer(layer, bvh_funcs):
-    n = len(layer)
-    areas = ga.empty(shape=n, dtype=np.uint32)
-    union_areas = ga.empty(shape=n, dtype=np.uint32)
-    nthreads_per_block = 128
-    min_areas = ga.empty(shape=int(np.ceil(n/float(nthreads_per_block))), dtype=np.uint32)
-    min_index = ga.empty_like(min_areas)
-
-    update = 50000
-
-    skip_size = 1
-    flag = cuda.pagelocked_empty(shape=skip_size, dtype=np.uint32, mem_flags=cuda.host_alloc_flags.DEVICEMAP)
-    flag_gpu = np.intp(flag.base.get_device_pointer())
-    print 'starting optimization'
-
-    i = 0
-    skips = 0
-    while i < (n/2 - 1):
-        # How are we doing?
-        if i % update == 0:
-            for first_index, elements_this_iter, nblocks_this_iter in \
-                    chunk_iterator(n-1, nthreads_per_block, max_blocks=10000):
-
-                bvh_funcs.distance_to_prev(np.uint32(first_index + 1),
-                                           np.uint32(elements_this_iter),
-                                           layer,
-                                           union_areas,
-                                           block=(nthreads_per_block,1,1),
-                                           grid=(nblocks_this_iter,1))
-
-            union_areas_host = union_areas.get()[1::2]
-            print 'Area of parent layer: %1.12e' % union_areas_host.astype(float).sum()
-            print 'Area of parent layer so far (%d): %1.12e' % (i*2, union_areas_host.astype(float)[:i].\
-sum())
-            print 'Skips:', skips
-
-        test_index = i * 2
-
-        blocks = 0
-        look_forward = min(8192*400, n - test_index - 2)
-        skip_this_round = min(skip_size, n - test_index - 1)
-        flag[:] = 0
-        for first_index, elements_this_iter, nblocks_this_iter in \
-                chunk_iterator(look_forward, nthreads_per_block, max_blocks=10000):
-            bvh_funcs.min_distance_to(np.uint32(first_index + test_index + 2),
-                                      np.uint32(elements_this_iter),
-                                      np.uint32(test_index),
-                                      layer,
-                                      np.uint32(blocks),
-                                      min_areas,
-                                      min_index,
-                                      flag_gpu,
-                                      block=(nthreads_per_block,1,1),
-                                      grid=(nblocks_this_iter, skip_this_round))
-            blocks += nblocks_this_iter
-        cuda.Context.get_current().synchronize()
-
-        if flag[0] == 0:
-            flag_nonzero = flag.nonzero()[0]
-            if len(flag_nonzero) == 0:
-                no_swap_required = skip_size
-            else:
-                no_swap_required = flag_nonzero[0]
-            i += no_swap_required
-            skips += no_swap_required
-            continue
-
-        areas_host = min_areas[:blocks].get()
-        min_index_host = min_index[:blocks].get()
-        best_block = areas_host.argmin()
-        better_i = min_index_host[best_block]
-
-        if i % update == 0:
-            print 'swapping %d and %d' % (test_index + 1, better_i)
-
-        bvh_funcs.swap(np.uint32(test_index+1), np.uint32(better_i),
-                       layer, block=(1,1,1), grid=(1,1))
-        i += 1
-
-    for first_index, elements_this_iter, nblocks_this_iter in \
-            chunk_iterator(n-1, nthreads_per_block, max_blocks=10000):
-
-        bvh_funcs.distance_to_prev(np.uint32(first_index + 1),
-                                   np.uint32(elements_this_iter),
-                                   layer,
-                                   union_areas,
-                                   block=(nthreads_per_block,1,1),
-                                   grid=(nblocks_this_iter,1))
-
-    union_areas_host = union_areas.get()[1::2]
-    print 'Final area of parent layer: %1.12e' % union_areas_host.sum()
-    print 'Skips:', skips
-
-def make_bvh(vertices, gpu_vertices, ntriangles, gpu_triangles, branch_degree):
-    assert branch_degree > 1
-    bvh_module = get_cu_module('bvh.cu', options=cuda_options, 
-                                include_source_directory=True)
-    bvh_funcs = GPUFuncs(bvh_module)
-
-    world_min = vertices.min(axis=0)
-    # Full scale at 2**16 - 2 in order to ensure there is dynamic range to round
-    # up by one count after quantization
-    world_scale = np.max((vertices.max(axis=0) - world_min)) / (2**16 - 2)
-
-    world_origin = ga.vec.make_float3(*world_min)
-    world_scale  = np.float32(world_scale)
-
-    layer_conf = compute_layer_configuration(ntriangles, branch_degree)
-    layer_offsets = list(np.cumsum([npad for n, npad in layer_conf]))
-
-    # Last entry is number of nodes, trim off and add zero to get offset of each layer
-    n_nodes = int(layer_offsets[-1])
-    layer_offsets = [0] + layer_offsets[:-1]
-
-    leaf_nodes = ga.empty(shape=ntriangles, dtype=ga.vec.uint4)
-    morton_codes = ga.empty(shape=ntriangles, dtype=np.uint64)
-
-    # Step 1: Make leaves
-    nthreads_per_block=256
-    for first_index, elements_this_iter, nblocks_this_iter in \
-            chunk_iterator(ntriangles, nthreads_per_block, max_blocks=10000):
-        bvh_funcs.make_leaves(np.uint32(first_index),
-                              np.uint32(elements_this_iter),
-                              gpu_triangles, gpu_vertices, 
-                              world_origin, world_scale,
-                              leaf_nodes, morton_codes,
-                              block=(nthreads_per_block,1,1),
-                              grid=(nblocks_this_iter,1))
-
-    # argsort on the CPU because I'm too lazy to do it on the GPU
-    argsort = morton_codes.get().argsort().astype(np.uint32)
-    del morton_codes
-    local_leaf_nodes = leaf_nodes.get()[argsort]
-    del leaf_nodes
-    #del remap_order
-    #
-    #remap_order = ga.to_gpu(argsort)
-    #m = morton_codes.get()
-    #m.sort()
-    #print m
-    #assert False
-    # Step 2: sort leaf nodes into full node list
-    #print cuda.mem_get_info(), leaf_nodes.nbytes
-    nodes = ga.zeros(shape=n_nodes, dtype=ga.vec.uint4)
-    areas = ga.zeros(shape=n_nodes, dtype=np.uint32)
-    cuda.memcpy_htod(int(nodes.gpudata)+int(layer_offsets[-1]), local_leaf_nodes)
-
-    #for first_index, elements_this_iter, nblocks_this_iter in \
-    #       chunk_iterator(ntriangles, nthreads_per_block, max_blocks=10000):
-    #   bvh_funcs.reorder_leaves(np.uint32(first_index),
-    #                            np.uint32(elements_this_iter),
-    #                            leaf_nodes, nodes[layer_offsets[-1]:], remap_order,
-    #                            block=(nthreads_per_block,1,1),
-    #                            grid=(nblocks_this_iter,1))
-
-
-    # Step 3: Create parent layers in reverse order
-    layer_parameters = zip(layer_offsets[:-1], layer_offsets[1:], layer_conf)
-    layer_parameters.reverse()
-
-    i = len(layer_parameters)
-    for parent_offset, child_offset, (nparent, nparent_pad) in layer_parameters:
-        #if i < 30:
-        #    optimize_bvh_layer(nodes[child_offset:child_offset+nparent*branch_degree],
-        #                  bvh_funcs)
-
-        for first_index, elements_this_iter, nblocks_this_iter in \
-                chunk_iterator(nparent * branch_degree, nthreads_per_block,
-                               max_blocks=10000):
-            bvh_funcs.node_area(np.uint32(first_index+child_offset),
-                                np.uint32(elements_this_iter),
-                                nodes,
-                                areas,
-                                block=(nthreads_per_block,1,1),
-                                grid=(nblocks_this_iter,1))
-
-        print 'area', i, nparent * branch_degree, '%e' % areas[child_offset:child_offset+nparent*branch_degree].get().astype(float).sum()
-
-        for first_index, elements_this_iter, nblocks_this_iter in \
-                chunk_iterator(nparent, nthreads_per_block, max_blocks=10000):
-            bvh_funcs.build_layer(np.uint32(first_index),
-                                  np.uint32(elements_this_iter),
-                                  np.uint32(branch_degree),
-                                  nodes,
-                                  np.uint32(parent_offset),
-                                  np.uint32(child_offset),
-                                  block=(nthreads_per_block,1,1),
-                                  grid=(nblocks_this_iter,1))
-
-        i -= 1
-
-    return world_origin, world_scale, nodes
-
 class GPUGeometry(object):
-    def __init__(self, geometry, wavelengths=None, print_usage=False, branch_degree=2):
+    def __init__(self, geometry, wavelengths=None, print_usage=False):
         if wavelengths is None:
             wavelengths = standard_wavelengths
 
@@ -321,26 +105,18 @@ class GPUGeometry(object):
         self.surface_pointer_array = \
             make_gpu_struct(8*len(self.surface_ptrs), self.surface_ptrs)
 
-        self.pagelocked_vertices = cuda.pagelocked_empty(shape=len(geometry.mesh.vertices),
-                                                         dtype=ga.vec.float3,
-                                                         mem_flags=cuda.host_alloc_flags.DEVICEMAP | cuda.host_alloc_flags.WRITECOMBINED)
-        self.pagelocked_triangles = cuda.pagelocked_empty(shape=len(geometry.mesh.triangles),
-                                                         dtype=ga.vec.uint3,
-                                                         mem_flags=cuda.host_alloc_flags.DEVICEMAP | cuda.host_alloc_flags.WRITECOMBINED)
-        self.pagelocked_vertices[:] = to_float3(geometry.mesh.vertices)
-        self.pagelocked_triangles[:] = to_uint3(geometry.mesh.triangles)
-        self.vertices = np.intp(self.pagelocked_vertices.base.get_device_pointer())
-        self.triangles = np.intp(self.pagelocked_triangles.base.get_device_pointer())
-
-
-        self.branch_degree = branch_degree
-        print 'bvh', cuda.mem_get_info()
-        self.world_origin, self.world_scale, self.nodes = make_bvh(geometry.mesh.vertices,
-                                                                   self.vertices,
-                                                                   len(geometry.mesh.triangles),
-                                                                   self.triangles,
-                                                                   self.branch_degree)
-        print 'bvh after', cuda.mem_get_info()
+        self.vertices = mapped_empty(shape=len(geometry.mesh.vertices),
+                                     dtype=ga.vec.float3,
+                                     write_combined=True)
+        self.triangles = mapped_empty(shape=len(geometry.mesh.triangles),
+                                      dtype=ga.vec.uint3,
+                                      write_combined=True)
+        self.vertices[:] = to_float3(geometry.mesh.vertices)
+        self.triangles[:] = to_uint3(geometry.mesh.triangles)
+
+        self.nodes = ga.to_gpu(geometry.bvh.nodes)
+        self.world_origin = ga.vec.make_float3(*geometry.bvh.world_coords.world_origin)
+        self.world_scale = np.float32(geometry.bvh.world_coords.world_scale)
 
         material_codes = (((geometry.material1_index & 0xff) << 24) |
                           ((geometry.material2_index & 0xff) << 16) |
@@ -351,14 +127,15 @@ class GPUGeometry(object):
         self.solid_id_map = ga.to_gpu(geometry.solid_id.astype(np.uint32))
 
         self.gpudata = make_gpu_struct(geometry_struct_size,
-                                       [self.vertices, self.triangles,
+                                       [Mapped(self.vertices), 
+                                        Mapped(self.triangles),
                                         self.material_codes,
                                         self.colors, self.nodes,
                                         self.material_pointer_array,
                                         self.surface_pointer_array,
                                         self.world_origin,
                                         self.world_scale,
-                                        np.uint32(self.branch_degree)])
+                                        np.uint32(geometry.bvh.degree)])
 
         self.geometry = geometry