Add an argsort_direction() function to chroma.tools and use it to

group photons so that they take similar paths on the GPU. argsort_direction() morton-orders an array of normalized direction vectors according to their spherical coordinates. Photons sorted in this way tend to follow similar paths through a detector geometry, which enhances cache locality. As a result, get_node() uses the GPU L1 cache again, with good results.
author: Stan Seibert <stan@mtrr.org> 2012-02-17 18:26:41 -0500
committer: tlatorre <tlatorre@uchicago.edu> 2021-05-09 08:42:38 -0700
commit: cc386a0dbad610f4c2ed6a79ed1cfc2af57aade3 (patch)
tree: 017fb4cc93f526dfc61951f282fb9f26526fd478
parent: e2bf16de5d94a7b438e9c1af52f0bb4dc632b35f (diff)
download: chroma-cc386a0dbad610f4c2ed6a79ed1cfc2af57aade3.tar.gz
chroma-cc386a0dbad610f4c2ed6a79ed1cfc2af57aade3.tar.bz2
chroma-cc386a0dbad610f4c2ed6a79ed1cfc2af57aade3.zip
4 files changed, 66 insertions, 10 deletions
diff --git a/chroma/benchmark.py b/chroma/benchmark.py
index 06a8eba..3c3e919 100755
--- a/chroma/benchmark.py
+++ b/chroma/benchmark.py
@@ -14,6 +14,7 @@ from chroma import generator
 from chroma import tools
 from chroma.transform import normalize
 from chroma.demo.optics import water
+from chroma.loader import create_geometry_from_obj
 
 # Generator processes need to fork BEFORE the GPU context is setup
 g4generator = generator.photon.G4ParallelGenerator(4, water)
@@ -29,7 +30,9 @@ def intersect(gpu_geometry, number=100, nphotons=500000, nthreads_per_block=64,
     run_times = []
     for i in tools.progress(range(number)):
         pos = ga.zeros(nphotons, dtype=ga.vec.float3)
-        dir = ga.to_gpu(gpu.to_float3(sample.uniform_sphere(nphotons)))
+        dir = sample.uniform_sphere(nphotons)
+        reorder = tools.argsort_direction(dir)
+        dir = ga.to_gpu(gpu.to_float3(dir[reorder]))
 
         t0 = time.time()
         gpu_funcs.distance_to_mesh(np.int32(pos.size), pos, dir, gpu_geometry.gpudata, distances_gpu, block=(nthreads_per_block,1,1), grid=(pos.size//nthreads_per_block+1,1))
@@ -73,6 +76,8 @@ def propagate(gpu_detector, number=10, nphotons=500000, nthreads_per_block=64,
     for i in tools.progress(range(number)):
         pos = np.zeros((nphotons,3))
         dir = sample.uniform_sphere(nphotons)
+        reorder = tools.argsort_direction(dir)
+        dir = dir[reorder]
         pol = normalize(np.cross(sample.uniform_sphere(nphotons), dir))
         wavelengths = np.random.uniform(400,800,size=nphotons)
         photons = event.Photons(pos, dir, pol, wavelengths)
@@ -132,8 +137,10 @@ def pdf(gpu_detector, npdfs=10, nevents=100, nreps=16, ndaq=1,
                                   nthreads_per_block, max_blocks)
             for gpu_photon_slice in gpu_photons.iterate_copies():
                 for idaq in xrange(ndaq):
-                    gpu_channels = gpu_daq.acquire(gpu_photon_slice, rng_states,
-                                                   nthreads_per_block, max_blocks)
+                    gpu_daq.begin_acquire()
+                    gpu_daq.acquire(gpu_photon_slice, rng_states,
+                                    nthreads_per_block, max_blocks)
+                    gpu_channels = gpu_daq.end_acquire()
                     gpu_pdf.add_hits_to_pdf(gpu_channels, nthreads_per_block)
 
         hitcount, pdf = gpu_pdf.get_pdfs()
@@ -177,7 +184,9 @@ def pdf_eval(gpu_detector, npdfs=10, nevents=25, nreps=16, ndaq=128,
     gpu_photons.propagate(gpu_detector, rng_states,
                           nthreads_per_block, max_blocks)
     gpu_daq = gpu.GPUDaq(gpu_detector)
-    data_ev_channels = gpu_daq.acquire(gpu_photons, rng_states, nthreads_per_block, max_blocks).get()
+    gpu_daq.begin_acquire()
+    gpu_daq.acquire(gpu_photons, rng_states, nthreads_per_block, max_blocks).get()
+    data_ev_channels = gpu_daq.end_acquire()
     
     # Setup PDF evaluation
     gpu_daq = gpu.GPUDaq(gpu_detector, ndaq=ndaq)
@@ -207,9 +216,11 @@ def pdf_eval(gpu_detector, npdfs=10, nevents=25, nreps=16, ndaq=128,
             gpu_photons.propagate(gpu_detector, rng_states,
                                   nthreads_per_block, max_blocks)
             for gpu_photon_slice in gpu_photons.iterate_copies():
+                gpu_daq.begin_acquire()
                 gpu_photon_slice = gpu_photon_slice.select(event.SURFACE_DETECT)
-                gpu_channels = gpu_daq.acquire(gpu_photon_slice, rng_states,
-                                               nthreads_per_block, max_blocks)
+                gpu_daq.acquire(gpu_photon_slice, rng_states,
+                                nthreads_per_block, max_blocks)
+                gpu_channels = gpu_daq.end_acquire()
                 gpu_pdf.accumulate_pdf_eval(gpu_channels, nthreads_per_block)
 
         cuda.Context.get_current().synchronize()        
@@ -233,8 +244,7 @@ if __name__ == '__main__':
     if len(sys.argv) > 1:
         tests = sys.argv[1:] # unless test names given on command line
 
-    detector = demo.detector()
-    detector.build(bits=11)
+    detector = create_geometry_from_obj(demo.detector())
 
     context = gpu.create_cuda_context()
     gpu_detector = gpu.GPUDetector(detector)
diff --git a/chroma/cuda/geometry.h b/chroma/cuda/geometry.h
index 0735792..3db6dad 100644
--- a/chroma/cuda/geometry.h
+++ b/chroma/cuda/geometry.h
@@ -22,7 +22,7 @@ __device__ uint4 read_skip_l1(uint4 *ptr)
 __device__ Node
 get_node(Geometry *geometry, const unsigned int &i)
 {
-    uint4 node = read_skip_l1(geometry->nodes + i);
+    uint4 node = geometry->nodes[i];
     Node node_struct;
 
     if (node.x == 0) {
diff --git a/chroma/generator/g4gen.py b/chroma/generator/g4gen.py
index 0132381..0b8169d 100644
--- a/chroma/generator/g4gen.py
+++ b/chroma/generator/g4gen.py
@@ -3,6 +3,7 @@ from chroma.generator.mute import *
 import pyublas
 import numpy as np
 from chroma.event import Photons, Vertex
+from chroma.tools import argsort_direction
 
 g4mute()
 from Geant4 import *
@@ -75,7 +76,7 @@ class G4Generator(object):
         g4material.SetMaterialPropertiesTable(prop_table)
         return g4material
 
-    def _extract_photons_from_tracking_action(self):
+    def _extract_photons_from_tracking_action(self, sort=True):
         n = self.tracking_action.GetNumPhotons()        
         pos = np.zeros(shape=(n,3), dtype=np.float32)
         pos[:,0] = self.tracking_action.GetX()
@@ -96,6 +97,14 @@ class G4Generator(object):
 
         t0 = self.tracking_action.GetT0().astype(np.float32)
 
+        if sort:
+            reorder = argsort_direction(dir)
+            pos = pos[reorder]
+            dir = dir[reorder]
+            pol = pol[reorder]
+            wavelengths = wavelengths[reorder]
+            t0 = t0[reorder]
+
         return Photons(pos, dir, pol, wavelengths, t0)
 
     def generate_photons(self, vertices):
diff --git a/chroma/tools.py b/chroma/tools.py
index 1e111ff..3a930c1 100644
--- a/chroma/tools.py
+++ b/chroma/tools.py
@@ -153,3 +153,40 @@ def memoize_method_with_dictionary_arg(func):
             return result
     return lookup
 
+def interleave3d(arr, bits):
+    """
+    Interleave the bits of quantized three-dimensional points in space.
+
+    Example
+        >>> interleave(np.identity(3, dtype=np.int))
+        array([4, 2, 1], dtype=uint64)
+    """
+    if len(arr.shape) != 2 or arr.shape[1] != 3:
+        raise Exception('shape mismatch')
+
+    z = np.zeros(arr.shape[0], dtype=np.uint64)
+    for i in range(bits):
+        z |= (arr[:,2] & 1 << i) << (2*i) | \
+             (arr[:,1] & 1 << i) << (2*i+1) | \
+             (arr[:,0] & 1 << i) << (2*i+2)
+    return z
+
+def argsort_direction(dir):
+    '''Return the indicies required to resort the ndarray(shape=(n,3))
+    array of direction vectors based on a morton ordering of the
+    spherical coordinates.
+
+    This is used to organize photons in such a way that they enhance
+    the cache benefits of the GPU.
+    '''
+
+    bits = 16
+    MAXINT = 2**bits - 1
+    theta = (np.arccos(dir[:,2]) / np.pi * MAXINT).astype(np.uint32)
+    phi = ((np.arctan2(dir[:,1], dir[:,0]) / np.pi / 2.0 + 0.5) * MAXINT).astype(np.uint32)
+    
+    morton = np.zeros(len(dir), dtype=np.uint32)
+    for i in xrange(bits):
+        morton |= (theta & 1 << i) << (i) | \
+                  (phi & 1 << i) << (i+1)
+    return np.argsort(morton)
author	Stan Seibert <stan@mtrr.org>	2012-02-17 18:26:41 -0500
committer	tlatorre <tlatorre@uchicago.edu>	2021-05-09 08:42:38 -0700
commit	cc386a0dbad610f4c2ed6a79ed1cfc2af57aade3 (patch)
tree	017fb4cc93f526dfc61951f282fb9f26526fd478
parent	e2bf16de5d94a7b438e9c1af52f0bb4dc632b35f (diff)
download	chroma-cc386a0dbad610f4c2ed6a79ed1cfc2af57aade3.tar.gz chroma-cc386a0dbad610f4c2ed6a79ed1cfc2af57aade3.tar.bz2 chroma-cc386a0dbad610f4c2ed6a79ed1cfc2af57aade3.zip