diff options
author | Stan Seibert <stan@mtrr.org> | 2012-02-17 18:26:41 -0500 |
---|---|---|
committer | tlatorre <tlatorre@uchicago.edu> | 2021-05-09 08:42:38 -0700 |
commit | cc386a0dbad610f4c2ed6a79ed1cfc2af57aade3 (patch) | |
tree | 017fb4cc93f526dfc61951f282fb9f26526fd478 | |
parent | e2bf16de5d94a7b438e9c1af52f0bb4dc632b35f (diff) | |
download | chroma-cc386a0dbad610f4c2ed6a79ed1cfc2af57aade3.tar.gz chroma-cc386a0dbad610f4c2ed6a79ed1cfc2af57aade3.tar.bz2 chroma-cc386a0dbad610f4c2ed6a79ed1cfc2af57aade3.zip |
Add an argsort_direction() function to chroma.tools and use it to
group photons so that they take similar paths on the GPU.
argsort_direction() morton-orders an array of normalized direction
vectors according to their spherical coordinates. Photons sorted in
this way tend to follow similar paths through a detector geometry,
which enhances cache locality. As a result, get_node() uses the GPU
L1 cache again, with good results.
-rwxr-xr-x | chroma/benchmark.py | 26 | ||||
-rw-r--r-- | chroma/cuda/geometry.h | 2 | ||||
-rw-r--r-- | chroma/generator/g4gen.py | 11 | ||||
-rw-r--r-- | chroma/tools.py | 37 |
4 files changed, 66 insertions, 10 deletions
diff --git a/chroma/benchmark.py b/chroma/benchmark.py index 06a8eba..3c3e919 100755 --- a/chroma/benchmark.py +++ b/chroma/benchmark.py @@ -14,6 +14,7 @@ from chroma import generator from chroma import tools from chroma.transform import normalize from chroma.demo.optics import water +from chroma.loader import create_geometry_from_obj # Generator processes need to fork BEFORE the GPU context is setup g4generator = generator.photon.G4ParallelGenerator(4, water) @@ -29,7 +30,9 @@ def intersect(gpu_geometry, number=100, nphotons=500000, nthreads_per_block=64, run_times = [] for i in tools.progress(range(number)): pos = ga.zeros(nphotons, dtype=ga.vec.float3) - dir = ga.to_gpu(gpu.to_float3(sample.uniform_sphere(nphotons))) + dir = sample.uniform_sphere(nphotons) + reorder = tools.argsort_direction(dir) + dir = ga.to_gpu(gpu.to_float3(dir[reorder])) t0 = time.time() gpu_funcs.distance_to_mesh(np.int32(pos.size), pos, dir, gpu_geometry.gpudata, distances_gpu, block=(nthreads_per_block,1,1), grid=(pos.size//nthreads_per_block+1,1)) @@ -73,6 +76,8 @@ def propagate(gpu_detector, number=10, nphotons=500000, nthreads_per_block=64, for i in tools.progress(range(number)): pos = np.zeros((nphotons,3)) dir = sample.uniform_sphere(nphotons) + reorder = tools.argsort_direction(dir) + dir = dir[reorder] pol = normalize(np.cross(sample.uniform_sphere(nphotons), dir)) wavelengths = np.random.uniform(400,800,size=nphotons) photons = event.Photons(pos, dir, pol, wavelengths) @@ -132,8 +137,10 @@ def pdf(gpu_detector, npdfs=10, nevents=100, nreps=16, ndaq=1, nthreads_per_block, max_blocks) for gpu_photon_slice in gpu_photons.iterate_copies(): for idaq in xrange(ndaq): - gpu_channels = gpu_daq.acquire(gpu_photon_slice, rng_states, - nthreads_per_block, max_blocks) + gpu_daq.begin_acquire() + gpu_daq.acquire(gpu_photon_slice, rng_states, + nthreads_per_block, max_blocks) + gpu_channels = gpu_daq.end_acquire() gpu_pdf.add_hits_to_pdf(gpu_channels, nthreads_per_block) hitcount, pdf = gpu_pdf.get_pdfs() @@ -177,7 +184,9 @@ def pdf_eval(gpu_detector, npdfs=10, nevents=25, nreps=16, ndaq=128, gpu_photons.propagate(gpu_detector, rng_states, nthreads_per_block, max_blocks) gpu_daq = gpu.GPUDaq(gpu_detector) - data_ev_channels = gpu_daq.acquire(gpu_photons, rng_states, nthreads_per_block, max_blocks).get() + gpu_daq.begin_acquire() + gpu_daq.acquire(gpu_photons, rng_states, nthreads_per_block, max_blocks).get() + data_ev_channels = gpu_daq.end_acquire() # Setup PDF evaluation gpu_daq = gpu.GPUDaq(gpu_detector, ndaq=ndaq) @@ -207,9 +216,11 @@ def pdf_eval(gpu_detector, npdfs=10, nevents=25, nreps=16, ndaq=128, gpu_photons.propagate(gpu_detector, rng_states, nthreads_per_block, max_blocks) for gpu_photon_slice in gpu_photons.iterate_copies(): + gpu_daq.begin_acquire() gpu_photon_slice = gpu_photon_slice.select(event.SURFACE_DETECT) - gpu_channels = gpu_daq.acquire(gpu_photon_slice, rng_states, - nthreads_per_block, max_blocks) + gpu_daq.acquire(gpu_photon_slice, rng_states, + nthreads_per_block, max_blocks) + gpu_channels = gpu_daq.end_acquire() gpu_pdf.accumulate_pdf_eval(gpu_channels, nthreads_per_block) cuda.Context.get_current().synchronize() @@ -233,8 +244,7 @@ if __name__ == '__main__': if len(sys.argv) > 1: tests = sys.argv[1:] # unless test names given on command line - detector = demo.detector() - detector.build(bits=11) + detector = create_geometry_from_obj(demo.detector()) context = gpu.create_cuda_context() gpu_detector = gpu.GPUDetector(detector) diff --git a/chroma/cuda/geometry.h b/chroma/cuda/geometry.h index 0735792..3db6dad 100644 --- a/chroma/cuda/geometry.h +++ b/chroma/cuda/geometry.h @@ -22,7 +22,7 @@ __device__ uint4 read_skip_l1(uint4 *ptr) __device__ Node get_node(Geometry *geometry, const unsigned int &i) { - uint4 node = read_skip_l1(geometry->nodes + i); + uint4 node = geometry->nodes[i]; Node node_struct; if (node.x == 0) { diff --git a/chroma/generator/g4gen.py b/chroma/generator/g4gen.py index 0132381..0b8169d 100644 --- a/chroma/generator/g4gen.py +++ b/chroma/generator/g4gen.py @@ -3,6 +3,7 @@ from chroma.generator.mute import * import pyublas import numpy as np from chroma.event import Photons, Vertex +from chroma.tools import argsort_direction g4mute() from Geant4 import * @@ -75,7 +76,7 @@ class G4Generator(object): g4material.SetMaterialPropertiesTable(prop_table) return g4material - def _extract_photons_from_tracking_action(self): + def _extract_photons_from_tracking_action(self, sort=True): n = self.tracking_action.GetNumPhotons() pos = np.zeros(shape=(n,3), dtype=np.float32) pos[:,0] = self.tracking_action.GetX() @@ -96,6 +97,14 @@ class G4Generator(object): t0 = self.tracking_action.GetT0().astype(np.float32) + if sort: + reorder = argsort_direction(dir) + pos = pos[reorder] + dir = dir[reorder] + pol = pol[reorder] + wavelengths = wavelengths[reorder] + t0 = t0[reorder] + return Photons(pos, dir, pol, wavelengths, t0) def generate_photons(self, vertices): diff --git a/chroma/tools.py b/chroma/tools.py index 1e111ff..3a930c1 100644 --- a/chroma/tools.py +++ b/chroma/tools.py @@ -153,3 +153,40 @@ def memoize_method_with_dictionary_arg(func): return result return lookup +def interleave3d(arr, bits): + """ + Interleave the bits of quantized three-dimensional points in space. + + Example + >>> interleave(np.identity(3, dtype=np.int)) + array([4, 2, 1], dtype=uint64) + """ + if len(arr.shape) != 2 or arr.shape[1] != 3: + raise Exception('shape mismatch') + + z = np.zeros(arr.shape[0], dtype=np.uint64) + for i in range(bits): + z |= (arr[:,2] & 1 << i) << (2*i) | \ + (arr[:,1] & 1 << i) << (2*i+1) | \ + (arr[:,0] & 1 << i) << (2*i+2) + return z + +def argsort_direction(dir): + '''Return the indicies required to resort the ndarray(shape=(n,3)) + array of direction vectors based on a morton ordering of the + spherical coordinates. + + This is used to organize photons in such a way that they enhance + the cache benefits of the GPU. + ''' + + bits = 16 + MAXINT = 2**bits - 1 + theta = (np.arccos(dir[:,2]) / np.pi * MAXINT).astype(np.uint32) + phi = ((np.arctan2(dir[:,1], dir[:,0]) / np.pi / 2.0 + 0.5) * MAXINT).astype(np.uint32) + + morton = np.zeros(len(dir), dtype=np.uint32) + for i in xrange(bits): + morton |= (theta & 1 << i) << (i) | \ + (phi & 1 << i) << (i+1) + return np.argsort(morton) |