summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStan Seibert <stan@mtrr.org>2012-02-17 18:26:41 -0500
committertlatorre <tlatorre@uchicago.edu>2021-05-09 08:42:38 -0700
commitcc386a0dbad610f4c2ed6a79ed1cfc2af57aade3 (patch)
tree017fb4cc93f526dfc61951f282fb9f26526fd478
parente2bf16de5d94a7b438e9c1af52f0bb4dc632b35f (diff)
downloadchroma-cc386a0dbad610f4c2ed6a79ed1cfc2af57aade3.tar.gz
chroma-cc386a0dbad610f4c2ed6a79ed1cfc2af57aade3.tar.bz2
chroma-cc386a0dbad610f4c2ed6a79ed1cfc2af57aade3.zip
Add an argsort_direction() function to chroma.tools and use it to
group photons so that they take similar paths on the GPU. argsort_direction() morton-orders an array of normalized direction vectors according to their spherical coordinates. Photons sorted in this way tend to follow similar paths through a detector geometry, which enhances cache locality. As a result, get_node() uses the GPU L1 cache again, with good results.
-rwxr-xr-xchroma/benchmark.py26
-rw-r--r--chroma/cuda/geometry.h2
-rw-r--r--chroma/generator/g4gen.py11
-rw-r--r--chroma/tools.py37
4 files changed, 66 insertions, 10 deletions
diff --git a/chroma/benchmark.py b/chroma/benchmark.py
index 06a8eba..3c3e919 100755
--- a/chroma/benchmark.py
+++ b/chroma/benchmark.py
@@ -14,6 +14,7 @@ from chroma import generator
from chroma import tools
from chroma.transform import normalize
from chroma.demo.optics import water
+from chroma.loader import create_geometry_from_obj
# Generator processes need to fork BEFORE the GPU context is setup
g4generator = generator.photon.G4ParallelGenerator(4, water)
@@ -29,7 +30,9 @@ def intersect(gpu_geometry, number=100, nphotons=500000, nthreads_per_block=64,
run_times = []
for i in tools.progress(range(number)):
pos = ga.zeros(nphotons, dtype=ga.vec.float3)
- dir = ga.to_gpu(gpu.to_float3(sample.uniform_sphere(nphotons)))
+ dir = sample.uniform_sphere(nphotons)
+ reorder = tools.argsort_direction(dir)
+ dir = ga.to_gpu(gpu.to_float3(dir[reorder]))
t0 = time.time()
gpu_funcs.distance_to_mesh(np.int32(pos.size), pos, dir, gpu_geometry.gpudata, distances_gpu, block=(nthreads_per_block,1,1), grid=(pos.size//nthreads_per_block+1,1))
@@ -73,6 +76,8 @@ def propagate(gpu_detector, number=10, nphotons=500000, nthreads_per_block=64,
for i in tools.progress(range(number)):
pos = np.zeros((nphotons,3))
dir = sample.uniform_sphere(nphotons)
+ reorder = tools.argsort_direction(dir)
+ dir = dir[reorder]
pol = normalize(np.cross(sample.uniform_sphere(nphotons), dir))
wavelengths = np.random.uniform(400,800,size=nphotons)
photons = event.Photons(pos, dir, pol, wavelengths)
@@ -132,8 +137,10 @@ def pdf(gpu_detector, npdfs=10, nevents=100, nreps=16, ndaq=1,
nthreads_per_block, max_blocks)
for gpu_photon_slice in gpu_photons.iterate_copies():
for idaq in xrange(ndaq):
- gpu_channels = gpu_daq.acquire(gpu_photon_slice, rng_states,
- nthreads_per_block, max_blocks)
+ gpu_daq.begin_acquire()
+ gpu_daq.acquire(gpu_photon_slice, rng_states,
+ nthreads_per_block, max_blocks)
+ gpu_channels = gpu_daq.end_acquire()
gpu_pdf.add_hits_to_pdf(gpu_channels, nthreads_per_block)
hitcount, pdf = gpu_pdf.get_pdfs()
@@ -177,7 +184,9 @@ def pdf_eval(gpu_detector, npdfs=10, nevents=25, nreps=16, ndaq=128,
gpu_photons.propagate(gpu_detector, rng_states,
nthreads_per_block, max_blocks)
gpu_daq = gpu.GPUDaq(gpu_detector)
- data_ev_channels = gpu_daq.acquire(gpu_photons, rng_states, nthreads_per_block, max_blocks).get()
+ gpu_daq.begin_acquire()
+ gpu_daq.acquire(gpu_photons, rng_states, nthreads_per_block, max_blocks).get()
+ data_ev_channels = gpu_daq.end_acquire()
# Setup PDF evaluation
gpu_daq = gpu.GPUDaq(gpu_detector, ndaq=ndaq)
@@ -207,9 +216,11 @@ def pdf_eval(gpu_detector, npdfs=10, nevents=25, nreps=16, ndaq=128,
gpu_photons.propagate(gpu_detector, rng_states,
nthreads_per_block, max_blocks)
for gpu_photon_slice in gpu_photons.iterate_copies():
+ gpu_daq.begin_acquire()
gpu_photon_slice = gpu_photon_slice.select(event.SURFACE_DETECT)
- gpu_channels = gpu_daq.acquire(gpu_photon_slice, rng_states,
- nthreads_per_block, max_blocks)
+ gpu_daq.acquire(gpu_photon_slice, rng_states,
+ nthreads_per_block, max_blocks)
+ gpu_channels = gpu_daq.end_acquire()
gpu_pdf.accumulate_pdf_eval(gpu_channels, nthreads_per_block)
cuda.Context.get_current().synchronize()
@@ -233,8 +244,7 @@ if __name__ == '__main__':
if len(sys.argv) > 1:
tests = sys.argv[1:] # unless test names given on command line
- detector = demo.detector()
- detector.build(bits=11)
+ detector = create_geometry_from_obj(demo.detector())
context = gpu.create_cuda_context()
gpu_detector = gpu.GPUDetector(detector)
diff --git a/chroma/cuda/geometry.h b/chroma/cuda/geometry.h
index 0735792..3db6dad 100644
--- a/chroma/cuda/geometry.h
+++ b/chroma/cuda/geometry.h
@@ -22,7 +22,7 @@ __device__ uint4 read_skip_l1(uint4 *ptr)
__device__ Node
get_node(Geometry *geometry, const unsigned int &i)
{
- uint4 node = read_skip_l1(geometry->nodes + i);
+ uint4 node = geometry->nodes[i];
Node node_struct;
if (node.x == 0) {
diff --git a/chroma/generator/g4gen.py b/chroma/generator/g4gen.py
index 0132381..0b8169d 100644
--- a/chroma/generator/g4gen.py
+++ b/chroma/generator/g4gen.py
@@ -3,6 +3,7 @@ from chroma.generator.mute import *
import pyublas
import numpy as np
from chroma.event import Photons, Vertex
+from chroma.tools import argsort_direction
g4mute()
from Geant4 import *
@@ -75,7 +76,7 @@ class G4Generator(object):
g4material.SetMaterialPropertiesTable(prop_table)
return g4material
- def _extract_photons_from_tracking_action(self):
+ def _extract_photons_from_tracking_action(self, sort=True):
n = self.tracking_action.GetNumPhotons()
pos = np.zeros(shape=(n,3), dtype=np.float32)
pos[:,0] = self.tracking_action.GetX()
@@ -96,6 +97,14 @@ class G4Generator(object):
t0 = self.tracking_action.GetT0().astype(np.float32)
+ if sort:
+ reorder = argsort_direction(dir)
+ pos = pos[reorder]
+ dir = dir[reorder]
+ pol = pol[reorder]
+ wavelengths = wavelengths[reorder]
+ t0 = t0[reorder]
+
return Photons(pos, dir, pol, wavelengths, t0)
def generate_photons(self, vertices):
diff --git a/chroma/tools.py b/chroma/tools.py
index 1e111ff..3a930c1 100644
--- a/chroma/tools.py
+++ b/chroma/tools.py
@@ -153,3 +153,40 @@ def memoize_method_with_dictionary_arg(func):
return result
return lookup
+def interleave3d(arr, bits):
+ """
+ Interleave the bits of quantized three-dimensional points in space.
+
+ Example
+ >>> interleave(np.identity(3, dtype=np.int))
+ array([4, 2, 1], dtype=uint64)
+ """
+ if len(arr.shape) != 2 or arr.shape[1] != 3:
+ raise Exception('shape mismatch')
+
+ z = np.zeros(arr.shape[0], dtype=np.uint64)
+ for i in range(bits):
+ z |= (arr[:,2] & 1 << i) << (2*i) | \
+ (arr[:,1] & 1 << i) << (2*i+1) | \
+ (arr[:,0] & 1 << i) << (2*i+2)
+ return z
+
+def argsort_direction(dir):
+ '''Return the indicies required to resort the ndarray(shape=(n,3))
+ array of direction vectors based on a morton ordering of the
+ spherical coordinates.
+
+ This is used to organize photons in such a way that they enhance
+ the cache benefits of the GPU.
+ '''
+
+ bits = 16
+ MAXINT = 2**bits - 1
+ theta = (np.arccos(dir[:,2]) / np.pi * MAXINT).astype(np.uint32)
+ phi = ((np.arctan2(dir[:,1], dir[:,0]) / np.pi / 2.0 + 0.5) * MAXINT).astype(np.uint32)
+
+ morton = np.zeros(len(dir), dtype=np.uint32)
+ for i in xrange(bits):
+ morton |= (theta & 1 << i) << (i) | \
+ (phi & 1 << i) << (i+1)
+ return np.argsort(morton)