summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStan Seibert <stan@mtrr.org>2011-08-08 20:16:33 -0400
committerStan Seibert <stan@mtrr.org>2011-08-08 20:16:33 -0400
commitedd78c209c88a652691cc5602b37d79085fee795 (patch)
treedfd12bb15893dbbea5d5c907d12c9d01f6fee9a0
parent2e80fdc7e7dbb84da0f37bb159e08ed618fe15f5 (diff)
downloadchroma-edd78c209c88a652691cc5602b37d79085fee795.tar.gz
chroma-edd78c209c88a652691cc5602b37d79085fee795.tar.bz2
chroma-edd78c209c88a652691cc5602b37d79085fee795.zip
propagate() takes an array of photon offsets and a range of
offsets to load. Now events with more photons than RNG states can be propagated through multiple kernel calls. Also lays the groundwork for consolidating photons between steps to reduce the amount of propagation work required.
-rw-r--r--gpu.py89
-rw-r--r--src/daq.cu9
-rw-r--r--src/kernel.cu34
3 files changed, 83 insertions, 49 deletions
diff --git a/gpu.py b/gpu.py
index 8de0f90..a21bc38 100644
--- a/gpu.py
+++ b/gpu.py
@@ -15,6 +15,28 @@ from color import map_to_color
cuda.init()
+def chunk_iterator(nelements, nthreads_per_block, max_blocks):
+ '''Iterator that yields tuples with the values requried to process
+ a long array in multiple kernel passes on the GPU.
+
+ Each yielded value is (first_index, elements_this_iteration, nblocks_this_iteration)
+
+ >>> list(chunk_iterator(300, 32, 2))
+ [(0, 64, 2), (64, 64, 2), (128, 64, 2), (192, 64, 2), (256, 9, 1)]
+ '''
+ first = 0
+ while first < nelements:
+ elements_left = nelements - first
+ blocks = elements_left // nthreads_per_block
+ if elements_left % nthreads_per_block != 0:
+ blocks += 1 #Round up only if needed
+ blocks = min(max_blocks, blocks)
+ elements_this_round = min(elements_left, blocks * nthreads_per_block)
+
+ yield (first, elements_this_round, blocks)
+ first += elements_this_round
+
+
def format_size(size):
if size < 1e3:
return '%.1f%s' % (size, ' ')
@@ -54,8 +76,8 @@ class GPU(object):
self.geo_funcs = CUDAFuncs(self.module, ['set_wavelength_range', 'set_material', 'set_surface', 'set_global_mesh_variables', 'color_solids'])
self.prop_funcs = CUDAFuncs(self.module, ['init_rng', 'propagate'])
- self.nblocks = 64
- self.max_nthreads = 200000
+ self.nthread_per_block = 64
+ self.max_blocks = 1024
self.daq_module = SourceModule(src.daq, options=['-I' + src.dir], no_extern_c=True, cache_dir=False)
self.daq_funcs = CUDAFuncs(self.daq_module,
['reset_earliest_time_int', 'run_daq',
@@ -166,15 +188,16 @@ class GPU(object):
solid_ids_gpu = gpuarray.to_gpu(np.array(solid_ids, dtype=np.int32))
solid_colors_gpu = gpuarray.to_gpu(np.array(colors, dtype=np.uint32))
- self.geo_funcs.color_solids(np.int32(solid_ids_gpu.size), np.uint32(self.triangles_gpu.size), self.solid_id_map_gpu, solid_ids_gpu, solid_colors_gpu, block=(self.nblocks,1,1), grid=(solid_ids_gpu.size//self.nblocks+1,1))
+ self.geo_funcs.color_solids(np.int32(solid_ids_gpu.size), np.uint32(self.triangles_gpu.size), self.solid_id_map_gpu, solid_ids_gpu, solid_colors_gpu, block=(self.nblock,1,1), grid=(solid_ids_gpu.size//self.nthread_per_block+1,1))
self.context.synchronize()
def setup_propagate(self, seed=1):
- self.rng_states_gpu = cuda.mem_alloc(self.max_nthreads*sizeof('curandStateXORWOW', '#include <curand_kernel.h>'))
- self.prop_funcs.init_rng(np.int32(self.max_nthreads), self.rng_states_gpu,
+ self.rng_states_gpu = cuda.mem_alloc(self.nthread_per_block * self.max_blocks
+ * sizeof('curandStateXORWOW', '#include <curand_kernel.h>'))
+ self.prop_funcs.init_rng(np.int32(self.max_blocks * self.nthread_per_block), self.rng_states_gpu,
np.uint64(seed), np.uint64(0),
- block=(self.nblocks,1,1),
- grid=(self.max_nthreads//self.nblocks+1,1))
+ block=(self.nthread_per_block,1,1),
+ grid=(self.max_blocks,1))
#self.context.synchronize()
def load_photons(self, pos, dir, pol, wavelength, t0,
@@ -193,7 +216,6 @@ class GPU(object):
if any. -1 if no triangle was hit in the last step
'''
self.nphotons = len(pos)
- assert self.nphotons < self.max_nthreads
assert len(dir) == self.nphotons
assert len(pol) == self.nphotons
assert len(wavelength) == self.nphotons
@@ -204,6 +226,7 @@ class GPU(object):
self.polarizations_gpu = gpuarray.to_gpu(pol.astype(np.float32).view(gpuarray.vec.float3))
self.wavelengths_gpu = gpuarray.to_gpu(wavelength.astype(np.float32))
self.times_gpu = gpuarray.to_gpu(t0.astype(np.float32))
+ self.photon_offsets_gpu = gpuarray.arange(self.nphotons, dtype=np.int32)
if histories is not None:
self.histories_gpu = gpuarray.to_gpu(histories.astype(np.uint32))
@@ -216,8 +239,6 @@ class GPU(object):
self.last_hit_triangles_gpu = gpuarray.empty(self.nphotons, dtype=np.int32)
self.last_hit_triangles_gpu.fill(-1)
- self.per_photon_kernel_config = {'block': (self.nblocks,1,1),
- 'grid': (self.nphotons//self.nblocks+1,1)}
def propagate(self, max_steps=10):
'''Propagate photons on GPU to termination or max_steps, whichever comes first.
@@ -225,13 +246,19 @@ class GPU(object):
May be called repeatedly without reloading photon information if single-stepping
through photon history.
'''
- self.prop_funcs.propagate(np.int32(self.nphotons),
- self.rng_states_gpu, self.positions_gpu,
- self.directions_gpu,
- self.wavelengths_gpu, self.polarizations_gpu,
- self.times_gpu, self.histories_gpu,
- self.last_hit_triangles_gpu,
- np.int32(max_steps), **self.per_photon_kernel_config)
+ for first_photon, photons_this_round, blocks in chunk_iterator(self.nphotons, self.nthread_per_block, self.max_blocks):
+ self.prop_funcs.propagate(np.int32(first_photon),
+ np.int32(photons_this_round),
+ self.photon_offsets_gpu,
+ self.rng_states_gpu, self.positions_gpu,
+ self.directions_gpu,
+ self.wavelengths_gpu, self.polarizations_gpu,
+ self.times_gpu, self.histories_gpu,
+ self.last_hit_triangles_gpu,
+ np.int32(max_steps),
+ block=(self.nthread_per_block,1,1),
+ grid=(blocks, 1))
+
#self.context.synchronize()
def get_photons(self):
@@ -257,24 +284,26 @@ class GPU(object):
self.daq_funcs.reset_earliest_time_int(np.float32(1e9),
np.int32(len(self.earliest_time_int_gpu)),
self.earliest_time_int_gpu,
- block=(self.nblocks,1,1),
- grid=(len(self.earliest_time_int_gpu)//self.nblocks+1,1))
+ block=(self.nthread_per_block,1,1),
+ grid=(len(self.earliest_time_int_gpu)//self.nthread_per_block+1,1))
#self.context.synchronize()
- self.daq_funcs.run_daq(self.rng_states_gpu, np.uint32(0x1 << 2),
- np.float32(self.daq_pmt_rms),
- np.int32(len(self.positions_gpu)),
- self.times_gpu, self.histories_gpu,
- self.last_hit_triangles_gpu,
- self.solid_id_map_gpu,
- np.int32(len(self.earliest_time_int_gpu)),
- self.earliest_time_int_gpu,
- block=(self.nblocks,1,1), grid=(self.nphotons//self.nblocks+1,1))
+ for first_photon, photons_this_round, blocks in chunk_iterator(self.nphotons, self.nthread_per_block, self.max_blocks):
+ self.daq_funcs.run_daq(self.rng_states_gpu, np.uint32(0x1 << 2),
+ np.float32(self.daq_pmt_rms),
+ np.int32(first_photon),
+ np.int32(photons_this_round),
+ self.times_gpu, self.histories_gpu,
+ self.last_hit_triangles_gpu,
+ self.solid_id_map_gpu,
+ np.int32(len(self.earliest_time_int_gpu)),
+ self.earliest_time_int_gpu,
+ block=(self.nthread_per_block,1,1), grid=(blocks,1))
#self.context.synchronize()
self.daq_funcs.convert_sortable_int_to_float(np.int32(len(self.earliest_time_int_gpu)),
self.earliest_time_int_gpu,
self.earliest_time_gpu,
- block=(self.nblocks,1,1),
- grid=(len(self.earliest_time_int_gpu)//self.nblocks+1,1))
+ block=(self.nthread_per_block,1,1),
+ grid=(len(self.earliest_time_int_gpu)//self.nthread_per_block+1,1))
#self.context.synchronize()
diff --git a/src/daq.cu b/src/daq.cu
index 2b5f9b4..7c5e6a5 100644
--- a/src/daq.cu
+++ b/src/daq.cu
@@ -30,6 +30,7 @@ __global__ void reset_earliest_time_int(float maxtime, int ntime_ints, unsigned
__global__ void run_daq(curandState *s, unsigned int detection_state,
float time_rms,
+ int first_photon,
int nphotons, float *photon_times,
unsigned int *photon_histories,
int *last_hit_triangles, int *solid_map,
@@ -41,17 +42,17 @@ __global__ void run_daq(curandState *s, unsigned int detection_state,
if (id < nphotons)
{
curandState rng = s[id];
-
- int triangle_id = last_hit_triangles[id];
+ int photon_id = id + first_photon;
+ int triangle_id = last_hit_triangles[photon_id];
if (triangle_id > -1)
{
int solid_id = solid_map[triangle_id];
- int history = photon_histories[id];
+ int history = photon_histories[photon_id];
if (solid_id < nsolids && (history & detection_state))
{
- float time = photon_times[id] + curand_normal(&rng) * time_rms;
+ float time = photon_times[photon_id] + curand_normal(&rng) * time_rms;
unsigned int time_int = float_to_sortable_int(time);
atomicMin(earliest_time_int + solid_id, time_int);
diff --git a/src/kernel.cu b/src/kernel.cu
index 82efe88..17b829c 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -260,7 +260,9 @@ __global__ void ray_trace(int nthreads, float3 *positions, float3 *directions, i
} // ray_trace
-__global__ void propagate(int nthreads, curandState *rng_states, float3 *positions, float3 *directions, float *wavelengths, float3 *polarizations, float *times, unsigned int *histories, int *last_hit_triangles, int max_steps)
+__global__ void propagate(int first_photon, int nthreads, int *photon_offsets, curandState *rng_states,
+ float3 *positions, float3 *directions, float *wavelengths, float3 *polarizations, float *times,
+ unsigned int *histories, int *last_hit_triangles, int max_steps)
{
int id = blockIdx.x*blockDim.x + threadIdx.x;
@@ -269,16 +271,18 @@ __global__ void propagate(int nthreads, curandState *rng_states, float3 *positio
curandState rng = rng_states[id];
+ int photon_id = photon_offsets[first_photon + id];
+
Photon p;
- p.position = positions[id];
- p.direction = directions[id];
+ p.position = positions[photon_id];
+ p.direction = directions[photon_id];
p.direction /= norm(p.direction);
- p.polarization = polarizations[id];
+ p.polarization = polarizations[photon_id];
p.polarization /= norm(p.polarization);
- p.wavelength = wavelengths[id];
- p.time = times[id];
- p.last_hit_triangle = last_hit_triangles[id];
- p.history = histories[id];
+ p.wavelength = wavelengths[photon_id];
+ p.time = times[photon_id];
+ p.last_hit_triangle = last_hit_triangles[photon_id];
+ p.history = histories[photon_id];
if (p.history & (NO_HIT | BULK_ABSORB | SURFACE_DETECT | SURFACE_ABSORB))
return;
@@ -321,13 +325,13 @@ __global__ void propagate(int nthreads, curandState *rng_states, float3 *positio
} // while (steps < max_steps)
rng_states[id] = rng;
- positions[id] = p.position;
- directions[id] = p.direction;
- polarizations[id] = p.polarization;
- wavelengths[id] = p.wavelength;
- times[id] = p.time;
- histories[id] = p.history;
- last_hit_triangles[id] = p.last_hit_triangle;
+ positions[photon_id] = p.position;
+ directions[photon_id] = p.direction;
+ polarizations[photon_id] = p.polarization;
+ wavelengths[photon_id] = p.wavelength;
+ times[photon_id] = p.time;
+ histories[photon_id] = p.history;
+ last_hit_triangles[photon_id] = p.last_hit_triangle;
} // propagate