summaryrefslogtreecommitdiff
path: root/gpu.py
diff options
context:
space:
mode:
Diffstat (limited to 'gpu.py')
-rw-r--r--gpu.py89
1 files changed, 59 insertions, 30 deletions
diff --git a/gpu.py b/gpu.py
index 8de0f90..a21bc38 100644
--- a/gpu.py
+++ b/gpu.py
@@ -15,6 +15,28 @@ from color import map_to_color
cuda.init()
+def chunk_iterator(nelements, nthreads_per_block, max_blocks):
+ '''Iterator that yields tuples with the values requried to process
+ a long array in multiple kernel passes on the GPU.
+
+ Each yielded value is (first_index, elements_this_iteration, nblocks_this_iteration)
+
+ >>> list(chunk_iterator(300, 32, 2))
+ [(0, 64, 2), (64, 64, 2), (128, 64, 2), (192, 64, 2), (256, 9, 1)]
+ '''
+ first = 0
+ while first < nelements:
+ elements_left = nelements - first
+ blocks = elements_left // nthreads_per_block
+ if elements_left % nthreads_per_block != 0:
+ blocks += 1 #Round up only if needed
+ blocks = min(max_blocks, blocks)
+ elements_this_round = min(elements_left, blocks * nthreads_per_block)
+
+ yield (first, elements_this_round, blocks)
+ first += elements_this_round
+
+
def format_size(size):
if size < 1e3:
return '%.1f%s' % (size, ' ')
@@ -54,8 +76,8 @@ class GPU(object):
self.geo_funcs = CUDAFuncs(self.module, ['set_wavelength_range', 'set_material', 'set_surface', 'set_global_mesh_variables', 'color_solids'])
self.prop_funcs = CUDAFuncs(self.module, ['init_rng', 'propagate'])
- self.nblocks = 64
- self.max_nthreads = 200000
+ self.nthread_per_block = 64
+ self.max_blocks = 1024
self.daq_module = SourceModule(src.daq, options=['-I' + src.dir], no_extern_c=True, cache_dir=False)
self.daq_funcs = CUDAFuncs(self.daq_module,
['reset_earliest_time_int', 'run_daq',
@@ -166,15 +188,16 @@ class GPU(object):
solid_ids_gpu = gpuarray.to_gpu(np.array(solid_ids, dtype=np.int32))
solid_colors_gpu = gpuarray.to_gpu(np.array(colors, dtype=np.uint32))
- self.geo_funcs.color_solids(np.int32(solid_ids_gpu.size), np.uint32(self.triangles_gpu.size), self.solid_id_map_gpu, solid_ids_gpu, solid_colors_gpu, block=(self.nblocks,1,1), grid=(solid_ids_gpu.size//self.nblocks+1,1))
+ self.geo_funcs.color_solids(np.int32(solid_ids_gpu.size), np.uint32(self.triangles_gpu.size), self.solid_id_map_gpu, solid_ids_gpu, solid_colors_gpu, block=(self.nblock,1,1), grid=(solid_ids_gpu.size//self.nthread_per_block+1,1))
self.context.synchronize()
def setup_propagate(self, seed=1):
- self.rng_states_gpu = cuda.mem_alloc(self.max_nthreads*sizeof('curandStateXORWOW', '#include <curand_kernel.h>'))
- self.prop_funcs.init_rng(np.int32(self.max_nthreads), self.rng_states_gpu,
+ self.rng_states_gpu = cuda.mem_alloc(self.nthread_per_block * self.max_blocks
+ * sizeof('curandStateXORWOW', '#include <curand_kernel.h>'))
+ self.prop_funcs.init_rng(np.int32(self.max_blocks * self.nthread_per_block), self.rng_states_gpu,
np.uint64(seed), np.uint64(0),
- block=(self.nblocks,1,1),
- grid=(self.max_nthreads//self.nblocks+1,1))
+ block=(self.nthread_per_block,1,1),
+ grid=(self.max_blocks,1))
#self.context.synchronize()
def load_photons(self, pos, dir, pol, wavelength, t0,
@@ -193,7 +216,6 @@ class GPU(object):
if any. -1 if no triangle was hit in the last step
'''
self.nphotons = len(pos)
- assert self.nphotons < self.max_nthreads
assert len(dir) == self.nphotons
assert len(pol) == self.nphotons
assert len(wavelength) == self.nphotons
@@ -204,6 +226,7 @@ class GPU(object):
self.polarizations_gpu = gpuarray.to_gpu(pol.astype(np.float32).view(gpuarray.vec.float3))
self.wavelengths_gpu = gpuarray.to_gpu(wavelength.astype(np.float32))
self.times_gpu = gpuarray.to_gpu(t0.astype(np.float32))
+ self.photon_offsets_gpu = gpuarray.arange(self.nphotons, dtype=np.int32)
if histories is not None:
self.histories_gpu = gpuarray.to_gpu(histories.astype(np.uint32))
@@ -216,8 +239,6 @@ class GPU(object):
self.last_hit_triangles_gpu = gpuarray.empty(self.nphotons, dtype=np.int32)
self.last_hit_triangles_gpu.fill(-1)
- self.per_photon_kernel_config = {'block': (self.nblocks,1,1),
- 'grid': (self.nphotons//self.nblocks+1,1)}
def propagate(self, max_steps=10):
'''Propagate photons on GPU to termination or max_steps, whichever comes first.
@@ -225,13 +246,19 @@ class GPU(object):
May be called repeatedly without reloading photon information if single-stepping
through photon history.
'''
- self.prop_funcs.propagate(np.int32(self.nphotons),
- self.rng_states_gpu, self.positions_gpu,
- self.directions_gpu,
- self.wavelengths_gpu, self.polarizations_gpu,
- self.times_gpu, self.histories_gpu,
- self.last_hit_triangles_gpu,
- np.int32(max_steps), **self.per_photon_kernel_config)
+ for first_photon, photons_this_round, blocks in chunk_iterator(self.nphotons, self.nthread_per_block, self.max_blocks):
+ self.prop_funcs.propagate(np.int32(first_photon),
+ np.int32(photons_this_round),
+ self.photon_offsets_gpu,
+ self.rng_states_gpu, self.positions_gpu,
+ self.directions_gpu,
+ self.wavelengths_gpu, self.polarizations_gpu,
+ self.times_gpu, self.histories_gpu,
+ self.last_hit_triangles_gpu,
+ np.int32(max_steps),
+ block=(self.nthread_per_block,1,1),
+ grid=(blocks, 1))
+
#self.context.synchronize()
def get_photons(self):
@@ -257,24 +284,26 @@ class GPU(object):
self.daq_funcs.reset_earliest_time_int(np.float32(1e9),
np.int32(len(self.earliest_time_int_gpu)),
self.earliest_time_int_gpu,
- block=(self.nblocks,1,1),
- grid=(len(self.earliest_time_int_gpu)//self.nblocks+1,1))
+ block=(self.nthread_per_block,1,1),
+ grid=(len(self.earliest_time_int_gpu)//self.nthread_per_block+1,1))
#self.context.synchronize()
- self.daq_funcs.run_daq(self.rng_states_gpu, np.uint32(0x1 << 2),
- np.float32(self.daq_pmt_rms),
- np.int32(len(self.positions_gpu)),
- self.times_gpu, self.histories_gpu,
- self.last_hit_triangles_gpu,
- self.solid_id_map_gpu,
- np.int32(len(self.earliest_time_int_gpu)),
- self.earliest_time_int_gpu,
- block=(self.nblocks,1,1), grid=(self.nphotons//self.nblocks+1,1))
+ for first_photon, photons_this_round, blocks in chunk_iterator(self.nphotons, self.nthread_per_block, self.max_blocks):
+ self.daq_funcs.run_daq(self.rng_states_gpu, np.uint32(0x1 << 2),
+ np.float32(self.daq_pmt_rms),
+ np.int32(first_photon),
+ np.int32(photons_this_round),
+ self.times_gpu, self.histories_gpu,
+ self.last_hit_triangles_gpu,
+ self.solid_id_map_gpu,
+ np.int32(len(self.earliest_time_int_gpu)),
+ self.earliest_time_int_gpu,
+ block=(self.nthread_per_block,1,1), grid=(blocks,1))
#self.context.synchronize()
self.daq_funcs.convert_sortable_int_to_float(np.int32(len(self.earliest_time_int_gpu)),
self.earliest_time_int_gpu,
self.earliest_time_gpu,
- block=(self.nblocks,1,1),
- grid=(len(self.earliest_time_int_gpu)//self.nblocks+1,1))
+ block=(self.nthread_per_block,1,1),
+ grid=(len(self.earliest_time_int_gpu)//self.nthread_per_block+1,1))
#self.context.synchronize()