summaryrefslogtreecommitdiff
path: root/gpu.py
diff options
context:
space:
mode:
Diffstat (limited to 'gpu.py')
-rw-r--r--gpu.py84
1 files changed, 41 insertions, 43 deletions
diff --git a/gpu.py b/gpu.py
index 4eb2fab..eeb759e 100644
--- a/gpu.py
+++ b/gpu.py
@@ -18,6 +18,9 @@ import event
cuda.init()
+def to_float3(arr):
+ return arr.astype(np.float32).view(gpuarray.vec.float3)
+
def boolean_argsort(condition):
'''Returns two arrays of indicies indicating which elements of the
boolean array `condition` should be exchanged so that all of the
@@ -84,28 +87,31 @@ class CUDAFuncs(object):
setattr(self, name, cuda_module.get_function(name))
class GPU(object):
- def __init__(self, device_id=None):
+ def __init__(self, device_id=None, nthreads_per_block=64, max_blocks=1024):
'''Initialize a GPU context on the specified device. If device_id==None,
the default device is used.'''
if device_id is None:
self.context = make_default_context()
else:
- print 'device_id = %s' % device_id
device = cuda.Device(device_id)
self.context = device.make_context()
+
print 'device %s' % self.context.get_device().name()
+ self.nthreads_per_block = nthreads_per_block
+ self.max_blocks = max_blocks
+
self.context.set_cache_config(cuda.func_cache.PREFER_L1)
cuda_options = ['-I' + dirname(chroma.src.__file__), '--use_fast_math', '--ptxas-options=-v']
self.module = SourceModule(chroma.src.kernel, options=cuda_options, no_extern_c=True)
+ self.kernels = CUDAFuncs(self.module, ['ray_trace', 'ray_trace_alpha', 'rotate', 'rotate_around_point', 'translate', 'update_xyz_lookup', 'update_xyz_image', 'process_image', 'init_rng'])
+
self.geo_funcs = CUDAFuncs(self.module, ['set_wavelength_range', 'set_material', 'set_surface', 'set_global_mesh_variables', 'color_solids'])
self.prop_funcs = CUDAFuncs(self.module, ['init_rng', 'propagate', 'swap'])
- self.nthread_per_block = 64
- self.max_blocks = 1024
self.daq_module = SourceModule(chroma.src.daq, options=cuda_options, no_extern_c=True)
self.daq_funcs = CUDAFuncs(self.daq_module,
['reset_earliest_time_int', 'run_daq',
@@ -121,7 +127,7 @@ class GPU(object):
print format_array('node_map_end', self.node_map_end_gpu)
print '%-15s %6s %6s' % ('total', '', format_size(self.vertices_gpu.nbytes + self.triangles_gpu.nbytes + self.lower_bounds_gpu.nbytes + self.upper_bounds_gpu.nbytes + self.node_map_gpu.nbytes + self.node_map_end_gpu.nbytes))
- def load_geometry(self, geometry):
+ def load_geometry(self, geometry, print_usage=True):
if not hasattr(geometry, 'mesh'):
print 'WARNING: building geometry with 8-bits'
geometry.build(bits=8)
@@ -159,7 +165,7 @@ class GPU(object):
self.surfaces.append(surface)
- self.vertices_gpu = gpuarray.to_gpu(geometry.mesh.vertices.astype(np.float32).view(gpuarray.vec.float3))
+ self.vertices_gpu = gpuarray.to_gpu(to_float3(geometry.mesh.vertices))
triangles = \
np.empty(len(geometry.mesh.triangles), dtype=gpuarray.vec.uint4)
@@ -169,17 +175,9 @@ class GPU(object):
triangles['w'] = ((geometry.material1_index & 0xff) << 24) | ((geometry.material2_index & 0xff) << 16) | ((geometry.surface_index & 0xff) << 8)
self.triangles_gpu = gpuarray.to_gpu(triangles)
- lower_bounds_float3 = np.empty(geometry.lower_bounds.shape[0], dtype=gpuarray.vec.float3)
- lower_bounds_float3['x'] = geometry.lower_bounds[:,0]
- lower_bounds_float3['y'] = geometry.lower_bounds[:,1]
- lower_bounds_float3['z'] = geometry.lower_bounds[:,2]
- self.lower_bounds_gpu = gpuarray.to_gpu(lower_bounds_float3)
+ self.lower_bounds_gpu = gpuarray.to_gpu(to_float3(geometry.lower_bounds))
- upper_bounds_float3 = np.empty(geometry.upper_bounds.shape[0], dtype=gpuarray.vec.float3)
- upper_bounds_float3['x'] = geometry.upper_bounds[:,0]
- upper_bounds_float3['y'] = geometry.upper_bounds[:,1]
- upper_bounds_float3['z'] = geometry.upper_bounds[:,2]
- self.upper_bounds_gpu = gpuarray.to_gpu(upper_bounds_float3)
+ self.upper_bounds_gpu = gpuarray.to_gpu(to_float3(geometry.upper_bounds))
self.colors_gpu = gpuarray.to_gpu(geometry.colors.astype(np.uint32))
self.node_map_gpu = gpuarray.to_gpu(geometry.node_map.astype(np.uint32))
@@ -199,14 +197,11 @@ class GPU(object):
self.geometry = geometry
- print 'average of %f child nodes per node' % (np.mean(geometry.node_map_end[geometry.first_node:] - geometry.node_map[geometry.first_node:]))
-
- print '%i nodes with one child' % (np.count_nonzero((geometry.node_map_end[geometry.first_node:] - geometry.node_map[geometry.first_node:]) == 1))
-
- print '%i leaf nodes with one child' % (np.count_nonzero((geometry.node_map_end[:geometry.first_node] - geometry.node_map[:geometry.first_node]) == 1))
-
-
- self.print_device_usage()
+ if print_usage:
+ print 'average of %f child nodes per node' % (np.mean(geometry.node_map_end[geometry.first_node:] - geometry.node_map[geometry.first_node:]))
+ print '%i nodes with one child' % (np.count_nonzero((geometry.node_map_end[geometry.first_node:] - geometry.node_map[geometry.first_node:]) == 1))
+ print '%i leaf nodes with one child' % (np.count_nonzero((geometry.node_map_end[:geometry.first_node] - geometry.node_map[:geometry.first_node]) == 1))
+ self.print_device_usage()
def reset_colors(self):
self.colors_gpu.set_async(self.geometry.colors.astype(np.uint32))
@@ -215,22 +210,22 @@ class GPU(object):
solid_hit_gpu = gpuarray.to_gpu(np.array(solid_hit, dtype=np.bool))
solid_colors_gpu = gpuarray.to_gpu(np.array(colors, dtype=np.uint32))
- for first_triangle, triangles_this_round, blocks in chunk_iterator(self.triangles_gpu.size, self.nthread_per_block, self.max_blocks):
+ for first_triangle, triangles_this_round, blocks in chunk_iterator(self.triangles_gpu.size, self.nthreads_per_block, self.max_blocks):
self.geo_funcs.color_solids(np.int32(first_triangle),
np.int32(triangles_this_round),
self.solid_id_map_gpu,
solid_hit_gpu,
solid_colors_gpu,
- block=(self.nthread_per_block,1,1),
+ block=(self.nthreads_per_block,1,1),
grid=(blocks,1))
self.context.synchronize()
def setup_propagate(self, seed=1):
- self.rng_states_gpu = cuda.mem_alloc(self.nthread_per_block * self.max_blocks
+ self.rng_states_gpu = cuda.mem_alloc(self.nthreads_per_block * self.max_blocks
* sizeof('curandStateXORWOW', '#include <curand_kernel.h>'))
- self.prop_funcs.init_rng(np.int32(self.max_blocks * self.nthread_per_block), self.rng_states_gpu,
+ self.prop_funcs.init_rng(np.int32(self.max_blocks * self.nthreads_per_block), self.rng_states_gpu,
np.uint64(seed), np.uint64(0),
- block=(self.nthread_per_block,1,1),
+ block=(self.nthreads_per_block,1,1),
grid=(self.max_blocks,1))
#self.context.synchronize()
@@ -244,13 +239,16 @@ class GPU(object):
assert len(photons.directions) == self.nphotons
assert len(photons.polarizations) == self.nphotons
assert len(photons.wavelengths) == self.nphotons
- assert len(photons.times) == self.nphotons
- self.positions_gpu = gpuarray.to_gpu(photons.positions.astype(np.float32).view(gpuarray.vec.float3))
- self.directions_gpu = gpuarray.to_gpu(photons.directions.astype(np.float32).view(gpuarray.vec.float3))
- self.polarizations_gpu = gpuarray.to_gpu(photons.polarizations.astype(np.float32).view(gpuarray.vec.float3))
+ self.positions_gpu = gpuarray.to_gpu(to_float3(photons.positions))#.astype(np.float32).view(gpuarray.vec.float3))
+ self.directions_gpu = gpuarray.to_gpu(to_float3(photons.directions))#.astype(np.float32).view(gpuarray.vec.float3))
+ self.polarizations_gpu = gpuarray.to_gpu(to_float3(photons.polarizations))#.astype(np.float32).view(gpuarray.vec.float3))
self.wavelengths_gpu = gpuarray.to_gpu(photons.wavelengths.astype(np.float32))
- self.times_gpu = gpuarray.to_gpu(photons.times.astype(np.float32))
+
+ if photons.times is not None:
+ self.times_gpu = gpuarray.to_gpu(photons.times.astype(np.float32))
+ else:
+ self.times_gpu = gpuarray.zeros(self.nphotons, dtype=np.float32)
if photons.histories is not None:
self.histories_gpu = gpuarray.to_gpu(photons.histories.astype(np.uint32))
@@ -282,12 +280,12 @@ class GPU(object):
while step < max_steps:
## Just finish the rest of the steps if the # of photons is low
- if nphotons < self.nthread_per_block * 16 * 8:
+ if nphotons < self.nthreads_per_block * 16 * 8:
nsteps = max_steps - step
else:
nsteps = 1
- for first_photon, photons_this_round, blocks in chunk_iterator(nphotons, self.nthread_per_block, self.max_blocks):
+ for first_photon, photons_this_round, blocks in chunk_iterator(nphotons, self.nthreads_per_block, self.max_blocks):
self.prop_funcs.propagate(np.int32(first_photon),
np.int32(photons_this_round),
input_queue_gpu[1:],
@@ -298,7 +296,7 @@ class GPU(object):
self.times_gpu, self.histories_gpu,
self.last_hit_triangles_gpu,
np.int32(nsteps),
- block=(self.nthread_per_block,1,1),
+ block=(self.nthreads_per_block,1,1),
grid=(blocks, 1))
step += nsteps
@@ -341,13 +339,13 @@ class GPU(object):
self.daq_funcs.reset_earliest_time_int(np.float32(1e9),
np.int32(len(self.earliest_time_int_gpu)),
self.earliest_time_int_gpu,
- block=(self.nthread_per_block,1,1),
- grid=(len(self.earliest_time_int_gpu)//self.nthread_per_block+1,1))
+ block=(self.nthreads_per_block,1,1),
+ grid=(len(self.earliest_time_int_gpu)//self.nthreads_per_block+1,1))
self.channel_q_gpu.fill(0)
self.channel_history_gpu.fill(0)
#self.context.synchronize()
- for first_photon, photons_this_round, blocks in chunk_iterator(self.nphotons, self.nthread_per_block, self.max_blocks):
+ for first_photon, photons_this_round, blocks in chunk_iterator(self.nphotons, self.nthreads_per_block, self.max_blocks):
self.daq_funcs.run_daq(self.rng_states_gpu, np.uint32(0x1 << 2),
np.float32(self.daq_pmt_rms),
np.int32(first_photon),
@@ -359,13 +357,13 @@ class GPU(object):
self.earliest_time_int_gpu,
self.channel_q_gpu,
self.channel_history_gpu,
- block=(self.nthread_per_block,1,1), grid=(blocks,1))
+ block=(self.nthreads_per_block,1,1), grid=(blocks,1))
#self.context.synchronize()
self.daq_funcs.convert_sortable_int_to_float(np.int32(len(self.earliest_time_int_gpu)),
self.earliest_time_int_gpu,
self.earliest_time_gpu,
- block=(self.nthread_per_block,1,1),
- grid=(len(self.earliest_time_int_gpu)//self.nthread_per_block+1,1))
+ block=(self.nthreads_per_block,1,1),
+ grid=(len(self.earliest_time_int_gpu)//self.nthreads_per_block+1,1))
if 'profile' in __builtins__:
self.context.synchronize()